├── .gitattributes
├── Dockerfile
├── README.md
├── app.py
├── facewrapper
    ├── dict
    │   ├── data1.bin
    │   ├── data2.bin
    │   └── data3.bin
    ├── facewrapper.py
    └── libs
    │   ├── libimutils.so
    │   ├── libimutils.so_for_ubuntu22
    │   └── libttvfaceengine7.so
├── gradio
    ├── demo.py
    └── examples
    │   ├── 1.jpg
    │   ├── 2.jpg
    │   ├── 3.jpg
    │   └── 4.jpg
├── openvino
    ├── cache.json
    ├── libgna.so
    ├── libgna.so.2
    ├── libgna.so.3.0.0.1455
    ├── libopenvino.so
    ├── libopenvino_auto_batch_plugin.so
    ├── libopenvino_auto_plugin.so
    ├── libopenvino_c.so
    ├── libopenvino_gapi_preproc.so
    ├── libopenvino_hetero_plugin.so
    ├── libopenvino_intel_cpu_plugin.so
    ├── libopenvino_intel_gna_plugin.so
    ├── libopenvino_intel_hddl_plugin.so
    ├── libopenvino_intel_myriad_plugin.so
    ├── libopenvino_ir_frontend.so
    ├── libopenvino_onnx_frontend.so
    ├── libopenvino_paddle_frontend.so
    ├── libopenvino_tensorflow_fe.so
    ├── pcie-ma2x8x.mvcmd
    ├── plugins.xml
    ├── usb-ma2x8x.mvcmd
    └── vpu_custom_kernels
    │   ├── binarization.bin
    │   ├── binarization.cl
    │   ├── binary_convolution.bin
    │   ├── binary_convolution.cl
    │   ├── binary_convolution1x1.bin
    │   ├── binary_convolution1x1.cl
    │   ├── binary_convolution3x3.bin
    │   ├── binary_convolution3x3.cl
    │   ├── convolution1x1_chw.bin
    │   ├── convolution1x1_chw.cl
    │   ├── convolution1x1_hwc.bin
    │   ├── convolution1x1_hwc.cl
    │   ├── convolution3x3.bin
    │   ├── convolution3x3.cl
    │   ├── correlate.bin
    │   ├── correlate.cl
    │   ├── ctc.bin
    │   ├── ctc.cl
    │   ├── customLayerBindings.xml
    │   ├── cvtf32f16.bin
    │   ├── cvtf32f16.cl
    │   ├── cvtu8f16.bin
    │   ├── cvtu8f16.cl
    │   ├── detectron_prior_grid_gen.bin
    │   ├── detectron_prior_grid_gen.cl
    │   ├── fakequantize.bin
    │   ├── fakequantize.cl
    │   ├── grn.bin
    │   ├── grn.cl
    │   ├── mvn_reduction.bin
    │   ├── mvn_reduction.cl
    │   ├── mvn_scale.bin
    │   ├── mvn_scale.cl
    │   ├── region_chw.bin
    │   ├── region_chw.cl
    │   ├── region_hwc.bin
    │   ├── region_hwc.cl
    │   ├── reorg_chw.bin
    │   ├── reorg_chw.cl
    │   ├── reorg_hwc.bin
    │   ├── reorg_hwc.cl
    │   ├── reorg_hwc_naive.bin
    │   ├── reorg_hwc_naive.cl
    │   ├── resample_AA.bin
    │   ├── resample_AA.cl
    │   ├── resample_noAA.bin
    │   ├── resample_noAA.cl
    │   ├── shuffle_channels.bin
    │   ├── shuffle_channels.cl
    │   ├── st.bin
    │   └── st.cl
├── requirements.txt
└── run.sh


/.gitattributes:
--------------------------------------------------------------------------------
 1 | *.7z filter=lfs diff=lfs merge=lfs -text
 2 | *.arrow filter=lfs diff=lfs merge=lfs -text
 3 | *.bin filter=lfs diff=lfs merge=lfs -text
 4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
 5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
 6 | *.ftz filter=lfs diff=lfs merge=lfs -text
 7 | *.gz filter=lfs diff=lfs merge=lfs -text
 8 | *.h5 filter=lfs diff=lfs merge=lfs -text
 9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tar filter=lfs diff=lfs merge=lfs -text
29 | *.tflite filter=lfs diff=lfs merge=lfs -text
30 | *.tgz filter=lfs diff=lfs merge=lfs -text
31 | *.wasm filter=lfs diff=lfs merge=lfs -text
32 | *.xz filter=lfs diff=lfs merge=lfs -text
33 | *.zip filter=lfs diff=lfs merge=lfs -text
34 | *.zst filter=lfs diff=lfs merge=lfs -text
35 | *tfevents* filter=lfs diff=lfs merge=lfs -text
36 | facewrapper/libs/libttvfaceengine7.so filter=lfs diff=lfs merge=lfs -text
37 | openvino/libgna.so filter=lfs diff=lfs merge=lfs -text
38 | openvino/libgna.so.2 filter=lfs diff=lfs merge=lfs -text
39 | openvino/libgna.so.3.0.0.1455 filter=lfs diff=lfs merge=lfs -text
40 | openvino/libopenvino_gapi_preproc.so filter=lfs diff=lfs merge=lfs -text
41 | openvino/libopenvino_intel_cpu_plugin.so filter=lfs diff=lfs merge=lfs -text
42 | openvino/libopenvino_intel_gna_plugin.so filter=lfs diff=lfs merge=lfs -text
43 | openvino/libopenvino_intel_hddl_plugin.so filter=lfs diff=lfs merge=lfs -text
44 | openvino/libopenvino_intel_myriad_plugin.so filter=lfs diff=lfs merge=lfs -text
45 | openvino/libopenvino_onnx_frontend.so filter=lfs diff=lfs merge=lfs -text
46 | openvino/libopenvino_tensorflow_fe.so filter=lfs diff=lfs merge=lfs -text
47 | openvino/libopenvino.so filter=lfs diff=lfs merge=lfs -text
48 | openvino/pcie-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text
49 | openvino/usb-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text
50 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:20.04
 2 | RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
 3 | RUN apt-get update -y
 4 | RUN apt-get install -y python3 python3-pip python3-opencv
 5 | RUN apt-get install -y libcurl4-openssl-dev libssl-dev
 6 | RUN mkdir -p /home/FaceOnLive_v7
 7 | RUN mkdir -p /home/FaceOnLive_v7/facewrapper
 8 | WORKDIR /home/FaceOnLive_v7
 9 | COPY ./facewrapper ./facewrapper
10 | COPY ./facewrapper/libs/libimutils.so /usr/lib
11 | COPY ./gradio ./gradio
12 | COPY ./openvino /usr/lib
13 | COPY ./app.py ./app.py
14 | COPY ./run.sh .
15 | COPY ./requirements.txt ./requirements.txt
16 | RUN pip3 install -r requirements.txt
17 | RUN chmod a+x run.sh
18 | CMD ["./run.sh"]
19 | EXPOSE 9000


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |   <a href="https://join.slack.com/t/faceonlive/shared_invite/zt-2drx19c5t-vQsR4TUGPD8oL7i7BXdKZA">Slack</a>
 3 |     ·
 4 |    <a href="https://www.faceonlive.com/">Website</a>
 5 |     ·
 6 |    <a href="https://portfolio.faceonlive.com">Portfolio</a>  
 7 |     ·
 8 |     <a href="https://www.huggingface.co/FaceOnLive">Hugging Face</a>
 9 |     ·
10 |     <a href="https://getapi.faceonlive.com">Free APIs</a>
11 |     ·
12 |     <a href="https://github.com/FaceOnLive/OpenKYC">OpenKYC</a>  
13 |     ·
14 |     <a href="https://github.com/FaceOnLive/Mask-Face-Attendance-App-Flutter">Face Attendance</a>  
15 |     ·
16 |     <a href="mailto:contact@faceonlive.com">Contact</a>
17 | </div>
18 | <h1 align="center">Face Liveness Detection SDK For Linux</h1>
19 | <p align="center">Fully Offline, On-Premise Face Liveness Detection SDK for Linux</p>
20 | 
21 | </br>
22 | Documentation at https://docs.faceonlive.com
23 | </br>
24 | 
25 | ## :tada:  Try It Yourself on our [Portfolio Website](https://portfolio.faceonlive.com/#server_sdks/server/liv)
26 | 
27 | Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [Gradio](https://github.com/gradio-app/gradio). Try out the Web Demo: [![Hugging Face Spaces](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue)](https://huggingface.co/spaces/FaceOnLive/Face-Liveness-Detection-SDK)
28 | 
29 | 
30 | https://user-images.githubusercontent.com/91896009/187945910-4ca6d27c-d058-4749-a834-44914a5a957c.mp4
31 | 
32 | 
33 | ## :clap:  Supporters
34 | [![Stargazers repo roster for @faceonlive/Face-Liveness-Detection-SDK-Linux](https://reporoster.com/stars/faceonlive/Face-Liveness-Detection-SDK-Linux)](https://github.com/faceonlive/Face-Liveness-Detection-SDK-Linux/stargazers)
35 | [![Forkers repo roster for @faceonlive/Face-Liveness-Detection-SDK-Linux](https://reporoster.com/forks/faceonlive/Face-Liveness-Detection-SDK-Linux)](https://github.com/faceonlive/Face-Liveness-Detection-SDK-Linux/network/members)
36 | <p align="center"><a href="https://github.com/nastyox/Rando.js#nastyox"><img src="http://randojs.com/images/barsSmallTransparentBackground.gif" alt="Animated footer bars" width="100%"/></a></p>
37 | 
38 | ## 🏃  How to run
39 | ### 1. Download and install dependencies
40 | To begin, follow these steps to download and install the necessary dependencies:
41 | ```
42 | git clone https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux
43 | cd Face-Liveness-Detection-SDK-Linux
44 | chmod +x ./install_dependency.sh
45 | sudo ./install_dependency.sh
46 | ```
47 | ### 2. Execute the Python Flask application
48 | Next, run the Python Flask application by executing the following command:
49 | ```
50 | python3 app.py
51 | ```
52 | ### 3. Activate the SDK
53 | #### - Online License
54 | If you have an online license, please update the license key provided by us in the following file:
55 | https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/blob/6e702fa01aeabbfb395d82c637a66dc18a93f2fb/app.py#L23-L23
56 | #### - Offline License
57 | If you have an offline license, please share your machine's HWID (Hardware ID) with us to receive the license.txt file. Update the HWID in the following file:
58 | https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/blob/6e702fa01aeabbfb395d82c637a66dc18a93f2fb/app.py#L24-L24
59 | ```
60 | online init failed: 6
61 | hwid:  IXwjedMe8M5cZX/GwU3NEOqJRcqLwldq27HSLyFiejbGDB9XVgytA1RgJukV3mWWTNo84NwTMYU=
62 | ```
63 | ### 4. Using Docker
64 | - Build the Docker image:
65 | ```
66 | sudo docker build --pull --rm -f Dockerfile -t faceonlive_v7:latest .
67 | ```
68 | - Run Docker with online license:
69 | ```
70 | sudo docker docker run --network host faceonlive_v7
71 | ```
72 | - Run Docker with offline license:
73 | ```
74 | sudo docker run -v license.txt:/root/FaceOnLive_v7/license.txt --network host faceonlive_v7
75 | ```
76 | ### 5. Test endpoint
77 | To test the endpoint, download the Postman Collection from the following link:
78 | [FaceOnLive.postman_collection.json](https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/blob/main/FaceOnLive.postman_collection.json)
79 | 
80 | ![image](https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/assets/91896009/417e4fe3-9a01-43b3-a95b-d379ad4bdf17)
81 | 
82 | ![image](https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/assets/91896009/2275503e-49f0-4c72-9922-6e750a26dd62)
83 | 
84 | 


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append('.')
  3 | 
  4 | from flask import Flask, request, jsonify
  5 | from time import gmtime, strftime
  6 | import os
  7 | import base64
  8 | import json
  9 | import cv2
 10 | import numpy as np
 11 | 
 12 | from facewrapper.facewrapper import ttv_version
 13 | from facewrapper.facewrapper import ttv_get_hwid
 14 | from facewrapper.facewrapper import ttv_init
 15 | from facewrapper.facewrapper import ttv_init_offline
 16 | from facewrapper.facewrapper import ttv_detect_face
 17 | 
 18 | app = Flask(__name__) 
 19 | 
 20 | app.config['SITE'] = "http://0.0.0.0:8000/"
 21 | app.config['DEBUG'] = False
 22 | 
 23 | licenseKey = os.environ.get("LICENSE_KEY")
 24 | licensePath = "license.txt"
 25 | modelFolder = os.path.abspath(os.path.dirname(__file__)) + '/facewrapper/dict'
 26 | 
 27 | version = ttv_version()
 28 | print("version: ", version.decode('utf-8'))
 29 | 
 30 | ret = ttv_init(modelFolder.encode('utf-8'), licenseKey.encode('utf-8'))
 31 | if ret != 0:
 32 |     print(f"online init failed: {ret}");
 33 | 
 34 |     hwid = ttv_get_hwid()
 35 |     print("hwid: ", hwid.decode('utf-8'))
 36 | 
 37 |     ret = ttv_init_offline(modelFolder.encode('utf-8'), licensePath.encode('utf-8'))
 38 |     if ret != 0:
 39 |         print(f"offline init failed: {ret}")
 40 |         exit(-1)
 41 |     else:
 42 |         print(f"offline init ok")
 43 | 
 44 | else:
 45 |     print(f"online init ok")
 46 | 
 47 | @app.route('/api/liveness', methods=['POST'])
 48 | def check_liveness():
 49 |   file = request.files['image']
 50 |   image = cv2.imdecode(np.fromstring(file.read(), np.uint8), cv2.IMREAD_COLOR)
 51 |   
 52 |   faceRect = np.zeros([4], dtype=np.int32)
 53 |   livenessScore = np.zeros([1], dtype=np.double)
 54 |   angles = np.zeros([3], dtype=np.double)
 55 |   ret = ttv_detect_face(image, image.shape[1], image.shape[0], faceRect, livenessScore, angles)
 56 |   if ret == -1:
 57 |       result = "license error!"
 58 |   elif ret == -2:
 59 |       result = "init error!"
 60 |   elif ret == 0:
 61 |       result = "no face detected!"
 62 |   elif ret > 1:
 63 |       result = "multiple face detected!"
 64 |   elif faceRect[0] < 0 or faceRect[1] < 0 or faceRect[2] >= image.shape[1] or faceRect[2] >= image.shape[0]:
 65 |       result = "faace is in boundary!"
 66 |   elif livenessScore[0] > 0.5:
 67 |       result = "genuine"
 68 |   else:
 69 |       result = "spoof"
 70 |   
 71 |   status = "ok"
 72 |   response = jsonify({"status": status, "data": {"result": result, "face_rect": {"x": int(faceRect[0]), "y": int(faceRect[1]), "w": int(faceRect[2] - faceRect[0] + 1), "h" : int(faceRect[3] - faceRect[1] + 1)}, "liveness_score": livenessScore[0],
 73 |     "angles": {"yaw": angles[0], "roll": angles[1], "pitch": angles[2]}}})
 74 | 
 75 |   response.status_code = 200
 76 |   response.headers["Content-Type"] = "application/json; charset=utf-8"
 77 |   return response
 78 | 
 79 | @app.route('/api/liveness_base64', methods=['POST'])
 80 | def check_liveness_base64():
 81 |   content = request.get_json()
 82 |   imageBase64 = content['image']
 83 |   image = cv2.imdecode(np.frombuffer(base64.b64decode(imageBase64), dtype=np.uint8), cv2.IMREAD_COLOR)
 84 | 
 85 |   faceRect = np.zeros([4], dtype=np.int32)
 86 |   livenessScore = np.zeros([1], dtype=np.double)
 87 |   angles = np.zeros([3], dtype=np.double)
 88 |   ret = ttv_detect_face(image, image.shape[1], image.shape[0], faceRect, livenessScore, angles)
 89 |   if ret == -1:
 90 |       result = "license error!"
 91 |   elif ret == -2:
 92 |       result = "init error!"
 93 |   elif ret == 0:
 94 |       result = "no face detected!"
 95 |   elif ret > 1:
 96 |       result = "multiple face detected!"
 97 |   elif faceRect[0] < 0 or faceRect[1] < 0 or faceRect[2] >= image.shape[1] or faceRect[2] >= image.shape[0]:
 98 |       result = "faace is in boundary!"
 99 |   elif livenessScore[0] > 0.5:
100 |       result = "genuine"
101 |   else:
102 |       result = "spoof"
103 |   
104 |   status = "ok"
105 |   response = jsonify({"status": status, "data": {"result": result, "face_rect": {"x": int(faceRect[0]), "y": int(faceRect[1]), "w": int(faceRect[2] - faceRect[0] + 1), "h" : int(faceRect[3] - faceRect[1] + 1)}, "liveness_score": livenessScore[0],
106 |     "angles": {"yaw": angles[0], "roll": angles[1], "pitch": angles[2]}}})
107 | 
108 |   response.status_code = 200
109 |   response.headers["Content-Type"] = "application/json; charset=utf-8"
110 |   return response
111 | 
112 | 
113 | if __name__ == '__main__':
114 |     port = int(os.environ.get("PORT", 8000))
115 |     app.run(host='0.0.0.0', port=port)
116 | 


--------------------------------------------------------------------------------
/facewrapper/dict/data1.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:36cf5fcc49345989a86839a53529314ec1fe5d621c377a1952bc7538d55e7f1b
3 | size 16255630
4 | 


--------------------------------------------------------------------------------
/facewrapper/dict/data2.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6fa65c4b7df14f0c96c174868a1b1c675adc8c4a11e3c0807009f3d0cad51f5a
3 | size 280076956
4 | 


--------------------------------------------------------------------------------
/facewrapper/dict/data3.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f25fb0cd3d70cb84c258e7109620f411c087e0875828d6ab86cc9c4838d49bec
3 | size 11875339
4 | 


--------------------------------------------------------------------------------
/facewrapper/facewrapper.py:
--------------------------------------------------------------------------------
 1 | import ctypes, ctypes.util
 2 | from ctypes import *
 3 | from numpy.ctypeslib import ndpointer
 4 | import sys
 5 | import os
 6 | sys.path.append('/opt/intel/openvino_2022/runtime/lib/intel64')
 7 | 
 8 | lib_path = os.path.abspath(os.path.dirname(__file__)) + '/libs/libttvfaceengine7.so'
 9 | liveness_engine = cdll.LoadLibrary(lib_path)
10 | 
11 | ttv_version = liveness_engine.ttv_version
12 | ttv_version.argtypes = []
13 | ttv_version.restype = ctypes.c_char_p
14 | 
15 | ttv_get_hwid = liveness_engine.ttv_get_hwid
16 | ttv_get_hwid.argtypes = []
17 | ttv_get_hwid.restype = ctypes.c_char_p
18 | 
19 | ttv_init = liveness_engine.ttv_init
20 | ttv_init.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
21 | ttv_init.restype = ctypes.c_int32
22 | 
23 | ttv_init_offline = liveness_engine.ttv_init_offline
24 | ttv_init_offline.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
25 | ttv_init_offline.restype = ctypes.c_int32
26 | 
27 | 
28 | ttv_detect_face = liveness_engine.ttv_detect_face
29 | ttv_detect_face.argtypes = [ndpointer(ctypes.c_ubyte, flags='C_CONTIGUOUS'), ctypes.c_int32, ctypes.c_int32, ndpointer(ctypes.c_int32, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_double, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_double, flags='C_CONTIGUOUS')]
30 | ttv_detect_face.restype = ctypes.c_int32
31 | 
32 | 


--------------------------------------------------------------------------------
/facewrapper/libs/libimutils.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/facewrapper/libs/libimutils.so


--------------------------------------------------------------------------------
/facewrapper/libs/libimutils.so_for_ubuntu22:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/facewrapper/libs/libimutils.so_for_ubuntu22


--------------------------------------------------------------------------------
/facewrapper/libs/libttvfaceengine7.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b3d6f12326c8bd60242dd7366cfebeef69d25a296bdd9d329d3033e8b70e782f
3 | size 3664979
4 | 


--------------------------------------------------------------------------------
/gradio/demo.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | import requests
 3 | import json
 4 | 
 5 | def face_liveness(frame):
 6 |     url = "http://127.0.0.1:8000/api/liveness"
 7 |     files = None
 8 |     if frame is None:
 9 |         return ['', None]
10 | 
11 |     files = {'image': open(frame, 'rb')}
12 |     r = requests.post(url=url, files=files)
13 |     return r.json()
14 | 
15 | with gr.Blocks() as demo:
16 |     gr.Markdown(
17 |         """
18 |     # Face Liveness Detection
19 |     Get your own Face Liveness Detection Server by duplicating this space.<br/>
20 |     Contact us at contact@faceonlive.com for issues and support.<br/>
21 |     """
22 |     )
23 |     with gr.Row():
24 |         with gr.Column(scale=5):
25 |             image_input = gr.Image(type='filepath')
26 |             gr.Examples(['gradio/examples/1.jpg', 'gradio/examples/2.jpg', 'gradio/examples/3.jpg', 'gradio/examples/4.jpg'], 
27 |                             inputs=image_input)
28 |             face_liveness_button = gr.Button("Check Liveness")
29 |         with gr.Column(scale=5):
30 |             liveness_result_output = gr.JSON()
31 |     
32 |     face_liveness_button.click(face_liveness, inputs=image_input, outputs=liveness_result_output)
33 | 
34 | demo.launch(server_name="0.0.0.0", server_port=7860)


--------------------------------------------------------------------------------
/gradio/examples/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/1.jpg


--------------------------------------------------------------------------------
/gradio/examples/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/2.jpg


--------------------------------------------------------------------------------
/gradio/examples/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/3.jpg


--------------------------------------------------------------------------------
/gradio/examples/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/4.jpg


--------------------------------------------------------------------------------
/openvino/libgna.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3 | size 3120536
4 | 


--------------------------------------------------------------------------------
/openvino/libgna.so.2:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3 | size 3120536
4 | 


--------------------------------------------------------------------------------
/openvino/libgna.so.3.0.0.1455:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3 | size 3120536
4 | 


--------------------------------------------------------------------------------
/openvino/libopenvino.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fd216848c1ba78e62360c12c9684df0c160f6962f3d900e5918cc042b42b2b46
3 | size 13495416
4 | 


--------------------------------------------------------------------------------
/openvino/libopenvino_auto_batch_plugin.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_auto_batch_plugin.so


--------------------------------------------------------------------------------
/openvino/libopenvino_auto_plugin.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_auto_plugin.so


--------------------------------------------------------------------------------
/openvino/libopenvino_c.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_c.so


--------------------------------------------------------------------------------
/openvino/libopenvino_gapi_preproc.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3ac5ce0a8f3acefb41e8aa8161f78035dafff25c4b8c3485ebc541573b2b15f0
3 | size 1312920
4 | 


--------------------------------------------------------------------------------
/openvino/libopenvino_hetero_plugin.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_hetero_plugin.so


--------------------------------------------------------------------------------
/openvino/libopenvino_intel_cpu_plugin.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:afe05ada6d5b11495a21787fa6ab0162fc40f7a9ab97be78f7b7185126d15b18
3 | size 33299880
4 | 


--------------------------------------------------------------------------------
/openvino/libopenvino_intel_gna_plugin.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ef15b623e7f81788160c4056ccd5e887a8184affe381e84a906646ef36cae1ab
3 | size 4067016
4 | 


--------------------------------------------------------------------------------
/openvino/libopenvino_intel_hddl_plugin.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:96362327fbc404e88583bdcd2a526ccbf4ca26d4ecdb8898234be7986d9b8b2b
3 | size 5894680
4 | 


--------------------------------------------------------------------------------
/openvino/libopenvino_intel_myriad_plugin.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e596436002565356b80400e0d7e50093d53d338f623b171f658de527477852de
3 | size 6120168
4 | 


--------------------------------------------------------------------------------
/openvino/libopenvino_ir_frontend.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_ir_frontend.so


--------------------------------------------------------------------------------
/openvino/libopenvino_onnx_frontend.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0770ed09d471b20bffcf4ef57ab1fb002db04c4404598bd5c52a4418a67f5441
3 | size 3781640
4 | 


--------------------------------------------------------------------------------
/openvino/libopenvino_paddle_frontend.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_paddle_frontend.so


--------------------------------------------------------------------------------
/openvino/libopenvino_tensorflow_fe.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c2dadbcd8ba32cec02873caf8dcc644d1d8856cdcd2978c603e5bac169e01bb9
3 | size 2723864
4 | 


--------------------------------------------------------------------------------
/openvino/pcie-ma2x8x.mvcmd:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f03146453508f2bcab1589907bccaa429b48db6123a7b8a428d6ce221d1fbb4d
3 | size 2099248
4 | 


--------------------------------------------------------------------------------
/openvino/plugins.xml:
--------------------------------------------------------------------------------
 1 | <ie>
 2 |     <plugins>
 3 |         <plugin name="AUTO" location="libopenvino_auto_plugin.so">
 4 |             <properties>
 5 |                 <property key="MULTI_WORK_MODE_AS_AUTO" value="YES"/>
 6 |             </properties>
 7 |         </plugin>
 8 |         <plugin name="BATCH" location="libopenvino_auto_batch_plugin.so">
 9 |         </plugin>
10 |         <plugin name="CPU" location="libopenvino_intel_cpu_plugin.so">
11 |         </plugin>
12 |         <plugin name="GNA" location="libopenvino_intel_gna_plugin.so">
13 |         </plugin>
14 |         <plugin name="GPU" location="libopenvino_intel_gpu_plugin.so">
15 |         </plugin>
16 |         <plugin name="HETERO" location="libopenvino_hetero_plugin.so">
17 |         </plugin>
18 |         <plugin name="MULTI" location="libopenvino_auto_plugin.so">
19 |         </plugin>
20 |         <plugin name="MYRIAD" location="libopenvino_intel_myriad_plugin.so">
21 |         </plugin>
22 |         <plugin name="HDDL" location="libopenvino_intel_hddl_plugin.so">
23 |         </plugin>
24 |         <plugin name="VPUX" location="libopenvino_intel_vpux_plugin.so">
25 |         </plugin>
26 |     </plugins>
27 | </ie>


--------------------------------------------------------------------------------
/openvino/usb-ma2x8x.mvcmd:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:faf33388b88708177a358fcb4704eba04b1cf9e88d6a047f90c833d686140a2e
3 | size 2298632
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binarization.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3e0de6082c7bacca2ff5ad131f0afc44304fc792a6d99e7829399eb61491a0ac
3 | size 19632
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binarization.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
 7 | 
 8 | __kernel void binarization(
 9 |     const __global half *__restrict src_data,
10 |     const __global half *__restrict input_low_high,
11 |     const __global half *__restrict dst_data,
12 |     int switch_out,
13 |     int input_low_high_size,
14 |     int W,
15 |     int H)
16 | {
17 |     __local half local_src[15 * 1024];
18 |     __local half local_dst[15 * 1024];
19 | 
20 |     event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
21 |     wait_group_events(1, &e1);
22 | 
23 |     int c = get_global_id(2);
24 |     int C = get_global_size(2);
25 | 
26 |     half dst_low  = switch_out ? 1.h : -1.h;
27 |     half dst_high = switch_out ? -1.h : 1.h;
28 | 
29 |     half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c];
30 | 
31 |     for (int h = 0; h < H; h++) {
32 | 
33 |         __local const half *__restrict addr_src = local_src + h * W;
34 |         __local half *__restrict addr_dst       = local_dst + h * W;
35 | 
36 | #if 1
37 |         for (int w = 0; w < W / 8; w++) {
38 | 
39 |             half8 h_src_val8 = (*((__local half8 *)addr_src + w));
40 | 
41 |             short8 cond1;
42 |             cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh);
43 |             cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh);
44 |             cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh);
45 |             cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh);
46 |             cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh);
47 |             cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh);
48 |             cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh);
49 |             cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh);
50 | 
51 |             cond1 = ~(cond1 - (short8)1);
52 | 
53 |             short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high);
54 | 
55 |             *((__local half8 *)addr_dst + w) = as_half8(res);
56 |         }
57 | #endif
58 |         for (int w = W & (~0x7); w < W; w++) {
59 |             addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high;
60 |         }
61 |     }
62 | 
63 |     barrier(CLK_LOCAL_MEM_FENCE);
64 | 
65 |     event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
66 |     wait_group_events(1, &e2);
67 | }
68 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:12c349d6f73c233b158e1d67af31715c7b8bda79f191b1e759476e01e65bb64a
3 | size 10764
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | 
 7 | int extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
 8 | 
 9 | __kernel void binary_convolution(
10 |     const __global half *restrict src_data,
11 |     const __global uchar *restrict weights_data,
12 |     __global half *restrict dst_data,
13 |     float pad_value,
14 | 
15 |     int IW,
16 |     int IH,
17 |     int IC,
18 | 
19 |     int DW,
20 |     int DH,
21 | 
22 |     int GC,
23 | 
24 |     int KW,
25 |     int KH,
26 | 
27 |     int PW,
28 |     int PH,
29 | 
30 |     int SW,
31 |     int SH)
32 | {
33 |     int ipad_value = ((pad_value > 0.f) ? 1 : 0);
34 |     int c          = get_global_id(2);
35 |     int y          = get_global_id(1);
36 |     int x          = get_global_id(0);
37 | 
38 |     int OC = get_global_size(2);
39 |     int OH = get_global_size(1);
40 |     int OW = get_global_size(0);
41 | 
42 |     int KD = 1;
43 |     int SD = 0;
44 |     int DD = 0;
45 |     int PD = 0;
46 |     int ID = 1;
47 |     int OD = 1;
48 | 
49 |     int nbits = 8;
50 | 
51 |     int g  = c % GC;
52 |     int oc = c / GC;
53 |     int oh = y;
54 |     int ow = x;
55 | 
56 |     for (int od = 0; od < OD; od++) {
57 |         int oidx = g * OC / GC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow;
58 | 
59 |         int res = 0;
60 | 
61 |         for (int ic = 0; ic < IC / GC; ic++) {
62 |             for (int kd = 0; kd < KD; kd++) {
63 |                 for (int kh = 0; kh < KH; kh++) {
64 |                     for (int kw = 0; kw < KW; kw++) {
65 |                         int widx = g * OC / GC * IC / GC * KD * KH * KW
66 |                                    + oc * IC / GC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW
67 |                                    + kh * KW + kw;
68 | 
69 |                         int w = extract_weights(weights_data[widx / nbits], (widx % nbits));
70 | 
71 |                         int s;
72 | 
73 |                         int iw = ow * SW - PW + kw * DW;
74 |                         int ih = oh * SH - PH + kh * DH;
75 |                         int id = od * SD - PD + kd * DD;
76 | 
77 |                         if (iw < 0 || iw >= (int)IW || ih < 0 || ih >= (int)IH || id < 0
78 |                             || id >= (int)ID) {
79 |                             s = ipad_value;
80 |                         } else {
81 |                             int iidx = g * IC / GC * ID * IH * IW + ic * ID * IH * IW + id * IH * IW
82 |                                        + ih * IW + iw;
83 | 
84 |                             s = ((src_data[iidx] > 0.f) ? 1 : 0);
85 |                         }
86 | 
87 |                         res += s ^ w;
88 |                     }
89 |                 }
90 |             }
91 |         }
92 | 
93 |         dst_data[oidx] = (half)(IC / GC * KD * KH * KW - 2 * res);
94 |     }
95 | }
96 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution1x1.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6deff31d62aa84c643fbeba77e7dcd4ae5d9b488c1c98e07fffeb58ff8e9b945
3 | size 76316
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution1x1.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
  9 | 
 10 | __kernel void binary_convolution(
 11 |     const __global half *restrict src_data,
 12 |     const __global uchar *restrict weights_data,
 13 |     __global half *restrict dst_data,
 14 |     float pad_value,
 15 | 
 16 |     int IW,
 17 |     int IH,
 18 |     int IC,
 19 | 
 20 |     int DW,
 21 |     int DH,
 22 | 
 23 |     int GC,
 24 | 
 25 |     int KW,
 26 |     int KH,
 27 | 
 28 |     int PW,
 29 |     int PH,
 30 | 
 31 |     int SW,
 32 |     int SH,
 33 | 
 34 |     int OW)
 35 | {
 36 |     __local half src_local[32 * 1024];
 37 |     __local half dst_local[2 * 1024];
 38 | 
 39 |     const int oh = get_group_id(0);
 40 |     const int oc = get_group_id(1);
 41 |     const int OH = get_global_size(0);
 42 |     const int OC = get_global_size(1);
 43 | 
 44 |     const int gc = oc / (OC / GC);
 45 | 
 46 |     if (oh * SH >= 0 && oh * SH <= IH - 1) {
 47 |         const __global half *src = src_data + (gc * IC / GC) * IW * IH + (SH * oh) * IW;
 48 | 
 49 |         event_t e1 = async_work_group_copy_2D2D(
 50 |             src_local, // dst
 51 |             src, // src
 52 |             IW, // num_elements_per_line,
 53 |             IC / GC, // num_lines,
 54 |             IH * IW - IW, // src_line_stride,
 55 |             0, // dst_line_stride,
 56 |             0);
 57 |         wait_group_events(1, &e1);
 58 |     }
 59 | 
 60 |     half pad_value_half = convert_half(pad_value);
 61 | 
 62 |     //padding row
 63 |     if (oh * SH > IH - 1) {
 64 |         __local half *dst = src_local;
 65 |         for (int c = 0; c < IC / GC; c++) {
 66 |             #pragma unroll 8
 67 |             for (int j = 0; j < IW; j++) {
 68 |                 dst[j] = pad_value_half;
 69 |             }
 70 |             dst += IW;
 71 |         }
 72 |     }
 73 | 
 74 |     int OWS = SW * OW;
 75 |     ushort8 in;
 76 | 
 77 |     for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
 78 |         ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
 79 |         for (int ic = 0; ic < IC / GC; ++ic) {
 80 |             __local half *src = (__local half *)((__local half8 *)(src_local + ic * IW) + ows8);
 81 |             int weight_pos    = oc * IC / GC + ic;
 82 |             ushort w =
 83 |                 extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
 84 | 
 85 |             if ((ows8 * 8) <= IW - 1) {
 86 |                 in = *((__local ushort8 *)(src));
 87 |             }
 88 | 
 89 |             //padding column
 90 |             if (ows8 * 8 + 7 > IW - 1) {
 91 |                 int boundary = (IW - 1) - ows8 * 8 + 1;
 92 |                 boundary     = boundary < 0 ? 0 : boundary;
 93 |                 for (int offset = boundary; offset < 8; offset++) {
 94 |                     *((half *)(&in) + offset) = pad_value_half;
 95 |                 }
 96 |             }
 97 | 
 98 |             ushort8 w8 = (ushort8)(w);
 99 | 
100 |             ushort8 cond =
101 |                 (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
102 | 
103 |             val += (cond ^ w8);
104 |         }
105 | 
106 |         ushort8 val_shift = val << 1;
107 |         int boundary      = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
108 |         for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
109 |             *(dst_local + ow) = (half)(IC / GC - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
110 |         }
111 |     }
112 | 
113 |     barrier(CLK_LOCAL_MEM_FENCE);
114 | 
115 |     event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
116 |     wait_group_events(1, &e2);
117 | }
118 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution3x3.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:55e3c3f8863ff7a3583bcc7340d1e226775f5f14cfb11dd32bd671764570f7cb
3 | size 104136
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution3x3.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
  9 | 
 10 | __kernel void binary_convolution(
 11 |     const __global half *restrict src_data,
 12 |     const __global uchar *restrict weights_data,
 13 |     const __global half *restrict dst_data,
 14 |     float pad_value,
 15 | 
 16 |     int IW,
 17 |     int IH,
 18 |     int IC,
 19 | 
 20 |     int DW,
 21 |     int DH,
 22 | 
 23 |     int GC,
 24 | 
 25 |     int KW,
 26 |     int KH,
 27 | 
 28 |     int PW,
 29 |     int PH,
 30 | 
 31 |     int SW,
 32 |     int SH,
 33 | 
 34 |     int OW)
 35 | {
 36 |     __local half src_local[32 * 1024];
 37 |     __local half dst_local[2 * 1024];
 38 | 
 39 |     const int oh = get_group_id(0);
 40 |     const int oc = get_group_id(1);
 41 |     const int OH = get_global_size(0);
 42 |     const int OC = get_global_size(1);
 43 | 
 44 |     const int gc = oc / (OC / GC);
 45 | 
 46 |     if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows
 47 |     {
 48 |         event_t e = async_work_group_copy_3D3D(
 49 |             src_local, // dst
 50 |             src_data + (gc * IC / GC) * IW * IH + (SH * oh - 1) * IW, // src
 51 |             IW, // num_elements_per_line
 52 |             3, // num_lines
 53 |             DH * IW - IW, // src_line_stride
 54 |             0, // dst_line_stride
 55 |             IC / GC, // num planes
 56 |             IH * IW - 3 * DH * IW, // src plane stride
 57 |             0, // dst plane stride
 58 |             0);
 59 |         wait_group_events(1, &e);
 60 |     } else {
 61 |         int ih = oh * SH - 1;
 62 |         if (ih >= 0 && ih <= IH - 1) //dma for first row
 63 |         {
 64 |             event_t e = async_work_group_copy_2D2D(
 65 |                 src_local, // dst
 66 |                 src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
 67 |                 IW, // num_elements_per_line,
 68 |                 IC / GC, // num_lines,
 69 |                 IH * IW - IW, // src_line_stride,
 70 |                 2 * IW, // dst_line_stride,
 71 |                 0);
 72 | 
 73 |             wait_group_events(1, &e);
 74 |         }
 75 |         ih = oh * SH - 1 + DH;
 76 |         if (ih >= 0 && ih <= IH - 1) //dma for second row
 77 |         {
 78 |             event_t e = async_work_group_copy_2D2D(
 79 |                 src_local + IW, // dst
 80 |                 src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
 81 |                 IW, // num_elements_per_line,
 82 |                 IC / GC, // num_lines,
 83 |                 IH * IW - IW, // src_line_stride,
 84 |                 2 * IW, // dst_line_stride,
 85 |                 0);
 86 |             wait_group_events(1, &e);
 87 |         }
 88 |         ih = oh * SH - 1 + 2 * DH;
 89 |         if (ih >= 0 && ih <= IH - 1) //dma for third row
 90 |         {
 91 |             event_t e = async_work_group_copy_2D2D(
 92 |                 src_local + 2 * IW, // dst
 93 |                 src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
 94 |                 IW, // num_elements_per_line,
 95 |                 IC / GC, // num_lines,
 96 |                 IH * IW - IW, // src_line_stride,
 97 |                 2 * IW, // dst_line_stride,
 98 |                 0);
 99 |             wait_group_events(1, &e);
100 |         }
101 |     }
102 | 
103 |     half pad_value_half = convert_half(pad_value);
104 | 
105 |     //padding row
106 |     if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) {
107 |         __local half *dst = src_local;
108 |         for (int c = 0; c < IC / GC; c++) {
109 |             #pragma unroll 8
110 |             for (int j = 0; j < IW; j++) {
111 |                 dst[j] = pad_value_half;
112 |             }
113 |             dst += 3 * IW;
114 |         }
115 |     }
116 |     if (oh * SH + DH - 1 > IH - 1) {
117 |         __local half *dst = src_local + IW;
118 |         for (int c = 0; c < IC / GC; c++) {
119 |             #pragma unroll 8
120 |             for (int j = 0; j < IW; j++) {
121 |                 dst[j] = pad_value_half;
122 |             }
123 |             dst += 3 * IW;
124 |         }
125 |     }
126 |     if (oh * SH + DH + DH - 1 > IH - 1) {
127 |         __local half *dst = src_local + 2 * IW;
128 |         for (int c = 0; c < IC / GC; c++) {
129 |             #pragma unroll 8
130 |             for (int j = 0; j < IW; j++) {
131 |                 dst[j] = pad_value_half;
132 |             }
133 |             dst += 3 * IW;
134 |         }
135 |     }
136 | 
137 |     int OWS = SW * OW;
138 | 
139 |     ushort8 in00;
140 |     ushort8 in01;
141 |     ushort8 in02;
142 |     ushort8 in10;
143 |     ushort8 in11;
144 |     ushort8 in12;
145 |     ushort8 in20;
146 |     ushort8 in21;
147 |     ushort8 in22;
148 | 
149 |     for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
150 |         ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
151 |         for (int ic = 0; ic < IC / GC; ++ic) {
152 |             __local half *src =
153 |                 (__local half *)((__local half8 *)(src_local + ic * IW * 3 + IW + DW - 1) + ows8);
154 |             int weight_pos = oc * IC / GC * 3 * 3 + ic * 3 * 3;
155 |             ushort w0 = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
156 |             ushort w1 = extract_weights(weights_data[((weight_pos + 1)) / 8], ((weight_pos + 1) % 8));
157 |             ushort w2 = extract_weights(weights_data[((weight_pos + 2)) / 8], ((weight_pos + 2) % 8));
158 |             ushort w3 = extract_weights(weights_data[((weight_pos + 3)) / 8], ((weight_pos + 3) % 8));
159 |             ushort w4 = extract_weights(weights_data[((weight_pos + 4)) / 8], ((weight_pos + 4) % 8));
160 |             ushort w5 = extract_weights(weights_data[((weight_pos + 5)) / 8], ((weight_pos + 5) % 8));
161 |             ushort w6 = extract_weights(weights_data[((weight_pos + 6)) / 8], ((weight_pos + 6) % 8));
162 |             ushort w7 = extract_weights(weights_data[((weight_pos + 7)) / 8], ((weight_pos + 7) % 8));
163 |             ushort w8 = extract_weights(weights_data[((weight_pos + 8)) / 8], ((weight_pos + 8) % 8));
164 | 
165 |             if ((ows8 * 8) - 1 <= IW - 1) {
166 |                 in00 = *((__local ushort8 *)(src - IW - DW));
167 |                 in01 = *((__local ushort8 *)(src - IW));
168 |                 in02 = *((__local ushort8 *)(src - IW + DW));
169 | 
170 |                 in10 = *((__local ushort8 *)(src - DW));
171 |                 in11 = *((__local ushort8 *)(src));
172 |                 in12 = *((__local ushort8 *)(src + DW));
173 | 
174 |                 in20 = *((__local ushort8 *)(src + IW - DW));
175 |                 in21 = *((__local ushort8 *)(src + IW));
176 |                 in22 = *((__local ushort8 *)(src + IW + DW));
177 |             }
178 | 
179 |             //padding column
180 |             if (ows8 * 8 - 1 < 0) {
181 |                 int boundary = 1 - ows8 * 8;
182 |                 boundary     = boundary > 8 ? 8 : boundary;
183 |                 for (int offset = 0; offset < boundary; offset++) {
184 |                     *((half *)(&in00) + offset) = pad_value_half;
185 |                     *((half *)(&in10) + offset) = pad_value_half;
186 |                     *((half *)(&in20) + offset) = pad_value_half;
187 |                 }
188 |             }
189 |             if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) {
190 |                 int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1;
191 |                 boundary     = boundary < 0 ? 0 : boundary;
192 |                 for (int offset = boundary; offset < 8; offset++) {
193 |                     *((half *)(&in02) + offset) = pad_value_half;
194 |                     *((half *)(&in12) + offset) = pad_value_half;
195 |                     *((half *)(&in22) + offset) = pad_value_half;
196 |                 }
197 |             }
198 |             if ((ows8 * 8 + 7) + DW - 1 > IW - 1) {
199 |                 int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1;
200 |                 boundary     = boundary < 0 ? 0 : boundary;
201 |                 for (int offset = boundary; offset < 8; offset++) {
202 |                     *((half *)(&in01) + offset) = pad_value_half;
203 |                     *((half *)(&in11) + offset) = pad_value_half;
204 |                     *((half *)(&in21) + offset) = pad_value_half;
205 |                 }
206 |             }
207 |             if ((ows8 * 8 + 7) - 1 > IW - 1) {
208 |                 int boundary = (IW - 1 + 1) - ows8 * 8 + 1;
209 |                 boundary     = boundary < 0 ? 0 : boundary;
210 |                 for (int offset = boundary; offset < 8; offset++) {
211 |                     *((half *)(&in00) + offset) = pad_value_half;
212 |                     *((half *)(&in10) + offset) = pad_value_half;
213 |                     *((half *)(&in20) + offset) = pad_value_half;
214 |                 }
215 |             }
216 | 
217 |             ushort8 w00 = (ushort8)(w0);
218 |             ushort8 w01 = (ushort8)(w1);
219 |             ushort8 w02 = (ushort8)(w2);
220 |             ushort8 w10 = (ushort8)(w3);
221 |             ushort8 w11 = (ushort8)(w4);
222 |             ushort8 w12 = (ushort8)(w5);
223 |             ushort8 w20 = (ushort8)(w6);
224 |             ushort8 w21 = (ushort8)(w7);
225 |             ushort8 w22 = (ushort8)(w8);
226 | 
227 |             ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ?
228 |                                 (ushort8)(1) :
229 |                                 (ushort8)(0);
230 |             ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ?
231 |                                 (ushort8)(1) :
232 |                                 (ushort8)(0);
233 |             ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ?
234 |                                 (ushort8)(1) :
235 |                                 (ushort8)(0);
236 |             ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ?
237 |                                 (ushort8)(1) :
238 |                                 (ushort8)(0);
239 |             ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ?
240 |                                 (ushort8)(1) :
241 |                                 (ushort8)(0);
242 |             ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ?
243 |                                 (ushort8)(1) :
244 |                                 (ushort8)(0);
245 |             ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ?
246 |                                 (ushort8)(1) :
247 |                                 (ushort8)(0);
248 |             ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ?
249 |                                 (ushort8)(1) :
250 |                                 (ushort8)(0);
251 |             ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ?
252 |                                 (ushort8)(1) :
253 |                                 (ushort8)(0);
254 | 
255 |             val += (cond0 ^ w00);
256 |             val += (cond1 ^ w01);
257 |             val += (cond2 ^ w02);
258 |             val += (cond3 ^ w10);
259 |             val += (cond4 ^ w11);
260 |             val += (cond5 ^ w12);
261 |             val += (cond6 ^ w20);
262 |             val += (cond7 ^ w21);
263 |             val += (cond8 ^ w22);
264 |         }
265 | 
266 |         ushort8 val_shift = val << 1;
267 |         int boundary      = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
268 |         for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
269 |             *(dst_local + ow) =
270 |                 (half)(IC / GC * KH * KW - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
271 |         }
272 |     }
273 | 
274 |     barrier(CLK_LOCAL_MEM_FENCE);
275 | 
276 |     event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
277 |     wait_group_events(1, &e2);
278 | }
279 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution1x1_chw.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8717c8429d41a69337007871137f06a9e6b38c685b5b3fecc634fade0eaa7e7f
3 | size 9220
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution1x1_chw.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | __kernel void Convolution1x1_NCHW(
  9 |     const __global half *in,
 10 |     const __global half *out,
 11 |     const __global half *w,
 12 |     int IW,
 13 |     int IH,
 14 |     int IC,
 15 |     int OW,
 16 |     int OH,
 17 |     int OC)
 18 | {
 19 |     __local half in_local[8 * 1024];
 20 |     __local half out_local[8 * 1024];
 21 | 
 22 |     event_t e1 = async_work_group_copy_2D2D(
 23 |         in_local, // dst
 24 |         in + get_group_id(0) * IW, // src
 25 |         IW, // num_elements_per_line,
 26 |         IC, // num_lines,
 27 |         IW * IH - IW, // src_line_stride,
 28 |         0, // dst_line_stride,
 29 |         0);
 30 |     wait_group_events(1, &e1);
 31 | 
 32 |     int oh = get_global_id(0);
 33 |     int oc = get_global_id(1);
 34 | 
 35 |     int stride;
 36 |     int write_output = 0;
 37 |     __global half *src;
 38 | 
 39 |     __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
 40 |     __global half *w1  = (__global half *)(&w[oc * IC]);
 41 | 
 42 |     for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
 43 |         uint iw = ow;
 44 |         uint ih = oh;
 45 | 
 46 |         half8 val8_0 = 0.0f;
 47 | 
 48 |         __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
 49 |         __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
 50 |         __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
 51 |         __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
 52 |         __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
 53 |         __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
 54 |         __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
 55 |         __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
 56 | 
 57 |         for (uint ic = 0; ic < IC / 8; ic++) {
 58 |             val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
 59 |             val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
 60 |             val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
 61 |             val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
 62 |             val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
 63 |             val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
 64 |             val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
 65 |             val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
 66 |         }
 67 | 
 68 |         for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
 69 |             val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
 70 |         }
 71 |         *((__local half8 *)&out_local[ow + 0]) = (val8_0);
 72 |     }
 73 | 
 74 |     uint iw = (OW & (~0x7));
 75 |     uint ih = oh;
 76 | 
 77 |     half8 val8_0 = 0.0f;
 78 | 
 79 |     __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
 80 |     __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
 81 |     __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
 82 |     __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
 83 |     __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
 84 |     __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
 85 |     __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
 86 |     __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
 87 | 
 88 |     for (uint ic = 0; ic < IC / 8; ic++) {
 89 |         val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
 90 |         val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
 91 |         val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
 92 |         val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
 93 |         val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
 94 |         val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
 95 |         val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
 96 |         val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
 97 |     }
 98 | 
 99 |     for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
100 |         val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
101 |     }
102 |     for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
103 |         out_local[ow + 0] = (val8_0[ow % 8]);
104 |     }
105 | 
106 |     barrier(CLK_LOCAL_MEM_FENCE);
107 | 
108 |     event_t e2 = async_work_group_copy(
109 |         out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
110 |         out_local,
111 |         OW,
112 |         0);
113 |     wait_group_events(1, &e2);
114 | }
115 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution1x1_hwc.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5b6122a6bf6f50d2c7fc612d4e286559f9c96746e166892d192e1264e1ce5a2c
3 | size 4304
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution1x1_hwc.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | __kernel void Convolution1x1_NHWC(
  9 |     const __global half *in,
 10 |     const __global half *out,
 11 |     const __global half *w,
 12 |     int IW,
 13 |     int IH,
 14 |     int IC,
 15 |     int OW,
 16 |     int OH,
 17 |     int OC)
 18 | {
 19 | 
 20 |     __local half in_local[8 * 1024];
 21 |     __local half out_local[8 * 1024];
 22 | 
 23 |     const int sizeAct = IW * IC;
 24 | 
 25 |     event_t e1 = async_work_group_copy(in_local, in + get_group_id(0) * sizeAct, sizeAct, 0);
 26 |     wait_group_events(1, &e1);
 27 | 
 28 |     int oh = get_global_id(0);
 29 |     int oc = get_global_id(1);
 30 | 
 31 |     int stride;
 32 |     int write_output = 0;
 33 |     __global half *src;
 34 | 
 35 |     __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
 36 |     __global half *w1  = (__global half *)(&w[oc * IC]);
 37 | 
 38 |     for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
 39 |         uint iw = ow;
 40 |         uint ih = oh;
 41 | 
 42 |         half8 val8_0 = 0.0f;
 43 |         half8 val8_1 = 0.0f;
 44 |         half8 val8_2 = 0.0f;
 45 |         half8 val8_3 = 0.0f;
 46 |         half8 val8_4 = 0.0f;
 47 |         half8 val8_5 = 0.0f;
 48 |         half8 val8_6 = 0.0f;
 49 |         half8 val8_7 = 0.0f;
 50 | 
 51 |         __local half8 *in8_0 = (__local half8 *)(&in_local[(iw + 0) * IC]);
 52 |         __local half8 *in8_1 = (__local half8 *)(&in_local[(iw + 1) * IC]);
 53 |         __local half8 *in8_2 = (__local half8 *)(&in_local[(iw + 2) * IC]);
 54 |         __local half8 *in8_3 = (__local half8 *)(&in_local[(iw + 3) * IC]);
 55 |         __local half8 *in8_4 = (__local half8 *)(&in_local[(iw + 4) * IC]);
 56 |         __local half8 *in8_5 = (__local half8 *)(&in_local[(iw + 5) * IC]);
 57 |         __local half8 *in8_6 = (__local half8 *)(&in_local[(iw + 6) * IC]);
 58 |         __local half8 *in8_7 = (__local half8 *)(&in_local[(iw + 7) * IC]);
 59 | 
 60 |         for (uint ic = 0; ic < IC / 8; ++ic) {
 61 |             val8_0 += (in8_0[ic]) * (w8[ic]);
 62 |             val8_1 += (in8_1[ic]) * (w8[ic]);
 63 |             val8_2 += (in8_2[ic]) * (w8[ic]);
 64 |             val8_3 += (in8_3[ic]) * (w8[ic]);
 65 |             val8_4 += (in8_4[ic]) * (w8[ic]);
 66 |             val8_5 += (in8_5[ic]) * (w8[ic]);
 67 |             val8_6 += (in8_6[ic]) * (w8[ic]);
 68 |             val8_7 += (in8_7[ic]) * (w8[ic]);
 69 |         }
 70 | 
 71 |         half val_0 = 0.0f;
 72 |         half val_1 = 0.0f;
 73 |         half val_2 = 0.0f;
 74 |         half val_3 = 0.0f;
 75 |         half val_4 = 0.0f;
 76 |         half val_5 = 0.0f;
 77 |         half val_6 = 0.0f;
 78 |         half val_7 = 0.0f;
 79 |         for (uint ic = IC & (~0x7); ic < IC; ++ic) {
 80 |             val_0 += *((__local half *)in8_0 + ic) * (*((__global half *)w8 + ic));
 81 |             val_1 += *((__local half *)in8_1 + ic) * (*((__global half *)w8 + ic));
 82 |             val_2 += *((__local half *)in8_2 + ic) * (*((__global half *)w8 + ic));
 83 |             val_3 += *((__local half *)in8_3 + ic) * (*((__global half *)w8 + ic));
 84 |             val_4 += *((__local half *)in8_4 + ic) * (*((__global half *)w8 + ic));
 85 |             val_5 += *((__local half *)in8_5 + ic) * (*((__global half *)w8 + ic));
 86 |             val_6 += *((__local half *)in8_6 + ic) * (*((__global half *)w8 + ic));
 87 |             val_7 += *((__local half *)in8_7 + ic) * (*((__global half *)w8 + ic));
 88 |         }
 89 |         out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0;
 90 |         out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1;
 91 |         out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2;
 92 |         out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3;
 93 |         out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4;
 94 |         out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5;
 95 |         out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6;
 96 |         out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7;
 97 |     }
 98 |     for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
 99 | 
100 |         uint iw = ow;
101 |         uint ih = oh;
102 | 
103 |         half8 val8 = 0.0f;
104 | 
105 |         __local half8 *in8 = (__local half8 *)(&in_local[iw * IC]);
106 | 
107 |         for (uint ic = 0; ic < IC / 8; ++ic) {
108 |             val8 += (in8[ic]) * (w8[ic]);
109 |         }
110 | 
111 |         half val = 0.0f;
112 |         for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
113 |             val += (*((__local half *)in8 + ic)) * (*((__global half *)w8 + ic));
114 |         }
115 |         out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val;
116 |     }
117 | 
118 |     barrier(CLK_LOCAL_MEM_FENCE);
119 | 
120 |     event_t e2 = async_work_group_copy(
121 |         out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
122 |         out_local,
123 |         OW,
124 |         0);
125 |     wait_group_events(1, &e2);
126 | }
127 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution3x3.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:021bb40840ff35506972e6f6a7dea1b5f40a8db0927aaa9a6c116b152e386851
3 | size 5748
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution3x3.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | __kernel void Convolution3x3(
  9 |     const __global half *in_param,
 10 |     const __global half *out,
 11 |     const __global half *w,
 12 |     int IW,
 13 |     int IH,
 14 |     int IC,
 15 |     int OW,
 16 |     int OH,
 17 |     int OC,
 18 |     int KX,
 19 |     int KY,
 20 |     int stride_x,
 21 |     int stride_y,
 22 |     int pad_x,
 23 |     int pad_y,
 24 |     int dilation_x,
 25 |     int dilation_y)
 26 | {
 27 |     __local half in_local[8 * 1024];
 28 |     __local half out_local[8 * 1024];
 29 |     __local half w_local[8 * 1024];
 30 | 
 31 |     const int sizePlane = IW * IH;
 32 |     event_t e1          = async_work_group_copy_2D2D(
 33 |         in_local, // dst
 34 |         in_param + get_group_id(0) * stride_y * IW, // src
 35 |         3 * IW, // num_elements_per_line,
 36 |         IC, // num_lines,
 37 |         IW * IH - 3 * IW, // src_line_stride,
 38 |         0, // dst_line_stride,
 39 |         0);
 40 |     wait_group_events(1, &e1);
 41 | 
 42 |     const int sizeWeight = IC * 3 * 3;
 43 |     e1 = async_work_group_copy(w_local, w + get_group_id(1) * sizeWeight, sizeWeight, 0);
 44 |     wait_group_events(1, &e1);
 45 | 
 46 |     int oh = get_global_id(0);
 47 |     int oc = get_global_id(1);
 48 | 
 49 |     __local half *in = (__local half *)in_local + 1;
 50 | 
 51 |     int stride;
 52 |     int write_output = 0;
 53 |     __local half *src;
 54 | 
 55 |     if ((stride_x == 1) && (stride_y == 1)) {
 56 |         stride       = OW / 8;
 57 |         write_output = 1;
 58 |     }
 59 |     if ((stride_x == 2) && (stride_y == 2)) {
 60 |         stride       = OW / 4;
 61 |         write_output = 2;
 62 |     }
 63 | 
 64 |     for (int ow = 0; ow < stride; ow++) {
 65 |         float8 val = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
 66 |         for (int ic = 0; ic < IC; ++ic) {
 67 |             src             = (__local half *)((__local half8 *)(in + ic * IW * 3) + ow);
 68 |             __local half *k = (__local half *)(w_local + ic * 3 * 3);
 69 | 
 70 |             half8 aux_in00 = *((__local half8 *)src - 1);
 71 |             half8 aux_in01 = *((__local half8 *)src + 0);
 72 |             half8 aux_in02 = *((__local half8 *)src + 1);
 73 |             half8 aux_in10 = *((__local half8 *)(src + IW) - 1);
 74 |             half8 aux_in11 = *((__local half8 *)(src + IW) + 0);
 75 |             half8 aux_in12 = *((__local half8 *)(src + IW) + 1);
 76 |             half8 aux_in20 = *((__local half8 *)(src + IW * 2) - 1);
 77 |             half8 aux_in21 = *((__local half8 *)(src + IW * 2) + 0);
 78 |             half8 aux_in22 = *((__local half8 *)(src + IW * 2) + 1);
 79 | 
 80 |             short8 in00 = *((short8 *)&aux_in00);
 81 |             short8 in01 = *((short8 *)&aux_in01);
 82 |             short8 in02 = *((short8 *)&aux_in02);
 83 |             short8 in10 = *((short8 *)&aux_in10);
 84 |             short8 in11 = *((short8 *)&aux_in11);
 85 |             short8 in12 = *((short8 *)&aux_in12);
 86 |             short8 in20 = *((short8 *)&aux_in20);
 87 |             short8 in21 = *((short8 *)&aux_in21);
 88 |             short8 in22 = *((short8 *)&aux_in22);
 89 | 
 90 |             short8 aux_aux00 = __builtin_shave_cmu_alignvec_rri_short8(in00, in01, 14);
 91 |             short8 aux_aux01 = in01;
 92 |             short8 aux_aux02 = __builtin_shave_cmu_alignvec_rri_short8(in01, in02, 2);
 93 |             short8 aux_aux10 = __builtin_shave_cmu_alignvec_rri_short8(in10, in11, 14);
 94 |             short8 aux_aux11 = in11;
 95 |             short8 aux_aux12 = __builtin_shave_cmu_alignvec_rri_short8(in11, in12, 2);
 96 |             short8 aux_aux20 = __builtin_shave_cmu_alignvec_rri_short8(in20, in21, 14);
 97 |             short8 aux_aux21 = in21;
 98 |             short8 aux_aux22 = __builtin_shave_cmu_alignvec_rri_short8(in21, in22, 2);
 99 | 
100 |             half8 aux00 = *((half8 *)&aux_aux00);
101 |             half8 aux01 = *((half8 *)&aux_aux01);
102 |             half8 aux02 = *((half8 *)&aux_aux02);
103 |             half8 aux10 = *((half8 *)&aux_aux10);
104 |             half8 aux11 = *((half8 *)&aux_aux11);
105 |             half8 aux12 = *((half8 *)&aux_aux12);
106 |             half8 aux20 = *((half8 *)&aux_aux20);
107 |             half8 aux21 = *((half8 *)&aux_aux21);
108 |             half8 aux22 = *((half8 *)&aux_aux22);
109 | 
110 |             half8 w00 = (half8)(*(k + 0));
111 |             half8 w01 = (half8)(*(k + 1));
112 |             half8 w02 = (half8)(*(k + 2));
113 |             half8 w10 = (half8)(*(k + 3));
114 |             half8 w11 = (half8)(*(k + 4));
115 |             half8 w12 = (half8)(*(k + 5));
116 |             half8 w20 = (half8)(*(k + 6));
117 |             half8 w21 = (half8)(*(k + 7));
118 |             half8 w22 = (half8)(*(k + 8));
119 | 
120 |             val += convert_float8(aux00) * convert_float8(w00);
121 |             val += convert_float8(aux01) * convert_float8(w01);
122 |             val += convert_float8(aux02) * convert_float8(w02);
123 |             val += convert_float8(aux10) * convert_float8(w10);
124 |             val += convert_float8(aux11) * convert_float8(w11);
125 |             val += convert_float8(aux12) * convert_float8(w12);
126 |             val += convert_float8(aux20) * convert_float8(w20);
127 |             val += convert_float8(aux21) * convert_float8(w21);
128 |             val += convert_float8(aux22) * convert_float8(w22);
129 |         }
130 |         if (write_output == 2) *((__local half4 *)(out_local) + ow) = convert_half4(val.s0246);
131 |         if (write_output == 1) *((__local half8 *)(out_local) + ow) = convert_half8(val);
132 |     }
133 | 
134 |     for (int ow = OW & ~(0x7); ow < OW; ow++) {
135 |         float val = 0.0f;
136 |         for (int ic = 0; ic < IC; ++ic) {
137 |             for (int ky = 0; ky < 3; ++ky) {
138 |                 for (int kx = 0; kx < 3; ++kx) {
139 |                     int iw = ow * stride_x - pad_x + kx * dilation_x;
140 |                     int ih = oh * stride_y - pad_y + ky * dilation_y;
141 | 
142 |                     val += convert_float(in[ic * IW * 3 + (ky * dilation_y) * IW + iw])
143 |                            * convert_float(w_local[ic * 3 * 3 + ky * 3 + kx]);
144 |                 }
145 |             }
146 |         }
147 |         out_local[ow] = convert_half(val);
148 |     }
149 | 
150 |     barrier(CLK_LOCAL_MEM_FENCE);
151 | 
152 |     event_t e2 = async_work_group_copy(
153 |         out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
154 |         out_local,
155 |         OW,
156 |         0);
157 |     wait_group_events(1, &e2);
158 | }
159 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/correlate.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e2b24b1b5bfd1786128682ee814230653b4b63aad5b472feec9c6f4a4c833e2f
3 | size 14336
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/correlate.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | 
  7 | #define MAX_OPENCL_BUFF_SIZE 64 * 1024
  8 | 
  9 | #define USE_DMA 1
 10 | 
 11 | #if defined(USE_DMA)
 12 | void dmacpyLineSrcStrideStart(global half *from, private half *to, int size, int src_width, int src_stride)
 13 | {
 14 |     item_dma_event_t copyEvent =
 15 |         WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0);
 16 |     WaitWorkItemDmaEvents(1, &copyEvent);
 17 | }
 18 | 
 19 | void dmacpyLineDstStrideStart(private half *from, global half *to, int size, int src_width, int src_stride)
 20 | {
 21 |     item_dma_event_t copyEvent =
 22 |         WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0);
 23 |     WaitWorkItemDmaEvents(1, &copyEvent);
 24 | }
 25 | #endif
 26 | 
 27 | void memzero(void *ptr, size_t num)
 28 | {
 29 |     float4 *line0_ = (float4 *)ptr;
 30 |     #pragma unroll 16
 31 |     for (int i = 0; i < num / 16; i++) {
 32 |         line0_[i] = (float4){0.f, 0.f, 0.f, 0.f};
 33 |     }
 34 |     uchar *ptr_ = (uchar *)ptr;
 35 |     for (int i = num / 16 * 16; i < num; i++) {
 36 |         ptr_[i] = 0;
 37 |     }
 38 | }
 39 | 
 40 | void __attribute__((noinline)) crosscorrh(
 41 |     __private const half *restrict line0,
 42 |     __private const half *restrict line1,
 43 |     __private half *restrict dline,
 44 |     int topwidth,
 45 |     int max_displacement,
 46 |     int neighborhood_grid_radius,
 47 |     int kernel_size,
 48 |     int padding,
 49 |     int bottomwidth,
 50 |     int stride1,
 51 |     int stride2,
 52 |     int max_channels,
 53 |     int cur_subchannels)
 54 | {
 55 |     if (max_channels == 64) {
 56 |         for (int i = 0; i < kernel_size; i++) {
 57 |             int x1      = max_displacement - padding + i;
 58 |             int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1) / stride1;
 59 |             x1 += offset1 * stride1;
 60 | 
 61 |             for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1) {
 62 |                 int x2      = x1 - neighborhood_grid_radius * stride2;
 63 |                 int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1) / stride2;
 64 |                 x2 += offset2 * stride2;
 65 | 
 66 |                 for (int top_channel_x = offset2 - neighborhood_grid_radius;
 67 |                      top_channel_x <= neighborhood_grid_radius && x2 < bottomwidth;
 68 |                      top_channel_x++, x2 += stride2) {
 69 |                     half8 sum4 = (half8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
 70 | 
 71 |                     half8 *src0 = (half8 *)(line0 + x1 * max_channels);
 72 |                     half8 *src1 = (half8 *)(line1 + x2 * max_channels);
 73 | 
 74 |                     #pragma unroll 8
 75 |                     for (int ch = 0; ch < max_channels / 8; ch++) sum4 += (src0[ch]) * (src1[ch]);
 76 | 
 77 |                     half sum = __builtin_shave_sau_sumx_f16_r(sum4);
 78 |                     dline[(top_channel_x + neighborhood_grid_radius) * topwidth + blockIdx_x] += (sum);
 79 |                 }
 80 |             }
 81 |         }
 82 |     } else {
 83 |         int neighborhood_grid_width = 2 * neighborhood_grid_radius + 1;
 84 | 
 85 |         for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++) {
 86 |             for (int i = 0; i < kernel_size; i++) {
 87 |                 int x1 = blockIdx_x * stride1 + max_displacement + i - padding;
 88 | 
 89 |                 if ((x1 >= 0) && (x1 < bottomwidth)) {
 90 |                     int o_min = -neighborhood_grid_radius * stride2;
 91 |                     int o_max = neighborhood_grid_width * stride2 - neighborhood_grid_radius * stride2;
 92 |                     if ((o_min) < (-x1)) {
 93 |                         o_min -= ((x1 + o_min - (stride2 - 1)) / stride2) * stride2;
 94 |                     }
 95 |                     if ((o_max) >= (bottomwidth + stride2 - x1)) {
 96 |                         o_max -= ((x1 + o_max - bottomwidth) / stride2) * stride2;
 97 |                     }
 98 | 
 99 |                     int o = o_min;
100 |                     for (; o <= o_max - 4 * stride2; o += 4 * stride2) {
101 |                         half8 *bottom0   = (half8 *)(line0 + x1 * max_channels);
102 |                         half8 *bottom1_0 = (half8 *)(line1 + (x1 + o + 0 * stride2) * max_channels);
103 |                         half8 *bottom1_1 = (half8 *)(line1 + (x1 + o + 1 * stride2) * max_channels);
104 |                         half8 *bottom1_2 = (half8 *)(line1 + (x1 + o + 2 * stride2) * max_channels);
105 |                         half8 *bottom1_3 = (half8 *)(line1 + (x1 + o + 3 * stride2) * max_channels);
106 | 
107 |                         int c = 0;
108 | 
109 |                         half8 sum40 = 0;
110 |                         half8 sum41 = 0;
111 |                         half8 sum42 = 0;
112 |                         half8 sum43 = 0;
113 | 
114 |                         for (; c <= cur_subchannels / 8 - 4; c += 4) {
115 |                             sum40 += bottom0[c + 0] * bottom1_0[c + 0];
116 |                             sum40 += bottom0[c + 1] * bottom1_0[c + 1];
117 |                             sum40 += bottom0[c + 2] * bottom1_0[c + 2];
118 |                             sum40 += bottom0[c + 3] * bottom1_0[c + 3];
119 | 
120 |                             sum41 += bottom0[c + 0] * bottom1_1[c + 0];
121 |                             sum41 += bottom0[c + 1] * bottom1_1[c + 1];
122 |                             sum41 += bottom0[c + 2] * bottom1_1[c + 2];
123 |                             sum41 += bottom0[c + 3] * bottom1_1[c + 3];
124 | 
125 |                             sum42 += bottom0[c + 0] * bottom1_2[c + 0];
126 |                             sum42 += bottom0[c + 1] * bottom1_2[c + 1];
127 |                             sum42 += bottom0[c + 2] * bottom1_2[c + 2];
128 |                             sum42 += bottom0[c + 3] * bottom1_2[c + 3];
129 | 
130 |                             sum43 += bottom0[c + 0] * bottom1_3[c + 0];
131 |                             sum43 += bottom0[c + 1] * bottom1_3[c + 1];
132 |                             sum43 += bottom0[c + 2] * bottom1_3[c + 2];
133 |                             sum43 += bottom0[c + 3] * bottom1_3[c + 3];
134 |                         }
135 | 
136 |                         for (; c < cur_subchannels / 8; c++) {
137 |                             sum40 += bottom0[c] * bottom1_0[c];
138 |                             sum41 += bottom0[c] * bottom1_1[c];
139 |                             sum42 += bottom0[c] * bottom1_2[c];
140 |                             sum43 += bottom0[c] * bottom1_3[c];
141 |                         }
142 | 
143 |                         half sum0 = __builtin_shave_sau_sumx_f16_r(sum40);
144 |                         half sum1 = __builtin_shave_sau_sumx_f16_r(sum41);
145 |                         half sum2 = __builtin_shave_sau_sumx_f16_r(sum42);
146 |                         half sum3 = __builtin_shave_sau_sumx_f16_r(sum43);
147 | 
148 |                         for (c = c * 8; c < cur_subchannels; c++) {
149 |                             sum0 += line0[x1 * max_channels + c] * line1[(x1 + o + 0 * stride2) * max_channels + c];
150 |                             sum1 += line0[x1 * max_channels + c] * line1[(x1 + o + 1 * stride2) * max_channels + c];
151 |                             sum2 += line0[x1 * max_channels + c] * line1[(x1 + o + 2 * stride2) * max_channels + c];
152 |                             sum3 += line0[x1 * max_channels + c] * line1[(x1 + o + 3 * stride2) * max_channels + c];
153 |                         }
154 | 
155 |                         dline[blockIdx_x + (((o / stride2) + 0) * topwidth + neighborhood_grid_radius * topwidth)] +=
156 |                             sum0;
157 |                         dline[blockIdx_x + (((o / stride2) + 1) * topwidth + neighborhood_grid_radius * topwidth)] +=
158 |                             sum1;
159 |                         dline[blockIdx_x + (((o / stride2) + 2) * topwidth + neighborhood_grid_radius * topwidth)] +=
160 |                             sum2;
161 |                         dline[blockIdx_x + (((o / stride2) + 3) * topwidth + neighborhood_grid_radius * topwidth)] +=
162 |                             sum3;
163 |                     }
164 | 
165 |                     for (; o < o_max; o += 1 * stride2) {
166 |                         half8 *bottom0 = (half8 *)(line0 + x1 * max_channels);
167 |                         half8 *bottom1 = (half8 *)(line1 + (x1 + o) * max_channels);
168 | 
169 |                         int c = 0;
170 | 
171 |                         half8 sum4 = 0;
172 |                         for (; c <= cur_subchannels / 8 - 4; c += 4) {
173 |                             sum4 += bottom0[c + 0] * bottom1[c + 0];
174 |                             sum4 += bottom0[c + 1] * bottom1[c + 1];
175 |                             sum4 += bottom0[c + 2] * bottom1[c + 2];
176 |                             sum4 += bottom0[c + 3] * bottom1[c + 3];
177 |                         }
178 |                         for (; c < cur_subchannels / 8; c++) {
179 |                             sum4 += bottom0[c] * bottom1[c];
180 |                         }
181 | 
182 |                         half sum = __builtin_shave_sau_sumx_f16_r(sum4);
183 | 
184 |                         for (c = c * 8; c < cur_subchannels; c++) {
185 |                             sum += line0[x1 * max_channels + c] * line1[(x1 + o) * max_channels + c];
186 |                         }
187 | 
188 |                         dline[blockIdx_x + (((o + neighborhood_grid_radius * stride2) / stride2) * topwidth)] += sum;
189 |                     }
190 |                 }
191 |             }
192 |         }
193 |     }
194 | }
195 | 
196 | __kernel void correlate2_half(
197 |     __global const half *restrict bottom0,
198 |     __global const half *restrict bottom1,
199 |     __global half *restrict top,
200 |     int topwidth,
201 |     int topheight,
202 |     int bottomwidth,
203 |     int bottomheight,
204 |     int bottomchannels,
205 |     int max_displacement,
206 |     int padding,
207 |     int neighborhood_grid_radius,
208 |     int neighborhood_grid_width,
209 |     int kernel_size,
210 |     int stride1,
211 |     int stride2)
212 | {
213 |     int max_channels = (MAX_OPENCL_BUFF_SIZE / sizeof(half) - topwidth * neighborhood_grid_width) / (3 * bottomwidth);
214 |     if (max_channels > 64) max_channels = 64;
215 |     int subchannels_count = (bottomchannels + max_channels - 1) / max_channels;
216 |     int subchannels       = (bottomchannels + subchannels_count - 1) / subchannels_count;
217 |     if (subchannels < max_channels) subchannels = max_channels;
218 | 
219 |     const int sumelems = kernel_size * kernel_size * bottomchannels;
220 | 
221 |     __private half cmx[MAX_OPENCL_BUFF_SIZE / sizeof(half)];
222 | 
223 |     __private half *line0 = cmx;
224 |     __private half *line1 = line0 + bottomwidth * subchannels;
225 |     __private half *dline = line1 + bottomwidth * subchannels;
226 | 
227 |     int blockIdx_y = get_global_id(0);
228 | 
229 | #if defined(USE_DMA)
230 |     __private half *dmabuf = dline + topwidth * neighborhood_grid_width;
231 | #endif
232 | 
233 |     int y1 = blockIdx_y * stride1 + max_displacement;
234 | 
235 |     for (int j = 0; j < kernel_size; j++) {
236 |         for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels) {
237 |             // configure channel batching
238 |             int startchannel = bottomchannel;
239 |             int endchannel = startchannel + subchannels > bottomchannels ? bottomchannels : startchannel + subchannels;
240 |             int deltachannels = endchannel - startchannel;
241 | 
242 |             // load line form blob 0 with repackaging
243 |             if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight) {
244 | #if defined(USE_DMA)
245 |                 __global const half *curr =
246 |                     bottom0 + startchannel * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth;
247 |                 dmacpyLineSrcStrideStart(
248 |                     curr,
249 |                     dmabuf,
250 |                     bottomwidth * deltachannels * sizeof(half),
251 |                     bottomwidth * sizeof(half),
252 |                     bottomwidth * bottomheight * sizeof(half));
253 | 
254 |                 for (int ch = 0; ch < deltachannels; ch++) {
255 |                     for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
256 |                         half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
257 |                         line0[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
258 |                         line0[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
259 |                         line0[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
260 |                         line0[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
261 | 
262 |                         line0[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
263 |                         line0[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
264 |                         line0[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
265 |                         line0[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
266 |                     }
267 | 
268 |                     for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
269 |                         line0[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
270 |                     }
271 |                 }
272 | 
273 |                 if (deltachannels < subchannels)
274 |                     for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
275 |                         memzero(
276 |                             line0 + blockIdx_x * max_channels + deltachannels,
277 |                             (subchannels - deltachannels) * sizeof(half));
278 | #else
279 |                 for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
280 |                     for (int ch = 0; ch < deltachannels; ch++)
281 |                         line0[blockIdx_x * max_channels + ch] = bottom0
282 |                             [(ch + startchannel) * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth
283 |                              + blockIdx_x];
284 | 
285 |                     if (deltachannels < subchannels)
286 |                         memzero(
287 |                             line0 + blockIdx_x * max_channels + deltachannels,
288 |                             (subchannels - deltachannels) * sizeof(half));
289 |                 }
290 | #endif
291 |             } else
292 |                 memzero(line0, max_channels * bottomwidth * sizeof(half));
293 | 
294 |             for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++) {
295 |                 int y2 = y1 + (top_channel_y - neighborhood_grid_radius) * stride2;
296 | 
297 |                 if (y2 + j - padding >= 0 && y2 + j - padding < bottomheight) {
298 | #if defined(USE_DMA)
299 |                     __global const half *curr =
300 |                         bottom1 + startchannel * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth;
301 |                     dmacpyLineSrcStrideStart(
302 |                         curr,
303 |                         dmabuf,
304 |                         bottomwidth * deltachannels * sizeof(half),
305 |                         bottomwidth * sizeof(half),
306 |                         bottomwidth * bottomheight * sizeof(half));
307 | 
308 |                     for (int ch = 0; ch < deltachannels; ch++) {
309 |                         for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
310 |                             half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
311 |                             line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
312 |                             line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
313 |                             line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
314 |                             line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
315 | 
316 |                             line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
317 |                             line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
318 |                             line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
319 |                             line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
320 |                         }
321 | 
322 |                         for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
323 |                             line1[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
324 |                         }
325 |                     }
326 | #else
327 |                     for (int ch = 0; ch < deltachannels; ch++) {
328 |                         for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
329 |                             half8 val = ((
330 |                                 __global half8
331 |                                     *)(bottom1 + (ch + startchannel) * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth))
332 |                                 [blockIdx_x];
333 |                             line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
334 |                             line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
335 |                             line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
336 |                             line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
337 | 
338 |                             line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
339 |                             line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
340 |                             line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
341 |                             line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
342 |                         }
343 |                         for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
344 |                             half val =
345 |                                 (bottom1 + (ch + startchannel) * bottomheight * bottomwidth
346 |                                  + (y2 + j - padding) * bottomwidth)[blockIdx_x];
347 |                             line1[(blockIdx_x)*max_channels + ch] = val;
348 |                         }
349 |                     }
350 | #endif
351 |                     for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
352 |                         if (deltachannels < subchannels)
353 |                             memzero(
354 |                                 line1 + blockIdx_x * max_channels + deltachannels,
355 |                                 (subchannels - deltachannels) * sizeof(half));
356 |                     }
357 |                 } else
358 |                     memzero(line1, max_channels * bottomwidth * sizeof(half));
359 | 
360 |                 if (j == 0 && startchannel == 0) {
361 |                     memzero(dline, neighborhood_grid_width * topwidth * sizeof(half));
362 |                 } else {
363 | #if defined(USE_DMA)
364 |                     dmacpyLineSrcStrideStart(
365 |                         top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
366 |                         dline,
367 |                         topwidth * neighborhood_grid_width * sizeof(half),
368 |                         topwidth * sizeof(half),
369 |                         topwidth * topheight * sizeof(half));
370 | #else
371 |                     for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
372 |                         for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
373 |                             half8 val = ((
374 |                                 __global half8
375 |                                     *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
376 |                                 [blockIdx_x];
377 |                             ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] = val;
378 |                         }
379 |                         for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
380 |                             dline[top_channel_x * topwidth + blockIdx_x] =
381 |                                 top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
382 |                                     + blockIdx_y * topwidth + blockIdx_x];
383 |                         }
384 |                     }
385 | #endif
386 |                 }
387 | 
388 |                 if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight && y2 + j - padding >= 0
389 |                     && y2 + j - padding < bottomheight) {
390 |                     crosscorrh(
391 |                         line0,
392 |                         line1,
393 |                         dline,
394 |                         topwidth,
395 |                         max_displacement,
396 |                         neighborhood_grid_radius,
397 |                         kernel_size,
398 |                         padding,
399 |                         bottomwidth,
400 |                         stride1,
401 |                         stride2,
402 |                         max_channels,
403 |                         subchannels);
404 |                 }
405 | 
406 |                 if (j == kernel_size - 1 && endchannel == bottomchannels) {
407 |                     half8 scale = (half8){
408 |                         (half)sumelems,
409 |                         (half)sumelems,
410 |                         (half)sumelems,
411 |                         (half)sumelems,
412 |                         (half)sumelems,
413 |                         (half)sumelems,
414 |                         (half)sumelems,
415 |                         (half)sumelems};
416 |                     for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
417 |                         for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
418 |                             ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] =
419 |                                 ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] / scale;
420 |                         }
421 |                         for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
422 |                             dline[top_channel_x * topwidth + blockIdx_x] =
423 |                                 dline[top_channel_x * topwidth + blockIdx_x] / (half)sumelems;
424 |                         }
425 |                     }
426 |                 }
427 | 
428 | #if defined(USE_DMA)
429 |                 dmacpyLineDstStrideStart(
430 |                     dline,
431 |                     top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
432 |                     topwidth * neighborhood_grid_width * sizeof(half),
433 |                     topwidth * sizeof(half),
434 |                     topwidth * topheight * sizeof(half));
435 | #else
436 |                 for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
437 |                     for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
438 |                         ((__global half8
439 |                               *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
440 |                             [blockIdx_x] = ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x]
441 |                                            + (half8){0, 0, 0, 0, 0, 0, 0, 0};
442 |                     }
443 |                     for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
444 |                         top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
445 |                             + blockIdx_y * topwidth + blockIdx_x] =
446 |                             dline[top_channel_x * topwidth + blockIdx_x] + (half)0;
447 |                     }
448 |                 }
449 | #endif
450 |             }
451 |         }
452 |     }
453 | }
454 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/ctc.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:292de0fbb8dc6ead6970576d1b9a26a323fc9febfceb92c3af6b84496d523def
3 | size 10196
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/ctc.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
 7 | 
 8 | __global half *find(__global const half *begin, __global const half *end, half value)
 9 | {
10 |     while (begin != end) {
11 |         if (*begin == value) {
12 |             return begin;
13 |         }
14 |         ++begin;
15 |     }
16 |     return end;
17 | }
18 | 
19 | __kernel void CTCDecoder(
20 |     __global half *restrict probabilities,
21 |     __global half *restrict sequence_indicators,
22 |     __global half *restrict output,
23 |     int width,
24 |     int height,
25 |     int channels)
26 | {
27 |     __local half local_src[88 * 1 * 77];
28 |     __local half local_dst[88 * 1];
29 | 
30 |     event_t e1 = async_work_group_copy_2D2D(
31 |         local_src, // dst
32 |         probabilities, // src
33 |         width, // num_elements_per_line,
34 |         height * channels, // num_lines,
35 |         width * (height - 1), // src_line_stride,
36 |         width * (height - 1), // dst_line_stride,
37 |         0);
38 | 
39 |     wait_group_events(1, &e1);
40 | 
41 |     const int T = channels; // Time
42 |     const int B = height; // Batches
43 |     const int C = width; // Chars
44 | 
45 |     #pragma unroll 4
46 |     for (int i = 0; i < B * T; i++) {
47 |         local_dst[i] = -1.h;
48 |     }
49 | 
50 |     int output_index = 0;
51 | 
52 |     for (int b = 0; b < B; ++b) {
53 |         __global const half *restrict seq_ind = sequence_indicators + b * T;
54 |         const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind;
55 |         const int time    = min(seq_len, T);
56 | 
57 |         int prev_class_idx = -1;
58 | 
59 |         #pragma unroll 4
60 |         for (int t = 0; t < time; ++t) {
61 |             __local const half *restrict probs = local_src + b * C + t * C * B;
62 | 
63 |             int max_class_idx = 0;
64 |             half max_prob     = probs[0];
65 |             for (int c = 1; c < C; ++c) {
66 |                 const half prob = probs[c];
67 |                 if (prob > max_prob) {
68 |                     max_class_idx = c;
69 |                     max_prob      = prob;
70 |                 }
71 |             }
72 | 
73 |             if (max_class_idx < C - 1 && max_class_idx != prev_class_idx) {
74 |                 local_dst[b * T + output_index] = (half)max_class_idx;
75 |                 output_index++;
76 |             }
77 | 
78 |             prev_class_idx = max_class_idx;
79 |         }
80 |     }
81 | 
82 |     barrier(CLK_LOCAL_MEM_FENCE);
83 | 
84 |     event_t e2 = async_work_group_copy_2D2D(
85 |         output, // dst
86 |         local_dst, // src
87 |         channels, // num_elements_per_line,
88 |         height, // num_lines,
89 |         0, // src_line_stride,
90 |         0, // dst_line_stride,
91 |         0);
92 | 
93 |     wait_group_events(1, &e2);
94 | }
95 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/customLayerBindings.xml:
--------------------------------------------------------------------------------
  1 | <CustomLayer name="ReorgYolo" type="MVCL" version="1">
  2 |     <Kernel entry="reorg_hwc_naive">
  3 |         <Source filename="reorg_hwc_naive.bin"/>
  4 |         <Parameters>
  5 |             <Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
  6 |             <Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
  7 |             <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
  8 |             <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
  9 |             <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
 10 |             <Scalar arg-name="stride" type="int" source="stride"/>
 11 |         </Parameters>
 12 |         <WorkSizes dim="input,0" global="F,1,1" local="stride*stride,1,1"/>
 13 |     </Kernel>
 14 | </CustomLayer>
 15 | 
 16 | <CustomLayer name="ReorgYolo" type="MVCL" version="1">
 17 |     <Kernel entry="reorg_chw">
 18 |         <Source filename="reorg_chw.bin"/>
 19 |         <Parameters>
 20 |             <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
 21 |             <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
 22 |             <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
 23 |             <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
 24 |             <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
 25 |             <Scalar arg-name="stride" type="int" source="stride"/>
 26 |         </Parameters>
 27 |         <WorkSizes dim="input,0" global="Y*F/(stride*stride),stride*stride,1" local="stride,stride,1"/>
 28 |     </Kernel>
 29 | </CustomLayer>
 30 | 
 31 | <CustomLayer name="RegionYolo" type="MVCL" version="1">
 32 |     <Where do_softmax="1"/>
 33 |     <Kernel entry="region_chw">
 34 |         <Source filename="region_chw.bin"/>
 35 |         <Parameters>
 36 |             <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
 37 |             <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
 38 |             <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
 39 |             <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
 40 |             <Scalar arg-name="classes" type="int" source="classes"/>
 41 |             <Scalar arg-name="coords" type="int" source="coords"/>
 42 |             <Scalar arg-name="num" type="int" source="num"/>
 43 |             <Scalar arg-name="maskSize" type="int" source="3"/>
 44 |             <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
 45 |         </Parameters>
 46 |         <WorkSizes global="((X*Y+7)/8)*8,num,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
 47 |     </Kernel>
 48 | </CustomLayer>
 49 | 
 50 | <CustomLayer name="RegionYolo" type="MVCL" version="1">-->
 51 |    <Where do_softmax="0" mask="0,1,2"/>
 52 |    <Kernel entry="region_chw">
 53 |        <Source filename="region_chw.bin"/>
 54 |        <Parameters>
 55 |            <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
 56 |            <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
 57 |            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
 58 |            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
 59 |            <Scalar arg-name="classes" type="int" source="classes"/>
 60 |            <Scalar arg-name="coords" type="int" source="coords"/>
 61 |            <Scalar arg-name="num" type="int" source="num"/>
 62 |            <Scalar arg-name="maskSize" type="int" source="3"/>
 63 |            <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
 64 |        </Parameters>
 65 |        <WorkSizes global="((X*Y+7)/8)*8,3,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
 66 |    </Kernel>
 67 | </CustomLayer>
 68 | 
 69 | <CustomLayer name="RegionYolo" type="MVCL" version="1">
 70 |    <Where do_softmax="1"/>
 71 |    <Kernel entry="region_hwc">
 72 |        <Source filename="region_hwc.bin"/>
 73 |        <Parameters>
 74 |            <Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
 75 |            <Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
 76 |            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
 77 |            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
 78 |            <Scalar arg-name="classes" type="int" source="classes"/>
 79 |            <Scalar arg-name="coords" type="int" source="coords"/>
 80 |            <Scalar arg-name="num" type="int" source="num"/>
 81 |            <Scalar arg-name="maskSize" type="int" source="3"/>
 82 |            <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
 83 |        </Parameters>
 84 |        <WorkSizes global="((X*Y+7)/8)*8,num,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
 85 |    </Kernel>
 86 | </CustomLayer>
 87 | 
 88 | <CustomLayer name="RegionYolo" type="MVCL" version="1">
 89 |    <Where do_softmax="0" mask="0,1,2"/>
 90 |    <Kernel entry="region_hwc">
 91 |        <Source filename="region_hwc.bin"/>
 92 |        <Parameters>
 93 |            <Tensor arg-name="src" type="input" port-index="0" format="BYXF"/>
 94 |            <Tensor arg-name="dst" type="output" port-index="0" format="BYXF"/>
 95 |            <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
 96 |            <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
 97 |            <Scalar arg-name="classes" type="int" source="classes"/>
 98 |            <Scalar arg-name="coords" type="int" source="coords"/>
 99 |            <Scalar arg-name="num" type="int" source="num"/>
100 |            <Scalar arg-name="maskSize" type="int" source="3"/>
101 |            <Scalar arg-name="doSoftmax" type="int" source="do_softmax"/>
102 |        </Parameters>
103 |        <WorkSizes global="((X*Y+7)/8)*8,3,1" local="((X*Y+7)/8)*8,1,1" dim="input,0"/>
104 |    </Kernel>
105 | </CustomLayer>
106 | 
107 | <!-- Pixel-wise kernel binding, local work group config is per line in the input tensor  -->
108 | <CustomLayer name="GRN" type="MVCL" version="1">
109 |     <Kernel entry="grn">
110 |         <Source filename="grn.bin"/>
111 |         <Parameters>
112 |             <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
113 |             <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
114 |             <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
115 |             <Scalar arg-name="bias" type="float" source="bias"/>
116 |         </Parameters>
117 |         <WorkSizes dim="input,0" global="X,Y,1" local="X,1,1"/>
118 |     </Kernel>
119 | </CustomLayer>
120 | 
121 | <!-- Two stage layer binding, first kernel computes mean and variance, the second one normalizes input tensor-->
122 | <CustomLayer name="MVN" type="MVCL" version="1">
123 |     <Kernel entry="reduction_mean" stage="0">
124 |         <Source filename="mvn_reduction.bin"/>
125 |         <Parameters>
126 |             <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
127 |             <Tensor arg-name="mean" type="output_buffer" port-index="0" dim="output,0" size="Y*F*4"/>
128 |             <Tensor arg-name="variance" type="output_buffer" port-index="1" dim="output,0" size="Y*F*4"/>
129 |             <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
130 |             <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
131 |             <Scalar arg-name="across_channels" type="int" source="across_channels"/>
132 |         </Parameters>
133 |         <WorkSizes dim="output,0" global="1,Y,F" local="1,1,1"/>
134 |     </Kernel>
135 |     <Kernel entry="mvn_scale" stage="1">
136 |         <Source filename="mvn_scale.bin"/>
137 |         <Parameters>
138 |             <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
139 |             <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
140 |             <Tensor arg-name="mean_part" type="input_buffer" port-index="0" dim="output,0" size="Y*F*4"/>
141 |             <Tensor arg-name="power_mean" type="input_buffer" port-index="1" dim="output,0" size="Y*F*4"/>
142 |             <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
143 |             <Scalar arg-name="H1" type="int" port-index="0" source="I.Y"/>
144 |             <Scalar arg-name="across_channels" type="int" source="across_channels"/>
145 |             <Scalar arg-name="normalize_variance" type="int" source="normalize_variance"/>
146 |             <Scalar arg-name="nparts" type="int" port-index="0" source="I.Y"/>
147 |         </Parameters>
148 |         <WorkSizes dim="output,0" global="1,Y,F" local="1,1,1"/>
149 |     </Kernel>
150 | </CustomLayer>
151 | 
152 | <!-- Single work group kernel for not embarrassingly-parallel use-case -->
153 | <CustomLayer name="CTCGreedyDecoder" type="MVCL" version="1" max-shaves="1">
154 |     <Kernel entry="CTCDecoder">
155 |         <Source filename="ctc.bin"/>
156 |         <Parameters>
157 |             <Tensor arg-name="probabilities" type="input" port-index="0" format="FYX"/>
158 |             <Tensor arg-name="sequence_indicators" type="input" port-index="1" format="BF"/>
159 |             <Tensor arg-name="output" type="output" port-index="0" format="BFYX"/>
160 |             <Scalar arg-name="width" type="int" port-index="0" source="I.X"/>
161 |             <Scalar arg-name="height" type="int" port-index="0" source="I.Y"/>
162 |             <Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
163 |         </Parameters>
164 |         <WorkSizes dim="output,0" global="1,1,1" local="1,1,1"/>
165 |     </Kernel>
166 | </CustomLayer>
167 | 
168 | <CustomLayer name="ShuffleChannel" type="MVCL" version="1">
169 |     <!-- artificially added where closure for testing reasons, kernel itself supports arbitrary grouping -->
170 |     <!-- <Where group="2"/> -->
171 |     <Kernel entry="ShuffleChannel">
172 |         <Source filename="shuffle_channels.bin"/>
173 |         <Parameters>
174 |             <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
175 |             <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
176 |             <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
177 |             <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
178 |             <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
179 |             <Scalar arg-name="G" type="int" source="group"/>
180 |         </Parameters>
181 |         <WorkSizes dim="input,0" global="F,1,1" local="1,1,1"/>
182 |     </Kernel>
183 | </CustomLayer>
184 | 
185 | <!-- Reference version of generic quantize layer, should be changed to FakeQuantize-->
186 | <CustomLayer name="FakeQuantize" type="MVCL" version="1">
187 |     <Kernel entry="quantize">
188 |         <Source filename="fakequantize.bin"/>
189 |         <Parameters>
190 |             <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
191 |             <Tensor arg-name="input_low" type="input" port-index="1" format="ANY"/>
192 |             <Tensor arg-name="input_high" type="input" port-index="2" format="ANY"/>
193 |             <Tensor arg-name="output_low" type="input" port-index="3" format="ANY"/>
194 |             <Tensor arg-name="output_high" type="input" port-index="4" format="ANY"/>
195 |             <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
196 |             <Scalar arg-name="levels" type="int" source="levels"/>
197 |             <Scalar arg-name="input_low_size" type="int" port-index="1" source="I.F"/>
198 |             <Scalar arg-name="input_high_size" type="int" port-index="2" source="I.F"/>
199 |             <Scalar arg-name="output_low_size" type="int" port-index="3" source="I.F"/>
200 |             <Scalar arg-name="output_high_size" type="int" port-index="4" source="I.F"/>
201 |             <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
202 |             <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
203 |         </Parameters>
204 |         <WorkSizes dim="input,0" global="1,Y,F" local="1,Y,1"/>
205 |     </Kernel>
206 | </CustomLayer>
207 | 
208 | <!-- Reference version of generic quantize layer, should be changed to FakeQuantize-->
209 | <CustomLayer name="FakeQuantizeBin" type="MVCL" version="1">
210 |     <Where levels="2"/>
211 |     <Kernel entry="binarization">
212 |         <Source filename="binarization.bin"/>
213 |         <Parameters>
214 |             <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
215 |             <Tensor arg-name="input_low_high" type="input" port-index="1" format="BFYX"/>
216 |             <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
217 |             <Scalar arg-name="switch_out" type="int" source="switch_out"/>
218 |             <Scalar arg-name="input_low_high_size" type="int" source="input_low_size"/>
219 |             <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
220 |             <Scalar arg-name="H" type="int" port-index="0" source="I.Y"/>
221 |             <Tensor arg-name="input_high" type="input" port-index="2" format="BFYX"/>
222 |             <Tensor arg-name="output_low" type="input" port-index="3" format="BFYX"/>
223 |             <Tensor arg-name="output_high" type="input" port-index="4" format="BFYX"/>
224 |             <Scalar arg-name="input_high_size" type="int" source="input_high_size"/>
225 |             <Scalar arg-name="output_low_size" type="int" source="output_low_size"/>
226 |             <Scalar arg-name="output_high_size" type="int" source="output_high_size"/>
227 |             <Data arg-name="src_local" type="local_data" dim="input,0" size="X*Y*2"/>
228 |             <Data arg-name="dst_local" type="local_data" dim="input,0" size="X*Y*2"/>
229 |         </Parameters>
230 |         <WorkSizes dim="input,0" global="1,1,F" local="1,1,1"/>
231 |     </Kernel>
232 | </CustomLayer>
233 | 
234 | <CustomLayer name="BinaryConvolution" type="MVCL" version="1">
235 |     <Where kernel="3,3"/>
236 |     <Kernel entry="binary_convolution">
237 |         <Source filename="binary_convolution3x3.bin"/>
238 |         <Parameters>
239 |             <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
240 |             <Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
241 |             <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
242 |             <Scalar arg-name="pad_value" type="float" source="pad_value"/>
243 |             <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
244 |             <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
245 |             <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
246 |             <Scalar arg-name="DW" type="int" port-index="0" source="dilations"/>
247 |             <Scalar arg-name="DH" type="int" port-index="1" source="dilations"/>
248 |             <Scalar arg-name="GC" type="int" source="group"/>
249 |             <Scalar arg-name="KW" type="int" port-index="0" source="kernel"/>
250 |             <Scalar arg-name="KH" type="int" port-index="1" source="kernel"/>
251 |             <Scalar arg-name="PW" type="int" port-index="0" source="pads_begin"/>
252 |             <Scalar arg-name="PH" type="int" port-index="1" source="pads_begin"/>
253 |             <Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
254 |             <Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
255 |             <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
256 |         </Parameters>
257 |         <WorkSizes dim="output,0" global="Y,F,1" local="1,1,1"/>
258 |     </Kernel>
259 | </CustomLayer>
260 | 
261 | <CustomLayer name="BinaryConvolution" type="MVCL" version="1">
262 |     <Where kernel="1,1"/>
263 |     <Kernel entry="binary_convolution">
264 |         <Source filename="binary_convolution1x1.bin"/>
265 |         <Parameters>
266 |             <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
267 |             <Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
268 |             <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
269 |             <Scalar arg-name="pad_value" type="float" source="pad_value"/>
270 |             <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
271 |             <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
272 |             <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
273 |             <Scalar arg-name="DW" type="int" port-index="0" source="dilations"/>
274 |             <Scalar arg-name="DH" type="int" port-index="1" source="dilations"/>
275 |             <Scalar arg-name="GC" type="int" source="group"/>
276 |             <Scalar arg-name="KW" type="int" port-index="0" source="kernel"/>
277 |             <Scalar arg-name="KH" type="int" port-index="1" source="kernel"/>
278 |             <Scalar arg-name="PW" type="int" port-index="0" source="pads_begin"/>
279 |             <Scalar arg-name="PH" type="int" port-index="1" source="pads_begin"/>
280 |             <Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
281 |             <Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
282 |             <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
283 |         </Parameters>
284 |         <WorkSizes dim="output,0" global="Y,F,1" local="1,1,1"/>
285 |     </Kernel>
286 | </CustomLayer>
287 | 
288 | <!-- Reference version of generic quantize binary convolution -->
289 | <!-- An example of a kernel binding that uses data blob from IR -->
290 | <CustomLayer name="BinaryConvolution" type="MVCL" version="1">
291 |     <Kernel entry="binary_convolution">
292 |         <Source filename="binary_convolution.bin"/>
293 |         <Parameters>
294 |             <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
295 |             <Data arg-name="weights_data" type="data" source="weights" format="ANY"/>
296 |             <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
297 |             <Scalar arg-name="pad_value" type="float" source="pad_value"/>
298 |             <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
299 |             <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
300 |             <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
301 |             <Scalar arg-name="DW" type="int" port-index="0" source="dilations"/>
302 |             <Scalar arg-name="DH" type="int" port-index="1" source="dilations"/>
303 |             <Scalar arg-name="GC" type="int" source="group"/>
304 |             <Scalar arg-name="KW" type="int" port-index="0" source="kernel"/>
305 |             <Scalar arg-name="KH" type="int" port-index="1" source="kernel"/>
306 |             <Scalar arg-name="PW" type="int" port-index="0" source="pads_begin"/>
307 |             <Scalar arg-name="PH" type="int" port-index="1" source="pads_begin"/>
308 |             <Scalar arg-name="SW" type="int" port-index="0" source="strides"/>
309 |             <Scalar arg-name="SH" type="int" port-index="1" source="strides"/>
310 |         </Parameters>
311 |         <WorkSizes dim="output,0" global="X,Y,F" local="1,1,1"/>
312 |     </Kernel>
313 | </CustomLayer>
314 | 
315 | <CustomLayer name="Resample" type="MVCL" version="1">
316 |     <Where antialias="0"/>
317 |     <Kernel entry="resample_nearest">
318 |         <Source filename="resample_noAA.bin"/>
319 |         <Parameters>
320 |             <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
321 |             <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
322 |             <Scalar arg-name="iw" type="int" port-index="0" source="I.X"/>
323 |             <Scalar arg-name="ih" type="int" port-index="0" source="I.Y"/>
324 |             <Scalar arg-name="factor" type="float" source="factor"/>
325 |             <Scalar arg-name="ow" type="int" port-index="0" source="O.X"/>
326 |             <Scalar arg-name="oh" type="int" port-index="0" source="O.Y"/>
327 |             <Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
328 |         </Parameters>
329 |         <WorkSizes global="1,Y,1" local="1,1,1" dim="output,0"/>
330 |     </Kernel>
331 | </CustomLayer>
332 | 
333 | <CustomLayer name="Resample" type="MVCL" version="1">
334 |     <Where antialias="1"/>
335 |     <Kernel entry="resample_with_antialias">
336 |         <Source filename="resample_AA.bin"/>
337 |         <Parameters>
338 |             <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
339 |             <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
340 |             <Scalar arg-name="iw" type="int" port-index="0" source="I.X"/>
341 |             <Scalar arg-name="ih" type="int" port-index="0" source="I.Y"/>
342 |             <Scalar arg-name="factor" type="float" source="factor"/>
343 |             <Scalar arg-name="ow" type="int" port-index="0" source="O.X"/>
344 |             <Scalar arg-name="oh" type="int" port-index="0" source="O.Y"/>
345 |             <Scalar arg-name="channels" type="int" port-index="0" source="I.F"/>
346 |         </Parameters>
347 |         <WorkSizes global="1,round(Y*factor),F" local="1,1,F" dim="input,0"/>
348 |     </Kernel>
349 | </CustomLayer>
350 | 
351 | <CustomLayer name="Convolution" type="MVCL" version="1">
352 |     <Where kernel="1,1" dilation="1,1"/>
353 |     <Kernel entry="Convolution1x1_NCHW">
354 |        <Source filename="convolution1x1_chw.bin"/>
355 |        <Parameters>
356 |            <Tensor arg-name="in" type="input" port-index="0" format="BFYX"/>
357 |            <Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
358 |            <Data arg-name="w" type="data" source="weights" format="ANY"/>
359 |            <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
360 |            <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
361 |            <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
362 |            <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
363 |            <Scalar arg-name="OH" type="int" port-index="0" source="O.Y"/>
364 |            <Scalar arg-name="OC" type="int" port-index="0" source="O.F"/>
365 | 
366 |            <Scalar arg-name="stride-x" type="int" port-index="0" source="stride"/>
367 |            <Scalar arg-name="stride-y" type="int" port-index="1" source="stride"/>
368 |            <Scalar arg-name="pad-x" type="int" port-index="0" source="pads_begin"/>
369 |            <Scalar arg-name="pad-y" type="int" port-index="1" source="pads_begin"/>
370 |            <Scalar arg-name="kernel-x" type="int" port-index="0" source="kernel"/>
371 |            <Scalar arg-name="kernel-y" type="int" port-index="1" source="kernel"/>
372 |            <Scalar arg-name="output" type="int" port-index="0" source="output"/>
373 |            <Scalar arg-name="group" type="int" port-index="0" source="group"/>
374 |        </Parameters>
375 |        <WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
376 |    </Kernel>
377 | </CustomLayer>
378 | 
379 | <CustomLayer name="Convolution" type="MVCL" version="1">
380 |     <Where kernel="1,1" dilation="1,1"/>
381 |     <Kernel entry="Convolution1x1_NHWC">
382 |        <Source filename="convolution1x1_hwc.bin"/>
383 |        <Parameters>
384 |            <Tensor arg-name="in" type="input" port-index="0" format="BYXF"/>
385 |            <Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
386 |            <Data arg-name="w" type="data" source="weights" format="ANY"/>
387 |            <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
388 |            <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
389 |            <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
390 |            <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
391 |            <Scalar arg-name="OH" type="int" port-index="0" source="O.Y"/>
392 |            <Scalar arg-name="OC" type="int" port-index="0" source="O.F"/>
393 | 
394 |            <Scalar arg-name="stride-x" type="int" port-index="0" source="stride"/>
395 |            <Scalar arg-name="stride-y" type="int" port-index="1" source="stride"/>
396 |            <Scalar arg-name="pad-x" type="int" port-index="0" source="pads_begin"/>
397 |            <Scalar arg-name="pad-y" type="int" port-index="1" source="pads_begin"/>
398 |            <Scalar arg-name="kernel-x" type="int" port-index="0" source="kernel"/>
399 |            <Scalar arg-name="kernel-y" type="int" port-index="1" source="kernel"/>
400 |            <Scalar arg-name="output" type="int" port-index="0" source="output"/>
401 |            <Scalar arg-name="group" type="int" port-index="0" source="group"/>
402 |        </Parameters>
403 |        <WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
404 |    </Kernel>
405 | </CustomLayer>
406 | 
407 | <CustomLayer name="Convolution" type="MVCL" version="1">
408 |     <Where kernel="3,3" dilation="1,1"/>
409 |     <Kernel entry="Convolution3x3">
410 |         <Source filename="convolution3x3.bin"/>
411 |         <Parameters>
412 |             <Tensor arg-name="in_param" type="input" port-index="0" format="BFYX"/>
413 |             <Tensor arg-name="out" type="output" port-index="0" format="BFYX"/>
414 |             <Data arg-name="w" type="data" source="weights" format="BFYX"/>
415 |             <Scalar arg-name="IW" type="int" port-index="0" source="I.X"/>
416 |             <Scalar arg-name="IH" type="int" port-index="0" source="I.Y"/>
417 |             <Scalar arg-name="IC" type="int" port-index="0" source="I.F"/>
418 |             <Scalar arg-name="OW" type="int" port-index="0" source="O.X"/>
419 |             <Scalar arg-name="OH" type="int" port-index="0" source="O.Y"/>
420 |             <Scalar arg-name="OC" type="int" port-index="0" source="O.F"/>
421 |             <Scalar arg-name="KX" type="int" port-index="0" source="kernel"/>
422 |             <Scalar arg-name="KY" type="int" port-index="1" source="kernel"/>
423 |             <Scalar arg-name="stride_x" type="int" port-index="0" source="stride"/>
424 |             <Scalar arg-name="stride_y" type="int" port-index="1" source="stride"/>
425 |             <Scalar arg-name="pad_x" type="int" port-index="0" source="pads_begin"/>
426 |             <Scalar arg-name="pad_y" type="int" port-index="1" source="pads_begin"/>
427 |             <Scalar arg-name="dilation_x" type="int" port-index="0" source="dilation"/>
428 |             <Scalar arg-name="dilation_y" type="int" port-index="1" source="dilation"/>
429 |             <Scalar arg-name="output" type="int" port-index="0" source="output"/>
430 | 
431 |             <Data arg-name="in_local" type="local_data" dim="input,0" size="X*F*3*2"/>
432 |             <Data arg-name="out_local" type="local_data" dim="output,0" size="X*F*2"/>
433 |             <Data arg-name="w_local" type="local_data" dim="input,0" size="3*3*F*2"/>
434 |         </Parameters>
435 |         <WorkSizes global="Y,F,B" local="1,1,1" dim="output,0"/>
436 |     </Kernel>
437 | </CustomLayer>
438 | 
439 | <CustomLayer name="ExperimentalDetectronPriorGridGenerator" type="MVCL" version="1">
440 |     <Kernel entry="experimental_detectron_prior_grid_generator">
441 |         <Source filename="detectron_prior_grid_gen.bin"/>
442 |         <Parameters>
443 |             <Tensor arg-name="input_priors" type="input" port-index="0" format="BFYX"/>
444 |             <Tensor arg-name="input_feature_map" type="input" port-index="1" format="BFYX"/>
445 |             <Tensor arg-name="input_rois" type="input" port-index="2" format="BFYX"/>
446 |             <Tensor arg-name="output" type="output" port-index="0" format="BFYX"/>
447 |             <Scalar arg-name="grid_h" type="int" port-index="1" source="I.Y"/>
448 |             <Scalar arg-name="grid_w" type="int" port-index="1" source="I.X"/>
449 |             <Scalar arg-name="stride_h" type="float" source="stride_h"/>
450 |             <Scalar arg-name="stride_w" type="float" source="stride_w"/>
451 |             <Scalar arg-name="num_priors" type="int" port-index="0" source="I.Y"/>
452 |             <Scalar arg-name="num_anchors_per_prior" type="int" port-index="0" source="I.X"/>
453 |         </Parameters>
454 |         <WorkSizes dim="input,1" global="((X+31)/32)*32,Y,1" local="32,1,1"/>
455 |     </Kernel>
456 | </CustomLayer>
457 | 
458 | <CustomLayer name="Convert" type="MVCL" version="1">
459 |     <Kernel entry="cvtu8f16">
460 |         <Source filename="cvtu8f16.bin"/>
461 |         <Parameters>
462 |             <Tensor arg-name="src" type="input" port-index="0" format="BFYX"/>
463 |             <Tensor arg-name="dst" type="output" port-index="0" format="BFYX"/>
464 |             <Scalar arg-name="scale" type="float" source="scale"/>
465 |             <Scalar arg-name="bias" type="float" source="bias"/>
466 |         </Parameters>
467 |         <WorkSizes dim="input,0" global="X,Y,F" local="X,1,1"/>
468 |     </Kernel>
469 | </CustomLayer>
470 | 
471 | <CustomLayer name="Correlate" type="MVCL" version="1">
472 |     <Kernel entry="correlate2_half">
473 |         <Source filename="correlate.bin"/>
474 |         <Parameters>
475 |             <Tensor arg-name="bottom0" type="input" port-index="0" format="BFYX"/>
476 |             <Tensor arg-name="bottom1" type="input" port-index="1" format="BFYX"/>
477 |             <Tensor arg-name="top" type="output" port-index="0" format="BFYX"/>
478 |             <Scalar arg-name="topwidth" type="int" source="top_width"/>
479 |             <Scalar arg-name="topheight" type="int" source="top_height"/>
480 |             <Scalar arg-name="bottomwidth" type="int" port-index="0" source="I.X"/>
481 |             <Scalar arg-name="bottomheight" type="int" port-index="0" source="I.Y"/>
482 |             <Scalar arg-name="bottomchannels" type="int" port-index="0" source="I.F"/>
483 |             <Scalar arg-name="max_displacement" type="int" source="displacement"/>
484 |             <Scalar arg-name="padding" type="int" source="pad"/>
485 |             <Scalar arg-name="neighborhood_grid_radius" type="int" source="neighborhood_grid_radius"/>
486 |             <Scalar arg-name="neighborhood_grid_width" type="int" source="neighborhood_grid_width"/>
487 |             <Scalar arg-name="kernel_size" type="int" source="kernel_size"/>
488 |             <Scalar arg-name="stride1" type="int" port-index="0" source="stride"/>
489 |             <Scalar arg-name="stride2" type="int" port-index="1" source="stride"/>
490 |         </Parameters>
491 |         <WorkSizes dim="input,0" global="top_height,1,1" local="1,1,1"/>
492 |     </Kernel>
493 | </CustomLayer>
494 | 
495 | <CustomLayer name="SpatialTransform" type="MVCL" version="1">
496 |     <Kernel entry="ocl_st">
497 |         <Source filename="st.bin"/>
498 |         <Parameters>
499 |             <Tensor arg-name="src_data" type="input" port-index="0" format="BFYX"/>
500 |             <Tensor arg-name="theta" type="input" port-index="1" format="ANY"/>
501 |             <Tensor arg-name="dst_data" type="output" port-index="0" format="BFYX"/>
502 |             <Scalar arg-name="C" type="int" port-index="0" source="I.F"/>
503 |             <Scalar arg-name="W" type="int" port-index="0" source="I.X"/>
504 |         </Parameters>
505 |         <WorkSizes dim="input,0" global="(X+511)/512,Y,1" local="1,1,1"/>
506 |     </Kernel>
507 | </CustomLayer>
508 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/cvtf32f16.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:703ef56f84299e76d36b3ba5a632ae3d5e3ecd54761dcfe0006ca69ddce4bc6d
3 | size 2664
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/cvtf32f16.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | 
 7 | __kernel void cvtf32f16(const __global float* restrict inImage,
 8 |                               __global half*  restrict outImage,
 9 |                                        float   scale,
10 |                                        float   bais)
11 | {
12 |     int idx = get_global_id(0)
13 |             + get_global_id(1) * get_global_size(0)
14 |             + get_global_id(2) * get_global_size(0) * get_global_size(1);
15 | 
16 |     outImage[idx] = convert_half(inImage[idx]*scale+bais);
17 | }
18 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/cvtu8f16.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:46c943e08f37cedac77f727f55835637d4878edcc20aaa24f16ed5888d13bd43
3 | size 4588
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/cvtu8f16.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
 7 | 
 8 | __kernel void cvtu8f16(__global const uchar *restrict src, __global half *restrict dst, float scale, float bias)
 9 | {
10 |     __local uchar local_src[8 * 1024];
11 |     __local half local_dst[8 * 1024];
12 | 
13 |     event_t e1 = async_work_group_copy_3D3D(
14 |         local_src, // dst
15 |         src + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
16 |             + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // src
17 |         get_local_size(0), // num_elements_per_line
18 |         get_local_size(0) * get_local_size(1) / (get_local_size(0)), // num_lines
19 |         get_global_size(0) - get_local_size(0), // src_line_stride
20 |         0, // dst_line_stride
21 |         get_local_size(2), // num planes
22 |         get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src plane stride
23 |         0, // dst plane stride
24 |         0);
25 |     wait_group_events(1, &e1);
26 | 
27 |     size_t idx = get_local_id(0)
28 |                + get_local_id(1) * get_local_size(0)
29 |                + get_local_id(2) * get_local_size(0) * get_local_size(1);
30 | 
31 |     local_dst[idx] = convert_half(local_src[idx]) * (half)scale + (half)bias;
32 | 
33 |     barrier(CLK_LOCAL_MEM_FENCE);
34 | 
35 |     event_t e2 = async_work_group_copy_3D3D(
36 |         dst + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
37 |             + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // dst
38 |         local_dst, // src
39 |         get_local_size(0), // num_elements_per_line
40 |         get_local_size(1), // num_lines
41 |         0, // src_line_stride
42 |         get_global_size(0) - get_local_size(0), // dst_line_stride
43 |         get_local_size(2), // num_planes
44 |         0, // src_plane_stride
45 |         get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride
46 |         0);
47 |     wait_group_events(1, &e2);
48 | }
49 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/detectron_prior_grid_gen.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4c80d556d23f1c959fa10c00ff1cd9c3ae10aba607b37c7a0620d903fc7cedd8
3 | size 6972
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/detectron_prior_grid_gen.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
 7 | 
 8 | __kernel void experimental_detectron_prior_grid_generator(
 9 |     __global const half *restrict input_priors,
10 |     __global const half *restrict input_feature_map,
11 |     __global const half *restrict input_rois,
12 |     __global half *restrict output,
13 |     int grid_h,
14 |     int grid_w,
15 |     float stride_h,
16 |     float stride_w,
17 |     int num_priors,
18 |     int num_anchors_per_prior)
19 | {
20 |     __local half local_input_priors[8 * 1024];
21 |     __local half local_output[8 * 1024];
22 | 
23 |     event_t e1 = async_work_group_copy(
24 |         local_input_priors,
25 |         input_priors,
26 |         num_anchors_per_prior * num_priors,
27 |         0);
28 |     wait_group_events(1, &e1);
29 | 
30 |     int width_start = get_group_id(0) * get_local_size(0);
31 |     int width_end   = min(width_start + get_local_size(0), (unsigned)grid_w);
32 |     int width       = width_end - width_start;
33 | 
34 |     int h     = get_group_id(1);
35 |     int w_idx = get_group_id(0) * get_local_size(0);
36 |     for (int w = 0; w < width; ++w) {
37 |         #pragma unroll 4
38 |         for (int p = 0; p < num_priors; ++p) {
39 |             local_output[(w * num_priors + p) * num_anchors_per_prior + 0] =
40 |                 local_input_priors[4 * p + 0]
41 |                 + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
42 |             local_output[(w * num_priors + p) * num_anchors_per_prior + 1] =
43 |                 local_input_priors[4 * p + 1] + convert_half(stride_h) * (convert_half(h) + 0.5);
44 |             local_output[(w * num_priors + p) * num_anchors_per_prior + 2] =
45 |                 local_input_priors[4 * p + 2]
46 |                 + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
47 |             local_output[(w * num_priors + p) * num_anchors_per_prior + 3] =
48 |                 local_input_priors[4 * p + 3] + convert_half(stride_h) * (convert_half(h) + 0.5);
49 |         }
50 |     }
51 | 
52 |     barrier(CLK_LOCAL_MEM_FENCE);
53 | 
54 |     event_t e2 = async_work_group_copy_2D2D(
55 |         output + get_group_id(0) * get_local_size(0) * num_anchors_per_prior * num_priors
56 |             + get_group_id(1) * get_local_size(1) * grid_w * num_anchors_per_prior
57 |                   * num_priors, // dst
58 |         local_output, // src
59 |         width * num_anchors_per_prior * num_priors, // num_elements_per_line
60 |         1, // num_lines
61 |         (grid_w - width) * num_anchors_per_prior * num_priors, // src_line_stride
62 |         (grid_w - width) * num_anchors_per_prior * num_priors, // dst_line_stride
63 |         0);
64 |     wait_group_events(1, &e2);
65 | }
66 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/fakequantize.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d17659bbf12a172849085003a055bfb4b91d3bb5bdc7f820395820eaa90b46ef
3 | size 15688
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/fakequantize.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | __kernel void quantize(
  9 |     __global const half *restrict src_data,
 10 |     __global const half *restrict input_low,
 11 |     __global const half *restrict input_high,
 12 |     __global const half *restrict output_low,
 13 |     __global const half *restrict output_high,
 14 |     __global half *restrict dst_data,
 15 |     int levels,
 16 |     int input_low_size,
 17 |     int input_high_size,
 18 |     int output_low_size,
 19 |     int output_high_size,
 20 |     int W,
 21 |     int H)
 22 | {
 23 |     __local half local_src[15 * 1024];
 24 |     __local half local_dst[15 * 1024];
 25 | 
 26 |     event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
 27 |     wait_group_events(1, &e1);
 28 | 
 29 |     int c = get_group_id(2);
 30 | 
 31 |     half h_ilow  = (input_low_size == 1 ? input_low[0] : input_low[c]);
 32 |     half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]);
 33 |     half h_olow  = (output_low_size == 1 ? output_low[0] : output_low[c]);
 34 |     half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]);
 35 | 
 36 |     half const1 = (half)(
 37 |         !(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow)));
 38 |     half const2 =
 39 |         (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1));
 40 | 
 41 |     __local const half *restrict src = local_src + W * get_local_id(1);
 42 |     __local half       *restrict dst = local_dst + W * get_local_id(1);
 43 | 
 44 |     for (int w = 0; w < W / 8; w++) {
 45 |         half8 val = *((__local half8 *)src + w);
 46 |         half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h;
 47 | 
 48 |         aux = (half8){
 49 |             (half)(short)(aux.s0),
 50 |             (half)(short)(aux.s1),
 51 |             (half)(short)(aux.s2),
 52 |             (half)(short)(aux.s3),
 53 |             (half)(short)(aux.s4),
 54 |             (half)(short)(aux.s5),
 55 |             (half)(short)(aux.s6),
 56 |             (half)(short)(aux.s7)};
 57 | 
 58 |         aux = aux * (half8)const2 + (half8)h_olow;
 59 | 
 60 |         short8 a;
 61 |         short8 b;
 62 |         a.s0 = (val.s0 <= h_ilow);
 63 |         a.s1 = (val.s1 <= h_ilow);
 64 |         a.s2 = (val.s2 <= h_ilow);
 65 |         a.s3 = (val.s3 <= h_ilow);
 66 |         a.s4 = (val.s4 <= h_ilow);
 67 |         a.s5 = (val.s5 <= h_ilow);
 68 |         a.s6 = (val.s6 <= h_ilow);
 69 |         a.s7 = (val.s7 <= h_ilow);
 70 | 
 71 |         b.s0 = (val.s0 > h_ihigh);
 72 |         b.s1 = (val.s1 > h_ihigh);
 73 |         b.s2 = (val.s2 > h_ihigh);
 74 |         b.s3 = (val.s3 > h_ihigh);
 75 |         b.s4 = (val.s4 > h_ihigh);
 76 |         b.s5 = (val.s5 > h_ihigh);
 77 |         b.s6 = (val.s6 > h_ihigh);
 78 |         b.s7 = (val.s7 > h_ihigh);
 79 | 
 80 |         a = ~(a - (short8)1);
 81 |         b = ~(b - (short8)1);
 82 | 
 83 |         short8 c1 = (~a & b);
 84 |         short8 c2 = (~a & ~b);
 85 | 
 86 |         short8 res = (a & as_short8((half8)h_olow)) | (c1 & as_short8((half8)h_ohigh)) | (c2 & as_short8(aux));
 87 | 
 88 |         *((__local half8 *)dst + w) = as_half8(res);
 89 |     }
 90 | 
 91 |     for (int w = W & (~0x7); w < W; w++) {
 92 |         half val = src[w];
 93 |         short a  = val <= h_ilow;
 94 |         a        = ~(a - 1);
 95 |         short b  = val > h_ihigh;
 96 |         b        = ~(b - 1);
 97 | 
 98 |         short c1 = (~a & b);
 99 |         short c2 = (~a & ~b);
100 | 
101 |         short res = (a & as_short(h_olow)) | (c1 & as_short(h_ohigh))
102 |                     | (c2 & as_short(((half)(round((val - h_ilow) * const1) * const2) + h_olow)));
103 | 
104 |         dst[w] = as_half(res);
105 |     }
106 | 
107 |     barrier(CLK_LOCAL_MEM_FENCE);
108 | 
109 |     event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
110 |     wait_group_events(1, &e2);
111 | }
112 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/grn.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6e3dbe5173ca93f39fecaf29f820e1704bcb485affc1a09554e4c86f8de46214
3 | size 7972
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/grn.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
 7 | 
 8 | __kernel void grn(__global const half *restrict src_data, __global half *restrict dst_data, int C, float bias)
 9 | {
10 |     __local half src[8 * 1024];
11 |     __local half dst[8 * 1024];
12 | 
13 |     const size_t index = get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0);
14 | 
15 |     event_t e1 = async_work_group_copy_3D3D(
16 |         src, // dst
17 |         src_data + index, // src
18 |         get_local_size(0), // num_elements_per_line,
19 |         get_local_size(1), // num_lines,
20 |         get_global_size(0) - get_local_size(0), // src_line_stride,
21 |         0, // dst_line_stride,
22 |         C, // num_planes,
23 |         get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src_plane_stride
24 |         0, // dst_plane_stride
25 |         0);
26 |     wait_group_events(1, &e1);
27 | 
28 |     float variance = bias + 1e-9f;
29 | 
30 |     #pragma unroll 8
31 |     for (int c = 0; c < C; c++) {
32 |         float val = (float)src[c * get_local_size(1) * get_local_size(0)
33 |                                + get_local_id(1) * get_local_size(0)
34 |                                + get_local_id(0)];
35 |         variance += val * val;
36 |     }
37 | 
38 |     half hvariance = (half)(native_rsqrt((half)(variance / 16.f)) * 0.25f);
39 | 
40 |     #pragma unroll 8
41 |     for (int c = 0; c < C; c++) {
42 |         dst[c * get_local_size(1) * get_local_size(0)
43 |             + get_local_id(1) * get_local_size(0)
44 |             + get_local_id(0)] =
45 |             src[c * get_local_size(1) * get_local_size(0)
46 |                   + get_local_id(1) * get_local_size(0) + get_local_id(0)] * hvariance;
47 |     }
48 | 
49 |     barrier(CLK_LOCAL_MEM_FENCE);
50 | 
51 |     event_t e2 = async_work_group_copy_3D3D(
52 |         dst_data + index, // src
53 |         dst, // dst
54 |         get_local_size(0), // num_elements_per_line,
55 |         get_local_size(1), // num_lines,
56 |         0, // src_line_stride,
57 |         get_global_size(0) - get_local_size(0), // dst_line_stride,
58 |         C, // num_planes,
59 |         0, // src_plane_stride
60 |         get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride
61 |         0);
62 |     wait_group_events(1, &e2);
63 | }
64 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/mvn_reduction.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:581206fc5c0e0d429094bc7076d8772dc6ba69199a3ee75d269f13dc2f0d7ac8
3 | size 7840
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/mvn_reduction.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | // Set to 1 only if output is zerroed before kernel execution
  9 | #define USE_ATOMICS 0
 10 | 
 11 | void atomic_add_global(volatile __global float *source, const float operand)
 12 | {
 13 |     union {
 14 |         unsigned int intVal;
 15 |         float floatVal;
 16 |     } newVal;
 17 |     union {
 18 |         unsigned int intVal;
 19 |         float floatVal;
 20 |     } prevVal;
 21 | 
 22 |     do {
 23 |         prevVal.floatVal = *source;
 24 |         newVal.floatVal  = prevVal.floatVal + operand;
 25 |     } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
 26 | }
 27 | 
 28 | __kernel void reduction_mean(
 29 |     __global const half *restrict src,
 30 |     __global float *restrict mean,
 31 |     __global float *restrict variance,
 32 |     int W,
 33 |     int H,
 34 |     int across_channels)
 35 | {
 36 |     __local half src_line[4 * 1024];
 37 |     event_t e;
 38 | 
 39 |     e = async_work_group_copy_2D2D(
 40 |         src_line, // dst
 41 |         src + get_group_id(1) * get_local_size(1) * W
 42 |             + get_group_id(2) * get_local_size(2) * W * get_global_size(1), // src
 43 |         W * get_local_size(1), // num_elements_per_line,
 44 |         get_local_size(2), // num_lines,
 45 |         W * (get_global_size(1) - get_local_size(1)), // src_line_stride,
 46 |         0, // dst_line_stride,
 47 |         0);
 48 | 
 49 |     wait_group_events(1, &e);
 50 | 
 51 |     int h = get_global_id(1);
 52 |     int c = get_global_id(2);
 53 | 
 54 |     const int MAX_LOCAL_SIZE = 8;
 55 | 
 56 |     __local float mbuf[MAX_LOCAL_SIZE];
 57 |     __local float vbuf[MAX_LOCAL_SIZE];
 58 | 
 59 |     mbuf[get_local_id(1)] = 0;
 60 |     vbuf[get_local_id(1)] = 0;
 61 | 
 62 |     if (h < H) {
 63 |         float sum  = 0.f;
 64 |         float sum2 = 0.f;
 65 | 
 66 |         float8 sum4  = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
 67 |         float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
 68 | 
 69 |         const __local half8 *restrict lsrc = ((const __local half8 *)(src_line + get_local_id(1) * W));
 70 | 
 71 |         #pragma unroll 16
 72 |         for (size_t w = 0; w < W / 8; w++) {
 73 |             half8 sh    = lsrc[w];
 74 |             float8 valf = convert_float8(sh);
 75 | 
 76 |             sum4 += valf;
 77 |             sum24 += valf * valf;
 78 |         }
 79 | 
 80 |         for (size_t w = W / 8 * 8; w < W; w++) {
 81 |             float val = (float)src_line[get_local_id(1) * W + w];
 82 |             sum += val;
 83 |             sum2 += val * val;
 84 |         }
 85 | 
 86 |         mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum;
 87 |         vbuf[get_local_id(1)] =
 88 |             sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2;
 89 |     }
 90 | 
 91 |     barrier(CLK_LOCAL_MEM_FENCE);
 92 | 
 93 |     if (get_local_id(1) == 0) {
 94 |         float res  = 0;
 95 |         float res2 = 0;
 96 | 
 97 |         for (int i = 0; i < get_local_size(1); i++) {
 98 |             res += mbuf[i];
 99 |             res2 += vbuf[i];
100 |         }
101 | 
102 | // requires memory reset before layer execution
103 | #if USE_ATOMICS
104 |         int idx = (across_channels == 0) ? c : 0;
105 | 
106 |         atomic_add_global(mean + idx, res);
107 |         atomic_add_global(variance + idx, res2);
108 | #else
109 |         int idx = c * get_num_groups(1) + get_group_id(1);
110 | 
111 |         mean[idx]     = res;
112 |         variance[idx] = res2;
113 | #endif
114 |     }
115 | }
116 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/mvn_scale.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:527d82ec9e71bbbfaf86c1e3f1b2beea02875cf719a45592b1f3d5e244e5c15c
3 | size 3564
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/mvn_scale.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
 7 | 
 8 | // Set to 1 only if output is zerroed before kernel execution
 9 | #define USE_ATOMICS 0
10 | 
11 | __attribute__((reqd_work_group_size(1, 1, 1))) __kernel void mvn_scale(
12 |     const __global half *restrict src,
13 |     __global float *restrict mean_part,
14 |     __global float *restrict power_mean,
15 |     __global half *restrict dst,
16 |     int W,
17 |     int H1,
18 |     int across_channels,
19 |     int normalize_variance,
20 |     int nparts)
21 | {
22 |     __local half src_line[4 * 1024];
23 |     __local half dst_line[4 * 1024];
24 | 
25 |     int c = get_group_id(2);
26 |     int C = get_global_size(2);
27 | 
28 |     int h = get_group_id(1);
29 |     int H = get_global_size(1);
30 | 
31 |     event_t e1 = async_work_group_copy(src_line, src + c * H * W + h * W, W, 0);
32 |     wait_group_events(1, &e1);
33 | 
34 |     int idx     = (across_channels == 0) ? nparts * c : 0;
35 |     float scale = (across_channels == 0) ? H * W : H * W * C;
36 | 
37 | #if USE_ATOMICS
38 |     float mean     = mean_part[idx];
39 |     float variance = power_mean[idx];
40 | #else
41 | 
42 |     int total      = (across_channels == 0) ? nparts : nparts * C;
43 |     float mean     = 0.f;
44 |     float variance = 0.f;
45 | 
46 |     for (int i = 0; i < total; i++) {
47 |         mean += mean_part[idx + i];
48 |         variance += power_mean[idx + i];
49 |     }
50 | #endif
51 | 
52 |     mean     = mean / scale;
53 |     variance = variance / scale;
54 |     variance = variance - mean * mean;
55 |     variance = native_sqrt(variance) + 1e-9f;
56 | 
57 |     half hmean     = mean;
58 |     half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance);
59 | 
60 |     for (size_t w = 0; w < W; w++) {
61 |         dst_line[w] = (src_line[w] - hmean) * hvariance;
62 |     }
63 | 
64 |     barrier(CLK_LOCAL_MEM_FENCE);
65 | 
66 |     event_t e2 = async_work_group_copy(dst + c * H * W + h * W, dst_line, W, 0);
67 |     wait_group_events(1, &e2);
68 | }
69 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/region_chw.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5c02ada5f9718e59c1c908799a77ab383e6ca333d46c9577608bdb9c3bf15388
3 | size 22828
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/region_chw.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | __constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))
  9 | 
 10 | #define ALLOW_EARLY_RETURN 1
 11 | 
 12 | static void inline logistic_activate(__local const half *restrict src, __local half *restrict dst, int offset)
 13 | {
 14 |     half val    = src[offset];
 15 |     val         = 1.0h / (1.0h + exp2(val * -log_2_e));
 16 |     dst[offset] = val;
 17 | }
 18 | 
 19 | __kernel void region_chw(
 20 |     __global const half *restrict src_data,
 21 |     __global half *restrict dst_data,
 22 |     int W,
 23 |     int H,
 24 |     int classes,
 25 |     int coords,
 26 |     int num,
 27 |     int maskSize,
 28 |     int doSoftmax)
 29 | {
 30 |     __local half local_src[13 * 13 * (4 + 1 + 80)];
 31 |     __local half local_dst[13 * 13 * (4 + 1 + 80)];
 32 | 
 33 |     const int box_sz = W * H * (classes + coords + 1);
 34 |     event_t e1       = async_work_group_copy(local_src, src_data + get_group_id(1) * box_sz, box_sz, 0);
 35 |     wait_group_events(1, &e1);
 36 | 
 37 |     const int pixel_pos = get_local_id(0);
 38 |     const int stride    = W * H;
 39 | 
 40 | #if ALLOW_EARLY_RETURN
 41 |     if (pixel_pos < W * H)
 42 | #endif
 43 |     {
 44 |         __local const half *restrict src = local_src + pixel_pos;
 45 |         __local half *restrict dst       = local_dst + pixel_pos;
 46 | 
 47 |         logistic_activate(src, dst, 0 * stride);
 48 |         logistic_activate(src, dst, 1 * stride);
 49 | 
 50 |         //copy plane 2 and 3
 51 |         dst[2 * stride] = src[2 * stride];
 52 |         dst[3 * stride] = src[3 * stride];
 53 | 
 54 |         logistic_activate(src, dst, 4 * stride);
 55 | 
 56 |         src += (coords + 1) * stride;
 57 |         dst += (coords + 1) * stride;
 58 | 
 59 |         if (doSoftmax) {
 60 |             half max_val = src[0];
 61 |             #pragma unroll 4
 62 |             for (int c = 1; c < classes; c++) {
 63 |                 max_val = max(max_val, src[c * stride]);
 64 |             }
 65 | 
 66 |             half expSum = 0.0h;
 67 |             #pragma unroll 4
 68 |             for (int c = 0; c < classes; c++) {
 69 |                 const half e    = src[c * stride] - max_val;
 70 |                 const half tmp  = exp2(e * log_2_e);
 71 |                 dst[c * stride] = tmp;
 72 |                 expSum += tmp;
 73 |             }
 74 | 
 75 |             const half recip = 1.h / expSum;
 76 |             int c            = 0;
 77 |             for (; c < (classes & ~0x3); c += 4) {
 78 |                 const half t0 = dst[(c + 0) * stride];
 79 |                 const half t1 = dst[(c + 1) * stride];
 80 |                 const half t2 = dst[(c + 2) * stride];
 81 |                 const half t3 = dst[(c + 3) * stride];
 82 | 
 83 |                 const half e0 = t0 * recip;
 84 |                 const half e1 = t1 * recip;
 85 |                 const half e2 = t2 * recip;
 86 |                 const half e3 = t3 * recip;
 87 | 
 88 |                 dst[(c + 0) * stride] = e0;
 89 |                 dst[(c + 1) * stride] = e1;
 90 |                 dst[(c + 2) * stride] = e2;
 91 |                 dst[(c + 3) * stride] = e3;
 92 |             }
 93 |             for (; c < classes; c++) {
 94 |                 dst[c * stride] *= recip;
 95 |             }
 96 |         } else {
 97 |             #pragma unroll 4
 98 |             for (int c = 0; c < classes; c++) {
 99 |                 logistic_activate(src, dst, c * stride);
100 |             }
101 |         }
102 |     }
103 | 
104 |     barrier(CLK_LOCAL_MEM_FENCE);
105 | 
106 |     event_t e2 = async_work_group_copy(dst_data + get_group_id(1) * box_sz, local_dst, box_sz, 0);
107 |     wait_group_events(1, &e2);
108 | }
109 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/region_hwc.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:88f7bb144d85d08e9f7879e43ed6f8722bb2f93534e5becd8c7ff2a220cdd9f3
3 | size 81896
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/region_hwc.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | __constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))
  9 | 
 10 | #define ALLOW_EARLY_RETURN 1
 11 | 
 12 | static void inline logistic_activate_hwc(
 13 |     __local const half *restrict src,
 14 |     __local half *restrict dst,
 15 |     int offset,
 16 |     int stride)
 17 | {
 18 |     half val             = src[offset];
 19 |     val                  = 1.0h / (1.0h + exp2(val * -log_2_e));
 20 |     dst[offset * stride] = val;
 21 | }
 22 | 
 23 | __kernel void region_hwc(
 24 |     __global const half *restrict src,
 25 |     __global half *restrict dst,
 26 |     int W,
 27 |     int H,
 28 |     int classes,
 29 |     int coords,
 30 |     int num,
 31 |     int maskSize,
 32 |     int doSoftmax)
 33 | {
 34 |     __local half local_src[13 * 13 * (4 + 1 + 80)];
 35 |     __local half local_dst[13 * 13 * (4 + 1 + 80)];
 36 | 
 37 |     const int pixel_pos = get_local_id(0);
 38 | 
 39 |     const int local_C = classes + coords + 1;
 40 |     const int c       = get_group_id(1) * local_C;
 41 |     const int h       = get_group_id(0);
 42 | 
 43 |     num         = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize;
 44 |     const int C = local_C * num;
 45 | 
 46 |     event_t e1 = async_work_group_copy_2D2D(
 47 |         local_src, // dst
 48 |         src + h * W * C + c, // src
 49 |         local_C, // num_elements_per_line,
 50 |         H * W, // num_lines,
 51 |         C - local_C, // src_line_stride,
 52 |         0, // dst_line_stride,
 53 |         0);
 54 | 
 55 |     wait_group_events(1, &e1);
 56 | 
 57 | #if ALLOW_EARLY_RETURN
 58 |     if (pixel_pos < W * H)
 59 | #endif
 60 |     {
 61 |         const int w = pixel_pos % W;
 62 |         const int h = pixel_pos / W;
 63 | 
 64 |         __local const half *restrict src = local_src + h * W * local_C + w * local_C;
 65 |         __local half *restrict dst       = local_dst + h * W + w;
 66 | 
 67 |         const int stride = H * W;
 68 |         logistic_activate_hwc(src, dst, 0, stride);
 69 |         logistic_activate_hwc(src, dst, 1, stride);
 70 | 
 71 |         //copy plane 2 and 3
 72 |         dst[2 * stride] = src[2];
 73 |         dst[3 * stride] = src[3];
 74 | 
 75 |         logistic_activate_hwc(src, dst, 4, stride);
 76 | 
 77 |         src += coords + 1;
 78 |         dst += (coords + 1) * stride;
 79 | 
 80 |         if (doSoftmax) {
 81 |             half max_val = src[0];
 82 |             #pragma unroll 4
 83 |             for (int c = 1; c < classes; c++) {
 84 |                 max_val = max(max_val, src[c]);
 85 |             }
 86 | 
 87 |             half expSum = 0.0h;
 88 |             #pragma unroll 4
 89 |             for (int c = 0; c < classes; c++) {
 90 |                 const half e    = src[c] - max_val;
 91 |                 const half tmp  = exp2(e * log_2_e);
 92 |                 dst[c * stride] = tmp;
 93 |                 expSum += tmp;
 94 |             }
 95 | 
 96 |             const half invExpSum = 1.0h / expSum;
 97 |             #pragma unroll 4
 98 |             for (int c = 0; c < classes; c++) {
 99 |                 dst[c * stride] *= invExpSum;
100 |             }
101 |         } else {
102 |             #pragma unroll 4
103 |             for (int c = 0; c < classes; c++) {
104 |                 logistic_activate_hwc(src, dst, c, stride);
105 |             }
106 |         }
107 |     }
108 | 
109 |     barrier(CLK_LOCAL_MEM_FENCE);
110 | 
111 |     const int box_sz = W * H * (classes + coords + 1);
112 |     event_t e2       = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0);
113 |     wait_group_events(1, &e2);
114 | }
115 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_chw.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:16758c3a629f5e397b7b51d417686fd745603c119e1f5d9985b05f4f3ef7efc7
3 | size 12208
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_chw.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
 7 | 
 8 | __kernel void reorg_chw(
 9 |     __global const half *restrict src,
10 |     __global half *restrict dst,
11 |     int W,
12 |     int H,
13 |     int C,
14 |     int stride)
15 | {
16 |     __local half local_src[8 * 1024];
17 |     __local half local_dst[8 * 1024];
18 | 
19 |     event_t e1 = async_work_group_copy_2D2D(
20 |         local_src, // dst
21 |         src + get_group_id(1) * W * stride
22 |             + get_group_id(0) * W * stride * stride, // src
23 |         W * stride, // num_elements_per_line,
24 |         get_local_size(0), // num_lines,
25 |         W * stride * (stride * get_num_groups(0) - 1), // src_line_stride,
26 |         0, // dst_line_stride,
27 |         0);
28 |     wait_group_events(1, &e1);
29 | 
30 |     const int c        = get_local_id(0);
31 |     const int stride_x = get_local_id(1);
32 | 
33 |     const int srcIdx = stride_x + c * W * stride;
34 |     const int dstIdx = stride_x * W * get_local_size(0) + c * W;
35 | 
36 |     int x = 0;
37 |     for (; x <= W - 8; x += 8) {
38 |         half8 data = (half8){
39 |             local_src[srcIdx + (x + 0) * stride],
40 |             local_src[srcIdx + (x + 1) * stride],
41 |             local_src[srcIdx + (x + 2) * stride],
42 |             local_src[srcIdx + (x + 3) * stride],
43 |             local_src[srcIdx + (x + 4) * stride],
44 |             local_src[srcIdx + (x + 5) * stride],
45 |             local_src[srcIdx + (x + 6) * stride],
46 |             local_src[srcIdx + (x + 7) * stride]};
47 | 
48 |         *((__local half8 *)(&local_dst[dstIdx + x])) = data;
49 |     }
50 | 
51 |     for (; x < W; x++) {
52 |         local_dst[dstIdx + x] = local_src[srcIdx + x * stride];
53 |     }
54 | 
55 |     barrier(CLK_LOCAL_MEM_FENCE);
56 | 
57 |     event_t e2 = async_work_group_copy_2D2D(
58 |         dst + get_group_id(0) * W
59 |             + get_group_id(1) * W * stride * get_global_size(0), // dst
60 |         local_dst, // src
61 |         W, // num_elements_per_line
62 |         get_local_size(0) * stride, // num_lines
63 |         0, // src_line_stride
64 |         W * (get_num_groups(0) - 1), // dst_line_stride
65 |         0);
66 |     wait_group_events(1, &e2);
67 | }
68 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_hwc.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d57494f39baecc4011f87ab5241c1b0a19a07a7bbd14e20b135a94a0d7ecb3c1
3 | size 42144
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_hwc.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
 7 | 
 8 | __kernel void reorg_hwc(
 9 |     __global half const *restrict src,
10 |     __global half *restrict dst,
11 |     int W,
12 |     int H,
13 |     int C,
14 |     int stride)
15 | {
16 |     __local half local_src[8 * 1024];
17 |     __local half local_dst[8 * 1024];
18 | 
19 |     event_t e1 = async_work_group_copy_2D2D(
20 |         local_src, // dst
21 |         src + get_group_id(0) * stride + get_group_id(1) * C, // src
22 |         stride, // num_elements_per_line
23 |         H * W / stride, // num_lines
24 |         (C - 1) * stride, // src_line_stride
25 |         0, // dst_line_stride
26 |         0);
27 |     wait_group_events(1, &e1);
28 | 
29 |     const int stride_y = get_local_id(1);
30 |     const int blocks   = get_local_size(0);
31 |     const int b        = get_local_id(0);
32 | 
33 |     const int OC = stride * stride;
34 |     const int OH = H / stride;
35 |     const int OW = W / stride;
36 |     const int IC = stride;
37 |     const int IH = H;
38 |     const int IW = W / stride;
39 | 
40 |     for (int block_h = 0; block_h < stride; block_h++) {
41 |         const int src_line = b * stride * stride + stride_y * stride + block_h;
42 |         const int c        = src_line / IH;
43 |         const int h        = src_line % IH;
44 | 
45 |         const int dst_line = b * stride + stride_y * blocks * stride + block_h;
46 |         const int oc       = dst_line / OH;
47 |         const int oh       = dst_line % OH;
48 | 
49 |         for (int w = 0; w < W / stride; w++) {
50 |             local_dst[oh * OW * OC + w * OC + oc] = local_src[h * IW * IC + w * IC + c];
51 |         }
52 |     }
53 | 
54 |     barrier(CLK_LOCAL_MEM_FENCE);
55 | 
56 |     event_t e2 = async_work_group_copy_2D2D(
57 |         dst + get_group_id(1) * C + get_group_id(0) * stride, // dst
58 |         local_dst, // src
59 |         stride, // num_elements_per_line
60 |         W * H / stride, // num_lines
61 |         0, // src_line_stride
62 |         C * stride - stride, // dst_line_stride
63 |         0);
64 |     wait_group_events(1, &e2);
65 | }
66 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_hwc_naive.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:14d3b747218694b644afe03e205955c1a5be042b2d7e62d261973a4ea1b8aaa8
3 | size 13396
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_hwc_naive.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | 
 7 | __kernel void reorg_hwc_naive(
 8 |     __global half const *restrict src,
 9 |     __global half *restrict dst,
10 |     int W,
11 |     int H,
12 |     int C,
13 |     int stride)
14 | {
15 |     const int out_c = C / (stride * stride);
16 |     const int oc    = C * (stride * stride);
17 |     const int oh    = H / stride;
18 |     const int ow    = W / stride;
19 | 
20 |     const int c = get_global_id(0);
21 | 
22 |     for (int h = 0; h < H; ++h) {
23 |         int in_index  = W * (h + H * c) + (0);
24 |         int new_z     = in_index / (oh * ow);
25 |         int new_y     = (in_index % (oh * ow)) / ow;
26 |         int new_x     = (in_index % (oh * ow)) % ow;
27 |         int new_index = new_z + new_x * oc + new_y * oc * ow;
28 | 
29 |         in_index++;
30 | 
31 |         int c2        = c % out_c;
32 |         int offset    = c / out_c;
33 |         int w2        = 0 * stride + offset % stride;
34 |         int h2        = h * stride + offset / stride;
35 |         int out_index = w2 + W * stride * (h2 + H * stride * c2);
36 | 
37 |         #pragma unroll 2
38 |         for (int i = 0; i < W; ++i, out_index += stride, in_index++) {
39 |             // repacking coordinates
40 |             int k0               = out_index / (H * W);
41 |             int j0               = (out_index % (H * W)) / W;
42 |             int i0               = (out_index % (H * W)) % W;
43 |             int out_index_repack = k0 + C * i0 + C * W * j0;
44 | 
45 |             dst[new_index] = src[out_index_repack];
46 | 
47 |             int new_z = in_index / (oh * ow);
48 |             int new_y = (in_index % (oh * ow)) / ow;
49 |             int new_x = (in_index % (oh * ow)) % ow;
50 |             new_index = new_z + new_x * oc + new_y * oc * ow;
51 |         }
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/resample_AA.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:98ca582aaa70d3e7cc339aba88150eb12839ab67a0fac9c563f8c8ea37e705e2
3 | size 67860
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/resample_AA.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | #define USE_OPTIMIZED_ROUND
  9 | 
 10 | #ifdef USE_OPTIMIZED_ROUND
 11 | #define ROUND(x) ((int)((x) + 0.5f))
 12 | #else
 13 | #define ROUND(x) (int)(round(x))
 14 | #endif
 15 | 
 16 | inline int out_to_in(float ox, float f)
 17 | {
 18 | #ifdef USE_OPTIMIZED_ROUND
 19 |     return (int)((ox + 0.5f) / f);
 20 | #else
 21 |     return ROUND((ox + 0.5f) / f - 0.5f);
 22 | #endif
 23 | }
 24 | 
 25 | static inline float triangleCoeff(float x) { return 1.0f - fabs(x); }
 26 | 
 27 | static inline float4 triangleCoeff4(float4 x) { return 1.0f - fabs(x); }
 28 | 
 29 | __kernel void resample_with_antialias(
 30 |     __global const half *restrict src,
 31 |     __global half *restrict dst,
 32 |     int iw,
 33 |     int ih,
 34 |     float factor,
 35 |     int ow,
 36 |     int oh,
 37 |     int channels)
 38 | {
 39 |     __local half local_src[20 * 1024];
 40 |     __local half local_dst[8 * 1024];
 41 | 
 42 |     const int r        = (factor > 1.0f) ? 2 : ceil(1.0f / factor);
 43 |     const int oy_first = get_group_id(1) * get_local_size(1);
 44 |     const int oy_last  = (get_group_id(1) + 1) * get_local_size(1) - 1;
 45 |     const int iy_first = max(out_to_in(oy_first, factor) - r, 0);
 46 |     const int iy_last  = min(out_to_in(oy_last, factor) + r, ih - 1);
 47 |     const int iy_size  = iy_last - iy_first + 1;
 48 | 
 49 |     event_t e1 = async_work_group_copy_2D2D(
 50 |         local_src, // dst
 51 |         src + get_group_id(2) * get_local_size(2) * ih * iw + iy_first * iw, // src
 52 |         iy_size * iw, // num_elements_per_line,
 53 |         get_local_size(2), // num_lines,
 54 |         (ih - iy_size) * iw, // src_line_stride,
 55 |         0, // dst_line_stride,
 56 |         0);
 57 |     wait_group_events(1, &e1);
 58 | 
 59 |     const int oy     = get_global_id(1);
 60 |     const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first;
 61 |     const int iy     = ROUND(iy_f);
 62 | 
 63 |     __local half const *restrict start_src =
 64 |         local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2);
 65 |     __local half *restrict start_dst =
 66 |         local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2);
 67 | 
 68 |     for (int ox = 0; ox < ow; ox++) {
 69 |         const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f;
 70 |         const int ix_i   = ROUND(ix_f);
 71 | 
 72 |         float4 v_sum  = 0.f;
 73 |         float4 v_wsum = 0.f;
 74 |         for (int y = 0; y < iy_size; y++) {
 75 |             float dy  = iy_f - y;
 76 |             int x     = max(ix_i - r, 0);
 77 |             int end_x = min(ix_i + r, iw - 1);
 78 | 
 79 |             float4 dx;
 80 |             for (int i = 0; i < 4; i++) dx[i] = ix_f - x - i;
 81 | 
 82 |             for (; x < end_x - 3; x += 4, dx -= 4) {
 83 |                 float4 w =
 84 |                     factor * triangleCoeff4(factor * dx) * factor * triangleCoeff(factor * dy);
 85 |                 float4 src_vec = {
 86 |                     start_src[y * iw + x + 0],
 87 |                     start_src[y * iw + x + 1],
 88 |                     start_src[y * iw + x + 2],
 89 |                     start_src[y * iw + x + 3]};
 90 | 
 91 |                 v_sum += w * src_vec;
 92 |                 v_wsum += w;
 93 |             }
 94 | 
 95 |             for (; x <= end_x; x++) {
 96 |                 float dx = ix_f - x;
 97 |                 float w = factor * triangleCoeff(factor * dx) * factor * triangleCoeff(factor * dy);
 98 | 
 99 |                 v_sum[0] += w * start_src[y * iw + x];
100 |                 v_wsum[0] += w;
101 |             }
102 |         }
103 | 
104 |         v_sum[0]  = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3];
105 |         v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3];
106 | 
107 |         start_dst[get_local_id(1) * ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]);
108 |     }
109 | 
110 |     barrier(CLK_LOCAL_MEM_FENCE);
111 | 
112 |     event_t e2 = async_work_group_copy_2D2D(
113 |         dst + get_group_id(2) * get_local_size(2) * get_global_size(1) * ow
114 |             + get_group_id(1) * get_local_size(1) * ow, // dst
115 |         local_dst, // src
116 |         get_local_size(1) * ow, // num_elements_per_line,
117 |         get_local_size(2), // num_lines,
118 |         0, // src_line_stride,
119 |         (get_global_size(1) - get_local_size(1)) * ow, // dst_line_stride,
120 |         0);
121 |     wait_group_events(1, &e2);
122 | }
123 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/resample_noAA.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9c67917cd959fe4add69b139f44ef07e36c4ca37bc2b8a47b2bdfe48e8a3f559
3 | size 68828
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/resample_noAA.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | #define USE_OPTIMIZED_ROUND
  9 | 
 10 | #ifdef USE_OPTIMIZED_ROUND
 11 | #define ROUND(x) ((int)((x) + 0.5f))
 12 | #else
 13 | #define ROUND(x) (int)(round(x))
 14 | #endif
 15 | 
 16 | inline int out_to_in(float ox, float f) { return (int)((ox + 0.5f) * f); }
 17 | 
 18 | void interpolationCHW_nn(__local half *psrc, __local half *pdst, int OW, int IW, int C, float rw, float rh)
 19 | {
 20 |     float alpha = rh / 2.0f - 0.5f;
 21 | 
 22 |     for (int w = 0; w < OW / 8; w++) {
 23 |         float fw0 = rw * (w * 8 + 0) + alpha;
 24 |         float fw1 = rw * (w * 8 + 1) + alpha;
 25 |         float fw2 = rw * (w * 8 + 2) + alpha;
 26 |         float fw3 = rw * (w * 8 + 3) + alpha;
 27 | 
 28 |         float fw4 = rw * (w * 8 + 4) + alpha;
 29 |         float fw5 = rw * (w * 8 + 5) + alpha;
 30 |         float fw6 = rw * (w * 8 + 6) + alpha;
 31 |         float fw7 = rw * (w * 8 + 7) + alpha;
 32 | 
 33 |         int iw0 = min((int)ROUND(fw0), IW - 1);
 34 |         int iw1 = min((int)ROUND(fw1), IW - 1);
 35 |         int iw2 = min((int)ROUND(fw2), IW - 1);
 36 |         int iw3 = min((int)ROUND(fw3), IW - 1);
 37 | 
 38 |         int iw4 = min((int)ROUND(fw4), IW - 1);
 39 |         int iw5 = min((int)ROUND(fw5), IW - 1);
 40 |         int iw6 = min((int)ROUND(fw6), IW - 1);
 41 |         int iw7 = min((int)ROUND(fw7), IW - 1);
 42 | 
 43 |         for (int c = 0; c < C; c++) {
 44 |             half8 val = {
 45 |                 *((__local half *)(psrc + c * IW + iw0)),
 46 |                 *((__local half *)(psrc + c * IW + iw1)),
 47 |                 *((__local half *)(psrc + c * IW + iw2)),
 48 |                 *((__local half *)(psrc + c * IW + iw3)),
 49 | 
 50 |                 *((__local half *)(psrc + c * IW + iw4)),
 51 |                 *((__local half *)(psrc + c * IW + iw5)),
 52 |                 *((__local half *)(psrc + c * IW + iw6)),
 53 |                 *((__local half *)(psrc + c * IW + iw7)),
 54 |             };
 55 |             *((__local half8 *)(pdst + c * OW + w * 8)) = val;
 56 |         }
 57 |     }
 58 | 
 59 |     for (int w = OW / 8 * 8; w < OW; w++) {
 60 |         float fw = rw * w + alpha;
 61 |         int iw0  = min((int)ROUND(fw), IW - 1);
 62 | 
 63 |         for (int c = 0; c < C; c++) {
 64 |             *((__local half *)(pdst + c * OW + w)) = *((__local half *)(psrc + c * IW + iw0));
 65 |         }
 66 |     }
 67 | }
 68 | 
 69 | kernel void resample_nearest(
 70 |     __global const half *restrict src,
 71 |     __global half *restrict dst,
 72 |     int iw,
 73 |     int ih,
 74 |     float factor,
 75 |     int ow,
 76 |     int oh,
 77 |     int channels)
 78 | {
 79 |     __local half local_src[14 * 1024];
 80 |     __local half local_dst[14 * 1024];
 81 | 
 82 |     const int oy_first = get_group_id(1) * get_local_size(1);
 83 |     const int oy_last  = (get_group_id(1) + 1) * get_local_size(1) - 1;
 84 |     const int iy_first = out_to_in(oy_first, 1.0 / factor);
 85 |     const int iy_last  = out_to_in(oy_last, 1.0 / factor);
 86 | 
 87 |     const int iy_size = iy_last - iy_first + 1;
 88 | 
 89 |     event_t e1 = async_work_group_copy_2D2D(
 90 |         local_src, // dst
 91 |         src + get_group_id(2) * channels * ih * iw + iy_first * iw, // src
 92 |         iy_size * iw, // num_elements_per_line,
 93 |         channels, // num_lines,
 94 |         ih * iw - iy_size * iw, // src_line_stride,
 95 |         0, // dst_line_stride,
 96 |         0);
 97 | 
 98 |     wait_group_events(1, &e1);
 99 | 
100 |     interpolationCHW_nn(local_src, local_dst, ow, iw, channels, 1.0 / factor, 1.0 / factor);
101 | 
102 |     event_t e2 = async_work_group_copy_2D2D(
103 |         dst + get_group_id(2) * channels * get_global_size(1) * ow + get_group_id(1) * get_local_size(1) * ow, // dst
104 |         local_dst, // src
105 |         get_local_size(1) * ow, // size_t num_elements_per_line,
106 |         channels, // size_t num_lines,
107 |         0, // size_t src_line_stride,
108 |         get_global_size(1) * ow - get_local_size(1) * ow, // size_t dst_line_stride,
109 |         0);
110 | 
111 |     wait_group_events(1, &e2);
112 | }
113 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/shuffle_channels.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:230aa8e01a387beb4de512de1f3599867cc74dc36578359d78f5c856af9428cd
3 | size 10740
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/shuffle_channels.cl:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2018-2022 Intel Corporation
 2 | // SPDX-License-Identifier: Apache-2.0
 3 | //
 4 | 
 5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 6 | 
 7 | __kernel void ShuffleChannel(
 8 |     __global const half *restrict src_data,
 9 |     __global half *restrict dst_data,
10 |     int C,
11 |     int H,
12 |     int W,
13 |     int G)
14 | {
15 |     int c = get_global_id(0);
16 |     if (c >= C) return;
17 |     int CX = C / G;
18 |     int CY = G;
19 |     int cy = c % G;
20 |     int cx = c / G;
21 | 
22 |     __global const half8 *src_line =
23 |         ((__global const half8 *)(src_data + cy * CX * H * W + cx * H * W));
24 |     __global half8 *dst_line = ((__global half8 *)(dst_data + cx * CY * H * W + cy * H * W));
25 | 
26 |     for (int i = 0; i < W * H / 8; i++) {
27 |         dst_line[i] = src_line[i];
28 |     }
29 | 
30 |     for (int i = W * H / 8 * 8; i < W * H; i++) {
31 |         dst_data[cx * CY * H * W + cy * H * W + i] = src_data[cy * CX * H * W + cx * H * W + i];
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/st.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0f4e46dc0a701ea9d32ed113d8be30448306fea92560e617bed9605e24d1d6fb
3 | size 20376
4 | 


--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/st.cl:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2018-2022 Intel Corporation
  2 | // SPDX-License-Identifier: Apache-2.0
  3 | //
  4 | 
  5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
  6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
  7 | 
  8 | #define MAX_WIDTH 512
  9 | 
 10 | __attribute__((noinline)) void calcInd(
 11 |     __global const half *restrict theta,
 12 |     __local half *restrict weight,
 13 |     __local int *restrict ind,
 14 |     int y,
 15 |     int H,
 16 |     int x0,
 17 |     int length,
 18 |     int step,
 19 |     int W)
 20 | {
 21 |     float a = (float)y * 1.0f / H * 2 - 1;
 22 | 
 23 |     int x = 0;
 24 | 
 25 |     float8 va  = (float8){a, a, a, a, a, a, a, a};
 26 |     float8 vxy = (float8){x0 + 0, x0 + 1, x0 + 2, x0 + 3, x0 + 4, x0 + 5, x0 + 6, x0 + 7};
 27 | 
 28 |     for (; x <= length - 8; x += 8, vxy += 8) {
 29 |         float8 va1 = vxy * 1.0f / W * 2 - 1.f;
 30 | 
 31 |         float8 vx = (va * theta[0] + va1 * theta[1] + theta[2] + 1.f) / 2.f * H;
 32 |         float8 vy = (va * theta[3] + va1 * theta[4] + theta[5] + 1.f) / 2.f * W;
 33 | 
 34 |         const int8 ix = convert_int8(vx) - ((vx < 0) & 1);
 35 |         const int8 iy = convert_int8(vy) - ((vy < 0) & 1);
 36 | 
 37 |         float8 ax = vx - convert_float8(ix);
 38 |         float8 ay = vy - convert_float8(iy);
 39 |         float8 bx = 1.f - ax;
 40 |         float8 by = 1.f - ay;
 41 | 
 42 |         union {
 43 |             int8 d;
 44 |             uint8 i;
 45 |         } check_x;
 46 | 
 47 |         check_x.d = ix;
 48 |         int8 b01  = check_x.i < (uint8)H;
 49 | 
 50 |         check_x.d = ix + 1;
 51 |         int8 b45  = check_x.i < (uint8)H;
 52 | 
 53 |         union {
 54 |             int8 d;
 55 |             uint8 i;
 56 |         } check_y;
 57 | 
 58 |         check_y.d = iy;
 59 |         int8 b23  = check_y.i < (uint8)W;
 60 | 
 61 |         check_y.d = iy + 1;
 62 |         int8 b67  = check_y.i < (uint8)W;
 63 | 
 64 |         int8 b0123 = b01 & b23;
 65 |         int8 b0167 = b01 & b67;
 66 |         int8 b4523 = b45 & b23;
 67 |         int8 b4567 = b45 & b67;
 68 | 
 69 |         int8 TL_id = ((ix + 0) * W + (iy + 0)) * (b0123 & 1);
 70 |         int8 BL_id = ((ix + 1) * W + (iy + 0)) * (b4523 & 1);
 71 |         int8 TR_id = ((ix + 0) * W + (iy + 1)) * (b0167 & 1);
 72 |         int8 BR_id = ((ix + 1) * W + (iy + 1)) * (b4567 & 1);
 73 | 
 74 |         union {
 75 |             float8 f;
 76 |             int8 i;
 77 |         } w0;
 78 |         w0.f = bx * by;
 79 |         union {
 80 |             float8 f;
 81 |             int8 i;
 82 |         } w1;
 83 |         w1.f = ax * by;
 84 |         union {
 85 |             float8 f;
 86 |             int8 i;
 87 |         } w2;
 88 |         w2.f = bx * ay;
 89 |         union {
 90 |             float8 f;
 91 |             int8 i;
 92 |         } w3;
 93 |         w3.f = ax * ay;
 94 | 
 95 |         w0.i = w0.i & b0123;
 96 |         w1.i = w1.i & b4523;
 97 |         w2.i = w2.i & b0167;
 98 |         w3.i = w3.i & b4567;
 99 | 
100 |         *((__local half8 *)(weight + x + 0 * step)) = convert_half8(w0.f);
101 |         *((__local half8 *)(weight + x + 1 * step)) = convert_half8(w1.f);
102 |         *((__local half8 *)(weight + x + 2 * step)) = convert_half8(w2.f);
103 |         *((__local half8 *)(weight + x + 3 * step)) = convert_half8(w3.f);
104 | 
105 |         *((__local int8 *)(ind + x + 0 * step)) = TL_id;
106 |         *((__local int8 *)(ind + x + 1 * step)) = BL_id;
107 |         *((__local int8 *)(ind + x + 2 * step)) = TR_id;
108 |         *((__local int8 *)(ind + x + 3 * step)) = BR_id;
109 |     }
110 | 
111 |     for (; x < length; x++) {
112 |         float a1 = (float)(x0 + x) * 1.0f / W * 2 - 1;
113 | 
114 |         float fx = (a * theta[0] + a1 * theta[1] + theta[2] + 1) / 2 * H;
115 |         float fy = (a * theta[3] + a1 * theta[4] + theta[5] + 1) / 2 * W;
116 | 
117 |         const int ix = (int)(fx) - (fx < 0);
118 |         const int iy = (int)(fy) - (fy < 0);
119 | 
120 |         float ax = fx - ix;
121 |         float ay = fy - iy;
122 |         float bx = 1 - ax;
123 |         float by = 1 - ay;
124 | 
125 |         int b0 = ix >= 0;
126 |         int b4 = ix >= -1;
127 |         int b1 = ix < H;
128 |         int b5 = ix < H - 1;
129 | 
130 |         int b2 = iy >= 0;
131 |         int b6 = iy >= -1;
132 |         int b3 = iy < W;
133 |         int b7 = iy < W - 1;
134 | 
135 |         int b01 = b0 & b1;
136 |         int b23 = b2 & b3;
137 |         int b45 = b4 & b5;
138 |         int b67 = b6 & b7;
139 | 
140 |         int b0123 = b01 & b23;
141 |         int b0167 = b01 & b67;
142 |         int b4523 = b45 & b23;
143 |         int b4567 = b45 & b67;
144 | 
145 |         int TL_id = ((ix + 0) * W + (iy + 0)) * b0123;
146 |         int BL_id = ((ix + 1) * W + (iy + 0)) * b4523;
147 |         int TR_id = ((ix + 0) * W + (iy + 1)) * b0167;
148 |         int BR_id = ((ix + 1) * W + (iy + 1)) * b4567;
149 | 
150 |         half w0 = bx * by * b0123;
151 |         half w1 = ax * by * b4523;
152 |         half w2 = bx * ay * b0167;
153 |         half w3 = ax * ay * b4567;
154 | 
155 |         weight[x + 0 * step] = w0;
156 |         weight[x + 1 * step] = w1;
157 |         weight[x + 2 * step] = w2;
158 |         weight[x + 3 * step] = w3;
159 | 
160 |         ind[x + 0 * step] = TL_id;
161 |         ind[x + 1 * step] = BL_id;
162 |         ind[x + 2 * step] = TR_id;
163 |         ind[x + 3 * step] = BR_id;
164 |     }
165 | }
166 | 
167 | __attribute__((noinline)) void apply(
168 |     __global half const *restrict src,
169 |     __local half const *restrict weight,
170 |     __local int const *restrict ind,
171 |     __local half *restrict dst,
172 |     int src_stride,
173 |     int step)
174 | {
175 |     int x = 0;
176 |     for (; x <= src_stride - 8; x += 8) {
177 |         int8 TL_id = *((__local int8 *)(ind + x + 0 * step));
178 |         int8 BL_id = *((__local int8 *)(ind + x + 1 * step));
179 |         int8 TR_id = *((__local int8 *)(ind + x + 2 * step));
180 |         int8 BR_id = *((__local int8 *)(ind + x + 3 * step));
181 | 
182 |         half8 w00 = *((__local half8 *)(weight + x + 0 * step));
183 |         half8 w01 = *((__local half8 *)(weight + x + 1 * step));
184 |         half8 w02 = *((__local half8 *)(weight + x + 2 * step));
185 |         half8 w03 = *((__local half8 *)(weight + x + 3 * step));
186 | 
187 |         half8 TL = (half8){
188 |             src[TL_id[0]], src[TL_id[1]],
189 |             src[TL_id[2]], src[TL_id[3]],
190 |             src[TL_id[4]], src[TL_id[5]],
191 |             src[TL_id[6]], src[TL_id[7]]};
192 |         half8 TR = (half8){
193 |             src[TR_id[0]], src[TR_id[1]],
194 |             src[TR_id[2]], src[TR_id[3]],
195 |             src[TR_id[4]], src[TR_id[5]],
196 |             src[TR_id[6]], src[TR_id[7]]};
197 |         half8 BL = (half8){
198 |             src[BL_id[0]], src[BL_id[1]],
199 |             src[BL_id[2]], src[BL_id[3]],
200 |             src[BL_id[4]], src[BL_id[5]],
201 |             src[BL_id[6]], src[BL_id[7]]};
202 |         half8 BR = (half8){
203 |             src[BR_id[0]], src[BR_id[1]],
204 |             src[BR_id[2]], src[BR_id[3]],
205 |             src[BR_id[4]], src[BR_id[5]],
206 |             src[BR_id[6]], src[BR_id[7]]};
207 | 
208 |         half8 res = w00 * TL + w01 * BL + w02 * TR + w03 * BR;
209 | 
210 |         *((__local half8 *)(dst + x)) = res;
211 |     }
212 | 
213 |     for (; x < src_stride; x++) {
214 |         int TL_id = ind[x + 0 * step];
215 |         int BL_id = ind[x + 1 * step];
216 |         int TR_id = ind[x + 2 * step];
217 |         int BR_id = ind[x + 3 * step];
218 | 
219 |         half w00 = weight[x + 0 * step];
220 |         half w01 = weight[x + 1 * step];
221 |         half w02 = weight[x + 2 * step];
222 |         half w03 = weight[x + 3 * step];
223 | 
224 |         half TL = src[TL_id];
225 |         half TR = src[TR_id];
226 |         half BL = src[BL_id];
227 |         half BR = src[BR_id];
228 | 
229 |         half res = w00 * TL + w01 * BL + w02 * TR + w03 * BR;
230 | 
231 |         dst[x] = res;
232 |     }
233 | }
234 | 
235 | __kernel void ocl_st(
236 |     __global half const *const restrict src_data,
237 |     __global half const *const restrict theta,
238 |     __global half *const restrict dst_data,
239 |     int C,
240 |     int W)
241 | {
242 |     __local int ind[4 * MAX_WIDTH] __attribute__((aligned(16)));
243 |     __local half weight[4 * MAX_WIDTH] __attribute__((aligned(16)));
244 |     __local half local_dst[4 * 1024];
245 | 
246 |     int w = get_group_id(0);
247 | 
248 |     int y = get_global_id(1);
249 |     int H = get_global_size(1);
250 | 
251 |     const int x0         = w * MAX_WIDTH;
252 |     const int x1         = min(x0 + MAX_WIDTH, W);
253 |     const int src_stride = x1 - x0;
254 | 
255 |     calcInd(theta, weight, ind, y, H, x0, src_stride, MAX_WIDTH, W);
256 | 
257 |     for (int c = 0; c < C; c++) {
258 |         __global half const *restrict src = src_data + c * H * W;
259 |         __local half *restrict dst = local_dst + c * get_local_size(1) * src_stride + get_local_id(1) * src_stride;
260 | 
261 |         apply(src, weight, ind, dst, src_stride, MAX_WIDTH);
262 |     }
263 | 
264 |     barrier(CLK_LOCAL_MEM_FENCE);
265 | 
266 |     event_t e = async_work_group_copy_3D3D(
267 |         dst_data + get_group_id(1) * get_local_size(1) * W + x0, // dst
268 |         local_dst, // src
269 |         src_stride, // num_elements_per_line
270 |         get_local_size(1), // num_lines
271 |         0, // src_line_stride
272 |         W - src_stride, // dst_line_stride
273 |         C, // num planes
274 |         0, // src plane stride
275 |         W * (get_global_size(1) - get_local_size(1)), // dst plane stride
276 |         0);
277 |     wait_group_events(1, &e);
278 | }
279 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | flask-cors
3 | gradio
4 | opencv-python
5 | numpy==1.20.3


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | exec python3 app.py &
4 | exec python3 gradio/demo.py


--------------------------------------------------------------------------------