├── .gitattributes
├── Dockerfile
├── README.md
├── app.py
├── facewrapper
├── dict
│ ├── data1.bin
│ ├── data2.bin
│ └── data3.bin
├── facewrapper.py
└── libs
│ ├── libimutils.so
│ ├── libimutils.so_for_ubuntu22
│ └── libttvfaceengine7.so
├── gradio
├── demo.py
└── examples
│ ├── 1.jpg
│ ├── 2.jpg
│ ├── 3.jpg
│ └── 4.jpg
├── openvino
├── cache.json
├── libgna.so
├── libgna.so.2
├── libgna.so.3.0.0.1455
├── libopenvino.so
├── libopenvino_auto_batch_plugin.so
├── libopenvino_auto_plugin.so
├── libopenvino_c.so
├── libopenvino_gapi_preproc.so
├── libopenvino_hetero_plugin.so
├── libopenvino_intel_cpu_plugin.so
├── libopenvino_intel_gna_plugin.so
├── libopenvino_intel_hddl_plugin.so
├── libopenvino_intel_myriad_plugin.so
├── libopenvino_ir_frontend.so
├── libopenvino_onnx_frontend.so
├── libopenvino_paddle_frontend.so
├── libopenvino_tensorflow_fe.so
├── pcie-ma2x8x.mvcmd
├── plugins.xml
├── usb-ma2x8x.mvcmd
└── vpu_custom_kernels
│ ├── binarization.bin
│ ├── binarization.cl
│ ├── binary_convolution.bin
│ ├── binary_convolution.cl
│ ├── binary_convolution1x1.bin
│ ├── binary_convolution1x1.cl
│ ├── binary_convolution3x3.bin
│ ├── binary_convolution3x3.cl
│ ├── convolution1x1_chw.bin
│ ├── convolution1x1_chw.cl
│ ├── convolution1x1_hwc.bin
│ ├── convolution1x1_hwc.cl
│ ├── convolution3x3.bin
│ ├── convolution3x3.cl
│ ├── correlate.bin
│ ├── correlate.cl
│ ├── ctc.bin
│ ├── ctc.cl
│ ├── customLayerBindings.xml
│ ├── cvtf32f16.bin
│ ├── cvtf32f16.cl
│ ├── cvtu8f16.bin
│ ├── cvtu8f16.cl
│ ├── detectron_prior_grid_gen.bin
│ ├── detectron_prior_grid_gen.cl
│ ├── fakequantize.bin
│ ├── fakequantize.cl
│ ├── grn.bin
│ ├── grn.cl
│ ├── mvn_reduction.bin
│ ├── mvn_reduction.cl
│ ├── mvn_scale.bin
│ ├── mvn_scale.cl
│ ├── region_chw.bin
│ ├── region_chw.cl
│ ├── region_hwc.bin
│ ├── region_hwc.cl
│ ├── reorg_chw.bin
│ ├── reorg_chw.cl
│ ├── reorg_hwc.bin
│ ├── reorg_hwc.cl
│ ├── reorg_hwc_naive.bin
│ ├── reorg_hwc_naive.cl
│ ├── resample_AA.bin
│ ├── resample_AA.cl
│ ├── resample_noAA.bin
│ ├── resample_noAA.cl
│ ├── shuffle_channels.bin
│ ├── shuffle_channels.cl
│ ├── st.bin
│ └── st.cl
├── requirements.txt
└── run.sh
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.7z filter=lfs diff=lfs merge=lfs -text
2 | *.arrow filter=lfs diff=lfs merge=lfs -text
3 | *.bin filter=lfs diff=lfs merge=lfs -text
4 | *.bz2 filter=lfs diff=lfs merge=lfs -text
5 | *.ckpt filter=lfs diff=lfs merge=lfs -text
6 | *.ftz filter=lfs diff=lfs merge=lfs -text
7 | *.gz filter=lfs diff=lfs merge=lfs -text
8 | *.h5 filter=lfs diff=lfs merge=lfs -text
9 | *.joblib filter=lfs diff=lfs merge=lfs -text
10 | *.lfs.* filter=lfs diff=lfs merge=lfs -text
11 | *.mlmodel filter=lfs diff=lfs merge=lfs -text
12 | *.model filter=lfs diff=lfs merge=lfs -text
13 | *.msgpack filter=lfs diff=lfs merge=lfs -text
14 | *.npy filter=lfs diff=lfs merge=lfs -text
15 | *.npz filter=lfs diff=lfs merge=lfs -text
16 | *.onnx filter=lfs diff=lfs merge=lfs -text
17 | *.ot filter=lfs diff=lfs merge=lfs -text
18 | *.parquet filter=lfs diff=lfs merge=lfs -text
19 | *.pb filter=lfs diff=lfs merge=lfs -text
20 | *.pickle filter=lfs diff=lfs merge=lfs -text
21 | *.pkl filter=lfs diff=lfs merge=lfs -text
22 | *.pt filter=lfs diff=lfs merge=lfs -text
23 | *.pth filter=lfs diff=lfs merge=lfs -text
24 | *.rar filter=lfs diff=lfs merge=lfs -text
25 | *.safetensors filter=lfs diff=lfs merge=lfs -text
26 | saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27 | *.tar.* filter=lfs diff=lfs merge=lfs -text
28 | *.tar filter=lfs diff=lfs merge=lfs -text
29 | *.tflite filter=lfs diff=lfs merge=lfs -text
30 | *.tgz filter=lfs diff=lfs merge=lfs -text
31 | *.wasm filter=lfs diff=lfs merge=lfs -text
32 | *.xz filter=lfs diff=lfs merge=lfs -text
33 | *.zip filter=lfs diff=lfs merge=lfs -text
34 | *.zst filter=lfs diff=lfs merge=lfs -text
35 | *tfevents* filter=lfs diff=lfs merge=lfs -text
36 | facewrapper/libs/libttvfaceengine7.so filter=lfs diff=lfs merge=lfs -text
37 | openvino/libgna.so filter=lfs diff=lfs merge=lfs -text
38 | openvino/libgna.so.2 filter=lfs diff=lfs merge=lfs -text
39 | openvino/libgna.so.3.0.0.1455 filter=lfs diff=lfs merge=lfs -text
40 | openvino/libopenvino_gapi_preproc.so filter=lfs diff=lfs merge=lfs -text
41 | openvino/libopenvino_intel_cpu_plugin.so filter=lfs diff=lfs merge=lfs -text
42 | openvino/libopenvino_intel_gna_plugin.so filter=lfs diff=lfs merge=lfs -text
43 | openvino/libopenvino_intel_hddl_plugin.so filter=lfs diff=lfs merge=lfs -text
44 | openvino/libopenvino_intel_myriad_plugin.so filter=lfs diff=lfs merge=lfs -text
45 | openvino/libopenvino_onnx_frontend.so filter=lfs diff=lfs merge=lfs -text
46 | openvino/libopenvino_tensorflow_fe.so filter=lfs diff=lfs merge=lfs -text
47 | openvino/libopenvino.so filter=lfs diff=lfs merge=lfs -text
48 | openvino/pcie-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text
49 | openvino/usb-ma2x8x.mvcmd filter=lfs diff=lfs merge=lfs -text
50 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM ubuntu:20.04
2 | RUN ln -snf /usr/share/zoneinfo/$CONTAINER_TIMEZONE /etc/localtime && echo $CONTAINER_TIMEZONE > /etc/timezone
3 | RUN apt-get update -y
4 | RUN apt-get install -y python3 python3-pip python3-opencv
5 | RUN apt-get install -y libcurl4-openssl-dev libssl-dev
6 | RUN mkdir -p /home/FaceOnLive_v7
7 | RUN mkdir -p /home/FaceOnLive_v7/facewrapper
8 | WORKDIR /home/FaceOnLive_v7
9 | COPY ./facewrapper ./facewrapper
10 | COPY ./facewrapper/libs/libimutils.so /usr/lib
11 | COPY ./gradio ./gradio
12 | COPY ./openvino /usr/lib
13 | COPY ./app.py ./app.py
14 | COPY ./run.sh .
15 | COPY ./requirements.txt ./requirements.txt
16 | RUN pip3 install -r requirements.txt
17 | RUN chmod a+x run.sh
18 | CMD ["./run.sh"]
19 | EXPOSE 9000
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
18 | Face Liveness Detection SDK For Linux
19 | Fully Offline, On-Premise Face Liveness Detection SDK for Linux
20 |
21 |
22 | Documentation at https://docs.faceonlive.com
23 |
24 |
25 | ## :tada: Try It Yourself on our [Portfolio Website](https://portfolio.faceonlive.com/#server_sdks/server/liv)
26 |
27 | Integrated into [Huggingface Spaces 🤗](https://huggingface.co/spaces) using [Gradio](https://github.com/gradio-app/gradio). Try out the Web Demo: [](https://huggingface.co/spaces/FaceOnLive/Face-Liveness-Detection-SDK)
28 |
29 |
30 | https://user-images.githubusercontent.com/91896009/187945910-4ca6d27c-d058-4749-a834-44914a5a957c.mp4
31 |
32 |
33 | ## :clap: Supporters
34 | [](https://github.com/faceonlive/Face-Liveness-Detection-SDK-Linux/stargazers)
35 | [](https://github.com/faceonlive/Face-Liveness-Detection-SDK-Linux/network/members)
36 | 
37 |
38 | ## 🏃 How to run
39 | ### 1. Download and install dependencies
40 | To begin, follow these steps to download and install the necessary dependencies:
41 | ```
42 | git clone https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux
43 | cd Face-Liveness-Detection-SDK-Linux
44 | chmod +x ./install_dependency.sh
45 | sudo ./install_dependency.sh
46 | ```
47 | ### 2. Execute the Python Flask application
48 | Next, run the Python Flask application by executing the following command:
49 | ```
50 | python3 app.py
51 | ```
52 | ### 3. Activate the SDK
53 | #### - Online License
54 | If you have an online license, please update the license key provided by us in the following file:
55 | https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/blob/6e702fa01aeabbfb395d82c637a66dc18a93f2fb/app.py#L23-L23
56 | #### - Offline License
57 | If you have an offline license, please share your machine's HWID (Hardware ID) with us to receive the license.txt file. Update the HWID in the following file:
58 | https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/blob/6e702fa01aeabbfb395d82c637a66dc18a93f2fb/app.py#L24-L24
59 | ```
60 | online init failed: 6
61 | hwid: IXwjedMe8M5cZX/GwU3NEOqJRcqLwldq27HSLyFiejbGDB9XVgytA1RgJukV3mWWTNo84NwTMYU=
62 | ```
63 | ### 4. Using Docker
64 | - Build the Docker image:
65 | ```
66 | sudo docker build --pull --rm -f Dockerfile -t faceonlive_v7:latest .
67 | ```
68 | - Run Docker with online license:
69 | ```
70 | sudo docker docker run --network host faceonlive_v7
71 | ```
72 | - Run Docker with offline license:
73 | ```
74 | sudo docker run -v license.txt:/root/FaceOnLive_v7/license.txt --network host faceonlive_v7
75 | ```
76 | ### 5. Test endpoint
77 | To test the endpoint, download the Postman Collection from the following link:
78 | [FaceOnLive.postman_collection.json](https://github.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/blob/main/FaceOnLive.postman_collection.json)
79 |
80 | 
81 |
82 | 
83 |
84 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('.')
3 |
4 | from flask import Flask, request, jsonify
5 | from time import gmtime, strftime
6 | import os
7 | import base64
8 | import json
9 | import cv2
10 | import numpy as np
11 |
12 | from facewrapper.facewrapper import ttv_version
13 | from facewrapper.facewrapper import ttv_get_hwid
14 | from facewrapper.facewrapper import ttv_init
15 | from facewrapper.facewrapper import ttv_init_offline
16 | from facewrapper.facewrapper import ttv_detect_face
17 |
18 | app = Flask(__name__)
19 |
20 | app.config['SITE'] = "http://0.0.0.0:8000/"
21 | app.config['DEBUG'] = False
22 |
23 | licenseKey = os.environ.get("LICENSE_KEY")
24 | licensePath = "license.txt"
25 | modelFolder = os.path.abspath(os.path.dirname(__file__)) + '/facewrapper/dict'
26 |
27 | version = ttv_version()
28 | print("version: ", version.decode('utf-8'))
29 |
30 | ret = ttv_init(modelFolder.encode('utf-8'), licenseKey.encode('utf-8'))
31 | if ret != 0:
32 | print(f"online init failed: {ret}");
33 |
34 | hwid = ttv_get_hwid()
35 | print("hwid: ", hwid.decode('utf-8'))
36 |
37 | ret = ttv_init_offline(modelFolder.encode('utf-8'), licensePath.encode('utf-8'))
38 | if ret != 0:
39 | print(f"offline init failed: {ret}")
40 | exit(-1)
41 | else:
42 | print(f"offline init ok")
43 |
44 | else:
45 | print(f"online init ok")
46 |
47 | @app.route('/api/liveness', methods=['POST'])
48 | def check_liveness():
49 | file = request.files['image']
50 | image = cv2.imdecode(np.fromstring(file.read(), np.uint8), cv2.IMREAD_COLOR)
51 |
52 | faceRect = np.zeros([4], dtype=np.int32)
53 | livenessScore = np.zeros([1], dtype=np.double)
54 | angles = np.zeros([3], dtype=np.double)
55 | ret = ttv_detect_face(image, image.shape[1], image.shape[0], faceRect, livenessScore, angles)
56 | if ret == -1:
57 | result = "license error!"
58 | elif ret == -2:
59 | result = "init error!"
60 | elif ret == 0:
61 | result = "no face detected!"
62 | elif ret > 1:
63 | result = "multiple face detected!"
64 | elif faceRect[0] < 0 or faceRect[1] < 0 or faceRect[2] >= image.shape[1] or faceRect[2] >= image.shape[0]:
65 | result = "faace is in boundary!"
66 | elif livenessScore[0] > 0.5:
67 | result = "genuine"
68 | else:
69 | result = "spoof"
70 |
71 | status = "ok"
72 | response = jsonify({"status": status, "data": {"result": result, "face_rect": {"x": int(faceRect[0]), "y": int(faceRect[1]), "w": int(faceRect[2] - faceRect[0] + 1), "h" : int(faceRect[3] - faceRect[1] + 1)}, "liveness_score": livenessScore[0],
73 | "angles": {"yaw": angles[0], "roll": angles[1], "pitch": angles[2]}}})
74 |
75 | response.status_code = 200
76 | response.headers["Content-Type"] = "application/json; charset=utf-8"
77 | return response
78 |
79 | @app.route('/api/liveness_base64', methods=['POST'])
80 | def check_liveness_base64():
81 | content = request.get_json()
82 | imageBase64 = content['image']
83 | image = cv2.imdecode(np.frombuffer(base64.b64decode(imageBase64), dtype=np.uint8), cv2.IMREAD_COLOR)
84 |
85 | faceRect = np.zeros([4], dtype=np.int32)
86 | livenessScore = np.zeros([1], dtype=np.double)
87 | angles = np.zeros([3], dtype=np.double)
88 | ret = ttv_detect_face(image, image.shape[1], image.shape[0], faceRect, livenessScore, angles)
89 | if ret == -1:
90 | result = "license error!"
91 | elif ret == -2:
92 | result = "init error!"
93 | elif ret == 0:
94 | result = "no face detected!"
95 | elif ret > 1:
96 | result = "multiple face detected!"
97 | elif faceRect[0] < 0 or faceRect[1] < 0 or faceRect[2] >= image.shape[1] or faceRect[2] >= image.shape[0]:
98 | result = "faace is in boundary!"
99 | elif livenessScore[0] > 0.5:
100 | result = "genuine"
101 | else:
102 | result = "spoof"
103 |
104 | status = "ok"
105 | response = jsonify({"status": status, "data": {"result": result, "face_rect": {"x": int(faceRect[0]), "y": int(faceRect[1]), "w": int(faceRect[2] - faceRect[0] + 1), "h" : int(faceRect[3] - faceRect[1] + 1)}, "liveness_score": livenessScore[0],
106 | "angles": {"yaw": angles[0], "roll": angles[1], "pitch": angles[2]}}})
107 |
108 | response.status_code = 200
109 | response.headers["Content-Type"] = "application/json; charset=utf-8"
110 | return response
111 |
112 |
113 | if __name__ == '__main__':
114 | port = int(os.environ.get("PORT", 8000))
115 | app.run(host='0.0.0.0', port=port)
116 |
--------------------------------------------------------------------------------
/facewrapper/dict/data1.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:36cf5fcc49345989a86839a53529314ec1fe5d621c377a1952bc7538d55e7f1b
3 | size 16255630
4 |
--------------------------------------------------------------------------------
/facewrapper/dict/data2.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6fa65c4b7df14f0c96c174868a1b1c675adc8c4a11e3c0807009f3d0cad51f5a
3 | size 280076956
4 |
--------------------------------------------------------------------------------
/facewrapper/dict/data3.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f25fb0cd3d70cb84c258e7109620f411c087e0875828d6ab86cc9c4838d49bec
3 | size 11875339
4 |
--------------------------------------------------------------------------------
/facewrapper/facewrapper.py:
--------------------------------------------------------------------------------
1 | import ctypes, ctypes.util
2 | from ctypes import *
3 | from numpy.ctypeslib import ndpointer
4 | import sys
5 | import os
6 | sys.path.append('/opt/intel/openvino_2022/runtime/lib/intel64')
7 |
8 | lib_path = os.path.abspath(os.path.dirname(__file__)) + '/libs/libttvfaceengine7.so'
9 | liveness_engine = cdll.LoadLibrary(lib_path)
10 |
11 | ttv_version = liveness_engine.ttv_version
12 | ttv_version.argtypes = []
13 | ttv_version.restype = ctypes.c_char_p
14 |
15 | ttv_get_hwid = liveness_engine.ttv_get_hwid
16 | ttv_get_hwid.argtypes = []
17 | ttv_get_hwid.restype = ctypes.c_char_p
18 |
19 | ttv_init = liveness_engine.ttv_init
20 | ttv_init.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
21 | ttv_init.restype = ctypes.c_int32
22 |
23 | ttv_init_offline = liveness_engine.ttv_init_offline
24 | ttv_init_offline.argtypes = [ctypes.c_char_p, ctypes.c_char_p]
25 | ttv_init_offline.restype = ctypes.c_int32
26 |
27 |
28 | ttv_detect_face = liveness_engine.ttv_detect_face
29 | ttv_detect_face.argtypes = [ndpointer(ctypes.c_ubyte, flags='C_CONTIGUOUS'), ctypes.c_int32, ctypes.c_int32, ndpointer(ctypes.c_int32, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_double, flags='C_CONTIGUOUS'), ndpointer(ctypes.c_double, flags='C_CONTIGUOUS')]
30 | ttv_detect_face.restype = ctypes.c_int32
31 |
32 |
--------------------------------------------------------------------------------
/facewrapper/libs/libimutils.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/facewrapper/libs/libimutils.so
--------------------------------------------------------------------------------
/facewrapper/libs/libimutils.so_for_ubuntu22:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/facewrapper/libs/libimutils.so_for_ubuntu22
--------------------------------------------------------------------------------
/facewrapper/libs/libttvfaceengine7.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b3d6f12326c8bd60242dd7366cfebeef69d25a296bdd9d329d3033e8b70e782f
3 | size 3664979
4 |
--------------------------------------------------------------------------------
/gradio/demo.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | import requests
3 | import json
4 |
5 | def face_liveness(frame):
6 | url = "http://127.0.0.1:8000/api/liveness"
7 | files = None
8 | if frame is None:
9 | return ['', None]
10 |
11 | files = {'image': open(frame, 'rb')}
12 | r = requests.post(url=url, files=files)
13 | return r.json()
14 |
15 | with gr.Blocks() as demo:
16 | gr.Markdown(
17 | """
18 | # Face Liveness Detection
19 | Get your own Face Liveness Detection Server by duplicating this space.
20 | Contact us at contact@faceonlive.com for issues and support.
21 | """
22 | )
23 | with gr.Row():
24 | with gr.Column(scale=5):
25 | image_input = gr.Image(type='filepath')
26 | gr.Examples(['gradio/examples/1.jpg', 'gradio/examples/2.jpg', 'gradio/examples/3.jpg', 'gradio/examples/4.jpg'],
27 | inputs=image_input)
28 | face_liveness_button = gr.Button("Check Liveness")
29 | with gr.Column(scale=5):
30 | liveness_result_output = gr.JSON()
31 |
32 | face_liveness_button.click(face_liveness, inputs=image_input, outputs=liveness_result_output)
33 |
34 | demo.launch(server_name="0.0.0.0", server_port=7860)
--------------------------------------------------------------------------------
/gradio/examples/1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/1.jpg
--------------------------------------------------------------------------------
/gradio/examples/2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/2.jpg
--------------------------------------------------------------------------------
/gradio/examples/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/3.jpg
--------------------------------------------------------------------------------
/gradio/examples/4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/gradio/examples/4.jpg
--------------------------------------------------------------------------------
/openvino/libgna.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3 | size 3120536
4 |
--------------------------------------------------------------------------------
/openvino/libgna.so.2:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3 | size 3120536
4 |
--------------------------------------------------------------------------------
/openvino/libgna.so.3.0.0.1455:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:22441d86dca92b00ae7fb9d315bcb1c6a8a213ac4fe86396489753ebe76f869e
3 | size 3120536
4 |
--------------------------------------------------------------------------------
/openvino/libopenvino.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fd216848c1ba78e62360c12c9684df0c160f6962f3d900e5918cc042b42b2b46
3 | size 13495416
4 |
--------------------------------------------------------------------------------
/openvino/libopenvino_auto_batch_plugin.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_auto_batch_plugin.so
--------------------------------------------------------------------------------
/openvino/libopenvino_auto_plugin.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_auto_plugin.so
--------------------------------------------------------------------------------
/openvino/libopenvino_c.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_c.so
--------------------------------------------------------------------------------
/openvino/libopenvino_gapi_preproc.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3ac5ce0a8f3acefb41e8aa8161f78035dafff25c4b8c3485ebc541573b2b15f0
3 | size 1312920
4 |
--------------------------------------------------------------------------------
/openvino/libopenvino_hetero_plugin.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_hetero_plugin.so
--------------------------------------------------------------------------------
/openvino/libopenvino_intel_cpu_plugin.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:afe05ada6d5b11495a21787fa6ab0162fc40f7a9ab97be78f7b7185126d15b18
3 | size 33299880
4 |
--------------------------------------------------------------------------------
/openvino/libopenvino_intel_gna_plugin.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ef15b623e7f81788160c4056ccd5e887a8184affe381e84a906646ef36cae1ab
3 | size 4067016
4 |
--------------------------------------------------------------------------------
/openvino/libopenvino_intel_hddl_plugin.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:96362327fbc404e88583bdcd2a526ccbf4ca26d4ecdb8898234be7986d9b8b2b
3 | size 5894680
4 |
--------------------------------------------------------------------------------
/openvino/libopenvino_intel_myriad_plugin.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e596436002565356b80400e0d7e50093d53d338f623b171f658de527477852de
3 | size 6120168
4 |
--------------------------------------------------------------------------------
/openvino/libopenvino_ir_frontend.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_ir_frontend.so
--------------------------------------------------------------------------------
/openvino/libopenvino_onnx_frontend.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0770ed09d471b20bffcf4ef57ab1fb002db04c4404598bd5c52a4418a67f5441
3 | size 3781640
4 |
--------------------------------------------------------------------------------
/openvino/libopenvino_paddle_frontend.so:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FaceOnLive/Face-Liveness-Detection-SDK-Linux/04d37814737b9afded1d040726c3b5622e9954d2/openvino/libopenvino_paddle_frontend.so
--------------------------------------------------------------------------------
/openvino/libopenvino_tensorflow_fe.so:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c2dadbcd8ba32cec02873caf8dcc644d1d8856cdcd2978c603e5bac169e01bb9
3 | size 2723864
4 |
--------------------------------------------------------------------------------
/openvino/pcie-ma2x8x.mvcmd:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f03146453508f2bcab1589907bccaa429b48db6123a7b8a428d6ce221d1fbb4d
3 | size 2099248
4 |
--------------------------------------------------------------------------------
/openvino/plugins.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/openvino/usb-ma2x8x.mvcmd:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:faf33388b88708177a358fcb4704eba04b1cf9e88d6a047f90c833d686140a2e
3 | size 2298632
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binarization.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:3e0de6082c7bacca2ff5ad131f0afc44304fc792a6d99e7829399eb61491a0ac
3 | size 19632
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binarization.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __kernel void binarization(
9 | const __global half *__restrict src_data,
10 | const __global half *__restrict input_low_high,
11 | const __global half *__restrict dst_data,
12 | int switch_out,
13 | int input_low_high_size,
14 | int W,
15 | int H)
16 | {
17 | __local half local_src[15 * 1024];
18 | __local half local_dst[15 * 1024];
19 |
20 | event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
21 | wait_group_events(1, &e1);
22 |
23 | int c = get_global_id(2);
24 | int C = get_global_size(2);
25 |
26 | half dst_low = switch_out ? 1.h : -1.h;
27 | half dst_high = switch_out ? -1.h : 1.h;
28 |
29 | half s_ilow_ihigh = input_low_high_size == 1 ? input_low_high[0] : input_low_high[c];
30 |
31 | for (int h = 0; h < H; h++) {
32 |
33 | __local const half *__restrict addr_src = local_src + h * W;
34 | __local half *__restrict addr_dst = local_dst + h * W;
35 |
36 | #if 1
37 | for (int w = 0; w < W / 8; w++) {
38 |
39 | half8 h_src_val8 = (*((__local half8 *)addr_src + w));
40 |
41 | short8 cond1;
42 | cond1.s0 = (h_src_val8.s0 <= s_ilow_ihigh);
43 | cond1.s1 = (h_src_val8.s1 <= s_ilow_ihigh);
44 | cond1.s2 = (h_src_val8.s2 <= s_ilow_ihigh);
45 | cond1.s3 = (h_src_val8.s3 <= s_ilow_ihigh);
46 | cond1.s4 = (h_src_val8.s4 <= s_ilow_ihigh);
47 | cond1.s5 = (h_src_val8.s5 <= s_ilow_ihigh);
48 | cond1.s6 = (h_src_val8.s6 <= s_ilow_ihigh);
49 | cond1.s7 = (h_src_val8.s7 <= s_ilow_ihigh);
50 |
51 | cond1 = ~(cond1 - (short8)1);
52 |
53 | short8 res = cond1 & as_short8((half8)dst_low) | ~cond1 & as_short8((half8)dst_high);
54 |
55 | *((__local half8 *)addr_dst + w) = as_half8(res);
56 | }
57 | #endif
58 | for (int w = W & (~0x7); w < W; w++) {
59 | addr_dst[w] = (addr_src[w] <= s_ilow_ihigh) ? dst_low : dst_high;
60 | }
61 | }
62 |
63 | barrier(CLK_LOCAL_MEM_FENCE);
64 |
65 | event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
66 | wait_group_events(1, &e2);
67 | }
68 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:12c349d6f73c233b158e1d67af31715c7b8bda79f191b1e759476e01e65bb64a
3 | size 10764
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 |
7 | int extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
8 |
9 | __kernel void binary_convolution(
10 | const __global half *restrict src_data,
11 | const __global uchar *restrict weights_data,
12 | __global half *restrict dst_data,
13 | float pad_value,
14 |
15 | int IW,
16 | int IH,
17 | int IC,
18 |
19 | int DW,
20 | int DH,
21 |
22 | int GC,
23 |
24 | int KW,
25 | int KH,
26 |
27 | int PW,
28 | int PH,
29 |
30 | int SW,
31 | int SH)
32 | {
33 | int ipad_value = ((pad_value > 0.f) ? 1 : 0);
34 | int c = get_global_id(2);
35 | int y = get_global_id(1);
36 | int x = get_global_id(0);
37 |
38 | int OC = get_global_size(2);
39 | int OH = get_global_size(1);
40 | int OW = get_global_size(0);
41 |
42 | int KD = 1;
43 | int SD = 0;
44 | int DD = 0;
45 | int PD = 0;
46 | int ID = 1;
47 | int OD = 1;
48 |
49 | int nbits = 8;
50 |
51 | int g = c % GC;
52 | int oc = c / GC;
53 | int oh = y;
54 | int ow = x;
55 |
56 | for (int od = 0; od < OD; od++) {
57 | int oidx = g * OC / GC * OD * OH * OW + oc * OD * OH * OW + od * OH * OW + oh * OW + ow;
58 |
59 | int res = 0;
60 |
61 | for (int ic = 0; ic < IC / GC; ic++) {
62 | for (int kd = 0; kd < KD; kd++) {
63 | for (int kh = 0; kh < KH; kh++) {
64 | for (int kw = 0; kw < KW; kw++) {
65 | int widx = g * OC / GC * IC / GC * KD * KH * KW
66 | + oc * IC / GC * KD * KH * KW + ic * KD * KH * KW + kd * KH * KW
67 | + kh * KW + kw;
68 |
69 | int w = extract_weights(weights_data[widx / nbits], (widx % nbits));
70 |
71 | int s;
72 |
73 | int iw = ow * SW - PW + kw * DW;
74 | int ih = oh * SH - PH + kh * DH;
75 | int id = od * SD - PD + kd * DD;
76 |
77 | if (iw < 0 || iw >= (int)IW || ih < 0 || ih >= (int)IH || id < 0
78 | || id >= (int)ID) {
79 | s = ipad_value;
80 | } else {
81 | int iidx = g * IC / GC * ID * IH * IW + ic * ID * IH * IW + id * IH * IW
82 | + ih * IW + iw;
83 |
84 | s = ((src_data[iidx] > 0.f) ? 1 : 0);
85 | }
86 |
87 | res += s ^ w;
88 | }
89 | }
90 | }
91 | }
92 |
93 | dst_data[oidx] = (half)(IC / GC * KD * KH * KW - 2 * res);
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution1x1.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6deff31d62aa84c643fbeba77e7dcd4ae5d9b488c1c98e07fffeb58ff8e9b945
3 | size 76316
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution1x1.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
9 |
10 | __kernel void binary_convolution(
11 | const __global half *restrict src_data,
12 | const __global uchar *restrict weights_data,
13 | __global half *restrict dst_data,
14 | float pad_value,
15 |
16 | int IW,
17 | int IH,
18 | int IC,
19 |
20 | int DW,
21 | int DH,
22 |
23 | int GC,
24 |
25 | int KW,
26 | int KH,
27 |
28 | int PW,
29 | int PH,
30 |
31 | int SW,
32 | int SH,
33 |
34 | int OW)
35 | {
36 | __local half src_local[32 * 1024];
37 | __local half dst_local[2 * 1024];
38 |
39 | const int oh = get_group_id(0);
40 | const int oc = get_group_id(1);
41 | const int OH = get_global_size(0);
42 | const int OC = get_global_size(1);
43 |
44 | const int gc = oc / (OC / GC);
45 |
46 | if (oh * SH >= 0 && oh * SH <= IH - 1) {
47 | const __global half *src = src_data + (gc * IC / GC) * IW * IH + (SH * oh) * IW;
48 |
49 | event_t e1 = async_work_group_copy_2D2D(
50 | src_local, // dst
51 | src, // src
52 | IW, // num_elements_per_line,
53 | IC / GC, // num_lines,
54 | IH * IW - IW, // src_line_stride,
55 | 0, // dst_line_stride,
56 | 0);
57 | wait_group_events(1, &e1);
58 | }
59 |
60 | half pad_value_half = convert_half(pad_value);
61 |
62 | //padding row
63 | if (oh * SH > IH - 1) {
64 | __local half *dst = src_local;
65 | for (int c = 0; c < IC / GC; c++) {
66 | #pragma unroll 8
67 | for (int j = 0; j < IW; j++) {
68 | dst[j] = pad_value_half;
69 | }
70 | dst += IW;
71 | }
72 | }
73 |
74 | int OWS = SW * OW;
75 | ushort8 in;
76 |
77 | for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
78 | ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
79 | for (int ic = 0; ic < IC / GC; ++ic) {
80 | __local half *src = (__local half *)((__local half8 *)(src_local + ic * IW) + ows8);
81 | int weight_pos = oc * IC / GC + ic;
82 | ushort w =
83 | extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
84 |
85 | if ((ows8 * 8) <= IW - 1) {
86 | in = *((__local ushort8 *)(src));
87 | }
88 |
89 | //padding column
90 | if (ows8 * 8 + 7 > IW - 1) {
91 | int boundary = (IW - 1) - ows8 * 8 + 1;
92 | boundary = boundary < 0 ? 0 : boundary;
93 | for (int offset = boundary; offset < 8; offset++) {
94 | *((half *)(&in) + offset) = pad_value_half;
95 | }
96 | }
97 |
98 | ushort8 w8 = (ushort8)(w);
99 |
100 | ushort8 cond =
101 | (((in) < (ushort8)0x8000) && (in > (ushort8)0x0000)) ? (ushort8)(1) : (ushort8)(0);
102 |
103 | val += (cond ^ w8);
104 | }
105 |
106 | ushort8 val_shift = val << 1;
107 | int boundary = (ows8 * 8 + 7) / SW < OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
108 | for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
109 | *(dst_local + ow) = (half)(IC / GC - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
110 | }
111 | }
112 |
113 | barrier(CLK_LOCAL_MEM_FENCE);
114 |
115 | event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
116 | wait_group_events(1, &e2);
117 | }
118 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution3x3.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:55e3c3f8863ff7a3583bcc7340d1e226775f5f14cfb11dd32bd671764570f7cb
3 | size 104136
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/binary_convolution3x3.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | ushort extract_weights(uchar val, int bit) { return ((val >> bit) & 1); }
9 |
10 | __kernel void binary_convolution(
11 | const __global half *restrict src_data,
12 | const __global uchar *restrict weights_data,
13 | const __global half *restrict dst_data,
14 | float pad_value,
15 |
16 | int IW,
17 | int IH,
18 | int IC,
19 |
20 | int DW,
21 | int DH,
22 |
23 | int GC,
24 |
25 | int KW,
26 | int KH,
27 |
28 | int PW,
29 | int PH,
30 |
31 | int SW,
32 | int SH,
33 |
34 | int OW)
35 | {
36 | __local half src_local[32 * 1024];
37 | __local half dst_local[2 * 1024];
38 |
39 | const int oh = get_group_id(0);
40 | const int oc = get_group_id(1);
41 | const int OH = get_global_size(0);
42 | const int OC = get_global_size(1);
43 |
44 | const int gc = oc / (OC / GC);
45 |
46 | if (oh * SH - 1 >= 0 && oh * SH + DH + DH - 1 <= IH - 1) //dma for 3 rows
47 | {
48 | event_t e = async_work_group_copy_3D3D(
49 | src_local, // dst
50 | src_data + (gc * IC / GC) * IW * IH + (SH * oh - 1) * IW, // src
51 | IW, // num_elements_per_line
52 | 3, // num_lines
53 | DH * IW - IW, // src_line_stride
54 | 0, // dst_line_stride
55 | IC / GC, // num planes
56 | IH * IW - 3 * DH * IW, // src plane stride
57 | 0, // dst plane stride
58 | 0);
59 | wait_group_events(1, &e);
60 | } else {
61 | int ih = oh * SH - 1;
62 | if (ih >= 0 && ih <= IH - 1) //dma for first row
63 | {
64 | event_t e = async_work_group_copy_2D2D(
65 | src_local, // dst
66 | src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
67 | IW, // num_elements_per_line,
68 | IC / GC, // num_lines,
69 | IH * IW - IW, // src_line_stride,
70 | 2 * IW, // dst_line_stride,
71 | 0);
72 |
73 | wait_group_events(1, &e);
74 | }
75 | ih = oh * SH - 1 + DH;
76 | if (ih >= 0 && ih <= IH - 1) //dma for second row
77 | {
78 | event_t e = async_work_group_copy_2D2D(
79 | src_local + IW, // dst
80 | src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
81 | IW, // num_elements_per_line,
82 | IC / GC, // num_lines,
83 | IH * IW - IW, // src_line_stride,
84 | 2 * IW, // dst_line_stride,
85 | 0);
86 | wait_group_events(1, &e);
87 | }
88 | ih = oh * SH - 1 + 2 * DH;
89 | if (ih >= 0 && ih <= IH - 1) //dma for third row
90 | {
91 | event_t e = async_work_group_copy_2D2D(
92 | src_local + 2 * IW, // dst
93 | src_data + (gc * IC / GC) * IW * IH + ih * IW, // src
94 | IW, // num_elements_per_line,
95 | IC / GC, // num_lines,
96 | IH * IW - IW, // src_line_stride,
97 | 2 * IW, // dst_line_stride,
98 | 0);
99 | wait_group_events(1, &e);
100 | }
101 | }
102 |
103 | half pad_value_half = convert_half(pad_value);
104 |
105 | //padding row
106 | if (oh * SH - 1 < 0 || oh * SH - 1 > IH - 1) {
107 | __local half *dst = src_local;
108 | for (int c = 0; c < IC / GC; c++) {
109 | #pragma unroll 8
110 | for (int j = 0; j < IW; j++) {
111 | dst[j] = pad_value_half;
112 | }
113 | dst += 3 * IW;
114 | }
115 | }
116 | if (oh * SH + DH - 1 > IH - 1) {
117 | __local half *dst = src_local + IW;
118 | for (int c = 0; c < IC / GC; c++) {
119 | #pragma unroll 8
120 | for (int j = 0; j < IW; j++) {
121 | dst[j] = pad_value_half;
122 | }
123 | dst += 3 * IW;
124 | }
125 | }
126 | if (oh * SH + DH + DH - 1 > IH - 1) {
127 | __local half *dst = src_local + 2 * IW;
128 | for (int c = 0; c < IC / GC; c++) {
129 | #pragma unroll 8
130 | for (int j = 0; j < IW; j++) {
131 | dst[j] = pad_value_half;
132 | }
133 | dst += 3 * IW;
134 | }
135 | }
136 |
137 | int OWS = SW * OW;
138 |
139 | ushort8 in00;
140 | ushort8 in01;
141 | ushort8 in02;
142 | ushort8 in10;
143 | ushort8 in11;
144 | ushort8 in12;
145 | ushort8 in20;
146 | ushort8 in21;
147 | ushort8 in22;
148 |
149 | for (int ows8 = 0; ows8 < (OWS + 7) / 8; ows8++) {
150 | ushort8 val = {0, 0, 0, 0, 0, 0, 0, 0};
151 | for (int ic = 0; ic < IC / GC; ++ic) {
152 | __local half *src =
153 | (__local half *)((__local half8 *)(src_local + ic * IW * 3 + IW + DW - 1) + ows8);
154 | int weight_pos = oc * IC / GC * 3 * 3 + ic * 3 * 3;
155 | ushort w0 = extract_weights(weights_data[((weight_pos + 0)) / 8], ((weight_pos + 0) % 8));
156 | ushort w1 = extract_weights(weights_data[((weight_pos + 1)) / 8], ((weight_pos + 1) % 8));
157 | ushort w2 = extract_weights(weights_data[((weight_pos + 2)) / 8], ((weight_pos + 2) % 8));
158 | ushort w3 = extract_weights(weights_data[((weight_pos + 3)) / 8], ((weight_pos + 3) % 8));
159 | ushort w4 = extract_weights(weights_data[((weight_pos + 4)) / 8], ((weight_pos + 4) % 8));
160 | ushort w5 = extract_weights(weights_data[((weight_pos + 5)) / 8], ((weight_pos + 5) % 8));
161 | ushort w6 = extract_weights(weights_data[((weight_pos + 6)) / 8], ((weight_pos + 6) % 8));
162 | ushort w7 = extract_weights(weights_data[((weight_pos + 7)) / 8], ((weight_pos + 7) % 8));
163 | ushort w8 = extract_weights(weights_data[((weight_pos + 8)) / 8], ((weight_pos + 8) % 8));
164 |
165 | if ((ows8 * 8) - 1 <= IW - 1) {
166 | in00 = *((__local ushort8 *)(src - IW - DW));
167 | in01 = *((__local ushort8 *)(src - IW));
168 | in02 = *((__local ushort8 *)(src - IW + DW));
169 |
170 | in10 = *((__local ushort8 *)(src - DW));
171 | in11 = *((__local ushort8 *)(src));
172 | in12 = *((__local ushort8 *)(src + DW));
173 |
174 | in20 = *((__local ushort8 *)(src + IW - DW));
175 | in21 = *((__local ushort8 *)(src + IW));
176 | in22 = *((__local ushort8 *)(src + IW + DW));
177 | }
178 |
179 | //padding column
180 | if (ows8 * 8 - 1 < 0) {
181 | int boundary = 1 - ows8 * 8;
182 | boundary = boundary > 8 ? 8 : boundary;
183 | for (int offset = 0; offset < boundary; offset++) {
184 | *((half *)(&in00) + offset) = pad_value_half;
185 | *((half *)(&in10) + offset) = pad_value_half;
186 | *((half *)(&in20) + offset) = pad_value_half;
187 | }
188 | }
189 | if ((ows8 * 8 + 7) + DW + DW - 1 > IW - 1) {
190 | int boundary = (IW - DW - 1 - DW + 1) - ows8 * 8 + 1;
191 | boundary = boundary < 0 ? 0 : boundary;
192 | for (int offset = boundary; offset < 8; offset++) {
193 | *((half *)(&in02) + offset) = pad_value_half;
194 | *((half *)(&in12) + offset) = pad_value_half;
195 | *((half *)(&in22) + offset) = pad_value_half;
196 | }
197 | }
198 | if ((ows8 * 8 + 7) + DW - 1 > IW - 1) {
199 | int boundary = (IW - 1 - DW + 1) - ows8 * 8 + 1;
200 | boundary = boundary < 0 ? 0 : boundary;
201 | for (int offset = boundary; offset < 8; offset++) {
202 | *((half *)(&in01) + offset) = pad_value_half;
203 | *((half *)(&in11) + offset) = pad_value_half;
204 | *((half *)(&in21) + offset) = pad_value_half;
205 | }
206 | }
207 | if ((ows8 * 8 + 7) - 1 > IW - 1) {
208 | int boundary = (IW - 1 + 1) - ows8 * 8 + 1;
209 | boundary = boundary < 0 ? 0 : boundary;
210 | for (int offset = boundary; offset < 8; offset++) {
211 | *((half *)(&in00) + offset) = pad_value_half;
212 | *((half *)(&in10) + offset) = pad_value_half;
213 | *((half *)(&in20) + offset) = pad_value_half;
214 | }
215 | }
216 |
217 | ushort8 w00 = (ushort8)(w0);
218 | ushort8 w01 = (ushort8)(w1);
219 | ushort8 w02 = (ushort8)(w2);
220 | ushort8 w10 = (ushort8)(w3);
221 | ushort8 w11 = (ushort8)(w4);
222 | ushort8 w12 = (ushort8)(w5);
223 | ushort8 w20 = (ushort8)(w6);
224 | ushort8 w21 = (ushort8)(w7);
225 | ushort8 w22 = (ushort8)(w8);
226 |
227 | ushort8 cond0 = (((in00) < (ushort8)0x8000) && (in00 > (ushort8)0x0000)) ?
228 | (ushort8)(1) :
229 | (ushort8)(0);
230 | ushort8 cond1 = (((in01) < (ushort8)0x8000) && (in01 > (ushort8)0x0000)) ?
231 | (ushort8)(1) :
232 | (ushort8)(0);
233 | ushort8 cond2 = (((in02) < (ushort8)0x8000) && (in02 > (ushort8)0x0000)) ?
234 | (ushort8)(1) :
235 | (ushort8)(0);
236 | ushort8 cond3 = (((in10) < (ushort8)0x8000) && (in10 > (ushort8)0x0000)) ?
237 | (ushort8)(1) :
238 | (ushort8)(0);
239 | ushort8 cond4 = (((in11) < (ushort8)0x8000) && (in11 > (ushort8)0x0000)) ?
240 | (ushort8)(1) :
241 | (ushort8)(0);
242 | ushort8 cond5 = (((in12) < (ushort8)0x8000) && (in12 > (ushort8)0x0000)) ?
243 | (ushort8)(1) :
244 | (ushort8)(0);
245 | ushort8 cond6 = (((in20) < (ushort8)0x8000) && (in20 > (ushort8)0x0000)) ?
246 | (ushort8)(1) :
247 | (ushort8)(0);
248 | ushort8 cond7 = (((in21) < (ushort8)0x8000) && (in21 > (ushort8)0x0000)) ?
249 | (ushort8)(1) :
250 | (ushort8)(0);
251 | ushort8 cond8 = (((in22) < (ushort8)0x8000) && (in22 > (ushort8)0x0000)) ?
252 | (ushort8)(1) :
253 | (ushort8)(0);
254 |
255 | val += (cond0 ^ w00);
256 | val += (cond1 ^ w01);
257 | val += (cond2 ^ w02);
258 | val += (cond3 ^ w10);
259 | val += (cond4 ^ w11);
260 | val += (cond5 ^ w12);
261 | val += (cond6 ^ w20);
262 | val += (cond7 ^ w21);
263 | val += (cond8 ^ w22);
264 | }
265 |
266 | ushort8 val_shift = val << 1;
267 | int boundary = (ows8 * 8 + 7) / SW <= OW - 1 ? (ows8 * 8 + 7) / SW : OW - 1;
268 | for (int ow = (ows8 * 8 + SW - 1) / SW; ow <= boundary; ow++) {
269 | *(dst_local + ow) =
270 | (half)(IC / GC * KH * KW - *((ushort *)(&val_shift) + ow * SW - ows8 * 8));
271 | }
272 | }
273 |
274 | barrier(CLK_LOCAL_MEM_FENCE);
275 |
276 | event_t e2 = async_work_group_copy(dst_data + oc * OW * OH + oh * OW, dst_local, OW, 0);
277 | wait_group_events(1, &e2);
278 | }
279 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution1x1_chw.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8717c8429d41a69337007871137f06a9e6b38c685b5b3fecc634fade0eaa7e7f
3 | size 9220
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution1x1_chw.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __kernel void Convolution1x1_NCHW(
9 | const __global half *in,
10 | const __global half *out,
11 | const __global half *w,
12 | int IW,
13 | int IH,
14 | int IC,
15 | int OW,
16 | int OH,
17 | int OC)
18 | {
19 | __local half in_local[8 * 1024];
20 | __local half out_local[8 * 1024];
21 |
22 | event_t e1 = async_work_group_copy_2D2D(
23 | in_local, // dst
24 | in + get_group_id(0) * IW, // src
25 | IW, // num_elements_per_line,
26 | IC, // num_lines,
27 | IW * IH - IW, // src_line_stride,
28 | 0, // dst_line_stride,
29 | 0);
30 | wait_group_events(1, &e1);
31 |
32 | int oh = get_global_id(0);
33 | int oc = get_global_id(1);
34 |
35 | int stride;
36 | int write_output = 0;
37 | __global half *src;
38 |
39 | __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
40 | __global half *w1 = (__global half *)(&w[oc * IC]);
41 |
42 | for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
43 | uint iw = ow;
44 | uint ih = oh;
45 |
46 | half8 val8_0 = 0.0f;
47 |
48 | __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
49 | __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
50 | __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
51 | __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
52 | __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
53 | __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
54 | __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
55 | __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
56 |
57 | for (uint ic = 0; ic < IC / 8; ic++) {
58 | val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
59 | val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
60 | val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
61 | val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
62 | val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
63 | val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
64 | val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
65 | val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
66 | }
67 |
68 | for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
69 | val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
70 | }
71 | *((__local half8 *)&out_local[ow + 0]) = (val8_0);
72 | }
73 |
74 | uint iw = (OW & (~0x7));
75 | uint ih = oh;
76 |
77 | half8 val8_0 = 0.0f;
78 |
79 | __local half8 *in8_0 = (__local half8 *)(&in_local[iw + 0 * IW]);
80 | __local half8 *in8_1 = (__local half8 *)(&in_local[iw + 1 * IW]);
81 | __local half8 *in8_2 = (__local half8 *)(&in_local[iw + 2 * IW]);
82 | __local half8 *in8_3 = (__local half8 *)(&in_local[iw + 3 * IW]);
83 | __local half8 *in8_4 = (__local half8 *)(&in_local[iw + 4 * IW]);
84 | __local half8 *in8_5 = (__local half8 *)(&in_local[iw + 5 * IW]);
85 | __local half8 *in8_6 = (__local half8 *)(&in_local[iw + 6 * IW]);
86 | __local half8 *in8_7 = (__local half8 *)(&in_local[iw + 7 * IW]);
87 |
88 | for (uint ic = 0; ic < IC / 8; ic++) {
89 | val8_0 += (in8_0[ic * IW]) * ((half8)w8[ic].s0);
90 | val8_0 += (in8_1[ic * IW]) * ((half8)w8[ic].s1);
91 | val8_0 += (in8_2[ic * IW]) * ((half8)w8[ic].s2);
92 | val8_0 += (in8_3[ic * IW]) * ((half8)w8[ic].s3);
93 | val8_0 += (in8_4[ic * IW]) * ((half8)w8[ic].s4);
94 | val8_0 += (in8_5[ic * IW]) * ((half8)w8[ic].s5);
95 | val8_0 += (in8_6[ic * IW]) * ((half8)w8[ic].s6);
96 | val8_0 += (in8_7[ic * IW]) * ((half8)w8[ic].s7);
97 | }
98 |
99 | for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
100 | val8_0 += *((__local half8 *)(&in_local[iw + ic * IW])) * ((half8)w1[ic]);
101 | }
102 | for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
103 | out_local[ow + 0] = (val8_0[ow % 8]);
104 | }
105 |
106 | barrier(CLK_LOCAL_MEM_FENCE);
107 |
108 | event_t e2 = async_work_group_copy(
109 | out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
110 | out_local,
111 | OW,
112 | 0);
113 | wait_group_events(1, &e2);
114 | }
115 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution1x1_hwc.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5b6122a6bf6f50d2c7fc612d4e286559f9c96746e166892d192e1264e1ce5a2c
3 | size 4304
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution1x1_hwc.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __kernel void Convolution1x1_NHWC(
9 | const __global half *in,
10 | const __global half *out,
11 | const __global half *w,
12 | int IW,
13 | int IH,
14 | int IC,
15 | int OW,
16 | int OH,
17 | int OC)
18 | {
19 |
20 | __local half in_local[8 * 1024];
21 | __local half out_local[8 * 1024];
22 |
23 | const int sizeAct = IW * IC;
24 |
25 | event_t e1 = async_work_group_copy(in_local, in + get_group_id(0) * sizeAct, sizeAct, 0);
26 | wait_group_events(1, &e1);
27 |
28 | int oh = get_global_id(0);
29 | int oc = get_global_id(1);
30 |
31 | int stride;
32 | int write_output = 0;
33 | __global half *src;
34 |
35 | __global half8 *w8 = (__global half8 *)(&w[oc * IC]);
36 | __global half *w1 = (__global half *)(&w[oc * IC]);
37 |
38 | for (uint ow = 0; ow < (OW & (~0x7)); ow += 8) {
39 | uint iw = ow;
40 | uint ih = oh;
41 |
42 | half8 val8_0 = 0.0f;
43 | half8 val8_1 = 0.0f;
44 | half8 val8_2 = 0.0f;
45 | half8 val8_3 = 0.0f;
46 | half8 val8_4 = 0.0f;
47 | half8 val8_5 = 0.0f;
48 | half8 val8_6 = 0.0f;
49 | half8 val8_7 = 0.0f;
50 |
51 | __local half8 *in8_0 = (__local half8 *)(&in_local[(iw + 0) * IC]);
52 | __local half8 *in8_1 = (__local half8 *)(&in_local[(iw + 1) * IC]);
53 | __local half8 *in8_2 = (__local half8 *)(&in_local[(iw + 2) * IC]);
54 | __local half8 *in8_3 = (__local half8 *)(&in_local[(iw + 3) * IC]);
55 | __local half8 *in8_4 = (__local half8 *)(&in_local[(iw + 4) * IC]);
56 | __local half8 *in8_5 = (__local half8 *)(&in_local[(iw + 5) * IC]);
57 | __local half8 *in8_6 = (__local half8 *)(&in_local[(iw + 6) * IC]);
58 | __local half8 *in8_7 = (__local half8 *)(&in_local[(iw + 7) * IC]);
59 |
60 | for (uint ic = 0; ic < IC / 8; ++ic) {
61 | val8_0 += (in8_0[ic]) * (w8[ic]);
62 | val8_1 += (in8_1[ic]) * (w8[ic]);
63 | val8_2 += (in8_2[ic]) * (w8[ic]);
64 | val8_3 += (in8_3[ic]) * (w8[ic]);
65 | val8_4 += (in8_4[ic]) * (w8[ic]);
66 | val8_5 += (in8_5[ic]) * (w8[ic]);
67 | val8_6 += (in8_6[ic]) * (w8[ic]);
68 | val8_7 += (in8_7[ic]) * (w8[ic]);
69 | }
70 |
71 | half val_0 = 0.0f;
72 | half val_1 = 0.0f;
73 | half val_2 = 0.0f;
74 | half val_3 = 0.0f;
75 | half val_4 = 0.0f;
76 | half val_5 = 0.0f;
77 | half val_6 = 0.0f;
78 | half val_7 = 0.0f;
79 | for (uint ic = IC & (~0x7); ic < IC; ++ic) {
80 | val_0 += *((__local half *)in8_0 + ic) * (*((__global half *)w8 + ic));
81 | val_1 += *((__local half *)in8_1 + ic) * (*((__global half *)w8 + ic));
82 | val_2 += *((__local half *)in8_2 + ic) * (*((__global half *)w8 + ic));
83 | val_3 += *((__local half *)in8_3 + ic) * (*((__global half *)w8 + ic));
84 | val_4 += *((__local half *)in8_4 + ic) * (*((__global half *)w8 + ic));
85 | val_5 += *((__local half *)in8_5 + ic) * (*((__global half *)w8 + ic));
86 | val_6 += *((__local half *)in8_6 + ic) * (*((__global half *)w8 + ic));
87 | val_7 += *((__local half *)in8_7 + ic) * (*((__global half *)w8 + ic));
88 | }
89 | out_local[ow + 0] = __builtin_shave_sau_sumx_f16_r(val8_0) + val_0;
90 | out_local[ow + 1] = __builtin_shave_sau_sumx_f16_r(val8_1) + val_1;
91 | out_local[ow + 2] = __builtin_shave_sau_sumx_f16_r(val8_2) + val_2;
92 | out_local[ow + 3] = __builtin_shave_sau_sumx_f16_r(val8_3) + val_3;
93 | out_local[ow + 4] = __builtin_shave_sau_sumx_f16_r(val8_4) + val_4;
94 | out_local[ow + 5] = __builtin_shave_sau_sumx_f16_r(val8_5) + val_5;
95 | out_local[ow + 6] = __builtin_shave_sau_sumx_f16_r(val8_6) + val_6;
96 | out_local[ow + 7] = __builtin_shave_sau_sumx_f16_r(val8_7) + val_7;
97 | }
98 | for (uint ow = (OW & (~0x7)); ow < OW; ow++) {
99 |
100 | uint iw = ow;
101 | uint ih = oh;
102 |
103 | half8 val8 = 0.0f;
104 |
105 | __local half8 *in8 = (__local half8 *)(&in_local[iw * IC]);
106 |
107 | for (uint ic = 0; ic < IC / 8; ++ic) {
108 | val8 += (in8[ic]) * (w8[ic]);
109 | }
110 |
111 | half val = 0.0f;
112 | for (uint ic = (IC & (~0x7)); ic < IC; ++ic) {
113 | val += (*((__local half *)in8 + ic)) * (*((__global half *)w8 + ic));
114 | }
115 | out_local[ow] = __builtin_shave_sau_sumx_f16_r(val8) + val;
116 | }
117 |
118 | barrier(CLK_LOCAL_MEM_FENCE);
119 |
120 | event_t e2 = async_work_group_copy(
121 | out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
122 | out_local,
123 | OW,
124 | 0);
125 | wait_group_events(1, &e2);
126 | }
127 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution3x3.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:021bb40840ff35506972e6f6a7dea1b5f40a8db0927aaa9a6c116b152e386851
3 | size 5748
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/convolution3x3.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __kernel void Convolution3x3(
9 | const __global half *in_param,
10 | const __global half *out,
11 | const __global half *w,
12 | int IW,
13 | int IH,
14 | int IC,
15 | int OW,
16 | int OH,
17 | int OC,
18 | int KX,
19 | int KY,
20 | int stride_x,
21 | int stride_y,
22 | int pad_x,
23 | int pad_y,
24 | int dilation_x,
25 | int dilation_y)
26 | {
27 | __local half in_local[8 * 1024];
28 | __local half out_local[8 * 1024];
29 | __local half w_local[8 * 1024];
30 |
31 | const int sizePlane = IW * IH;
32 | event_t e1 = async_work_group_copy_2D2D(
33 | in_local, // dst
34 | in_param + get_group_id(0) * stride_y * IW, // src
35 | 3 * IW, // num_elements_per_line,
36 | IC, // num_lines,
37 | IW * IH - 3 * IW, // src_line_stride,
38 | 0, // dst_line_stride,
39 | 0);
40 | wait_group_events(1, &e1);
41 |
42 | const int sizeWeight = IC * 3 * 3;
43 | e1 = async_work_group_copy(w_local, w + get_group_id(1) * sizeWeight, sizeWeight, 0);
44 | wait_group_events(1, &e1);
45 |
46 | int oh = get_global_id(0);
47 | int oc = get_global_id(1);
48 |
49 | __local half *in = (__local half *)in_local + 1;
50 |
51 | int stride;
52 | int write_output = 0;
53 | __local half *src;
54 |
55 | if ((stride_x == 1) && (stride_y == 1)) {
56 | stride = OW / 8;
57 | write_output = 1;
58 | }
59 | if ((stride_x == 2) && (stride_y == 2)) {
60 | stride = OW / 4;
61 | write_output = 2;
62 | }
63 |
64 | for (int ow = 0; ow < stride; ow++) {
65 | float8 val = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
66 | for (int ic = 0; ic < IC; ++ic) {
67 | src = (__local half *)((__local half8 *)(in + ic * IW * 3) + ow);
68 | __local half *k = (__local half *)(w_local + ic * 3 * 3);
69 |
70 | half8 aux_in00 = *((__local half8 *)src - 1);
71 | half8 aux_in01 = *((__local half8 *)src + 0);
72 | half8 aux_in02 = *((__local half8 *)src + 1);
73 | half8 aux_in10 = *((__local half8 *)(src + IW) - 1);
74 | half8 aux_in11 = *((__local half8 *)(src + IW) + 0);
75 | half8 aux_in12 = *((__local half8 *)(src + IW) + 1);
76 | half8 aux_in20 = *((__local half8 *)(src + IW * 2) - 1);
77 | half8 aux_in21 = *((__local half8 *)(src + IW * 2) + 0);
78 | half8 aux_in22 = *((__local half8 *)(src + IW * 2) + 1);
79 |
80 | short8 in00 = *((short8 *)&aux_in00);
81 | short8 in01 = *((short8 *)&aux_in01);
82 | short8 in02 = *((short8 *)&aux_in02);
83 | short8 in10 = *((short8 *)&aux_in10);
84 | short8 in11 = *((short8 *)&aux_in11);
85 | short8 in12 = *((short8 *)&aux_in12);
86 | short8 in20 = *((short8 *)&aux_in20);
87 | short8 in21 = *((short8 *)&aux_in21);
88 | short8 in22 = *((short8 *)&aux_in22);
89 |
90 | short8 aux_aux00 = __builtin_shave_cmu_alignvec_rri_short8(in00, in01, 14);
91 | short8 aux_aux01 = in01;
92 | short8 aux_aux02 = __builtin_shave_cmu_alignvec_rri_short8(in01, in02, 2);
93 | short8 aux_aux10 = __builtin_shave_cmu_alignvec_rri_short8(in10, in11, 14);
94 | short8 aux_aux11 = in11;
95 | short8 aux_aux12 = __builtin_shave_cmu_alignvec_rri_short8(in11, in12, 2);
96 | short8 aux_aux20 = __builtin_shave_cmu_alignvec_rri_short8(in20, in21, 14);
97 | short8 aux_aux21 = in21;
98 | short8 aux_aux22 = __builtin_shave_cmu_alignvec_rri_short8(in21, in22, 2);
99 |
100 | half8 aux00 = *((half8 *)&aux_aux00);
101 | half8 aux01 = *((half8 *)&aux_aux01);
102 | half8 aux02 = *((half8 *)&aux_aux02);
103 | half8 aux10 = *((half8 *)&aux_aux10);
104 | half8 aux11 = *((half8 *)&aux_aux11);
105 | half8 aux12 = *((half8 *)&aux_aux12);
106 | half8 aux20 = *((half8 *)&aux_aux20);
107 | half8 aux21 = *((half8 *)&aux_aux21);
108 | half8 aux22 = *((half8 *)&aux_aux22);
109 |
110 | half8 w00 = (half8)(*(k + 0));
111 | half8 w01 = (half8)(*(k + 1));
112 | half8 w02 = (half8)(*(k + 2));
113 | half8 w10 = (half8)(*(k + 3));
114 | half8 w11 = (half8)(*(k + 4));
115 | half8 w12 = (half8)(*(k + 5));
116 | half8 w20 = (half8)(*(k + 6));
117 | half8 w21 = (half8)(*(k + 7));
118 | half8 w22 = (half8)(*(k + 8));
119 |
120 | val += convert_float8(aux00) * convert_float8(w00);
121 | val += convert_float8(aux01) * convert_float8(w01);
122 | val += convert_float8(aux02) * convert_float8(w02);
123 | val += convert_float8(aux10) * convert_float8(w10);
124 | val += convert_float8(aux11) * convert_float8(w11);
125 | val += convert_float8(aux12) * convert_float8(w12);
126 | val += convert_float8(aux20) * convert_float8(w20);
127 | val += convert_float8(aux21) * convert_float8(w21);
128 | val += convert_float8(aux22) * convert_float8(w22);
129 | }
130 | if (write_output == 2) *((__local half4 *)(out_local) + ow) = convert_half4(val.s0246);
131 | if (write_output == 1) *((__local half8 *)(out_local) + ow) = convert_half8(val);
132 | }
133 |
134 | for (int ow = OW & ~(0x7); ow < OW; ow++) {
135 | float val = 0.0f;
136 | for (int ic = 0; ic < IC; ++ic) {
137 | for (int ky = 0; ky < 3; ++ky) {
138 | for (int kx = 0; kx < 3; ++kx) {
139 | int iw = ow * stride_x - pad_x + kx * dilation_x;
140 | int ih = oh * stride_y - pad_y + ky * dilation_y;
141 |
142 | val += convert_float(in[ic * IW * 3 + (ky * dilation_y) * IW + iw])
143 | * convert_float(w_local[ic * 3 * 3 + ky * 3 + kx]);
144 | }
145 | }
146 | }
147 | out_local[ow] = convert_half(val);
148 | }
149 |
150 | barrier(CLK_LOCAL_MEM_FENCE);
151 |
152 | event_t e2 = async_work_group_copy(
153 | out + get_group_id(1) * OW * OH + get_group_id(0) * OW,
154 | out_local,
155 | OW,
156 | 0);
157 | wait_group_events(1, &e2);
158 | }
159 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/correlate.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e2b24b1b5bfd1786128682ee814230653b4b63aad5b472feec9c6f4a4c833e2f
3 | size 14336
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/correlate.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 |
7 | #define MAX_OPENCL_BUFF_SIZE 64 * 1024
8 |
9 | #define USE_DMA 1
10 |
11 | #if defined(USE_DMA)
12 | void dmacpyLineSrcStrideStart(global half *from, private half *to, int size, int src_width, int src_stride)
13 | {
14 | item_dma_event_t copyEvent =
15 | WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_stride, src_width, size, 0);
16 | WaitWorkItemDmaEvents(1, ©Event);
17 | }
18 |
19 | void dmacpyLineDstStrideStart(private half *from, global half *to, int size, int src_width, int src_stride)
20 | {
21 | item_dma_event_t copyEvent =
22 | WorkItemDmaCreateStrideTransaction(from, to, src_width, src_width, src_width, src_stride, size, 0);
23 | WaitWorkItemDmaEvents(1, ©Event);
24 | }
25 | #endif
26 |
27 | void memzero(void *ptr, size_t num)
28 | {
29 | float4 *line0_ = (float4 *)ptr;
30 | #pragma unroll 16
31 | for (int i = 0; i < num / 16; i++) {
32 | line0_[i] = (float4){0.f, 0.f, 0.f, 0.f};
33 | }
34 | uchar *ptr_ = (uchar *)ptr;
35 | for (int i = num / 16 * 16; i < num; i++) {
36 | ptr_[i] = 0;
37 | }
38 | }
39 |
40 | void __attribute__((noinline)) crosscorrh(
41 | __private const half *restrict line0,
42 | __private const half *restrict line1,
43 | __private half *restrict dline,
44 | int topwidth,
45 | int max_displacement,
46 | int neighborhood_grid_radius,
47 | int kernel_size,
48 | int padding,
49 | int bottomwidth,
50 | int stride1,
51 | int stride2,
52 | int max_channels,
53 | int cur_subchannels)
54 | {
55 | if (max_channels == 64) {
56 | for (int i = 0; i < kernel_size; i++) {
57 | int x1 = max_displacement - padding + i;
58 | int offset1 = x1 >= 0 ? 0 : (-x1 + stride1 - 1) / stride1;
59 | x1 += offset1 * stride1;
60 |
61 | for (int blockIdx_x = offset1; blockIdx_x < topwidth && x1 < bottomwidth; blockIdx_x++, x1 += stride1) {
62 | int x2 = x1 - neighborhood_grid_radius * stride2;
63 | int offset2 = x2 >= 0 ? 0 : (-x2 + stride2 - 1) / stride2;
64 | x2 += offset2 * stride2;
65 |
66 | for (int top_channel_x = offset2 - neighborhood_grid_radius;
67 | top_channel_x <= neighborhood_grid_radius && x2 < bottomwidth;
68 | top_channel_x++, x2 += stride2) {
69 | half8 sum4 = (half8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
70 |
71 | half8 *src0 = (half8 *)(line0 + x1 * max_channels);
72 | half8 *src1 = (half8 *)(line1 + x2 * max_channels);
73 |
74 | #pragma unroll 8
75 | for (int ch = 0; ch < max_channels / 8; ch++) sum4 += (src0[ch]) * (src1[ch]);
76 |
77 | half sum = __builtin_shave_sau_sumx_f16_r(sum4);
78 | dline[(top_channel_x + neighborhood_grid_radius) * topwidth + blockIdx_x] += (sum);
79 | }
80 | }
81 | }
82 | } else {
83 | int neighborhood_grid_width = 2 * neighborhood_grid_radius + 1;
84 |
85 | for (int blockIdx_x = 0; blockIdx_x < topwidth; blockIdx_x++) {
86 | for (int i = 0; i < kernel_size; i++) {
87 | int x1 = blockIdx_x * stride1 + max_displacement + i - padding;
88 |
89 | if ((x1 >= 0) && (x1 < bottomwidth)) {
90 | int o_min = -neighborhood_grid_radius * stride2;
91 | int o_max = neighborhood_grid_width * stride2 - neighborhood_grid_radius * stride2;
92 | if ((o_min) < (-x1)) {
93 | o_min -= ((x1 + o_min - (stride2 - 1)) / stride2) * stride2;
94 | }
95 | if ((o_max) >= (bottomwidth + stride2 - x1)) {
96 | o_max -= ((x1 + o_max - bottomwidth) / stride2) * stride2;
97 | }
98 |
99 | int o = o_min;
100 | for (; o <= o_max - 4 * stride2; o += 4 * stride2) {
101 | half8 *bottom0 = (half8 *)(line0 + x1 * max_channels);
102 | half8 *bottom1_0 = (half8 *)(line1 + (x1 + o + 0 * stride2) * max_channels);
103 | half8 *bottom1_1 = (half8 *)(line1 + (x1 + o + 1 * stride2) * max_channels);
104 | half8 *bottom1_2 = (half8 *)(line1 + (x1 + o + 2 * stride2) * max_channels);
105 | half8 *bottom1_3 = (half8 *)(line1 + (x1 + o + 3 * stride2) * max_channels);
106 |
107 | int c = 0;
108 |
109 | half8 sum40 = 0;
110 | half8 sum41 = 0;
111 | half8 sum42 = 0;
112 | half8 sum43 = 0;
113 |
114 | for (; c <= cur_subchannels / 8 - 4; c += 4) {
115 | sum40 += bottom0[c + 0] * bottom1_0[c + 0];
116 | sum40 += bottom0[c + 1] * bottom1_0[c + 1];
117 | sum40 += bottom0[c + 2] * bottom1_0[c + 2];
118 | sum40 += bottom0[c + 3] * bottom1_0[c + 3];
119 |
120 | sum41 += bottom0[c + 0] * bottom1_1[c + 0];
121 | sum41 += bottom0[c + 1] * bottom1_1[c + 1];
122 | sum41 += bottom0[c + 2] * bottom1_1[c + 2];
123 | sum41 += bottom0[c + 3] * bottom1_1[c + 3];
124 |
125 | sum42 += bottom0[c + 0] * bottom1_2[c + 0];
126 | sum42 += bottom0[c + 1] * bottom1_2[c + 1];
127 | sum42 += bottom0[c + 2] * bottom1_2[c + 2];
128 | sum42 += bottom0[c + 3] * bottom1_2[c + 3];
129 |
130 | sum43 += bottom0[c + 0] * bottom1_3[c + 0];
131 | sum43 += bottom0[c + 1] * bottom1_3[c + 1];
132 | sum43 += bottom0[c + 2] * bottom1_3[c + 2];
133 | sum43 += bottom0[c + 3] * bottom1_3[c + 3];
134 | }
135 |
136 | for (; c < cur_subchannels / 8; c++) {
137 | sum40 += bottom0[c] * bottom1_0[c];
138 | sum41 += bottom0[c] * bottom1_1[c];
139 | sum42 += bottom0[c] * bottom1_2[c];
140 | sum43 += bottom0[c] * bottom1_3[c];
141 | }
142 |
143 | half sum0 = __builtin_shave_sau_sumx_f16_r(sum40);
144 | half sum1 = __builtin_shave_sau_sumx_f16_r(sum41);
145 | half sum2 = __builtin_shave_sau_sumx_f16_r(sum42);
146 | half sum3 = __builtin_shave_sau_sumx_f16_r(sum43);
147 |
148 | for (c = c * 8; c < cur_subchannels; c++) {
149 | sum0 += line0[x1 * max_channels + c] * line1[(x1 + o + 0 * stride2) * max_channels + c];
150 | sum1 += line0[x1 * max_channels + c] * line1[(x1 + o + 1 * stride2) * max_channels + c];
151 | sum2 += line0[x1 * max_channels + c] * line1[(x1 + o + 2 * stride2) * max_channels + c];
152 | sum3 += line0[x1 * max_channels + c] * line1[(x1 + o + 3 * stride2) * max_channels + c];
153 | }
154 |
155 | dline[blockIdx_x + (((o / stride2) + 0) * topwidth + neighborhood_grid_radius * topwidth)] +=
156 | sum0;
157 | dline[blockIdx_x + (((o / stride2) + 1) * topwidth + neighborhood_grid_radius * topwidth)] +=
158 | sum1;
159 | dline[blockIdx_x + (((o / stride2) + 2) * topwidth + neighborhood_grid_radius * topwidth)] +=
160 | sum2;
161 | dline[blockIdx_x + (((o / stride2) + 3) * topwidth + neighborhood_grid_radius * topwidth)] +=
162 | sum3;
163 | }
164 |
165 | for (; o < o_max; o += 1 * stride2) {
166 | half8 *bottom0 = (half8 *)(line0 + x1 * max_channels);
167 | half8 *bottom1 = (half8 *)(line1 + (x1 + o) * max_channels);
168 |
169 | int c = 0;
170 |
171 | half8 sum4 = 0;
172 | for (; c <= cur_subchannels / 8 - 4; c += 4) {
173 | sum4 += bottom0[c + 0] * bottom1[c + 0];
174 | sum4 += bottom0[c + 1] * bottom1[c + 1];
175 | sum4 += bottom0[c + 2] * bottom1[c + 2];
176 | sum4 += bottom0[c + 3] * bottom1[c + 3];
177 | }
178 | for (; c < cur_subchannels / 8; c++) {
179 | sum4 += bottom0[c] * bottom1[c];
180 | }
181 |
182 | half sum = __builtin_shave_sau_sumx_f16_r(sum4);
183 |
184 | for (c = c * 8; c < cur_subchannels; c++) {
185 | sum += line0[x1 * max_channels + c] * line1[(x1 + o) * max_channels + c];
186 | }
187 |
188 | dline[blockIdx_x + (((o + neighborhood_grid_radius * stride2) / stride2) * topwidth)] += sum;
189 | }
190 | }
191 | }
192 | }
193 | }
194 | }
195 |
196 | __kernel void correlate2_half(
197 | __global const half *restrict bottom0,
198 | __global const half *restrict bottom1,
199 | __global half *restrict top,
200 | int topwidth,
201 | int topheight,
202 | int bottomwidth,
203 | int bottomheight,
204 | int bottomchannels,
205 | int max_displacement,
206 | int padding,
207 | int neighborhood_grid_radius,
208 | int neighborhood_grid_width,
209 | int kernel_size,
210 | int stride1,
211 | int stride2)
212 | {
213 | int max_channels = (MAX_OPENCL_BUFF_SIZE / sizeof(half) - topwidth * neighborhood_grid_width) / (3 * bottomwidth);
214 | if (max_channels > 64) max_channels = 64;
215 | int subchannels_count = (bottomchannels + max_channels - 1) / max_channels;
216 | int subchannels = (bottomchannels + subchannels_count - 1) / subchannels_count;
217 | if (subchannels < max_channels) subchannels = max_channels;
218 |
219 | const int sumelems = kernel_size * kernel_size * bottomchannels;
220 |
221 | __private half cmx[MAX_OPENCL_BUFF_SIZE / sizeof(half)];
222 |
223 | __private half *line0 = cmx;
224 | __private half *line1 = line0 + bottomwidth * subchannels;
225 | __private half *dline = line1 + bottomwidth * subchannels;
226 |
227 | int blockIdx_y = get_global_id(0);
228 |
229 | #if defined(USE_DMA)
230 | __private half *dmabuf = dline + topwidth * neighborhood_grid_width;
231 | #endif
232 |
233 | int y1 = blockIdx_y * stride1 + max_displacement;
234 |
235 | for (int j = 0; j < kernel_size; j++) {
236 | for (int bottomchannel = 0; bottomchannel < bottomchannels; bottomchannel += subchannels) {
237 | // configure channel batching
238 | int startchannel = bottomchannel;
239 | int endchannel = startchannel + subchannels > bottomchannels ? bottomchannels : startchannel + subchannels;
240 | int deltachannels = endchannel - startchannel;
241 |
242 | // load line form blob 0 with repackaging
243 | if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight) {
244 | #if defined(USE_DMA)
245 | __global const half *curr =
246 | bottom0 + startchannel * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth;
247 | dmacpyLineSrcStrideStart(
248 | curr,
249 | dmabuf,
250 | bottomwidth * deltachannels * sizeof(half),
251 | bottomwidth * sizeof(half),
252 | bottomwidth * bottomheight * sizeof(half));
253 |
254 | for (int ch = 0; ch < deltachannels; ch++) {
255 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
256 | half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
257 | line0[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
258 | line0[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
259 | line0[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
260 | line0[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
261 |
262 | line0[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
263 | line0[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
264 | line0[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
265 | line0[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
266 | }
267 |
268 | for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
269 | line0[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
270 | }
271 | }
272 |
273 | if (deltachannels < subchannels)
274 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++)
275 | memzero(
276 | line0 + blockIdx_x * max_channels + deltachannels,
277 | (subchannels - deltachannels) * sizeof(half));
278 | #else
279 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
280 | for (int ch = 0; ch < deltachannels; ch++)
281 | line0[blockIdx_x * max_channels + ch] = bottom0
282 | [(ch + startchannel) * bottomheight * bottomwidth + (y1 + j - padding) * bottomwidth
283 | + blockIdx_x];
284 |
285 | if (deltachannels < subchannels)
286 | memzero(
287 | line0 + blockIdx_x * max_channels + deltachannels,
288 | (subchannels - deltachannels) * sizeof(half));
289 | }
290 | #endif
291 | } else
292 | memzero(line0, max_channels * bottomwidth * sizeof(half));
293 |
294 | for (int top_channel_y = 0; top_channel_y < neighborhood_grid_width; top_channel_y++) {
295 | int y2 = y1 + (top_channel_y - neighborhood_grid_radius) * stride2;
296 |
297 | if (y2 + j - padding >= 0 && y2 + j - padding < bottomheight) {
298 | #if defined(USE_DMA)
299 | __global const half *curr =
300 | bottom1 + startchannel * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth;
301 | dmacpyLineSrcStrideStart(
302 | curr,
303 | dmabuf,
304 | bottomwidth * deltachannels * sizeof(half),
305 | bottomwidth * sizeof(half),
306 | bottomwidth * bottomheight * sizeof(half));
307 |
308 | for (int ch = 0; ch < deltachannels; ch++) {
309 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
310 | half8 val = ((half8 *)(dmabuf + ch * bottomwidth))[blockIdx_x];
311 | line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
312 | line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
313 | line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
314 | line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
315 |
316 | line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
317 | line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
318 | line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
319 | line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
320 | }
321 |
322 | for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
323 | line1[(blockIdx_x)*max_channels + ch] = dmabuf[blockIdx_x + ch * bottomwidth];
324 | }
325 | }
326 | #else
327 | for (int ch = 0; ch < deltachannels; ch++) {
328 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth / 8; blockIdx_x++) {
329 | half8 val = ((
330 | __global half8
331 | *)(bottom1 + (ch + startchannel) * bottomheight * bottomwidth + (y2 + j - padding) * bottomwidth))
332 | [blockIdx_x];
333 | line1[(blockIdx_x * 8 + 0) * max_channels + ch] = val[0];
334 | line1[(blockIdx_x * 8 + 1) * max_channels + ch] = val[1];
335 | line1[(blockIdx_x * 8 + 2) * max_channels + ch] = val[2];
336 | line1[(blockIdx_x * 8 + 3) * max_channels + ch] = val[3];
337 |
338 | line1[(blockIdx_x * 8 + 4) * max_channels + ch] = val[4];
339 | line1[(blockIdx_x * 8 + 5) * max_channels + ch] = val[5];
340 | line1[(blockIdx_x * 8 + 6) * max_channels + ch] = val[6];
341 | line1[(blockIdx_x * 8 + 7) * max_channels + ch] = val[7];
342 | }
343 | for (int blockIdx_x = bottomwidth / 8 * 8; blockIdx_x < bottomwidth; blockIdx_x++) {
344 | half val =
345 | (bottom1 + (ch + startchannel) * bottomheight * bottomwidth
346 | + (y2 + j - padding) * bottomwidth)[blockIdx_x];
347 | line1[(blockIdx_x)*max_channels + ch] = val;
348 | }
349 | }
350 | #endif
351 | for (int blockIdx_x = 0; blockIdx_x < bottomwidth; blockIdx_x++) {
352 | if (deltachannels < subchannels)
353 | memzero(
354 | line1 + blockIdx_x * max_channels + deltachannels,
355 | (subchannels - deltachannels) * sizeof(half));
356 | }
357 | } else
358 | memzero(line1, max_channels * bottomwidth * sizeof(half));
359 |
360 | if (j == 0 && startchannel == 0) {
361 | memzero(dline, neighborhood_grid_width * topwidth * sizeof(half));
362 | } else {
363 | #if defined(USE_DMA)
364 | dmacpyLineSrcStrideStart(
365 | top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
366 | dline,
367 | topwidth * neighborhood_grid_width * sizeof(half),
368 | topwidth * sizeof(half),
369 | topwidth * topheight * sizeof(half));
370 | #else
371 | for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
372 | for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
373 | half8 val = ((
374 | __global half8
375 | *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
376 | [blockIdx_x];
377 | ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] = val;
378 | }
379 | for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
380 | dline[top_channel_x * topwidth + blockIdx_x] =
381 | top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
382 | + blockIdx_y * topwidth + blockIdx_x];
383 | }
384 | }
385 | #endif
386 | }
387 |
388 | if (y1 + j - padding >= 0 && y1 + j - padding < bottomheight && y2 + j - padding >= 0
389 | && y2 + j - padding < bottomheight) {
390 | crosscorrh(
391 | line0,
392 | line1,
393 | dline,
394 | topwidth,
395 | max_displacement,
396 | neighborhood_grid_radius,
397 | kernel_size,
398 | padding,
399 | bottomwidth,
400 | stride1,
401 | stride2,
402 | max_channels,
403 | subchannels);
404 | }
405 |
406 | if (j == kernel_size - 1 && endchannel == bottomchannels) {
407 | half8 scale = (half8){
408 | (half)sumelems,
409 | (half)sumelems,
410 | (half)sumelems,
411 | (half)sumelems,
412 | (half)sumelems,
413 | (half)sumelems,
414 | (half)sumelems,
415 | (half)sumelems};
416 | for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
417 | for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
418 | ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] =
419 | ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x] / scale;
420 | }
421 | for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
422 | dline[top_channel_x * topwidth + blockIdx_x] =
423 | dline[top_channel_x * topwidth + blockIdx_x] / (half)sumelems;
424 | }
425 | }
426 | }
427 |
428 | #if defined(USE_DMA)
429 | dmacpyLineDstStrideStart(
430 | dline,
431 | top + top_channel_y * neighborhood_grid_width * topheight * topwidth + blockIdx_y * topwidth,
432 | topwidth * neighborhood_grid_width * sizeof(half),
433 | topwidth * sizeof(half),
434 | topwidth * topheight * sizeof(half));
435 | #else
436 | for (int top_channel_x = 0; top_channel_x < neighborhood_grid_width; top_channel_x++) {
437 | for (int blockIdx_x = 0; blockIdx_x < topwidth / 8; blockIdx_x++) {
438 | ((__global half8
439 | *)(top + ((top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth + blockIdx_y * topwidth)))
440 | [blockIdx_x] = ((half8 *)(dline + top_channel_x * topwidth))[blockIdx_x]
441 | + (half8){0, 0, 0, 0, 0, 0, 0, 0};
442 | }
443 | for (int blockIdx_x = (topwidth / 8) * 8; blockIdx_x < topwidth; blockIdx_x++) {
444 | top[(top_channel_y * neighborhood_grid_width + top_channel_x) * topheight * topwidth
445 | + blockIdx_y * topwidth + blockIdx_x] =
446 | dline[top_channel_x * topwidth + blockIdx_x] + (half)0;
447 | }
448 | }
449 | #endif
450 | }
451 | }
452 | }
453 | }
454 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/ctc.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:292de0fbb8dc6ead6970576d1b9a26a323fc9febfceb92c3af6b84496d523def
3 | size 10196
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/ctc.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __global half *find(__global const half *begin, __global const half *end, half value)
9 | {
10 | while (begin != end) {
11 | if (*begin == value) {
12 | return begin;
13 | }
14 | ++begin;
15 | }
16 | return end;
17 | }
18 |
19 | __kernel void CTCDecoder(
20 | __global half *restrict probabilities,
21 | __global half *restrict sequence_indicators,
22 | __global half *restrict output,
23 | int width,
24 | int height,
25 | int channels)
26 | {
27 | __local half local_src[88 * 1 * 77];
28 | __local half local_dst[88 * 1];
29 |
30 | event_t e1 = async_work_group_copy_2D2D(
31 | local_src, // dst
32 | probabilities, // src
33 | width, // num_elements_per_line,
34 | height * channels, // num_lines,
35 | width * (height - 1), // src_line_stride,
36 | width * (height - 1), // dst_line_stride,
37 | 0);
38 |
39 | wait_group_events(1, &e1);
40 |
41 | const int T = channels; // Time
42 | const int B = height; // Batches
43 | const int C = width; // Chars
44 |
45 | #pragma unroll 4
46 | for (int i = 0; i < B * T; i++) {
47 | local_dst[i] = -1.h;
48 | }
49 |
50 | int output_index = 0;
51 |
52 | for (int b = 0; b < B; ++b) {
53 | __global const half *restrict seq_ind = sequence_indicators + b * T;
54 | const int seq_len = find(seq_ind + 1, seq_ind + T, 0.h) - seq_ind;
55 | const int time = min(seq_len, T);
56 |
57 | int prev_class_idx = -1;
58 |
59 | #pragma unroll 4
60 | for (int t = 0; t < time; ++t) {
61 | __local const half *restrict probs = local_src + b * C + t * C * B;
62 |
63 | int max_class_idx = 0;
64 | half max_prob = probs[0];
65 | for (int c = 1; c < C; ++c) {
66 | const half prob = probs[c];
67 | if (prob > max_prob) {
68 | max_class_idx = c;
69 | max_prob = prob;
70 | }
71 | }
72 |
73 | if (max_class_idx < C - 1 && max_class_idx != prev_class_idx) {
74 | local_dst[b * T + output_index] = (half)max_class_idx;
75 | output_index++;
76 | }
77 |
78 | prev_class_idx = max_class_idx;
79 | }
80 | }
81 |
82 | barrier(CLK_LOCAL_MEM_FENCE);
83 |
84 | event_t e2 = async_work_group_copy_2D2D(
85 | output, // dst
86 | local_dst, // src
87 | channels, // num_elements_per_line,
88 | height, // num_lines,
89 | 0, // src_line_stride,
90 | 0, // dst_line_stride,
91 | 0);
92 |
93 | wait_group_events(1, &e2);
94 | }
95 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/customLayerBindings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 | -->
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 |
403 |
404 |
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
488 |
489 |
490 |
491 |
492 |
493 |
494 |
495 |
496 |
497 |
498 |
499 |
500 |
501 |
502 |
503 |
504 |
505 |
506 |
507 |
508 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/cvtf32f16.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:703ef56f84299e76d36b3ba5a632ae3d5e3ecd54761dcfe0006ca69ddce4bc6d
3 | size 2664
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/cvtf32f16.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 |
7 | __kernel void cvtf32f16(const __global float* restrict inImage,
8 | __global half* restrict outImage,
9 | float scale,
10 | float bais)
11 | {
12 | int idx = get_global_id(0)
13 | + get_global_id(1) * get_global_size(0)
14 | + get_global_id(2) * get_global_size(0) * get_global_size(1);
15 |
16 | outImage[idx] = convert_half(inImage[idx]*scale+bais);
17 | }
18 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/cvtu8f16.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:46c943e08f37cedac77f727f55835637d4878edcc20aaa24f16ed5888d13bd43
3 | size 4588
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/cvtu8f16.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __kernel void cvtu8f16(__global const uchar *restrict src, __global half *restrict dst, float scale, float bias)
9 | {
10 | __local uchar local_src[8 * 1024];
11 | __local half local_dst[8 * 1024];
12 |
13 | event_t e1 = async_work_group_copy_3D3D(
14 | local_src, // dst
15 | src + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
16 | + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // src
17 | get_local_size(0), // num_elements_per_line
18 | get_local_size(0) * get_local_size(1) / (get_local_size(0)), // num_lines
19 | get_global_size(0) - get_local_size(0), // src_line_stride
20 | 0, // dst_line_stride
21 | get_local_size(2), // num planes
22 | get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src plane stride
23 | 0, // dst plane stride
24 | 0);
25 | wait_group_events(1, &e1);
26 |
27 | size_t idx = get_local_id(0)
28 | + get_local_id(1) * get_local_size(0)
29 | + get_local_id(2) * get_local_size(0) * get_local_size(1);
30 |
31 | local_dst[idx] = convert_half(local_src[idx]) * (half)scale + (half)bias;
32 |
33 | barrier(CLK_LOCAL_MEM_FENCE);
34 |
35 | event_t e2 = async_work_group_copy_3D3D(
36 | dst + get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0)
37 | + get_group_id(2) * get_local_size(2) * get_global_size(0) * get_global_size(1), // dst
38 | local_dst, // src
39 | get_local_size(0), // num_elements_per_line
40 | get_local_size(1), // num_lines
41 | 0, // src_line_stride
42 | get_global_size(0) - get_local_size(0), // dst_line_stride
43 | get_local_size(2), // num_planes
44 | 0, // src_plane_stride
45 | get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride
46 | 0);
47 | wait_group_events(1, &e2);
48 | }
49 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/detectron_prior_grid_gen.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4c80d556d23f1c959fa10c00ff1cd9c3ae10aba607b37c7a0620d903fc7cedd8
3 | size 6972
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/detectron_prior_grid_gen.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __kernel void experimental_detectron_prior_grid_generator(
9 | __global const half *restrict input_priors,
10 | __global const half *restrict input_feature_map,
11 | __global const half *restrict input_rois,
12 | __global half *restrict output,
13 | int grid_h,
14 | int grid_w,
15 | float stride_h,
16 | float stride_w,
17 | int num_priors,
18 | int num_anchors_per_prior)
19 | {
20 | __local half local_input_priors[8 * 1024];
21 | __local half local_output[8 * 1024];
22 |
23 | event_t e1 = async_work_group_copy(
24 | local_input_priors,
25 | input_priors,
26 | num_anchors_per_prior * num_priors,
27 | 0);
28 | wait_group_events(1, &e1);
29 |
30 | int width_start = get_group_id(0) * get_local_size(0);
31 | int width_end = min(width_start + get_local_size(0), (unsigned)grid_w);
32 | int width = width_end - width_start;
33 |
34 | int h = get_group_id(1);
35 | int w_idx = get_group_id(0) * get_local_size(0);
36 | for (int w = 0; w < width; ++w) {
37 | #pragma unroll 4
38 | for (int p = 0; p < num_priors; ++p) {
39 | local_output[(w * num_priors + p) * num_anchors_per_prior + 0] =
40 | local_input_priors[4 * p + 0]
41 | + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
42 | local_output[(w * num_priors + p) * num_anchors_per_prior + 1] =
43 | local_input_priors[4 * p + 1] + convert_half(stride_h) * (convert_half(h) + 0.5);
44 | local_output[(w * num_priors + p) * num_anchors_per_prior + 2] =
45 | local_input_priors[4 * p + 2]
46 | + convert_half(stride_w) * (convert_half(w_idx + w) + 0.5);
47 | local_output[(w * num_priors + p) * num_anchors_per_prior + 3] =
48 | local_input_priors[4 * p + 3] + convert_half(stride_h) * (convert_half(h) + 0.5);
49 | }
50 | }
51 |
52 | barrier(CLK_LOCAL_MEM_FENCE);
53 |
54 | event_t e2 = async_work_group_copy_2D2D(
55 | output + get_group_id(0) * get_local_size(0) * num_anchors_per_prior * num_priors
56 | + get_group_id(1) * get_local_size(1) * grid_w * num_anchors_per_prior
57 | * num_priors, // dst
58 | local_output, // src
59 | width * num_anchors_per_prior * num_priors, // num_elements_per_line
60 | 1, // num_lines
61 | (grid_w - width) * num_anchors_per_prior * num_priors, // src_line_stride
62 | (grid_w - width) * num_anchors_per_prior * num_priors, // dst_line_stride
63 | 0);
64 | wait_group_events(1, &e2);
65 | }
66 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/fakequantize.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d17659bbf12a172849085003a055bfb4b91d3bb5bdc7f820395820eaa90b46ef
3 | size 15688
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/fakequantize.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __kernel void quantize(
9 | __global const half *restrict src_data,
10 | __global const half *restrict input_low,
11 | __global const half *restrict input_high,
12 | __global const half *restrict output_low,
13 | __global const half *restrict output_high,
14 | __global half *restrict dst_data,
15 | int levels,
16 | int input_low_size,
17 | int input_high_size,
18 | int output_low_size,
19 | int output_high_size,
20 | int W,
21 | int H)
22 | {
23 | __local half local_src[15 * 1024];
24 | __local half local_dst[15 * 1024];
25 |
26 | event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(2) * W * H, W * H, 0);
27 | wait_group_events(1, &e1);
28 |
29 | int c = get_group_id(2);
30 |
31 | half h_ilow = (input_low_size == 1 ? input_low[0] : input_low[c]);
32 | half h_ihigh = (input_high_size == 1 ? input_high[0] : input_high[c]);
33 | half h_olow = (output_low_size == 1 ? output_low[0] : output_low[c]);
34 | half h_ohigh = (output_high_size == 1 ? output_high[0] : output_high[c]);
35 |
36 | half const1 = (half)(
37 | !(h_ihigh - h_ilow) ? 0.0f : convert_float(levels - 1) / (convert_float(h_ihigh) - convert_float(h_ilow)));
38 | half const2 =
39 | (half)(!(levels - 1) ? 0.0f : (convert_float(h_ohigh) - convert_float(h_olow)) / convert_float(levels - 1));
40 |
41 | __local const half *restrict src = local_src + W * get_local_id(1);
42 | __local half *restrict dst = local_dst + W * get_local_id(1);
43 |
44 | for (int w = 0; w < W / 8; w++) {
45 | half8 val = *((__local half8 *)src + w);
46 | half8 aux = (val - (half8)h_ilow) * (half8)const1 + (half8)0.5h;
47 |
48 | aux = (half8){
49 | (half)(short)(aux.s0),
50 | (half)(short)(aux.s1),
51 | (half)(short)(aux.s2),
52 | (half)(short)(aux.s3),
53 | (half)(short)(aux.s4),
54 | (half)(short)(aux.s5),
55 | (half)(short)(aux.s6),
56 | (half)(short)(aux.s7)};
57 |
58 | aux = aux * (half8)const2 + (half8)h_olow;
59 |
60 | short8 a;
61 | short8 b;
62 | a.s0 = (val.s0 <= h_ilow);
63 | a.s1 = (val.s1 <= h_ilow);
64 | a.s2 = (val.s2 <= h_ilow);
65 | a.s3 = (val.s3 <= h_ilow);
66 | a.s4 = (val.s4 <= h_ilow);
67 | a.s5 = (val.s5 <= h_ilow);
68 | a.s6 = (val.s6 <= h_ilow);
69 | a.s7 = (val.s7 <= h_ilow);
70 |
71 | b.s0 = (val.s0 > h_ihigh);
72 | b.s1 = (val.s1 > h_ihigh);
73 | b.s2 = (val.s2 > h_ihigh);
74 | b.s3 = (val.s3 > h_ihigh);
75 | b.s4 = (val.s4 > h_ihigh);
76 | b.s5 = (val.s5 > h_ihigh);
77 | b.s6 = (val.s6 > h_ihigh);
78 | b.s7 = (val.s7 > h_ihigh);
79 |
80 | a = ~(a - (short8)1);
81 | b = ~(b - (short8)1);
82 |
83 | short8 c1 = (~a & b);
84 | short8 c2 = (~a & ~b);
85 |
86 | short8 res = (a & as_short8((half8)h_olow)) | (c1 & as_short8((half8)h_ohigh)) | (c2 & as_short8(aux));
87 |
88 | *((__local half8 *)dst + w) = as_half8(res);
89 | }
90 |
91 | for (int w = W & (~0x7); w < W; w++) {
92 | half val = src[w];
93 | short a = val <= h_ilow;
94 | a = ~(a - 1);
95 | short b = val > h_ihigh;
96 | b = ~(b - 1);
97 |
98 | short c1 = (~a & b);
99 | short c2 = (~a & ~b);
100 |
101 | short res = (a & as_short(h_olow)) | (c1 & as_short(h_ohigh))
102 | | (c2 & as_short(((half)(round((val - h_ilow) * const1) * const2) + h_olow)));
103 |
104 | dst[w] = as_half(res);
105 | }
106 |
107 | barrier(CLK_LOCAL_MEM_FENCE);
108 |
109 | event_t e2 = async_work_group_copy(dst_data + get_group_id(2) * W * H, local_dst, W * H, 0);
110 | wait_group_events(1, &e2);
111 | }
112 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/grn.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6e3dbe5173ca93f39fecaf29f820e1704bcb485affc1a09554e4c86f8de46214
3 | size 7972
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/grn.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __kernel void grn(__global const half *restrict src_data, __global half *restrict dst_data, int C, float bias)
9 | {
10 | __local half src[8 * 1024];
11 | __local half dst[8 * 1024];
12 |
13 | const size_t index = get_group_id(0) * get_local_size(0) + get_group_id(1) * get_local_size(1) * get_global_size(0);
14 |
15 | event_t e1 = async_work_group_copy_3D3D(
16 | src, // dst
17 | src_data + index, // src
18 | get_local_size(0), // num_elements_per_line,
19 | get_local_size(1), // num_lines,
20 | get_global_size(0) - get_local_size(0), // src_line_stride,
21 | 0, // dst_line_stride,
22 | C, // num_planes,
23 | get_global_size(0) * (get_global_size(1) - get_local_size(1)), // src_plane_stride
24 | 0, // dst_plane_stride
25 | 0);
26 | wait_group_events(1, &e1);
27 |
28 | float variance = bias + 1e-9f;
29 |
30 | #pragma unroll 8
31 | for (int c = 0; c < C; c++) {
32 | float val = (float)src[c * get_local_size(1) * get_local_size(0)
33 | + get_local_id(1) * get_local_size(0)
34 | + get_local_id(0)];
35 | variance += val * val;
36 | }
37 |
38 | half hvariance = (half)(native_rsqrt((half)(variance / 16.f)) * 0.25f);
39 |
40 | #pragma unroll 8
41 | for (int c = 0; c < C; c++) {
42 | dst[c * get_local_size(1) * get_local_size(0)
43 | + get_local_id(1) * get_local_size(0)
44 | + get_local_id(0)] =
45 | src[c * get_local_size(1) * get_local_size(0)
46 | + get_local_id(1) * get_local_size(0) + get_local_id(0)] * hvariance;
47 | }
48 |
49 | barrier(CLK_LOCAL_MEM_FENCE);
50 |
51 | event_t e2 = async_work_group_copy_3D3D(
52 | dst_data + index, // src
53 | dst, // dst
54 | get_local_size(0), // num_elements_per_line,
55 | get_local_size(1), // num_lines,
56 | 0, // src_line_stride,
57 | get_global_size(0) - get_local_size(0), // dst_line_stride,
58 | C, // num_planes,
59 | 0, // src_plane_stride
60 | get_global_size(0) * (get_global_size(1) - get_local_size(1)), // dst_plane_stride
61 | 0);
62 | wait_group_events(1, &e2);
63 | }
64 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/mvn_reduction.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:581206fc5c0e0d429094bc7076d8772dc6ba69199a3ee75d269f13dc2f0d7ac8
3 | size 7840
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/mvn_reduction.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | // Set to 1 only if output is zerroed before kernel execution
9 | #define USE_ATOMICS 0
10 |
11 | void atomic_add_global(volatile __global float *source, const float operand)
12 | {
13 | union {
14 | unsigned int intVal;
15 | float floatVal;
16 | } newVal;
17 | union {
18 | unsigned int intVal;
19 | float floatVal;
20 | } prevVal;
21 |
22 | do {
23 | prevVal.floatVal = *source;
24 | newVal.floatVal = prevVal.floatVal + operand;
25 | } while (atomic_cmpxchg((volatile __global unsigned int *)source, prevVal.intVal, newVal.intVal) != prevVal.intVal);
26 | }
27 |
28 | __kernel void reduction_mean(
29 | __global const half *restrict src,
30 | __global float *restrict mean,
31 | __global float *restrict variance,
32 | int W,
33 | int H,
34 | int across_channels)
35 | {
36 | __local half src_line[4 * 1024];
37 | event_t e;
38 |
39 | e = async_work_group_copy_2D2D(
40 | src_line, // dst
41 | src + get_group_id(1) * get_local_size(1) * W
42 | + get_group_id(2) * get_local_size(2) * W * get_global_size(1), // src
43 | W * get_local_size(1), // num_elements_per_line,
44 | get_local_size(2), // num_lines,
45 | W * (get_global_size(1) - get_local_size(1)), // src_line_stride,
46 | 0, // dst_line_stride,
47 | 0);
48 |
49 | wait_group_events(1, &e);
50 |
51 | int h = get_global_id(1);
52 | int c = get_global_id(2);
53 |
54 | const int MAX_LOCAL_SIZE = 8;
55 |
56 | __local float mbuf[MAX_LOCAL_SIZE];
57 | __local float vbuf[MAX_LOCAL_SIZE];
58 |
59 | mbuf[get_local_id(1)] = 0;
60 | vbuf[get_local_id(1)] = 0;
61 |
62 | if (h < H) {
63 | float sum = 0.f;
64 | float sum2 = 0.f;
65 |
66 | float8 sum4 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
67 | float8 sum24 = (float8){0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f};
68 |
69 | const __local half8 *restrict lsrc = ((const __local half8 *)(src_line + get_local_id(1) * W));
70 |
71 | #pragma unroll 16
72 | for (size_t w = 0; w < W / 8; w++) {
73 | half8 sh = lsrc[w];
74 | float8 valf = convert_float8(sh);
75 |
76 | sum4 += valf;
77 | sum24 += valf * valf;
78 | }
79 |
80 | for (size_t w = W / 8 * 8; w < W; w++) {
81 | float val = (float)src_line[get_local_id(1) * W + w];
82 | sum += val;
83 | sum2 += val * val;
84 | }
85 |
86 | mbuf[get_local_id(1)] = sum4.s0 + sum4.s1 + sum4.s2 + sum4.s3 + sum4.s4 + sum4.s5 + sum4.s6 + sum4.s7 + sum;
87 | vbuf[get_local_id(1)] =
88 | sum24.s0 + sum24.s1 + sum24.s2 + sum24.s3 + sum24.s4 + sum24.s5 + sum24.s6 + sum24.s7 + sum2;
89 | }
90 |
91 | barrier(CLK_LOCAL_MEM_FENCE);
92 |
93 | if (get_local_id(1) == 0) {
94 | float res = 0;
95 | float res2 = 0;
96 |
97 | for (int i = 0; i < get_local_size(1); i++) {
98 | res += mbuf[i];
99 | res2 += vbuf[i];
100 | }
101 |
102 | // requires memory reset before layer execution
103 | #if USE_ATOMICS
104 | int idx = (across_channels == 0) ? c : 0;
105 |
106 | atomic_add_global(mean + idx, res);
107 | atomic_add_global(variance + idx, res2);
108 | #else
109 | int idx = c * get_num_groups(1) + get_group_id(1);
110 |
111 | mean[idx] = res;
112 | variance[idx] = res2;
113 | #endif
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/mvn_scale.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:527d82ec9e71bbbfaf86c1e3f1b2beea02875cf719a45592b1f3d5e244e5c15c
3 | size 3564
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/mvn_scale.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | // Set to 1 only if output is zerroed before kernel execution
9 | #define USE_ATOMICS 0
10 |
11 | __attribute__((reqd_work_group_size(1, 1, 1))) __kernel void mvn_scale(
12 | const __global half *restrict src,
13 | __global float *restrict mean_part,
14 | __global float *restrict power_mean,
15 | __global half *restrict dst,
16 | int W,
17 | int H1,
18 | int across_channels,
19 | int normalize_variance,
20 | int nparts)
21 | {
22 | __local half src_line[4 * 1024];
23 | __local half dst_line[4 * 1024];
24 |
25 | int c = get_group_id(2);
26 | int C = get_global_size(2);
27 |
28 | int h = get_group_id(1);
29 | int H = get_global_size(1);
30 |
31 | event_t e1 = async_work_group_copy(src_line, src + c * H * W + h * W, W, 0);
32 | wait_group_events(1, &e1);
33 |
34 | int idx = (across_channels == 0) ? nparts * c : 0;
35 | float scale = (across_channels == 0) ? H * W : H * W * C;
36 |
37 | #if USE_ATOMICS
38 | float mean = mean_part[idx];
39 | float variance = power_mean[idx];
40 | #else
41 |
42 | int total = (across_channels == 0) ? nparts : nparts * C;
43 | float mean = 0.f;
44 | float variance = 0.f;
45 |
46 | for (int i = 0; i < total; i++) {
47 | mean += mean_part[idx + i];
48 | variance += power_mean[idx + i];
49 | }
50 | #endif
51 |
52 | mean = mean / scale;
53 | variance = variance / scale;
54 | variance = variance - mean * mean;
55 | variance = native_sqrt(variance) + 1e-9f;
56 |
57 | half hmean = mean;
58 | half hvariance = (normalize_variance == 0) ? 1.f : (1.f / variance);
59 |
60 | for (size_t w = 0; w < W; w++) {
61 | dst_line[w] = (src_line[w] - hmean) * hvariance;
62 | }
63 |
64 | barrier(CLK_LOCAL_MEM_FENCE);
65 |
66 | event_t e2 = async_work_group_copy(dst + c * H * W + h * W, dst_line, W, 0);
67 | wait_group_events(1, &e2);
68 | }
69 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/region_chw.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5c02ada5f9718e59c1c908799a77ab383e6ca333d46c9577608bdb9c3bf15388
3 | size 22828
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/region_chw.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))
9 |
10 | #define ALLOW_EARLY_RETURN 1
11 |
12 | static void inline logistic_activate(__local const half *restrict src, __local half *restrict dst, int offset)
13 | {
14 | half val = src[offset];
15 | val = 1.0h / (1.0h + exp2(val * -log_2_e));
16 | dst[offset] = val;
17 | }
18 |
19 | __kernel void region_chw(
20 | __global const half *restrict src_data,
21 | __global half *restrict dst_data,
22 | int W,
23 | int H,
24 | int classes,
25 | int coords,
26 | int num,
27 | int maskSize,
28 | int doSoftmax)
29 | {
30 | __local half local_src[13 * 13 * (4 + 1 + 80)];
31 | __local half local_dst[13 * 13 * (4 + 1 + 80)];
32 |
33 | const int box_sz = W * H * (classes + coords + 1);
34 | event_t e1 = async_work_group_copy(local_src, src_data + get_group_id(1) * box_sz, box_sz, 0);
35 | wait_group_events(1, &e1);
36 |
37 | const int pixel_pos = get_local_id(0);
38 | const int stride = W * H;
39 |
40 | #if ALLOW_EARLY_RETURN
41 | if (pixel_pos < W * H)
42 | #endif
43 | {
44 | __local const half *restrict src = local_src + pixel_pos;
45 | __local half *restrict dst = local_dst + pixel_pos;
46 |
47 | logistic_activate(src, dst, 0 * stride);
48 | logistic_activate(src, dst, 1 * stride);
49 |
50 | //copy plane 2 and 3
51 | dst[2 * stride] = src[2 * stride];
52 | dst[3 * stride] = src[3 * stride];
53 |
54 | logistic_activate(src, dst, 4 * stride);
55 |
56 | src += (coords + 1) * stride;
57 | dst += (coords + 1) * stride;
58 |
59 | if (doSoftmax) {
60 | half max_val = src[0];
61 | #pragma unroll 4
62 | for (int c = 1; c < classes; c++) {
63 | max_val = max(max_val, src[c * stride]);
64 | }
65 |
66 | half expSum = 0.0h;
67 | #pragma unroll 4
68 | for (int c = 0; c < classes; c++) {
69 | const half e = src[c * stride] - max_val;
70 | const half tmp = exp2(e * log_2_e);
71 | dst[c * stride] = tmp;
72 | expSum += tmp;
73 | }
74 |
75 | const half recip = 1.h / expSum;
76 | int c = 0;
77 | for (; c < (classes & ~0x3); c += 4) {
78 | const half t0 = dst[(c + 0) * stride];
79 | const half t1 = dst[(c + 1) * stride];
80 | const half t2 = dst[(c + 2) * stride];
81 | const half t3 = dst[(c + 3) * stride];
82 |
83 | const half e0 = t0 * recip;
84 | const half e1 = t1 * recip;
85 | const half e2 = t2 * recip;
86 | const half e3 = t3 * recip;
87 |
88 | dst[(c + 0) * stride] = e0;
89 | dst[(c + 1) * stride] = e1;
90 | dst[(c + 2) * stride] = e2;
91 | dst[(c + 3) * stride] = e3;
92 | }
93 | for (; c < classes; c++) {
94 | dst[c * stride] *= recip;
95 | }
96 | } else {
97 | #pragma unroll 4
98 | for (int c = 0; c < classes; c++) {
99 | logistic_activate(src, dst, c * stride);
100 | }
101 | }
102 | }
103 |
104 | barrier(CLK_LOCAL_MEM_FENCE);
105 |
106 | event_t e2 = async_work_group_copy(dst_data + get_group_id(1) * box_sz, local_dst, box_sz, 0);
107 | wait_group_events(1, &e2);
108 | }
109 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/region_hwc.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:88f7bb144d85d08e9f7879e43ed6f8722bb2f93534e5becd8c7ff2a220cdd9f3
3 | size 81896
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/region_hwc.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __constant static half log_2_e = (half)1.442695040888963; // log2(exp(1.0))
9 |
10 | #define ALLOW_EARLY_RETURN 1
11 |
12 | static void inline logistic_activate_hwc(
13 | __local const half *restrict src,
14 | __local half *restrict dst,
15 | int offset,
16 | int stride)
17 | {
18 | half val = src[offset];
19 | val = 1.0h / (1.0h + exp2(val * -log_2_e));
20 | dst[offset * stride] = val;
21 | }
22 |
23 | __kernel void region_hwc(
24 | __global const half *restrict src,
25 | __global half *restrict dst,
26 | int W,
27 | int H,
28 | int classes,
29 | int coords,
30 | int num,
31 | int maskSize,
32 | int doSoftmax)
33 | {
34 | __local half local_src[13 * 13 * (4 + 1 + 80)];
35 | __local half local_dst[13 * 13 * (4 + 1 + 80)];
36 |
37 | const int pixel_pos = get_local_id(0);
38 |
39 | const int local_C = classes + coords + 1;
40 | const int c = get_group_id(1) * local_C;
41 | const int h = get_group_id(0);
42 |
43 | num = (doSoftmax != 0) * num + (doSoftmax == 0) * maskSize;
44 | const int C = local_C * num;
45 |
46 | event_t e1 = async_work_group_copy_2D2D(
47 | local_src, // dst
48 | src + h * W * C + c, // src
49 | local_C, // num_elements_per_line,
50 | H * W, // num_lines,
51 | C - local_C, // src_line_stride,
52 | 0, // dst_line_stride,
53 | 0);
54 |
55 | wait_group_events(1, &e1);
56 |
57 | #if ALLOW_EARLY_RETURN
58 | if (pixel_pos < W * H)
59 | #endif
60 | {
61 | const int w = pixel_pos % W;
62 | const int h = pixel_pos / W;
63 |
64 | __local const half *restrict src = local_src + h * W * local_C + w * local_C;
65 | __local half *restrict dst = local_dst + h * W + w;
66 |
67 | const int stride = H * W;
68 | logistic_activate_hwc(src, dst, 0, stride);
69 | logistic_activate_hwc(src, dst, 1, stride);
70 |
71 | //copy plane 2 and 3
72 | dst[2 * stride] = src[2];
73 | dst[3 * stride] = src[3];
74 |
75 | logistic_activate_hwc(src, dst, 4, stride);
76 |
77 | src += coords + 1;
78 | dst += (coords + 1) * stride;
79 |
80 | if (doSoftmax) {
81 | half max_val = src[0];
82 | #pragma unroll 4
83 | for (int c = 1; c < classes; c++) {
84 | max_val = max(max_val, src[c]);
85 | }
86 |
87 | half expSum = 0.0h;
88 | #pragma unroll 4
89 | for (int c = 0; c < classes; c++) {
90 | const half e = src[c] - max_val;
91 | const half tmp = exp2(e * log_2_e);
92 | dst[c * stride] = tmp;
93 | expSum += tmp;
94 | }
95 |
96 | const half invExpSum = 1.0h / expSum;
97 | #pragma unroll 4
98 | for (int c = 0; c < classes; c++) {
99 | dst[c * stride] *= invExpSum;
100 | }
101 | } else {
102 | #pragma unroll 4
103 | for (int c = 0; c < classes; c++) {
104 | logistic_activate_hwc(src, dst, c, stride);
105 | }
106 | }
107 | }
108 |
109 | barrier(CLK_LOCAL_MEM_FENCE);
110 |
111 | const int box_sz = W * H * (classes + coords + 1);
112 | event_t e2 = async_work_group_copy(dst + get_group_id(1) * box_sz, local_dst, box_sz, 0);
113 | wait_group_events(1, &e2);
114 | }
115 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_chw.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:16758c3a629f5e397b7b51d417686fd745603c119e1f5d9985b05f4f3ef7efc7
3 | size 12208
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_chw.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __kernel void reorg_chw(
9 | __global const half *restrict src,
10 | __global half *restrict dst,
11 | int W,
12 | int H,
13 | int C,
14 | int stride)
15 | {
16 | __local half local_src[8 * 1024];
17 | __local half local_dst[8 * 1024];
18 |
19 | event_t e1 = async_work_group_copy_2D2D(
20 | local_src, // dst
21 | src + get_group_id(1) * W * stride
22 | + get_group_id(0) * W * stride * stride, // src
23 | W * stride, // num_elements_per_line,
24 | get_local_size(0), // num_lines,
25 | W * stride * (stride * get_num_groups(0) - 1), // src_line_stride,
26 | 0, // dst_line_stride,
27 | 0);
28 | wait_group_events(1, &e1);
29 |
30 | const int c = get_local_id(0);
31 | const int stride_x = get_local_id(1);
32 |
33 | const int srcIdx = stride_x + c * W * stride;
34 | const int dstIdx = stride_x * W * get_local_size(0) + c * W;
35 |
36 | int x = 0;
37 | for (; x <= W - 8; x += 8) {
38 | half8 data = (half8){
39 | local_src[srcIdx + (x + 0) * stride],
40 | local_src[srcIdx + (x + 1) * stride],
41 | local_src[srcIdx + (x + 2) * stride],
42 | local_src[srcIdx + (x + 3) * stride],
43 | local_src[srcIdx + (x + 4) * stride],
44 | local_src[srcIdx + (x + 5) * stride],
45 | local_src[srcIdx + (x + 6) * stride],
46 | local_src[srcIdx + (x + 7) * stride]};
47 |
48 | *((__local half8 *)(&local_dst[dstIdx + x])) = data;
49 | }
50 |
51 | for (; x < W; x++) {
52 | local_dst[dstIdx + x] = local_src[srcIdx + x * stride];
53 | }
54 |
55 | barrier(CLK_LOCAL_MEM_FENCE);
56 |
57 | event_t e2 = async_work_group_copy_2D2D(
58 | dst + get_group_id(0) * W
59 | + get_group_id(1) * W * stride * get_global_size(0), // dst
60 | local_dst, // src
61 | W, // num_elements_per_line
62 | get_local_size(0) * stride, // num_lines
63 | 0, // src_line_stride
64 | W * (get_num_groups(0) - 1), // dst_line_stride
65 | 0);
66 | wait_group_events(1, &e2);
67 | }
68 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_hwc.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d57494f39baecc4011f87ab5241c1b0a19a07a7bbd14e20b135a94a0d7ecb3c1
3 | size 42144
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_hwc.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | __kernel void reorg_hwc(
9 | __global half const *restrict src,
10 | __global half *restrict dst,
11 | int W,
12 | int H,
13 | int C,
14 | int stride)
15 | {
16 | __local half local_src[8 * 1024];
17 | __local half local_dst[8 * 1024];
18 |
19 | event_t e1 = async_work_group_copy_2D2D(
20 | local_src, // dst
21 | src + get_group_id(0) * stride + get_group_id(1) * C, // src
22 | stride, // num_elements_per_line
23 | H * W / stride, // num_lines
24 | (C - 1) * stride, // src_line_stride
25 | 0, // dst_line_stride
26 | 0);
27 | wait_group_events(1, &e1);
28 |
29 | const int stride_y = get_local_id(1);
30 | const int blocks = get_local_size(0);
31 | const int b = get_local_id(0);
32 |
33 | const int OC = stride * stride;
34 | const int OH = H / stride;
35 | const int OW = W / stride;
36 | const int IC = stride;
37 | const int IH = H;
38 | const int IW = W / stride;
39 |
40 | for (int block_h = 0; block_h < stride; block_h++) {
41 | const int src_line = b * stride * stride + stride_y * stride + block_h;
42 | const int c = src_line / IH;
43 | const int h = src_line % IH;
44 |
45 | const int dst_line = b * stride + stride_y * blocks * stride + block_h;
46 | const int oc = dst_line / OH;
47 | const int oh = dst_line % OH;
48 |
49 | for (int w = 0; w < W / stride; w++) {
50 | local_dst[oh * OW * OC + w * OC + oc] = local_src[h * IW * IC + w * IC + c];
51 | }
52 | }
53 |
54 | barrier(CLK_LOCAL_MEM_FENCE);
55 |
56 | event_t e2 = async_work_group_copy_2D2D(
57 | dst + get_group_id(1) * C + get_group_id(0) * stride, // dst
58 | local_dst, // src
59 | stride, // num_elements_per_line
60 | W * H / stride, // num_lines
61 | 0, // src_line_stride
62 | C * stride - stride, // dst_line_stride
63 | 0);
64 | wait_group_events(1, &e2);
65 | }
66 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_hwc_naive.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:14d3b747218694b644afe03e205955c1a5be042b2d7e62d261973a4ea1b8aaa8
3 | size 13396
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/reorg_hwc_naive.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 |
7 | __kernel void reorg_hwc_naive(
8 | __global half const *restrict src,
9 | __global half *restrict dst,
10 | int W,
11 | int H,
12 | int C,
13 | int stride)
14 | {
15 | const int out_c = C / (stride * stride);
16 | const int oc = C * (stride * stride);
17 | const int oh = H / stride;
18 | const int ow = W / stride;
19 |
20 | const int c = get_global_id(0);
21 |
22 | for (int h = 0; h < H; ++h) {
23 | int in_index = W * (h + H * c) + (0);
24 | int new_z = in_index / (oh * ow);
25 | int new_y = (in_index % (oh * ow)) / ow;
26 | int new_x = (in_index % (oh * ow)) % ow;
27 | int new_index = new_z + new_x * oc + new_y * oc * ow;
28 |
29 | in_index++;
30 |
31 | int c2 = c % out_c;
32 | int offset = c / out_c;
33 | int w2 = 0 * stride + offset % stride;
34 | int h2 = h * stride + offset / stride;
35 | int out_index = w2 + W * stride * (h2 + H * stride * c2);
36 |
37 | #pragma unroll 2
38 | for (int i = 0; i < W; ++i, out_index += stride, in_index++) {
39 | // repacking coordinates
40 | int k0 = out_index / (H * W);
41 | int j0 = (out_index % (H * W)) / W;
42 | int i0 = (out_index % (H * W)) % W;
43 | int out_index_repack = k0 + C * i0 + C * W * j0;
44 |
45 | dst[new_index] = src[out_index_repack];
46 |
47 | int new_z = in_index / (oh * ow);
48 | int new_y = (in_index % (oh * ow)) / ow;
49 | int new_x = (in_index % (oh * ow)) % ow;
50 | new_index = new_z + new_x * oc + new_y * oc * ow;
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/resample_AA.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:98ca582aaa70d3e7cc339aba88150eb12839ab67a0fac9c563f8c8ea37e705e2
3 | size 67860
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/resample_AA.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | #define USE_OPTIMIZED_ROUND
9 |
10 | #ifdef USE_OPTIMIZED_ROUND
11 | #define ROUND(x) ((int)((x) + 0.5f))
12 | #else
13 | #define ROUND(x) (int)(round(x))
14 | #endif
15 |
16 | inline int out_to_in(float ox, float f)
17 | {
18 | #ifdef USE_OPTIMIZED_ROUND
19 | return (int)((ox + 0.5f) / f);
20 | #else
21 | return ROUND((ox + 0.5f) / f - 0.5f);
22 | #endif
23 | }
24 |
25 | static inline float triangleCoeff(float x) { return 1.0f - fabs(x); }
26 |
27 | static inline float4 triangleCoeff4(float4 x) { return 1.0f - fabs(x); }
28 |
29 | __kernel void resample_with_antialias(
30 | __global const half *restrict src,
31 | __global half *restrict dst,
32 | int iw,
33 | int ih,
34 | float factor,
35 | int ow,
36 | int oh,
37 | int channels)
38 | {
39 | __local half local_src[20 * 1024];
40 | __local half local_dst[8 * 1024];
41 |
42 | const int r = (factor > 1.0f) ? 2 : ceil(1.0f / factor);
43 | const int oy_first = get_group_id(1) * get_local_size(1);
44 | const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1;
45 | const int iy_first = max(out_to_in(oy_first, factor) - r, 0);
46 | const int iy_last = min(out_to_in(oy_last, factor) + r, ih - 1);
47 | const int iy_size = iy_last - iy_first + 1;
48 |
49 | event_t e1 = async_work_group_copy_2D2D(
50 | local_src, // dst
51 | src + get_group_id(2) * get_local_size(2) * ih * iw + iy_first * iw, // src
52 | iy_size * iw, // num_elements_per_line,
53 | get_local_size(2), // num_lines,
54 | (ih - iy_size) * iw, // src_line_stride,
55 | 0, // dst_line_stride,
56 | 0);
57 | wait_group_events(1, &e1);
58 |
59 | const int oy = get_global_id(1);
60 | const float iy_f = ((oy + 0.5f) / factor - 0.5f) - iy_first;
61 | const int iy = ROUND(iy_f);
62 |
63 | __local half const *restrict start_src =
64 | local_src + iw * get_local_id(1) + iw * iy_size * get_local_id(2);
65 | __local half *restrict start_dst =
66 | local_dst + ow * get_local_id(1) + ow * get_local_size(1) * get_local_id(2);
67 |
68 | for (int ox = 0; ox < ow; ox++) {
69 | const float ix_f = (float)((ox + 0.5f) / factor) - 0.5f;
70 | const int ix_i = ROUND(ix_f);
71 |
72 | float4 v_sum = 0.f;
73 | float4 v_wsum = 0.f;
74 | for (int y = 0; y < iy_size; y++) {
75 | float dy = iy_f - y;
76 | int x = max(ix_i - r, 0);
77 | int end_x = min(ix_i + r, iw - 1);
78 |
79 | float4 dx;
80 | for (int i = 0; i < 4; i++) dx[i] = ix_f - x - i;
81 |
82 | for (; x < end_x - 3; x += 4, dx -= 4) {
83 | float4 w =
84 | factor * triangleCoeff4(factor * dx) * factor * triangleCoeff(factor * dy);
85 | float4 src_vec = {
86 | start_src[y * iw + x + 0],
87 | start_src[y * iw + x + 1],
88 | start_src[y * iw + x + 2],
89 | start_src[y * iw + x + 3]};
90 |
91 | v_sum += w * src_vec;
92 | v_wsum += w;
93 | }
94 |
95 | for (; x <= end_x; x++) {
96 | float dx = ix_f - x;
97 | float w = factor * triangleCoeff(factor * dx) * factor * triangleCoeff(factor * dy);
98 |
99 | v_sum[0] += w * start_src[y * iw + x];
100 | v_wsum[0] += w;
101 | }
102 | }
103 |
104 | v_sum[0] = v_sum[0] + v_sum[1] + v_sum[2] + v_sum[3];
105 | v_wsum[0] = v_wsum[0] + v_wsum[1] + v_wsum[2] + v_wsum[3];
106 |
107 | start_dst[get_local_id(1) * ow + ox] = (!v_wsum[0]) ? 0.0f : (half)(v_sum[0] / v_wsum[0]);
108 | }
109 |
110 | barrier(CLK_LOCAL_MEM_FENCE);
111 |
112 | event_t e2 = async_work_group_copy_2D2D(
113 | dst + get_group_id(2) * get_local_size(2) * get_global_size(1) * ow
114 | + get_group_id(1) * get_local_size(1) * ow, // dst
115 | local_dst, // src
116 | get_local_size(1) * ow, // num_elements_per_line,
117 | get_local_size(2), // num_lines,
118 | 0, // src_line_stride,
119 | (get_global_size(1) - get_local_size(1)) * ow, // dst_line_stride,
120 | 0);
121 | wait_group_events(1, &e2);
122 | }
123 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/resample_noAA.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9c67917cd959fe4add69b139f44ef07e36c4ca37bc2b8a47b2bdfe48e8a3f559
3 | size 68828
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/resample_noAA.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | #define USE_OPTIMIZED_ROUND
9 |
10 | #ifdef USE_OPTIMIZED_ROUND
11 | #define ROUND(x) ((int)((x) + 0.5f))
12 | #else
13 | #define ROUND(x) (int)(round(x))
14 | #endif
15 |
16 | inline int out_to_in(float ox, float f) { return (int)((ox + 0.5f) * f); }
17 |
18 | void interpolationCHW_nn(__local half *psrc, __local half *pdst, int OW, int IW, int C, float rw, float rh)
19 | {
20 | float alpha = rh / 2.0f - 0.5f;
21 |
22 | for (int w = 0; w < OW / 8; w++) {
23 | float fw0 = rw * (w * 8 + 0) + alpha;
24 | float fw1 = rw * (w * 8 + 1) + alpha;
25 | float fw2 = rw * (w * 8 + 2) + alpha;
26 | float fw3 = rw * (w * 8 + 3) + alpha;
27 |
28 | float fw4 = rw * (w * 8 + 4) + alpha;
29 | float fw5 = rw * (w * 8 + 5) + alpha;
30 | float fw6 = rw * (w * 8 + 6) + alpha;
31 | float fw7 = rw * (w * 8 + 7) + alpha;
32 |
33 | int iw0 = min((int)ROUND(fw0), IW - 1);
34 | int iw1 = min((int)ROUND(fw1), IW - 1);
35 | int iw2 = min((int)ROUND(fw2), IW - 1);
36 | int iw3 = min((int)ROUND(fw3), IW - 1);
37 |
38 | int iw4 = min((int)ROUND(fw4), IW - 1);
39 | int iw5 = min((int)ROUND(fw5), IW - 1);
40 | int iw6 = min((int)ROUND(fw6), IW - 1);
41 | int iw7 = min((int)ROUND(fw7), IW - 1);
42 |
43 | for (int c = 0; c < C; c++) {
44 | half8 val = {
45 | *((__local half *)(psrc + c * IW + iw0)),
46 | *((__local half *)(psrc + c * IW + iw1)),
47 | *((__local half *)(psrc + c * IW + iw2)),
48 | *((__local half *)(psrc + c * IW + iw3)),
49 |
50 | *((__local half *)(psrc + c * IW + iw4)),
51 | *((__local half *)(psrc + c * IW + iw5)),
52 | *((__local half *)(psrc + c * IW + iw6)),
53 | *((__local half *)(psrc + c * IW + iw7)),
54 | };
55 | *((__local half8 *)(pdst + c * OW + w * 8)) = val;
56 | }
57 | }
58 |
59 | for (int w = OW / 8 * 8; w < OW; w++) {
60 | float fw = rw * w + alpha;
61 | int iw0 = min((int)ROUND(fw), IW - 1);
62 |
63 | for (int c = 0; c < C; c++) {
64 | *((__local half *)(pdst + c * OW + w)) = *((__local half *)(psrc + c * IW + iw0));
65 | }
66 | }
67 | }
68 |
69 | kernel void resample_nearest(
70 | __global const half *restrict src,
71 | __global half *restrict dst,
72 | int iw,
73 | int ih,
74 | float factor,
75 | int ow,
76 | int oh,
77 | int channels)
78 | {
79 | __local half local_src[14 * 1024];
80 | __local half local_dst[14 * 1024];
81 |
82 | const int oy_first = get_group_id(1) * get_local_size(1);
83 | const int oy_last = (get_group_id(1) + 1) * get_local_size(1) - 1;
84 | const int iy_first = out_to_in(oy_first, 1.0 / factor);
85 | const int iy_last = out_to_in(oy_last, 1.0 / factor);
86 |
87 | const int iy_size = iy_last - iy_first + 1;
88 |
89 | event_t e1 = async_work_group_copy_2D2D(
90 | local_src, // dst
91 | src + get_group_id(2) * channels * ih * iw + iy_first * iw, // src
92 | iy_size * iw, // num_elements_per_line,
93 | channels, // num_lines,
94 | ih * iw - iy_size * iw, // src_line_stride,
95 | 0, // dst_line_stride,
96 | 0);
97 |
98 | wait_group_events(1, &e1);
99 |
100 | interpolationCHW_nn(local_src, local_dst, ow, iw, channels, 1.0 / factor, 1.0 / factor);
101 |
102 | event_t e2 = async_work_group_copy_2D2D(
103 | dst + get_group_id(2) * channels * get_global_size(1) * ow + get_group_id(1) * get_local_size(1) * ow, // dst
104 | local_dst, // src
105 | get_local_size(1) * ow, // size_t num_elements_per_line,
106 | channels, // size_t num_lines,
107 | 0, // size_t src_line_stride,
108 | get_global_size(1) * ow - get_local_size(1) * ow, // size_t dst_line_stride,
109 | 0);
110 |
111 | wait_group_events(1, &e2);
112 | }
113 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/shuffle_channels.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:230aa8e01a387beb4de512de1f3599867cc74dc36578359d78f5c856af9428cd
3 | size 10740
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/shuffle_channels.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 |
7 | __kernel void ShuffleChannel(
8 | __global const half *restrict src_data,
9 | __global half *restrict dst_data,
10 | int C,
11 | int H,
12 | int W,
13 | int G)
14 | {
15 | int c = get_global_id(0);
16 | if (c >= C) return;
17 | int CX = C / G;
18 | int CY = G;
19 | int cy = c % G;
20 | int cx = c / G;
21 |
22 | __global const half8 *src_line =
23 | ((__global const half8 *)(src_data + cy * CX * H * W + cx * H * W));
24 | __global half8 *dst_line = ((__global half8 *)(dst_data + cx * CY * H * W + cy * H * W));
25 |
26 | for (int i = 0; i < W * H / 8; i++) {
27 | dst_line[i] = src_line[i];
28 | }
29 |
30 | for (int i = W * H / 8 * 8; i < W * H; i++) {
31 | dst_data[cx * CY * H * W + cy * H * W + i] = src_data[cy * CX * H * W + cx * H * W + i];
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/st.bin:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0f4e46dc0a701ea9d32ed113d8be30448306fea92560e617bed9605e24d1d6fb
3 | size 20376
4 |
--------------------------------------------------------------------------------
/openvino/vpu_custom_kernels/st.cl:
--------------------------------------------------------------------------------
1 | // Copyright (C) 2018-2022 Intel Corporation
2 | // SPDX-License-Identifier: Apache-2.0
3 | //
4 |
5 | #pragma OPENCL EXTENSION cl_khr_fp16 : enable
6 | #pragma OPENCL EXTENSION cl_khr_extended_async_copies : enable
7 |
8 | #define MAX_WIDTH 512
9 |
10 | __attribute__((noinline)) void calcInd(
11 | __global const half *restrict theta,
12 | __local half *restrict weight,
13 | __local int *restrict ind,
14 | int y,
15 | int H,
16 | int x0,
17 | int length,
18 | int step,
19 | int W)
20 | {
21 | float a = (float)y * 1.0f / H * 2 - 1;
22 |
23 | int x = 0;
24 |
25 | float8 va = (float8){a, a, a, a, a, a, a, a};
26 | float8 vxy = (float8){x0 + 0, x0 + 1, x0 + 2, x0 + 3, x0 + 4, x0 + 5, x0 + 6, x0 + 7};
27 |
28 | for (; x <= length - 8; x += 8, vxy += 8) {
29 | float8 va1 = vxy * 1.0f / W * 2 - 1.f;
30 |
31 | float8 vx = (va * theta[0] + va1 * theta[1] + theta[2] + 1.f) / 2.f * H;
32 | float8 vy = (va * theta[3] + va1 * theta[4] + theta[5] + 1.f) / 2.f * W;
33 |
34 | const int8 ix = convert_int8(vx) - ((vx < 0) & 1);
35 | const int8 iy = convert_int8(vy) - ((vy < 0) & 1);
36 |
37 | float8 ax = vx - convert_float8(ix);
38 | float8 ay = vy - convert_float8(iy);
39 | float8 bx = 1.f - ax;
40 | float8 by = 1.f - ay;
41 |
42 | union {
43 | int8 d;
44 | uint8 i;
45 | } check_x;
46 |
47 | check_x.d = ix;
48 | int8 b01 = check_x.i < (uint8)H;
49 |
50 | check_x.d = ix + 1;
51 | int8 b45 = check_x.i < (uint8)H;
52 |
53 | union {
54 | int8 d;
55 | uint8 i;
56 | } check_y;
57 |
58 | check_y.d = iy;
59 | int8 b23 = check_y.i < (uint8)W;
60 |
61 | check_y.d = iy + 1;
62 | int8 b67 = check_y.i < (uint8)W;
63 |
64 | int8 b0123 = b01 & b23;
65 | int8 b0167 = b01 & b67;
66 | int8 b4523 = b45 & b23;
67 | int8 b4567 = b45 & b67;
68 |
69 | int8 TL_id = ((ix + 0) * W + (iy + 0)) * (b0123 & 1);
70 | int8 BL_id = ((ix + 1) * W + (iy + 0)) * (b4523 & 1);
71 | int8 TR_id = ((ix + 0) * W + (iy + 1)) * (b0167 & 1);
72 | int8 BR_id = ((ix + 1) * W + (iy + 1)) * (b4567 & 1);
73 |
74 | union {
75 | float8 f;
76 | int8 i;
77 | } w0;
78 | w0.f = bx * by;
79 | union {
80 | float8 f;
81 | int8 i;
82 | } w1;
83 | w1.f = ax * by;
84 | union {
85 | float8 f;
86 | int8 i;
87 | } w2;
88 | w2.f = bx * ay;
89 | union {
90 | float8 f;
91 | int8 i;
92 | } w3;
93 | w3.f = ax * ay;
94 |
95 | w0.i = w0.i & b0123;
96 | w1.i = w1.i & b4523;
97 | w2.i = w2.i & b0167;
98 | w3.i = w3.i & b4567;
99 |
100 | *((__local half8 *)(weight + x + 0 * step)) = convert_half8(w0.f);
101 | *((__local half8 *)(weight + x + 1 * step)) = convert_half8(w1.f);
102 | *((__local half8 *)(weight + x + 2 * step)) = convert_half8(w2.f);
103 | *((__local half8 *)(weight + x + 3 * step)) = convert_half8(w3.f);
104 |
105 | *((__local int8 *)(ind + x + 0 * step)) = TL_id;
106 | *((__local int8 *)(ind + x + 1 * step)) = BL_id;
107 | *((__local int8 *)(ind + x + 2 * step)) = TR_id;
108 | *((__local int8 *)(ind + x + 3 * step)) = BR_id;
109 | }
110 |
111 | for (; x < length; x++) {
112 | float a1 = (float)(x0 + x) * 1.0f / W * 2 - 1;
113 |
114 | float fx = (a * theta[0] + a1 * theta[1] + theta[2] + 1) / 2 * H;
115 | float fy = (a * theta[3] + a1 * theta[4] + theta[5] + 1) / 2 * W;
116 |
117 | const int ix = (int)(fx) - (fx < 0);
118 | const int iy = (int)(fy) - (fy < 0);
119 |
120 | float ax = fx - ix;
121 | float ay = fy - iy;
122 | float bx = 1 - ax;
123 | float by = 1 - ay;
124 |
125 | int b0 = ix >= 0;
126 | int b4 = ix >= -1;
127 | int b1 = ix < H;
128 | int b5 = ix < H - 1;
129 |
130 | int b2 = iy >= 0;
131 | int b6 = iy >= -1;
132 | int b3 = iy < W;
133 | int b7 = iy < W - 1;
134 |
135 | int b01 = b0 & b1;
136 | int b23 = b2 & b3;
137 | int b45 = b4 & b5;
138 | int b67 = b6 & b7;
139 |
140 | int b0123 = b01 & b23;
141 | int b0167 = b01 & b67;
142 | int b4523 = b45 & b23;
143 | int b4567 = b45 & b67;
144 |
145 | int TL_id = ((ix + 0) * W + (iy + 0)) * b0123;
146 | int BL_id = ((ix + 1) * W + (iy + 0)) * b4523;
147 | int TR_id = ((ix + 0) * W + (iy + 1)) * b0167;
148 | int BR_id = ((ix + 1) * W + (iy + 1)) * b4567;
149 |
150 | half w0 = bx * by * b0123;
151 | half w1 = ax * by * b4523;
152 | half w2 = bx * ay * b0167;
153 | half w3 = ax * ay * b4567;
154 |
155 | weight[x + 0 * step] = w0;
156 | weight[x + 1 * step] = w1;
157 | weight[x + 2 * step] = w2;
158 | weight[x + 3 * step] = w3;
159 |
160 | ind[x + 0 * step] = TL_id;
161 | ind[x + 1 * step] = BL_id;
162 | ind[x + 2 * step] = TR_id;
163 | ind[x + 3 * step] = BR_id;
164 | }
165 | }
166 |
167 | __attribute__((noinline)) void apply(
168 | __global half const *restrict src,
169 | __local half const *restrict weight,
170 | __local int const *restrict ind,
171 | __local half *restrict dst,
172 | int src_stride,
173 | int step)
174 | {
175 | int x = 0;
176 | for (; x <= src_stride - 8; x += 8) {
177 | int8 TL_id = *((__local int8 *)(ind + x + 0 * step));
178 | int8 BL_id = *((__local int8 *)(ind + x + 1 * step));
179 | int8 TR_id = *((__local int8 *)(ind + x + 2 * step));
180 | int8 BR_id = *((__local int8 *)(ind + x + 3 * step));
181 |
182 | half8 w00 = *((__local half8 *)(weight + x + 0 * step));
183 | half8 w01 = *((__local half8 *)(weight + x + 1 * step));
184 | half8 w02 = *((__local half8 *)(weight + x + 2 * step));
185 | half8 w03 = *((__local half8 *)(weight + x + 3 * step));
186 |
187 | half8 TL = (half8){
188 | src[TL_id[0]], src[TL_id[1]],
189 | src[TL_id[2]], src[TL_id[3]],
190 | src[TL_id[4]], src[TL_id[5]],
191 | src[TL_id[6]], src[TL_id[7]]};
192 | half8 TR = (half8){
193 | src[TR_id[0]], src[TR_id[1]],
194 | src[TR_id[2]], src[TR_id[3]],
195 | src[TR_id[4]], src[TR_id[5]],
196 | src[TR_id[6]], src[TR_id[7]]};
197 | half8 BL = (half8){
198 | src[BL_id[0]], src[BL_id[1]],
199 | src[BL_id[2]], src[BL_id[3]],
200 | src[BL_id[4]], src[BL_id[5]],
201 | src[BL_id[6]], src[BL_id[7]]};
202 | half8 BR = (half8){
203 | src[BR_id[0]], src[BR_id[1]],
204 | src[BR_id[2]], src[BR_id[3]],
205 | src[BR_id[4]], src[BR_id[5]],
206 | src[BR_id[6]], src[BR_id[7]]};
207 |
208 | half8 res = w00 * TL + w01 * BL + w02 * TR + w03 * BR;
209 |
210 | *((__local half8 *)(dst + x)) = res;
211 | }
212 |
213 | for (; x < src_stride; x++) {
214 | int TL_id = ind[x + 0 * step];
215 | int BL_id = ind[x + 1 * step];
216 | int TR_id = ind[x + 2 * step];
217 | int BR_id = ind[x + 3 * step];
218 |
219 | half w00 = weight[x + 0 * step];
220 | half w01 = weight[x + 1 * step];
221 | half w02 = weight[x + 2 * step];
222 | half w03 = weight[x + 3 * step];
223 |
224 | half TL = src[TL_id];
225 | half TR = src[TR_id];
226 | half BL = src[BL_id];
227 | half BR = src[BR_id];
228 |
229 | half res = w00 * TL + w01 * BL + w02 * TR + w03 * BR;
230 |
231 | dst[x] = res;
232 | }
233 | }
234 |
235 | __kernel void ocl_st(
236 | __global half const *const restrict src_data,
237 | __global half const *const restrict theta,
238 | __global half *const restrict dst_data,
239 | int C,
240 | int W)
241 | {
242 | __local int ind[4 * MAX_WIDTH] __attribute__((aligned(16)));
243 | __local half weight[4 * MAX_WIDTH] __attribute__((aligned(16)));
244 | __local half local_dst[4 * 1024];
245 |
246 | int w = get_group_id(0);
247 |
248 | int y = get_global_id(1);
249 | int H = get_global_size(1);
250 |
251 | const int x0 = w * MAX_WIDTH;
252 | const int x1 = min(x0 + MAX_WIDTH, W);
253 | const int src_stride = x1 - x0;
254 |
255 | calcInd(theta, weight, ind, y, H, x0, src_stride, MAX_WIDTH, W);
256 |
257 | for (int c = 0; c < C; c++) {
258 | __global half const *restrict src = src_data + c * H * W;
259 | __local half *restrict dst = local_dst + c * get_local_size(1) * src_stride + get_local_id(1) * src_stride;
260 |
261 | apply(src, weight, ind, dst, src_stride, MAX_WIDTH);
262 | }
263 |
264 | barrier(CLK_LOCAL_MEM_FENCE);
265 |
266 | event_t e = async_work_group_copy_3D3D(
267 | dst_data + get_group_id(1) * get_local_size(1) * W + x0, // dst
268 | local_dst, // src
269 | src_stride, // num_elements_per_line
270 | get_local_size(1), // num_lines
271 | 0, // src_line_stride
272 | W - src_stride, // dst_line_stride
273 | C, // num planes
274 | 0, // src plane stride
275 | W * (get_global_size(1) - get_local_size(1)), // dst plane stride
276 | 0);
277 | wait_group_events(1, &e);
278 | }
279 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flask
2 | flask-cors
3 | gradio
4 | opencv-python
5 | numpy==1.20.3
--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | exec python3 app.py &
4 | exec python3 gradio/demo.py
--------------------------------------------------------------------------------