├── .gitattributes
├── .gitignore
├── .gitmodules
├── README.md
├── archive
├── compute_linear_param_encodings.py
├── convert_model.py
├── export_quantized_model.py
└── make_calibration_samples.py
├── assets
├── b_rwkv_vocab_v20230424.txt
├── lambada_test.txt
├── mmlu_dev_dataset
│ ├── data-00000-of-00001.arrow
│ ├── dataset_info.json
│ └── state.json
├── mmlu_test_dataset.json
├── mmlu_test_dataset
│ ├── data-00000-of-00001.arrow
│ ├── dataset_info.json
│ └── state.json
├── rwkv_vocab_v20230424.txt
└── rwkv_vocab_v20230424_tts.txt
├── build_hexagon_wkv_kernel.sh
├── compute_quant_encodings_experimental.py
├── convert_model.py
├── convert_model_dlc.py
├── convert_vocab.py
├── docs
├── optrace.md
└── xelite_npu_rwkv.png
├── hexagon
├── CPU
│ └── RwkvWkvOpPackage
│ │ ├── Makefile
│ │ ├── config
│ │ └── RwkvWkvOpPackageCPU.xml
│ │ ├── makefiles
│ │ ├── Android.mk
│ │ ├── Application.mk
│ │ └── Makefile.linux-x86_64
│ │ └── src
│ │ ├── CpuCustomOpPackage.cpp
│ │ ├── RwkvWkvOpPackageInterface.cpp
│ │ ├── ops
│ │ ├── wkv6.cpp
│ │ ├── wkv7_output.cpp
│ │ └── wkv7_state.cpp
│ │ └── utils
│ │ ├── BackendUtils.hpp
│ │ ├── CPU
│ │ ├── CpuBackendUtils.cpp
│ │ └── CpuBackendUtils.hpp
│ │ └── CustomOpUtils.hpp
├── HTP
│ ├── RwkvWkvOpPackage
│ │ ├── Makefile
│ │ ├── config
│ │ │ └── RwkvWkvOpPackageHTP.xml
│ │ └── src
│ │ │ ├── RwkvWkvOpPackageInterface.cpp
│ │ │ └── ops
│ │ │ ├── wkv6.cpp
│ │ │ ├── wkv7.cpp.old
│ │ │ ├── wkv7_output.cpp
│ │ │ └── wkv7_state.cpp
│ └── prebuilt
│ │ ├── libQnnRwkvWkvOpPackageV68.so
│ │ ├── libQnnRwkvWkvOpPackageV69.so
│ │ ├── libQnnRwkvWkvOpPackageV73.so
│ │ ├── libQnnRwkvWkvOpPackageV75.so
│ │ └── libQnnRwkvWkvOpPackageV79.so
└── test
│ ├── test_qnn_wkv_kernel.py
│ └── wkv_custom.py
├── librwkv-qualcomm
├── CMakeLists.txt
├── Makefile
├── make
│ ├── Android-demo.mk
│ ├── Android-eval.mk
│ ├── Android-mmlu.mk
│ ├── Android.mk
│ ├── Application.mk
│ ├── Makefile.linux-x86_64
│ ├── Makefile.oe-linux-aarch64-gcc11.2
│ ├── Makefile.oe-linux-aarch64-gcc8.2
│ ├── Makefile.oe-linux-aarch64-gcc9.3
│ └── Makefile.ubuntu-aarch64-gcc9.4
└── src
│ ├── CMakeLists.txt
│ ├── Interfaces.hpp
│ ├── Log
│ ├── LogUtils.cpp
│ ├── LogUtils.hpp
│ ├── Logger.cpp
│ └── Logger.hpp
│ ├── PAL
│ ├── include
│ │ └── PAL
│ │ │ ├── Debug.hpp
│ │ │ ├── Directory.hpp
│ │ │ ├── DynamicLoading.hpp
│ │ │ ├── FileOp.hpp
│ │ │ ├── Path.hpp
│ │ │ └── StringOp.hpp
│ └── src
│ │ ├── common
│ │ └── StringOp.cpp
│ │ ├── linux
│ │ ├── Directory.cpp
│ │ ├── DynamicLoading.cpp
│ │ ├── FileOp.cpp
│ │ └── Path.cpp
│ │ └── windows
│ │ ├── Common.cpp
│ │ ├── Common.hpp
│ │ ├── Directory.cpp
│ │ ├── DynamicLoading.cpp
│ │ ├── FileOp.cpp
│ │ └── Path.cpp
│ ├── QnnTypeDef.hpp
│ ├── QnnTypeMacros.hpp
│ ├── Utils
│ ├── BuildId.hpp
│ ├── ClientBuffer.cpp
│ ├── ClientBuffer.hpp
│ ├── DataUtil.cpp
│ ├── DataUtil.hpp
│ ├── DmaBufAllocator.cpp
│ ├── DmaBufAllocator.hpp
│ ├── DynamicLoadUtil.cpp
│ ├── DynamicLoadUtil.hpp
│ ├── IBufferAlloc.hpp
│ ├── IOTensor.cpp
│ ├── IOTensor.hpp
│ ├── RpcMem.cpp
│ ├── RpcMem.hpp
│ ├── Utils.cpp
│ ├── Utils.hpp
│ ├── dlwrap.cpp
│ └── dlwrap.hpp
│ ├── WrapperUtils
│ ├── QnnWrapperUtils.cpp
│ └── QnnWrapperUtils.hpp
│ ├── eval_text.cpp
│ ├── half.hpp
│ ├── json.hpp
│ ├── librwkv-qualcomm-app.cpp
│ ├── librwkv-qualcomm-app.hpp
│ ├── librwkv-qualcomm.cpp
│ ├── librwkv-qualcomm.h
│ ├── main.cpp
│ ├── mmlu.cpp
│ ├── soc_detect.cpp
│ ├── soc_detect.h
│ ├── tokenizer.cpp
│ ├── tokenizer.h
│ └── trie.hpp
├── make_context_cache_binary.py
├── make_context_cache_binary_dlc.py
├── quant_encodings
└── README.md
├── quantizers
├── advanced_ptq
│ └── actmse_quantizer.py
├── base_quantizer.py
├── configs
│ ├── backend_aware_htp_quantsim_config_v75.json
│ ├── default_per_channel_config.json
│ ├── htp_quantsim_config_v75.json
│ ├── htp_quantsim_config_v75_per_channel.json
│ ├── qsim_config_per_channel_with_exceptions.json
│ ├── rwkv_activation_exceptions.json
│ └── rwkv_gptq_exceptions.json
└── exceptions.py
├── quantsim_eval_lambada.py
├── quantsim_eval_mmlu.py
├── rwkv_src
├── elemwise_ops.py
├── rwkv_model.py
├── rwkv_tokenizer.py
├── rwkv_v5_modules.py
├── rwkv_v6_modules.py
├── rwkv_v7_modules.py
├── rwkv_v7_modules_conv.py
└── wkv_custom.py
└── utils
├── dataset_builder.py
├── htp_devices_config.py
├── indexed_dataset.py
├── model_preparer.py
├── model_utils.py
└── split_onnx.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.encodings filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | onnx/
2 | libs/
3 | lib/
4 | build/
5 | output/
6 | .pkl_memoize_py3/
7 | gmon.out
8 | qacc_temp/
9 | obj/
10 | bin/
11 | *.pyc
12 | __pycache__
13 | .vscode/
14 | tmp/
15 | samples*/
16 | input_list*
17 | dataset_cache/
18 | quant_export/
19 | trace_output/
20 | test_wkv*
21 | test_data*
22 | QNN/
23 | v7_*_quant/
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MollySophia/rwkv-qualcomm/fd403b7c9f3b6c4ac5a810f334a0a51c8693fb42/.gitmodules
--------------------------------------------------------------------------------
/archive/compute_linear_param_encodings.py:
--------------------------------------------------------------------------------
1 | # from rwkv_src.modeling_rwkv6 import Rwkv6ForCausalLM
2 | from rwkv_src.rwkv_model import RWKV_RNN
3 | from transformers import AutoConfig, AutoTokenizer
4 | import types
5 | import torch
6 | import torch.nn as nn
7 | from transformers.tokenization_utils_base import BatchEncoding
8 |
9 | from utils.model_utils import get_dummy_input_for_rwkv_causal_llm
10 | from quantizers.advanced_ptq.actmse_quantizer import ActMSEQuantizer
11 | from utils.dataset_builder import DatasetBuilder
12 |
13 | import argparse
14 | from pathlib import Path
15 |
16 | parser = argparse.ArgumentParser(description='Compute param encodings for linear modules')
17 | parser.add_argument('model', type=Path, help='Path to RWKV pth file')
18 | parser.add_argument('--weights_bitwidth', type=int, default=4, help='Weights bitwidth')
19 | parser.add_argument('--use_cuda', action='store_true', default=True, help='Use CUDA')
20 | parser.add_argument('--strategy', type=str, choices=['symqt', 'symfp', 'asym'], default='asym', help='Quantization strategy')
21 | args_parser = parser.parse_args()
22 |
23 | args = types.SimpleNamespace()
24 | ##############################
25 | args.quant_scheme = "tf"
26 | args.activation_bit_width = 32
27 | args.parameter_bit_width = args_parser.weights_bitwidth
28 | args.in_place_quantsim = False
29 | args.config_file = "quantizers/configs/default_per_channel_config.json"
30 | args.num_cands = 20
31 | args.export_dir = "quant_export"
32 | args.output_dir = "quant_export"
33 | args.model_name = str(args_parser.model).replace(".pth", "").split("/")[-1]
34 | args.input_symmetry = args_parser.strategy
35 | args.exceptions_file = "quantizers/configs/rwkv_gptq_exceptions.json"
36 | args.act_mse_loss_type = "mse"
37 | args.parameter_encoding_file = None
38 | args.encoding_path = None
39 | args.do_actmse = True
40 | args.disable_act_quantizers = True
41 | args.fp16 = False
42 | args.do_train = False
43 | args.clip_activation = None
44 | args.load_sim_checkpoint = False
45 | args.save_sim_checkpoint = False
46 | ##############################
47 | args.calib_dataset_name = "wikitext"
48 | args.calib_dataset_config_name = "wikitext-2-raw-v1"
49 | args.dataset_cache_dir = "./dataset_cache"
50 | args.calib_dataset_split = None
51 | args.calib_dataset_preprocessor = "gpt2"
52 | args.eval_dataset_name = "wikitext"
53 | args.eval_dataset_config_name = "wikitext-103-raw-v1"
54 | args.eval_dataset_split = "test"
55 | args.eval_dataset_preprocessor = "gptq"
56 | args.num_calibration_batches = 20
57 | args.per_device_calib_batch_size = 1
58 | args.per_device_eval_batch_size = 1
59 | args.block_size = 1024
60 | args.seed = 1234
61 | ##############################
62 |
63 | device = torch.device("cuda") if args_parser.use_cuda and torch.cuda.is_available() else torch.device("cpu")
64 | args.device = device
65 |
66 | model_args = types.SimpleNamespace()
67 | model_args.USE_CUDA = args_parser.use_cuda
68 | model_args.fp16 = False
69 | model_args.wkv_customop = False
70 | model_args.USE_EMBEDDING = True
71 | model_args.MODEL_NAME = str(args_parser.model)
72 | model_args.RESCALE_LAYER = 0
73 | model_args.eos_token_id = 0
74 | model = RWKV_RNN(model_args)
75 |
76 | tokenizer = AutoTokenizer.from_pretrained("RWKV/rwkv-5-world-1b5", trust_remote_code=True)
77 | tokenizer.model_max_length = 1024
78 |
79 | dummy_input = get_dummy_input_for_rwkv_causal_llm(1, 1, device, model_cfg=model.args)
80 |
81 | dataset_builder = DatasetBuilder(args)
82 | dataset_builder.make_dataset(tokenizer=tokenizer, args=args, column_name="text", shuffle=True)
83 |
84 | quantizer = ActMSEQuantizer(model, args, model.args)
85 | quantizer.orig_model = model
86 | quantizer.prepare_quantsim(dummy_input, args, dataset_builder.train_dataloader, tokenizer)
87 |
--------------------------------------------------------------------------------
/archive/make_calibration_samples.py:
--------------------------------------------------------------------------------
1 | from rwkv_src.rwkv_tokenizer import RWKV_TOKENIZER
2 | from rwkv_src.rwkv_model import RWKV_RNN, make_chunks, run_prompt
3 | import types
4 | import os, sys
5 | import torch
6 | import argparse
7 | from pathlib import Path
8 |
9 | from torchvision import datasets
10 | from datasets import load_dataset
11 |
12 | def main():
13 | parser = argparse.ArgumentParser(description='Make calibration sample files')
14 | parser.add_argument('model', type=Path, help='Path to RWKV pth file')
15 | parser.add_argument('output', type=Path, help='Path to output folder')
16 | parser.add_argument('chunks', type=int, help='Number of chunks')
17 | parser.add_argument('--ext_embedding', action='store_true', default=False, help='Use external embedding')
18 | parser.add_argument('--prefill', action='store_true', default=False, help='Prefill model')
19 | args = parser.parse_args()
20 |
21 | seq_length = 32 if args.prefill else 1
22 |
23 | model_args = types.SimpleNamespace()
24 | model_args.USE_CUDA = torch.cuda.is_available()
25 | model_args.fp16 = False
26 | model_args.USE_EMBEDDING = False if args.ext_embedding else True
27 | model_args.RESCALE_LAYER = 0
28 | model_args.wkv_customop = False
29 |
30 | model_args.MODEL_NAME = str(args.model)
31 |
32 | tokenizer = RWKV_TOKENIZER("./assets/rwkv_vocab_v20230424.txt")
33 |
34 | model = make_chunks(args.chunks, model_args) if args.chunks > 1 else RWKV_RNN(model_args)
35 |
36 | dataset = load_dataset('wikitext', 'wikitext-2-raw-v1', split='train')
37 | print("dataset len:", len(dataset['text']))
38 | for i in range(20):
39 | run_prompt(model, dataset['text'][i], tokenizer=tokenizer, length=0, seq_length=seq_length, generate_samples=True, samples_output=str(args.output))
40 |
41 | if __name__ == '__main__':
42 | main()
--------------------------------------------------------------------------------
/assets/mmlu_dev_dataset/data-00000-of-00001.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MollySophia/rwkv-qualcomm/fd403b7c9f3b6c4ac5a810f334a0a51c8693fb42/assets/mmlu_dev_dataset/data-00000-of-00001.arrow
--------------------------------------------------------------------------------
/assets/mmlu_dev_dataset/dataset_info.json:
--------------------------------------------------------------------------------
1 | {
2 | "builder_name": "parquet",
3 | "citation": "",
4 | "config_name": "all",
5 | "dataset_name": "mmlu",
6 | "dataset_size": 168871380,
7 | "description": "",
8 | "download_checksums": {
9 | "hf://datasets/cais/mmlu@c30699e8356da336a370243923dbaf21066bb9fe/all/test-00000-of-00001.parquet": {
10 | "num_bytes": 3504718,
11 | "checksum": null
12 | },
13 | "hf://datasets/cais/mmlu@c30699e8356da336a370243923dbaf21066bb9fe/all/validation-00000-of-00001.parquet": {
14 | "num_bytes": 408449,
15 | "checksum": null
16 | },
17 | "hf://datasets/cais/mmlu@c30699e8356da336a370243923dbaf21066bb9fe/all/dev-00000-of-00001.parquet": {
18 | "num_bytes": 76504,
19 | "checksum": null
20 | },
21 | "hf://datasets/cais/mmlu@c30699e8356da336a370243923dbaf21066bb9fe/all/auxiliary_train-00000-of-00001.parquet": {
22 | "num_bytes": 47513731,
23 | "checksum": null
24 | }
25 | },
26 | "download_size": 51503402,
27 | "features": {
28 | "question": {
29 | "dtype": "string",
30 | "_type": "Value"
31 | },
32 | "subject": {
33 | "dtype": "string",
34 | "_type": "Value"
35 | },
36 | "choices": {
37 | "feature": {
38 | "dtype": "string",
39 | "_type": "Value"
40 | },
41 | "_type": "Sequence"
42 | },
43 | "answer": {
44 | "names": [
45 | "A",
46 | "B",
47 | "C",
48 | "D"
49 | ],
50 | "_type": "ClassLabel"
51 | }
52 | },
53 | "homepage": "",
54 | "license": "",
55 | "size_in_bytes": 220374782,
56 | "splits": {
57 | "test": {
58 | "name": "test",
59 | "num_bytes": 6969209,
60 | "num_examples": 14042,
61 | "dataset_name": "mmlu"
62 | },
63 | "validation": {
64 | "name": "validation",
65 | "num_bytes": 763676,
66 | "num_examples": 1531,
67 | "dataset_name": "mmlu"
68 | },
69 | "dev": {
70 | "name": "dev",
71 | "num_bytes": 125389,
72 | "num_examples": 285,
73 | "dataset_name": "mmlu"
74 | },
75 | "auxiliary_train": {
76 | "name": "auxiliary_train",
77 | "num_bytes": 161013106,
78 | "num_examples": 99842,
79 | "dataset_name": "mmlu"
80 | }
81 | },
82 | "version": {
83 | "version_str": "0.0.0",
84 | "major": 0,
85 | "minor": 0,
86 | "patch": 0
87 | }
88 | }
--------------------------------------------------------------------------------
/assets/mmlu_dev_dataset/state.json:
--------------------------------------------------------------------------------
1 | {
2 | "_data_files": [
3 | {
4 | "filename": "data-00000-of-00001.arrow"
5 | }
6 | ],
7 | "_fingerprint": "ca7a71e4c243f30b",
8 | "_format_columns": null,
9 | "_format_kwargs": {},
10 | "_format_type": null,
11 | "_output_all_columns": false,
12 | "_split": "dev"
13 | }
--------------------------------------------------------------------------------
/assets/mmlu_test_dataset/data-00000-of-00001.arrow:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MollySophia/rwkv-qualcomm/fd403b7c9f3b6c4ac5a810f334a0a51c8693fb42/assets/mmlu_test_dataset/data-00000-of-00001.arrow
--------------------------------------------------------------------------------
/assets/mmlu_test_dataset/dataset_info.json:
--------------------------------------------------------------------------------
1 | {
2 | "builder_name": "parquet",
3 | "citation": "",
4 | "config_name": "all",
5 | "dataset_name": "mmlu",
6 | "dataset_size": 168871380,
7 | "description": "",
8 | "download_checksums": {
9 | "hf://datasets/cais/mmlu@c30699e8356da336a370243923dbaf21066bb9fe/all/test-00000-of-00001.parquet": {
10 | "num_bytes": 3504718,
11 | "checksum": null
12 | },
13 | "hf://datasets/cais/mmlu@c30699e8356da336a370243923dbaf21066bb9fe/all/validation-00000-of-00001.parquet": {
14 | "num_bytes": 408449,
15 | "checksum": null
16 | },
17 | "hf://datasets/cais/mmlu@c30699e8356da336a370243923dbaf21066bb9fe/all/dev-00000-of-00001.parquet": {
18 | "num_bytes": 76504,
19 | "checksum": null
20 | },
21 | "hf://datasets/cais/mmlu@c30699e8356da336a370243923dbaf21066bb9fe/all/auxiliary_train-00000-of-00001.parquet": {
22 | "num_bytes": 47513731,
23 | "checksum": null
24 | }
25 | },
26 | "download_size": 51503402,
27 | "features": {
28 | "question": {
29 | "dtype": "string",
30 | "_type": "Value"
31 | },
32 | "subject": {
33 | "dtype": "string",
34 | "_type": "Value"
35 | },
36 | "choices": {
37 | "feature": {
38 | "dtype": "string",
39 | "_type": "Value"
40 | },
41 | "_type": "Sequence"
42 | },
43 | "answer": {
44 | "names": [
45 | "A",
46 | "B",
47 | "C",
48 | "D"
49 | ],
50 | "_type": "ClassLabel"
51 | }
52 | },
53 | "homepage": "",
54 | "license": "",
55 | "size_in_bytes": 220374782,
56 | "splits": {
57 | "test": {
58 | "name": "test",
59 | "num_bytes": 6969209,
60 | "num_examples": 14042,
61 | "dataset_name": "mmlu"
62 | },
63 | "validation": {
64 | "name": "validation",
65 | "num_bytes": 763676,
66 | "num_examples": 1531,
67 | "dataset_name": "mmlu"
68 | },
69 | "dev": {
70 | "name": "dev",
71 | "num_bytes": 125389,
72 | "num_examples": 285,
73 | "dataset_name": "mmlu"
74 | },
75 | "auxiliary_train": {
76 | "name": "auxiliary_train",
77 | "num_bytes": 161013106,
78 | "num_examples": 99842,
79 | "dataset_name": "mmlu"
80 | }
81 | },
82 | "version": {
83 | "version_str": "0.0.0",
84 | "major": 0,
85 | "minor": 0,
86 | "patch": 0
87 | }
88 | }
--------------------------------------------------------------------------------
/assets/mmlu_test_dataset/state.json:
--------------------------------------------------------------------------------
1 | {
2 | "_data_files": [
3 | {
4 | "filename": "data-00000-of-00001.arrow"
5 | }
6 | ],
7 | "_fingerprint": "436299c1c09696bb",
8 | "_format_columns": null,
9 | "_format_kwargs": {},
10 | "_format_type": null,
11 | "_output_all_columns": false,
12 | "_split": "test"
13 | }
--------------------------------------------------------------------------------
/build_hexagon_wkv_kernel.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | rm -rf hexagon/HTP/RwkvWkvOpPackage/build
4 | make -C hexagon/HTP/RwkvWkvOpPackage/ htp_x86 htp_v68 htp_v69 htp_v73 htp_v75 htp_v79 -j4
5 |
6 | make -C hexagon/CPU/RwkvWkvOpPackage/ -j4
7 |
8 | rm -rf hexagon/HTP/prebuilt
9 | mkdir -p hexagon/HTP/prebuilt
10 |
11 | cp hexagon/HTP/RwkvWkvOpPackage/build/hexagon-v68/libQnnRwkvWkvOpPackage.so hexagon/HTP/prebuilt/libQnnRwkvWkvOpPackageV68.so
12 | cp hexagon/HTP/RwkvWkvOpPackage/build/hexagon-v69/libQnnRwkvWkvOpPackage.so hexagon/HTP/prebuilt/libQnnRwkvWkvOpPackageV69.so
13 | cp hexagon/HTP/RwkvWkvOpPackage/build/hexagon-v73/libQnnRwkvWkvOpPackage.so hexagon/HTP/prebuilt/libQnnRwkvWkvOpPackageV73.so
14 | cp hexagon/HTP/RwkvWkvOpPackage/build/hexagon-v75/libQnnRwkvWkvOpPackage.so hexagon/HTP/prebuilt/libQnnRwkvWkvOpPackageV75.so
15 | cp hexagon/HTP/RwkvWkvOpPackage/build/hexagon-v79/libQnnRwkvWkvOpPackage.so hexagon/HTP/prebuilt/libQnnRwkvWkvOpPackageV79.so
--------------------------------------------------------------------------------
/convert_vocab.py:
--------------------------------------------------------------------------------
1 | import sys, ast
2 |
3 | vocab_file = sys.argv[1]
4 | vocab = None
5 | with open(vocab_file, 'r') as f:
6 | vocab = f.readlines()
7 |
8 | vocab_new = []
9 | for line in vocab:
10 | parts = line.split(' ')
11 | assert len(parts) >= 3
12 | idx, token, token_len = int(parts[0]), ast.literal_eval(' '.join(parts[1:-1])), int(parts[-1])
13 | token = token.encode("utf-8") if isinstance(token, str) else token
14 | token_raw = "b'"
15 | for byte in token:
16 | token_raw += '\\x' + hex(byte)[2:].zfill(2)
17 | token_raw += "'"
18 | vocab_new.append(f"{idx} {token_raw} {token_len}\n")
19 |
20 | with open("b_" + vocab_file, 'w') as f:
21 | f.writelines(vocab_new)
--------------------------------------------------------------------------------
/docs/optrace.md:
--------------------------------------------------------------------------------
1 | ```
2 | rm -rf trace_output
3 | ./qnn-net-run --profiling_level detailed --profiling_option optrace --output_data_type float_and_native --retrieve_context RWKV-x070-World-1.5B-v3-20250127-ctx4096.bin --backend libQnnHtp.so --input_list ./input_list.txt --output_dir ./trace_output --log_level info --perf_profile burst --io_tensor_mem_handle_type=ion
4 | # or with customop:
5 | ./qnn-net-run --profiling_level detailed --profiling_option optrace --output_data_type float_and_native --retrieve_context RWKV-x070-World-1.5B-v3-20250127-ctx4096.bin --backend libQnnHtp.so --input_list ./input_list.txt --output_dir ./trace_output --log_level info --perf_profile burst --io_tensor_mem_handle_type=ion --op_packages libQnnRwkvWkvOpPackage.so:RwkvWkvOpPackageInterfaceProvider
6 | ```
7 |
8 | ```
9 | adb pull /data/local/tmp/rwkv/trace_output
10 | qnn-profile-viewer --reader $QNN_SDK_ROOT/lib/x86_64-linux-clang/libQnnHtpOptraceProfilingReader.so --input_log ./trace_output/qnn-profiling-data_0.log --schematic ./RWKV-x070-World-1.5B-v3-20250127-ctx4096_schematic.bin --output ./chrometrace.json
11 | ```
--------------------------------------------------------------------------------
/docs/xelite_npu_rwkv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MollySophia/rwkv-qualcomm/fd403b7c9f3b6c4ac5a810f334a0a51c8693fb42/docs/xelite_npu_rwkv.png
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
3 | # All rights reserved.
4 | # Confidential and Proprietary - Qualcomm Technologies, Inc.
5 | #
6 |
7 | # define default
8 | default: all
9 |
10 | # define package name
11 | export PACKAGE_NAME := $(notdir $(shell pwd))
12 |
13 | # define library prerequisites list
14 | lib_cpu := src
15 | make_dir := makefiles
16 | LIB_SOURCES = $(lib_cpu))
17 |
18 | # define target_architecture
19 | export TARGET_AARCH_VARS:= -march=x86-64
20 |
21 | # define target name
22 | export TARGET = linux-x86_64
23 |
24 | # specify compiler
25 | export CXX ?= clang++-9
26 |
27 | # define default Android ABI
28 | PLATFORM ?= arm64-v8a
29 |
30 | .PHONY: all $(LIB_SOURCES) all_android all_x86 cpu cpu_x86 cpu_android
31 | all: $(LIB_SOURCES) all_x86 all_android
32 |
33 | # Combined Targets
34 | cpu: cpu_x86 cpu_android
35 | clean: clean_x86 clean_android clean_qnx clean_qos
36 |
37 | # x86 Targets
38 | all_x86: cpu_x86
39 |
40 | cpu_x86:
41 | $(call build_if_exists,$(lib_cpu),-$(MAKE) -f $(make_dir)/Makefile.linux-x86_64)
42 |
43 | clean_x86:
44 | @rm -rf libs obj
45 |
46 | # qnx Targets
47 | all_qnx: cpu_qnx cpu_qos
48 |
49 | cpu_qnx: check_qnx
50 | $(call build_if_exists,$(lib_cpu),-$(MAKE) -f $(make_dir)/Makefile.qnx-aarch64)
51 |
52 | clean_qnx:
53 | @rm -rf libs obj
54 |
55 | cpu_qos: check_qnx
56 | $(call build_if_exists,$(lib_cpu),-$(MAKE) -f $(make_dir)/Makefile.qos224-aarch64)
57 |
58 | clean_qos:
59 | @rm -rf libs obj
60 |
61 | # Android Targets
62 |
63 | all_android: cpu_android
64 |
65 | cpu_android: cpu_aarch64-android
66 |
67 | cpu_aarch64-android: check_ndk clean_aarch64-android
68 | $(call build_if_exists,$(lib_cpu),$(ANDROID_NDK_ROOT)/ndk-build APP_ALLOW_MISSING_DEPS=true APP_ABI="arm64-v8a" NDK_PROJECT_PATH=./ NDK_APPLICATION_MK=$(make_dir)/Application.mk APP_BUILD_SCRIPT=$(make_dir)/Android.mk)
69 | @$(rename_target_dirs)
70 |
71 | clean_android: check_ndk clean_aarch64-android
72 |
73 | clean_aarch64-android:
74 | @rm -rf libs/aarch64-android
75 | @rm -rf obj/local/aarch64-android
76 |
77 | # utilities
78 | # Syntax: $(call build_if_exists
,)
79 | build_if_exists = $(if $(wildcard $(1)),$(2),$(warning WARNING: $(1) does not exist. Skipping Compilation))
80 | rename_target_dirs = find . -type d -execdir rename 's/arm64-v8a/aarch64-android/' '{}' \+ \
81 |
82 | check_ndk:
83 | ifeq ($(ANDROID_NDK_ROOT),)
84 | $(error ERROR: ANDROID_NDK_ROOT not set, skipping compilation for Android platform(s).)
85 | endif
86 |
87 | check_qnx:
88 | ifeq ($(QNX_HOST),)
89 | $(error ERROR: QNX_HOST not set, skipping compilation for QNX platform.)
90 | endif
91 | ifeq ($(QNX_TARGET),)
92 | $(error ERROR: QNX_TARGET not set, skipping compilation for QNX platform.)
93 | endif
94 |
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/makefiles/Android.mk:
--------------------------------------------------------------------------------
1 | # ==============================================================================
2 | #
3 | # Copyright (c) 2020, 2023-2024 Qualcomm Technologies, Inc.
4 | # All Rights Reserved.
5 | # Confidential and Proprietary - Qualcomm Technologies, Inc.
6 | #
7 | # ===============================================================
8 |
9 | LOCAL_PATH := $(call my-dir)
10 | SUPPORTED_TARGET_ABI := arm64-v8a x86 x86_64
11 |
12 | #============================ Verify Target Info and Application Variables =========================================
13 | ifneq ($(filter $(TARGET_ARCH_ABI),$(SUPPORTED_TARGET_ABI)),)
14 | ifneq ($(APP_STL), c++_shared)
15 | $(error Unsupported APP_STL: "$(APP_STL)")
16 | endif
17 | else
18 | $(error Unsupported TARGET_ARCH_ABI: '$(TARGET_ARCH_ABI)')
19 | endif
20 |
21 | #============================ Define Common Variables ===============================================================
22 | # Include paths
23 | UTIL_SRC_DIR := $(LOCAL_PATH)/../src/utils
24 | # QNN_SDK_ROOT should be set and points to the SDK path, it will be used.
25 | ifdef QNN_SDK_ROOT
26 | # define directories
27 | CUSTOM_OP_DIR :=$(QNN_SDK_ROOT)/share/QNN/OpPackageGenerator/CustomOp
28 |
29 | # setup include paths
30 | PACKAGE_C_INCLUDES += -I $(QNN_SDK_ROOT)/include/QNN -I $(QNN_SDK_ROOT)/include/QNN/CPU -I $(LOCAL_PATH)/../include/ -I $(UTIL_SRC_DIR) -I $(UTIL_SRC_DIR)/CPU -I $(CUSTOM_OP_DIR)
31 | # copy source files from SDK if not present
32 | $(info Copying custom op source files from SDK)
33 | COPYFILES := $(shell find $(CUSTOM_OP_DIR)/CPU -name "*.cpp" -exec cp -rf {} $(LOCAL_PATH)/../src 2>/dev/null \;)
34 | else
35 | $(error QNN_SDK_ROOT: Please set QNN_SDK_ROOT)
36 | endif
37 |
38 | #========================== Define OpPackage Library Build Variables =============================================
39 | include $(CLEAR_VARS)
40 | LOCAL_C_INCLUDES := $(PACKAGE_C_INCLUDES)
41 | MY_SRC_FILES = $(wildcard $(LOCAL_PATH)/../src/*.cpp) $(wildcard $(LOCAL_PATH)/../src/utils/*.cpp) $(wildcard $(LOCAL_PATH)/../src/utils/CPU/*.cpp) $(wildcard $(LOCAL_PATH)/../src/ops/*.cpp)
42 | LOCAL_MODULE := RwkvWkvOpPackage
43 | LOCAL_SRC_FILES := $(subst makefiles/,,$(MY_SRC_FILES))
44 | LOCAL_LDLIBS := -lGLESv2 -lEGL
45 | include $(BUILD_SHARED_LIBRARY)
46 |
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/makefiles/Application.mk:
--------------------------------------------------------------------------------
1 | # ==============================================================================
2 | #
3 | # Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
4 | # All Rights Reserved.
5 | # Confidential and Proprietary - Qualcomm Technologies, Inc.
6 | #
7 | # ===============================================================
8 |
9 | APP_ABI := arm64-v8a
10 | APP_STL := c++_shared
11 | APP_PLATFORM := android-21
12 | APP_CPPFLAGS += -std=c++11 -O3 -fvisibility=hidden -DQNN_API="__attribute__((visibility(\"default\")))"
13 | APP_LDFLAGS += -lc -lm -ldl
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/makefiles/Makefile.linux-x86_64:
--------------------------------------------------------------------------------
1 | # ==============================================================================
2 | #
3 | # Copyright (c) Qualcomm Technologies, Inc. and/or its subsidiaries.
4 | # All rights reserved.
5 | # Confidential and Proprietary - Qualcomm Technologies, Inc.
6 | #
7 | # ==============================================================================
8 |
9 | # define relevant directories
10 | SRC_DIR := src
11 | SRC_DIR_OPS := src/ops
12 | SRC_DIR_UTILS := src/utils/CPU
13 |
14 | # Checking if clang++-9 is present. If not switch to clang++
15 | ifeq ($(shell $(CXX) -v 2>&1 | grep -c "clang version"), 0)
16 | CXX := clang++
17 | endif
18 |
19 | # define library name and corresponding directory
20 | QNN_TARGET ?= x86_64-linux-clang
21 | export LIB_DIR := ./libs/$(QNN_TARGET)
22 |
23 | ifdef PACKAGE_NAME
24 | library := $(LIB_DIR)/lib$(PACKAGE_NAME).so
25 | else
26 | library :=$(LIB_DIR)/libCpuCustomPackage.so
27 | endif
28 |
29 | # define target architecture if not previously defined, default is x86
30 | ifndef TARGET_AARCH_VARS
31 | TARGET_AARCH_VARS:= -march=x86-64
32 | endif
33 |
34 | # Include paths
35 | # QNN_SDK_ROOT should be set and points to the SDK path, it will be used.
36 | ifdef QNN_SDK_ROOT
37 | # setup custom op directory path
38 | CUSTOM_OP_DIR :=$(QNN_SDK_ROOT)/share/QNN/OpPackageGenerator/CustomOp
39 |
40 | # setup include paths
41 |
42 | INCLUDES += -I$(QNN_SDK_ROOT)/include/QNN -I include -I$(QNN_SDK_ROOT)/include/QNN/CPU -I $(CUSTOM_OP_DIR)
43 | INCLUDES += -I $(SRC_DIR)/utils -I $(SRC_DIR)/utils/CPU
44 |
45 | # copy source files from custom op directory
46 | $(info Copying custom op source files from SDK)
47 | COPYFILES := $(shell find $(CUSTOM_OP_DIR)/CPU -name "*.cpp" -exec cp -rf {} $(SRC_DIR) 2>/dev/null \;)
48 | else
49 | $(error QNN_SDK_ROOT: Please set QNN_SDK_ROOT)
50 | endif
51 |
52 | # set compiler flags
53 | COMMON_CXXFLAGS = -std=c++11 -fno-exceptions -fPIC -pg $(INCLUDES)
54 | COMMON_LDFLAGS = -shared -s -fPIC
55 |
56 | ifdef QNN_DEBUG_ENABLE
57 | CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O0 -g -DQNN_API=""
58 | LDFLAGS += $(COMMON_LDFLAGS)
59 | else
60 | CXXFLAGS += $(COMMON_CXXFLAGS) -march=x86-64 -O3 -Wno-write-strings -fvisibility=hidden -DQNN_API="__attribute__((visibility(\"default\")))"
61 | LDFLAGS += $(COMMON_LDFLAGS) -fvisibility=hidden -flto
62 | endif
63 |
64 | # define library sources
65 | SOURCES := $(wildcard $(SRC_DIR)/*.cpp)
66 | SOURCES_OPS := $(wildcard $(SRC_DIR_OPS)/*.cpp)
67 | SOURCE_UTILS := $(wildcard $(SRC_DIR_UTILS)/*.cpp)
68 |
69 | # define object directories
70 | OBJ_DIR := obj/$(QNN_TARGET)
71 | OBJ_DIR_OPS := obj/$(QNN_TARGET)/ops
72 | OBJ_DIR_UTILS := obj/$(QNN_TARGET)/utils
73 |
74 | # setup object files in object directory
75 | OBJECTS := $(patsubst %.cpp,$(OBJ_DIR)/%.o,$(foreach x,$(SOURCES),$(notdir $(x))))
76 | OBJECTS_OPS := $(patsubst %.cpp,$(OBJ_DIR_OPS)/%.o,$(foreach x,$(SOURCES_OPS),$(notdir $(x))))
77 | OBJECTS_UTILS := $(patsubst %.cpp,$(OBJ_DIR_UTILS)/%.o,$(foreach x,$(SOURCE_UTILS),$(notdir $(x))))
78 |
79 | # Rule to make library
80 | .PHONY: library
81 | library: $(library)
82 |
83 | # Implicit rule to compile and link object files
84 | $(OBJ_DIR)/%.o: $(SRC_DIR)/%.cpp
85 | $(CXX) $(CXXFLAGS) -c $^ -o $@
86 |
87 | $(OBJ_DIR_UTILS)/%.o: $(SRC_DIR_UTILS)/%.cpp
88 | $(CXX) $(CXXFLAGS) -c $^ -o $@
89 |
90 | # set up resources
91 | directories := $(LIB_DIR) $(OBJ_DIR) $(OBJ_DIR_OPS) $(OBJ_DIR_UTILS)
92 |
93 | # Compile
94 | $(library): $(OBJECTS) $(OBJECTS_OPS) $(OBJECTS_UTILS) | $(directories)
95 | $(CXX) $(CXXFLAGS) $(LINKFLAGS) -shared $^ -o $@
96 |
97 | # rule for object directory resource
98 | $(OBJECTS): | $(OBJ_DIR) $(COPYFILES)
99 | $(OBJECTS_OPS): | $(OBJ_DIR_OPS)
100 | $(OBJECTS_UTILS): | $(OBJ_DIR_UTILS)
101 |
102 | # rule to create directories
103 | $(directories):
104 | mkdir -p $@
105 |
106 | .PHONY: clean
107 | clean:
108 | rm -rf $(OBJ_DIR) $(LIB_DIR)
109 |
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/src/CpuCustomOpPackage.cpp:
--------------------------------------------------------------------------------
1 | //=============================================================================
2 | //
3 | // Copyright (c) 2020-2022 Qualcomm Technologies, Inc.
4 | // All Rights Reserved.
5 | // Confidential and Proprietary - Qualcomm Technologies, Inc.
6 | //
7 | //=============================================================================
8 |
9 | #include "CPU/QnnCpuOpPackage.h"
10 | #include "CustomBEMacros.hpp"
11 | #include "CustomOpPackage.hpp"
12 | #include "QnnSdkBuildId.h"
13 |
14 | using namespace qnn::custom;
15 | using namespace qnn::custom::utils;
16 |
17 | static Qnn_ErrorHandle_t QnnOpPackage_execute(void* opPkgNodeData) {
18 | auto opPkg = CustomOpPackage::getInstance();
19 | std::shared_ptr op;
20 |
21 | opPkg->getOpResolver()->getCustomOp((opHandle)opPkgNodeData, op);
22 | auto opRegistration = opPkg->getOpRegistration(op->m_typeName);
23 |
24 | QNN_CUSTOM_BE_ENSURE(opPkg, QNN_OP_PACKAGE_ERROR_GENERAL);
25 | QNN_CUSTOM_BE_ENSURE_STATUS(opRegistration->execute(op.get()));
26 |
27 | return QNN_SUCCESS;
28 | }
29 |
30 | std::mutex CustomOpPackage::s_mtx;
31 | std::shared_ptr CustomOpPackage ::s_opPackageInstance;
32 | bool CustomOpPackage::s_isInitialized;
33 |
34 | Qnn_ErrorHandle_t CustomOpPackage::getPackageInfo(const QnnOpPackage_Info_t** info) {
35 | QNN_CUSTOM_BE_ENSURE(info, QNN_OP_PACKAGE_ERROR_INVALID_INFO)
36 |
37 | for (auto op : m_registered_ops) {
38 | m_operationNames.push_back(op.first.c_str());
39 | }
40 |
41 | m_sdkApiVersion = QNN_CPU_API_VERSION_INIT;
42 | m_packageInfo = QNN_OP_PACKAGE_INFO_INIT;
43 | m_packageInfo.packageName = m_packageName;
44 | m_packageInfo.operationNames = m_operationNames.data();
45 | m_packageInfo.numOperations = static_cast(m_operationNames.size());
46 | m_packageInfo.sdkBuildId = QNN_SDK_BUILD_ID;
47 | m_packageInfo.sdkApiVersion = &m_sdkApiVersion;
48 | *info = &m_packageInfo;
49 |
50 | return QNN_SUCCESS;
51 | }
52 |
53 | Qnn_ErrorHandle_t CustomOpPackage::createOpImpl(
54 | QnnOpPackage_GraphInfrastructure_t graphInfrastructure,
55 | QnnOpPackage_Node_t node,
56 | QnnOpPackage_OpImpl_t* opImplPtr) {
57 | // initialize op resolver if not already set
58 | if (!m_opResolver) {
59 | m_opResolver.reset(new CustomOpResolver());
60 | }
61 | auto cpuNode = reinterpret_cast(node);
62 | auto customOp = std::shared_ptr(new CustomOp(cpuNode->name, cpuNode->typeName));
63 | const auto opRegistration = m_registered_ops[cpuNode->typeName];
64 |
65 | // Get op from op factory
66 | QNN_CUSTOM_BE_ENSURE_STATUS(
67 | opRegistration->initialize(node, graphInfrastructure, customOp.get()));
68 |
69 | // Update op reference
70 | auto opImpl = std::make_shared();
71 | opImpl->opImplFn = QnnOpPackage_execute;
72 | opImpl->userData = (void*)m_opResolver->registerCustomOp(std::move(customOp));
73 |
74 | // update out kernel param
75 | auto cpuImpl = reinterpret_cast(opImplPtr);
76 | *cpuImpl = opImpl.get();
77 |
78 | // update opImpl list
79 | m_OpImplList.emplace_back(opImpl);
80 |
81 | return QNN_SUCCESS;
82 | }
83 |
84 | Qnn_ErrorHandle_t CustomOpPackage::freeOpImpl(QnnOpPackage_OpImpl_t opImpl) {
85 | QNN_CUSTOM_BE_ENSURE(opImpl, QNN_OP_PACKAGE_ERROR_GENERAL);
86 |
87 | auto op = std::shared_ptr(new CustomOp());
88 |
89 | auto cpuOpImpl = reinterpret_cast(opImpl);
90 | m_opResolver->getCustomOp((opHandle)cpuOpImpl->userData, op);
91 |
92 | auto opRegistration = m_registered_ops[op->m_typeName];
93 | QNN_CUSTOM_BE_ENSURE_STATUS(m_opResolver->removeCustomOp((opHandle)cpuOpImpl->userData));
94 |
95 | if (opRegistration->free) {
96 | opRegistration->free(*op);
97 | }
98 |
99 | return QNN_SUCCESS;
100 | }
101 |
102 | std::shared_ptr CustomOpPackage::getInstance() noexcept {
103 | std::lock_guard locker(s_mtx);
104 | if (!s_opPackageInstance) {
105 | s_opPackageInstance.reset(new (std::nothrow) CustomOpPackage());
106 | }
107 | return s_opPackageInstance;
108 | }
109 |
110 | void CustomOpPackage::setIsInitialized(bool isInitialized) {
111 | std::lock_guard locker(s_mtx);
112 | s_isInitialized = isInitialized;
113 | }
114 |
115 | bool CustomOpPackage::getIsInitialized() {
116 | std::lock_guard locker(s_mtx);
117 | return s_isInitialized;
118 | }
119 |
120 | void CustomOpPackage::destroyInstance() {
121 | if (s_opPackageInstance && s_isInitialized) s_opPackageInstance.reset();
122 | s_isInitialized = false;
123 | }
124 |
125 | void CustomOpPackage::freeResolver() {
126 | if (m_opResolver) m_opResolver.reset();
127 | }
128 |
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/src/RwkvWkvOpPackageInterface.cpp:
--------------------------------------------------------------------------------
1 | //==============================================================================
2 | // Auto Generated Code for RwkvWkvOpPackage
3 | //==============================================================================
4 | #include "QnnCpuOpPackage.h"
5 | #include "CustomOpPackage.hpp"
6 |
7 | using namespace qnn::custom;
8 | using namespace qnn::custom::macros;
9 |
10 | static Qnn_ErrorHandle_t RwkvWkvOpPackageInitialize(
11 | QnnOpPackage_GlobalInfrastructure_t globalInfrastructure) {
12 |
13 | QNN_CUSTOM_BE_ENSURE(!(CustomOpPackage::getIsInitialized()),QNN_OP_PACKAGE_ERROR_LIBRARY_ALREADY_INITIALIZED);
14 |
15 | INIT_BE_OP_PACKAGE(RwkvWkvOpPackage)
16 |
17 | REGISTER_PACKAGE_OP(wkv6)
18 | REGISTER_PACKAGE_OP(wkv7_state)
19 | REGISTER_PACKAGE_OP(wkv7_output)
20 |
21 | // INIT_BE_PACKAGE_OPTIMIZATIONS();
22 |
23 | CustomOpPackage::setIsInitialized(true);
24 |
25 | return QNN_SUCCESS;
26 | }
27 |
28 | static Qnn_ErrorHandle_t RwkvWkvOpPackageGetInfo(const QnnOpPackage_Info_t** info) {
29 | auto opPkg = CustomOpPackage::getInstance();
30 |
31 | QNN_CUSTOM_BE_ENSURE(opPkg, QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED);
32 |
33 | QNN_CUSTOM_BE_ENSURE_STATUS(opPkg->getPackageInfo(info));
34 |
35 | return QNN_SUCCESS;
36 | }
37 |
38 | static Qnn_ErrorHandle_t RwkvWkvOpPackageValidateOpConfig(Qnn_OpConfig_t opConfig) {
39 | auto opPkg = CustomOpPackage::getInstance();
40 |
41 | QNN_CUSTOM_BE_ENSURE(opPkg, QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED);
42 |
43 | auto opRegistration = opPkg->getOpRegistration(opConfig.v1.typeName);
44 |
45 | QNN_CUSTOM_BE_ENSURE(opRegistration, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
46 |
47 | QNN_CUSTOM_BE_ENSURE_STATUS(opRegistration->validateOpConfig(opConfig));
48 |
49 | return QNN_SUCCESS;
50 | }
51 |
52 | static Qnn_ErrorHandle_t RwkvWkvOpPackageCreateOpImpl(
53 | QnnOpPackage_GraphInfrastructure_t graphInfrastructure,
54 | QnnOpPackage_Node_t node,
55 | QnnOpPackage_OpImpl_t* opImpl) {
56 | auto opPkg = CustomOpPackage::getInstance();
57 |
58 | QNN_CUSTOM_BE_ENSURE(opPkg, QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED);
59 |
60 | QNN_CUSTOM_BE_ENSURE_STATUS(
61 | opPkg->createOpImpl(graphInfrastructure, node, opImpl));
62 |
63 | return QNN_SUCCESS;
64 | }
65 |
66 | static Qnn_ErrorHandle_t RwkvWkvOpPackageFreeOpImpl(
67 | QnnCpuOpPackage_OpImpl_t* opImpl) {
68 | auto opPkg = CustomOpPackage::getInstance();
69 |
70 | QNN_CUSTOM_BE_ENSURE(opPkg, QNN_OP_PACKAGE_ERROR_LIBRARY_NOT_INITIALIZED);
71 |
72 | QNN_CUSTOM_BE_ENSURE_STATUS(opPkg->freeOpImpl(opImpl));
73 |
74 | return QNN_SUCCESS;
75 | }
76 |
77 | static Qnn_ErrorHandle_t RwkvWkvOpPackageTerminate() {
78 | auto opPkg = CustomOpPackage::getInstance();
79 |
80 | CustomOpPackage::destroyInstance();
81 | opPkg->freeResolver();
82 |
83 | return QNN_SUCCESS;
84 | }
85 |
86 | static Qnn_ErrorHandle_t RwkvWkvOpPackageLogInitialize(
87 | QnnLog_Callback_t callback, QnnLog_Level_t maxLogLevel) {
88 | // function should be used if at least two backends support it
89 | // USER SHOULD NOTE THIS FUNCTION IS UNUSED BY BE
90 |
91 | return QNN_SUCCESS;
92 | }
93 |
94 | static Qnn_ErrorHandle_t RwkvWkvOpPackageLogSetLevel(
95 | QnnLog_Level_t maxLogLevel) {
96 | // USER SHOULD NOTE THIS FUNCTION IS UNUSED BY CPU BE
97 |
98 | return QNN_SUCCESS;
99 | }
100 |
101 | static Qnn_ErrorHandle_t RwkvWkvOpPackageLogTerminate() {
102 | // USER SHOULD NOTE THIS FUNCTION IS UNUSED BY CPU BE
103 |
104 | return QNN_SUCCESS;
105 | }
106 |
107 |
108 | extern "C" QNN_API Qnn_ErrorHandle_t RwkvWkvOpPackageInterfaceProvider(
109 | QnnOpPackage_Interface_t* interface) {
110 | interface->interfaceVersion.major = 1;
111 | interface->interfaceVersion.minor = 4;
112 | interface->interfaceVersion.patch = 0;
113 | interface->v1_4.init = RwkvWkvOpPackageInitialize;
114 | interface->v1_4.terminate = RwkvWkvOpPackageTerminate;
115 | interface->v1_4.getInfo = RwkvWkvOpPackageGetInfo;
116 | interface->v1_4.validateOpConfig = RwkvWkvOpPackageValidateOpConfig;
117 | interface->v1_4.createOpImpl = RwkvWkvOpPackageCreateOpImpl;
118 | interface->v1_4.freeOpImpl = RwkvWkvOpPackageFreeOpImpl;
119 | interface->v1_4.logInitialize = RwkvWkvOpPackageLogInitialize;
120 | interface->v1_4.logSetLevel = RwkvWkvOpPackageLogSetLevel;
121 | interface->v1_4.logTerminate = RwkvWkvOpPackageLogTerminate;
122 | return QNN_SUCCESS;
123 | }
124 |
125 |
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/src/ops/wkv6.cpp:
--------------------------------------------------------------------------------
1 | //==============================================================================
2 | // Auto Generated Code for RwkvWkvOpPackage
3 | //==============================================================================
4 | #include
5 | #include
6 |
7 | #include "CpuBackendUtils.hpp"
8 | #include "CustomOpPackage.hpp"
9 |
10 | using namespace qnn::custom;
11 | using namespace qnn::custom::utils;
12 |
13 | namespace wkv6 {
14 |
15 | Qnn_ErrorHandle_t execute(CustomOp* operation) {
16 | /*
17 | * To have good performance and stability, it is required to avoid heap memory
18 | * allocation in this function. The heap memory allocation includes but not
19 | * limited to calling malloc, operator new, constructing STL container objects
20 | * like std::vector with default allocator, and adding items like calling
21 | * std::vector::push_back to STL container objects with default allocator.
22 | *
23 | * Please check in SDK documentation for more information.
24 | */
25 |
26 | float* k = (float*)operation->getInput(0)->data;
27 | float* v = (float*)operation->getInput(1)->data;
28 | float* r = (float*)operation->getInput(2)->data;
29 | float* state_in = (float*)operation->getInput(3)->data;
30 | float* tf = (float*)operation->getInput(4)->data;
31 | float* td = (float*)operation->getInput(5)->data;
32 | float* output = (float*)operation->getOutput(0)->data;
33 | float* state_out = (float*)operation->getOutput(1)->data;
34 |
35 | int num_heads = operation->getInput(3)->currentDimensions[0];
36 | int head_size = operation->getInput(3)->currentDimensions[1];
37 | int seq_length = operation->getInput(0)->currentDimensions[0] / num_heads;
38 |
39 | memset(output, 0, seq_length * num_heads * head_size * sizeof(float));
40 | for (int t = 0; t < seq_length; t++) {
41 | if (t > 0) state_in = state_out;
42 | for (int h = 0; h < num_heads; h++) {
43 | for (int i = 0; i < head_size; i++) {
44 | auto k_val = k[t * num_heads * head_size + h * head_size + i];
45 | auto r_val = r[t * num_heads * head_size + h * head_size + i];
46 | auto td_val = td[t * num_heads * head_size + h * head_size + i];
47 | auto tf_val = tf[h * head_size + i];
48 | for (int j = 0; j < head_size; j++) {
49 | auto v_val = v[t * num_heads * head_size + h * head_size + j];
50 | auto kv_val = k_val * v_val;
51 | auto prev_state_val = state_in[h * head_size * head_size + i * head_size + j];
52 | output[t * num_heads * head_size + h * head_size + j] += r_val * (kv_val * tf_val + prev_state_val);
53 | state_out[h * head_size * head_size + i * head_size + j] = prev_state_val * td_val + kv_val;
54 | }
55 | }
56 | }
57 | }
58 |
59 | return QNN_SUCCESS;
60 | }
61 |
62 | Qnn_ErrorHandle_t finalize(const CustomOp* operation) {
63 | QNN_CUSTOM_BE_ENSURE_EQ(operation->numInput(), 6, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
64 | QNN_CUSTOM_BE_ENSURE_EQ(operation->numOutput(), 2, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
65 |
66 | /**
67 | * Add code here
68 | **/
69 |
70 | return QNN_SUCCESS;
71 | }
72 |
73 | Qnn_ErrorHandle_t free(CustomOp& operation) {
74 |
75 | /**
76 | * Add code here
77 | **/
78 |
79 | return QNN_SUCCESS;
80 | }
81 |
82 | Qnn_ErrorHandle_t populateFromNode(const QnnOpPackage_Node_t node,
83 | QnnOpPackage_GraphInfrastructure_t graphInfrastructure,
84 | CustomOp* operation) {
85 | // Add input
86 | for (uint32_t i = 0; i < numInputs(node); i++) {
87 | operation->addInput(getInput(node, i));
88 | }
89 |
90 | // Add output
91 | for (uint32_t i = 0; i < numOutputs(node); i++) {
92 | operation->addOutput(getOutput(node, i));
93 | }
94 |
95 |
96 | return QNN_SUCCESS;
97 | }
98 |
99 | Qnn_ErrorHandle_t validateOpConfig(Qnn_OpConfig_t opConfig) {
100 | QNN_CUSTOM_BE_ENSURE_EQ(
101 | strcmp(opConfig.v1.typeName, "wkv6"), 0, QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT)
102 |
103 | QNN_CUSTOM_BE_ENSURE_EQ(opConfig.v1.numOfInputs, 6, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
104 | QNN_CUSTOM_BE_ENSURE_EQ(opConfig.v1.numOfOutputs, 2, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
105 |
106 | return QNN_SUCCESS;
107 | }
108 | } // namespace wkv6
109 |
110 | CustomOpRegistration_t* register_Wkv6CustomOp() {
111 | using namespace wkv6;
112 | static CustomOpRegistration_t WkvRegister = {execute, finalize, free, validateOpConfig, populateFromNode};
113 | return &WkvRegister;
114 | }
115 |
116 | REGISTER_OP(wkv6, register_Wkv6CustomOp);
117 |
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/src/ops/wkv7_output.cpp:
--------------------------------------------------------------------------------
1 | //==============================================================================
2 | // Auto Generated Code for RwkvWkvOpPackage
3 | //==============================================================================
4 | #include
5 | #include
6 |
7 | #include "CpuBackendUtils.hpp"
8 | #include "CustomOpPackage.hpp"
9 |
10 | using namespace qnn::custom;
11 | using namespace qnn::custom::utils;
12 |
13 | namespace wkv7_output {
14 |
15 | Qnn_ErrorHandle_t execute(CustomOp* operation) {
16 | /*
17 | * To have good performance and stability, it is required to avoid heap memory
18 | * allocation in this function. The heap memory allocation includes but not
19 | * limited to calling malloc, operator new, constructing STL container objects
20 | * like std::vector with default allocator, and adding items like calling
21 | * std::vector::push_back to STL container objects with default allocator.
22 | *
23 | * Please check in SDK documentation for more information.
24 | */
25 |
26 |
27 | return QNN_SUCCESS;
28 | }
29 |
30 | Qnn_ErrorHandle_t finalize(const CustomOp* operation) {
31 | QNN_CUSTOM_BE_ENSURE_EQ(operation->numInput(), 2, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
32 | QNN_CUSTOM_BE_ENSURE_EQ(operation->numOutput(), 1, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
33 |
34 | /**
35 | * Add code here
36 | **/
37 |
38 | return QNN_SUCCESS;
39 | }
40 |
41 | Qnn_ErrorHandle_t free(CustomOp& operation) {
42 |
43 | /**
44 | * Add code here
45 | **/
46 |
47 | return QNN_SUCCESS;
48 | }
49 |
50 | Qnn_ErrorHandle_t populateFromNode(const QnnOpPackage_Node_t node,
51 | QnnOpPackage_GraphInfrastructure_t graphInfrastructure,
52 | CustomOp* operation) {
53 | // Add input
54 | for (uint32_t i = 0; i < numInputs(node); i++) {
55 | operation->addInput(getInput(node, i));
56 | }
57 |
58 | // Add output
59 | for (uint32_t i = 0; i < numOutputs(node); i++) {
60 | operation->addOutput(getOutput(node, i));
61 | }
62 |
63 |
64 | return QNN_SUCCESS;
65 | }
66 |
67 | Qnn_ErrorHandle_t validateOpConfig(Qnn_OpConfig_t opConfig) {
68 | QNN_CUSTOM_BE_ENSURE_EQ(
69 | strcmp(opConfig.v1.typeName, "wkv7_output"), 0, QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT)
70 |
71 | QNN_CUSTOM_BE_ENSURE_EQ(opConfig.v1.numOfInputs, 2, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
72 | QNN_CUSTOM_BE_ENSURE_EQ(opConfig.v1.numOfOutputs, 1, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
73 |
74 | return QNN_SUCCESS;
75 | }
76 | } // namespace wkv7_output
77 |
78 | CustomOpRegistration_t* register_Wkv7OutputCustomOp() {
79 | using namespace wkv7_output;
80 | static CustomOpRegistration_t WkvRegister = {execute, finalize, free, validateOpConfig, populateFromNode};
81 | return &WkvRegister;
82 | }
83 |
84 | REGISTER_OP(wkv7_output, register_Wkv7OutputCustomOp);
85 |
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/src/ops/wkv7_state.cpp:
--------------------------------------------------------------------------------
1 | //==============================================================================
2 | // Auto Generated Code for RwkvWkvOpPackage
3 | //==============================================================================
4 | #include
5 | #include
6 |
7 | #include "CpuBackendUtils.hpp"
8 | #include "CustomOpPackage.hpp"
9 |
10 | using namespace qnn::custom;
11 | using namespace qnn::custom::utils;
12 |
13 | namespace wkv7_state {
14 |
15 | Qnn_ErrorHandle_t execute(CustomOp* operation) {
16 | /*
17 | * To have good performance and stability, it is required to avoid heap memory
18 | * allocation in this function. The heap memory allocation includes but not
19 | * limited to calling malloc, operator new, constructing STL container objects
20 | * like std::vector with default allocator, and adding items like calling
21 | * std::vector::push_back to STL container objects with default allocator.
22 | *
23 | * Please check in SDK documentation for more information.
24 | */
25 |
26 | float* r = (float*)operation->getInput(0)->data;
27 | float* w = (float*)operation->getInput(1)->data;
28 | float* k = (float*)operation->getInput(2)->data;
29 | float* v = (float*)operation->getInput(3)->data;
30 | float* a = (float*)operation->getInput(4)->data;
31 | float* b = (float*)operation->getInput(5)->data;
32 | float* state_in = (float*)operation->getInput(6)->data;
33 | float* output = (float*)operation->getOutput(0)->data;
34 | float* state_out = (float*)operation->getOutput(1)->data;
35 |
36 | int num_heads = operation->getInput(6)->currentDimensions[0];
37 | int head_size = operation->getInput(6)->currentDimensions[1];
38 | // int seq_length = operation->getInput(0)->currentDimensions[0];
39 | int seq_length = operation->getInput(0)->currentDimensions[0] / num_heads;
40 |
41 | for (int t = 0; t < seq_length; t++) {
42 | if (t > 0) state_in = state_out;
43 | for (int h = 0; h < num_heads; h++) {
44 | for (int i = 0; i < head_size; i++) {
45 | auto v_val = v[t * num_heads * head_size + h * head_size + i];
46 |
47 | float sa = 0, result = 0;
48 | for (int j = 0; j < head_size; j++) {
49 | sa += a[t * num_heads * head_size + h * head_size + j] * state_in[h * head_size * head_size + i * head_size + j];
50 | }
51 |
52 | for (int j = 0; j < head_size; j++) {
53 | auto r_val = r[t * num_heads * head_size + h * head_size + j];
54 | auto w_val = w[t * num_heads * head_size + h * head_size + j];
55 | auto k_val = k[t * num_heads * head_size + h * head_size + j];
56 | auto b_val = b[t * num_heads * head_size + h * head_size + j];
57 | auto kv_val = k_val * v_val;
58 | auto state_val = state_in[h * head_size * head_size + i * head_size + j] * w_val + kv_val + sa * b_val;
59 | result += state_val * r_val;
60 | state_out[h * head_size * head_size + i * head_size + j] = state_val;
61 | }
62 | output[t * num_heads * head_size + h * head_size + i] = result;
63 | }
64 | }
65 | }
66 |
67 | return QNN_SUCCESS;
68 | }
69 |
70 | Qnn_ErrorHandle_t finalize(const CustomOp* operation) {
71 | QNN_CUSTOM_BE_ENSURE_EQ(operation->numInput(), 6, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
72 | QNN_CUSTOM_BE_ENSURE_EQ(operation->numOutput(), 1, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
73 |
74 | /**
75 | * Add code here
76 | **/
77 |
78 | return QNN_SUCCESS;
79 | }
80 |
81 | Qnn_ErrorHandle_t free(CustomOp& operation) {
82 |
83 | /**
84 | * Add code here
85 | **/
86 |
87 | return QNN_SUCCESS;
88 | }
89 |
90 | Qnn_ErrorHandle_t populateFromNode(const QnnOpPackage_Node_t node,
91 | QnnOpPackage_GraphInfrastructure_t graphInfrastructure,
92 | CustomOp* operation) {
93 | // Add input
94 | for (uint32_t i = 0; i < numInputs(node); i++) {
95 | operation->addInput(getInput(node, i));
96 | }
97 |
98 | // Add output
99 | for (uint32_t i = 0; i < numOutputs(node); i++) {
100 | operation->addOutput(getOutput(node, i));
101 | }
102 |
103 |
104 | return QNN_SUCCESS;
105 | }
106 |
107 | Qnn_ErrorHandle_t validateOpConfig(Qnn_OpConfig_t opConfig) {
108 | QNN_CUSTOM_BE_ENSURE_EQ(
109 | strcmp(opConfig.v1.typeName, "wkv7_state"), 0, QNN_OP_PACKAGE_ERROR_INVALID_ARGUMENT)
110 |
111 | QNN_CUSTOM_BE_ENSURE_EQ(opConfig.v1.numOfInputs, 6, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
112 | QNN_CUSTOM_BE_ENSURE_EQ(opConfig.v1.numOfOutputs, 1, QNN_OP_PACKAGE_ERROR_VALIDATION_FAILURE)
113 |
114 | return QNN_SUCCESS;
115 | }
116 | } // namespace wkv7_state
117 |
118 | CustomOpRegistration_t* register_Wkv7StateCustomOp() {
119 | using namespace wkv7_state;
120 | static CustomOpRegistration_t WkvRegister = {execute, finalize, free, validateOpConfig, populateFromNode};
121 | return &WkvRegister;
122 | }
123 |
124 | REGISTER_OP(wkv7_state, register_Wkv7StateCustomOp);
125 |
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/src/utils/BackendUtils.hpp:
--------------------------------------------------------------------------------
1 | //==============================================================================
2 | //
3 | // Copyright (c) 2020-2023 Qualcomm Technologies, Inc.
4 | // All Rights Reserved.
5 | // Confidential and Proprietary - Qualcomm Technologies, Inc.
6 | //
7 | //==============================================================================
8 |
9 | #pragma once
10 | #include
11 | #include
12 |
13 | #include
14 | #include
15 |
16 | #include "QnnOpPackage.h"
17 | #include "QnnTypes.h"
18 |
19 | //============================================================================
20 | // Backend Defined Behavior
21 | //=============================================================================
22 | // A required backend defined tensor object which designates an input or output tensor
23 | typedef struct CustomOpTensor* CustomOpTensorPtr_t;
24 |
25 | // A required backend defined parameter object which designates scalar, tensor and string parameters
26 | typedef struct CustomOpParam* CustomOpParamPtr_t;
27 |
28 | // A backend defined object which contains additional info about an operation such as connectivity,
29 | // buffers etc
30 | typedef struct CustomOpContext* CustomOpContextPtr_t;
31 |
32 | // A backend defined object which contains information about a kernel such as its string path, its
33 | // buffers, assigned memory, local dimensions etc.
34 | typedef struct CustomOpKernelContext* CustomOpKernelContextPtr_t;
35 |
36 | namespace qnn {
37 |
38 | namespace custom {
39 |
40 | namespace utils {
41 |
42 | // Each backend is expected to define these utilities to aid users in accessing basic info about
43 | // an operation package node.
44 | const CustomOpTensorPtr_t* getInput(QnnOpPackage_Node_t node);
45 |
46 | const CustomOpTensorPtr_t* getOutput(QnnOpPackage_Node_t node);
47 |
48 | const CustomOpParamPtr_t* getParam(QnnOpPackage_Node_t node);
49 |
50 | const CustomOpTensorPtr_t getInput(QnnOpPackage_Node_t node, size_t idx);
51 |
52 | CustomOpTensorPtr_t getOutput(QnnOpPackage_Node_t node, size_t idx);
53 |
54 | const std::pair getParam(QnnOpPackage_Node_t node,
55 | const std::string& paramName);
56 |
57 | uint32_t numInputs(QnnOpPackage_Node_t node);
58 |
59 | uint32_t numOutputs(QnnOpPackage_Node_t node);
60 |
61 | uint32_t numDimensions(CustomOpTensorPtr_t tensor);
62 |
63 | const uint32_t* getTensorShape(CustomOpTensorPtr_t tensor);
64 |
65 | void* getTensorData(CustomOpTensorPtr_t tensor);
66 |
67 | uint32_t numTensorSize(CustomOpTensorPtr_t tensor);
68 | // Additional backend utilities should be included under this namespace
69 | namespace backend_utils {}
70 | } // namespace utils
71 | } // namespace custom
72 | } // namespace qnn
73 |
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/src/utils/CPU/CpuBackendUtils.cpp:
--------------------------------------------------------------------------------
1 | //==============================================================================
2 | //
3 | // Copyright (c) 2020, 2023 Qualcomm Technologies, Inc.
4 | // All Rights Reserved.
5 | // Confidential and Proprietary - Qualcomm Technologies, Inc.
6 | //
7 | //==============================================================================
8 |
9 | #include
10 |
11 | #include
12 |
13 | #include "CpuBackendUtils.hpp"
14 |
15 | namespace qnn {
16 |
17 | namespace custom {
18 |
19 | namespace utils {
20 |
21 | // Each backend is expected to define these utilities to aid users in accessing basic info about
22 | // an operation package node.
23 | const CustomOpTensorPtr_t* getInput(QnnOpPackage_Node_t node) {
24 | return (CustomOpTensorPtr_t*)reinterpret_cast(node)->inputs;
25 | }
26 |
27 | const CustomOpTensorPtr_t* getOutput(QnnOpPackage_Node_t node) {
28 | return (CustomOpTensorPtr_t*)reinterpret_cast(node)->outputs;
29 | }
30 |
31 | const CustomOpParamPtr_t* getParam(QnnOpPackage_Node_t node) {
32 | return (CustomOpParamPtr_t*)reinterpret_cast(node)->params;
33 | }
34 |
35 | const std::pair getParam(QnnOpPackage_Node_t node,
36 | const std::string& name) {
37 | auto cpuNode = reinterpret_cast(node);
38 | auto params = (CustomOpParamPtr_t*)cpuNode->params;
39 |
40 | for (uint32_t idx = 0; idx < cpuNode->numOfParams; idx++) {
41 | auto paramName = params[idx]->name;
42 |
43 | if (strcmp(paramName, name.c_str()) == 0) {
44 | return {true, params[idx]};
45 | }
46 | }
47 |
48 | return {false, nullptr};
49 | }
50 |
51 | const CustomOpTensorPtr_t getInput(QnnOpPackage_Node_t node, size_t idx) {
52 | return (CustomOpTensorPtr_t) reinterpret_cast(node)->inputs[idx];
53 | }
54 |
55 | CustomOpTensorPtr_t getOutput(QnnOpPackage_Node_t node, size_t idx) {
56 | return (CustomOpTensorPtr_t) reinterpret_cast(node)->outputs[idx];
57 | }
58 |
59 | uint32_t numInputs(QnnOpPackage_Node_t node) {
60 | return reinterpret_cast(node)->numOfInputs;
61 | }
62 |
63 | uint32_t numOutputs(QnnOpPackage_Node_t node) {
64 | return reinterpret_cast(node)->numOfOutputs;
65 | }
66 |
67 | uint32_t numDimensions(CustomOpTensorPtr_t tensor) {
68 | return reinterpret_cast(tensor)->rank;
69 | }
70 |
71 | uint32_t numTensorSize(CustomOpTensorPtr_t tensor) {
72 | uint32_t size = 1;
73 | auto cpuTensor = reinterpret_cast(tensor);
74 |
75 | for (uint32_t i = 0; i < cpuTensor->rank; i++) {
76 | size *= cpuTensor->currentDimensions[i];
77 | }
78 | return size;
79 | }
80 |
81 | const uint32_t* getTensorShape(CustomOpTensorPtr_t tensor) {
82 | return reinterpret_cast(tensor)->currentDimensions;
83 | }
84 |
85 | template
86 | const T* getTensorData(CustomOpTensorPtr_t tensor) {
87 | auto tempTensor = reinterpret_cast(tensor);
88 | auto dataRef = reinterpret_cast(tempTensor->data);
89 | return const_cast(dataRef);
90 | }
91 |
92 | template
93 | T& getTensorDataRef(CustomOpTensorPtr_t tensor) {
94 | auto tempTensor = reinterpret_cast(tensor);
95 | auto dataRef = reinterpret_cast(tempTensor->data);
96 | return &dataRef;
97 | }
98 |
99 | namespace backend_utils {
100 |
101 | const double getScalarParam(const CustomOpParamPtr_t param) {
102 | auto cpuParam = reinterpret_cast(param);
103 | return cpuParam->scalarParam;
104 | }
105 |
106 | const CustomOpTensorPtr_t getTensorParam(const CustomOpParamPtr_t param) {
107 | auto cpuParam = reinterpret_cast(param);
108 | return (CustomOpTensorPtr_t)cpuParam->tensorParam;
109 | }
110 |
111 | } // namespace backend_utils
112 | } // namespace utils
113 | } // namespace custom
114 | } // namespace qnn
115 |
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/src/utils/CPU/CpuBackendUtils.hpp:
--------------------------------------------------------------------------------
1 | //==============================================================================
2 | //
3 | // Copyright (c) 2020 Qualcomm Technologies, Inc.
4 | // All Rights Reserved.
5 | // Confidential and Proprietary - Qualcomm Technologies, Inc.
6 | //
7 | //==============================================================================
8 |
9 | #pragma once
10 |
11 | #include "BackendUtils.hpp"
12 | #include "QnnCpuOpPackage.h"
13 |
14 | // Tensor and parameter definitions
15 | struct CustomOpTensor : public QnnCpuOpPackage_Tensor_t {};
16 |
17 | struct CustomOpParam : public QnnCpuOpPackage_Param_t {};
18 |
19 | namespace qnn {
20 | namespace custom {
21 | namespace utils {
22 | namespace backend_utils {
23 |
24 | const double getScalarParam(const CustomOpParamPtr_t param);
25 |
26 | const CustomOpTensorPtr_t getTensorParam(const CustomOpParamPtr_t param);
27 | } // namespace backend_utils
28 | } // namespace utils
29 | } // namespace custom
30 | } // namespace qnn
--------------------------------------------------------------------------------
/hexagon/CPU/RwkvWkvOpPackage/src/utils/CustomOpUtils.hpp:
--------------------------------------------------------------------------------
1 | //==============================================================================
2 | //
3 | // Copyright (c) 2020 Qualcomm Technologies, Inc.
4 | // All Rights Reserved.
5 | // Confidential and Proprietary - Qualcomm Technologies, Inc.
6 | //
7 | //==============================================================================
8 |
9 | #pragma once
10 |
11 | #include