├── index.js ├── .npmignore ├── bin └── download-prebuilds.js ├── set-optional-deps.cjs ├── README.md ├── LICENSE ├── .gitignore ├── binding.gyp ├── package.json ├── .github └── workflows │ └── prebuild.yml └── src └── extract.cpp /index.js: -------------------------------------------------------------------------------- 1 | module.exports = require('node-gyp-build-optional-packages')(__dirname) 2 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | # Dependency directories 2 | node_modules/ 3 | tests/samples 4 | .vs 5 | build/ 6 | .DS_Store -------------------------------------------------------------------------------- /bin/download-prebuilds.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const { dirname } = require('path'); 4 | const { fileURLToPath } = require('url'); 5 | const { exec } = require('child_process'); 6 | 7 | process.chdir(dirname(__dirname)); 8 | exec('prebuildify-ci download', (error, stdout, stderr) => { 9 | console.error(stderr); 10 | console.log(stdout); 11 | }); 12 | -------------------------------------------------------------------------------- /set-optional-deps.cjs: -------------------------------------------------------------------------------- 1 | let fs = require('fs'); 2 | packageData = JSON.parse(fs.readFileSync('package.json')); 3 | let prebuilds = fs.readdirSync('prebuilds'); 4 | let platformDeps = packageData.optionalDependencies = {}; 5 | let packageName = packageData.name; 6 | let version = packageData.version; 7 | for (let prebuild of prebuilds) { 8 | platformDeps['@' + packageName + '/' + packageName + '-' + prebuild] = version; 9 | } 10 | fs.writeFileSync('package.json', JSON.stringify(packageData, null, 2)); 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Summary 2 | This module is designed to do fast and efficient native/C-level extraction of strings from CBOR binary data. This works by calling `extractStrings(buffer, start, end)`, and it will extract strings by doing partial CBOR parsing, and scanning to find the string data in the range specified in the buffer. It will return an array of strings that it finds. When it finds strings that can be represented with latin-1/one-byte strings (and important V8 optimization), it will attempt return a continuous string of CBOR data that contains multiple sub-strings, so the decoder can slice off strings by offset. When a string contains non-latin characters, and must be represented as a two-byte string, this will always be returned as the string alone without combination with any other strings. The extractor will return an array of a maximum of 256 strings. The decoder can call the extractStrings again, with a new offset to continue extracting more strings as necessary. 3 | 4 | ## License 5 | MIT -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Kris Zyp 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | dist 8 | 9 | # Runtime data 10 | pids 11 | *.pid 12 | *.seed 13 | *.pid.lock 14 | 15 | # Directory for instrumented libs generated by jscoverage/JSCover 16 | lib-cov 17 | 18 | # Coverage directory used by tools like istanbul 19 | coverage 20 | 21 | # nyc test coverage 22 | .nyc_output 23 | 24 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 25 | .grunt 26 | 27 | # Bower dependency directory (https://bower.io/) 28 | bower_components 29 | 30 | # node-waf configuration 31 | .lock-wscript 32 | 33 | # Compiled binary addons (http://nodejs.org/api/addons.html) 34 | build/Release 35 | prebuilds 36 | 37 | # Dependency directories 38 | node_modules/ 39 | jspm_packages/ 40 | 41 | package-lock.json 42 | # Typescript v1 declaration files 43 | typings/ 44 | 45 | # Optional npm cache directory 46 | .npm 47 | 48 | # Optional eslint cache 49 | .eslintcache 50 | 51 | # Optional REPL history 52 | .node_repl_history 53 | 54 | # Output of 'npm pack' 55 | *.tgz 56 | 57 | # Yarn Integrity file 58 | .yarn-integrity 59 | 60 | # dotenv environment variables file 61 | .env 62 | tests/samples 63 | 64 | # Visual Studio Code directory 65 | .vscode 66 | .vs 67 | 68 | .DS_Store 69 | build -------------------------------------------------------------------------------- /binding.gyp: -------------------------------------------------------------------------------- 1 | { 2 | "variables": { 3 | "os_linux_compiler%": "gcc", 4 | "enable_v8%": "true", 5 | "enable_pointer_compression%": "false", 6 | "build_v8_with_gn": "false" 7 | }, 8 | "conditions": [ 9 | ['OS=="win"', { 10 | "variables": { 11 | "enable_v8%": "=7", { 39 | "cflags": [ 40 | "-Wimplicit-fallthrough=2", 41 | ], 42 | }], 43 | ], 44 | "ldflags": [ 45 | "-fPIC", 46 | "-fvisibility=hidden" 47 | ], 48 | "cflags": [ 49 | "-fPIC", 50 | "-fvisibility=hidden", 51 | "-O3" 52 | ], 53 | }], 54 | ["enable_v8!='false'", { 55 | "defines": ["ENABLE_V8_API=1"] 56 | }], 57 | ], 58 | } 59 | ] 60 | } 61 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cbor-extract", 3 | "author": "Kris Zyp", 4 | "version": "2.2.0", 5 | "description": "Node addon for string extraction for cbor-x", 6 | "license": "MIT", 7 | "repository": { 8 | "type": "git", 9 | "url": "http://github.com/kriszyp/cbor-extract" 10 | }, 11 | "scripts": { 12 | "install": "node-gyp-build-optional-packages", 13 | "recompile": "node-gyp rebuild", 14 | "before-publish": "prebuildify-ci download && node set-optional-deps.cjs", 15 | "prebuild": "prebuildify-platform-packages --target 20.10.0", 16 | "prebuild-win32": "prebuildify-platform-packages --target 20.10.0 && set ENABLE_V8_FUNCTIONS=false&& prebuildify-platform-packages --platform-packages --napi --target 20.10.0", 17 | "prebuild-macos": "prebuildify-platform-packages --target 20.0.0 && ENABLE_V8_FUNCTIONS=false prebuildify-platform-packages --napi --platform-packages --target 20.10.0", 18 | "prebuild-libc": "prebuildify-platform-packages --tag-libc --target 20.10.0 && prebuildify-platform-packages --platform-packages --napi --tag-libc --target 16.14.2 && ENABLE_V8_FUNCTIONS=false prebuildify-platform-packages --platform-packages --napi --tag-libc --target 20.10.0", 19 | "prebuild-libc-alpine": "prebuildify-cross --image alpine --tag-libc --target 20.10.0", 20 | "publish-all": "cd prebuilds/win32-x64 && npm publish --access public && cd ../darwin-x64 && npm publish --access public && cd ../darwin-arm64 && npm publish --access public && cd ../linux-x64 && npm publish --access public && cd ../linux-arm64 && npm publish --access public && cd ../linux-arm && npm publish --access public && cd ../.. && npm publish --access public", 21 | "test": "node ./index.js" 22 | }, 23 | "main": "./index.js", 24 | "gypfile": true, 25 | "dependencies": { 26 | "node-gyp-build-optional-packages": "5.1.1" 27 | }, 28 | "files": [ 29 | "index.js", 30 | "/src", 31 | "/*.gyp", 32 | "/bin" 33 | ], 34 | "bin": { 35 | "download-cbor-prebuilds": "./bin/download-prebuilds.js" 36 | }, 37 | "devDependencies": { 38 | "prebuildify-platform-packages": "5.0.4", 39 | "prebuildify-ci": "^1.0.5", 40 | "prebuildify-cross": "5.0.0" 41 | }, 42 | "optionalDependencies": { 43 | "@cbor-extract/cbor-extract-darwin-arm64": "2.2.0", 44 | "@cbor-extract/cbor-extract-darwin-x64": "2.2.0", 45 | "@cbor-extract/cbor-extract-linux-arm": "2.2.0", 46 | "@cbor-extract/cbor-extract-linux-arm64": "2.2.0", 47 | "@cbor-extract/cbor-extract-linux-x64": "2.2.0", 48 | "@cbor-extract/cbor-extract-win32-x64": "2.2.0" 49 | } 50 | } -------------------------------------------------------------------------------- /.github/workflows/prebuild.yml: -------------------------------------------------------------------------------- 1 | name: Prebuild 2 | on: [push] 3 | jobs: 4 | build-test-macos: 5 | if: startsWith(github.ref, 'refs/tags/') 6 | runs-on: macos-11 7 | steps: 8 | - uses: actions/checkout@v3 9 | - name: Setup node 10 | uses: actions/setup-node@v3 11 | with: 12 | node-version: 18 13 | - run: python3 -m pip install setuptools 14 | - run: npm install 15 | - run: npm test 16 | - run: npm run prebuild-macos 17 | if: startsWith(github.ref, 'refs/tags/') 18 | - run: npm run prebuild-macos 19 | if: startsWith(github.ref, 'refs/tags/') 20 | env: 21 | PREBUILD_ARCH: arm64 22 | - run: tar --create --format ustar --verbose --file=prebuild-darwin.tar -C prebuilds . 23 | if: startsWith(github.ref, 'refs/tags/') 24 | - name: Prebuild 25 | uses: softprops/action-gh-release@v1 26 | if: startsWith(github.ref, 'refs/tags/') 27 | with: 28 | files: prebuild-darwin.tar 29 | build-test-win32: 30 | if: startsWith(github.ref, 'refs/tags/') 31 | runs-on: windows-latest 32 | steps: 33 | - uses: actions/checkout@v3 34 | - name: Setup node 35 | uses: actions/setup-node@v3 36 | with: 37 | node-version: 16 38 | - run: npm install 39 | - run: npm run prebuild-win32 40 | - run: tar --create --verbose --file=prebuild-win32.tar -C prebuilds . 41 | #if: startsWith(github.ref, 'refs/tags/') 42 | - name: Prebuild 43 | uses: softprops/action-gh-release@v1 44 | if: startsWith(github.ref, 'refs/tags/') 45 | with: 46 | files: prebuild-win32.tar 47 | build-centos-7: 48 | #if: startsWith(github.ref, 'refs/tags/') 49 | runs-on: ubuntu-latest 50 | container: quay.io/pypa/manylinux2014_x86_64 51 | steps: 52 | #- run: ldd --version ldd 53 | #- uses: actions/checkout@v3 54 | # with: 55 | # repository: 'kriszyp/musl-bins' 56 | #- run: tar -xf aarch64-linux-musl-cross.tgz && pwd && ls 57 | - uses: actions/checkout@v3 58 | - name: Setup node 59 | uses: actions/setup-node@v3 60 | with: 61 | node-version: 14 62 | - run: curl https://raw.githubusercontent.com/kriszyp/musl-bins/main/aarch64-linux-musl-cross.tgz --output aarch64-linux-musl-cross.tgz 63 | - run: tar -xf aarch64-linux-musl-cross.tgz && pwd && ls 64 | - run: curl https://raw.githubusercontent.com/kriszyp/musl-bins/main/armv7l-linux-musleabihf-cross.tgz --output armv7l-linux-musleabihf-cross.tgz 65 | - run: tar -xf armv7l-linux-musleabihf-cross.tgz && pwd && ls 66 | - run: curl https://raw.githubusercontent.com/kriszyp/musl-bins/main/x86_64-linux-musl-native.tgz --output x86_64-linux-musl-native.tgz 67 | - run: tar -xf x86_64-linux-musl-native.tgz && pwd && ls 68 | - run: npm install 69 | - run: npm run prebuild-libc 70 | if: startsWith(github.ref, 'refs/tags/') 71 | env: 72 | PREBUILD_LIBC: musl 73 | PREBUILD_ARCH: arm64 74 | CC: ${PWD}/aarch64-linux-musl-cross/bin/aarch64-linux-musl-gcc 75 | CXX: ${PWD}/aarch64-linux-musl-cross/bin/aarch64-linux-musl-g++ 76 | - run: npm run prebuild-libc 77 | if: startsWith(github.ref, 'refs/tags/') 78 | env: 79 | PREBUILD_LIBC: musl 80 | PREBUILD_ARCH: arm 81 | PREBUILD_ARMV: 7 82 | CC: ${PWD}/armv7l-linux-musleabihf-cross/bin/armv7l-linux-musleabihf-gcc 83 | CXX: ${PWD}/armv7l-linux-musleabihf-cross/bin/armv7l-linux-musleabihf-g++ 84 | - run: npm run prebuild-libc 85 | if: startsWith(github.ref, 'refs/tags/') 86 | env: 87 | PREBUILD_LIBC: musl 88 | PREBUILD_ARCH: x64 89 | CC: ${PWD}/x86_64-linux-musl-native/bin/x86_64-linux-musl-gcc 90 | CXX: ${PWD}/x86_64-linux-musl-native/bin/x86_64-linux-musl-g++ 91 | - run: npm run prebuild-libc 92 | - run: npm test 93 | - run: tar --create --verbose --file=prebuild-linux.tar -C prebuilds . 94 | - name: Prebuild 95 | if: startsWith(github.ref, 'refs/tags/') 96 | uses: softprops/action-gh-release@v1 97 | with: 98 | files: prebuild-linux.tar 99 | build-linux-arm: 100 | if: startsWith(github.ref, 'refs/tags/') 101 | runs-on: ubuntu-latest 102 | #container: quay.io/pypa/manylinux_2_24_x86_64 103 | steps: 104 | - run: sudo apt-get update 105 | - run: sudo apt-get install -y gcc-aarch64-linux-gnu gcc-arm-linux-gnueabihf g++-aarch64-linux-gnu g++-arm-linux-gnueabihf 106 | - run: ldd --version ldd 107 | - uses: actions/checkout@v3 108 | - name: Setup node 109 | uses: actions/setup-node@v3 110 | with: 111 | node-version: 16 112 | - run: npm install 113 | - run: npm run prebuild-libc 114 | if: startsWith(github.ref, 'refs/tags/') 115 | env: 116 | PREBUILD_ARCH: arm64 117 | CC: aarch64-linux-gnu-gcc 118 | CXX: aarch64-linux-gnu-g++ 119 | - run: npm run prebuild-libc 120 | env: 121 | PREBUILD_ARCH: arm 122 | PREBUILD_ARMV: 7 123 | CC: arm-linux-gnueabihf-gcc 124 | CXX: arm-linux-gnueabihf-g++ 125 | - run: tar --create --verbose --file=prebuild-linux-arm.tar -C prebuilds . 126 | - name: Prebuild 127 | if: startsWith(github.ref, 'refs/tags/') 128 | uses: softprops/action-gh-release@v1 129 | with: 130 | files: prebuild-linux-arm.tar 131 | -------------------------------------------------------------------------------- /src/extract.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | This is responsible for extracting the strings, in bulk, from a CBOR buffer. Creating strings from buffers can 3 | be one of the biggest performance bottlenecks of parsing, but creating an array of extracting strings all at once 4 | provides much better performance. This will parse and produce up to 256 strings at once .The JS parser can call this multiple 5 | times as necessary to get more strings. This must be partially capable of parsing CBOR so it can know where to 6 | find the string tokens and determine their position and length. All strings are decoded as UTF-8. 7 | */ 8 | #include 9 | #if ENABLE_V8_API 10 | #include 11 | #endif 12 | 13 | #ifndef thread_local 14 | #ifdef __GNUC__ 15 | # define thread_local __thread 16 | #elif __STDC_VERSION__ >= 201112L 17 | # define thread_local _Thread_local 18 | #elif defined(_MSC_VER) 19 | # define thread_local __declspec(thread) 20 | #else 21 | # define thread_local 22 | #endif 23 | #endif 24 | 25 | const int MAX_TARGET_SIZE = 255; 26 | napi_value unexpectedEnd(napi_env env) { 27 | napi_value returnValue; 28 | napi_get_undefined(env, &returnValue); 29 | napi_throw_type_error(env, NULL, "Unexpected end of buffer reading string"); 30 | return returnValue; 31 | } 32 | class Extractor { 33 | public: 34 | napi_value target[MAX_TARGET_SIZE + 1]; // leave one for the queued string 35 | 36 | uint8_t* source; 37 | uint32_t position = 0; 38 | uint32_t writePosition = 0; 39 | uint32_t stringStart = 0; 40 | uint32_t lastStringEnd = 0; 41 | 42 | void readString(napi_env env, uint32_t length, bool allowStringBlocks) { 43 | uint32_t start = position; 44 | uint32_t end = position + length; 45 | if (allowStringBlocks) { // for larger strings, we don't bother to check every character for being latin, and just go right to creating a new string 46 | while(position < end) { 47 | if (source[position] < 0x80) // ensure we character is latin and can be decoded as one byte 48 | position++; 49 | else { 50 | break; 51 | } 52 | } 53 | } 54 | if (position < end) { 55 | // non-latin character 56 | if (lastStringEnd) { 57 | napi_value value; 58 | napi_create_string_latin1(env, (const char*) source + stringStart, lastStringEnd - stringStart, &value); 59 | target[writePosition++] = value; 60 | lastStringEnd = 0; 61 | } 62 | // use standard utf-8 conversion 63 | napi_value value; 64 | napi_create_string_utf8(env, (const char*) source + start, (int) length, &value); 65 | target[writePosition++] = value; 66 | position = end; 67 | return; 68 | } 69 | 70 | if (lastStringEnd) { 71 | if (start - lastStringEnd > 40 || end - stringStart > 6000) { 72 | napi_value value; 73 | napi_create_string_latin1(env, (const char*) source + stringStart, lastStringEnd - stringStart, &value); 74 | target[writePosition++] = value; 75 | stringStart = start; 76 | } 77 | } else { 78 | stringStart = start; 79 | } 80 | lastStringEnd = end; 81 | } 82 | napi_value extractStrings(napi_env env, uint32_t startingPosition, uint32_t size, uint32_t firstStringSize, uint8_t* inputSource) { 83 | writePosition = 0; 84 | lastStringEnd = 0; 85 | position = startingPosition; 86 | source = inputSource; 87 | readString(env, firstStringSize, firstStringSize < 0x100); 88 | while (position < size) { 89 | uint8_t token = source[position++]; 90 | uint8_t majorType = token >> 5; 91 | token = token & 0x1f; 92 | if (majorType == 2 || majorType == 3) { 93 | uint32_t length; 94 | switch (token) { 95 | case 0x18: 96 | if (position + 1 > size) { 97 | return unexpectedEnd(env); 98 | } 99 | length = source[position++]; 100 | break; 101 | case 0x19: 102 | if (position + 2 > size) { 103 | return unexpectedEnd(env); 104 | } 105 | length = source[position++] << 8; 106 | length += source[position++]; 107 | break; 108 | case 0x1a: 109 | if (position + 4 > size) { 110 | return unexpectedEnd(env); 111 | } 112 | length = source[position++] << 24; 113 | length += source[position++] << 16; 114 | length += source[position++] << 8; 115 | length += source[position++]; 116 | break; 117 | case 0x1b: 118 | return unexpectedEnd(env); 119 | default: 120 | length = token; 121 | } 122 | if (majorType == 3) { 123 | // string 124 | if (length + position > size) { 125 | return unexpectedEnd(env); 126 | } 127 | readString(env, length, length < 0x100); 128 | if (writePosition >= MAX_TARGET_SIZE) 129 | break; 130 | } else { // binary data 131 | position += length; 132 | } 133 | 134 | } else { // all other tokens 135 | switch (token) { 136 | case 0x18: 137 | position++; 138 | break; 139 | case 0x19: 140 | position += 2; 141 | break; 142 | case 0x1a: 143 | position += 4; 144 | break; 145 | case 0x1b: 146 | position += 8; 147 | break; 148 | } 149 | } 150 | } 151 | if (lastStringEnd) { 152 | napi_value value; 153 | napi_create_string_latin1(env, (const char*) source + stringStart, lastStringEnd - stringStart, &value); 154 | if (writePosition == 0) { 155 | return value; 156 | } 157 | target[writePosition++] = value; 158 | } else if (writePosition == 1) { 159 | return target[0]; 160 | } 161 | napi_value array; 162 | #if ENABLE_V8_API 163 | v8::Local v8Array = v8::Array::New(v8::Isolate::GetCurrent(), (v8::Local*) target, writePosition); 164 | memcpy(&array, &v8Array, sizeof(array)); 165 | #else 166 | napi_create_array_with_length(env, writePosition, &array); 167 | for (int i = 0; i < writePosition; i++) { 168 | napi_set_element(env, array, i, target[i]); 169 | } 170 | #endif 171 | return array; 172 | } 173 | }; 174 | 175 | static thread_local Extractor* extractor; 176 | 177 | napi_value extractStrings(napi_env env, napi_callback_info info) { 178 | size_t argc = 4; 179 | napi_value args[4]; 180 | napi_get_cb_info(env, info, &argc, args, NULL, NULL); 181 | uint32_t position; 182 | uint32_t size; 183 | uint32_t firstStringSize; 184 | napi_get_value_uint32(env, args[0], &position); 185 | napi_get_value_uint32(env, args[1], &size); 186 | napi_get_value_uint32(env, args[2], &firstStringSize); 187 | uint8_t* source; 188 | size_t buffer_size; 189 | napi_status status = napi_get_buffer_info(env, args[3], (void**) &source, &buffer_size); 190 | if (status) { 191 | napi_throw_type_error(env, NULL, "Unexpected buffer type, expected a Buffer"); 192 | return args[0]; 193 | } 194 | return extractor->extractStrings(env, position, size, firstStringSize, source); 195 | } 196 | #define EXPORT_NAPI_FUNCTION(name, func) { napi_property_descriptor desc = { name, 0, func, 0, 0, 0, (napi_property_attributes) (napi_writable | napi_configurable), 0 }; napi_define_properties(env, exports, 1, &desc); } 197 | 198 | NAPI_MODULE_INIT() { 199 | extractor = new Extractor(); // create our thread-local extractor 200 | EXPORT_NAPI_FUNCTION("extractStrings", extractStrings); 201 | return exports; 202 | } --------------------------------------------------------------------------------