├── .c8rc.json ├── .github ├── dependabot.yml ├── release-drafter.yml └── workflows │ ├── ci.yml │ ├── codeql.yml │ ├── post-dependabot-prs.yml │ ├── release-drafter.yml │ └── update-license.yml ├── .gitignore ├── .mocharc.json ├── .yarnrc.yml ├── LICENSE.txt ├── README.md ├── biome.jsonc ├── lib ├── AbstractTokenizer.ts ├── BufferTokenizer.ts ├── FileTokenizer.ts ├── ReadStreamTokenizer.ts ├── core.ts ├── index.ts ├── tsconfig.json └── types.ts ├── package.json ├── test ├── resources │ ├── id3v1.mp3 │ ├── test1.dat │ ├── test2.dat │ └── test3.dat ├── test.ts ├── tsconfig.json └── util.ts ├── tsconfig.json └── yarn.lock /.c8rc.json: -------------------------------------------------------------------------------- 1 | { 2 | "reporter": [ 3 | "lcov", 4 | "text" 5 | ], 6 | "include": ["lib/**"], 7 | "exclude": [".yarn/**"] 8 | } 9 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | 4 | # ECMAScript Module (ESM) 5 | - package-ecosystem: npm 6 | directory: "/" 7 | schedule: 8 | interval: weekly 9 | time: "06:00" 10 | open-pull-requests-limit: 30 11 | versioning-strategy: increase 12 | target-branch: "master" 13 | labels: 14 | - dependencies 15 | groups: 16 | remark: 17 | dependency-type: "development" 18 | patterns: 19 | - "remark*" 20 | types: 21 | dependency-type: "development" 22 | patterns: 23 | - "@types/*" 24 | -------------------------------------------------------------------------------- /.github/release-drafter.yml: -------------------------------------------------------------------------------- 1 | # Release Drafter template 2 | # Ref: https://github.com/marketplace/actions/release-drafter 3 | 4 | name-template: 'v$RESOLVED_VERSION' 5 | tag-template: 'v$RESOLVED_VERSION' 6 | categories: 7 | - title: 💥 API Changes 8 | labels: 9 | - API change 10 | - title: 🚀 Enhancements 11 | labels: 12 | - enhancement 13 | - title: 🎨 Improvements 14 | labels: 15 | - improvement 16 | - title: 🐛 Bug Fixes 17 | labels: 18 | - bug 19 | - title: 🔧 Under the hood 20 | labels: 21 | - debt 22 | - title: ⬆️ Dependencies 23 | labels: 24 | - dependencies 25 | - title: 📝 Documentation 26 | labels: 27 | - documentation 28 | exclude-labels: 29 | - 'DevOps' 30 | - dev-dependencies 31 | change-template: '- $TITLE @$AUTHOR (#$NUMBER)' 32 | change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks. 33 | version-resolver: 34 | major: 35 | labels: 36 | - 'major' 37 | minor: 38 | labels: 39 | - 'minor' 40 | patch: 41 | labels: 42 | - 'patch' 43 | default: patch 44 | template: | 45 | ## Changes 46 | 47 | $CHANGES 48 | 49 | ## 📦 NPM release 50 | NPM release: [$REPOSITORY@$RESOLVED_VERSION](https://www.npmjs.com/package/$REPOSITORY/v/$RESOLVED_VERSION) 51 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: 3 | pull_request: 4 | branches: [ "master" ] 5 | push: 6 | 7 | jobs: 8 | 9 | build: 10 | name: "Build module" 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | 15 | - uses: actions/checkout@v4 16 | 17 | - uses: actions/setup-node@v4 18 | with: 19 | node-version: 20.x 20 | 21 | - name: Enable Corepack 22 | run: corepack enable 23 | 24 | - name: Install dependencies 25 | run: yarn install 26 | 27 | - name: Lint TypeScript 28 | run: yarn run lint-ts 29 | 30 | - name: Lint Markdown 31 | run: yarn run lint-md 32 | 33 | - name: Build 34 | run: yarn run build 35 | 36 | - name: Upload build 37 | uses: actions/upload-artifact@v4 38 | with: 39 | name: build 40 | path: | 41 | lib/**/*.js 42 | lib/**/*.js.map 43 | lib/**/*.d.ts 44 | test/**/*.js 45 | test/**/*.js.map 46 | 47 | test-nodejs: 48 | name: "Test with Node.js (V8)" 49 | runs-on: ubuntu-latest 50 | needs: build 51 | 52 | env: 53 | YARN_IGNORE_NODE: 1 54 | 55 | strategy: 56 | matrix: 57 | node-version: [18.x, 20.x, 22.x] 58 | 59 | steps: 60 | 61 | - name: 'Checkout the repository' 62 | uses: actions/checkout@v4 63 | 64 | - name: Setup Node.js ${{ matrix.node-version }} 65 | uses: actions/setup-node@v4 66 | with: 67 | node-version: ${{ matrix.node-version }} 68 | 69 | - name: Enable Corepack 70 | run: corepack enable 71 | 72 | - name: Install dependencies 73 | run: yarn install 74 | 75 | - name: Download build 76 | uses: actions/download-artifact@v4 77 | with: 78 | name: build 79 | 80 | - name: Test with Node.js ${{ matrix.node-version }} 81 | run: yarn run test-coverage 82 | 83 | - name: Coveralls Parallel 84 | uses: coverallsapp/github-action@v2 85 | with: 86 | github-token: ${{ secrets.github_token }} 87 | flag-name: run-node-${{ matrix.test_number }} 88 | parallel: true 89 | 90 | test-bun: 91 | name: "Test with Bun (JavaScriptCore)" 92 | runs-on: ubuntu-latest 93 | needs: build 94 | 95 | strategy: 96 | matrix: 97 | bun-version: [1.2] 98 | 99 | env: 100 | YARN_IGNORE_NODE: 1 101 | 102 | steps: 103 | 104 | - name: 'Checkout the repository' 105 | uses: actions/checkout@v4 106 | 107 | - name: Setup Bun ${{ matrix.bun-version }} 108 | uses: oven-sh/setup-bun@v2 109 | with: 110 | bun-version: ${{ matrix.bun-version }} 111 | 112 | - name: Enable Corepack 113 | run: corepack enable 114 | 115 | - name: Install dependencies 116 | run: yarn install 117 | 118 | - name: Download build 119 | uses: actions/download-artifact@v4 120 | with: 121 | name: build 122 | 123 | - name: Unit tests with Bun ${{ matrix.bun-version }} 124 | run: bun run bun:test 125 | 126 | finish: 127 | needs: 128 | - test-nodejs 129 | - test-bun 130 | runs-on: ubuntu-latest 131 | steps: 132 | - name: Coveralls Finished 133 | uses: coverallsapp/github-action@v2 134 | with: 135 | github-token: ${{ secrets.github_token }} 136 | parallel-finished: true 137 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: [ "master" ] 6 | pull_request: 7 | branches: [ "master" ] 8 | schedule: 9 | - cron: "32 12 * * 6" 10 | 11 | jobs: 12 | analyze: 13 | name: Analyze 14 | runs-on: ubuntu-latest 15 | permissions: 16 | actions: read 17 | contents: read 18 | security-events: write 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | language: [ javascript ] 24 | 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v4 28 | 29 | - name: Initialize CodeQL 30 | uses: github/codeql-action/init@v3 31 | with: 32 | languages: ${{ matrix.language }} 33 | queries: +security-and-quality 34 | 35 | - name: Autobuild 36 | uses: github/codeql-action/autobuild@v3 37 | 38 | - name: Perform CodeQL Analysis 39 | uses: github/codeql-action/analyze@v3 40 | with: 41 | category: "/language:${{ matrix.language }}" 42 | -------------------------------------------------------------------------------- /.github/workflows/post-dependabot-prs.yml: -------------------------------------------------------------------------------- 1 | name: Dependabot Pull Request 2 | on: pull_request_target 3 | jobs: 4 | build: 5 | runs-on: ubuntu-latest 6 | if: ${{ github.event.pull_request.user.login == 'dependabot[bot]' }} 7 | steps: 8 | - name: Fetch Dependabot metadata 9 | id: dependabot-metadata 10 | uses: dependabot/fetch-metadata@v2 11 | with: 12 | github-token: "${{ secrets.GITHUB_TOKEN }}" 13 | - name: Add dev-dependencies label 14 | uses: actions-ecosystem/action-add-labels@v1 15 | if: ${{ steps.dependabot-metadata.outputs.dependency-type == 'direct:development' }} 16 | with: 17 | labels: dev-dependencies 18 | - name: Remove dependencies label 19 | uses: actions-ecosystem/action-remove-labels@v1 20 | if: ${{ steps.dependabot-metadata.outputs.dependency-type == 'direct:development' }} 21 | with: 22 | labels: dependencies 23 | -------------------------------------------------------------------------------- /.github/workflows/release-drafter.yml: -------------------------------------------------------------------------------- 1 | name: Release Drafter 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | types: [opened, reopened, synchronize] 9 | 10 | permissions: 11 | contents: read 12 | 13 | jobs: 14 | update_release_draft: 15 | permissions: 16 | contents: write 17 | pull-requests: write 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: release-drafter/release-drafter@v6 21 | env: 22 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 23 | -------------------------------------------------------------------------------- /.github/workflows/update-license.yml: -------------------------------------------------------------------------------- 1 | name: Update License Year 2 | 3 | on: 4 | schedule: 5 | - cron: "0 0 1 1 *" # Runs on January 1st every year 6 | workflow_dispatch: # Allows manual triggering 7 | 8 | jobs: 9 | update-license: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout repository 14 | uses: actions/checkout@v3 15 | with: 16 | token: ${{ secrets.GITHUB_TOKEN }} 17 | 18 | - name: Update LICENSE year 19 | run: | 20 | CURRENT_YEAR=$(date +"%Y") 21 | sed -E -i "s/(Copyright © )[0-9]{4}/\1$CURRENT_YEAR/" LICENSE.txt 22 | 23 | - name: Commit and push changes 24 | run: | 25 | CURRENT_YEAR=$(date +"%Y") 26 | git config --global user.name "Borewit" 27 | git config --global user.email "Borewit@users.noreply.github.com" 28 | git diff --quiet LICENSE.txt || (git add LICENSE.txt && git commit -m "Update license year to $CURRENT_YEAR" && git push) 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Node module 2 | node_modules 3 | 4 | #IntelliJ IDEA: 5 | .idea 6 | *.iml 7 | 8 | # Yarn: 9 | .pnp.* 10 | .yarn/* 11 | 12 | # Project 13 | lib/**/*.js 14 | lib/**/*.js.map 15 | lib/**/*.d.ts 16 | test/**/*.js 17 | test/**/*.js.map 18 | test/**/*.d.ts 19 | test/resources/tmp.dat 20 | coverage 21 | -------------------------------------------------------------------------------- /.mocharc.json: -------------------------------------------------------------------------------- 1 | { 2 | "extension": ["ts", "tsx"], 3 | "watch-files": ["lib/**/*.ts", "test/**/*.ts"], 4 | "spec": ["test/*.ts"], 5 | "loader": ["ts-node/esm"], 6 | "extensions": ["ts", "tsx"] 7 | } 8 | -------------------------------------------------------------------------------- /.yarnrc.yml: -------------------------------------------------------------------------------- 1 | nodeLinker: node-modules 2 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright © 2025 Borewit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Node.js CI](https://github.com/Borewit/strtok3/actions/workflows/ci.yml/badge.svg)](https://github.com/Borewit/strtok3/actions/workflows/ci.yml) 2 | [![CodeQL](https://github.com/Borewit/strtok3/actions/workflows/codeql.yml/badge.svg?branch=master)](https://github.com/Borewit/strtok3/actions/workflows/codeql.yml) 3 | [![NPM version](https://badge.fury.io/js/strtok3.svg)](https://npmjs.org/package/strtok3) 4 | [![npm downloads](http://img.shields.io/npm/dm/strtok3.svg)](https://npmcharts.com/compare/strtok3,token-types?start=1200&interval=30) 5 | [![DeepScan grade](https://deepscan.io/api/teams/5165/projects/8526/branches/103329/badge/grade.svg)](https://deepscan.io/dashboard#view=project&tid=5165&pid=8526&bid=103329) 6 | [![Known Vulnerabilities](https://snyk.io/test/github/Borewit/strtok3/badge.svg?targetFile=package.json)](https://snyk.io/test/github/Borewit/strtok3?targetFile=package.json) 7 | [![Codacy Badge](https://api.codacy.com/project/badge/Grade/59dd6795e61949fb97066ca52e6097ef)](https://www.codacy.com/app/Borewit/strtok3?utm_source=github.com&utm_medium=referral&utm_content=Borewit/strtok3&utm_campaign=Badge_Grade) 8 | # strtok3 9 | 10 | A promise based streaming [*tokenizer*](#tokenizer-object) for [Node.js](http://nodejs.org) and browsers. 11 | 12 | The `strtok3` module provides several methods for creating a [*tokenizer*](#tokenizer-object) from various input sources. 13 | Designed for: 14 | * Seamless support in streaming environments. 15 | * Efficiently decode binary data, strings, and numbers. 16 | * Reading [predefined](https://github.com/Borewit/token-types) or custom tokens. 17 | * Offering [*tokenizers*](#tokenizer-object) for reading from [files](#method-strtok3fromfile), [streams](#fromstream-function) or [Uint8Arrays](#frombuffer-function). 18 | 19 | ### Features 20 | `strtok3` can read from: 21 | * Files, using a file path as input. 22 | * Node.js [streams](https://nodejs.org/api/stream.html). 23 | * [Buffer](https://nodejs.org/api/buffer.html) or [Uint8Array](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Uint8Array). 24 | * HTTP chunked transfer provided by [@tokenizer/http](https://github.com/Borewit/tokenizer-http). 25 | * [Amazon S3](https://aws.amazon.com/s3) chunks with [@tokenizer/s3](https://github.com/Borewit/tokenizer-s3). 26 | 27 | ## Installation 28 | 29 | ```sh 30 | npm install strtok3 31 | ``` 32 | 33 | ### Compatibility 34 | 35 | Starting with version 7, the module has migrated from [CommonJS](https://en.wikipedia.org/wiki/CommonJS) to [pure ECMAScript Module (ESM)](https://gist.github.com/sindresorhus/a39789f98801d908bbc7ff3ecc99d99c). 36 | The distributed JavaScript codebase is compliant with the [ECMAScript 2020 (11th Edition)](https://en.wikipedia.org/wiki/ECMAScript_version_history#11th_Edition_%E2%80%93_ECMAScript_2020) standard. 37 | 38 | Requires a modern browser, Node.js (V8) ≥ 18 engine or Bun (JavaScriptCore) ≥ 1.2. 39 | 40 | For TypeScript CommonJs backward compatibility, you can use [load-esm](https://github.com/Borewit/load-esm). 41 | 42 | > [!NOTE] 43 | > This module requires a [Node.js ≥ 16](https://nodejs.org/en/about/previous-releases) engine. 44 | > It can also be used in a browser environment when bundled with a module bundler. 45 | 46 | ## Support the Project 47 | If you find this project useful and would like to support its development, consider sponsoring or contributing: 48 | 49 | - [Become a sponsor to Borewit](https://github.com/sponsors/Borewit) 50 | 51 | - Buy me a coffee: 52 | 53 | Buy me A coffee 54 | 55 | ## API Documentation 56 | 57 | ### strtok3 methods 58 | 59 | Use one of the methods to instantiate an [*abstract tokenizer*](#tokenizer-object): 60 | - [fromFile](#fromfile-function)* 61 | - [fromStream](#fromstream-function)* 62 | - [fromWebStream](#fromwebstream-function) 63 | - [fromBuffer](#frombuffer-function) 64 | 65 | > **_NOTE:_** * `fromFile` and `fromStream` only available when importing this module with Node.js 66 | 67 | All methods return a [`Tokenizer`](#tokenizer-object), either directly or via a promise. 68 | 69 | #### `fromFile` function 70 | 71 | Creates a [*tokenizer*](#tokenizer-object) from a local file. 72 | 73 | ```ts 74 | function fromFile(sourceFilePath: string): Promise 75 | ``` 76 | 77 | | Parameter | Type | Description | 78 | |----------------|----------|----------------------------| 79 | | sourceFilePath | `string` | Path to file to read from | 80 | 81 | > [!NOTE] 82 | > - Only available for Node.js engines 83 | > - `fromFile` automatically embeds [file-information](#file-information) 84 | 85 | Returns, via a promise, a [*tokenizer*](#tokenizer-object) which can be used to parse a file. 86 | 87 | ```js 88 | import * as strtok3 from 'strtok3'; 89 | import * as Token from 'token-types'; 90 | 91 | (async () => { 92 | 93 | const tokenizer = await strtok3.fromFile("somefile.bin"); 94 | try { 95 | const myNumber = await tokenizer.readToken(Token.UINT8); 96 | console.log(`My number: ${myNumber}`); 97 | } finally { 98 | tokenizer.close(); // Close the file 99 | } 100 | })(); 101 | ``` 102 | 103 | #### `fromStream` function 104 | 105 | Creates a [*tokenizer*](#tokenizer-object) from a Node.js [readable stream](https://nodejs.org/api/stream.html#stream_class_stream_readable). 106 | 107 | ```ts 108 | function fromStream(stream: Readable, options?: ITokenizerOptions): Promise 109 | ``` 110 | 111 | | Parameter | Optional | Type | Description | 112 | |-----------|-----------|-------------------------|--------------------------| 113 | | stream | no | [Readable](https://nodejs.org/api/stream.html#stream_class_stream_readable) | Stream to read from | 114 | | fileInfo | yes | [IFileInfo](#IFileInfo) | Provide file information | 115 | 116 | Returns a Promise providing a [*tokenizer*](#tokenizer-object). 117 | 118 | > [!NOTE] 119 | > - Only available for Node.js engines 120 | 121 | #### `fromWebStream` function 122 | 123 | Creates [*tokenizer*](#tokenizer-object) from a [WHATWG ReadableStream](https://nodejs.org/api/webstreams.html#web-streams-api). 124 | 125 | ```ts 126 | function fromWebStream(webStream: AnyWebByteStream, options?: ITokenizerOptions): ReadStreamTokenizer 127 | ``` 128 | 129 | | Parameter | Optional | Type | Description | 130 | |----------------|-----------|--------------------------------------------------------------------------|------------------------------------| 131 | | readableStream | no | [ReadableStream](https://nodejs.org/api/webstreams.html#web-streams-api) | WHATWG ReadableStream to read from | 132 | | fileInfo | yes | [IFileInfo](#IFileInfo) | Provide file information | 133 | 134 | Returns a Promise providing a [*tokenizer*](#tokenizer-object) 135 | 136 | ```js 137 | import strtok3 from 'strtok3'; 138 | import * as Token from 'token-types'; 139 | 140 | strtok3.fromWebStream(readableStream).then(tokenizer => { 141 | return tokenizer.readToken(Token.UINT8).then(myUint8Number => { 142 | console.log(`My number: ${myUint8Number}`); 143 | }); 144 | }); 145 | ``` 146 | 147 | #### `fromBuffer()` function 148 | 149 | Create a tokenizer from memory ([Uint8Array](https://nodejs.org/api/buffer.html)). 150 | 151 | ```ts 152 | function fromBuffer(uint8Array: Uint8Array, options?: ITokenizerOptions): BufferTokenizer 153 | ``` 154 | 155 | | Parameter | Optional | Type | Description | 156 | |------------|----------|--------------------------------------------------|----------------------------------------| 157 | | uint8Array | no | [Uint8Array](https://nodejs.org/api/buffer.html) | Uint8Array or Buffer to read from | 158 | | fileInfo | yes | [IFileInfo](#IFileInfo) | Provide file information | 159 | 160 | Returns a Promise providing a [*tokenizer*](#tokenizer-object). 161 | 162 | ```js 163 | import * as strtok3 from 'strtok3'; 164 | 165 | const tokenizer = strtok3.fromBuffer(buffer); 166 | 167 | tokenizer.readToken(Token.UINT8).then(myUint8Number => { 168 | console.log(`My number: ${myUint8Number}`); 169 | }); 170 | ``` 171 | 172 | ### `Tokenizer` object 173 | The *tokenizer* is an abstraction of a [stream](https://nodejs.org/api/stream.html), file or [Uint8Array](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Uint8Array), allowing _reading_ or _peeking_ from the stream. 174 | It can also be translated in chunked reads, as done in [@tokenizer/http](https://github.com/Borewit/tokenizer-http); 175 | 176 | #### Key Features: 177 | 178 | - Supports seeking within the stream using `tokenizer.ignore()`. 179 | - Offers `peek` methods to preview data without advancing the read pointer. 180 | - Maintains the read position via tokenizer.position. 181 | 182 | #### Tokenizer functions 183 | 184 | _Read_ methods advance the stream pointer, while _peek_ methods do not. 185 | 186 | There are two kind of functions: 187 | 1. *read* methods: used to read a *token* of [Buffer](https://nodejs.org/api/buffer.html) from the [*tokenizer*](#tokenizer-object). The position of the *tokenizer-stream* will advance with the size of the token. 188 | 2. *peek* methods: same as the read, but it will *not* advance the pointer. It allows to read (peek) ahead. 189 | 190 | #### `readBuffer` function 191 | 192 | Read data from the _tokenizer_ into provided "buffer" (`Uint8Array`). 193 | `readBuffer(buffer, options?)` 194 | 195 | ```ts 196 | readBuffer(buffer: Uint8Array, options?: IReadChunkOptions): Promise; 197 | ``` 198 | 199 | | Parameter | Type | Description | 200 | |------------|----------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 201 | | buffer | [Buffer](https://nodejs.org/api/buffer.html) | Uint8Array | Target buffer to write the data read to | 202 | | options | [IReadChunkOptions](#ireadchunkoptions) | An integer specifying the number of bytes to read | 203 | 204 | Return promise with number of bytes read. 205 | The number of bytes read maybe if less, *mayBeLess* flag was set. 206 | 207 | #### `peekBuffer` function 208 | 209 | Peek (read ahead), from [*tokenizer*](#tokenizer-object), into the buffer without advancing the stream pointer. 210 | 211 | ```ts 212 | peekBuffer(uint8Array: Uint8Array, options?: IReadChunkOptions): Promise; 213 | ``` 214 | 215 | | Parameter | Type | Description | 216 | |------------|-----------------------------------------|-----------------------------------------------------| 217 | | buffer | Buffer | Uint8Array | Target buffer to write the data read (peeked) to. | 218 | | options | [IReadChunkOptions](#ireadchunkoptions) | An integer specifying the number of bytes to read. | | 219 | 220 | Return value `Promise` Promise with number of bytes read. The number of bytes read maybe if less, *mayBeLess* flag was set. 221 | 222 | #### `readToken` function 223 | 224 | Read a *token* from the tokenizer-stream. 225 | 226 | ```ts 227 | readToken(token: IGetToken, position: number = this.position): Promise 228 | ``` 229 | 230 | | Parameter | Type | Description | 231 | |------------|-------------------------|---------------------------------------------------------------------------------------------------------------------- | 232 | | token | [IGetToken](#IGetToken) | Token to read from the tokenizer-stream. | 233 | | position? | number | Offset where to begin reading within the file. If position is null, data will be read from the current file position. | 234 | 235 | Return value `Promise`. Promise with number of bytes read. The number of bytes read maybe if less, *mayBeLess* flag was set. 236 | 237 | #### `peek` function 238 | 239 | Peek a *token* from the [*tokenizer*](#tokenizer-object). 240 | 241 | ```ts 242 | peekToken(token: IGetToken, position: number = this.position): Promise 243 | ``` 244 | 245 | | Parameter | Type | Description | 246 | |------------|----------------------------|-------------------------------------------------------------------------------------------------------------------------| 247 | | token | [IGetToken](#IGetToken) | Token to read from the tokenizer-stream. | 248 | | position? | number | Offset where to begin reading within the file. If position is null, data will be read from the current file position. | 249 | 250 | Return a promise with the token value peeked from the [*tokenizer*](#tokenizer-object). 251 | 252 | #### `readNumber` function 253 | 254 | Peek a numeric [*token*](#token) from the [*tokenizer*](#tokenizer-object). 255 | 256 | ```ts 257 | readNumber(token: IToken): Promise 258 | ``` 259 | 260 | | Parameter | Type | Description | 261 | |------------|---------------------------------|----------------------------------------------------| 262 | | token | [IGetToken](#IGetToken) | Numeric token to read from the tokenizer-stream. | 263 | 264 | Returns a promise with the decoded numeric value from the *tokenizer-stream*. 265 | 266 | #### `ignore` function 267 | 268 | Advance the offset pointer with the token number of bytes provided. 269 | 270 | ```ts 271 | ignore(length: number): Promise 272 | ``` 273 | 274 | | Parameter | Type | Description | 275 | |------------|--------|----------------------------------------------------------------------| 276 | | ignore | number | Numeric of bytes to ignore. Will advance the `tokenizer.position` | 277 | 278 | Returns a promise with the decoded numeric value from the *tokenizer-stream*. 279 | 280 | #### `close` function 281 | Clean up resources, such as closing a file pointer if applicable. 282 | 283 | #### `Tokenizer` attributes 284 | 285 | - `fileInfo` 286 | 287 | Optional attribute describing the file information, see [IFileInfo](#IFileInfo) 288 | 289 | - `position` 290 | 291 | Pointer to the current position in the [*tokenizer*](#tokenizer-object) stream. 292 | If a *position* is provided to a _read_ or _peek_ method, is should be, at least, equal or greater than this value. 293 | 294 | ### `IReadChunkOptions` interface 295 | 296 | Each attribute is optional: 297 | 298 | | Attribute | Type | Description | 299 | |-----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| 300 | | length | number | Requested number of bytes to read. | 301 | | position | number | Position where to peek from the file. If position is null, data will be read from the [current file position](#attribute-tokenizerposition). Position may not be less then [tokenizer.position](#attribute-tokenizerposition) | 302 | | mayBeLess | boolean | If and only if set, will not throw an EOF error if less then the requested *mayBeLess* could be read. | 303 | 304 | Example usage: 305 | ```js 306 | tokenizer.peekBuffer(buffer, {mayBeLess: true}); 307 | ``` 308 | 309 | ### `IFileInfo` interface 310 | 311 | Provides optional metadata about the file being tokenized. 312 | 313 | | Attribute | Type | Description | 314 | |-----------|---------|---------------------------------------------------------------------------------------------------| 315 | | size | number | File size in bytes | 316 | | mimeType | number | [MIME-type](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types) of file. | 317 | | path | number | File path | 318 | | url | boolean | File URL | 319 | 320 | ### `Token` object 321 | 322 | The *token* is basically a description what to read form the [*tokenizer-stream*](#tokenizer-object). 323 | A basic set of *token types* can be found here: [*token-types*](https://github.com/Borewit/token-types). 324 | 325 | A token is something which implements the following interface: 326 | ```ts 327 | export interface IGetToken { 328 | 329 | /** 330 | * Length in bytes of encoded value 331 | */ 332 | len: number; 333 | 334 | /** 335 | * Decode value from buffer at offset 336 | * @param buf Buffer to read the decoded value from 337 | * @param off Decode offset 338 | */ 339 | get(buf: Uint8Array, off: number): T; 340 | } 341 | ``` 342 | The *tokenizer* reads `token.len` bytes from the *tokenizer-stream* into a Buffer. 343 | The `token.get` will be called with the Buffer. `token.get` is responsible for conversion from the buffer to the desired output type. 344 | 345 | ### Working with Web-API readable stream 346 | To convert a [Web-API readable stream](https://developer.mozilla.org/en-US/docs/Web/API/ReadableStreamDefaultReader) into a [Node.js readable stream]((https://nodejs.org/api/stream.html#stream_readable_streams)), you can use [readable-web-to-node-stream](https://github.com/Borewit/readable-web-to-node-stream) to convert one in another. 347 | 348 | ```js 349 | import { fromWebStream } strtok3 from 'strtok3'; 350 | import { ReadableWebToNodeStream } from 'readable-web-to-node-stream'; 351 | 352 | (async () => { 353 | 354 | const response = await fetch(url); 355 | const readableWebStream = response.body; // Web-API readable stream 356 | const webStream = new ReadableWebToNodeStream(readableWebStream); // convert to Node.js readable stream 357 | 358 | const tokenizer = fromWebStream(webStream); // And we now have tokenizer in a web environment 359 | })(); 360 | ``` 361 | 362 | ## Dependencies 363 | 364 | The diagram below illustrates the primary dependencies of `strtok3`: 365 | 366 | ```mermaid 367 | graph TD; 368 | S(strtok3)-->P(peek-readable) 369 | S(strtok3)-->TO("@tokenizer/token") 370 | ``` 371 | 372 | - [peek-readable](https://github.com/Borewit/peek-readable): Manages reading operations with peeking capabilities, allowing data to be previewed without advancing the read pointer. 373 | - [@tokenizer/token](https://github.com/Borewit/tokenizer-token): Provides token definitions and utilities used by `strtok3` for interpreting binary data. 374 | 375 | ## Licence 376 | 377 | This project is licensed under the [MIT License](LICENSE.txt). Feel free to use, modify, and distribute as needed. 378 | -------------------------------------------------------------------------------- /biome.jsonc: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://biomejs.dev/schemas/1.9.3/schema.json", 3 | "organizeImports": { 4 | "enabled": false 5 | }, 6 | "formatter": { 7 | "enabled": false 8 | }, 9 | "linter": { 10 | "enabled": true, 11 | "rules": { 12 | "correctness": { 13 | "noUnusedImports": "error", 14 | "noNodejsModules": "error" 15 | }, 16 | "recommended": true, 17 | "complexity": { 18 | "noForEach": "off" 19 | }, 20 | "suspicious": { 21 | "noEmptyBlockStatements": "error", 22 | "useErrorMessage": "error" 23 | }, 24 | "nursery":{ 25 | "noRestrictedImports": { 26 | "level": "error", 27 | "options": { 28 | "paths": { 29 | "node:buffer": "Use Uint8Array instead of Buffer" 30 | } 31 | } 32 | }}, 33 | "style":{ 34 | "useConsistentBuiltinInstantiation": "error", 35 | "useThrowNewError": "error", 36 | "useThrowOnlyError": "error" 37 | } 38 | } 39 | }, 40 | "files": { 41 | "ignoreUnknown": true, 42 | "ignore": [ 43 | "./coverage", 44 | "./yarn", 45 | "./.pnp.*", 46 | "./lib/**/*.d.ts", 47 | "./lib/**/*.js", 48 | "./test/**/*.d.ts", 49 | "./test/**/*.js" 50 | ] 51 | }, 52 | "overrides": [ 53 | { 54 | "include": ["./test/**/*", "./lib/index.ts", "./lib/FileTokenizer.ts"], 55 | "linter": { 56 | "rules": { 57 | "correctness": { 58 | "noNodejsModules": "off" 59 | } 60 | } 61 | } 62 | } 63 | ] 64 | } 65 | -------------------------------------------------------------------------------- /lib/AbstractTokenizer.ts: -------------------------------------------------------------------------------- 1 | import type { ITokenizer, IFileInfo, IReadChunkOptions, OnClose, ITokenizerOptions } from './types.js'; 2 | import type { IGetToken, IToken } from '@tokenizer/token'; 3 | import { EndOfStreamError } from 'peek-readable'; 4 | 5 | interface INormalizedReadChunkOptions extends IReadChunkOptions { 6 | length: number; 7 | position: number; 8 | mayBeLess?: boolean; 9 | } 10 | 11 | /** 12 | * Core tokenizer 13 | */ 14 | export abstract class AbstractTokenizer implements ITokenizer { 15 | 16 | private onClose?: OnClose; 17 | private numBuffer = new Uint8Array(8); 18 | 19 | public abstract fileInfo: IFileInfo; 20 | 21 | /** 22 | * Tokenizer-stream position 23 | */ 24 | public position = 0; 25 | 26 | 27 | /** 28 | * Constructor 29 | * @param options Tokenizer options 30 | * @protected 31 | */ 32 | protected constructor(options?: ITokenizerOptions) { 33 | this.onClose = options?.onClose; 34 | if (options?.abortSignal) { 35 | options.abortSignal.addEventListener('abort', () => { 36 | this.abort(); 37 | }) 38 | } 39 | } 40 | 41 | abstract supportsRandomAccess(): boolean; 42 | 43 | /** 44 | * Read buffer from tokenizer 45 | * @param buffer - Target buffer to fill with data read from the tokenizer-stream 46 | * @param options - Additional read options 47 | * @returns Promise with number of bytes read 48 | */ 49 | public abstract readBuffer(buffer: Uint8Array, options?: IReadChunkOptions): Promise; 50 | 51 | /** 52 | * Peek (read ahead) buffer from tokenizer 53 | * @param uint8Array - Target buffer to fill with data peeked from the tokenizer-stream 54 | * @param options - Peek behaviour options 55 | * @returns Promise with number of bytes read 56 | */ 57 | public abstract peekBuffer(uint8Array: Uint8Array, options?: IReadChunkOptions): Promise; 58 | 59 | /** 60 | * Read a token from the tokenizer-stream 61 | * @param token - The token to read 62 | * @param position - If provided, the desired position in the tokenizer-stream 63 | * @returns Promise with token data 64 | */ 65 | public async readToken(token: IGetToken, position: number = this.position): Promise { 66 | const uint8Array = new Uint8Array(token.len); 67 | const len = await this.readBuffer(uint8Array, {position}); 68 | if (len < token.len) 69 | throw new EndOfStreamError(); 70 | return token.get(uint8Array, 0); 71 | } 72 | 73 | /** 74 | * Peek a token from the tokenizer-stream. 75 | * @param token - Token to peek from the tokenizer-stream. 76 | * @param position - Offset where to begin reading within the file. If position is null, data will be read from the current file position. 77 | * @returns Promise with token data 78 | */ 79 | public async peekToken(token: IGetToken, position: number = this.position): Promise { 80 | const uint8Array = new Uint8Array(token.len); 81 | const len = await this.peekBuffer(uint8Array, {position}); 82 | if (len < token.len) 83 | throw new EndOfStreamError(); 84 | return token.get(uint8Array, 0); 85 | } 86 | 87 | /** 88 | * Read a numeric token from the stream 89 | * @param token - Numeric token 90 | * @returns Promise with number 91 | */ 92 | public async readNumber(token: IToken): Promise { 93 | const len = await this.readBuffer(this.numBuffer, {length: token.len}); 94 | if (len < token.len) 95 | throw new EndOfStreamError(); 96 | return token.get(this.numBuffer, 0); 97 | } 98 | 99 | /** 100 | * Read a numeric token from the stream 101 | * @param token - Numeric token 102 | * @returns Promise with number 103 | */ 104 | public async peekNumber(token: IToken): Promise { 105 | const len = await this.peekBuffer(this.numBuffer, {length: token.len}); 106 | if (len < token.len) 107 | throw new EndOfStreamError(); 108 | return token.get(this.numBuffer, 0); 109 | } 110 | 111 | /** 112 | * Ignore number of bytes, advances the pointer in under tokenizer-stream. 113 | * @param length - Number of bytes to ignore 114 | * @return resolves the number of bytes ignored, equals length if this available, otherwise the number of bytes available 115 | */ 116 | public async ignore(length: number): Promise { 117 | if (this.fileInfo.size !== undefined) { 118 | const bytesLeft = this.fileInfo.size - this.position; 119 | if (length > bytesLeft) { 120 | this.position += bytesLeft; 121 | return bytesLeft; 122 | } 123 | } 124 | this.position += length; 125 | return length; 126 | } 127 | 128 | public async close(): Promise { 129 | await this.abort(); 130 | await this.onClose?.(); 131 | } 132 | 133 | protected normalizeOptions(uint8Array: Uint8Array, options?: IReadChunkOptions): INormalizedReadChunkOptions { 134 | 135 | if (!this.supportsRandomAccess() && options && options.position !== undefined && options.position < this.position) { 136 | throw new Error('`options.position` must be equal or greater than `tokenizer.position`'); 137 | } 138 | 139 | return { 140 | ...{ 141 | mayBeLess: false, 142 | offset: 0, 143 | length: uint8Array.length, 144 | position: this.position 145 | }, ...options 146 | }; 147 | } 148 | 149 | public abort(): Promise { 150 | return Promise.resolve(); // Ignore abort signal 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /lib/BufferTokenizer.ts: -------------------------------------------------------------------------------- 1 | import type {ITokenizerOptions, IReadChunkOptions, IRandomAccessFileInfo, IRandomAccessTokenizer} from './types.js'; 2 | import { EndOfStreamError } from 'peek-readable'; 3 | import { AbstractTokenizer } from './AbstractTokenizer.js'; 4 | 5 | export class BufferTokenizer extends AbstractTokenizer implements IRandomAccessTokenizer { 6 | 7 | public fileInfo: IRandomAccessFileInfo; 8 | 9 | /** 10 | * Construct BufferTokenizer 11 | * @param uint8Array - Uint8Array to tokenize 12 | * @param options Tokenizer options 13 | */ 14 | constructor(private uint8Array: Uint8Array, options?: ITokenizerOptions) { 15 | super(options); 16 | this.fileInfo = {...options?.fileInfo ?? {}, ...{size: uint8Array.length}}; 17 | } 18 | 19 | /** 20 | * Read buffer from tokenizer 21 | * @param uint8Array - Uint8Array to tokenize 22 | * @param options - Read behaviour options 23 | * @returns {Promise} 24 | */ 25 | public async readBuffer(uint8Array: Uint8Array, options?: IReadChunkOptions): Promise { 26 | 27 | if (options?.position) { 28 | this.position = options.position; 29 | } 30 | 31 | const bytesRead = await this.peekBuffer(uint8Array, options); 32 | this.position += bytesRead; 33 | return bytesRead; 34 | } 35 | 36 | /** 37 | * Peek (read ahead) buffer from tokenizer 38 | * @param uint8Array 39 | * @param options - Read behaviour options 40 | * @returns {Promise} 41 | */ 42 | public async peekBuffer(uint8Array: Uint8Array, options?: IReadChunkOptions): Promise { 43 | 44 | const normOptions = this.normalizeOptions(uint8Array, options); 45 | 46 | const bytes2read = Math.min(this.uint8Array.length - normOptions.position, normOptions.length); 47 | if ((!normOptions.mayBeLess) && bytes2read < normOptions.length) { 48 | throw new EndOfStreamError(); 49 | } 50 | uint8Array.set(this.uint8Array.subarray(normOptions.position, normOptions.position + bytes2read)); 51 | return bytes2read; 52 | } 53 | 54 | public close(): Promise { 55 | return super.close(); 56 | } 57 | 58 | supportsRandomAccess(): boolean { 59 | return true; 60 | } 61 | 62 | setPosition(position: number): void { 63 | this.position = position; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /lib/FileTokenizer.ts: -------------------------------------------------------------------------------- 1 | import { AbstractTokenizer } from './AbstractTokenizer.js'; 2 | import { EndOfStreamError } from 'peek-readable'; 3 | import type {IRandomAccessTokenizer, IRandomAccessFileInfo, IReadChunkOptions, ITokenizerOptions} from './types.js'; 4 | import { type FileHandle, open as fsOpen } from 'node:fs/promises'; 5 | 6 | interface IFileTokenizerOptions extends ITokenizerOptions { 7 | /** 8 | * Pass additional file information to the tokenizer 9 | */ 10 | fileInfo: IRandomAccessFileInfo; 11 | } 12 | 13 | export class FileTokenizer extends AbstractTokenizer implements IRandomAccessTokenizer { 14 | 15 | public fileInfo: IRandomAccessFileInfo; 16 | 17 | /** 18 | * Create tokenizer from provided file path 19 | * @param sourceFilePath File path 20 | */ 21 | static async fromFile(sourceFilePath: string): Promise { 22 | const fileHandle = await fsOpen(sourceFilePath, 'r'); 23 | const stat = await fileHandle.stat(); 24 | return new FileTokenizer(fileHandle, {fileInfo: {path: sourceFilePath, size: stat.size}}); 25 | } 26 | 27 | protected constructor(private fileHandle: FileHandle, options: IFileTokenizerOptions) { 28 | super(options); 29 | this.fileInfo = options.fileInfo; 30 | } 31 | 32 | /** 33 | * Read buffer from file 34 | * @param uint8Array - Uint8Array to write result to 35 | * @param options - Read behaviour options 36 | * @returns Promise number of bytes read 37 | */ 38 | public async readBuffer(uint8Array: Uint8Array, options?: IReadChunkOptions): Promise { 39 | const normOptions = this.normalizeOptions(uint8Array, options); 40 | this.position = normOptions.position; 41 | if (normOptions.length === 0) return 0; 42 | const res = await this.fileHandle.read(uint8Array, 0, normOptions.length, normOptions.position); 43 | this.position += res.bytesRead; 44 | if (res.bytesRead < normOptions.length && (!options || !options.mayBeLess)) { 45 | throw new EndOfStreamError(); 46 | } 47 | return res.bytesRead; 48 | } 49 | 50 | /** 51 | * Peek buffer from file 52 | * @param uint8Array - Uint8Array (or Buffer) to write data to 53 | * @param options - Read behaviour options 54 | * @returns Promise number of bytes read 55 | */ 56 | public async peekBuffer(uint8Array: Uint8Array, options?: IReadChunkOptions): Promise { 57 | 58 | const normOptions = this.normalizeOptions(uint8Array, options); 59 | 60 | const res = await this.fileHandle.read(uint8Array, 0, normOptions.length, normOptions.position); 61 | if ((!normOptions.mayBeLess) && res.bytesRead < normOptions.length) { 62 | throw new EndOfStreamError(); 63 | } 64 | return res.bytesRead; 65 | } 66 | 67 | public async close(): Promise { 68 | await this.fileHandle.close(); 69 | return super.close(); 70 | } 71 | 72 | setPosition(position: number): void { 73 | this.position = position; 74 | } 75 | 76 | supportsRandomAccess(): boolean { 77 | return true; 78 | } 79 | } 80 | 81 | 82 | -------------------------------------------------------------------------------- /lib/ReadStreamTokenizer.ts: -------------------------------------------------------------------------------- 1 | import { AbstractTokenizer } from './AbstractTokenizer.js'; 2 | import { EndOfStreamError, type IStreamReader } from 'peek-readable'; 3 | import type {IFileInfo, IReadChunkOptions, ITokenizerOptions} from './types.js'; 4 | 5 | const maxBufferSize = 256000; 6 | 7 | export class ReadStreamTokenizer extends AbstractTokenizer { 8 | 9 | public fileInfo: IFileInfo; 10 | 11 | /** 12 | * Constructor 13 | * @param streamReader stream-reader to read from 14 | * @param options Tokenizer options 15 | */ 16 | public constructor(private streamReader: IStreamReader, options?: ITokenizerOptions) { 17 | super(options); 18 | this.fileInfo = options?.fileInfo ?? {}; 19 | } 20 | 21 | /** 22 | * Read buffer from tokenizer 23 | * @param uint8Array - Target Uint8Array to fill with data read from the tokenizer-stream 24 | * @param options - Read behaviour options 25 | * @returns Promise with number of bytes read 26 | */ 27 | public async readBuffer(uint8Array: Uint8Array, options?: IReadChunkOptions): Promise { 28 | const normOptions = this.normalizeOptions(uint8Array, options); 29 | const skipBytes = normOptions.position - this.position; 30 | if (skipBytes > 0) { 31 | await this.ignore(skipBytes); 32 | return this.readBuffer(uint8Array, options); 33 | } 34 | if (skipBytes < 0) { 35 | throw new Error('`options.position` must be equal or greater than `tokenizer.position`'); 36 | } 37 | if (normOptions.length === 0) { 38 | return 0; 39 | } 40 | const bytesRead = await this.streamReader.read(uint8Array.subarray(0, normOptions.length), normOptions.mayBeLess); 41 | this.position += bytesRead; 42 | if ((!options || !options.mayBeLess) && bytesRead < normOptions.length) { 43 | throw new EndOfStreamError(); 44 | } 45 | return bytesRead; 46 | } 47 | 48 | /** 49 | * Peek (read ahead) buffer from tokenizer 50 | * @param uint8Array - Uint8Array (or Buffer) to write data to 51 | * @param options - Read behaviour options 52 | * @returns Promise with number of bytes peeked 53 | */ 54 | public async peekBuffer(uint8Array: Uint8Array, options?: IReadChunkOptions): Promise { 55 | 56 | const normOptions = this.normalizeOptions(uint8Array, options); 57 | let bytesRead = 0; 58 | 59 | if (normOptions.position) { 60 | const skipBytes = normOptions.position - this.position; 61 | if (skipBytes > 0) { 62 | const skipBuffer = new Uint8Array(normOptions.length + skipBytes); 63 | bytesRead = await this.peekBuffer(skipBuffer, {mayBeLess: normOptions.mayBeLess}); 64 | uint8Array.set(skipBuffer.subarray(skipBytes)); 65 | return bytesRead - skipBytes; 66 | } 67 | if (skipBytes < 0) { 68 | throw new Error('Cannot peek from a negative offset in a stream'); 69 | } 70 | } 71 | 72 | if (normOptions.length > 0) { 73 | try { 74 | bytesRead = await this.streamReader.peek(uint8Array.subarray(0, normOptions.length), normOptions.mayBeLess); 75 | } catch (err) { 76 | if (options?.mayBeLess && err instanceof EndOfStreamError) { 77 | return 0; 78 | } 79 | throw err; 80 | } 81 | if ((!normOptions.mayBeLess) && bytesRead < normOptions.length) { 82 | throw new EndOfStreamError(); 83 | } 84 | } 85 | 86 | return bytesRead; 87 | } 88 | 89 | public async ignore(length: number): Promise { 90 | // debug(`ignore ${this.position}...${this.position + length - 1}`); 91 | const bufSize = Math.min(maxBufferSize, length); 92 | const buf = new Uint8Array(bufSize); 93 | let totBytesRead = 0; 94 | while (totBytesRead < length) { 95 | const remaining = length - totBytesRead; 96 | const bytesRead = await this.readBuffer(buf, {length: Math.min(bufSize, remaining)}); 97 | if (bytesRead < 0) { 98 | return bytesRead; 99 | } 100 | totBytesRead += bytesRead; 101 | } 102 | return totBytesRead; 103 | } 104 | 105 | public abort(): Promise { 106 | return this.streamReader.abort(); 107 | } 108 | 109 | public async close(): Promise { 110 | return this.streamReader.close(); 111 | } 112 | 113 | supportsRandomAccess(): boolean { 114 | return false; 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /lib/core.ts: -------------------------------------------------------------------------------- 1 | import type { Readable } from 'node:stream'; 2 | import { StreamReader, makeWebStreamReader, type AnyWebByteStream } from 'peek-readable'; 3 | 4 | import { ReadStreamTokenizer } from './ReadStreamTokenizer.js'; 5 | import { BufferTokenizer } from './BufferTokenizer.js'; 6 | import type { ITokenizerOptions } from './types.js'; 7 | 8 | export { EndOfStreamError, AbortError, type AnyWebByteStream } from 'peek-readable'; 9 | export type { ITokenizer, IRandomAccessTokenizer, IFileInfo, IRandomAccessFileInfo, ITokenizerOptions, IReadChunkOptions, OnClose } from './types.js'; 10 | export type { IToken, IGetToken } from '@tokenizer/token'; 11 | export { AbstractTokenizer } from './AbstractTokenizer.js'; 12 | 13 | /** 14 | * Construct ReadStreamTokenizer from given Stream. 15 | * Will set fileSize, if provided given Stream has set the .path property/ 16 | * @param stream - Read from Node.js Stream.Readable 17 | * @param options - Tokenizer options 18 | * @returns ReadStreamTokenizer 19 | */ 20 | export function fromStream(stream: Readable, options?: ITokenizerOptions): ReadStreamTokenizer { 21 | const streamReader= new StreamReader(stream); 22 | const _options: ITokenizerOptions = options ?? {}; 23 | const chainedClose = _options.onClose; 24 | _options.onClose = async () => { 25 | await streamReader.close(); 26 | if(chainedClose) { 27 | return chainedClose(); 28 | } 29 | }; 30 | return new ReadStreamTokenizer(streamReader, _options); 31 | } 32 | 33 | /** 34 | * Construct ReadStreamTokenizer from given ReadableStream (WebStream API). 35 | * Will set fileSize, if provided given Stream has set the .path property/ 36 | * @param webStream - Read from Node.js Stream.Readable (must be a byte stream) 37 | * @param options - Tokenizer options 38 | * @returns ReadStreamTokenizer 39 | */ 40 | export function fromWebStream(webStream: AnyWebByteStream, options?: ITokenizerOptions): ReadStreamTokenizer { 41 | const webStreamReader= makeWebStreamReader(webStream); 42 | const _options: ITokenizerOptions = options ?? {}; 43 | const chainedClose = _options.onClose; 44 | _options.onClose = async () => { 45 | await webStreamReader.close(); 46 | if(chainedClose) { 47 | return chainedClose(); 48 | } 49 | }; 50 | return new ReadStreamTokenizer(webStreamReader, _options); 51 | } 52 | 53 | /** 54 | * Construct ReadStreamTokenizer from given Buffer. 55 | * @param uint8Array - Uint8Array to tokenize 56 | * @param options - Tokenizer options 57 | * @returns BufferTokenizer 58 | */ 59 | export function fromBuffer(uint8Array: Uint8Array, options?: ITokenizerOptions): BufferTokenizer { 60 | return new BufferTokenizer(uint8Array, options); 61 | } 62 | -------------------------------------------------------------------------------- /lib/index.ts: -------------------------------------------------------------------------------- 1 | import type { Readable } from 'node:stream'; 2 | import type { ReadStreamTokenizer } from './ReadStreamTokenizer.js'; 3 | import { stat as fsStat } from 'node:fs/promises'; 4 | import { type ITokenizerOptions, fromStream as coreFromStream } from './core.js'; 5 | import {FileTokenizer} from "./FileTokenizer.js"; 6 | 7 | export { FileTokenizer } from './FileTokenizer.js'; 8 | export * from './core.js'; 9 | export type { IToken, IGetToken } from '@tokenizer/token'; 10 | 11 | interface StreamWithFile extends Readable { 12 | /** 13 | * Informal property set by `node:fs.createReadStream` 14 | */ 15 | path?: string; 16 | } 17 | 18 | /** 19 | * Construct ReadStreamTokenizer from given Stream. 20 | * Will set fileSize, if provided given Stream has set the .path property. 21 | * @param stream - Node.js Stream.Readable 22 | * @param options - Pass additional file information to the tokenizer 23 | * @returns Tokenizer 24 | */ 25 | export async function fromStream(stream: Readable, options?: ITokenizerOptions): Promise { 26 | const rst = coreFromStream(stream, options); 27 | if ((stream as StreamWithFile).path) { 28 | const stat = await fsStat((stream as StreamWithFile).path as string); 29 | rst.fileInfo.path = (stream as StreamWithFile).path; 30 | rst.fileInfo.size = stat.size; 31 | } 32 | return rst; 33 | } 34 | 35 | export const fromFile = FileTokenizer.fromFile; 36 | -------------------------------------------------------------------------------- /lib/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../tsconfig.json", 3 | "compilerOptions": { 4 | "declaration": true 5 | } 6 | } 7 | 8 | -------------------------------------------------------------------------------- /lib/types.ts: -------------------------------------------------------------------------------- 1 | import type { IGetToken } from '@tokenizer/token'; 2 | 3 | export interface IFileInfo { 4 | /** 5 | * File size in bytes 6 | */ 7 | size?: number; 8 | /** 9 | * MIME-type of file 10 | */ 11 | mimeType?: string; 12 | 13 | /** 14 | * File path 15 | */ 16 | path?: string; 17 | 18 | /** 19 | * File URL 20 | */ 21 | url?: string; 22 | } 23 | 24 | export interface IRandomAccessFileInfo extends IFileInfo { 25 | /** 26 | * File size in bytes 27 | */ 28 | size: number; 29 | } 30 | 31 | export interface IReadChunkOptions { 32 | 33 | /** 34 | * Number of bytes to read. 35 | */ 36 | length?: number; 37 | 38 | /** 39 | * Position where to begin reading from the file. 40 | * Default it is `tokenizer.position`. 41 | * Position may not be less than `tokenizer.position`, unless `supportsRandomAccess()` returns `true`. 42 | */ 43 | position?: number; 44 | 45 | /** 46 | * If set, will not throw an EOF error if not all off the requested data could be read 47 | */ 48 | mayBeLess?: boolean; 49 | } 50 | 51 | export interface IRandomAccessTokenizer extends ITokenizer { 52 | 53 | /** 54 | * Provide access to information of the underlying information stream or file. 55 | */ 56 | fileInfo: IRandomAccessFileInfo; 57 | 58 | /** 59 | * Change the position (offset) of the tokenizer 60 | * @param position New position 61 | */ 62 | setPosition(position: number): void; 63 | } 64 | 65 | /** 66 | * The tokenizer allows us to read or peek from the tokenizer-stream. 67 | * The tokenizer-stream is an abstraction of a stream, file or Buffer. 68 | */ 69 | export interface ITokenizer { 70 | 71 | /** 72 | * Provide access to information of the underlying information stream or file. 73 | */ 74 | readonly fileInfo: IFileInfo; 75 | 76 | /** 77 | * Offset in bytes (= number of bytes read) since beginning of file or stream 78 | */ 79 | readonly position: number; 80 | 81 | /** 82 | * Peek (read ahead) buffer from tokenizer 83 | * @param buffer - Target buffer to fill with data peek from the tokenizer-stream 84 | * @param options - Read behaviour options 85 | * @returns Promise with number of bytes read 86 | */ 87 | peekBuffer(buffer: Uint8Array, options?: IReadChunkOptions): Promise; 88 | 89 | /** 90 | * Peek (read ahead) buffer from tokenizer 91 | * @param buffer - Target buffer to fill with data peeked from the tokenizer-stream 92 | * @param options - Additional read options 93 | * @returns Promise with number of bytes read 94 | */ 95 | readBuffer(buffer: Uint8Array, options?: IReadChunkOptions): Promise; 96 | 97 | /** 98 | * Peek a token from the tokenizer-stream. 99 | * @param token - Token to peek from the tokenizer-stream. 100 | * @param position - Offset where to begin reading within the file. If position is null, data will be read from the current file position. 101 | * @param maybeless - If set, will not throw an EOF error if the less then the requested length could be read. 102 | */ 103 | peekToken(token: IGetToken, position?: number | null, maybeless?: boolean): Promise; 104 | 105 | /** 106 | * Read a token from the tokenizer-stream. 107 | * @param token - Token to peek from the tokenizer-stream. 108 | * @param position - Offset where to begin reading within the file. If position is null, data will be read from the current file position. 109 | */ 110 | readToken(token: IGetToken, position?: number): Promise; 111 | 112 | /** 113 | * Peek a numeric token from the stream 114 | * @param token - Numeric token 115 | * @returns Promise with number 116 | */ 117 | peekNumber(token: IGetToken): Promise; 118 | 119 | /** 120 | * Read a numeric token from the stream 121 | * @param token - Numeric token 122 | * @returns Promise with number 123 | */ 124 | readNumber(token: IGetToken): Promise; 125 | 126 | /** 127 | * Ignore given number of bytes 128 | * @param length - Number of bytes ignored 129 | */ 130 | ignore(length: number): Promise; 131 | 132 | /** 133 | * Clean up resources. 134 | * It does not close the stream for StreamReader, but is does close the file-descriptor. 135 | */ 136 | close(): Promise; 137 | 138 | /** 139 | * Abort pending asynchronous operations 140 | */ 141 | abort(): Promise; 142 | 143 | /** 144 | * Returns true when the underlying file supports random access 145 | */ 146 | supportsRandomAccess(): boolean; 147 | } 148 | 149 | export type OnClose = () => Promise; 150 | 151 | export interface ITokenizerOptions { 152 | /** 153 | * Pass additional file information to the tokenizer 154 | */ 155 | fileInfo?: IFileInfo; 156 | 157 | /** 158 | * On tokenizer close handler 159 | */ 160 | onClose?: OnClose; 161 | 162 | /** 163 | * Pass `AbortSignal` which can stop active async operations 164 | * Ref: https://developer.mozilla.org/en-US/docs/Web/API/AbortSignal 165 | */ 166 | abortSignal?: AbortSignal; 167 | } 168 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "strtok3", 3 | "version": "10.2.2", 4 | "description": "A promise based streaming tokenizer", 5 | "author": { 6 | "name": "Borewit", 7 | "url": "https://github.com/Borewit" 8 | }, 9 | "funding": { 10 | "type": "github", 11 | "url": "https://github.com/sponsors/Borewit" 12 | }, 13 | "scripts": { 14 | "clean": "del-cli 'lib/**/*.js' 'lib/**/*.js.map' 'lib/**/*.d.ts' 'test/**/*.js' 'test/**/*.js.map'", 15 | "compile-src": "tsc -p lib", 16 | "compile-test": "tsc -p test", 17 | "compile": "yarn run compile-src && yarn run compile-test", 18 | "build": "yarn run clean && yarn run compile", 19 | "eslint": "eslint lib test", 20 | "lint-md": "remark -u preset-lint-recommended .", 21 | "lint-ts": "biome check", 22 | "lint": "yarn run lint-md && yarn run lint-ts", 23 | "fix": "yarn run biome lint --write", 24 | "test": "mocha", 25 | "bun:test": "bun run --bun test", 26 | "test-coverage": "c8 yarn run test", 27 | "send-codacy": "c8 report --reporter=text-lcov | codacy-coverage", 28 | "start": "yarn run compile && yarn run lint && yarn run cover-test" 29 | }, 30 | "engines": { 31 | "node": ">=18" 32 | }, 33 | "repository": { 34 | "type": "git", 35 | "url": "https://github.com/Borewit/strtok3.git" 36 | }, 37 | "license": "MIT", 38 | "type": "module", 39 | "exports": { 40 | ".": { 41 | "node": "./lib/index.js", 42 | "default": "./lib/core.js" 43 | }, 44 | "./core": "./lib/core.js" 45 | }, 46 | "types": "lib/index.d.ts", 47 | "files": [ 48 | "lib/**/*.js", 49 | "lib/**/*.d.ts" 50 | ], 51 | "bugs": { 52 | "url": "https://github.com/Borewit/strtok3/issues" 53 | }, 54 | "dependencies": { 55 | "@tokenizer/token": "^0.3.0", 56 | "peek-readable": "^7.0.0" 57 | }, 58 | "devDependencies": { 59 | "@biomejs/biome": "^1.9.4", 60 | "@types/chai": "^5.2.2", 61 | "@types/chai-as-promised": "^8.0.2", 62 | "@types/debug": "^4.1.12", 63 | "@types/mocha": "^10.0.10", 64 | "@types/node": "^22.15.19", 65 | "c8": "^10.1.3", 66 | "chai": "^5.2.0", 67 | "chai-as-promised": "^8.0.1", 68 | "del-cli": "^6.0.0", 69 | "mocha": "^11.5.0", 70 | "node-readable-to-web-readable-stream": "^0.4.2", 71 | "remark-cli": "^12.0.1", 72 | "remark-preset-lint-recommended": "^7.0.1", 73 | "token-types": "^6.0.0", 74 | "ts-node": "^10.9.2", 75 | "typescript": "^5.8.3", 76 | "uint8array-extras": "^1.4.0" 77 | }, 78 | "keywords": [ 79 | "tokenizer", 80 | "reader", 81 | "token", 82 | "async", 83 | "promise", 84 | "parser", 85 | "decoder", 86 | "binary", 87 | "endian", 88 | "uint", 89 | "stream", 90 | "streaming" 91 | ], 92 | "packageManager": "yarn@4.9.1" 93 | } 94 | -------------------------------------------------------------------------------- /test/resources/id3v1.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Borewit/strtok3/5e7c191bd1930140438dd48fd837515c449365a3/test/resources/id3v1.mp3 -------------------------------------------------------------------------------- /test/resources/test1.dat: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /test/resources/test2.dat: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Borewit/strtok3/5e7c191bd1930140438dd48fd837515c449365a3/test/resources/test2.dat -------------------------------------------------------------------------------- /test/resources/test3.dat: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /test/test.ts: -------------------------------------------------------------------------------- 1 | import { PassThrough } from 'node:stream'; 2 | import * as fs from 'node:fs/promises'; 3 | import { createReadStream } from 'node:fs'; 4 | import { dirname } from 'node:path'; 5 | import { fileURLToPath } from 'node:url'; 6 | 7 | import * as Token from 'token-types'; 8 | import { assert, expect, use } from 'chai'; 9 | import chaiAsPromised from 'chai-as-promised'; 10 | import { 11 | fromBuffer, 12 | fromFile, 13 | fromStream, 14 | fromWebStream, 15 | type ITokenizer, 16 | type IRandomAccessTokenizer 17 | } from '../lib/index.js'; 18 | import Path from 'node:path'; 19 | import { EndOfStreamError } from 'peek-readable'; 20 | 21 | import mocha from 'mocha'; 22 | import { stringToUint8Array } from 'uint8array-extras'; 23 | 24 | import { DelayedStream, makeByteReadableStreamFromFile } from './util.js'; 25 | import process from 'node:process'; 26 | 27 | use(chaiAsPromised); 28 | 29 | const __dirname = dirname(fileURLToPath(import .meta.url)); 30 | 31 | const {describe, it} = mocha; 32 | 33 | interface ITokenizerTest { 34 | name: string; 35 | loadTokenizer: (testFile: string, delay?: number, abortSignal?: AbortSignal) => Promise; 36 | hasFileInfo: boolean; 37 | abortable: boolean; 38 | randomRead: boolean; 39 | } 40 | 41 | function getResourcePath(testFile: string) { 42 | return Path.join(__dirname, 'resources', testFile); 43 | } 44 | 45 | async function getTokenizerWithData(testData: string, test: ITokenizerTest, delay?: number, abortSignal?: AbortSignal): Promise { 46 | const testPath = getResourcePath('tmp.dat'); 47 | await fs.writeFile(testPath, testData, {encoding: 'latin1'}); 48 | return test.loadTokenizer('tmp.dat', delay, abortSignal); 49 | } 50 | 51 | describe('Matrix tests', () => { 52 | 53 | const tokenizerTests: ITokenizerTest[] = [ 54 | { 55 | name: 'fromStream()', 56 | loadTokenizer: async (testFile, delay, abortSignal?: AbortSignal) => { 57 | const stream = createReadStream(getResourcePath(testFile)); 58 | const delayedStream = new DelayedStream(stream, delay); 59 | return fromStream(delayedStream, {abortSignal}); 60 | }, 61 | hasFileInfo: true, 62 | abortable: true, 63 | randomRead: false 64 | }, { 65 | name: 'fromWebStream()', 66 | loadTokenizer: async (testFile, delay, abortSignal?: AbortSignal) => { 67 | const fileStream = makeByteReadableStreamFromFile(Path.join(__dirname, 'resources', testFile), delay); 68 | return fromWebStream(fileStream, {abortSignal}); 69 | }, 70 | hasFileInfo: false, 71 | abortable: true, 72 | randomRead: false 73 | }, { 74 | name: 'fromFile()', 75 | loadTokenizer: async testFile => { 76 | return fromFile(Path.join(__dirname, 'resources', testFile)); 77 | }, 78 | hasFileInfo: true, 79 | abortable: false, 80 | randomRead: true 81 | }, { 82 | name: 'fromBuffer()', 83 | loadTokenizer: async testFile => { 84 | const data = await fs.readFile(Path.join(__dirname, 'resources', testFile)); 85 | return fromBuffer(data); 86 | }, 87 | hasFileInfo: true, 88 | abortable: false, 89 | randomRead: true 90 | } 91 | ]; 92 | 93 | tokenizerTests 94 | // .filter((x, n) => n === 1) 95 | .forEach(tokenizerType => { 96 | describe(tokenizerType.name, () => { 97 | 98 | describe('tokenizer read options', () => { 99 | 100 | it('option.offset', async () => { 101 | const buf = new Uint8Array(7); 102 | const rst = await getTokenizerWithData('\x01\x02\x03\x04\x05\x06', tokenizerType); 103 | assert.strictEqual(await rst.readBuffer(buf.subarray(1), {length: 6}), 6); 104 | await rst.close(); 105 | }); 106 | 107 | it('option.length', async () => { 108 | const buf = new Uint8Array(7); 109 | const rst = await getTokenizerWithData('\x01\x02\x03\x04\x05\x06', tokenizerType); 110 | assert.strictEqual(await rst.readBuffer(buf, {length: 2}), 2); 111 | await rst.close(); 112 | }); 113 | 114 | it('default length', async () => { 115 | const buf = new Uint8Array(6); 116 | const rst = await getTokenizerWithData('\x01\x02\x03\x04\x05\x06', tokenizerType); 117 | assert.strictEqual(await rst.readBuffer(buf.subarray(1)), 5, 'default length = buffer.length - option.offset'); 118 | await rst.close(); 119 | }); 120 | 121 | it('option.maybeLess = true', async () => { 122 | const buffer = new Uint8Array(4); 123 | const rst = await getTokenizerWithData('\x89\x54\x40', tokenizerType); 124 | const len = await rst.readBuffer(buffer, {mayBeLess: true}); 125 | assert.strictEqual(len, 3, 'should return 3 because no more bytes are available'); 126 | await rst.close(); 127 | }); 128 | 129 | it('option.position', async () => { 130 | const buffer = new Uint8Array(5); 131 | const rst = await getTokenizerWithData('\x01\x02\x03\x04\x05\x06', tokenizerType); 132 | const len = await rst.readBuffer(buffer, {position: 1}); 133 | assert.strictEqual(len, 5, 'return value'); 134 | assert.deepEqual(buffer, Uint8Array.from([0x02, 0x03, 0x04, 0x05, 0x06])); 135 | await rst.close(); 136 | }); 137 | 138 | }); 139 | 140 | describe('tokenizer peek options', () => { 141 | 142 | it('option.offset', async () => { 143 | const buf = new Uint8Array(7); 144 | const rst = await getTokenizerWithData('\x01\x02\x03\x04\x05\x06', tokenizerType); 145 | assert.strictEqual(await rst.peekBuffer(buf.subarray(1), {length: 6}), 6); 146 | await rst.close(); 147 | }); 148 | 149 | it('option.length', async () => { 150 | const buf = new Uint8Array(7); 151 | const rst = await getTokenizerWithData('\x01\x02\x03\x04\x05\x06', tokenizerType); 152 | assert.strictEqual(await rst.peekBuffer(buf, {length: 2}), 2); 153 | await rst.close(); 154 | }); 155 | 156 | it('default length', async () => { 157 | const buf = new Uint8Array(6); 158 | const rst = await getTokenizerWithData('\x01\x02\x03\x04\x05\x06', tokenizerType); 159 | assert.strictEqual(await rst.peekBuffer(buf.subarray(1)), 5, 'default length = buffer.length - option.offset'); 160 | await rst.close(); 161 | }); 162 | 163 | it('option.maybeLess = true', async () => { 164 | const buffer = new Uint8Array(4); 165 | const rst = await getTokenizerWithData('\x89\x54\x40', tokenizerType); 166 | const len = await rst.peekBuffer(buffer, {mayBeLess: true}); 167 | assert.strictEqual(len, 3, 'should return 3 because no more bytes are available'); 168 | await rst.close(); 169 | }); 170 | 171 | it('option.position', async () => { 172 | const buffer = new Uint8Array(5); 173 | const rst = await getTokenizerWithData('\x01\x02\x03\x04\x05\x06', tokenizerType); 174 | const len = await rst.peekBuffer(buffer, {position: 1}); 175 | assert.strictEqual(len, 5, 'return value'); 176 | assert.deepEqual(buffer, Uint8Array.from([0x02, 0x03, 0x04, 0x05, 0x06])); 177 | await rst.close(); 178 | }); 179 | 180 | }); 181 | 182 | it('should decode buffer', async () => { 183 | 184 | const rst = await getTokenizerWithData('\x05peter', tokenizerType); 185 | // should decode UINT8 from chunk 186 | assert.strictEqual(rst.position, 0); 187 | let value: string | number = await rst.readToken(Token.UINT8); 188 | assert.strictEqual(typeof value, 'number'); 189 | assert.strictEqual(value, 5, '0x05 == 5'); 190 | // should decode string from chunk 191 | assert.strictEqual(rst.position, 1); 192 | value = await rst.readToken(new Token.StringType(5, 'utf-8')); 193 | assert.strictEqual(typeof value, 'string'); 194 | assert.strictEqual(value, 'peter'); 195 | assert.strictEqual(rst.position, 6); 196 | // should should reject at the end of the stream 197 | try { 198 | await rst.readToken(Token.UINT8); 199 | assert.fail('Should reject due to end-of-stream'); 200 | } catch (err) { 201 | assert.instanceOf(err, EndOfStreamError); 202 | } finally { 203 | await rst.close(); 204 | } 205 | }); 206 | 207 | it('should be able to read from an absolute offset', async () => { 208 | 209 | const rst = await getTokenizerWithData('\x05peter', tokenizerType); 210 | // should decode UINT8 from chunk 211 | assert.strictEqual(rst.position, 0); 212 | const value: string | number = await rst.readToken(new Token.StringType(5, 'utf-8'), 1); 213 | assert.strictEqual(typeof value, 'string'); 214 | assert.strictEqual(value, 'peter'); 215 | assert.strictEqual(rst.position, 6); 216 | 217 | try { 218 | await rst.readToken(Token.UINT8); 219 | assert.fail('Should reject due to end-of-stream'); 220 | } catch (err) { 221 | assert.instanceOf(err, EndOfStreamError); 222 | } finally { 223 | await rst.close(); 224 | } 225 | 226 | }); 227 | 228 | it('should pick length from buffer, if length is not explicit defined', async () => { 229 | 230 | const rst = await getTokenizerWithData('\x05peter', tokenizerType); 231 | 232 | const buf = new Uint8Array(4); 233 | 234 | // should decode UINT8 from chunk 235 | assert.strictEqual(rst.position, 0); 236 | const bufferLength = await rst.readBuffer(buf); 237 | assert.strictEqual(bufferLength, buf.length); 238 | assert.strictEqual(rst.position, buf.length); 239 | await rst.close(); 240 | }); 241 | 242 | it('should contain fileSize if constructed from file-read-stream', async () => { 243 | if (tokenizerType.hasFileInfo) { 244 | const rst = await tokenizerType.loadTokenizer('test1.dat'); 245 | assert.strictEqual(rst.fileInfo.size, 16, ' ReadStreamTokenizer.fileSize.size'); 246 | await rst.close(); 247 | } 248 | }); 249 | 250 | describe('Parsing binary numbers', () => { 251 | 252 | it('should encode signed 8-bit integer (INT8)', () => { 253 | 254 | const b = new Uint8Array(1); 255 | 256 | Token.INT8.put(b, 0, 0x00); 257 | assert.deepEqual(b, Uint8Array.from([0x00])); 258 | 259 | Token.INT8.put(b, 0, 0x22); 260 | assert.deepEqual(b, Uint8Array.from([0x22])); 261 | 262 | Token.INT8.put(b, 0, -0x22); 263 | assert.deepEqual(b, Uint8Array.from([0xde])); 264 | }); 265 | 266 | it('should decode signed 8-bit integer (INT8)', async () => { 267 | 268 | const rst = await getTokenizerWithData('\x00\x7f\x80\xff\x81', tokenizerType); 269 | 270 | let value: number = await rst.readToken(Token.INT8); 271 | assert.strictEqual(typeof value, 'number'); 272 | assert.strictEqual(value, 0, 'INT8 #1 == 0'); 273 | value = await rst.readToken(Token.INT8); 274 | assert.strictEqual(typeof value, 'number'); 275 | assert.strictEqual(value, 127, 'INT8 #2 == 127'); 276 | value = await rst.readToken(Token.INT8); 277 | assert.strictEqual(typeof value, 'number'); 278 | assert.strictEqual(value, -128, 'INT8 #3 == -128'); 279 | value = await rst.readToken(Token.INT8); 280 | assert.strictEqual(typeof value, 'number'); 281 | assert.strictEqual(value, -1, 'INT8 #4 == -1'); 282 | value = await rst.readToken(Token.INT8); 283 | assert.strictEqual(typeof value, 'number'); 284 | assert.strictEqual(value, -127, 'INT8 #5 == -127'); 285 | 286 | await rst.close(); 287 | 288 | }); 289 | 290 | it('should encode signed 16-bit big-endian integer (INT16_BE)', () => { 291 | 292 | const b = new Uint8Array(2); 293 | 294 | Token.INT16_BE.put(b, 0, 0x00); 295 | assert.deepEqual(b, Uint8Array.from([0x00, 0x00])); 296 | 297 | Token.INT16_BE.put(b, 0, 0x0f0b); 298 | assert.deepEqual(b, Uint8Array.from([0x0f, 0x0b])); 299 | 300 | Token.INT16_BE.put(b, 0, -0x0f0b); 301 | assert.deepEqual(b, Uint8Array.from([0xf0, 0xf5])); 302 | }); 303 | 304 | it('should decode signed 16-bit big-endian integer (INT16_BE)', async () => { 305 | 306 | const rst = await getTokenizerWithData('\x0a\x1a\x00\x00\xff\xff\x80\x00', tokenizerType); 307 | 308 | let value: number = await rst.readToken(Token.INT16_BE); 309 | assert.strictEqual(typeof value, 'number'); 310 | assert.strictEqual(value, 2586, 'INT16_BE#1'); 311 | value = await rst.readToken(Token.INT16_BE); 312 | assert.strictEqual(typeof value, 'number'); 313 | assert.strictEqual(value, 0, 'INT16_BE#2'); 314 | value = await rst.readToken(Token.INT16_BE); 315 | assert.strictEqual(typeof value, 'number'); 316 | assert.strictEqual(value, -1, 'INT16_BE#3'); 317 | value = await rst.readToken(Token.INT16_BE); 318 | assert.strictEqual(typeof value, 'number'); 319 | assert.strictEqual(value, -32768, 'INT16_BE#4'); 320 | 321 | await rst.close(); 322 | }); 323 | 324 | it('should encode signed 24-bit big-endian integer (INT24_BE)', async () => { 325 | 326 | const b = new Uint8Array(3); 327 | 328 | Token.INT24_BE.put(b, 0, 0x00); 329 | assert.deepEqual(b, Uint8Array.from([0x00, 0x00, 0x00])); 330 | 331 | Token.INT24_BE.put(b, 0, 0x0f0ba0); 332 | assert.deepEqual(b, Uint8Array.from([0x0f, 0x0b, 0xa0])); 333 | 334 | Token.INT24_BE.put(b, 0, -0x0f0bcc); 335 | assert.deepEqual(b, Uint8Array.from([0xf0, 0xf4, 0x34])); 336 | }); 337 | 338 | it('should decode signed 24-bit big-endian integer (INT24_BE)', async () => { 339 | 340 | const rst = await getTokenizerWithData('\x00\x00\x00\xff\xff\xff\x10\x00\xff\x80\x00\x00', tokenizerType); 341 | 342 | let value: number = await rst.readToken(Token.INT24_BE); 343 | assert.strictEqual(typeof value, 'number'); 344 | assert.strictEqual(value, 0, 'INT24_BE#1'); 345 | value = await rst.readToken(Token.INT24_BE); 346 | assert.strictEqual(typeof value, 'number'); 347 | assert.strictEqual(value, -1, 'INT24_BE#2'); 348 | value = await rst.readToken(Token.INT24_BE); 349 | assert.strictEqual(typeof value, 'number'); 350 | assert.strictEqual(value, 1048831, 'INT24_BE#3'); 351 | value = await rst.readToken(Token.INT24_BE); 352 | assert.strictEqual(typeof value, 'number'); 353 | assert.strictEqual(value, -8388608, 'INT24_BE#4'); 354 | await rst.close(); 355 | }); 356 | 357 | // ToDo: test decoding: INT24_LE 358 | 359 | it('should encode signed 32-bit big-endian integer (INT32_BE)', () => { 360 | 361 | const b = new Uint8Array(4); 362 | 363 | Token.INT32_BE.put(b, 0, 0x00); 364 | assert.deepEqual(b, Uint8Array.from([0x00, 0x00, 0x00, 0x00])); 365 | 366 | Token.INT32_BE.put(b, 0, 0x0f0bcca0); 367 | assert.deepEqual(b, Uint8Array.from([0x0f, 0x0b, 0xcc, 0xa0])); 368 | 369 | Token.INT32_BE.put(b, 0, -0x0f0bcca0); 370 | assert.deepEqual(b, Uint8Array.from([0xf0, 0xf4, 0x33, 0x60])); 371 | }); 372 | 373 | it('should decode signed 32-bit big-endian integer (INT32_BE)', async () => { 374 | 375 | const rst = await getTokenizerWithData('\x00\x00\x00\x00\xff\xff\xff\xff\x00\x10\x00\xff\x80\x00\x00\x00', tokenizerType); 376 | 377 | let value: number = await rst.readToken(Token.INT32_BE); 378 | assert.strictEqual(typeof value, 'number'); 379 | assert.strictEqual(value, 0, 'INT32_BE #1'); 380 | value = await rst.readToken(Token.INT32_BE); 381 | assert.strictEqual(typeof value, 'number'); 382 | assert.strictEqual(value, -1, 'INT32_BE #2'); 383 | value = await rst.readToken(Token.INT32_BE); 384 | assert.strictEqual(typeof value, 'number'); 385 | assert.strictEqual(value, 1048831, 'INT32_BE #3'); 386 | value = await rst.readToken(Token.INT32_BE); 387 | assert.strictEqual(typeof value, 'number'); 388 | assert.strictEqual(value, -2147483648, 'INT32_BE #4'); 389 | await rst.close(); 390 | }); 391 | 392 | it('should encode signed 8-bit big-endian integer (INT8)', () => { 393 | 394 | const b = new Uint8Array(1); 395 | 396 | Token.UINT8.put(b, 0, 0x00); 397 | assert.deepEqual(b, Uint8Array.from([0x00])); 398 | 399 | Token.UINT8.put(b, 0, 0xff); 400 | assert.deepEqual(b, Uint8Array.from([0xff])); 401 | }); 402 | 403 | it('should decode unsigned 8-bit integer (UINT8)', async () => { 404 | 405 | const rst = await getTokenizerWithData('\x00\x1a\xff', tokenizerType); 406 | 407 | let value: number = await rst.readToken(Token.UINT8); 408 | assert.strictEqual(typeof value, 'number'); 409 | assert.strictEqual(value, 0, 'UINT8 #1'); 410 | value = await rst.readToken(Token.UINT8); 411 | assert.strictEqual(typeof value, 'number'); 412 | assert.strictEqual(value, 26, 'UINT8 #2'); 413 | value = await rst.readToken(Token.UINT8); 414 | assert.strictEqual(typeof value, 'number'); 415 | assert.strictEqual(value, 255, 'UINT8 #3'); 416 | await rst.close(); 417 | }); 418 | 419 | it('should encode unsigned 16-bit big-endian integer (UINT16_LE)', () => { 420 | 421 | const b = new Uint8Array(4); 422 | 423 | Token.UINT16_LE.put(b, 0, 0x00); 424 | Token.UINT16_LE.put(b, 2, 0xffaa); 425 | assert.deepEqual(b, Uint8Array.from([0x00, 0x00, 0xaa, 0xff])); 426 | }); 427 | 428 | it('should encode unsigned 16-bit little-endian integer (UINT16_BE)', () => { 429 | const b = new Uint8Array(4); 430 | Token.UINT16_BE.put(b, 0, 0xf); 431 | Token.UINT16_BE.put(b, 2, 0xffaa); 432 | assert.deepEqual(b, Uint8Array.from([0x00, 0x0f, 0xff, 0xaa])); 433 | }); 434 | 435 | it('should encode unsigned 16-bit mixed little/big-endian integers', () => { 436 | const b = new Uint8Array(4); 437 | Token.UINT16_BE.put(b, 0, 0xffaa); 438 | Token.UINT16_LE.put(b, 2, 0xffaa); 439 | assert.deepEqual(b, Uint8Array.from([0xff, 0xaa, 0xaa, 0xff])); 440 | }); 441 | 442 | it('should decode unsigned mixed 16-bit big/little-endian integer', async () => { 443 | 444 | const rst = await getTokenizerWithData('\x1a\x00\x1a\x00\x1a\x00\x1a\x00', tokenizerType); 445 | 446 | let value: number = await rst.readToken(Token.UINT16_LE); 447 | assert.strictEqual(typeof value, 'number'); 448 | assert.strictEqual(value, 0x001a, 'UINT16_LE #1'); 449 | value = await rst.readToken(Token.UINT16_BE); 450 | assert.strictEqual(typeof value, 'number'); 451 | assert.strictEqual(value, 0x1a00, 'UINT16_BE #2'); 452 | value = await rst.readToken(Token.UINT16_LE); 453 | assert.strictEqual(typeof value, 'number'); 454 | assert.strictEqual(value, 0x001a, 'UINT16_BE #3'); 455 | value = await rst.readToken(Token.UINT16_BE); 456 | assert.strictEqual(typeof value, 'number'); 457 | assert.strictEqual(value, 0x1a00, 'UINT16_LE #4'); 458 | 459 | await rst.close(); 460 | }); 461 | 462 | it('should encode unsigned 24-bit little-endian integer (UINT24_LE)', () => { 463 | 464 | const b = new Uint8Array(3); 465 | 466 | Token.UINT24_LE.put(b, 0, 0x00); 467 | assert.deepEqual(b, Uint8Array.from([0x00, 0x000, 0x00])); 468 | 469 | Token.UINT24_LE.put(b, 0, 0xff); 470 | assert.deepEqual(b, Uint8Array.from([0xff, 0x00, 0x00])); 471 | 472 | Token.UINT24_LE.put(b, 0, 0xaabbcc); 473 | assert.deepEqual(b, Uint8Array.from([0xcc, 0xbb, 0xaa])); 474 | }); 475 | 476 | it('should encode unsigned 24-bit big-endian integer (UINT24_BE)', () => { 477 | 478 | const b = new Uint8Array(3); 479 | 480 | Token.UINT24_BE.put(b, 0, 0x00); 481 | assert.deepEqual(b, Uint8Array.from([0x00, 0x00, 0x00])); 482 | 483 | Token.UINT24_BE.put(b, 0, 0xff); 484 | assert.deepEqual(b, Uint8Array.from([0x00, 0x00, 0xff])); 485 | 486 | Token.UINT24_BE.put(b, 0, 0xaabbcc); 487 | assert.deepEqual(b, Uint8Array.from([0xaa, 0xbb, 0xcc])); 488 | }); 489 | 490 | it('should decode signed 24-bit big/little-endian integer (UINT24_LE/INT24_BE)', async () => { 491 | 492 | const rst = await getTokenizerWithData('\x1a\x1a\x00\x1a\x1a\x00\x1a\x1a\x00\x1a\x1a\x00', tokenizerType); 493 | 494 | let value: number = await rst.readToken(Token.UINT24_LE); 495 | assert.strictEqual(typeof value, 'number'); 496 | assert.strictEqual(value, 0x001a1a, 'INT24_LE#1'); 497 | value = await rst.readToken(Token.UINT24_BE); 498 | assert.strictEqual(typeof value, 'number'); 499 | assert.strictEqual(value, 0x1a1a00, 'INT24_BE#2'); 500 | value = await rst.readToken(Token.UINT24_LE); 501 | assert.strictEqual(typeof value, 'number'); 502 | assert.strictEqual(value, 0x001a1a, 'INT24_LE#3'); 503 | value = await rst.readToken(Token.UINT24_BE); 504 | assert.strictEqual(typeof value, 'number'); 505 | assert.strictEqual(value, 0x1a1a00, 'INT24_BE#4'); 506 | 507 | await rst.close(); 508 | }); 509 | 510 | it('should encode unsigned 32-bit little-endian integer (UINT32_LE)', () => { 511 | 512 | const b = new Uint8Array(4); 513 | 514 | Token.UINT32_LE.put(b, 0, 0x00); 515 | assert.deepEqual(b, Uint8Array.from([0x00, 0x00, 0x00, 0x00])); 516 | 517 | Token.UINT32_LE.put(b, 0, 0xff); 518 | assert.deepEqual(b, Uint8Array.from([0xff, 0x00, 0x00, 0x00])); 519 | 520 | Token.UINT32_LE.put(b, 0, 0xaabbccdd); 521 | assert.deepEqual(b, Uint8Array.from([0xdd, 0xcc, 0xbb, 0xaa])); 522 | }); 523 | 524 | it('should encode unsigned 32-bit big-endian integer (INT32_BE)', () => { 525 | 526 | const b = new Uint8Array(4); 527 | 528 | Token.UINT32_BE.put(b, 0, 0x00); 529 | assert.deepEqual(b, Uint8Array.from([0x00, 0x00, 0x00, 0x00])); 530 | 531 | Token.UINT32_BE.put(b, 0, 0xff); 532 | assert.deepEqual(b, Uint8Array.from([0x00, 0x00, 0x00, 0xff])); 533 | 534 | Token.UINT32_BE.put(b, 0, 0xaabbccdd); 535 | assert.deepEqual(b, Uint8Array.from([0xaa, 0xbb, 0xcc, 0xdd])); 536 | }); 537 | 538 | it('should decode unsigned 32-bit little/big-endian integer (UINT32_LE/UINT32_BE)', async () => { 539 | 540 | const rst = await getTokenizerWithData('\x1a\x00\x1a\x00\x1a\x00\x1a\x00\x1a\x00\x1a\x00\x1a\x00\x1a\x00', tokenizerType); 541 | 542 | let value: number = await rst.readToken(Token.UINT32_LE); 543 | assert.strictEqual(typeof value, 'number'); 544 | assert.strictEqual(value, 0x001a001a, 'UINT24_LE #1'); 545 | value = await rst.readToken(Token.UINT32_BE); 546 | assert.strictEqual(typeof value, 'number'); 547 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #2'); 548 | value = await rst.readToken(Token.UINT32_LE); 549 | assert.strictEqual(typeof value, 'number'); 550 | assert.strictEqual(value, 0x001a001a, 'UINT32_LE #3'); 551 | value = await rst.readToken(Token.UINT32_BE); 552 | assert.strictEqual(typeof value, 'number'); 553 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #4'); 554 | 555 | await rst.close(); 556 | }); 557 | 558 | }); 559 | 560 | it('Transparency', async function() { 561 | 562 | this.timeout(5000); 563 | 564 | const size = 10 * 1024; 565 | const buf = new Uint8Array(size); 566 | 567 | for (let i = 0; i < size; ++i) { 568 | buf[i] = i % 255; 569 | } 570 | 571 | const testFile = 'test2.dat'; 572 | const pathTestFile = Path.join(__dirname, 'resources', testFile); 573 | await fs.writeFile(pathTestFile, buf); 574 | 575 | const rst = await tokenizerType.loadTokenizer(testFile); 576 | let expected = 0; 577 | 578 | try { 579 | let v: number; 580 | do { 581 | v = await rst.readNumber(Token.UINT8); 582 | assert.strictEqual(v, expected % 255, `offset=${expected}`); 583 | ++expected; 584 | } while (v > 0); 585 | } catch (err) { 586 | assert.instanceOf(err, EndOfStreamError); 587 | assert.strictEqual(expected, size, 'total number of parsed bytes'); 588 | } 589 | 590 | await rst.close(); 591 | }); 592 | 593 | it('Handle peek token', async () => { 594 | 595 | async function peekOnData(tokenizer: ITokenizer): Promise { 596 | assert.strictEqual(tokenizer.position, 0); 597 | 598 | let value = await tokenizer.peekToken(Token.UINT32_LE); 599 | assert.strictEqual(typeof value, 'number'); 600 | assert.strictEqual(value, 0x001a001a, 'UINT24_LE #1'); 601 | assert.strictEqual(tokenizer.position, 0); 602 | 603 | value = await tokenizer.peekToken(Token.UINT32_LE); 604 | assert.strictEqual(typeof value, 'number'); 605 | assert.strictEqual(value, 0x001a001a, 'UINT24_LE sequential peek #2'); 606 | assert.strictEqual(tokenizer.position, 0); 607 | value = await tokenizer.readToken(Token.UINT32_LE); 608 | 609 | assert.strictEqual(typeof value, 'number'); 610 | assert.strictEqual(value, 0x001a001a, 'UINT24_LE #3'); 611 | assert.strictEqual(tokenizer.position, 4); 612 | value = await tokenizer.readToken(Token.UINT32_BE); 613 | assert.strictEqual(typeof value, 'number'); 614 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #4'); 615 | assert.strictEqual(tokenizer.position, 8); 616 | value = await tokenizer.readToken(Token.UINT32_LE); 617 | 618 | assert.strictEqual(typeof value, 'number'); 619 | assert.strictEqual(value, 0x001a001a, 'UINT32_LE #5'); 620 | assert.strictEqual(tokenizer.position, 12); 621 | value = await tokenizer.readToken(Token.UINT32_BE); 622 | 623 | assert.strictEqual(typeof value, 'number'); 624 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #6'); 625 | assert.strictEqual(tokenizer.position, 16); 626 | 627 | } 628 | 629 | const rst = await tokenizerType.loadTokenizer('test1.dat'); 630 | 631 | if (rst.supportsRandomAccess()) { 632 | assert.strictEqual(rst.fileInfo.size, 16, 'check file size property'); 633 | } 634 | await peekOnData(rst); 635 | await rst.close(); 636 | }); 637 | 638 | it('Overlapping peeks', async () => { 639 | 640 | const rst = await getTokenizerWithData('\x01\x02\x03\x04\x05', tokenizerType); 641 | const peekBuffer = new Uint8Array(3); 642 | const readBuffer = new Uint8Array(1); 643 | 644 | assert.strictEqual(0, rst.position); 645 | let len = await rst.peekBuffer(peekBuffer, {length: 3}); // Peek #1 646 | assert.strictEqual(3, len); 647 | assert.deepEqual(peekBuffer, stringToUint8Array('\x01\x02\x03'), 'Peek #1'); 648 | assert.strictEqual(rst.position, 0); 649 | len = await rst.readBuffer(readBuffer, {length: 1}); // Read #1 650 | assert.strictEqual(len, 1); 651 | assert.strictEqual(rst.position, 1); 652 | assert.deepEqual(readBuffer, stringToUint8Array('\x01'), 'Read #1'); 653 | len = await rst.peekBuffer(peekBuffer, {length: 3}); // Peek #2 654 | assert.strictEqual(len, 3); 655 | assert.strictEqual(rst.position, 1); 656 | assert.deepEqual(peekBuffer, stringToUint8Array('\x02\x03\x04'), 'Peek #2'); 657 | len = await rst.readBuffer(readBuffer, {length: 1}); // Read #2 658 | assert.strictEqual(len, 1); 659 | assert.strictEqual(rst.position, 2); 660 | assert.deepEqual(readBuffer, stringToUint8Array('\x02'), 'Read #2'); 661 | len = await rst.peekBuffer(peekBuffer, {length: 3}); // Peek #3 662 | assert.strictEqual(len, 3); 663 | assert.strictEqual(rst.position, 2); 664 | assert.deepEqual(peekBuffer, stringToUint8Array('\x03\x04\x05'), 'Peek #3'); 665 | len = await rst.readBuffer(readBuffer, {length: 1}); // Read #3 666 | assert.strictEqual(len, 1); 667 | assert.strictEqual(rst.position, 3); 668 | assert.deepEqual(readBuffer, stringToUint8Array('\x03'), 'Read #3'); 669 | len = await rst.peekBuffer(peekBuffer, {length: 2}); // Peek #4 670 | assert.strictEqual(len, 2, '3 bytes requested to peek, only 2 bytes left'); 671 | assert.strictEqual(rst.position, 3); 672 | assert.deepEqual(peekBuffer, stringToUint8Array('\x04\x05\x05'), 'Peek #4'); 673 | len = await rst.readBuffer(readBuffer, {length: 1}); // Read #4 674 | assert.strictEqual(len, 1); 675 | assert.strictEqual(rst.position, 4); 676 | assert.deepEqual(readBuffer, stringToUint8Array('\x04'), 'Read #4'); 677 | 678 | await rst.close(); 679 | }); 680 | 681 | it('should be able to read at position ahead', async () => { 682 | 683 | const rst = await getTokenizerWithData('\x05peter', tokenizerType); 684 | // should decode string from chunk 685 | assert.strictEqual(rst.position, 0); 686 | const value = await rst.readToken(new Token.StringType(5, 'utf-8'), 1); 687 | assert.strictEqual(typeof value, 'string'); 688 | assert.strictEqual(value, 'peter'); 689 | assert.strictEqual(rst.position, 6); 690 | // should should reject at the end of the stream 691 | try { 692 | await rst.readToken(Token.UINT8); 693 | assert.fail('Should reject due to end-of-stream'); 694 | } catch (err) { 695 | assert.instanceOf(err, EndOfStreamError); 696 | } finally { 697 | await rst.close(); 698 | } 699 | }); 700 | 701 | it('should be able to peek at position ahead', async () => { 702 | 703 | const rst = await getTokenizerWithData('\x05peter', tokenizerType); 704 | // should decode string from chunk 705 | assert.strictEqual(rst.position, 0); 706 | const value = await rst.peekToken(new Token.StringType(5, 'latin1'), 1); 707 | assert.strictEqual(typeof value, 'string'); 708 | assert.strictEqual(value, 'peter'); 709 | assert.strictEqual(rst.position, 0); 710 | 711 | await rst.close(); 712 | }); 713 | 714 | it('number', async () => { 715 | const tokenizer = await tokenizerType.loadTokenizer('test3.dat'); 716 | assert.isDefined(tokenizer.fileInfo, 'tokenizer.fileInfo'); 717 | // @ts-ignore 718 | await tokenizer.ignore(1); 719 | const x = await tokenizer.peekNumber(Token.INT32_BE); 720 | assert.strictEqual(x, 33752069); 721 | 722 | await tokenizer.close(); 723 | }); 724 | 725 | it('should throw an Error if we reach EOF while peeking a number', async () => { 726 | const tokenizer = await tokenizerType.loadTokenizer('test3.dat'); 727 | if (tokenizerType.hasFileInfo) { 728 | assert.isDefined(tokenizer.fileInfo, 'tokenizer.fileInfo'); 729 | } 730 | // @ts-ignore 731 | await tokenizer.ignore(2); 732 | try { 733 | await tokenizer.peekNumber(Token.INT32_BE); 734 | assert.fail('Should throw Error: End-Of-File'); 735 | } catch (err) { 736 | assert.instanceOf(err, EndOfStreamError); 737 | } 738 | await tokenizer.close(); 739 | }); 740 | 741 | it('should be able to handle multiple ignores', async () => { 742 | const tokenizer = await tokenizerType.loadTokenizer('test1.dat'); 743 | let value = await tokenizer.readToken(Token.UINT32_LE); 744 | assert.strictEqual(typeof value, 'number'); 745 | assert.strictEqual(value, 0x001a001a, 'UINT24_LE #1'); 746 | await tokenizer.ignore(Token.UINT32_BE.len); 747 | await tokenizer.ignore(Token.UINT32_LE.len); 748 | value = await tokenizer.readToken(Token.UINT32_BE); 749 | assert.strictEqual(typeof value, 'number'); 750 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #4'); 751 | await tokenizer.close(); 752 | }); 753 | 754 | it('should be able to ignore (skip)', async () => { 755 | 756 | const tokenizer = await tokenizerType.loadTokenizer('test1.dat'); 757 | assert.strictEqual(tokenizer.position, 0); 758 | await tokenizer.ignore(4); 759 | assert.strictEqual(tokenizer.position, 4); 760 | let value = await tokenizer.readToken(Token.UINT32_BE); 761 | assert.strictEqual(typeof value, 'number'); 762 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #2'); 763 | value = await tokenizer.readToken(Token.UINT32_LE); 764 | assert.strictEqual(typeof value, 'number'); 765 | assert.strictEqual(value, 0x001a001a, 'UINT32_LE #3'); 766 | value = await tokenizer.readToken(Token.UINT32_BE); 767 | assert.strictEqual(typeof value, 'number'); 768 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #4'); 769 | await tokenizer.close(); 770 | }); 771 | 772 | describe('End-Of-File exception behaviour', () => { 773 | 774 | it('should not throw an Error if we read exactly until the end of the file', async () => { 775 | 776 | const rst = await getTokenizerWithData('\x89\x54\x40', tokenizerType); 777 | const num = await rst.readToken(Token.UINT24_BE); 778 | assert.strictEqual(num, 9000000); 779 | await rst.close(); 780 | }); 781 | 782 | it('readBuffer()', async () => { 783 | 784 | const testFile = 'test1.dat'; 785 | 786 | const stat = await fs.stat(getResourcePath(testFile)); 787 | const tokenizer = await tokenizerType.loadTokenizer(testFile); 788 | const buf = new Uint8Array(stat.size); 789 | const bytesRead = await tokenizer.readBuffer(buf); 790 | assert.ok(typeof bytesRead === 'number', 'readBuffer promise should provide a number'); 791 | assert.strictEqual(stat.size, bytesRead); 792 | try { 793 | await tokenizer.readBuffer(buf); 794 | assert.fail('Should throw EOF'); 795 | } catch (err) { 796 | assert.instanceOf(err, EndOfStreamError); 797 | } finally { 798 | await tokenizer.close(); 799 | } 800 | }); 801 | 802 | it('should handle zero byte read', async () => { 803 | 804 | const rst = await getTokenizerWithData('\x00\x00\x00', tokenizerType); 805 | const uint8Array = await rst.readToken(new Token.Uint8ArrayType(0)); 806 | assert.strictEqual(uint8Array.length, 0); 807 | await rst.close(); 808 | }); 809 | 810 | it('should not throw an Error if we read exactly until the end of the file', async () => { 811 | 812 | const rst = await getTokenizerWithData('\x89\x54\x40', tokenizerType); 813 | const num = await rst.readToken(Token.UINT24_BE); 814 | assert.strictEqual(num, 9000000); 815 | await rst.close(); 816 | }); 817 | 818 | it('should be thrown if a token EOF reached in the middle of a token', async () => { 819 | 820 | const rst = await getTokenizerWithData('\x89\x54\x40', tokenizerType); 821 | try { 822 | await rst.readToken(Token.INT32_BE); 823 | assert.fail('It should throw EndOfFile Error'); 824 | } catch (err) { 825 | assert.instanceOf(err, EndOfStreamError); 826 | } finally { 827 | await rst.close(); 828 | } 829 | }); 830 | 831 | it('should throw an EOF if we read to buffer', async () => { 832 | const buffer = new Uint8Array(4); 833 | 834 | const rst = await getTokenizerWithData('\x89\x54\x40', tokenizerType); 835 | try { 836 | await rst.readBuffer(buffer); 837 | assert.fail('It should throw EndOfFile Error'); 838 | } catch (err) { 839 | assert.instanceOf(err, EndOfStreamError); 840 | } finally { 841 | await rst.close(); 842 | } 843 | }); 844 | 845 | it('should throw an EOF if we peek to buffer', async () => { 846 | 847 | const buffer = new Uint8Array(4); 848 | const rst = await getTokenizerWithData('\x89\x54\x40', tokenizerType); 849 | try { 850 | await rst.peekBuffer(buffer); 851 | assert.fail('It should throw EndOfFile Error'); 852 | } catch (err) { 853 | assert.instanceOf(err, EndOfStreamError); 854 | } finally { 855 | await rst.close(); 856 | } 857 | }); 858 | 859 | }); 860 | 861 | it('should be able to read from a file', async () => { 862 | 863 | const tokenizer = await tokenizerType.loadTokenizer('test1.dat'); 864 | if (tokenizerType.hasFileInfo) { 865 | assert.strictEqual(tokenizer.fileInfo.size, 16, 'check file size property'); 866 | } 867 | let value = await tokenizer.readToken(Token.UINT32_LE); 868 | assert.strictEqual(typeof value, 'number'); 869 | assert.strictEqual(value, 0x001a001a, 'UINT24_LE #1'); 870 | value = await tokenizer.readToken(Token.UINT32_BE); 871 | assert.strictEqual(typeof value, 'number'); 872 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #2'); 873 | value = await tokenizer.readToken(Token.UINT32_LE); 874 | assert.strictEqual(typeof value, 'number'); 875 | assert.strictEqual(value, 0x001a001a, 'UINT32_LE #3'); 876 | value = await tokenizer.readToken(Token.UINT32_BE); 877 | assert.strictEqual(typeof value, 'number'); 878 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #4'); 879 | await tokenizer.close(); 880 | }); 881 | 882 | it('should be able to parse the IgnoreType-token', async () => { 883 | const tokenizer = await tokenizerType.loadTokenizer('test1.dat'); 884 | await tokenizer.readToken(new Token.IgnoreType(4)); 885 | let value = await tokenizer.readToken(Token.UINT32_BE); 886 | assert.strictEqual(typeof value, 'number'); 887 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #2'); 888 | value = await tokenizer.readToken(Token.UINT32_LE); 889 | assert.strictEqual(typeof value, 'number'); 890 | assert.strictEqual(value, 0x001a001a, 'UINT32_LE #3'); 891 | value = await tokenizer.readToken(Token.UINT32_BE); 892 | assert.strictEqual(typeof value, 'number'); 893 | assert.strictEqual(value, 0x1a001a00, 'UINT32_BE #4'); 894 | await tokenizer.close(); 895 | }); 896 | 897 | it('should be able to read 0 bytes from a file', async () => { 898 | const bufZero = new Uint8Array(0); 899 | const tokenizer = await tokenizerType.loadTokenizer('test1.dat'); 900 | try { 901 | await tokenizer.readBuffer(bufZero); 902 | } finally { 903 | await tokenizer.close(); 904 | } 905 | }); 906 | 907 | if (tokenizerType.abortable) { 908 | 909 | describe('Abort delayed read', () => { 910 | 911 | it('without aborting', async () => { 912 | const fileReadStream = await getTokenizerWithData('123', tokenizerType, 500); 913 | try { 914 | const promise = fileReadStream.readToken(new Token.StringType(3, 'utf-8'), 0); 915 | assert.strictEqual(await promise, '123'); 916 | } finally { 917 | await fileReadStream.close(); 918 | } 919 | }); 920 | 921 | it('abort async operation using `abort()`', async function() { 922 | if (process.versions.bun) { 923 | this.skip(); // Fails with Bun 1.2 924 | } 925 | const fileReadStream = await getTokenizerWithData('123', tokenizerType, 500); 926 | try { 927 | const promise = fileReadStream.readToken(new Token.StringType(3, 'utf-8'), 0); 928 | await fileReadStream.abort(); 929 | await expect(promise).to.be.rejectedWith(Error); 930 | } finally { 931 | await fileReadStream.close(); 932 | } 933 | }); 934 | 935 | it('abort async operation using `close()`', async function() { 936 | if (process.versions.bun) { 937 | this.skip(); // Fails with Bun 1.2 938 | } 939 | const fileReadStream = await getTokenizerWithData('123', tokenizerType, 500); 940 | const promise = fileReadStream.readToken(new Token.StringType(3, 'utf-8'), 0); 941 | await fileReadStream.close(); 942 | await expect(promise).to.be.rejectedWith(Error); 943 | }); 944 | 945 | it('abort async operation using `AbortController`', async function() { 946 | 947 | if (process.versions.bun) { 948 | this.skip(); // Fails with Bun 1.2 949 | } 950 | 951 | const abortController = new AbortController(); 952 | const fileReadStream = await getTokenizerWithData('123', tokenizerType, 500, abortController.signal); 953 | try { 954 | const promise = fileReadStream.readToken(new Token.StringType(3, 'utf-8'), 0); 955 | abortController.abort(); 956 | await expect(promise).to.be.rejectedWith(Error); 957 | } finally { 958 | await fileReadStream.close(); 959 | } 960 | }); 961 | 962 | }); 963 | } 964 | 965 | }); // End of test "Tokenizer-types" 966 | }); 967 | 968 | describe('Random-read-access', async () => { 969 | 970 | tokenizerTests 971 | .filter(tokenizerType => tokenizerType.randomRead) 972 | .forEach(tokenizerType => { 973 | describe(tokenizerType.name, () => { 974 | 975 | it('Read ID3v1 header at the end of the file', async () => { 976 | const tokenizer = await tokenizerType.loadTokenizer('id3v1.mp3') as IRandomAccessTokenizer; 977 | try { 978 | assert.isTrue(tokenizer.supportsRandomAccess(), 'Tokenizer should support random reads'); 979 | const id3HeaderSize = 128; 980 | const id3Header = new Uint8Array(id3HeaderSize); 981 | await tokenizer.readBuffer(id3Header, {position: tokenizer.fileInfo.size - id3HeaderSize}); 982 | const id3Tag = new TextDecoder('utf-8').decode(id3Header.subarray(0, 3)); 983 | assert.strictEqual(id3Tag, 'TAG'); 984 | assert.strictEqual(tokenizer.position, tokenizer.fileInfo.size, 'Tokenizer position should be at the end of the file'); 985 | tokenizer.setPosition(0); 986 | assert.strictEqual(tokenizer.position, 0, 'Tokenizer position should be at the beginning of the file'); 987 | } finally { 988 | await tokenizer.close(); 989 | } 990 | }); 991 | 992 | it('Be able to random read from position 0', async () => { 993 | const tokenizer = await fromFile(getResourcePath('id3v1.mp3')); 994 | try { 995 | // Advance tokenizer.position 996 | await tokenizer.ignore(20); 997 | const mpegSync = new Uint8Array(2); 998 | await tokenizer.readBuffer(mpegSync, {position: 0}); 999 | assert.strictEqual(mpegSync[0], 255, 'First sync byte'); 1000 | assert.strictEqual(mpegSync[1], 251, 'Second sync byte'); 1001 | } finally { 1002 | await tokenizer.close(); 1003 | } 1004 | 1005 | }); 1006 | }); 1007 | }); 1008 | 1009 | }); 1010 | }); 1011 | 1012 | describe('fromStream with mayBeLess flag', () => { 1013 | 1014 | it('mayBeLess=true', async () => { 1015 | // Initialize empty stream 1016 | const stream = new PassThrough(); 1017 | const tokenizer = await fromStream(stream); 1018 | try { 1019 | stream.end(); 1020 | 1021 | // Try to read 5 bytes from empty stream, with mayBeLess flag enabled 1022 | const buffer = new Uint8Array(5); 1023 | const bytesRead = await tokenizer.peekBuffer(buffer, {mayBeLess: true}); 1024 | assert.strictEqual(bytesRead, 0); 1025 | } finally { 1026 | await tokenizer.close(); 1027 | } 1028 | }); 1029 | 1030 | it('mayBeLess=false', async () => { 1031 | // Initialize empty stream 1032 | const stream = new PassThrough(); 1033 | const tokenizer = await fromStream(stream); 1034 | try { 1035 | stream.end(); 1036 | 1037 | // Try to read 5 bytes from empty stream, with mayBeLess flag enabled 1038 | const buffer = new Uint8Array(5); 1039 | await tokenizer.peekBuffer(buffer, {mayBeLess: false}); 1040 | } catch (err) { 1041 | if (err instanceof Error) { 1042 | assert.strictEqual(err.message, 'End-Of-Stream'); 1043 | } else { 1044 | assert.fail('Expected: err instanceof Error'); 1045 | } 1046 | return; 1047 | } finally { 1048 | if (tokenizer) { 1049 | await tokenizer.close(); 1050 | } 1051 | } 1052 | assert.fail('Should throw End-Of-Stream error'); 1053 | }); 1054 | 1055 | }); 1056 | 1057 | it('should determine the file size using a file stream', async () => { 1058 | const stream = createReadStream(Path.join(__dirname, 'resources', 'test1.dat')); 1059 | const tokenizer = await fromStream(stream); 1060 | try { 1061 | assert.isDefined(tokenizer.fileInfo, '`fileInfo` should be defined'); 1062 | assert.strictEqual(tokenizer.fileInfo.size, 16, 'fileInfo.size'); 1063 | } finally { 1064 | await tokenizer.close(); 1065 | } 1066 | }); 1067 | 1068 | it('should release stream after close', async () => { 1069 | 1070 | const fileStream = makeByteReadableStreamFromFile(Path.join(__dirname, 'resources', 'test1.dat'), 0); 1071 | assert.isFalse(fileStream.locked, 'stream is unlocked before initializing tokenizer'); 1072 | const webStreamTokenizer = fromWebStream(fileStream); 1073 | assert.isTrue(fileStream.locked, 'stream is locked after initializing tokenizer'); 1074 | await webStreamTokenizer.close(); 1075 | assert.isFalse(fileStream.locked, 'stream is unlocked after closing tokenizer'); 1076 | }); 1077 | -------------------------------------------------------------------------------- /test/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": "../tsconfig.json" 3 | } 4 | -------------------------------------------------------------------------------- /test/util.ts: -------------------------------------------------------------------------------- 1 | import { createReadStream } from 'node:fs'; 2 | import { Transform, Readable } from 'node:stream'; 3 | import { makeByteReadableStreamFromNodeReadable } from 'node-readable-to-web-readable-stream'; 4 | 5 | export function makeByteReadableStreamFromFile(filename: string, delay = 0): ReadableStream { 6 | 7 | // Create a Node.js Readable stream 8 | const nodeReadable = createReadStream(filename); 9 | 10 | // Create a Transform stream to introduce delay 11 | const delayTransform = new Transform({ 12 | transform(chunk, encoding, callback) { 13 | setTimeout(() => callback(null, chunk), delay); 14 | } 15 | }); 16 | 17 | // Pipe through the delay transform 18 | const delayedNodeStream = nodeReadable.pipe(delayTransform); 19 | 20 | return makeByteReadableStreamFromNodeReadable(delayedNodeStream); 21 | } 22 | 23 | export class DelayedStream extends Readable { 24 | 25 | private buffer: (Uint8Array | null)[]; 26 | private isReading: boolean; 27 | private path: string | undefined; 28 | 29 | constructor(private sourceStream: Readable, private delay = 0) { 30 | super(); 31 | this.path = (sourceStream as unknown as {path: string}).path; 32 | this.buffer = []; 33 | this.isReading = false; 34 | 35 | this.sourceStream.on('data', (chunk) => { 36 | this.buffer.push(chunk); 37 | this.emitDelayed(); 38 | }); 39 | 40 | this.sourceStream.on('end', () => { 41 | this.buffer.push(null); // Signal the end of the stream 42 | this.emitDelayed(); 43 | }); 44 | } 45 | 46 | _read() { 47 | if (!this.isReading && this.buffer.length > 0) { 48 | this.emitDelayed(); 49 | } 50 | } 51 | 52 | emitDelayed() { 53 | if (this.isReading) return; 54 | 55 | if (this.buffer.length > 0) { 56 | this.isReading = true; 57 | const chunk = this.buffer.shift(); 58 | 59 | setTimeout(() => { 60 | this.push(chunk); 61 | this.isReading = false; 62 | 63 | if (this.buffer.length > 0) { 64 | this.emitDelayed(); 65 | } 66 | }, this.delay); 67 | } 68 | } 69 | } 70 | 71 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "inlineSources": false, 4 | "module": "node16", 5 | "moduleResolution": "node16", 6 | "target": "ES2020", 7 | "esModuleInterop": true, 8 | "strict": true, 9 | "verbatimModuleSyntax": true 10 | } 11 | } 12 | 13 | --------------------------------------------------------------------------------