├── test ├── words.txt ├── .gitignore ├── test-sax.ts ├── tsconfig.json ├── test-expat.ts ├── node-expat.d.ts └── test.ts ├── autogypi.json ├── .gitignore ├── src ├── tokenizer │ ├── ErrorType.ts │ ├── TokenSpace.ts │ ├── CodeType.ts │ ├── TokenSet.ts │ └── Patricia.ts ├── schema │ ├── AttributeGroup.ts │ ├── Attribute.ts │ ├── Group.ts │ ├── Member.ts │ ├── ComplexType.ts │ ├── Element.ts │ └── SimpleSchema.ts ├── parser │ ├── ParserLib.ts │ ├── TokenChunk.ts │ ├── ParserStream.ts │ ├── Stitcher.ts │ ├── InternalToken.ts │ ├── ParserNamespace.ts │ ├── Lib.d.ts │ ├── JSX.ts │ ├── Token.ts │ ├── ParserConfig.ts │ └── Parser.ts ├── tsconfig.json ├── index.ts ├── builder │ ├── BuilderConfig.ts │ ├── RuleSet.ts │ └── Builder.ts ├── Namespace.ts ├── CRC32.ts ├── Buffer.ts └── writer │ ├── JsonWriter.ts │ └── Writer.ts ├── .npmignore ├── lib ├── Namespace.cc ├── Patricia.cc ├── Namespace.h ├── Patricia.h ├── ParserConfig.cc ├── PatriciaCursor.h ├── ParserConfig.h ├── README.md ├── PatriciaCursor.cc ├── Parser.h └── Parser.cc ├── binding.gyp ├── .travis.yml ├── appveyor.yml ├── README.md ├── LICENSE └── package.json /test/words.txt: -------------------------------------------------------------------------------- 1 | foobar 2 | foo 3 | -------------------------------------------------------------------------------- /test/.gitignore: -------------------------------------------------------------------------------- 1 | *.js 2 | *.d.ts 3 | -------------------------------------------------------------------------------- /autogypi.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": [ 3 | "nbind" 4 | ], 5 | "includes": [] 6 | } 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | build/ 3 | dist/ 4 | package-lock.json 5 | auto*.gypi 6 | *.lock 7 | *.log.* 8 | *.log 9 | *.tgz 10 | -------------------------------------------------------------------------------- /src/tokenizer/ErrorType.ts: -------------------------------------------------------------------------------- 1 | export const enum ErrorType { 2 | OK = 0, 3 | INVALID_CHAR, 4 | PROHIBITED_WHITESPACE, 5 | TOO_MANY_PREFIXES, 6 | OTHER 7 | }; 8 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | build/ 3 | src/ 4 | test/ 5 | package-lock.json 6 | appveyor.yml 7 | .travis.yml 8 | *.lock 9 | *.log.* 10 | *.log 11 | *.tgz 12 | !src/tokenizer/CodeType.ts 13 | !src/tokenizer/ErrorType.ts 14 | -------------------------------------------------------------------------------- /src/schema/AttributeGroup.ts: -------------------------------------------------------------------------------- 1 | import { AttributeSpec } from './Attribute'; 2 | 3 | export class AttributeGroup { 4 | 5 | addAttribute(spec: AttributeSpec) { 6 | this.list.push(spec); 7 | } 8 | 9 | /** List of allowed attributes and attribute groups. */ 10 | list: AttributeSpec[] = [] 11 | 12 | } 13 | -------------------------------------------------------------------------------- /lib/Namespace.cc: -------------------------------------------------------------------------------- 1 | #include "Namespace.h" 2 | 3 | #include 4 | 5 | #ifdef NBIND_CLASS 6 | 7 | NBIND_CLASS(Namespace) { 8 | construct(); 9 | method(clone); 10 | method(setElementTrie); 11 | method(setAttributeTrie); 12 | // TODO: 13 | // method(setValueTrie); 14 | } 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /test/test-sax.ts: -------------------------------------------------------------------------------- 1 | import * as fs from 'fs'; 2 | import * as stream from 'stream'; 3 | import * as sax from 'sax'; 4 | 5 | const xml = sax.createStream(true, { position: true }); 6 | 7 | xml.on('opentag', (node: sax.Tag) => { 8 | // console.log(node); 9 | }); 10 | 11 | fs.createReadStream(process.argv[2]).pipe(xml); 12 | -------------------------------------------------------------------------------- /binding.gyp: -------------------------------------------------------------------------------- 1 | { 2 | "targets": [ 3 | { 4 | "includes": [ 5 | "auto.gypi" 6 | ], 7 | "sources": [ 8 | "lib/Patricia.cc", 9 | "lib/PatriciaCursor.cc", 10 | "lib/Namespace.cc", 11 | "lib/ParserConfig.cc", 12 | "lib/Parser.cc" 13 | ] 14 | } 15 | ], 16 | "includes": [ 17 | "auto-top.gypi" 18 | ] 19 | } 20 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | 3 | node_js: 4 | - "9" 5 | - "8" 6 | - "4" 7 | - "0.12" 8 | 9 | env: 10 | # - CC=clang CXX=clang 11 | - CC=gcc-4.8 CXX=g++-4.8 12 | 13 | addons: 14 | apt: 15 | sources: 16 | - ubuntu-toolchain-r-test 17 | # - llvm-toolchain-precise-3.8 18 | packages: 19 | - gcc-4.8 20 | - g++-4.8 21 | # - clang-3.8 22 | -------------------------------------------------------------------------------- /test/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compileOnSave": true, 3 | "compilerOptions": { 4 | "declaration": false, 5 | "lib": ["es5", "es2015.collection"], 6 | "module": "commonjs", 7 | "moduleResolution": "node", 8 | "noImplicitAny": true, 9 | "noImplicitThis": true, 10 | "removeComments": false, 11 | "sourceMap": false, 12 | "strictNullChecks": true, 13 | "target": "es5" 14 | }, 15 | "files": [ 16 | "test.ts" 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /lib/Patricia.cc: -------------------------------------------------------------------------------- 1 | #include "Patricia.h" 2 | #include "PatriciaCursor.h" 3 | 4 | uint32_t Patricia :: find(const char *needle) { 5 | PatriciaCursor cursor; 6 | char c; 7 | 8 | cursor.init(*this); 9 | while((c = *needle++)) cursor.advance(c); 10 | 11 | return(cursor.getData()); 12 | } 13 | 14 | #include 15 | 16 | #ifdef NBIND_CLASS 17 | 18 | NBIND_CLASS(Patricia) { 19 | construct<>(); 20 | method(setBuffer); 21 | method(find); 22 | } 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /src/parser/ParserLib.ts: -------------------------------------------------------------------------------- 1 | import * as path from 'path'; 2 | import * as nbind from 'nbind'; 3 | import * as Lib from './Lib'; 4 | 5 | export const lib = nbind.init(path.resolve(__dirname, '../..')).lib; 6 | 7 | export const NativeParser = lib.Parser; 8 | export type NativeParser = Lib.Parser; 9 | 10 | export const NativeNamespace = lib.Namespace; 11 | export type NativeNamespace = Lib.Namespace; 12 | 13 | export const NativeConfig = lib.ParserConfig; 14 | export type NativeConfig = Lib.ParserConfig; 15 | -------------------------------------------------------------------------------- /test/test-expat.ts: -------------------------------------------------------------------------------- 1 | import * as fs from 'fs'; 2 | import * as stream from 'stream'; 3 | import * as expat from 'node-expat'; 4 | 5 | const xml = new expat.Parser(null); 6 | 7 | xml.on('startElement', (name: string, attributeTbl: {[name: string]: string}) => { 8 | // console.log(name); 9 | // console.log(attributeTbl); 10 | }); 11 | 12 | const file = fs.createReadStream(process.argv[2]); 13 | 14 | file.on('data', (data: Buffer) => xml.parse(data, false)); 15 | 16 | file.on('end', () => xml.parse('', true)); 17 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | version: 0.0.{build} 2 | skip_tags: true 3 | os: Visual Studio 2015 4 | shallow_clone: true 5 | init: 6 | - ps: Install-Product node $env:nodejs_version 7 | environment: 8 | matrix: 9 | - nodejs_version: "9" 10 | - nodejs_version: "8" 11 | - nodejs_version: "4" 12 | - nodejs_version: "0.12" 13 | matrix: 14 | allow_failures: 15 | - nodejs_version: "0.12" 16 | install: 17 | - set PATH=%APPDATA%\npm;%PATH% 18 | - npm install 19 | build: off 20 | test_script: 21 | - node --version 22 | - npm --version 23 | - npm test 24 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | cxml 2 | ==== 3 | 4 | Build status 5 | Build status 6 | 7 | This branch is a complete rewrite in progress. 8 | 9 | License 10 | ======= 11 | 12 | [The MIT License](https://raw.githubusercontent.com/charto/cxml/master/LICENSE) 13 | 14 | Copyright (c) 2017 BusFaster Ltd 15 | -------------------------------------------------------------------------------- /src/schema/Attribute.ts: -------------------------------------------------------------------------------- 1 | import { AttributeGroup } from './AttributeGroup'; 2 | import { MemberSpec, MemberMeta, SimpleType, SimpleValue } from './Member'; 3 | 4 | /** Configuration for attributes as type members. */ 5 | 6 | export class AttributeSpec extends MemberSpec { 7 | 8 | /** Default value to use if the element or attribute is missing. */ 9 | default?: SimpleValue; 10 | /** Name and other info. */ 11 | meta?: AttributeMeta; 12 | 13 | group?: AttributeGroup; 14 | 15 | } 16 | 17 | export class AttributeMeta extends MemberMeta { 18 | 19 | type: SimpleType; 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/schema/Group.ts: -------------------------------------------------------------------------------- 1 | import { SimpleElementSpec, ElementSpec } from './Element'; 2 | 3 | export const enum GroupKind { 4 | group, 5 | all, 6 | choice, 7 | sequence 8 | } 9 | 10 | export class Group { 11 | 12 | constructor( public kind: GroupKind ) {} 13 | 14 | addElement(spec: SimpleElementSpec | ElementSpec) { 15 | this.list.push(spec); 16 | if(spec.meta) this.tbl[spec.meta.token.id!] = spec; 17 | } 18 | 19 | /** List of allowed elements and groups. */ 20 | list: (SimpleElementSpec | ElementSpec)[] = [] 21 | 22 | tbl: { [id: number]: SimpleElementSpec | ElementSpec } = {}; 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compileOnSave": true, 3 | "compilerOptions": { 4 | "declaration": true, 5 | "experimentalDecorators": true, 6 | "lib": [ "dom", "es5", "es2015.promise", "es2015.collection" ], 7 | "module": "commonjs", 8 | "moduleResolution": "node", 9 | "noImplicitAny": true, 10 | "noImplicitThis": true, 11 | "outDir": "../dist", 12 | "removeComments": false, 13 | "sourceMap": false, 14 | "strictFunctionTypes": true, 15 | "strictNullChecks": true, 16 | "strictPropertyInitialization": false, 17 | "target": "es5" 18 | }, 19 | "files": [ 20 | "index.ts" 21 | ] 22 | } 23 | -------------------------------------------------------------------------------- /lib/Namespace.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | #include "Patricia.h" 8 | 9 | class Namespace { 10 | 11 | public: 12 | 13 | explicit Namespace(std::string uri) : uri(uri) {} 14 | 15 | Namespace clone() { return(*this); } 16 | 17 | void setElementTrie(nbind::Buffer buffer) { 18 | elementTrie.setBuffer(buffer); 19 | } 20 | 21 | void setAttributeTrie(nbind::Buffer buffer) { 22 | attributeTrie.setBuffer(buffer); 23 | } 24 | 25 | // TODO: 26 | // void setValueTrie(nbind::Buffer buffer) { 27 | // valueTrie.setBuffer(buffer); 28 | // } 29 | 30 | std::string uri; 31 | 32 | Patricia elementTrie; 33 | Patricia attributeTrie; 34 | // TODO: 35 | // Patricia valueTrie; 36 | 37 | }; 38 | -------------------------------------------------------------------------------- /src/parser/TokenChunk.ts: -------------------------------------------------------------------------------- 1 | import { Namespace } from '../Namespace'; 2 | import { Token, TokenBuffer, TokenKind } from './Token'; 3 | 4 | export class TokenChunk { 5 | 6 | static allocate(buffer: TokenBuffer = []) { 7 | let chunk = TokenChunk.first; 8 | 9 | if(chunk) { 10 | TokenChunk.first = chunk.next; 11 | } else { 12 | chunk = new TokenChunk(); 13 | } 14 | 15 | chunk.length = buffer.length; 16 | chunk.buffer = buffer; 17 | // Clear free list pointer to help GC find garbage also if free() is not called. 18 | chunk.next = void 0; 19 | chunk.namespaceList = void 0; 20 | 21 | return(chunk); 22 | } 23 | 24 | free() { 25 | this.next = TokenChunk.first; 26 | TokenChunk.first = this; 27 | } 28 | 29 | length: number; 30 | buffer: TokenBuffer; 31 | next: TokenChunk | undefined; 32 | namespaceList: (Namespace | undefined)[] | undefined; 33 | 34 | private static first: TokenChunk | undefined; 35 | 36 | } 37 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export { ArrayType, encodeArray, decodeArray, concatArray } from './Buffer'; 2 | export { CRC32, Hasher32 } from './CRC32'; 3 | 4 | import { Namespace } from './Namespace'; 5 | export { Namespace }; 6 | export { ParserConfig, ParserOptions, TokenTbl, Registry } from './parser/ParserConfig'; 7 | export { Parser, ParseError } from './parser/Parser'; 8 | export { Builder } from './builder/Builder'; 9 | export { Writer } from './writer/Writer'; 10 | export { JsonWriter } from './writer/JsonWriter'; 11 | export { defineElement, defineAttribute, jsxElement, jsxCompile, jsxExpand } from './parser/JSX'; 12 | export { TokenChunk } from './parser/TokenChunk'; 13 | export { ElementMeta } from './schema/Element'; 14 | export { AttributeMeta } from './schema/Attribute'; 15 | export * from './parser/Token'; 16 | 17 | export const processing = Namespace.processing; 18 | export const anonymous = Namespace.unknown; 19 | export const xml1998 = Namespace.xml1998; 20 | -------------------------------------------------------------------------------- /src/builder/BuilderConfig.ts: -------------------------------------------------------------------------------- 1 | import { Namespace } from '../Namespace'; 2 | import { ParserConfig, ParserOptions } from '../parser/ParserConfig'; 3 | import { SimpleSchema, SimpleSchemaSpecTbl } from '../schema/SimpleSchema'; 4 | import { RuleSet } from './RuleSet'; 5 | import { Builder } from './Builder'; 6 | 7 | export class BuilderConfig { 8 | 9 | constructor(parserConfig: ParserConfig, schemaSpec: SimpleSchemaSpecTbl) { 10 | this.options = parserConfig.options; 11 | 12 | for(let prefix of Object.keys(schemaSpec)) { 13 | const [ defaultPrefix, nsUri, spec ] = schemaSpec[prefix]; 14 | const ns = new Namespace(defaultPrefix, nsUri); 15 | 16 | if(spec['document']) { 17 | this.ruleSetTbl[nsUri] = new RuleSet(new SimpleSchema(parserConfig, ns, spec)); 18 | } 19 | } 20 | } 21 | 22 | createBuilder(nsUri: string) { 23 | return(new Builder(this, nsUri)); 24 | } 25 | 26 | options: ParserOptions; 27 | ruleSetTbl: { [uri: string]: RuleSet } = {}; 28 | 29 | } 30 | -------------------------------------------------------------------------------- /src/parser/ParserStream.ts: -------------------------------------------------------------------------------- 1 | import * as stream from 'stream'; 2 | 3 | import { ArrayType } from '../Buffer'; 4 | import { Namespace } from '../Namespace'; 5 | import { ParserConfig } from './ParserConfig'; 6 | import { Parser } from './Parser'; 7 | import { TokenChunk } from './TokenChunk'; 8 | import { 9 | Token, 10 | TokenBuffer, 11 | TokenKind, 12 | } from './Token'; 13 | 14 | /** XML parser stream, emits tokens with fully qualified names. */ 15 | 16 | export class ParserStream extends stream.Transform { 17 | 18 | constructor(config: ParserConfig, public parser = config.createParser()) { 19 | super({ objectMode: true }); 20 | } 21 | 22 | _flush( flush: (err: any, chunk: TokenChunk | null) => void) { 23 | this.parser.destroy(flush); 24 | flush(null, null); 25 | } 26 | 27 | _transform( 28 | chunk: string | ArrayType, 29 | enc: string, 30 | flush: (err: any, chunk: TokenChunk | null) => void 31 | ) { 32 | this.parser.write(chunk, enc, flush); 33 | } 34 | 35 | } 36 | -------------------------------------------------------------------------------- /src/schema/Member.ts: -------------------------------------------------------------------------------- 1 | import { Namespace } from '../Namespace'; 2 | import { MemberToken } from '../parser/Token'; 3 | 4 | import { ComplexType } from './ComplexType'; 5 | 6 | /** SimpleType equivalent JavaScript data types. */ 7 | 8 | export type SimpleValue = string | number | boolean; 9 | 10 | /** Configuration for elements and attributes as type members. */ 11 | 12 | export class MemberSpec { 13 | 14 | constructor( 15 | public min = 1, 16 | public max = 1 17 | ) {} 18 | 19 | meta?: MemberMeta; 20 | 21 | } 22 | 23 | /** Definition of a type with only text content. 24 | * Applicable to both elements and attributes. */ 25 | 26 | export class SimpleType { 27 | 28 | base?: SimpleType; 29 | 30 | } 31 | 32 | export class MemberMeta { 33 | 34 | /** @param token Token with element or attribute name and namespace. 35 | * A single token may have different types depending on its parent. */ 36 | constructor(public token: MemberToken) {} 37 | 38 | exists = true; 39 | 40 | type: SimpleType | ComplexType; 41 | 42 | } 43 | -------------------------------------------------------------------------------- /test/node-expat.d.ts: -------------------------------------------------------------------------------- 1 | declare module "node-expat" { 2 | import * as events from "events"; 3 | 4 | export class Parser extends events.EventEmitter { 5 | constructor(encoding: string | null); 6 | 7 | parse(data: string | Buffer, isFinal: boolean): boolean; 8 | 9 | setEncoding(encoding: string): boolean; 10 | // setUnknownEncoding() TODO 11 | 12 | // getError() TODO 13 | 14 | stop(): boolean; 15 | // Same return value as stop(). 16 | pause(): boolean; 17 | resume(): boolean; 18 | 19 | destroy(): void; 20 | destroySoon(): void; 21 | 22 | // Same data argument and return value as parse() but emits errors and isFinal is false. 23 | write(data: string | Buffer): boolean; 24 | // Same data argument and return value as parse() but emits errors and isFinal is true. 25 | end(data: string | Buffer): boolean; 26 | 27 | reset(): boolean; 28 | 29 | getCurrentLineNumber(): number; 30 | getCurrentColumnNumber(): number; 31 | getCurrentByteIndex(): number; 32 | } 33 | 34 | // export function createParser(cb: ???): Parser TODO 35 | } 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2017 BusFaster Ltd 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /src/Namespace.ts: -------------------------------------------------------------------------------- 1 | /** Basic XML namespace definition. */ 2 | 3 | export class Namespace { 4 | 5 | constructor( 6 | /** Default xmlns prefix for serializing to XML. */ 7 | public defaultPrefix: string, 8 | /** Unique identifier for the namespace, should be a valid URI. */ 9 | public uri: string, 10 | /** Numeric ID for faster mapping of namespaces to local prefixes. */ 11 | public id = Namespace.idLast++, 12 | /** Special namespaces represent processing instructions (always defined). */ 13 | public isSpecial = false 14 | ) {} 15 | 16 | addElement(name: string) { this.elementNameList.push(name); } 17 | addAttribute(name: string) { this.attributeNameList.push(name); } 18 | addLocation(url: string) { this.schemaLocationList.push(url); } 19 | 20 | elementNameList: string[] = []; 21 | attributeNameList: string[] = []; 22 | schemaLocationList: string[] = []; 23 | 24 | static idLast = 0; 25 | static unknown = new Namespace('', '', 0, true); 26 | static processing = new Namespace('?', '?', 0, true); 27 | static xml1998 = new Namespace('xml', 'http://www.w3.org/XML/1998/namespace'); 28 | 29 | } 30 | 31 | Namespace.processing.addElement('xml'); 32 | -------------------------------------------------------------------------------- /lib/Patricia.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | /* 6 | A trie node contains data and 4 extra bytes: 7 | 8 | - Length in bits (1 byte) 9 | - Data (1 - 32 bytes) 10 | - If state is accepted (can only have one child): 11 | - Offset to data pointer (3 bytes) 12 | - If node has two children (cannot be an accepted state): 13 | - Offset to other child (3 bytes) 14 | 15 | First child node immediately follows. 16 | 17 | Total data size is limited to 16 megabytes. 18 | */ 19 | 20 | /** Patricia trie. */ 21 | 22 | class Patricia { 23 | 24 | friend class PatriciaCursor; 25 | 26 | public: 27 | 28 | void setRoot(const unsigned char *root) { this->root = root; } 29 | 30 | void setBuffer(nbind::Buffer buffer) { 31 | this->buffer = buffer; 32 | root = buffer.data(); 33 | } 34 | 35 | uint32_t find(const char *needle); 36 | 37 | static constexpr uint32_t notFound = 0x7fffff; 38 | static constexpr uint32_t idMask = 0x7fffff; 39 | 40 | private: 41 | 42 | /** Trie root. */ 43 | const unsigned char *root; 44 | 45 | /** Handle to the JavaScript buffer with inserted data, 46 | * to prevent garbage collecting it too early. */ 47 | nbind::Buffer buffer; 48 | 49 | }; 50 | -------------------------------------------------------------------------------- /src/tokenizer/TokenSpace.ts: -------------------------------------------------------------------------------- 1 | import { ParserNamespace } from '../parser/ParserNamespace'; 2 | import { InternalToken } from '../parser/InternalToken'; 3 | import { TokenKind } from '../parser/Token'; 4 | 5 | /** Allocates IDs for distinguishing between tokens of the same type. */ 6 | 7 | export class TokenSpace { 8 | 9 | constructor(private kind: TokenKind, parent?: TokenSpace) { 10 | if(parent) { 11 | this.isLinked = true; 12 | 13 | this.idLast = parent.idLast; 14 | this.list = parent.list; 15 | } else { 16 | this.isLinked = false; 17 | 18 | this.idLast = 0; 19 | this.list = []; 20 | } 21 | } 22 | 23 | link() { 24 | this.isLinked = true; 25 | } 26 | 27 | private unlink() { 28 | if(!this.isLinked) return; 29 | this.isLinked = false; 30 | 31 | this.list = this.list.slice(0); 32 | } 33 | 34 | createToken(name: string, ns?: ParserNamespace) { 35 | this.unlink(); 36 | 37 | const token = new InternalToken(++this.idLast, this.kind, name, ns); 38 | this.list[token.id] = token; 39 | 40 | return(token); 41 | } 42 | 43 | /** If true, object is a clone sharing data with another object. */ 44 | private isLinked: boolean; 45 | private idLast: number; 46 | 47 | list: InternalToken[]; 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/tokenizer/CodeType.ts: -------------------------------------------------------------------------------- 1 | // TODO: cdata start/end (disables entity parsing on JS side) 2 | export const enum CodeType { 3 | OPEN_ELEMENT_ID = 0, 4 | CLOSE_ELEMENT_ID, 5 | ATTRIBUTE_ID, 6 | PREFIX_ID, 7 | XMLNS_ID, 8 | URI_ID, 9 | SGML_ID, 10 | 11 | ELEMENT_EMITTED, 12 | CLOSED_ELEMENT_EMITTED, 13 | SGML_EMITTED, 14 | 15 | NAMESPACE_ID, 16 | 17 | VALUE_START_OFFSET, 18 | VALUE_END_OFFSET, 19 | 20 | TEXT_START_OFFSET, 21 | TEXT_END_OFFSET, 22 | 23 | CDATA_START_OFFSET, 24 | CDATA_END_OFFSET, 25 | 26 | COMMENT_START_OFFSET, 27 | COMMENT_END_OFFSET, 28 | 29 | SGML_NESTED_START, 30 | SGML_NESTED_END, 31 | 32 | SGML_TEXT_START_OFFSET, 33 | SGML_TEXT_END_OFFSET, 34 | 35 | // Unrecognized element name. 36 | UNKNOWN_START_OFFSET, 37 | 38 | // The order of these must match OPEN_ELEMENT_ID, CLOSE_ELEMENT_ID... 39 | UNKNOWN_OPEN_ELEMENT_END_OFFSET, 40 | UNKNOWN_CLOSE_ELEMENT_END_OFFSET, 41 | UNKNOWN_ATTRIBUTE_END_OFFSET, 42 | UNKNOWN_PREFIX_END_OFFSET, 43 | UNKNOWN_XMLNS_END_OFFSET, 44 | UNKNOWN_URI_END_OFFSET, 45 | UNKNOWN_SGML_END_OFFSET, 46 | 47 | // Recognized part from an unrecognized name. 48 | PARTIAL_ELEMENT_ID, 49 | PARTIAL_ATTRIBUTE_ID, 50 | PARTIAL_PREFIX_ID, 51 | PARTIAL_URI_ID, 52 | PARTIAL_LEN 53 | }; 54 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cxml", 3 | "version": "0.3.1", 4 | "description": "Advanced schema-aware streaming XML parser", 5 | "main": "dist/index.js", 6 | "typings": "dist/index.d.ts", 7 | "scripts": { 8 | "autogypi": "autogypi", 9 | "node-gyp": "node-gyp", 10 | "emcc-path": "emcc-path", 11 | "copyasm": "copyasm", 12 | "ndts": "ndts", 13 | "tsc": "tsc", 14 | "prepublish": "ndts > src/parser/Lib.d.ts && tsc -p src && ndts > dist/parser/Lib.d.ts", 15 | "install": "autogypi && node-gyp configure build", 16 | "test": "tsc -p test && node test/test.js" 17 | }, 18 | "author": "Juha Järvi", 19 | "license": "MIT", 20 | "repository": { 21 | "type": "git", 22 | "url": "git+https://github.com/charto/cxml.git" 23 | }, 24 | "bugs": { 25 | "url": "https://github.com/charto/cxml/issues" 26 | }, 27 | "homepage": "https://github.com/charto/cxml#readme", 28 | "keywords": [ 29 | "xml", 30 | "streaming", 31 | "schema", 32 | "parser", 33 | "xsd", 34 | "dts", 35 | "typescript" 36 | ], 37 | "devDependencies": { 38 | "typescript": "^3.2.2" 39 | }, 40 | "dependencies": { 41 | "@types/node": "^10.12.18", 42 | "autogypi": "^0.2.2", 43 | "nbind": "^0.3.15" 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/CRC32.ts: -------------------------------------------------------------------------------- 1 | import { ArrayType, encodeArray } from './Buffer'; 2 | 3 | export interface Hasher32 { 4 | append(data: string | ArrayType): number; 5 | } 6 | 7 | class Hasher implements Hasher32 { 8 | constructor(private tbl: number[]) {} 9 | 10 | append(data: string | ArrayType) { 11 | const tbl = this.tbl; 12 | let crc = this.crc; 13 | 14 | if(typeof(data) == 'string') data = encodeArray(data); 15 | 16 | let len = data.length; 17 | let pos = 0; 18 | 19 | for(let pos = 0; pos < len; ++pos) { 20 | crc = (crc >>> 8) ^ tbl[(crc & 0xff) ^ data[pos]]; 21 | } 22 | 23 | this.crc = crc; 24 | 25 | return((crc ^ 0xffffffff) >>> 0); 26 | } 27 | 28 | crc = 0xffffffff; 29 | } 30 | 31 | /** 32-bit Cyclic Redundancy Check. */ 32 | 33 | export class CRC32 { 34 | /** @param poly Reversed generator polynomial, default edb88320 (Ethernet, GZIP, PNG). 35 | * Other good choices are 82f63b78 (Castagnoli) used in Btrfs and eb31d82e (Koopman). */ 36 | 37 | constructor(public poly = 0xedb88320) { 38 | for(let n = 0; n < 256; ++n) { 39 | let crc = n; 40 | let b = 8; 41 | 42 | while(b--) { 43 | crc = ((crc >>> 1) ^ (-(crc & 1) & poly)) >>> 0; 44 | } 45 | 46 | this.tbl[n] = crc; 47 | } 48 | } 49 | 50 | create(): Hasher32 { 51 | return(new Hasher(this.tbl)); 52 | } 53 | 54 | tbl: number[] = []; 55 | } 56 | -------------------------------------------------------------------------------- /lib/ParserConfig.cc: -------------------------------------------------------------------------------- 1 | #include "ParserConfig.h" 2 | #include "PatriciaCursor.h" 3 | 4 | ParserConfig :: ParserConfig( 5 | uint32_t xmlnsToken, 6 | uint32_t emptyPrefixToken, 7 | uint32_t xmlnsPrefixToken, 8 | uint32_t processingPrefixToken 9 | ) : 10 | xmlnsToken(xmlnsToken), 11 | emptyPrefixToken(emptyPrefixToken), 12 | xmlnsPrefixToken(xmlnsPrefixToken), 13 | processingPrefixToken(processingPrefixToken) 14 | { 15 | for(unsigned int i = 0; i < namespacePrefixTblSize; ++i) { 16 | namespacePrefixTbl[i] = std::make_pair(0, nullptr); 17 | } 18 | // Ensure that valid namespace indices start from 1. 19 | // TODO: Do we still need this? 20 | namespaceList.push_back(nullptr); 21 | } 22 | 23 | bool ParserConfig :: addUri(uint32_t uri, uint32_t ns) { 24 | if(ns < namespaceList.size()) { 25 | if(uri >= namespaceByUriToken.size()) { 26 | namespaceByUriToken.resize(uri + 1); 27 | } 28 | 29 | namespaceByUriToken[uri] = std::make_pair(ns, namespaceList[ns].get()); 30 | 31 | return(true); 32 | } 33 | 34 | return(false); 35 | } 36 | 37 | #include 38 | 39 | #ifdef NBIND_CLASS 40 | 41 | NBIND_CLASS(ParserConfig) { 42 | construct(); 43 | 44 | method(addNamespace); 45 | method(addUri); 46 | method(bindPrefix); 47 | 48 | method(setUriTrie); 49 | method(setPrefixTrie); 50 | } 51 | 52 | #endif 53 | -------------------------------------------------------------------------------- /src/schema/ComplexType.ts: -------------------------------------------------------------------------------- 1 | import { AttributeSpec } from './Attribute'; 2 | import { AttributeGroup } from './AttributeGroup'; 3 | import { SimpleElementSpec, ElementSpec, ElementBase } from './Element'; 4 | import { Group, GroupKind } from './Group'; 5 | 6 | export interface ElementTypeConstructor { 7 | new(): ElementClass; 8 | }; 9 | 10 | /** Definition of a type containing other elements and attributes. Only applicable to elements. */ 11 | 12 | export class ComplexType { 13 | 14 | addAttribute(spec: AttributeSpec) { 15 | if(!this.attributes) this.attributes = new AttributeGroup(); 16 | this.attributes.addAttribute(spec); 17 | } 18 | 19 | addAll(spec: SimpleElementSpec | ElementSpec) { 20 | if(!this.elements) { 21 | this.elements = new ElementSpec(); 22 | this.elements.group = new Group(GroupKind.all); 23 | } 24 | 25 | this.elements.group!.addElement(spec); 26 | } 27 | 28 | createProto() { 29 | if(!this.XMLType) { 30 | const BaseType = this.base ? this.base.createProto() : ElementBase; 31 | this.XMLType = class XMLType extends BaseType {}; 32 | } 33 | 34 | return(this.XMLType as ElementTypeConstructor); 35 | } 36 | 37 | base?: ComplexType; 38 | 39 | XMLType: ElementTypeConstructor; 40 | 41 | attributes?: AttributeGroup; 42 | elements?: ElementSpec; 43 | 44 | } 45 | -------------------------------------------------------------------------------- /lib/PatriciaCursor.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include "Patricia.h" 6 | 7 | /** Cursor for finding a string in the trie, in steps of one character. */ 8 | class PatriciaCursor { 9 | 10 | public: 11 | 12 | /** Start scanning a trie from the first input character. */ 13 | void init(const Patricia &trie); 14 | 15 | /** Try to match previous input using a different trie. On failure, 16 | * the cursor remains unchanged. */ 17 | bool transfer(const Patricia &trie); 18 | 19 | /** Advance to the next input character, updating pointer to any associated 20 | * value found. */ 21 | bool advance(unsigned char c); 22 | 23 | /** Find the ID of the (lexicographically) first descendant leaf 24 | * after advance has failed. The cursor position is unchanged. */ 25 | uint32_t findLeaf(); 26 | 27 | /** Get the data value associated with the string. 28 | * Valid values are from 0 to 0x7ffffe and 0x7fffff indicates no data. 29 | * Values are 3 bytes and the highest bit is an internal flag whether 30 | * the trie node has no children. */ 31 | uint32_t getData(); 32 | 33 | private: 34 | 35 | const unsigned char *root = nullptr; 36 | const unsigned char *ptr = nullptr; 37 | const unsigned char *found; 38 | uint16_t len; 39 | 40 | /** Handle to the JavaScript buffer with inserted data, 41 | * to prevent garbage collecting it too early. */ 42 | nbind::Buffer buffer; 43 | 44 | }; 45 | -------------------------------------------------------------------------------- /src/Buffer.ts: -------------------------------------------------------------------------------- 1 | export declare class TextEncoder { 2 | constructor(encoding: string); 3 | 4 | encode(data: string): Uint8Array; 5 | decode(data: Uint8Array): string; 6 | } 7 | 8 | export type ArrayType = Buffer | Uint8Array; 9 | export let ArrayType: { new(size: number): ArrayType }; 10 | 11 | export let encodeArray: (text: string) => ArrayType; 12 | export let decodeArray: (data: ArrayType, start?: number, end?: number) => string; 13 | export let concatArray: (list: ArrayType[], len: number) => ArrayType; 14 | 15 | if(typeof(Buffer) == 'function') { 16 | ArrayType = Buffer; 17 | 18 | encodeArray = (text: string) => new Buffer(text); 19 | decodeArray = (data: ArrayType, start?: number, end?: number) => (data as Buffer).toString('utf-8', start, end); 20 | 21 | concatArray = Buffer.concat as any; 22 | } else if(typeof(TextEncoder) == 'function') { 23 | ArrayType = Uint8Array; 24 | 25 | const encoder = new TextEncoder('utf-8'); 26 | encodeArray = (text: string) => encoder.encode(name); 27 | decodeArray = (data: ArrayType, start?: number, end?: number) => encoder.decode( 28 | (start || end || end === 0) ? data.slice(start, end) : data 29 | ); 30 | 31 | concatArray = (list: ArrayType[], len: number) => { 32 | const buf = new Uint8Array(len); 33 | 34 | let offset = 0; 35 | for(let part of list) { 36 | buf.set(part, offset); 37 | offset += part.length; 38 | } 39 | 40 | return(buf); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/parser/Stitcher.ts: -------------------------------------------------------------------------------- 1 | import { ArrayType, encodeArray, decodeArray, concatArray } from '../Buffer'; 2 | 3 | export class Stitcher { 4 | 5 | setChunk(chunk: ArrayType) { 6 | this.chunk = chunk; 7 | } 8 | 9 | reset(buf: ArrayType, len: number) { 10 | this.partList = [ buf.slice(0, len) ]; 11 | this.byteLen = len; 12 | } 13 | 14 | storeSlice(start: number, end?: number) { 15 | if(!this.partList) this.partList = []; 16 | if(end !== 0) { 17 | this.partList.push(this.chunk.slice(start, end)); 18 | this.byteLen += (end || this.chunk.length) - start; 19 | } 20 | } 21 | 22 | /** getSlice helper for concatenating buffer parts. */ 23 | private buildSlice(start: number, end?: number) { 24 | this.storeSlice(start, end); 25 | 26 | const result = decodeArray(concatArray(this.partList!, this.byteLen)); 27 | this.partList = null; 28 | this.byteLen = 0; 29 | 30 | return(result); 31 | } 32 | 33 | /** Get a string from the input buffer. Prepend any parts left from 34 | * previous code buffers. */ 35 | getSlice(start: number, end?: number) { 36 | return(( 37 | this.partList ? this.buildSlice(start, end) : 38 | decodeArray(this.chunk, start, end) 39 | ).replace(/\r\n?|\n\r/g, '\n')); 40 | } 41 | 42 | /** Current input buffer. */ 43 | private chunk: ArrayType; 44 | 45 | /** Storage for parts of strings split between chunks of input. */ 46 | private partList: ArrayType[] | null = null; 47 | private byteLen = 0; 48 | 49 | } 50 | -------------------------------------------------------------------------------- /lib/ParserConfig.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #include "Namespace.h" 7 | #include "Patricia.h" 8 | 9 | class ParserConfig { 10 | 11 | friend class Parser; 12 | 13 | public: 14 | 15 | static constexpr uint32_t namespacePrefixTblSize = 256; 16 | 17 | ParserConfig(uint32_t xmlnsToken, uint32_t emptyPrefixToken, uint32_t xmlnsPrefixToken, uint32_t processingPrefixToken); 18 | 19 | void setUriTrie(nbind::Buffer buffer) { uriTrie.setBuffer(buffer); } 20 | void setPrefixTrie(nbind::Buffer buffer) { prefixTrie.setBuffer(buffer); } 21 | 22 | uint32_t addNamespace(const std::shared_ptr ns) { 23 | namespaceList.push_back(ns); 24 | 25 | return(namespaceList.size() - 1); 26 | } 27 | 28 | bool addUri(uint32_t uri, uint32_t ns); 29 | 30 | bool bindPrefix(uint32_t idPrefix, uint32_t uri) { 31 | if(idPrefix >= namespacePrefixTblSize) return(false); 32 | if(uri >= namespaceByUriToken.size()) return(false); 33 | 34 | namespacePrefixTbl[idPrefix] = namespaceByUriToken[uri]; 35 | return(true); 36 | } 37 | 38 | private: 39 | 40 | std::vector> namespaceList; 41 | std::vector > namespaceByUriToken; 42 | std::pair namespacePrefixTbl[namespacePrefixTblSize]; 43 | 44 | uint32_t xmlnsToken; 45 | 46 | uint32_t emptyPrefixToken; 47 | uint32_t xmlnsPrefixToken; 48 | uint32_t processingPrefixToken; 49 | 50 | Patricia uriTrie; 51 | Patricia prefixTrie; 52 | 53 | }; 54 | -------------------------------------------------------------------------------- /src/tokenizer/TokenSet.ts: -------------------------------------------------------------------------------- 1 | import { Patricia } from './Patricia'; 2 | import { ParserNamespace } from '../parser/ParserNamespace'; 3 | import { TokenSpace } from './TokenSpace'; 4 | import { InternalToken } from '../parser/InternalToken'; 5 | 6 | export class TokenSet { 7 | 8 | constructor(private space: TokenSpace, parent?: TokenSet) { 9 | if(parent) { 10 | this.isLinked = true; 11 | 12 | this.tbl = parent.tbl; 13 | this.trie = parent.trie; 14 | } else { 15 | this.isLinked = false; 16 | 17 | this.tbl = {}; 18 | this.trie = new Patricia(); 19 | } 20 | } 21 | 22 | link() { 23 | this.isLinked = true; 24 | } 25 | 26 | private unlink() { 27 | if(!this.isLinked) return; 28 | this.isLinked = false; 29 | 30 | const tbl: { [ name: string ]: InternalToken } = {}; 31 | for(let key of Object.keys(this.tbl)) { 32 | tbl[key] = this.tbl[key]; 33 | } 34 | 35 | this.tbl = tbl; 36 | this.trie = this.trie.clone(); 37 | } 38 | 39 | createToken(name: string, ns?: ParserNamespace) { 40 | let token = this.tbl[name]; 41 | 42 | if(!token) { 43 | this.unlink(); 44 | 45 | token = this.space.createToken(name, ns); 46 | 47 | this.tbl[name] = token; 48 | if(token.name) { 49 | this.dirty = true; 50 | this.trie.insertNode(token); 51 | } 52 | } 53 | 54 | return(token); 55 | } 56 | 57 | addToken(token: InternalToken) { 58 | if(token.name) { 59 | this.dirty = true; 60 | this.tbl[token.name] = token; 61 | this.trie.insertNode(token); 62 | } 63 | } 64 | 65 | encodeTrie() { 66 | return(this.trie.encode()); 67 | } 68 | 69 | /** If true, object is a clone sharing data with another object. */ 70 | private isLinked: boolean; 71 | 72 | private tbl: { [ name: string ]: InternalToken }; 73 | private trie: Patricia; 74 | 75 | public dirty = true; 76 | 77 | } 78 | -------------------------------------------------------------------------------- /src/parser/InternalToken.ts: -------------------------------------------------------------------------------- 1 | import { Namespace } from '../Namespace'; 2 | import { ParserNamespace } from './ParserNamespace'; 3 | import { ArrayType, encodeArray } from '../Buffer'; 4 | import { TokenKind, OpenToken, CloseToken, EmittedToken, StringToken, PrefixToken, UriToken } from './Token'; 5 | 6 | export class InternalToken { 7 | constructor( 8 | public id: number, 9 | kind: TokenKind, 10 | public name: string, 11 | public ns?: ParserNamespace 12 | ) { 13 | this.buf = encodeArray(name); 14 | const nsBase = ns ? ns.base : Namespace.unknown; 15 | 16 | switch(kind) { 17 | case TokenKind.element: 18 | 19 | this.open = new OpenToken(name, nsBase, id); 20 | this.close = this.open.close; 21 | this.emitted = this.open.emitted; 22 | this.tokenList = [ 23 | this.open, 24 | this.close, 25 | this.emitted, 26 | null 27 | ]; 28 | break; 29 | 30 | case TokenKind.attribute: 31 | 32 | this.string = new StringToken(name, nsBase, id); 33 | this.tokenList = [ 34 | null, 35 | null, 36 | null, 37 | this.string 38 | ]; 39 | break; 40 | 41 | case TokenKind.prefix: 42 | this.prefix = new PrefixToken(name, id); 43 | break; 44 | 45 | case TokenKind.uri: 46 | this.uri = new UriToken(ns!.base); 47 | break; 48 | 49 | default: 50 | 51 | break; 52 | } 53 | } 54 | 55 | // TODO: Should be an empty string instead. 56 | static empty = new InternalToken(0, TokenKind.element, '\0'); 57 | 58 | buf: ArrayType; 59 | 60 | open: OpenToken; 61 | close: CloseToken; 62 | emitted: EmittedToken; 63 | 64 | string: StringToken; 65 | 66 | prefix: PrefixToken; 67 | uri: UriToken; 68 | 69 | // Order must match TokenKind. 70 | tokenList: [ 71 | OpenToken | null, 72 | CloseToken | null, 73 | EmittedToken | null, 74 | StringToken | null 75 | ]; 76 | } 77 | -------------------------------------------------------------------------------- /test/test.ts: -------------------------------------------------------------------------------- 1 | import * as fs from 'fs'; 2 | import * as path from 'path'; 3 | 4 | import * as nbind from 'nbind'; 5 | import * as cxml from '..'; 6 | import * as Lib from '../dist/parser/Lib'; 7 | 8 | import { ParserStream } from '../dist/parser/ParserStream'; 9 | 10 | import { TokenSpace } from '../dist/tokenizer/TokenSpace'; 11 | import { Patricia } from '../dist/tokenizer/Patricia'; 12 | 13 | const lib = nbind.init(path.resolve(__dirname, '..')).lib; 14 | 15 | function testPatricia() { 16 | const tokenSpace = new TokenSpace(0); 17 | const trie = new Patricia(); 18 | const rawTrie = new lib.Patricia(); 19 | 20 | const tokenList = fs.readFileSync( 21 | process.argv[2] || path.resolve(__dirname, 'words.txt'), 22 | { encoding: 'utf-8' } 23 | ).split('\n').filter( 24 | (name: string) => name.length > 1 25 | ).map( 26 | (name: string) => tokenSpace.createToken(name) 27 | ); 28 | 29 | trie.insertList(tokenList); 30 | rawTrie.setBuffer(trie.encode()); 31 | 32 | let result: number; 33 | 34 | for(let token of tokenList) { 35 | result = rawTrie.find(token.name); 36 | if(result != token.id) { 37 | console.error('ERROR in ' + result + ' ' + token.name); 38 | process.exit(1); 39 | } 40 | } 41 | } 42 | 43 | function testParser() { 44 | const xmlConfig = new cxml.ParserConfig(); 45 | 46 | const xmlParser = new ParserStream(xmlConfig); 47 | 48 | xmlParser.pipe(new cxml.Writer()).pipe(process.stdout); 49 | 50 | xmlParser.write(''); 51 | // xmlParser.write(''); 52 | // xmlParser.write(''); 53 | // xmlParser.write(''); 54 | 55 | xmlParser.end(); 56 | } 57 | 58 | testPatricia(); 59 | testParser(); 60 | -------------------------------------------------------------------------------- /src/parser/ParserNamespace.ts: -------------------------------------------------------------------------------- 1 | import { NativeNamespace } from './ParserLib'; 2 | 3 | import { Namespace } from '../Namespace'; 4 | import { ParserConfig } from './ParserConfig'; 5 | import { Token } from './Token'; 6 | import { TokenSet } from '../tokenizer/TokenSet'; 7 | import { InternalToken } from './InternalToken'; 8 | 9 | export class ParserNamespace { 10 | 11 | /** @param base Parser-independent namespace definition. */ 12 | constructor(public parent: Namespace | ParserNamespace, config: ParserConfig) { 13 | if(parent instanceof ParserNamespace) { 14 | this.base = parent.base; 15 | this.native = parent.native.clone(); 16 | 17 | this.elementSet = new TokenSet(config.elementSpace, parent.elementSet); 18 | this.attributeSet = new TokenSet(config.attributeSpace, parent.attributeSet); 19 | 20 | this.uriToken = parent.uriToken; 21 | } else { 22 | this.base = parent; 23 | this.native = new NativeNamespace(parent.uri); 24 | 25 | this.elementSet = new TokenSet(config.elementSpace); 26 | this.attributeSet = new TokenSet(config.attributeSpace); 27 | 28 | this.attributeSet.addToken(config.xmlnsToken); 29 | 30 | for(let name of parent.elementNameList) { 31 | this.addElement(name); 32 | } 33 | 34 | for(let name of parent.attributeNameList) { 35 | this.addAttribute(name); 36 | } 37 | } 38 | } 39 | 40 | registerNative(): NativeNamespace { 41 | if(this.elementSet.dirty) { 42 | this.native.setElementTrie(this.elementSet.encodeTrie()); 43 | this.elementSet.dirty = false; 44 | } 45 | if(this.attributeSet.dirty) { 46 | this.native.setAttributeTrie(this.attributeSet.encodeTrie()); 47 | this.attributeSet.dirty = false; 48 | } 49 | return(this.native); 50 | } 51 | 52 | addElement(name: string) { 53 | return(this.elementSet.createToken(name, this)); 54 | } 55 | 56 | addAttribute(name: string) { 57 | return(this.attributeSet.createToken(name, this)); 58 | } 59 | 60 | public base: Namespace; 61 | private native: NativeNamespace; 62 | 63 | /** Index in parser's namespaceList. */ 64 | public id: number; 65 | 66 | uriToken: Token; 67 | 68 | private elementSet: TokenSet; 69 | private attributeSet: TokenSet; 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/parser/Lib.d.ts: -------------------------------------------------------------------------------- 1 | import { Buffer } from "nbind/dist/shim"; 2 | 3 | export class NBindBase { free?(): void } 4 | 5 | export class Namespace extends NBindBase { 6 | /** Namespace(std::string); */ 7 | constructor(p0: string); 8 | 9 | /** Namespace clone(); */ 10 | clone(): Namespace; 11 | 12 | /** void setElementTrie(Buffer); */ 13 | setElementTrie(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): void; 14 | 15 | /** void setAttributeTrie(Buffer); */ 16 | setAttributeTrie(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): void; 17 | } 18 | 19 | export class Parser extends NBindBase { 20 | /** Parser(const ParserConfig &); */ 21 | constructor(p0: ParserConfig); 22 | 23 | /** ParserConfig * getConfig(); */ 24 | getConfig(): ParserConfig | null; 25 | 26 | /** void setCodeBuffer(Buffer, cbFunction &); */ 27 | setCodeBuffer(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer, p1: (...args: any[]) => any): void; 28 | 29 | /** void setPrefix(uint32_t); */ 30 | setPrefix(p0: number): void; 31 | 32 | /** bool bindPrefix(uint32_t, uint32_t); */ 33 | bindPrefix(p0: number, p1: number): boolean; 34 | 35 | /** int32_t parse(Buffer); */ 36 | parse(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): number; 37 | 38 | /** int32_t destroy(); */ 39 | destroy(): number; 40 | 41 | /** uint32_t row; -- Read-only */ 42 | row: number; 43 | 44 | /** uint32_t col; -- Read-only */ 45 | col: number; 46 | } 47 | 48 | export class ParserConfig extends NBindBase { 49 | /** ParserConfig(uint32_t, uint32_t, uint32_t, uint32_t); */ 50 | constructor(p0: number, p1: number, p2: number, p3: number); 51 | 52 | /** uint32_t addNamespace(std::shared_ptr); */ 53 | addNamespace(p0: Namespace): number; 54 | 55 | /** bool addUri(uint32_t, uint32_t); */ 56 | addUri(p0: number, p1: number): boolean; 57 | 58 | /** bool bindPrefix(uint32_t, uint32_t); */ 59 | bindPrefix(p0: number, p1: number): boolean; 60 | 61 | /** void setUriTrie(Buffer); */ 62 | setUriTrie(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): void; 63 | 64 | /** void setPrefixTrie(Buffer); */ 65 | setPrefixTrie(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): void; 66 | } 67 | 68 | export class Patricia extends NBindBase { 69 | /** Patricia(); */ 70 | constructor(); 71 | 72 | /** void setBuffer(Buffer); */ 73 | setBuffer(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): void; 74 | 75 | /** uint32_t find(const char *); */ 76 | find(p0: string): number; 77 | } 78 | -------------------------------------------------------------------------------- /lib/README.md: -------------------------------------------------------------------------------- 1 | Tokenizer library 2 | ================= 3 | 4 | This is an XML tokenizer library for `cxml`, written in C++ for speed. 5 | Fundamentally it's a small, manually designed DFA (state machine). 6 | 7 | Every recognized element or attribute from a known namespace is a specific 8 | token. Otherwise tokens are different kinds of offsets to the input buffer. 9 | 10 | Structure 11 | --------- 12 | 13 | - `Parser.cc` contains the main state machine. 14 | - `PatriciaCursor.cc` handles traversing Patricia tries containing known 15 | text string tokens. 16 | - `ParserConfig.h` contains the API for initializing parser settings. 17 | Creating new parser instances from the same config object is fast. 18 | 19 | Design 20 | ------ 21 | 22 | ### What choices make C++ suitable and why is it faster? 23 | 24 | Some reasons: 25 | 26 | - The code would be almost the same if written in JavaScript, but a 27 | JavaScript JIT compiler would need to make countless correct guesses 28 | to produce equally optimized output. 29 | - The state machine structure is encoded in assignments to an integer 30 | `state` variable and `switch`, `case` and `goto` statements. 31 | An integer-based jump table is very fast. 32 | - Every `goto` could be replaced with `continue`, but then the compiler 33 | may not understand that the jump table can be skipped. 34 | - Length of text content is calculated in a very tight loop using a pointer. 35 | Compiled JavaScript would require more safety checks. 36 | - Output is only tokens with offsets to input, nothing is copied. 37 | - Input is treated as bytes without decoding UTF-8. 38 | Recognized tokens never need such decoding. 39 | - Calls between languages are always slow and thus only used to notify when 40 | a buffer has become full. No arguments are passed, to avoid type conversion. 41 | - Both languages directly access the same buffers, sharing memory. 42 | - Code dealing with pointers and character literals is clearer and less 43 | verbose when written in C++. 44 | 45 | ### Counter-arguments and justifications for C++ 46 | 47 | - For safety, C++ does require more careful programming, especially when 48 | using pointers. 49 | - Incorrect memory reads in C++ could crash when JavaScript would produce 50 | invalid output, allowing denial of service attacks. Information from other 51 | variables could also be leaked. 52 | - Invalid memory writes through pointers lead to remote code execution 53 | exploits, compromising all security. 54 | - They should be avoided, or audited carefully and surrounded with checks. 55 | - For speed, this tokenizer uses pointers extensively but carefully. 56 | - When reading, it avoids many run-time checks that JavaScript would do. 57 | - However, it does not output what was read, only where it found something 58 | interesting. This avoids leaking information. 59 | - Writes are done very carefully in a single, small function. 60 | - Between various checks, only one number is written at a time. 61 | - Elsewhere, `const` pointers prevent accidental memory writes. 62 | - Written data is not directly copied from input. 63 | -------------------------------------------------------------------------------- /src/schema/Element.ts: -------------------------------------------------------------------------------- 1 | import { ElementToken } from '../parser/Token'; 2 | 3 | import { ComplexType, ElementTypeConstructor } from './ComplexType'; 4 | import { Group } from './Group'; 5 | import { MemberSpec, MemberMeta, SimpleType, SimpleValue } from './Member'; 6 | 7 | export class SimpleElementSpec extends MemberSpec { 8 | 9 | /** Name and other info. */ 10 | meta: SimpleElementMeta; 11 | 12 | } 13 | 14 | /** Configuration for elements as type members. */ 15 | 16 | export class ElementSpec extends MemberSpec { 17 | 18 | /** Name and other info, also available in the prototype of all element instances. */ 19 | meta?: ElementMeta; 20 | 21 | group?: Group; 22 | 23 | } 24 | 25 | /** Metadata for elements without children or attributes in builder output. */ 26 | 27 | export class SimpleElementMeta extends MemberMeta { 28 | 29 | /** Substitution group head. */ 30 | substitutes?: SimpleElementMeta; 31 | 32 | /** Token with element name and namespace. 33 | * A single token may have different types depending on its parent. */ 34 | token: ElementToken; 35 | 36 | type: SimpleType; 37 | 38 | } 39 | 40 | /** Metadata for elements in builder output. */ 41 | 42 | export class ElementMeta extends MemberMeta { 43 | 44 | createProto() { 45 | if(!this.XMLType) { 46 | const BaseType: ElementTypeConstructor = this.type.createProto(); 47 | 48 | this.XMLType = class XMLType extends BaseType implements ElementInstance { 49 | _: ElementMeta; 50 | } as ElementConstructor; 51 | 52 | Object.defineProperty(this.XMLType.prototype, 'constructor', { 53 | configurable: true, 54 | enumerable: false, 55 | writable: true 56 | }); 57 | 58 | Object.defineProperty(this.XMLType.prototype, '_', { 59 | configurable: true, 60 | enumerable: false, 61 | value: this, 62 | writable: true 63 | }); 64 | } 65 | 66 | return(this.XMLType); 67 | } 68 | 69 | XMLType?: ElementConstructor; 70 | 71 | /** A singleton object to use if the element is missing. */ 72 | placeholder?: ElementClass; 73 | 74 | /** Substitution group head. */ 75 | substitutes?: ElementMeta; 76 | 77 | /** Token with element name and namespace. 78 | * A single token may have different types depending on its parent. */ 79 | token: ElementToken; 80 | 81 | type: ComplexType; 82 | 83 | } 84 | 85 | /** Base class for elements defined in the schema. Inherited by a hierarchy of types, 86 | * each branch terminating in an element definition. */ 87 | 88 | export class ElementBase {} 89 | 90 | /** Represents any element defined in the schema. */ 91 | 92 | export interface ElementInstance extends ElementBase { 93 | 94 | /** Builder metadata. Defined in the prototypes of parsed objects, 95 | * or properties of placeholders for non-existent members. */ 96 | _: ElementMeta; 97 | 98 | /** Possible text content. */ 99 | $?: SimpleValue; 100 | 101 | } 102 | 103 | export interface ElementConstructor { 104 | new(): ElementClass; 105 | }; 106 | -------------------------------------------------------------------------------- /src/parser/JSX.ts: -------------------------------------------------------------------------------- 1 | import { Token, TokenKind, ElementToken, OpenToken, AttributeToken } from './Token'; 2 | import { ParserConfig } from './ParserConfig'; 3 | 4 | export function defineElement(): ElementToken { 5 | return(true as any); 6 | } 7 | 8 | export function defineAttribute(): AttributeToken { 9 | return(false as any); 10 | } 11 | 12 | export interface XModule { 13 | [name: string]: string | ElementToken | AttributeToken; 14 | } 15 | 16 | export interface XModuleTable { 17 | [prefix: string]: XModule; 18 | } 19 | 20 | export interface XMLElementNode extends Array { 21 | 0: any; 22 | 1: Attributes; 23 | 2: XMLNode[]; 24 | } 25 | 26 | export class XMLArgumentNode { 27 | constructor(public name: string) {} 28 | } 29 | 30 | export type XMLNode = XMLElementNode | XMLArgumentNode | string | number | boolean | undefined; 31 | 32 | export function jsxElement( 33 | kind: string, 34 | attr: Attributes, 35 | ...children: XMLNode[] 36 | ): XMLElementNode; 37 | 38 | export function jsxElement() { return(Array.prototype.slice.apply(arguments)); } 39 | 40 | export function jsxExpand( 41 | config: ParserConfig, 42 | node: XMLNode, 43 | output: (any[] | XMLArgumentNode)[], 44 | part = output[0] as any[] 45 | ) { 46 | if(typeof(node) != 'object') { 47 | part.push(node); 48 | } else if(node instanceof Array) { 49 | const element = node[0]; 50 | const attributes = node[1] || {}; 51 | 52 | // If the first element is not a token or the second element is, 53 | // then the node is already expanded! 54 | // An attribute, emitted or close token always follows 55 | // an open token in expanded nodes. 56 | 57 | if(!(element instanceof OpenToken) || attributes instanceof Token) { 58 | // Flatten and output the already expanded node. 59 | return(element instanceof Array ? Array.prototype.concat.apply([], node) : node); 60 | } 61 | 62 | part.push(element); 63 | 64 | for(let name of Object.keys(attributes)) { 65 | const attr = attributes[name]; 66 | 67 | part.push(config.getAttributeTokens(element.ns, name)[TokenKind.string]!); 68 | 69 | if(attr instanceof XMLArgumentNode) { 70 | output.push(attr); 71 | part = []; 72 | output.push(part); 73 | } else { 74 | part.push(attr); 75 | } 76 | } 77 | 78 | if(node.length > 2) { 79 | part.push(element.emitted); 80 | 81 | for(let num = 2; num < node.length; ++num) { 82 | part = jsxExpand(config, node[num], output, part); 83 | } 84 | } 85 | 86 | part.push(element.close); 87 | } else if(node instanceof XMLArgumentNode) { 88 | output.push(node); 89 | part = []; 90 | output.push(part); 91 | } 92 | 93 | return(part); 94 | } 95 | 96 | export function jsxCompile( 97 | config: ParserConfig, 98 | generate: (...args: any[]) => XMLElementNode 99 | ) { 100 | const template = generate((name: string) => new XMLArgumentNode(name)); 101 | 102 | // console.log(require('util').inspect(template, { depth: null })); 103 | 104 | const parts: any[][] = [[]]; 105 | jsxExpand(config, template, parts); 106 | 107 | const rest = parts.slice(1); 108 | 109 | // Compile a function that expands and interpolates arguments into the template. 110 | 111 | return(eval( 112 | // The function returns the first part of expanded output... 113 | '(function compiled(spec) {return(parts[0]' + 114 | // ...with other parts appended, if any. 115 | (!rest.length ? '' : 116 | '.concat(' + rest.map( 117 | (part, pos: number) => (part instanceof XMLArgumentNode ? ( 118 | // Expand parts representing arguments. 119 | 'jsxExpand(config,spec.' + part.name + '||"",[[]])' 120 | ) : ( 121 | // Output already expanded parts as-is. 122 | 'rest[' + pos + ']' 123 | )) 124 | ).join(',') + ')' 125 | ) + 126 | ');})' 127 | )); 128 | } 129 | -------------------------------------------------------------------------------- /src/parser/Token.ts: -------------------------------------------------------------------------------- 1 | import { Namespace } from '../Namespace'; 2 | import { ParserNamespace } from './ParserNamespace'; 3 | import { ParserConfig } from './ParserConfig'; 4 | 5 | export type TokenBuffer = (Token | number | string)[]; 6 | 7 | // Order must match InternalToken.tokenList. 8 | export const enum TokenKind { 9 | // External element token types 10 | open, 11 | close, 12 | emitted, 13 | elementEnd = emitted, 14 | 15 | // External attribute token types 16 | string, 17 | number, 18 | attributeEnd = number, 19 | 20 | comment, 21 | cdata, 22 | blank, 23 | sgml, 24 | sgmlEmitted, 25 | sgmlNestedStart, 26 | sgmlNestedEnd, 27 | sgmlText, 28 | 29 | // Internal token types 30 | uri, 31 | prefix, 32 | element, 33 | attribute, 34 | 35 | other 36 | } 37 | 38 | export abstract class Token { 39 | 40 | constructor() {} 41 | 42 | serialize?(indent?: string, data?: any): string | TokenBuffer; 43 | serializeJson?(indent?: string, data?: any): any; 44 | 45 | kind: TokenKind; 46 | kindString: string; 47 | 48 | } 49 | Token.prototype.kind = TokenKind.other; 50 | Token.prototype.kindString = 'other'; 51 | 52 | export class SpecialToken extends Token { 53 | 54 | constructor(public kind: TokenKind, public kindString: string) { super(); } 55 | 56 | static comment = new SpecialToken(TokenKind.comment, 'comment'); 57 | static cdata = new SpecialToken(TokenKind.cdata, 'cdata'); 58 | static blank = new SpecialToken(TokenKind.blank, 'blank'); 59 | static sgmlEmitted = new SpecialToken(TokenKind.sgmlEmitted, 'SGML emitted'); 60 | static sgmlNestedStart = new SpecialToken(TokenKind.sgmlNestedStart, 'DTD start'); 61 | static sgmlNestedEnd = new SpecialToken(TokenKind.sgmlNestedEnd, 'DTD end'); 62 | static sgmlText = new SpecialToken(TokenKind.sgmlText, 'SGML text'); 63 | 64 | } 65 | 66 | export class PrefixToken extends Token { 67 | 68 | constructor(public name: string, public id?: number) { super(); } 69 | 70 | } 71 | PrefixToken.prototype.kind = TokenKind.prefix; 72 | PrefixToken.prototype.kindString = 'prefix'; 73 | 74 | export class UriToken extends Token { 75 | 76 | constructor(public ns: Namespace) { super(); } 77 | 78 | } 79 | UriToken.prototype.kind = TokenKind.uri; 80 | UriToken.prototype.kindString = 'uri'; 81 | 82 | export abstract class MemberToken extends Token { 83 | 84 | constructor(public name: string, public ns: Namespace, public id?: number) { super(); } 85 | 86 | abstract resolve(ns: ParserNamespace): Token; 87 | 88 | } 89 | 90 | export class ElementToken extends MemberToken { 91 | 92 | resolve(ns: ParserNamespace) { 93 | return(ns.addElement(this.name).tokenList[this.kind as number]!); 94 | } 95 | 96 | } 97 | 98 | export class AttributeToken extends MemberToken { 99 | 100 | resolve(ns: ParserNamespace) { 101 | return(ns.addAttribute(this.name).tokenList[this.kind as number]!); 102 | } 103 | 104 | } 105 | 106 | export class OpenToken extends ElementToken { 107 | emitted = new EmittedToken(this.name, this.ns, this.id); 108 | close = new CloseToken(this.name, this.ns, this.id); 109 | } 110 | OpenToken.prototype.kind = TokenKind.open; 111 | OpenToken.prototype.kindString = 'open'; 112 | 113 | export class CloseToken extends ElementToken {} 114 | CloseToken.prototype.kind = TokenKind.close; 115 | CloseToken.prototype.kindString = 'close'; 116 | 117 | export class EmittedToken extends ElementToken {} 118 | EmittedToken.prototype.kind = TokenKind.emitted; 119 | EmittedToken.prototype.kindString = 'emitted'; 120 | 121 | export class StringToken extends AttributeToken {} 122 | StringToken.prototype.kind = TokenKind.string; 123 | StringToken.prototype.kindString = 'string'; 124 | 125 | export class SgmlToken extends Token { 126 | 127 | constructor(public name: string, public prefix: string) { super(); } 128 | 129 | } 130 | SgmlToken.prototype.kind = TokenKind.sgml; 131 | SgmlToken.prototype.kindString = 'sgml'; 132 | -------------------------------------------------------------------------------- /src/builder/RuleSet.ts: -------------------------------------------------------------------------------- 1 | import { SimpleSchema } from '../schema/SimpleSchema'; 2 | import { ComplexType } from '../schema/ComplexType'; 3 | import { MemberSpec } from '../schema/Member'; 4 | import { ElementInstance, ElementMeta, ElementConstructor } from '../schema/Element'; 5 | 6 | export class Rule { 7 | 8 | addElement(member: RuleMember) { 9 | this.elements[member.id] = member; 10 | } 11 | 12 | addAttribute(member: RuleMember) { 13 | this.attributes[member.id] = member; 14 | } 15 | 16 | elements: { [id: number]: RuleMember } = {}; 17 | attributes: { [id: number]: RuleMember } = {}; 18 | 19 | static string = new Rule(); 20 | 21 | XMLType: ElementConstructor; 22 | 23 | } 24 | 25 | export class RuleMember { 26 | 27 | constructor(public rule: Rule, public spec: MemberSpec) { 28 | this.id = spec.meta!.token.id!; 29 | this.min = spec.min; 30 | this.max = spec.max; 31 | } 32 | 33 | id: number; 34 | min: number; 35 | max: number; 36 | 37 | } 38 | 39 | function link(parent: Type) { 40 | function Result() {} 41 | Result.prototype = parent; 42 | return(new (Result as any)()); 43 | } 44 | 45 | export interface RuleStack { 46 | meta?: ElementMeta; 47 | rule?: Rule; 48 | parent?: RuleStack; 49 | } 50 | 51 | export class RuleSet { 52 | 53 | createRule(type: ComplexType, meta?: ElementMeta, parent?: RuleStack) { 54 | const rule = new Rule(); 55 | let childRule: Rule | undefined; 56 | let proto: { [key: string]: any } = {}; 57 | 58 | if(meta) { 59 | rule.XMLType = meta.createProto(); 60 | proto = rule.XMLType.prototype; 61 | } 62 | 63 | if(type.elements && type.elements.group) { 64 | for(let childSpec of type.elements.group.list) { 65 | const memberMeta = childSpec.meta; 66 | 67 | if(memberMeta) { 68 | if(memberMeta instanceof ElementMeta) { 69 | childRule = void 0; 70 | 71 | for(let item = parent; item; item = item.parent) { 72 | if(item.meta == memberMeta) { 73 | // If the child element type matches an ancestor's, 74 | // re-use its rule to avoid infinite recursion. 75 | 76 | childRule = item.rule; 77 | break; 78 | } 79 | } 80 | 81 | if(!childRule) { 82 | childRule = this.createRule(memberMeta.type, memberMeta, { meta, rule, parent }); 83 | } 84 | 85 | // Subclass type metadata and clear existence flag to indicate a placeholder. 86 | let fakeMeta = link(memberMeta); 87 | fakeMeta.exists = false; 88 | 89 | let placeholder: ElementInstance | ElementInstance[] | null = new childRule.XMLType(); 90 | placeholder._ = fakeMeta; 91 | memberMeta.placeholder = placeholder; 92 | 93 | if(childSpec.max > 1) { 94 | // Use arrays as placeholders for arrays of children. 95 | placeholder = childSpec.min > 0 ? [ placeholder ] : []; 96 | } else if(childSpec.min < 1) { 97 | placeholder = null; 98 | } 99 | 100 | if(placeholder) { 101 | Object.defineProperty(proto, memberMeta.token.name, { 102 | configurable: true, 103 | enumerable: false, 104 | value: placeholder, 105 | writable: true 106 | }); 107 | } 108 | } else childRule = Rule.string; 109 | 110 | rule.addElement(new RuleMember(childRule, childSpec)); 111 | } 112 | } 113 | } 114 | 115 | if(type.attributes) { 116 | for(let attributeSpec of type.attributes.list) { 117 | const memberMeta = attributeSpec.meta; 118 | 119 | if(memberMeta) { 120 | // const token = memberMeta.token; 121 | 122 | childRule = Rule.string; 123 | 124 | rule.addAttribute(new RuleMember(childRule, attributeSpec)); 125 | } 126 | } 127 | } 128 | 129 | return(rule); 130 | } 131 | 132 | constructor(public schema: SimpleSchema) { 133 | this.rootRule = this.createRule(schema.document); 134 | } 135 | 136 | rootRule: Rule; 137 | 138 | } 139 | -------------------------------------------------------------------------------- /src/writer/JsonWriter.ts: -------------------------------------------------------------------------------- 1 | import * as stream from 'stream'; 2 | 3 | import { Namespace } from '../Namespace'; 4 | import { Token, TokenKind, MemberToken } from '../parser/Token'; 5 | import { TokenChunk } from '../parser/TokenChunk'; 6 | 7 | import { Indent, State, indentPattern } from './Writer'; 8 | 9 | export class JsonWriter extends stream.Transform { 10 | 11 | /** @param data Arbitrary data passed to any custom serializers. */ 12 | 13 | constructor(private data?: any) { 14 | super({ objectMode: true }); 15 | } 16 | 17 | transform(chunk: TokenChunk) { 18 | let state = this.state; 19 | let depth = this.depth; 20 | let indent = this.indent; 21 | let nsElement = this.nsElement; 22 | const buffer = chunk.buffer; 23 | let token: typeof buffer[0]; 24 | let member: MemberToken; 25 | let prefix: string; 26 | let serialized: any; 27 | 28 | let partList: string[] = []; 29 | let partNum = -1; 30 | let lastNum = chunk.length - 1; 31 | let tokenNum = -1; 32 | 33 | while(tokenNum < lastNum) { 34 | 35 | token = buffer[++tokenNum]; 36 | 37 | if(token instanceof Token) { 38 | switch(token.kind) { 39 | case TokenKind.open: 40 | 41 | member = token as MemberToken; 42 | nsElement = member.ns; 43 | 44 | if(nsElement.isSpecial && nsElement.defaultPrefix == '?') { 45 | state = State.PROCESSING; 46 | } else { 47 | ++depth; 48 | partList[++partNum] = indent + '[ "' + member.name + '"'; 49 | state = State.ELEMENT; 50 | } 51 | 52 | indent = ',' + indentPattern.substr(0, depth); 53 | break; 54 | 55 | case TokenKind.emitted: 56 | 57 | state = State.TEXT; 58 | break; 59 | 60 | case TokenKind.close: 61 | 62 | if(state != State.PROCESSING) { 63 | member = token as MemberToken; 64 | indent = indentPattern.substr(0, --depth); 65 | 66 | if(state == State.TEXT) { 67 | partList[++partNum] = indent + ']'; 68 | } else { 69 | partList[++partNum] = ' ]'; 70 | } 71 | 72 | indent = ',' + indent; 73 | } 74 | 75 | state = State.TEXT; 76 | break; 77 | 78 | case TokenKind.string: 79 | 80 | member = token as MemberToken; 81 | 82 | partList[++partNum] = ', [ "$' + member.name + '"'; 83 | break; 84 | 85 | case TokenKind.comment: 86 | 87 | state = State.COMMENT; 88 | break; 89 | 90 | case TokenKind.other: 91 | 92 | if(token.serializeJson) { 93 | serialized = token.serializeJson(indent, this.data); 94 | if(typeof(serialized) != 'string') serialized = JSON.stringify(serialized); 95 | 96 | partList[++partNum] = indent + serialized; 97 | state = State.AFTER_TEXT; 98 | } 99 | break; 100 | } 101 | } else { 102 | switch(state) { 103 | case State.TEXT: 104 | 105 | partList[++partNum] = ', [ "$", ' + JSON.stringify(token) + ' ]'; 106 | state = State.AFTER_TEXT; 107 | break; 108 | 109 | case State.ELEMENT: 110 | case State.PROCESSING: 111 | 112 | partList[++partNum] = ', ' + JSON.stringify(token) + ' ]'; 113 | break; 114 | 115 | case State.COMMENT: 116 | 117 | break; 118 | 119 | } 120 | } 121 | } 122 | 123 | this.state = state; 124 | this.depth = depth; 125 | this.indent = indent; 126 | this.nsElement = nsElement; 127 | 128 | return(partList); 129 | } 130 | 131 | _transform(chunk: TokenChunk | null, enc: string, flush: (err: any, chunk: string) => void) { 132 | if(!chunk) { 133 | flush(null, ''); 134 | return; 135 | } 136 | 137 | const partList = this.transform(chunk); 138 | flush(null, partList.join('')); 139 | } 140 | 141 | _flush( flush: (err: any, chunk: string) => void) { 142 | flush(null, '\n'); 143 | } 144 | 145 | private state = State.TEXT as State; 146 | private depth = Indent.MIN_DEPTH; 147 | private indent = ''; 148 | private nsElement: Namespace; 149 | 150 | } 151 | -------------------------------------------------------------------------------- /lib/PatriciaCursor.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "PatriciaCursor.h" 4 | 5 | void PatriciaCursor :: init(const Patricia &trie) { 6 | if(trie.root != root) { 7 | root = trie.root; 8 | // Hold on to trie data used by the cursor in case it gets garbage collected. 9 | buffer = trie.buffer; 10 | } 11 | 12 | ptr = root; 13 | len = *ptr++; 14 | 15 | found = nullptr; 16 | } 17 | 18 | bool PatriciaCursor :: advance(unsigned char c) { 19 | const unsigned char *p = ptr; 20 | unsigned char delta; 21 | 22 | // Loop until the current trie branch node contains an entire byte. 23 | while(len < 8) { 24 | if(len) { 25 | // Compare input with branch node contents by using XOR, and 26 | // shift away trailing bits not contained in the node. 27 | delta = (c ^ *p++) >> (7 - len); 28 | } else { 29 | // If the branch doesn't depend on any bits inside the byte, 30 | // it must be the last byte of an inserted string. 31 | // An associated data value will follow so jump over it. 32 | delta = 0; 33 | 34 | // High bit of associated data value signals no longer strings 35 | // with this prefix exist. 36 | if(*p & 0x80) { 37 | ptr = p; 38 | return(false); 39 | } 40 | } 41 | 42 | if(delta) { 43 | // If input differs from branch node contents in bits before 44 | // the last one, then it was not found in the trie. 45 | if(delta > 1) { 46 | ptr = p - 1; 47 | return(false); 48 | } 49 | 50 | // If the last bit differs, find pointer to the second child. 51 | // It must exist, otherwise there would be no branch here. 52 | p += (p[0] << 16) + (p[1] << 8) + p[2]; 53 | } else { 54 | // This branch is conditioned on a bit so it has a pointer 55 | // to a second child, or it ends on a byte boundary so it has 56 | // a data pointer. In either case, jump over a pointer to find 57 | // the first child node. 58 | p += 3; 59 | } 60 | 61 | // Entered a new node, so read its length. 62 | len = *p++; 63 | } 64 | 65 | len -= 8; 66 | 67 | // If the node contains a full byte but the input doesn't match, 68 | // then it was not found in the trie. 69 | if(c != *p++) { 70 | ptr = p; 71 | return(false); 72 | } 73 | 74 | if(!len) { 75 | // If the branch doesn't depend on any bits inside the byte, 76 | // it must be the last byte of an inserted string. 77 | // Store the location of its data value. 78 | found = p; 79 | 80 | // NOTE: Nodes longer than 32 bytes must be split, so intermediate 81 | // nodes represent partial strings not actually inserted. Their 82 | // associated value is Patricia :: notFound, so results are unaffected. 83 | } 84 | 85 | ptr = p; 86 | return(true); 87 | } 88 | 89 | bool PatriciaCursor :: transfer(const Patricia &trie) { 90 | const unsigned char *p = trie.root; 91 | const unsigned char *target = ptr; 92 | unsigned char c; 93 | PatriciaCursor other; 94 | 95 | other.init(trie); 96 | 97 | // TODO! 98 | while(0 && p < target) { 99 | // c = ... 100 | 101 | if(!other.advance(c)) return(false); 102 | } 103 | 104 | *this = other; 105 | 106 | return(true); 107 | } 108 | 109 | uint32_t PatriciaCursor :: findLeaf() { 110 | const unsigned char *p = ptr; 111 | uint16_t len = this->len; 112 | uint32_t data; 113 | 114 | do { 115 | // Skip to reference to current node's data or second child. 116 | p += (len + 7) / 8; 117 | 118 | while(len & 7) { 119 | // Read length from beginning of first child 120 | // (just after the data reference). 121 | len = p[3]; 122 | // Skip current node's data or second child reference, the first child's 123 | // length and its contents, moving to its data or second child reference. 124 | p += (len + 7) / 8 + 4; 125 | } 126 | 127 | len = p[3]; 128 | found = p; 129 | data = getData(); 130 | 131 | p += 4; 132 | // After splitting nodes at 32 chars, avoid returning a split node. 133 | } while(data == Patricia :: notFound && !(*p & 0x80)); 134 | 135 | return(data); 136 | } 137 | 138 | uint32_t PatriciaCursor :: getData() { 139 | if(!found) return(Patricia :: notFound); 140 | 141 | return( ( (found[0] << 16) + (found[1] << 8) + found[2] ) & Patricia :: idMask ); 142 | } 143 | -------------------------------------------------------------------------------- /src/schema/SimpleSchema.ts: -------------------------------------------------------------------------------- 1 | import { TokenKind } from '../parser/Token'; 2 | import { ParserConfig } from '../parser/ParserConfig'; 3 | import { Namespace } from '../Namespace'; 4 | import { ComplexType } from './ComplexType'; 5 | import { SimpleType } from './Member'; 6 | import { AttributeSpec, AttributeMeta } from './Attribute'; 7 | import { SimpleElementSpec, SimpleElementMeta, ElementSpec, ElementMeta } from './Element'; 8 | 9 | export type SimpleMemberSpec = string | { [ memberName: string]: string }; 10 | 11 | export type SimpleSchemaSpec = { [ typeName: string ]: SimpleMemberSpec[] }; 12 | 13 | export type SimpleSchemaSpecTbl = { [prefix: string]: [ string, string, SimpleSchemaSpec ] }; 14 | 15 | export class SimpleSchema { 16 | 17 | constructor(private parserConfig: ParserConfig, public ns: Namespace, spec: SimpleSchemaSpec, root = spec['document']) { 18 | const typeTbl = this.typeTbl; 19 | 20 | // SimpleType expands to a plain string without support for attributes, 21 | // ComplexType expands to { $: "..." } allowing parseUnknown to work. 22 | // const stringType = new SimpleType(); 23 | const stringType = new ComplexType(); 24 | typeTbl['xs:string'] = stringType; 25 | 26 | parserConfig.addNamespace(ns); 27 | 28 | // Create placeholder objects for all types. 29 | for(let typeName of Object.keys(spec)) { 30 | typeTbl[typeName] = new ComplexType(); 31 | } 32 | 33 | // Define types, using placeholders when referring to undefined types. 34 | for(let typeName of Object.keys(spec)) { 35 | this.defineType(spec[typeName], typeTbl[typeName] as ComplexType); 36 | } 37 | 38 | this.document = (typeTbl['document'] || this.defineType(root)) as ComplexType; 39 | } 40 | 41 | defineType(spec: SimpleMemberSpec[], type: ComplexType = new ComplexType()) { 42 | let memberName: string; 43 | 44 | for(let child of spec) { 45 | if(typeof(child) == 'string') { 46 | memberName = child; 47 | child = {}; 48 | child[memberName] = memberName; 49 | } 50 | 51 | for(memberName of Object.keys(child)) { 52 | let min = 1, max = 1; 53 | 54 | // Parse element or attribute name with type prefix / suffix. 55 | let parts = memberName.match(/(\$?)([^\[]+)(\[\])?(\?)?/); 56 | if(!parts) continue; 57 | 58 | let [, prefix, name, arraySuffix, optionalSuffix] = parts; 59 | 60 | // Parse type name if it differs from element/attribute name. 61 | if(child[memberName] != memberName) { 62 | parts = child[memberName].match(/(\$?)([^\[]+)(\[\])?(\?)?/); 63 | if(!parts) continue; 64 | 65 | // Type prefix / suffix behave identically in member and type names. 66 | prefix = prefix || parts[1]; 67 | arraySuffix = arraySuffix || parts[3]; 68 | optionalSuffix = optionalSuffix || parts[4]; 69 | } 70 | 71 | if(optionalSuffix) min = 0; 72 | if(arraySuffix) max = Infinity; 73 | 74 | const memberTypeName = parts[2]; 75 | const memberType = this.typeTbl[memberTypeName]; 76 | 77 | // Prefix $ marks attributes. 78 | if(prefix == '$') { 79 | const token = this.parserConfig.getAttributeTokens(this.ns, name)[TokenKind.string]!; 80 | const attributeSpec = new AttributeSpec(min, max); 81 | const attributeMeta = new AttributeMeta(token); 82 | 83 | // attributeMeta.type = xsd:string 84 | attributeSpec.meta = attributeMeta; 85 | type.addAttribute(attributeSpec); 86 | } else if(memberType) { 87 | const token = this.parserConfig.getElementTokens(this.ns, name)[TokenKind.open]!; 88 | let elementSpec: SimpleElementSpec | ElementSpec; 89 | let elementMeta: SimpleElementMeta | ElementMeta; 90 | 91 | if(memberType instanceof ComplexType) { 92 | elementSpec = new ElementSpec(min, max); 93 | elementMeta = new ElementMeta(token); 94 | } else { 95 | elementSpec = new SimpleElementSpec(min, max); 96 | elementMeta = new SimpleElementMeta(token); 97 | } 98 | 99 | elementMeta.type = memberType; 100 | elementSpec.meta = elementMeta; 101 | type.addAll(elementSpec); 102 | } 103 | } 104 | } 105 | 106 | return(type); 107 | } 108 | 109 | typeTbl: { [ typeName: string ]: SimpleType | ComplexType } = {}; 110 | 111 | document: ComplexType; 112 | 113 | } 114 | -------------------------------------------------------------------------------- /src/builder/Builder.ts: -------------------------------------------------------------------------------- 1 | import { Namespace } from '../Namespace'; 2 | import { TokenChunk } from '../parser/TokenChunk'; 3 | import { Token, TokenBuffer, TokenKind, OpenToken, CloseToken, StringToken } from '../parser/Token'; 4 | import { ParserConfig, ParserOptions } from '../parser/ParserConfig'; 5 | import { Parser } from '../parser/Parser'; 6 | import { SimpleSchema, SimpleSchemaSpecTbl } from '../schema/SimpleSchema'; 7 | import { RuleSet, Rule, RuleMember } from './RuleSet'; 8 | 9 | import { ComplexType } from '../schema/ComplexType'; 10 | import { ElementInstance, ElementSpec, ElementMeta, ElementConstructor } from '../schema/Element'; 11 | import { ElementToken } from '../parser/Token'; 12 | import { BuilderConfig } from './BuilderConfig'; 13 | 14 | const enum State { 15 | ELEMENT = 0, 16 | PROCESSING, 17 | TEXT, 18 | COMMENT 19 | } 20 | 21 | export class Builder { 22 | 23 | constructor(private config: BuilderConfig, public nsUri: string) { 24 | const ruleSet = this.config.ruleSetTbl[nsUri]; 25 | 26 | if(!ruleSet) throw(new Error('Unknown XML namespace ' + nsUri)); 27 | 28 | this.rule = ruleSet.rootRule; 29 | } 30 | 31 | getUnknownProto(token: ElementToken) { 32 | let elementSpec: ElementSpec | undefined = this.unknownType.elements && this.unknownType.elements.group!.tbl[token.id!] as ElementSpec; 33 | 34 | if(!elementSpec) { 35 | elementSpec = new ElementSpec(0, Infinity); 36 | const elementMeta = new ElementMeta(token); 37 | 38 | elementMeta.type = new ComplexType(); 39 | elementSpec.meta = elementMeta; 40 | 41 | this.unknownType.addAll(elementSpec); 42 | } 43 | 44 | return(elementSpec.meta!.createProto()); 45 | } 46 | 47 | write(chunk: TokenChunk) { 48 | if(!chunk) return; 49 | 50 | const parseUnknown = this.config.options.parseUnknown; 51 | let unknownDepth = this.unknownDepth; 52 | let state = this.state; 53 | let item = this.item; 54 | let rule = this.rule; 55 | let member = this.member; 56 | let target = this.target; 57 | let stackPos = this.stackPos; 58 | 59 | const ruleStack = this.ruleStack; 60 | const itemStack = this.itemStack; 61 | 62 | const buffer = chunk.buffer; 63 | let token: typeof buffer[0]; 64 | let dataType: string; 65 | let kind: number; 66 | let id: number; 67 | let name: string; 68 | 69 | let itemNext: any; 70 | let ruleNext: Rule | undefined; 71 | 72 | let lastNum = chunk.length - 1; 73 | let tokenNum = -1; 74 | 75 | while(tokenNum < lastNum) { 76 | 77 | token = buffer[++tokenNum]; 78 | dataType = typeof(token); 79 | 80 | if(unknownDepth) { 81 | if(dataType == 'object') { 82 | kind = (token as Token).kind; 83 | 84 | if(kind == TokenKind.open) ++unknownDepth; 85 | else if(kind == TokenKind.close) --unknownDepth; 86 | } 87 | } else if(dataType == 'object') { 88 | kind = (token as Token).kind; 89 | 90 | switch(kind) { 91 | case TokenKind.open: 92 | 93 | id = (token as OpenToken).id!; 94 | name = (token as OpenToken).name; 95 | member = rule && rule.elements[id]; 96 | 97 | if(member) { 98 | ruleNext = member.rule; 99 | 100 | if(ruleNext == Rule.string) { 101 | // NOTE: If the string element has attributes, 102 | // they're added to its parent element! 103 | target = name; 104 | itemNext = item; 105 | } else { 106 | itemNext = new ruleNext.XMLType(); 107 | if(member.max > 1) { 108 | if(!item.hasOwnProperty(name)) item[name] = []; 109 | item[name].push(itemNext); 110 | } else item[name] = itemNext; 111 | } 112 | } else if(!parseUnknown) { 113 | ++unknownDepth; 114 | 115 | state = State.TEXT; 116 | break; 117 | } else { 118 | 119 | ruleNext = void 0; 120 | itemNext = new (this.getUnknownProto(token as OpenToken))(); 121 | 122 | if(!item.hasOwnProperty(name)) item[name] = itemNext; 123 | else if(item[name] instanceof Array) item[name].push(itemNext); 124 | else item[name] = [item[name], itemNext]; 125 | } 126 | 127 | itemStack[stackPos] = item; 128 | ruleStack[stackPos++] = rule; 129 | item = itemNext; 130 | rule = ruleNext; 131 | 132 | state = State.ELEMENT; 133 | break; 134 | 135 | case TokenKind.close: 136 | 137 | item = itemStack[--stackPos]; 138 | rule = ruleStack[stackPos]; 139 | 140 | // Fallthru 141 | case TokenKind.emitted: 142 | 143 | if(rule != Rule.string) target = '$'; 144 | 145 | state = State.TEXT; 146 | break; 147 | 148 | case TokenKind.string: 149 | 150 | id = (token as StringToken).id!; 151 | member = rule && rule.attributes[id]; 152 | if(member || parseUnknown) { 153 | target = (token as StringToken).name; 154 | } else { 155 | target = void 0; 156 | } 157 | 158 | break; 159 | 160 | case TokenKind.comment: 161 | 162 | state = State.COMMENT; 163 | break; 164 | } 165 | } else { 166 | switch(state) { 167 | case State.TEXT: 168 | case State.ELEMENT: 169 | 170 | if(target) { 171 | item[target] = (member && member.max > 1) ? (token + '').split(/ +/) : token; 172 | target = void 0; 173 | } 174 | 175 | break; 176 | } 177 | } 178 | } 179 | 180 | this.unknownDepth = unknownDepth; 181 | this.state = state; 182 | this.item = item; 183 | this.rule = rule; 184 | this.member = member; 185 | this.target = target; 186 | this.stackPos = stackPos; 187 | 188 | chunk.free(); 189 | 190 | return(this.document); 191 | } 192 | 193 | document: any = {}; 194 | private item = this.document; 195 | private rule?: Rule; 196 | private member?: RuleMember; 197 | private target?: string; 198 | 199 | private unknownType = new ComplexType(); 200 | private unknownDepth = 0; 201 | 202 | private stackPos = 0; 203 | private ruleStack: (Rule | undefined)[] = []; 204 | private itemStack: any[] = []; 205 | 206 | private state = State.TEXT; 207 | 208 | } 209 | -------------------------------------------------------------------------------- /src/tokenizer/Patricia.ts: -------------------------------------------------------------------------------- 1 | import { ArrayType, concatArray } from '../Buffer'; 2 | import { InternalToken } from '../parser/InternalToken'; 3 | 4 | class Node { 5 | constructor( 6 | public token: InternalToken | null, 7 | public buf: ArrayType, 8 | public len: number, 9 | public first?: Node, 10 | public second?: Node 11 | ) {} 12 | 13 | clone(): Node { 14 | const other = new Node( 15 | this.token, 16 | this.buf, 17 | this.len, 18 | this.first && this.first.clone(), 19 | this.second && this.second.clone() 20 | ); 21 | 22 | return(other); 23 | } 24 | } 25 | 26 | /** Maximum number of bits per node (number must fit in 1 byte). */ 27 | const MAX_LEN = 255; // Test edge cases by using smaller numbers (>= 8) here! 28 | 29 | /** Must equal Patricia :: notFound on C++ side. */ 30 | export const NOT_FOUND = 0x7fffff; 31 | 32 | class PatriciaCursor { 33 | constructor(public node: Node) { 34 | this.pos = 0; 35 | this.len = node.len; 36 | } 37 | 38 | advance(c: number) { 39 | let node = this.node; 40 | let b = node.buf; 41 | let p = this.pos; 42 | let len = this.len; 43 | let delta = 0; 44 | 45 | while(len < 8) { 46 | if(len) { 47 | delta = (c ^ b[p++]) >> (7 - len); 48 | } else { 49 | if(!node.first) return(false); 50 | delta = 0; 51 | } 52 | 53 | if(delta) { 54 | if(delta > 1) { 55 | this.node = node; 56 | this.pos = p - 1; 57 | this.len = len; 58 | 59 | return(false); 60 | } 61 | 62 | node = node.second!; 63 | } else { 64 | node = node.first!; 65 | } 66 | 67 | b = node.buf; 68 | p = 0; 69 | len = node.len; 70 | } 71 | 72 | if(c != b[p++]) { 73 | this.node = node; 74 | this.pos = p - 1; 75 | this.len = len; 76 | 77 | return(false); 78 | } 79 | 80 | len -= 8; 81 | 82 | this.node = node; 83 | this.pos = p; 84 | this.len = len; 85 | 86 | return(true); 87 | } 88 | 89 | pos: number; 90 | len: number; 91 | } 92 | 93 | export class Patricia { 94 | clone() { 95 | const other = new Patricia(); 96 | 97 | if(this.root) other.root = this.root.clone(); 98 | 99 | return(other); 100 | } 101 | 102 | insertNode(token: InternalToken) { 103 | let pos = 0; 104 | let root = this.root; 105 | 106 | if(!token.name) { 107 | throw(new Error('Empty strings not supported')); 108 | } 109 | 110 | if(!root) { 111 | root = new Node(token, token.buf, token.buf.length * 8); 112 | this.root = root; 113 | return; 114 | } 115 | 116 | let cursor = new PatriciaCursor(root); 117 | 118 | while(pos < token.buf.length && cursor.advance(token.buf[pos])) ++pos; 119 | 120 | const node = cursor.node; 121 | let rest: Node | undefined; 122 | 123 | if(pos < token.buf.length) { 124 | rest = new Node( 125 | token, 126 | token.buf.slice(pos), 127 | (token.buf.length - pos) * 8 128 | ); 129 | } 130 | 131 | if(cursor.len) { 132 | let bit = 0; 133 | 134 | if(rest) { 135 | let c = token.buf[pos] ^ node.buf[cursor.pos]; 136 | 137 | while(!(c & 0x80)) { 138 | c <<= 1; 139 | ++bit; 140 | } 141 | } else { 142 | // The new node is a prefix of this node. 143 | // Cut this node at a byte boundary. 144 | } 145 | 146 | // Split the node. 147 | 148 | node.first = new Node( 149 | node.token!, 150 | node.buf.slice(cursor.pos), 151 | node.len - cursor.pos * 8, 152 | node.first, 153 | node.second 154 | ); 155 | 156 | node.second = rest; 157 | 158 | node.token = rest ? null : token; 159 | node.buf = node.buf.slice(0, cursor.pos + ((bit + 7) >> 3)); 160 | node.len = cursor.pos * 8 + bit; 161 | } else if(!rest) { 162 | throw(new Error('Duplicates not supported: ' + token.name)); 163 | } else { 164 | // The new node only extends an existing node. 165 | node.first = rest; 166 | } 167 | } 168 | 169 | insertList(tokenList: InternalToken[]) { 170 | for(let token of tokenList) { 171 | this.insertNode(token); 172 | } 173 | 174 | // Verify that the tokens were correctly inserted! 175 | 176 | for(let token of tokenList) { 177 | let pos = 0; 178 | let root = this.root; 179 | 180 | let cursor = new PatriciaCursor(root); 181 | 182 | while(pos < token.buf.length) { 183 | if(!cursor.advance(token.buf[pos++])) { 184 | throw(new Error('Inserted token missing: ' + token.name)); 185 | } 186 | } 187 | 188 | if(cursor.node.token != token) { 189 | throw(new Error('Wrong token inserted for: ' + token.name)); 190 | } 191 | } 192 | } 193 | 194 | private static encodeNode( 195 | node: Node, 196 | dataList: ArrayType[] 197 | ) { 198 | let len = node.len; 199 | let partLen: number; 200 | let byteLen: number; 201 | let totalByteLen = 0; 202 | let posIn = -1; 203 | let posOut: number; 204 | 205 | while(len) { 206 | partLen = len; 207 | if(partLen > MAX_LEN) partLen = MAX_LEN & ~7; 208 | 209 | // Convert bit to byte length rounding up, add 1 byte for length 210 | // header and 3 bytes for reference 211 | // (token ID or offset to second child). 212 | byteLen = (partLen + 7) >> 3; 213 | const data = new ArrayType(byteLen + 4); 214 | 215 | dataList.push(data); 216 | totalByteLen += byteLen + 4; 217 | 218 | posOut = 0; 219 | 220 | data[posOut] = partLen; 221 | while(posOut < byteLen) data[++posOut] = node.buf[++posIn]; 222 | 223 | let ref: number; 224 | 225 | if(len > MAX_LEN) { 226 | ref = NOT_FOUND; 227 | } else { 228 | let nextTotalLen = 0; 229 | if(node.first) nextTotalLen += Patricia.encodeNode(node.first, dataList); 230 | 231 | if(node.second) { 232 | ref = nextTotalLen + 3; 233 | nextTotalLen += Patricia.encodeNode(node.second, dataList); 234 | } else { 235 | // ref = tokenSet.encode(node.token!) || 0; 236 | ref = node.token!.id; 237 | if(!node.first) ref |= 0x800000; // See 0x80 in PatriciaCursor.cc 238 | } 239 | 240 | totalByteLen += nextTotalLen; 241 | } 242 | 243 | data[++posOut] = ref >> 16; 244 | data[++posOut] = ref >> 8; 245 | data[++posOut] = ref; 246 | 247 | len -= partLen; 248 | } 249 | 250 | return(totalByteLen); 251 | } 252 | 253 | encode() { 254 | const dataList: ArrayType[] = []; 255 | 256 | // Encode trie contents into a buffer. 257 | const dataLen = Patricia.encodeNode( 258 | this.root || Patricia.sentinel, 259 | dataList 260 | ); 261 | 262 | return(concatArray(dataList, dataLen)); 263 | } 264 | 265 | /** Represents the root of an empty tree. */ 266 | private static sentinel = new Node( 267 | InternalToken.empty, 268 | InternalToken.empty.buf, 269 | InternalToken.empty.buf.length * 8 270 | ); 271 | 272 | private root: Node; 273 | } 274 | -------------------------------------------------------------------------------- /lib/Parser.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | #include "Namespace.h" 8 | #include "PatriciaCursor.h" 9 | #include "ParserConfig.h" 10 | 11 | struct ParserState { 12 | 13 | /** Flag whether the opening tag had a namespace prefix. */ 14 | bool isQualified; 15 | /** Namespace of this element. */ 16 | Namespace *nsElement; 17 | /** Default xmlns before entering this element. */ 18 | Namespace *nsOuterDefault; 19 | /** Number of new xmlns mappings made by this element. */ 20 | uint32_t xmlnsMapCount; 21 | 22 | }; 23 | 24 | struct PrefixDefinition { 25 | 26 | PrefixDefinition(uint32_t idPrefix = 0, uint32_t idNamespace = 0) : 27 | idPrefix(idPrefix), idNamespace(idNamespace) {} 28 | 29 | uint32_t idPrefix; 30 | uint32_t idNamespace; 31 | 32 | }; 33 | 34 | struct Element { 35 | 36 | Element(size_t prefixStackOffset, uint32_t crc32) : 37 | prefixStackOffset(prefixStackOffset), crc32(crc32) {} 38 | 39 | size_t prefixStackOffset; 40 | // TODO: verify open and close tags match by using CRC. 41 | uint32_t crc32; 42 | 43 | }; 44 | 45 | /** Fast streaming XML parser. */ 46 | 47 | class Parser { 48 | 49 | public: 50 | 51 | static constexpr uint32_t namespacePrefixTblSize = ParserConfig :: namespacePrefixTblSize; 52 | 53 | /** Parser states. */ 54 | 55 | enum class State : uint32_t { 56 | BEGIN, 57 | MATCH, MATCH_SPARSE, QUOTE, 58 | BEFORE_TEXT, TEXT, 59 | BEFORE_CDATA, CDATA, 60 | AFTER_LT, 61 | BEFORE_NAME, MATCH_TRIE, NAME, UNKNOWN_NAME, 62 | STORE_ELEMENT_NAME, AFTER_ELEMENT_NAME, 63 | AFTER_CLOSE_ELEMENT_NAME, 64 | BEFORE_ATTRIBUTE_VALUE, AFTER_ATTRIBUTE_VALUE, 65 | DEFINE_XMLNS_BEFORE_PREFIX_NAME, DEFINE_XMLNS_AFTER_PREFIX_NAME, 66 | BEFORE_VALUE, VALUE, UNKNOWN_VALUE, DEFINE_XMLNS_AFTER_URI, 67 | BEFORE_SGML, SGML_DECLARATION, 68 | AFTER_PROCESSING_NAME, AFTER_PROCESSING_VALUE, 69 | BEFORE_COMMENT, COMMENT, 70 | EXPECT, 71 | PARSE_ERROR 72 | }; 73 | 74 | enum class TagType : uint32_t { 75 | ELEMENT, 76 | SGML_DECLARATION, 77 | PROCESSING 78 | }; 79 | 80 | enum class MatchTarget : uint32_t { 81 | ELEMENT, 82 | ELEMENT_NAMESPACE, 83 | ATTRIBUTE, 84 | ATTRIBUTE_NAMESPACE 85 | }; 86 | 87 | static constexpr unsigned int TOKEN_SHIFT = 5; 88 | 89 | #define export 90 | #define const 91 | #define enum enum class 92 | 93 | #define CodeType TokenType : uint32_t 94 | #include "../src/tokenizer/CodeType.ts" 95 | #undef CodeType 96 | 97 | #define ErrorType ErrorType : uint32_t 98 | #include "../src/tokenizer/ErrorType.ts" 99 | #undef ErrorType 100 | 101 | #undef enum 102 | #undef const 103 | #undef export 104 | 105 | Parser(const ParserConfig &config); 106 | 107 | ErrorType destroy(); 108 | 109 | ParserConfig *getConfig() { return(&config); } 110 | 111 | /** Parse a chunk of incoming data. */ 112 | ErrorType parse(nbind::Buffer chunk); 113 | 114 | void setCodeBuffer(nbind::Buffer tokenBuffer, nbind::cbFunction &flushTokens) { 115 | this->flushTokens = std::unique_ptr(new nbind::cbFunction(flushTokens)); 116 | this->tokenBuffer = tokenBuffer; 117 | 118 | tokenList = reinterpret_cast(tokenBuffer.data()); 119 | tokenBufferEnd = tokenList + tokenBuffer.length() / 4; 120 | 121 | flushTokens.reset(); 122 | } 123 | 124 | inline void flush(uint32_t *&tokenPtr) { 125 | (*flushTokens)(); 126 | tokenList[0] = 0; 127 | tokenPtr = tokenList + 1; 128 | } 129 | 130 | bool updateElementStack(TokenType nameTokenType) { 131 | if(nameTokenType == TokenType :: OPEN_ELEMENT_ID) { 132 | // TODO: Ensure stack is not too large. 133 | elementStack.emplace_back(prefixStack.size(), 0); 134 | } else if(nameTokenType == TokenType :: CLOSE_ELEMENT_ID) { 135 | if(elementStack.empty()) return(false); 136 | 137 | const Element &element = elementStack.back(); 138 | size_t oldSize = element.prefixStackOffset; 139 | 140 | for(size_t size = prefixStack.size(); size > oldSize; --size) { 141 | const PrefixDefinition &old = prefixStack.back(); 142 | Namespace *ns = config.namespaceList[old.idNamespace].get(); 143 | // For efficiency, never undefine an xmlns prefix 144 | // because it may be redefined identically later. 145 | if(ns) { 146 | config.namespacePrefixTbl[old.idPrefix] = std::make_pair(old.idNamespace, ns); 147 | } 148 | prefixStack.pop_back(); 149 | } 150 | 151 | elementStack.pop_back(); 152 | } 153 | 154 | return(true); 155 | } 156 | 157 | /** Output a token. This is the only function writing to memory, so safety 158 | * from code execution exploits depends on this and nothing else. */ 159 | 160 | inline void writeToken(TokenType kind, uint32_t token, uint32_t *&tokenPtr) { 161 | if(tokenPtr >= tokenBufferEnd) flush(tokenPtr); 162 | 163 | // Buffer content length is stored at its beginning. 164 | ++tokenList[0]; 165 | 166 | // This must never write outside the range 167 | // from tokenList to tokenBufferEnd (exclusive). 168 | *tokenPtr++ = static_cast(kind) + (token << TOKEN_SHIFT); 169 | } 170 | 171 | void setPrefix(uint32_t idPrefix) { 172 | if(idPrefix < namespacePrefixTblSize) this->idPrefix = idPrefix; 173 | memberPrefix->idPrefix = idPrefix; 174 | memberPrefix->idNamespace = config.namespacePrefixTbl[config.emptyPrefixToken].first; 175 | } 176 | 177 | bool bindPrefix(uint32_t idPrefix, uint32_t uri) { 178 | uint32_t nsOld = config.namespacePrefixTbl[idPrefix].first; 179 | 180 | if(config.bindPrefix(idPrefix, uri)) { 181 | // Push old prefix binding to stack, to restore it after closing tag. 182 | prefixStack.emplace_back(idPrefix, nsOld); 183 | if(elementPrefix.idPrefix == idPrefix) { 184 | elementPrefix.idNamespace = config.namespacePrefixTbl[idPrefix].first; 185 | } 186 | return(true); 187 | } 188 | 189 | return(false); 190 | } 191 | 192 | bool addUri(uint32_t uri, uint32_t idNamespace); 193 | 194 | // Emit content for a partially matched token. 195 | // If the input buffer was drained, emit the match length and some 196 | // valid token beginning identically, to recover the complete name. 197 | inline void emitPartialName( 198 | const unsigned char *p, 199 | size_t offset, 200 | TokenType tokenType, 201 | uint32_t *&tokenPtr 202 | ); 203 | 204 | inline void updateRowCol(unsigned char c); 205 | 206 | inline uint32_t getRow() { return(row); } 207 | inline uint32_t getCol() { return(col); } 208 | 209 | ParserConfig config; 210 | 211 | /** Prefix and namespace of current element. */ 212 | PrefixDefinition elementPrefix; 213 | PrefixDefinition attributePrefix; 214 | PrefixDefinition *memberPrefix = &attributePrefix; 215 | 216 | std::vector prefixStack; 217 | std::vector elementStack; 218 | 219 | PatriciaCursor cursor; 220 | 221 | unsigned char *nameCharTbl; 222 | unsigned char *nameStartCharTbl; 223 | const char *pattern; 224 | 225 | State state; 226 | State matchState; 227 | State noMatchState; 228 | State partialMatchState; 229 | State afterNameState; 230 | State afterTextState; 231 | State afterMatchTrieState; 232 | /** Next state after reading an element, attribute or processing instruction 233 | * name, a text node or an attribute value. */ 234 | State nextState; 235 | /** Next state if the current character was not the expected one. */ 236 | State otherState; 237 | /** Next state after reading an attribute value. Regular elements and 238 | * processing instructions need different handling. */ 239 | State afterValueState; 240 | /** Flag whether the previously emitted name was found in a trie. */ 241 | bool knownName; 242 | 243 | TagType tagType; 244 | MatchTarget matchTarget; 245 | 246 | unsigned char textEndChar; 247 | 248 | /** Expected character for moving to another state. */ 249 | unsigned char expected; 250 | 251 | size_t pos; 252 | 253 | uint32_t row; 254 | uint32_t col; 255 | 256 | uint32_t idToken; 257 | uint32_t idPrefix; 258 | 259 | uint32_t idElement; 260 | 261 | uint32_t sgmlNesting; 262 | 263 | TokenType nameTokenType; 264 | TokenType textTokenType; 265 | TokenType valueTokenType; 266 | const unsigned char *tokenStart; 267 | 268 | // TODO: Maybe this could be std::function 269 | std::unique_ptr flushTokens; 270 | 271 | Patricia Namespace :: *trie; 272 | 273 | nbind::Buffer tokenBuffer; 274 | uint32_t *tokenList; 275 | const uint32_t *tokenBufferEnd; 276 | 277 | }; 278 | -------------------------------------------------------------------------------- /src/writer/Writer.ts: -------------------------------------------------------------------------------- 1 | import * as stream from 'stream'; 2 | 3 | import { Namespace } from '../Namespace'; 4 | import { TokenChunk } from '../parser/TokenChunk'; 5 | import { Token, TokenBuffer, TokenKind, MemberToken, SgmlToken } from '../parser/Token'; 6 | 7 | export const enum Indent { 8 | MIN_DEPTH = 1, 9 | MAX_DEPTH = 256 10 | } 11 | 12 | export const enum State { 13 | ELEMENT = 0, 14 | PROCESSING, 15 | SGML, 16 | SGML_TEXT, 17 | TEXT, 18 | AFTER_TEXT, 19 | COMMENT, 20 | CDATA 21 | } 22 | 23 | export const indentPattern = '\n' + new Array(Indent.MAX_DEPTH).join('\t'); 24 | 25 | export class Writer extends stream.Transform { 26 | 27 | /** @param data Arbitrary data passed to any custom serializers. */ 28 | 29 | constructor(private data?: any) { 30 | super({ objectMode: true }); 31 | } 32 | 33 | transform(chunk: TokenChunk | TokenBuffer | string, partList: string[]) { 34 | const prefixList = this.prefixList; 35 | const chunkCount = this.chunkCount++; 36 | let buffer: TokenBuffer; 37 | let state = this.state; 38 | let depth = this.depth; 39 | let indent = this.indent; 40 | let nsElement = this.nsElement; 41 | let token: typeof buffer[0]; 42 | let member: MemberToken; 43 | let prefix: string; 44 | let serialized: string | TokenBuffer; 45 | 46 | let partNum = partList.length - 1; 47 | let lastNum = chunk.length - 1; 48 | let tokenNum = -1; 49 | let namespaceList: (Namespace | undefined)[] | undefined; 50 | 51 | if(typeof(chunk) == 'string') { 52 | partList.push(chunk); 53 | return(partList); 54 | } else if(chunk instanceof TokenChunk) { 55 | buffer = chunk.buffer; 56 | namespaceList = chunk.namespaceList; 57 | } else { 58 | buffer = chunk; 59 | } 60 | 61 | if(!chunkCount) { 62 | if(!namespaceList) { 63 | namespaceList = []; 64 | 65 | while(tokenNum < lastNum) { 66 | token = buffer[++tokenNum]; 67 | 68 | if(token instanceof MemberToken) { 69 | namespaceList[token.ns.id] = token.ns; 70 | } 71 | } 72 | 73 | tokenNum = -1; 74 | } 75 | 76 | this.copyPrefixes(namespaceList); 77 | } 78 | 79 | while(tokenNum < lastNum) { 80 | 81 | token = buffer[++tokenNum]; 82 | 83 | if(token instanceof Token) { 84 | switch(token.kind) { 85 | case TokenKind.open: 86 | 87 | member = token as MemberToken; 88 | nsElement = member.ns; 89 | partList[++partNum] = indent + '<' + prefixList[nsElement.id] + member.name; 90 | 91 | if(nsElement.isSpecial && nsElement.defaultPrefix == '?') { 92 | state = State.PROCESSING; 93 | } else { 94 | if(depth++ == Indent.MIN_DEPTH) partList[++partNum] = this.xmlnsDefinitions; 95 | state = State.ELEMENT; 96 | } 97 | 98 | indent = indentPattern.substr(0, depth); 99 | break; 100 | 101 | case TokenKind.sgmlEmitted: 102 | 103 | this.sgmlSeparator = ''; 109 | 110 | state = State.TEXT; 111 | break; 112 | 113 | case TokenKind.close: 114 | 115 | if(state == State.PROCESSING) { 116 | partList[++partNum] = '?>'; 117 | } else { 118 | member = token as MemberToken; 119 | indent = indentPattern.substr(0, --depth); 120 | 121 | if(state == State.ELEMENT) { 122 | partList[++partNum] = '/>'; 123 | } else { 124 | if(state != State.AFTER_TEXT) partList[++partNum] = indent; 125 | partList[++partNum] = '' 126 | } 127 | } 128 | 129 | state = State.TEXT; 130 | break; 131 | 132 | case TokenKind.string: 133 | 134 | member = token as MemberToken; 135 | // Omit prefixes for attributes in the same namespace 136 | // as their parent element. 137 | if(member.ns == nsElement) prefix = ''; 138 | else prefix = prefixList[member.ns.id]; 139 | 140 | partList[++partNum] = ' ' + prefix + member.name + '='; 141 | break; 142 | 143 | case TokenKind.sgml: 144 | 145 | prefix = (token as SgmlToken).prefix; 146 | 147 | partList[++partNum] = this.sgmlSeparator + prefix + (prefix && ':') + (token as SgmlToken).name; 148 | this.sgmlSeparator = ' '; 149 | break; 150 | 151 | case TokenKind.comment: 152 | 153 | state = State.COMMENT; 154 | break; 155 | 156 | case TokenKind.cdata: 157 | 158 | state = State.CDATA; 159 | break; 160 | 161 | case TokenKind.sgmlNestedStart: 162 | 163 | partList[++partNum] = this.sgmlSeparator + '['; 164 | this.sgmlSeparator = ''; 211 | state = State.AFTER_TEXT; 212 | break; 213 | 214 | case State.ELEMENT: 215 | case State.PROCESSING: 216 | 217 | partList[++partNum] = '"' + token + '"'; 218 | break; 219 | 220 | case State.COMMENT: 221 | 222 | partList[++partNum] = indent + ' 271 | case '!': 272 | 273 | tagType = TagType :: ELEMENT; 274 | state = State :: BEFORE_SGML; 275 | break; 276 | 277 | // An SGML or an XML processing 278 | // instruction. 279 | case '?': 280 | 281 | afterNameState = State :: AFTER_PROCESSING_NAME; 282 | afterValueState = State :: AFTER_PROCESSING_VALUE; 283 | nameTokenType = TokenType :: OPEN_ELEMENT_ID; 284 | 285 | tagType = TagType :: PROCESSING; 286 | matchTarget = MatchTarget :: ELEMENT; 287 | 288 | // Put unknown processing instructions in a placeholder namespace. 289 | elementPrefix.idPrefix = config.processingPrefixToken; 290 | elementPrefix.idNamespace = config.namespacePrefixTbl[config.processingPrefixToken].first; 291 | memberPrefix = &elementPrefix; 292 | 293 | ns = config.namespacePrefixTbl[config.processingPrefixToken].second; 294 | 295 | cursor.init(ns->*trie); 296 | 297 | tokenStart = p; 298 | 299 | state = State :: MATCH_TRIE; 300 | afterMatchTrieState = State :: NAME; 301 | break; 302 | 303 | // A closing element (no whitespace after '<'). 304 | case '/': 305 | afterNameState = State :: AFTER_CLOSE_ELEMENT_NAME; 306 | nameTokenType = TokenType :: CLOSE_ELEMENT_ID; 307 | 308 | tagType = TagType :: ELEMENT; 309 | matchTarget = MatchTarget :: ELEMENT; 310 | state = State :: BEFORE_NAME; 311 | break; 312 | 313 | // An element . May be self-closing. 314 | default: 315 | afterNameState = State :: STORE_ELEMENT_NAME; 316 | afterValueState = State :: AFTER_ATTRIBUTE_VALUE; 317 | nameTokenType = TokenType :: OPEN_ELEMENT_ID; 318 | memberPrefix = &elementPrefix; 319 | 320 | tagType = TagType :: ELEMENT; 321 | matchTarget = MatchTarget :: ELEMENT; 322 | state = State :: BEFORE_NAME; 323 | // Avoid consuming the first character. 324 | goto BEFORE_NAME; 325 | } 326 | 327 | break; 328 | 329 | // Skip any whitespace before an element name. XML doesn't 330 | // actually allow any, so this state could be removed for 331 | // stricter parsing. 332 | /* 333 | case State :: BEFORE_ELEMENT_NAME: BEFORE_ELEMENT_NAME: 334 | 335 | if(whiteCharTbl[c]) break; 336 | 337 | state = State :: BEFORE_NAME; 338 | goto BEFORE_NAME; 339 | */ 340 | 341 | // ----------------------------------------- 342 | // Element and attribute name parsing begins 343 | // ----------------------------------------- 344 | 345 | // Start matching a name to known names in a Patricia trie. 346 | case State :: BEFORE_NAME: BEFORE_NAME: 347 | 348 | // The current character must be the valid first character of 349 | // an element or attribute name, anything else is an error. 350 | if(!nameStartCharTbl[c]) { 351 | return(ErrorType :: INVALID_CHAR); 352 | } 353 | 354 | // Look for a ":" separator indicating a qualified name (starts 355 | // with a namespace prefix). If the entire name doesn't fit in 356 | // the input buffer, we first try to parse as a qualified name. 357 | // This is an optional lookup to avoid later reprocessing. 358 | for(ahead = 0; ahead + 1 < len && nameCharTbl[p[ahead]]; ++ahead) {} 359 | 360 | if(matchTarget == MatchTarget :: ELEMENT) { 361 | elementPrefix.idPrefix = config.emptyPrefixToken; 362 | elementPrefix.idNamespace = config.namespacePrefixTbl[config.emptyPrefixToken].first; 363 | ns = config.namespacePrefixTbl[config.emptyPrefixToken].second; 364 | } else { 365 | // By default, attributes belong to the same namespace as their parent element. 366 | attributePrefix.idPrefix = elementPrefix.idPrefix; 367 | attributePrefix.idNamespace = elementPrefix.idNamespace; 368 | ns = config.namespaceList[elementPrefix.idNamespace].get(); 369 | // If element namespace prefix was known but undefined, 370 | // try the default namespace to allow matching the magic xmlns attribute. 371 | if(ns == nullptr) ns = config.namespacePrefixTbl[config.emptyPrefixToken].second; 372 | } 373 | 374 | // Prepare Patricia tree cursor for parsing. 375 | if(ahead + 1 >= len || p[ahead] == ':') { 376 | // If the input ran out, assume the name contains a colon 377 | // in the next input buffer chunk. If a colon is found, the 378 | // name starts with a namespace prefix. 379 | 380 | if(matchTarget == MatchTarget :: ELEMENT) { 381 | matchTarget = MatchTarget :: ELEMENT_NAMESPACE; 382 | } else { 383 | matchTarget = MatchTarget :: ATTRIBUTE_NAMESPACE; 384 | } 385 | cursor.init(config.prefixTrie); 386 | } else { 387 | if(ns == nullptr) { 388 | // No default namespace is defined, so this element 389 | // cannot be matched with anything. 390 | writeToken(TokenType :: PREFIX_ID, (memberPrefix->idNamespace << 14) | memberPrefix->idPrefix, tokenPtr); 391 | writeToken(TokenType :: UNKNOWN_START_OFFSET, p - 1 - chunkBuffer, tokenPtr); 392 | 393 | idToken = Patricia :: notFound; 394 | state = State :: UNKNOWN_NAME; 395 | goto UNKNOWN_NAME; 396 | } 397 | 398 | cursor.init(ns->*trie); 399 | } 400 | 401 | tokenStart = p - 1; 402 | 403 | state = State :: MATCH_TRIE; 404 | afterMatchTrieState = State :: NAME; 405 | goto MATCH_TRIE; 406 | 407 | case State :: MATCH_TRIE: MATCH_TRIE: 408 | 409 | // Fast inner loop for matching to known element and attribute names. 410 | while(cursor.advance(c)) { 411 | updateRowCol(c); 412 | if(!--len) { 413 | pos += p - tokenStart; 414 | return(ErrorType :: OK); 415 | } 416 | c = *p++; 417 | } 418 | 419 | state = afterMatchTrieState; 420 | continue; 421 | 422 | case State :: NAME: 423 | 424 | if(!nameCharTbl[c]) { 425 | // If the whole name was matched, get associated reference. 426 | idToken = cursor.getData(); 427 | 428 | // Test for an attribute "xmlns:..." defining a namespace 429 | // prefix. 430 | 431 | if(tagType == TagType :: ELEMENT && ( 432 | ( 433 | matchTarget == MatchTarget :: ATTRIBUTE_NAMESPACE && 434 | idToken == config.xmlnsPrefixToken 435 | ) || ( 436 | matchTarget == MatchTarget :: ATTRIBUTE && 437 | idToken == config.xmlnsToken 438 | ) 439 | )) { 440 | if(c == ':') { 441 | pos = 0; 442 | state = State :: DEFINE_XMLNS_BEFORE_PREFIX_NAME; 443 | break; 444 | } else { 445 | // Prepare to set the default namespace. 446 | nameTokenType = TokenType :: XMLNS_ID; 447 | afterNameState = State :: DEFINE_XMLNS_AFTER_PREFIX_NAME; 448 | idToken = config.emptyPrefixToken; 449 | } 450 | } 451 | 452 | if(idToken != Patricia :: notFound) { 453 | if(c == ':' && tagType == TagType :: ELEMENT) { 454 | // If matching a namespace, use it. 455 | if( 456 | matchTarget == MatchTarget :: ELEMENT_NAMESPACE || 457 | matchTarget == MatchTarget :: ATTRIBUTE_NAMESPACE 458 | ) { 459 | if(idToken >= namespacePrefixTblSize) { 460 | return(ErrorType :: TOO_MANY_PREFIXES); 461 | } 462 | 463 | memberPrefix->idPrefix = idToken; 464 | memberPrefix->idNamespace = config.namespacePrefixTbl[idToken].first; 465 | 466 | if(matchTarget == MatchTarget :: ELEMENT_NAMESPACE) { 467 | matchTarget = MatchTarget :: ELEMENT; 468 | } else { 469 | matchTarget = MatchTarget :: ATTRIBUTE; 470 | } 471 | 472 | ns = config.namespacePrefixTbl[idToken].second; 473 | 474 | if(ns == nullptr) { 475 | // Found a known but undeclared namespace 476 | // prefix, valid if declared with an xmlns 477 | // attribute in the same element. 478 | 479 | writeToken(TokenType :: PREFIX_ID, (memberPrefix->idNamespace << 14) | memberPrefix->idPrefix, tokenPtr); 480 | writeToken(TokenType :: UNKNOWN_START_OFFSET, p - chunkBuffer, tokenPtr); 481 | 482 | idToken = Patricia :: notFound; 483 | pos = 0; 484 | state = State :: UNKNOWN_NAME; 485 | break; 486 | } 487 | 488 | pos = 0; 489 | tokenStart = p; 490 | cursor.init(ns->*trie); 491 | 492 | state = State :: MATCH_TRIE; 493 | break; 494 | } else { 495 | // TODO: Reintepret token up to cursor as a 496 | // namespace prefix. 497 | } 498 | break; 499 | } else if( 500 | matchTarget == MatchTarget :: ELEMENT_NAMESPACE || 501 | matchTarget == MatchTarget :: ATTRIBUTE_NAMESPACE 502 | ) { 503 | // TODO: Reintepret token up to cursor as an 504 | // element or attribute name according to 505 | // nameTokenType. 506 | } 507 | 508 | if(nameTokenType != TokenType :: XMLNS_ID) { 509 | if(!updateElementStack(nameTokenType)) return(ErrorType :: OTHER); 510 | writeToken(TokenType :: PREFIX_ID, (memberPrefix->idNamespace << 14) | memberPrefix->idPrefix, tokenPtr); 511 | } 512 | writeToken(nameTokenType, idToken, tokenPtr); 513 | 514 | knownName = true; 515 | pos = 0; 516 | state = afterNameState; 517 | continue; 518 | } else { 519 | // TODO: Verify emitting partial name works in this case. 520 | } 521 | } 522 | 523 | pos += p - tokenStart; 524 | 525 | // For partial matches, emit the matched part of a name. 526 | emitPartialName( 527 | p, 528 | static_cast(p - chunkBuffer), 529 | ( 530 | matchTarget == MatchTarget :: ELEMENT ? 531 | TokenType :: PARTIAL_ELEMENT_ID : ( 532 | matchTarget == MatchTarget :: ATTRIBUTE ? 533 | TokenType :: PARTIAL_ATTRIBUTE_ID : 534 | TokenType :: PARTIAL_PREFIX_ID 535 | ) 536 | ), 537 | tokenPtr 538 | ); 539 | 540 | idToken = Patricia :: notFound; 541 | pos = 0; 542 | state = State :: UNKNOWN_NAME; 543 | goto UNKNOWN_NAME; 544 | 545 | // From this part onwards, the name was not found in any applicable 546 | // Patricia trie. 547 | case State :: UNKNOWN_NAME: UNKNOWN_NAME: 548 | 549 | while(nameCharTbl[c]) { 550 | updateRowCol(c); 551 | if(!--len) return(ErrorType :: OK); 552 | c = *p++; 553 | } 554 | 555 | if(c == ':' && tagType == TagType :: ELEMENT) { 556 | // Found a new, undeclared namespace prefix, valid if 557 | // declared with an xmlns attribute in the same element. 558 | 559 | writeToken( 560 | TokenType :: UNKNOWN_PREFIX_END_OFFSET, 561 | p - chunkBuffer - 1, 562 | tokenPtr 563 | ); 564 | 565 | // Flush tokens to regenerate prefix trie in JavaScript. 566 | flush(tokenPtr); 567 | 568 | // Namespace is unknown so prepare to emit the name. 569 | writeToken(TokenType :: UNKNOWN_START_OFFSET, p - chunkBuffer, tokenPtr); 570 | break; 571 | } 572 | 573 | if(nameTokenType != TokenType :: XMLNS_ID) { 574 | if(!updateElementStack(nameTokenType)) return(ErrorType :: OTHER); 575 | writeToken(TokenType :: PREFIX_ID, (memberPrefix->idNamespace << 14) | memberPrefix->idPrefix, tokenPtr); 576 | } 577 | writeToken( 578 | static_cast( 579 | static_cast(TokenType :: UNKNOWN_OPEN_ELEMENT_END_OFFSET) - 580 | static_cast(TokenType :: OPEN_ELEMENT_ID) + 581 | static_cast(nameTokenType) 582 | ), 583 | p - chunkBuffer - 1, 584 | tokenPtr 585 | ); 586 | 587 | knownName = false; 588 | state = afterNameState; 589 | continue; 590 | 591 | // --------------------------------------- 592 | // Element and attribute name parsing ends 593 | // --------------------------------------- 594 | 595 | case State :: STORE_ELEMENT_NAME: 596 | 597 | // Store element name ID (already output) to verify closing element. 598 | // TODO: Push to a stack and verify! 599 | idElement = idToken; 600 | 601 | state = State :: AFTER_ELEMENT_NAME; 602 | goto AFTER_ELEMENT_NAME; 603 | 604 | // Inside an element start tag with the name already parsed. 605 | case State :: AFTER_ELEMENT_NAME: AFTER_ELEMENT_NAME: 606 | 607 | switch(c) { 608 | case '/': 609 | 610 | if(!updateElementStack(TokenType :: CLOSE_ELEMENT_ID)) return(ErrorType :: OTHER); 611 | writeToken(TokenType :: CLOSED_ELEMENT_EMITTED, idElement, tokenPtr); 612 | 613 | expected = '>'; 614 | nextState = State :: BEFORE_TEXT; 615 | otherState = State :: PARSE_ERROR; 616 | 617 | state = State :: EXPECT; 618 | break; 619 | 620 | case '>': 621 | 622 | writeToken(TokenType :: ELEMENT_EMITTED, idElement, tokenPtr); 623 | 624 | state = State :: BEFORE_TEXT; 625 | break; 626 | 627 | default: 628 | 629 | if(whiteCharTbl[c]) break; 630 | else { 631 | // First read an attribute name. 632 | state = State :: BEFORE_NAME; 633 | matchTarget = MatchTarget :: ATTRIBUTE; 634 | nameTokenType = TokenType :: ATTRIBUTE_ID; 635 | memberPrefix = &attributePrefix; 636 | trie = &Namespace :: attributeTrie; 637 | 638 | // Then equals sign and opening double quote. 639 | afterNameState = State :: MATCH_SPARSE; 640 | pattern = "=\""; 641 | noMatchState = State :: PARSE_ERROR; 642 | partialMatchState = State :: QUOTE; 643 | 644 | // Finally text content up to closing double quote. 645 | matchState = State :: TEXT; 646 | textTokenType = TokenType :: VALUE_START_OFFSET; 647 | textEndChar = '"'; 648 | afterTextState = afterValueState; 649 | 650 | // Attribute name. 651 | goto BEFORE_NAME; 652 | } 653 | } 654 | 655 | break; 656 | 657 | case State :: AFTER_CLOSE_ELEMENT_NAME: 658 | if(c == '>') { 659 | state = State :: BEFORE_TEXT; 660 | } else if(!whiteCharTbl[c]) { 661 | return(ErrorType :: PROHIBITED_WHITESPACE); 662 | } 663 | 664 | break; 665 | 666 | // ------------------------------ 667 | // Attribute value parsing begins 668 | // ------------------------------ 669 | 670 | // Enforce whitespace between attributes. 671 | case State :: AFTER_ATTRIBUTE_VALUE: AFTER_ATTRIBUTE_VALUE: 672 | 673 | switch(c) { 674 | case '/': 675 | case '>': 676 | 677 | // Switch states without consuming character. 678 | state = State :: AFTER_ELEMENT_NAME; 679 | goto AFTER_ELEMENT_NAME; 680 | 681 | default: 682 | 683 | if(whiteCharTbl[c]) { 684 | state = State :: AFTER_ELEMENT_NAME; 685 | break; 686 | } else { 687 | return(ErrorType :: INVALID_CHAR); 688 | } 689 | } 690 | 691 | break; 692 | 693 | // Finished reading an attribute name beginning "xmlns:". 694 | // Parse the namespace prefix it defines. 695 | case State :: DEFINE_XMLNS_BEFORE_PREFIX_NAME: 696 | 697 | tokenStart = p - 1; 698 | 699 | // Prepare Patricia tree cursor for parsing an xmlns prefix. 700 | state = State :: MATCH_TRIE; 701 | cursor.init(config.prefixTrie); 702 | 703 | // TODO: Better use a state without handling of the : char. 704 | afterMatchTrieState = State :: NAME; 705 | 706 | afterNameState = State :: DEFINE_XMLNS_AFTER_PREFIX_NAME; 707 | // Prepare to emit the chosen namespace prefix. 708 | nameTokenType = TokenType :: XMLNS_ID; 709 | 710 | goto MATCH_TRIE; 711 | 712 | case State :: DEFINE_XMLNS_AFTER_PREFIX_NAME: 713 | 714 | if(knownName) { 715 | // Store index of namespace prefix in prefix mapping table 716 | // for assigning a new namespace URI. 717 | idPrefix = idToken; 718 | } else { 719 | // If the name was unrecognized, flush tokens so JavaScript 720 | // updates the namespace prefix trie and this tokenizer can 721 | // recognize it in the future. 722 | flush(tokenPtr); 723 | } 724 | 725 | // Match equals sign and namespace URI in double quotes. 726 | state = State :: MATCH_SPARSE; 727 | pattern = "=\""; 728 | noMatchState = State :: PARSE_ERROR; 729 | partialMatchState = State :: QUOTE; 730 | 731 | matchState = State :: BEFORE_VALUE; 732 | cursor.init(config.uriTrie); 733 | valueTokenType = TokenType :: URI_ID; 734 | textEndChar = '"'; 735 | 736 | afterValueState = State :: DEFINE_XMLNS_AFTER_URI; 737 | 738 | goto MATCH_SPARSE; 739 | 740 | case State :: BEFORE_VALUE: 741 | 742 | tokenStart = p - 1; 743 | 744 | state = State :: MATCH_TRIE; 745 | afterMatchTrieState = State :: VALUE; 746 | goto MATCH_TRIE; 747 | 748 | // Parse a value that should match a known set. Similar to 749 | // State :: NAME but reads up to and consumes a final double quote. 750 | case State :: VALUE: 751 | 752 | if(c == textEndChar) { 753 | // If the whole value was matched, get associated reference. 754 | idToken = cursor.getData(); 755 | 756 | if(idToken != Patricia :: notFound) { 757 | if(valueTokenType == TokenType :: URI_ID) { 758 | valueTokenType = TokenType :: NAMESPACE_ID; 759 | idToken = config.namespaceByUriToken[idToken].first; 760 | } 761 | writeToken(valueTokenType, idToken, tokenPtr); 762 | 763 | knownName = true; 764 | pos = 0; 765 | state = afterValueState; 766 | break; 767 | } else { 768 | // TODO: Verify emitting partial name works in this case. 769 | } 770 | } 771 | 772 | pos += p - tokenStart; 773 | 774 | emitPartialName( 775 | p, 776 | static_cast(p - chunkBuffer), 777 | TokenType :: PARTIAL_URI_ID, 778 | tokenPtr 779 | ); 780 | 781 | idToken = Patricia :: notFound; 782 | pos = 0; 783 | state = State :: UNKNOWN_VALUE; 784 | goto UNKNOWN_VALUE; 785 | 786 | case State :: UNKNOWN_VALUE: UNKNOWN_VALUE: 787 | 788 | while(1) { 789 | if(!valueCharTbl[c]) { 790 | if(c == textEndChar) break; 791 | 792 | switch(c) { 793 | case '&': 794 | 795 | // TODO: Handle entities. 796 | break; 797 | 798 | case '"': 799 | case '\'': 800 | case '<': 801 | case '>': 802 | 803 | // TODO: Stricter parsing would ban these. 804 | break; 805 | 806 | case ']': 807 | 808 | break; 809 | 810 | default: 811 | 812 | // Disallow nonsense bytes. 813 | return(ErrorType :: INVALID_CHAR); 814 | } 815 | } 816 | 817 | updateRowCol(c); 818 | if(!--len) return(ErrorType :: OK); 819 | c = *p++; 820 | } 821 | 822 | writeToken( 823 | static_cast( 824 | static_cast(TokenType :: UNKNOWN_OPEN_ELEMENT_END_OFFSET) - 825 | static_cast(TokenType :: OPEN_ELEMENT_ID) + 826 | static_cast(valueTokenType) 827 | ), 828 | p - chunkBuffer - 1, 829 | tokenPtr 830 | ); 831 | 832 | knownName = false; 833 | state = afterValueState; 834 | break; 835 | 836 | case State :: DEFINE_XMLNS_AFTER_URI: 837 | 838 | if(knownName) { 839 | bindPrefix(idPrefix, idToken); 840 | } else { 841 | // If the value was unrecognized, flush tokens so JavaScript 842 | // updates the uri trie and this tokenizer can recognize it 843 | // in the future. 844 | flush(tokenPtr); 845 | 846 | // Reset element namespace to correctly match any following attributes. 847 | elementPrefix.idNamespace = config.namespacePrefixTbl[elementPrefix.idPrefix].first; 848 | } 849 | 850 | afterValueState = State :: AFTER_ATTRIBUTE_VALUE; 851 | 852 | state = State :: AFTER_ATTRIBUTE_VALUE; 853 | goto AFTER_ATTRIBUTE_VALUE; 854 | 855 | // ---------------------------- 856 | // Attribute value parsing ends 857 | // ---------------------------- 858 | 859 | // Tag starting with 877 | case '-': 878 | 879 | expected = '-'; 880 | nextState = State :: BEFORE_COMMENT; 881 | otherState = State :: PARSE_ERROR; 882 | 883 | state = State :: EXPECT; 884 | break; 885 | 886 | default: 887 | 888 | // writeToken(TokenType :: SGML_START, 0, tokenPtr); 889 | goto SGML_DECLARATION; 890 | } 891 | break; 892 | 893 | case State :: SGML_DECLARATION: SGML_DECLARATION: 894 | 895 | if(whiteCharTbl[c]) break; 896 | 897 | switch(c) { 898 | case '"': 899 | case '\'': 900 | 901 | textTokenType = TokenType :: SGML_TEXT_START_OFFSET; 902 | textEndChar = c; 903 | afterTextState = State :: SGML_DECLARATION; 904 | 905 | state = State :: TEXT; 906 | break; 907 | 908 | case '>': 909 | 910 | writeToken(TokenType :: SGML_EMITTED, 0, tokenPtr); 911 | 912 | nameCharTbl = xmlNameCharTbl; 913 | nameStartCharTbl = xmlNameStartCharTbl; 914 | 915 | state = State :: BEFORE_TEXT; 916 | break; 917 | 918 | default: 919 | 920 | matchTarget = MatchTarget :: ELEMENT; 921 | nameTokenType = TokenType :: SGML_ID; 922 | memberPrefix = &elementPrefix; 923 | 924 | nameCharTbl = dtdNameCharTbl; 925 | nameStartCharTbl = dtdNameCharTbl; 926 | afterNameState = State :: SGML_DECLARATION; 927 | 928 | state = State :: BEFORE_NAME; 929 | goto BEFORE_NAME; 930 | 931 | case '[': 932 | 933 | // Signal start of DTD embedded in DOCTYPE. 934 | writeToken(TokenType :: SGML_NESTED_START, 0, tokenPtr); 935 | ++sgmlNesting; 936 | 937 | nameCharTbl = xmlNameCharTbl; 938 | nameStartCharTbl = xmlNameStartCharTbl; 939 | 940 | state = State :: BEFORE_TEXT; 941 | break; 942 | } 943 | break; 944 | 945 | // Inside a processing instruction with the name already parsed. 946 | case State :: AFTER_PROCESSING_NAME: AFTER_PROCESSING_NAME: 947 | 948 | switch(c) { 949 | case '?': 950 | 951 | // End of an XML processing instruction. 952 | // Handle like a self-closing element. 953 | c = '/'; 954 | state = State :: AFTER_ELEMENT_NAME; 955 | goto AFTER_ELEMENT_NAME; 956 | 957 | case '>': 958 | 959 | // End of an SGML processing instruction. 960 | if(!updateElementStack(TokenType :: CLOSE_ELEMENT_ID)) return(ErrorType :: OTHER); 961 | writeToken(TokenType :: CLOSED_ELEMENT_EMITTED, idElement, tokenPtr); 962 | 963 | state = State :: BEFORE_TEXT; 964 | break; 965 | 966 | case '/': 967 | 968 | return(ErrorType :: INVALID_CHAR); 969 | 970 | default: 971 | 972 | // Switch states without consuming character. 973 | state = State :: AFTER_ELEMENT_NAME; 974 | goto AFTER_ELEMENT_NAME; 975 | } 976 | 977 | break; 978 | 979 | // Enforce whitespace between processing instruction attributes. 980 | case State :: AFTER_PROCESSING_VALUE: 981 | 982 | switch(c) { 983 | case '?': 984 | case '>': 985 | 986 | // Switch states without consuming character. 987 | state = State :: AFTER_PROCESSING_NAME; 988 | goto AFTER_PROCESSING_NAME; 989 | 990 | default: 991 | 992 | if(whiteCharTbl[c]) { 993 | state = State :: AFTER_PROCESSING_NAME; 994 | break; 995 | } else { 996 | return(ErrorType :: INVALID_CHAR); 997 | } 998 | } 999 | 1000 | break; 1001 | 1002 | case State :: BEFORE_COMMENT: 1003 | 1004 | writeToken(TokenType :: COMMENT_START_OFFSET, p - chunkBuffer - 1, tokenPtr); 1005 | 1006 | state = State :: COMMENT; 1007 | goto COMMENT; 1008 | 1009 | // Note: the terminating "-->" is included in the output byte range. 1010 | case State :: COMMENT: COMMENT: 1011 | 1012 | while(1) { 1013 | if(c == '-') { 1014 | ++pos; 1015 | } else if(c == '>' && pos >= 2) { 1016 | break; 1017 | } else { 1018 | pos = 0; 1019 | } 1020 | 1021 | updateRowCol(c); 1022 | if(!--len) return(ErrorType :: OK); 1023 | c = *p++; 1024 | } 1025 | 1026 | writeToken( 1027 | TokenType :: COMMENT_END_OFFSET, 1028 | p - chunkBuffer, 1029 | tokenPtr 1030 | ); 1031 | 1032 | pos = 0; 1033 | state = State :: BEFORE_TEXT; 1034 | break; 1035 | 1036 | case State :: EXPECT: 1037 | 1038 | state = (c == expected) ? nextState : otherState; 1039 | 1040 | if(state == State :: PARSE_ERROR) goto PARSE_ERROR; 1041 | break; 1042 | 1043 | case State :: PARSE_ERROR: PARSE_ERROR: 1044 | 1045 | return(ErrorType :: OTHER); 1046 | 1047 | default: 1048 | 1049 | break; 1050 | } 1051 | 1052 | // Only read the next character at the end of the loop, to allow 1053 | // reprocessing the same character (changing states without 1054 | // consuming input) by using "continue". 1055 | updateRowCol(c); 1056 | if(!--len) return(ErrorType :: OK); 1057 | c = *p++; 1058 | } 1059 | } 1060 | 1061 | inline void Parser :: emitPartialName( 1062 | const unsigned char *p, 1063 | size_t offset, 1064 | TokenType tokenType, 1065 | uint32_t *&tokenPtr 1066 | ) { 1067 | // Test if the number of characters consumed is more than one, 1068 | // and more than past characters still left in the input buffer. 1069 | // Otherwise we can still take the other, faster branch. 1070 | if(pos > 1 && (pos > offset || DEBUG_PARTIAL_NAME_RECOVERY)) { 1071 | // NOTE: This is a very rare and complicated edge case. 1072 | // Test it with the debug flag to run it more often. 1073 | 1074 | uint32_t id = cursor.findLeaf(); 1075 | 1076 | if(id != Patricia :: notFound) { 1077 | // Emit part length. 1078 | writeToken(TokenType :: PARTIAL_LEN, pos - 1, tokenPtr); 1079 | // Emit the first descendant leaf node, which by definition 1080 | // will begin with this name part (any descendant leaf would work). 1081 | writeToken(tokenType, id, tokenPtr); 1082 | } 1083 | // Emit the offset of the remaining part of the name. 1084 | writeToken(TokenType :: UNKNOWN_START_OFFSET, offset - 1, tokenPtr); 1085 | } else { 1086 | // The consumed part of the name still remains in the 1087 | // input buffer. Simply emit its starting offset. 1088 | writeToken(TokenType :: UNKNOWN_START_OFFSET, offset - pos, tokenPtr); 1089 | } 1090 | } 1091 | 1092 | struct Init { 1093 | void setRange(unsigned char *tbl, const char *ranges, unsigned char flag) { 1094 | unsigned char c, last; 1095 | 1096 | while((c = *ranges++)) { 1097 | last = *ranges++; 1098 | while(c <= last) tbl[c++] = flag; 1099 | } 1100 | } 1101 | 1102 | Init() { 1103 | for(unsigned int i = 0; i <= 0xff; ++i) { 1104 | whiteCharTbl[i] = 0; 1105 | valueCharTbl[i] = (i >= ' ' && i <= 0xf7); 1106 | xmlNameStartCharTbl[i] = 0; 1107 | xmlNameCharTbl[i] = 0; 1108 | dtdNameCharTbl[i] = 0; 1109 | } 1110 | 1111 | for(unsigned char c : "\r\n\t ") c && (valueCharTbl[c] = 1, whiteCharTbl[c] = 1); 1112 | 1113 | for(unsigned char c : "\"'&<>]\x7f") c && (valueCharTbl[c] = 0); 1114 | 1115 | setRange(xmlNameStartCharTbl, "__AZaz\x80\xf7", 1); 1116 | setRange(xmlNameCharTbl, "..--09__AZaz\x80\xf7", 1); 1117 | setRange(dtdNameCharTbl, "##%%..--09__AZaz\x80\xf7", 1); 1118 | } 1119 | }; 1120 | 1121 | Init init; 1122 | 1123 | #include 1124 | 1125 | #ifdef NBIND_CLASS 1126 | 1127 | NBIND_ALIAS(Parser :: ErrorType, int32_t); 1128 | 1129 | NBIND_CLASS(Parser) { 1130 | construct(); 1131 | method(getConfig); 1132 | method(setCodeBuffer); 1133 | method(setPrefix); 1134 | method(bindPrefix); 1135 | getter(getRow); 1136 | getter(getCol); 1137 | method(parse); 1138 | method(destroy); 1139 | } 1140 | 1141 | #endif 1142 | --------------------------------------------------------------------------------