├── test
    ├── words.txt
    ├── .gitignore
    ├── test-sax.ts
    ├── tsconfig.json
    ├── test-expat.ts
    ├── node-expat.d.ts
    └── test.ts
├── autogypi.json
├── .gitignore
├── src
    ├── tokenizer
    │   ├── ErrorType.ts
    │   ├── TokenSpace.ts
    │   ├── CodeType.ts
    │   ├── TokenSet.ts
    │   └── Patricia.ts
    ├── schema
    │   ├── AttributeGroup.ts
    │   ├── Attribute.ts
    │   ├── Group.ts
    │   ├── Member.ts
    │   ├── ComplexType.ts
    │   ├── Element.ts
    │   └── SimpleSchema.ts
    ├── parser
    │   ├── ParserLib.ts
    │   ├── TokenChunk.ts
    │   ├── ParserStream.ts
    │   ├── Stitcher.ts
    │   ├── InternalToken.ts
    │   ├── ParserNamespace.ts
    │   ├── Lib.d.ts
    │   ├── JSX.ts
    │   ├── Token.ts
    │   ├── ParserConfig.ts
    │   └── Parser.ts
    ├── tsconfig.json
    ├── index.ts
    ├── builder
    │   ├── BuilderConfig.ts
    │   ├── RuleSet.ts
    │   └── Builder.ts
    ├── Namespace.ts
    ├── CRC32.ts
    ├── Buffer.ts
    └── writer
    │   ├── JsonWriter.ts
    │   └── Writer.ts
├── .npmignore
├── lib
    ├── Namespace.cc
    ├── Patricia.cc
    ├── Namespace.h
    ├── Patricia.h
    ├── ParserConfig.cc
    ├── PatriciaCursor.h
    ├── ParserConfig.h
    ├── README.md
    ├── PatriciaCursor.cc
    ├── Parser.h
    └── Parser.cc
├── binding.gyp
├── .travis.yml
├── appveyor.yml
├── README.md
├── LICENSE
└── package.json


/test/words.txt:
--------------------------------------------------------------------------------
1 | foobar
2 | foo
3 | 


--------------------------------------------------------------------------------
/test/.gitignore:
--------------------------------------------------------------------------------
1 | *.js
2 | *.d.ts
3 | 


--------------------------------------------------------------------------------
/autogypi.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"dependencies": [
3 | 		"nbind"
4 | 	],
5 | 	"includes": []
6 | }
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | node_modules/
 2 | build/
 3 | dist/
 4 | package-lock.json
 5 | auto*.gypi
 6 | *.lock
 7 | *.log.*
 8 | *.log
 9 | *.tgz
10 | 


--------------------------------------------------------------------------------
/src/tokenizer/ErrorType.ts:
--------------------------------------------------------------------------------
1 | export const enum ErrorType {
2 | 	OK = 0,
3 | 	INVALID_CHAR,
4 | 	PROHIBITED_WHITESPACE,
5 | 	TOO_MANY_PREFIXES,
6 | 	OTHER
7 | };
8 | 


--------------------------------------------------------------------------------
/.npmignore:
--------------------------------------------------------------------------------
 1 | node_modules/
 2 | build/
 3 | src/
 4 | test/
 5 | package-lock.json
 6 | appveyor.yml
 7 | .travis.yml
 8 | *.lock
 9 | *.log.*
10 | *.log
11 | *.tgz
12 | !src/tokenizer/CodeType.ts
13 | !src/tokenizer/ErrorType.ts
14 | 


--------------------------------------------------------------------------------
/src/schema/AttributeGroup.ts:
--------------------------------------------------------------------------------
 1 | import { AttributeSpec } from './Attribute';
 2 | 
 3 | export class AttributeGroup {
 4 | 
 5 | 	addAttribute(spec: AttributeSpec) {
 6 | 		this.list.push(spec);
 7 | 	}
 8 | 
 9 | 	/** List of allowed attributes and attribute groups. */
10 | 	list: AttributeSpec[] = []
11 | 
12 | }
13 | 


--------------------------------------------------------------------------------
/lib/Namespace.cc:
--------------------------------------------------------------------------------
 1 | #include "Namespace.h"
 2 | 
 3 | #include <nbind/nbind.h>
 4 | 
 5 | #ifdef NBIND_CLASS
 6 | 
 7 | NBIND_CLASS(Namespace) {
 8 | 	construct<std::string>();
 9 | 	method(clone);
10 | 	method(setElementTrie);
11 | 	method(setAttributeTrie);
12 | 	// TODO:
13 | 	// method(setValueTrie);
14 | }
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/test/test-sax.ts:
--------------------------------------------------------------------------------
 1 | import * as fs from 'fs';
 2 | import * as stream from 'stream';
 3 | import * as sax from 'sax';
 4 | 
 5 | const xml = sax.createStream(true, { position: true });
 6 | 
 7 | xml.on('opentag', (node: sax.Tag) => {
 8 | 	// console.log(node);
 9 | });
10 | 
11 | fs.createReadStream(process.argv[2]).pipe(xml);
12 | 


--------------------------------------------------------------------------------
/binding.gyp:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"targets": [
 3 | 		{
 4 | 			"includes": [
 5 | 				"auto.gypi"
 6 | 			],
 7 | 			"sources": [
 8 | 				"lib/Patricia.cc",
 9 | 				"lib/PatriciaCursor.cc",
10 | 				"lib/Namespace.cc",
11 | 				"lib/ParserConfig.cc",
12 | 				"lib/Parser.cc"
13 | 			]
14 | 		}
15 | 	],
16 | 	"includes": [
17 | 		"auto-top.gypi"
18 | 	]
19 | }
20 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: node_js
 2 | 
 3 | node_js:
 4 |   - "9"
 5 |   - "8"
 6 |   - "4"
 7 |   - "0.12"
 8 | 
 9 | env:
10 | #  - CC=clang CXX=clang
11 |   - CC=gcc-4.8 CXX=g++-4.8
12 | 
13 | addons:
14 |   apt:
15 |     sources:
16 |       - ubuntu-toolchain-r-test
17 | #      - llvm-toolchain-precise-3.8
18 |     packages:
19 |       - gcc-4.8
20 |       - g++-4.8
21 | #      - clang-3.8
22 | 


--------------------------------------------------------------------------------
/test/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"compileOnSave": true,
 3 | 	"compilerOptions": {
 4 | 		"declaration": false,
 5 | 		"lib": ["es5", "es2015.collection"],
 6 | 		"module": "commonjs",
 7 | 		"moduleResolution": "node",
 8 | 		"noImplicitAny": true,
 9 | 		"noImplicitThis": true,
10 | 		"removeComments": false,
11 | 		"sourceMap": false,
12 | 		"strictNullChecks": true,
13 | 		"target": "es5"
14 | 	},
15 | 	"files": [
16 | 		"test.ts"
17 | 	]
18 | }
19 | 


--------------------------------------------------------------------------------
/lib/Patricia.cc:
--------------------------------------------------------------------------------
 1 | #include "Patricia.h"
 2 | #include "PatriciaCursor.h"
 3 | 
 4 | uint32_t Patricia :: find(const char *needle) {
 5 | 	PatriciaCursor cursor;
 6 | 	char c;
 7 | 
 8 | 	cursor.init(*this);
 9 | 	while((c = *needle++)) cursor.advance(c);
10 | 
11 | 	return(cursor.getData());
12 | }
13 | 
14 | #include <nbind/nbind.h>
15 | 
16 | #ifdef NBIND_CLASS
17 | 
18 | NBIND_CLASS(Patricia) {
19 | 	construct<>();
20 | 	method(setBuffer);
21 | 	method(find);
22 | }
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/src/parser/ParserLib.ts:
--------------------------------------------------------------------------------
 1 | import * as path from 'path';
 2 | import * as nbind from 'nbind';
 3 | import * as Lib from './Lib';
 4 | 
 5 | export const lib = nbind.init<typeof Lib>(path.resolve(__dirname, '../..')).lib;
 6 | 
 7 | export const NativeParser = lib.Parser;
 8 | export type NativeParser = Lib.Parser;
 9 | 
10 | export const NativeNamespace = lib.Namespace;
11 | export type NativeNamespace = Lib.Namespace;
12 | 
13 | export const NativeConfig = lib.ParserConfig;
14 | export type NativeConfig = Lib.ParserConfig;
15 | 


--------------------------------------------------------------------------------
/test/test-expat.ts:
--------------------------------------------------------------------------------
 1 | import * as fs from 'fs';
 2 | import * as stream from 'stream';
 3 | import * as expat from 'node-expat';
 4 | 
 5 | const xml = new expat.Parser(null);
 6 | 
 7 | xml.on('startElement', (name: string, attributeTbl: {[name: string]: string}) => {
 8 | 	// console.log(name);
 9 | 	// console.log(attributeTbl);
10 | });
11 | 
12 | const file = fs.createReadStream(process.argv[2]);
13 | 
14 | file.on('data', (data: Buffer) => xml.parse(data, false));
15 | 
16 | file.on('end', () => xml.parse('', true));
17 | 


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | version: 0.0.{build}
 2 | skip_tags: true
 3 | os: Visual Studio 2015
 4 | shallow_clone: true
 5 | init:
 6 | - ps: Install-Product node $env:nodejs_version
 7 | environment:
 8 |   matrix:
 9 |   - nodejs_version: "9"
10 |   - nodejs_version: "8"
11 |   - nodejs_version: "4"
12 |   - nodejs_version: "0.12"
13 | matrix:
14 |   allow_failures:
15 |   - nodejs_version: "0.12"
16 | install:
17 | - set PATH=%APPDATA%\npm;%PATH%
18 | - npm install
19 | build: off
20 | test_script:
21 | - node --version
22 | - npm --version
23 | - npm test
24 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | cxml
 2 | ====
 3 | 
 4 | <a href="http://travis-ci.org/charto/cxml"><img src="https://travis-ci.org/charto/cxml.svg?branch=master" alt="Build status"></a>
 5 | <a href="https://ci.appveyor.com/project/jjrv/cxml/branch/master"><img src="https://ci.appveyor.com/api/projects/status/towoy5r7xopeffdy/branch/master?svg=true" alt="Build status"></a>
 6 | 
 7 | This branch is a complete rewrite in progress.
 8 | 
 9 | License
10 | =======
11 | 
12 | [The MIT License](https://raw.githubusercontent.com/charto/cxml/master/LICENSE)
13 | 
14 | Copyright (c) 2017 BusFaster Ltd
15 | 


--------------------------------------------------------------------------------
/src/schema/Attribute.ts:
--------------------------------------------------------------------------------
 1 | import { AttributeGroup } from './AttributeGroup';
 2 | import { MemberSpec, MemberMeta, SimpleType, SimpleValue } from './Member';
 3 | 
 4 | /** Configuration for attributes as type members. */
 5 | 
 6 | export class AttributeSpec extends MemberSpec {
 7 | 
 8 | 	/** Default value to use if the element or attribute is missing. */
 9 | 	default?: SimpleValue;
10 | 	/** Name and other info. */
11 | 	meta?: AttributeMeta;
12 | 
13 | 	group?: AttributeGroup;
14 | 
15 | }
16 | 
17 | export class AttributeMeta extends MemberMeta {
18 | 
19 | 	type: SimpleType;
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/schema/Group.ts:
--------------------------------------------------------------------------------
 1 | import { SimpleElementSpec, ElementSpec } from './Element';
 2 | 
 3 | export const enum GroupKind {
 4 | 	group,
 5 | 	all,
 6 | 	choice,
 7 | 	sequence
 8 | }
 9 | 
10 | export class Group {
11 | 
12 | 	constructor( public kind: GroupKind ) {}
13 | 
14 | 	addElement(spec: SimpleElementSpec | ElementSpec) {
15 | 		this.list.push(spec);
16 | 		if(spec.meta) this.tbl[spec.meta.token.id!] = spec;
17 | 	}
18 | 
19 | 	/** List of allowed elements and groups. */
20 | 	list: (SimpleElementSpec | ElementSpec)[] = []
21 | 
22 | 	tbl: { [id: number]: SimpleElementSpec | ElementSpec } = {};
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"compileOnSave": true,
 3 | 	"compilerOptions": {
 4 | 		"declaration": true,
 5 | 		"experimentalDecorators": true,
 6 | 		"lib": [ "dom", "es5", "es2015.promise", "es2015.collection" ],
 7 | 		"module": "commonjs",
 8 | 		"moduleResolution": "node",
 9 | 		"noImplicitAny": true,
10 | 		"noImplicitThis": true,
11 | 		"outDir": "../dist",
12 | 		"removeComments": false,
13 | 		"sourceMap": false,
14 | 		"strictFunctionTypes": true,
15 | 		"strictNullChecks": true,
16 | 		"strictPropertyInitialization": false,
17 | 		"target": "es5"
18 | 	},
19 | 	"files": [
20 | 		"index.ts"
21 | 	]
22 | }
23 | 


--------------------------------------------------------------------------------
/lib/Namespace.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | 
 5 | #include <nbind/api.h>
 6 | 
 7 | #include "Patricia.h"
 8 | 
 9 | class Namespace {
10 | 
11 | public:
12 | 
13 | 	explicit Namespace(std::string uri) : uri(uri) {}
14 | 
15 | 	Namespace clone() { return(*this); }
16 | 
17 | 	void setElementTrie(nbind::Buffer buffer) {
18 | 		elementTrie.setBuffer(buffer);
19 | 	}
20 | 
21 | 	void setAttributeTrie(nbind::Buffer buffer) {
22 | 		attributeTrie.setBuffer(buffer);
23 | 	}
24 | 
25 | 	// TODO:
26 | 	// void setValueTrie(nbind::Buffer buffer) {
27 | 		// valueTrie.setBuffer(buffer);
28 | 	// }
29 | 
30 | 	std::string uri;
31 | 
32 | 	Patricia elementTrie;
33 | 	Patricia attributeTrie;
34 | 	// TODO:
35 | 	// Patricia valueTrie;
36 | 
37 | };
38 | 


--------------------------------------------------------------------------------
/src/parser/TokenChunk.ts:
--------------------------------------------------------------------------------
 1 | import { Namespace } from '../Namespace';
 2 | import { Token, TokenBuffer, TokenKind } from './Token';
 3 | 
 4 | export class TokenChunk {
 5 | 
 6 | 	static allocate(buffer: TokenBuffer = []) {
 7 | 		let chunk = TokenChunk.first;
 8 | 
 9 | 		if(chunk) {
10 | 			TokenChunk.first = chunk.next;
11 | 		} else {
12 | 			chunk = new TokenChunk();
13 | 		}
14 | 
15 | 		chunk.length = buffer.length;
16 | 		chunk.buffer = buffer;
17 | 		// Clear free list pointer to help GC find garbage also if free() is not called.
18 | 		chunk.next = void 0;
19 | 		chunk.namespaceList = void 0;
20 | 
21 | 		return(chunk);
22 | 	}
23 | 
24 | 	free() {
25 | 		this.next = TokenChunk.first;
26 | 		TokenChunk.first = this;
27 | 	}
28 | 
29 | 	length: number;
30 | 	buffer: TokenBuffer;
31 | 	next: TokenChunk | undefined;
32 | 	namespaceList: (Namespace | undefined)[] | undefined;
33 | 
34 | 	private static first: TokenChunk | undefined;
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | export { ArrayType, encodeArray, decodeArray, concatArray } from './Buffer';
 2 | export { CRC32, Hasher32 } from './CRC32';
 3 | 
 4 | import { Namespace } from './Namespace';
 5 | export { Namespace };
 6 | export { ParserConfig, ParserOptions, TokenTbl, Registry } from './parser/ParserConfig';
 7 | export { Parser, ParseError } from './parser/Parser';
 8 | export { Builder } from './builder/Builder';
 9 | export { Writer } from './writer/Writer';
10 | export { JsonWriter } from './writer/JsonWriter';
11 | export { defineElement, defineAttribute, jsxElement, jsxCompile, jsxExpand } from './parser/JSX';
12 | export { TokenChunk } from './parser/TokenChunk';
13 | export { ElementMeta } from './schema/Element';
14 | export { AttributeMeta } from './schema/Attribute';
15 | export * from './parser/Token';
16 | 
17 | export const processing = Namespace.processing;
18 | export const anonymous = Namespace.unknown;
19 | export const xml1998 = Namespace.xml1998;
20 | 


--------------------------------------------------------------------------------
/src/builder/BuilderConfig.ts:
--------------------------------------------------------------------------------
 1 | import { Namespace } from '../Namespace';
 2 | import { ParserConfig, ParserOptions } from '../parser/ParserConfig';
 3 | import { SimpleSchema, SimpleSchemaSpecTbl } from '../schema/SimpleSchema';
 4 | import { RuleSet } from './RuleSet';
 5 | import { Builder } from './Builder';
 6 | 
 7 | export class BuilderConfig {
 8 | 
 9 | 	constructor(parserConfig: ParserConfig, schemaSpec: SimpleSchemaSpecTbl) {
10 | 		this.options = parserConfig.options;
11 | 
12 | 		for(let prefix of Object.keys(schemaSpec)) {
13 | 			const [ defaultPrefix, nsUri, spec ] = schemaSpec[prefix];
14 | 			const ns = new Namespace(defaultPrefix, nsUri);
15 | 
16 | 			if(spec['document']) {
17 | 				this.ruleSetTbl[nsUri] = new RuleSet(new SimpleSchema(parserConfig, ns, spec));
18 | 			}
19 | 		}
20 | 	}
21 | 
22 | 	createBuilder(nsUri: string) {
23 | 		return(new Builder(this, nsUri));
24 | 	}
25 | 
26 | 	options: ParserOptions;
27 | 	ruleSetTbl: { [uri: string]: RuleSet } = {};
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/src/parser/ParserStream.ts:
--------------------------------------------------------------------------------
 1 | import * as stream from 'stream';
 2 | 
 3 | import { ArrayType } from '../Buffer';
 4 | import { Namespace } from '../Namespace';
 5 | import { ParserConfig } from './ParserConfig';
 6 | import { Parser } from './Parser';
 7 | import { TokenChunk } from './TokenChunk';
 8 | import {
 9 | 	Token,
10 | 	TokenBuffer,
11 | 	TokenKind,
12 | } from './Token';
13 | 
14 |  /** XML parser stream, emits tokens with fully qualified names. */
15 | 
16 | export class ParserStream extends stream.Transform {
17 | 
18 | 	constructor(config: ParserConfig, public parser = config.createParser()) {
19 | 		super({ objectMode: true });
20 | 	}
21 | 
22 | 	_flush( flush: (err: any, chunk: TokenChunk | null) => void) {
23 | 		this.parser.destroy(flush);
24 | 		flush(null, null);
25 | 	}
26 | 
27 | 	_transform(
28 | 		chunk: string | ArrayType,
29 | 		enc: string,
30 | 		flush: (err: any, chunk: TokenChunk | null) => void
31 | 	) {
32 | 		this.parser.write(chunk, enc, flush);
33 | 	}
34 | 
35 | }
36 | 


--------------------------------------------------------------------------------
/src/schema/Member.ts:
--------------------------------------------------------------------------------
 1 | import { Namespace } from '../Namespace';
 2 | import { MemberToken } from '../parser/Token';
 3 | 
 4 | import { ComplexType } from './ComplexType';
 5 | 
 6 | /** SimpleType equivalent JavaScript data types. */
 7 | 
 8 | export type SimpleValue = string | number | boolean;
 9 | 
10 | /** Configuration for elements and attributes as type members. */
11 | 
12 | export class MemberSpec {
13 | 
14 | 	constructor(
15 | 		public min = 1,
16 | 		public max = 1
17 | 	) {}
18 | 
19 | 	meta?: MemberMeta;
20 | 
21 | }
22 | 
23 | /** Definition of a type with only text content.
24 |   * Applicable to both elements and attributes. */
25 | 
26 | export class SimpleType {
27 | 
28 | 	base?: SimpleType;
29 | 
30 | }
31 | 
32 | export class MemberMeta {
33 | 
34 | 	/** @param token Token with element or attribute name and namespace.
35 | 	  * A single token may have different types depending on its parent. */
36 | 	constructor(public token: MemberToken) {}
37 | 
38 | 	exists = true;
39 | 
40 | 	type: SimpleType | ComplexType;
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/test/node-expat.d.ts:
--------------------------------------------------------------------------------
 1 | declare module "node-expat" {
 2 | 	import * as events from "events";
 3 | 
 4 | 	export class Parser extends events.EventEmitter {
 5 | 		constructor(encoding: string | null);
 6 | 
 7 | 		parse(data: string | Buffer, isFinal: boolean): boolean;
 8 | 
 9 | 		setEncoding(encoding: string): boolean;
10 | 		// setUnknownEncoding() TODO
11 | 
12 | 		// getError() TODO
13 | 
14 | 		stop(): boolean;
15 | 		// Same return value as stop().
16 | 		pause(): boolean;
17 | 		resume(): boolean;
18 | 
19 | 		destroy(): void;
20 | 		destroySoon(): void;
21 | 
22 | 		// Same data argument and return value as parse() but emits errors and isFinal is false.
23 | 		write(data: string | Buffer): boolean;
24 | 		// Same data argument and return value as parse() but emits errors and isFinal is true.
25 | 		end(data: string | Buffer): boolean;
26 | 
27 | 		reset(): boolean;
28 | 
29 | 		getCurrentLineNumber(): number;
30 | 		getCurrentColumnNumber(): number;
31 | 		getCurrentByteIndex(): number;
32 | 	}
33 | 
34 | 	// export function createParser(cb: ???): Parser TODO
35 | }   
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2017 BusFaster Ltd
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/src/Namespace.ts:
--------------------------------------------------------------------------------
 1 | /** Basic XML namespace definition. */
 2 | 
 3 | export class Namespace {
 4 | 
 5 | 	constructor(
 6 | 		/** Default xmlns prefix for serializing to XML. */
 7 | 		public defaultPrefix: string,
 8 | 		/** Unique identifier for the namespace, should be a valid URI. */
 9 | 		public uri: string,
10 | 		/** Numeric ID for faster mapping of namespaces to local prefixes. */
11 | 		public id = Namespace.idLast++,
12 | 		/** Special namespaces represent processing instructions (always defined). */
13 | 		public isSpecial = false
14 | 	) {}
15 | 
16 | 	addElement(name: string) { this.elementNameList.push(name); }
17 | 	addAttribute(name: string) { this.attributeNameList.push(name); }
18 | 	addLocation(url: string) { this.schemaLocationList.push(url); }
19 | 
20 | 	elementNameList: string[] = [];
21 | 	attributeNameList: string[] = [];
22 | 	schemaLocationList: string[] = [];
23 | 
24 | 	static idLast = 0;
25 | 	static unknown = new Namespace('', '', 0, true);
26 | 	static processing = new Namespace('?', '?', 0, true);
27 | 	static xml1998 = new Namespace('xml', 'http://www.w3.org/XML/1998/namespace');
28 | 
29 | }
30 | 
31 | Namespace.processing.addElement('xml');
32 | 


--------------------------------------------------------------------------------
/lib/Patricia.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <nbind/api.h>
 4 | 
 5 | /*
 6 | 	A trie node contains data and 4 extra bytes:
 7 | 
 8 | 	- Length in bits (1 byte)
 9 | 	- Data (1 - 32 bytes)
10 | 	- If state is accepted (can only have one child):
11 | 	  - Offset to data pointer (3 bytes)
12 | 	- If node has two children (cannot be an accepted state):
13 | 	  - Offset to other child (3 bytes)
14 | 
15 | 	First child node immediately follows.
16 | 
17 | 	Total data size is limited to 16 megabytes.
18 | */
19 | 
20 | /** Patricia trie. */
21 | 
22 | class Patricia {
23 | 
24 | 	friend class PatriciaCursor;
25 | 
26 | public:
27 | 
28 | 	void setRoot(const unsigned char *root) { this->root = root; }
29 | 
30 | 	void setBuffer(nbind::Buffer buffer) {
31 | 		this->buffer = buffer;
32 | 		root = buffer.data();
33 | 	}
34 | 
35 | 	uint32_t find(const char *needle);
36 | 
37 | 	static constexpr uint32_t notFound = 0x7fffff;
38 | 	static constexpr uint32_t idMask = 0x7fffff;
39 | 
40 | private:
41 | 
42 | 	/** Trie root. */
43 | 	const unsigned char *root;
44 | 
45 | 	/** Handle to the JavaScript buffer with inserted data,
46 | 	  * to prevent garbage collecting it too early. */
47 | 	nbind::Buffer buffer;
48 | 
49 | };
50 | 


--------------------------------------------------------------------------------
/src/tokenizer/TokenSpace.ts:
--------------------------------------------------------------------------------
 1 | import { ParserNamespace } from '../parser/ParserNamespace';
 2 | import { InternalToken } from '../parser/InternalToken';
 3 | import { TokenKind } from '../parser/Token';
 4 | 
 5 | /** Allocates IDs for distinguishing between tokens of the same type. */
 6 | 
 7 | export class TokenSpace {
 8 | 
 9 | 	constructor(private kind: TokenKind, parent?: TokenSpace) {
10 | 		if(parent) {
11 | 			this.isLinked = true;
12 | 
13 | 			this.idLast = parent.idLast;
14 | 			this.list = parent.list;
15 | 		} else {
16 | 			this.isLinked = false;
17 | 
18 | 			this.idLast = 0;
19 | 			this.list = [];
20 | 		}
21 | 	}
22 | 
23 | 	link() {
24 | 		this.isLinked = true;
25 | 	}
26 | 
27 | 	private unlink() {
28 | 		if(!this.isLinked) return;
29 | 		this.isLinked = false;
30 | 
31 | 		this.list = this.list.slice(0);
32 | 	}
33 | 
34 | 	createToken(name: string, ns?: ParserNamespace) {
35 | 		this.unlink();
36 | 
37 | 		const token = new InternalToken(++this.idLast, this.kind, name, ns);
38 | 		this.list[token.id] = token;
39 | 
40 | 		return(token);
41 | 	}
42 | 
43 | 	/** If true, object is a clone sharing data with another object. */
44 | 	private isLinked: boolean;
45 | 	private idLast: number;
46 | 
47 | 	list: InternalToken[];
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/src/tokenizer/CodeType.ts:
--------------------------------------------------------------------------------
 1 | // TODO: cdata start/end (disables entity parsing on JS side)
 2 | export const enum CodeType {
 3 | 	OPEN_ELEMENT_ID = 0,
 4 | 	CLOSE_ELEMENT_ID,
 5 | 	ATTRIBUTE_ID,
 6 | 	PREFIX_ID,
 7 | 	XMLNS_ID,
 8 | 	URI_ID,
 9 | 	SGML_ID,
10 | 
11 | 	ELEMENT_EMITTED,
12 | 	CLOSED_ELEMENT_EMITTED,
13 | 	SGML_EMITTED,
14 | 
15 | 	NAMESPACE_ID,
16 | 
17 | 	VALUE_START_OFFSET,
18 | 	VALUE_END_OFFSET,
19 | 
20 | 	TEXT_START_OFFSET,
21 | 	TEXT_END_OFFSET,
22 | 
23 | 	CDATA_START_OFFSET,
24 | 	CDATA_END_OFFSET,
25 | 
26 | 	COMMENT_START_OFFSET,
27 | 	COMMENT_END_OFFSET,
28 | 
29 | 	SGML_NESTED_START,
30 | 	SGML_NESTED_END,
31 | 
32 | 	SGML_TEXT_START_OFFSET,
33 | 	SGML_TEXT_END_OFFSET,
34 | 
35 | 	// Unrecognized element name.
36 | 	UNKNOWN_START_OFFSET,
37 | 
38 | 	// The order of these must match OPEN_ELEMENT_ID, CLOSE_ELEMENT_ID...
39 | 	UNKNOWN_OPEN_ELEMENT_END_OFFSET,
40 | 	UNKNOWN_CLOSE_ELEMENT_END_OFFSET,
41 | 	UNKNOWN_ATTRIBUTE_END_OFFSET,
42 | 	UNKNOWN_PREFIX_END_OFFSET,
43 | 	UNKNOWN_XMLNS_END_OFFSET,
44 | 	UNKNOWN_URI_END_OFFSET,
45 | 	UNKNOWN_SGML_END_OFFSET,
46 | 
47 | 	// Recognized part from an unrecognized name.
48 | 	PARTIAL_ELEMENT_ID,
49 | 	PARTIAL_ATTRIBUTE_ID,
50 | 	PARTIAL_PREFIX_ID,
51 | 	PARTIAL_URI_ID,
52 | 	PARTIAL_LEN
53 | };
54 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "cxml",
 3 |   "version": "0.3.1",
 4 |   "description": "Advanced schema-aware streaming XML parser",
 5 |   "main": "dist/index.js",
 6 |   "typings": "dist/index.d.ts",
 7 |   "scripts": {
 8 |     "autogypi": "autogypi",
 9 |     "node-gyp": "node-gyp",
10 |     "emcc-path": "emcc-path",
11 |     "copyasm": "copyasm",
12 |     "ndts": "ndts",
13 |     "tsc": "tsc",
14 |     "prepublish": "ndts > src/parser/Lib.d.ts && tsc -p src && ndts > dist/parser/Lib.d.ts",
15 |     "install": "autogypi && node-gyp configure build",
16 |     "test": "tsc -p test && node test/test.js"
17 |   },
18 |   "author": "Juha Järvi",
19 |   "license": "MIT",
20 |   "repository": {
21 |     "type": "git",
22 |     "url": "git+https://github.com/charto/cxml.git"
23 |   },
24 |   "bugs": {
25 |     "url": "https://github.com/charto/cxml/issues"
26 |   },
27 |   "homepage": "https://github.com/charto/cxml#readme",
28 |   "keywords": [
29 |     "xml",
30 |     "streaming",
31 |     "schema",
32 |     "parser",
33 |     "xsd",
34 |     "dts",
35 |     "typescript"
36 |   ],
37 |   "devDependencies": {
38 |     "typescript": "^3.2.2"
39 |   },
40 |   "dependencies": {
41 |     "@types/node": "^10.12.18",
42 |     "autogypi": "^0.2.2",
43 |     "nbind": "^0.3.15"
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/CRC32.ts:
--------------------------------------------------------------------------------
 1 | import { ArrayType, encodeArray } from './Buffer';
 2 | 
 3 | export interface Hasher32 {
 4 | 	append(data: string | ArrayType): number;
 5 | }
 6 | 
 7 | class Hasher implements Hasher32 {
 8 | 	constructor(private tbl: number[]) {}
 9 | 
10 | 	append(data: string | ArrayType) {
11 | 		const tbl = this.tbl;
12 | 		let crc = this.crc;
13 | 
14 | 		if(typeof(data) == 'string') data = encodeArray(data);
15 | 
16 | 		let len = data.length;
17 | 		let pos = 0;
18 | 
19 | 		for(let pos = 0; pos < len; ++pos) {
20 | 			crc = (crc >>> 8) ^ tbl[(crc & 0xff) ^ data[pos]];
21 | 		}
22 | 
23 | 		this.crc = crc;
24 | 
25 | 		return((crc ^ 0xffffffff) >>> 0);
26 | 	}
27 | 
28 | 	crc = 0xffffffff;
29 | }
30 | 
31 | /** 32-bit Cyclic Redundancy Check. */
32 | 
33 | export class CRC32 {
34 | 	/** @param poly Reversed generator polynomial, default edb88320 (Ethernet, GZIP, PNG).
35 | 	  * Other good choices are 82f63b78 (Castagnoli) used in Btrfs and eb31d82e (Koopman). */
36 | 
37 | 	constructor(public poly = 0xedb88320) {
38 | 		for(let n = 0; n < 256; ++n) {
39 | 			let crc = n;
40 | 			let b = 8;
41 | 
42 | 			while(b--) {
43 | 				crc = ((crc >>> 1) ^ (-(crc & 1) & poly)) >>> 0;
44 | 			}
45 | 
46 | 			this.tbl[n] = crc;
47 | 		}
48 | 	}
49 | 
50 | 	create(): Hasher32 {
51 | 		return(new Hasher(this.tbl));
52 | 	}
53 | 
54 | 	tbl: number[] = [];
55 | }
56 | 


--------------------------------------------------------------------------------
/lib/ParserConfig.cc:
--------------------------------------------------------------------------------
 1 | #include "ParserConfig.h"
 2 | #include "PatriciaCursor.h"
 3 | 
 4 | ParserConfig :: ParserConfig(
 5 | 	uint32_t xmlnsToken,
 6 | 	uint32_t emptyPrefixToken,
 7 | 	uint32_t xmlnsPrefixToken,
 8 | 	uint32_t processingPrefixToken
 9 | ) :
10 | 	xmlnsToken(xmlnsToken),
11 | 	emptyPrefixToken(emptyPrefixToken),
12 | 	xmlnsPrefixToken(xmlnsPrefixToken),
13 | 	processingPrefixToken(processingPrefixToken)
14 | {
15 | 	for(unsigned int i = 0; i < namespacePrefixTblSize; ++i) {
16 | 		namespacePrefixTbl[i] = std::make_pair(0, nullptr);
17 | 	}
18 | 	// Ensure that valid namespace indices start from 1.
19 | 	// TODO: Do we still need this?
20 | 	namespaceList.push_back(nullptr);
21 | }
22 | 
23 | bool ParserConfig :: addUri(uint32_t uri, uint32_t ns) {
24 | 	if(ns < namespaceList.size()) {
25 | 		if(uri >= namespaceByUriToken.size()) {
26 | 			namespaceByUriToken.resize(uri + 1);
27 | 		}
28 | 
29 | 		namespaceByUriToken[uri] = std::make_pair(ns, namespaceList[ns].get());
30 | 
31 | 		return(true);
32 | 	}
33 | 
34 | 	return(false);
35 | }
36 | 
37 | #include <nbind/nbind.h>
38 | 
39 | #ifdef NBIND_CLASS
40 | 
41 | NBIND_CLASS(ParserConfig) {
42 | 	construct<uint32_t, uint32_t, uint32_t, uint32_t>();
43 | 
44 | 	method(addNamespace);
45 | 	method(addUri);
46 | 	method(bindPrefix);
47 | 
48 | 	method(setUriTrie);
49 | 	method(setPrefixTrie);
50 | }
51 | 
52 | #endif
53 | 


--------------------------------------------------------------------------------
/src/schema/ComplexType.ts:
--------------------------------------------------------------------------------
 1 | import { AttributeSpec } from './Attribute';
 2 | import { AttributeGroup } from './AttributeGroup';
 3 | import { SimpleElementSpec, ElementSpec, ElementBase } from './Element';
 4 | import { Group, GroupKind } from './Group';
 5 | 
 6 | export interface ElementTypeConstructor<ElementClass extends ElementBase = ElementBase> {
 7 | 	new(): ElementClass;
 8 | };
 9 | 
10 | /** Definition of a type containing other elements and attributes. Only applicable to elements. */
11 | 
12 | export class ComplexType {
13 | 
14 | 	addAttribute(spec: AttributeSpec) {
15 | 		if(!this.attributes) this.attributes = new AttributeGroup();
16 | 		this.attributes.addAttribute(spec);
17 | 	}
18 | 
19 | 	addAll(spec: SimpleElementSpec | ElementSpec) {
20 | 		if(!this.elements) {
21 | 			this.elements = new ElementSpec();
22 | 			this.elements.group = new Group(GroupKind.all);
23 | 		}
24 | 
25 | 		this.elements.group!.addElement(spec);
26 | 	}
27 | 
28 | 	createProto<ElementClass = ElementBase>() {
29 | 		if(!this.XMLType) {
30 | 			const BaseType = this.base ? this.base.createProto() : ElementBase;
31 | 			this.XMLType = class XMLType extends BaseType {};
32 | 		}
33 | 
34 | 		return(this.XMLType as ElementTypeConstructor<ElementClass>);
35 | 	}
36 | 
37 | 	base?: ComplexType;
38 | 
39 | 	XMLType: ElementTypeConstructor;
40 | 
41 | 	attributes?: AttributeGroup;
42 | 	elements?: ElementSpec;
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/lib/PatriciaCursor.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdint>
 4 | 
 5 | #include "Patricia.h"
 6 | 
 7 | /** Cursor for finding a string in the trie, in steps of one character. */
 8 | class PatriciaCursor {
 9 | 
10 | public:
11 | 
12 | 	/** Start scanning a trie from the first input character. */
13 | 	void init(const Patricia &trie);
14 | 
15 | 	/** Try to match previous input using a different trie. On failure,
16 | 	  * the cursor remains unchanged. */
17 | 	bool transfer(const Patricia &trie);
18 | 
19 | 	/** Advance to the next input character, updating pointer to any associated
20 | 	  * value found. */
21 | 	bool advance(unsigned char c);
22 | 
23 | 	/** Find the ID of the (lexicographically) first descendant leaf
24 | 	  * after advance has failed. The cursor position is unchanged. */
25 | 	uint32_t findLeaf();
26 | 
27 | 	/** Get the data value associated with the string.
28 | 	  * Valid values are from 0 to 0x7ffffe and 0x7fffff indicates no data.
29 | 	  * Values are 3 bytes and the highest bit is an internal flag whether
30 | 	  * the trie node has no children. */
31 | 	uint32_t getData();
32 | 
33 | private:
34 | 
35 | 	const unsigned char *root = nullptr;
36 | 	const unsigned char *ptr = nullptr;
37 | 	const unsigned char *found;
38 | 	uint16_t len;
39 | 
40 | 	/** Handle to the JavaScript buffer with inserted data,
41 | 	  * to prevent garbage collecting it too early. */
42 | 	nbind::Buffer buffer;
43 | 
44 | };
45 | 


--------------------------------------------------------------------------------
/src/Buffer.ts:
--------------------------------------------------------------------------------
 1 | export declare class TextEncoder {
 2 | 	constructor(encoding: string);
 3 | 
 4 | 	encode(data: string): Uint8Array;
 5 | 	decode(data: Uint8Array): string;
 6 | }
 7 | 
 8 | export type ArrayType = Buffer | Uint8Array;
 9 | export let ArrayType: { new(size: number): ArrayType };
10 | 
11 | export let encodeArray: (text: string) => ArrayType;
12 | export let decodeArray: (data: ArrayType, start?: number, end?: number) => string;
13 | export let concatArray: (list: ArrayType[], len: number) => ArrayType;
14 | 
15 | if(typeof(Buffer) == 'function') {
16 | 	ArrayType = Buffer;
17 | 
18 | 	encodeArray = (text: string) => new Buffer(text);
19 | 	decodeArray = (data: ArrayType, start?: number, end?: number) => (data as Buffer).toString('utf-8', start, end);
20 | 
21 | 	concatArray = Buffer.concat as any;
22 | } else if(typeof(TextEncoder) == 'function') {
23 | 	ArrayType = Uint8Array;
24 | 
25 | 	const encoder = new TextEncoder('utf-8');
26 | 	encodeArray = (text: string) => encoder.encode(name);
27 | 	decodeArray = (data: ArrayType, start?: number, end?: number) => encoder.decode(
28 | 		(start || end || end === 0) ? data.slice(start, end) : data
29 | 	);
30 | 
31 | 	concatArray = (list: ArrayType[], len: number) => {
32 | 		const buf = new Uint8Array(len);
33 | 
34 | 		let offset = 0;
35 | 		for(let part of list) {
36 | 			buf.set(part, offset);
37 | 			offset += part.length;
38 | 		}
39 | 
40 | 		return(buf);
41 | 	}
42 | }
43 | 


--------------------------------------------------------------------------------
/src/parser/Stitcher.ts:
--------------------------------------------------------------------------------
 1 | import { ArrayType, encodeArray, decodeArray, concatArray } from '../Buffer';
 2 | 
 3 | export class Stitcher {
 4 | 
 5 | 	setChunk(chunk: ArrayType) {
 6 | 		this.chunk = chunk;
 7 | 	}
 8 | 
 9 | 	reset(buf: ArrayType, len: number) {
10 | 		this.partList = [ buf.slice(0, len) ];
11 | 		this.byteLen = len;
12 | 	}
13 | 
14 | 	storeSlice(start: number, end?: number) {
15 | 		if(!this.partList) this.partList = [];
16 | 		if(end !== 0) {
17 | 			this.partList.push(this.chunk.slice(start, end));
18 | 			this.byteLen += (end || this.chunk.length) - start;
19 | 		}
20 | 	}
21 | 
22 | 	/** getSlice helper for concatenating buffer parts. */
23 | 	private buildSlice(start: number, end?: number) {
24 | 		this.storeSlice(start, end);
25 | 
26 | 		const result = decodeArray(concatArray(this.partList!, this.byteLen));
27 | 		this.partList = null;
28 | 		this.byteLen = 0;
29 | 
30 | 		return(result);
31 | 	}
32 | 
33 | 	/** Get a string from the input buffer. Prepend any parts left from
34 | 	  * previous code buffers. */
35 | 	getSlice(start: number, end?: number) {
36 | 		return((
37 | 			this.partList ? this.buildSlice(start, end) :
38 | 			decodeArray(this.chunk, start, end)
39 | 		).replace(/\r\n?|\n\r/g, '\n'));
40 | 	}
41 | 
42 | 	/** Current input buffer. */
43 | 	private chunk: ArrayType;
44 | 
45 | 	/** Storage for parts of strings split between chunks of input. */
46 | 	private partList: ArrayType[] | null = null;
47 | 	private byteLen = 0;
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/lib/ParserConfig.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <vector>
 4 | #include <memory>
 5 | 
 6 | #include "Namespace.h"
 7 | #include "Patricia.h"
 8 | 
 9 | class ParserConfig {
10 | 
11 | 	friend class Parser;
12 | 
13 | public:
14 | 
15 | 	static constexpr uint32_t namespacePrefixTblSize = 256;
16 | 
17 | 	ParserConfig(uint32_t xmlnsToken, uint32_t emptyPrefixToken, uint32_t xmlnsPrefixToken, uint32_t processingPrefixToken);
18 | 
19 | 	void setUriTrie(nbind::Buffer buffer) { uriTrie.setBuffer(buffer); }
20 | 	void setPrefixTrie(nbind::Buffer buffer) { prefixTrie.setBuffer(buffer); }
21 | 
22 | 	uint32_t addNamespace(const std::shared_ptr<Namespace> ns) {
23 | 		namespaceList.push_back(ns);
24 | 
25 | 		return(namespaceList.size() - 1);
26 | 	}
27 | 
28 | 	bool addUri(uint32_t uri, uint32_t ns);
29 | 
30 | 	bool bindPrefix(uint32_t idPrefix, uint32_t uri) {
31 | 		if(idPrefix >= namespacePrefixTblSize) return(false);
32 | 		if(uri >= namespaceByUriToken.size()) return(false);
33 | 
34 | 		namespacePrefixTbl[idPrefix] = namespaceByUriToken[uri];
35 | 		return(true);
36 | 	}
37 | 
38 | private:
39 | 
40 | 	std::vector<std::shared_ptr<Namespace>> namespaceList;
41 | 	std::vector<std::pair<uint32_t, const Namespace *> > namespaceByUriToken;
42 | 	std::pair<uint32_t, const Namespace *> namespacePrefixTbl[namespacePrefixTblSize];
43 | 
44 | 	uint32_t xmlnsToken;
45 | 
46 | 	uint32_t emptyPrefixToken;
47 | 	uint32_t xmlnsPrefixToken;
48 | 	uint32_t processingPrefixToken;
49 | 
50 | 	Patricia uriTrie;
51 | 	Patricia prefixTrie;
52 | 
53 | };
54 | 


--------------------------------------------------------------------------------
/src/tokenizer/TokenSet.ts:
--------------------------------------------------------------------------------
 1 | import { Patricia } from './Patricia';
 2 | import { ParserNamespace } from '../parser/ParserNamespace';
 3 | import { TokenSpace } from './TokenSpace';
 4 | import { InternalToken } from '../parser/InternalToken';
 5 | 
 6 | export class TokenSet {
 7 | 
 8 | 	constructor(private space: TokenSpace, parent?: TokenSet) {
 9 | 		if(parent) {
10 | 			this.isLinked = true;
11 | 
12 | 			this.tbl = parent.tbl;
13 | 			this.trie = parent.trie;
14 | 		} else {
15 | 			this.isLinked = false;
16 | 
17 | 			this.tbl = {};
18 | 			this.trie = new Patricia();
19 | 		}
20 | 	}
21 | 
22 | 	link() {
23 | 		this.isLinked = true;
24 | 	}
25 | 
26 | 	private unlink() {
27 | 		if(!this.isLinked) return;
28 | 		this.isLinked = false;
29 | 
30 | 		const tbl: { [ name: string ]: InternalToken } = {};
31 | 		for(let key of Object.keys(this.tbl)) {
32 | 			tbl[key] = this.tbl[key];
33 | 		}
34 | 
35 | 		this.tbl = tbl;
36 | 		this.trie = this.trie.clone();
37 | 	}
38 | 
39 | 	createToken(name: string, ns?: ParserNamespace) {
40 | 		let token = this.tbl[name];
41 | 
42 | 		if(!token) {
43 | 			this.unlink();
44 | 
45 | 			token = this.space.createToken(name, ns);
46 | 
47 | 			this.tbl[name] = token;
48 | 			if(token.name) {
49 | 				this.dirty = true;
50 | 				this.trie.insertNode(token);
51 | 			}
52 | 		}
53 | 
54 | 		return(token);
55 | 	}
56 | 
57 | 	addToken(token: InternalToken) {
58 | 		if(token.name) {
59 | 			this.dirty = true;
60 | 			this.tbl[token.name] = token;
61 | 			this.trie.insertNode(token);
62 | 		}
63 | 	}
64 | 
65 | 	encodeTrie() {
66 | 		return(this.trie.encode());
67 | 	}
68 | 
69 | 	/** If true, object is a clone sharing data with another object. */
70 | 	private isLinked: boolean;
71 | 
72 | 	private tbl: { [ name: string ]: InternalToken };
73 | 	private trie: Patricia;
74 | 
75 | 	public dirty = true;
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/src/parser/InternalToken.ts:
--------------------------------------------------------------------------------
 1 | import { Namespace } from '../Namespace';
 2 | import { ParserNamespace } from './ParserNamespace';
 3 | import { ArrayType, encodeArray } from '../Buffer';
 4 | import { TokenKind, OpenToken, CloseToken, EmittedToken, StringToken, PrefixToken, UriToken } from './Token';
 5 | 
 6 | export class InternalToken {
 7 | 	constructor(
 8 | 		public id: number,
 9 | 		kind: TokenKind,
10 | 		public name: string,
11 | 		public ns?: ParserNamespace
12 | 	) {
13 | 		this.buf = encodeArray(name);
14 | 		const nsBase = ns ? ns.base : Namespace.unknown;
15 | 
16 | 		switch(kind) {
17 | 			case TokenKind.element:
18 | 
19 | 				this.open = new OpenToken(name, nsBase, id);
20 | 				this.close = this.open.close;
21 | 				this.emitted = this.open.emitted;
22 | 				this.tokenList = [
23 | 					this.open,
24 | 					this.close,
25 | 					this.emitted,
26 | 					null
27 | 				];
28 | 				break;
29 | 
30 | 			case TokenKind.attribute:
31 | 
32 | 				this.string = new StringToken(name, nsBase, id);
33 | 				this.tokenList = [
34 | 					null,
35 | 					null,
36 | 					null,
37 | 					this.string
38 | 				];
39 | 				break;
40 | 
41 | 			case TokenKind.prefix:
42 | 				this.prefix = new PrefixToken(name, id);
43 | 				break;
44 | 
45 | 			case TokenKind.uri:
46 | 				this.uri = new UriToken(ns!.base);
47 | 				break;
48 | 
49 | 			default:
50 | 
51 | 				break;
52 | 		}
53 | 	}
54 | 
55 | 	// TODO: Should be an empty string instead.
56 | 	static empty = new InternalToken(0, TokenKind.element, '\0');
57 | 
58 | 	buf: ArrayType;
59 | 
60 | 	open: OpenToken;
61 | 	close: CloseToken;
62 | 	emitted: EmittedToken;
63 | 
64 | 	string: StringToken;
65 | 
66 | 	prefix: PrefixToken;
67 | 	uri: UriToken;
68 | 
69 | 	// Order must match TokenKind.
70 | 	tokenList: [
71 | 		OpenToken | null,
72 | 		CloseToken | null,
73 | 		EmittedToken | null,
74 | 		StringToken | null
75 | 	];
76 | }
77 | 


--------------------------------------------------------------------------------
/test/test.ts:
--------------------------------------------------------------------------------
 1 | import * as fs from 'fs';
 2 | import * as path from 'path';
 3 | 
 4 | import * as nbind from 'nbind';
 5 | import * as cxml from '..';
 6 | import * as Lib from '../dist/parser/Lib';
 7 | 
 8 | import { ParserStream } from '../dist/parser/ParserStream';
 9 | 
10 | import { TokenSpace } from '../dist/tokenizer/TokenSpace';
11 | import { Patricia } from '../dist/tokenizer/Patricia';
12 | 
13 | const lib = nbind.init<typeof Lib>(path.resolve(__dirname, '..')).lib;
14 | 
15 | function testPatricia() {
16 | 	const tokenSpace = new TokenSpace(0);
17 | 	const trie = new Patricia();
18 | 	const rawTrie = new lib.Patricia();
19 | 
20 | 	const tokenList = fs.readFileSync(
21 | 		process.argv[2] || path.resolve(__dirname, 'words.txt'),
22 | 		{ encoding: 'utf-8' }
23 | 	).split('\n').filter(
24 | 		(name: string) => name.length > 1
25 | 	).map(
26 | 		(name: string) => tokenSpace.createToken(name)
27 | 	);
28 | 
29 | 	trie.insertList(tokenList);
30 | 	rawTrie.setBuffer(trie.encode());
31 | 
32 | 	let result: number;
33 | 
34 | 	for(let token of tokenList) {
35 | 		result = rawTrie.find(token.name);
36 | 		if(result != token.id) {
37 | 			console.error('ERROR in ' + result + ' ' + token.name);
38 | 			process.exit(1);
39 | 		}
40 | 	}
41 | }
42 | 
43 | function testParser() {
44 | 	const xmlConfig = new cxml.ParserConfig();
45 | 
46 | 	const xmlParser = new ParserStream(xmlConfig);
47 | 
48 | 	xmlParser.pipe(new cxml.Writer()).pipe(process.stdout);
49 | 
50 | 	xmlParser.write('<?xml version="1.0" encoding="UTF-8"?><foo xmlns="urn:test:a1"><bar xmlns="urn:test:a2" /><bar /></foo>');
51 | 	// xmlParser.write('<foo xmlns="urn:test:a1"><bar xmlns="urn:test:a2" /><bar /></foo>');
52 | 	// xmlParser.write('<a:foo xmlns:a="urn:test:a1"><a:bar xmlns:a="urn:test:a2" /><a:bar /></a:foo>');
53 | 	// xmlParser.write('</a></a>');
54 | 
55 | 	xmlParser.end();
56 | }
57 | 
58 | testPatricia();
59 | testParser();
60 | 


--------------------------------------------------------------------------------
/src/parser/ParserNamespace.ts:
--------------------------------------------------------------------------------
 1 | import { NativeNamespace } from './ParserLib';
 2 | 
 3 | import { Namespace } from '../Namespace';
 4 | import { ParserConfig } from './ParserConfig';
 5 | import { Token } from './Token';
 6 | import { TokenSet } from '../tokenizer/TokenSet';
 7 | import { InternalToken } from './InternalToken';
 8 | 
 9 | export class ParserNamespace {
10 | 
11 | 	/** @param base Parser-independent namespace definition. */
12 | 	constructor(public parent: Namespace | ParserNamespace, config: ParserConfig) {
13 | 		if(parent instanceof ParserNamespace) {
14 | 			this.base = parent.base;
15 | 			this.native = parent.native.clone();
16 | 
17 | 			this.elementSet = new TokenSet(config.elementSpace, parent.elementSet);
18 | 			this.attributeSet = new TokenSet(config.attributeSpace, parent.attributeSet);
19 | 
20 | 			this.uriToken = parent.uriToken;
21 | 		} else {
22 | 			this.base = parent;
23 | 			this.native = new NativeNamespace(parent.uri);
24 | 
25 | 			this.elementSet = new TokenSet(config.elementSpace);
26 | 			this.attributeSet = new TokenSet(config.attributeSpace);
27 | 
28 | 			this.attributeSet.addToken(config.xmlnsToken);
29 | 
30 | 			for(let name of parent.elementNameList) {
31 | 				this.addElement(name);
32 | 			}
33 | 
34 | 			for(let name of parent.attributeNameList) {
35 | 				this.addAttribute(name);
36 | 			}
37 | 		}
38 | 	}
39 | 
40 | 	registerNative(): NativeNamespace {
41 | 		if(this.elementSet.dirty) {
42 | 			this.native.setElementTrie(this.elementSet.encodeTrie());
43 | 			this.elementSet.dirty = false;
44 | 		}
45 | 		if(this.attributeSet.dirty) {
46 | 			this.native.setAttributeTrie(this.attributeSet.encodeTrie());
47 | 			this.attributeSet.dirty = false;
48 | 		}
49 | 		return(this.native);
50 | 	}
51 | 
52 | 	addElement(name: string) {
53 | 		return(this.elementSet.createToken(name, this));
54 | 	}
55 | 
56 | 	addAttribute(name: string) {
57 | 		return(this.attributeSet.createToken(name, this));
58 | 	}
59 | 
60 | 	public base: Namespace;
61 | 	private native: NativeNamespace;
62 | 
63 | 	/** Index in parser's namespaceList. */
64 | 	public id: number;
65 | 
66 | 	uriToken: Token;
67 | 
68 | 	private elementSet: TokenSet;
69 | 	private attributeSet: TokenSet;
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/src/parser/Lib.d.ts:
--------------------------------------------------------------------------------
 1 | import { Buffer } from "nbind/dist/shim";
 2 | 
 3 | export class NBindBase { free?(): void }
 4 | 
 5 | export class Namespace extends NBindBase {
 6 | 	/** Namespace(std::string); */
 7 | 	constructor(p0: string);
 8 | 
 9 | 	/** Namespace clone(); */
10 | 	clone(): Namespace;
11 | 
12 | 	/** void setElementTrie(Buffer); */
13 | 	setElementTrie(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): void;
14 | 
15 | 	/** void setAttributeTrie(Buffer); */
16 | 	setAttributeTrie(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): void;
17 | }
18 | 
19 | export class Parser extends NBindBase {
20 | 	/** Parser(const ParserConfig &); */
21 | 	constructor(p0: ParserConfig);
22 | 
23 | 	/** ParserConfig * getConfig(); */
24 | 	getConfig(): ParserConfig | null;
25 | 
26 | 	/** void setCodeBuffer(Buffer, cbFunction &); */
27 | 	setCodeBuffer(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer, p1: (...args: any[]) => any): void;
28 | 
29 | 	/** void setPrefix(uint32_t); */
30 | 	setPrefix(p0: number): void;
31 | 
32 | 	/** bool bindPrefix(uint32_t, uint32_t); */
33 | 	bindPrefix(p0: number, p1: number): boolean;
34 | 
35 | 	/** int32_t parse(Buffer); */
36 | 	parse(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): number;
37 | 
38 | 	/** int32_t destroy(); */
39 | 	destroy(): number;
40 | 
41 | 	/** uint32_t row; -- Read-only */
42 | 	row: number;
43 | 
44 | 	/** uint32_t col; -- Read-only */
45 | 	col: number;
46 | }
47 | 
48 | export class ParserConfig extends NBindBase {
49 | 	/** ParserConfig(uint32_t, uint32_t, uint32_t, uint32_t); */
50 | 	constructor(p0: number, p1: number, p2: number, p3: number);
51 | 
52 | 	/** uint32_t addNamespace(std::shared_ptr<Namespace>); */
53 | 	addNamespace(p0: Namespace): number;
54 | 
55 | 	/** bool addUri(uint32_t, uint32_t); */
56 | 	addUri(p0: number, p1: number): boolean;
57 | 
58 | 	/** bool bindPrefix(uint32_t, uint32_t); */
59 | 	bindPrefix(p0: number, p1: number): boolean;
60 | 
61 | 	/** void setUriTrie(Buffer); */
62 | 	setUriTrie(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): void;
63 | 
64 | 	/** void setPrefixTrie(Buffer); */
65 | 	setPrefixTrie(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): void;
66 | }
67 | 
68 | export class Patricia extends NBindBase {
69 | 	/** Patricia(); */
70 | 	constructor();
71 | 
72 | 	/** void setBuffer(Buffer); */
73 | 	setBuffer(p0: number[] | ArrayBuffer | DataView | Uint8Array | Buffer): void;
74 | 
75 | 	/** uint32_t find(const char *); */
76 | 	find(p0: string): number;
77 | }
78 | 


--------------------------------------------------------------------------------
/lib/README.md:
--------------------------------------------------------------------------------
 1 | Tokenizer library
 2 | =================
 3 | 
 4 | This is an XML tokenizer library for `cxml`, written in C++ for speed.
 5 | Fundamentally it's a small, manually designed DFA (state machine).
 6 | 
 7 | Every recognized element or attribute from a known namespace is a specific
 8 | token. Otherwise tokens are different kinds of offsets to the input buffer.
 9 | 
10 | Structure
11 | ---------
12 | 
13 | - `Parser.cc` contains the main state machine.
14 | - `PatriciaCursor.cc` handles traversing Patricia tries containing known
15 |   text string tokens.
16 | - `ParserConfig.h` contains the API for initializing parser settings.
17 |   Creating new parser instances from the same config object is fast.
18 | 
19 | Design
20 | ------
21 | 
22 | ### What choices make C++ suitable and why is it faster?
23 | 
24 | Some reasons:
25 | 
26 | - The code would be almost the same if written in JavaScript, but a
27 |   JavaScript JIT compiler would need to make countless correct guesses
28 |   to produce equally optimized output.
29 |   - The state machine structure is encoded in assignments to an integer
30 |     `state` variable and `switch`, `case` and `goto` statements.
31 |     An integer-based jump table is very fast.
32 |   - Every `goto` could be replaced with `continue`, but then the compiler
33 |     may not understand that the jump table can be skipped.
34 | - Length of text content is calculated in a very tight loop using a pointer.
35 |   Compiled JavaScript would require more safety checks.
36 | - Output is only tokens with offsets to input, nothing is copied.
37 | - Input is treated as bytes without decoding UTF-8.
38 |   Recognized tokens never need such decoding.
39 | - Calls between languages are always slow and thus only used to notify when
40 |   a buffer has become full. No arguments are passed, to avoid type conversion.
41 |   - Both languages directly access the same buffers, sharing memory.
42 | - Code dealing with pointers and character literals is clearer and less
43 |   verbose when written in C++.
44 | 
45 | ### Counter-arguments and justifications for C++
46 | 
47 | - For safety, C++ does require more careful programming, especially when
48 |   using pointers.
49 |   - Incorrect memory reads in C++ could crash when JavaScript would produce
50 |     invalid output, allowing denial of service attacks. Information from other
51 |     variables could also be leaked.
52 |   - Invalid memory writes through pointers lead to remote code execution
53 |     exploits, compromising all security.
54 |     - They should be avoided, or audited carefully and surrounded with checks.
55 | - For speed, this tokenizer uses pointers extensively but carefully.
56 |   - When reading, it avoids many run-time checks that JavaScript would do.
57 |     - However, it does not output what was read, only where it found something
58 |       interesting. This avoids leaking information.
59 |   - Writes are done very carefully in a single, small function.
60 |     - Between various checks, only one number is written at a time.
61 |     - Elsewhere, `const` pointers prevent accidental memory writes.
62 |     - Written data is not directly copied from input.
63 | 


--------------------------------------------------------------------------------
/src/schema/Element.ts:
--------------------------------------------------------------------------------
  1 | import { ElementToken } from '../parser/Token';
  2 | 
  3 | import { ComplexType, ElementTypeConstructor } from './ComplexType';
  4 | import { Group } from './Group';
  5 | import { MemberSpec, MemberMeta, SimpleType, SimpleValue } from './Member';
  6 | 
  7 | export class SimpleElementSpec extends MemberSpec {
  8 | 
  9 | 	/** Name and other info. */
 10 | 	meta: SimpleElementMeta;
 11 | 
 12 | }
 13 | 
 14 | /** Configuration for elements as type members. */
 15 | 
 16 | export class ElementSpec extends MemberSpec {
 17 | 
 18 | 	/** Name and other info, also available in the prototype of all element instances. */
 19 | 	meta?: ElementMeta;
 20 | 
 21 | 	group?: Group;
 22 | 
 23 | }
 24 | 
 25 | /** Metadata for elements without children or attributes in builder output. */
 26 | 
 27 | export class SimpleElementMeta extends MemberMeta {
 28 | 
 29 | 	/** Substitution group head. */
 30 | 	substitutes?: SimpleElementMeta;
 31 | 
 32 | 	/** Token with element name and namespace.
 33 | 	  * A single token may have different types depending on its parent. */
 34 | 	token: ElementToken;
 35 | 
 36 | 	type: SimpleType;
 37 | 
 38 | }
 39 | 
 40 | /** Metadata for elements in builder output. */
 41 | 
 42 | export class ElementMeta<ElementClass extends ElementInstance = ElementInstance> extends MemberMeta {
 43 | 
 44 | 	createProto() {
 45 | 		if(!this.XMLType) {
 46 | 			const BaseType: ElementTypeConstructor = this.type.createProto<ElementClass>();
 47 | 
 48 | 			this.XMLType = class XMLType extends BaseType implements ElementInstance {
 49 | 				_: ElementMeta<this>;
 50 | 			} as ElementConstructor<ElementClass>;
 51 | 
 52 | 			Object.defineProperty(this.XMLType.prototype, 'constructor', {
 53 | 				configurable: true,
 54 | 				enumerable: false,
 55 | 				writable: true
 56 | 			});
 57 | 
 58 | 			Object.defineProperty(this.XMLType.prototype, '_', {
 59 | 				configurable: true,
 60 | 				enumerable: false,
 61 | 				value: this,
 62 | 				writable: true
 63 | 			});
 64 | 		}
 65 | 
 66 | 		return(this.XMLType);
 67 | 	}
 68 | 
 69 | 	XMLType?: ElementConstructor<ElementClass>;
 70 | 
 71 | 	/** A singleton object to use if the element is missing. */
 72 | 	placeholder?: ElementClass;
 73 | 
 74 | 	/** Substitution group head. */
 75 | 	substitutes?: ElementMeta;
 76 | 
 77 | 	/** Token with element name and namespace.
 78 | 	  * A single token may have different types depending on its parent. */
 79 | 	token: ElementToken;
 80 | 
 81 | 	type: ComplexType;
 82 | 
 83 | }
 84 | 
 85 | /** Base class for elements defined in the schema. Inherited by a hierarchy of types,
 86 |  *  each branch terminating in an element definition. */
 87 | 
 88 | export class ElementBase {}
 89 | 
 90 | /** Represents any element defined in the schema. */
 91 | 
 92 | export interface ElementInstance extends ElementBase {
 93 | 
 94 | 	/** Builder metadata. Defined in the prototypes of parsed objects,
 95 | 	  * or properties of placeholders for non-existent members. */
 96 | 	_: ElementMeta<this>;
 97 | 
 98 | 	/** Possible text content. */
 99 | 	$?: SimpleValue;
100 | 
101 | }
102 | 
103 | export interface ElementConstructor<ElementClass extends ElementInstance = ElementInstance> {
104 | 	new(): ElementClass;
105 | };
106 | 


--------------------------------------------------------------------------------
/src/parser/JSX.ts:
--------------------------------------------------------------------------------
  1 | import { Token, TokenKind, ElementToken, OpenToken, AttributeToken } from './Token';
  2 | import { ParserConfig } from './ParserConfig';
  3 | 
  4 | export function defineElement(): ElementToken {
  5 | 	return(true as any);
  6 | }
  7 | 
  8 | export function defineAttribute(): AttributeToken {
  9 | 	return(false as any);
 10 | }
 11 | 
 12 | export interface XModule {
 13 | 	[name: string]: string | ElementToken | AttributeToken;
 14 | }
 15 | 
 16 | export interface XModuleTable {
 17 | 	[prefix: string]: XModule;
 18 | }
 19 | 
 20 | export interface XMLElementNode<Attributes> extends Array<string | Attributes | XMLNode[]> {
 21 | 	0: any;
 22 | 	1: Attributes;
 23 | 	2: XMLNode[];
 24 | }
 25 | 
 26 | export class XMLArgumentNode {
 27 | 	constructor(public name: string) {}
 28 | }
 29 | 
 30 | export type XMLNode = XMLElementNode<any> | XMLArgumentNode | string | number | boolean | undefined;
 31 | 
 32 | export function jsxElement<Attributes>(
 33 | 	kind: string,
 34 | 	attr: Attributes,
 35 | 	...children: XMLNode[]
 36 | ): XMLElementNode<Attributes>;
 37 | 
 38 | export function jsxElement() { return(Array.prototype.slice.apply(arguments)); }
 39 | 
 40 | export function jsxExpand(
 41 | 	config: ParserConfig,
 42 | 	node: XMLNode,
 43 | 	output: (any[] | XMLArgumentNode)[],
 44 | 	part = output[0] as any[]
 45 | ) {
 46 | 	if(typeof(node) != 'object') {
 47 | 		part.push(node);
 48 | 	} else if(node instanceof Array) {
 49 | 		const element = node[0];
 50 | 		const attributes = node[1] || {};
 51 | 
 52 | 		// If the first element is not a token or the second element is,
 53 | 		// then the node is already expanded!
 54 | 		// An attribute, emitted or close token always follows
 55 | 		// an open token in expanded nodes.
 56 | 
 57 | 		if(!(element instanceof OpenToken) || attributes instanceof Token) {
 58 | 			// Flatten and output the already expanded node.
 59 | 			return(element instanceof Array ? Array.prototype.concat.apply([], node) : node);
 60 | 		}
 61 | 
 62 | 		part.push(element);
 63 | 
 64 | 		for(let name of Object.keys(attributes)) {
 65 | 			const attr = attributes[name];
 66 | 
 67 | 			part.push(config.getAttributeTokens(element.ns, name)[TokenKind.string]!);
 68 | 
 69 | 			if(attr instanceof XMLArgumentNode) {
 70 | 				output.push(attr);
 71 | 				part = [];
 72 | 				output.push(part);
 73 | 			} else {
 74 | 				part.push(attr);
 75 | 			}
 76 | 		}
 77 | 
 78 | 		if(node.length > 2) {
 79 | 			part.push(element.emitted);
 80 | 
 81 | 			for(let num = 2; num < node.length; ++num) {
 82 | 				part = jsxExpand(config, node[num], output, part);
 83 | 			}
 84 | 		}
 85 | 
 86 | 		part.push(element.close);
 87 | 	} else if(node instanceof XMLArgumentNode) {
 88 | 		output.push(node);
 89 | 		part = [];
 90 | 		output.push(part);
 91 | 	}
 92 | 
 93 | 	return(part);
 94 | }
 95 | 
 96 | export function jsxCompile(
 97 | 	config: ParserConfig,
 98 | 	generate: (...args: any[]) => XMLElementNode<any>
 99 | ) {
100 | 	const template = generate((name: string) => new XMLArgumentNode(name));
101 | 
102 | 	// console.log(require('util').inspect(template, { depth: null }));
103 | 
104 | 	const parts: any[][] = [[]];
105 | 	jsxExpand(config, template, parts);
106 | 
107 | 	const rest = parts.slice(1);
108 | 
109 | 	// Compile a function that expands and interpolates arguments into the template.
110 | 
111 | 	return(eval(
112 | 		// The function returns the first part of expanded output...
113 | 		'(function compiled(spec) {return(parts[0]' +
114 | 		// ...with other parts appended, if any.
115 | 		(!rest.length ? '' :
116 | 			'.concat(' + rest.map(
117 | 				(part, pos: number) => (part instanceof XMLArgumentNode ? (
118 | 					// Expand parts representing arguments.
119 | 					'jsxExpand(config,spec.' + part.name + '||"",[[]])'
120 | 				) : (
121 | 					// Output already expanded parts as-is.
122 | 					'rest[' + pos + ']'
123 | 				))
124 | 			).join(',') + ')'
125 | 		) +
126 | 		');})'
127 | 	));
128 | }
129 | 


--------------------------------------------------------------------------------
/src/parser/Token.ts:
--------------------------------------------------------------------------------
  1 | import { Namespace } from '../Namespace';
  2 | import { ParserNamespace } from './ParserNamespace';
  3 | import { ParserConfig } from './ParserConfig';
  4 | 
  5 | export type TokenBuffer = (Token | number | string)[];
  6 | 
  7 | // Order must match InternalToken.tokenList.
  8 | export const enum TokenKind {
  9 | 	// External element token types
 10 | 	open,
 11 | 	close,
 12 | 	emitted,
 13 | 	elementEnd = emitted,
 14 | 
 15 | 	// External attribute token types
 16 | 	string,
 17 | 	number,
 18 | 	attributeEnd = number,
 19 | 
 20 | 	comment,
 21 | 	cdata,
 22 | 	blank,
 23 | 	sgml,
 24 | 	sgmlEmitted,
 25 | 	sgmlNestedStart,
 26 | 	sgmlNestedEnd,
 27 | 	sgmlText,
 28 | 
 29 | 	// Internal token types
 30 | 	uri,
 31 | 	prefix,
 32 | 	element,
 33 | 	attribute,
 34 | 
 35 | 	other
 36 | }
 37 | 
 38 | export abstract class Token {
 39 | 
 40 | 	constructor() {}
 41 | 
 42 | 	serialize?(indent?: string, data?: any): string | TokenBuffer;
 43 | 	serializeJson?(indent?: string, data?: any): any;
 44 | 
 45 | 	kind: TokenKind;
 46 | 	kindString: string;
 47 | 
 48 | }
 49 | Token.prototype.kind = TokenKind.other;
 50 | Token.prototype.kindString = 'other';
 51 | 
 52 | export class SpecialToken extends Token {
 53 | 
 54 | 	constructor(public kind: TokenKind, public kindString: string) { super(); }
 55 | 
 56 | 	static comment = new SpecialToken(TokenKind.comment, 'comment');
 57 | 	static cdata = new SpecialToken(TokenKind.cdata, 'cdata');
 58 | 	static blank = new SpecialToken(TokenKind.blank, 'blank');
 59 | 	static sgmlEmitted = new SpecialToken(TokenKind.sgmlEmitted, 'SGML emitted');
 60 | 	static sgmlNestedStart = new SpecialToken(TokenKind.sgmlNestedStart, 'DTD start');
 61 | 	static sgmlNestedEnd = new SpecialToken(TokenKind.sgmlNestedEnd, 'DTD end');
 62 | 	static sgmlText = new SpecialToken(TokenKind.sgmlText, 'SGML text');
 63 | 
 64 | }
 65 | 
 66 | export class PrefixToken extends Token {
 67 | 
 68 | 	constructor(public name: string, public id?: number) { super(); }
 69 | 
 70 | }
 71 | PrefixToken.prototype.kind = TokenKind.prefix;
 72 | PrefixToken.prototype.kindString = 'prefix';
 73 | 
 74 | export class UriToken extends Token {
 75 | 
 76 | 	constructor(public ns: Namespace) { super(); }
 77 | 
 78 | }
 79 | UriToken.prototype.kind = TokenKind.uri;
 80 | UriToken.prototype.kindString = 'uri';
 81 | 
 82 | export abstract class MemberToken extends Token {
 83 | 
 84 | 	constructor(public name: string, public ns: Namespace, public id?: number) { super(); }
 85 | 
 86 | 	abstract resolve(ns: ParserNamespace): Token;
 87 | 
 88 | }
 89 | 
 90 | export class ElementToken extends MemberToken {
 91 | 
 92 | 	resolve(ns: ParserNamespace) {
 93 | 		return(ns.addElement(this.name).tokenList[this.kind as number]!);
 94 | 	}
 95 | 
 96 | }
 97 | 
 98 | export class AttributeToken extends MemberToken {
 99 | 
100 | 	resolve(ns: ParserNamespace) {
101 | 		return(ns.addAttribute(this.name).tokenList[this.kind as number]!);
102 | 	}
103 | 
104 | }
105 | 
106 | export class OpenToken extends ElementToken {
107 | 	emitted = new EmittedToken(this.name, this.ns, this.id);
108 | 	close = new CloseToken(this.name, this.ns, this.id);
109 | }
110 | OpenToken.prototype.kind = TokenKind.open;
111 | OpenToken.prototype.kindString = 'open';
112 | 
113 | export class CloseToken extends ElementToken {}
114 | CloseToken.prototype.kind = TokenKind.close;
115 | CloseToken.prototype.kindString = 'close';
116 | 
117 | export class EmittedToken extends ElementToken {}
118 | EmittedToken.prototype.kind = TokenKind.emitted;
119 | EmittedToken.prototype.kindString = 'emitted';
120 | 
121 | export class StringToken extends AttributeToken {}
122 | StringToken.prototype.kind = TokenKind.string;
123 | StringToken.prototype.kindString = 'string';
124 | 
125 | export class SgmlToken extends Token {
126 | 
127 | 	constructor(public name: string, public prefix: string) { super(); }
128 | 
129 | }
130 | SgmlToken.prototype.kind = TokenKind.sgml;
131 | SgmlToken.prototype.kindString = 'sgml';
132 | 


--------------------------------------------------------------------------------
/src/builder/RuleSet.ts:
--------------------------------------------------------------------------------
  1 | import { SimpleSchema } from '../schema/SimpleSchema';
  2 | import { ComplexType } from '../schema/ComplexType';
  3 | import { MemberSpec } from '../schema/Member';
  4 | import { ElementInstance, ElementMeta, ElementConstructor } from '../schema/Element';
  5 | 
  6 | export class Rule {
  7 | 
  8 | 	addElement(member: RuleMember) {
  9 | 		this.elements[member.id] = member;
 10 | 	}
 11 | 
 12 | 	addAttribute(member: RuleMember) {
 13 | 		this.attributes[member.id] = member;
 14 | 	}
 15 | 
 16 | 	elements: { [id: number]: RuleMember } = {};
 17 | 	attributes: { [id: number]: RuleMember } = {};
 18 | 
 19 | 	static string = new Rule();
 20 | 
 21 | 	XMLType: ElementConstructor;
 22 | 
 23 | }
 24 | 
 25 | export class RuleMember {
 26 | 
 27 | 	constructor(public rule: Rule, public spec: MemberSpec) {
 28 | 		this.id = spec.meta!.token.id!;
 29 | 		this.min = spec.min;
 30 | 		this.max = spec.max;
 31 | 	}
 32 | 
 33 | 	id: number;
 34 | 	min: number;
 35 | 	max: number;
 36 | 
 37 | }
 38 | 
 39 | function link<Type>(parent: Type) {
 40 | 	function Result() {}
 41 | 	Result.prototype = parent;
 42 | 	return(new (Result as any)());
 43 | }
 44 | 
 45 | export interface RuleStack {
 46 | 	meta?: ElementMeta;
 47 | 	rule?: Rule;
 48 | 	parent?: RuleStack;
 49 | }
 50 | 
 51 | export class RuleSet {
 52 | 
 53 | 	createRule(type: ComplexType, meta?: ElementMeta, parent?: RuleStack) {
 54 | 		const rule = new Rule();
 55 | 		let childRule: Rule | undefined;
 56 | 		let proto: { [key: string]: any } = {};
 57 | 
 58 | 		if(meta) {
 59 | 			rule.XMLType = meta.createProto();
 60 | 			proto = rule.XMLType.prototype;
 61 | 		}
 62 | 
 63 | 		if(type.elements && type.elements.group) {
 64 | 			for(let childSpec of type.elements.group.list) {
 65 | 				const memberMeta = childSpec.meta;
 66 | 
 67 | 				if(memberMeta) {
 68 | 					if(memberMeta instanceof ElementMeta) {
 69 | 						childRule = void 0;
 70 | 
 71 | 						for(let item = parent; item; item = item.parent) {
 72 | 							if(item.meta == memberMeta) {
 73 | 								// If the child element type matches an ancestor's,
 74 | 								// re-use its rule to avoid infinite recursion.
 75 | 
 76 | 								childRule = item.rule;
 77 | 								break;
 78 | 							}
 79 | 						}
 80 | 
 81 | 						if(!childRule) {
 82 | 							childRule = this.createRule(memberMeta.type, memberMeta, { meta, rule, parent });
 83 | 						}
 84 | 
 85 | 						// Subclass type metadata and clear existence flag to indicate a placeholder.
 86 | 						let fakeMeta = link(memberMeta);
 87 | 						fakeMeta.exists = false;
 88 | 
 89 | 						let placeholder: ElementInstance | ElementInstance[] | null = new childRule.XMLType();
 90 | 						placeholder._ = fakeMeta;
 91 | 						memberMeta.placeholder = placeholder;
 92 | 
 93 | 						if(childSpec.max > 1) {
 94 | 							// Use arrays as placeholders for arrays of children.
 95 | 							placeholder = childSpec.min > 0 ? [ placeholder ] : [];
 96 | 						} else if(childSpec.min < 1) {
 97 | 							placeholder = null;
 98 | 						}
 99 | 
100 | 						if(placeholder) {
101 | 							Object.defineProperty(proto, memberMeta.token.name, {
102 | 								configurable: true,
103 | 								enumerable: false,
104 | 								value: placeholder,
105 | 								writable: true
106 | 							});
107 | 						}
108 | 					} else childRule = Rule.string;
109 | 
110 | 					rule.addElement(new RuleMember(childRule, childSpec));
111 | 				}
112 | 			}
113 | 		}
114 | 
115 | 		if(type.attributes) {
116 | 			for(let attributeSpec of type.attributes.list) {
117 | 				const memberMeta = attributeSpec.meta;
118 | 
119 | 				if(memberMeta) {
120 | 					// const token = memberMeta.token;
121 | 
122 | 					childRule = Rule.string;
123 | 
124 | 					rule.addAttribute(new RuleMember(childRule, attributeSpec));
125 | 				}
126 | 			}
127 | 		}
128 | 
129 | 		return(rule);
130 | 	}
131 | 
132 | 	constructor(public schema: SimpleSchema) {
133 | 		this.rootRule = this.createRule(schema.document);
134 | 	}
135 | 
136 | 	rootRule: Rule;
137 | 
138 | }
139 | 


--------------------------------------------------------------------------------
/src/writer/JsonWriter.ts:
--------------------------------------------------------------------------------
  1 | import * as stream from 'stream';
  2 | 
  3 | import { Namespace } from '../Namespace';
  4 | import { Token, TokenKind, MemberToken } from '../parser/Token';
  5 | import { TokenChunk } from '../parser/TokenChunk';
  6 | 
  7 | import { Indent, State, indentPattern } from './Writer';
  8 | 
  9 | export class JsonWriter extends stream.Transform {
 10 | 
 11 | 	/** @param data Arbitrary data passed to any custom serializers. */
 12 | 
 13 | 	constructor(private data?: any) {
 14 | 		super({ objectMode: true });
 15 | 	}
 16 | 
 17 | 	transform(chunk: TokenChunk) {
 18 | 		let state = this.state;
 19 | 		let depth = this.depth;
 20 | 		let indent = this.indent;
 21 | 		let nsElement = this.nsElement;
 22 | 		const buffer = chunk.buffer;
 23 | 		let token: typeof buffer[0];
 24 | 		let member: MemberToken;
 25 | 		let prefix: string;
 26 | 		let serialized: any;
 27 | 
 28 | 		let partList: string[] = [];
 29 | 		let partNum = -1;
 30 | 		let lastNum = chunk.length - 1;
 31 | 		let tokenNum = -1;
 32 | 
 33 | 		while(tokenNum < lastNum) {
 34 | 
 35 | 			token = buffer[++tokenNum];
 36 | 
 37 | 			if(token instanceof Token) {
 38 | 				switch(token.kind) {
 39 | 					case TokenKind.open:
 40 | 
 41 | 						member = token as MemberToken;
 42 | 						nsElement = member.ns;
 43 | 
 44 | 						if(nsElement.isSpecial && nsElement.defaultPrefix == '?') {
 45 | 							state = State.PROCESSING;
 46 | 						} else {
 47 | 							++depth;
 48 | 							partList[++partNum] = indent + '[ "' + member.name + '"';
 49 | 							state = State.ELEMENT;
 50 | 						}
 51 | 
 52 | 						indent = ',' + indentPattern.substr(0, depth);
 53 | 						break;
 54 | 
 55 | 					case TokenKind.emitted:
 56 | 
 57 | 						state = State.TEXT;
 58 | 						break;
 59 | 
 60 | 					case TokenKind.close:
 61 | 
 62 | 						if(state != State.PROCESSING) {
 63 | 							member = token as MemberToken;
 64 | 							indent = indentPattern.substr(0, --depth);
 65 | 
 66 | 							if(state == State.TEXT) {
 67 | 								partList[++partNum] = indent + ']';
 68 | 							} else {
 69 | 								partList[++partNum] = ' ]';
 70 | 							}
 71 | 
 72 | 							indent = ',' + indent;
 73 | 						}
 74 | 
 75 | 						state = State.TEXT;
 76 | 						break;
 77 | 
 78 | 					case TokenKind.string:
 79 | 
 80 | 						member = token as MemberToken;
 81 | 
 82 | 						partList[++partNum] = ', [ "$' + member.name + '"';
 83 | 						break;
 84 | 
 85 | 					case TokenKind.comment:
 86 | 
 87 | 						state = State.COMMENT;
 88 | 						break;
 89 | 
 90 | 					case TokenKind.other:
 91 | 
 92 | 						if(token.serializeJson) {
 93 | 							serialized = token.serializeJson(indent, this.data);
 94 | 							if(typeof(serialized) != 'string') serialized = JSON.stringify(serialized);
 95 | 
 96 | 							partList[++partNum] = indent + serialized;
 97 | 							state = State.AFTER_TEXT;
 98 | 						}
 99 | 						break;
100 | 				}
101 | 			} else {
102 | 				switch(state) {
103 | 					case State.TEXT:
104 | 
105 | 						partList[++partNum] = ', [ "$", ' + JSON.stringify(token) + ' ]';
106 | 						state = State.AFTER_TEXT;
107 | 						break;
108 | 
109 | 					case State.ELEMENT:
110 | 					case State.PROCESSING:
111 | 
112 | 						partList[++partNum] = ', ' + JSON.stringify(token) + ' ]';
113 | 						break;
114 | 
115 | 					case State.COMMENT:
116 | 
117 | 						break;
118 | 
119 | 				}
120 | 			}
121 | 		}
122 | 
123 | 		this.state = state;
124 | 		this.depth = depth;
125 | 		this.indent = indent;
126 | 		this.nsElement = nsElement;
127 | 
128 | 		return(partList);
129 | 	}
130 | 
131 | 	_transform(chunk: TokenChunk | null, enc: string, flush: (err: any, chunk: string) => void) {
132 | 		if(!chunk) {
133 | 			flush(null, '');
134 | 			return;
135 | 		}
136 | 
137 | 		const partList = this.transform(chunk);
138 | 		flush(null, partList.join(''));
139 | 	}
140 | 
141 | 	_flush( flush: (err: any, chunk: string) => void) {
142 | 		flush(null, '\n');
143 | 	}
144 | 
145 | 	private state = State.TEXT as State;
146 | 	private depth = Indent.MIN_DEPTH;
147 | 	private indent = '';
148 | 	private nsElement: Namespace;
149 | 
150 | }
151 | 


--------------------------------------------------------------------------------
/lib/PatriciaCursor.cc:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | 
  3 | #include "PatriciaCursor.h"
  4 | 
  5 | void PatriciaCursor :: init(const Patricia &trie) {
  6 | 	if(trie.root != root) {
  7 | 		root = trie.root;
  8 | 		// Hold on to trie data used by the cursor in case it gets garbage collected.
  9 | 		buffer = trie.buffer;
 10 | 	}
 11 | 
 12 | 	ptr = root;
 13 | 	len = *ptr++;
 14 | 
 15 | 	found = nullptr;
 16 | }
 17 | 
 18 | bool PatriciaCursor :: advance(unsigned char c) {
 19 | 	const unsigned char *p = ptr;
 20 | 	unsigned char delta;
 21 | 
 22 | 	// Loop until the current trie branch node contains an entire byte.
 23 | 	while(len < 8) {
 24 | 		if(len) {
 25 | 			// Compare input with branch node contents by using XOR, and
 26 | 			// shift away trailing bits not contained in the node.
 27 | 			delta = (c ^ *p++) >> (7 - len);
 28 | 		} else {
 29 | 			// If the branch doesn't depend on any bits inside the byte,
 30 | 			// it must be the last byte of an inserted string.
 31 | 			// An associated data value will follow so jump over it.
 32 | 			delta = 0;
 33 | 
 34 | 			// High bit of associated data value signals no longer strings
 35 | 			// with this prefix exist.
 36 | 			if(*p & 0x80) {
 37 | 				ptr = p;
 38 | 				return(false);
 39 | 			}
 40 | 		}
 41 | 
 42 | 		if(delta) {
 43 | 			// If input differs from branch node contents in bits before
 44 | 			// the last one, then it was not found in the trie.
 45 | 			if(delta > 1) {
 46 | 				ptr = p - 1;
 47 | 				return(false);
 48 | 			}
 49 | 
 50 | 			// If the last bit differs, find pointer to the second child.
 51 | 			// It must exist, otherwise there would be no branch here.
 52 | 			p += (p[0] << 16) + (p[1] << 8) + p[2];
 53 | 		} else {
 54 | 			// This branch is conditioned on a bit so it has a pointer
 55 | 			// to a second child, or it ends on a byte boundary so it has
 56 | 			// a data pointer. In either case, jump over a pointer to find
 57 | 			// the first child node.
 58 | 			p += 3;
 59 | 		}
 60 | 
 61 | 		// Entered a new node, so read its length.
 62 | 		len = *p++;
 63 | 	}
 64 | 
 65 | 	len -= 8;
 66 | 
 67 | 	// If the node contains a full byte but the input doesn't match,
 68 | 	// then it was not found in the trie.
 69 | 	if(c != *p++) {
 70 | 		ptr = p;
 71 | 		return(false);
 72 | 	}
 73 | 
 74 | 	if(!len) {
 75 | 		// If the branch doesn't depend on any bits inside the byte,
 76 | 		// it must be the last byte of an inserted string.
 77 | 		// Store the location of its data value.
 78 | 		found = p;
 79 | 
 80 | 		// NOTE: Nodes longer than 32 bytes must be split, so intermediate
 81 | 		// nodes represent partial strings not actually inserted. Their
 82 | 		// associated value is Patricia :: notFound, so results are unaffected.
 83 | 	}
 84 | 
 85 | 	ptr = p;
 86 | 	return(true);
 87 | }
 88 | 
 89 | bool PatriciaCursor :: transfer(const Patricia &trie) {
 90 | 	const unsigned char *p = trie.root;
 91 | 	const unsigned char *target = ptr;
 92 | 	unsigned char c;
 93 | 	PatriciaCursor other;
 94 | 
 95 | 	other.init(trie);
 96 | 
 97 | 	// TODO!
 98 | 	while(0 && p < target) {
 99 | 		// c = ...
100 | 
101 | 		if(!other.advance(c)) return(false);
102 | 	}
103 | 
104 | 	*this = other;
105 | 
106 | 	return(true);
107 | }
108 | 
109 | uint32_t PatriciaCursor :: findLeaf() {
110 | 	const unsigned char *p = ptr;
111 | 	uint16_t len = this->len;
112 | 	uint32_t data;
113 | 
114 | 	do {
115 | 		// Skip to reference to current node's data or second child.
116 | 		p += (len + 7) / 8;
117 | 
118 | 		while(len & 7) {
119 | 			// Read length from beginning of first child
120 | 			// (just after the data reference).
121 | 			len = p[3];
122 | 			// Skip current node's data or second child reference, the first child's
123 | 			// length and its contents, moving to its data or second child reference.
124 | 			p += (len + 7) / 8 + 4;
125 | 		}
126 | 
127 | 		len = p[3];
128 | 		found = p;
129 | 		data = getData();
130 | 
131 | 		p += 4;
132 | 		// After splitting nodes at 32 chars, avoid returning a split node.
133 | 	} while(data == Patricia :: notFound && !(*p & 0x80));
134 | 
135 | 	return(data);
136 | }
137 | 
138 | uint32_t PatriciaCursor :: getData() {
139 | 	if(!found) return(Patricia :: notFound);
140 | 
141 | 	return( ( (found[0] << 16) + (found[1] << 8) + found[2] ) & Patricia :: idMask );
142 | }
143 | 


--------------------------------------------------------------------------------
/src/schema/SimpleSchema.ts:
--------------------------------------------------------------------------------
  1 | import { TokenKind } from '../parser/Token';
  2 | import { ParserConfig } from '../parser/ParserConfig';
  3 | import { Namespace } from '../Namespace';
  4 | import { ComplexType } from './ComplexType';
  5 | import { SimpleType } from './Member';
  6 | import { AttributeSpec, AttributeMeta } from './Attribute';
  7 | import { SimpleElementSpec, SimpleElementMeta, ElementSpec, ElementMeta } from './Element';
  8 | 
  9 | export type SimpleMemberSpec = string | { [ memberName: string]: string };
 10 | 
 11 | export type SimpleSchemaSpec = { [ typeName: string ]: SimpleMemberSpec[] };
 12 | 
 13 | export type SimpleSchemaSpecTbl = { [prefix: string]: [ string, string, SimpleSchemaSpec ] };
 14 | 
 15 | export class SimpleSchema {
 16 | 
 17 | 	constructor(private parserConfig: ParserConfig, public ns: Namespace, spec: SimpleSchemaSpec, root = spec['document']) {
 18 | 		const typeTbl = this.typeTbl;
 19 | 
 20 | 		// SimpleType expands to a plain string without support for attributes,
 21 | 		// ComplexType expands to { $: "..." } allowing parseUnknown to work.
 22 | 		// const stringType = new SimpleType();
 23 | 		const stringType = new ComplexType();
 24 | 		typeTbl['xs:string'] = stringType;
 25 | 
 26 | 		parserConfig.addNamespace(ns);
 27 | 
 28 | 		// Create placeholder objects for all types.
 29 | 		for(let typeName of Object.keys(spec)) {
 30 | 			typeTbl[typeName] = new ComplexType();
 31 | 		}
 32 | 
 33 | 		// Define types, using placeholders when referring to undefined types.
 34 | 		for(let typeName of Object.keys(spec)) {
 35 | 			this.defineType(spec[typeName], typeTbl[typeName] as ComplexType);
 36 | 		}
 37 | 
 38 | 		this.document = (typeTbl['document'] || this.defineType(root)) as ComplexType;
 39 | 	}
 40 | 
 41 | 	defineType(spec: SimpleMemberSpec[], type: ComplexType = new ComplexType()) {
 42 | 		let memberName: string;
 43 | 
 44 | 		for(let child of spec) {
 45 | 			if(typeof(child) == 'string') {
 46 | 				memberName = child;
 47 | 				child = {};
 48 | 				child[memberName] = memberName;
 49 | 			}
 50 | 
 51 | 			for(memberName of Object.keys(child)) {
 52 | 				let min = 1, max = 1;
 53 | 
 54 | 				// Parse element or attribute name with type prefix / suffix.
 55 | 				let parts = memberName.match(/(\$?)([^\[]+)(\[\])?(\?)?/);
 56 | 				if(!parts) continue;
 57 | 
 58 | 				let [, prefix, name, arraySuffix, optionalSuffix] = parts;
 59 | 
 60 | 				// Parse type name if it differs from element/attribute name.
 61 | 				if(child[memberName] != memberName) {
 62 | 					parts = child[memberName].match(/(\$?)([^\[]+)(\[\])?(\?)?/);
 63 | 					if(!parts) continue;
 64 | 
 65 | 					// Type prefix / suffix behave identically in member and type names.
 66 | 					prefix = prefix || parts[1];
 67 | 					arraySuffix = arraySuffix || parts[3];
 68 | 					optionalSuffix = optionalSuffix || parts[4];
 69 | 				}
 70 | 
 71 | 				if(optionalSuffix) min = 0;
 72 | 				if(arraySuffix) max = Infinity;
 73 | 
 74 | 				const memberTypeName = parts[2];
 75 | 				const memberType = this.typeTbl[memberTypeName];
 76 | 
 77 | 				// Prefix $ marks attributes.
 78 | 				if(prefix == '$') {
 79 | 					const token = this.parserConfig.getAttributeTokens(this.ns, name)[TokenKind.string]!;
 80 | 					const attributeSpec = new AttributeSpec(min, max);
 81 | 					const attributeMeta = new AttributeMeta(token);
 82 | 
 83 | 					// attributeMeta.type = xsd:string
 84 | 					attributeSpec.meta = attributeMeta;
 85 | 					type.addAttribute(attributeSpec);
 86 | 				} else if(memberType) {
 87 | 					const token = this.parserConfig.getElementTokens(this.ns, name)[TokenKind.open]!;
 88 | 					let elementSpec: SimpleElementSpec | ElementSpec;
 89 | 					let elementMeta: SimpleElementMeta | ElementMeta;
 90 | 
 91 | 					if(memberType instanceof ComplexType) {
 92 | 						elementSpec = new ElementSpec(min, max);
 93 | 						elementMeta = new ElementMeta(token);
 94 | 					} else {
 95 | 						elementSpec = new SimpleElementSpec(min, max);
 96 | 						elementMeta = new SimpleElementMeta(token);
 97 | 					}
 98 | 
 99 | 					elementMeta.type = memberType;
100 | 					elementSpec.meta = elementMeta;
101 | 					type.addAll(elementSpec);
102 | 				}
103 | 			}
104 | 		}
105 | 
106 | 		return(type);
107 | 	}
108 | 
109 | 	typeTbl: { [ typeName: string ]: SimpleType | ComplexType } = {};
110 | 
111 | 	document: ComplexType;
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/src/builder/Builder.ts:
--------------------------------------------------------------------------------
  1 | import { Namespace } from '../Namespace';
  2 | import { TokenChunk } from '../parser/TokenChunk';
  3 | import { Token, TokenBuffer, TokenKind, OpenToken, CloseToken, StringToken } from '../parser/Token';
  4 | import { ParserConfig, ParserOptions } from '../parser/ParserConfig';
  5 | import { Parser } from '../parser/Parser';
  6 | import { SimpleSchema, SimpleSchemaSpecTbl } from '../schema/SimpleSchema';
  7 | import { RuleSet, Rule, RuleMember } from './RuleSet';
  8 | 
  9 | import { ComplexType } from '../schema/ComplexType';
 10 | import { ElementInstance, ElementSpec, ElementMeta, ElementConstructor } from '../schema/Element';
 11 | import { ElementToken } from '../parser/Token';
 12 | import { BuilderConfig } from './BuilderConfig';
 13 | 
 14 | const enum State {
 15 | 	ELEMENT = 0,
 16 | 	PROCESSING,
 17 | 	TEXT,
 18 | 	COMMENT
 19 | }
 20 | 
 21 | export class Builder {
 22 | 
 23 | 	constructor(private config: BuilderConfig, public nsUri: string) {
 24 | 		const ruleSet = this.config.ruleSetTbl[nsUri];
 25 | 
 26 | 		if(!ruleSet) throw(new Error('Unknown XML namespace ' + nsUri));
 27 | 
 28 | 		this.rule = ruleSet.rootRule;
 29 | 	}
 30 | 
 31 | 	getUnknownProto(token: ElementToken) {
 32 | 		let elementSpec: ElementSpec | undefined = this.unknownType.elements && this.unknownType.elements.group!.tbl[token.id!] as ElementSpec;
 33 | 
 34 | 		if(!elementSpec) {
 35 | 			elementSpec = new ElementSpec(0, Infinity);
 36 | 			const elementMeta = new ElementMeta(token);
 37 | 
 38 | 			elementMeta.type = new ComplexType();
 39 | 			elementSpec.meta = elementMeta;
 40 | 
 41 | 			this.unknownType.addAll(elementSpec);
 42 | 		}
 43 | 
 44 | 		return(elementSpec.meta!.createProto());
 45 | 	}
 46 | 
 47 | 	write(chunk: TokenChunk) {
 48 | 		if(!chunk) return;
 49 | 
 50 | 		const parseUnknown = this.config.options.parseUnknown;
 51 | 		let unknownDepth = this.unknownDepth;
 52 | 		let state = this.state;
 53 | 		let item = this.item;
 54 | 		let rule = this.rule;
 55 | 		let member = this.member;
 56 | 		let target = this.target;
 57 | 		let stackPos = this.stackPos;
 58 | 
 59 | 		const ruleStack = this.ruleStack;
 60 | 		const itemStack = this.itemStack;
 61 | 
 62 | 		const buffer = chunk.buffer;
 63 | 		let token: typeof buffer[0];
 64 | 		let dataType: string;
 65 | 		let kind: number;
 66 | 		let id: number;
 67 | 		let name: string;
 68 | 
 69 | 		let itemNext: any;
 70 | 		let ruleNext: Rule | undefined;
 71 | 
 72 | 		let lastNum = chunk.length - 1;
 73 | 		let tokenNum = -1;
 74 | 
 75 | 		while(tokenNum < lastNum) {
 76 | 
 77 | 			token = buffer[++tokenNum];
 78 | 			dataType = typeof(token);
 79 | 
 80 | 			if(unknownDepth) {
 81 | 				if(dataType == 'object') {
 82 | 					kind = (token as Token).kind;
 83 | 
 84 | 					if(kind == TokenKind.open) ++unknownDepth;
 85 | 					else if(kind == TokenKind.close) --unknownDepth;
 86 | 				}
 87 | 			} else if(dataType == 'object') {
 88 | 				kind = (token as Token).kind;
 89 | 
 90 | 				switch(kind) {
 91 | 					case TokenKind.open:
 92 | 
 93 | 						id = (token as OpenToken).id!;
 94 | 						name = (token as OpenToken).name;
 95 | 						member = rule && rule.elements[id];
 96 | 
 97 | 						if(member) {
 98 | 							ruleNext = member.rule;
 99 | 
100 | 							if(ruleNext == Rule.string) {
101 | 								// NOTE: If the string element has attributes,
102 | 								// they're added to its parent element!
103 | 								target = name;
104 | 								itemNext = item;
105 | 							} else {
106 | 								itemNext = new ruleNext.XMLType();
107 | 								if(member.max > 1) {
108 | 									if(!item.hasOwnProperty(name)) item[name] = [];
109 | 									item[name].push(itemNext);
110 | 								} else item[name] = itemNext;
111 | 							}
112 | 						} else if(!parseUnknown) {
113 | 							++unknownDepth;
114 | 
115 | 							state = State.TEXT;
116 | 							break;
117 | 						} else {
118 | 
119 | 							ruleNext = void 0;
120 | 							itemNext = new (this.getUnknownProto(token as OpenToken))();
121 | 
122 | 							if(!item.hasOwnProperty(name)) item[name] = itemNext;
123 | 							else if(item[name] instanceof Array) item[name].push(itemNext);
124 | 							else item[name] = [item[name], itemNext];
125 | 						}
126 | 
127 | 						itemStack[stackPos] = item;
128 | 						ruleStack[stackPos++] = rule;
129 | 						item = itemNext;
130 | 						rule = ruleNext;
131 | 
132 | 						state = State.ELEMENT;
133 | 						break;
134 | 
135 | 					case TokenKind.close:
136 | 
137 | 						item = itemStack[--stackPos];
138 | 						rule = ruleStack[stackPos];
139 | 
140 | 					// Fallthru
141 | 					case TokenKind.emitted:
142 | 
143 | 						if(rule != Rule.string) target = '$';
144 | 
145 | 						state = State.TEXT;
146 | 						break;
147 | 
148 | 					case TokenKind.string:
149 | 
150 | 						id = (token as StringToken).id!;
151 | 						member = rule && rule.attributes[id];
152 | 						if(member || parseUnknown) {
153 | 							target = (token as StringToken).name;
154 | 						} else {
155 | 							target = void 0;
156 | 						}
157 | 
158 | 						break;
159 | 
160 | 					case TokenKind.comment:
161 | 
162 | 						state = State.COMMENT;
163 | 						break;
164 | 				}
165 | 			} else {
166 | 				switch(state) {
167 | 					case State.TEXT:
168 | 					case State.ELEMENT:
169 | 
170 | 						if(target) {
171 | 							item[target] = (member && member.max > 1) ? (token + '').split(/ +/) : token;
172 | 							target = void 0;
173 | 						}
174 | 
175 | 						break;
176 | 				}
177 | 			}
178 | 		}
179 | 
180 | 		this.unknownDepth = unknownDepth;
181 | 		this.state = state;
182 | 		this.item = item;
183 | 		this.rule = rule;
184 | 		this.member = member;
185 | 		this.target = target;
186 | 		this.stackPos = stackPos;
187 | 
188 | 		chunk.free();
189 | 
190 | 		return(this.document);
191 | 	}
192 | 
193 | 	document: any = {};
194 | 	private item = this.document;
195 | 	private rule?: Rule;
196 | 	private member?: RuleMember;
197 | 	private target?: string;
198 | 
199 | 	private unknownType = new ComplexType();
200 | 	private unknownDepth = 0;
201 | 
202 | 	private stackPos = 0;
203 | 	private ruleStack: (Rule | undefined)[] = [];
204 | 	private itemStack: any[] = [];
205 | 
206 | 	private state = State.TEXT;
207 | 
208 | }
209 | 


--------------------------------------------------------------------------------
/src/tokenizer/Patricia.ts:
--------------------------------------------------------------------------------
  1 | import { ArrayType, concatArray } from '../Buffer';
  2 | import { InternalToken } from '../parser/InternalToken';
  3 | 
  4 | class Node {
  5 | 	constructor(
  6 | 		public token: InternalToken | null,
  7 | 		public buf: ArrayType,
  8 | 		public len: number,
  9 | 		public first?: Node,
 10 | 		public second?: Node
 11 | 	) {}
 12 | 
 13 | 	clone(): Node {
 14 | 		const other = new Node(
 15 | 			this.token,
 16 | 			this.buf,
 17 | 			this.len,
 18 | 			this.first && this.first.clone(),
 19 | 			this.second && this.second.clone()
 20 | 		);
 21 | 
 22 | 		return(other);
 23 | 	}
 24 | }
 25 | 
 26 | /** Maximum number of bits per node (number must fit in 1 byte). */
 27 | const MAX_LEN = 255; // Test edge cases by using smaller numbers (>= 8) here!
 28 | 
 29 | /** Must equal Patricia :: notFound on C++ side. */
 30 | export const NOT_FOUND = 0x7fffff;
 31 | 
 32 | class PatriciaCursor {
 33 | 	constructor(public node: Node) {
 34 | 		this.pos = 0;
 35 | 		this.len = node.len;
 36 | 	}
 37 | 
 38 | 	advance(c: number) {
 39 | 		let node = this.node;
 40 | 		let b = node.buf;
 41 | 		let p = this.pos;
 42 | 		let len = this.len;
 43 | 		let delta = 0;
 44 | 
 45 | 		while(len < 8) {
 46 | 			if(len) {
 47 | 				delta = (c ^ b[p++]) >> (7 - len);
 48 | 			} else {
 49 | 				if(!node.first) return(false);
 50 | 				delta = 0;
 51 | 			}
 52 | 
 53 | 			if(delta) {
 54 | 				if(delta > 1) {
 55 | 					this.node = node;
 56 | 					this.pos = p - 1;
 57 | 					this.len = len;
 58 | 
 59 | 					return(false);
 60 | 				}
 61 | 
 62 | 				node = node.second!;
 63 | 			} else {
 64 | 				node = node.first!;
 65 | 			}
 66 | 
 67 | 			b = node.buf;
 68 | 			p = 0;
 69 | 			len = node.len;
 70 | 		}
 71 | 
 72 | 		if(c != b[p++]) {
 73 | 			this.node = node;
 74 | 			this.pos = p - 1;
 75 | 			this.len = len;
 76 | 
 77 | 			return(false);
 78 | 		}
 79 | 
 80 | 		len -= 8;
 81 | 
 82 | 		this.node = node;
 83 | 		this.pos = p;
 84 | 		this.len = len;
 85 | 
 86 | 		return(true);
 87 | 	}
 88 | 
 89 | 	pos: number;
 90 | 	len: number;
 91 | }
 92 | 
 93 | export class Patricia {
 94 | 	clone() {
 95 | 		const other = new Patricia();
 96 | 
 97 | 		if(this.root) other.root = this.root.clone();
 98 | 
 99 | 		return(other);
100 | 	}
101 | 
102 | 	insertNode(token: InternalToken) {
103 | 		let pos = 0;
104 | 		let root = this.root;
105 | 
106 | 		if(!token.name) {
107 | 			throw(new Error('Empty strings not supported'));
108 | 		}
109 | 
110 | 		if(!root) {
111 | 			root = new Node(token, token.buf, token.buf.length * 8);
112 | 			this.root = root;
113 | 			return;
114 | 		}
115 | 
116 | 		let cursor = new PatriciaCursor(root);
117 | 
118 | 		while(pos < token.buf.length && cursor.advance(token.buf[pos])) ++pos;
119 | 
120 | 		const node = cursor.node;
121 | 		let rest: Node | undefined;
122 | 
123 | 		if(pos < token.buf.length) {
124 | 			rest = new Node(
125 | 				token,
126 | 				token.buf.slice(pos),
127 | 				(token.buf.length - pos) * 8
128 | 			);
129 | 		}
130 | 
131 | 		if(cursor.len) {
132 | 			let bit = 0;
133 | 
134 | 			if(rest) {
135 | 				let c = token.buf[pos] ^ node.buf[cursor.pos];
136 | 
137 | 				while(!(c & 0x80)) {
138 | 					c <<= 1;
139 | 					++bit;
140 | 				}
141 | 			} else {
142 | 				// The new node is a prefix of this node.
143 | 				// Cut this node at a byte boundary.
144 | 			}
145 | 
146 | 			// Split the node.
147 | 
148 | 			node.first = new Node(
149 | 				node.token!,
150 | 				node.buf.slice(cursor.pos),
151 | 				node.len - cursor.pos * 8,
152 | 				node.first,
153 | 				node.second
154 | 			);
155 | 
156 | 			node.second = rest;
157 | 
158 | 			node.token = rest ? null : token;
159 | 			node.buf = node.buf.slice(0, cursor.pos + ((bit + 7) >> 3));
160 | 			node.len = cursor.pos * 8 + bit;
161 | 		} else if(!rest) {
162 | 			throw(new Error('Duplicates not supported: ' + token.name));
163 | 		} else {
164 | 			// The new node only extends an existing node.
165 | 			node.first = rest;
166 | 		}
167 | 	}
168 | 
169 | 	insertList(tokenList: InternalToken[]) {
170 | 		for(let token of tokenList) {
171 | 			this.insertNode(token);
172 | 		}
173 | 
174 | 		// Verify that the tokens were correctly inserted!
175 | 
176 | 		for(let token of tokenList) {
177 | 			let pos = 0;
178 | 			let root = this.root;
179 | 
180 | 			let cursor = new PatriciaCursor(root);
181 | 
182 | 			while(pos < token.buf.length) {
183 | 				if(!cursor.advance(token.buf[pos++])) {
184 | 					throw(new Error('Inserted token missing: ' + token.name));
185 | 				}
186 | 			}
187 | 
188 | 			if(cursor.node.token != token) {
189 | 				throw(new Error('Wrong token inserted for: ' + token.name));
190 | 			}
191 | 		}
192 | 	}
193 | 
194 | 	private static encodeNode(
195 | 		node: Node,
196 | 		dataList: ArrayType[]
197 | 	) {
198 | 		let len = node.len;
199 | 		let partLen: number;
200 | 		let byteLen: number;
201 | 		let totalByteLen = 0;
202 | 		let posIn = -1;
203 | 		let posOut: number;
204 | 
205 | 		while(len) {
206 | 			partLen = len;
207 | 			if(partLen > MAX_LEN) partLen = MAX_LEN & ~7;
208 | 
209 | 			// Convert bit to byte length rounding up, add 1 byte for length
210 | 			// header and 3 bytes for reference
211 | 			// (token ID or offset to second child).
212 | 			byteLen = (partLen + 7) >> 3;
213 | 			const data = new ArrayType(byteLen + 4);
214 | 
215 | 			dataList.push(data);
216 | 			totalByteLen += byteLen + 4;
217 | 
218 | 			posOut = 0;
219 | 
220 | 			data[posOut] = partLen;
221 | 			while(posOut < byteLen) data[++posOut] = node.buf[++posIn];
222 | 
223 | 			let ref: number;
224 | 
225 | 			if(len > MAX_LEN) {
226 | 				ref = NOT_FOUND;
227 | 			} else {
228 | 				let nextTotalLen = 0;
229 | 				if(node.first) nextTotalLen += Patricia.encodeNode(node.first, dataList);
230 | 
231 | 				if(node.second) {
232 | 					ref = nextTotalLen + 3;
233 | 					nextTotalLen += Patricia.encodeNode(node.second, dataList);
234 | 				} else {
235 | 					// ref = tokenSet.encode(node.token!) || 0;
236 | 					ref = node.token!.id;
237 | 					if(!node.first) ref |= 0x800000; // See 0x80 in PatriciaCursor.cc
238 | 				}
239 | 
240 | 				totalByteLen += nextTotalLen;
241 | 			}
242 | 
243 | 			data[++posOut] = ref >> 16;
244 | 			data[++posOut] = ref >> 8;
245 | 			data[++posOut] = ref;
246 | 
247 | 			len -= partLen;
248 | 		}
249 | 
250 | 		return(totalByteLen);
251 | 	}
252 | 
253 | 	encode() {
254 | 		const dataList: ArrayType[] = [];
255 | 
256 | 		// Encode trie contents into a buffer.
257 | 		const dataLen = Patricia.encodeNode(
258 | 			this.root || Patricia.sentinel,
259 | 			dataList
260 | 		);
261 | 
262 | 		return(concatArray(dataList, dataLen));
263 | 	}
264 | 
265 | 	/** Represents the root of an empty tree. */
266 | 	private static sentinel = new Node(
267 | 		InternalToken.empty,
268 | 		InternalToken.empty.buf,
269 | 		InternalToken.empty.buf.length * 8
270 | 	);
271 | 
272 | 	private root: Node;
273 | }
274 | 


--------------------------------------------------------------------------------
/lib/Parser.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <vector>
  4 | 
  5 | #include <nbind/api.h>
  6 | 
  7 | #include "Namespace.h"
  8 | #include "PatriciaCursor.h"
  9 | #include "ParserConfig.h"
 10 | 
 11 | struct ParserState {
 12 | 
 13 | 	/** Flag whether the opening tag had a namespace prefix. */
 14 | 	bool isQualified;
 15 | 	/** Namespace of this element. */
 16 | 	Namespace *nsElement;
 17 | 	/** Default xmlns before entering this element. */
 18 | 	Namespace *nsOuterDefault;
 19 | 	/** Number of new xmlns mappings made by this element. */
 20 | 	uint32_t xmlnsMapCount;
 21 | 
 22 | };
 23 | 
 24 | struct PrefixDefinition {
 25 | 
 26 | 	PrefixDefinition(uint32_t idPrefix = 0, uint32_t idNamespace = 0) :
 27 | 	idPrefix(idPrefix), idNamespace(idNamespace) {}
 28 | 
 29 | 	uint32_t idPrefix;
 30 | 	uint32_t idNamespace;
 31 | 
 32 | };
 33 | 
 34 | struct Element {
 35 | 
 36 | 	Element(size_t prefixStackOffset, uint32_t crc32) :
 37 | 	prefixStackOffset(prefixStackOffset), crc32(crc32) {}
 38 | 
 39 | 	size_t prefixStackOffset;
 40 | 	// TODO: verify open and close tags match by using CRC.
 41 | 	uint32_t crc32;
 42 | 
 43 | };
 44 | 
 45 | /** Fast streaming XML parser. */
 46 | 
 47 | class Parser {
 48 | 
 49 | public:
 50 | 
 51 | 	static constexpr uint32_t namespacePrefixTblSize = ParserConfig :: namespacePrefixTblSize;
 52 | 
 53 | 	/** Parser states. */
 54 | 
 55 | 	enum class State : uint32_t {
 56 | 		BEGIN,
 57 | 		MATCH, MATCH_SPARSE, QUOTE,
 58 | 		BEFORE_TEXT, TEXT,
 59 | 		BEFORE_CDATA, CDATA,
 60 | 		AFTER_LT,
 61 | 		BEFORE_NAME, MATCH_TRIE, NAME, UNKNOWN_NAME,
 62 | 		STORE_ELEMENT_NAME, AFTER_ELEMENT_NAME,
 63 | 		AFTER_CLOSE_ELEMENT_NAME,
 64 | 		BEFORE_ATTRIBUTE_VALUE, AFTER_ATTRIBUTE_VALUE,
 65 | 		DEFINE_XMLNS_BEFORE_PREFIX_NAME, DEFINE_XMLNS_AFTER_PREFIX_NAME,
 66 | 		BEFORE_VALUE, VALUE, UNKNOWN_VALUE, DEFINE_XMLNS_AFTER_URI,
 67 | 		BEFORE_SGML, SGML_DECLARATION,
 68 | 		AFTER_PROCESSING_NAME, AFTER_PROCESSING_VALUE,
 69 | 		BEFORE_COMMENT, COMMENT,
 70 | 		EXPECT,
 71 | 		PARSE_ERROR
 72 | 	};
 73 | 
 74 | 	enum class TagType : uint32_t {
 75 | 		ELEMENT,
 76 | 		SGML_DECLARATION,
 77 | 		PROCESSING
 78 | 	};
 79 | 
 80 | 	enum class MatchTarget : uint32_t {
 81 | 		ELEMENT,
 82 | 		ELEMENT_NAMESPACE,
 83 | 		ATTRIBUTE,
 84 | 		ATTRIBUTE_NAMESPACE
 85 | 	};
 86 | 
 87 | 	static constexpr unsigned int TOKEN_SHIFT = 5;
 88 | 
 89 | 	#define export
 90 | 	#define const
 91 | 	#define enum enum class
 92 | 
 93 | 	#define CodeType TokenType : uint32_t
 94 | 	#include "../src/tokenizer/CodeType.ts"
 95 | 	#undef CodeType
 96 | 
 97 | 	#define ErrorType ErrorType : uint32_t
 98 | 	#include "../src/tokenizer/ErrorType.ts"
 99 | 	#undef ErrorType
100 | 
101 | 	#undef enum
102 | 	#undef const
103 | 	#undef export
104 | 
105 | 	Parser(const ParserConfig &config);
106 | 
107 | 	ErrorType destroy();
108 | 
109 | 	ParserConfig *getConfig() { return(&config); }
110 | 
111 | 	/** Parse a chunk of incoming data. */
112 | 	ErrorType parse(nbind::Buffer chunk);
113 | 
114 | 	void setCodeBuffer(nbind::Buffer tokenBuffer, nbind::cbFunction &flushTokens) {
115 | 		this->flushTokens = std::unique_ptr<nbind::cbFunction>(new nbind::cbFunction(flushTokens));
116 | 		this->tokenBuffer = tokenBuffer;
117 | 
118 | 		tokenList = reinterpret_cast<uint32_t *>(tokenBuffer.data());
119 | 		tokenBufferEnd = tokenList + tokenBuffer.length() / 4;
120 | 
121 | 		flushTokens.reset();
122 | 	}
123 | 
124 | 	inline void flush(uint32_t *&tokenPtr) {
125 | 		(*flushTokens)();
126 | 		tokenList[0] = 0;
127 | 		tokenPtr = tokenList + 1;
128 | 	}
129 | 
130 | 	bool updateElementStack(TokenType nameTokenType) {
131 | 		if(nameTokenType == TokenType :: OPEN_ELEMENT_ID) {
132 | 			// TODO: Ensure stack is not too large.
133 | 			elementStack.emplace_back(prefixStack.size(), 0);
134 | 		} else if(nameTokenType == TokenType :: CLOSE_ELEMENT_ID) {
135 | 			if(elementStack.empty()) return(false);
136 | 
137 | 			const Element &element = elementStack.back();
138 | 			size_t oldSize = element.prefixStackOffset;
139 | 
140 | 			for(size_t size = prefixStack.size(); size > oldSize; --size) {
141 | 				const PrefixDefinition &old = prefixStack.back();
142 | 				Namespace *ns = config.namespaceList[old.idNamespace].get();
143 | 				// For efficiency, never undefine an xmlns prefix
144 | 				// because it may be redefined identically later.
145 | 				if(ns) {
146 | 					config.namespacePrefixTbl[old.idPrefix] = std::make_pair(old.idNamespace, ns);
147 | 				}
148 | 				prefixStack.pop_back();
149 | 			}
150 | 
151 | 			elementStack.pop_back();
152 | 		}
153 | 
154 | 		return(true);
155 | 	}
156 | 
157 | 	/** Output a token. This is the only function writing to memory, so safety
158 | 	  * from code execution exploits depends on this and nothing else. */
159 | 
160 | 	inline void writeToken(TokenType kind, uint32_t token, uint32_t *&tokenPtr) {
161 | 		if(tokenPtr >= tokenBufferEnd) flush(tokenPtr);
162 | 
163 | 		// Buffer content length is stored at its beginning.
164 | 		++tokenList[0];
165 | 
166 | 		// This must never write outside the range
167 | 		// from tokenList to tokenBufferEnd (exclusive).
168 | 		*tokenPtr++ = static_cast<uint32_t>(kind) + (token << TOKEN_SHIFT);
169 | 	}
170 | 
171 | 	void setPrefix(uint32_t idPrefix) {
172 | 		if(idPrefix < namespacePrefixTblSize) this->idPrefix = idPrefix;
173 | 		memberPrefix->idPrefix = idPrefix;
174 | 		memberPrefix->idNamespace = config.namespacePrefixTbl[config.emptyPrefixToken].first;
175 | 	}
176 | 
177 | 	bool bindPrefix(uint32_t idPrefix, uint32_t uri) {
178 | 		uint32_t nsOld = config.namespacePrefixTbl[idPrefix].first;
179 | 
180 | 		if(config.bindPrefix(idPrefix, uri)) {
181 | 			// Push old prefix binding to stack, to restore it after closing tag.
182 | 			prefixStack.emplace_back(idPrefix, nsOld);
183 | 			if(elementPrefix.idPrefix == idPrefix) {
184 | 				elementPrefix.idNamespace = config.namespacePrefixTbl[idPrefix].first;
185 | 			}
186 | 			return(true);
187 | 		}
188 | 
189 | 		return(false);
190 | 	}
191 | 
192 | 	bool addUri(uint32_t uri, uint32_t idNamespace);
193 | 
194 | 	// Emit content for a partially matched token.
195 | 	// If the input buffer was drained, emit the match length and some
196 | 	// valid token beginning identically, to recover the complete name.
197 | 	inline void emitPartialName(
198 | 		const unsigned char *p,
199 | 		size_t offset,
200 | 		TokenType tokenType,
201 | 		uint32_t *&tokenPtr
202 | 	);
203 | 
204 | 	inline void updateRowCol(unsigned char c);
205 | 
206 | 	inline uint32_t getRow() { return(row); }
207 | 	inline uint32_t getCol() { return(col); }
208 | 
209 | 	ParserConfig config;
210 | 
211 | 	/** Prefix and namespace of current element. */
212 | 	PrefixDefinition elementPrefix;
213 | 	PrefixDefinition attributePrefix;
214 | 	PrefixDefinition *memberPrefix = &attributePrefix;
215 | 
216 | 	std::vector<PrefixDefinition> prefixStack;
217 | 	std::vector<Element> elementStack;
218 | 
219 | 	PatriciaCursor cursor;
220 | 
221 | 	unsigned char *nameCharTbl;
222 | 	unsigned char *nameStartCharTbl;
223 | 	const char *pattern;
224 | 
225 | 	State state;
226 | 	State matchState;
227 | 	State noMatchState;
228 | 	State partialMatchState;
229 | 	State afterNameState;
230 | 	State afterTextState;
231 | 	State afterMatchTrieState;
232 | 	/** Next state after reading an element, attribute or processing instruction
233 | 	  * name, a text node or an attribute value. */
234 | 	State nextState;
235 | 	/** Next state if the current character was not the expected one. */
236 | 	State otherState;
237 | 	/** Next state after reading an attribute value. Regular elements and
238 | 	  * processing instructions need different handling. */
239 | 	State afterValueState;
240 | 	/** Flag whether the previously emitted name was found in a trie. */
241 | 	bool knownName;
242 | 
243 | 	TagType tagType;
244 | 	MatchTarget matchTarget;
245 | 
246 | 	unsigned char textEndChar;
247 | 
248 | 	/** Expected character for moving to another state. */
249 | 	unsigned char expected;
250 | 
251 | 	size_t pos;
252 | 
253 | 	uint32_t row;
254 | 	uint32_t col;
255 | 
256 | 	uint32_t idToken;
257 | 	uint32_t idPrefix;
258 | 
259 | 	uint32_t idElement;
260 | 
261 | 	uint32_t sgmlNesting;
262 | 
263 | 	TokenType nameTokenType;
264 | 	TokenType textTokenType;
265 | 	TokenType valueTokenType;
266 | 	const unsigned char *tokenStart;
267 | 
268 | 	// TODO: Maybe this could be std::function<void ()>
269 | 	std::unique_ptr<nbind::cbFunction> flushTokens;
270 | 
271 | 	Patricia Namespace :: *trie;
272 | 
273 | 	nbind::Buffer tokenBuffer;
274 | 	uint32_t *tokenList;
275 | 	const uint32_t *tokenBufferEnd;
276 | 
277 | };
278 | 


--------------------------------------------------------------------------------
/src/writer/Writer.ts:
--------------------------------------------------------------------------------
  1 | import * as stream from 'stream';
  2 | 
  3 | import { Namespace } from '../Namespace';
  4 | import { TokenChunk } from '../parser/TokenChunk';
  5 | import { Token, TokenBuffer, TokenKind, MemberToken, SgmlToken } from '../parser/Token';
  6 | 
  7 | export const enum Indent {
  8 | 	MIN_DEPTH = 1,
  9 | 	MAX_DEPTH = 256
 10 | }
 11 | 
 12 | export const enum State {
 13 | 	ELEMENT = 0,
 14 | 	PROCESSING,
 15 | 	SGML,
 16 | 	SGML_TEXT,
 17 | 	TEXT,
 18 | 	AFTER_TEXT,
 19 | 	COMMENT,
 20 | 	CDATA
 21 | }
 22 | 
 23 | export const indentPattern = '\n' + new Array(Indent.MAX_DEPTH).join('\t');
 24 | 
 25 | export class Writer extends stream.Transform {
 26 | 
 27 | 	/** @param data Arbitrary data passed to any custom serializers. */
 28 | 
 29 | 	constructor(private data?: any) {
 30 | 		super({ objectMode: true });
 31 | 	}
 32 | 
 33 | 	transform(chunk: TokenChunk | TokenBuffer | string, partList: string[]) {
 34 | 		const prefixList = this.prefixList;
 35 | 		const chunkCount = this.chunkCount++;
 36 | 		let buffer: TokenBuffer;
 37 | 		let state = this.state;
 38 | 		let depth = this.depth;
 39 | 		let indent = this.indent;
 40 | 		let nsElement = this.nsElement;
 41 | 		let token: typeof buffer[0];
 42 | 		let member: MemberToken;
 43 | 		let prefix: string;
 44 | 		let serialized: string | TokenBuffer;
 45 | 
 46 | 		let partNum = partList.length - 1;
 47 | 		let lastNum = chunk.length - 1;
 48 | 		let tokenNum = -1;
 49 | 		let namespaceList: (Namespace | undefined)[] | undefined;
 50 | 
 51 | 		if(typeof(chunk) == 'string') {
 52 | 			partList.push(chunk);
 53 | 			return(partList);
 54 | 		} else if(chunk instanceof TokenChunk) {
 55 | 			buffer = chunk.buffer;
 56 | 			namespaceList = chunk.namespaceList;
 57 | 		} else {
 58 | 			buffer = chunk;
 59 | 		}
 60 | 
 61 | 		if(!chunkCount) {
 62 | 			if(!namespaceList) {
 63 | 				namespaceList = [];
 64 | 
 65 | 				while(tokenNum < lastNum) {
 66 | 					token = buffer[++tokenNum];
 67 | 
 68 | 					if(token instanceof MemberToken) {
 69 | 						namespaceList[token.ns.id] = token.ns;
 70 | 					}
 71 | 				}
 72 | 
 73 | 				tokenNum = -1;
 74 | 			}
 75 | 
 76 | 			this.copyPrefixes(namespaceList);
 77 | 		}
 78 | 
 79 | 		while(tokenNum < lastNum) {
 80 | 
 81 | 			token = buffer[++tokenNum];
 82 | 
 83 | 			if(token instanceof Token) {
 84 | 				switch(token.kind) {
 85 | 					case TokenKind.open:
 86 | 
 87 | 						member = token as MemberToken;
 88 | 						nsElement = member.ns;
 89 | 						partList[++partNum] = indent + '<' + prefixList[nsElement.id] + member.name;
 90 | 
 91 | 						if(nsElement.isSpecial && nsElement.defaultPrefix == '?') {
 92 | 							state = State.PROCESSING;
 93 | 						} else {
 94 | 							if(depth++ == Indent.MIN_DEPTH) partList[++partNum] = this.xmlnsDefinitions;
 95 | 							state = State.ELEMENT;
 96 | 						}
 97 | 
 98 | 						indent = indentPattern.substr(0, depth);
 99 | 						break;
100 | 
101 | 					case TokenKind.sgmlEmitted:
102 | 
103 | 						this.sgmlSeparator = '<!';
104 | 
105 | 					// Fallthru
106 | 					case TokenKind.emitted:
107 | 
108 | 						partList[++partNum] = '>';
109 | 
110 | 						state = State.TEXT;
111 | 						break;
112 | 
113 | 					case TokenKind.close:
114 | 
115 | 						if(state == State.PROCESSING) {
116 | 							partList[++partNum] = '?>';
117 | 						} else {
118 | 							member = token as MemberToken;
119 | 							indent = indentPattern.substr(0, --depth);
120 | 
121 | 							if(state == State.ELEMENT) {
122 | 								partList[++partNum] = '/>';
123 | 							} else {
124 | 								if(state != State.AFTER_TEXT) partList[++partNum] = indent;
125 | 								partList[++partNum] = '</' + prefixList[member.ns.id] + member.name + '>'
126 | 							}
127 | 						}
128 | 
129 | 						state = State.TEXT;
130 | 						break;
131 | 
132 | 					case TokenKind.string:
133 | 
134 | 						member = token as MemberToken;
135 | 						// Omit prefixes for attributes in the same namespace
136 | 						// as their parent element.
137 | 						if(member.ns == nsElement) prefix = '';
138 | 						else prefix = prefixList[member.ns.id];
139 | 
140 | 						partList[++partNum] = ' ' + prefix + member.name + '=';
141 | 						break;
142 | 
143 | 					case TokenKind.sgml:
144 | 
145 | 						prefix = (token as SgmlToken).prefix;
146 | 
147 | 						partList[++partNum] = this.sgmlSeparator + prefix + (prefix && ':') + (token as SgmlToken).name;
148 | 						this.sgmlSeparator = ' ';
149 | 						break;
150 | 
151 | 					case TokenKind.comment:
152 | 
153 | 						state = State.COMMENT;
154 | 						break;
155 | 
156 | 					case TokenKind.cdata:
157 | 
158 | 						state = State.CDATA;
159 | 						break;
160 | 
161 | 					case TokenKind.sgmlNestedStart:
162 | 
163 | 						partList[++partNum] = this.sgmlSeparator + '[';
164 | 						this.sgmlSeparator = '<!';
165 | 						state = State.TEXT;
166 | 						break;
167 | 
168 | 					case TokenKind.sgmlNestedEnd:
169 | 
170 | 						partList[++partNum] = ']';
171 | 						this.sgmlSeparator = ' ';
172 | 						state = State.SGML;
173 | 						break;
174 | 
175 | 					case TokenKind.sgmlText:
176 | 
177 | 						state = State.SGML_TEXT;
178 | 						break;
179 | 
180 | 					case TokenKind.other:
181 | 
182 | 						if(token.serialize) {
183 | 
184 | 							serialized = token.serialize(indent, this.data);
185 | 							if(typeof(serialized) == 'string') {
186 | 								partList[++partNum] = serialized;
187 | 								state = State.AFTER_TEXT;
188 | 							} else {
189 | 								this.state = state;
190 | 								this.depth = depth;
191 | 								this.indent = indent;
192 | 
193 | 								this.transform(TokenChunk.allocate(serialized), partList);
194 | 								partNum = partList.length - 1;
195 | 							}
196 | 						}
197 | 						break;
198 | 				}
199 | 			} else {
200 | 				switch(state) {
201 | 					case State.TEXT:
202 | 					case State.AFTER_TEXT:
203 | 
204 | 						partList[++partNum] = '' + token;
205 | 						state = State.AFTER_TEXT;
206 | 						break;
207 | 
208 | 					case State.CDATA:
209 | 
210 | 						partList[++partNum] = '<![CDATA[' + token + ']]>';
211 | 						state = State.AFTER_TEXT;
212 | 						break;
213 | 
214 | 					case State.ELEMENT:
215 | 					case State.PROCESSING:
216 | 
217 | 						partList[++partNum] = '"' + token + '"';
218 | 						break;
219 | 
220 | 					case State.COMMENT:
221 | 
222 | 						partList[++partNum] = indent + '<!--' + token;
223 | 						state = State.TEXT;
224 | 						break;
225 | 
226 | 					case State.SGML_TEXT:
227 | 
228 | 						partList[++partNum] = this.sgmlSeparator + '"' + token + '"';
229 | 						this.sgmlSeparator = ' ';
230 | 						break;
231 | 
232 | 				}
233 | 			}
234 | 		}
235 | 
236 | 		this.state = state;
237 | 		this.depth = depth;
238 | 		this.indent = indent;
239 | 		this.nsElement = nsElement;
240 | 
241 | 		if(chunk instanceof TokenChunk) chunk.free();
242 | 
243 | 		return(partList);
244 | 	}
245 | 
246 | 	_transform(chunk: TokenChunk | TokenBuffer | null, enc: string, flush: (err: any, chunk: string) => void) {
247 | 		if(!chunk) {
248 | 			flush(null, '');
249 | 			return;
250 | 		}
251 | 
252 | 		const partList: string[] = [];
253 | 
254 | 		if(!this.chunkCount) {
255 | 			const token = chunk instanceof TokenChunk ? chunk.buffer[0] : chunk[0];
256 | 
257 | 			if(
258 | 				!(token instanceof Token) ||
259 | 				token.kind != TokenKind.open ||
260 | 				(token as MemberToken).ns != Namespace.processing ||
261 | 				(token as MemberToken).name != 'xml'
262 | 			) {
263 | 				partList.push('<?xml version="1.0" encoding="utf-8"?>\n');
264 | 			}
265 | 		}
266 | 
267 | 		this.transform(chunk, partList);
268 | 		flush(null, partList.join(''));
269 | 	}
270 | 
271 | 	_flush( flush: (err: any, chunk: string) => void) {
272 | 		flush(null, '\n');
273 | 	}
274 | 
275 | 	copyPrefixes(namespaceList: (Namespace | undefined)[]) {
276 | 		const prefixTbl = this.prefixTbl;
277 | 		const prefixList = this.prefixList;
278 | 		let prefix: string | undefined;
279 | 		let ns: Namespace | undefined;
280 | 
281 | 		// Add a number to distinguish between duplicate prefix names.
282 | 
283 | 		for(let i = 0; i < namespaceList.length; ++i) {
284 | 			ns = namespaceList[i];
285 | 			if(!ns) continue;
286 | 
287 | 			prefix = ns.defaultPrefix;
288 | 			if(!prefix && !ns.isSpecial) continue;
289 | 
290 | 			if(prefixTbl[prefix]) {
291 | 				let j = 1;
292 | 
293 | 				do {
294 | 					prefix = ns!.defaultPrefix + (++j);
295 | 				} while(prefixTbl[prefix]);
296 | 			}
297 | 
298 | 			prefixList[i] = prefix;
299 | 			prefixTbl[prefix] = i + 1;
300 | 		}
301 | 
302 | 		let j = 0;
303 | 
304 | 		// Name all unnamed prefixes with "p" and a sequence number.
305 | 
306 | 		for(let i = 0; i < namespaceList.length; ++i) {
307 | 			ns = namespaceList[i];
308 | 			if(!ns) continue;
309 | 
310 | 			prefix = ns.defaultPrefix;
311 | 			if(prefix || ns.isSpecial) continue;
312 | 
313 | 			do {
314 | 				prefix = 'p' + (++j);
315 | 			} while(prefixTbl[prefix]);
316 | 
317 | 			prefixList[i] = prefix;
318 | 			prefixTbl[prefix] = i + 1;
319 | 		}
320 | 
321 | 		let definitionList: string[] = [];
322 | 
323 | 		for(let i = 0; i < namespaceList.length; ++i) {
324 | 			ns = namespaceList[i];
325 | 			prefix = prefixList[i];
326 | 			if(!prefix || !ns || ns.isSpecial) continue;
327 | 
328 | 			if(prefix != 'xml') definitionList.push(' xmlns:' + prefix + '="' + ns.uri + '"');
329 | 			this.prefixList[i] = prefix + ':';
330 | 		}
331 | 
332 | 		this.xmlnsDefinitions = definitionList.join('');
333 | 	}
334 | 
335 | 	private chunkCount = 0;
336 | 	private state = State.TEXT as State;
337 | 	private depth = Indent.MIN_DEPTH;
338 | 	private indent = '';
339 | 	private sgmlSeparator = '<!';
340 | 	private nsElement: Namespace;
341 | 	private prefixList: string[] = [];
342 | 	private prefixTbl: { [ key: string ]: number } = {};
343 | 	private xmlnsDefinitions = '';
344 | 
345 | }
346 | 


--------------------------------------------------------------------------------
/src/parser/ParserConfig.ts:
--------------------------------------------------------------------------------
  1 | import { NativeConfig, NativeParser } from './ParserLib';
  2 | 
  3 | import { Namespace } from '../Namespace';
  4 | import { ParserNamespace } from './ParserNamespace';
  5 | import { TokenSpace } from '../tokenizer/TokenSpace';
  6 | import { TokenSet } from '../tokenizer/TokenSet';
  7 | import { InternalToken } from './InternalToken';
  8 | import { TokenChunk } from './TokenChunk';
  9 | import { TokenKind, MemberToken, OpenToken, CloseToken, EmittedToken, StringToken } from './Token';
 10 | import { Parser } from './Parser';
 11 | import { XModuleTable } from './JSX';
 12 | 
 13 | export interface ParserOptions {
 14 | 	parseUnknown?: boolean;
 15 | 	omitDefaults?: boolean;
 16 | }
 17 | 
 18 | export interface TokenTbl {
 19 | 	[ prefix: string ]: {
 20 | 		uri: string,
 21 | 		elements?: string[],
 22 | 		attributes?: string[]
 23 | 	}
 24 | }
 25 | 
 26 | export interface Registry {
 27 | 	[prefix: string]: {
 28 | 		[ idOrName: string ]: MemberToken;
 29 | 	}
 30 | }
 31 | 
 32 | /** Parser configuration for quickly instantiating new parsers.
 33 |   * Each parser instance holds a new, cloned copy. */
 34 | 
 35 | export class ParserConfig {
 36 | 
 37 | 	/** XML parser configuration.
 38 | 	  * @param config Parent object for cloning.
 39 | 	  * @param native Reference to C++ object. For internal use only. */
 40 | 	constructor(config?: ParserOptions | ParserConfig, native?: NativeConfig | null) {
 41 | 		if(config instanceof ParserConfig) {
 42 | 			config.link();
 43 | 			this.isLinked = true;
 44 | 
 45 | 			this.options = config.options;
 46 | 
 47 | 			this.uriSpace = config.uriSpace;
 48 | 			this.prefixSpace = config.prefixSpace;
 49 | 			this.elementSpace = config.elementSpace;
 50 | 			this.attributeSpace = config.attributeSpace;
 51 | 
 52 | 			this.xmlnsToken = config.xmlnsToken;
 53 | 
 54 | 			this.emptyPrefixToken = config.emptyPrefixToken;
 55 | 			this.xmlnsPrefixToken = config.xmlnsPrefixToken;
 56 | 			this.processingPrefixToken = config.processingPrefixToken;
 57 | 
 58 | 			this.uriSet = config.uriSet;
 59 | 			this.prefixSet = config.prefixSet;
 60 | 
 61 | 			this.namespaceList = config.namespaceList;
 62 | 			this.namespaceTbl = config.namespaceTbl;
 63 | 			this.maxNamespace = config.maxNamespace;
 64 | 
 65 | 			this.nsMapper = config.nsMapper;
 66 | 		} else {
 67 | 			this.isLinked = false;
 68 | 
 69 | 			this.options = config || {};
 70 | 
 71 | 			this.uriSpace = new TokenSpace(TokenKind.uri);
 72 | 			this.prefixSpace = new TokenSpace(TokenKind.prefix);
 73 | 			this.elementSpace = new TokenSpace(TokenKind.element);
 74 | 			this.attributeSpace = new TokenSpace(TokenKind.attribute);
 75 | 
 76 | 			this.xmlnsToken = this.attributeSpace.createToken('xmlns');
 77 | 
 78 | 			this.uriSet = new TokenSet(this.uriSpace);
 79 | 			this.prefixSet = new TokenSet(this.prefixSpace);
 80 | 
 81 | 			this.namespaceList = [];
 82 | 			this.namespaceTbl = {};
 83 | 			this.maxNamespace = 0;
 84 | 		}
 85 | 
 86 | 		// this.clonedNamespaceCount = this.maxNamespace;
 87 | 
 88 | 		if(!native) {
 89 | 			this.emptyPrefixToken = this.prefixSet.createToken('');
 90 | 			this.xmlnsPrefixToken = this.prefixSet.createToken('xmlns');
 91 | 			this.processingPrefixToken = this.prefixSet.createToken('?');
 92 | 
 93 | 			native = new NativeConfig(this.xmlnsToken.id, this.emptyPrefixToken.id, this.xmlnsPrefixToken.id, this.processingPrefixToken.id);
 94 | 		}
 95 | 
 96 | 		this.native = native;
 97 | 
 98 | 		if(!this.isLinked && !this.options.omitDefaults) {
 99 | 			this.bindNamespace(Namespace.processing);
100 | 			this.bindNamespace(Namespace.unknown);
101 | 			this.bindNamespace(Namespace.xml1998);
102 | 		}
103 | 	}
104 | 
105 | 	link() {
106 | 		this.isLinked = true;
107 | 
108 | 		this.uriSpace.link();
109 | 		this.prefixSpace.link();
110 | 		this.elementSpace.link();
111 | 		this.attributeSpace.link();
112 | 
113 | 		this.uriSet.link();
114 | 		this.prefixSet.link();
115 | 	}
116 | 
117 | 	unlink() {
118 | 		if(!this.isLinked) return;
119 | 		this.isLinked = false;
120 | 
121 | 		this.uriSpace = new TokenSpace(TokenKind.uri, this.uriSpace);
122 | 		this.prefixSpace = new TokenSpace(TokenKind.prefix, this.prefixSpace);
123 | 		this.elementSpace = new TokenSpace(TokenKind.element, this.elementSpace);
124 | 		this.attributeSpace = new TokenSpace(TokenKind.attribute, this.attributeSpace);
125 | 
126 | 		this.uriSet = new TokenSet(this.uriSpace, this.uriSet);
127 | 		this.prefixSet = new TokenSet(this.prefixSpace, this.prefixSet);
128 | 
129 | 		const namespaceList = this.namespaceList.slice(0);
130 | 		let num = namespaceList.length;
131 | 
132 | 		while(num--) {
133 | 			let ns = namespaceList[num];
134 | 
135 | 			// This just skips namespace 0 which never exists
136 | 			// (see ParserConfig.cc).
137 | 			if(ns) {
138 | 				ns = new ParserNamespace(ns, this);
139 | 				ns.id = num;
140 | 				namespaceList[num] = ns;
141 | 			}
142 | 		}
143 | 
144 | 		const namespaceTbl: { [ name: string ]: ParserNamespace } = {};
145 | 		for(let key of Object.keys(this.namespaceTbl)) {
146 | 			namespaceTbl[key] = namespaceList[this.namespaceTbl[key].id];
147 | 		}
148 | 
149 | 		this.namespaceList = namespaceList;
150 | 		this.namespaceTbl = namespaceTbl;
151 | 	}
152 | 
153 | 	createParser() {
154 | 		// Create a native code parser which clones the native config.
155 | 		const nativeParser = new NativeParser(this.native);
156 | 		// Create a cloned config with a native object shared with the new parser.
157 | 		const config = new ParserConfig(this, nativeParser.getConfig());
158 | 
159 | 		return(new Parser(config, nativeParser));
160 | 	}
161 | 
162 | 	parseSync(data: string) {
163 | 		return(this.createParser().parseSync(data));
164 | 	}
165 | 
166 | 	getNamespace(uri: string) {
167 | 		const ns = this.namespaceTbl[uri];
168 | 		return(ns && ns.base);
169 | 	}
170 | 
171 | 	addNamespace(nsBase: Namespace) {
172 | 		let uri = (this.nsMapper && this.nsMapper(nsBase.uri)) || nsBase.uri;
173 | 		let nsParser = this.namespaceTbl[uri];
174 | 
175 | 		if(nsParser) return(nsParser.id);
176 | 
177 | 		this.unlink();
178 | 
179 | 		nsBase.uri = uri;
180 | 		nsParser = new ParserNamespace(nsBase, this);
181 | 		nsParser.id = this.native.addNamespace(nsParser.registerNative());
182 | 
183 | 		this.namespaceList[nsParser.id] = nsParser;
184 | 		this.namespaceTbl[uri] = nsParser;
185 | 
186 | 		if(nsBase.id > this.maxNamespace) this.maxNamespace = nsBase.id;
187 | 
188 | 		if(nsBase.defaultPrefix) this.addPrefix(nsBase.defaultPrefix);
189 | 		this.addUri(uri, nsParser);
190 | 
191 | 		return(nsParser.id);
192 | 	}
193 | 
194 | 	bindNamespace(nsBase: Namespace, prefix?: string, parser?: Parser) {
195 | 		this.addNamespace(nsBase);
196 | 
197 | 		let uri = (this.nsMapper && this.nsMapper(nsBase.uri)) || nsBase.uri;
198 | 		let nsParser = this.namespaceTbl[uri];
199 | 
200 | 		if(!prefix && prefix != '') prefix = nsParser.base.defaultPrefix;
201 | 
202 | 		(parser || this).bindPrefix(
203 | 			this.addPrefix(prefix),
204 | 			this.addUri(uri, nsParser)
205 | 		);
206 | 
207 | 		return(nsParser.id);
208 | 	}
209 | 
210 | 	updateNamespaces() {
211 | 		const list = this.namespaceList;
212 | 		const len = list.length;
213 | 
214 | 		for(let num = 0; num < len; ++num) {
215 | 			if(list[num]) list[num].registerNative();
216 | 		}
217 | 	}
218 | 
219 | 	bindPrefix(prefix: InternalToken, uri: InternalToken) {
220 | 		this.native.bindPrefix(prefix.id, uri.id);
221 | 	}
222 | 
223 | 	addUri(uri: string, ns: ParserNamespace) {
224 | 		this.unlink();
225 | 
226 | 		const token = this.uriSet.createToken(uri, ns);
227 | 
228 | 		this.native.setUriTrie(this.uriSet.encodeTrie());
229 | 		this.native.addUri(token.id, ns.id);
230 | 		ns.uriToken = token.uri;
231 | 
232 | 		return(token);
233 | 	}
234 | 
235 | 	addPrefix(prefix: string) {
236 | 		this.unlink();
237 | 
238 | 		const token = this.prefixSet.createToken(prefix);
239 | 
240 | 		this.native.setPrefixTrie(this.prefixSet.encodeTrie());
241 | 
242 | 		return(token);
243 | 	}
244 | 
245 | 	registerTokens(tbl: TokenTbl): Registry {
246 | 		const registry: Registry = {};
247 | 		let token: MemberToken;
248 | 
249 | 		for(let prefix of Object.keys(tbl)) {
250 | 			const spec = tbl[prefix];
251 | 			const uri = spec.uri;
252 | 			const ns = this.getNamespace(uri) || new Namespace(prefix, uri);
253 | 
254 | 			for(let name of spec.elements || []) {
255 | 				const tokens = this.getElementTokens(ns, name);
256 | 				token = tokens[TokenKind.open]!;
257 | 
258 | 				registry[prefix][name] = token;
259 | 				registry[prefix][token.id!] = token;
260 | 			}
261 | 
262 | 			for(let name of spec.attributes || []) {
263 | 				const tokens = this.getAttributeTokens(ns, name);
264 | 				token = tokens[TokenKind.string]!;
265 | 
266 | 				registry[prefix][name] = token;
267 | 				registry[prefix][token.id!] = token;
268 | 			}
269 | 		}
270 | 
271 | 		return(registry);
272 | 	}
273 | 
274 | 	jsxRegister<Module extends XModuleTable>(spec: Module): Module;
275 | 
276 | 	jsxRegister<Module extends XModuleTable, Result>(spec: Module, handler?: (result: Module) => Result): Result;
277 | 
278 | 	jsxRegister<Module extends XModuleTable, Result>(spec: Module, handler?: (result: Module) => Result) {
279 | 		const result: { [prefix: string]: { [name: string]: OpenToken | StringToken | string }} = {};
280 | 		let token: OpenToken | StringToken;
281 | 
282 | 		for(let prefix of Object.keys(spec)) {
283 | 			const elements = (spec as any)[prefix];
284 | 			const uri = elements.xmlns;
285 | 			const ns = this.getNamespace(uri) || new Namespace(prefix, uri);
286 | 			result[prefix] = {};
287 | 
288 | 			for(let name of Object.keys(elements)) {
289 | 				if(name == 'xmlns') {
290 | 					token = elements[name];
291 | 				} else if(elements[name]) {
292 | 					token = this.getElementTokens(ns, name)[TokenKind.open]!;
293 | 				} else {
294 | 					token = this.getAttributeTokens(ns, name)[TokenKind.string]!;
295 | 				}
296 | 
297 | 				result[prefix][name] = token;
298 | 			}
299 | 		}
300 | 
301 | 		return(handler ? handler(result as Module) : result as Module);
302 | 	}
303 | 
304 | 	getElementTokens(ns: Namespace, name: string) {
305 | 		const id = this.addNamespace(ns);
306 | 		return(this.namespaceList[id].addElement(name).tokenList);
307 | 	}
308 | 
309 | 	getAttributeTokens(ns: Namespace, name: string) {
310 | 		const id = this.addNamespace(ns);
311 | 		return(this.namespaceList[id].addAttribute(name).tokenList);
312 | 	}
313 | 
314 | 	/** If true, object is a clone sharing data with another object. */
315 | 	private isLinked: boolean;
316 | 
317 | 	/** Reference to C++ object. */
318 | 	private native: NativeConfig;
319 | 
320 | 	options: ParserOptions;
321 | 
322 | 	/** Represents an "attribute" defining the default xmlns. */
323 | 	xmlnsToken: InternalToken;
324 | 	emptyPrefixToken: InternalToken;
325 | 	/** Represents the "xmlns" prefix defining a named xmlns. */
326 | 	xmlnsPrefixToken: InternalToken;
327 | 	processingPrefixToken: InternalToken;
328 | 
329 | 	/** Allocates ID numbers for xmlns uri tokens. */
330 | 	uriSpace: TokenSpace;
331 | 	/** Allocates ID numbers for xmlns prefix tokens. */
332 | 	prefixSpace: TokenSpace;
333 | 	/** Allocates ID numbers for element name tokens. */
334 | 	elementSpace: TokenSpace;
335 | 	/** Allocates ID numbers for attribute name tokens. */
336 | 	attributeSpace: TokenSpace;
337 | 
338 | 	uriSet: TokenSet;
339 | 	prefixSet: TokenSet;
340 | 
341 | 	/** List of supported namespaces. */
342 | 	namespaceList: ParserNamespace[];
343 | 	/** Mapping from URI to namespace. */
344 | 	private namespaceTbl: { [ uri: string ]: ParserNamespace };
345 | 	maxNamespace: number;
346 | 
347 | 	nsMapper?: (uri: string) => string | null | false | undefined;
348 | 
349 | }
350 | 


--------------------------------------------------------------------------------
/src/parser/Parser.ts:
--------------------------------------------------------------------------------
  1 | import { ArrayType, encodeArray } from '../Buffer';
  2 | import { Namespace } from '../Namespace';
  3 | import { CodeType } from '../tokenizer/CodeType';
  4 | import { ErrorType } from '../tokenizer/ErrorType';
  5 | import { NativeParser } from './ParserLib';
  6 | import { ParserConfig } from './ParserConfig';
  7 | import { ParserNamespace } from './ParserNamespace';
  8 | import { InternalToken } from './InternalToken';
  9 | import { TokenSet } from '../tokenizer/TokenSet';
 10 | import { TokenChunk } from './TokenChunk';
 11 | import { Stitcher } from './Stitcher';
 12 | import {
 13 | 	Token,
 14 | 	TokenBuffer,
 15 | 	TokenKind,
 16 | 	SpecialToken,
 17 | 	MemberToken,
 18 | 	OpenToken,
 19 | 	CloseToken,
 20 | 	StringToken,
 21 | 	SgmlToken
 22 | } from './Token';
 23 | 
 24 | // const codeBufferSize = 2;
 25 | // const codeBufferSize = 3;
 26 | const codeBufferSize = 8192;
 27 | 
 28 | const chunkSize = Infinity;
 29 | 
 30 | const enum TOKEN {
 31 | 	SHIFT = 5,
 32 | 	MASK = 31
 33 | }
 34 | 
 35 | export class ParseError extends Error {
 36 | 
 37 | 	constructor(public code: ErrorType, public row: number, public col: number) {
 38 | 		super('Parse error on line ' + row + ' column ' + col);
 39 | 	}
 40 | 
 41 | }
 42 | 
 43 | /** XML parser stream, emits tokens with fully qualified names. */
 44 | 
 45 | export class Parser {
 46 | 
 47 | 	/** Call only from ParserConfig.createParser.
 48 | 	  * @param config Reference to C++ config object.
 49 | 	  * @param native Reference to C++ parser object. */
 50 | 
 51 | 	constructor(private config: ParserConfig, private native: NativeParser) {
 52 | 		this.codeBuffer = new Uint32Array(codeBufferSize);
 53 | 		this.native.setCodeBuffer(this.codeBuffer, () => this.parseCodeBuffer(true));
 54 | 
 55 | 		for(let ns of this.config.namespaceList) {
 56 | 			if(ns && (ns.base.isSpecial || ns.base.defaultPrefix == 'xml')) {
 57 | 				this.namespaceList[ns.base.id] = ns.base;
 58 | 			}
 59 | 		}
 60 | 	}
 61 | 
 62 | 	public getConfig() { return(this.config); }
 63 | 
 64 | 	bindPrefix(prefix: InternalToken, uri: InternalToken) {
 65 | 		this.native.bindPrefix(prefix.id, uri.id);
 66 | 	}
 67 | 
 68 | 	public parseSync(data: string | ArrayType) {
 69 | 		const buffer: TokenBuffer = [];
 70 | 		let namespaceList: (Namespace | undefined)[] | undefined;
 71 | 
 72 | 		this.write(data, '', (err: any, chunk: TokenChunk | null) => {
 73 | 			if(err || !chunk) throw(err);
 74 | 
 75 | 			for(let tokenNum = 0; tokenNum < chunk.length; ++tokenNum) {
 76 | 				buffer.push(chunk.buffer[tokenNum]);
 77 | 			}
 78 | 
 79 | 			if(chunk.namespaceList) namespaceList = chunk.namespaceList;
 80 | 
 81 | 			chunk.free();
 82 | 		});
 83 | 
 84 | 		const output = TokenChunk.allocate(buffer);
 85 | 		output.namespaceList = namespaceList;
 86 | 
 87 | 		return(output);
 88 | 	}
 89 | 
 90 | 	destroy(
 91 | 		flush: (err: any, chunk: TokenChunk | null) => void
 92 | 	) {
 93 | 		const nativeStatus = this.native.destroy();
 94 | 
 95 | 		if(nativeStatus != ErrorType.OK) {
 96 | 			this.hasError = new ParseError(nativeStatus, this.native.row + 1, this.native.col + 1);
 97 | 			flush(this.hasError, null);
 98 | 		} else {
 99 | 			this.parseCodeBuffer(false);
100 | 			flush(null, this.tokenChunk);
101 | 		}
102 | 	}
103 | 
104 | 	write(
105 | 		chunk: string | ArrayType,
106 | 		enc: string,
107 | 		flush: (err: any, chunk: TokenChunk | null) => void
108 | 	) {
109 | 		if(this.hasError) {
110 | 			flush(this.hasError, null);
111 | 			return;
112 | 		}
113 | 
114 | 		if(typeof(chunk) == 'string') chunk = encodeArray(chunk);
115 | 
116 | 		const len = chunk.length;
117 | 		let nativeStatus = ErrorType.OK;
118 | 		let next: number;
119 | 
120 | 		if(len < chunkSize) {
121 | 			this.chunk = chunk;
122 | 			this.stitcher.setChunk(this.chunk);
123 | 			nativeStatus = this.native.parse(this.chunk);
124 | 			this.parseCodeBuffer(false);
125 | 		} else {
126 | 			// Limit size of buffers sent to native code.
127 | 			for(let pos = 0; pos < len; pos = next) {
128 | 				next = Math.min(pos + chunkSize, len);
129 | 
130 | 				this.chunk = chunk.slice(pos, next);
131 | 				this.stitcher.setChunk(this.chunk);
132 | 				nativeStatus = this.native.parse(this.chunk);
133 | 
134 | 				if(nativeStatus != ErrorType.OK) break;
135 | 				this.parseCodeBuffer(false);
136 | 			}
137 | 		}
138 | 
139 | 		if(nativeStatus != ErrorType.OK) {
140 | 			this.hasError = new ParseError(nativeStatus, this.native.row + 1, this.native.col + 1);
141 | 			flush(this.hasError, null);
142 | 			return;
143 | 		}
144 | 
145 | 		if(this.elementStart < 0) {
146 | 			if(this.namespacesChanged) this.tokenChunk.namespaceList = this.namespaceList;
147 | 			flush(null, this.tokenChunk);
148 | 
149 | 			this.tokenChunk = TokenChunk.allocate();
150 | 		} else {
151 | 			// Not ready to flush but have to send something to get more input.
152 | 			flush(null, null);
153 | 		}
154 | 	}
155 | 
156 | 	private parseCodeBuffer(pending: boolean) {
157 | 		const config = this.config;
158 | 		const stitcher = this.stitcher;
159 | 		const codeBuffer = this.codeBuffer;
160 | 		const codeCount = codeBuffer[0];
161 | 
162 | 		// NOTE: These must be updated if config is unlinked!
163 | 		let elementList = config.elementSpace.list;
164 | 		let attributeList = config.attributeSpace.list;
165 | 		let prefixList = config.prefixSpace.list;
166 | 		let uriList = config.uriSpace.list;
167 | 		let partialList = elementList;
168 | 
169 | 		let codeNum = 0;
170 | 		let partStart = this.partStart;
171 | 		let partialLen = this.partialLen;
172 | 		let latestElement = this.latestElement;
173 | 		let latestPrefix = this.latestPrefix;
174 | 		let latestNamespace = this.latestNamespace;
175 | 
176 | 		const tokenBuffer = this.tokenChunk.buffer;
177 | 		const prefixBuffer = this.prefixBuffer;
178 | 		const namespaceBuffer = this.namespaceBuffer;
179 | 		const unknownElementTbl = this.unknownElementTbl;
180 | 		const unknownAttributeTbl = this.unknownAttributeTbl;
181 | 		const sgmlTbl = this.sgmlTbl;
182 | 		const unknownOffsetList = this.unknownOffsetList;
183 | 		let tokenNum = this.tokenChunk.length - 1;
184 | 		let token: Token;
185 | 		let name: string;
186 | 		let prefix: string;
187 | 		let elementStart = this.elementStart;
188 | 		let unknownCount = this.unknownCount;
189 | 
190 | 		while(codeNum < codeCount) {
191 | 			let code = codeBuffer[++codeNum];
192 | 			const kind = code & TOKEN.MASK;
193 | 			code >>= TOKEN.SHIFT;
194 | 
195 | 			switch(kind) {
196 | 				case CodeType.OPEN_ELEMENT_ID:
197 | 
198 | 					latestElement = elementList[code].open;
199 | 					// TODO: If latestprefix is null, use current prefix for element's namespace.
200 | 					tokenBuffer[++tokenNum] = latestElement;
201 | 					prefixBuffer[0] = latestPrefix;
202 | 					elementStart = tokenNum;
203 | 					break;
204 | 
205 | 				case CodeType.CLOSE_ELEMENT_ID:
206 | 
207 | 					tokenBuffer[++tokenNum] = elementList[code].close;
208 | 					break;
209 | 
210 | 				case CodeType.ELEMENT_EMITTED:
211 | 				case CodeType.CLOSED_ELEMENT_EMITTED:
212 | 
213 | 					if(unknownCount) {
214 | 						let ns: ParserNamespace;
215 | 						let offset: number;
216 | 
217 | 						for(let pos = 0; pos < unknownCount; ++pos) {
218 | 							offset = unknownOffsetList[pos];
219 | 							ns = namespaceBuffer[offset]!;
220 | 							// If an xmlns definition already resolved
221 | 							// this token, ns will be null.
222 | 							if(ns) {
223 | 								// Ensure namespace is updated after config unlink.
224 | 								ns = config.namespaceList[ns.id];
225 | 								tokenBuffer[offset + elementStart] = (
226 | 									tokenBuffer[offset + elementStart] as MemberToken
227 | 								).resolve(ns);
228 | 							}
229 | 						}
230 | 
231 | 						latestElement = tokenBuffer[elementStart] as OpenToken;
232 | 						unknownCount = 0;
233 | 					}
234 | 
235 | 					tokenBuffer[++tokenNum] = (
236 | 						kind == CodeType.ELEMENT_EMITTED ?
237 | 						latestElement.emitted :
238 | 						latestElement.close
239 | 					)
240 | 
241 | 					elementStart = -1;
242 | 
243 | 					break;
244 | 
245 | 				case CodeType.ATTRIBUTE_ID:
246 | 
247 | 					tokenBuffer[++tokenNum] = attributeList[code].string;
248 | 					// If latestprefix is null, set attribute prefix to match its parent element.
249 | 					prefixBuffer[tokenNum - elementStart] = latestPrefix || prefixBuffer[0];
250 | 					break;
251 | 
252 | 				case CodeType.PREFIX_ID:
253 | 
254 | 					latestNamespace = config.namespaceList[code >> 14];
255 | 					code = code & 0x3fff;
256 | 
257 | 				// Fallthru
258 | 				case CodeType.XMLNS_ID:
259 | 
260 | 					latestPrefix = prefixList[code];
261 | 					break;
262 | 
263 | 				case CodeType.NAMESPACE_ID:
264 | 
265 | 					this.resolve(elementStart, tokenNum, latestPrefix!, code);
266 | 					tokenBuffer[++tokenNum] = latestPrefix!.prefix;
267 | 					tokenBuffer[++tokenNum] = this.config.namespaceList[code].uriToken;
268 | 					latestPrefix = null;
269 | 					break;
270 | 
271 | 				case CodeType.SGML_ID:
272 | 
273 | 					token = elementList[code].open;
274 | 					prefix = (token as MemberToken).ns.defaultPrefix;
275 | 					name = (token as MemberToken).name;
276 | 					token = sgmlTbl[prefix + ':' + name];
277 | 
278 | 					if(!token) {
279 | 						token = new SgmlToken(name, prefix);
280 | 						sgmlTbl[prefix + ':' + name] = token as SgmlToken;
281 | 					}
282 | 
283 | 					tokenBuffer[++tokenNum] = token;
284 | 					break;
285 | 
286 | 				case CodeType.TEXT_START_OFFSET:
287 | 				case CodeType.CDATA_START_OFFSET:
288 | 				case CodeType.VALUE_START_OFFSET:
289 | 				case CodeType.COMMENT_START_OFFSET:
290 | 				case CodeType.SGML_TEXT_START_OFFSET:
291 | 				case CodeType.UNKNOWN_START_OFFSET:
292 | 
293 | 					partStart = code;
294 | 					break;
295 | 
296 | 				case CodeType.UNKNOWN_OPEN_ELEMENT_END_OFFSET:
297 | 
298 | 					name = stitcher.getSlice(partStart, code);
299 | 					latestElement = unknownElementTbl[name];
300 | 
301 | 					if(!latestElement) {
302 | 						latestElement = new OpenToken(name, Namespace.unknown);
303 | 						unknownElementTbl[name] = latestElement;
304 | 					}
305 | 
306 | 					tokenBuffer[++tokenNum] = latestElement;
307 | 					prefixBuffer[0] = latestPrefix;
308 | 					namespaceBuffer[0] = latestNamespace;
309 | 					elementStart = tokenNum;
310 | 					unknownOffsetList[0] = 0;
311 | 					unknownCount = 1;
312 | 
313 | 					partStart = -1;
314 | 					break;
315 | 
316 | 				case CodeType.UNKNOWN_CLOSE_ELEMENT_END_OFFSET:
317 | 
318 | 					name = stitcher.getSlice(partStart, code);
319 | 					tokenBuffer[++tokenNum] = (latestNamespace ?
320 | 						latestNamespace.addElement(name) :
321 | 						unknownElementTbl[name]
322 | 					).close;
323 | 
324 | 					partStart = -1;
325 | 					break;
326 | 
327 | 				case CodeType.UNKNOWN_ATTRIBUTE_END_OFFSET:
328 | 
329 | 					name = stitcher.getSlice(partStart, code);
330 | 					token = unknownAttributeTbl[name];
331 | 
332 | 					if(!token) {
333 | 						token = new StringToken(name, Namespace.unknown);
334 | 						unknownAttributeTbl[name] = token;
335 | 					}
336 | 
337 | 					tokenBuffer[++tokenNum] = token;
338 | 
339 | 					let pos = tokenNum - elementStart;
340 | 					prefixBuffer[pos] = latestPrefix;
341 | 					namespaceBuffer[pos] = latestNamespace;
342 | 					unknownOffsetList[unknownCount++] = pos;
343 | 
344 | 					partStart = -1;
345 | 					break;
346 | 
347 | 				case CodeType.UNKNOWN_SGML_END_OFFSET:
348 | 
349 | 					prefix = latestPrefix ? latestPrefix.name : '';
350 | 					name = stitcher.getSlice(partStart, code);
351 | 					token = sgmlTbl[prefix + ':' + name];
352 | 
353 | 					if(!token) {
354 | 						token = new SgmlToken(name, prefix);
355 | 						sgmlTbl[prefix + ':' + name] = token as SgmlToken;
356 | 					}
357 | 
358 | 					tokenBuffer[++tokenNum] = token;
359 | 
360 | 					partStart = -1;
361 | 					break;
362 | 
363 | 				case CodeType.SGML_EMITTED:
364 | 				case CodeType.SGML_NESTED_START:
365 | 				case CodeType.SGML_NESTED_END:
366 | 
367 | 					tokenBuffer[++tokenNum] = this.specialTokenTbl[kind];
368 | 					break;
369 | 
370 | 				case CodeType.COMMENT_END_OFFSET:
371 | 				case CodeType.SGML_TEXT_END_OFFSET:
372 | 
373 | 					tokenBuffer[++tokenNum] = this.specialTokenTbl[kind];
374 | 
375 | 				// Fallthru
376 | 				case CodeType.VALUE_END_OFFSET:
377 | 				case CodeType.TEXT_END_OFFSET:
378 | 
379 | 					tokenBuffer[++tokenNum] = stitcher.getSlice(partStart, code);
380 | 					partStart = -1;
381 | 					break;
382 | 
383 | 				case CodeType.CDATA_END_OFFSET:
384 | 
385 | 					tokenBuffer[++tokenNum] = SpecialToken.cdata;
386 | 					name = stitcher.getSlice(partStart, code);
387 | 					tokenBuffer[++tokenNum] = name.substr(0, name.length - 3);
388 | 					partStart = -1;
389 | 					break;
390 | 
391 | 				case CodeType.UNKNOWN_PREFIX_END_OFFSET:
392 | 				case CodeType.UNKNOWN_XMLNS_END_OFFSET:
393 | 				case CodeType.UNKNOWN_URI_END_OFFSET:
394 | 
395 | 					// Add the namespace prefix or URI to a separate trie.
396 | 					// Incoming code buffer should have been flushed immediately
397 | 					// after writing this token.
398 | 
399 | 					if(kind == CodeType.UNKNOWN_URI_END_OFFSET) {
400 | 						let uri = stitcher.getSlice(partStart, code);
401 | 
402 | 						/* if(uri.id > dynamicTokenTblSize) {
403 | 							// TODO: report row and column in error messages.
404 | 							throw(new Error('Too many different xmlns URIs'));
405 | 						} */
406 | 
407 | 						// Create a new namespace for the unrecognized URI.
408 | 						name = latestPrefix!.name;
409 | 						const ns = new Namespace(name, uri, config.maxNamespace + 1);
410 | 						// This may unlink the config:
411 | 						const idNamespace = config.bindNamespace(ns, latestPrefix!.name, this);
412 | 						this.resolve(elementStart, tokenNum, latestPrefix!, idNamespace);
413 | 						tokenBuffer[++tokenNum] = latestPrefix!.prefix;
414 | 						tokenBuffer[++tokenNum] = this.config.namespaceList[idNamespace].uriToken;
415 | 						latestPrefix = null;
416 | 					} else {
417 | 						// This may unlink the config:
418 | 						latestPrefix = config.addPrefix(stitcher.getSlice(partStart, code));
419 | 
420 | 						/* if(latestPrefix.id > dynamicTokenTblSize) {
421 | 							// TODO: report row and column in error messages.
422 | 							throw(new Error('Too many different xmlns prefixes'));
423 | 						} */
424 | 
425 | 						this.native.setPrefix(latestPrefix.id);
426 | 					}
427 | 
428 | 					// Config may have been unlinked so update references to it.
429 | 					elementList = config.elementSpace.list;
430 | 					attributeList = config.attributeSpace.list;
431 | 					prefixList = config.prefixSpace.list;
432 | 					uriList = config.uriSpace.list;
433 | 
434 | 					partStart = -1;
435 | 					break;
436 | 
437 | 				case CodeType.PARTIAL_LEN:
438 | 
439 | 					partialLen = code;
440 | 					break;
441 | 
442 | 				case CodeType.PARTIAL_URI_ID:
443 | 
444 | 					partialList = uriList;
445 | 
446 | 				// Fallthru
447 | 				case CodeType.PARTIAL_PREFIX_ID:
448 | 
449 | 					if(partialList == elementList) partialList = prefixList;
450 | 
451 | 				// Fallthru
452 | 				case CodeType.PARTIAL_ATTRIBUTE_ID:
453 | 
454 | 					if(partialList == elementList) partialList = attributeList;
455 | 
456 | 				// Fallthru
457 | 				case CodeType.PARTIAL_ELEMENT_ID:
458 | 
459 | 					stitcher.reset(partialList[code].buf, partialLen);
460 | 					partialList = elementList;
461 | 					break;
462 | 
463 | 				default:
464 | 
465 | 					break;
466 | 			}
467 | 		}
468 | 
469 | 		if(!pending && partStart >= 0) {
470 | 			stitcher.storeSlice(partStart);
471 | 			partStart = 0;
472 | 		}
473 | 
474 | 		// NOTE: Any active cursor in native code will still use the old trie
475 | 		// after update.
476 | 		config.updateNamespaces();
477 | 
478 | 		this.partStart = partStart;
479 | 		this.partialLen = partialLen;
480 | 		this.latestElement = latestElement;
481 | 		this.latestPrefix = latestPrefix;
482 | 		this.latestNamespace = latestNamespace;
483 | 
484 | 		this.tokenChunk.length = tokenNum + 1;
485 | 		this.elementStart = elementStart;
486 | 		this.unknownCount = unknownCount;
487 | 	}
488 | 
489 | 	/** Resolve any prior occurrences of a recently defined prefix
490 | 	  * within the same element. */
491 | 	private resolve(elementStart: number, tokenNum: number, prefix: InternalToken, idNamespace: number) {
492 | 		const prefixBuffer = this.prefixBuffer;
493 | 		const tokenBuffer = this.tokenChunk.buffer;
494 | 		const ns = this.config.namespaceList[idNamespace];
495 | 		const len = tokenNum - elementStart;
496 | 		let token: Token | number | string;
497 | 
498 | 		if(!ns.base.defaultPrefix) {
499 | 			ns.base.defaultPrefix = prefix.name;
500 | 		}
501 | 		this.namespaceList[ns.base.id] = ns.base;
502 | 		this.namespacesChanged = true;
503 | 
504 | 		for(let pos = 0; pos <= len; ++pos) {
505 | 			if(prefixBuffer[pos] == prefix) {
506 | 				token = tokenBuffer[pos + elementStart];
507 | 				if(token instanceof MemberToken) {
508 | 					tokenBuffer[pos + elementStart] = token.resolve(ns);
509 | 					this.namespaceBuffer[pos] = null;
510 | 				}
511 | 			}
512 | 		}
513 | 	}
514 | 
515 | 	private stitcher = new Stitcher();
516 | 
517 | 	/** Current element not yet emitted (closing angle bracket unseen). */
518 | 	private latestElement: OpenToken;
519 | 	/** Previous namespace prefix token, applied to the next element, attribute
520 | 	  * or xmlns definition. */
521 | 	private latestPrefix: InternalToken | null;
522 | 	private latestNamespace: ParserNamespace | null;
523 | 
524 | 	/** Current input buffer. */
525 | 	private chunk: ArrayType;
526 | 
527 | 	private namespaceList: (Namespace | undefined)[] = [];
528 | 	private namespacesChanged = true;
529 | 
530 | 	/** Offset to start of text in input buffer, or -1 if not reading text. */
531 | 	private partStart = -1;
532 | 
533 | 	/** Number of valid initial bytes in next token. */
534 | 	private partialLen: number;
535 | 
536 | 	/** Shared with C++ library. */
537 | 	private codeBuffer: Uint32Array;
538 | 	/** Stream output buffer chunk. */
539 | 	tokenChunk = TokenChunk.allocate();
540 | 
541 | 	/** Offset to start of current element definition in output buffer. */
542 | 	private elementStart = -1;
543 | 	/** Prefixes of latest tokenBuffer entries (their namespace may change
544 | 	  * if the prefix is remapped). Index 0 corresponds to elementStart. */
545 | 	private prefixBuffer: (InternalToken | null)[] = [];
546 | 	private namespaceBuffer: (ParserNamespace | null)[] = [];
547 | 
548 | 	/** Unresolved elements (temporary tokens lacking a namespace). */
549 | 	private unknownElementTbl: { [ name: string ]: OpenToken } = {};
550 | 	/** Unresolved attributes (temporary tokens lacking a namespace). */
551 | 	private unknownAttributeTbl: { [ name: string ]: Token } = {};
552 | 	private sgmlTbl: { [ name: string ]: SgmlToken } = {};
553 | 	private unknownOffsetList: number[] = [];
554 | 
555 | 	private unknownCount = 0;
556 | 
557 | 	specialTokenTbl = {
558 | 		[CodeType.COMMENT_END_OFFSET]: SpecialToken.comment,
559 | 		[CodeType.SGML_EMITTED]: SpecialToken.sgmlEmitted,
560 | 		[CodeType.SGML_NESTED_START]: SpecialToken.sgmlNestedStart,
561 | 		[CodeType.SGML_NESTED_END]: SpecialToken.sgmlNestedEnd,
562 | 		[CodeType.SGML_TEXT_END_OFFSET]: SpecialToken.sgmlText
563 | 	};
564 | 
565 | 	private hasError?: ParseError;
566 | 
567 | }
568 | 


--------------------------------------------------------------------------------
/lib/Parser.cc:
--------------------------------------------------------------------------------
   1 | #include <cstring>
   2 | #include <cstdio>
   3 | 
   4 | #include "Parser.h"
   5 | 
   6 | #ifndef DEBUG_PARTIAL_NAME_RECOVERY
   7 | #	define DEBUG_PARTIAL_NAME_RECOVERY 0
   8 | #endif
   9 | 
  10 | unsigned char whiteCharTbl[256];
  11 | unsigned char valueCharTbl[256];
  12 | unsigned char xmlNameStartCharTbl[256];
  13 | unsigned char xmlNameCharTbl[256];
  14 | unsigned char dtdNameCharTbl[256];
  15 | 
  16 | Parser :: Parser(const ParserConfig &config) : config(config) {
  17 | 	state = State :: MATCH;
  18 | 	nameCharTbl = xmlNameCharTbl;
  19 | 	nameStartCharTbl = xmlNameStartCharTbl;
  20 | 	pattern = "\xef\xbb\xbf";
  21 | 	matchState = State :: BEFORE_TEXT;
  22 | 	noMatchState = State :: BEFORE_TEXT;
  23 | 	partialMatchState = State :: PARSE_ERROR;
  24 | 	pos = 0;
  25 | 	row = 0;
  26 | 	col = 0;
  27 | 	sgmlNesting = 0;
  28 | }
  29 | 
  30 | /** Branchless cursor position update based on UTF-8 input byte. Assumes
  31 |   * each codepoint is a separate character printed left to right. */
  32 | inline void Parser :: updateRowCol(unsigned char c) {
  33 | #if 0
  34 | 	unsigned int color = static_cast<unsigned int>(state);
  35 | 	printf("\e[%d;%dm%c", (color & 8) >> 3, 30 + (color & 7), c);
  36 | #endif
  37 | 	col = (
  38 | 		// If c is a tab, round col up to just before the next tab stop.
  39 | 		(col | (((c != '\t') - 1) & 7)) +
  40 | 		// Then increment col if c is not a UTF-8 continuation byte.
  41 | 		((c & 0xc0) != 0x80)
  42 | 	) & (
  43 | 		// Finally set col to zero if c is a line feed.
  44 | 		(c == '\n') - 1
  45 | 	);
  46 | 
  47 | 	// Increment row if c is a line feed.
  48 | 	row += (c == '\n');
  49 | }
  50 | 
  51 | Parser :: ErrorType Parser :: destroy() {
  52 | 	uint32_t *tokenPtr = tokenList + 1;
  53 | 
  54 | 	tokenList[0] = 0;
  55 | 
  56 | 	switch(state) {
  57 | 
  58 | 		case State :: TEXT:
  59 | 
  60 | 			writeToken(static_cast<TokenType>(static_cast<uint32_t>(textTokenType) + 1), 0, tokenPtr);
  61 | 			break;
  62 | 
  63 | 		default:
  64 | 
  65 | 			break;
  66 | 
  67 | 	}
  68 | 
  69 | 	flushTokens.reset();
  70 | 
  71 | 	return(ErrorType :: OK);
  72 | }
  73 | 
  74 | /** Parse a chunk of incoming data.
  75 |   * For security from buffer overflow attacks, memory writes are only done in
  76 |   * writeToken which should be foolproof. */
  77 | 
  78 | Parser :: ErrorType Parser :: parse(nbind::Buffer chunk) {
  79 | 	size_t len = chunk.length();
  80 | 	size_t ahead;
  81 | 	const unsigned char *chunkBuffer = chunk.data();
  82 | 	const unsigned char *p = chunkBuffer;
  83 | 	unsigned char c, d = 0;
  84 | 	const Namespace *ns;
  85 | 
  86 | 	// Indicate that no tokens inside the chunk were found yet.
  87 | 	tokenList[0] = 0;
  88 | 	uint32_t *tokenPtr = tokenList + 1;
  89 | 
  90 | 	tokenStart = p;
  91 | 
  92 | 	// Read a byte of input.
  93 | 	c = *p++;
  94 | 
  95 | 	/*
  96 | 		This loop represents a DFA (deterministic finite automaton) where
  97 | 		top-level switch case labels represent states. Goto and continue
  98 | 		statements allow changing states without consuming input
  99 | 		(because input reading and loop condition test are at the end).
 100 | 
 101 | 		Element and attribute names and values, text and comments use an
 102 | 		additional tighter inner loop for speed.
 103 | 
 104 | 		Some duplicated states are avoided using the after<Name>State variables,
 105 | 		which allow execution to jump to a common state and back again.
 106 | 	*/
 107 | 
 108 | 	while(1) {
 109 | 		switch(state) {
 110 | 
 111 | 			// Parser start state at the beginning of input,
 112 | 			// when pattern is a UTF-8 BOM.
 113 | 			case State :: MATCH:
 114 | 			case State :: MATCH_SPARSE: MATCH_SPARSE:
 115 | 
 116 | 				d = pattern[pos];
 117 | 
 118 | 				if(!d) {
 119 | 					state = matchState;
 120 | 					pos = 0;
 121 | 					continue;
 122 | 				} else if(c == d) {
 123 | 					++pos;
 124 | 					break;
 125 | 				} else if(state == State :: MATCH_SPARSE && whiteCharTbl[c]) {
 126 | 					break;
 127 | 				} else {
 128 | 					state = pos ? partialMatchState : noMatchState;
 129 | 					pos = 0;
 130 | 					continue;
 131 | 				}
 132 | 
 133 | 			case State :: QUOTE:
 134 | 
 135 | 				if(d == '"' && c == '\'') {
 136 | 					textEndChar = '\'';
 137 | 					state = matchState;
 138 | 					break;
 139 | 				} else {
 140 | 					state = noMatchState;
 141 | 					continue;
 142 | 				}
 143 | 
 144 | 			// State at the beginning of input after a possible UTF-8 BOM,
 145 | 			// or after any closing tag.
 146 | 			// Skip whitespace and then read text up to an opening tag.
 147 | 			case State :: BEFORE_TEXT:
 148 | 
 149 | 				if(whiteCharTbl[c]) break;
 150 | 
 151 | 				if(c == '<') {
 152 | 					state = State :: AFTER_LT;
 153 | 					break;
 154 | 				}
 155 | 
 156 | 				textEndChar = '<';
 157 | 				afterTextState = State :: AFTER_LT;
 158 | 
 159 | 				textTokenType = TokenType :: TEXT_START_OFFSET;
 160 | 				state = State :: TEXT;
 161 | 				// Avoid consuming the first character.
 162 | 				goto TEXT;
 163 | 
 164 | 			// Read text, which can be an attribute value or a text node,
 165 | 			// until textEndChar (defined by a preceding state) is found.
 166 | 			// TODO: Detect and handle numbers in a special way for speed?
 167 | 			case State :: TEXT: TEXT:
 168 | 
 169 | 				writeToken(textTokenType, p - chunkBuffer - 1, tokenPtr);
 170 | 
 171 | 				// Fast inner loop for capturing text between elements
 172 | 				// and in attribute values.
 173 | 				while(1) {
 174 | 					if(!valueCharTbl[c]) {
 175 | 						if(c == textEndChar) break;
 176 | 
 177 | 						switch(c) {
 178 | 							case '&':
 179 | 
 180 | 								// TODO: handle entities here?
 181 | 								break;
 182 | 
 183 | 							case '"':
 184 | 							case '\'':
 185 | 							case '<':
 186 | 							case '>':
 187 | 
 188 | 								// TODO: Stricter parsing would ban these.
 189 | 								break;
 190 | 
 191 | 							case ']':
 192 | 
 193 | 								if(sgmlNesting) {
 194 | 									// Signal end of DTD embedded in DOCTYPE.
 195 | 									writeToken(TokenType :: SGML_NESTED_END, 0, tokenPtr);
 196 | 									--sgmlNesting;
 197 | 
 198 | 									textEndChar = ']';
 199 | 									afterTextState = State :: SGML_DECLARATION;
 200 | 									continue;
 201 | 								}
 202 | 								break;
 203 | 
 204 | 							default:
 205 | 
 206 | 								// Disallow nonsense bytes.
 207 | 								return(ErrorType :: INVALID_CHAR);
 208 | 						}
 209 | 					}
 210 | 
 211 | 					updateRowCol(c);
 212 | 					if(!--len) return(ErrorType :: OK);
 213 | 					c = *p++;
 214 | 				}
 215 | 
 216 | 				writeToken(
 217 | 					// End token ID is always one higher than the corresponding
 218 | 					// start token ID.
 219 | 					static_cast<TokenType>(static_cast<uint32_t>(textTokenType) + 1),
 220 | 					p - chunkBuffer - 1,
 221 | 					tokenPtr
 222 | 				);
 223 | 
 224 | 				state = afterTextState;
 225 | 				break;
 226 | 
 227 | 			case State :: BEFORE_CDATA:
 228 | 
 229 | 				writeToken(textTokenType, p - chunkBuffer - 1, tokenPtr);
 230 | 				state = State :: CDATA;
 231 | 				goto CDATA;
 232 | 
 233 | 			// Note: the terminating "]]>" is included in the output byte range.
 234 | 			case State :: CDATA: CDATA:
 235 | 
 236 | 				while(1) {
 237 | 					if(c == ']') {
 238 | 						++pos;
 239 | 					} else if(c == '>' && pos >= 2) {
 240 | 						break;
 241 | 					} else {
 242 | 						pos = 0;
 243 | 					}
 244 | 
 245 | 					updateRowCol(c);
 246 | 					if(!--len) return(ErrorType :: OK);
 247 | 					c = *p++;
 248 | 				}
 249 | 
 250 | 				writeToken(
 251 | 					// End token ID is always one higher than the corresponding
 252 | 					// start token ID.
 253 | 					static_cast<TokenType>(static_cast<uint32_t>(textTokenType) + 1),
 254 | 					p - chunkBuffer,
 255 | 					tokenPtr
 256 | 				);
 257 | 
 258 | 				pos = 0;
 259 | 				state = afterTextState;
 260 | 				break;
 261 | 
 262 | 			// The previous character was a '<' starting a tag. The current
 263 | 			// character determines what kind of tag.
 264 | 			case State :: AFTER_LT:
 265 | 
 266 | 				trie = &Namespace :: elementTrie;
 267 | 
 268 | 				switch(c) {
 269 | 					// An SGML declaration <! ... > or <![CDATA[ ... ]]>
 270 | 					// or a comment <!-- ... -->
 271 | 					case '!':
 272 | 
 273 | 						tagType = TagType :: ELEMENT;
 274 | 						state = State :: BEFORE_SGML;
 275 | 						break;
 276 | 
 277 | 					// An SGML <? ... > or an XML <? ... ?> processing
 278 | 					// instruction.
 279 | 					case '?':
 280 | 
 281 | 						afterNameState = State :: AFTER_PROCESSING_NAME;
 282 | 						afterValueState = State :: AFTER_PROCESSING_VALUE;
 283 | 						nameTokenType = TokenType :: OPEN_ELEMENT_ID;
 284 | 
 285 | 						tagType = TagType :: PROCESSING;
 286 | 						matchTarget = MatchTarget :: ELEMENT;
 287 | 
 288 | 						// Put unknown processing instructions in a placeholder namespace.
 289 | 						elementPrefix.idPrefix = config.processingPrefixToken;
 290 | 						elementPrefix.idNamespace = config.namespacePrefixTbl[config.processingPrefixToken].first;
 291 | 						memberPrefix = &elementPrefix;
 292 | 
 293 | 						ns = config.namespacePrefixTbl[config.processingPrefixToken].second;
 294 | 
 295 | 						cursor.init(ns->*trie);
 296 | 
 297 | 						tokenStart = p;
 298 | 
 299 | 						state = State :: MATCH_TRIE;
 300 | 						afterMatchTrieState = State :: NAME;
 301 | 						break;
 302 | 
 303 | 					// A closing element </NAME > (no whitespace after '<').
 304 | 					case '/':
 305 | 						afterNameState = State :: AFTER_CLOSE_ELEMENT_NAME;
 306 | 						nameTokenType = TokenType :: CLOSE_ELEMENT_ID;
 307 | 
 308 | 						tagType = TagType :: ELEMENT;
 309 | 						matchTarget = MatchTarget :: ELEMENT;
 310 | 						state = State :: BEFORE_NAME;
 311 | 						break;
 312 | 
 313 | 					// An element <NAME ... >. May be self-closing.
 314 | 					default:
 315 | 						afterNameState = State :: STORE_ELEMENT_NAME;
 316 | 						afterValueState = State :: AFTER_ATTRIBUTE_VALUE;
 317 | 						nameTokenType = TokenType :: OPEN_ELEMENT_ID;
 318 | 						memberPrefix = &elementPrefix;
 319 | 
 320 | 						tagType = TagType :: ELEMENT;
 321 | 						matchTarget = MatchTarget :: ELEMENT;
 322 | 						state = State :: BEFORE_NAME;
 323 | 						// Avoid consuming the first character.
 324 | 						goto BEFORE_NAME;
 325 | 				}
 326 | 
 327 | 				break;
 328 | 
 329 | 			// Skip any whitespace before an element name. XML doesn't
 330 | 			// actually allow any, so this state could be removed for
 331 | 			// stricter parsing.
 332 | 			/*
 333 | 			case State :: BEFORE_ELEMENT_NAME: BEFORE_ELEMENT_NAME:
 334 | 
 335 | 				if(whiteCharTbl[c]) break;
 336 | 
 337 | 				state = State :: BEFORE_NAME;
 338 | 				goto BEFORE_NAME;
 339 | 			*/
 340 | 
 341 | 			// -----------------------------------------
 342 | 			// Element and attribute name parsing begins
 343 | 			// -----------------------------------------
 344 | 
 345 | 			// Start matching a name to known names in a Patricia trie.
 346 | 			case State :: BEFORE_NAME: BEFORE_NAME:
 347 | 
 348 | 				// The current character must be the valid first character of
 349 | 				// an element or attribute name, anything else is an error.
 350 | 				if(!nameStartCharTbl[c]) {
 351 | 					return(ErrorType :: INVALID_CHAR);
 352 | 				}
 353 | 
 354 | 				// Look for a ":" separator indicating a qualified name (starts
 355 | 				// with a namespace prefix). If the entire name doesn't fit in
 356 | 				// the input buffer, we first try to parse as a qualified name.
 357 | 				// This is an optional lookup to avoid later reprocessing.
 358 | 				for(ahead = 0; ahead + 1 < len && nameCharTbl[p[ahead]]; ++ahead) {}
 359 | 
 360 | 				if(matchTarget == MatchTarget :: ELEMENT) {
 361 | 					elementPrefix.idPrefix = config.emptyPrefixToken;
 362 | 					elementPrefix.idNamespace = config.namespacePrefixTbl[config.emptyPrefixToken].first;
 363 | 					ns = config.namespacePrefixTbl[config.emptyPrefixToken].second;
 364 | 				} else {
 365 | 					// By default, attributes belong to the same namespace as their parent element.
 366 | 					attributePrefix.idPrefix = elementPrefix.idPrefix;
 367 | 					attributePrefix.idNamespace = elementPrefix.idNamespace;
 368 | 					ns = config.namespaceList[elementPrefix.idNamespace].get();
 369 | 					// If element namespace prefix was known but undefined,
 370 | 					// try the default namespace to allow matching the magic xmlns attribute.
 371 | 					if(ns == nullptr) ns = config.namespacePrefixTbl[config.emptyPrefixToken].second;
 372 | 				}
 373 | 
 374 | 				// Prepare Patricia tree cursor for parsing.
 375 | 				if(ahead + 1 >= len || p[ahead] == ':') {
 376 | 					// If the input ran out, assume the name contains a colon
 377 | 					// in the next input buffer chunk. If a colon is found, the
 378 | 					// name starts with a namespace prefix.
 379 | 
 380 | 					if(matchTarget == MatchTarget :: ELEMENT) {
 381 | 						matchTarget = MatchTarget :: ELEMENT_NAMESPACE;
 382 | 					} else {
 383 | 						matchTarget = MatchTarget :: ATTRIBUTE_NAMESPACE;
 384 | 					}
 385 | 					cursor.init(config.prefixTrie);
 386 | 				} else {
 387 | 					if(ns == nullptr) {
 388 | 						// No default namespace is defined, so this element
 389 | 						// cannot be matched with anything.
 390 | 						writeToken(TokenType :: PREFIX_ID, (memberPrefix->idNamespace << 14) | memberPrefix->idPrefix, tokenPtr);
 391 | 						writeToken(TokenType :: UNKNOWN_START_OFFSET, p - 1 - chunkBuffer, tokenPtr);
 392 | 
 393 | 						idToken = Patricia :: notFound;
 394 | 						state = State :: UNKNOWN_NAME;
 395 | 						goto UNKNOWN_NAME;
 396 | 					}
 397 | 
 398 | 					cursor.init(ns->*trie);
 399 | 				}
 400 | 
 401 | 				tokenStart = p - 1;
 402 | 
 403 | 				state = State :: MATCH_TRIE;
 404 | 				afterMatchTrieState = State :: NAME;
 405 | 				goto MATCH_TRIE;
 406 | 
 407 | 			case State :: MATCH_TRIE: MATCH_TRIE:
 408 | 
 409 | 				// Fast inner loop for matching to known element and attribute names.
 410 | 				while(cursor.advance(c)) {
 411 | 					updateRowCol(c);
 412 | 					if(!--len) {
 413 | 						pos += p - tokenStart;
 414 | 						return(ErrorType :: OK);
 415 | 					}
 416 | 					c = *p++;
 417 | 				}
 418 | 
 419 | 				state = afterMatchTrieState;
 420 | 				continue;
 421 | 
 422 | 			case State :: NAME:
 423 | 
 424 | 				if(!nameCharTbl[c]) {
 425 | 					// If the whole name was matched, get associated reference.
 426 | 					idToken = cursor.getData();
 427 | 
 428 | 					// Test for an attribute "xmlns:..." defining a namespace
 429 | 					// prefix.
 430 | 
 431 | 					if(tagType == TagType :: ELEMENT && (
 432 | 						(
 433 | 							matchTarget == MatchTarget :: ATTRIBUTE_NAMESPACE &&
 434 | 							idToken == config.xmlnsPrefixToken
 435 | 						) || (
 436 | 							matchTarget == MatchTarget :: ATTRIBUTE &&
 437 | 							idToken == config.xmlnsToken
 438 | 						)
 439 | 					)) {
 440 | 						if(c == ':') {
 441 | 							pos = 0;
 442 | 							state = State :: DEFINE_XMLNS_BEFORE_PREFIX_NAME;
 443 | 							break;
 444 | 						} else {
 445 | 							// Prepare to set the default namespace.
 446 | 							nameTokenType = TokenType :: XMLNS_ID;
 447 | 							afterNameState = State :: DEFINE_XMLNS_AFTER_PREFIX_NAME;
 448 | 							idToken = config.emptyPrefixToken;
 449 | 						}
 450 | 					}
 451 | 
 452 | 					if(idToken != Patricia :: notFound) {
 453 | 						if(c == ':' && tagType == TagType :: ELEMENT) {
 454 | 							// If matching a namespace, use it.
 455 | 							if(
 456 | 								matchTarget == MatchTarget :: ELEMENT_NAMESPACE ||
 457 | 								matchTarget == MatchTarget :: ATTRIBUTE_NAMESPACE
 458 | 							) {
 459 | 								if(idToken >= namespacePrefixTblSize) {
 460 | 									return(ErrorType :: TOO_MANY_PREFIXES);
 461 | 								}
 462 | 
 463 | 								memberPrefix->idPrefix = idToken;
 464 | 								memberPrefix->idNamespace = config.namespacePrefixTbl[idToken].first;
 465 | 
 466 | 								if(matchTarget == MatchTarget :: ELEMENT_NAMESPACE) {
 467 | 									matchTarget = MatchTarget :: ELEMENT;
 468 | 								} else {
 469 | 									matchTarget = MatchTarget :: ATTRIBUTE;
 470 | 								}
 471 | 
 472 | 								ns = config.namespacePrefixTbl[idToken].second;
 473 | 
 474 | 								if(ns == nullptr) {
 475 | 									// Found a known but undeclared namespace
 476 | 									// prefix, valid if declared with an xmlns
 477 | 									// attribute in the same element.
 478 | 
 479 | 									writeToken(TokenType :: PREFIX_ID, (memberPrefix->idNamespace << 14) | memberPrefix->idPrefix, tokenPtr);
 480 | 									writeToken(TokenType :: UNKNOWN_START_OFFSET, p - chunkBuffer, tokenPtr);
 481 | 
 482 | 									idToken = Patricia :: notFound;
 483 | 									pos = 0;
 484 | 									state = State :: UNKNOWN_NAME;
 485 | 									break;
 486 | 								}
 487 | 
 488 | 								pos = 0;
 489 | 								tokenStart = p;
 490 | 								cursor.init(ns->*trie);
 491 | 
 492 | 								state = State :: MATCH_TRIE;
 493 | 								break;
 494 | 							} else {
 495 | 								// TODO: Reintepret token up to cursor as a
 496 | 								// namespace prefix.
 497 | 							}
 498 | 							break;
 499 | 						} else if(
 500 | 							matchTarget == MatchTarget :: ELEMENT_NAMESPACE ||
 501 | 							matchTarget == MatchTarget :: ATTRIBUTE_NAMESPACE
 502 | 						) {
 503 | 							// TODO: Reintepret token up to cursor as an
 504 | 							// element or attribute name according to
 505 | 							// nameTokenType.
 506 | 						}
 507 | 
 508 | 						if(nameTokenType != TokenType :: XMLNS_ID) {
 509 | 							if(!updateElementStack(nameTokenType)) return(ErrorType :: OTHER);
 510 | 							writeToken(TokenType :: PREFIX_ID, (memberPrefix->idNamespace << 14) | memberPrefix->idPrefix, tokenPtr);
 511 | 						}
 512 | 						writeToken(nameTokenType, idToken, tokenPtr);
 513 | 
 514 | 						knownName = true;
 515 | 						pos = 0;
 516 | 						state = afterNameState;
 517 | 						continue;
 518 | 					} else {
 519 | 						// TODO: Verify emitting partial name works in this case.
 520 | 					}
 521 | 				}
 522 | 
 523 | 				pos += p - tokenStart;
 524 | 
 525 | 				// For partial matches, emit the matched part of a name.
 526 | 				emitPartialName(
 527 | 					p,
 528 | 					static_cast<size_t>(p - chunkBuffer),
 529 | 					(
 530 | 						matchTarget == MatchTarget :: ELEMENT ?
 531 | 						TokenType :: PARTIAL_ELEMENT_ID : (
 532 | 							matchTarget == MatchTarget :: ATTRIBUTE ?
 533 | 							TokenType :: PARTIAL_ATTRIBUTE_ID :
 534 | 							TokenType :: PARTIAL_PREFIX_ID
 535 | 						)
 536 | 					),
 537 | 					tokenPtr
 538 | 				);
 539 | 
 540 | 				idToken = Patricia :: notFound;
 541 | 				pos = 0;
 542 | 				state = State :: UNKNOWN_NAME;
 543 | 				goto UNKNOWN_NAME;
 544 | 
 545 | 			// From this part onwards, the name was not found in any applicable
 546 | 			// Patricia trie.
 547 | 			case State :: UNKNOWN_NAME: UNKNOWN_NAME:
 548 | 
 549 | 				while(nameCharTbl[c]) {
 550 | 					updateRowCol(c);
 551 | 					if(!--len) return(ErrorType :: OK);
 552 | 					c = *p++;
 553 | 				}
 554 | 
 555 | 				if(c == ':' && tagType == TagType :: ELEMENT) {
 556 | 					// Found a new, undeclared namespace prefix, valid if
 557 | 					// declared with an xmlns attribute in the same element.
 558 | 
 559 | 					writeToken(
 560 | 						TokenType :: UNKNOWN_PREFIX_END_OFFSET,
 561 | 						p - chunkBuffer - 1,
 562 | 						tokenPtr
 563 | 					);
 564 | 
 565 | 					// Flush tokens to regenerate prefix trie in JavaScript.
 566 | 					flush(tokenPtr);
 567 | 
 568 | 					// Namespace is unknown so prepare to emit the name.
 569 | 					writeToken(TokenType :: UNKNOWN_START_OFFSET, p - chunkBuffer, tokenPtr);
 570 | 					break;
 571 | 				}
 572 | 
 573 | 				if(nameTokenType != TokenType :: XMLNS_ID) {
 574 | 					if(!updateElementStack(nameTokenType)) return(ErrorType :: OTHER);
 575 | 					writeToken(TokenType :: PREFIX_ID, (memberPrefix->idNamespace << 14) | memberPrefix->idPrefix, tokenPtr);
 576 | 				}
 577 | 				writeToken(
 578 | 					static_cast<TokenType>(
 579 | 						static_cast<uint32_t>(TokenType :: UNKNOWN_OPEN_ELEMENT_END_OFFSET) -
 580 | 						static_cast<uint32_t>(TokenType :: OPEN_ELEMENT_ID) +
 581 | 						static_cast<uint32_t>(nameTokenType)
 582 | 					),
 583 | 					p - chunkBuffer - 1,
 584 | 					tokenPtr
 585 | 				);
 586 | 
 587 | 				knownName = false;
 588 | 				state = afterNameState;
 589 | 				continue;
 590 | 
 591 | 			// ---------------------------------------
 592 | 			// Element and attribute name parsing ends
 593 | 			// ---------------------------------------
 594 | 
 595 | 			case State :: STORE_ELEMENT_NAME:
 596 | 
 597 | 				// Store element name ID (already output) to verify closing element.
 598 | 				// TODO: Push to a stack and verify!
 599 | 				idElement = idToken;
 600 | 
 601 | 				state = State :: AFTER_ELEMENT_NAME;
 602 | 				goto AFTER_ELEMENT_NAME;
 603 | 
 604 | 			// Inside an element start tag with the name already parsed.
 605 | 			case State :: AFTER_ELEMENT_NAME: AFTER_ELEMENT_NAME:
 606 | 
 607 | 				switch(c) {
 608 | 					case '/':
 609 | 
 610 | 						if(!updateElementStack(TokenType :: CLOSE_ELEMENT_ID)) return(ErrorType :: OTHER);
 611 | 						writeToken(TokenType :: CLOSED_ELEMENT_EMITTED, idElement, tokenPtr);
 612 | 
 613 | 						expected = '>';
 614 | 						nextState = State :: BEFORE_TEXT;
 615 | 						otherState = State :: PARSE_ERROR;
 616 | 
 617 | 						state = State :: EXPECT;
 618 | 						break;
 619 | 
 620 | 					case '>':
 621 | 
 622 | 						writeToken(TokenType :: ELEMENT_EMITTED, idElement, tokenPtr);
 623 | 
 624 | 						state = State :: BEFORE_TEXT;
 625 | 						break;
 626 | 
 627 | 					default:
 628 | 
 629 | 						if(whiteCharTbl[c]) break;
 630 | 						else {
 631 | 							// First read an attribute name.
 632 | 							state = State :: BEFORE_NAME;
 633 | 							matchTarget = MatchTarget :: ATTRIBUTE;
 634 | 							nameTokenType = TokenType :: ATTRIBUTE_ID;
 635 | 							memberPrefix = &attributePrefix;
 636 | 							trie = &Namespace :: attributeTrie;
 637 | 
 638 | 							// Then equals sign and opening double quote.
 639 | 							afterNameState = State :: MATCH_SPARSE;
 640 | 							pattern = "=\"";
 641 | 							noMatchState = State :: PARSE_ERROR;
 642 | 							partialMatchState = State :: QUOTE;
 643 | 
 644 | 							// Finally text content up to closing double quote.
 645 | 							matchState = State :: TEXT;
 646 | 							textTokenType = TokenType :: VALUE_START_OFFSET;
 647 | 							textEndChar = '"';
 648 | 							afterTextState = afterValueState;
 649 | 
 650 | 							// Attribute name.
 651 | 							goto BEFORE_NAME;
 652 | 						}
 653 | 				}
 654 | 
 655 | 				break;
 656 | 
 657 | 			case State :: AFTER_CLOSE_ELEMENT_NAME:
 658 | 				if(c == '>') {
 659 | 					state = State :: BEFORE_TEXT;
 660 | 				} else if(!whiteCharTbl[c]) {
 661 | 					return(ErrorType :: PROHIBITED_WHITESPACE);
 662 | 				}
 663 | 
 664 | 				break;
 665 | 
 666 | 			// ------------------------------
 667 | 			// Attribute value parsing begins
 668 | 			// ------------------------------
 669 | 
 670 | 			// Enforce whitespace between attributes.
 671 | 			case State :: AFTER_ATTRIBUTE_VALUE: AFTER_ATTRIBUTE_VALUE:
 672 | 
 673 | 				switch(c) {
 674 | 					case '/':
 675 | 					case '>':
 676 | 
 677 | 						// Switch states without consuming character.
 678 | 						state = State :: AFTER_ELEMENT_NAME;
 679 | 						goto AFTER_ELEMENT_NAME;
 680 | 
 681 | 					default:
 682 | 
 683 | 						if(whiteCharTbl[c]) {
 684 | 							state = State :: AFTER_ELEMENT_NAME;
 685 | 							break;
 686 | 						} else {
 687 | 							return(ErrorType :: INVALID_CHAR);
 688 | 						}
 689 | 				}
 690 | 
 691 | 				break;
 692 | 
 693 | 			// Finished reading an attribute name beginning "xmlns:".
 694 | 			// Parse the namespace prefix it defines.
 695 | 			case State :: DEFINE_XMLNS_BEFORE_PREFIX_NAME:
 696 | 
 697 | 				tokenStart = p - 1;
 698 | 
 699 | 				// Prepare Patricia tree cursor for parsing an xmlns prefix.
 700 | 				state = State :: MATCH_TRIE;
 701 | 				cursor.init(config.prefixTrie);
 702 | 
 703 | 				// TODO: Better use a state without handling of the : char.
 704 | 				afterMatchTrieState = State :: NAME;
 705 | 
 706 | 				afterNameState = State :: DEFINE_XMLNS_AFTER_PREFIX_NAME;
 707 | 				// Prepare to emit the chosen namespace prefix.
 708 | 				nameTokenType = TokenType :: XMLNS_ID;
 709 | 
 710 | 				goto MATCH_TRIE;
 711 | 
 712 | 			case State :: DEFINE_XMLNS_AFTER_PREFIX_NAME:
 713 | 
 714 | 				if(knownName) {
 715 | 					// Store index of namespace prefix in prefix mapping table
 716 | 					// for assigning a new namespace URI.
 717 | 					idPrefix = idToken;
 718 | 				} else {
 719 | 					// If the name was unrecognized, flush tokens so JavaScript
 720 | 					// updates the namespace prefix trie and this tokenizer can
 721 | 					// recognize it in the future.
 722 | 					flush(tokenPtr);
 723 | 				}
 724 | 
 725 | 				// Match equals sign and namespace URI in double quotes.
 726 | 				state = State :: MATCH_SPARSE;
 727 | 				pattern = "=\"";
 728 | 				noMatchState = State :: PARSE_ERROR;
 729 | 				partialMatchState = State :: QUOTE;
 730 | 
 731 | 				matchState = State :: BEFORE_VALUE;
 732 | 				cursor.init(config.uriTrie);
 733 | 				valueTokenType = TokenType :: URI_ID;
 734 | 				textEndChar = '"';
 735 | 
 736 | 				afterValueState = State :: DEFINE_XMLNS_AFTER_URI;
 737 | 
 738 | 				goto MATCH_SPARSE;
 739 | 
 740 | 			case State :: BEFORE_VALUE:
 741 | 
 742 | 				tokenStart = p - 1;
 743 | 
 744 | 				state = State :: MATCH_TRIE;
 745 | 				afterMatchTrieState = State :: VALUE;
 746 | 				goto MATCH_TRIE;
 747 | 
 748 | 			// Parse a value that should match a known set. Similar to
 749 | 			// State :: NAME but reads up to and consumes a final double quote.
 750 | 			case State :: VALUE:
 751 | 
 752 | 				if(c == textEndChar) {
 753 | 					// If the whole value was matched, get associated reference.
 754 | 					idToken = cursor.getData();
 755 | 
 756 | 					if(idToken != Patricia :: notFound) {
 757 | 						if(valueTokenType == TokenType :: URI_ID) {
 758 | 							valueTokenType = TokenType :: NAMESPACE_ID;
 759 | 							idToken = config.namespaceByUriToken[idToken].first;
 760 | 						}
 761 | 						writeToken(valueTokenType, idToken, tokenPtr);
 762 | 
 763 | 						knownName = true;
 764 | 						pos = 0;
 765 | 						state = afterValueState;
 766 | 						break;
 767 | 					} else {
 768 | 						// TODO: Verify emitting partial name works in this case.
 769 | 					}
 770 | 				}
 771 | 
 772 | 				pos += p - tokenStart;
 773 | 
 774 | 				emitPartialName(
 775 | 					p,
 776 | 					static_cast<size_t>(p - chunkBuffer),
 777 | 					TokenType :: PARTIAL_URI_ID,
 778 | 					tokenPtr
 779 | 				);
 780 | 
 781 | 				idToken = Patricia :: notFound;
 782 | 				pos = 0;
 783 | 				state = State :: UNKNOWN_VALUE;
 784 | 				goto UNKNOWN_VALUE;
 785 | 
 786 | 			case State :: UNKNOWN_VALUE: UNKNOWN_VALUE:
 787 | 
 788 | 				while(1) {
 789 | 					if(!valueCharTbl[c]) {
 790 | 						if(c == textEndChar) break;
 791 | 
 792 | 						switch(c) {
 793 | 							case '&':
 794 | 
 795 | 								// TODO: Handle entities.
 796 | 								break;
 797 | 
 798 | 							case '"':
 799 | 							case '\'':
 800 | 							case '<':
 801 | 							case '>':
 802 | 
 803 | 								// TODO: Stricter parsing would ban these.
 804 | 								break;
 805 | 
 806 | 							case ']':
 807 | 
 808 | 								break;
 809 | 
 810 | 							default:
 811 | 
 812 | 								// Disallow nonsense bytes.
 813 | 								return(ErrorType :: INVALID_CHAR);
 814 | 						}
 815 | 					}
 816 | 
 817 | 					updateRowCol(c);
 818 | 					if(!--len) return(ErrorType :: OK);
 819 | 					c = *p++;
 820 | 				}
 821 | 
 822 | 				writeToken(
 823 | 					static_cast<TokenType>(
 824 | 						static_cast<uint32_t>(TokenType :: UNKNOWN_OPEN_ELEMENT_END_OFFSET) -
 825 | 						static_cast<uint32_t>(TokenType :: OPEN_ELEMENT_ID) +
 826 | 						static_cast<uint32_t>(valueTokenType)
 827 | 					),
 828 | 					p - chunkBuffer - 1,
 829 | 					tokenPtr
 830 | 				);
 831 | 
 832 | 				knownName = false;
 833 | 				state = afterValueState;
 834 | 				break;
 835 | 
 836 | 			case State :: DEFINE_XMLNS_AFTER_URI:
 837 | 
 838 | 				if(knownName) {
 839 | 					bindPrefix(idPrefix, idToken);
 840 | 				} else {
 841 | 					// If the value was unrecognized, flush tokens so JavaScript
 842 | 					// updates the uri trie and this tokenizer can recognize it
 843 | 					// in the future.
 844 | 					flush(tokenPtr);
 845 | 
 846 | 					// Reset element namespace to correctly match any following attributes.
 847 | 					elementPrefix.idNamespace = config.namespacePrefixTbl[elementPrefix.idPrefix].first;
 848 | 				}
 849 | 
 850 | 				afterValueState = State :: AFTER_ATTRIBUTE_VALUE;
 851 | 
 852 | 				state = State :: AFTER_ATTRIBUTE_VALUE;
 853 | 				goto AFTER_ATTRIBUTE_VALUE;
 854 | 
 855 | 			// ----------------------------
 856 | 			// Attribute value parsing ends
 857 | 			// ----------------------------
 858 | 
 859 | 			// Tag starting with <! (comment, cdata, entity definition...)
 860 | 			case State :: BEFORE_SGML:
 861 | 
 862 | 				switch(c) {
 863 | 					case '[':
 864 | 
 865 | 						pattern = "CDATA[";
 866 | 						matchState = State :: BEFORE_CDATA;
 867 | 						noMatchState = State :: PARSE_ERROR;
 868 | 						partialMatchState = State :: PARSE_ERROR;
 869 | 
 870 | 						textTokenType = TokenType :: CDATA_START_OFFSET;
 871 | 						afterTextState = State :: BEFORE_TEXT;
 872 | 
 873 | 						state = State :: MATCH;
 874 | 						break;
 875 | 
 876 | 					// <!-- comment -->
 877 | 					case '-':
 878 | 
 879 | 						expected = '-';
 880 | 						nextState = State :: BEFORE_COMMENT;
 881 | 						otherState = State :: PARSE_ERROR;
 882 | 
 883 | 						state = State :: EXPECT;
 884 | 						break;
 885 | 
 886 | 					default:
 887 | 
 888 | 						// writeToken(TokenType :: SGML_START, 0, tokenPtr);
 889 | 						goto SGML_DECLARATION;
 890 | 				}
 891 | 				break;
 892 | 
 893 | 			case State :: SGML_DECLARATION: SGML_DECLARATION:
 894 | 
 895 | 				if(whiteCharTbl[c]) break;
 896 | 
 897 | 				switch(c) {
 898 | 					case '"':
 899 | 					case '\'':
 900 | 
 901 | 						textTokenType = TokenType :: SGML_TEXT_START_OFFSET;
 902 | 						textEndChar = c;
 903 | 						afterTextState = State :: SGML_DECLARATION;
 904 | 
 905 | 						state = State :: TEXT;
 906 | 						break;
 907 | 
 908 | 					case '>':
 909 | 
 910 | 						writeToken(TokenType :: SGML_EMITTED, 0, tokenPtr);
 911 | 
 912 | 						nameCharTbl = xmlNameCharTbl;
 913 | 						nameStartCharTbl = xmlNameStartCharTbl;
 914 | 
 915 | 						state = State :: BEFORE_TEXT;
 916 | 						break;
 917 | 
 918 | 					default:
 919 | 
 920 | 						matchTarget = MatchTarget :: ELEMENT;
 921 | 						nameTokenType = TokenType :: SGML_ID;
 922 | 						memberPrefix = &elementPrefix;
 923 | 
 924 | 						nameCharTbl = dtdNameCharTbl;
 925 | 						nameStartCharTbl = dtdNameCharTbl;
 926 | 						afterNameState = State :: SGML_DECLARATION;
 927 | 
 928 | 						state = State :: BEFORE_NAME;
 929 | 						goto BEFORE_NAME;
 930 | 
 931 | 					case '[':
 932 | 
 933 | 						// Signal start of DTD embedded in DOCTYPE.
 934 | 						writeToken(TokenType :: SGML_NESTED_START, 0, tokenPtr);
 935 | 						++sgmlNesting;
 936 | 
 937 | 						nameCharTbl = xmlNameCharTbl;
 938 | 						nameStartCharTbl = xmlNameStartCharTbl;
 939 | 
 940 | 						state = State :: BEFORE_TEXT;
 941 | 						break;
 942 | 				}
 943 | 				break;
 944 | 
 945 | 			// Inside a processing instruction with the name already parsed.
 946 | 			case State :: AFTER_PROCESSING_NAME: AFTER_PROCESSING_NAME:
 947 | 
 948 | 				switch(c) {
 949 | 					case '?':
 950 | 
 951 | 						// End of an XML processing instruction.
 952 | 						// Handle like a self-closing element.
 953 | 						c = '/';
 954 | 						state = State :: AFTER_ELEMENT_NAME;
 955 | 						goto AFTER_ELEMENT_NAME;
 956 | 
 957 | 					case '>':
 958 | 
 959 | 						// End of an SGML processing instruction.
 960 | 						if(!updateElementStack(TokenType :: CLOSE_ELEMENT_ID)) return(ErrorType :: OTHER);
 961 | 						writeToken(TokenType :: CLOSED_ELEMENT_EMITTED, idElement, tokenPtr);
 962 | 
 963 | 						state = State :: BEFORE_TEXT;
 964 | 						break;
 965 | 
 966 | 					case '/':
 967 | 
 968 | 						return(ErrorType :: INVALID_CHAR);
 969 | 
 970 | 					default:
 971 | 
 972 | 						// Switch states without consuming character.
 973 | 						state = State :: AFTER_ELEMENT_NAME;
 974 | 						goto AFTER_ELEMENT_NAME;
 975 | 				}
 976 | 
 977 | 				break;
 978 | 
 979 | 			// Enforce whitespace between processing instruction attributes.
 980 | 			case State :: AFTER_PROCESSING_VALUE:
 981 | 
 982 | 				switch(c) {
 983 | 					case '?':
 984 | 					case '>':
 985 | 
 986 | 						// Switch states without consuming character.
 987 | 						state = State :: AFTER_PROCESSING_NAME;
 988 | 						goto AFTER_PROCESSING_NAME;
 989 | 
 990 | 					default:
 991 | 
 992 | 						if(whiteCharTbl[c]) {
 993 | 							state = State :: AFTER_PROCESSING_NAME;
 994 | 							break;
 995 | 						} else {
 996 | 							return(ErrorType :: INVALID_CHAR);
 997 | 						}
 998 | 				}
 999 | 
1000 | 				break;
1001 | 
1002 | 			case State :: BEFORE_COMMENT:
1003 | 
1004 | 				writeToken(TokenType :: COMMENT_START_OFFSET, p - chunkBuffer - 1, tokenPtr);
1005 | 
1006 | 				state = State :: COMMENT;
1007 | 				goto COMMENT;
1008 | 
1009 | 			// Note: the terminating "-->" is included in the output byte range.
1010 | 			case State :: COMMENT: COMMENT:
1011 | 
1012 | 				while(1) {
1013 | 					if(c == '-') {
1014 | 						++pos;
1015 | 					} else if(c == '>' && pos >= 2) {
1016 | 						break;
1017 | 					} else {
1018 | 						pos = 0;
1019 | 					}
1020 | 
1021 | 					updateRowCol(c);
1022 | 					if(!--len) return(ErrorType :: OK);
1023 | 					c = *p++;
1024 | 				}
1025 | 
1026 | 				writeToken(
1027 | 					TokenType :: COMMENT_END_OFFSET,
1028 | 					p - chunkBuffer,
1029 | 					tokenPtr
1030 | 				);
1031 | 
1032 | 				pos = 0;
1033 | 				state = State :: BEFORE_TEXT;
1034 | 				break;
1035 | 
1036 | 			case State :: EXPECT:
1037 | 
1038 | 				state = (c == expected) ? nextState : otherState;
1039 | 
1040 | 				if(state == State :: PARSE_ERROR) goto PARSE_ERROR;
1041 | 				break;
1042 | 
1043 | 			case State :: PARSE_ERROR: PARSE_ERROR:
1044 | 
1045 | 				return(ErrorType :: OTHER);
1046 | 
1047 | 			default:
1048 | 
1049 | 				break;
1050 | 		}
1051 | 
1052 | 		// Only read the next character at the end of the loop, to allow
1053 | 		// reprocessing the same character (changing states without
1054 | 		// consuming input) by using "continue".
1055 | 		updateRowCol(c);
1056 | 		if(!--len) return(ErrorType :: OK);
1057 | 		c = *p++;
1058 | 	}
1059 | }
1060 | 
1061 | inline void Parser :: emitPartialName(
1062 | 	const unsigned char *p,
1063 | 	size_t offset,
1064 | 	TokenType tokenType,
1065 | 	uint32_t *&tokenPtr
1066 | ) {
1067 | 	// Test if the number of characters consumed is more than one,
1068 | 	// and more than past characters still left in the input buffer.
1069 | 	// Otherwise we can still take the other, faster branch.
1070 | 	if(pos > 1 && (pos > offset || DEBUG_PARTIAL_NAME_RECOVERY)) {
1071 | 		// NOTE: This is a very rare and complicated edge case.
1072 | 		// Test it with the debug flag to run it more often.
1073 | 
1074 | 		uint32_t id = cursor.findLeaf();
1075 | 
1076 | 		if(id != Patricia :: notFound) {
1077 | 			// Emit part length.
1078 | 			writeToken(TokenType :: PARTIAL_LEN, pos - 1, tokenPtr);
1079 | 			// Emit the first descendant leaf node, which by definition
1080 | 			// will begin with this name part (any descendant leaf would work).
1081 | 			writeToken(tokenType, id, tokenPtr);
1082 | 		}
1083 | 		// Emit the offset of the remaining part of the name.
1084 | 		writeToken(TokenType :: UNKNOWN_START_OFFSET, offset - 1, tokenPtr);
1085 | 	} else {
1086 | 		// The consumed part of the name still remains in the
1087 | 		// input buffer. Simply emit its starting offset.
1088 | 		writeToken(TokenType :: UNKNOWN_START_OFFSET, offset - pos, tokenPtr);
1089 | 	}
1090 | }
1091 | 
1092 | struct Init {
1093 | 	void setRange(unsigned char *tbl, const char *ranges, unsigned char flag) {
1094 | 		unsigned char c, last;
1095 | 
1096 | 		while((c = *ranges++)) {
1097 | 			last = *ranges++;
1098 | 			while(c <= last) tbl[c++] = flag;
1099 | 		}
1100 | 	}
1101 | 
1102 | 	Init() {
1103 | 		for(unsigned int i = 0; i <= 0xff; ++i) {
1104 | 			whiteCharTbl[i] = 0;
1105 | 			valueCharTbl[i] = (i >= ' ' && i <= 0xf7);
1106 | 			xmlNameStartCharTbl[i] = 0;
1107 | 			xmlNameCharTbl[i] = 0;
1108 | 			dtdNameCharTbl[i] = 0;
1109 | 		}
1110 | 
1111 | 		for(unsigned char c : "\r\n\t ")    c && (valueCharTbl[c] = 1, whiteCharTbl[c] = 1);
1112 | 
1113 | 		for(unsigned char c : "\"'&<>]\x7f") c && (valueCharTbl[c] = 0);
1114 | 
1115 | 		setRange(xmlNameStartCharTbl,  "__AZaz\x80\xf7", 1);
1116 | 		setRange(xmlNameCharTbl, "..--09__AZaz\x80\xf7", 1);
1117 | 		setRange(dtdNameCharTbl, "##%%..--09__AZaz\x80\xf7", 1);
1118 | 	}
1119 | };
1120 | 
1121 | Init init;
1122 | 
1123 | #include <nbind/nbind.h>
1124 | 
1125 | #ifdef NBIND_CLASS
1126 | 
1127 | NBIND_ALIAS(Parser :: ErrorType, int32_t);
1128 | 
1129 | NBIND_CLASS(Parser) {
1130 | 	construct<const ParserConfig &>();
1131 | 	method(getConfig);
1132 | 	method(setCodeBuffer);
1133 | 	method(setPrefix);
1134 | 	method(bindPrefix);
1135 | 	getter(getRow);
1136 | 	getter(getCol);
1137 | 	method(parse);
1138 | 	method(destroy);
1139 | }
1140 | 
1141 | #endif
1142 | 


--------------------------------------------------------------------------------