├── .gitignore ├── README.md ├── examples ├── example1.ts └── example2.ts ├── package.json ├── tsconfig.json └── typed-lexer.ts /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # typed-lexer 2 | 3 | An easy to use lexer that features typescript type definition. 4 | This lexer is inspired by [aaditmshah/lexer](https://github.com/aaditmshah/lexer "aaditmshah/lexer") but tries 5 | to provide a cleaner API. 6 | 7 | ## Installation 8 | 9 | `typed-lexer` can be installed via the node package manager using the command `npm install typed-lexer`. 10 | 11 | ## Usage 12 | 13 | The lexer supports state based tokenizing. Both the state and the tokens can be typed. 14 | See the examples in the examples directory for other applications. 15 | 16 | ```typescript 17 | import { LexerFactory, matches, or } from "typed-lexer"; 18 | 19 | type State = "start" | "inRangeBlock"; 20 | type TokenType = "DefinedIdentifier" | "WS" | "ProdDef" 21 | | "Identifier" | "StringStart" | "String" | "StringEnd" 22 | | "RangeStart" | "Range" | "RangeEnd" | "Invalid"; 23 | 24 | export class MyLexerFactory extends LexerFactory { 25 | constructor() { 26 | super("start"); 27 | 28 | const start = matches("start"); 29 | const inRangeBlock = matches("inRangeBlock"); 30 | 31 | this.addRuleWithRegexGroups(/([a-zA-Z][a-zA-Z0-9]*)(\s*)(::=)/, 32 | [ "DefinedIdentifier", "WS", "ProdDef" ], start); 33 | this.addSimpleRule(/[a-zA-Z_][a-zA-Z0-9_]*/, "Identifier", start); 34 | this.addSimpleRule(/\s+/, "WS", start); 35 | 36 | this.addRuleWithRegexGroups(/(")(.*?)(")/, [ "StringStart", "String", "StringEnd" ], start); 37 | this.addSimpleRule(/#x[0-9A-F]+/, "HexRef", or(start, inRangeBlock)); 38 | 39 | this.addSimpleRule("[", "RangeStart", start, "inRangeBlock"); 40 | this.addSimpleRule("]", "RangeEnd", inRangeBlock, "start"); 41 | this.addSimpleRule("-", "Range", inRangeBlock); 42 | this.addSimpleRule(/./, "String", inRangeBlock); 43 | 44 | this.addSimpleRule(/./, "Invalid", start); 45 | } 46 | } 47 | 48 | const result = new MyLexerFactory() 49 | .getLexerFor("foo ::= (bar (',' bar)*)?") 50 | .readAllWithStr(); 51 | 52 | for (const t of result) 53 | console.log(`${t.token} (${t.str})`); 54 | 55 | ``` 56 | 57 | ## TODOs 58 | * Add support for Jison 59 | * Add support for code mirror syntax highlighting 60 | * Improve documentation 61 | * Add proper tests 62 | * Improve performance by storing rules that match constant strings into an hashmap or a tries -------------------------------------------------------------------------------- /examples/example1.ts: -------------------------------------------------------------------------------- 1 | import { LexerFactory, clone, TokenWithLen } from "../typed-lexer"; 2 | 3 | 4 | let lineCounter = new LexerFactory({ line: 0, column: 0 }); 5 | // either return new state 6 | lineCounter.addRule("\n", (m, ret, state) => ret.state({ line: state.line + 1, column: 0 })); 7 | // or modify state and return true to proceed. Returning false would try the next rule. 8 | lineCounter.addDefaultRule((m, ret, state) => { state.column++; return true; }); 9 | 10 | var lineBreaks = lineCounter.getLexerFor("test\nhallo").readToEnd().getCurState(); 11 | console.log(lineBreaks); 12 | 13 | 14 | 15 | 16 | 17 | 18 | interface State { 19 | indent: number[]; 20 | start: boolean; 21 | } 22 | 23 | type Token = "Indent" | "Dedent" | "Other" | "WS" | "Identifier"; 24 | 25 | class MyLexer extends LexerFactory { 26 | constructor() { 27 | super({ indent: [0], start: true }); 28 | 29 | this.addRule(/[\t ]*/, (m, ret, state) => { 30 | state.start = false; 31 | state.indent.unshift(m.length); 32 | return ret.token("Indent", state); 33 | }, s => s.start); 34 | 35 | const notStart = (s: State) => !s.start; 36 | 37 | this.addRule(/\n[\t ]*/, (m, ret, state) => { 38 | const indent = m.length; 39 | if (indent > state.indent[0]) { 40 | state.indent.unshift(indent); 41 | return ret.token("Indent"); 42 | } 43 | const tokens: Token[] = []; 44 | while (indent < state.indent[0]) { 45 | tokens.push("Dedent"); 46 | state.indent.shift(); 47 | } 48 | return ret.tokens(tokens); 49 | 50 | }, notStart); 51 | 52 | this.addSimpleRule(/[a-zA-Z_][a-zA-Z0-9_]*/, "Identifier", notStart); 53 | this.addSimpleRule(/[ \r\t]+/, "WS", notStart); 54 | 55 | this.addDefaultSimpleRule("Other"); 56 | } 57 | } 58 | 59 | const result = new MyLexer().getLexerFor( 60 | ` 61 | class Test1 62 | foo bar 63 | return 4 64 | class Test2 65 | bazz buzz 66 | `).readAllWithStr(); 67 | 68 | for (const r of result) 69 | console.log(r); -------------------------------------------------------------------------------- /examples/example2.ts: -------------------------------------------------------------------------------- 1 | import { LexerFactory, matches, or } from "../typed-lexer"; 2 | 3 | type State = "start" | "inRangeBlock"; 4 | type TokenType = "WS" | "Identifier" | "DefinedIdentifier" | "Disj" | "CondDisj" 5 | | "Without" | "OpenParen" | "CloseParen" | "Opt" | "Star" | "PosStar" | "ProdDef" | "UnicodePropertyRef" 6 | | "SingleChar" | "String" | "StringStart" | "StringEnd" | "HexRef" | "Range" | "RangeStart" | "RangeEnd" | "Invalid"; 7 | 8 | 9 | export class EglLexerFactory extends LexerFactory { 10 | constructor() { 11 | super("start"); 12 | 13 | const start = matches("start"); 14 | const inRangeBlock = matches("inRangeBlock"); 15 | 16 | this.addRuleWithRegexGroups(/([a-zA-Z][a-zA-Z0-9]*)(\s*)(::=)/, [ "DefinedIdentifier", "WS", "ProdDef" ], start); 17 | this.addSimpleRule(/[a-zA-Z_][a-zA-Z0-9_]*/, "Identifier", start); 18 | this.addSimpleRule(/\s+/, "WS", start); 19 | 20 | this.addSimpleRules({ 21 | "||": "CondDisj", 22 | "|": "Disj", 23 | ".": "SingleChar", 24 | "\\": "Without", 25 | "?": "Opt", 26 | "*": "Star", 27 | "+": "PosStar", 28 | "(": "OpenParen", 29 | ")": "CloseParen", 30 | "#": "UnicodePropertyRef" 31 | }, start); 32 | 33 | this.addRuleWithRegexGroups(/(")(.*?)(")/, [ "StringStart", "String", "StringEnd" ], start); 34 | this.addRuleWithRegexGroups(/(')(.*?)(')/, [ "StringStart", "String", "StringEnd" ], start); 35 | this.addSimpleRule(/#x[0-9A-F]+/, "HexRef", or(start, inRangeBlock)); 36 | 37 | this.addSimpleRule("[", "RangeStart", start, "inRangeBlock"); 38 | this.addSimpleRule("]", "RangeEnd", inRangeBlock, "start"); 39 | this.addSimpleRule("-", "Range", inRangeBlock); 40 | this.addDefaultSimpleRule("String", inRangeBlock); 41 | 42 | this.addDefaultSimpleRule("Invalid"); 43 | } 44 | } 45 | 46 | const result = new EglLexerFactory() 47 | .getLexerFor("foo ::= (bar (',' bar)*)?") 48 | .readAllWithStr(); 49 | 50 | for (const t of result) 51 | console.log(`${t.token} (${t.str})`); 52 | 53 | /* this code prints the following to the console: 54 | 55 | DefinedIdentifier (foo) 56 | WS ( ) 57 | ProdDef (::=) 58 | WS ( ) 59 | OpenParen (() 60 | Identifier (bar) 61 | WS ( ) 62 | OpenParen (() 63 | StringStart (') 64 | String (,) 65 | StringEnd (') 66 | WS ( ) 67 | Identifier (bar) 68 | CloseParen ()) 69 | Star (*) 70 | CloseParen ()) 71 | Opt (?) 72 | 73 | */ -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "typed-lexer", 3 | "version": "1.0.2", 4 | "description": "An easy to use lexer that features typescript typings.", 5 | "main": "dist/typed-lexer.js", 6 | "typings": "dist/typed-lexer", 7 | "files": [ "dist/" ], 8 | "scripts": { 9 | "test": "echo \"Error: no test specified\" && exit 1" 10 | }, 11 | "keywords": [ 12 | "lexer", 13 | "typescript", 14 | "tokenizer" 15 | ], 16 | "author": "Henning Dieterichs", 17 | "repository": { 18 | "type": "git", 19 | "url": "https://github.com/hediet/typed-lexer.git" 20 | }, 21 | "license": "MIT" 22 | } 23 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": "1.6.2", 3 | "compilerOptions": { 4 | "module": "commonjs", 5 | "declaration": true, 6 | "outDir": "dist/" 7 | }, 8 | "files": [ 9 | "typed-lexer.ts" 10 | ] 11 | } -------------------------------------------------------------------------------- /typed-lexer.ts: -------------------------------------------------------------------------------- 1 | export interface TokenWithPosAndLen { 2 | token: TToken; 3 | startPos: number; 4 | length: number; 5 | } 6 | 7 | export interface TokenWithLen { 8 | token: TToken; 9 | length: number; 10 | } 11 | 12 | export interface TokenWithStr { 13 | token: TToken; 14 | str: string; 15 | } 16 | 17 | export interface Result { 18 | typeDiscriminator_Result: string; 19 | } 20 | 21 | export interface ResultFactory { 22 | tokens(tokens: TToken[], nextState?: TState): Result; 23 | tokensWithPos(tokens: TokenWithPosAndLen[], nextState?: TState): Result; 24 | tokensWithLen(tokens: TokenWithLen[], nextState?: TState): Result; 25 | token(token: TToken, nextState?: TState): Result; 26 | state(nextState: TState): Result; 27 | nothing(): Result; 28 | } 29 | 30 | export type Handler = (matched: string, ret: ResultFactory, state: TState, matchedGroups?: RegExpExecArray) => Result | boolean; 31 | export type Predicate = (v: T) => boolean; 32 | 33 | 34 | function isString(a: any): a is string { return typeof(a) === "string"; } 35 | function isBool(a: any): a is boolean { return typeof(a) === "boolean"; } 36 | 37 | 38 | class ResultImplementation implements Result { 39 | public typeDiscriminator_Result: string; 40 | 41 | public tokens: TokenWithPosAndLen[]; 42 | public nextState: TState; // |undefined 43 | 44 | public matchedString: string; 45 | } 46 | 47 | class ResultFactoryImplementation implements ResultFactory { 48 | 49 | constructor(public matchedString: string) { 50 | } 51 | 52 | public tokensWithPos(tokens: TokenWithPosAndLen[], nextState?: TState): ResultImplementation { 53 | const r = new ResultImplementation(); 54 | r.nextState = nextState; 55 | r.matchedString = this.matchedString; 56 | r.tokens = tokens; 57 | return r; 58 | } 59 | 60 | public tokens(tokens: TToken[], nextState?: TState): ResultImplementation { 61 | if (tokens.length == 0) 62 | return this.tokensWithPos([], nextState); 63 | let t2 = tokens.map>(t => ({ token: t, startPos: 0, length: 0 })); 64 | t2[t2.length - 1].length = this.matchedString.length; 65 | return this.tokensWithPos(t2, nextState); 66 | } 67 | 68 | public tokensWithLen(tokens: TokenWithLen[], nextState?: TState): ResultImplementation { 69 | const t2 = tokens as TokenWithPosAndLen[]; 70 | 71 | let pos = 0; 72 | for (const t of t2) { 73 | t.startPos = pos; 74 | pos += t.length; 75 | } 76 | 77 | return this.tokensWithPos(t2, nextState); 78 | } 79 | 80 | public token(token: TToken, nextState?: TState): ResultImplementation { 81 | return this.tokensWithPos([{ token: token, startPos: 0, length: this.matchedString.length }], nextState); 82 | } 83 | 84 | public state(nextState: TState): ResultImplementation { 85 | return this.tokensWithPos([], nextState); 86 | } 87 | 88 | public nothing(): ResultImplementation { 89 | return this.tokensWithPos([]); 90 | } 91 | } 92 | 93 | abstract class Rule { 94 | constructor(private handler: Handler, private statePredicate?: Predicate) { 95 | } 96 | 97 | protected abstract internalMatch(str: string): [string, RegExpExecArray]; 98 | 99 | public match(str: string, state: TState): ResultImplementation { // | null 100 | 101 | if (this.statePredicate && !this.statePredicate(state)) return null; 102 | 103 | let [ matchedStr, matchedGroups ] = this.internalMatch(str); 104 | if (matchedStr == null) return null; 105 | 106 | const ret = new ResultFactoryImplementation(matchedStr); 107 | let result = this.handler(matchedStr, ret, state, matchedGroups) as (ResultImplementation | boolean); 108 | 109 | if (isBool(result)) { 110 | if (!result) return null; 111 | return ret.tokens([], state); 112 | } 113 | else { 114 | if (result.nextState === undefined) 115 | result.nextState = state; 116 | return result; 117 | } 118 | } 119 | } 120 | 121 | class RegExRule extends Rule { 122 | private matchRegex: RegExp; 123 | 124 | constructor(regex: RegExp, handler: Handler, statePredicate?: Predicate) { 125 | super(handler, statePredicate); 126 | this.matchRegex = new RegExp("^" + regex.source); 127 | } 128 | 129 | protected internalMatch(str: string): [string, RegExpExecArray] { 130 | let matchedGroups = this.matchRegex.exec(str); 131 | if (matchedGroups == null || matchedGroups.length == 0) return [ null, null ]; 132 | return [ matchedGroups[0], matchedGroups ]; 133 | } 134 | } 135 | 136 | class StringRule extends Rule { 137 | constructor(private matchStr: string, handler: Handler, statePredicate?: Predicate) { 138 | super(handler, statePredicate); 139 | } 140 | 141 | protected internalMatch(str: string): [string, RegExpExecArray] { 142 | const str2 = str.substr(0, this.matchStr.length); 143 | if (str2 !== this.matchStr) return [ null, null ]; 144 | return [ this.matchStr, null ]; 145 | } 146 | } 147 | 148 | export class LexerFactory { 149 | private rules: Rule[] = []; 150 | 151 | constructor(private startState?: TState) { 152 | } 153 | 154 | public addRule(regex: RegExp|string, handler: Handler, statePredicate?: Predicate): this { 155 | let rule: Rule; 156 | if (isString(regex)) 157 | rule = new StringRule(regex, handler, statePredicate); 158 | else 159 | rule = new RegExRule(regex, handler, statePredicate); 160 | 161 | this.rules.push(rule); 162 | return this; 163 | } 164 | 165 | public addDefaultRule(handler?: Handler, statePredicate?: Predicate): this { 166 | if (handler === undefined) 167 | handler = (m, ret) => ret.nothing(); 168 | 169 | return this.addRule(/[\s\S]/, handler, statePredicate); 170 | } 171 | 172 | public addDefaultSimpleRule(token?: TToken, statePredicate?: Predicate): this { 173 | return this.addSimpleRule(/[\s\S]/, token, statePredicate); 174 | } 175 | 176 | public addSimpleRule(regex: RegExp|string, token?: TToken, statePredicate?: Predicate, nextState?: TState): this { 177 | if (token === undefined) 178 | return this.addRule(regex, (m, ret) => ret.state(nextState), statePredicate); 179 | 180 | return this.addRule(regex, (m, ret) => ret.token(token, nextState), statePredicate); 181 | } 182 | 183 | public addSimpleRules(rules: { [char: string]: TToken }, statePredicate?: Predicate, nextState?: TState): this { 184 | for (const c in rules) 185 | this.addSimpleRule(c, rules[c], statePredicate, nextState); 186 | return this; 187 | } 188 | 189 | public addRuleWithRegexGroups(regex: RegExp, tokens: TToken[], statePredicate?: Predicate, nextState?: TState): this { 190 | return this.addRule(regex, (m, ret, state, groups) => 191 | ret.tokensWithLen(groups.slice(1).map((g, idx) => ({ token: tokens[idx], length: g.length })), nextState), 192 | statePredicate); 193 | } 194 | 195 | public getLexerFor(input: string, startState?: TState): Lexer { 196 | if (startState === undefined) 197 | startState = (this.startState !== undefined) ? this.startState : null; 198 | return new Lexer(input, this.rules, startState); 199 | } 200 | } 201 | 202 | 203 | export class Lexer { 204 | 205 | private pos: number = 0; 206 | private cur: TokenWithPosAndLen = null; 207 | private restrained: TokenWithPosAndLen[] = []; 208 | private rules: Rule[]; 209 | 210 | constructor(private input: string, rules: any[], private state: TState) { 211 | this.rules = rules; 212 | } 213 | 214 | public readToEnd(): this { 215 | while (true) { 216 | let cur = this.next(); 217 | if (cur === undefined) 218 | break; 219 | } 220 | return this; 221 | } 222 | 223 | public readAll(): TToken[] { 224 | const result: TToken[] = []; 225 | while (true) { 226 | let cur = this.next(); 227 | if (cur === undefined) 228 | break; 229 | result.push(cur); 230 | } 231 | 232 | return result; 233 | } 234 | 235 | public readAllWithStr(): TokenWithStr[] { 236 | const result: TokenWithStr[] = []; 237 | while (true) { 238 | let cur = this.next(); 239 | if (cur === undefined) 240 | break; 241 | result.push({ token: cur, str: this.input.substr(this.cur.startPos, this.cur.length) }); 242 | } 243 | return result; 244 | } 245 | 246 | 247 | public getInput(): string { return this.input; } 248 | 249 | public getCur(): TokenWithPosAndLen { return this.cur; } 250 | public getCurToken(): TToken { return this.cur ? this.cur.token : undefined; } 251 | public getCurState(): TState { return this.state; } 252 | 253 | public getRestrained(): TokenWithPosAndLen[] { return this.restrained; } 254 | 255 | 256 | public next(): TToken { // |undefined 257 | 258 | while (this.restrained.length == 0) { 259 | 260 | var curStr = this.input.substr(this.pos); 261 | 262 | if (curStr.length == 0) { 263 | this.cur = undefined; 264 | return undefined; 265 | } 266 | 267 | let result: ResultImplementation = null; 268 | 269 | for (const r of this.rules) { 270 | result = r.match(curStr, this.state); 271 | if (result != null) break; 272 | } 273 | 274 | if (result == null) throw new Error(`${curStr} could not be matched!`); 275 | 276 | for (const t of result.tokens) 277 | t.startPos += this.pos; // add offset 278 | 279 | this.pos += result.matchedString.length; 280 | this.state = result.nextState; 281 | 282 | this.restrained.push(...result.tokens); 283 | } 284 | 285 | this.cur = this.restrained.shift(); 286 | return this.cur.token; 287 | } 288 | } 289 | 290 | export function matches(...elements: T[]): Predicate { return (other) => elements.some(element => element === other); } 291 | export function matchesNot(...elements: T[]): Predicate { return (other) => !elements.some(element => element === other); } 292 | export function and(...ops: Predicate[]): Predicate { return (other) => ops.every(o => o(other)); } 293 | export function or(...ops: Predicate[]): Predicate { return (other) => ops.some(o => o(other)); } 294 | 295 | // from http://stackoverflow.com/questions/728360/most-elegant-way-to-clone-a-javascript-object 296 | export function clone(obj: T): T { 297 | var copy; 298 | 299 | // Handle the 3 simple types, and null or undefined 300 | if (null == obj || "object" != typeof obj) return obj; 301 | 302 | // Handle Date 303 | if (obj instanceof Date) { 304 | copy = new Date(); 305 | copy.setTime(obj.getTime()); 306 | return copy; 307 | } 308 | 309 | // Handle Array 310 | if (obj instanceof Array) { 311 | copy = []; 312 | for (var i = 0, len = obj.length; i < len; i++) { 313 | copy[i] = clone(obj[i]); 314 | } 315 | return copy; 316 | } 317 | 318 | // Handle Object 319 | if (obj instanceof Object) { 320 | copy = {}; 321 | for (var attr in obj) { 322 | if (obj.hasOwnProperty(attr)) copy[attr] = clone(obj[attr]); 323 | } 324 | return copy; 325 | } 326 | 327 | throw new Error("Unable to copy obj! Its type isn't supported."); 328 | } --------------------------------------------------------------------------------