├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── core.js
├── index.js
├── package.json
└── tests.js


/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 |   - "node" # latest / current
4 |   - "lts/*"
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Small Helm LLC
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # tokenizer2
 2 | 
 3 | [![build status](https://secure.travis-ci.org/smallhelm/tokenizer2.png)](https://travis-ci.org/smallhelm/tokenizer2)
 4 | 
 5 | tokenize any text stream given some basic regex rules to match tokens
 6 | 
 7 | **NOTE** This library works well, but I don't use it anymore. I just use `while` loops in a state machine pattern to tokenize. No library needed (or wanted). Here are some examples: [one](https://github.com/farskipper/ecmaless/blob/5503521ccb5c28b03fcb7bbeb3d6dd81e34e1e7a/packages/ecmaless-parser2/src/tokenizer.js#L58), [two](https://github.com/Picolab/node-krl-parser/blob/478df6033ad55f239d89be95c40936cbfaf07058/src/tokenizer.js) Yes it's stateful and verbose, but in my experience this is easier to write and maintain (using TDD of course). Just setup a test-runner and start small then grow it to tokenize everything you want. Once you get the hang of it, it's really easy to figure out how to tokenize something since you have full control of the state machine.
 8 | 
 9 | 
10 | ## Example
11 | ```js
12 | var tokenizer2 = require('tokenizer2');
13 | 
14 | //create a readable/writeable stream
15 | var token_stream = tokenizer2();
16 | 
17 | //make some rules
18 | token_stream.addRule(/^[\s]+$/               , 'whitespace');
19 | token_stream.addRule(/^"([^"]|\\")*"$/       , 'string');
20 | token_stream.addRule(/^[-+]?[0-9]+\.?[0-9]*$/, 'number');
21 | token_stream.addRule(/^[^"0-9\s][^\s]*$/     , 'symbol');
22 | 
23 | //write some info to the console
24 | token_stream.on('data', function(token){
25 |   console.log('token:', token);
26 | });
27 | token_stream.on('end', function(){
28 |   console.log('DONE');
29 | });
30 | 
31 | //pipe in some data
32 | fs.createReadStream('./demo.txt').pipe(token_stream);
33 | ```
34 | demo.txt
35 | ```txt
36 | print "some multi-
37 | lined string"
38 | 
39 | 123.25 times -10
40 | ```
41 | The output
42 | ```js
43 | token: {type: 'symbol'    , src: 'print',  line: 1, col:  1 }
44 | token: {type: 'whitespace', src: ' ',      line: 1, col:  6 }
45 | token: {type: 'string'    , src: '"some multi-\nlined string"', line: 1, col: 7 }
46 | token: {type: 'whitespace', src: '\n\n',   line: 2, col: 14 }
47 | token: {type: 'number'    , src: '123.25', line: 4, col:  1 }
48 | token: {type: 'whitespace', src: ' ',      line: 4, col:  7 }
49 | token: {type: 'symbol'    , src: 'times',  line: 4, col:  8 }
50 | token: {type: 'whitespace', src: ' ',      line: 4, col: 13 }
51 | token: {type: 'number'    , src: '-10',    line: 4, col: 14 }
52 | token: {type: 'whitespace', src: '\n',     line: 4, col: 17 }
53 | DONE
54 | ```
55 | 
56 | ### What if more than one rule matches a token? 
57 | 
58 | `token_stream.addRule` adds rules in an order sensitive way. The first matching rule will be used.
59 | 
60 | ### Why tokenizer2
61 | 
62 | The key difference between this and [tokenizer](https://github.com/Floby/node-tokenizer) is the way it matches rules. `tokenizer` uses [disect](https://github.com/Floby/node-disect) to do bisection on a chunk of text. This is a fast approach, however doesn't work well if your regex rule expects some specific characters at the end of the token. To solve this tokenizer2 simply starts at the beginning of the chunk, and finds the longest matching rule.
63 | 
64 | Other differences
65 |  * tokenizer2 wraps [through2.obj](https://www.npmjs.com/package/through2) so all the node stream APIs should work nicely
66 |  * tokenizer2 uses the standard `'data'` event to emit the tokens
67 |  * tokenizer2 emits line and col numbers
68 | 
69 | ## Non-streaming, synchronous API
70 | 
71 | If, for whatever reason, you don't want to use the streaming api. There is a lighter weight, synchronous api.
72 | 
73 | ```js
74 | var core = require('tokenizer2/core');
75 | 
76 | var t = core(function(token){
77 |   //called synchronously on every token found
78 | });
79 | 
80 | 
81 | //add rules just like the streaming api
82 | t.addRule(/^[\s]+$/, 'whitespace');
83 | 
84 | //Give it strings to tokenize
85 | t.onText("some text to tokenize");
86 | t.onText("some more text");
87 | 
88 | //Call this when it's done
89 | t.end();//this may throw an error
90 | ```
91 | 
92 | ## License
93 | MIT
94 | 


--------------------------------------------------------------------------------
/core.js:
--------------------------------------------------------------------------------
 1 | var findMatchingRule = function(rules, text){
 2 |   var i;
 3 |   for(i=0; i<rules.length; i++)
 4 |     if(rules[i].regex.test(text))
 5 |       return rules[i];
 6 |   return undefined;
 7 | };
 8 | 
 9 | var findMaxIndexAndRule = function(rules, text){
10 |   var i, rule, last_matching_rule;
11 |   for(i=0; i<text.length; i++){
12 |     rule = findMatchingRule(rules, text.substring(0, i + 1));
13 |     if(rule)
14 |       last_matching_rule = rule;
15 |     else if(last_matching_rule)
16 |       return {max_index: i, rule: last_matching_rule};
17 |   }
18 |   return last_matching_rule ? {max_index: text.length, rule: last_matching_rule} : undefined;
19 | };
20 | 
21 | module.exports = function(onToken_orig){
22 |   var buffer = "";
23 |   var rules = [];
24 |   var line = 1;
25 |   var col = 1;
26 | 
27 |   var onToken = function(src, type){
28 |     onToken_orig({
29 |       type: type,
30 |       src: src,
31 |       line: line,
32 |       col: col
33 |     });
34 |     var lines = src.split("\n");
35 |     line += lines.length - 1;
36 |     col = (lines.length > 1 ? 1 : col) + lines[lines.length - 1].length;
37 |   };
38 | 
39 |   return {
40 |     addRule: function(regex, type){
41 |       rules.push({regex: regex, type: type});
42 |     },
43 |     onText: function(text){
44 |       var str = buffer + text;
45 |       var m = findMaxIndexAndRule(rules, str);
46 |       while(m && m.max_index !== str.length){
47 |         onToken(str.substring(0, m.max_index), m.rule.type);
48 | 
49 |         //now find the next token
50 |         str = str.substring(m.max_index);
51 |         m = findMaxIndexAndRule(rules, str);
52 |       }
53 |       buffer = str;
54 |     },
55 |     end: function(){
56 |       if(buffer.length === 0)
57 |         return;
58 | 
59 |       var rule = findMatchingRule(rules, buffer);
60 |       if(!rule){
61 |         var err = new Error("unable to tokenize");
62 |         err.tokenizer2 = {
63 |           buffer: buffer,
64 |           line: line,
65 |           col: col
66 |         };
67 |         throw err;
68 |       }
69 | 
70 |       onToken(buffer, rule.type);
71 |     }
72 |   };
73 | };
74 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
 1 | var core = require("./core");
 2 | var through2 = require("through2");
 3 | 
 4 | module.exports = function(){
 5 |   var t = core(function(token){
 6 |     token_stream.push(token);
 7 |   });
 8 | 
 9 |   var token_stream = through2.obj(function(chunk, enc, done){
10 |     t.onText(chunk.toString());
11 |     done();
12 |   }, function(done){
13 |     try{
14 |       t.end();
15 |       done();
16 |     }catch(err){
17 |       done(err);
18 |     }
19 |   });
20 |   token_stream.addRule = t.addRule;
21 |   return token_stream;
22 | };
23 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tokenizer2",
 3 |   "version": "2.0.1",
 4 |   "description": "tokenize any text stream given some basic regex rules to match tokens",
 5 |   "main": "index.js",
 6 |   "files": [
 7 |     "core.js",
 8 |     "index.js"
 9 |   ],
10 |   "scripts": {
11 |     "test": "node tests.js | tap-min"
12 |   },
13 |   "repository": {
14 |     "type": "git",
15 |     "url": "git+https://github.com/smallhelm/tokenizer2.git"
16 |   },
17 |   "keywords": [
18 |     "tokenizer",
19 |     "through",
20 |     "stream"
21 |   ],
22 |   "author": "smallhelm",
23 |   "license": "MIT",
24 |   "bugs": {
25 |     "url": "https://github.com/smallhelm/tokenizer2/issues"
26 |   },
27 |   "homepage": "https://github.com/smallhelm/tokenizer2#readme",
28 |   "dependencies": {
29 |     "through2": "^2.0.0"
30 |   },
31 |   "devDependencies": {
32 |     "tap-min": "^1.0.0",
33 |     "tape": "^4.0.0"
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/tests.js:
--------------------------------------------------------------------------------
  1 | var test = require('tape');
  2 | var test = require('tape');
  3 | var tokenizer = require('./index');
  4 | 
  5 | var setup = function(writes, callback){
  6 |   var tokens = [];
  7 | 
  8 |   var token_stream = tokenizer();
  9 | 
 10 |   token_stream.addRule(/^[\s]+$/, 'whitespace');
 11 |   token_stream.addRule(/^"([^"\n]|\\")*"$/, 'string');
 12 |   token_stream.addRule(/^[^"0-9\s][^\s]*$/, 'symbol');
 13 |   token_stream.addRule(/^[-+]?[0-9]+\.?[0-9]*$/, 'number');
 14 | 
 15 |   token_stream.on('data', function(token){
 16 |     tokens.push([token.type, token.src, token.line, token.col]);
 17 |   });
 18 |   token_stream.on('end', function(){
 19 |     callback(undefined, tokens);
 20 |   });
 21 | 
 22 |   var nextWrite = function(){
 23 |     var write = writes.shift();
 24 |     if(!write){
 25 |       process.nextTick(function(){
 26 |         token_stream.end();
 27 |       });
 28 |       return;
 29 |     }
 30 |     process.nextTick(function(){
 31 |       token_stream.write(write);
 32 |       nextWrite();
 33 |     });
 34 |   };
 35 |   nextWrite();
 36 | };
 37 | 
 38 | var assertsForTheHelloWorldString = function(t){
 39 |   return function(err, tokens){
 40 |     if(err) return t.end(err);
 41 | 
 42 |     t.deepEquals(tokens[ 0], ['symbol'    , 'hello'     , 1,1]);
 43 |     t.deepEquals(tokens[ 1], ['whitespace', ' '         , 1,6]);
 44 |     t.deepEquals(tokens[ 2], ['symbol'    , 'world'     , 1,7]);
 45 |     t.deepEquals(tokens[ 3], ['whitespace', '\n '       , 1,12]);
 46 |     t.deepEquals(tokens[ 4], ['string'    , '"a string"', 2,2]);
 47 |     t.deepEquals(tokens[ 5], ['whitespace', '  '        , 2,12]);
 48 |     t.deepEquals(tokens[ 6], ['number'    , '100.25'    , 2,14]);
 49 |     t.deepEquals(tokens[ 7], ['whitespace', '\n'        , 2,20]);
 50 |     t.deepEquals(tokens[ 8], ['symbol'    , 'one2three' , 3,1]);
 51 | 
 52 |     t.equals(tokens.length, 9);
 53 |     t.end();
 54 |   };
 55 | };
 56 | 
 57 | test("all in one chunk", function(t){
 58 |   setup([
 59 |     'hello world\n "a string"  100.25\none2three'
 60 |   ], assertsForTheHelloWorldString(t));
 61 | });
 62 | 
 63 | test("broken up", function(t){
 64 |   setup([
 65 |     'hello world\n',
 66 |     ' "a string" ',
 67 |     ' 100.25\n',
 68 |     'one2three'
 69 |   ], assertsForTheHelloWorldString(t));
 70 | });
 71 | 
 72 | test("broken up in inconvenient places", function(t){
 73 |   setup([
 74 |     'he',
 75 |     'llo',
 76 |     ' world\n ',
 77 |     '"a ',
 78 |     'string',
 79 |     '"  100',
 80 |     '.',
 81 |     '25',
 82 |     '\none',
 83 |     '2',
 84 |     'three'
 85 |   ], assertsForTheHelloWorldString(t));
 86 | });
 87 | 
 88 | test("one char at a time", function(t){
 89 |   setup('hello world\n "a string"  100.25\none2three'.split(''), assertsForTheHelloWorldString(t));
 90 | });
 91 | 
 92 | test("error on no match", function(t){
 93 |   var token_stream = tokenizer();
 94 |   token_stream.addRule(/^[\s]+$/, 'whitespace');
 95 |   token_stream.on('data', function(token){
 96 |     t.deepEquals(token, {type: 'whitespace', src: ' ', line: 1, col: 1});
 97 |   });
 98 |   token_stream.on('error', function(err){
 99 |     t.equals(String(err), 'Error: unable to tokenize');
100 |     t.equals(err.tokenizer2.buffer, "10 01");
101 |     t.equals(err.tokenizer2.line, 1);
102 |     t.equals(err.tokenizer2.col, 2);
103 |     t.end();
104 |   });
105 |   token_stream.on('end', function(){
106 |     t.fail('should\'ve failed instead of ending');
107 |   });
108 | 
109 |   process.nextTick(function(){
110 |     token_stream.write(' 10 01');
111 |     process.nextTick(function(){
112 |       token_stream.end();
113 |     });
114 |   });
115 | });
116 | 


--------------------------------------------------------------------------------