├── .bowerrc
├── .github
    └── FUNDING.yml
├── index.js
├── .gitignore
├── bower.json
├── lib
    ├── tokenizer
    │   ├── wordToken.js
    │   ├── lexerRule.js
    │   ├── lexer.js
    │   ├── index.js
    │   └── taggedWord.js
    └── model
    │   └── lexicon.js
├── test
    └── test.js
├── package.json
├── LICENSE
├── Gruntfile.js
├── README.md
└── dist
    └── index.js


/.bowerrc:
--------------------------------------------------------------------------------
1 | {
2 | 	
3 | }


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: duyet
2 | ko_fi: duyet
3 | 


--------------------------------------------------------------------------------
/index.js:
--------------------------------------------------------------------------------
1 | 'use strict';
2 | 
3 | (function() {
4 | 	exports.lexer = require('./lib/tokenizer/lexer');
5 |     module.exports = require('./lib/tokenizer');
6 | }).call(this);


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Logs
 2 | logs
 3 | *.log
 4 | 
 5 | # Runtime data
 6 | pids
 7 | *.pid
 8 | *.seed
 9 | 
10 | # Directory for instrumented libs generated by jscoverage/JSCover
11 | lib-cov
12 | 
13 | # Coverage directory used by tools like istanbul
14 | coverage
15 | 
16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
17 | .grunt
18 | 
19 | # node-waf configuration
20 | .lock-wscript
21 | 
22 | # Compiled binary addons (http://nodejs.org/api/addons.html)
23 | build/Release
24 | 
25 | # Dependency directory
26 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git
27 | node_modules
28 | 


--------------------------------------------------------------------------------
/bower.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "node-vntokenizer",
 3 |   "description": "Tokenizer for Vietnamese in Nodejs and Javascript",
 4 |   "main": "index.js",
 5 |   "authors": [
 6 |     "Van-Duyet Le"
 7 |   ],
 8 |   "license": "MIT",
 9 |   "keywords": [
10 |     "node",
11 |     "nlp",
12 |     "vntokenizers",
13 |     "text",
14 |     "processing",
15 |     "ngram",
16 |     "tokenizer",
17 |     "vietnam",
18 |     "vietnamese"
19 |   ],
20 |   "homepage": "https://github.com/duyetdev/node-vntokenizer",
21 |   "moduleType": [
22 |     "globals",
23 |     "node"
24 |   ],
25 |   "ignore": [
26 |     "**/.*",
27 |     "node_modules",
28 |     "bower_components",
29 |     "test",
30 |     "tests"
31 |   ],
32 |   "version": "0.0.1"
33 | }
34 | 


--------------------------------------------------------------------------------
/lib/tokenizer/wordToken.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | var TaggedWord = require('./taggedWord');
 4 | 
 5 | /**
 6 |  * A word token. It is a lexer token with  
 7 |  * an additional information - the part of speech. But in general,
 8 |  * we do not use this information.
 9 |  * 
10 |  */
11 | var WordToken = function(rule, text, line, column, pos) {
12 | 	TaggedWord.apply(this, rule, text, line, column);
13 | 
14 | 	pos = pos || null;
15 | };
16 | 
17 | WordToken.prototype = TaggedWord.prototype;
18 | WordToken.prototype.constructor = WordToken;
19 | 
20 | /**
21 |  * Get the parts-of-speech of the token
22 |  * @return parts-of-speech of the token
23 |  */
24 | WordToken.prototype.getPOS = function() {
25 | 	return pos;
26 | }
27 | 
28 | module.exports = WordToken;


--------------------------------------------------------------------------------
/test/test.js:
--------------------------------------------------------------------------------
 1 | var assert = require('assert');
 2 | var Tokenizer = require('../index.js');
 3 | var token = new Tokenizer();
 4 | 
 5 | describe('Tokenizer', function() {
 6 | 	it ('should be tokenized vietnamese words', function() {
 7 | 		assert.deepEqual([ 'Le', 'Van', 'Duyet' ], token.tokenize('Le Van Duyet'));
 8 | 		assert.deepEqual([ 'Lê', 'Văn', 'Duyệt' ], token.tokenize('Lê Văn Duyệt'));
 9 | 		assert.deepEqual([ 'Tôi', 'tên', 'là', 'Duyệt', 'Test', 'chơi', 'vậy', 'thôi', 'Không', 'biết', 'có', 'đúng', 'hay', 'không', 'nữa' ], token.tokenize('Tôi tên là Duyệt. Test chơi vậy thôi!! Không biết có đúng hay không nữa?'));
10 | 	});
11 | 
12 | 	it ('should be remove not word charactors', function() {
13 | 		assert.deepEqual([ 'Lê', 'Văn', 'Duyệt' ], token.tokenize('!!!Lê!!Văn          Duyệt'));
14 | 	});
15 | 
16 | 	it ('should be empty', function() {
17 |                 assert.deepEqual([], token.tokenize('  !@#    '));
18 |         });
19 | 
20 | });
21 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "node-vntokenizer",
 3 |   "version": "0.0.2",
 4 |   "description": "Tokenizer for Vietnamese in Nodejs and Javascript",
 5 |   "main": "index.js",
 6 |   "scripts": {
 7 |     "test": "./node_modules/mocha/bin/mocha"
 8 |   },
 9 |   "repository": {
10 |     "type": "git",
11 |     "url": "git+https://github.com/duyetdev/node-vntokenizer.git"
12 |   },
13 |   "keywords": [
14 |     "node",
15 |     "nlp",
16 |     "vntokenizers",
17 |     "text",
18 |     "processing",
19 |     "ngram",
20 |     "tokenizer",
21 |     "vietnam",
22 |     "vietnamese"
23 |   ],
24 |   "author": "Van-Duyet Le",
25 |   "license": "MIT",
26 |   "bugs": {
27 |     "url": "https://github.com/duyetdev/node-vntokenizer/issues"
28 |   },
29 |   "homepage": "https://github.com/duyetdev/node-vntokenizer#readme",
30 |   "dependencies": {
31 |     "java": "^5.0.0",
32 |     "lodash": "^4.17.21",
33 |     "underscore": "^1.8.3"
34 |   },
35 |   "devDependencies": {
36 |     "grunt": "^0.4.5",
37 |     "grunt-contrib-copy": "^0.8.2",
38 |     "grunt-contrib-uglify": "^0.10.0",
39 |     "mocha": "^2.3.3"
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2015 Van-Duyet Le
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/Gruntfile.js:
--------------------------------------------------------------------------------
 1 | var pkgjson = require('./package.json');
 2 | 
 3 | var config = {
 4 | 	pkg: pkgjson,
 5 | 	app: 'lib',
 6 | 	dist: 'dist'
 7 | }
 8 | 
 9 | module.exports = function(grunt) {
10 | 	// Configuration
11 | 	grunt.initConfig({
12 | 		config: config,
13 | 		pkg: config.pkg,
14 | 		bower: grunt.file.readJSON('./.bowerrc'),
15 | 		copy: {
16 | 			dist: {
17 | 				files: [{
18 | 					expand: true,
19 | 					cwd: '<%= config.app %>/tokenizer',
20 | 					src: 'index.js',
21 | 					dest: '<%= config.dist %>'
22 | 				}]
23 | 			}
24 | 		},
25 | 		uglify: {
26 | 			options: {
27 | 				banner: '/*! <%= pkg.name %> lib - v<%= pkg.version %> -' +
28 | 					'<%= grunt.template.today("yyyy-mm-dd") %> */'
29 | 			},
30 | 			dist: {
31 | 				files: {
32 | 					'<%= config.dist %>/js/lib.min.js': [
33 | 						'<%= bower.directory %>/jquery/jquery.js',
34 | 						'<%= bower.directory %>/underscore/underscore.js',
35 | 						'<%= bower.directory %>/requirejs/require.js',
36 | 					]
37 | 				}
38 | 			}
39 | 		}
40 | 	});
41 | 
42 | 	grunt.loadNpmTasks('grunt-contrib-copy');
43 | 	grunt.loadNpmTasks('grunt-contrib-uglify');
44 | 
45 | 	grunt.registerTask('default', [
46 | 		'copy',
47 | 		// 'uglify'
48 | 	]);
49 | };
50 | 


--------------------------------------------------------------------------------
/lib/tokenizer/lexerRule.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | /**
 4 |  * Instantiate a new lexical rule with a name and a regex
 5 |  * 
 6 |  * @param name
 7 |  *            a name
 8 |  * @param regex
 9 |  *            a regular expression
10 |  */
11 | var LexerRule = function (name, regex) {
12 | 	/**
13 | 	 * The name of the lexical category that this rule matches
14 | 	 */
15 | 	var name =  name || '';
16 | 
17 | 	/**
18 | 	 * The regular expression used for matching
19 | 	 */
20 | 	var regex = regex || false;
21 | 	/**
22 | 	 * A pre-compiled pattern object, kept to save processing time
23 | 	 */
24 | 	var pattern = false;
25 | 
26 | 	/**
27 | 	 * Get the category name
28 | 	 * 
29 | 	 * @return the name of rule
30 | 	 */
31 | 	function getName() {
32 | 		return this.name;
33 | 	}
34 | 
35 | 	/**
36 | 	 * Get the regex defining the rule
37 | 	 * 
38 | 	 * @return the regex
39 | 	 */
40 | 	function getRegex() {
41 | 		return this.regex;
42 | 	}
43 | 
44 | 	/**
45 | 	 * Return the pattern object. Create one if it hasn't been created already.
46 | 	 * 
47 | 	 * @return the pattern object
48 | 	 */
49 | 	function getPattern() {
50 | 		if (this.pattern == null) {
51 | 			this.pattern = new RegExp(this.regex);
52 | 		}
53 | 		return this.pattern;
54 | 	}
55 | 
56 | 	/**
57 | 	 * Return a string representation of the rule
58 | 	 */
59 | 	function toString() {
60 | 		return "[" + this.name + "]";
61 | 	}
62 | }
63 | 
64 | module.exports = LexerRule;


--------------------------------------------------------------------------------
/lib/tokenizer/lexer.js:
--------------------------------------------------------------------------------
 1 | 'use strict';
 2 | 
 3 | var lexerRule = require('./lexerRule');
 4 | var lexerModel = require('../model/lexicon');
 5 | 
 6 | var re = lexerModel;
 7 | 
 8 | function LexerNode(string, regex, regexs) {
 9 |   this.string = string;
10 |   this.children = [];
11 | 
12 |   if (string) {
13 |     this.matches = string.match(regex);
14 |     var childElements = string.split(regex);
15 |   }
16 | 
17 |   if (!this.matches) {
18 |     this.matches = [];
19 |     var childElements = [string];
20 |   }
21 | 
22 |   if (!regexs.length) {
23 |     // no more regular expressions, we're done
24 |     this.children = childElements;
25 |   } else {
26 |     // descend recursively
27 |     var nextRegex = regexs[0]
28 |       , nextRegexes = regexs.slice(1);
29 | 
30 |     for (var i in childElements) {
31 |       if (childElements.hasOwnProperty(i)) {
32 |         this.children.push(
33 |           new LexerNode(childElements[i], nextRegex, nextRegexes));
34 |       }
35 |     }
36 |   }
37 | }
38 | 
39 | LexerNode.prototype.fillArray = function(array){
40 |   for (var i in this.children) {
41 |     if (this.children.hasOwnProperty(i)) {
42 |       var child = this.children[i];
43 | 
44 |       if (child && child.fillArray) {
45 |         child.fillArray(array);
46 |       } else if (re.unblank.test(child)) {
47 |         array.push(child);
48 |       }
49 | 
50 |       if (i < this.matches.length) {
51 |         var match = this.matches[i];
52 |         if (re.unblank.test(match))
53 |           array.push(match);
54 |       }
55 |     }
56 |   }
57 | }
58 | 
59 | LexerNode.prototype.toString = function(){
60 |   var array = [];
61 |   this.fillArray(array);
62 |   return array.toString();
63 | }
64 | 
65 | function Lexer(string){
66 |   var regexs = [];
67 |   for (var i in lexerModel) regexs.push(lexerModel[i]);
68 | 
69 |   var array = []
70 |   var node = new LexerNode(string, regexs[0], regexs.slice(1));
71 |   
72 |   node.fillArray(array);
73 |   return array;
74 | }
75 | 
76 | module.exports = Lexer;


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # node-vntokenizer
 2 | 
 3 | Tokenizer for Vietnamese in Nodejs and Javascript.
 4 | 
 5 | <img src="https://img.shields.io/npm/v/node-vntokenizer.svg?style=flat-square" />
 6 | <img src="https://img.shields.io/npm/l/node-vntokenizer.svg?style=flat-square" />
 7 | 
 8 | [![NPM](https://nodei.co/npm/node-vntokenizer.png?downloads=true&downloadRank=true&stars=true)](https://nodei.co/npm/node-vntokenizer/)
 9 | 
10 | # Instalation 
11 | 
12 | * Via NPM
13 | ```sh
14 | npm install node-vntokenizer --save
15 | ```
16 | 
17 | * Via Bower 
18 | ```sh
19 | bower install tokenizer --save
20 | ```
21 | 
22 | # Sample
23 | 
24 | ```js
25 | var Tokenizer = require('node-vntokenizer');
26 | var token = new Tokenizer();
27 | 
28 | console.log(token.tokenize('Lê Văn Duyệt'));
29 | // [ 'Lê', 'Văn', 'Duyệt' ]
30 | 
31 | console.log(token.tokenize('Tôi tên là Duyệt. Test chơi vậy thôi!! Không biết có đúng hay không nữa?'));
32 | // [ 'Tôi', 'tên', 'là', 'Duyệt', 'Test', 'chơi', 'vậy', 'thôi', 'Không', 'biết', 'có', 'đúng', 'hay', 'không', 'nữa' ]
33 | 
34 | console.log(token.tokenize('!!!Lê!!Văn          Duyệt'));
35 | // [ 'Lê', 'Văn', 'Duyệt' ]
36 | ```
37 | 
38 | # Test
39 | ```sh
40 | npm test
41 | ```
42 | 
43 | # How to contribute
44 | 1. Fork the project on Github
45 | 2. Create a topic branch for your changes
46 | 3. Ensure that you provide documentation and test coverage for your changes (patches won’t be accepted without)
47 | 4. Create a pull request on Github (these are also a great place to start a conversation around a patch as early as possible)
48 | 
49 | # License
50 | MIT License
51 | 
52 | Copyright (c) 2015 Van-Duyet Le
53 | 
54 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
55 | 
56 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
57 | 
58 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/dist/index.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | 
  3 | (function() {
  4 |     var root = this;
  5 |     var has_require = typeof require !== 'undefined';
  6 | 
  7 |     if (typeof _ === 'undefined') {
  8 |         if (has_require) {
  9 |             var util = require("util");
 10 |             var _ = require('lodash');
 11 |         } else
 12 |             throw new Error('vnTokenizer requires underscore');
 13 |     }
 14 | 
 15 |     var Tokenizer = function() {};
 16 | 
 17 |     Tokenizer.prototype.trim = function(array) {
 18 |         while (array[array.length - 1] == '')
 19 |             array.pop();
 20 | 
 21 |         while (array[0] == '')
 22 |             array.shift();
 23 | 
 24 |         return array;
 25 |     };
 26 | 
 27 |     // Expose an attach function that will patch String with new methods.
 28 |     Tokenizer.prototype.attach = function() {
 29 |         var self = this;
 30 | 
 31 |         String.prototype.tokenize = function() {
 32 |             return self.tokenize(this);
 33 |         }
 34 |     };
 35 | 
 36 |     Tokenizer.prototype.tokenize = function() {};
 37 | 
 38 |     // Base Class for RegExp Matching
 39 |     var RegexpTokenizer = function(options) {
 40 |         options = options || {};
 41 |         this._pattern = /[^a-z0-9A-Z_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềếềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ]+/;
 42 |         this._pattern = options.pattern || this._pattern;
 43 |         this.discardEmpty = options.discardEmpty || true;
 44 | 
 45 |         // Match and split on GAPS not the actual WORDS
 46 |         this._gaps = options.gaps;
 47 | 
 48 |         if (this._gaps === undefined) {
 49 |             this._gaps = true;
 50 |         }
 51 |     };
 52 | 
 53 |     if (util) util.inherits(RegexpTokenizer, Tokenizer);
 54 |     
 55 |     RegexpTokenizer.prototype.tokenize = function(s) {
 56 |         var results;
 57 | 
 58 |         if (this._gaps) {
 59 |             results = s.split(this._pattern);
 60 |             return (this.discardEmpty) ? _.without(results, '', ' ') : results;
 61 |         } else {
 62 |             return s.match(this._pattern);
 63 |         }
 64 |     };
 65 | 
 66 | 
 67 |     /***
 68 |      * A tokenizer that divides a text into sequences of alphabetic and
 69 |      * non-alphabetic characters.  E.g.:
 70 |      *
 71 |      *      >>> WordTokenizer().tokenize("She said 'hello'.")
 72 |      *      ['She', 'said', 'hello']
 73 |      * 
 74 |      */
 75 |     var WordTokenizer = function(options) {
 76 |         this._pattern = /\W+/;
 77 |         RegexpTokenizer.call(this, options)
 78 |     };
 79 | 
 80 |     if (util) util.inherits(WordTokenizer, RegexpTokenizer);
 81 | 
 82 |     /***
 83 |      * A tokenizer that divides a text into sequences of alphabetic and
 84 |      * non-alphabetic characters.  E.g.:
 85 |      *
 86 |      *      >>> WordPunctTokenizer().tokenize("She said 'hello'.")
 87 |      *      ['She', 'said', "'", 'hello', "'."]
 88 |      * 
 89 |      */
 90 |     var WordPunctTokenizer = function(options) {
 91 |         this._pattern = new RegExp(/(\w+|\!|\'|\"")/i);
 92 |         RegexpTokenizer.call(this, options)
 93 |     };
 94 | 
 95 |     if (util) util.inherits(WordPunctTokenizer, RegexpTokenizer);
 96 | 
 97 | 
 98 |     //exports.Tokenizer = Tokenizer;
 99 |     //exports.RegexpTokenizer = RegexpTokenizer;
100 |     //exports.WordTokenizer = WordTokenizer;
101 |     //exports.WordPunctTokenizer = WordPunctTokenizer;
102 |     //module.exports = WordTokenizer;
103 | 
104 |     // Exports
105 |     if (typeof exports !== 'undefined') {
106 |         if (typeof module !== 'undefined' && module.exports) {
107 |             exports = module.exports = RegexpTokenizer;
108 |         }
109 |         exports.Tokenizer = RegexpTokenizer;
110 |     } else {
111 |         root.Tokenizer = RegexpTokenizer;
112 |     }
113 | }).call(this);


--------------------------------------------------------------------------------
/lib/tokenizer/index.js:
--------------------------------------------------------------------------------
  1 | 'use strict';
  2 | 
  3 | (function() {
  4 |     var root = this;
  5 |     var has_require = typeof require !== 'undefined';
  6 | 
  7 |     if (typeof _ === 'undefined') {
  8 |         if (has_require) {
  9 |             var util = require("util");
 10 |             var _ = require('lodash');
 11 |         } else
 12 |             throw new Error('vnTokenizer requires underscore');
 13 |     }
 14 | 
 15 |     var Tokenizer = function() {};
 16 | 
 17 |     Tokenizer.prototype.trim = function(array) {
 18 |         while (array[array.length - 1] == '')
 19 |             array.pop();
 20 | 
 21 |         while (array[0] == '')
 22 |             array.shift();
 23 | 
 24 |         return array;
 25 |     };
 26 | 
 27 |     // Expose an attach function that will patch String with new methods.
 28 |     Tokenizer.prototype.attach = function() {
 29 |         var self = this;
 30 | 
 31 |         String.prototype.tokenize = function() {
 32 |             return self.tokenize(this);
 33 |         }
 34 |     };
 35 | 
 36 |     Tokenizer.prototype.tokenize = function() {};
 37 | 
 38 |     // Base Class for RegExp Matching
 39 |     var RegexpTokenizer = function(options) {
 40 |         options = options || {};
 41 |         this._pattern = /[^a-z0-9A-Z_ÆØÅÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠæøåàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềếềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ]+/;
 42 |         this._pattern = options.pattern || this._pattern;
 43 |         this.discardEmpty = options.discardEmpty || true;
 44 | 
 45 |         // Match and split on GAPS not the actual WORDS
 46 |         this._gaps = options.gaps;
 47 | 
 48 |         if (this._gaps === undefined) {
 49 |             this._gaps = true;
 50 |         }
 51 |     };
 52 | 
 53 |     if (util) util.inherits(RegexpTokenizer, Tokenizer);
 54 |     
 55 |     RegexpTokenizer.prototype.tokenize = function(s) {
 56 |         var results;
 57 | 
 58 |         if (this._gaps) {
 59 |             results = s.split(this._pattern);
 60 |             return (this.discardEmpty) ? _.without(results, '', ' ') : results;
 61 |         } else {
 62 |             return s.match(this._pattern);
 63 |         }
 64 |     };
 65 | 
 66 | 
 67 |     /***
 68 |      * A tokenizer that divides a text into sequences of alphabetic and
 69 |      * non-alphabetic characters.  E.g.:
 70 |      *
 71 |      *      >>> WordTokenizer().tokenize("She said 'hello'.")
 72 |      *      ['She', 'said', 'hello']
 73 |      * 
 74 |      */
 75 |     var WordTokenizer = function(options) {
 76 |         this._pattern = /\W+/;
 77 |         RegexpTokenizer.call(this, options)
 78 |     };
 79 | 
 80 |     if (util) util.inherits(WordTokenizer, RegexpTokenizer);
 81 | 
 82 |     /***
 83 |      * A tokenizer that divides a text into sequences of alphabetic and
 84 |      * non-alphabetic characters.  E.g.:
 85 |      *
 86 |      *      >>> WordPunctTokenizer().tokenize("She said 'hello'.")
 87 |      *      ['She', 'said', "'", 'hello', "'."]
 88 |      * 
 89 |      */
 90 |     var WordPunctTokenizer = function(options) {
 91 |         this._pattern = new RegExp(/(\w+|\!|\'|\"")/i);
 92 |         RegexpTokenizer.call(this, options)
 93 |     };
 94 | 
 95 |     if (util) util.inherits(WordPunctTokenizer, RegexpTokenizer);
 96 | 
 97 | 
 98 |     //exports.Tokenizer = Tokenizer;
 99 |     //exports.RegexpTokenizer = RegexpTokenizer;
100 |     //exports.WordTokenizer = WordTokenizer;
101 |     //exports.WordPunctTokenizer = WordPunctTokenizer;
102 |     //module.exports = WordTokenizer;
103 | 
104 |     // Exports
105 |     if (typeof exports !== 'undefined') {
106 |         if (typeof module !== 'undefined' && module.exports) {
107 |             exports = module.exports = RegexpTokenizer;
108 |         }
109 |         exports.Tokenizer = RegexpTokenizer;
110 |     } else {
111 |         root.Tokenizer = RegexpTokenizer;
112 |     }
113 | }).call(this);


--------------------------------------------------------------------------------
/lib/model/lexicon.js:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 | 	"numbersign": /#/,
 3 | 	"ampersand": /&amp;/,
 4 | 	"date_mm-dd-yy": /(0*[1-9]|1[012])-(0*[1-9]|[12][0-9]|3[01])-\d\d/,
 5 | 	"date_mm/dd/yy": /(0*[1-9]|1[012])\/(0*[1-9]|[12][0-9]|3[01])\/\d\d/,
 6 | 	"date_mm.dd.yy": /(0*[1-9]|1[012])\.(0*[1-9]|[12][0-9]|3[01])\.\d\d/,
 7 | 	"date_dd-mm-yy": /([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-\d\d/,
 8 | 	"date_dd/mm/yy": /([12][0-9]|3[01]|0*[1-9])\/(1[012]||0*[1-9])\/\d\d/,
 9 | 	"date_dd.mm.yy": /([12][0-9]|3[01]|0*[1-9])[\.](1[012]||0*[1-9])[\.]\d\d/,
10 | 	"date_dd-mm-yyyy": /([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-(19|20)\d\d/,
11 | 	"date_dd/mm/yyyy": /([12][0-9]|3[01]|0*[1-9])\/(1[012]||0*[1-9])\/(19|20)\d\d/,
12 | 	"date_dd.mm.yyyy": /([12][0-9]|3[01]|0*[1-9])\.(1[012]||0*[1-9])\.(19|20)\d\d/,
13 | 	"date_dd-mm": /(0*[1-9]|[12][0-9]|3[01])[-\/\.](1[012]|0*[1-9])/,
14 | 	"date_mm-yy": /(0*[1-9]|1[012])[-\/\.]\d\d/,
15 | 	"date_mm-yyyy": /(0*[1-9]|1[012])[-\/\.](19|20)\d\d/,
16 | 	"date_yyyy": /(19|20)\d\d/,
17 | 	"date_mm-dd-yyyy": /(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])-(19|20)\d\d/,
18 | 	"date_mm/dd/yyyy": /(0*[1-9]|1[012])\/([12][0-9]|3[01]|0*[1-9])\/(19|20)\d\d/,
19 | 	"date_mm.dd.yyyy": /(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])\.(19|20)\d\d/,
20 | 	"date_yyyy-mm-dd": /(19|20)\d\d-(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])/,
21 | 	"date_yyyy/mm/dd": /(19|20)\d\d\/(0*[1-9]|1[012])\/([12][0-9]|3[01]|0*[1-9])/,
22 | 	"date_yyyy.mm.dd": /(19|20)\d\d\.(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])/,
23 | 	"hhmmss": /([0-1]\d|[2][0-3]):[0-5]\d:[0-5]\d/,
24 | 	"percent": /([0-9]*[\.,])?[0-9]+%/,
25 | 	"name1": /[A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*/,
26 | 	"name2": /([A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*)(\s+[A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*/,
27 | 	//"phrase": /([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ])?([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\s])*([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz])*/,
28 | 	"phrase": /([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)/,
29 | 	"allcaps": /([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)(\s*[AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)*[^aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\/\)\(\?!\.;:,\-"']/,
30 | 	"fraction": /(\d+)\/(\d+)/,
31 | 	"email": /(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})/,
32 | 	"return": /(^$)/,
33 | 	"fslash": /\//,
34 | 	"langle": /&lt;/,
35 | 	"xmltags": /&lt;\/*\w*&gt;/,
36 | 	"equal": /=/,
37 | 	"rangle": /&gt;/,
38 | 	"aroba": /@/,
39 | 	"number1": /[+]?([0-9]*)?[0-9]+([\.,]\d+)*/,
40 | 	"number2": /[+]?([0-9]*)?[0-9]+([\.,]\d+)*(\s|tỉ|tỷ|triệu|ngàn|nghìn|trăm|chục)*/,
41 | 	"degree": /[-+]?([0-9]*[\.,])?[0-9]+°/,
42 | 	"ponctuation": /[\\?!\\.:;,\-"']/,
43 | 	"dollar": /\$/,
44 | 	"lparen": /\(/,
45 | 	"rparen": /\)/,
46 | 	"asterisk": /\*/,
47 | 	"plus": /\+/,
48 | 	"minus": /\-/,
49 | 	"ellipsis": /\.\.\./,
50 | 	"residual": /\W/,
51 | 	"lbracket": /\[/,
52 | 	"bslash": /\\/,
53 | 	"rbracket": /\]/,
54 | 	"entity0": /\d+([\.,]\d+)*[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+\d+$/,
55 | 	"entity1": /[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+(\d)*$/,
56 | 	"entity2": /[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\d]+([\.\-/][\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*[\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+/,
57 | 	"space": /\s+/,
58 | 	"word": /\w/,
59 | 	"lcbrace": /\{/,
60 | 	"rcbrace": /\}/,
61 | 	"underscore": /_/,
62 | 	"pound": /£/,
63 | 	//number: /[0-9]*\.[0-9]+|[0-9]+/ig,
64 | 	// space: /\s+/ig,
65 | 	unblank: /\S/,
66 | 	
67 | };


--------------------------------------------------------------------------------
/lib/tokenizer/taggedWord.js:
--------------------------------------------------------------------------------
  1 | var LexerRule = require('./lexerRule');
  2 | 
  3 | /**
  4 |  * Create a LexerToken
  5 |  * 
  6 |  * @param rule
  7 |  *            a rule
  8 |  * @param text
  9 |  *            the text
 10 |  * @param line
 11 |  *            the line location of the text in a file
 12 |  * @param column
 13 |  *            the column location of the text in a file
 14 |  */
 15 | var TaggedWord = function(rule, text, line, column) {
 16 | 	/**
 17 | 	 * A lexer rule
 18 | 	 */
 19 | 	var rule = rule || null;
 20 | 	
 21 | 	/**
 22 | 	 * The text
 23 | 	 */
 24 | 	var text = text || '';
 25 | 
 26 | 	/**
 27 | 	 * The line location of the text in the file
 28 | 	 */
 29 | 	var line = line || -1;
 30 | 
 31 | 	/**
 32 | 	 * The column location of the text in the file
 33 | 	 */
 34 | 	var column = column || -1;
 35 | 
 36 | 	/**
 37 | 	 * Create a lexer token from a text
 38 | 	 * 
 39 | 	 * @param text
 40 | 	 *            a text
 41 | 	 */
 42 | 	public TaggedWord(String text) {
 43 | 		this.rule = null;
 44 | 		this.text = text;
 45 | 		this.line = -1;
 46 | 		this.column = -1;
 47 | 	}
 48 | 	
 49 | 	/**
 50 | 	 * Return the rule that matched this token
 51 | 	 * 
 52 | 	 * @return the rule that match this token
 53 | 	 */
 54 | 	function getRule() {
 55 | 		return rule;
 56 | 	}
 57 | 
 58 | 	/**
 59 | 	 * Return the text that matched by this token
 60 | 	 * 
 61 | 	 * @return the text matched by this token
 62 | 	 */
 63 | 	function getText() {
 64 | 		return text.trim();
 65 | 	}
 66 | 
 67 | 	/**
 68 | 	 * Test if this rule is a phrase rule. A phrase is processed 
 69 | 	 * by a lexical segmenter.
 70 | 	 * 
 71 | 	 * @return true/false
 72 | 	 */
 73 | 	function isPhrase() {
 74 | 		return rule.getName() === "phrase" ? true : false;
 75 | 	}
 76 | 
 77 | 	function stringStartsWith (string, prefix) {
 78 |     	return string.slice(0, prefix.length) == prefix;
 79 | 	}
 80 | 
 81 | 	/**
 82 | 	 * Test if this rule is a named entity rule.
 83 | 	 * 
 84 | 	 * @return true/false
 85 | 	 */
 86 | 	function isNamedEntity() {
 87 | 		return stringStartsWith(rule.getName(), "name");
 88 | 	}
 89 | 	
 90 | 	/**
 91 | 	 * @return true/false
 92 | 	 */
 93 | 	function isDate() {
 94 | 		return stringStartsWith(rule.getName(), "date");
 95 | 	}
 96 | 	
 97 | 	/**
 98 | 	 * @return true/false
 99 | 	 */
100 | 	function isDateDay() {
101 | 		return stringStartsWith(rule.getName(), "day");
102 | 	}
103 | 	
104 | 	/**
105 | 	 * @return true/false
106 | 	 */
107 | 	function isDateMonth() {
108 | 		return stringStartsWith(rule.getName(), "month");
109 | 	}
110 | 
111 | 	function isDateYear() {
112 | 		return stringStartsWith(rule.getName(), "year");
113 | 	}
114 | 	
115 | 	function isNumber() {
116 | 		return stringStartsWith(rule.getName(), "number");
117 | 	}
118 | 	/**
119 | 	 * @return Returns the column.
120 | 	 */
121 | 	function getColumn() {
122 | 		return column;
123 | 	}
124 | 
125 | 	/**
126 | 	 * @param column
127 | 	 *            The column to set.
128 | 	 */
129 | 	function setColumn(column) {
130 | 		this.column = column;
131 | 	}
132 | 
133 | 	/**
134 | 	 * @return Returns the line.
135 | 	 */
136 | 	function getLine() {
137 | 		return line;
138 | 	}
139 | 
140 | 	/**
141 | 	 * @param line
142 | 	 *            The line to set.
143 | 	 */
144 | 	function setLine(line) {
145 | 		this.line = line;
146 | 	}
147 | 
148 | 	/**
149 | 	 * Return a string representation of the token
150 | 	 */
151 | 	function toString() {
152 | 		// return "[\"" + text + "\"" + " at (" + line + "," + column + ")]";
153 | 		// return rule.getName() + ": " + text;
154 | 		return text.trim();
155 | 	}
156 | 
157 | 	/* (non-Javadoc)
158 | 	 * @see java.lang.Object#hashCode()
159 | 	 */
160 | 	function hashCode() {
161 | 		return getText().hashCode();
162 | 	}
163 | 	
164 | 	/* (non-Javadoc)
165 | 	 * @see java.lang.Object#equals(java.lang.Object)
166 | 	 */
167 | 	function equals(obj) {
168 | 		if (obj == null) return false;
169 | 		if (!(obj instanceof TaggedWord)) {
170 | 			return false;
171 | 		}
172 | 		// two lexer is considered equal if their text are equal.
173 | 		// 
174 | 		return ((TaggedWord)obj).getText() == (getText());
175 | 	}
176 | 
177 | 	/* (non-Javadoc)
178 | 	 * @see java.lang.Comparable#compareTo(java.lang.Object)
179 | 	 */
180 | 	function compareTo(o) {
181 | 		return getText() === o.getText();
182 | 	}
183 | }
184 | 
185 | module.exports = TaggedWord;


--------------------------------------------------------------------------------