├── .bowerrc ├── .github └── FUNDING.yml ├── index.js ├── .gitignore ├── bower.json ├── lib ├── tokenizer │ ├── wordToken.js │ ├── lexerRule.js │ ├── lexer.js │ ├── index.js │ └── taggedWord.js └── model │ └── lexicon.js ├── test └── test.js ├── package.json ├── LICENSE ├── Gruntfile.js ├── README.md └── dist └── index.js /.bowerrc: -------------------------------------------------------------------------------- 1 | { 2 | 3 | } -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: duyet 2 | ko_fi: duyet 3 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | (function() { 4 | exports.lexer = require('./lib/tokenizer/lexer'); 5 | module.exports = require('./lib/tokenizer'); 6 | }).call(this); -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | 5 | # Runtime data 6 | pids 7 | *.pid 8 | *.seed 9 | 10 | # Directory for instrumented libs generated by jscoverage/JSCover 11 | lib-cov 12 | 13 | # Coverage directory used by tools like istanbul 14 | coverage 15 | 16 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 17 | .grunt 18 | 19 | # node-waf configuration 20 | .lock-wscript 21 | 22 | # Compiled binary addons (http://nodejs.org/api/addons.html) 23 | build/Release 24 | 25 | # Dependency directory 26 | # https://www.npmjs.org/doc/misc/npm-faq.html#should-i-check-my-node_modules-folder-into-git 27 | node_modules 28 | -------------------------------------------------------------------------------- /bower.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node-vntokenizer", 3 | "description": "Tokenizer for Vietnamese in Nodejs and Javascript", 4 | "main": "index.js", 5 | "authors": [ 6 | "Van-Duyet Le" 7 | ], 8 | "license": "MIT", 9 | "keywords": [ 10 | "node", 11 | "nlp", 12 | "vntokenizers", 13 | "text", 14 | "processing", 15 | "ngram", 16 | "tokenizer", 17 | "vietnam", 18 | "vietnamese" 19 | ], 20 | "homepage": "https://github.com/duyetdev/node-vntokenizer", 21 | "moduleType": [ 22 | "globals", 23 | "node" 24 | ], 25 | "ignore": [ 26 | "**/.*", 27 | "node_modules", 28 | "bower_components", 29 | "test", 30 | "tests" 31 | ], 32 | "version": "0.0.1" 33 | } 34 | -------------------------------------------------------------------------------- /lib/tokenizer/wordToken.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var TaggedWord = require('./taggedWord'); 4 | 5 | /** 6 | * A word token. It is a lexer token with 7 | * an additional information - the part of speech. But in general, 8 | * we do not use this information. 9 | * 10 | */ 11 | var WordToken = function(rule, text, line, column, pos) { 12 | TaggedWord.apply(this, rule, text, line, column); 13 | 14 | pos = pos || null; 15 | }; 16 | 17 | WordToken.prototype = TaggedWord.prototype; 18 | WordToken.prototype.constructor = WordToken; 19 | 20 | /** 21 | * Get the parts-of-speech of the token 22 | * @return parts-of-speech of the token 23 | */ 24 | WordToken.prototype.getPOS = function() { 25 | return pos; 26 | } 27 | 28 | module.exports = WordToken; -------------------------------------------------------------------------------- /test/test.js: -------------------------------------------------------------------------------- 1 | var assert = require('assert'); 2 | var Tokenizer = require('../index.js'); 3 | var token = new Tokenizer(); 4 | 5 | describe('Tokenizer', function() { 6 | it ('should be tokenized vietnamese words', function() { 7 | assert.deepEqual([ 'Le', 'Van', 'Duyet' ], token.tokenize('Le Van Duyet')); 8 | assert.deepEqual([ 'Lê', 'Văn', 'Duyệt' ], token.tokenize('Lê Văn Duyệt')); 9 | assert.deepEqual([ 'Tôi', 'tên', 'là', 'Duyệt', 'Test', 'chơi', 'vậy', 'thôi', 'Không', 'biết', 'có', 'đúng', 'hay', 'không', 'nữa' ], token.tokenize('Tôi tên là Duyệt. Test chơi vậy thôi!! Không biết có đúng hay không nữa?')); 10 | }); 11 | 12 | it ('should be remove not word charactors', function() { 13 | assert.deepEqual([ 'Lê', 'Văn', 'Duyệt' ], token.tokenize('!!!Lê!!Văn Duyệt')); 14 | }); 15 | 16 | it ('should be empty', function() { 17 | assert.deepEqual([], token.tokenize(' !@# ')); 18 | }); 19 | 20 | }); 21 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node-vntokenizer", 3 | "version": "0.0.2", 4 | "description": "Tokenizer for Vietnamese in Nodejs and Javascript", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "./node_modules/mocha/bin/mocha" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/duyetdev/node-vntokenizer.git" 12 | }, 13 | "keywords": [ 14 | "node", 15 | "nlp", 16 | "vntokenizers", 17 | "text", 18 | "processing", 19 | "ngram", 20 | "tokenizer", 21 | "vietnam", 22 | "vietnamese" 23 | ], 24 | "author": "Van-Duyet Le", 25 | "license": "MIT", 26 | "bugs": { 27 | "url": "https://github.com/duyetdev/node-vntokenizer/issues" 28 | }, 29 | "homepage": "https://github.com/duyetdev/node-vntokenizer#readme", 30 | "dependencies": { 31 | "java": "^5.0.0", 32 | "lodash": "^4.17.21", 33 | "underscore": "^1.8.3" 34 | }, 35 | "devDependencies": { 36 | "grunt": "^0.4.5", 37 | "grunt-contrib-copy": "^0.8.2", 38 | "grunt-contrib-uglify": "^0.10.0", 39 | "mocha": "^2.3.3" 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Van-Duyet Le 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /Gruntfile.js: -------------------------------------------------------------------------------- 1 | var pkgjson = require('./package.json'); 2 | 3 | var config = { 4 | pkg: pkgjson, 5 | app: 'lib', 6 | dist: 'dist' 7 | } 8 | 9 | module.exports = function(grunt) { 10 | // Configuration 11 | grunt.initConfig({ 12 | config: config, 13 | pkg: config.pkg, 14 | bower: grunt.file.readJSON('./.bowerrc'), 15 | copy: { 16 | dist: { 17 | files: [{ 18 | expand: true, 19 | cwd: '<%= config.app %>/tokenizer', 20 | src: 'index.js', 21 | dest: '<%= config.dist %>' 22 | }] 23 | } 24 | }, 25 | uglify: { 26 | options: { 27 | banner: '/*! <%= pkg.name %> lib - v<%= pkg.version %> -' + 28 | '<%= grunt.template.today("yyyy-mm-dd") %> */' 29 | }, 30 | dist: { 31 | files: { 32 | '<%= config.dist %>/js/lib.min.js': [ 33 | '<%= bower.directory %>/jquery/jquery.js', 34 | '<%= bower.directory %>/underscore/underscore.js', 35 | '<%= bower.directory %>/requirejs/require.js', 36 | ] 37 | } 38 | } 39 | } 40 | }); 41 | 42 | grunt.loadNpmTasks('grunt-contrib-copy'); 43 | grunt.loadNpmTasks('grunt-contrib-uglify'); 44 | 45 | grunt.registerTask('default', [ 46 | 'copy', 47 | // 'uglify' 48 | ]); 49 | }; 50 | -------------------------------------------------------------------------------- /lib/tokenizer/lexerRule.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | /** 4 | * Instantiate a new lexical rule with a name and a regex 5 | * 6 | * @param name 7 | * a name 8 | * @param regex 9 | * a regular expression 10 | */ 11 | var LexerRule = function (name, regex) { 12 | /** 13 | * The name of the lexical category that this rule matches 14 | */ 15 | var name = name || ''; 16 | 17 | /** 18 | * The regular expression used for matching 19 | */ 20 | var regex = regex || false; 21 | /** 22 | * A pre-compiled pattern object, kept to save processing time 23 | */ 24 | var pattern = false; 25 | 26 | /** 27 | * Get the category name 28 | * 29 | * @return the name of rule 30 | */ 31 | function getName() { 32 | return this.name; 33 | } 34 | 35 | /** 36 | * Get the regex defining the rule 37 | * 38 | * @return the regex 39 | */ 40 | function getRegex() { 41 | return this.regex; 42 | } 43 | 44 | /** 45 | * Return the pattern object. Create one if it hasn't been created already. 46 | * 47 | * @return the pattern object 48 | */ 49 | function getPattern() { 50 | if (this.pattern == null) { 51 | this.pattern = new RegExp(this.regex); 52 | } 53 | return this.pattern; 54 | } 55 | 56 | /** 57 | * Return a string representation of the rule 58 | */ 59 | function toString() { 60 | return "[" + this.name + "]"; 61 | } 62 | } 63 | 64 | module.exports = LexerRule; -------------------------------------------------------------------------------- /lib/tokenizer/lexer.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | var lexerRule = require('./lexerRule'); 4 | var lexerModel = require('../model/lexicon'); 5 | 6 | var re = lexerModel; 7 | 8 | function LexerNode(string, regex, regexs) { 9 | this.string = string; 10 | this.children = []; 11 | 12 | if (string) { 13 | this.matches = string.match(regex); 14 | var childElements = string.split(regex); 15 | } 16 | 17 | if (!this.matches) { 18 | this.matches = []; 19 | var childElements = [string]; 20 | } 21 | 22 | if (!regexs.length) { 23 | // no more regular expressions, we're done 24 | this.children = childElements; 25 | } else { 26 | // descend recursively 27 | var nextRegex = regexs[0] 28 | , nextRegexes = regexs.slice(1); 29 | 30 | for (var i in childElements) { 31 | if (childElements.hasOwnProperty(i)) { 32 | this.children.push( 33 | new LexerNode(childElements[i], nextRegex, nextRegexes)); 34 | } 35 | } 36 | } 37 | } 38 | 39 | LexerNode.prototype.fillArray = function(array){ 40 | for (var i in this.children) { 41 | if (this.children.hasOwnProperty(i)) { 42 | var child = this.children[i]; 43 | 44 | if (child && child.fillArray) { 45 | child.fillArray(array); 46 | } else if (re.unblank.test(child)) { 47 | array.push(child); 48 | } 49 | 50 | if (i < this.matches.length) { 51 | var match = this.matches[i]; 52 | if (re.unblank.test(match)) 53 | array.push(match); 54 | } 55 | } 56 | } 57 | } 58 | 59 | LexerNode.prototype.toString = function(){ 60 | var array = []; 61 | this.fillArray(array); 62 | return array.toString(); 63 | } 64 | 65 | function Lexer(string){ 66 | var regexs = []; 67 | for (var i in lexerModel) regexs.push(lexerModel[i]); 68 | 69 | var array = [] 70 | var node = new LexerNode(string, regexs[0], regexs.slice(1)); 71 | 72 | node.fillArray(array); 73 | return array; 74 | } 75 | 76 | module.exports = Lexer; -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # node-vntokenizer 2 | 3 | Tokenizer for Vietnamese in Nodejs and Javascript. 4 | 5 | 6 | 7 | 8 | [![NPM](https://nodei.co/npm/node-vntokenizer.png?downloads=true&downloadRank=true&stars=true)](https://nodei.co/npm/node-vntokenizer/) 9 | 10 | # Instalation 11 | 12 | * Via NPM 13 | ```sh 14 | npm install node-vntokenizer --save 15 | ``` 16 | 17 | * Via Bower 18 | ```sh 19 | bower install tokenizer --save 20 | ``` 21 | 22 | # Sample 23 | 24 | ```js 25 | var Tokenizer = require('node-vntokenizer'); 26 | var token = new Tokenizer(); 27 | 28 | console.log(token.tokenize('Lê Văn Duyệt')); 29 | // [ 'Lê', 'Văn', 'Duyệt' ] 30 | 31 | console.log(token.tokenize('Tôi tên là Duyệt. Test chơi vậy thôi!! Không biết có đúng hay không nữa?')); 32 | // [ 'Tôi', 'tên', 'là', 'Duyệt', 'Test', 'chơi', 'vậy', 'thôi', 'Không', 'biết', 'có', 'đúng', 'hay', 'không', 'nữa' ] 33 | 34 | console.log(token.tokenize('!!!Lê!!Văn Duyệt')); 35 | // [ 'Lê', 'Văn', 'Duyệt' ] 36 | ``` 37 | 38 | # Test 39 | ```sh 40 | npm test 41 | ``` 42 | 43 | # How to contribute 44 | 1. Fork the project on Github 45 | 2. Create a topic branch for your changes 46 | 3. Ensure that you provide documentation and test coverage for your changes (patches won’t be accepted without) 47 | 4. Create a pull request on Github (these are also a great place to start a conversation around a patch as early as possible) 48 | 49 | # License 50 | MIT License 51 | 52 | Copyright (c) 2015 Van-Duyet Le 53 | 54 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 55 | 56 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 57 | 58 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 59 | 60 | 61 | -------------------------------------------------------------------------------- /dist/index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | (function() { 4 | var root = this; 5 | var has_require = typeof require !== 'undefined'; 6 | 7 | if (typeof _ === 'undefined') { 8 | if (has_require) { 9 | var util = require("util"); 10 | var _ = require('lodash'); 11 | } else 12 | throw new Error('vnTokenizer requires underscore'); 13 | } 14 | 15 | var Tokenizer = function() {}; 16 | 17 | Tokenizer.prototype.trim = function(array) { 18 | while (array[array.length - 1] == '') 19 | array.pop(); 20 | 21 | while (array[0] == '') 22 | array.shift(); 23 | 24 | return array; 25 | }; 26 | 27 | // Expose an attach function that will patch String with new methods. 28 | Tokenizer.prototype.attach = function() { 29 | var self = this; 30 | 31 | String.prototype.tokenize = function() { 32 | return self.tokenize(this); 33 | } 34 | }; 35 | 36 | Tokenizer.prototype.tokenize = function() {}; 37 | 38 | // Base Class for RegExp Matching 39 | var RegexpTokenizer = function(options) { 40 | options = options || {}; 41 | this._pattern = /[^a-z0-9A-Z_ÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềếềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ]+/; 42 | this._pattern = options.pattern || this._pattern; 43 | this.discardEmpty = options.discardEmpty || true; 44 | 45 | // Match and split on GAPS not the actual WORDS 46 | this._gaps = options.gaps; 47 | 48 | if (this._gaps === undefined) { 49 | this._gaps = true; 50 | } 51 | }; 52 | 53 | if (util) util.inherits(RegexpTokenizer, Tokenizer); 54 | 55 | RegexpTokenizer.prototype.tokenize = function(s) { 56 | var results; 57 | 58 | if (this._gaps) { 59 | results = s.split(this._pattern); 60 | return (this.discardEmpty) ? _.without(results, '', ' ') : results; 61 | } else { 62 | return s.match(this._pattern); 63 | } 64 | }; 65 | 66 | 67 | /*** 68 | * A tokenizer that divides a text into sequences of alphabetic and 69 | * non-alphabetic characters. E.g.: 70 | * 71 | * >>> WordTokenizer().tokenize("She said 'hello'.") 72 | * ['She', 'said', 'hello'] 73 | * 74 | */ 75 | var WordTokenizer = function(options) { 76 | this._pattern = /\W+/; 77 | RegexpTokenizer.call(this, options) 78 | }; 79 | 80 | if (util) util.inherits(WordTokenizer, RegexpTokenizer); 81 | 82 | /*** 83 | * A tokenizer that divides a text into sequences of alphabetic and 84 | * non-alphabetic characters. E.g.: 85 | * 86 | * >>> WordPunctTokenizer().tokenize("She said 'hello'.") 87 | * ['She', 'said', "'", 'hello', "'."] 88 | * 89 | */ 90 | var WordPunctTokenizer = function(options) { 91 | this._pattern = new RegExp(/(\w+|\!|\'|\"")/i); 92 | RegexpTokenizer.call(this, options) 93 | }; 94 | 95 | if (util) util.inherits(WordPunctTokenizer, RegexpTokenizer); 96 | 97 | 98 | //exports.Tokenizer = Tokenizer; 99 | //exports.RegexpTokenizer = RegexpTokenizer; 100 | //exports.WordTokenizer = WordTokenizer; 101 | //exports.WordPunctTokenizer = WordPunctTokenizer; 102 | //module.exports = WordTokenizer; 103 | 104 | // Exports 105 | if (typeof exports !== 'undefined') { 106 | if (typeof module !== 'undefined' && module.exports) { 107 | exports = module.exports = RegexpTokenizer; 108 | } 109 | exports.Tokenizer = RegexpTokenizer; 110 | } else { 111 | root.Tokenizer = RegexpTokenizer; 112 | } 113 | }).call(this); -------------------------------------------------------------------------------- /lib/tokenizer/index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | (function() { 4 | var root = this; 5 | var has_require = typeof require !== 'undefined'; 6 | 7 | if (typeof _ === 'undefined') { 8 | if (has_require) { 9 | var util = require("util"); 10 | var _ = require('lodash'); 11 | } else 12 | throw new Error('vnTokenizer requires underscore'); 13 | } 14 | 15 | var Tokenizer = function() {}; 16 | 17 | Tokenizer.prototype.trim = function(array) { 18 | while (array[array.length - 1] == '') 19 | array.pop(); 20 | 21 | while (array[0] == '') 22 | array.shift(); 23 | 24 | return array; 25 | }; 26 | 27 | // Expose an attach function that will patch String with new methods. 28 | Tokenizer.prototype.attach = function() { 29 | var self = this; 30 | 31 | String.prototype.tokenize = function() { 32 | return self.tokenize(this); 33 | } 34 | }; 35 | 36 | Tokenizer.prototype.tokenize = function() {}; 37 | 38 | // Base Class for RegExp Matching 39 | var RegexpTokenizer = function(options) { 40 | options = options || {}; 41 | this._pattern = /[^a-z0-9A-Z_ÆØÅÀÁÂÃÈÉÊÌÍÒÓÔÕÙÚĂĐĨŨƠæøåàáâãèéêìíòóôõùúăđĩũơƯĂẠẢẤẦẨẪẬẮẰẲẴẶẸẺẼỀỀỂưăạảấầẩẫậắằẳẵặẹẻẽềếềểỄỆỈỊỌỎỐỒỔỖỘỚỜỞỠỢỤỦỨỪễệỉịọỏốồổỗộớờởỡợụủứừỬỮỰỲỴÝỶỸửữựỳỵỷỹ]+/; 42 | this._pattern = options.pattern || this._pattern; 43 | this.discardEmpty = options.discardEmpty || true; 44 | 45 | // Match and split on GAPS not the actual WORDS 46 | this._gaps = options.gaps; 47 | 48 | if (this._gaps === undefined) { 49 | this._gaps = true; 50 | } 51 | }; 52 | 53 | if (util) util.inherits(RegexpTokenizer, Tokenizer); 54 | 55 | RegexpTokenizer.prototype.tokenize = function(s) { 56 | var results; 57 | 58 | if (this._gaps) { 59 | results = s.split(this._pattern); 60 | return (this.discardEmpty) ? _.without(results, '', ' ') : results; 61 | } else { 62 | return s.match(this._pattern); 63 | } 64 | }; 65 | 66 | 67 | /*** 68 | * A tokenizer that divides a text into sequences of alphabetic and 69 | * non-alphabetic characters. E.g.: 70 | * 71 | * >>> WordTokenizer().tokenize("She said 'hello'.") 72 | * ['She', 'said', 'hello'] 73 | * 74 | */ 75 | var WordTokenizer = function(options) { 76 | this._pattern = /\W+/; 77 | RegexpTokenizer.call(this, options) 78 | }; 79 | 80 | if (util) util.inherits(WordTokenizer, RegexpTokenizer); 81 | 82 | /*** 83 | * A tokenizer that divides a text into sequences of alphabetic and 84 | * non-alphabetic characters. E.g.: 85 | * 86 | * >>> WordPunctTokenizer().tokenize("She said 'hello'.") 87 | * ['She', 'said', "'", 'hello', "'."] 88 | * 89 | */ 90 | var WordPunctTokenizer = function(options) { 91 | this._pattern = new RegExp(/(\w+|\!|\'|\"")/i); 92 | RegexpTokenizer.call(this, options) 93 | }; 94 | 95 | if (util) util.inherits(WordPunctTokenizer, RegexpTokenizer); 96 | 97 | 98 | //exports.Tokenizer = Tokenizer; 99 | //exports.RegexpTokenizer = RegexpTokenizer; 100 | //exports.WordTokenizer = WordTokenizer; 101 | //exports.WordPunctTokenizer = WordPunctTokenizer; 102 | //module.exports = WordTokenizer; 103 | 104 | // Exports 105 | if (typeof exports !== 'undefined') { 106 | if (typeof module !== 'undefined' && module.exports) { 107 | exports = module.exports = RegexpTokenizer; 108 | } 109 | exports.Tokenizer = RegexpTokenizer; 110 | } else { 111 | root.Tokenizer = RegexpTokenizer; 112 | } 113 | }).call(this); -------------------------------------------------------------------------------- /lib/model/lexicon.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "numbersign": /#/, 3 | "ampersand": /&/, 4 | "date_mm-dd-yy": /(0*[1-9]|1[012])-(0*[1-9]|[12][0-9]|3[01])-\d\d/, 5 | "date_mm/dd/yy": /(0*[1-9]|1[012])\/(0*[1-9]|[12][0-9]|3[01])\/\d\d/, 6 | "date_mm.dd.yy": /(0*[1-9]|1[012])\.(0*[1-9]|[12][0-9]|3[01])\.\d\d/, 7 | "date_dd-mm-yy": /([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-\d\d/, 8 | "date_dd/mm/yy": /([12][0-9]|3[01]|0*[1-9])\/(1[012]||0*[1-9])\/\d\d/, 9 | "date_dd.mm.yy": /([12][0-9]|3[01]|0*[1-9])[\.](1[012]||0*[1-9])[\.]\d\d/, 10 | "date_dd-mm-yyyy": /([12][0-9]|3[01]|0*[1-9])-(1[012]||0*[1-9])-(19|20)\d\d/, 11 | "date_dd/mm/yyyy": /([12][0-9]|3[01]|0*[1-9])\/(1[012]||0*[1-9])\/(19|20)\d\d/, 12 | "date_dd.mm.yyyy": /([12][0-9]|3[01]|0*[1-9])\.(1[012]||0*[1-9])\.(19|20)\d\d/, 13 | "date_dd-mm": /(0*[1-9]|[12][0-9]|3[01])[-\/\.](1[012]|0*[1-9])/, 14 | "date_mm-yy": /(0*[1-9]|1[012])[-\/\.]\d\d/, 15 | "date_mm-yyyy": /(0*[1-9]|1[012])[-\/\.](19|20)\d\d/, 16 | "date_yyyy": /(19|20)\d\d/, 17 | "date_mm-dd-yyyy": /(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])-(19|20)\d\d/, 18 | "date_mm/dd/yyyy": /(0*[1-9]|1[012])\/([12][0-9]|3[01]|0*[1-9])\/(19|20)\d\d/, 19 | "date_mm.dd.yyyy": /(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])\.(19|20)\d\d/, 20 | "date_yyyy-mm-dd": /(19|20)\d\d-(0*[1-9]|1[012])-([12][0-9]|3[01]|0*[1-9])/, 21 | "date_yyyy/mm/dd": /(19|20)\d\d\/(0*[1-9]|1[012])\/([12][0-9]|3[01]|0*[1-9])/, 22 | "date_yyyy.mm.dd": /(19|20)\d\d\.(0*[1-9]|1[012])\.([12][0-9]|3[01]|0*[1-9])/, 23 | "hhmmss": /([0-1]\d|[2][0-3]):[0-5]\d:[0-5]\d/, 24 | "percent": /([0-9]*[\.,])?[0-9]+%/, 25 | "name1": /[A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*/, 26 | "name2": /([A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]*)(\s+[A-ZÁÂĐÐÍÔƯỨÝỶ][aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*/, 27 | //"phrase": /([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ])?([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\s])*([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz])*/, 28 | "phrase": /([aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)/, 29 | "allcaps": /([AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)(\s*[AÀẢÃÁẠĂẰẲẴẮẶÂẦẨẪẤẬBCDĐÐEÈẺẼÉẸÊỀỂỄẾỆFGHIÌỈĨÍỊJKLMNOÒỎÕÓỌÔỒỔỖỐỘƠỜỞỠỚỢPQRSTUÙỦŨÚỤƯỪỬỮỨỰVWXYỲỶỸÝỴZ]+)*[^aàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\/\)\(\?!\.;:,\-"']/, 30 | "fraction": /(\d+)\/(\d+)/, 31 | "email": /(\w[-._\w]*\w@\w[-._\w]*\w\.\w{2,3})/, 32 | "return": /(^$)/, 33 | "fslash": /\//, 34 | "langle": /</, 35 | "xmltags": /<\/*\w*>/, 36 | "equal": /=/, 37 | "rangle": />/, 38 | "aroba": /@/, 39 | "number1": /[+]?([0-9]*)?[0-9]+([\.,]\d+)*/, 40 | "number2": /[+]?([0-9]*)?[0-9]+([\.,]\d+)*(\s|tỉ|tỷ|triệu|ngàn|nghìn|trăm|chục)*/, 41 | "degree": /[-+]?([0-9]*[\.,])?[0-9]+°/, 42 | "ponctuation": /[\\?!\\.:;,\-"']/, 43 | "dollar": /\$/, 44 | "lparen": /\(/, 45 | "rparen": /\)/, 46 | "asterisk": /\*/, 47 | "plus": /\+/, 48 | "minus": /\-/, 49 | "ellipsis": /\.\.\./, 50 | "residual": /\W/, 51 | "lbracket": /\[/, 52 | "bslash": /\\/, 53 | "rbracket": /\]/, 54 | "entity0": /\d+([\.,]\d+)*[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+\d+$/, 55 | "entity1": /[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+(\d)*$/, 56 | "entity2": /[A-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz\d]+([\.\-/][\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+)*[\dA-Zaàảãáạăằẳẵắặâầẩẫấậbcdđeèẻẽéẹêềểễếệfghiìỉĩíịjklmnoòỏõóọôồổỗốộơờởỡớợpqrstuùủũúụưừửữứựvwxyỳỷỹýỵz]+/, 57 | "space": /\s+/, 58 | "word": /\w/, 59 | "lcbrace": /\{/, 60 | "rcbrace": /\}/, 61 | "underscore": /_/, 62 | "pound": /£/, 63 | //number: /[0-9]*\.[0-9]+|[0-9]+/ig, 64 | // space: /\s+/ig, 65 | unblank: /\S/, 66 | 67 | }; -------------------------------------------------------------------------------- /lib/tokenizer/taggedWord.js: -------------------------------------------------------------------------------- 1 | var LexerRule = require('./lexerRule'); 2 | 3 | /** 4 | * Create a LexerToken 5 | * 6 | * @param rule 7 | * a rule 8 | * @param text 9 | * the text 10 | * @param line 11 | * the line location of the text in a file 12 | * @param column 13 | * the column location of the text in a file 14 | */ 15 | var TaggedWord = function(rule, text, line, column) { 16 | /** 17 | * A lexer rule 18 | */ 19 | var rule = rule || null; 20 | 21 | /** 22 | * The text 23 | */ 24 | var text = text || ''; 25 | 26 | /** 27 | * The line location of the text in the file 28 | */ 29 | var line = line || -1; 30 | 31 | /** 32 | * The column location of the text in the file 33 | */ 34 | var column = column || -1; 35 | 36 | /** 37 | * Create a lexer token from a text 38 | * 39 | * @param text 40 | * a text 41 | */ 42 | public TaggedWord(String text) { 43 | this.rule = null; 44 | this.text = text; 45 | this.line = -1; 46 | this.column = -1; 47 | } 48 | 49 | /** 50 | * Return the rule that matched this token 51 | * 52 | * @return the rule that match this token 53 | */ 54 | function getRule() { 55 | return rule; 56 | } 57 | 58 | /** 59 | * Return the text that matched by this token 60 | * 61 | * @return the text matched by this token 62 | */ 63 | function getText() { 64 | return text.trim(); 65 | } 66 | 67 | /** 68 | * Test if this rule is a phrase rule. A phrase is processed 69 | * by a lexical segmenter. 70 | * 71 | * @return true/false 72 | */ 73 | function isPhrase() { 74 | return rule.getName() === "phrase" ? true : false; 75 | } 76 | 77 | function stringStartsWith (string, prefix) { 78 | return string.slice(0, prefix.length) == prefix; 79 | } 80 | 81 | /** 82 | * Test if this rule is a named entity rule. 83 | * 84 | * @return true/false 85 | */ 86 | function isNamedEntity() { 87 | return stringStartsWith(rule.getName(), "name"); 88 | } 89 | 90 | /** 91 | * @return true/false 92 | */ 93 | function isDate() { 94 | return stringStartsWith(rule.getName(), "date"); 95 | } 96 | 97 | /** 98 | * @return true/false 99 | */ 100 | function isDateDay() { 101 | return stringStartsWith(rule.getName(), "day"); 102 | } 103 | 104 | /** 105 | * @return true/false 106 | */ 107 | function isDateMonth() { 108 | return stringStartsWith(rule.getName(), "month"); 109 | } 110 | 111 | function isDateYear() { 112 | return stringStartsWith(rule.getName(), "year"); 113 | } 114 | 115 | function isNumber() { 116 | return stringStartsWith(rule.getName(), "number"); 117 | } 118 | /** 119 | * @return Returns the column. 120 | */ 121 | function getColumn() { 122 | return column; 123 | } 124 | 125 | /** 126 | * @param column 127 | * The column to set. 128 | */ 129 | function setColumn(column) { 130 | this.column = column; 131 | } 132 | 133 | /** 134 | * @return Returns the line. 135 | */ 136 | function getLine() { 137 | return line; 138 | } 139 | 140 | /** 141 | * @param line 142 | * The line to set. 143 | */ 144 | function setLine(line) { 145 | this.line = line; 146 | } 147 | 148 | /** 149 | * Return a string representation of the token 150 | */ 151 | function toString() { 152 | // return "[\"" + text + "\"" + " at (" + line + "," + column + ")]"; 153 | // return rule.getName() + ": " + text; 154 | return text.trim(); 155 | } 156 | 157 | /* (non-Javadoc) 158 | * @see java.lang.Object#hashCode() 159 | */ 160 | function hashCode() { 161 | return getText().hashCode(); 162 | } 163 | 164 | /* (non-Javadoc) 165 | * @see java.lang.Object#equals(java.lang.Object) 166 | */ 167 | function equals(obj) { 168 | if (obj == null) return false; 169 | if (!(obj instanceof TaggedWord)) { 170 | return false; 171 | } 172 | // two lexer is considered equal if their text are equal. 173 | // 174 | return ((TaggedWord)obj).getText() == (getText()); 175 | } 176 | 177 | /* (non-Javadoc) 178 | * @see java.lang.Comparable#compareTo(java.lang.Object) 179 | */ 180 | function compareTo(o) { 181 | return getText() === o.getText(); 182 | } 183 | } 184 | 185 | module.exports = TaggedWord; --------------------------------------------------------------------------------