├── .npmignore ├── docs ├── menu.svg ├── close.svg ├── scripts │ ├── collapse.js │ ├── linenumber.js │ ├── prettify │ │ ├── lang-css.js │ │ └── Apache-License-2.0.txt │ └── search.js ├── styles │ └── prettify.css ├── string-lower-case.js.html ├── string-trim.js.html ├── string-upper-case.js.html └── string-stem.js.html ├── .travis.yml ├── .nycrc.json ├── .jsdoc.json ├── .gitignore ├── LICENSE ├── package.json ├── runkit └── example.js ├── src ├── string-lower-case.js ├── string-upper-case.js ├── string-trim.js ├── string-stem.js ├── tokens-stem.js ├── tokens-phonetize.js ├── string-soundex.js ├── tokens-soundex.js ├── string-amplify-not-elision.js ├── string-remove-extra-spaces.js ├── string-remove-spl-chars.js ├── tokens-bigrams.js ├── string-remove-html-tags.js ├── tokens-append-bigrams.js ├── string-remove-punctuations.js ├── string-remove-elisions.js ├── string-split-elisions.js ├── string-extract-run-of-capital-words.js ├── string-retain-alpha-nums.js ├── string-marker.js ├── dictionaries │ └── stop_words.json ├── string-ngram.js ├── tokens-remove-words.js ├── jsdoc-classes.js ├── tokens-sow.js ├── string-soc.js ├── string-sentences.js ├── name_cleaner_regexes.js ├── string-tokenize0.js ├── helper-return-indexer.js ├── string-extract-persons-name.js ├── string-song.js ├── string-edge-ngrams.js ├── string-compose-corpus.js ├── tokens-propagate-negations.js ├── phonetize_regexes.js ├── string-bong.js ├── helper-return-words-filter.js ├── tokens-bow.js ├── string-tokenize.js ├── helper-return-quoted-text-extractor.js ├── string-phonetize.js ├── util_regexes.js └── wink-nlp-utils.js ├── CODE_OF_CONDUCT.md ├── test └── string-edge-ngrams-specs.js ├── README.md ├── CONTRIBUTING.md └── .eslintrc.json /.npmignore: -------------------------------------------------------------------------------- 1 | test 2 | docs 3 | sourcedocs 4 | .eslintrc.json 5 | .jshintrc 6 | .travis.yml 7 | 8 | # Generic 9 | .node_repl_history 10 | .npm 11 | coverage 12 | -------------------------------------------------------------------------------- /docs/menu.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "16.17.1" 4 | 5 | 6 | cache: 7 | directories: 8 | - "node_modules" 9 | 10 | script: 11 | - npm run pretest 12 | - npm run test 13 | - npm run coverage 14 | -------------------------------------------------------------------------------- /docs/close.svg: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /docs/scripts/collapse.js: -------------------------------------------------------------------------------- 1 | function hideAllButCurrent(){ 2 | //by default all submenut items are hidden 3 | $("nav > ul > li > ul li").hide(); 4 | 5 | //only current page (if it exists) should be opened 6 | var file = window.location.pathname.split("/").pop(); 7 | $("nav > ul > li > a[href^='"+file+"']").parent().find("> ul li").show(); 8 | } 9 | $( document ).ready(function() { 10 | hideAllButCurrent(); 11 | }); -------------------------------------------------------------------------------- /.nycrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "watermarks": { 3 | "branches": [ 4 | 90, 5 | 99.5 6 | ], 7 | "lines": [ 8 | 90, 9 | 99.5 10 | ], 11 | "functions": [ 12 | 90, 13 | 99.5 14 | ], 15 | "statements": [ 16 | 90, 17 | 99.5 18 | ] 19 | }, 20 | "branches": 99.5, 21 | "lines": 99.5, 22 | "functions": 99.5, 23 | "statements": 99.5, 24 | "check-coverage": true 25 | } -------------------------------------------------------------------------------- /.jsdoc.json: -------------------------------------------------------------------------------- 1 | { 2 | "opts": { 3 | "encoding": "utf8", 4 | "readme": "README.md", 5 | "template": "./node_modules/docdash/", 6 | "destination": "docs" 7 | }, 8 | "plugins": ["plugins/markdown"], 9 | "docdash": { 10 | "meta": { 11 | "title": "wink-nlp-utils - Wink JS" 12 | }, 13 | "menu":{ 14 | "Github":{ 15 | "href":"https://github.com/winkjs/wink-nlp-utils", 16 | "target":"_blank" 17 | } 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | 6 | # Runtime data 7 | pids 8 | *.pid 9 | *.seed 10 | 11 | # Coverage directory used by tools like istanbul 12 | coverage 13 | 14 | # nyc test coverage 15 | .nyc_output 16 | 17 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 18 | .grunt 19 | 20 | # node-waf configuration 21 | .lock-wscript 22 | 23 | # Compiled binary addons (http://nodejs.org/api/addons.html) 24 | build/Release 25 | 26 | # Dependency directories 27 | node_modules 28 | jspm_packages 29 | 30 | # Optional npm cache directory 31 | .npm 32 | 33 | # Optional REPL history 34 | .node_repl_history 35 | 36 | # Documentation 37 | sourcedocs 38 | 39 | # Mac stuff 40 | .DS_Store 41 | -------------------------------------------------------------------------------- /docs/scripts/linenumber.js: -------------------------------------------------------------------------------- 1 | /*global document */ 2 | (function() { 3 | var source = document.getElementsByClassName('prettyprint source linenums'); 4 | var i = 0; 5 | var lineNumber = 0; 6 | var lineId; 7 | var lines; 8 | var totalLines; 9 | var anchorHash; 10 | 11 | if (source && source[0]) { 12 | anchorHash = document.location.hash.substring(1); 13 | lines = source[0].getElementsByTagName('li'); 14 | totalLines = lines.length; 15 | 16 | for (; i < totalLines; i++) { 17 | lineNumber++; 18 | lineId = 'line' + lineNumber; 19 | lines[i].id = lineId; 20 | if (lineId === anchorHash) { 21 | lines[i].className += ' selected'; 22 | } 23 | } 24 | } 25 | })(); 26 | -------------------------------------------------------------------------------- /docs/scripts/prettify/lang-css.js: -------------------------------------------------------------------------------- 1 | PR.registerLangHandler(PR.createSimpleLexer([["pln",/^[\t\n\f\r ]+/,null," \t\r\n "]],[["str",/^"(?:[^\n\f\r"\\]|\\(?:\r\n?|\n|\f)|\\[\S\s])*"/,null],["str",/^'(?:[^\n\f\r'\\]|\\(?:\r\n?|\n|\f)|\\[\S\s])*'/,null],["lang-css-str",/^url\(([^"')]*)\)/i],["kwd",/^(?:url|rgb|!important|@import|@page|@media|@charset|inherit)(?=[^\w-]|$)/i,null],["lang-css-kw",/^(-?(?:[_a-z]|\\[\da-f]+ ?)(?:[\w-]|\\\\[\da-f]+ ?)*)\s*:/i],["com",/^\/\*[^*]*\*+(?:[^*/][^*]*\*+)*\//],["com", 2 | /^(?:<\!--|--\>)/],["lit",/^(?:\d+|\d*\.\d+)(?:%|[a-z]+)?/i],["lit",/^#[\da-f]{3,6}/i],["pln",/^-?(?:[_a-z]|\\[\da-f]+ ?)(?:[\w-]|\\\\[\da-f]+ ?)*/i],["pun",/^[^\s\w"']+/]]),["css"]);PR.registerLangHandler(PR.createSimpleLexer([],[["kwd",/^-?(?:[_a-z]|\\[\da-f]+ ?)(?:[\w-]|\\\\[\da-f]+ ?)*/i]]),["css-kw"]);PR.registerLangHandler(PR.createSimpleLexer([],[["str",/^[^"')]+/]]),["css-str"]); 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017-22 GRAYPE Systems Private Limited 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies 9 | of the Software, and to permit persons to whom the Software is furnished to do 10 | so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/styles/prettify.css: -------------------------------------------------------------------------------- 1 | .pln { 2 | color: #abb2bf; 3 | } 4 | 5 | /* string content */ 6 | .str { 7 | color: #98c379; 8 | } 9 | 10 | /* a keyword */ 11 | .kwd { 12 | color: #c678dd; 13 | } 14 | 15 | /* a comment */ 16 | .com { 17 | color: #5c6370; 18 | font-style: italic; 19 | } 20 | 21 | /* a type name */ 22 | .typ { 23 | color: #d19a66; 24 | } 25 | 26 | /* a literal value */ 27 | .lit { 28 | color: #56b6c2; 29 | } 30 | 31 | /* punctuation */ 32 | .pun { 33 | color: #abb2bf; 34 | } 35 | 36 | /* lisp open bracket */ 37 | .opn { 38 | color: #000000; 39 | } 40 | 41 | /* lisp close bracket */ 42 | .clo { 43 | color: #000000; 44 | } 45 | 46 | /* a markup tag name */ 47 | .tag { 48 | color: #c678dd; 49 | } 50 | 51 | /* a markup attribute name */ 52 | .atn { 53 | color: #98c379; 54 | } 55 | 56 | /* a markup attribute value */ 57 | .atv { 58 | color: #d19a66; 59 | } 60 | 61 | /* a declaration */ 62 | .dec { 63 | color: #EF5050; 64 | } 65 | 66 | /* a variable name */ 67 | .var { 68 | color: #d19a66; 69 | } 70 | 71 | /* a function name */ 72 | .fun { 73 | color: #e06c75; 74 | } 75 | 76 | /* Specify class=linenums on a pre to get line numbering */ 77 | ol.linenums { 78 | margin-top: 0; 79 | margin-bottom: 0; 80 | } 81 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wink-nlp-utils", 3 | "version": "2.1.0", 4 | "description": "NLP Functions for amplifying negations, managing elisions, creating ngrams, stems, phonetic codes to tokens and more.", 5 | "keywords": [ 6 | "Tokenize", 7 | "Stem", 8 | "NGrams", 9 | "Bag of Words", 10 | "Phonetize", 11 | "Soundex", 12 | "Stop Words", 13 | "Sentence Breaking", 14 | "Regex", 15 | "NLP", 16 | "Natural Language Processing" 17 | ], 18 | "main": "src/wink-nlp-utils.js", 19 | "scripts": { 20 | "pretest": "npm run lint && npm run docs", 21 | "test": "nyc --reporter=html --reporter=text mocha ./test/", 22 | "coverage": "nyc report --reporter=text-lcov | coveralls", 23 | "sourcedocs": "docker -i src -o ./sourcedocs --sidebar no", 24 | "docs": "jsdoc src/*.js -c .jsdoc.json", 25 | "lint": "eslint ./src/*.js ./test/*.js ./runkit/*.js" 26 | }, 27 | "repository": { 28 | "type": "git", 29 | "url": "git+https://github.com/winkjs/wink-nlp-utils.git" 30 | }, 31 | "author": "Sanjaya Kumar Saxena", 32 | "license": "MIT", 33 | "bugs": { 34 | "url": "https://github.com/winkjs/wink-nlp-utils/issues" 35 | }, 36 | "homepage": "http://winkjs.org/", 37 | "devDependencies": { 38 | "chai": "^4.3.6", 39 | "coveralls": "^3.1.1", 40 | "docdash": "github:winkjs/docdash", 41 | "docker": "^1.0.0", 42 | "eslint": "^8.25.0", 43 | "jsdoc": "^3.6.11", 44 | "mocha": "^10.0.0", 45 | "nyc": "^15.1.0" 46 | }, 47 | "dependencies": { 48 | "wink-distance": "^2.0.1", 49 | "wink-eng-lite-web-model": "^1.4.3", 50 | "wink-helpers": "^2.0.0", 51 | "wink-nlp": "^1.12.0", 52 | "wink-porter2-stemmer": "^2.0.1", 53 | "wink-tokenizer": "^5.2.3" 54 | }, 55 | "runkitExampleFilename": "./runkit/example.js" 56 | } 57 | -------------------------------------------------------------------------------- /runkit/example.js: -------------------------------------------------------------------------------- 1 | // Load wink-nlp-utils 2 | var nlp = require( 'wink-nlp-utils' ); 3 | 4 | // Extract person's name from a string: 5 | var name = nlp.string.extractPersonsName( 'Dr. Sarah Connor M. Tech., PhD. - AI' ); 6 | console.log( name ); // eslint-disable-line no-console 7 | // -> 'Sarah Connor' 8 | 9 | // Compose all possible sentences from a string: 10 | var str = '[I] [am having|have] [a] [problem|question]'; 11 | console.log( nlp.string.composeCorpus( str ) ); // eslint-disable-line no-console 12 | // -> [ 'I am having a problem', 13 | // -> 'I am having a question', 14 | // -> 'I have a problem', 15 | // -> 'I have a question' ] 16 | 17 | // Sentence Boundary Detection. 18 | var para = 'AI Inc. is focussing on AI. I work for AI Inc. My mail is r2d2@yahoo.com'; 19 | console.log( nlp.string.sentences( para ) ); // eslint-disable-line no-console 20 | // -> [ 'AI Inc. is focussing on AI.', 21 | // 'I work for AI Inc.', 22 | // 'My mail is r2d2@yahoo.com' ] 23 | 24 | // Tokenize a sentence. 25 | var s = 'For details on wink, check out http://winkjs.org/ URL!'; 26 | console.log( nlp.string.tokenize( s, true ) ); // eslint-disable-line no-console 27 | // -> [ { value: 'For', tag: 'word' }, 28 | // { value: 'details', tag: 'word' }, 29 | // { value: 'on', tag: 'word' }, 30 | // { value: 'wink', tag: 'word' }, 31 | // { value: ',', tag: 'punctuation' }, 32 | // { value: 'check', tag: 'word' }, 33 | // { value: 'out', tag: 'word' }, 34 | // { value: 'http://winkjs.org/', tag: 'url' }, 35 | // { value: 'URL', tag: 'word' }, 36 | // { value: '!', tag: 'punctuation' } ] 37 | 38 | // Remove stop words: 39 | var t = nlp.tokens.removeWords( [ 'mary', 'had', 'a', 'little', 'lamb' ] ); 40 | console.log( t ); // eslint-disable-line no-console 41 | // -> [ 'mary', 'little', 'lamb' ] 42 | -------------------------------------------------------------------------------- /src/string-lower-case.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### lowerCase 32 | /** 33 | * 34 | * Converts the input string to lower case. 35 | * 36 | * @alias string#lowerCase 37 | * @param {string} str the input string. 38 | * @return {string} input string in lower case. 39 | * @example 40 | * lowerCase( 'Lower Case' ); 41 | * // -> 'lower case' 42 | */ 43 | var lowerCase = function ( str ) { 44 | return ( str.toLowerCase() ); 45 | }; // lowerCase() 46 | 47 | module.exports = lowerCase; 48 | -------------------------------------------------------------------------------- /src/string-upper-case.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### upperCase 32 | /** 33 | * 34 | * Converts the input string to upper case. 35 | * 36 | * @alias string#upperCase 37 | * @param {string} str the input string. 38 | * @return {string} input string in upper case. 39 | * @example 40 | * upperCase( 'Upper Case' ); 41 | * // -> 'UPPER CASE' 42 | */ 43 | var upperCase = function ( str ) { 44 | return ( str.toUpperCase() ); 45 | }; // upperCase() 46 | 47 | module.exports = upperCase; 48 | -------------------------------------------------------------------------------- /src/string-trim.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### trim 32 | /** 33 | * 34 | * Trims leading and trailing whitespaces from the input string. 35 | * 36 | * @alias string#trim 37 | * @param {string} str the input string. 38 | * @return {string} input string with leading & trailing whitespaces removed. 39 | * @example 40 | * trim( ' Padded ' ); 41 | * // -> 'Padded' 42 | */ 43 | var trim = function ( str ) { 44 | return ( str.trim() ); 45 | }; // trim() 46 | 47 | module.exports = trim; 48 | -------------------------------------------------------------------------------- /src/string-stem.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var porter2Stemmer = require( 'wink-porter2-stemmer' ); 29 | 30 | // ## string 31 | 32 | // ### stem 33 | /** 34 | * 35 | * Stems an inflected word using Porter2 stemming algorithm. 36 | * 37 | * @alias string#stem 38 | * @param {string} word to be stemmed. 39 | * @return {string} the stemmed word. 40 | * 41 | * @example 42 | * stem( 'consisting' ); 43 | * // -> 'consist' 44 | */ 45 | var stem = function ( word ) { 46 | return ( porter2Stemmer( word ) ); 47 | }; // stem() 48 | 49 | module.exports = stem; 50 | -------------------------------------------------------------------------------- /docs/scripts/search.js: -------------------------------------------------------------------------------- 1 | $( document ).ready(function() { 2 | var searchAttr = 'data-search-mode'; 3 | jQuery.expr[':'].Contains = function(a,i,m){ 4 | return (a.textContent || a.innerText || "").toUpperCase().indexOf(m[3].toUpperCase())>=0; 5 | }; 6 | //on search 7 | $("#nav-search").on("keyup", function(event) { 8 | var search = $(this).val(); 9 | 10 | if (!search) { 11 | //no search, show all results 12 | document.documentElement.removeAttribute(searchAttr); 13 | $("nav > ul > li").not('.level-hide').show(); 14 | 15 | if(typeof hideAllButCurrent === "function"){ 16 | //let's do what ever collapse wants to do 17 | hideAllButCurrent(); 18 | } 19 | else{ 20 | //menu by default should be opened 21 | $("nav > ul > li > ul li").show(); 22 | } 23 | } 24 | else{ 25 | //we are searching 26 | document.documentElement.setAttribute(searchAttr, ''); 27 | 28 | //show all parents 29 | $("nav > ul > li").show(); 30 | //hide all results 31 | $("nav > ul > li > ul li").hide(); 32 | //show results matching filter 33 | $("nav > ul > li > ul").find("a:Contains("+search+")").parent().show(); 34 | //hide parents without children 35 | $("nav > ul > li").each(function(){ 36 | if($(this).find("a:Contains("+search+")").length == 0 && $(this).children("ul").length === 0){ 37 | //has no child at all and does not contain text 38 | $(this).hide(); 39 | } 40 | else if($(this).find("a:Contains("+search+")").length == 0 && $(this).find("ul").children(':visible').length == 0){ 41 | //has no visible child and does not contain text 42 | $(this).hide(); 43 | } 44 | }); 45 | } 46 | }); 47 | }); -------------------------------------------------------------------------------- /src/tokens-stem.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var porter2Stemmer = require( 'wink-porter2-stemmer' ); 29 | 30 | // ## tokens 31 | 32 | // ### stem 33 | /** 34 | * 35 | * Stems input tokens using Porter Stemming Algorithm Version 2. 36 | * 37 | * @alias tokens#stem 38 | * @param {string[]} tokens the input tokens. 39 | * @return {string[]} stemmed tokens. 40 | * @example 41 | * stem( [ 'he', 'acted', 'decisively', 'today' ] ); 42 | * // -> [ 'he', 'act', 'decis', 'today' ] 43 | */ 44 | var stem = function ( tokens ) { 45 | return tokens.map( porter2Stemmer ); 46 | }; // stem() 47 | 48 | module.exports = stem; 49 | -------------------------------------------------------------------------------- /src/tokens-phonetize.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var stringPhonetize = require( './string-phonetize.js' ); 29 | 30 | // ## tokens 31 | 32 | // ### phonetize 33 | /** 34 | * 35 | * Phonetizes input tokens using using an algorithmic adaptation of Metaphone. 36 | * 37 | * @alias tokens#phonetize 38 | * @param {string[]} tokens the input tokens. 39 | * @return {string[]} phonetized tokens. 40 | * @example 41 | * phonetize( [ 'he', 'acted', 'decisively', 'today' ] ); 42 | * // -> [ 'h', 'aktd', 'dssvl', 'td' ] 43 | */ 44 | var phonetize = function ( tokens ) { 45 | return tokens.map( stringPhonetize ); 46 | }; // phonetize() 47 | 48 | module.exports = phonetize; 49 | -------------------------------------------------------------------------------- /src/string-soundex.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var sndx = require( 'wink-distance/src/soundex.js' ); 29 | 30 | // ## string 31 | 32 | // ### soundex 33 | /** 34 | * 35 | * Produces the soundex code from the input `word`. 36 | * 37 | * @alias string#soundex 38 | * @param {string} word the input word. 39 | * @param {number} [maxLength=4] of soundex code to be returned. 40 | * @return {string} soundex code of `word`. 41 | * @example 42 | * soundex( 'Burroughs' ); 43 | * // -> 'B620' 44 | * soundex( 'Burrows' ); 45 | * // -> 'B620' 46 | */ 47 | var soundex = function ( word, maxLength ) { 48 | return sndx( word, maxLength ); 49 | }; // soundex() 50 | 51 | module.exports = soundex; 52 | -------------------------------------------------------------------------------- /src/tokens-soundex.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var stringSoundex = require( './string-soundex.js' ); 29 | 30 | // ## tokens 31 | 32 | // ### soundex 33 | /** 34 | * 35 | * Generates the soundex coded tokens from the input tokens. 36 | * 37 | * @alias tokens#soundex 38 | * @param {string[]} tokens the input tokens. 39 | * @return {string[]} soundex coded tokens. 40 | * @example 41 | * soundex( [ 'he', 'acted', 'decisively', 'today' ] ); 42 | * // -> [ 'H000', 'A233', 'D221', 'T300' ] 43 | */ 44 | var soundex = function ( tokens ) { 45 | // Need to send `maxLength` as `undefined`. 46 | return tokens.map( ( t ) => stringSoundex( t ) ); 47 | }; // soundex() 48 | 49 | module.exports = soundex; 50 | -------------------------------------------------------------------------------- /src/string-amplify-not-elision.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | 30 | // ## string 31 | 32 | // ### amplifyNotElision 33 | /** 34 | * 35 | * Amplifies the not elision by converting it into not; for example `isn't` 36 | * becomes `is not`. 37 | * 38 | * @alias string#amplifyNotElision 39 | * @param {string} str the input string. 40 | * @return {string} input string after not elision amplification. 41 | * @example 42 | * amplifyNotElision( "someone's wallet, isn't it?" ); 43 | * // -> "someone's wallet, is not it?" 44 | */ 45 | var amplifyNotElision = function ( str ) { 46 | return str.replace( rgx.notElision, '$1 not' ); 47 | }; // amplifyNotElision() 48 | 49 | module.exports = amplifyNotElision; 50 | -------------------------------------------------------------------------------- /src/string-remove-extra-spaces.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | 30 | // ## string 31 | 32 | // ### removeExtraSpaces 33 | /** 34 | * 35 | * Removes leading, trailing and any extra in-between whitespaces from the input 36 | * string. 37 | * 38 | * @alias string#removeExtraSpaces 39 | * @param {string} str the input string. 40 | * @return {string} input string after removal of leading, trailing and extra 41 | * whitespaces. 42 | * @example 43 | * removeExtraSpaces( ' Padded Text ' ); 44 | * // -> 'Padded Text' 45 | */ 46 | var removeExtraSpaces = function ( str ) { 47 | return ( str 48 | .trim() 49 | .replace( rgx.spaces, ' ') 50 | ); 51 | }; // removeExtraSpaces() 52 | 53 | module.exports = removeExtraSpaces; 54 | -------------------------------------------------------------------------------- /src/string-remove-spl-chars.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | 30 | // ## string 31 | 32 | // ### removeSplChars 33 | /** 34 | * 35 | * Removes each special character by replacing it with a whitespace. It looks for 36 | * the following special characters — `~@#%^*+=`. 37 | * 38 | * Extra spaces, if required, may be removed using [string.removeExtraSpaces](#stringremoveextraspaces) 39 | * function. 40 | * 41 | * @alias string#removeSplChars 42 | * @param {string} str the input string. 43 | * @return {string} input string after removal of special characters. 44 | * @example 45 | * removeSplChars( '4 + 4*2 = 12' ); 46 | * // -> '4 4 2 12' 47 | */ 48 | var removeSplChars = function ( str ) { 49 | return str.replace( rgx.splChars, ' ' ); 50 | }; // removeSplChars() 51 | 52 | module.exports = removeSplChars; 53 | -------------------------------------------------------------------------------- /src/tokens-bigrams.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## tokens 30 | 31 | // ### bigrams 32 | /** 33 | * 34 | * Generates bigrams from the input tokens. 35 | * 36 | * @alias tokens#bigrams 37 | * @param {string[]} tokens the input tokens. 38 | * @return {string[]} the bigrams. 39 | * @example 40 | * bigrams( [ 'he', 'acted', 'decisively', 'today' ] ); 41 | * // -> [ [ 'he', 'acted' ], 42 | * // [ 'acted', 'decisively' ], 43 | * // [ 'decisively', 'today' ] ] 44 | */ 45 | var bigrams = function ( tokens ) { 46 | // Bigrams will be stored here. 47 | var bgs = []; 48 | // Helper variables. 49 | var i, imax; 50 | // Create bigrams. 51 | for ( i = 0, imax = tokens.length - 1; i < imax; i += 1 ) { 52 | bgs.push( [ tokens[ i ], tokens[ i + 1 ] ] ); 53 | } 54 | return bgs; 55 | }; // bigrams() 56 | 57 | module.exports = bigrams; 58 | -------------------------------------------------------------------------------- /src/string-remove-html-tags.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | 30 | // ## string 31 | 32 | // ### removeHTMLTags 33 | /** 34 | * 35 | * Removes each HTML tag by replacing it with a whitespace. 36 | * 37 | * Extra spaces, if required, may be removed using [string.removeExtraSpaces](#stringremoveextraspaces) 38 | * function. 39 | * 40 | * @alias string#removeHTMLTags 41 | * @param {string} str the input string. 42 | * @return {string} input string after removal of HTML tags. 43 | * @example 44 | * removeHTMLTags( '

Vive la France  !

' ); 45 | * // -> ' Vive la France ! ' 46 | */ 47 | var removeHTMLTags = function ( str ) { 48 | return ( str 49 | .replace( rgx.htmlTags, ' ' ) 50 | .replace( rgx.htmlEscSeq1, ' ' ) 51 | .replace( rgx.htmlEscSeq2, ' ' ) 52 | ); 53 | }; // removeHTMLTags() 54 | 55 | module.exports = removeHTMLTags; 56 | -------------------------------------------------------------------------------- /src/tokens-append-bigrams.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## tokens 30 | 31 | // ### appendBigrams 32 | /** 33 | * 34 | * Generates bigrams from the input tokens and appends them to the input tokens. 35 | * 36 | * @alias tokens#appendBigrams 37 | * @param {string[]} tokens the input tokens. 38 | * @return {string[]} the input tokens appended with their bigrams. 39 | * @example 40 | * appendBigrams( [ 'he', 'acted', 'decisively', 'today' ] ); 41 | * // -> [ 'he', 42 | * // 'acted', 43 | * // 'decisively', 44 | * // 'today', 45 | * // 'he_acted', 46 | * // 'acted_decisively', 47 | * // 'decisively_today' ] 48 | */ 49 | var appendBigrams = function ( tokens ) { 50 | var i, imax; 51 | for ( i = 0, imax = tokens.length - 1; i < imax; i += 1 ) { 52 | tokens.push( tokens[ i ] + '_' + tokens[ i + 1 ] ); 53 | } 54 | return tokens; 55 | }; // appendBigrams() 56 | 57 | module.exports = appendBigrams; 58 | -------------------------------------------------------------------------------- /src/string-remove-punctuations.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | 30 | // ## string 31 | 32 | // ### removePunctuations 33 | /** 34 | * 35 | * Removes each punctuation mark by replacing it with a whitespace. It looks for 36 | * the following punctuations — `.,;!?:"!'... - () [] {}`. 37 | * 38 | * Extra spaces, if required, may be removed using [string.removeExtraSpaces](#stringremoveextraspaces) 39 | * function. 40 | * 41 | * @alias string#removePunctuations 42 | * @param {string} str the input string. 43 | * @return {string} input string after removal of punctuations. 44 | * @example 45 | * removePunctuations( 'Punctuations like "\'\',;!?:"!... are removed' ); 46 | * // -> 'Punctuations like are removed' 47 | */ 48 | var removePunctuations = function ( str ) { 49 | return str.replace( rgx.punctuations, ' ' ); 50 | }; // removePunctuations() 51 | 52 | module.exports = removePunctuations; 53 | -------------------------------------------------------------------------------- /src/string-remove-elisions.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | 30 | // ## string 31 | 32 | // ### removeElisions 33 | /** 34 | * 35 | * Removes basic elisions found in the input string. Typical example of elisions 36 | * are `it's, let's, where's, I'd, I'm, I'll, I've, and Isn't` etc. Note it retains 37 | * apostrophe used to indicate possession. 38 | * 39 | * @alias string#removeElisions 40 | * @param {string} str the input string. 41 | * @return {string} input string after removal of elisions. 42 | * @example 43 | * removeElisions( "someone's wallet, isn't it?" ); 44 | * // -> "someone's wallet, is it?" 45 | */ 46 | var removeElisions = function ( str ) { 47 | return ( str 48 | .replace( rgx.elisionsSpl, '$2' ) 49 | .replace( rgx.elisions1, '$1' ) 50 | .replace( rgx.elisions2, '$1' ) 51 | ); 52 | }; // removeElisions() 53 | 54 | module.exports = removeElisions; 55 | -------------------------------------------------------------------------------- /src/string-split-elisions.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | 30 | // ## string 31 | 32 | // ### splitElisions 33 | /** 34 | * 35 | * Splits basic elisions found in the input string. Typical example of elisions 36 | * are `it's, let's, where's, I'd, I'm, I'll, I've, and Isn't` etc. Note it does 37 | * not touch apostrophe used to indicate possession. 38 | * 39 | * @alias string#splitElisions 40 | * @param {string} str the input string. 41 | * @return {string} input string after splitting of elisions. 42 | * @example 43 | * splitElisions( "someone's wallet, isn't it?" ); 44 | * // -> "someone's wallet, is n't it?" 45 | */ 46 | var splitElisions = function ( str ) { 47 | return ( str 48 | .replace( rgx.elisionsSpl, '$2 $3' ) 49 | .replace( rgx.elisions1, '$1 $2' ) 50 | .replace( rgx.elisions2, '$1 $2' ) 51 | ); 52 | }; // splitElisions() 53 | 54 | module.exports = splitElisions; 55 | -------------------------------------------------------------------------------- /src/string-extract-run-of-capital-words.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | var trim = require( './string-trim.js' ); 30 | // ## string 31 | 32 | // ### extractRunOfCapitalWords 33 | /** 34 | * 35 | * Extracts the array of text appearing as Title Case or in ALL CAPS from the 36 | * input string. 37 | * 38 | * @alias string#extractRunOfCapitalWords 39 | * @param {string} str the input string. 40 | * @return {string[]} of text appearing in Title Case or in ALL CAPS; if no such 41 | * text is found then `null` is returned. 42 | * @example 43 | * extractRunOfCapitalWords( 'In The Terminator, Sarah Connor is in Los Angeles' ); 44 | * // -> [ 'In The Terminator', 'Sarah Connor', 'Los Angeles' ] 45 | */ 46 | var extractRunOfCapitalWords = function ( str ) { 47 | var m = str.match( rgx.rocWords ); 48 | return ( ( m ) ? m.map( trim ) : m ); 49 | }; // extractRunOfCapitalWords() 50 | 51 | module.exports = extractRunOfCapitalWords; 52 | -------------------------------------------------------------------------------- /src/string-retain-alpha-nums.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | 30 | // ## string 31 | 32 | // ### retainAlphaNums 33 | /** 34 | * 35 | * Retains only apha, numerals, and removes all other characters from 36 | * the input string, including leading, trailing and extra in-between 37 | * whitespaces. 38 | * 39 | * @alias string#retainAlphaNums 40 | * @param {string} str the input string. 41 | * @return {string} input string after removal of non-alphanumeric characters, 42 | * leading, trailing and extra whitespaces. 43 | * @example 44 | * retainAlphaNums( ' This, text here, has (other) chars_! ' ); 45 | * // -> 'This text here has other chars' 46 | */ 47 | var retainAlphaNums = function ( str ) { 48 | return ( str 49 | .replace( rgx.notAlphaNumeric, ' ') 50 | .replace( rgx.spaces, ' ') 51 | .trim() 52 | ); 53 | }; // retainAlphaNums() 54 | 55 | module.exports = retainAlphaNums; 56 | -------------------------------------------------------------------------------- /src/string-marker.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### marker 32 | /** 33 | * 34 | * Generates `marker` of the input string; it is defined as 1-gram, sorted 35 | * and joined back as a string again. Marker is a quick and aggressive way 36 | * to detect similarity between short strings. Its aggression may lead to more 37 | * false positives such as `Meter` and `Metre` or `no melon` and `no lemon`. 38 | * 39 | * @alias string#marker 40 | * @param {string} str the input string. 41 | * @return {string} the marker. 42 | * @example 43 | * marker( 'the quick brown fox jumps over the lazy dog' ); 44 | * // -> ' abcdefghijklmnopqrstuvwxyz' 45 | */ 46 | var marker = function ( str ) { 47 | var uniqChars = Object.create( null ); 48 | for ( var i = 0, imax = str.length; i < imax; i += 1 ) { 49 | uniqChars[ str[ i ] ] = true; 50 | } 51 | return ( Object.keys( uniqChars ).sort().join('') ); 52 | }; // marker() 53 | 54 | module.exports = marker; 55 | -------------------------------------------------------------------------------- /src/dictionaries/stop_words.json: -------------------------------------------------------------------------------- 1 | [ 2 | "i", 3 | "me", 4 | "my", 5 | "myself", 6 | "we", 7 | "our", 8 | "ours", 9 | "ourselves", 10 | "you", 11 | "your", 12 | "yours", 13 | "yourself", 14 | "yourselves", 15 | "he", 16 | "him", 17 | "his", 18 | "himself", 19 | "she", 20 | "her", 21 | "hers", 22 | "herself", 23 | "it", 24 | "its", 25 | "itself", 26 | "they", 27 | "them", 28 | "their", 29 | "theirs", 30 | "themselves", 31 | "what", 32 | "which", 33 | "who", 34 | "whom", 35 | "this", 36 | "that", 37 | "these", 38 | "those", 39 | "am", 40 | "is", 41 | "are", 42 | "was", 43 | "were", 44 | "be", 45 | "been", 46 | "being", 47 | "have", 48 | "has", 49 | "had", 50 | "having", 51 | "do", 52 | "does", 53 | "did", 54 | "doing", 55 | "would", 56 | "should", 57 | "could", 58 | "ought", 59 | "i'm", 60 | "you're", 61 | "he's", 62 | "she's", 63 | "it's", 64 | "we're", 65 | "they're", 66 | "i've", 67 | "you've", 68 | "we've", 69 | "they've", 70 | "i'd", 71 | "you'd", 72 | "he'd", 73 | "she'd", 74 | "we'd", 75 | "they'd", 76 | "i'll", 77 | "you'll", 78 | "he'll", 79 | "she'll", 80 | "we'll", 81 | "they'll", 82 | "let's", 83 | "that's", 84 | "who's", 85 | "what's", 86 | "here's", 87 | "there's", 88 | "when's", 89 | "where's", 90 | "why's", 91 | "how's", 92 | "a", 93 | "an", 94 | "the", 95 | "and", 96 | "but", 97 | "if", 98 | "or", 99 | "because", 100 | "as", 101 | "until", 102 | "while", 103 | "of", 104 | "at", 105 | "by", 106 | "for", 107 | "with", 108 | "about", 109 | "against", 110 | "between", 111 | "into", 112 | "through", 113 | "during", 114 | "before", 115 | "after", 116 | "above", 117 | "below", 118 | "to", 119 | "from", 120 | "up", 121 | "down", 122 | "in", 123 | "out", 124 | "on", 125 | "off", 126 | "over", 127 | "under", 128 | "again", 129 | "further", 130 | "then", 131 | "once", 132 | "here", 133 | "there", 134 | "when", 135 | "where", 136 | "why", 137 | "how", 138 | "all", 139 | "any", 140 | "both", 141 | "each", 142 | "few", 143 | "more", 144 | "most", 145 | "other", 146 | "some", 147 | "such", 148 | "only", 149 | "own", 150 | "same", 151 | "so", 152 | "than", 153 | "too", 154 | "very" 155 | ] 156 | -------------------------------------------------------------------------------- /src/string-ngram.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### ngram 32 | /** 33 | * 34 | * Generates an array of ngrams of a specified size from the input string. The 35 | * default size is 2, which means it will generate bigrams by default. 36 | * 37 | * @alias string#ngram 38 | * @param {string} str the input string. 39 | * @param {number} [size=2] ngram's size. 40 | * @return {string[]} ngrams of `size` from `str`. 41 | * @example 42 | * ngram( 'FRANCE' ); 43 | * // -> [ 'FR', 'RA', 'AN', 'NC', 'CE' ] 44 | * ngram( 'FRENCH' ); 45 | * // -> [ 'FR', 'RE', 'EN', 'NC', 'CH' ] 46 | * ngram( 'FRANCE', 3 ); 47 | * // -> [ 'FRA', 'RAN', 'ANC', 'NCE' ] 48 | */ 49 | var ngram = function ( str, size ) { 50 | var ng = ( size || 2 ), 51 | ngramz = [], 52 | tg; 53 | for ( var i = 0, imax = str.length; i < imax; i += 1 ) { 54 | tg = str.slice( i, i + ng ); 55 | if ( tg.length === ng ) ngramz.push( tg ); 56 | } 57 | return ( ngramz ); 58 | }; // ngram() 59 | 60 | module.exports = ngram; 61 | -------------------------------------------------------------------------------- /src/tokens-remove-words.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | var defaultStopWords = require( './dictionaries/stop_words.json' ); 30 | var words = require( './helper-return-words-filter.js' ); 31 | defaultStopWords = words( defaultStopWords ); 32 | 33 | // ## tokens 34 | 35 | // ### removeWords 36 | /** 37 | * 38 | * Removes the stop words from the input array of tokens. 39 | * 40 | * @alias tokens#removeWords 41 | * @param {string[]} tokens the input tokens. 42 | * @param {wordsFilter} [stopWords=defaultStopWords] default stop words are 43 | * loaded from `stop_words.json` located under the `src/dictionaries/` directory. 44 | * Custom stop words can be created using [helper.returnWordsFilter ](#helperreturnwordsfilter). 45 | * @return {string[]} balance tokens. 46 | * @example 47 | * removeWords( [ 'this', 'is', 'a', 'cat' ] ); 48 | * // -> [ 'cat' ] 49 | */ 50 | var removeWords = function ( tokens, stopWords ) { 51 | var givenStopWords = ( stopWords || defaultStopWords ); 52 | return tokens.filter( givenStopWords.exclude ); 53 | }; // removeWords() 54 | 55 | module.exports = removeWords; 56 | -------------------------------------------------------------------------------- /src/jsdoc-classes.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | /** 28 | * @classdesc Indexer object 29 | * @class Indexer 30 | * @hideconstructor 31 | */ 32 | 33 | /** 34 | * Incrementally builds index for each `element/itsIndex` combination passed. 35 | * 36 | * @method Indexer#build 37 | * @param elements 38 | * @param itsIndex 39 | */ 40 | 41 | /** 42 | * Used to access the index. This index is in a form of an object that contains 43 | * each element as key. The value of each key is an array 44 | * containing all index positions to the element in question. Note these index positions 45 | * are nothing but each `itsIndex` value passed for the `element`. 46 | * 47 | * @method Indexer#result 48 | * @returns {Object} 49 | */ 50 | 51 | /** 52 | * @classdesc WordsFilter 53 | * @class WordsFilter 54 | * @hideconstructor 55 | */ 56 | 57 | /** 58 | * Contains the set created from the array `words`. 59 | * 60 | * @method WordsFilter#set 61 | */ 62 | 63 | /** 64 | * Used with the array's filter method to exclude the `words` or mapped 65 | * `words` if `givenMappers` are defined. 66 | * 67 | * @method WordsFilter#exclude 68 | */ 69 | -------------------------------------------------------------------------------- /src/tokens-sow.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### setOfWords 32 | /** 33 | * 34 | * Generates the set of words from the input string. It also has an alias **`sow()`**. 35 | * 36 | * @alias tokens#setOfWords 37 | * @param {string[]} tokens the input tokens. 38 | * @param {function} [ifn=undefined] a function to build index; it is called for 39 | * every **member word of the set **; and it receives the word and the `idx` 40 | * as input arguments. The `build()` function of [helper.returnIndexer](#helperreturnindexer) 41 | * may be used as `ifn`. If `undefined` then index is not built. 42 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn` 43 | * function. 44 | * @return {set} of words from tokens. 45 | * @example 46 | * setOfWords( [ 'rain', 'rain', 'go', 'away' ] ); 47 | * // -> Set { 'rain', 'go', 'away' } 48 | */ 49 | var setOfWords = function ( tokens, ifn, idx ) { 50 | var tset = new Set( tokens ); 51 | if ( typeof ifn === 'function' ) { 52 | tset.forEach( function ( m ) { 53 | ifn( m, idx ); 54 | } ); 55 | } 56 | return ( tset ); 57 | }; // bow() 58 | 59 | module.exports = setOfWords; 60 | -------------------------------------------------------------------------------- /src/string-soc.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### setOfChars 32 | /** 33 | * 34 | * Creates a set of chars from the input string `s`. This is useful 35 | * in even more aggressive string matching using Jaccard or Tversky compared to 36 | * `marker()`. It also has an alias **`soc()`**. 37 | * 38 | * @alias string#setOfChars 39 | * @param {string} str the input string. 40 | * @param {function} [ifn=undefined] a function to build index; it receives the first 41 | * character of `str` and the `idx` as input arguments. The `build()` function of 42 | * [helper.returnIndexer](#helperreturnindexer) may be used as `ifn`. If `undefined` 43 | * then index is not built. 44 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn` 45 | * function. 46 | * @return {string} the soc. 47 | * @example 48 | * setOfChars( 'the quick brown fox jumps over the lazy dog' ); 49 | * // -> ' abcdefghijklmnopqrstuvwxyz' 50 | */ 51 | var setOfChars = function ( str, ifn, idx ) { 52 | var cset = new Set( str ); 53 | if ( typeof ifn === 'function' ) { 54 | ifn( str[ 0 ], idx ); 55 | } 56 | return ( cset ); 57 | }; // soc() 58 | 59 | module.exports = setOfChars; 60 | -------------------------------------------------------------------------------- /src/string-sentences.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | // Load wink-nlp package & helpers. 29 | const winkNLP = require( 'wink-nlp' ); 30 | // Load english language model — light version. 31 | const model = require( 'wink-eng-lite-web-model' ); 32 | // Instantiate winkNLP, only use SBD. 33 | const nlp = winkNLP( model, [ 'sbd' ] ); 34 | 35 | // ## string 36 | 37 | // ### sentences 38 | /** 39 | * 40 | * Detects the sentence boundaries in the input `paragraph` and splits it into 41 | * an array of sentence(s). 42 | * 43 | * @alias string#sentences 44 | * @param {string} paragraph the input string. 45 | * @return {string[]} of sentences. 46 | * @example 47 | * sentences( 'AI Inc. is focussing on AI. I work for AI Inc. My mail is r2d2@yahoo.com' ); 48 | * // -> [ 'AI Inc. is focussing on AI.', 49 | * // 'I work for AI Inc.', 50 | * // 'My mail is r2d2@yahoo.com' ] 51 | * 52 | * sentences( 'U.S.A is my birth place. I was born on 06.12.1924. I climbed Mt. Everest.' ); 53 | * // -> [ 'U.S.A is my birth place.', 54 | * // 'I was born on 06.12.1924.', 55 | * // 'I climbed Mt. Everest.' ] 56 | */ 57 | var punkt = function ( paragraph ) { 58 | // Leverage winkNLP. 59 | return nlp.readDoc( paragraph ).sentences().out(); 60 | }; // punkt() 61 | 62 | module.exports = punkt; 63 | -------------------------------------------------------------------------------- /src/name_cleaner_regexes.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var degrees = [ 29 | /\bm\.?\s*a\b/i, 30 | /\bb\.?\s*a\b/i, 31 | /\bb\.?\s*tech\b/i, 32 | /\bm\.?\s*tech\b/i, 33 | /\bb\.?\s*des\b/i, 34 | /\bm\.?\s*des\b/i, 35 | /\bm\.?\s*b\.?\s*a\b/i, 36 | /\bm\.?\s*c\.?\s*a\b/i, 37 | /\bb\.?\s*c\.?\s*a\b/i, 38 | /\bl\.?\s*l\.?\s*b\b/i, 39 | /\bl\.?\s*l\.?\s*m\b/i, 40 | /\bm\.?\s*b\.?\s*b\.?\s*s\b/i, 41 | /\bm\.?\s*d\b/i, 42 | /\bd\.?\s*m\b/i, 43 | /\bm\.?\s*s\b/i, 44 | /\bd\.?\s*n\.?\s*b\b/i, 45 | /\bd\.?\s*g\.?\s*o\b/i, 46 | /\bd\.?\s*l\.?\s*o\b/i, 47 | /\bb\.?\s*d\.?\s*s\b/i, 48 | /\bb\.?\s*h\.?\s*m\.?\s*s\b/i, 49 | /\bb\.?\s*a\.?\s*m\.?\s*s\b/i, 50 | /\bf\.?\s*i\.?\s*c\.?\s*s\b/i, 51 | /\bm\.?\s*n\.?\s*a\.?\s*m\.?\s*s\b/i, 52 | /\bb\.?\s*e\.?\s*m\.?\s*s\b/i, 53 | /\bd\.?\s*c\.?\s*h\b/i, 54 | /\bm\.?\s*c\.?\s*h\b/i, 55 | /\bf\.?\s*r\.?\s*c\.?\s*s\b/i, 56 | /\bm\.?\s*r\.?\s*c\.?\s*p\b/i, 57 | /\bf\.?\s*i\.?\s*a\.?\s*c\.?\s*m\b/i, 58 | /\bf\.?\s*i\.?\s*m\.?\s*s\.?\s*a\b/i, 59 | /\bp\.?\s*h\.?\s*d\b/i, 60 | ]; 61 | 62 | var titleNames = [ 'mr', 'mrs', 'miss', 'ms', 'master', 'er', 'dr', 'shri', 'shrimati', 'sir' ]; 63 | 64 | var titles = new RegExp( '^(?:' + titleNames.join( '|' ) + ')$', 'i' ); 65 | 66 | module.exports = { 67 | degrees: degrees, 68 | titles: titles 69 | }; 70 | -------------------------------------------------------------------------------- /src/string-tokenize0.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var removeElisions = require( './string-remove-elisions.js' ); 29 | var amplifyNotElision = require( './string-amplify-not-elision.js' ); 30 | var rgx = require( './util_regexes.js' ); 31 | 32 | // ## string 33 | 34 | // ### tokenize0 35 | /** 36 | * 37 | * Tokenizes by splitting the input string on **non-words**. This means tokens would 38 | * consists of only alphas, numerals and underscores; all other characters will 39 | * be stripped as they are treated as separators. It also removes all elisions; 40 | * however negations are retained and amplified. 41 | * 42 | * @alias string#tokenize0 43 | * @param {string} str the input string. 44 | * @return {string[]} of tokens. 45 | * @example 46 | * tokenize0( "someone's wallet, isn't it?" ); 47 | * // -> [ 'someone', 's', 'wallet', 'is', 'not', 'it' ] 48 | */ 49 | var tokenize0 = function ( str ) { 50 | var tokens = removeElisions( amplifyNotElision( str ) ) 51 | .replace( rgx.cannot, '$1 $2' ) 52 | .split( rgx.nonWords ); 53 | // Check the 0th and last element of array for empty string because if 54 | // fisrt/last characters are non-words then these will be empty stings! 55 | if ( tokens[ 0 ] === '' ) tokens.shift(); 56 | if ( tokens[ tokens.length - 1 ] === '' ) tokens.pop(); 57 | return tokens; 58 | }; // tokenize0() 59 | 60 | module.exports = tokenize0; 61 | -------------------------------------------------------------------------------- /src/helper-return-indexer.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### returnIndexer 32 | 33 | /** 34 | * 35 | * Returns an Indexer object that contains two functions. The first function `build()` 36 | * incrementally builds an index for each `element` using `itsIndex` — both passed as 37 | * parameters to it. The second function — `result()` allows accessing the index anytime. 38 | * 39 | * It is typically used with [string.soc](#stringsoc), [string.bong](#stringbong), 40 | * [string.song](#stringsong), and [tokens.sow](#tokenssow). 41 | * 42 | * @alias helper#returnIndexer 43 | * @return {indexer} used to build and access the index. 44 | * @example 45 | * var indexer = returnIndexer(); 46 | * // -> { build: [function], result: [function] } 47 | */ 48 | var returnIndexer = function () { 49 | var theIndex = Object.create( null ); 50 | var methods = Object.create( null ); 51 | 52 | // Builds index by adding the `element` and `itsIndex`. The `itsIndex` should 53 | // be a valid JS array index; no validation checks are performed while building 54 | // index. 55 | var build = function ( element, itsIndex ) { 56 | theIndex[ element ] = theIndex[ element ] || []; 57 | theIndex[ element ].push( itsIndex ); 58 | return true; 59 | }; // build() 60 | 61 | // Returns the index built so far. 62 | var result = function () { 63 | return theIndex; 64 | }; // result() 65 | 66 | methods.build = build; 67 | methods.result = result; 68 | 69 | return methods; 70 | }; // index() 71 | 72 | module.exports = returnIndexer; 73 | -------------------------------------------------------------------------------- /src/string-extract-persons-name.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | var ncrgx = require( './name_cleaner_regexes.js' ); 30 | 31 | // ## string 32 | 33 | // ### extractPersonsName 34 | /** 35 | * 36 | * Attempts to extract person's name from input string. 37 | * It assmues the following name format:
38 | * `[] []`
39 | * Entities in square brackets are optional. Note, it is not a 40 | * named entity detection mechanism. 41 | * 42 | * @alias string#extractPersonsName 43 | * @param {string} str the input string. 44 | * @return {string} extracted name. 45 | * @example 46 | * extractPersonsName( 'Dr. Sarah Connor M. Tech., PhD. - AI' ); 47 | * // -> 'Sarah Connor' 48 | */ 49 | var extractPersonsName = function ( str ) { 50 | // Remove Degrees by making the list of indexes of each degree and subsequently 51 | // finding the minimum and slicing from there! 52 | var indexes = ncrgx.degrees.map( function ( r ) { 53 | var m = r.exec( str ); 54 | return ( m ) ? m.index : 999999; 55 | } ); 56 | var sp = Math.min.apply( null, indexes ); 57 | 58 | // Generate an Array of Every Elelemnt of Name (e.g. title, first name, 59 | // sir name, honours, etc) 60 | var aeen = str.slice( 0, sp ).replace( rgx.notAlpha, ' ').replace( rgx.spaces, ' ').trim().split(' '); 61 | // Remove titles from the beginning. 62 | while ( aeen.length && ncrgx.titles.test( aeen[0] ) ) aeen.shift(); 63 | return aeen.join(' '); 64 | }; // extractPersonsName() 65 | 66 | module.exports = extractPersonsName; 67 | -------------------------------------------------------------------------------- /src/string-song.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### setOfNGrams 32 | /** 33 | * 34 | * Generates the set of ngrams of `size` from the input string. The 35 | * default size is 2, which means it will generate set of bigrams by default. 36 | * It also has an alias **`song()`**. 37 | * 38 | * @alias string#setOfNGrams 39 | * @param {string} str the input string. 40 | * @param {number} [size=2] ngram size. 41 | * @param {function} [ifn=undefined] a function to build index; it is called for 42 | * every **unique occurrence of ngram** of `str`; and it receives the ngram and the `idx` 43 | * as input arguments. The `build()` function of [helper.returnIndexer](#helperreturnindexer) 44 | * may be used as `ifn`. If `undefined` then index is not built. 45 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn` 46 | * function. 47 | * @return {set} of ngrams of `size` of `str`. 48 | * @example 49 | * setOfNGrams( 'mama' ); 50 | * // -> Set { 'ma', 'am' } 51 | * song( 'mamma' ); 52 | * // -> Set { 'ma', 'am', 'mm' } 53 | */ 54 | var setOfNGrams = function ( str, size, ifn, idx ) { 55 | var ng = ( size || 2 ), 56 | ngSet = new Set(), 57 | tg; 58 | for ( var i = 0, imax = str.length; i < imax; i += 1 ) { 59 | tg = str.slice( i, i + ng ); 60 | if ( tg.length === ng ) { 61 | if ( ( typeof ifn === 'function' ) && !ngSet.has( tg ) ) { 62 | ifn( tg, idx ); 63 | } 64 | ngSet.add( tg ); 65 | } 66 | } 67 | return ( ngSet ); 68 | }; // song() 69 | 70 | module.exports = setOfNGrams; 71 | -------------------------------------------------------------------------------- /src/string-edge-ngrams.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### edgeNGrams 32 | /** 33 | * 34 | * Generates the edge ngrams from the input string. 35 | * 36 | * @alias string#edgeNGrams 37 | * @param {string} str the input string. 38 | * @param {number} [min=2] size of ngram generated. 39 | * @param {number} [max=8] size of ngram is generated. 40 | * @param {number} [delta=2] edge ngrams are generated in increments of this value. 41 | * @param {function} [ifn=undefined] a function to build index; it is called for 42 | * every edge ngram of `str`; and it receives the edge ngram and the `idx` 43 | * as input arguments. The `build()` function of [helper.returnIndexer](#helperreturnindexer) 44 | * may be used as `ifn`. If `undefined` then index is not built. 45 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn` 46 | * function. 47 | * @return {string[]} of edge ngrams. 48 | * @example 49 | * edgeNGrams( 'decisively' ); 50 | * // -> [ 'de', 'deci', 'decisi', 'decisive' ] 51 | * edgeNGrams( 'decisively', 8, 10, 1 ); 52 | * // -> [ 'decisive', 'decisivel', 'decisively' ] 53 | */ 54 | var edgeNGrams = function ( str, min, max, delta, ifn, idx ) { 55 | var dlta = ( delta || 2 ), 56 | eg, 57 | egs = [], 58 | imax = Math.min( ( max || 8 ), str.length ) + 1, 59 | start = ( min || 2 ); 60 | 61 | // Generate edge ngrams 62 | for ( var i = start; i < imax; i += dlta ) { 63 | eg = str.slice( 0, i ); 64 | egs.push( eg ); 65 | if ( typeof ifn === 'function' ) { 66 | ifn( eg, idx ); 67 | } 68 | } 69 | return ( egs ); 70 | }; // edgeNGrams() 71 | 72 | module.exports = edgeNGrams; 73 | -------------------------------------------------------------------------------- /src/string-compose-corpus.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var helpers = require( 'wink-helpers' ); 29 | var returnQuotedTextExtractor = require( './helper-return-quoted-text-extractor.js' ); 30 | var extractQuotedText = returnQuotedTextExtractor( '[', ']' ); 31 | // ## string 32 | 33 | // ### composeCorpus 34 | /** 35 | * 36 | * Generates all possible sentences from the input argument string. 37 | * The string s must follow a special syntax as illustrated in the 38 | * example below:
39 | * `'[I] [am having|have] [a] [problem|question]'`
40 | * 41 | * Each phrase must be quoted between `[ ]` and each possible option of phrases 42 | * (if any) must be separated by a `|` character. The corpus is composed by 43 | * computing the cartesian product of all the phrases. 44 | * 45 | * @alias string#composeCorpus 46 | * @param {string} str the input string. 47 | * @return {string[]} of all possible sentences. 48 | * @example 49 | * composeCorpus( '[I] [am having|have] [a] [problem|question]' ); 50 | * // -> [ 'I am having a problem', 51 | * // 'I am having a question', 52 | * // 'I have a problem', 53 | * // 'I have a question' ] 54 | */ 55 | var composeCorpus = function ( str ) { 56 | if ( !str || ( typeof str !== 'string' ) ) return []; 57 | 58 | var quotedTextElems = extractQuotedText( str ); 59 | var corpus = []; 60 | var finalCorpus = []; 61 | 62 | if ( !quotedTextElems ) return []; 63 | quotedTextElems.forEach( function ( e ) { 64 | corpus.push( e.split( '|' ) ); 65 | } ); 66 | 67 | helpers.array.product( corpus ).forEach( function ( e ) { 68 | finalCorpus.push( e.join( ' ' ) ); 69 | } ); 70 | return ( finalCorpus ); 71 | }; // composeCorpus() 72 | 73 | module.exports = composeCorpus; 74 | -------------------------------------------------------------------------------- /src/tokens-propagate-negations.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = require( './util_regexes.js' ); 29 | 30 | // ## string 31 | 32 | // ### propagateNegations 33 | /** 34 | * 35 | * It looks for negation tokens in the input array of tokens and propagates 36 | * negation to subsequent `upto` tokens by prefixing them by a `!`. It is useful 37 | * in handling text containing negations during tasks like similarity detection, 38 | * classification or search. 39 | * 40 | * @alias tokens#propagateNegations 41 | * @param {string[]} tokens the input tokens. 42 | * @param {number} [upto=2] number of tokens to be negated after the negation 43 | * token. Note, tokens are only negated either `upto` tokens or up to the token 44 | * preceeding the **`, . ; : ! ?`** punctuations. 45 | * @return {string[]} tokens with negation propagated. 46 | * @example 47 | * propagateNegations( [ 'mary', 'is', 'not', 'feeling', 'good', 'today' ] ); 48 | * // -> [ 'mary', 'is', 'not', '!feeling', '!good', 'today' ] 49 | */ 50 | var propagateNegations = function ( tokens, upto ) { 51 | var i, imax, j, jmax; 52 | var tkns = tokens; 53 | var limit = upto || 2; 54 | for ( i = 0, imax = tkns.length; i < imax; i += 1 ) { 55 | if ( rgx.negations.test( tkns[ i ] ) ) { 56 | for ( j = i + 1, jmax = Math.min( imax, i + limit + 1 ); j < jmax; j += 1 ) { 57 | // Hit a punctuation mark, break out of the loop otherwise go *upto the limit*. 58 | // > TODO: promote to utilities regex, after test cases have been added. 59 | if ( ( /[\,\.\;\:\!\?]/ ).test( tkns[ j ] ) ) break; 60 | // Propoage negation: invert the token by prefixing a `!` to it. 61 | tkns[ j ] = '!' + tkns[ j ]; 62 | } 63 | i = j; 64 | } 65 | } 66 | return tkns; 67 | }; // propagateNegations() 68 | 69 | module.exports = propagateNegations; 70 | -------------------------------------------------------------------------------- /src/phonetize_regexes.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | /* eslint no-underscore-dangle: "off" */ 29 | var rgx = Object.create( null ); 30 | // Remove repeating characters. 31 | rgx.repeatingChars = /([^c])\1/g; 32 | // Drop first character from character pairs, if found in the beginning. 33 | rgx.kngnPairs = /^(kn|gn|pn|ae|wr)/; 34 | // Drop vowels that are not found in the beginning. 35 | rgx.__vowels = /(?!^)[aeiou]/g; 36 | // Replaces `ough` in the end by 'f' 37 | rgx.ough = /ough$/; 38 | // Replace following 3 instances of `dg` by `j`. 39 | rgx.dge = /dge/g; 40 | rgx.dgi = /dgi/g; 41 | rgx.dgy = /dgy/g; 42 | // Replace `sch` by `sk`. 43 | rgx.sch = /sch/g; 44 | // Drop `c` in `sci, sce, scy`. 45 | rgx.sci = /sci/g; 46 | rgx.sce = /sce/g; 47 | rgx.scy = /scy/g; 48 | // Make 'sh' out of `tio & tia`. 49 | rgx.tio = /tio/g; 50 | rgx.tia = /tia/g; 51 | // `t` is silent in `tch`. 52 | rgx.tch = /tch/g; 53 | // Drop `b` in the end if preceeded by `m`. 54 | rgx.mb_ = /mb$/; 55 | // These are pronounced as `k`. 56 | rgx.cq = /cq/g; 57 | rgx.ck = /ck/g; 58 | // Here `c` sounds like `s` 59 | rgx.ce = /ce/g; 60 | rgx.ci = /ci/g; 61 | rgx.cy = /cy/g; 62 | // And this `f`. 63 | rgx.ph = /ph/g; 64 | // The `sh` finally replaced by `x`. 65 | rgx.sh = /sh|sio|sia/g; 66 | // This is open rgx - TODO: need to finalize. 67 | rgx.vrnotvy = /([aeiou])(r)([^aeiouy])/g; 68 | // `th` sounds like theta - make it 0. 69 | rgx.th = /th/g; 70 | // `c` sounds like `k` except when it is followed by `h`. 71 | rgx.cnoth = /(c)([^h])/g; 72 | // Even `q` sounds like `k`. 73 | rgx.q = /q/g; 74 | // The first `x` sounds like `s`. 75 | rgx._x = /^x/; 76 | // Otherwise `x` is more like `ks`. 77 | rgx.x = /x/g; 78 | // Drop `y` if not followed by a vowel or appears in the end. 79 | rgx.ynotv = /(y)([^aeiou])/g; 80 | rgx.y_ = /y$/; 81 | // `z` is `s`. 82 | rgx.z = /z/g; 83 | 84 | // Export rgx. 85 | module.exports = rgx; 86 | -------------------------------------------------------------------------------- /src/string-bong.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### bagOfNGrams 32 | /** 33 | * 34 | * Generates the bag of ngrams of `size` from the input string. The 35 | * default size is 2, which means it will generate bag of bigrams by default. It 36 | * also has an alias **`bong()`**. 37 | * 38 | * @alias string#bagOfNGrams 39 | * @param {string} str the input string. 40 | * @param {number} [size=2] ngram size. 41 | * @param {function} [ifn=undefined] a function to build index; it is called for 42 | * every **unique occurrence of ngram** of `str`; and it receives the ngram and the `idx` 43 | * as input arguments. The `build()` function of [helper.returnIndexer](#helperreturnindexer) 44 | * may be used as `ifn`. If `undefined` then index is not built. 45 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn` 46 | * function. 47 | * @return {object} bag of ngrams of `size` from `str`. 48 | * @example 49 | * bagOfNGrams( 'mama' ); 50 | * // -> { ma: 2, am: 1 } 51 | * bong( 'mamma' ); 52 | * // -> { ma: 2, am: 1, mm: 1 } 53 | */ 54 | var bagOfNGrams = function ( str, size, ifn, idx ) { 55 | var ng = ( size || 2 ), 56 | ngBOW = Object.create( null ), 57 | tg; 58 | for ( var i = 0, imax = str.length; i < imax; i += 1 ) { 59 | tg = str.slice( i, i + ng ); 60 | if ( tg.length === ng ) { 61 | // Call `ifn` iff its defined and `tg` is appearing for the first time; 62 | // this avoids multiple calls to `ifn`. Strategy applies to `song()`, 63 | // and `bow()`. 64 | if ( ( typeof ifn === 'function' ) && !ngBOW[ tg ] ) { 65 | ifn( tg, idx ); 66 | } 67 | // Now define, if required and then update counts. 68 | ngBOW[ tg ] = 1 + ( ngBOW[ tg ] || 0 ); 69 | } 70 | } 71 | return ( ngBOW ); 72 | }; // bong() 73 | 74 | module.exports = bagOfNGrams; 75 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. 6 | 7 | ## Our Standards 8 | 9 | Examples of behavior that contributes to creating a positive environment include: 10 | 11 | * Using welcoming and inclusive language 12 | * Being respectful of differing viewpoints and experiences 13 | * Gracefully accepting constructive criticism 14 | * Focusing on what is best for the community 15 | * Showing empathy towards other community members 16 | 17 | Examples of unacceptable behavior by participants include: 18 | 19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances 20 | * Trolling, insulting/derogatory comments, and personal or political attacks 21 | * Public or private harassment 22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission 23 | * Other conduct which could reasonably be considered inappropriate in a professional setting 24 | 25 | ## Our Responsibilities 26 | 27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. 28 | 29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. 30 | 31 | ## Scope 32 | 33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. 34 | 35 | ## Enforcement 36 | 37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at ContactUs@graype.in. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. 38 | 39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. 40 | 41 | ## Attribution 42 | 43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] 44 | 45 | [homepage]: http://contributor-covenant.org 46 | [version]: http://contributor-covenant.org/version/1/4/ 47 | -------------------------------------------------------------------------------- /src/helper-return-words-filter.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### returnWordsFilter 32 | 33 | /** 34 | * 35 | * Returns an object containing the following functions: (a) `set()`, which returns 36 | * a set of mapped words given in the input array `words`. (b) `exclude()` that 37 | * is suitable for array filtering operations. 38 | * 39 | * If the second argument `mappers` is provided as an array of maping functions 40 | * then these are applied on the input array before converting into a set. A 41 | * mapper function must accept a string as argument and return a string as the result. 42 | * Examples of mapper functions are typically **string** functionss of **`wink-nlp-utils`** 43 | * such as `string.lowerCase()`, `string.stem()` and 44 | * `string.soundex()`. 45 | * 46 | * @alias helper#returnWordsFilter 47 | * @param {string[]} words that can be filtered using the returned wordsFilter. 48 | * @param {function[]} [mappers=undefined] optionally used to map each word before creating 49 | * the wordsFilter. 50 | * @return {wordsFilter} object containg `set()` and `exclude()` functions for `words`. 51 | * @example 52 | * var stopWords = [ 'This', 'That', 'Are', 'Is', 'Was', 'Will', 'a' ]; 53 | * var myFilter = returnWordsFilter( stopWords, [ string.lowerCase ] ); 54 | * [ 'this', 'is', 'a', 'cat' ].filter( myFilter.exclude ); 55 | * // -> [ 'cat' ] 56 | */ 57 | var returnWordsFilter = function ( words, mappers ) { 58 | var mappedWords = words; 59 | var givenMappers = mappers || []; 60 | givenMappers.forEach( function ( m ) { 61 | mappedWords = mappedWords.map( m ); 62 | } ); 63 | 64 | mappedWords = new Set( mappedWords ); 65 | 66 | var exclude = function ( t ) { 67 | return ( !( mappedWords.has( t ) ) ); 68 | }; // exclude() 69 | 70 | var set = function () { 71 | return mappedWords; 72 | }; // set() 73 | 74 | return { 75 | set: set, 76 | exclude: exclude 77 | }; 78 | }; // returnWordsFilter() 79 | 80 | module.exports = returnWordsFilter; 81 | -------------------------------------------------------------------------------- /src/tokens-bow.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### bagOfWords 32 | /** 33 | * 34 | * Generates the bag of words from the input string. By default it 35 | * uses `word count` as it's frequency; but if `logCounts` parameter is set to true then 36 | * it will use `log2( word counts + 1 )` as it's frequency. It also has an alias **`bow()`**. 37 | * 38 | * @alias tokens#bagOfWords 39 | * @param {string[]} tokens the input tokens. 40 | * @param {number} [logCounts=false] a true value flags the use of `log2( word count + 1 )` 41 | * instead of just `word count` as frequency. 42 | * @param {function} [ifn=undefined] a function to build index; it is called for 43 | * every **unique occurrence of word** in `tokens`; and it receives the word and the `idx` 44 | * as input arguments. The `build()` function of [helper.returnIndexer](#helperreturnindexer) 45 | * may be used as `ifn`. If `undefined` then index is not built. 46 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn` 47 | * function. 48 | * @return {object} bag of words from tokens. 49 | * @example 50 | * bagOfWords( [ 'rain', 'rain', 'go', 'away' ] ); 51 | * // -> { rain: 2, go: 1, away: 1 } 52 | * bow( [ 'rain', 'rain', 'go', 'away' ], true ); 53 | * // -> { rain: 1.584962500721156, go: 1, away: 1 } 54 | */ 55 | var bagOfWords = function ( tokens, logCounts, ifn, idx ) { 56 | var bow1 = Object.create( null ), 57 | i, imax, 58 | token, 59 | words; 60 | for ( i = 0, imax = tokens.length; i < imax; i += 1 ) { 61 | token = tokens[ i ]; 62 | if ( ( typeof ifn === 'function' ) && !bow1[ token ] ) { 63 | ifn( token, idx ); 64 | } 65 | bow1[ token ] = 1 + ( bow1[ token ] || 0 ); 66 | } 67 | if ( !logCounts ) return ( bow1 ); 68 | words = Object.keys( bow1 ); 69 | for ( i = 0, imax = words.length; i < imax; i += 1 ) { 70 | // Add `1` to ensure non-zero count! (Note: log2(1) is 0) 71 | bow1[ words[ i ] ] = Math.log2( bow1[ words[ i ] ] + 1 ); 72 | } 73 | return ( bow1 ); 74 | }; // bow() 75 | 76 | module.exports = bagOfWords; 77 | -------------------------------------------------------------------------------- /test/string-edge-ngrams-specs.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var chai = require( 'chai' ); 29 | var mocha = require( 'mocha' ); 30 | var edgeGrams = require( '../src/string-edge-ngrams.js' ); 31 | var index = require( '../src/helper-return-indexer.js' ); 32 | 33 | var expect = chai.expect; 34 | var describe = mocha.describe; 35 | var it = mocha.it; 36 | 37 | // ### Define common errors. 38 | // These are common test data for `null`, `undefined`, and `numeric` inputs 39 | // across all the functions included in the script. 40 | // The exception cases specific to the function are part of the test script of the function. 41 | var errors = [ 42 | { whenInputIs: null, expectedOutputIs: /^Cannot read.*/ }, 43 | { whenInputIs: undefined, expectedOutputIs: /^Cannot read.*/ }, 44 | { whenInputIs: 1, expectedOutputIs: /is not a function$/ } 45 | ]; 46 | 47 | 48 | // ### Create bong test cases. 49 | 50 | describe( 'string.edgeGrams()', function () { 51 | var tests = [ 52 | { whenInputIs: [ '' ], expectedOutputIs: [] }, 53 | { whenInputIs: [ 'decisively' ], expectedOutputIs: [ 'de', 'deci', 'decisi', 'decisive' ] }, 54 | { whenInputIs: [ 'decisively', 8, 10, 1 ], expectedOutputIs: [ 'decisive', 'decisivel', 'decisively' ] } 55 | ]; 56 | 57 | tests.forEach( function ( test ) { 58 | it( 'should return ' + JSON.stringify( test.expectedOutputIs ) + ' if the input is ' + JSON.stringify( test.whenInputIs ), function () { 59 | expect( edgeGrams( ...test.whenInputIs ) ).to.deep.equal( test.expectedOutputIs ); 60 | } ); 61 | } ); 62 | 63 | it( 'indexer result should return an index of 2-grams of rachna & archna', function () { 64 | var bongIndex = index(); 65 | edgeGrams( 'decision', 4, 8, 2, bongIndex.build, 'decision' ); 66 | edgeGrams( 'decisive', 4, 8, 2, bongIndex.build, 'decisive' ); 67 | var result = bongIndex.result(); 68 | expect( result ).to.deep.equal( { deci: [ 'decision', 'decisive' ], decisi: [ 'decision', 'decisive' ], decision: [ 'decision' ], decisive: [ 'decisive' ] } ); 69 | } ); 70 | 71 | errors.slice( 0, 2 ).forEach( function ( error ) { 72 | it( 'should throw ' + error.expectedOutputIs + ' if the input is ' + JSON.stringify( error.whenInputIs ), function () { 73 | expect( edgeGrams.bind( null, error.whenInputIs ) ).to.throw( error.expectedOutputIs ); 74 | } ); 75 | } ); 76 | } ); 77 | -------------------------------------------------------------------------------- /src/string-tokenize.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | // Load wink-nlp package & helpers. 29 | const winkNLP = require( 'wink-nlp' ); 30 | // Load english language model — light version. 31 | const model = require( 'wink-eng-lite-web-model' ); 32 | // Instantiate winkNLP, only use tokenization. 33 | const nlp = winkNLP( model, [] ); 34 | const its = nlp.its; 35 | 36 | // ## string 37 | 38 | // ### tokenize 39 | /** 40 | * 41 | * Tokenizes the input `sentence` according to the value of `detailed` flag. 42 | * Any occurance of `...` in the `sentence` is 43 | * converted to ellipses. In `detailed = true` mode, it 44 | * tags every token with its type; the supported tags are word, number, url, email, 45 | * mention, hashtag, emoji, emoticon, time, ordinal, currency, punctuation, symbol, 46 | * and tabCFLF. 47 | * 48 | * @alias string#tokenize 49 | * @param {string} sentence the input string. 50 | * @param {boolean} [detailed=false] if true, each token is a object cotaining 51 | * `value` and `tag` of each token; otherwise each token is a string. It's default 52 | * value of **false** ensures compatibility with previous version. 53 | * @return {(string[]|object[])} an array of strings if `detailed` is false otherwise 54 | * an array of objects. 55 | * @example 56 | * tokenize( "someone's wallet, isn't it? I'll return!" ); 57 | * // -> [ 'someone', '\'s', 'wallet', ',', 'is', 'n\'t', 'it', '?', 58 | * // 'I', '\'ll', 'return', '!' ] 59 | * 60 | * tokenize( 'For details on wink, check out http://winkjs.org/ URL!', true ); 61 | * // -> [ { value: 'For', tag: 'word' }, 62 | * // { value: 'details', tag: 'word' }, 63 | * // { value: 'on', tag: 'word' }, 64 | * // { value: 'wink', tag: 'word' }, 65 | * // { value: ',', tag: 'punctuation' }, 66 | * // { value: 'check', tag: 'word' }, 67 | * // { value: 'out', tag: 'word' }, 68 | * // { value: 'http://winkjs.org/', tag: 'url' }, 69 | * // { value: 'URL', tag: 'word' }, 70 | * // { value: '!', tag: 'punctuation' } ] 71 | */ 72 | var tokenize = function ( sentence, detailed ) { 73 | const doc = nlp.readDoc( sentence.replace( '...', '…' ) ); 74 | const tokens = []; 75 | 76 | if ( detailed ) { 77 | doc.tokens().each( ( t ) => { 78 | tokens.push( { value: t.out(), tag: t.out( its.type ) } ); 79 | } ); 80 | 81 | return tokens; 82 | } 83 | 84 | return doc.tokens().out(); 85 | }; // tokenize() 86 | 87 | module.exports = tokenize; 88 | -------------------------------------------------------------------------------- /src/helper-return-quoted-text-extractor.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | 29 | // ## string 30 | 31 | // ### returnQuotedTextExtractor 32 | 33 | /** 34 | * 35 | * Returns a function that extracts all occurrences of every quoted text 36 | * between the `lq` and the `rq` characters from its argument. This argument 37 | * must be of type string. 38 | * 39 | * @alias helper#returnQuotedTextExtractor 40 | * @param {string} [lq='"'] the left quote character. 41 | * @param {string} [rq='"'] the right quote character. 42 | * @return {function} that will accept an input string argument and return an 43 | * array of all substrings that are quoted between `lq` and `rq`. 44 | * @example 45 | * var extractQuotedText = returnQuotedTextExtractor(); 46 | * extractQuotedText( 'Raise 2 issues - "fix a bug" & "run tests"' ); 47 | * // -> [ 'fix a bug', 'run tests' ] 48 | */ 49 | var returnQuotedTextExtractor = function ( lq, rq ) { 50 | var // Index variable for *for-loop* 51 | i, 52 | // Set defaults for left quote, if required. 53 | lq1 = ( ( lq && ( typeof lq === 'string' ) ) ? lq : '"' ), 54 | // Extracts its length 55 | lqLen = lq1.length, 56 | // The regular expression is created here. 57 | regex = null, 58 | // The string containing the regular expression builds here. 59 | rgxStr = '', 60 | // Set defaults for right quote, if required. 61 | rq1 = ( ( rq && ( typeof rq === 'string' ) ) ? rq : lq1 ), 62 | // Extract its length. 63 | rqLen = rq1.length; 64 | 65 | // Build `rgxStr` 66 | for ( i = 0; i < lqLen; i += 1 ) rgxStr += '\\' + lq1.charAt( i ); 67 | rgxStr += '.*?'; 68 | for ( i = 0; i < rqLen; i += 1 ) rgxStr += '\\' + rq1.charAt( i ); 69 | // Create regular expression. 70 | regex = new RegExp( rgxStr, 'g' ); 71 | // Return the extractor function. 72 | return ( function ( s ) { 73 | if ( !s || ( typeof s !== 'string' ) ) return null; 74 | var // Extracted elements are captured here. 75 | elements = [], 76 | // Extract matches with quotes 77 | matches = s.match( regex ); 78 | if ( !matches || ( matches.length === 0 ) ) return null; 79 | // Collect elements after removing the quotes. 80 | for ( var k = 0, kmax = matches.length; k < kmax; k += 1 ) { 81 | elements.push( matches[ k ].substr( lqLen, matches[ k ].length - ( rqLen + lqLen ) ) ); 82 | } 83 | return ( elements ); 84 | } ); 85 | }; // returnQuotedTextExtractor() 86 | 87 | module.exports = returnQuotedTextExtractor; 88 | -------------------------------------------------------------------------------- /src/string-phonetize.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var phnrgx = require( './phonetize_regexes.js' ); 29 | /* eslint no-underscore-dangle: "off" */ 30 | 31 | // ## string 32 | 33 | // ### phonetize 34 | /** 35 | * 36 | * Phonetizes the input string using an algorithmic adaptation of Metaphone; It 37 | * is not an exact implementation of Metaphone. 38 | * 39 | * @alias string#phonetize 40 | * @param {string} word the input word. 41 | * @return {string} phonetic code of `word`. 42 | * @example 43 | * phonetize( 'perspective' ); 44 | * // -> 'prspktv' 45 | * phonetize( 'phenomenon' ); 46 | * // -> 'fnmnn' 47 | */ 48 | var phonetize = function ( word ) { 49 | var p = word.toLowerCase(); 50 | // Remove repeating letters. 51 | p = p.replace( phnrgx.repeatingChars, '$1'); 52 | // Drop first character of `kgknPairs`. 53 | if ( phnrgx.kngnPairs.test( p ) ) { 54 | p = p.substr( 1, p.length - 1 ); 55 | } 56 | // Run Regex Express now! 57 | p = p 58 | // Change `ough` in the end as `f`, 59 | .replace( phnrgx.ough, 'f' ) 60 | // Change `dg` to `j`, in `dge, dgi, dgy`. 61 | .replace( phnrgx.dge, 'je' ) 62 | .replace( phnrgx.dgi, 'ji' ) 63 | .replace( phnrgx.dgy, 'jy' ) 64 | // Change `c` to `k` in `sch` 65 | .replace( phnrgx.sch, 'sk' ) 66 | // Drop `c` in `sci, sce, scy`. 67 | .replace( phnrgx.sci, 'si' ) 68 | .replace( phnrgx.sce, 'se' ) 69 | .replace( phnrgx.scy, 'sy' ) 70 | // Drop `t` if it appears as `tch`. 71 | .replace( phnrgx.tch, 'ch' ) 72 | // Replace `tio & tia` by `sh`. 73 | .replace( phnrgx.tio, 'sh' ) 74 | .replace( phnrgx.tia, 'sh' ) 75 | // Drop `b` if it appears as `mb` in the end. 76 | .replace( phnrgx.mb_, 'm' ) 77 | // Drop `r` if it preceeds a vowel and not followed by a vowel or `y` 78 | // .replace( rgx.vrnotvy, '$1$3' ) 79 | // Replace `c` by `s` in `ce, ci, cy`. 80 | .replace( phnrgx.ce, 'se' ) 81 | .replace( phnrgx.ci, 'si' ) 82 | .replace( phnrgx.cy, 'sy' ) 83 | // Replace `cq` by `q`. 84 | .replace( phnrgx.cq, 'q' ) 85 | // Replace `ck` by `k`. 86 | .replace( phnrgx.ck, 'k' ) 87 | // Replace `ph` by `f`. 88 | .replace( phnrgx.ph, 'f' ) 89 | // Replace `th` by `0` (theta look alike!). 90 | .replace( phnrgx.th, '0' ) 91 | // Replace `c` by `k` if it is not followed by `h`. 92 | .replace( phnrgx.cnoth, 'k$2' ) 93 | // Replace `q` by `k`. 94 | .replace( phnrgx.q, 'k' ) 95 | // Replace `x` by `s` if it appears in the beginning. 96 | .replace( phnrgx._x, 's' ) 97 | // Other wise replace `x` by `ks`. 98 | .replace( phnrgx.x, 'ks' ) 99 | // Replace `sh, sia, sio` by `x`. Needs to be done post `x` processing! 100 | .replace( phnrgx.sh, 'x' ) 101 | // Drop `y` if it is now followed by a **vowel**. 102 | .replace( phnrgx.ynotv, '$2' ) 103 | .replace( phnrgx.y_, '' ) 104 | // Replace `z` by `s`. 105 | .replace( phnrgx.z, 's' ) 106 | // Drop all **vowels** excluding the first one. 107 | .replace( phnrgx.__vowels, '' ); 108 | 109 | return ( p ); 110 | }; // phonetize() 111 | 112 | module.exports = phonetize; 113 | -------------------------------------------------------------------------------- /src/util_regexes.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var rgx = Object.create( null ); 29 | 30 | // Matches standard english punctuations in a text. 31 | rgx.punctuations = /[\’\'\‘\’\`\“\”\"\[\]\(\)\{\}\…\,\.\!\;\?\/\-\:]/ig; 32 | // End Of Sentence Punctuations - useful for splitting text into sentences. 33 | rgx.eosPunctuations = /([\.\?\!])\s*(?=[a-z]|\s+\d)/gi; 34 | 35 | // Matches special characters: `* + % # @ ^ = ~ | \` in a text. 36 | rgx.splChars = /[\*\+\%\#\@\^\=\~\|\\]/ig; 37 | 38 | // Matches common english elisions including n't. 39 | // These are special ones as 's otherwise may be apostrophe! 40 | rgx.elisionsSpl = /(\b)(it|let|that|who|what|here|there|when|where|why|how)(\'s)\b/gi; 41 | // Single (1) character elisions. 42 | rgx.elisions1 = /([a-z])(\'d|\'m)\b/gi; 43 | // Two (2) character elisions. 44 | rgx.elisions2 = /([a-z])(\'ll|\'ve|\'re|n\'t)\b/gi; 45 | // Sperate not elision 'nt. 46 | rgx.notElision = /([a-z])(n\'t)\b/gi; 47 | // Specially handle cannot 48 | rgx.cannot = /\b(can)(not)\b/gi; 49 | 50 | // Matches space, tab, or new line characters in text. 51 | rgx.spaces = /\s+/ig; 52 | // Matches anything other than space, tab, or new line characters. 53 | rgx.notSpace = /\S/g; 54 | // Matches alpha and space characters in a text. 55 | rgx.alphaSpace = /[a-z\s]/ig; 56 | // Matches alphanumerals and space characters in a text. 57 | rgx.alphaNumericSpace = /[a-z0-9\s]/ig; 58 | // Matches non alpha characters in a text. 59 | rgx.notAlpha = /[^a-z]/ig; 60 | // Matches non alphanumerals in a text. 61 | rgx.notAlphaNumeric = /[^a-z0-9]/ig; 62 | // Matches one or more non-words characters. 63 | rgx.nonWords = /\W+/ig; 64 | // Matches complete negation token 65 | rgx.negations = /^(never|none|not|no)$/i; 66 | 67 | // Matches run of capital words in a text. 68 | rgx.rocWords = /(?:\b[A-Z][A-Za-z]*\s*){2,}/g; 69 | 70 | // Matches integer, decimal, JS floating point numbers in a text. 71 | rgx.number = /[0-9]*\.[0-9]+e[\+\-]{1}[0-9]+|[0-9]*\.[0-9]+|[0-9]+/ig; 72 | 73 | // Matches time in 12 hour am/pm format in a text. 74 | rgx.timeIn12HrAMPM = /(?:[0-9]|0[0-9]|1[0-2])((:?:[0-5][0-9])){0,1}\s?(?:[aApP][mM])/ig; 75 | 76 | // Matches HTML tags - in fact any thing enclosed in angular brackets including 77 | // the brackets. 78 | rgx.htmlTags = /(?:<[^>]*>)/g; 79 | // Matches the HTML Esc Sequences 80 | // Esc Seq of type `<` or ` ` 81 | rgx.htmlEscSeq1 = /(?:&[a-z]{2,6};)/gi; 82 | // Esc Seq of type ` ` 83 | rgx.htmlEscSeq2 = /(?:&#[0-9]{2,4};)/gi; 84 | 85 | // Tests if a given string is possibly in the Indian mobile telephone number format. 86 | rgx.mobileIndian = /^(0|\+91)?[789]\d{9}$/; 87 | // Tests if a given string is in the valid email format. 88 | rgx.email = /^(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/; 89 | 90 | // Extracts any number and text from a format text. 91 | // Useful in extracting value and UoM from strings like `2.7 Kgs`. 92 | rgx.separateNumAndText = /([0-9]*\.[0-9]+e[\+\-]{1}[0-9]+|[0-9]*\.[0-9]+|[0-9]+)[\s]*(.*)/i; 93 | 94 | // Crude date parser for a string containg date in a valid format. 95 | // > TODO: Need to improve this one! 96 | rgx.date = /(\d+)/ig; 97 | 98 | // Following 3 regexes are specially coded for `tokenize()` in prepare_text. 99 | // Matches punctuations that are not a part of a number. 100 | rgx.nonNumPunctuations = /[\.\,\-](?=\D)/gi; 101 | rgx.otherPunctuations = /[\’\'\‘\’\`\“\”\"\[\]\(\)\{\}\…\!\;\?\/\:]/ig; 102 | // > TODO: Add more currency symbols here. 103 | rgx.currency = /[\$\£\¥\€]/ig; 104 | 105 | // 106 | module.exports = rgx; 107 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # wink-nlp-utils 3 | 4 | NLP Functions for amplifying negations, managing elisions, creating ngrams, stems, phonetic codes to tokens and more. 5 | 6 | ### [![Build Status](https://app.travis-ci.com/winkjs/wink-nlp-utils.svg?branch=master)](https://app.travis-ci.com/github/winkjs/wink-nlp-utils) [![Coverage Status](https://coveralls.io/repos/github/winkjs/wink-nlp-utils/badge.svg?branch=master)](https://coveralls.io/github/winkjs/wink-nlp-utils?branch=master) [![Gitter](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/winkjs/Lobby) 7 | 8 | [](http://wink.org.in/) 9 | 10 | Prepare raw text for Natural Language Processing (NLP) using **`wink-nlp-utils`**. It offers a set of [APIs](http://wink.org.in/wink-nlp-utils/) to work on [strings](http://wink.org.in/wink-nlp-utils/#string) such as names, sentences, paragraphs and [tokens](http://wink.org.in/wink-nlp-utils/#tokens) represented as an array of strings/words. They perform the required pre-processing for many ML tasks such as [semantic search](https://www.npmjs.com/package/wink-bm25-text-search), and [classification](https://www.npmjs.com/package/wink-naive-bayes-text-classifier). 11 | 12 |

👉🏽

13 | We recommend using winkNLP for core natural language processing tasks.

It performs Tokenization, Sentence Boundary Detection, and Named Entity Recognition at a blazing fast speeds. It supports all your text processing needs starting from Sentiment Analysis, POS Tagging, Lemmatization, Stemming, Stop Word Removal, Negation Handling, Bigrams to Frequency Table Creation and more.

WinkNLP features user-friendly declarative APIs for Iteration, Filtering, and Text Visualization, and runs on web browsers. 14 |
15 | 16 | ### Installation 17 | Use [npm](https://www.npmjs.com/package/wink-nlp-utils) to install: 18 | ``` 19 | npm install wink-nlp-utils --save 20 | ``` 21 | 22 | 23 | ### Getting Started 24 | The `wink-nlp-utils` provides over **36 utility functions** for Natural Language Processing tasks. Some representative examples are extracting person's name from a string, compose training corpus for a chat bot, sentence boundary detection, tokenization and stop words removal: 25 | ```javascript 26 | 27 | // Load wink-nlp-utils 28 | var nlp = require( 'wink-nlp-utils' ); 29 | 30 | // Extract person's name from a string: 31 | var name = nlp.string.extractPersonsName( 'Dr. Sarah Connor M. Tech., PhD. - AI' ); 32 | console.log( name ); 33 | // -> 'Sarah Connor' 34 | 35 | // Compose all possible sentences from a string: 36 | var str = '[I] [am having|have] [a] [problem|question]'; 37 | console.log( nlp.string.composeCorpus( str ) ); 38 | // -> [ 'I am having a problem', 39 | // -> 'I am having a question', 40 | // -> 'I have a problem', 41 | // -> 'I have a question' ] 42 | 43 | // Sentence Boundary Detection. 44 | var para = 'AI Inc. is focussing on AI. I work for AI Inc. My mail is r2d2@yahoo.com'; 45 | console.log( nlp.string.sentences( para ) ); 46 | // -> [ 'AI Inc. is focussing on AI.', 47 | // 'I work for AI Inc.', 48 | // 'My mail is r2d2@yahoo.com' ] 49 | 50 | // Tokenize a sentence. 51 | var s = 'For details on wink, check out http://winkjs.org/ URL!'; 52 | console.log( nlp.string.tokenize( s, true ) ); 53 | // -> [ { value: 'For', tag: 'word' }, 54 | // { value: 'details', tag: 'word' }, 55 | // { value: 'on', tag: 'word' }, 56 | // { value: 'wink', tag: 'word' }, 57 | // { value: ',', tag: 'punctuation' }, 58 | // { value: 'check', tag: 'word' }, 59 | // { value: 'out', tag: 'word' }, 60 | // { value: 'http://winkjs.org/', tag: 'url' }, 61 | // { value: 'URL', tag: 'word' }, 62 | // { value: '!', tag: 'punctuation' } ] 63 | 64 | // Remove stop words: 65 | var t = nlp.tokens.removeWords( [ 'mary', 'had', 'a', 'little', 'lamb' ] ); 66 | console.log( t ); 67 | // -> [ 'mary', 'little', 'lamb' ] 68 | 69 | ``` 70 | 71 | Try [experimenting with these examples on Runkit](https://npm.runkit.com/wink-nlp-utils) in the browser. 72 | 73 | ### Documentation 74 | Check out the [wink NLP utilities API](http://winkjs.org/wink-nlp-utils/) documentation to learn more. 75 | 76 | ### Need Help? 77 | If you spot a bug and the same has not yet been reported, raise a new [issue](https://github.com/winkjs/wink-nlp-utils/issues) or consider fixing it and sending a pull request. 78 | 79 | ### About wink 80 | [Wink](http://winkjs.org/) is a family of open source packages for **Statistical Analysis**, **Natural Language Processing** and **Machine Learning** in NodeJS. The code is **thoroughly documented** for easy human comprehension and has a **test coverage of ~100%** for reliability to build production grade solutions. 81 | 82 | 83 | ### Copyright & License 84 | **wink-nlp-utils** is copyright 2017-22 [GRAYPE Systems Private Limited](http://graype.in/). 85 | 86 | It is licensed under the terms of the MIT License. 87 | -------------------------------------------------------------------------------- /src/wink-nlp-utils.js: -------------------------------------------------------------------------------- 1 | // wink-nlp-utils 2 | // NLP Functions for amplifying negations, managing elisions, 3 | // creating ngrams, stems, phonetic codes to tokens and more. 4 | // 5 | // Copyright (C) GRAYPE Systems Private Limited 6 | // 7 | // This file is part of “wink-nlp-utils”. 8 | // 9 | // Permission is hereby granted, free of charge, to any person obtaining a 10 | // copy of this software and associated documentation files (the "Software"), 11 | // to deal in the Software without restriction, including without limitation 12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense, 13 | // and/or sell copies of the Software, and to permit persons to whom the 14 | // Software is furnished to do so, subject to the following conditions: 15 | // 16 | // The above copyright notice and this permission notice shall be included 17 | // in all copies or substantial portions of the Software. 18 | // 19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 25 | // DEALINGS IN THE SOFTWARE. 26 | 27 | // 28 | var porter2Stemmer = require( 'wink-porter2-stemmer' ); 29 | 30 | // ### Prepare Name Space 31 | 32 | // Create prepare name space. 33 | var prepare = Object.create( null ); 34 | 35 | /** 36 | * Helper 37 | * @namespace helper 38 | */ 39 | prepare.helper = Object.create( null ); 40 | 41 | // Words 42 | prepare.helper.returnWordsFilter = require( './helper-return-words-filter.js' ); 43 | prepare.helper.words = prepare.helper.returnWordsFilter; 44 | // Make better **alias** name for the `word()` function. 45 | 46 | // Index 47 | prepare.helper.index = require( './helper-return-indexer.js' ); 48 | // Make better **alias** name for the `index()` function. 49 | prepare.helper.returnIndexer = prepare.helper.index; 50 | 51 | // Return Quoted Text Extractor 52 | prepare.helper.returnQuotedTextExtractor = require( './helper-return-quoted-text-extractor.js' ); 53 | 54 | /** 55 | * String 56 | * @namespace string 57 | */ 58 | prepare.string = Object.create( null ); 59 | 60 | // Lower Case 61 | prepare.string.lowerCase = require( './string-lower-case.js' ); 62 | // Upper Case 63 | prepare.string.upperCase = require( './string-upper-case.js' ); 64 | // Trim 65 | prepare.string.trim = require( './string-trim.js' ); 66 | // Remove Extra Spaces 67 | prepare.string.removeExtraSpaces = require( './string-remove-extra-spaces.js' ); 68 | // Retain Alpha-numerics 69 | prepare.string.retainAlphaNums = require( './string-retain-alpha-nums.js' ); 70 | // Extract Person's Name 71 | prepare.string.extractPersonsName = require( './string-extract-persons-name.js' ); 72 | // Extract Run of Capital Words 73 | prepare.string.extractRunOfCapitalWords = require( './string-extract-run-of-capital-words.js' ); 74 | // Remove Punctuations 75 | prepare.string.removePunctuations = require( './string-remove-punctuations.js' ); 76 | // Remove Special Chars 77 | prepare.string.removeSplChars = require( './string-remove-spl-chars.js' ); 78 | // Remove HTML Tags 79 | prepare.string.removeHTMLTags = require( './string-remove-html-tags.js' ); 80 | // Remove Elisions 81 | prepare.string.removeElisions = require( './string-remove-elisions.js' ); 82 | // Split Elisions 83 | prepare.string.splitElisions = require( './string-split-elisions.js' ); 84 | // Amplify Not Elision 85 | prepare.string.amplifyNotElision = require( './string-amplify-not-elision' ); 86 | // Marker 87 | prepare.string.marker = require( './string-marker.js' ); 88 | // SOC 89 | prepare.string.soc = require( './string-soc.js' ); 90 | prepare.string.setOfChars = require( './string-soc.js' ); 91 | // NGrams 92 | prepare.string.ngram = require( './string-ngram.js' ); 93 | // Edge NGrams 94 | prepare.string.edgeNGrams = require( './string-edge-ngrams.js' ); 95 | // BONG 96 | prepare.string.bong = require( './string-bong.js' ); 97 | prepare.string.bagOfNGrams = require( './string-bong.js' ); 98 | // SONG 99 | prepare.string.song = require( './string-song.js' ); 100 | prepare.string.setOfNGrams = require( './string-song.js' ); 101 | // Sentences 102 | prepare.string.sentences = require( './string-sentences.js' ); 103 | // Compose Corpus 104 | prepare.string.composeCorpus = require( './string-compose-corpus.js' ); 105 | // Tokenize0 106 | prepare.string.tokenize0 = require( './string-tokenize0.js' ); 107 | // Tokenize 108 | prepare.string.tokenize = require( './string-tokenize.js' ); 109 | // #### Stem 110 | prepare.string.stem = porter2Stemmer; 111 | // Phonetize 112 | prepare.string.phonetize = require( './string-phonetize.js' ); 113 | // Soundex 114 | prepare.string.soundex = require( './string-soundex.js' ); 115 | 116 | /** 117 | * Tokens 118 | * @namespace tokens 119 | */ 120 | prepare.tokens = Object.create( null ); 121 | 122 | // Stem 123 | prepare.tokens.stem = require( './tokens-stem.js' ); 124 | // Phonetize 125 | prepare.tokens.phonetize = require( './tokens-phonetize.js' ); 126 | // Soundex 127 | prepare.tokens.soundex = require( './tokens-soundex.js' ); 128 | // Remove Words 129 | prepare.tokens.removeWords = require( './tokens-remove-words.js' ); 130 | // BOW 131 | prepare.tokens.bow = require( './tokens-bow.js' ); 132 | prepare.tokens.bagOfWords = require( './tokens-bow.js' ); 133 | // SOW 134 | prepare.tokens.sow = require( './tokens-sow.js' ); 135 | prepare.tokens.setOfWords = require( './tokens-sow.js' ); 136 | // Propagate Negations 137 | prepare.tokens.propagateNegations = require( './tokens-propagate-negations.js' ); 138 | // Bigrams 139 | prepare.tokens.bigrams = require( './tokens-bigrams.js' ); 140 | // Append Bigrams 141 | prepare.tokens.appendBigrams = require( './tokens-append-bigrams.js' ); 142 | 143 | // Export prepare. 144 | module.exports = prepare; 145 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Wink 2 | 3 | Thank you for taking time to contribute. We are delighted to receive contributions from the community. For wink every contribution matters — whether you are reporting a **bug**, posting a **question**, submitting a **pull request** or updating the **documentation**. 4 | 5 | ## Getting Started 6 | 1. Fork the repository from github 7 | 2. Develop your code changes 8 | 3. Ensure that the API is properly documented 9 | 4. Capture the logic in comments 10 | 4. Ensure proper linting via `npm run pretest` 11 | 5. Run tests using `npm run test` 12 | 6. Make sure coverage either stays at the current levels or improves 13 | 7. Commit your changes in compliance with commit guidelines 14 | 8. Push to your fork 15 | 9. Sign the CLA if you are contributing for the first time 16 | 10. Finally, submit a pull request. 17 | 18 | 19 | ## Code of Conduct 20 | By contributing, you are expected to uphold [wink’s code of conduct](CODE_OF_CONDUCT.md). In essence, each one of us should: 21 | 22 | 1. respect fellow contributors, irrespective of their level of experience, race, religion, gender, sexual orientation, and age; 23 | 2. collaborate constructively; 24 | 3. never engage in any form of offense, harassment, insult, personal attack, provocation and/or use of inappropriate language; 25 | 26 | 27 | 28 | ## Things to know 29 | ### About Wink 30 | Wink is a growing open source project focusing on **Natural Language Processing**, **Machine Learning** and **Statistics**. It contains multiple repositories or packages. All packages expose consistent and uniform APIs, thus minimizing the need to learn a new interface for each task. Do take out some time in understanding the structure of APIs, before attempting any enhancements. In wink, we prefer **functions** and **closures** over objects. 31 | 32 | Like artisans, we too need a toolset and process to create beautiful software. The process is orchestrated by [Travis CI](https://travis-ci.org/) in accordance to the configuration files present in each repository. The details and tools used are outlined below. 33 | 34 | 35 | ### Linting 36 | Well defined linting rules helps us in making code more consistent and avoid bugs. [ESLint](https://eslint.org) enforces these rules via its configuration file. This file is located in the root of each repository. 37 | 38 | 39 | ### Documenting 40 | We believe that the documentation must not only explain the API but also narrate the story of logic, algorithms and references used. Wink uses the [JSDoc](https://jsdoc.app/) standard for API documentation and [Literate-Programming Standards](https://en.wikipedia.org/wiki/Literate_programming) for documenting the logic using [docker](http://jbt.github.io/docker/src/docker.js.html). The API documentation quality is measured using [Inch CI](https://inch-ci.org/) and we expect that your contribution will improve or maintain the current levels. 41 | 42 | ### Testing 43 | Wink requires a test coverage of **atleast > 99.5%** and aims for 100%. Any new contribution must maintain the existing test coverage level. We use [Chai](http://chaijs.com/), [Mocha](https://mochajs.org/) and [Istanbul](https://istanbul.js.org/), [Coveralls](https://coveralls.io/) to run tests and determine coverage. 44 | 45 | ### Committing 46 | We follow [commit guidelines](https://github.com/angular/angular.js/blob/master/DEVELOPERS.md#commits) from the Google's [Angular Project](https://angular.io/), whose documentation is licensed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/). See important excerpts for quick reference below: 47 | 48 | #### Commit Message Format 49 | Each commit message consists of a **header**, a **body** and a **footer**. The header has a special format that includes a **type**, a **scope** and a **subject**: 50 | 51 | (): 52 | 53 | 54 | 55 |