├── .npmignore
├── docs
├── menu.svg
├── close.svg
├── scripts
│ ├── collapse.js
│ ├── linenumber.js
│ ├── prettify
│ │ ├── lang-css.js
│ │ └── Apache-License-2.0.txt
│ └── search.js
├── styles
│ └── prettify.css
├── string-lower-case.js.html
├── string-trim.js.html
├── string-upper-case.js.html
└── string-stem.js.html
├── .travis.yml
├── .nycrc.json
├── .jsdoc.json
├── .gitignore
├── LICENSE
├── package.json
├── runkit
└── example.js
├── src
├── string-lower-case.js
├── string-upper-case.js
├── string-trim.js
├── string-stem.js
├── tokens-stem.js
├── tokens-phonetize.js
├── string-soundex.js
├── tokens-soundex.js
├── string-amplify-not-elision.js
├── string-remove-extra-spaces.js
├── string-remove-spl-chars.js
├── tokens-bigrams.js
├── string-remove-html-tags.js
├── tokens-append-bigrams.js
├── string-remove-punctuations.js
├── string-remove-elisions.js
├── string-split-elisions.js
├── string-extract-run-of-capital-words.js
├── string-retain-alpha-nums.js
├── string-marker.js
├── dictionaries
│ └── stop_words.json
├── string-ngram.js
├── tokens-remove-words.js
├── jsdoc-classes.js
├── tokens-sow.js
├── string-soc.js
├── string-sentences.js
├── name_cleaner_regexes.js
├── string-tokenize0.js
├── helper-return-indexer.js
├── string-extract-persons-name.js
├── string-song.js
├── string-edge-ngrams.js
├── string-compose-corpus.js
├── tokens-propagate-negations.js
├── phonetize_regexes.js
├── string-bong.js
├── helper-return-words-filter.js
├── tokens-bow.js
├── string-tokenize.js
├── helper-return-quoted-text-extractor.js
├── string-phonetize.js
├── util_regexes.js
└── wink-nlp-utils.js
├── CODE_OF_CONDUCT.md
├── test
└── string-edge-ngrams-specs.js
├── README.md
├── CONTRIBUTING.md
└── .eslintrc.json
/.npmignore:
--------------------------------------------------------------------------------
1 | test
2 | docs
3 | sourcedocs
4 | .eslintrc.json
5 | .jshintrc
6 | .travis.yml
7 |
8 | # Generic
9 | .node_repl_history
10 | .npm
11 | coverage
12 |
--------------------------------------------------------------------------------
/docs/menu.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: node_js
2 | node_js:
3 | - "16.17.1"
4 |
5 |
6 | cache:
7 | directories:
8 | - "node_modules"
9 |
10 | script:
11 | - npm run pretest
12 | - npm run test
13 | - npm run coverage
14 |
--------------------------------------------------------------------------------
/docs/close.svg:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/docs/scripts/collapse.js:
--------------------------------------------------------------------------------
1 | function hideAllButCurrent(){
2 | //by default all submenut items are hidden
3 | $("nav > ul > li > ul li").hide();
4 |
5 | //only current page (if it exists) should be opened
6 | var file = window.location.pathname.split("/").pop();
7 | $("nav > ul > li > a[href^='"+file+"']").parent().find("> ul li").show();
8 | }
9 | $( document ).ready(function() {
10 | hideAllButCurrent();
11 | });
--------------------------------------------------------------------------------
/.nycrc.json:
--------------------------------------------------------------------------------
1 | {
2 | "watermarks": {
3 | "branches": [
4 | 90,
5 | 99.5
6 | ],
7 | "lines": [
8 | 90,
9 | 99.5
10 | ],
11 | "functions": [
12 | 90,
13 | 99.5
14 | ],
15 | "statements": [
16 | 90,
17 | 99.5
18 | ]
19 | },
20 | "branches": 99.5,
21 | "lines": 99.5,
22 | "functions": 99.5,
23 | "statements": 99.5,
24 | "check-coverage": true
25 | }
--------------------------------------------------------------------------------
/.jsdoc.json:
--------------------------------------------------------------------------------
1 | {
2 | "opts": {
3 | "encoding": "utf8",
4 | "readme": "README.md",
5 | "template": "./node_modules/docdash/",
6 | "destination": "docs"
7 | },
8 | "plugins": ["plugins/markdown"],
9 | "docdash": {
10 | "meta": {
11 | "title": "wink-nlp-utils - Wink JS"
12 | },
13 | "menu":{
14 | "Github":{
15 | "href":"https://github.com/winkjs/wink-nlp-utils",
16 | "target":"_blank"
17 | }
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 |
6 | # Runtime data
7 | pids
8 | *.pid
9 | *.seed
10 |
11 | # Coverage directory used by tools like istanbul
12 | coverage
13 |
14 | # nyc test coverage
15 | .nyc_output
16 |
17 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
18 | .grunt
19 |
20 | # node-waf configuration
21 | .lock-wscript
22 |
23 | # Compiled binary addons (http://nodejs.org/api/addons.html)
24 | build/Release
25 |
26 | # Dependency directories
27 | node_modules
28 | jspm_packages
29 |
30 | # Optional npm cache directory
31 | .npm
32 |
33 | # Optional REPL history
34 | .node_repl_history
35 |
36 | # Documentation
37 | sourcedocs
38 |
39 | # Mac stuff
40 | .DS_Store
41 |
--------------------------------------------------------------------------------
/docs/scripts/linenumber.js:
--------------------------------------------------------------------------------
1 | /*global document */
2 | (function() {
3 | var source = document.getElementsByClassName('prettyprint source linenums');
4 | var i = 0;
5 | var lineNumber = 0;
6 | var lineId;
7 | var lines;
8 | var totalLines;
9 | var anchorHash;
10 |
11 | if (source && source[0]) {
12 | anchorHash = document.location.hash.substring(1);
13 | lines = source[0].getElementsByTagName('li');
14 | totalLines = lines.length;
15 |
16 | for (; i < totalLines; i++) {
17 | lineNumber++;
18 | lineId = 'line' + lineNumber;
19 | lines[i].id = lineId;
20 | if (lineId === anchorHash) {
21 | lines[i].className += ' selected';
22 | }
23 | }
24 | }
25 | })();
26 |
--------------------------------------------------------------------------------
/docs/scripts/prettify/lang-css.js:
--------------------------------------------------------------------------------
1 | PR.registerLangHandler(PR.createSimpleLexer([["pln",/^[\t\n\f\r ]+/,null," \t\r\n"]],[["str",/^"(?:[^\n\f\r"\\]|\\(?:\r\n?|\n|\f)|\\[\S\s])*"/,null],["str",/^'(?:[^\n\f\r'\\]|\\(?:\r\n?|\n|\f)|\\[\S\s])*'/,null],["lang-css-str",/^url\(([^"')]*)\)/i],["kwd",/^(?:url|rgb|!important|@import|@page|@media|@charset|inherit)(?=[^\w-]|$)/i,null],["lang-css-kw",/^(-?(?:[_a-z]|\\[\da-f]+ ?)(?:[\w-]|\\\\[\da-f]+ ?)*)\s*:/i],["com",/^\/\*[^*]*\*+(?:[^*/][^*]*\*+)*\//],["com",
2 | /^(?:<\!--|--\>)/],["lit",/^(?:\d+|\d*\.\d+)(?:%|[a-z]+)?/i],["lit",/^#[\da-f]{3,6}/i],["pln",/^-?(?:[_a-z]|\\[\da-f]+ ?)(?:[\w-]|\\\\[\da-f]+ ?)*/i],["pun",/^[^\s\w"']+/]]),["css"]);PR.registerLangHandler(PR.createSimpleLexer([],[["kwd",/^-?(?:[_a-z]|\\[\da-f]+ ?)(?:[\w-]|\\\\[\da-f]+ ?)*/i]]),["css-kw"]);PR.registerLangHandler(PR.createSimpleLexer([],[["str",/^[^"')]+/]]),["css-str"]);
3 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License
2 |
3 | Copyright (c) 2017-22 GRAYPE Systems Private Limited
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
6 | this software and associated documentation files (the "Software"), to deal in
7 | the Software without restriction, including without limitation the rights to
8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
9 | of the Software, and to permit persons to whom the Software is furnished to do
10 | so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/docs/styles/prettify.css:
--------------------------------------------------------------------------------
1 | .pln {
2 | color: #abb2bf;
3 | }
4 |
5 | /* string content */
6 | .str {
7 | color: #98c379;
8 | }
9 |
10 | /* a keyword */
11 | .kwd {
12 | color: #c678dd;
13 | }
14 |
15 | /* a comment */
16 | .com {
17 | color: #5c6370;
18 | font-style: italic;
19 | }
20 |
21 | /* a type name */
22 | .typ {
23 | color: #d19a66;
24 | }
25 |
26 | /* a literal value */
27 | .lit {
28 | color: #56b6c2;
29 | }
30 |
31 | /* punctuation */
32 | .pun {
33 | color: #abb2bf;
34 | }
35 |
36 | /* lisp open bracket */
37 | .opn {
38 | color: #000000;
39 | }
40 |
41 | /* lisp close bracket */
42 | .clo {
43 | color: #000000;
44 | }
45 |
46 | /* a markup tag name */
47 | .tag {
48 | color: #c678dd;
49 | }
50 |
51 | /* a markup attribute name */
52 | .atn {
53 | color: #98c379;
54 | }
55 |
56 | /* a markup attribute value */
57 | .atv {
58 | color: #d19a66;
59 | }
60 |
61 | /* a declaration */
62 | .dec {
63 | color: #EF5050;
64 | }
65 |
66 | /* a variable name */
67 | .var {
68 | color: #d19a66;
69 | }
70 |
71 | /* a function name */
72 | .fun {
73 | color: #e06c75;
74 | }
75 |
76 | /* Specify class=linenums on a pre to get line numbering */
77 | ol.linenums {
78 | margin-top: 0;
79 | margin-bottom: 0;
80 | }
81 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "wink-nlp-utils",
3 | "version": "2.1.0",
4 | "description": "NLP Functions for amplifying negations, managing elisions, creating ngrams, stems, phonetic codes to tokens and more.",
5 | "keywords": [
6 | "Tokenize",
7 | "Stem",
8 | "NGrams",
9 | "Bag of Words",
10 | "Phonetize",
11 | "Soundex",
12 | "Stop Words",
13 | "Sentence Breaking",
14 | "Regex",
15 | "NLP",
16 | "Natural Language Processing"
17 | ],
18 | "main": "src/wink-nlp-utils.js",
19 | "scripts": {
20 | "pretest": "npm run lint && npm run docs",
21 | "test": "nyc --reporter=html --reporter=text mocha ./test/",
22 | "coverage": "nyc report --reporter=text-lcov | coveralls",
23 | "sourcedocs": "docker -i src -o ./sourcedocs --sidebar no",
24 | "docs": "jsdoc src/*.js -c .jsdoc.json",
25 | "lint": "eslint ./src/*.js ./test/*.js ./runkit/*.js"
26 | },
27 | "repository": {
28 | "type": "git",
29 | "url": "git+https://github.com/winkjs/wink-nlp-utils.git"
30 | },
31 | "author": "Sanjaya Kumar Saxena",
32 | "license": "MIT",
33 | "bugs": {
34 | "url": "https://github.com/winkjs/wink-nlp-utils/issues"
35 | },
36 | "homepage": "http://winkjs.org/",
37 | "devDependencies": {
38 | "chai": "^4.3.6",
39 | "coveralls": "^3.1.1",
40 | "docdash": "github:winkjs/docdash",
41 | "docker": "^1.0.0",
42 | "eslint": "^8.25.0",
43 | "jsdoc": "^3.6.11",
44 | "mocha": "^10.0.0",
45 | "nyc": "^15.1.0"
46 | },
47 | "dependencies": {
48 | "wink-distance": "^2.0.1",
49 | "wink-eng-lite-web-model": "^1.4.3",
50 | "wink-helpers": "^2.0.0",
51 | "wink-nlp": "^1.12.0",
52 | "wink-porter2-stemmer": "^2.0.1",
53 | "wink-tokenizer": "^5.2.3"
54 | },
55 | "runkitExampleFilename": "./runkit/example.js"
56 | }
57 |
--------------------------------------------------------------------------------
/runkit/example.js:
--------------------------------------------------------------------------------
1 | // Load wink-nlp-utils
2 | var nlp = require( 'wink-nlp-utils' );
3 |
4 | // Extract person's name from a string:
5 | var name = nlp.string.extractPersonsName( 'Dr. Sarah Connor M. Tech., PhD. - AI' );
6 | console.log( name ); // eslint-disable-line no-console
7 | // -> 'Sarah Connor'
8 |
9 | // Compose all possible sentences from a string:
10 | var str = '[I] [am having|have] [a] [problem|question]';
11 | console.log( nlp.string.composeCorpus( str ) ); // eslint-disable-line no-console
12 | // -> [ 'I am having a problem',
13 | // -> 'I am having a question',
14 | // -> 'I have a problem',
15 | // -> 'I have a question' ]
16 |
17 | // Sentence Boundary Detection.
18 | var para = 'AI Inc. is focussing on AI. I work for AI Inc. My mail is r2d2@yahoo.com';
19 | console.log( nlp.string.sentences( para ) ); // eslint-disable-line no-console
20 | // -> [ 'AI Inc. is focussing on AI.',
21 | // 'I work for AI Inc.',
22 | // 'My mail is r2d2@yahoo.com' ]
23 |
24 | // Tokenize a sentence.
25 | var s = 'For details on wink, check out http://winkjs.org/ URL!';
26 | console.log( nlp.string.tokenize( s, true ) ); // eslint-disable-line no-console
27 | // -> [ { value: 'For', tag: 'word' },
28 | // { value: 'details', tag: 'word' },
29 | // { value: 'on', tag: 'word' },
30 | // { value: 'wink', tag: 'word' },
31 | // { value: ',', tag: 'punctuation' },
32 | // { value: 'check', tag: 'word' },
33 | // { value: 'out', tag: 'word' },
34 | // { value: 'http://winkjs.org/', tag: 'url' },
35 | // { value: 'URL', tag: 'word' },
36 | // { value: '!', tag: 'punctuation' } ]
37 |
38 | // Remove stop words:
39 | var t = nlp.tokens.removeWords( [ 'mary', 'had', 'a', 'little', 'lamb' ] );
40 | console.log( t ); // eslint-disable-line no-console
41 | // -> [ 'mary', 'little', 'lamb' ]
42 |
--------------------------------------------------------------------------------
/src/string-lower-case.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### lowerCase
32 | /**
33 | *
34 | * Converts the input string to lower case.
35 | *
36 | * @alias string#lowerCase
37 | * @param {string} str the input string.
38 | * @return {string} input string in lower case.
39 | * @example
40 | * lowerCase( 'Lower Case' );
41 | * // -> 'lower case'
42 | */
43 | var lowerCase = function ( str ) {
44 | return ( str.toLowerCase() );
45 | }; // lowerCase()
46 |
47 | module.exports = lowerCase;
48 |
--------------------------------------------------------------------------------
/src/string-upper-case.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### upperCase
32 | /**
33 | *
34 | * Converts the input string to upper case.
35 | *
36 | * @alias string#upperCase
37 | * @param {string} str the input string.
38 | * @return {string} input string in upper case.
39 | * @example
40 | * upperCase( 'Upper Case' );
41 | * // -> 'UPPER CASE'
42 | */
43 | var upperCase = function ( str ) {
44 | return ( str.toUpperCase() );
45 | }; // upperCase()
46 |
47 | module.exports = upperCase;
48 |
--------------------------------------------------------------------------------
/src/string-trim.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### trim
32 | /**
33 | *
34 | * Trims leading and trailing whitespaces from the input string.
35 | *
36 | * @alias string#trim
37 | * @param {string} str the input string.
38 | * @return {string} input string with leading & trailing whitespaces removed.
39 | * @example
40 | * trim( ' Padded ' );
41 | * // -> 'Padded'
42 | */
43 | var trim = function ( str ) {
44 | return ( str.trim() );
45 | }; // trim()
46 |
47 | module.exports = trim;
48 |
--------------------------------------------------------------------------------
/src/string-stem.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var porter2Stemmer = require( 'wink-porter2-stemmer' );
29 |
30 | // ## string
31 |
32 | // ### stem
33 | /**
34 | *
35 | * Stems an inflected word using Porter2 stemming algorithm.
36 | *
37 | * @alias string#stem
38 | * @param {string} word to be stemmed.
39 | * @return {string} the stemmed word.
40 | *
41 | * @example
42 | * stem( 'consisting' );
43 | * // -> 'consist'
44 | */
45 | var stem = function ( word ) {
46 | return ( porter2Stemmer( word ) );
47 | }; // stem()
48 |
49 | module.exports = stem;
50 |
--------------------------------------------------------------------------------
/docs/scripts/search.js:
--------------------------------------------------------------------------------
1 | $( document ).ready(function() {
2 | var searchAttr = 'data-search-mode';
3 | jQuery.expr[':'].Contains = function(a,i,m){
4 | return (a.textContent || a.innerText || "").toUpperCase().indexOf(m[3].toUpperCase())>=0;
5 | };
6 | //on search
7 | $("#nav-search").on("keyup", function(event) {
8 | var search = $(this).val();
9 |
10 | if (!search) {
11 | //no search, show all results
12 | document.documentElement.removeAttribute(searchAttr);
13 | $("nav > ul > li").not('.level-hide').show();
14 |
15 | if(typeof hideAllButCurrent === "function"){
16 | //let's do what ever collapse wants to do
17 | hideAllButCurrent();
18 | }
19 | else{
20 | //menu by default should be opened
21 | $("nav > ul > li > ul li").show();
22 | }
23 | }
24 | else{
25 | //we are searching
26 | document.documentElement.setAttribute(searchAttr, '');
27 |
28 | //show all parents
29 | $("nav > ul > li").show();
30 | //hide all results
31 | $("nav > ul > li > ul li").hide();
32 | //show results matching filter
33 | $("nav > ul > li > ul").find("a:Contains("+search+")").parent().show();
34 | //hide parents without children
35 | $("nav > ul > li").each(function(){
36 | if($(this).find("a:Contains("+search+")").length == 0 && $(this).children("ul").length === 0){
37 | //has no child at all and does not contain text
38 | $(this).hide();
39 | }
40 | else if($(this).find("a:Contains("+search+")").length == 0 && $(this).find("ul").children(':visible').length == 0){
41 | //has no visible child and does not contain text
42 | $(this).hide();
43 | }
44 | });
45 | }
46 | });
47 | });
--------------------------------------------------------------------------------
/src/tokens-stem.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var porter2Stemmer = require( 'wink-porter2-stemmer' );
29 |
30 | // ## tokens
31 |
32 | // ### stem
33 | /**
34 | *
35 | * Stems input tokens using Porter Stemming Algorithm Version 2.
36 | *
37 | * @alias tokens#stem
38 | * @param {string[]} tokens the input tokens.
39 | * @return {string[]} stemmed tokens.
40 | * @example
41 | * stem( [ 'he', 'acted', 'decisively', 'today' ] );
42 | * // -> [ 'he', 'act', 'decis', 'today' ]
43 | */
44 | var stem = function ( tokens ) {
45 | return tokens.map( porter2Stemmer );
46 | }; // stem()
47 |
48 | module.exports = stem;
49 |
--------------------------------------------------------------------------------
/src/tokens-phonetize.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var stringPhonetize = require( './string-phonetize.js' );
29 |
30 | // ## tokens
31 |
32 | // ### phonetize
33 | /**
34 | *
35 | * Phonetizes input tokens using using an algorithmic adaptation of Metaphone.
36 | *
37 | * @alias tokens#phonetize
38 | * @param {string[]} tokens the input tokens.
39 | * @return {string[]} phonetized tokens.
40 | * @example
41 | * phonetize( [ 'he', 'acted', 'decisively', 'today' ] );
42 | * // -> [ 'h', 'aktd', 'dssvl', 'td' ]
43 | */
44 | var phonetize = function ( tokens ) {
45 | return tokens.map( stringPhonetize );
46 | }; // phonetize()
47 |
48 | module.exports = phonetize;
49 |
--------------------------------------------------------------------------------
/src/string-soundex.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var sndx = require( 'wink-distance/src/soundex.js' );
29 |
30 | // ## string
31 |
32 | // ### soundex
33 | /**
34 | *
35 | * Produces the soundex code from the input `word`.
36 | *
37 | * @alias string#soundex
38 | * @param {string} word the input word.
39 | * @param {number} [maxLength=4] of soundex code to be returned.
40 | * @return {string} soundex code of `word`.
41 | * @example
42 | * soundex( 'Burroughs' );
43 | * // -> 'B620'
44 | * soundex( 'Burrows' );
45 | * // -> 'B620'
46 | */
47 | var soundex = function ( word, maxLength ) {
48 | return sndx( word, maxLength );
49 | }; // soundex()
50 |
51 | module.exports = soundex;
52 |
--------------------------------------------------------------------------------
/src/tokens-soundex.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var stringSoundex = require( './string-soundex.js' );
29 |
30 | // ## tokens
31 |
32 | // ### soundex
33 | /**
34 | *
35 | * Generates the soundex coded tokens from the input tokens.
36 | *
37 | * @alias tokens#soundex
38 | * @param {string[]} tokens the input tokens.
39 | * @return {string[]} soundex coded tokens.
40 | * @example
41 | * soundex( [ 'he', 'acted', 'decisively', 'today' ] );
42 | * // -> [ 'H000', 'A233', 'D221', 'T300' ]
43 | */
44 | var soundex = function ( tokens ) {
45 | // Need to send `maxLength` as `undefined`.
46 | return tokens.map( ( t ) => stringSoundex( t ) );
47 | }; // soundex()
48 |
49 | module.exports = soundex;
50 |
--------------------------------------------------------------------------------
/src/string-amplify-not-elision.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 |
30 | // ## string
31 |
32 | // ### amplifyNotElision
33 | /**
34 | *
35 | * Amplifies the not elision by converting it into not; for example `isn't`
36 | * becomes `is not`.
37 | *
38 | * @alias string#amplifyNotElision
39 | * @param {string} str the input string.
40 | * @return {string} input string after not elision amplification.
41 | * @example
42 | * amplifyNotElision( "someone's wallet, isn't it?" );
43 | * // -> "someone's wallet, is not it?"
44 | */
45 | var amplifyNotElision = function ( str ) {
46 | return str.replace( rgx.notElision, '$1 not' );
47 | }; // amplifyNotElision()
48 |
49 | module.exports = amplifyNotElision;
50 |
--------------------------------------------------------------------------------
/src/string-remove-extra-spaces.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 |
30 | // ## string
31 |
32 | // ### removeExtraSpaces
33 | /**
34 | *
35 | * Removes leading, trailing and any extra in-between whitespaces from the input
36 | * string.
37 | *
38 | * @alias string#removeExtraSpaces
39 | * @param {string} str the input string.
40 | * @return {string} input string after removal of leading, trailing and extra
41 | * whitespaces.
42 | * @example
43 | * removeExtraSpaces( ' Padded Text ' );
44 | * // -> 'Padded Text'
45 | */
46 | var removeExtraSpaces = function ( str ) {
47 | return ( str
48 | .trim()
49 | .replace( rgx.spaces, ' ')
50 | );
51 | }; // removeExtraSpaces()
52 |
53 | module.exports = removeExtraSpaces;
54 |
--------------------------------------------------------------------------------
/src/string-remove-spl-chars.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 |
30 | // ## string
31 |
32 | // ### removeSplChars
33 | /**
34 | *
35 | * Removes each special character by replacing it with a whitespace. It looks for
36 | * the following special characters — `~@#%^*+=`.
37 | *
38 | * Extra spaces, if required, may be removed using [string.removeExtraSpaces](#stringremoveextraspaces)
39 | * function.
40 | *
41 | * @alias string#removeSplChars
42 | * @param {string} str the input string.
43 | * @return {string} input string after removal of special characters.
44 | * @example
45 | * removeSplChars( '4 + 4*2 = 12' );
46 | * // -> '4 4 2 12'
47 | */
48 | var removeSplChars = function ( str ) {
49 | return str.replace( rgx.splChars, ' ' );
50 | }; // removeSplChars()
51 |
52 | module.exports = removeSplChars;
53 |
--------------------------------------------------------------------------------
/src/tokens-bigrams.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## tokens
30 |
31 | // ### bigrams
32 | /**
33 | *
34 | * Generates bigrams from the input tokens.
35 | *
36 | * @alias tokens#bigrams
37 | * @param {string[]} tokens the input tokens.
38 | * @return {string[]} the bigrams.
39 | * @example
40 | * bigrams( [ 'he', 'acted', 'decisively', 'today' ] );
41 | * // -> [ [ 'he', 'acted' ],
42 | * // [ 'acted', 'decisively' ],
43 | * // [ 'decisively', 'today' ] ]
44 | */
45 | var bigrams = function ( tokens ) {
46 | // Bigrams will be stored here.
47 | var bgs = [];
48 | // Helper variables.
49 | var i, imax;
50 | // Create bigrams.
51 | for ( i = 0, imax = tokens.length - 1; i < imax; i += 1 ) {
52 | bgs.push( [ tokens[ i ], tokens[ i + 1 ] ] );
53 | }
54 | return bgs;
55 | }; // bigrams()
56 |
57 | module.exports = bigrams;
58 |
--------------------------------------------------------------------------------
/src/string-remove-html-tags.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 |
30 | // ## string
31 |
32 | // ### removeHTMLTags
33 | /**
34 | *
35 | * Removes each HTML tag by replacing it with a whitespace.
36 | *
37 | * Extra spaces, if required, may be removed using [string.removeExtraSpaces](#stringremoveextraspaces)
38 | * function.
39 | *
40 | * @alias string#removeHTMLTags
41 | * @param {string} str the input string.
42 | * @return {string} input string after removal of HTML tags.
43 | * @example
44 | * removeHTMLTags( '
Vive la France !
' );
45 | * // -> ' Vive la France ! '
46 | */
47 | var removeHTMLTags = function ( str ) {
48 | return ( str
49 | .replace( rgx.htmlTags, ' ' )
50 | .replace( rgx.htmlEscSeq1, ' ' )
51 | .replace( rgx.htmlEscSeq2, ' ' )
52 | );
53 | }; // removeHTMLTags()
54 |
55 | module.exports = removeHTMLTags;
56 |
--------------------------------------------------------------------------------
/src/tokens-append-bigrams.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## tokens
30 |
31 | // ### appendBigrams
32 | /**
33 | *
34 | * Generates bigrams from the input tokens and appends them to the input tokens.
35 | *
36 | * @alias tokens#appendBigrams
37 | * @param {string[]} tokens the input tokens.
38 | * @return {string[]} the input tokens appended with their bigrams.
39 | * @example
40 | * appendBigrams( [ 'he', 'acted', 'decisively', 'today' ] );
41 | * // -> [ 'he',
42 | * // 'acted',
43 | * // 'decisively',
44 | * // 'today',
45 | * // 'he_acted',
46 | * // 'acted_decisively',
47 | * // 'decisively_today' ]
48 | */
49 | var appendBigrams = function ( tokens ) {
50 | var i, imax;
51 | for ( i = 0, imax = tokens.length - 1; i < imax; i += 1 ) {
52 | tokens.push( tokens[ i ] + '_' + tokens[ i + 1 ] );
53 | }
54 | return tokens;
55 | }; // appendBigrams()
56 |
57 | module.exports = appendBigrams;
58 |
--------------------------------------------------------------------------------
/src/string-remove-punctuations.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 |
30 | // ## string
31 |
32 | // ### removePunctuations
33 | /**
34 | *
35 | * Removes each punctuation mark by replacing it with a whitespace. It looks for
36 | * the following punctuations — `.,;!?:"!'... - () [] {}`.
37 | *
38 | * Extra spaces, if required, may be removed using [string.removeExtraSpaces](#stringremoveextraspaces)
39 | * function.
40 | *
41 | * @alias string#removePunctuations
42 | * @param {string} str the input string.
43 | * @return {string} input string after removal of punctuations.
44 | * @example
45 | * removePunctuations( 'Punctuations like "\'\',;!?:"!... are removed' );
46 | * // -> 'Punctuations like are removed'
47 | */
48 | var removePunctuations = function ( str ) {
49 | return str.replace( rgx.punctuations, ' ' );
50 | }; // removePunctuations()
51 |
52 | module.exports = removePunctuations;
53 |
--------------------------------------------------------------------------------
/src/string-remove-elisions.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 |
30 | // ## string
31 |
32 | // ### removeElisions
33 | /**
34 | *
35 | * Removes basic elisions found in the input string. Typical example of elisions
36 | * are `it's, let's, where's, I'd, I'm, I'll, I've, and Isn't` etc. Note it retains
37 | * apostrophe used to indicate possession.
38 | *
39 | * @alias string#removeElisions
40 | * @param {string} str the input string.
41 | * @return {string} input string after removal of elisions.
42 | * @example
43 | * removeElisions( "someone's wallet, isn't it?" );
44 | * // -> "someone's wallet, is it?"
45 | */
46 | var removeElisions = function ( str ) {
47 | return ( str
48 | .replace( rgx.elisionsSpl, '$2' )
49 | .replace( rgx.elisions1, '$1' )
50 | .replace( rgx.elisions2, '$1' )
51 | );
52 | }; // removeElisions()
53 |
54 | module.exports = removeElisions;
55 |
--------------------------------------------------------------------------------
/src/string-split-elisions.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 |
30 | // ## string
31 |
32 | // ### splitElisions
33 | /**
34 | *
35 | * Splits basic elisions found in the input string. Typical example of elisions
36 | * are `it's, let's, where's, I'd, I'm, I'll, I've, and Isn't` etc. Note it does
37 | * not touch apostrophe used to indicate possession.
38 | *
39 | * @alias string#splitElisions
40 | * @param {string} str the input string.
41 | * @return {string} input string after splitting of elisions.
42 | * @example
43 | * splitElisions( "someone's wallet, isn't it?" );
44 | * // -> "someone's wallet, is n't it?"
45 | */
46 | var splitElisions = function ( str ) {
47 | return ( str
48 | .replace( rgx.elisionsSpl, '$2 $3' )
49 | .replace( rgx.elisions1, '$1 $2' )
50 | .replace( rgx.elisions2, '$1 $2' )
51 | );
52 | }; // splitElisions()
53 |
54 | module.exports = splitElisions;
55 |
--------------------------------------------------------------------------------
/src/string-extract-run-of-capital-words.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 | var trim = require( './string-trim.js' );
30 | // ## string
31 |
32 | // ### extractRunOfCapitalWords
33 | /**
34 | *
35 | * Extracts the array of text appearing as Title Case or in ALL CAPS from the
36 | * input string.
37 | *
38 | * @alias string#extractRunOfCapitalWords
39 | * @param {string} str the input string.
40 | * @return {string[]} of text appearing in Title Case or in ALL CAPS; if no such
41 | * text is found then `null` is returned.
42 | * @example
43 | * extractRunOfCapitalWords( 'In The Terminator, Sarah Connor is in Los Angeles' );
44 | * // -> [ 'In The Terminator', 'Sarah Connor', 'Los Angeles' ]
45 | */
46 | var extractRunOfCapitalWords = function ( str ) {
47 | var m = str.match( rgx.rocWords );
48 | return ( ( m ) ? m.map( trim ) : m );
49 | }; // extractRunOfCapitalWords()
50 |
51 | module.exports = extractRunOfCapitalWords;
52 |
--------------------------------------------------------------------------------
/src/string-retain-alpha-nums.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 |
30 | // ## string
31 |
32 | // ### retainAlphaNums
33 | /**
34 | *
35 | * Retains only apha, numerals, and removes all other characters from
36 | * the input string, including leading, trailing and extra in-between
37 | * whitespaces.
38 | *
39 | * @alias string#retainAlphaNums
40 | * @param {string} str the input string.
41 | * @return {string} input string after removal of non-alphanumeric characters,
42 | * leading, trailing and extra whitespaces.
43 | * @example
44 | * retainAlphaNums( ' This, text here, has (other) chars_! ' );
45 | * // -> 'This text here has other chars'
46 | */
47 | var retainAlphaNums = function ( str ) {
48 | return ( str
49 | .replace( rgx.notAlphaNumeric, ' ')
50 | .replace( rgx.spaces, ' ')
51 | .trim()
52 | );
53 | }; // retainAlphaNums()
54 |
55 | module.exports = retainAlphaNums;
56 |
--------------------------------------------------------------------------------
/src/string-marker.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### marker
32 | /**
33 | *
34 | * Generates `marker` of the input string; it is defined as 1-gram, sorted
35 | * and joined back as a string again. Marker is a quick and aggressive way
36 | * to detect similarity between short strings. Its aggression may lead to more
37 | * false positives such as `Meter` and `Metre` or `no melon` and `no lemon`.
38 | *
39 | * @alias string#marker
40 | * @param {string} str the input string.
41 | * @return {string} the marker.
42 | * @example
43 | * marker( 'the quick brown fox jumps over the lazy dog' );
44 | * // -> ' abcdefghijklmnopqrstuvwxyz'
45 | */
46 | var marker = function ( str ) {
47 | var uniqChars = Object.create( null );
48 | for ( var i = 0, imax = str.length; i < imax; i += 1 ) {
49 | uniqChars[ str[ i ] ] = true;
50 | }
51 | return ( Object.keys( uniqChars ).sort().join('') );
52 | }; // marker()
53 |
54 | module.exports = marker;
55 |
--------------------------------------------------------------------------------
/src/dictionaries/stop_words.json:
--------------------------------------------------------------------------------
1 | [
2 | "i",
3 | "me",
4 | "my",
5 | "myself",
6 | "we",
7 | "our",
8 | "ours",
9 | "ourselves",
10 | "you",
11 | "your",
12 | "yours",
13 | "yourself",
14 | "yourselves",
15 | "he",
16 | "him",
17 | "his",
18 | "himself",
19 | "she",
20 | "her",
21 | "hers",
22 | "herself",
23 | "it",
24 | "its",
25 | "itself",
26 | "they",
27 | "them",
28 | "their",
29 | "theirs",
30 | "themselves",
31 | "what",
32 | "which",
33 | "who",
34 | "whom",
35 | "this",
36 | "that",
37 | "these",
38 | "those",
39 | "am",
40 | "is",
41 | "are",
42 | "was",
43 | "were",
44 | "be",
45 | "been",
46 | "being",
47 | "have",
48 | "has",
49 | "had",
50 | "having",
51 | "do",
52 | "does",
53 | "did",
54 | "doing",
55 | "would",
56 | "should",
57 | "could",
58 | "ought",
59 | "i'm",
60 | "you're",
61 | "he's",
62 | "she's",
63 | "it's",
64 | "we're",
65 | "they're",
66 | "i've",
67 | "you've",
68 | "we've",
69 | "they've",
70 | "i'd",
71 | "you'd",
72 | "he'd",
73 | "she'd",
74 | "we'd",
75 | "they'd",
76 | "i'll",
77 | "you'll",
78 | "he'll",
79 | "she'll",
80 | "we'll",
81 | "they'll",
82 | "let's",
83 | "that's",
84 | "who's",
85 | "what's",
86 | "here's",
87 | "there's",
88 | "when's",
89 | "where's",
90 | "why's",
91 | "how's",
92 | "a",
93 | "an",
94 | "the",
95 | "and",
96 | "but",
97 | "if",
98 | "or",
99 | "because",
100 | "as",
101 | "until",
102 | "while",
103 | "of",
104 | "at",
105 | "by",
106 | "for",
107 | "with",
108 | "about",
109 | "against",
110 | "between",
111 | "into",
112 | "through",
113 | "during",
114 | "before",
115 | "after",
116 | "above",
117 | "below",
118 | "to",
119 | "from",
120 | "up",
121 | "down",
122 | "in",
123 | "out",
124 | "on",
125 | "off",
126 | "over",
127 | "under",
128 | "again",
129 | "further",
130 | "then",
131 | "once",
132 | "here",
133 | "there",
134 | "when",
135 | "where",
136 | "why",
137 | "how",
138 | "all",
139 | "any",
140 | "both",
141 | "each",
142 | "few",
143 | "more",
144 | "most",
145 | "other",
146 | "some",
147 | "such",
148 | "only",
149 | "own",
150 | "same",
151 | "so",
152 | "than",
153 | "too",
154 | "very"
155 | ]
156 |
--------------------------------------------------------------------------------
/src/string-ngram.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### ngram
32 | /**
33 | *
34 | * Generates an array of ngrams of a specified size from the input string. The
35 | * default size is 2, which means it will generate bigrams by default.
36 | *
37 | * @alias string#ngram
38 | * @param {string} str the input string.
39 | * @param {number} [size=2] ngram's size.
40 | * @return {string[]} ngrams of `size` from `str`.
41 | * @example
42 | * ngram( 'FRANCE' );
43 | * // -> [ 'FR', 'RA', 'AN', 'NC', 'CE' ]
44 | * ngram( 'FRENCH' );
45 | * // -> [ 'FR', 'RE', 'EN', 'NC', 'CH' ]
46 | * ngram( 'FRANCE', 3 );
47 | * // -> [ 'FRA', 'RAN', 'ANC', 'NCE' ]
48 | */
49 | var ngram = function ( str, size ) {
50 | var ng = ( size || 2 ),
51 | ngramz = [],
52 | tg;
53 | for ( var i = 0, imax = str.length; i < imax; i += 1 ) {
54 | tg = str.slice( i, i + ng );
55 | if ( tg.length === ng ) ngramz.push( tg );
56 | }
57 | return ( ngramz );
58 | }; // ngram()
59 |
60 | module.exports = ngram;
61 |
--------------------------------------------------------------------------------
/src/tokens-remove-words.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | var defaultStopWords = require( './dictionaries/stop_words.json' );
30 | var words = require( './helper-return-words-filter.js' );
31 | defaultStopWords = words( defaultStopWords );
32 |
33 | // ## tokens
34 |
35 | // ### removeWords
36 | /**
37 | *
38 | * Removes the stop words from the input array of tokens.
39 | *
40 | * @alias tokens#removeWords
41 | * @param {string[]} tokens the input tokens.
42 | * @param {wordsFilter} [stopWords=defaultStopWords] default stop words are
43 | * loaded from `stop_words.json` located under the `src/dictionaries/` directory.
44 | * Custom stop words can be created using [helper.returnWordsFilter ](#helperreturnwordsfilter).
45 | * @return {string[]} balance tokens.
46 | * @example
47 | * removeWords( [ 'this', 'is', 'a', 'cat' ] );
48 | * // -> [ 'cat' ]
49 | */
50 | var removeWords = function ( tokens, stopWords ) {
51 | var givenStopWords = ( stopWords || defaultStopWords );
52 | return tokens.filter( givenStopWords.exclude );
53 | }; // removeWords()
54 |
55 | module.exports = removeWords;
56 |
--------------------------------------------------------------------------------
/src/jsdoc-classes.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | /**
28 | * @classdesc Indexer object
29 | * @class Indexer
30 | * @hideconstructor
31 | */
32 |
33 | /**
34 | * Incrementally builds index for each `element/itsIndex` combination passed.
35 | *
36 | * @method Indexer#build
37 | * @param elements
38 | * @param itsIndex
39 | */
40 |
41 | /**
42 | * Used to access the index. This index is in a form of an object that contains
43 | * each element as key. The value of each key is an array
44 | * containing all index positions to the element in question. Note these index positions
45 | * are nothing but each `itsIndex` value passed for the `element`.
46 | *
47 | * @method Indexer#result
48 | * @returns {Object}
49 | */
50 |
51 | /**
52 | * @classdesc WordsFilter
53 | * @class WordsFilter
54 | * @hideconstructor
55 | */
56 |
57 | /**
58 | * Contains the set created from the array `words`.
59 | *
60 | * @method WordsFilter#set
61 | */
62 |
63 | /**
64 | * Used with the array's filter method to exclude the `words` or mapped
65 | * `words` if `givenMappers` are defined.
66 | *
67 | * @method WordsFilter#exclude
68 | */
69 |
--------------------------------------------------------------------------------
/src/tokens-sow.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### setOfWords
32 | /**
33 | *
34 | * Generates the set of words from the input string. It also has an alias **`sow()`**.
35 | *
36 | * @alias tokens#setOfWords
37 | * @param {string[]} tokens the input tokens.
38 | * @param {function} [ifn=undefined] a function to build index; it is called for
39 | * every **member word of the set **; and it receives the word and the `idx`
40 | * as input arguments. The `build()` function of [helper.returnIndexer](#helperreturnindexer)
41 | * may be used as `ifn`. If `undefined` then index is not built.
42 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn`
43 | * function.
44 | * @return {set} of words from tokens.
45 | * @example
46 | * setOfWords( [ 'rain', 'rain', 'go', 'away' ] );
47 | * // -> Set { 'rain', 'go', 'away' }
48 | */
49 | var setOfWords = function ( tokens, ifn, idx ) {
50 | var tset = new Set( tokens );
51 | if ( typeof ifn === 'function' ) {
52 | tset.forEach( function ( m ) {
53 | ifn( m, idx );
54 | } );
55 | }
56 | return ( tset );
57 | }; // bow()
58 |
59 | module.exports = setOfWords;
60 |
--------------------------------------------------------------------------------
/src/string-soc.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### setOfChars
32 | /**
33 | *
34 | * Creates a set of chars from the input string `s`. This is useful
35 | * in even more aggressive string matching using Jaccard or Tversky compared to
36 | * `marker()`. It also has an alias **`soc()`**.
37 | *
38 | * @alias string#setOfChars
39 | * @param {string} str the input string.
40 | * @param {function} [ifn=undefined] a function to build index; it receives the first
41 | * character of `str` and the `idx` as input arguments. The `build()` function of
42 | * [helper.returnIndexer](#helperreturnindexer) may be used as `ifn`. If `undefined`
43 | * then index is not built.
44 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn`
45 | * function.
46 | * @return {string} the soc.
47 | * @example
48 | * setOfChars( 'the quick brown fox jumps over the lazy dog' );
49 | * // -> ' abcdefghijklmnopqrstuvwxyz'
50 | */
51 | var setOfChars = function ( str, ifn, idx ) {
52 | var cset = new Set( str );
53 | if ( typeof ifn === 'function' ) {
54 | ifn( str[ 0 ], idx );
55 | }
56 | return ( cset );
57 | }; // soc()
58 |
59 | module.exports = setOfChars;
60 |
--------------------------------------------------------------------------------
/src/string-sentences.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | // Load wink-nlp package & helpers.
29 | const winkNLP = require( 'wink-nlp' );
30 | // Load english language model — light version.
31 | const model = require( 'wink-eng-lite-web-model' );
32 | // Instantiate winkNLP, only use SBD.
33 | const nlp = winkNLP( model, [ 'sbd' ] );
34 |
35 | // ## string
36 |
37 | // ### sentences
38 | /**
39 | *
40 | * Detects the sentence boundaries in the input `paragraph` and splits it into
41 | * an array of sentence(s).
42 | *
43 | * @alias string#sentences
44 | * @param {string} paragraph the input string.
45 | * @return {string[]} of sentences.
46 | * @example
47 | * sentences( 'AI Inc. is focussing on AI. I work for AI Inc. My mail is r2d2@yahoo.com' );
48 | * // -> [ 'AI Inc. is focussing on AI.',
49 | * // 'I work for AI Inc.',
50 | * // 'My mail is r2d2@yahoo.com' ]
51 | *
52 | * sentences( 'U.S.A is my birth place. I was born on 06.12.1924. I climbed Mt. Everest.' );
53 | * // -> [ 'U.S.A is my birth place.',
54 | * // 'I was born on 06.12.1924.',
55 | * // 'I climbed Mt. Everest.' ]
56 | */
57 | var punkt = function ( paragraph ) {
58 | // Leverage winkNLP.
59 | return nlp.readDoc( paragraph ).sentences().out();
60 | }; // punkt()
61 |
62 | module.exports = punkt;
63 |
--------------------------------------------------------------------------------
/src/name_cleaner_regexes.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var degrees = [
29 | /\bm\.?\s*a\b/i,
30 | /\bb\.?\s*a\b/i,
31 | /\bb\.?\s*tech\b/i,
32 | /\bm\.?\s*tech\b/i,
33 | /\bb\.?\s*des\b/i,
34 | /\bm\.?\s*des\b/i,
35 | /\bm\.?\s*b\.?\s*a\b/i,
36 | /\bm\.?\s*c\.?\s*a\b/i,
37 | /\bb\.?\s*c\.?\s*a\b/i,
38 | /\bl\.?\s*l\.?\s*b\b/i,
39 | /\bl\.?\s*l\.?\s*m\b/i,
40 | /\bm\.?\s*b\.?\s*b\.?\s*s\b/i,
41 | /\bm\.?\s*d\b/i,
42 | /\bd\.?\s*m\b/i,
43 | /\bm\.?\s*s\b/i,
44 | /\bd\.?\s*n\.?\s*b\b/i,
45 | /\bd\.?\s*g\.?\s*o\b/i,
46 | /\bd\.?\s*l\.?\s*o\b/i,
47 | /\bb\.?\s*d\.?\s*s\b/i,
48 | /\bb\.?\s*h\.?\s*m\.?\s*s\b/i,
49 | /\bb\.?\s*a\.?\s*m\.?\s*s\b/i,
50 | /\bf\.?\s*i\.?\s*c\.?\s*s\b/i,
51 | /\bm\.?\s*n\.?\s*a\.?\s*m\.?\s*s\b/i,
52 | /\bb\.?\s*e\.?\s*m\.?\s*s\b/i,
53 | /\bd\.?\s*c\.?\s*h\b/i,
54 | /\bm\.?\s*c\.?\s*h\b/i,
55 | /\bf\.?\s*r\.?\s*c\.?\s*s\b/i,
56 | /\bm\.?\s*r\.?\s*c\.?\s*p\b/i,
57 | /\bf\.?\s*i\.?\s*a\.?\s*c\.?\s*m\b/i,
58 | /\bf\.?\s*i\.?\s*m\.?\s*s\.?\s*a\b/i,
59 | /\bp\.?\s*h\.?\s*d\b/i,
60 | ];
61 |
62 | var titleNames = [ 'mr', 'mrs', 'miss', 'ms', 'master', 'er', 'dr', 'shri', 'shrimati', 'sir' ];
63 |
64 | var titles = new RegExp( '^(?:' + titleNames.join( '|' ) + ')$', 'i' );
65 |
66 | module.exports = {
67 | degrees: degrees,
68 | titles: titles
69 | };
70 |
--------------------------------------------------------------------------------
/src/string-tokenize0.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var removeElisions = require( './string-remove-elisions.js' );
29 | var amplifyNotElision = require( './string-amplify-not-elision.js' );
30 | var rgx = require( './util_regexes.js' );
31 |
32 | // ## string
33 |
34 | // ### tokenize0
35 | /**
36 | *
37 | * Tokenizes by splitting the input string on **non-words**. This means tokens would
38 | * consists of only alphas, numerals and underscores; all other characters will
39 | * be stripped as they are treated as separators. It also removes all elisions;
40 | * however negations are retained and amplified.
41 | *
42 | * @alias string#tokenize0
43 | * @param {string} str the input string.
44 | * @return {string[]} of tokens.
45 | * @example
46 | * tokenize0( "someone's wallet, isn't it?" );
47 | * // -> [ 'someone', 's', 'wallet', 'is', 'not', 'it' ]
48 | */
49 | var tokenize0 = function ( str ) {
50 | var tokens = removeElisions( amplifyNotElision( str ) )
51 | .replace( rgx.cannot, '$1 $2' )
52 | .split( rgx.nonWords );
53 | // Check the 0th and last element of array for empty string because if
54 | // fisrt/last characters are non-words then these will be empty stings!
55 | if ( tokens[ 0 ] === '' ) tokens.shift();
56 | if ( tokens[ tokens.length - 1 ] === '' ) tokens.pop();
57 | return tokens;
58 | }; // tokenize0()
59 |
60 | module.exports = tokenize0;
61 |
--------------------------------------------------------------------------------
/src/helper-return-indexer.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### returnIndexer
32 |
33 | /**
34 | *
35 | * Returns an Indexer object that contains two functions. The first function `build()`
36 | * incrementally builds an index for each `element` using `itsIndex` — both passed as
37 | * parameters to it. The second function — `result()` allows accessing the index anytime.
38 | *
39 | * It is typically used with [string.soc](#stringsoc), [string.bong](#stringbong),
40 | * [string.song](#stringsong), and [tokens.sow](#tokenssow).
41 | *
42 | * @alias helper#returnIndexer
43 | * @return {indexer} used to build and access the index.
44 | * @example
45 | * var indexer = returnIndexer();
46 | * // -> { build: [function], result: [function] }
47 | */
48 | var returnIndexer = function () {
49 | var theIndex = Object.create( null );
50 | var methods = Object.create( null );
51 |
52 | // Builds index by adding the `element` and `itsIndex`. The `itsIndex` should
53 | // be a valid JS array index; no validation checks are performed while building
54 | // index.
55 | var build = function ( element, itsIndex ) {
56 | theIndex[ element ] = theIndex[ element ] || [];
57 | theIndex[ element ].push( itsIndex );
58 | return true;
59 | }; // build()
60 |
61 | // Returns the index built so far.
62 | var result = function () {
63 | return theIndex;
64 | }; // result()
65 |
66 | methods.build = build;
67 | methods.result = result;
68 |
69 | return methods;
70 | }; // index()
71 |
72 | module.exports = returnIndexer;
73 |
--------------------------------------------------------------------------------
/src/string-extract-persons-name.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 | var ncrgx = require( './name_cleaner_regexes.js' );
30 |
31 | // ## string
32 |
33 | // ### extractPersonsName
34 | /**
35 | *
36 | * Attempts to extract person's name from input string.
37 | * It assmues the following name format:
38 | * `[] []`
39 | * Entities in square brackets are optional. Note, it is not a
40 | * named entity detection mechanism.
41 | *
42 | * @alias string#extractPersonsName
43 | * @param {string} str the input string.
44 | * @return {string} extracted name.
45 | * @example
46 | * extractPersonsName( 'Dr. Sarah Connor M. Tech., PhD. - AI' );
47 | * // -> 'Sarah Connor'
48 | */
49 | var extractPersonsName = function ( str ) {
50 | // Remove Degrees by making the list of indexes of each degree and subsequently
51 | // finding the minimum and slicing from there!
52 | var indexes = ncrgx.degrees.map( function ( r ) {
53 | var m = r.exec( str );
54 | return ( m ) ? m.index : 999999;
55 | } );
56 | var sp = Math.min.apply( null, indexes );
57 |
58 | // Generate an Array of Every Elelemnt of Name (e.g. title, first name,
59 | // sir name, honours, etc)
60 | var aeen = str.slice( 0, sp ).replace( rgx.notAlpha, ' ').replace( rgx.spaces, ' ').trim().split(' ');
61 | // Remove titles from the beginning.
62 | while ( aeen.length && ncrgx.titles.test( aeen[0] ) ) aeen.shift();
63 | return aeen.join(' ');
64 | }; // extractPersonsName()
65 |
66 | module.exports = extractPersonsName;
67 |
--------------------------------------------------------------------------------
/src/string-song.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### setOfNGrams
32 | /**
33 | *
34 | * Generates the set of ngrams of `size` from the input string. The
35 | * default size is 2, which means it will generate set of bigrams by default.
36 | * It also has an alias **`song()`**.
37 | *
38 | * @alias string#setOfNGrams
39 | * @param {string} str the input string.
40 | * @param {number} [size=2] ngram size.
41 | * @param {function} [ifn=undefined] a function to build index; it is called for
42 | * every **unique occurrence of ngram** of `str`; and it receives the ngram and the `idx`
43 | * as input arguments. The `build()` function of [helper.returnIndexer](#helperreturnindexer)
44 | * may be used as `ifn`. If `undefined` then index is not built.
45 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn`
46 | * function.
47 | * @return {set} of ngrams of `size` of `str`.
48 | * @example
49 | * setOfNGrams( 'mama' );
50 | * // -> Set { 'ma', 'am' }
51 | * song( 'mamma' );
52 | * // -> Set { 'ma', 'am', 'mm' }
53 | */
54 | var setOfNGrams = function ( str, size, ifn, idx ) {
55 | var ng = ( size || 2 ),
56 | ngSet = new Set(),
57 | tg;
58 | for ( var i = 0, imax = str.length; i < imax; i += 1 ) {
59 | tg = str.slice( i, i + ng );
60 | if ( tg.length === ng ) {
61 | if ( ( typeof ifn === 'function' ) && !ngSet.has( tg ) ) {
62 | ifn( tg, idx );
63 | }
64 | ngSet.add( tg );
65 | }
66 | }
67 | return ( ngSet );
68 | }; // song()
69 |
70 | module.exports = setOfNGrams;
71 |
--------------------------------------------------------------------------------
/src/string-edge-ngrams.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### edgeNGrams
32 | /**
33 | *
34 | * Generates the edge ngrams from the input string.
35 | *
36 | * @alias string#edgeNGrams
37 | * @param {string} str the input string.
38 | * @param {number} [min=2] size of ngram generated.
39 | * @param {number} [max=8] size of ngram is generated.
40 | * @param {number} [delta=2] edge ngrams are generated in increments of this value.
41 | * @param {function} [ifn=undefined] a function to build index; it is called for
42 | * every edge ngram of `str`; and it receives the edge ngram and the `idx`
43 | * as input arguments. The `build()` function of [helper.returnIndexer](#helperreturnindexer)
44 | * may be used as `ifn`. If `undefined` then index is not built.
45 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn`
46 | * function.
47 | * @return {string[]} of edge ngrams.
48 | * @example
49 | * edgeNGrams( 'decisively' );
50 | * // -> [ 'de', 'deci', 'decisi', 'decisive' ]
51 | * edgeNGrams( 'decisively', 8, 10, 1 );
52 | * // -> [ 'decisive', 'decisivel', 'decisively' ]
53 | */
54 | var edgeNGrams = function ( str, min, max, delta, ifn, idx ) {
55 | var dlta = ( delta || 2 ),
56 | eg,
57 | egs = [],
58 | imax = Math.min( ( max || 8 ), str.length ) + 1,
59 | start = ( min || 2 );
60 |
61 | // Generate edge ngrams
62 | for ( var i = start; i < imax; i += dlta ) {
63 | eg = str.slice( 0, i );
64 | egs.push( eg );
65 | if ( typeof ifn === 'function' ) {
66 | ifn( eg, idx );
67 | }
68 | }
69 | return ( egs );
70 | }; // edgeNGrams()
71 |
72 | module.exports = edgeNGrams;
73 |
--------------------------------------------------------------------------------
/src/string-compose-corpus.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var helpers = require( 'wink-helpers' );
29 | var returnQuotedTextExtractor = require( './helper-return-quoted-text-extractor.js' );
30 | var extractQuotedText = returnQuotedTextExtractor( '[', ']' );
31 | // ## string
32 |
33 | // ### composeCorpus
34 | /**
35 | *
36 | * Generates all possible sentences from the input argument string.
37 | * The string s must follow a special syntax as illustrated in the
38 | * example below:
39 | * `'[I] [am having|have] [a] [problem|question]'`
40 | *
41 | * Each phrase must be quoted between `[ ]` and each possible option of phrases
42 | * (if any) must be separated by a `|` character. The corpus is composed by
43 | * computing the cartesian product of all the phrases.
44 | *
45 | * @alias string#composeCorpus
46 | * @param {string} str the input string.
47 | * @return {string[]} of all possible sentences.
48 | * @example
49 | * composeCorpus( '[I] [am having|have] [a] [problem|question]' );
50 | * // -> [ 'I am having a problem',
51 | * // 'I am having a question',
52 | * // 'I have a problem',
53 | * // 'I have a question' ]
54 | */
55 | var composeCorpus = function ( str ) {
56 | if ( !str || ( typeof str !== 'string' ) ) return [];
57 |
58 | var quotedTextElems = extractQuotedText( str );
59 | var corpus = [];
60 | var finalCorpus = [];
61 |
62 | if ( !quotedTextElems ) return [];
63 | quotedTextElems.forEach( function ( e ) {
64 | corpus.push( e.split( '|' ) );
65 | } );
66 |
67 | helpers.array.product( corpus ).forEach( function ( e ) {
68 | finalCorpus.push( e.join( ' ' ) );
69 | } );
70 | return ( finalCorpus );
71 | }; // composeCorpus()
72 |
73 | module.exports = composeCorpus;
74 |
--------------------------------------------------------------------------------
/src/tokens-propagate-negations.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = require( './util_regexes.js' );
29 |
30 | // ## string
31 |
32 | // ### propagateNegations
33 | /**
34 | *
35 | * It looks for negation tokens in the input array of tokens and propagates
36 | * negation to subsequent `upto` tokens by prefixing them by a `!`. It is useful
37 | * in handling text containing negations during tasks like similarity detection,
38 | * classification or search.
39 | *
40 | * @alias tokens#propagateNegations
41 | * @param {string[]} tokens the input tokens.
42 | * @param {number} [upto=2] number of tokens to be negated after the negation
43 | * token. Note, tokens are only negated either `upto` tokens or up to the token
44 | * preceeding the **`, . ; : ! ?`** punctuations.
45 | * @return {string[]} tokens with negation propagated.
46 | * @example
47 | * propagateNegations( [ 'mary', 'is', 'not', 'feeling', 'good', 'today' ] );
48 | * // -> [ 'mary', 'is', 'not', '!feeling', '!good', 'today' ]
49 | */
50 | var propagateNegations = function ( tokens, upto ) {
51 | var i, imax, j, jmax;
52 | var tkns = tokens;
53 | var limit = upto || 2;
54 | for ( i = 0, imax = tkns.length; i < imax; i += 1 ) {
55 | if ( rgx.negations.test( tkns[ i ] ) ) {
56 | for ( j = i + 1, jmax = Math.min( imax, i + limit + 1 ); j < jmax; j += 1 ) {
57 | // Hit a punctuation mark, break out of the loop otherwise go *upto the limit*.
58 | // > TODO: promote to utilities regex, after test cases have been added.
59 | if ( ( /[\,\.\;\:\!\?]/ ).test( tkns[ j ] ) ) break;
60 | // Propoage negation: invert the token by prefixing a `!` to it.
61 | tkns[ j ] = '!' + tkns[ j ];
62 | }
63 | i = j;
64 | }
65 | }
66 | return tkns;
67 | }; // propagateNegations()
68 |
69 | module.exports = propagateNegations;
70 |
--------------------------------------------------------------------------------
/src/phonetize_regexes.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | /* eslint no-underscore-dangle: "off" */
29 | var rgx = Object.create( null );
30 | // Remove repeating characters.
31 | rgx.repeatingChars = /([^c])\1/g;
32 | // Drop first character from character pairs, if found in the beginning.
33 | rgx.kngnPairs = /^(kn|gn|pn|ae|wr)/;
34 | // Drop vowels that are not found in the beginning.
35 | rgx.__vowels = /(?!^)[aeiou]/g;
36 | // Replaces `ough` in the end by 'f'
37 | rgx.ough = /ough$/;
38 | // Replace following 3 instances of `dg` by `j`.
39 | rgx.dge = /dge/g;
40 | rgx.dgi = /dgi/g;
41 | rgx.dgy = /dgy/g;
42 | // Replace `sch` by `sk`.
43 | rgx.sch = /sch/g;
44 | // Drop `c` in `sci, sce, scy`.
45 | rgx.sci = /sci/g;
46 | rgx.sce = /sce/g;
47 | rgx.scy = /scy/g;
48 | // Make 'sh' out of `tio & tia`.
49 | rgx.tio = /tio/g;
50 | rgx.tia = /tia/g;
51 | // `t` is silent in `tch`.
52 | rgx.tch = /tch/g;
53 | // Drop `b` in the end if preceeded by `m`.
54 | rgx.mb_ = /mb$/;
55 | // These are pronounced as `k`.
56 | rgx.cq = /cq/g;
57 | rgx.ck = /ck/g;
58 | // Here `c` sounds like `s`
59 | rgx.ce = /ce/g;
60 | rgx.ci = /ci/g;
61 | rgx.cy = /cy/g;
62 | // And this `f`.
63 | rgx.ph = /ph/g;
64 | // The `sh` finally replaced by `x`.
65 | rgx.sh = /sh|sio|sia/g;
66 | // This is open rgx - TODO: need to finalize.
67 | rgx.vrnotvy = /([aeiou])(r)([^aeiouy])/g;
68 | // `th` sounds like theta - make it 0.
69 | rgx.th = /th/g;
70 | // `c` sounds like `k` except when it is followed by `h`.
71 | rgx.cnoth = /(c)([^h])/g;
72 | // Even `q` sounds like `k`.
73 | rgx.q = /q/g;
74 | // The first `x` sounds like `s`.
75 | rgx._x = /^x/;
76 | // Otherwise `x` is more like `ks`.
77 | rgx.x = /x/g;
78 | // Drop `y` if not followed by a vowel or appears in the end.
79 | rgx.ynotv = /(y)([^aeiou])/g;
80 | rgx.y_ = /y$/;
81 | // `z` is `s`.
82 | rgx.z = /z/g;
83 |
84 | // Export rgx.
85 | module.exports = rgx;
86 |
--------------------------------------------------------------------------------
/src/string-bong.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### bagOfNGrams
32 | /**
33 | *
34 | * Generates the bag of ngrams of `size` from the input string. The
35 | * default size is 2, which means it will generate bag of bigrams by default. It
36 | * also has an alias **`bong()`**.
37 | *
38 | * @alias string#bagOfNGrams
39 | * @param {string} str the input string.
40 | * @param {number} [size=2] ngram size.
41 | * @param {function} [ifn=undefined] a function to build index; it is called for
42 | * every **unique occurrence of ngram** of `str`; and it receives the ngram and the `idx`
43 | * as input arguments. The `build()` function of [helper.returnIndexer](#helperreturnindexer)
44 | * may be used as `ifn`. If `undefined` then index is not built.
45 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn`
46 | * function.
47 | * @return {object} bag of ngrams of `size` from `str`.
48 | * @example
49 | * bagOfNGrams( 'mama' );
50 | * // -> { ma: 2, am: 1 }
51 | * bong( 'mamma' );
52 | * // -> { ma: 2, am: 1, mm: 1 }
53 | */
54 | var bagOfNGrams = function ( str, size, ifn, idx ) {
55 | var ng = ( size || 2 ),
56 | ngBOW = Object.create( null ),
57 | tg;
58 | for ( var i = 0, imax = str.length; i < imax; i += 1 ) {
59 | tg = str.slice( i, i + ng );
60 | if ( tg.length === ng ) {
61 | // Call `ifn` iff its defined and `tg` is appearing for the first time;
62 | // this avoids multiple calls to `ifn`. Strategy applies to `song()`,
63 | // and `bow()`.
64 | if ( ( typeof ifn === 'function' ) && !ngBOW[ tg ] ) {
65 | ifn( tg, idx );
66 | }
67 | // Now define, if required and then update counts.
68 | ngBOW[ tg ] = 1 + ( ngBOW[ tg ] || 0 );
69 | }
70 | }
71 | return ( ngBOW );
72 | }; // bong()
73 |
74 | module.exports = bagOfNGrams;
75 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
6 |
7 | ## Our Standards
8 |
9 | Examples of behavior that contributes to creating a positive environment include:
10 |
11 | * Using welcoming and inclusive language
12 | * Being respectful of differing viewpoints and experiences
13 | * Gracefully accepting constructive criticism
14 | * Focusing on what is best for the community
15 | * Showing empathy towards other community members
16 |
17 | Examples of unacceptable behavior by participants include:
18 |
19 | * The use of sexualized language or imagery and unwelcome sexual attention or advances
20 | * Trolling, insulting/derogatory comments, and personal or political attacks
21 | * Public or private harassment
22 | * Publishing others' private information, such as a physical or electronic address, without explicit permission
23 | * Other conduct which could reasonably be considered inappropriate in a professional setting
24 |
25 | ## Our Responsibilities
26 |
27 | Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
28 |
29 | Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
30 |
31 | ## Scope
32 |
33 | This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
34 |
35 | ## Enforcement
36 |
37 | Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at ContactUs@graype.in. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
38 |
39 | Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
40 |
41 | ## Attribution
42 |
43 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
44 |
45 | [homepage]: http://contributor-covenant.org
46 | [version]: http://contributor-covenant.org/version/1/4/
47 |
--------------------------------------------------------------------------------
/src/helper-return-words-filter.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### returnWordsFilter
32 |
33 | /**
34 | *
35 | * Returns an object containing the following functions: (a) `set()`, which returns
36 | * a set of mapped words given in the input array `words`. (b) `exclude()` that
37 | * is suitable for array filtering operations.
38 | *
39 | * If the second argument `mappers` is provided as an array of maping functions
40 | * then these are applied on the input array before converting into a set. A
41 | * mapper function must accept a string as argument and return a string as the result.
42 | * Examples of mapper functions are typically **string** functionss of **`wink-nlp-utils`**
43 | * such as `string.lowerCase()`, `string.stem()` and
44 | * `string.soundex()`.
45 | *
46 | * @alias helper#returnWordsFilter
47 | * @param {string[]} words that can be filtered using the returned wordsFilter.
48 | * @param {function[]} [mappers=undefined] optionally used to map each word before creating
49 | * the wordsFilter.
50 | * @return {wordsFilter} object containg `set()` and `exclude()` functions for `words`.
51 | * @example
52 | * var stopWords = [ 'This', 'That', 'Are', 'Is', 'Was', 'Will', 'a' ];
53 | * var myFilter = returnWordsFilter( stopWords, [ string.lowerCase ] );
54 | * [ 'this', 'is', 'a', 'cat' ].filter( myFilter.exclude );
55 | * // -> [ 'cat' ]
56 | */
57 | var returnWordsFilter = function ( words, mappers ) {
58 | var mappedWords = words;
59 | var givenMappers = mappers || [];
60 | givenMappers.forEach( function ( m ) {
61 | mappedWords = mappedWords.map( m );
62 | } );
63 |
64 | mappedWords = new Set( mappedWords );
65 |
66 | var exclude = function ( t ) {
67 | return ( !( mappedWords.has( t ) ) );
68 | }; // exclude()
69 |
70 | var set = function () {
71 | return mappedWords;
72 | }; // set()
73 |
74 | return {
75 | set: set,
76 | exclude: exclude
77 | };
78 | }; // returnWordsFilter()
79 |
80 | module.exports = returnWordsFilter;
81 |
--------------------------------------------------------------------------------
/src/tokens-bow.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### bagOfWords
32 | /**
33 | *
34 | * Generates the bag of words from the input string. By default it
35 | * uses `word count` as it's frequency; but if `logCounts` parameter is set to true then
36 | * it will use `log2( word counts + 1 )` as it's frequency. It also has an alias **`bow()`**.
37 | *
38 | * @alias tokens#bagOfWords
39 | * @param {string[]} tokens the input tokens.
40 | * @param {number} [logCounts=false] a true value flags the use of `log2( word count + 1 )`
41 | * instead of just `word count` as frequency.
42 | * @param {function} [ifn=undefined] a function to build index; it is called for
43 | * every **unique occurrence of word** in `tokens`; and it receives the word and the `idx`
44 | * as input arguments. The `build()` function of [helper.returnIndexer](#helperreturnindexer)
45 | * may be used as `ifn`. If `undefined` then index is not built.
46 | * @param {number} [idx=undefined] the index; passed as the second argument to the `ifn`
47 | * function.
48 | * @return {object} bag of words from tokens.
49 | * @example
50 | * bagOfWords( [ 'rain', 'rain', 'go', 'away' ] );
51 | * // -> { rain: 2, go: 1, away: 1 }
52 | * bow( [ 'rain', 'rain', 'go', 'away' ], true );
53 | * // -> { rain: 1.584962500721156, go: 1, away: 1 }
54 | */
55 | var bagOfWords = function ( tokens, logCounts, ifn, idx ) {
56 | var bow1 = Object.create( null ),
57 | i, imax,
58 | token,
59 | words;
60 | for ( i = 0, imax = tokens.length; i < imax; i += 1 ) {
61 | token = tokens[ i ];
62 | if ( ( typeof ifn === 'function' ) && !bow1[ token ] ) {
63 | ifn( token, idx );
64 | }
65 | bow1[ token ] = 1 + ( bow1[ token ] || 0 );
66 | }
67 | if ( !logCounts ) return ( bow1 );
68 | words = Object.keys( bow1 );
69 | for ( i = 0, imax = words.length; i < imax; i += 1 ) {
70 | // Add `1` to ensure non-zero count! (Note: log2(1) is 0)
71 | bow1[ words[ i ] ] = Math.log2( bow1[ words[ i ] ] + 1 );
72 | }
73 | return ( bow1 );
74 | }; // bow()
75 |
76 | module.exports = bagOfWords;
77 |
--------------------------------------------------------------------------------
/test/string-edge-ngrams-specs.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var chai = require( 'chai' );
29 | var mocha = require( 'mocha' );
30 | var edgeGrams = require( '../src/string-edge-ngrams.js' );
31 | var index = require( '../src/helper-return-indexer.js' );
32 |
33 | var expect = chai.expect;
34 | var describe = mocha.describe;
35 | var it = mocha.it;
36 |
37 | // ### Define common errors.
38 | // These are common test data for `null`, `undefined`, and `numeric` inputs
39 | // across all the functions included in the script.
40 | // The exception cases specific to the function are part of the test script of the function.
41 | var errors = [
42 | { whenInputIs: null, expectedOutputIs: /^Cannot read.*/ },
43 | { whenInputIs: undefined, expectedOutputIs: /^Cannot read.*/ },
44 | { whenInputIs: 1, expectedOutputIs: /is not a function$/ }
45 | ];
46 |
47 |
48 | // ### Create bong test cases.
49 |
50 | describe( 'string.edgeGrams()', function () {
51 | var tests = [
52 | { whenInputIs: [ '' ], expectedOutputIs: [] },
53 | { whenInputIs: [ 'decisively' ], expectedOutputIs: [ 'de', 'deci', 'decisi', 'decisive' ] },
54 | { whenInputIs: [ 'decisively', 8, 10, 1 ], expectedOutputIs: [ 'decisive', 'decisivel', 'decisively' ] }
55 | ];
56 |
57 | tests.forEach( function ( test ) {
58 | it( 'should return ' + JSON.stringify( test.expectedOutputIs ) + ' if the input is ' + JSON.stringify( test.whenInputIs ), function () {
59 | expect( edgeGrams( ...test.whenInputIs ) ).to.deep.equal( test.expectedOutputIs );
60 | } );
61 | } );
62 |
63 | it( 'indexer result should return an index of 2-grams of rachna & archna', function () {
64 | var bongIndex = index();
65 | edgeGrams( 'decision', 4, 8, 2, bongIndex.build, 'decision' );
66 | edgeGrams( 'decisive', 4, 8, 2, bongIndex.build, 'decisive' );
67 | var result = bongIndex.result();
68 | expect( result ).to.deep.equal( { deci: [ 'decision', 'decisive' ], decisi: [ 'decision', 'decisive' ], decision: [ 'decision' ], decisive: [ 'decisive' ] } );
69 | } );
70 |
71 | errors.slice( 0, 2 ).forEach( function ( error ) {
72 | it( 'should throw ' + error.expectedOutputIs + ' if the input is ' + JSON.stringify( error.whenInputIs ), function () {
73 | expect( edgeGrams.bind( null, error.whenInputIs ) ).to.throw( error.expectedOutputIs );
74 | } );
75 | } );
76 | } );
77 |
--------------------------------------------------------------------------------
/src/string-tokenize.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | // Load wink-nlp package & helpers.
29 | const winkNLP = require( 'wink-nlp' );
30 | // Load english language model — light version.
31 | const model = require( 'wink-eng-lite-web-model' );
32 | // Instantiate winkNLP, only use tokenization.
33 | const nlp = winkNLP( model, [] );
34 | const its = nlp.its;
35 |
36 | // ## string
37 |
38 | // ### tokenize
39 | /**
40 | *
41 | * Tokenizes the input `sentence` according to the value of `detailed` flag.
42 | * Any occurance of `...` in the `sentence` is
43 | * converted to ellipses. In `detailed = true` mode, it
44 | * tags every token with its type; the supported tags are word, number, url, email,
45 | * mention, hashtag, emoji, emoticon, time, ordinal, currency, punctuation, symbol,
46 | * and tabCFLF.
47 | *
48 | * @alias string#tokenize
49 | * @param {string} sentence the input string.
50 | * @param {boolean} [detailed=false] if true, each token is a object cotaining
51 | * `value` and `tag` of each token; otherwise each token is a string. It's default
52 | * value of **false** ensures compatibility with previous version.
53 | * @return {(string[]|object[])} an array of strings if `detailed` is false otherwise
54 | * an array of objects.
55 | * @example
56 | * tokenize( "someone's wallet, isn't it? I'll return!" );
57 | * // -> [ 'someone', '\'s', 'wallet', ',', 'is', 'n\'t', 'it', '?',
58 | * // 'I', '\'ll', 'return', '!' ]
59 | *
60 | * tokenize( 'For details on wink, check out http://winkjs.org/ URL!', true );
61 | * // -> [ { value: 'For', tag: 'word' },
62 | * // { value: 'details', tag: 'word' },
63 | * // { value: 'on', tag: 'word' },
64 | * // { value: 'wink', tag: 'word' },
65 | * // { value: ',', tag: 'punctuation' },
66 | * // { value: 'check', tag: 'word' },
67 | * // { value: 'out', tag: 'word' },
68 | * // { value: 'http://winkjs.org/', tag: 'url' },
69 | * // { value: 'URL', tag: 'word' },
70 | * // { value: '!', tag: 'punctuation' } ]
71 | */
72 | var tokenize = function ( sentence, detailed ) {
73 | const doc = nlp.readDoc( sentence.replace( '...', '…' ) );
74 | const tokens = [];
75 |
76 | if ( detailed ) {
77 | doc.tokens().each( ( t ) => {
78 | tokens.push( { value: t.out(), tag: t.out( its.type ) } );
79 | } );
80 |
81 | return tokens;
82 | }
83 |
84 | return doc.tokens().out();
85 | }; // tokenize()
86 |
87 | module.exports = tokenize;
88 |
--------------------------------------------------------------------------------
/src/helper-return-quoted-text-extractor.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 |
29 | // ## string
30 |
31 | // ### returnQuotedTextExtractor
32 |
33 | /**
34 | *
35 | * Returns a function that extracts all occurrences of every quoted text
36 | * between the `lq` and the `rq` characters from its argument. This argument
37 | * must be of type string.
38 | *
39 | * @alias helper#returnQuotedTextExtractor
40 | * @param {string} [lq='"'] the left quote character.
41 | * @param {string} [rq='"'] the right quote character.
42 | * @return {function} that will accept an input string argument and return an
43 | * array of all substrings that are quoted between `lq` and `rq`.
44 | * @example
45 | * var extractQuotedText = returnQuotedTextExtractor();
46 | * extractQuotedText( 'Raise 2 issues - "fix a bug" & "run tests"' );
47 | * // -> [ 'fix a bug', 'run tests' ]
48 | */
49 | var returnQuotedTextExtractor = function ( lq, rq ) {
50 | var // Index variable for *for-loop*
51 | i,
52 | // Set defaults for left quote, if required.
53 | lq1 = ( ( lq && ( typeof lq === 'string' ) ) ? lq : '"' ),
54 | // Extracts its length
55 | lqLen = lq1.length,
56 | // The regular expression is created here.
57 | regex = null,
58 | // The string containing the regular expression builds here.
59 | rgxStr = '',
60 | // Set defaults for right quote, if required.
61 | rq1 = ( ( rq && ( typeof rq === 'string' ) ) ? rq : lq1 ),
62 | // Extract its length.
63 | rqLen = rq1.length;
64 |
65 | // Build `rgxStr`
66 | for ( i = 0; i < lqLen; i += 1 ) rgxStr += '\\' + lq1.charAt( i );
67 | rgxStr += '.*?';
68 | for ( i = 0; i < rqLen; i += 1 ) rgxStr += '\\' + rq1.charAt( i );
69 | // Create regular expression.
70 | regex = new RegExp( rgxStr, 'g' );
71 | // Return the extractor function.
72 | return ( function ( s ) {
73 | if ( !s || ( typeof s !== 'string' ) ) return null;
74 | var // Extracted elements are captured here.
75 | elements = [],
76 | // Extract matches with quotes
77 | matches = s.match( regex );
78 | if ( !matches || ( matches.length === 0 ) ) return null;
79 | // Collect elements after removing the quotes.
80 | for ( var k = 0, kmax = matches.length; k < kmax; k += 1 ) {
81 | elements.push( matches[ k ].substr( lqLen, matches[ k ].length - ( rqLen + lqLen ) ) );
82 | }
83 | return ( elements );
84 | } );
85 | }; // returnQuotedTextExtractor()
86 |
87 | module.exports = returnQuotedTextExtractor;
88 |
--------------------------------------------------------------------------------
/src/string-phonetize.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var phnrgx = require( './phonetize_regexes.js' );
29 | /* eslint no-underscore-dangle: "off" */
30 |
31 | // ## string
32 |
33 | // ### phonetize
34 | /**
35 | *
36 | * Phonetizes the input string using an algorithmic adaptation of Metaphone; It
37 | * is not an exact implementation of Metaphone.
38 | *
39 | * @alias string#phonetize
40 | * @param {string} word the input word.
41 | * @return {string} phonetic code of `word`.
42 | * @example
43 | * phonetize( 'perspective' );
44 | * // -> 'prspktv'
45 | * phonetize( 'phenomenon' );
46 | * // -> 'fnmnn'
47 | */
48 | var phonetize = function ( word ) {
49 | var p = word.toLowerCase();
50 | // Remove repeating letters.
51 | p = p.replace( phnrgx.repeatingChars, '$1');
52 | // Drop first character of `kgknPairs`.
53 | if ( phnrgx.kngnPairs.test( p ) ) {
54 | p = p.substr( 1, p.length - 1 );
55 | }
56 | // Run Regex Express now!
57 | p = p
58 | // Change `ough` in the end as `f`,
59 | .replace( phnrgx.ough, 'f' )
60 | // Change `dg` to `j`, in `dge, dgi, dgy`.
61 | .replace( phnrgx.dge, 'je' )
62 | .replace( phnrgx.dgi, 'ji' )
63 | .replace( phnrgx.dgy, 'jy' )
64 | // Change `c` to `k` in `sch`
65 | .replace( phnrgx.sch, 'sk' )
66 | // Drop `c` in `sci, sce, scy`.
67 | .replace( phnrgx.sci, 'si' )
68 | .replace( phnrgx.sce, 'se' )
69 | .replace( phnrgx.scy, 'sy' )
70 | // Drop `t` if it appears as `tch`.
71 | .replace( phnrgx.tch, 'ch' )
72 | // Replace `tio & tia` by `sh`.
73 | .replace( phnrgx.tio, 'sh' )
74 | .replace( phnrgx.tia, 'sh' )
75 | // Drop `b` if it appears as `mb` in the end.
76 | .replace( phnrgx.mb_, 'm' )
77 | // Drop `r` if it preceeds a vowel and not followed by a vowel or `y`
78 | // .replace( rgx.vrnotvy, '$1$3' )
79 | // Replace `c` by `s` in `ce, ci, cy`.
80 | .replace( phnrgx.ce, 'se' )
81 | .replace( phnrgx.ci, 'si' )
82 | .replace( phnrgx.cy, 'sy' )
83 | // Replace `cq` by `q`.
84 | .replace( phnrgx.cq, 'q' )
85 | // Replace `ck` by `k`.
86 | .replace( phnrgx.ck, 'k' )
87 | // Replace `ph` by `f`.
88 | .replace( phnrgx.ph, 'f' )
89 | // Replace `th` by `0` (theta look alike!).
90 | .replace( phnrgx.th, '0' )
91 | // Replace `c` by `k` if it is not followed by `h`.
92 | .replace( phnrgx.cnoth, 'k$2' )
93 | // Replace `q` by `k`.
94 | .replace( phnrgx.q, 'k' )
95 | // Replace `x` by `s` if it appears in the beginning.
96 | .replace( phnrgx._x, 's' )
97 | // Other wise replace `x` by `ks`.
98 | .replace( phnrgx.x, 'ks' )
99 | // Replace `sh, sia, sio` by `x`. Needs to be done post `x` processing!
100 | .replace( phnrgx.sh, 'x' )
101 | // Drop `y` if it is now followed by a **vowel**.
102 | .replace( phnrgx.ynotv, '$2' )
103 | .replace( phnrgx.y_, '' )
104 | // Replace `z` by `s`.
105 | .replace( phnrgx.z, 's' )
106 | // Drop all **vowels** excluding the first one.
107 | .replace( phnrgx.__vowels, '' );
108 |
109 | return ( p );
110 | }; // phonetize()
111 |
112 | module.exports = phonetize;
113 |
--------------------------------------------------------------------------------
/src/util_regexes.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var rgx = Object.create( null );
29 |
30 | // Matches standard english punctuations in a text.
31 | rgx.punctuations = /[\’\'\‘\’\`\“\”\"\[\]\(\)\{\}\…\,\.\!\;\?\/\-\:]/ig;
32 | // End Of Sentence Punctuations - useful for splitting text into sentences.
33 | rgx.eosPunctuations = /([\.\?\!])\s*(?=[a-z]|\s+\d)/gi;
34 |
35 | // Matches special characters: `* + % # @ ^ = ~ | \` in a text.
36 | rgx.splChars = /[\*\+\%\#\@\^\=\~\|\\]/ig;
37 |
38 | // Matches common english elisions including n't.
39 | // These are special ones as 's otherwise may be apostrophe!
40 | rgx.elisionsSpl = /(\b)(it|let|that|who|what|here|there|when|where|why|how)(\'s)\b/gi;
41 | // Single (1) character elisions.
42 | rgx.elisions1 = /([a-z])(\'d|\'m)\b/gi;
43 | // Two (2) character elisions.
44 | rgx.elisions2 = /([a-z])(\'ll|\'ve|\'re|n\'t)\b/gi;
45 | // Sperate not elision 'nt.
46 | rgx.notElision = /([a-z])(n\'t)\b/gi;
47 | // Specially handle cannot
48 | rgx.cannot = /\b(can)(not)\b/gi;
49 |
50 | // Matches space, tab, or new line characters in text.
51 | rgx.spaces = /\s+/ig;
52 | // Matches anything other than space, tab, or new line characters.
53 | rgx.notSpace = /\S/g;
54 | // Matches alpha and space characters in a text.
55 | rgx.alphaSpace = /[a-z\s]/ig;
56 | // Matches alphanumerals and space characters in a text.
57 | rgx.alphaNumericSpace = /[a-z0-9\s]/ig;
58 | // Matches non alpha characters in a text.
59 | rgx.notAlpha = /[^a-z]/ig;
60 | // Matches non alphanumerals in a text.
61 | rgx.notAlphaNumeric = /[^a-z0-9]/ig;
62 | // Matches one or more non-words characters.
63 | rgx.nonWords = /\W+/ig;
64 | // Matches complete negation token
65 | rgx.negations = /^(never|none|not|no)$/i;
66 |
67 | // Matches run of capital words in a text.
68 | rgx.rocWords = /(?:\b[A-Z][A-Za-z]*\s*){2,}/g;
69 |
70 | // Matches integer, decimal, JS floating point numbers in a text.
71 | rgx.number = /[0-9]*\.[0-9]+e[\+\-]{1}[0-9]+|[0-9]*\.[0-9]+|[0-9]+/ig;
72 |
73 | // Matches time in 12 hour am/pm format in a text.
74 | rgx.timeIn12HrAMPM = /(?:[0-9]|0[0-9]|1[0-2])((:?:[0-5][0-9])){0,1}\s?(?:[aApP][mM])/ig;
75 |
76 | // Matches HTML tags - in fact any thing enclosed in angular brackets including
77 | // the brackets.
78 | rgx.htmlTags = /(?:<[^>]*>)/g;
79 | // Matches the HTML Esc Sequences
80 | // Esc Seq of type `<` or ` `
81 | rgx.htmlEscSeq1 = /(?:&[a-z]{2,6};)/gi;
82 | // Esc Seq of type ` `
83 | rgx.htmlEscSeq2 = /(?:[0-9]{2,4};)/gi;
84 |
85 | // Tests if a given string is possibly in the Indian mobile telephone number format.
86 | rgx.mobileIndian = /^(0|\+91)?[789]\d{9}$/;
87 | // Tests if a given string is in the valid email format.
88 | rgx.email = /^(([^<>()[\]\\.,;:\s@"]+(\.[^<>()[\]\\.,;:\s@"]+)*)|(".+"))@((\[[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}])|(([a-zA-Z\-0-9]+\.)+[a-zA-Z]{2,}))$/;
89 |
90 | // Extracts any number and text from a format text.
91 | // Useful in extracting value and UoM from strings like `2.7 Kgs`.
92 | rgx.separateNumAndText = /([0-9]*\.[0-9]+e[\+\-]{1}[0-9]+|[0-9]*\.[0-9]+|[0-9]+)[\s]*(.*)/i;
93 |
94 | // Crude date parser for a string containg date in a valid format.
95 | // > TODO: Need to improve this one!
96 | rgx.date = /(\d+)/ig;
97 |
98 | // Following 3 regexes are specially coded for `tokenize()` in prepare_text.
99 | // Matches punctuations that are not a part of a number.
100 | rgx.nonNumPunctuations = /[\.\,\-](?=\D)/gi;
101 | rgx.otherPunctuations = /[\’\'\‘\’\`\“\”\"\[\]\(\)\{\}\…\!\;\?\/\:]/ig;
102 | // > TODO: Add more currency symbols here.
103 | rgx.currency = /[\$\£\¥\€]/ig;
104 |
105 | //
106 | module.exports = rgx;
107 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # wink-nlp-utils
3 |
4 | NLP Functions for amplifying negations, managing elisions, creating ngrams, stems, phonetic codes to tokens and more.
5 |
6 | ### [](https://app.travis-ci.com/github/winkjs/wink-nlp-utils) [](https://coveralls.io/github/winkjs/wink-nlp-utils?branch=master) [](https://gitter.im/winkjs/Lobby)
7 |
8 | [](http://wink.org.in/)
9 |
10 | Prepare raw text for Natural Language Processing (NLP) using **`wink-nlp-utils`**. It offers a set of [APIs](http://wink.org.in/wink-nlp-utils/) to work on [strings](http://wink.org.in/wink-nlp-utils/#string) such as names, sentences, paragraphs and [tokens](http://wink.org.in/wink-nlp-utils/#tokens) represented as an array of strings/words. They perform the required pre-processing for many ML tasks such as [semantic search](https://www.npmjs.com/package/wink-bm25-text-search), and [classification](https://www.npmjs.com/package/wink-naive-bayes-text-classifier).
11 |
12 |
👉🏽
13 | We recommend using winkNLP for core natural language processing tasks.
It performs Tokenization, Sentence Boundary Detection, and Named Entity Recognition at a blazing fast speeds. It supports all your text processing needs starting from Sentiment Analysis, POS Tagging, Lemmatization, Stemming, Stop Word Removal, Negation Handling, Bigrams to Frequency Table Creation and more.
15 |
16 | ### Installation
17 | Use [npm](https://www.npmjs.com/package/wink-nlp-utils) to install:
18 | ```
19 | npm install wink-nlp-utils --save
20 | ```
21 |
22 |
23 | ### Getting Started
24 | The `wink-nlp-utils` provides over **36 utility functions** for Natural Language Processing tasks. Some representative examples are extracting person's name from a string, compose training corpus for a chat bot, sentence boundary detection, tokenization and stop words removal:
25 | ```javascript
26 |
27 | // Load wink-nlp-utils
28 | var nlp = require( 'wink-nlp-utils' );
29 |
30 | // Extract person's name from a string:
31 | var name = nlp.string.extractPersonsName( 'Dr. Sarah Connor M. Tech., PhD. - AI' );
32 | console.log( name );
33 | // -> 'Sarah Connor'
34 |
35 | // Compose all possible sentences from a string:
36 | var str = '[I] [am having|have] [a] [problem|question]';
37 | console.log( nlp.string.composeCorpus( str ) );
38 | // -> [ 'I am having a problem',
39 | // -> 'I am having a question',
40 | // -> 'I have a problem',
41 | // -> 'I have a question' ]
42 |
43 | // Sentence Boundary Detection.
44 | var para = 'AI Inc. is focussing on AI. I work for AI Inc. My mail is r2d2@yahoo.com';
45 | console.log( nlp.string.sentences( para ) );
46 | // -> [ 'AI Inc. is focussing on AI.',
47 | // 'I work for AI Inc.',
48 | // 'My mail is r2d2@yahoo.com' ]
49 |
50 | // Tokenize a sentence.
51 | var s = 'For details on wink, check out http://winkjs.org/ URL!';
52 | console.log( nlp.string.tokenize( s, true ) );
53 | // -> [ { value: 'For', tag: 'word' },
54 | // { value: 'details', tag: 'word' },
55 | // { value: 'on', tag: 'word' },
56 | // { value: 'wink', tag: 'word' },
57 | // { value: ',', tag: 'punctuation' },
58 | // { value: 'check', tag: 'word' },
59 | // { value: 'out', tag: 'word' },
60 | // { value: 'http://winkjs.org/', tag: 'url' },
61 | // { value: 'URL', tag: 'word' },
62 | // { value: '!', tag: 'punctuation' } ]
63 |
64 | // Remove stop words:
65 | var t = nlp.tokens.removeWords( [ 'mary', 'had', 'a', 'little', 'lamb' ] );
66 | console.log( t );
67 | // -> [ 'mary', 'little', 'lamb' ]
68 |
69 | ```
70 |
71 | Try [experimenting with these examples on Runkit](https://npm.runkit.com/wink-nlp-utils) in the browser.
72 |
73 | ### Documentation
74 | Check out the [wink NLP utilities API](http://winkjs.org/wink-nlp-utils/) documentation to learn more.
75 |
76 | ### Need Help?
77 | If you spot a bug and the same has not yet been reported, raise a new [issue](https://github.com/winkjs/wink-nlp-utils/issues) or consider fixing it and sending a pull request.
78 |
79 | ### About wink
80 | [Wink](http://winkjs.org/) is a family of open source packages for **Statistical Analysis**, **Natural Language Processing** and **Machine Learning** in NodeJS. The code is **thoroughly documented** for easy human comprehension and has a **test coverage of ~100%** for reliability to build production grade solutions.
81 |
82 |
83 | ### Copyright & License
84 | **wink-nlp-utils** is copyright 2017-22 [GRAYPE Systems Private Limited](http://graype.in/).
85 |
86 | It is licensed under the terms of the MIT License.
87 |
--------------------------------------------------------------------------------
/src/wink-nlp-utils.js:
--------------------------------------------------------------------------------
1 | // wink-nlp-utils
2 | // NLP Functions for amplifying negations, managing elisions,
3 | // creating ngrams, stems, phonetic codes to tokens and more.
4 | //
5 | // Copyright (C) GRAYPE Systems Private Limited
6 | //
7 | // This file is part of “wink-nlp-utils”.
8 | //
9 | // Permission is hereby granted, free of charge, to any person obtaining a
10 | // copy of this software and associated documentation files (the "Software"),
11 | // to deal in the Software without restriction, including without limitation
12 | // the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 | // and/or sell copies of the Software, and to permit persons to whom the
14 | // Software is furnished to do so, subject to the following conditions:
15 | //
16 | // The above copyright notice and this permission notice shall be included
17 | // in all copies or substantial portions of the Software.
18 | //
19 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 | // OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 | // THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 | // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25 | // DEALINGS IN THE SOFTWARE.
26 |
27 | //
28 | var porter2Stemmer = require( 'wink-porter2-stemmer' );
29 |
30 | // ### Prepare Name Space
31 |
32 | // Create prepare name space.
33 | var prepare = Object.create( null );
34 |
35 | /**
36 | * Helper
37 | * @namespace helper
38 | */
39 | prepare.helper = Object.create( null );
40 |
41 | // Words
42 | prepare.helper.returnWordsFilter = require( './helper-return-words-filter.js' );
43 | prepare.helper.words = prepare.helper.returnWordsFilter;
44 | // Make better **alias** name for the `word()` function.
45 |
46 | // Index
47 | prepare.helper.index = require( './helper-return-indexer.js' );
48 | // Make better **alias** name for the `index()` function.
49 | prepare.helper.returnIndexer = prepare.helper.index;
50 |
51 | // Return Quoted Text Extractor
52 | prepare.helper.returnQuotedTextExtractor = require( './helper-return-quoted-text-extractor.js' );
53 |
54 | /**
55 | * String
56 | * @namespace string
57 | */
58 | prepare.string = Object.create( null );
59 |
60 | // Lower Case
61 | prepare.string.lowerCase = require( './string-lower-case.js' );
62 | // Upper Case
63 | prepare.string.upperCase = require( './string-upper-case.js' );
64 | // Trim
65 | prepare.string.trim = require( './string-trim.js' );
66 | // Remove Extra Spaces
67 | prepare.string.removeExtraSpaces = require( './string-remove-extra-spaces.js' );
68 | // Retain Alpha-numerics
69 | prepare.string.retainAlphaNums = require( './string-retain-alpha-nums.js' );
70 | // Extract Person's Name
71 | prepare.string.extractPersonsName = require( './string-extract-persons-name.js' );
72 | // Extract Run of Capital Words
73 | prepare.string.extractRunOfCapitalWords = require( './string-extract-run-of-capital-words.js' );
74 | // Remove Punctuations
75 | prepare.string.removePunctuations = require( './string-remove-punctuations.js' );
76 | // Remove Special Chars
77 | prepare.string.removeSplChars = require( './string-remove-spl-chars.js' );
78 | // Remove HTML Tags
79 | prepare.string.removeHTMLTags = require( './string-remove-html-tags.js' );
80 | // Remove Elisions
81 | prepare.string.removeElisions = require( './string-remove-elisions.js' );
82 | // Split Elisions
83 | prepare.string.splitElisions = require( './string-split-elisions.js' );
84 | // Amplify Not Elision
85 | prepare.string.amplifyNotElision = require( './string-amplify-not-elision' );
86 | // Marker
87 | prepare.string.marker = require( './string-marker.js' );
88 | // SOC
89 | prepare.string.soc = require( './string-soc.js' );
90 | prepare.string.setOfChars = require( './string-soc.js' );
91 | // NGrams
92 | prepare.string.ngram = require( './string-ngram.js' );
93 | // Edge NGrams
94 | prepare.string.edgeNGrams = require( './string-edge-ngrams.js' );
95 | // BONG
96 | prepare.string.bong = require( './string-bong.js' );
97 | prepare.string.bagOfNGrams = require( './string-bong.js' );
98 | // SONG
99 | prepare.string.song = require( './string-song.js' );
100 | prepare.string.setOfNGrams = require( './string-song.js' );
101 | // Sentences
102 | prepare.string.sentences = require( './string-sentences.js' );
103 | // Compose Corpus
104 | prepare.string.composeCorpus = require( './string-compose-corpus.js' );
105 | // Tokenize0
106 | prepare.string.tokenize0 = require( './string-tokenize0.js' );
107 | // Tokenize
108 | prepare.string.tokenize = require( './string-tokenize.js' );
109 | // #### Stem
110 | prepare.string.stem = porter2Stemmer;
111 | // Phonetize
112 | prepare.string.phonetize = require( './string-phonetize.js' );
113 | // Soundex
114 | prepare.string.soundex = require( './string-soundex.js' );
115 |
116 | /**
117 | * Tokens
118 | * @namespace tokens
119 | */
120 | prepare.tokens = Object.create( null );
121 |
122 | // Stem
123 | prepare.tokens.stem = require( './tokens-stem.js' );
124 | // Phonetize
125 | prepare.tokens.phonetize = require( './tokens-phonetize.js' );
126 | // Soundex
127 | prepare.tokens.soundex = require( './tokens-soundex.js' );
128 | // Remove Words
129 | prepare.tokens.removeWords = require( './tokens-remove-words.js' );
130 | // BOW
131 | prepare.tokens.bow = require( './tokens-bow.js' );
132 | prepare.tokens.bagOfWords = require( './tokens-bow.js' );
133 | // SOW
134 | prepare.tokens.sow = require( './tokens-sow.js' );
135 | prepare.tokens.setOfWords = require( './tokens-sow.js' );
136 | // Propagate Negations
137 | prepare.tokens.propagateNegations = require( './tokens-propagate-negations.js' );
138 | // Bigrams
139 | prepare.tokens.bigrams = require( './tokens-bigrams.js' );
140 | // Append Bigrams
141 | prepare.tokens.appendBigrams = require( './tokens-append-bigrams.js' );
142 |
143 | // Export prepare.
144 | module.exports = prepare;
145 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to Wink
2 |
3 | Thank you for taking time to contribute. We are delighted to receive contributions from the community. For wink every contribution matters — whether you are reporting a **bug**, posting a **question**, submitting a **pull request** or updating the **documentation**.
4 |
5 | ## Getting Started
6 | 1. Fork the repository from github
7 | 2. Develop your code changes
8 | 3. Ensure that the API is properly documented
9 | 4. Capture the logic in comments
10 | 4. Ensure proper linting via `npm run pretest`
11 | 5. Run tests using `npm run test`
12 | 6. Make sure coverage either stays at the current levels or improves
13 | 7. Commit your changes in compliance with commit guidelines
14 | 8. Push to your fork
15 | 9. Sign the CLA if you are contributing for the first time
16 | 10. Finally, submit a pull request.
17 |
18 |
19 | ## Code of Conduct
20 | By contributing, you are expected to uphold [wink’s code of conduct](CODE_OF_CONDUCT.md). In essence, each one of us should:
21 |
22 | 1. respect fellow contributors, irrespective of their level of experience, race, religion, gender, sexual orientation, and age;
23 | 2. collaborate constructively;
24 | 3. never engage in any form of offense, harassment, insult, personal attack, provocation and/or use of inappropriate language;
25 |
26 |
27 |
28 | ## Things to know
29 | ### About Wink
30 | Wink is a growing open source project focusing on **Natural Language Processing**, **Machine Learning** and **Statistics**. It contains multiple repositories or packages. All packages expose consistent and uniform APIs, thus minimizing the need to learn a new interface for each task. Do take out some time in understanding the structure of APIs, before attempting any enhancements. In wink, we prefer **functions** and **closures** over objects.
31 |
32 | Like artisans, we too need a toolset and process to create beautiful software. The process is orchestrated by [Travis CI](https://travis-ci.org/) in accordance to the configuration files present in each repository. The details and tools used are outlined below.
33 |
34 |
35 | ### Linting
36 | Well defined linting rules helps us in making code more consistent and avoid bugs. [ESLint](https://eslint.org) enforces these rules via its configuration file. This file is located in the root of each repository.
37 |
38 |
39 | ### Documenting
40 | We believe that the documentation must not only explain the API but also narrate the story of logic, algorithms and references used. Wink uses the [JSDoc](https://jsdoc.app/) standard for API documentation and [Literate-Programming Standards](https://en.wikipedia.org/wiki/Literate_programming) for documenting the logic using [docker](http://jbt.github.io/docker/src/docker.js.html). The API documentation quality is measured using [Inch CI](https://inch-ci.org/) and we expect that your contribution will improve or maintain the current levels.
41 |
42 | ### Testing
43 | Wink requires a test coverage of **atleast > 99.5%** and aims for 100%. Any new contribution must maintain the existing test coverage level. We use [Chai](http://chaijs.com/), [Mocha](https://mochajs.org/) and [Istanbul](https://istanbul.js.org/), [Coveralls](https://coveralls.io/) to run tests and determine coverage.
44 |
45 | ### Committing
46 | We follow [commit guidelines](https://github.com/angular/angular.js/blob/master/DEVELOPERS.md#commits) from the Google's [Angular Project](https://angular.io/), whose documentation is licensed under [CC BY 4.0](https://creativecommons.org/licenses/by/4.0/). See important excerpts for quick reference below:
47 |
48 | #### Commit Message Format
49 | Each commit message consists of a **header**, a **body** and a **footer**. The header has a special format that includes a **type**, a **scope** and a **subject**:
50 |
51 | ():
52 |
53 |
54 |
55 |