├── .babelrc ├── .github └── FUNDING.yml ├── .gitignore ├── .npmignore ├── .travis.yml ├── BIBLIOGRAPHY.md ├── CHANGELOG.md ├── LICENSE.txt ├── NOTES.md ├── README.md ├── babel-plugin.js ├── benchmark └── levenshtein.js ├── package-lock.json ├── package.json ├── paper ├── algorithms.bib ├── paper.bib └── paper.md ├── scripts └── format-bibliography.js ├── src ├── clustering │ ├── abstract.js │ ├── blocking.js │ ├── canopy.js │ ├── helpers.js │ ├── key-collision.js │ ├── leader.js │ ├── naive.js │ ├── nn-descent.js │ ├── sorted-neighborhood.js │ └── vp-tree.js ├── hash │ ├── crc32.js │ └── minhash.js ├── helpers │ ├── frequencies.js │ ├── index.js │ └── vectors.js ├── inflectors │ └── spanish │ │ ├── index.js │ │ └── noun.js ├── keyers │ ├── fingerprint.js │ ├── html-text.js │ ├── name-power-set.js │ ├── name-sig.js │ ├── normalize.js │ ├── omission.js │ └── skeleton.js ├── keyword-extraction │ └── rake.js ├── metrics │ ├── bag.js │ ├── canberra.js │ ├── chebyshev.js │ ├── cosine.js │ ├── damerau-levenshtein.js │ ├── dice.js │ ├── euclidean.js │ ├── eudex.js │ ├── guth.js │ ├── hamming.js │ ├── identity.js │ ├── jaccard.js │ ├── jaro-winkler.js │ ├── jaro.js │ ├── lcs.js │ ├── length.js │ ├── levenshtein.js │ ├── lig.js │ ├── manhattan.js │ ├── minhash.js │ ├── minkowski.js │ ├── mlipns.js │ ├── monge-elkan.js │ ├── mra.js │ ├── overlap.js │ ├── prefix.js │ ├── ratcliff-obershelp.js │ ├── sift4.js │ ├── smith-waterman.js │ ├── sorensen.js │ ├── suffix.js │ └── tversky.js ├── parsers │ ├── brown.js │ └── conll.js ├── phonetics │ ├── alpha-sis.js │ ├── caverphone.js │ ├── daitch-mokotoff.js │ ├── double-metaphone.js │ ├── eudex.js │ ├── french │ │ ├── fonem.js │ │ ├── phonetic.js │ │ ├── phonex.js │ │ ├── sonnex.js │ │ ├── soundex.js │ │ └── soundex2.js │ ├── fuzzy-soundex.js │ ├── german │ │ ├── cologne.js │ │ └── phonem.js │ ├── lein.js │ ├── metaphone.js │ ├── mra.js │ ├── nysiis.js │ ├── onca.js │ ├── phonex.js │ ├── roger-root.js │ ├── sound-d.js │ ├── soundex.js │ └── statcan.js ├── regexp │ ├── classes.js │ └── index.js ├── stemmers │ ├── french │ │ ├── carry.js │ │ ├── eda.js │ │ ├── porter.js │ │ └── unine.js │ ├── german │ │ └── caumanns.js │ ├── ispell.js │ ├── lancaster.js │ ├── latin │ │ └── schinke.js │ ├── lovins.js │ ├── porter.js │ ├── s-stemmer.js │ ├── spanish │ │ └── unine.js │ └── uea-lite.js └── tokenizers │ ├── fingerprint │ ├── index.js │ └── name.js │ ├── hyphenation │ └── liang.js │ ├── lines │ ├── index.js │ └── naive.js │ ├── ngrams │ └── index.js │ ├── paragraphs │ ├── index.js │ └── naive.js │ ├── sentences │ ├── index.js │ ├── naive.js │ └── punkt.js │ ├── skipgrams │ └── index.js │ ├── syllables │ ├── legalipy.js │ └── sonoripy.js │ ├── tweets │ └── casual.js │ └── words │ ├── gersam.js │ ├── index.js │ ├── naive.js │ └── treebank.js └── test ├── _resources ├── brown │ └── ca02.txt ├── conll2000 │ └── excerpt.txt └── stopwords │ └── fox.txt ├── clustering ├── abstract.js ├── blocking.js ├── canopy.js ├── helpers.js ├── key-collision.js ├── leader.js ├── naive.js ├── nn-descent.js ├── sorted-neighborhood.js └── vp-tree.js ├── endpoint.js ├── hash ├── crc32.js └── minhash.js ├── helpers.js ├── helpers ├── frequencies.js ├── index.js └── vectors.js ├── inflectors └── spanish │ └── noun.js ├── keyers ├── html-text.js ├── name-power-set.js ├── name-sig.js ├── normalize.js ├── omission.js └── skeleton.js ├── keyword-extraction └── rake.js ├── metrics ├── bag.js ├── canberra.js ├── chebyshev.js ├── cosine.js ├── damerau-levenshtein.js ├── dice.js ├── euclidean.js ├── eudex.js ├── guth.js ├── hamming.js ├── identity.js ├── jaccard.js ├── jaro-winkler.js ├── lcs.js ├── length.js ├── levenshtein.js ├── lig.js ├── manhattan.js ├── minkowski.js ├── mlipns.js ├── monge-elkan.js ├── mra.js ├── overlap.js ├── prefix.js ├── ratcliff-obershelp.js ├── sift4.js ├── smith-waterman.js └── suffix.js ├── parsers ├── brown.js └── conll.js ├── phonetics ├── alpha-sis.js ├── caverphone.js ├── daitch-mokotoff.js ├── double-metaphone.js ├── eudex.js ├── french │ ├── fonem.js │ ├── phonetic.js │ ├── phonex.js │ ├── sonnex.js │ ├── soundex.js │ └── soundex2.js ├── fuzzy-soundex.js ├── german │ ├── cologne.js │ └── phonem.js ├── lein.js ├── metaphone.js ├── mra.js ├── nysiis.js ├── onca.js ├── phonex.js ├── roger-root.js ├── sound-d.js ├── soundex.js └── statcan.js ├── regexp └── index.js ├── stemmers ├── french │ ├── carry.js │ ├── eda.js │ ├── porter.js │ └── unine.js ├── german │ └── caumanns.js ├── lancaster.js ├── latin │ └── schinke.js ├── lovins.js ├── porter.js ├── s-stemmer.js ├── spanish │ └── unine.js └── uea-lite.js └── tokenizers ├── fingerprint.js ├── hyphenation └── liang.js ├── lines └── naive.js ├── ngrams.js ├── paragraphs └── naive.js ├── sentences ├── naive.js └── punkt.js ├── skipgrams.js ├── syllables ├── legalipy.js └── sonoripy.js ├── tweets └── casual.js └── words ├── gersam.js └── treebank.js /.babelrc: -------------------------------------------------------------------------------- 1 | { 2 | "presets": [ 3 | "es2015" 4 | ], 5 | "plugins": [ 6 | [ 7 | "transform-es2015-classes", 8 | { 9 | "loose": true 10 | } 11 | ], 12 | [ 13 | "transform-es2015-destructuring", 14 | { 15 | "loose": true 16 | } 17 | ], 18 | "./babel-plugin.js" 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | github: Yomguithereal 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Common 2 | node_modules 3 | experiments 4 | build 5 | *.log 6 | *.lock 7 | *.pl 8 | .DS_Store 9 | TODO.md 10 | 11 | # Transpiled modules 12 | /clustering 13 | /hash 14 | /helpers 15 | /inflectors 16 | /keyers 17 | /keyword-extraction 18 | /metrics 19 | /parsers 20 | /phonetics 21 | /regexp 22 | /stemmers 23 | /tokenizers 24 | -------------------------------------------------------------------------------- /.npmignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | experiments 3 | benchmark 4 | .babelrc 5 | .gitignore 6 | .npmignore 7 | *.yml 8 | build 9 | src 10 | test 11 | babel-plugin.js 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: node_js 2 | node_js: 3 | - "10" 4 | - "11" 5 | - "12" 6 | - "13" 7 | - "14" 8 | - "15" 9 | - "16" 10 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016-2020 Guillaume Plique (Yomguithereal) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /NOTES.md: -------------------------------------------------------------------------------- 1 | # Notes 2 | 3 | ## Roadmap 4 | 5 | * Minhash (CRC32?) & Fuzzyhash (ssdeep) & simhash 6 | * LSH Binning 7 | * Method to get the number of expected calculations 8 | * Distances 9 | * Phonetics 10 | * MVP Tree 11 | * Higher order VP Tree 12 | * Suffix Tree clustering 13 | * NN-Descent 14 | * NNCTPH 15 | * Fast online K-NN 16 | * Bitap 17 | * KNN clustering 18 | * LSH & MinHash & Rabin-Karp 19 | * Inverted Index (Complex & Simple) / Array to store doc id + weight + number of positions + positions for memory efficiency (Integer Array) 20 | * Write about the rationale behind the naive clustering composition methods 21 | 22 | * Create keyers => with phonetics & fingerprint keyers 23 | 24 | ## Clustering 25 | 26 | * Abstract a class for similarity clusterers to enable asynchronous work, possibility to abort computation etc. 27 | * It should be possible to make some optimization to the naive clusterer (whose worst case would perform the same amount of computation) by comparing new elements to only one item of an already existing cluster. 28 | * Method 3 should be possible to do without computing a graph but by holding a hashmap of items. 29 | * Method to return the similarity graph and to get a cluster index rather. (canopy then blocking for instance). 30 | * Method to return the time elapsed to compute. 31 | * Clusterer should hold the number of comparisons made 32 | * Clusterer should have chunk, async & emit progress events 33 | * SNM clustering is quite efficient when using ngram fingerprinting & a really small window. 34 | * The similarity graph must be undirected if you want the clusters to have a full diameter instead of radius somehow. 35 | * Possible to create modes for the naive clusterer `normal`, `minLengthFirst`, `maxLengthFirst`, or even `full`? 36 | * Check Java library about knng cluster extraction. 37 | 38 | ## UI 39 | 40 | * Pre-processing. 41 | * Inverted-Index of unique values to cut computations. 42 | * Should shuffle the values before applying clustering. 43 | * Difference between merge & harmonize. 44 | * Cluster expansion through inverted index sorted by occurrences. 45 | * Suggest methods based on size of the dataset. 46 | 47 | ## Recipes 48 | 49 | * Ngram blocking or SNM. 50 | * Double Metaphone key collision. 51 | * Overlap coefficient on names. 52 | * Minhash + ngrams for documents. 53 | -------------------------------------------------------------------------------- /benchmark/levenshtein.js: -------------------------------------------------------------------------------- 1 | require('babel-core/register'); 2 | 3 | var levenshtein = require('../src/metrics/distance/levenshtein'); 4 | var leven = require('leven'); 5 | 6 | var limited = levenshtein.limited; 7 | 8 | function run(fn) { 9 | fn('a', 'b'); 10 | fn('ab', 'ac'); 11 | fn('ac', 'bc'); 12 | fn('abc', 'axc'); 13 | fn('kitten', 'sitting'); 14 | fn('xabxcdxxefxgx', '1ab2cd34ef5g6'); 15 | fn('cat', 'cow'); 16 | fn('xabxcdxxefxgx', 'abcdefg'); 17 | fn('javawasneat', 'scalaisgreat'); 18 | fn('example', 'samples'); 19 | fn('sturgeon', 'urgently'); 20 | fn('levenshtein', 'frankenstein'); 21 | fn('distance', 'difference'); 22 | fn('abcde', 'tes'); 23 | fn('因為我是中國人所以我會說中文', '因為我是英國人所以我會說英文'); 24 | } 25 | 26 | suite('Levenshtein', function() { 27 | bench('talisman', function() { 28 | run(levenshtein); 29 | }); 30 | 31 | bench('leven', function() { 32 | run(leven); 33 | }); 34 | 35 | bench('limited', function() { 36 | run(limited.bind(null, 2)); 37 | }); 38 | }); 39 | -------------------------------------------------------------------------------- /scripts/format-bibliography.js: -------------------------------------------------------------------------------- 1 | const fs = require('fs'); 2 | const Cite = require('citation-js'); 3 | 4 | const bib = new Cite(fs.readFileSync('./paper/algorithms.bib', 'utf-8')); 5 | 6 | let output = bib.format('bibliography', { 7 | format: 'text', 8 | template: 'mla', 9 | lang: 'en-US' 10 | }); 11 | 12 | output = output 13 | .trim() 14 | .split('\n') 15 | .map(line => { 16 | return `> ${line}\n`; 17 | }) 18 | .join('\n'); 19 | 20 | console.log('# Talisman Bibliography\n'); 21 | console.log('[BibTex file](https://raw.githubusercontent.com/Yomguithereal/talisman/master/paper/algorithms.bib)\n'); 22 | console.log(output); 23 | -------------------------------------------------------------------------------- /src/clustering/abstract.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/abstract 3 | * ============================= 4 | * 5 | * Abstract class used by every record-linkage clusterer to expose a same 6 | * interface. 7 | */ 8 | 9 | /** 10 | * Defaults. 11 | */ 12 | const DEFAULTS = { 13 | minClusterSize: 2 14 | }; 15 | 16 | /** 17 | * Record Linkage Clusterer class. 18 | * 19 | * @constructor 20 | * @param {object} params - Clusterer parameters. 21 | * @param {array} items - Items to cluster. 22 | */ 23 | export default class RecordLinkageClusterer { 24 | constructor(params, items) { 25 | if (!params || typeof params !== 'object') 26 | throw new Error('talisman/clustering/record-linkage: the given params should be an object.'); 27 | 28 | if (!Array.isArray(items)) 29 | throw new Error('talisman/clustering/record-linkage: the given items should be an array.'); 30 | 31 | // Properties 32 | this.items = items; 33 | this.params = { 34 | minClusterSize: params.minClusterSize || DEFAULTS.minClusterSize 35 | }; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/clustering/canopy.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/canopy 3 | * =========================== 4 | * 5 | * Canopy clustering implementation. 6 | */ 7 | import RecordLinkageClusterer from './abstract'; 8 | 9 | /** 10 | * Canopy Clusterer class. 11 | * 12 | * @constructor 13 | */ 14 | export class CanopyClusterer extends RecordLinkageClusterer { 15 | constructor(params, items) { 16 | super(params, items); 17 | 18 | // Validating the distance function 19 | if (typeof params.distance !== 'function') 20 | throw new Error('talisman/clustering/record-linkage/canopy: the given distance is not a function.'); 21 | 22 | // Validating the thresholds 23 | if (typeof params.loose !== 'number') 24 | throw new Error('talisman/clustering/record-linkage/canopy: the given loose distance is not a number.'); 25 | if (typeof params.tight !== 'number') 26 | throw new Error('talisman/clustering/record-linkage/canopy: the given tight distance is not a number.'); 27 | 28 | if (params.loose < params.tight) 29 | throw new Error('talisman/clustering/record-linkage/canopy: loose distance should be greater than tight distance.'); 30 | 31 | this.distance = params.distance; 32 | this.params.loose = params.loose; 33 | this.params.tight = params.tight; 34 | } 35 | 36 | run() { 37 | const itemsIndex = {}, 38 | clusters = []; 39 | 40 | for (let i = 0, l = this.items.length; i < l; i++) 41 | itemsIndex[i] = true; 42 | 43 | for (const k in itemsIndex) { 44 | const a = this.items[k]; 45 | 46 | // Starting a new canopy 47 | delete itemsIndex[k]; 48 | const cluster = [a]; 49 | 50 | // Comparing to other elements in the set 51 | for (const k2 in itemsIndex) { 52 | const b = this.items[k2], 53 | d = this.distance(a, b); 54 | 55 | if (d <= this.params.loose) 56 | cluster.push(b); 57 | 58 | if (d <= this.params.tight) 59 | delete itemsIndex[k2]; 60 | } 61 | 62 | clusters.push(cluster); 63 | } 64 | 65 | return clusters; 66 | } 67 | } 68 | 69 | /** 70 | * Shortcut function for the canopy clusterer. 71 | * 72 | * @param {object} params - Clusterer parameters. 73 | * @param {array} items - Items to cluster. 74 | */ 75 | export default function canopy(params, items) { 76 | const clusterer = new CanopyClusterer(params, items); 77 | 78 | return clusterer.run(); 79 | } 80 | -------------------------------------------------------------------------------- /src/clustering/leader.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/leader 3 | * =========================== 4 | * 5 | * The Leader clustering algorithm is a quite simple algorithm used to partition 6 | * arbitrary data and running in O(ln) time complexity, l being the number of 7 | * clusters. 8 | * 9 | * It's also important to note that the resulting partition might change with 10 | * the order of given items. 11 | */ 12 | import RecordLinkageClusterer from './abstract'; 13 | 14 | /** 15 | * Leader Clusterer class. 16 | * 17 | * @constructor 18 | */ 19 | export class LeaderClusterer extends RecordLinkageClusterer { 20 | constructor(params, items) { 21 | super(params, items); 22 | 23 | // Validating the distance function 24 | if (typeof params.distance !== 'function') 25 | throw new Error('talisman/clustering/record-linkage/leader: the given distance is not a function.'); 26 | 27 | // Validating the thresholds 28 | if (typeof params.threshold !== 'number') 29 | throw new Error('talisman/clustering/record-linkage/leader: the given threshold is not a number.'); 30 | 31 | this.distance = params.distance; 32 | this.params.threshold = params.threshold; 33 | } 34 | 35 | run() { 36 | const clusters = []; 37 | 38 | for (let i = 0, l = this.items.length; i < l; i++) { 39 | const item = this.items[i]; 40 | 41 | let closestClusterIndex = null, 42 | closest = Infinity; 43 | 44 | for (let j = 0, m = clusters.length; j < m; j++) { 45 | const clusterLeader = clusters[j][0], 46 | distance = this.distance(clusterLeader, item); 47 | 48 | if (distance < closest) { 49 | closest = distance; 50 | closestClusterIndex = j; 51 | } 52 | } 53 | 54 | if (closest <= this.params.threshold) { 55 | clusters[closestClusterIndex].push(item); 56 | } 57 | else { 58 | clusters.push([item]); 59 | } 60 | } 61 | 62 | return clusters; 63 | } 64 | } 65 | 66 | /** 67 | * Shortcut function for the leader clusterer. 68 | * 69 | * @param {object} params - Clusterer parameters. 70 | * @param {array} items - Items to cluster. 71 | */ 72 | export default function leader(params, items) { 73 | const clusterer = new LeaderClusterer(params, items); 74 | 75 | return clusterer.run(); 76 | } 77 | -------------------------------------------------------------------------------- /src/clustering/naive.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/naive 3 | * ========================== 4 | * 5 | * Naive clustering working by performing the n(n-1)/2 distance calculations 6 | * between all relevant pairs. Time complexity of such a clustering is therefore 7 | * O(n^2), which is quite bad. 8 | * 9 | * Note that the produced clusters are fuzzy. 10 | */ 11 | import RecordLinkageClusterer from './abstract'; 12 | import { 13 | handleSimilarityPolymorphisms, 14 | clustersFromArrayGraph 15 | } from './helpers'; 16 | 17 | /** 18 | * Naive Clusterer class. 19 | * 20 | * @constructor 21 | */ 22 | export class NaiveClusterer extends RecordLinkageClusterer { 23 | constructor(params, items) { 24 | super(params, items); 25 | handleSimilarityPolymorphisms(this, params); 26 | } 27 | 28 | run() { 29 | const graph = {}; 30 | 31 | // Iterating over the needed pairs 32 | for (let i = 0, l = this.items.length; i < l; i++) { 33 | const a = this.items[i]; 34 | 35 | for (let j = i + 1; j < l; j++) { 36 | const b = this.items[j]; 37 | 38 | if (this.similarity(a, b)) { 39 | graph[i] = graph[i] || []; 40 | graph[i].push(j); 41 | 42 | // NOTE: undirected link seems to be mandatory for it to work 43 | graph[j] = graph[j] || []; 44 | graph[j].push(i); 45 | } 46 | } 47 | } 48 | 49 | // Computing clusters 50 | return clustersFromArrayGraph( 51 | this.items, 52 | graph, 53 | this.params.minClusterSize 54 | ); 55 | } 56 | } 57 | 58 | /** 59 | * Shortcut function for the naive clusterer. 60 | * 61 | * @param {object} params - Clusterer parameters. 62 | * @param {array} items - Items to cluster. 63 | */ 64 | export default function naive(params, items) { 65 | const clusterer = new NaiveClusterer(params, items); 66 | 67 | return clusterer.run(); 68 | } 69 | -------------------------------------------------------------------------------- /src/clustering/vp-tree.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/vp-tree 3 | * ============================ 4 | * 5 | * Clustering method using a Vantage Point Tree (VPTree) to find the clusters 6 | * more efficiently. 7 | */ 8 | import VPTree from 'mnemonist/vp-tree'; 9 | import RecordLinkageClusterer from './abstract'; 10 | 11 | /** 12 | * Vantage Point Tree Clusterer class. 13 | * 14 | * @constructor 15 | */ 16 | export class VPTreeClusterer extends RecordLinkageClusterer { 17 | constructor(params, items) { 18 | super(params, items); 19 | 20 | // Validating radius 21 | if (typeof params.radius !== 'number') 22 | throw new Error('talisman/clustering/record-linkage/vp-tree: the given radius is not a number.'); 23 | 24 | // Validating the distance function 25 | if (typeof params.distance !== 'function') 26 | throw new Error('talisman/clustering/record-linkage/vp-tree: the given distance is not a function.'); 27 | 28 | // Properties 29 | this.radius = params.radius; 30 | this.distance = params.distance; 31 | } 32 | 33 | run() { 34 | 35 | // Building the tree 36 | const tree = new VPTree(this.distance, this.items); 37 | 38 | // Retrieving the clusters 39 | const clusters = [], 40 | visited = new Set(); 41 | 42 | for (let i = 0, l = this.items.length; i < l; i++) { 43 | const item = this.items[i]; 44 | 45 | if (visited.has(item)) 46 | continue; 47 | 48 | const neighbors = tree.neighbors(this.radius, item); 49 | 50 | const cluster = new Array(neighbors.length); 51 | 52 | for (let j = 0, m = neighbors.length; j < m; j++) { 53 | visited.add(neighbors[j].item); 54 | cluster[j] = neighbors[j].item; 55 | } 56 | 57 | if (cluster.length >= this.params.minClusterSize) 58 | clusters.push(cluster); 59 | } 60 | 61 | return clusters; 62 | } 63 | } 64 | 65 | /** 66 | * Shortcut function for the vantage point tree clusterer. 67 | * 68 | * @param {object} params - Clusterer parameters. 69 | * @param {array} items - Items to cluster. 70 | */ 71 | export default function vpTree(params, items) { 72 | const clusterer = new VPTreeClusterer(params, items); 73 | 74 | return clusterer.run(); 75 | } 76 | -------------------------------------------------------------------------------- /src/hash/crc32.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman hash/crc32 3 | * ==================== 4 | * 5 | * JavaScript implementation of the CRC32 hash for UTF-8 strings. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/Cyclic_redundancy_check 8 | */ 9 | 10 | /** 11 | * Constants. 12 | */ 13 | const TABLE = new Int32Array(256); 14 | 15 | for (let c = 0, n = 0; n !== 256; n++) { 16 | c = n; 17 | c = ((c & 1) ? (-306674912 ^ (c >>> 1)) : (c >>> 1)); 18 | c = ((c & 1) ? (-306674912 ^ (c >>> 1)) : (c >>> 1)); 19 | c = ((c & 1) ? (-306674912 ^ (c >>> 1)) : (c >>> 1)); 20 | c = ((c & 1) ? (-306674912 ^ (c >>> 1)) : (c >>> 1)); 21 | c = ((c & 1) ? (-306674912 ^ (c >>> 1)) : (c >>> 1)); 22 | c = ((c & 1) ? (-306674912 ^ (c >>> 1)) : (c >>> 1)); 23 | c = ((c & 1) ? (-306674912 ^ (c >>> 1)) : (c >>> 1)); 24 | c = ((c & 1) ? (-306674912 ^ (c >>> 1)) : (c >>> 1)); 25 | TABLE[n] = c; 26 | } 27 | 28 | /** 29 | * Computes the CRC32 hash for the given UTF-8 string. 30 | * 31 | * @param {string} string - The string to hash. 32 | * @return {number} - The signed CRC32 hash. 33 | */ 34 | export default function crc32(string) { 35 | let C = -1; 36 | 37 | for (let i = 0, l = string.length, c, d; i < l;) { 38 | c = string.charCodeAt(i++); 39 | 40 | if (c < 0x80) { 41 | C = (C >>> 8) ^ TABLE[(C ^ c) & 0xFF]; 42 | } 43 | else if (c < 0x800) { 44 | C = (C >>> 8) ^ TABLE[(C ^ (192 | ((c >> 6) & 31))) & 0xFF]; 45 | C = (C >>> 8) ^ TABLE[(C ^ (128 | (c & 63))) & 0xFF]; 46 | } 47 | else if (c >= 0xD800 && c < 0xE000) { 48 | c = (c & 1023) + 64; 49 | d = string.charCodeAt(i++) & 1023; 50 | C = (C >>> 8) ^ TABLE[(C ^ (240 | ((c >> 8) & 7))) & 0xFF]; 51 | C = (C >>> 8) ^ TABLE[(C ^ (128 | ((c >> 2) & 63))) & 0xFF]; 52 | C = (C >>> 8) ^ TABLE[(C ^ (128 | ((d >> 6) & 15) | ((c & 3) << 4))) & 0xFF]; 53 | C = (C >>> 8) ^ TABLE[(C ^ (128 | (d & 63))) & 0xFF]; 54 | } 55 | else { 56 | C = (C >>> 8) ^ TABLE[(C ^ (224 | ((c >> 12) & 15))) & 0xFF]; 57 | C = (C >>> 8) ^ TABLE[(C ^ (128 | ((c >> 6) & 63))) & 0xFF]; 58 | C = (C >>> 8) ^ TABLE[(C ^ (128 | (c & 63))) & 0xFF]; 59 | } 60 | } 61 | 62 | return C ^ -1; 63 | } 64 | -------------------------------------------------------------------------------- /src/helpers/index.js: -------------------------------------------------------------------------------- 1 | /* eslint no-cond-assign: 0 */ 2 | /** 3 | * Talisman helpers 4 | * ================= 5 | * 6 | * Miscellaneous helper functions. 7 | */ 8 | 9 | /** 10 | * Function returning all the matches of a regular expression over the given 11 | * string. 12 | * 13 | * @param {RegExp} pattern - The regular expression to apply. 14 | * @param {string} string - The string to match. 15 | * @return {array} - An array of matches. 16 | */ 17 | export function findall(pattern, string) { 18 | const matches = []; 19 | 20 | if (!pattern.global) { 21 | const result = pattern.exec(string); 22 | 23 | if (result) 24 | matches.push(result); 25 | 26 | return matches; 27 | } 28 | 29 | let match; 30 | while (match = pattern.exec(string)) 31 | matches.push(match); 32 | 33 | // Resetting state of the Regex for safety 34 | pattern.lastIndex = 0; 35 | 36 | return matches; 37 | } 38 | 39 | /** 40 | * Function normalizing the given variable into a proper array sequence. 41 | * 42 | * @param {mixed} target - The variable to normalize as a sequence. 43 | * @return {array} - The resulting sequence. 44 | */ 45 | export function seq(target) { 46 | return typeof target === 'string' ? target.split('') : target; 47 | } 48 | 49 | /** 50 | * Function squeezing the given sequence by dropping consecutive duplicates. 51 | * 52 | * Note: the name was actually chosen to mimic Ruby's naming since I did not 53 | * find any equivalent in other standard libraries. 54 | * 55 | * @param {mixed} target - The sequence to squeeze. 56 | * @return {array} - The resulting sequence. 57 | */ 58 | export function squeeze(target) { 59 | const isString = typeof target === 'string', 60 | sequence = seq(target), 61 | squeezed = [sequence[0]]; 62 | 63 | for (let i = 1, l = sequence.length; i < l; i++) { 64 | if (sequence[i] !== sequence[i - 1]) 65 | squeezed.push(sequence[i]); 66 | } 67 | 68 | return isString ? squeezed.join('') : squeezed; 69 | } 70 | 71 | /** 72 | * Function creating an index of mapped letters. 73 | * 74 | * @param {string} first - First letters. 75 | * @param {string} second - Second letters. 76 | * @return {object} - The resulting index. 77 | */ 78 | export function translation(first, second) { 79 | const index = {}; 80 | 81 | first = first.split(''); 82 | second = second.split(''); 83 | 84 | if (first.length !== second.length) 85 | throw Error('talisman/helpers#translation: given strings don\'t have the same length.'); 86 | 87 | for (let i = 0, l = first.length; i < l; i++) 88 | index[first[i]] = second[i]; 89 | 90 | return index; 91 | } 92 | -------------------------------------------------------------------------------- /src/helpers/vectors.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman helpers/vectors 3 | * ========================= 4 | * 5 | * Compilation of various helpers to deal with vectors. 6 | */ 7 | 8 | /** 9 | * Function creating a vector of n dimensions and filling it with a single 10 | * value if required. 11 | * 12 | * @param {number} n - Dimensions of the vector to create. 13 | * @param {mixed} fill - Value to be used to fill the vector. 14 | * @return {array} - The resulting vector. 15 | */ 16 | export function vec(n, fill) { 17 | const vector = new Array(n); 18 | 19 | if (arguments.length > 1) { 20 | for (let i = 0; i < n; i++) 21 | vector[i] = fill; 22 | } 23 | 24 | return vector; 25 | } 26 | 27 | /** 28 | * Function adding two vectors. 29 | * 30 | * @param {array} a - The first vector. 31 | * @param {array} b - The second vector. 32 | * @return {array} - The resulting vector. 33 | */ 34 | export function add(a, b) { 35 | const dimensions = a.length, 36 | vector = vec(dimensions); 37 | 38 | for (let i = 0; i < dimensions; i++) 39 | vector[i] = a[i] + b[i]; 40 | 41 | return vector; 42 | } 43 | 44 | /** 45 | * Function multiplying a vector & a scalar. 46 | * 47 | * @param {array} v - The first vector. 48 | * @param {array} s - The scalar. 49 | * @return {array} - The resulting vector. 50 | */ 51 | export function scale(v, s) { 52 | const dimensions = v.length, 53 | vector = vec(dimensions); 54 | 55 | for (let i = 0; i < dimensions; i++) 56 | vector[i] = v[i] * s; 57 | 58 | return vector; 59 | } 60 | 61 | /** 62 | * Function returning the mean of a list of vectors. 63 | * 64 | * @param {array} vectors - The list of vectors to process. 65 | * @return {array} - A mean vector. 66 | */ 67 | export function mean(vectors) { 68 | const sum = vec(vectors[0].length, 0); 69 | 70 | for (let i = 0, l = vectors.length; i < l; i++) { 71 | const vector = vectors[i]; 72 | 73 | for (let j = 0, m = vector.length; j < m; j++) 74 | sum[j] += vector[j]; 75 | } 76 | 77 | for (let i = 0, l = sum.length; i < l; i++) 78 | sum[i] /= vectors.length; 79 | 80 | return sum; 81 | } 82 | 83 | /** 84 | * Function returning the scalar product of two vectors. 85 | * 86 | * @param {array} a - The first vector. 87 | * @param {array} b - The second vector. 88 | * @return {number} - The scalar product. 89 | */ 90 | export function dot(a, b) { 91 | let product = 0; 92 | 93 | for (let i = 0, l = a.length; i < l; i++) 94 | product += (a[i] * b[i]); 95 | 96 | return product; 97 | } 98 | -------------------------------------------------------------------------------- /src/inflectors/spanish/index.js: -------------------------------------------------------------------------------- 1 | import {singularize} from './noun'; 2 | 3 | export { 4 | singularize 5 | }; 6 | -------------------------------------------------------------------------------- /src/keyers/fingerprint.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyers/fingerprint 3 | * ============================ 4 | * 5 | * Keyer based on the fingerprint tokenizer. 6 | */ 7 | import {createTokenizer} from '../tokenizers/fingerprint'; 8 | import nameFingerprintTokenizer from '../tokenizers/fingerprint/name'; 9 | 10 | export function createKeyer(options) { 11 | options = options || {}; 12 | 13 | const tokenizer = createTokenizer(options); 14 | 15 | if (options.ngrams) 16 | return (n, string) => tokenizer(n, string).join(''); 17 | 18 | return string => tokenizer(string).join(' '); 19 | } 20 | 21 | export default createKeyer(); 22 | 23 | const ngramsFingerprint = createKeyer({ngrams: true}); 24 | 25 | const nameFingerprint = name => nameFingerprintTokenizer(name).join(' '); 26 | 27 | export {ngramsFingerprint, nameFingerprint}; 28 | -------------------------------------------------------------------------------- /src/keyers/name-sig.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyers/name-sig 3 | * ========================= 4 | * 5 | * The Name Significance "NameSig" similarity key. The algorithm is slightly 6 | * modified to align itself with the string fingerprint. 7 | * 8 | * [Article]: 9 | * Similarity Analysis of Patients’ Data: Bangladesh Perspective. 10 | * Shahidul Islam Khan, Abu Sayed Md. Latiful Hoque. 11 | * December 17, 2016 12 | */ 13 | import deburr from 'lodash/deburr'; 14 | 15 | /** 16 | * Constants. 17 | */ 18 | const TITLES = [ 19 | 'Dr', 20 | 'Jr', 21 | 'Md', 22 | 'Mgr', 23 | 'Mr', 24 | 'Mrs', 25 | 'Ms', 26 | 'Mme', 27 | 'Mlle', 28 | 'M', 29 | 'Prof', 30 | 'Phd', 31 | 'St', 32 | 'Sree', 33 | 'Sr' 34 | ]; 35 | 36 | const TITLE_REGEX = new RegExp(`(?:${TITLES.join('|')})\\.?\\s+`, 'gi'), 37 | UNDESIRABLES_REGEX = /[^a-z]/g, 38 | VOWELS_REGEX = /(\S)[aeiou]+/g, 39 | WHITESPACE_REGEX = /\s/g; 40 | 41 | const CONVERSIONS = [ 42 | [/[jz]/g, 'g'], 43 | [/[qc]/g, 'k'] 44 | ]; 45 | 46 | /** 47 | * Function taking a name string and returning its NameSig key. 48 | * 49 | * @param {string} name - Target name string. 50 | * @return {string} - The NameSig key. 51 | */ 52 | export default function nameSig(name) { 53 | 54 | // Deburring 55 | name = deburr(name); 56 | 57 | // Lowercasing 58 | name = name.toLowerCase(); 59 | 60 | // Dropping titles 61 | name = name.replace(TITLE_REGEX, ''); 62 | 63 | // Stripping undesirables 64 | name = name.replace(UNDESIRABLES_REGEX, ''); 65 | 66 | // Stripping ambiguous vowels 67 | name = name.replace(VOWELS_REGEX, '$1'); 68 | 69 | // Dropping whitespace 70 | name = name.replace(WHITESPACE_REGEX, ''); 71 | 72 | // Conversions 73 | name = name.replace(CONVERSIONS[0][0], CONVERSIONS[0][1]); 74 | name = name.replace(CONVERSIONS[1][0], CONVERSIONS[1][1]); 75 | 76 | return name; 77 | } 78 | -------------------------------------------------------------------------------- /src/keyers/normalize.js: -------------------------------------------------------------------------------- 1 | /* eslint no-control-regex: 0 */ 2 | /** 3 | * Talisman keyers/normalize 4 | * ========================== 5 | * 6 | * Generic function used to normalize strings to make them a good basis for 7 | * fuzzy comparisons. 8 | */ 9 | import deburr from 'lodash/deburr'; 10 | import { 11 | SINGLE_QUOTES as SINGLE_QUOTES_CLASS, 12 | DOUBLE_QUOTES as DOUBLE_QUOTES_CLASS, 13 | HYPHENS as HYPHENS_CLASS, 14 | COMMAS as COMMAS_CLASS 15 | } from '../regexp/classes'; 16 | 17 | /** 18 | * Regular expressions. 19 | */ 20 | const CONTROL_CHARACTERS = new RegExp('[\\x00-\\x08\\x0A-\\x1F\\x7F]', 'g'), 21 | SINGLE_QUOTES = new RegExp(`[${SINGLE_QUOTES_CLASS}]`, 'g'), 22 | DOUBLE_QUOTES = new RegExp(`[${DOUBLE_QUOTES_CLASS}]`, 'g'), 23 | HYPHENS = new RegExp(`[${HYPHENS_CLASS}]`, 'g'), 24 | COMMAS = new RegExp(`[${COMMAS_CLASS}]`, 'g'), 25 | WHITESPACE_COMPRESSION = /\s+/g; 26 | 27 | const CONVERSIONS = [ 28 | [/…/g, '...'], 29 | [/æ/g, 'ae'], 30 | [/œ/g, 'oe'], 31 | [/ß/g, 'ss'] 32 | ]; 33 | 34 | /** 35 | * Function creating a normalizer function. 36 | * 37 | * @param {object} params - Options: 38 | * @param {boolean} keepAccents - Whether to keep accents. 39 | * @param {boolean} keepCase - Whether to keep the case. 40 | * @return {function} 41 | */ 42 | export function createNormalizer(params) { 43 | params = params || {}; 44 | 45 | const keepAccents = params.keepAccents === true, 46 | keepCase = params.keepCase === true; 47 | 48 | /** 49 | * Function returning a normalized string. 50 | * 51 | * @param {string} string - String to normalize. 52 | * @return {string} 53 | */ 54 | return function normalizer(string) { 55 | if (!keepCase) 56 | string = string.toLowerCase(); 57 | 58 | string = string 59 | .trim() 60 | .replace(WHITESPACE_COMPRESSION, ' ') 61 | .replace(CONTROL_CHARACTERS, '') 62 | .replace(SINGLE_QUOTES, '\'') 63 | .replace(DOUBLE_QUOTES, '"') 64 | .replace(HYPHENS, '-') 65 | .replace(COMMAS, ','); 66 | 67 | for (let i = 0, l = CONVERSIONS.length; i < l; i++) { 68 | const pattern = CONVERSIONS[i][0], 69 | replacement = CONVERSIONS[i][1]; 70 | 71 | string = string.replace(pattern, replacement); 72 | } 73 | 74 | if (!keepAccents) 75 | string = deburr(string); 76 | 77 | return string; 78 | }; 79 | } 80 | 81 | export default createNormalizer(); 82 | -------------------------------------------------------------------------------- /src/keyers/omission.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyers/omission 3 | * ========================= 4 | * 5 | * Keyer taking a string and normalizing it into a "omission key". 6 | * 7 | * [Reference]: 8 | * http://dl.acm.org/citation.cfm?id=358048 9 | * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.12.385&rep=rep1&type=pdf 10 | * 11 | * [Article]: 12 | * Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction 13 | * in Scientific and Scholarly Text." Communications of the ACM, 27(4). 14 | * 358--368. 15 | */ 16 | import deburr from 'lodash/deburr'; 17 | 18 | /** 19 | * Constants. 20 | */ 21 | const UNDESIRABLES = /[^A-Z]/g, 22 | CONSONANTS = 'JKQXZVWYBFMGPDHCLNTSR', 23 | CONSONANTS_SET = new Set(CONSONANTS); 24 | 25 | /** 26 | * omission key function. 27 | * 28 | * @param {string} string - Target string. 29 | * @return {string} - The omission key. 30 | */ 31 | export default function omission(string) { 32 | 33 | // Deburring 34 | string = deburr(string); 35 | 36 | // Normalizing case 37 | string = string.toUpperCase(); 38 | 39 | // Dropping useless characters 40 | string = string.replace(UNDESIRABLES, ''); 41 | 42 | // Composing the key 43 | let key = ''; 44 | const vowels = new Set(); 45 | 46 | // Add consonants in order 47 | const letters = new Set(string); 48 | 49 | for (let i = 0, l = CONSONANTS.length; i < l; i++) { 50 | const consonant = CONSONANTS[i]; 51 | 52 | if (letters.has(consonant)) 53 | key += consonant; 54 | } 55 | 56 | // Add vowels in order they appeared in the word 57 | for (let i = 0, l = string.length; i < l; i++) { 58 | const letter = string[i]; 59 | 60 | if (!CONSONANTS_SET.has(letter) && !vowels.has(letter)) { 61 | vowels.add(letter); 62 | key += letter; 63 | } 64 | } 65 | 66 | return key; 67 | } 68 | -------------------------------------------------------------------------------- /src/keyers/skeleton.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyers/skeleton 3 | * ========================= 4 | * 5 | * Keyer taking a string and normalizing it into a "skeleton key". 6 | * 7 | * [Reference]: 8 | * http://dl.acm.org/citation.cfm?id=358048 9 | * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.12.385&rep=rep1&type=pdf 10 | * 11 | * [Article]: 12 | * Pollock, Joseph J. and Antonio Zamora. 1984. "Automatic Spelling Correction 13 | * in Scientific and Scholarly Text." Communications of the ACM, 27(4). 14 | * 358--368. 15 | */ 16 | import deburr from 'lodash/deburr'; 17 | 18 | /** 19 | * Constants. 20 | */ 21 | const UNDESIRABLES = /[^A-Z]/g, 22 | VOWELS = new Set('AEIOU'); 23 | 24 | /** 25 | * Helpers. 26 | */ 27 | function consume(set) { 28 | return Array.from(set).join(''); 29 | } 30 | 31 | /** 32 | * Skeleton key function. 33 | * 34 | * @param {string} string - Target string. 35 | * @return {string} - The skeleton key. 36 | */ 37 | export default function skeleton(string) { 38 | 39 | // Deburring 40 | string = deburr(string); 41 | 42 | // Normalizing case 43 | string = string.toUpperCase(); 44 | 45 | // Dropping useless characters 46 | string = string.replace(UNDESIRABLES, ''); 47 | 48 | // Composing the key 49 | const firstLetter = string[0]; 50 | 51 | if (!firstLetter) 52 | return ''; 53 | 54 | const consonants = new Set(), 55 | vowels = new Set(); 56 | 57 | for (let i = 1, l = string.length; i < l; i++) { 58 | const letter = string[i]; 59 | 60 | if (letter === firstLetter) 61 | continue; 62 | 63 | if (VOWELS.has(letter)) 64 | vowels.add(letter); 65 | else 66 | consonants.add(letter); 67 | } 68 | 69 | return firstLetter + consume(consonants) + consume(vowels); 70 | } 71 | -------------------------------------------------------------------------------- /src/metrics/bag.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/bag 3 | * ===================== 4 | * 5 | * Function computing the bag distance which works likewise which is the max 6 | * of the difference of multiset a & multiset b and the difference of 7 | * multiset b & multiset a. 8 | * 9 | * [Reference]: 10 | * http://www-db.disi.unibo.it/research/papers/SPIRE02.pdf 11 | * 12 | * [Article]: 13 | * String Matching with Metric Trees Using an Approximate Distance. 14 | * Ilaria Bartolini, Paolo Ciaccia, and Marco Patella. 15 | * 16 | * [Tags]: metric, string metric. 17 | */ 18 | 19 | /** 20 | * Function returning the bag distance. 21 | * 22 | * @param {mixed} a - The first sequence. 23 | * @param {mixed} b - The second sequence. 24 | * @return {number} - The bag distance. 25 | */ 26 | export default function bag(a, b) { 27 | if (a === b) 28 | return 0; 29 | 30 | const ma = Object.create(null), 31 | mb = Object.create(null); 32 | 33 | let da = a.length, 34 | db = b.length; 35 | 36 | if (!da) 37 | return db; 38 | if (!db) 39 | return da; 40 | 41 | const longest = Math.max(da, db); 42 | 43 | for (let i = 0; i < longest; i++) { 44 | if (i < da) { 45 | const value = a[i]; 46 | 47 | if (!ma[value]) 48 | ma[value] = 0; 49 | ma[value]++; 50 | } 51 | 52 | if (i < db) { 53 | const value = b[i]; 54 | 55 | if (!mb[value]) 56 | mb[value] = 0; 57 | mb[value]++; 58 | } 59 | } 60 | 61 | for (const k in ma) { 62 | if (mb[k]) 63 | da -= Math.min(ma[k], mb[k]); 64 | } 65 | 66 | for (const k in mb) { 67 | if (ma[k]) 68 | db -= Math.min(mb[k], ma[k]); 69 | } 70 | 71 | return Math.max(da, db); 72 | } 73 | -------------------------------------------------------------------------------- /src/metrics/canberra.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/canberra 3 | * ========================== 4 | * 5 | * Function computing the Canberra distance. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/Canberra_distance 8 | * 9 | * [Tags]: metric, vector space. 10 | */ 11 | 12 | /** 13 | * Function returning the Canberra distance between two vectors. 14 | * 15 | * @param {mixed} a - The first vector. 16 | * @param {mixed} b - The second vector. 17 | * @return {number} - The Canberra distance between a & b. 18 | * 19 | * @throws {Error} The function expects vectors of same dimension. 20 | */ 21 | export default function canberra(a, b) { 22 | if (a.length !== b.length) 23 | throw Error('talisman/metrics/distance/canberra: the given vectors are not of the same dimension.'); 24 | 25 | let distance = 0; 26 | 27 | for (let i = 0, l = a.length; i < l; i++) 28 | distance += Math.abs(a[i] - b[i]) / (Math.abs(a[i]) + Math.abs(b[i])); 29 | 30 | return distance; 31 | } 32 | -------------------------------------------------------------------------------- /src/metrics/chebyshev.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/chebyshev 3 | * =========================== 4 | * 5 | * Function computing the Chebyshev distance. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/Chebyshev_distance 8 | * 9 | * [Tags]: metric, vector space. 10 | */ 11 | 12 | /** 13 | * Function returning the Chebyshev distance between two vectors. 14 | * 15 | * @param {mixed} a - The first vector. 16 | * @param {mixed} b - The second vector. 17 | * @return {number} - The Chebyshev distance between a & b. 18 | * 19 | * @throws {Error} The function expects vectors of same dimension. 20 | */ 21 | export default function chebyshev(a, b) { 22 | if (a.length !== b.length) 23 | throw Error('talisman/metrics/distance/chebyshev: the given vectors are not of the same dimension.'); 24 | 25 | let distance = 0; 26 | 27 | for (let i = 0, l = a.length; i < l; i++) 28 | distance = Math.max(distance, Math.abs(a[i] - b[i])); 29 | 30 | return distance; 31 | } 32 | -------------------------------------------------------------------------------- /src/metrics/cosine.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/cosine 3 | * ======================== 4 | * 5 | * Function computing the cosine similarity. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/Cosine_similarity 8 | * 9 | * [Tags]: semimetric. 10 | */ 11 | 12 | /** 13 | * Function returning the cosine similarity between two vectors. 14 | * 15 | * @param {mixed} a - The first vector. 16 | * @param {mixed} b - The second vector. 17 | * @return {number} - The cosine similarity between a & b. 18 | * 19 | * @throws {Error} The function expects vectors of same dimension. 20 | */ 21 | export default function cosine(a, b) { 22 | if (a.length !== b.length) 23 | throw Error('talisman/metrics/distance/cosine: the given vectors are not of the same dimension.'); 24 | 25 | let xx = 0, 26 | xy = 0, 27 | yy = 0; 28 | 29 | for (let i = 0, l = a.length; i < l; i++) { 30 | const x = a[i], 31 | y = b[i]; 32 | 33 | xx += x * x; 34 | yy += y * y; 35 | xy += x * y; 36 | } 37 | 38 | return xy / Math.sqrt(xx * yy); 39 | } 40 | 41 | /** 42 | * Cosine distance is 1 - the cosine similarity. 43 | */ 44 | export function distance(a, b) { 45 | return 1 - cosine(a, b); 46 | } 47 | 48 | export {cosine as similarity}; 49 | -------------------------------------------------------------------------------- /src/metrics/dice.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/dice 3 | * ====================== 4 | * 5 | * Functions computing the Dice coefficient. 6 | * 7 | * [Reference]: 8 | * https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient 9 | * 10 | * [Article]: 11 | * Dice, Lee R. (1945). "Measures of the Amount of Ecologic Association 12 | * Between Species". Ecology 26 (3): 297–302. 13 | * 14 | * [Tags]: semimetric, string metric. 15 | */ 16 | import tversky from './tversky'; 17 | import {bigrams} from '../tokenizers/ngrams'; 18 | 19 | /** 20 | * Dice coefficient is just Tversky index with alpha = beta = 1 over the 21 | * sequences' bigrams. 22 | */ 23 | const dice = function(x, y) { 24 | 25 | // Shortcuts 26 | if (x === y) 27 | return 1; 28 | 29 | if (x.length === 1 && y.length === 1 && x !== y) 30 | return 0; 31 | 32 | // Computing the sequences' bigrams 33 | x = bigrams(x); 34 | y = bigrams(y); 35 | 36 | return tversky({alpha: 0.5, beta: 0.5}, x, y); 37 | }; 38 | 39 | /** 40 | * Dice distance is 1 - the Dice index. 41 | */ 42 | const distance = (x, y) => 1 - dice(x, y); 43 | 44 | /** 45 | * Exporting. 46 | */ 47 | export default dice; 48 | export { 49 | dice as index, 50 | dice as coefficient, 51 | dice as similarity, 52 | distance 53 | }; 54 | -------------------------------------------------------------------------------- /src/metrics/euclidean.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/euclidean 3 | * =========================== 4 | * 5 | * Function computing the euclidean distance. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/Euclidean_distance 8 | * 9 | * [Tags]: metric, string metric. 10 | */ 11 | 12 | /** 13 | * Function returning the squared euclidean distance between two vectors. 14 | * 15 | * @param {mixed} a - The first vector. 16 | * @param {mixed} b - The second vector. 17 | * @return {number} - The squared euclidean distance between a & b. 18 | * 19 | * @throws {Error} The function expects vectors of same dimension. 20 | */ 21 | export function squared(a, b) { 22 | if (a.length !== b.length) 23 | throw Error('talisman/metrics/distance/euclidean: the given vectors are not of the same dimension.'); 24 | 25 | let distance = 0; 26 | 27 | for (let i = 0, l = a.length; i < l; i++) 28 | distance += Math.pow(a[i] - b[i], 2); 29 | 30 | return distance; 31 | } 32 | 33 | /** 34 | * Function returning the euclidean distance between two vectors. 35 | * 36 | * @param {mixed} a - The first vector. 37 | * @param {mixed} b - The second vector. 38 | * @return {number} - The euclidean distance between a & b. 39 | * 40 | * @throws {Error} The function expects vector of same dimensions. 41 | */ 42 | export default function euclidean(a, b) { 43 | return Math.sqrt(squared(a, b)); 44 | } 45 | -------------------------------------------------------------------------------- /src/metrics/eudex.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/eudex 3 | * ======================= 4 | * 5 | * JavaScript implementation of the distance & similarity functions using 6 | * Eudex hashes. 7 | * 8 | * [Reference]: 9 | * https://github.com/ticki/eudex 10 | * 11 | * [Author]: 12 | * @ticki (https://github.com/ticki) 13 | * 14 | * [Tags]: metric, string metric. 15 | */ 16 | import eudex from '../phonetics/eudex'; 17 | import Long from 'long'; 18 | 19 | /** 20 | * Helpers. 21 | */ 22 | 23 | // NOTE: this is somewhat hacky and some methods can retrieve this information 24 | // in constant time rather than our linear time here. However, the massive 25 | // use of functions from the 'long' library might not be as optimized 26 | // by JavaScript engines. 27 | function bits(number) { 28 | return new Long((number.toString(2).match(/1/g) || []).length); 29 | } 30 | 31 | /** 32 | * Function returning the distance between two strings hashed by Eudex. 33 | * 34 | * @param {mixed} a - The first string. 35 | * @param {mixed} b - The second string. 36 | * @return {number} - The distance. 37 | */ 38 | export function distance(a, b) { 39 | const d = eudex(a).xor(eudex(b)); 40 | 41 | let sum = bits(d.and(0xFF)); 42 | 43 | const toAdd = [ 44 | bits(d.shiftRight(8).and(0xFF)).mul(2), 45 | bits(d.shiftRight(16).and(0xFF)).mul(4), 46 | bits(d.shiftRight(24).and(0xFF)).mul(8), 47 | bits(d.shiftRight(32).and(0xFF)).mul(16), 48 | bits(d.shiftRight(40).and(0xFF)).mul(32), 49 | bits(d.shiftRight(48).and(0xFF)).mul(64), 50 | bits(d.shiftRight(56).and(0xFF)).mul(128) 51 | ]; 52 | 53 | for (let i = 0, l = toAdd.length; i < l; i++) 54 | sum = sum.add(toAdd[i]); 55 | 56 | return sum.low; 57 | } 58 | 59 | /** 60 | * Function returning whether the two given strings are similar by appraising 61 | * the distance between their Eudex hash. 62 | * 63 | * @param {mixed} a - The first string. 64 | * @param {mixed} b - The second string. 65 | * @return {boolean} 66 | */ 67 | export function isSimilar(a, b) { 68 | return distance(a, b) < 10; 69 | } 70 | -------------------------------------------------------------------------------- /src/metrics/guth.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/guth 3 | * ====================== 4 | * 5 | * Implementation of the Guth distance. 6 | * 7 | * [Article]: 8 | * Gloria J. A. Guth (1976) Surname Spellings and Computerized Record Linkage, 9 | * Historical Methods Newsletter, 10:1, 10-19 10 | * DOI: 10.1080/00182494.1976.10112645 11 | * 12 | * [Tags]: metric, vector space. 13 | */ 14 | 15 | /** 16 | * Function returning the Guth distance between two sequences. 17 | * 18 | * @param {mixed} a - The first sequence to process. 19 | * @param {mixed} b - The second sequence to process. 20 | * @return {number} - The Guth distance between a & b. 21 | */ 22 | export default function guth(a, b) { 23 | if (a === b) 24 | return 0; 25 | 26 | let tmp; 27 | 28 | // Swapping so that a is the shortest 29 | if (a.length > b.length) { 30 | tmp = a; 31 | a = b; 32 | b = tmp; 33 | } 34 | 35 | let d = 0; 36 | 37 | // Iterating 38 | for (let i = 0, la = a.length, lb = b.length; i < lb; i++) { 39 | 40 | // Early termination when b is really longer than a 41 | if (i > la + 1) { 42 | d += lb - i; 43 | break; 44 | } 45 | 46 | const match = ( 47 | a[i] === b[i] || 48 | (i + 1 < lb && a[i] === b[i + 1]) || 49 | (i + 2 < lb && a[i] === b[i + 2]) || 50 | (i && a[i] === b[i - 1]) || 51 | (i && a[i - 1] === b[i]) || 52 | (i + 1 < la && a[i + 1] === b[i]) || 53 | (i + 2 < la && a[i + 2] === b[i]) || 54 | (i + 1 < la && i + 1 < lb && a[i + 1] === b[i + 1]) || 55 | (i + 2 < la && i + 2 < lb && a[i + 2] === b[i + 2]) 56 | ); 57 | 58 | if (!match) 59 | d++; 60 | } 61 | 62 | return d; 63 | } 64 | -------------------------------------------------------------------------------- /src/metrics/identity.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/identity 3 | * ========================== 4 | * 5 | * Identity distance/similarity. 6 | * 7 | * [Tags]: metric, string metric. 8 | */ 9 | 10 | /** 11 | * Identity distance. 12 | * 13 | * @param {array|string} a - First sequence. 14 | * @param {array|string} b - Second sequence. 15 | * @param {number} - Distance between 0 & 1. 16 | */ 17 | export function distance(a, b) { 18 | if (typeof a === 'string') 19 | return a === b ? 0 : 1; 20 | 21 | if (a === b) 22 | return 0; 23 | 24 | if (a.length !== b.length) 25 | return 1; 26 | 27 | for (let i = 0, l = a.length; i < l; i++) { 28 | if (a[i] !== b[i]) 29 | return 1; 30 | } 31 | 32 | return 0; 33 | } 34 | 35 | /** 36 | * Identity similarity. 37 | * 38 | * @param {array|string} a - First sequence. 39 | * @param {array|string} b - Second sequence. 40 | * @param {number} - Similarity between 0 & 1. 41 | */ 42 | export function similarity(a, b) { 43 | if (typeof a === 'string') 44 | return a === b ? 1 : 0; 45 | 46 | if (a === b) 47 | return 1; 48 | 49 | if (a.length !== b.length) 50 | return 0; 51 | 52 | for (let i = 0, l = a.length; i < l; i++) { 53 | if (a[i] !== b[i]) 54 | return 0; 55 | } 56 | 57 | return 1; 58 | } 59 | -------------------------------------------------------------------------------- /src/metrics/jaccard.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/jaccard 3 | * ========================= 4 | * 5 | * Functions computing the Jaccard distance/similarity. 6 | * 7 | * [Reference]: 8 | * https://en.wikipedia.org/wiki/Jaccard_index 9 | * 10 | * [Article]: 11 | * Jaccard, Paul (1912), "The distribution of the flora in the alpine zone", 12 | * New Phytologist 11: 37–50 13 | * 14 | * [Tags]: metric, string metric. 15 | */ 16 | 17 | /** 18 | * Function returning the Jaccard similarity score between two sequences. 19 | * 20 | * @param {mixed} a - The first sequence. 21 | * @param {mixed} b - The second sequence. 22 | * @return {number} - The Jaccard similarity score between a & b. 23 | */ 24 | function jaccard(a, b) { 25 | if (a === b) 26 | return 1; 27 | 28 | const la = a.length, 29 | lb = b.length; 30 | 31 | if (!la || !lb) 32 | return 0; 33 | 34 | const setA = {}, 35 | setB = {}; 36 | 37 | let I = 0, 38 | sizeA = 0, 39 | sizeB = 0; 40 | 41 | for (let i = 0; i < la; i++) { 42 | if (!setA.hasOwnProperty(a[i])) { 43 | setA[a[i]] = true; 44 | sizeA++; 45 | } 46 | } 47 | 48 | for (let i = 0; i < lb; i++) { 49 | if (!setB.hasOwnProperty(b[i])) { 50 | setB[b[i]] = true; 51 | sizeB++; 52 | 53 | if (setA.hasOwnProperty(b[i])) 54 | I++; 55 | } 56 | } 57 | 58 | // Size of the union is sum of size of both sets minus intersection 59 | const U = sizeA + sizeB - I; 60 | 61 | return I / U; 62 | } 63 | 64 | /** 65 | * Jaccard distance is 1 - the Jaccard index. 66 | */ 67 | const distance = (x, y) => 1 - jaccard(x, y); 68 | 69 | /** 70 | * Exporting. 71 | */ 72 | export default jaccard; 73 | export { 74 | jaccard as index, 75 | jaccard as similarity, 76 | distance 77 | }; 78 | -------------------------------------------------------------------------------- /src/metrics/jaro-winkler.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/jaro-winkler 3 | * ============================== 4 | * 5 | * Function computing the Jaro-Winkler score. 6 | * 7 | * [Reference]: 8 | * https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance 9 | * 10 | * [Article]: 11 | * Winkler, W. E. (1990). "String Comparator Metrics and Enhanced Decision Rules 12 | * in the Fellegi-Sunter Model of Record Linkage". 13 | * Proceedings of the Section on Survey Research Methods 14 | * (American Statistical Association): 354–359. 15 | * 16 | * [Tags]: semimetric, string metric. 17 | */ 18 | import jaro from './jaro'; 19 | 20 | /** 21 | * Function returning the Jaro-Winkler score between two sequences. 22 | * 23 | * @param {object} options - Custom options. 24 | * @param {mixed} a - The first sequence. 25 | * @param {mixed} b - The second sequence. 26 | * @return {number} - The Jaro-Winkler score between a & b. 27 | */ 28 | function customJaroWinkler(options, a, b) { 29 | options = options || {}; 30 | 31 | const { 32 | boostThreshold = 0.7, 33 | scalingFactor = 0.1 34 | } = options; 35 | 36 | if (scalingFactor > 0.25) 37 | throw Error('talisman/metrics/distance/jaro-winkler: the scaling factor should not exceed 0.25.'); 38 | 39 | if (boostThreshold < 0 || boostThreshold > 1) 40 | throw Error('talisman/metrics/distance/jaro-winkler: the boost threshold should be comprised between 0 and 1.'); 41 | 42 | // Fast break 43 | if (a === b) 44 | return 1; 45 | 46 | // Computing Jaro-Winkler score 47 | const dj = jaro(a, b); 48 | 49 | if (dj < boostThreshold) 50 | return dj; 51 | 52 | 53 | const p = scalingFactor; 54 | let l = 0; 55 | 56 | const prefixLimit = Math.min(a.length, b.length, 4); 57 | 58 | // Common prefix (up to 4 characters) 59 | for (let i = 0; i < prefixLimit; i++) { 60 | if (a[i] === b[i]) 61 | l++; 62 | else 63 | break; 64 | } 65 | 66 | return dj + (l * p * (1 - dj)); 67 | } 68 | 69 | /** 70 | * Jaro-Winkler standard function. 71 | */ 72 | const jaroWinkler = customJaroWinkler.bind(null, null); 73 | 74 | /** 75 | * Jaro-Winkler distance is 1 - the Jaro-Winkler score. 76 | */ 77 | const distance = (a, b) => 1 - jaroWinkler(a, b); 78 | 79 | /** 80 | * Exporting. 81 | */ 82 | export default jaroWinkler; 83 | export { 84 | customJaroWinkler as custom, 85 | jaroWinkler as similarity, 86 | distance 87 | }; 88 | -------------------------------------------------------------------------------- /src/metrics/lcs.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/lcs 3 | * ===================== 4 | * 5 | * Function computing the Longest Common Subsequence distance/similarity. 6 | * 7 | * [Tags]: metric, string metric. 8 | */ 9 | import {GeneralizedSuffixArray} from 'mnemonist/suffix-array'; 10 | 11 | /** 12 | * LCS similarity. 13 | * 14 | * @param {array|string} a - First sequence. 15 | * @param {array|string} b - Second sequence. 16 | * @param {number} - Similarity between 0 & 1. 17 | */ 18 | export function similarity(a, b) { 19 | if (a === b) 20 | return 1; 21 | 22 | const la = a.length, 23 | lb = b.length; 24 | 25 | if (!la || !lb) 26 | return 0; 27 | 28 | const gst = new GeneralizedSuffixArray([a, b]), 29 | lcs = gst.longestCommonSubsequence().length; 30 | 31 | return lcs / Math.max(la, lb); 32 | } 33 | 34 | /** 35 | * LCS distance. 36 | * 37 | * @param {array|string} a - First sequence. 38 | * @param {array|string} b - Second sequence. 39 | * @param {number} - Distance between 0 & 1. 40 | */ 41 | export function distance(a, b) { 42 | return 1 - similarity(a, b); 43 | } 44 | -------------------------------------------------------------------------------- /src/metrics/length.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/length 3 | * ======================== 4 | * 5 | * Length distance/similarity. Basically just the ratio of the shorter length 6 | * over the longer length. 7 | * 8 | * [Tags]: metric, string metric. 9 | */ 10 | 11 | /** 12 | * Length similarity. 13 | * 14 | * @param {array|string} a - First sequence. 15 | * @param {array|string} b - Second sequence. 16 | */ 17 | export function similarity(a, b) { 18 | if (a === b) 19 | return 1; 20 | 21 | const la = a.length, 22 | lb = b.length; 23 | 24 | if (!la || !lb) 25 | return 0; 26 | 27 | if (la < lb) 28 | return la / lb; 29 | 30 | return lb / la; 31 | } 32 | 33 | /** 34 | * Length distance. 35 | * 36 | * @param {array|string} a - First sequence. 37 | * @param {array|string} b - Second sequence. 38 | */ 39 | export function distance(a, b) { 40 | return 1 - similarity(a, b); 41 | } 42 | -------------------------------------------------------------------------------- /src/metrics/lig.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/lig 3 | * ===================== 4 | * 5 | * LIG2 & LIG3 distances. 6 | * 7 | * Note that the LIG1 distance is not implemented here because it's deemed 8 | * less useful by the paper's authors and because they seem to use a different 9 | * definition of the Guth distance function that the widely accepted one (as 10 | * hinted in another paper). 11 | * 12 | * [Article]: 13 | * An Interface for Mining Genealogical Nominal Data Using the Concept of 14 | * linkage and a Hybrid Name Matching Algorithm. 15 | * Chakkrit Snae, Bernard Diaz 16 | * Department of Computer Science, The University of Liverpool 17 | * Peach Street, Liverpool, UK, L69 7ZF 18 | */ 19 | import levenshtein from './levenshtein'; 20 | 21 | /** 22 | * LIG2 similarity metric. 23 | * 24 | * @param {string|array} a - First sequence. 25 | * @param {string|array} b - Second sequence. 26 | * @return {number} 27 | */ 28 | export function lig2(a, b) { 29 | if (a === b) 30 | return 1; 31 | 32 | let tmp; 33 | 34 | // Swapping so that a is the shortest 35 | if (a.length > b.length) { 36 | tmp = a; 37 | a = b; 38 | b = tmp; 39 | } 40 | 41 | const C = levenshtein(a, b), 42 | I = b.length - C; 43 | 44 | return I / (I + C); 45 | } 46 | 47 | /** 48 | * LIG3 similarity metric. 49 | * 50 | * @param {string|array} a - First sequence. 51 | * @param {string|array} b - Second sequence. 52 | * @return {number} 53 | */ 54 | export function lig3(a, b) { 55 | if (a === b) 56 | return 1; 57 | 58 | let tmp; 59 | 60 | // Swapping so that a is the shortest 61 | if (a.length > b.length) { 62 | tmp = a; 63 | a = b; 64 | b = tmp; 65 | } 66 | 67 | const C = levenshtein(a, b), 68 | I = b.length - C; 69 | 70 | return 2 * I / (2 * I + C); 71 | } 72 | -------------------------------------------------------------------------------- /src/metrics/manhattan.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/manhattan 3 | * =========================== 4 | * 5 | * Function computing the Manhattan distance. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/Taxicab_geometry 8 | * 9 | * [Tags]: metric, vector space. 10 | */ 11 | 12 | /** 13 | * Function returning the Manhattan distance between two vectors. 14 | * 15 | * @param {mixed} a - The first vector. 16 | * @param {mixed} b - The second vector. 17 | * @return {number} - The Manhattan distance between a & b. 18 | * 19 | * @throws {Error} The function expects vector of same dimensions. 20 | */ 21 | export default function manhattan(a, b) { 22 | if (a.length !== b.length) 23 | throw Error('talisman/metrics/distance/manhattan: the given vectors are not of the same dimension.'); 24 | 25 | let distance = 0; 26 | 27 | for (let i = 0, l = a.length; i < l; i++) 28 | distance += Math.abs(a[i] - b[i]); 29 | 30 | return distance; 31 | } 32 | -------------------------------------------------------------------------------- /src/metrics/minhash.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/minhash 3 | * ========================= 4 | * 5 | * Function computing the similarity/distance between MinHash signatures. 6 | */ 7 | 8 | /** 9 | * Function returning the similarity between two MinHash signatures. 10 | * 11 | * @param {mixed} a - The first signature. 12 | * @param {mixed} b - The second signature. 13 | * @return {number} - The similarity between a & b. 14 | * 15 | * @throws {Error} The function expects signatures of same length. 16 | */ 17 | export function similarity(a, b) { 18 | if (a.length !== b.length) 19 | throw Error('talisman/metrics/distance/minhash: the given signatures are not of same length.'); 20 | 21 | const L = a.length; 22 | 23 | let s = 0; 24 | 25 | for (let i = 0; i < L; i++) { 26 | if (a[i] === b[i]) 27 | s++; 28 | } 29 | 30 | return s / L; 31 | } 32 | 33 | /** 34 | * MinHash distance is simply 1 - similarity. 35 | */ 36 | export function distance(a, b) { 37 | return 1 - similarity(a, b); 38 | } 39 | -------------------------------------------------------------------------------- /src/metrics/minkowski.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/minkowski 3 | * =========================== 4 | * 5 | * Function computing the Minkowski distance. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/Minkowski_distance 8 | * 9 | * [Tags]: metric, vector space. 10 | */ 11 | 12 | /** 13 | * Function returning the Minkowski distance between two vectors. 14 | * 15 | * @param {number} p - The value for p. 16 | * @param {mixed} a - The first vector. 17 | * @param {mixed} b - The second vector. 18 | * @return {number} - The Minkowski distance between a & b. 19 | * 20 | * @throw {Error} The function expects a p value >= 1. 21 | * @throws {Error} The function expects vectors of same dimension. 22 | */ 23 | export default function minkowski(p, a, b) { 24 | if (p < 1) 25 | throw Error('talisman/metrics/distance/minkowski: the given p-value should be >= 1.'); 26 | 27 | if (a.length !== b.length) 28 | throw Error('talisman/metrics/distance/minkowski: the given vectors are not of the same dimension.'); 29 | 30 | let sum = 0; 31 | 32 | for (let i = 0, l = a.length; i < l; i++) 33 | sum += Math.pow(Math.abs(a[i] - b[i]), p); 34 | 35 | return Math.pow(sum, 1 / p); 36 | } 37 | -------------------------------------------------------------------------------- /src/metrics/monge-elkan.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/monge-elkan 3 | * ============================= 4 | * 5 | * Implementation of the Monge-Elkan distance. 6 | * 7 | * [Reference]: http://www.aaai.org/Papers/KDD/1996/KDD96-044.pdf 8 | * 9 | * [Article]: 10 | * Monge, Alvaro E. and Charles P. Elkan. 1996. "The field matching problem: 11 | * Algorithms and applications." KDD-9 Proceedings. 12 | * 13 | * [Tags]: metric, asymmetric, string metric. 14 | */ 15 | 16 | /** 17 | * Function computing the Monge-Elkan similarity. 18 | * 19 | * @param {function} similarity - Similarity function to use. 20 | * @param {array|string} source - Source sequence. 21 | * @param {array|string} target - Target sequence. 22 | * @return {number} - Monge-Elkan similarity. 23 | */ 24 | export default function mongeElkan(similarity, source, target) { 25 | if (source === target) 26 | return 1; 27 | if (!source.length && !target.length) 28 | return 1; 29 | if (!source.length || !target.length) 30 | return 0; 31 | 32 | let sum = 0; 33 | 34 | for (let i = 0, l = source.length; i < l; i++) { 35 | let max = -Infinity; 36 | 37 | for (let j = 0, m = target.length; j < m; j++) { 38 | const score = similarity(source[i], target[j]); 39 | 40 | if (score > max) 41 | max = score; 42 | } 43 | 44 | sum += max; 45 | } 46 | 47 | return sum / source.length; 48 | } 49 | 50 | /** 51 | * Function computing the symmetric Monge-Elkan similarity. 52 | * This is achieved by computing the mean of me(a, b) & me(b, a). 53 | */ 54 | export function symmetric(similarity, source, target) { 55 | const a = mongeElkan(similarity, source, target), 56 | b = mongeElkan(similarity, target, source); 57 | 58 | return (a + b) / 2; 59 | } 60 | 61 | /** 62 | * Aliases. 63 | */ 64 | const similarity = mongeElkan; 65 | 66 | export { 67 | similarity 68 | }; 69 | -------------------------------------------------------------------------------- /src/metrics/overlap.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/overlap 3 | * ========================= 4 | * 5 | * Function computing the overlap coefficient. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/Overlap_coefficient 8 | * 9 | * [Tags]: metric, string metric. 10 | */ 11 | 12 | /** 13 | * Global sets used by the overlap function. This way, we don't need to 14 | * create objects when computing the coefficient. 15 | */ 16 | let A = new Set(), 17 | B = new Set(); 18 | 19 | /** 20 | * Function returning the overlap coefficient between two sequences. 21 | * 22 | * @param {mixed} a - The first sequence. 23 | * @param {mixed} b - The second sequence. 24 | * @return {number} - The overlap coefficient between a & b. 25 | */ 26 | export default function overlap(a, b) { 27 | if (a === b) 28 | return 1; 29 | 30 | if (!a || !b) 31 | return 0; 32 | 33 | A.clear(); 34 | B.clear(); 35 | 36 | for (let i = 0, l = a.length; i < l; i++) 37 | A.add(a[i]); 38 | for (let i = 0, l = b.length; i < l; i++) 39 | B.add(b[i]); 40 | 41 | let tmp; 42 | 43 | // Let's find the shortest set 44 | if (A.size > B.size) { 45 | tmp = A; 46 | A = B; 47 | B = tmp; 48 | } 49 | 50 | // Computing intersection of both sets 51 | let I = 0; 52 | 53 | A.forEach(item => { 54 | if (B.has(item)) 55 | I++; 56 | }); 57 | 58 | return I / A.size; 59 | } 60 | -------------------------------------------------------------------------------- /src/metrics/prefix.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/prefix 3 | * ======================== 4 | * 5 | * Function computing the Prefix distance/similarity. This is basically the 6 | * ratio of the length of the common prefix to the length of the shortest 7 | * sequence. 8 | * 9 | * [Tags]: metric, string metric. 10 | */ 11 | 12 | /** 13 | * Prefix similarity. 14 | * 15 | * @param {array|string} a - First sequence. 16 | * @param {array|string} b - Second sequence. 17 | * @param {number} - Similarity between 0 & 1. 18 | */ 19 | export function similarity(a, b) { 20 | if (a === b) 21 | return 1; 22 | 23 | if (!a.length || !b.length) 24 | return 0; 25 | 26 | if (a.length > b.length) 27 | [a, b] = [b, a]; 28 | 29 | let i = 0; 30 | 31 | const l = a.length; 32 | 33 | for (; i < l; i++) { 34 | if (a[i] !== b[i]) 35 | break; 36 | } 37 | 38 | return i / l; 39 | } 40 | 41 | /** 42 | * Prefix distance. 43 | * 44 | * @param {array|string} a - First sequence. 45 | * @param {array|string} b - Second sequence. 46 | * @param {number} - Distance between 0 & 1. 47 | */ 48 | export function distance(a, b) { 49 | return 1 - similarity(a, b); 50 | } 51 | -------------------------------------------------------------------------------- /src/metrics/smith-waterman.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/smith-waterman 3 | * ================================ 4 | * 5 | * Functions computing the Smith-Waterman distance. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm 8 | * 9 | * [Article]: 10 | * Smith, Temple F. & Waterman, Michael S. (1981). "Identification of Common 11 | * Molecular Subsequences" (PDF). Journal of Molecular Biology. 147: 195–197. 12 | * 13 | * [Tags]: metric, string metric. 14 | */ 15 | const SIMILARITY = (a, b) => { 16 | return a === b ? 1 : 0; 17 | }; 18 | 19 | /** 20 | * Function returning the Smith-Waterman score between two sequences. 21 | * 22 | * @param {object} options - Options: 23 | * @param {number} gap - Gap cost. 24 | * @param {function} similarity - Similarity function. 25 | * @param {mixed} a - The first sequence to process. 26 | * @param {mixed} b - The second sequence to process. 27 | * @return {number} - The Smith-Waterman score between a & b. 28 | */ 29 | export function score(options, a, b) { 30 | const {gap = 1, similarity = SIMILARITY} = options; 31 | 32 | // Early terminations 33 | if (a === b) 34 | return a.length; 35 | 36 | const m = a.length, 37 | n = b.length; 38 | 39 | if (!m || !n) 40 | return 0; 41 | 42 | // TODO: Possibility to optimize for common prefix, but need to know max substitution cost 43 | 44 | const d = new Array(m + 1); 45 | 46 | let D = 0; 47 | 48 | for (let i = 0; i <= m; i++) { 49 | d[i] = new Array(2); 50 | d[i][0] = 0; 51 | } 52 | 53 | for (let j = 1; j <= n; j++) { 54 | d[0][j % 2] = 0; 55 | 56 | for (let i = 1; i <= m; i++) { 57 | const cost = similarity(a[i - 1], b[j - 1]); 58 | 59 | d[i][j % 2] = Math.max( 60 | 0, // Start over 61 | d[i - 1][(j - 1) % 2] + cost, // Substitution 62 | d[i - 1][j % 2] - gap, // Insertion 63 | d[i][(j - 1) % 2] - gap // Deletion 64 | ); 65 | 66 | // Storing max 67 | if (d[i][j % 2] > D) 68 | D = d[i][j % 2]; 69 | } 70 | } 71 | 72 | return D; 73 | } 74 | 75 | /** 76 | * Exporting standard distance. 77 | */ 78 | const smithWaterman = score.bind(null, {}); 79 | 80 | export default smithWaterman; 81 | 82 | -------------------------------------------------------------------------------- /src/metrics/sorensen.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/sorensen 3 | * ========================== 4 | * 5 | * Functions computing the Sorensen index. Note that Sorensen index is 6 | * actually the same as the Dice coefficient (metrics/dice). 7 | * 8 | * [Reference]: 9 | * https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient 10 | * 11 | * [Article]: 12 | * Sørensen, T. (1948). "A method of establishing groups of equal amplitude in 13 | * plant sociology based on similarity of species and its application to 14 | * analyses of the vegetation on Danish commons". 15 | * Kongelige Danske Videnskabernes Selskab 5 (4): 1–34. 16 | * 17 | * [Tags]: semimetric, string metric. 18 | */ 19 | import dice, {distance} from './dice'; 20 | 21 | /** 22 | * The Sorensen index is the same as the Dice one. 23 | */ 24 | export default dice; 25 | export { 26 | dice as index, 27 | dice as coefficient, 28 | dice as similarity, 29 | distance 30 | }; 31 | -------------------------------------------------------------------------------- /src/metrics/suffix.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/suffix 3 | * ======================== 4 | * 5 | * Function computing the Suffix distance/similarity. This is basically the 6 | * ratio of the length of the common suffix to the length of the shortest 7 | * sequence. 8 | * 9 | * [Tags]: metric, string metric. 10 | */ 11 | 12 | /** 13 | * Suffix similarity. 14 | * 15 | * @param {array|string} a - First sequence. 16 | * @param {array|string} b - Second sequence. 17 | * @param {number} - Similarity between 0 & 1. 18 | */ 19 | export function similarity(a, b) { 20 | if (a === b) 21 | return 1; 22 | 23 | if (!a.length || !b.length) 24 | return 0; 25 | 26 | if (a.length > b.length) 27 | [a, b] = [b, a]; 28 | 29 | let i = 0; 30 | 31 | const la = a.length, 32 | lb = b.length; 33 | 34 | for (; i < la; i++) { 35 | if (a[la - i - 1] !== b[lb - i - 1]) 36 | break; 37 | } 38 | 39 | return i / la; 40 | } 41 | 42 | /** 43 | * Suffix distance. 44 | * 45 | * @param {array|string} a - First sequence. 46 | * @param {array|string} b - Second sequence. 47 | * @param {number} - Distance between 0 & 1. 48 | */ 49 | export function distance(a, b) { 50 | return 1 - similarity(a, b); 51 | } 52 | -------------------------------------------------------------------------------- /src/parsers/brown.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman parsers/brown 3 | * ======================= 4 | * 5 | * A parser for Brown corpus files. 6 | */ 7 | const TOKEN_REGEX = /([^/\n\t\r\s]+)\/([^\s\n]+)/g; 8 | 9 | /** 10 | * Function taking text from the Brown corpus and outputting an array of 11 | * (word, tag) tuples. 12 | * 13 | * @param {string} text - The text to parse. 14 | * @return {array} - The tokens. 15 | */ 16 | export default function brown(text) { 17 | const tokens = []; 18 | let match; 19 | 20 | while (match = TOKEN_REGEX.exec(text)) { 21 | tokens.push([match[1], match[2]]); 22 | } 23 | 24 | TOKEN_REGEX.lastIndex = 0; 25 | 26 | return tokens; 27 | } 28 | -------------------------------------------------------------------------------- /src/parsers/conll.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman parsers/conll 3 | * ======================= 4 | * 5 | * A parser for the CONLL corpus files. 6 | */ 7 | 8 | /** 9 | * Function taking a CONLL corpus' text and returning an array of sentences 10 | * being arrays of (word, brill_tag, wsj_tag). 11 | * 12 | * @param {string} text - The text to parse. 13 | * @return {array} - The tokens. 14 | */ 15 | export default function conll(text) { 16 | const sentences = [], 17 | lines = text.split('\n'); 18 | 19 | let sentence = []; 20 | for (let i = 0, l = lines.length; i < l; i++) { 21 | const line = lines[i]; 22 | 23 | if (!line) { 24 | if (sentence.length) { 25 | sentences.push(sentence); 26 | sentence = []; 27 | } 28 | } 29 | else { 30 | sentence.push(line.split(' ')); 31 | } 32 | } 33 | 34 | if (sentence.length) 35 | sentences.push(sentence); 36 | 37 | return sentences; 38 | } 39 | -------------------------------------------------------------------------------- /src/phonetics/french/soundex.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/french/soundex 3 | * ================================== 4 | * 5 | * A version of the Soundex algorithm targeting the French language. 6 | * 7 | * [Reference]: 8 | * http://www-info.univ-lemans.fr/~carlier/recherche/soundex.html 9 | * http://sqlpro.developpez.com/cours/soundex/ 10 | */ 11 | import deburr from 'lodash/deburr'; 12 | import {translation, squeeze} from '../../helpers'; 13 | 14 | /** 15 | * Translations. 16 | */ 17 | const TRANSLATIONS = translation( 18 | 'AEIOUYWHBPCKQDTLMNRGJSXZFV', 19 | '000000DD112223345567788899' 20 | ); 21 | 22 | /** 23 | * Helpers. 24 | */ 25 | function pad(code) { 26 | return (code + '0000').slice(0, 4); 27 | } 28 | 29 | /** 30 | * Function taking a single name and computing its Soundex code. 31 | * 32 | * @param {string} name - The name to process. 33 | * @return {string} - The Soundex code. 34 | * 35 | * @throws {Error} The function expects the name to be a string. 36 | */ 37 | export default function soundex(name) { 38 | if (typeof name !== 'string') 39 | throw Error('talisman/phonetics/french/soundex: the given name is not a string.'); 40 | 41 | // Converting ç & œ 42 | name = name.toUpperCase() 43 | .replace(/Ç/g, 'S') 44 | .replace(/Œ/g, 'E'); 45 | 46 | // Preparing the string 47 | name = deburr(name).replace(/[^A-Z]/g, ''); 48 | 49 | const firstLetter = name.charAt(0); 50 | 51 | // Process the code for the name's tail 52 | let tail = ''; 53 | 54 | for (let i = 1, l = name.length; i < l; i++) { 55 | if (TRANSLATIONS[name[i]] !== 'D') 56 | tail += TRANSLATIONS[name[i]]; 57 | } 58 | 59 | // Dropping first code's letter if duplicate 60 | if (tail.charAt(0) === TRANSLATIONS[firstLetter]) 61 | tail = tail.slice(1); 62 | 63 | // Composing the code from the tail 64 | const code = squeeze(tail).replace(/0/g, ''); 65 | 66 | return pad(firstLetter + code); 67 | } 68 | -------------------------------------------------------------------------------- /src/phonetics/german/phonem.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/german/phonem 3 | * ================================= 4 | * 5 | * The phonem algorithm. 6 | * 7 | * [Reference]: 8 | * http://web.archive.org/web/20070209153423/http://uni-koeln.de/phil-fak/phonetik/Lehre/MA-Arbeiten/magister_wilz.pdf 9 | * 10 | * [Article]: 11 | * Wilde, Georg ; Meyer, Carsten: Doppelgänger gesucht - Ein Programm fur 12 | * kontext-sensitive phonetische Textumwandlung. In: ct Magazin fur 13 | * Computer & Technik 25 (1988) 14 | */ 15 | import {squeeze, translation} from '../../helpers'; 16 | 17 | /** 18 | * Rules. 19 | */ 20 | const SUBSTITUTIONS = [ 21 | [/(?:SC|SZ|CZ|TZ|TS)/g, 'C'], 22 | [/KS/g, 'X'], 23 | [/(?:PF|PH)/g, 'V'], 24 | [/QU/g, 'KW'], 25 | [/UE/g, 'Y'], 26 | [/AE/g, 'E'], 27 | [/OE/g, 'Ö'], 28 | [/E[IY]/g, 'AY'], 29 | [/EU/g, 'OY'], 30 | [/AU/g, 'A§'], 31 | [/OU/g, '§'] 32 | ]; 33 | 34 | const TRANSLATION = translation( 35 | 'ZKGQÇÑßFWPTÁÀÂÃÅÄÆÉÈÊËIJÌÍÎÏÜݧÚÙÛÔÒÓÕØ', 36 | 'CCCCCNSVVBDAAAAAEEEEEEYYYYYYYYUUUUOOOOÖ' 37 | ); 38 | 39 | const ACCEPTABLE_LETTERS = new Set('ABCDLMNORSUVWXYÖ'); 40 | 41 | /** 42 | * Function taking a single name and computing its phonem code. 43 | * 44 | * @param {string} name - The name to process. 45 | * @return {string} - The phonem code. 46 | * 47 | * @throws {Error} The function expects the name to be a string. 48 | */ 49 | export default function phonem(name) { 50 | if (typeof name !== 'string') 51 | throw Error('talisman/phonetics/german/phonem: the given name is not a string.'); 52 | 53 | let code = name.toUpperCase(); 54 | 55 | for (let i = 0, l = SUBSTITUTIONS.length; i < l; i++) 56 | code = code.replace(...SUBSTITUTIONS[i]); 57 | 58 | let translatedCode = ''; 59 | for (let i = 0, l = code.length; i < l; i++) 60 | translatedCode += TRANSLATION[code[i]] || code[i]; 61 | 62 | translatedCode = squeeze(translatedCode); 63 | 64 | code = ''; 65 | for (let i = 0, l = translatedCode.length; i < l; i++) { 66 | if (ACCEPTABLE_LETTERS.has(translatedCode[i])) 67 | code += translatedCode[i]; 68 | } 69 | 70 | return code; 71 | } 72 | -------------------------------------------------------------------------------- /src/phonetics/lein.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/lein 3 | * ======================== 4 | * 5 | * The Lein name coding procedure. 6 | * 7 | * [Reference]: 8 | * http://naldc.nal.usda.gov/download/27833/PDF 9 | */ 10 | import deburr from 'lodash/deburr'; 11 | import {squeeze, translation} from '../helpers'; 12 | 13 | /** 14 | * Constants. 15 | */ 16 | const DROPPED = /[AEIOUYWH]/g; 17 | 18 | const TRANSLATION = translation('DTMNLRBFPVCJKGQSXZ', '112233444455555555'); 19 | 20 | /** 21 | * Helpers. 22 | */ 23 | function pad(code) { 24 | return (code + '0000').slice(0, 4); 25 | } 26 | 27 | /** 28 | * Function taking a single name and computing its lein code. 29 | * 30 | * @param {string} name - The name to process. 31 | * @return {string} - The lein code. 32 | * 33 | * @throws {Error} The function expects the name to be a string. 34 | */ 35 | export default function lein(name) { 36 | if (typeof name !== 'string') 37 | throw Error('talisman/phonetics/lein: the given name is not a string.'); 38 | 39 | let code = deburr(name) 40 | .toUpperCase() 41 | .replace(/[^A-Z\s]/g, ''); 42 | 43 | // 1-- Keeping the first letter 44 | const first = code[0]; 45 | code = code.slice(1); 46 | 47 | // 2-- Dropping vowels and Y, W & H 48 | code = code.replace(DROPPED, ''); 49 | 50 | // 3-- Dropping consecutive duplicates and truncating to 4 characters 51 | code = squeeze(code).slice(0, 4); 52 | 53 | // 4-- Translations 54 | const backup = code; 55 | code = ''; 56 | 57 | for (let i = 0, l = backup.length; i < l; i++) 58 | code += TRANSLATION[backup[i]] || backup[i]; 59 | 60 | return pad(first + code); 61 | } 62 | -------------------------------------------------------------------------------- /src/phonetics/metaphone.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/metaphone 3 | * ============================= 4 | * 5 | * The metaphone algorithm. 6 | * 7 | * [Reference]: 8 | * https://en.wikipedia.org/wiki/Metaphone 9 | * 10 | * [Author]: 11 | * Lawrence Philips, 1990 12 | */ 13 | import deburr from 'lodash/deburr'; 14 | 15 | /** 16 | * Series of rules to apply. 17 | */ 18 | const RULES = [ 19 | [/([bcdfhjklmnpqrstvwxyz])\1+/g, '$1'], 20 | [/^ae/g, 'E'], 21 | [/^[gkp]n/g, 'N'], 22 | [/^wr/g, 'R'], 23 | [/^x/g, 'S'], 24 | [/^wh/g, 'W'], 25 | [/mb$/g, 'M'], 26 | [/(?!^)sch/g, 'SK'], 27 | [/th/g, '0'], 28 | [/t?ch|sh/g, 'X'], 29 | [/c(?=ia)/g, 'X'], 30 | [/[st](?=i[ao])/g, 'X'], 31 | [/s?c(?=[iey])/g, 'S'], 32 | [/[cq]/g, 'K'], 33 | [/dg(?=[iey])/g, 'J'], 34 | [/d/g, 'T'], 35 | [/g(?=h[^aeiou])/g, ''], 36 | [/gn(ed)?/g, 'N'], 37 | [/([^g]|^)g(?=[iey])/g, '$1J'], 38 | [/g+/g, 'K'], 39 | [/ph/g, 'F'], 40 | [/([aeiou])h(?=\b|[^aeiou])/g, '$1'], 41 | [/[wy](?![aeiou])/g, ''], 42 | [/z/g, 'S'], 43 | [/v/g, 'F'], 44 | [/(?!^)[aeiou]+/g, ''] 45 | ]; 46 | 47 | /** 48 | * Function taking a single word and computing its metaphone code. 49 | * 50 | * @param {string} word - The word to process. 51 | * @return {string} - The metaphone code. 52 | * 53 | * @throws {Error} The function expects the word to be a string. 54 | */ 55 | export default function metaphone(word) { 56 | if (typeof word !== 'string') 57 | throw Error('talisman/phonetics/metaphone: the given word is not a string.'); 58 | 59 | // Deburring the string & dropping any non-alphabetical character 60 | let code = deburr(word).toLowerCase().replace(/[^a-z]/g, ''); 61 | 62 | // Applying the rules 63 | for (let i = 0, l = RULES.length; i < l; i++) 64 | code = code.replace(RULES[i][0], RULES[i][1]); 65 | 66 | return code.toUpperCase(); 67 | } 68 | -------------------------------------------------------------------------------- /src/phonetics/mra.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/mra 3 | * ======================= 4 | * 5 | * Functions related to the computation of the Match Rating Approach codex. 6 | * 7 | * [Reference]: 8 | * https://en.wikipedia.org/wiki/Match_rating_approach 9 | * 10 | * [Article]: 11 | * Moore, G B.; Kuhns, J L.; Treffzs, J L.; Montgomery, C A. (Feb 1, 1977). 12 | * Accessing Individual Records from Personal Data Files Using Nonunique 13 | * Identifiers. 14 | * US National Institute of Standards and Technology. p. 17. NIST SP - 500-2. 15 | */ 16 | import {squeeze} from '../helpers'; 17 | import deburr from 'lodash/deburr'; 18 | 19 | /** 20 | * Function taking a single name and computing its MRA codex. 21 | * 22 | * @param {string} name - The name to process. 23 | * @return {string} - The MRA codex. 24 | * 25 | * @throws {Error} The function expects the name to be a string. 26 | */ 27 | export default function mra(name) { 28 | if (typeof name !== 'string') 29 | throw Error('talisman/phonetics/mra: the given name is not a string.'); 30 | 31 | // Preparing the name 32 | let codex = deburr(name) 33 | .toUpperCase() 34 | .replace(/[^A-Z]/g, ''); 35 | 36 | // Dropping non-leading vowels 37 | codex = codex.charAt(0) + codex.slice(1).replace(/[AEIOU]/g, ''); 38 | 39 | // Dropping consecutive consonants 40 | codex = squeeze(codex); 41 | 42 | // Returning the codex 43 | const offset = Math.min(3, codex.length - 3); 44 | 45 | return codex.slice(0, 3) + codex.substr(codex.length - offset, offset); 46 | } 47 | -------------------------------------------------------------------------------- /src/phonetics/onca.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/onca 3 | * ======================== 4 | * 5 | * The Oxford Name Compression Algorithm. This is basically a glorified 6 | * NYSIIS + Soundex combination. 7 | */ 8 | import soundex from './soundex'; 9 | import nysiis from './nysiis'; 10 | 11 | /** 12 | * Function taking a single name and computing its ONCA code. 13 | * 14 | * @param {string} name - The name to process. 15 | * @return {string} - The ONCA code. 16 | */ 17 | export default function onca(name) { 18 | return soundex(nysiis(name)); 19 | } 20 | -------------------------------------------------------------------------------- /src/phonetics/sound-d.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/sound-d 3 | * =========================== 4 | * 5 | * The SoundD algorithm, a slight variant over the original Soundex algorithm. 6 | * 7 | * [Article]: 8 | * Hybrid Matching Algorithm for Personal Names. 9 | * Cihan Varol, Coskun Bayrak. 10 | */ 11 | import {translation, squeeze} from '../helpers'; 12 | import deburr from 'lodash/deburr'; 13 | 14 | /** 15 | * Translations. 16 | */ 17 | const TRANSLATIONS = translation( 18 | 'AEIOUYWHBPFVCSKGJQXZDTLMNR', 19 | '00000000111122222222334556' 20 | ); 21 | 22 | /** 23 | * Constants. 24 | */ 25 | const INITIALS = new Set(['KN', 'GN', 'PN', 'AC', 'WR']); 26 | 27 | /** 28 | * Helpers. 29 | */ 30 | function pad(code) { 31 | return (code + '0000').slice(0, 4); 32 | } 33 | 34 | /** 35 | * Function taking a single name and computing its SoundD code. 36 | * 37 | * @param {string} name - The name to process. 38 | * @return {string} - The SoundD code. 39 | * 40 | * @throws {Error} The function expects the name to be a string. 41 | */ 42 | export default function soundD(name) { 43 | if (typeof name !== 'string') 44 | throw Error('talisman/phonetics/sound-d: the given name is not a string.'); 45 | 46 | name = deburr(name) 47 | .toUpperCase() 48 | .replace(/[^A-Z]/g, ''); 49 | 50 | // Handling some initials 51 | if (INITIALS.has(name.slice(0, 2))) 52 | name = name.slice(1); 53 | else if (name[0] === 'X') 54 | name = 'S' + name.slice(1); 55 | else if (name.slice(0, 2) === 'WH') 56 | name = 'W' + name.slice(2); 57 | 58 | // Process the code for the name's tail 59 | let tail = ''; 60 | 61 | for (let i = 0, l = name.length; i < l; i++) { 62 | const letter = name[i]; 63 | 64 | // Handling 'DGE', 'DGI', 'GH' 65 | if (letter === 'D') { 66 | if (name[i + 1] === 'G' && (name[i + 2] === 'E' || name[i + 2] === 'I')) { 67 | tail += '2'; 68 | i += 2; 69 | } 70 | 71 | continue; 72 | } 73 | else if (letter === 'G' && name[i + 1] === 'H') { 74 | tail += '0'; 75 | i++; 76 | 77 | continue; 78 | } 79 | 80 | tail += TRANSLATIONS[letter]; 81 | } 82 | 83 | // Composing the code from the tail 84 | const code = squeeze(tail).replace(/0/g, ''); 85 | 86 | return pad(code); 87 | } 88 | -------------------------------------------------------------------------------- /src/phonetics/statcan.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/statcan 3 | * =========================== 4 | * 5 | * The statistics Canada name coding technique. 6 | * 7 | * [Reference]: 8 | * http://naldc.nal.usda.gov/download/27833/PDF 9 | */ 10 | import deburr from 'lodash/deburr'; 11 | import {squeeze} from '../helpers'; 12 | 13 | /** 14 | * Constants. 15 | */ 16 | const DROPPED = /[AEIOUY]/g; 17 | 18 | /** 19 | * Function taking a single name and computing its statcan code. 20 | * 21 | * @param {string} name - The name to process. 22 | * @return {string} - The statcan code. 23 | * 24 | * @throws {Error} The function expects the name to be a string. 25 | */ 26 | export default function statcan(name) { 27 | 28 | if (typeof name !== 'string') 29 | throw Error('talisman/phonetics/statcan: the given name is not a string.'); 30 | 31 | let code = deburr(name) 32 | .toUpperCase() 33 | .replace(/[^A-Z\s]/g, ''); 34 | 35 | // 1-- Keeping the first letter 36 | const first = code[0]; 37 | code = code.slice(1); 38 | 39 | // 2-- Dropping vowels and Y 40 | code = code.replace(DROPPED, ''); 41 | 42 | // 3-- Dropping consecutive duplicates 43 | code = squeeze(code); 44 | 45 | // 4-- Dropping blanks 46 | code = code.replace(/\s/g, ''); 47 | 48 | // 5-- Limiting code size to 4 49 | return (first + code).slice(0, 4); 50 | } 51 | -------------------------------------------------------------------------------- /src/regexp/classes.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman regexp/classes 3 | * ======================== 4 | * 5 | * A collection of handy character classes. 6 | */ 7 | export const SINGLE_QUOTES = '’‘`‛\''; 8 | export const DOUBLE_QUOTES = '«»„‟“”"'; 9 | export const HYPHENS = '\\-‐‒–—―−‑⁃'; 10 | export const COMMAS = ',،、'; 11 | 12 | // TODO: mid dots 13 | -------------------------------------------------------------------------------- /src/regexp/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman regexp 3 | * ================ 4 | * 5 | * Some RegExp-related helpers. 6 | */ 7 | 8 | /** 9 | * Function escaping a string for insertion in a regular expression. 10 | * 11 | * @param {string} string - The string to escape. 12 | * @return {string} - The escaped string. 13 | */ 14 | const RE = /([|\\{}()[\]^$+*?.\-])/g; 15 | 16 | export function escapeRegexp(string) { 17 | return string.replace(RE, '\\$1'); 18 | } 19 | 20 | /** 21 | * Function creating a fuzzy matching pattern from the given query. 22 | * 23 | * @param {string} string - The string to escape. 24 | * @return {string} - The created pattern. 25 | */ 26 | export function createFuzzyPattern(query) { 27 | return query 28 | .split('') 29 | .map(character => { 30 | return `(${escapeRegexp(character)})`; 31 | }) 32 | .join('.*?'); 33 | } 34 | -------------------------------------------------------------------------------- /src/stemmers/s-stemmer.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman stemmers/s-stemmer 3 | * ============================ 4 | * 5 | * Implementation of the English "S-Stemmer". 6 | * 7 | * [Reference]: 8 | * http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.104.9828&rep=rep1&type=pdf 9 | * 10 | * [Article]: 11 | * Donna Harman (1991) How effective is suffixing? 12 | * Journal of the American Society for Information Science (vol. 42 issue 1). 13 | * 14 | * [Note]: 15 | * I cannot find the original author of the algorithm, only its explanation in 16 | * the linked article. 17 | */ 18 | 19 | /** 20 | * Function stemming the given world using the "S-Stemmer". 21 | * 22 | * @param {string} word - The word to stem. 23 | * @return {string} - The resulting stem. 24 | */ 25 | export default function sStemmer(word) { 26 | const length = word.length; 27 | 28 | if (length < 3 || word[length - 1] !== 's') 29 | return word; 30 | 31 | const penultimate = word[length - 2]; 32 | 33 | if (penultimate === 'u' || penultimate === 's') 34 | return word; 35 | 36 | if (penultimate === 'e') { 37 | if (length > 3 && 38 | word[length - 3] === 'i' && 39 | word[length - 4] !== 'a' && 40 | word[length - 4] !== 'e') { 41 | return word.slice(0, -3) + 'y'; 42 | } 43 | 44 | if (word[length - 3] === 'i' || 45 | word[length - 3] === 'a' || 46 | word[length - 3] === 'o' || 47 | word[length - 3] === 'e') { 48 | return word; 49 | } 50 | } 51 | 52 | return word.slice(0, -1); 53 | } 54 | -------------------------------------------------------------------------------- /src/tokenizers/fingerprint/name.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/fingerprint/name 3 | * ===================================== 4 | * 5 | * Variant of the fingerprint tokenizer but with opinionated options and 6 | * transformations known to work better with occidental names. 7 | */ 8 | import {createTokenizer} from './'; 9 | import {squeeze} from '../../helpers'; 10 | 11 | // TODO: handle roman numerals 12 | // TODO: O' 13 | // TODO: sort stopwords by length then alphabet 14 | 15 | const RULES = [ 16 | 17 | // McCallister / MacCallister 18 | [/\bmc(?=\w)/g, 'mac'], 19 | [/\b(ma?c\s+)(?=\w)/g, 'mac'], 20 | 21 | // Lee / Li 22 | [/\blee\b/g, 'li'], 23 | 24 | // Moussorgski / Moussorgsky 25 | [/ski\b/g, 'sky'], 26 | 27 | // Van Hoff / Von Hoff 28 | [/\bvan\b/g, 'von'], 29 | 30 | // Doerk / Dörk 31 | [/ö/g, 'oe'], 32 | 33 | // Düring / Duering 34 | [/ü/g, 'ue'] 35 | ]; 36 | 37 | const OPTIONS = { 38 | digits: false, 39 | split: ['-'], 40 | stopwords: [ 41 | 42 | // Articles etc. 43 | 'the', 44 | 'le', 45 | 'la', 46 | 'da', 47 | 'di', 48 | 'of', 49 | 50 | // Title 51 | 'doctor', 52 | 'dr', 53 | 'esq', 54 | 'mgr', 55 | 'professor', 56 | 'prof', 57 | 'md', 58 | 'phd', 59 | 'sir', 60 | 'lord', 61 | 62 | // Civility 63 | 'mr', 64 | 'mrs', 65 | 'ms', 66 | 'mme', 67 | 'mlle', 68 | 'jr', 69 | 'junior', 70 | 'sr', 71 | 'senior' 72 | ] 73 | }; 74 | 75 | const tokenizer = createTokenizer(OPTIONS); 76 | 77 | /** 78 | * Function returning the fingerprint of the given name. 79 | * 80 | * @param {string} name - Target name. 81 | * @param {array} 82 | */ 83 | export default function nameFingerprint(name) { 84 | name = name.toLowerCase(); 85 | 86 | // Applying rules 87 | for (let i = 0, l = RULES.length; i < l; i++) 88 | name = name.replace(RULES[i][0], RULES[i][1]); 89 | 90 | return tokenizer(name).map(squeeze); 91 | } 92 | -------------------------------------------------------------------------------- /src/tokenizers/lines/index.js: -------------------------------------------------------------------------------- 1 | import naive from './naive'; 2 | export default naive; 3 | -------------------------------------------------------------------------------- /src/tokenizers/lines/naive.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/lines/naive 3 | * ================================ 4 | * 5 | * A very simple line splitter. 6 | * 7 | * [Author]: Guillaume PLIQUE 8 | */ 9 | 10 | /** 11 | * Regex. 12 | */ 13 | const LINES = /(?:\r\n|\n\r|\n|\r)/; 14 | 15 | /** 16 | * Function tokenizing raw text into a sequence of lines. 17 | * 18 | * @param {string} text - The text to tokenize. 19 | * @return {array} - The tokens. 20 | */ 21 | export default function lines(text) { 22 | return text.split(LINES); 23 | } 24 | -------------------------------------------------------------------------------- /src/tokenizers/ngrams/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/ngrams 3 | * =========================== 4 | * 5 | * Functions related to ngrams' computation. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/N-gram 8 | */ 9 | 10 | /** 11 | * Function taking a sequence and computing its ngrams. 12 | * 13 | * @param {number} n - Nb of elements in the subsequence. 14 | * @param {mixed} sequence - The sequence to process. 15 | * @return {array} - The array of resulting ngrams. 16 | * 17 | * @throws {Error} The function expects a positive n > 0. 18 | */ 19 | export default function ngrams(n, sequence) { 20 | if (n < 1) 21 | throw Error('talisman/tokenizers/ngrams: first argument should be a positive integer > 0.'); 22 | 23 | const isString = typeof sequence === 'string'; 24 | 25 | const subsequences = []; 26 | 27 | for (let i = 0, l = sequence.length; i < l - n + 1; i++) { 28 | const subsequence = []; 29 | 30 | for (let j = 0; j < n; j++) 31 | subsequence.push(sequence[i + j]); 32 | 33 | subsequences.push(isString ? subsequence.join('') : subsequence); 34 | } 35 | 36 | return subsequences; 37 | } 38 | 39 | /** 40 | * Creating popular aliases. 41 | */ 42 | const bigrams = ngrams.bind(null, 2), 43 | trigrams = ngrams.bind(null, 3), 44 | quadrigrams = ngrams.bind(null, 4); 45 | 46 | export {bigrams, trigrams, quadrigrams}; 47 | -------------------------------------------------------------------------------- /src/tokenizers/paragraphs/index.js: -------------------------------------------------------------------------------- 1 | import naive from './naive'; 2 | export default naive; 3 | -------------------------------------------------------------------------------- /src/tokenizers/paragraphs/naive.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/paragraphs/naive 3 | * ===================================== 4 | * 5 | * A very simple paragraph tokenizer. 6 | * 7 | * [Author]: Guillaume PLIQUE 8 | */ 9 | 10 | /** 11 | * Regex. 12 | */ 13 | const PARAGRAPHS = /(?:\n\r|\r\n|\r|\n)[\t\s]*(?:\n\r|\r\n|\r|\n)+/; 14 | 15 | /** 16 | * Function tokenizing raw text into a sequence of paragraphs. 17 | * 18 | * @param {string} text - The text to tokenize. 19 | * @return {array} - The tokens. 20 | */ 21 | export default function paragraphs(text) { 22 | return text.split(PARAGRAPHS); 23 | } 24 | -------------------------------------------------------------------------------- /src/tokenizers/sentences/index.js: -------------------------------------------------------------------------------- 1 | import naive from './naive'; 2 | export default naive; 3 | -------------------------------------------------------------------------------- /src/tokenizers/skipgrams/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/skipgrams 3 | * ============================== 4 | * 5 | * Functions related to skipgrams' computation. 6 | * 7 | * [Reference]: https://en.wikipedia.org/wiki/N-gram#Skip-gram 8 | */ 9 | import combinations from 'obliterator/combinations'; 10 | import {seq} from '../../helpers'; 11 | import {vec} from '../../helpers/vectors'; 12 | import ngrams from '../ngrams'; 13 | 14 | /** 15 | * Sentinel object. 16 | */ 17 | const SENTINEL = {}; 18 | 19 | /** 20 | * Function taking a sequence and computing its skipgrams. 21 | * 22 | * @param {number} k - Nb of elements to skip. 23 | * @param {number} n - Nb of elements in the subsequence. 24 | * @param {mixed} sequence - The sequence to process. 25 | * @return {array} - The array of resulting skipgrams. 26 | * 27 | * @throws {Error} The function expects a positive k. 28 | * @throws {Error} The function expects a positive n > 0. 29 | * @throws {Error} n should be > k. 30 | */ 31 | export default function skipgrams(k, n, sequence) { 32 | if (k < 1) 33 | throw new Error('talisman/tokenizers/skipgrams: `k` should be a positive integer > 0.'); 34 | 35 | if (n < 1) 36 | throw Error('talisman/tokenizers/skipgrams: `n` should be a positive integer > 0.'); 37 | 38 | if (n < k) 39 | throw Error('talisman/tokenizers/skipgrams: `n` should be greater than `k`.'); 40 | 41 | const isString = typeof sequence === 'string'; 42 | 43 | sequence = seq(sequence); 44 | 45 | // NOTE: should be n or k? 46 | const padding = vec(n, SENTINEL); 47 | 48 | const subsequences = [], 49 | grams = ngrams(n + k, sequence.concat(padding)); 50 | 51 | for (let i = 0, l = grams.length; i < l; i++) { 52 | const head = grams[i][0], 53 | tail = grams[i].slice(1); 54 | 55 | const iterator = combinations(tail, n - 1); 56 | 57 | let step; 58 | 59 | while ((step = iterator.next(), !step.done)) { 60 | const skipTail = step.value; 61 | 62 | if (skipTail[skipTail.length - 1] === SENTINEL) 63 | continue; 64 | 65 | if (isString) 66 | subsequences.push(head + skipTail.join('')); 67 | else 68 | subsequences.push([head].concat(skipTail)); 69 | } 70 | } 71 | 72 | return subsequences; 73 | } 74 | -------------------------------------------------------------------------------- /src/tokenizers/words/index.js: -------------------------------------------------------------------------------- 1 | import naive from './naive'; 2 | export default naive; 3 | -------------------------------------------------------------------------------- /src/tokenizers/words/naive.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/words/naive 3 | * ================================ 4 | * 5 | * Exporting the Lodash's words function for convenience. 6 | * 7 | * [Reference]: https://github.com/lodash/lodash 8 | */ 9 | import words from 'lodash/words'; 10 | 11 | export default words; 12 | -------------------------------------------------------------------------------- /test/clustering/abstract.js: -------------------------------------------------------------------------------- 1 | /* eslint no-new: 0 */ 2 | /** 3 | * Talisman clustering/record-linkage/abstract tests 4 | * ================================================== 5 | * 6 | */ 7 | import assert from 'assert'; 8 | import RecordLinkageClusterer from '../../src/clustering/abstract'; 9 | 10 | describe('abstract', function() { 11 | 12 | it('should throw on invalid arguments.', function() { 13 | 14 | assert.throws(function() { 15 | new RecordLinkageClusterer(null); 16 | }, /params/); 17 | 18 | assert.throws(function() { 19 | new RecordLinkageClusterer({}, null); 20 | }, /items/); 21 | }); 22 | }); 23 | -------------------------------------------------------------------------------- /test/clustering/blocking.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/record-linkage/blocking tests 3 | * ================================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import levenshtein from '../../src/metrics/levenshtein'; 8 | import blocking from '../../src/clustering/blocking'; 9 | 10 | const DATA = [ 11 | 'abc', 12 | 'bde', 13 | 'bd', 14 | 'bde', 15 | 'bcde', 16 | 'abcde', 17 | 'ab' 18 | ]; 19 | 20 | describe('blocking', function() { 21 | 22 | it('should throw if the blocker function is invalid.', function() { 23 | assert.throws(function() { 24 | blocking({blocker: null, similarity: Function.prototype}, []); 25 | }, /blocker/); 26 | }); 27 | 28 | it('should correctly cluster data.', function() { 29 | const clusters = blocking({ 30 | block: a => a[0], 31 | distance: levenshtein, 32 | radius: 1 33 | }, DATA); 34 | 35 | assert.deepEqual(clusters, [ 36 | ['abc', 'ab'], 37 | ['bde', 'bd', 'bde', 'bcde'] 38 | ]); 39 | }); 40 | 41 | it('should be possible to map items to multiple blocks.', function() { 42 | const clusters = blocking({ 43 | blocks: a => a.split(''), 44 | similarity: (a, b) => levenshtein(a, b) <= 1 45 | }, DATA); 46 | 47 | assert.deepEqual(clusters, [ 48 | ['abc', 'ab'], 49 | ['bde', 'bd', 'bde', 'bcde'], 50 | ['abcde', 'bcde'] 51 | ]); 52 | }); 53 | 54 | it('should provide the index of the item to the blocker function.', function() { 55 | const blocks = DATA.map(item => item[0]); 56 | 57 | const clusters = blocking({ 58 | block: (item, i) => blocks[i], 59 | distance: levenshtein, 60 | radius: 1 61 | }, DATA); 62 | 63 | assert.deepEqual(clusters, [ 64 | ['abc', 'ab'], 65 | ['bde', 'bd', 'bde', 'bcde'] 66 | ]); 67 | }); 68 | }); 69 | -------------------------------------------------------------------------------- /test/clustering/canopy.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/record-linkage/canopy tests 3 | * ================================================ 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import canopy from '../../src/clustering/canopy'; 8 | import levenshtein from '../../src/metrics/levenshtein'; 9 | 10 | const DATA = [ 11 | 'abc', 12 | 'ab', 13 | 'bd', 14 | 'bde', 15 | 'bcde', 16 | 'abcde', 17 | 'abcdef', 18 | 'abcdefg' 19 | ]; 20 | 21 | describe('canopy', function() { 22 | 23 | it('should throw if the arguments are invalid.', function() { 24 | assert.throws(function() { 25 | canopy({distance: null}, []); 26 | }, /distance/); 27 | 28 | assert.throws(function() { 29 | canopy({distance: Function.prototype}, []); 30 | }, /loose/); 31 | 32 | assert.throws(function() { 33 | canopy({distance: Function.prototype, loose: 8}, []); 34 | }, /tight/); 35 | 36 | assert.throws(function() { 37 | canopy({distance: Function.prototype, loose: 4, tight: 7}, []); 38 | }, /greater/); 39 | }); 40 | 41 | it('should correctly compute clusters.', function() { 42 | const clusters = canopy({ 43 | distance: levenshtein, 44 | loose: 2, 45 | tight: 1 46 | }, DATA); 47 | 48 | assert.deepEqual(clusters, [ 49 | ['abc', 'ab', 'bd', 'abcde'], 50 | ['bd', 'bde', 'bcde'], 51 | ['bcde', 'abcde', 'abcdef'], 52 | ['abcdef', 'abcdefg'] 53 | ]); 54 | }); 55 | }); 56 | -------------------------------------------------------------------------------- /test/clustering/helpers.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/record-linkage/helpers tests 3 | * ================================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import {clustersFromArrayGraph} from '../../src/clustering/helpers'; 8 | 9 | describe('helpers', function() { 10 | 11 | describe('#.clustersFromArrayGraph', function() { 12 | 13 | it('should return correct clusters.', function() { 14 | const items = [ 15 | 'a', 16 | 'b', 17 | 'c', 18 | 'a', 19 | 'a', 20 | 'b', 21 | 'b', 22 | 'c' 23 | ]; 24 | 25 | const graph = { 26 | 0: [3, 4], 27 | 1: [5, 6], 28 | 2: [7], 29 | 3: [4], 30 | 5: [6] 31 | }; 32 | 33 | const clusters = clustersFromArrayGraph(items, graph, 2); 34 | 35 | assert.deepEqual(clusters, [ 36 | ['a', 'a', 'a'], 37 | ['b', 'b', 'b'], 38 | ['c', 'c'] 39 | ]); 40 | 41 | const limitedClusters = clustersFromArrayGraph(items, graph, 3); 42 | 43 | assert.deepEqual(limitedClusters, [ 44 | ['a', 'a', 'a'], 45 | ['b', 'b', 'b'] 46 | ]); 47 | }); 48 | }); 49 | }); 50 | -------------------------------------------------------------------------------- /test/clustering/leader.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/record-linkage/leader tests 3 | * ================================================ 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import leader from '../../src/clustering/leader'; 8 | import levenshtein from '../../src/metrics/levenshtein'; 9 | 10 | const BASIC_DATA = [ 11 | 1, 12 | 2, 13 | 3, 14 | 4, 15 | 10, 16 | 11, 17 | 24 18 | ]; 19 | 20 | const SHIFTED_BASIC_DATA = [ 21 | 3, 22 | 2, 23 | 1, 24 | 4, 25 | 10, 26 | 11, 27 | 24 28 | ]; 29 | 30 | const STRING_DATA = [ 31 | 'abc', 32 | 'abd', 33 | 'dbc', 34 | 'zyx', 35 | 'zxx', 36 | 'xxx' 37 | ]; 38 | 39 | describe('leader', function() { 40 | 41 | it('should throw if the arguments are invalid.', function() { 42 | assert.throws(function() { 43 | leader({distance: null}, []); 44 | }, /distance/); 45 | 46 | assert.throws(function() { 47 | leader({distance: Function.prototype}, []); 48 | }, /threshold/); 49 | }); 50 | 51 | it('should correctly compute clusters.', function() { 52 | const distance = (a, b) => Math.abs(a - b); 53 | 54 | let clusters = leader({ 55 | distance, 56 | threshold: 2 57 | }, BASIC_DATA); 58 | 59 | assert.deepEqual(clusters, [ 60 | [1, 2, 3], 61 | [4], 62 | [10, 11], 63 | [24] 64 | ]); 65 | 66 | clusters = leader({ 67 | distance, 68 | threshold: 2 69 | }, SHIFTED_BASIC_DATA); 70 | 71 | assert.deepEqual(clusters, [ 72 | [3, 2, 1, 4], 73 | [10, 11], 74 | [24] 75 | ]); 76 | 77 | clusters = leader({ 78 | distance: levenshtein, 79 | threshold: 1 80 | }, STRING_DATA); 81 | 82 | assert.deepEqual(clusters, [ 83 | ['abc', 'abd', 'dbc'], 84 | ['zyx', 'zxx'], 85 | ['xxx'] 86 | ]); 87 | }); 88 | }); 89 | -------------------------------------------------------------------------------- /test/clustering/naive.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/record-linkage/naive tests 3 | * =============================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import naive from '../../src/clustering/naive'; 8 | import levenshtein from '../../src/metrics/levenshtein'; 9 | 10 | const SIMPLE = [ 11 | 'a', 12 | 'b', 13 | 'c', 14 | 'a', 15 | 'a', 16 | 'b', 17 | 'b', 18 | 'c' 19 | ]; 20 | 21 | const CHAIN = [ 22 | 'abc', 23 | 'bcd', 24 | 'cde', 25 | 'def', 26 | 'efg', 27 | 'fgh', 28 | 'ghi' 29 | ]; 30 | 31 | const COMPLEX = [ 32 | 'abc', 33 | 'abc', 34 | 'bde', 35 | 'bd', 36 | 'bde', 37 | 'bcde', 38 | 'abcde', 39 | 'abcdef', 40 | 'abcdefg' 41 | ]; 42 | 43 | describe('naive', function() { 44 | 45 | it('should correctly cluster basic data.', function() { 46 | const clusters = naive({ 47 | similarity: (a, b) => a === b 48 | }, SIMPLE); 49 | 50 | assert.deepEqual(clusters, [ 51 | ['a', 'a', 'a'], 52 | ['b', 'b', 'b'], 53 | ['c', 'c'] 54 | ]); 55 | }); 56 | 57 | it('should correctly cluster objects.', function() { 58 | const data = SIMPLE.map(value => ({value})); 59 | 60 | const clusters = naive({ 61 | similarity: (a, b) => a.value === b.value 62 | }, data); 63 | 64 | assert.deepEqual(clusters.map(c => c.map(d => d.value)), [ 65 | ['a', 'a', 'a'], 66 | ['b', 'b', 'b'], 67 | ['c', 'c'] 68 | ]); 69 | }); 70 | 71 | it('should correctly cluster chains.', function() { 72 | const clusters = naive({ 73 | distance: levenshtein, 74 | radius: 2 75 | }, CHAIN); 76 | 77 | assert.deepEqual(clusters, [ 78 | ['abc', 'bcd'], 79 | ['cde', 'bcd', 'def'], 80 | ['efg', 'def', 'fgh'], 81 | ['ghi', 'fgh'] 82 | ]); 83 | }); 84 | 85 | it('should correctly cluster complex data.', function() { 86 | const clusters = naive({ 87 | similarity: (a, b) => levenshtein(a, b) <= 2 88 | }, COMPLEX); 89 | 90 | assert.deepEqual(clusters, [ 91 | ['abc', 'abc', 'bd', 'abcde'], 92 | ['bde', 'bd', 'bde', 'bcde', 'abcde'], 93 | ['abcdef', 'bcde', 'abcde', 'abcdefg'] 94 | ]); 95 | }); 96 | }); 97 | -------------------------------------------------------------------------------- /test/clustering/nn-descent.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/record-linkage/nn-descent tests 3 | * ==================================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import nnDescent from '../../src/clustering/nn-descent'; 8 | 9 | describe('nn-descent', function() { 10 | 11 | it('should throw if the arguments are invalid.', function() { 12 | assert.throws(function() { 13 | nnDescent({similarity: null}, []); 14 | }, /similarity/); 15 | 16 | assert.throws(function() { 17 | nnDescent({similarity: Function.prototype, rng: 'test'}, []); 18 | }, /rng/); 19 | 20 | assert.throws(function() { 21 | nnDescent({rho: -25}, []); 22 | }, /rho/); 23 | 24 | assert.throws(function() { 25 | nnDescent({delta: -45}, []); 26 | }, /delta/); 27 | 28 | assert.throws(function() { 29 | nnDescent({maxIterations: -65}, []); 30 | }, /maxIterations/); 31 | 32 | assert.throws(function() { 33 | nnDescent({k: 0}, []); 34 | }, /k/); 35 | }); 36 | 37 | it('should correctly compute clusters.', function() { 38 | 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /test/clustering/sorted-neighborhood.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/record-linkage/sorted-neighborhood tests 3 | * ================================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import levenshtein from '../../src/metrics/levenshtein'; 8 | import sortedNeighborhood from '../../src/clustering/sorted-neighborhood'; 9 | 10 | const DATA = [ 11 | 'aaa', 12 | 'zzz', 13 | 'bbb', 14 | 'bbb', 15 | 'aaa', 16 | 'zzz', 17 | 'aaz', 18 | 'zza', 19 | ]; 20 | 21 | describe('sorted-neighborhood', function() { 22 | 23 | it('should throw if the window is invalid.', function() { 24 | assert.throws(function() { 25 | sortedNeighborhood({window: null, similarity: Function.prototype}, []); 26 | }, /window/); 27 | }); 28 | 29 | it('should throw if the comparator functions are invalid.', function() { 30 | assert.throws(function() { 31 | sortedNeighborhood({window: 3, comparator: null, similarity: Function.prototype}, []); 32 | }, /comparator/); 33 | }); 34 | 35 | it('should correctly cluster data.', function() { 36 | const clusters = sortedNeighborhood({ 37 | comparator: (a, b) => { 38 | if (a < b) 39 | return -1; 40 | if (a > b) 41 | return 1; 42 | return 0; 43 | }, 44 | distance: levenshtein, 45 | radius: 1, 46 | window: 1 47 | }, DATA); 48 | 49 | assert.deepEqual(clusters, [ 50 | ['aaa', 'aaa'], 51 | ['zzz', 'zza', 'zzz'], 52 | ['bbb', 'bbb'], 53 | ['aaz', 'aaa'] 54 | ]); 55 | }); 56 | 57 | it('should be possible to map items to multiple blocks.', function() { 58 | const clusters = sortedNeighborhood({ 59 | comparators: [ 60 | (a, b) => { 61 | if (a < b) 62 | return -1; 63 | if (a > b) 64 | return 1; 65 | return 0; 66 | }, 67 | (a, b) => { 68 | if (a < b) 69 | return 1; 70 | if (a > b) 71 | return -1; 72 | return 0; 73 | } 74 | ], 75 | distance: levenshtein, 76 | radius: 1, 77 | window: 1 78 | }, DATA); 79 | 80 | assert.deepEqual(clusters, [ 81 | ['aaa', 'aaa', 'aaz'], 82 | ['zzz', 'zza', 'zzz'], 83 | ['bbb', 'bbb'] 84 | ]); 85 | }); 86 | }); 87 | -------------------------------------------------------------------------------- /test/clustering/vp-tree.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman clustering/record-linkage/vp-tree tests 3 | * ================================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import vpTree from '../../src/clustering/vp-tree'; 8 | import levenshtein from '../../src/metrics/levenshtein'; 9 | 10 | const CHAIN = [ 11 | 'abc', 12 | 'bcd', 13 | 'cde', 14 | 'def', 15 | 'efg', 16 | 'fgh', 17 | 'ghi' 18 | ]; 19 | 20 | const COMPLEX = [ 21 | 'abc', 22 | 'abc', 23 | 'bde', 24 | 'bd', 25 | 'bde', 26 | 'bcde', 27 | 'abcde', 28 | 'abcdef', 29 | 'abcdefg' 30 | ]; 31 | 32 | function serializeClusters(clusters) { 33 | const result = new Set(); 34 | 35 | clusters.forEach(cluster => { 36 | result.add(cluster.sort().join('$')); 37 | }); 38 | 39 | return result; 40 | } 41 | 42 | describe('vp-tree', function() { 43 | 44 | it('should throw if the arguments are invalid.', function() { 45 | assert.throws(function() { 46 | vpTree({distance: null, radius: 2}, []); 47 | }, /distance/); 48 | 49 | assert.throws(function() { 50 | vpTree({distance: Function.prototype, radius: null}, []); 51 | }, /radius/); 52 | }); 53 | 54 | it('should correctly cluster chains.', function() { 55 | const clusters = vpTree({ 56 | distance: levenshtein, 57 | radius: 2 58 | }, CHAIN); 59 | 60 | assert.deepStrictEqual(serializeClusters(clusters), serializeClusters([ 61 | ['bcd', 'abc'], 62 | ['def', 'cde', 'bcd'], 63 | ['fgh', 'efg', 'def'], 64 | ['ghi', 'fgh'] 65 | ])); 66 | }); 67 | 68 | it('should correctly cluster complex data.', function() { 69 | const clusters = vpTree({ 70 | distance: levenshtein, 71 | radius: 2 72 | }, COMPLEX); 73 | 74 | assert.deepStrictEqual(serializeClusters(clusters), serializeClusters([ 75 | ['abcde', 'bd', 'abc', 'abc'], 76 | ['abcde', 'bcde', 'bde', 'bd', 'bde'], 77 | ['abcdefg', 'abcdef', 'bcde', 'abcde'] 78 | ])); 79 | }); 80 | }); 81 | -------------------------------------------------------------------------------- /test/hash/crc32.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman helpers/crc32 tests 3 | * ============================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import crc32 from '../../src/hash/crc32'; 8 | 9 | describe('crc32', function() { 10 | 11 | it('should correctly hash the given strings.', function() { 12 | const tests = [ 13 | ['This is a string', 141976383], 14 | ['This is a string with éééà', 391581305], 15 | ['ßø⊂', -1838769021], 16 | ['\u2603', -1743909036] 17 | ]; 18 | 19 | tests.forEach(function([string, hash]) { 20 | assert.strictEqual(crc32(string), hash, `${string} => ${hash}`); 21 | }); 22 | }); 23 | }); 24 | -------------------------------------------------------------------------------- /test/hash/minhash.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman helpers/minhash tests 3 | * =============================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import createMinHash from '../../src/hash/minhash'; 8 | import {similarity, distance} from '../../src/metrics/minhash'; 9 | import seedrandom from 'seedrandom'; 10 | 11 | describe('minhash', function() { 12 | 13 | it('should produce the correct signature.', function() { 14 | const rng = seedrandom('shawarma'), 15 | minhash = createMinHash({rng, hashes: 6}); 16 | 17 | assert.deepEqual(Array.from(minhash('this is a string')), [ 18 | 75288857, 19 | 241855118, 20 | 149375312, 21 | 5249094, 22 | 339091736, 23 | 369835310 24 | ]); 25 | 26 | assert.deepEqual(Array.from(minhash(['this', 'is', 'a', 'string'])), [ 27 | -2497302731, 28 | -2872246020, 29 | -3540234138, 30 | -4187033817, 31 | -1454124627, 32 | -2422446200 33 | ]); 34 | }); 35 | 36 | it('should be possible to compute similarity between MinHash signatures.', function() { 37 | const rng = seedrandom('shawarma'), 38 | minhash = createMinHash({rng, hashes: 512}); 39 | 40 | const tests = [ 41 | ['abc', '', 0], 42 | ['', 'abc', 0], 43 | ['', '', 1], 44 | ['abc', 'abc', 1], 45 | ['abc', 'xyz', 0], 46 | ['night', 'nacht', 0.421875], 47 | ['context', 'contact', 0.55859375], 48 | [['mouse', 'eats', 'cheese'], ['cat', 'eats', 'mouse'], 0.48828125], 49 | ['ht', 'nacht', 0.376953125] 50 | ]; 51 | 52 | tests.forEach(function([a, b, j]) { 53 | const minA = minhash(a), 54 | minB = minhash(b); 55 | 56 | const s = similarity(minA, minB), 57 | d = distance(minA, minB); 58 | 59 | assert.strictEqual(s, j, `${a}/${b}`); 60 | assert.strictEqual(d, 1 - j, `${a}/${b}`); 61 | }); 62 | }); 63 | }); 64 | -------------------------------------------------------------------------------- /test/helpers.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman unit tests helpers 3 | * ============================ 4 | * 5 | */ 6 | import {default as parseCsv} from 'csv-parse/lib/sync'; 7 | import path from 'path'; 8 | import fs from 'fs'; 9 | 10 | /** 11 | * Function used to easily load some resources from the file system. 12 | */ 13 | export function loadResource(location) { 14 | return fs.readFileSync(path.join(__dirname, '_resources', location), 'utf-8'); 15 | } 16 | 17 | /** 18 | * Function used to easily load & parse some CSV file from the file system. 19 | */ 20 | export function loadCSV(location) { 21 | return parseCsv(loadResource(location)); 22 | } 23 | 24 | /** 25 | * Function spying on the execution of the provided function to ease some 26 | * tests, notably related to event handling. 27 | * 28 | * @param {function} target - Target function. 29 | * @param {function} - The spy. 30 | */ 31 | export function spy(target) { 32 | const fn = function() { 33 | fn.called = true; 34 | fn.times++; 35 | 36 | if (typeof target === 'function') 37 | return target.apply(null, arguments); 38 | }; 39 | 40 | fn.called = false; 41 | fn.times = 0; 42 | 43 | return fn; 44 | } 45 | -------------------------------------------------------------------------------- /test/helpers/frequencies.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman helpers/frequencies tests 3 | * =================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import { 8 | absolute, 9 | relative, 10 | updateFrequencies 11 | } from '../../src/helpers/frequencies'; 12 | 13 | describe('frequencies', function() { 14 | 15 | describe('#.absolute', function() { 16 | it('should compute correct frequencies of the given sequence.', function() { 17 | assert.deepEqual( 18 | absolute([1, 2, 3, 3, 4, 4, 4, 5]), 19 | { 20 | 1: 1, 21 | 2: 1, 22 | 3: 2, 23 | 4: 3, 24 | 5: 1 25 | } 26 | ); 27 | }); 28 | 29 | it('should also work on strings.', function() { 30 | assert.deepEqual( 31 | absolute('Hello'), 32 | { 33 | H: 1, 34 | e: 1, 35 | l: 2, 36 | o: 1 37 | } 38 | ); 39 | }); 40 | }); 41 | 42 | describe('#.relative', function() { 43 | it('should compute correct frequencies of the given sequence.', function() { 44 | assert.deepEqual( 45 | relative([1, 2, 3, 3, 4, 4, 4, 5]), 46 | { 47 | 1: 1 / 8, 48 | 2: 1 / 8, 49 | 3: 2 / 8, 50 | 4: 3 / 8, 51 | 5: 1 / 8 52 | } 53 | ); 54 | }); 55 | 56 | it('should also work on strings.', function() { 57 | assert.deepEqual( 58 | relative('test'), 59 | { 60 | t: 0.5, 61 | e: 0.25, 62 | s: 0.25 63 | } 64 | ); 65 | }); 66 | 67 | it('should be possible to pass absolute frequencies.', function() { 68 | assert.deepEqual( 69 | relative(absolute([1, 2, 3, 3, 4, 4, 4, 5])), 70 | { 71 | 1: 1 / 8, 72 | 2: 1 / 8, 73 | 3: 2 / 8, 74 | 4: 3 / 8, 75 | 5: 1 / 8 76 | } 77 | ); 78 | }); 79 | }); 80 | 81 | describe('#.updateFrequencies', function() { 82 | 83 | it('should correctly update frequencies with the given sequence.', function() { 84 | const previousFrequencies = { 85 | 1: 1, 86 | 2: 1, 87 | 3: 2, 88 | 4: 3, 89 | 5: 1 90 | }; 91 | 92 | assert.deepEqual(updateFrequencies(previousFrequencies, [7, 1, 1, 1, 2]), { 93 | 1: 4, 94 | 2: 2, 95 | 3: 2, 96 | 4: 3, 97 | 5: 1, 98 | 7: 1 99 | }); 100 | }); 101 | }); 102 | }); 103 | -------------------------------------------------------------------------------- /test/helpers/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman helpers tests 3 | * ======================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import { 8 | findall, 9 | seq, 10 | squeeze, 11 | translation 12 | } from '../../src/helpers'; 13 | 14 | describe('index', function() { 15 | 16 | describe('#.findall', function() { 17 | it('will correctly return an array of matches.', function() { 18 | assert.deepEqual( 19 | findall(/t/g, 'test').map(m => ([m[0], m.index])), 20 | [['t', 0], ['t', 3]] 21 | ); 22 | }); 23 | 24 | it('won\'t trigger an infinite loop if the regex is not global.', function() { 25 | assert.deepEqual( 26 | findall(/t/, 'test').map(m => ([m[0], m.index])), 27 | [['t', 0]] 28 | ); 29 | }); 30 | }); 31 | 32 | describe('#.seq', function() { 33 | 34 | it('should produce an array sequence from different variables.', function() { 35 | 36 | assert.deepEqual(seq('hello'), ['h', 'e', 'l', 'l', 'o']); 37 | assert.deepEqual(seq([1, 2, 3]), [1, 2, 3]); 38 | }); 39 | }); 40 | 41 | describe('#.squeeze', function() { 42 | 43 | it('should work with strings.', function() { 44 | assert.strictEqual(squeeze('test'), 'test'); 45 | assert.strictEqual(squeeze('hello yellow'), 'helo yelow'); 46 | }); 47 | 48 | it('should work with arbitrary sequences.', function() { 49 | assert.deepEqual(squeeze([1, 2, 3]), [1, 2, 3]); 50 | assert.deepEqual(squeeze([1, 1, 2, 3, 3]), [1, 2, 3]); 51 | }); 52 | }); 53 | 54 | describe('#.translation', function() { 55 | it('should throw if given strings don\'t have the same length.', function() { 56 | assert.throws(function() { 57 | translation('123', '1234'); 58 | }, /length/); 59 | }); 60 | 61 | it('should produce indexes.', function() { 62 | assert.deepEqual(translation('abc', '123'), {a: 1, b: 2, c: 3}); 63 | }); 64 | }); 65 | }); 66 | -------------------------------------------------------------------------------- /test/inflectors/spanish/noun.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman inflectors/spanish/noun 3 | * ================================= 4 | */ 5 | import assert from 'assert'; 6 | import {singularize} from '../../../src/inflectors/spanish/noun'; 7 | 8 | describe('noun', function() { 9 | 10 | describe('#.singularize', function() { 11 | 12 | it('should correclty singularize Spanish words.', function() { 13 | 14 | const tests = [ 15 | ['álbumes', 'álbum'], 16 | ['almacenes', 'almacén'], 17 | ['androides', 'androide'], 18 | ['antifaces', 'antifaz'], 19 | ['árboles', 'árbol'], 20 | ['atlas', 'atlas'], 21 | ['autobuses', 'autobús'], 22 | ['bases', 'base'], 23 | ['bebés', 'bebé'], 24 | ['camiones', 'camión'], 25 | ['casas', 'casa'], 26 | ['ceutíes', 'ceutí'], 27 | ['chimpancés', 'chimpancé'], 28 | ['clanes', 'clan'], 29 | ['compases', 'compás'], 30 | ['convoyes', 'convoy'], 31 | ['coxis', 'coxis'], 32 | ['crisis', 'crisis'], 33 | ['déficits', 'déficit'], 34 | ['ejes', 'eje'], 35 | ['espíritus', 'espíritu'], 36 | ['flashes', 'flash'], 37 | ['fracs', 'frac'], 38 | ['gafas', 'gafas'], 39 | ['hipótesis', 'hipótesis'], 40 | ['ingleses', 'inglés'], 41 | ['lápices', 'lápiz'], 42 | ['luces', 'luz'], 43 | ['montajes', 'montaje'], 44 | ['noes', 'no'], 45 | ['otitis', 'otitis'], 46 | ['padres', 'padre'], 47 | ['países', 'país'], 48 | ['papás', 'papá'], 49 | ['parkings', 'parking'], 50 | ['portaequipajes', 'portaequipaje'], 51 | ['radiocasetes', 'radiocasete'], 52 | ['shows', 'show'], 53 | ['sis', 'si'], 54 | ['síes', 'sí'], 55 | ['tabúes', 'tabú'], 56 | ['tamices', 'tamiz'], 57 | ['tanques', 'tanque'], 58 | ['taxis', 'taxi'], 59 | ['tijeras', 'tijeras'], 60 | ['trenes', 'tren'], 61 | ['virus', 'virus'], 62 | ['Trenes', 'Tren'], 63 | ['trEnes', 'trEn'] 64 | ]; 65 | 66 | tests.forEach(function([plural, singular]) { 67 | assert.strictEqual(singularize(plural), singular); 68 | }); 69 | }); 70 | }); 71 | }); 72 | -------------------------------------------------------------------------------- /test/keyers/html-text.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyers/html-text tests 3 | * ================================ 4 | */ 5 | import assert from 'assert'; 6 | import htmlText from '../../src/keyers/html-text'; 7 | 8 | // const BASIC_HTML = ` 9 | //
What are you doing?
12 | // Link towards something 13 | //Hello
World!
Hello
', 'Hello'], 27 | ['Hello
', 'Hello'], 28 | ['This is a é no?
', 'This is a é no?'], 29 | ['Some text ok?', 'Some text ok?'] 30 | // [BASIC_HTML, 'Hello What are you doing?Link towards somethingWorld!'] 31 | ]; 32 | 33 | tests.forEach(function([html, text]) { 34 | assert.strictEqual(htmlText(html), text); 35 | }); 36 | }); 37 | }); 38 | -------------------------------------------------------------------------------- /test/keyers/name-power-set.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyers/name-power-set tests 3 | * ===================================== 4 | */ 5 | import {assert} from 'chai'; 6 | import namePowerSet from '../../src/keyers/name-power-set'; 7 | 8 | describe('name-power-set', function() { 9 | 10 | const hasher = list => list.join('§'); 11 | 12 | const hashedNamePowerSet = name => namePowerSet(name).map(hasher); 13 | 14 | it('should return a correct name power set.', function() { 15 | assert.sameMembers(hashedNamePowerSet('Henry'), [ 16 | ['Henry'] 17 | ].map(hasher)); 18 | 19 | assert.sameMembers(hashedNamePowerSet('John Henry'), [ 20 | ['Henry', 'John'], 21 | ['H', 'John'], 22 | ['Henry', 'J'] 23 | ].map(hasher)); 24 | 25 | assert.sameMembers(hashedNamePowerSet('John Philip Henry'), [ 26 | ['Henry', 'John'], 27 | ['H', 'John'], 28 | ['Henry', 'J'], 29 | ['Henry', 'John', 'Philip'], 30 | ['H', 'John', 'Philip'], 31 | ['Henry', 'J', 'Philip'], 32 | ['H', 'J', 'Philip'], 33 | ['Henry', 'John', 'P'], 34 | ['H', 'John', 'P'], 35 | ['Henry', 'J', 'P'], 36 | ['Henry', 'Philip'], 37 | ['H', 'Philip'], 38 | ['Henry', 'P'], 39 | ['John', 'Philip'], 40 | ['J', 'Philip'], 41 | ['John', 'P'] 42 | ].map(hasher)); 43 | 44 | assert.sameMembers(hashedNamePowerSet('J.R.R. Tolkien'), [ 45 | ['J', 'R', 'Tolkien'], 46 | ['J', 'Tolkien'], 47 | ['R', 'Tolkien'] 48 | ].map(hasher)); 49 | }); 50 | 51 | it('should also work on already tokenized names.', function() { 52 | assert.sameMembers(hashedNamePowerSet(['john', 'henry']), [ 53 | ['henry', 'john'], 54 | ['h', 'john'], 55 | ['henry', 'j'] 56 | ].map(hasher)); 57 | }); 58 | }); 59 | -------------------------------------------------------------------------------- /test/keyers/name-sig.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyers/name-sig tests 3 | * =============================== 4 | */ 5 | import assert from 'assert'; 6 | import nameSig from '../../src/keyers/name-sig'; 7 | 8 | describe('name-sig', function() { 9 | 10 | it('should return proper namesig keys.', function() { 11 | const tests = [ 12 | ['Mr. Abdul Haque', 'abdlhk'], 13 | ['Mr. Md. Abdul Hoque', 'abdlhk'], 14 | ['Abdul Hoque', 'abdlhk'], 15 | ['Mr. Sobuj Saha', 'sbgsh'], 16 | ['Sree sabuj saha', 'sbgsh'], 17 | ['Sree Sobuz saha', 'sbgsh'], 18 | ['Marjorie', 'mrgr'], 19 | ['Amrishnav', 'amrshnv'] 20 | ]; 21 | 22 | tests.forEach(function([string, key]) { 23 | assert.strictEqual(nameSig(string), key, `${string} => ${key}`); 24 | }); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /test/keyers/normalize.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyers/normalize tests 3 | * ================================ 4 | */ 5 | import assert from 'assert'; 6 | import normalize, {createNormalizer} from '../../src/keyers/normalize'; 7 | 8 | describe('normalize', function() { 9 | 10 | it('should properly normalize the given strings.', function() { 11 | const tests = [ 12 | ['Hello, World', 'hello, world', 'Hello, World'], 13 | ['\x00Hello', 'hello', 'Hello'], 14 | [' \n this \t\t\t ', 'this'], 15 | [' this space\t', 'this space'], 16 | ['é Oh', 'e oh', 'e Oh', 'é oh'], 17 | ['æther', 'aether'], 18 | ['œuf', 'oeuf'], 19 | ['Straß', 'strass', 'Strass'], 20 | ['What now…', 'what now...', 'What now...'], 21 | ['It’s uncanny!', 'it\'s uncanny!', 'It\'s uncanny!'], 22 | ['Not a «problem» \t \t\n', 'not a "problem"', 'Not a "problem"'], 23 | ['so — annoying', 'so - annoying'], 24 | ['oh my، god...', 'oh my, god...'] 25 | ]; 26 | 27 | const keepCaseNormalizer = createNormalizer({keepCase: true}), 28 | keepAccentsNormalizer = createNormalizer({keepAccents: true}); 29 | 30 | tests.forEach(function([string, normalized, caseIntact, accentsIntact]) { 31 | if (!caseIntact) 32 | caseIntact = normalized; 33 | 34 | if (!accentsIntact) 35 | accentsIntact = normalized; 36 | 37 | assert.strictEqual(normalize(string), normalized, `Normalize: (${string}) => (${normalized})`); 38 | assert.strictEqual(keepCaseNormalizer(string), caseIntact, `Keep Case: (${string}) => (${caseIntact})`); 39 | assert.strictEqual(keepAccentsNormalizer(string), accentsIntact, `Keep Accents: (${string}) => (${accentsIntact})`); 40 | }); 41 | }); 42 | }); 43 | -------------------------------------------------------------------------------- /test/keyers/omission.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyers/omission tests 3 | * =============================== 4 | */ 5 | import assert from 'assert'; 6 | import omission from '../../src/keyers/omission'; 7 | 8 | describe('omission', function() { 9 | 10 | it('should return proper omission keys.', function() { 11 | const tests = [ 12 | ['', ''], 13 | ['hello', 'HLEO'], 14 | ['The quick brown fox jumped over the lazy dog.', 'JKQXZVWYBFMGPDHCLNTREUIOA'], 15 | ['Christopher', 'PHCTSRIOE'], 16 | ['Niall', 'LNIA'], 17 | ['caramel', 'MCLRAE'], 18 | ['Carlson', 'CLNSRAO'], 19 | ['Karlsson', 'KLNSRAO'], 20 | ['microeletronics', 'MCLNTSRIOE'], 21 | ['Circumstantial', 'MCLNTSRIUA'], 22 | ['LUMINESCENT', 'MCLNTSUIE'], 23 | ['multinucleate', 'MCLNTUIEA'], 24 | ['multinucleon', 'MCLNTUIEO'], 25 | ['cumulene', 'MCLNUE'], 26 | ['luminance', 'MCLNUIAE'], 27 | ['cœlomic', 'MCLOEI'], 28 | ['Molecule', 'MCLOEU'], 29 | ['Cameral', 'MCLRAE'], 30 | ['Maceral', 'MCLRAE'], 31 | ['Lacrimal', 'MCLRAI'] 32 | ]; 33 | 34 | tests.forEach(function([string, key]) { 35 | assert.strictEqual(omission(string), key, `${string} => ${key}`); 36 | }); 37 | }); 38 | }); 39 | -------------------------------------------------------------------------------- /test/keyers/skeleton.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyers/skeleton tests 3 | * =============================== 4 | */ 5 | import assert from 'assert'; 6 | import skeleton from '../../src/keyers/skeleton'; 7 | 8 | describe('skeleton', function() { 9 | 10 | it('should return proper skeleton keys.', function() { 11 | const tests = [ 12 | ['', ''], 13 | ['hello', 'HLEO'], 14 | ['The quick brown fox jumped over the lazy dog.', 'THQCKBRWNFXJMPDVLZYGEUIOA'], 15 | ['Christopher', 'CHRSTPIOE'], 16 | ['Niall', 'NLIA'], 17 | ['CHEMOGENIC', 'CHMGNEOI'], 18 | ['chemomagnetic', 'CHMGNTEOAI'], 19 | ['Chemcal', 'CHMLEA'], 20 | ['Chemcial', 'CHMLEIA'], 21 | ['Chemical', 'CHMLEIA'], 22 | ['Chemicial', 'CHMLEIA'], 23 | ['Chimical', 'CHMLIA'], 24 | ['Chemiluminescence', 'CHMLNSEIU'], 25 | ['Chemiluminescent', 'CHMLNSTEIU'], 26 | ['Chemically', 'CHMLYEIA'] 27 | ]; 28 | 29 | tests.forEach(function([string, key]) { 30 | assert.strictEqual(skeleton(string), key, `${string} => ${key}`); 31 | }); 32 | }); 33 | }); 34 | -------------------------------------------------------------------------------- /test/keyword-extraction/rake.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman keyword-extraction/rake tests 3 | * ======================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import sentences from '../../src/tokenizers/sentences'; 8 | import words from '../../src/tokenizers/words/treebank'; 9 | import createExtractor from '../../src/keyword-extraction/rake'; 10 | import {loadResource} from '../helpers'; 11 | 12 | const FOX = loadResource('stopwords/fox.txt'), 13 | STOPWORDS = FOX.split('\n').slice(0, -1); 14 | 15 | const DOCUMENT = ` 16 | Compatibility of systems of linear constraints over the set of natural numbers. 17 | 18 | Criteria of compatibility of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating sets of solutions for all types of systems are given. These criteria and the corresponding algorithms for constructing a minimal supporting set of solutions can be used in solving all the considered types of systems and systems of mixed types. 19 | `.replace(/\n+/g, ' ').replace(/\s+/, ' '); 20 | 21 | const TOKENIZED_DOCUMENT = sentences(DOCUMENT.replace(/\n+/g, ' ')).map(sentence => words(sentence.toLowerCase())); 22 | 23 | describe('rake', function() { 24 | 25 | it('should throw if given an invalid list of stopwords.', function() { 26 | 27 | assert.throws(function() { 28 | createExtractor(null); 29 | }, /stopwords/); 30 | 31 | assert.throws(function() { 32 | createExtractor({stopwords: 34}); 33 | }, /stopwords/); 34 | }); 35 | 36 | it('should properly extract keywords.', function() { 37 | const rake = createExtractor({stopwords: STOPWORDS}); 38 | 39 | const keywords = rake(TOKENIZED_DOCUMENT); 40 | 41 | assert.deepEqual(keywords, [ 42 | ['minimal', 'generating', 'sets'], 43 | ['linear', 'diophantine', 'equations'], 44 | ['minimal', 'supporting', 'set'], 45 | ['minimal', 'set'], 46 | ['linear', 'constraints'], 47 | ['upper', 'bounds'], 48 | ['strict', 'inequations'], 49 | ['nonstrict', 'inequations'] 50 | ]); 51 | }); 52 | }); 53 | -------------------------------------------------------------------------------- /test/metrics/bag.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/bag tests 3 | * ==================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import bag from '../../src/metrics/bag'; 8 | 9 | describe('bag', function() { 10 | 11 | it('should correctly compute the Bag distance.', function() { 12 | const tests = [ 13 | ['cat', 'cat', 0], 14 | ['cat', '', 3], 15 | ['', 'cat', 3], 16 | ['cat', 'hat', 1], 17 | ['Niall', 'Neil', 2], 18 | ['aluminum', 'Catalan', 5], 19 | ['ATCG', 'TAGC', 0] 20 | ]; 21 | 22 | tests.forEach(function([a, b, distance]) { 23 | assert.strictEqual(bag(a, b), distance, `${a}, ${b}`); 24 | }); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /test/metrics/canberra.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/canberra tests 3 | * ========================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import canberra from '../../src/metrics/canberra'; 8 | 9 | describe('canberra', function() { 10 | 11 | const tests = [ 12 | { 13 | a: [2], 14 | b: [4], 15 | distance: 2 / 6 16 | }, 17 | { 18 | a: [1, 3], 19 | b: [4, 5], 20 | distance: 0.85 21 | }, 22 | { 23 | a: [1, 3, 5], 24 | b: [2, 1, 4], 25 | distance: 1 / 3 + 1 / 2 + 1 / 9 26 | } 27 | ]; 28 | 29 | it('should throw if the given vectors are not of the same dimension.', function() { 30 | assert.throws(function() { 31 | canberra([1, 2], [1, 2, 3]); 32 | }, /dimension/); 33 | }); 34 | 35 | it('should correctly compute the canberra distance of n-dimensions vectors.', function() { 36 | tests.forEach(function({a, b, distance}) { 37 | assert.strictEqual(canberra(a, b), distance); 38 | }); 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /test/metrics/chebyshev.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/chebyshev tests 3 | * ========================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import chebyshev from '../../src/metrics/chebyshev'; 8 | 9 | describe('chebyshev', function() { 10 | 11 | const tests = [ 12 | { 13 | a: [2], 14 | b: [4], 15 | distance: 2 16 | }, 17 | { 18 | a: [1, 3], 19 | b: [4, 5], 20 | distance: 3 21 | }, 22 | { 23 | a: [1, 3, 5], 24 | b: [2, 1, 4], 25 | distance: 2 26 | } 27 | ]; 28 | 29 | it('should throw if the given vectors are not of the same dimension.', function() { 30 | assert.throws(function() { 31 | chebyshev([1, 2], [1, 2, 3]); 32 | }, /dimension/); 33 | }); 34 | 35 | it('should correctly compute the chebyshev distance of n-dimensions vectors.', function() { 36 | tests.forEach(function({a, b, distance}) { 37 | assert.strictEqual(chebyshev(a, b), distance); 38 | }); 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /test/metrics/cosine.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/cosine tests 3 | * ======================================= 4 | * 5 | */ 6 | import {assert} from 'chai'; 7 | import cosine, {distance} from '../../src/metrics/cosine'; 8 | 9 | describe('cosine', function() { 10 | 11 | const tests = [ 12 | { 13 | a: [2], 14 | b: [4], 15 | similarity: 1 16 | }, 17 | { 18 | a: [1, 3], 19 | b: [4, 5], 20 | similarity: 0.94 21 | }, 22 | { 23 | a: [1, 3, 5], 24 | b: [2, 1, 4], 25 | similarity: 0.92 26 | } 27 | ]; 28 | 29 | it('should throw if the given vectors are not of the same dimension.', function() { 30 | assert.throws(function() { 31 | cosine([1, 2], [1, 2, 3]); 32 | }, /dimension/); 33 | }); 34 | 35 | it('should correctly compute the cosine similarity of n-dimensions vectors.', function() { 36 | tests.forEach(function({a, b, similarity}) { 37 | assert.approximately(cosine(a, b), similarity, 0.01); 38 | }); 39 | }); 40 | 41 | it('should correctly compute the cosine distance of n-dimensions vectors.', function() { 42 | tests.forEach(function({a, b, similarity}) { 43 | assert.approximately(distance(a, b), 1 - similarity, 0.01); 44 | }); 45 | }); 46 | }); 47 | -------------------------------------------------------------------------------- /test/metrics/damerau-levenshtein.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/damerau-levenshtein tests 3 | * ==================================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import damerauLevenshtein, {limited} from '../../src/metrics/damerau-levenshtein'; 8 | 9 | describe('damerau-levenshtein', function() { 10 | const tests = [ 11 | [['a', 'b', 'c'], ['a', 'b', 'c'], 0], 12 | [['b', 'o', 'o', 'k'], ['b', 'a', 'c', 'k'], 2], 13 | ['abc', 'cba', 2], 14 | ['one', 'once upon', 6], 15 | ['ahk', 'ahk', 0], 16 | ['he', 'ben', 2], 17 | ['this', 'tihs', 1], 18 | ['toralf', 'titan', 4], 19 | ['google', 'goggle', 1], 20 | ['NawKtYu', '', 7], 21 | ['', 'NawKtYu', 7], 22 | ['NawKtYu', 'NawKtYu', 0], 23 | ['NawKtYu', 'tKNwYua', 6], 24 | ['Jdc', 'dJc', 1], 25 | ['sUzSOwx', 'zsSxUwO', 6], 26 | ['eOqoHAta', 'tAeaqHoO', 7], 27 | ['glSbo', 'lgSbo', 1], 28 | ['NJtQKcJE', 'cJEtQKJN', 4], 29 | ['GitIEVs', 'EGItVis', 5], 30 | ['MiWK', 'WKiM', 4], 31 | ]; 32 | 33 | it('should correctly compute the Damerau-Levenshtein distance.', function() { 34 | tests.forEach(function([a, b, distance]) { 35 | assert.strictEqual(damerauLevenshtein(a, b), distance, `${a} <=> ${b}`); 36 | }); 37 | }); 38 | 39 | it('should be possible to use the limited version.', function() { 40 | tests.forEach(function([a, b, distance]) { 41 | assert.strictEqual(limited(2, a, b), distance > 2 ? Infinity : distance, `${a} <=> ${b}`); 42 | }); 43 | }); 44 | }); 45 | -------------------------------------------------------------------------------- /test/metrics/dice.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/dice tests 3 | * ===================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import dice, { 8 | index, 9 | similarity, 10 | distance 11 | } from '../../src/metrics/dice'; 12 | import sorensen, { 13 | index as sorensenIndex, 14 | similarity as sorensenSimilarity, 15 | distance as sorensenDistance 16 | } from '../../src/metrics/sorensen'; 17 | 18 | describe('dice', function() { 19 | 20 | it('should compute the Dice index & aliases correctly.', function() { 21 | const tests = [ 22 | ['healed', 'healed', 1], 23 | ['healed', 'sealed', 0.8], 24 | ['healed', 'healthy', 6 / 11], 25 | ['healed', 'heard', 4 / 9], 26 | ['healed', 'herded', 0.4], 27 | ['healed', 'help', 0.25], 28 | ['healed', 'sold', 0], 29 | ['tomato', 'tomato', 1], 30 | ['h', 'help', 0], 31 | ['h', 'h', 1], 32 | ['', '', 1], 33 | ['h', 'g', 0] 34 | ]; 35 | 36 | tests.forEach(function([x, y, i]) { 37 | assert.strictEqual(dice(x, y), i, `${x} / ${y}`); 38 | assert.strictEqual(dice(x, y), index(x, y)); 39 | assert.strictEqual(dice(x, y), similarity(x, y)); 40 | assert.strictEqual(1 - dice(x, y), distance(x, y)); 41 | }); 42 | }); 43 | 44 | it('Sorensen index should be the same as Dice.', function() { 45 | const compared = ['healed', 'sealed']; 46 | 47 | assert.strictEqual(dice(...compared), sorensen(...compared)); 48 | assert.strictEqual(index(...compared), sorensenIndex(...compared)); 49 | assert.strictEqual(similarity(...compared), sorensenSimilarity(...compared)); 50 | assert.strictEqual(distance(...compared), sorensenDistance(...compared)); 51 | }); 52 | }); 53 | -------------------------------------------------------------------------------- /test/metrics/euclidean.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/euclidean tests 3 | * ========================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import euclidean, {squared} from '../../src/metrics/euclidean'; 8 | 9 | describe('euclidean', function() { 10 | 11 | const tests = [ 12 | { 13 | a: [2], 14 | b: [4], 15 | distance: 2, 16 | squaredDistance: 4 17 | }, 18 | { 19 | a: [1, 3], 20 | b: [4, 5], 21 | distance: Math.sqrt(13), 22 | squaredDistance: 13 23 | }, 24 | { 25 | a: [1, 3, 5], 26 | b: [2, 1, 4], 27 | distance: Math.sqrt(6), 28 | squaredDistance: 6 29 | } 30 | ]; 31 | 32 | it('should throw if the given vectors are not of the same dimension.', function() { 33 | assert.throws(function() { 34 | euclidean([1, 2], [1, 2, 3]); 35 | }, /dimension/); 36 | }); 37 | 38 | it('should correctly compute the euclidean distance of n-dimensions vectors.', function() { 39 | tests.forEach(function({a, b, distance}) { 40 | assert.strictEqual(euclidean(a, b), distance); 41 | }); 42 | }); 43 | 44 | it('should be possible to compute the squared distance instead.', function() { 45 | tests.forEach(function({a, b, squaredDistance}) { 46 | assert.strictEqual(squared(a, b), squaredDistance); 47 | }); 48 | }); 49 | }); 50 | -------------------------------------------------------------------------------- /test/metrics/eudex.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/eudex tests 3 | * ========================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import {distance, isSimilar} from '../../src/metrics/eudex'; 8 | 9 | describe('eudex', function() { 10 | it('should compute distances correctly.', function() { 11 | assert.strictEqual(distance('jumpo', 'jumbo'), 2); 12 | assert.strictEqual(distance('jumpo', 'trol'), 408); 13 | 14 | assert(distance('lizzard', 'wizzard') > distance('rick', 'rolled')); 15 | assert(distance('bannana', 'panana') >= distance('apple', 'abple')); 16 | assert(distance('trump', 'drumpf') < distance('gangam', 'style')); 17 | }); 18 | 19 | it('distance reflexivity.', function() { 20 | const pairs = [ 21 | ['a', 'b'], 22 | ['youtube', 'facebook'], 23 | ['Rust', 'Go'], 24 | ['rick', 'rolled'] 25 | ]; 26 | 27 | pairs.forEach(function([one, two]) { 28 | assert.strictEqual(distance(one, two), distance(two, one)); 29 | }); 30 | }); 31 | 32 | it('similarity function should work correctly.', function() { 33 | 34 | const similar = [ 35 | ['yay', 'yuy'], 36 | ['what', 'wat'], 37 | ['jesus', 'jeuses'], 38 | ['', ''], 39 | ['lol', 'lulz'], 40 | ['maier', 'meyer'], 41 | ['möier', 'meyer'], 42 | ['fümlaut', 'fymlaut'] 43 | ]; 44 | 45 | similar.forEach(function([one, two]) { 46 | assert(isSimilar(one, two), `${one} =~ ${two}`); 47 | }); 48 | 49 | const different = [ 50 | ['youtube', 'reddit'], 51 | ['yet', 'vet'], 52 | ['hacker', '4chan'], 53 | ['awesome', 'me'], 54 | ['prisco', 'vkisco'], 55 | ['no', 'go'], 56 | ['horse', 'norse'], 57 | ['nice', 'mice'] 58 | ]; 59 | 60 | different.forEach(function([one, two]) { 61 | assert(!isSimilar(one, two), `${one} =~ ${two}`); 62 | }); 63 | }); 64 | }); 65 | -------------------------------------------------------------------------------- /test/metrics/guth.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/guth tests 3 | * ===================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import guth from '../../src/metrics/guth'; 8 | 9 | describe('guth', function() { 10 | it('should correctly compute the Guth distance.', function() { 11 | const tests = [ 12 | ['HELLO', 'HELLO', 0], 13 | ['NOEL', 'LEON', 2], 14 | ['NOEN', 'LEON', 1], 15 | ['NOEL', 'NEON', 1], 16 | ['GLAVIN', 'GLAWYN', 0], 17 | ['MERIT', 'MERITS', 1], 18 | ['MERIST', 'MERITS', 0], 19 | ['MERIS', 'MERITS', 0], 20 | ['SMITH', 'SMYTH', 0], 21 | ['SMITH', 'SMYSS', 3], 22 | ['HELLO'.split(''), 'HELLO'.split(''), 0], 23 | ['ABC', 'DEFGHIJKLMNOPQRST', 17], 24 | ['DEFGHIJKLMNOPQRST', 'ABC', 17] 25 | ]; 26 | 27 | tests.forEach(function([a, b, d]) { 28 | assert.strictEqual(guth(a, b), d, `${a}/${b} => ${d}`); 29 | }); 30 | }); 31 | }); 32 | -------------------------------------------------------------------------------- /test/metrics/hamming.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/hamming tests 3 | * ======================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import hamming, { 8 | bitwise, 9 | normalizedDistance, 10 | normalizedSimilarity 11 | } from '../../src/metrics/hamming'; 12 | 13 | describe('hamming', function() { 14 | 15 | it('should throw if the given sequences are not of equal length.', function() { 16 | assert.throws(function() { 17 | hamming('hello', 'goodbye'); 18 | }, /equal/); 19 | }); 20 | 21 | it('should correctly compute the Hamming distance.', function() { 22 | const tests = [ 23 | ['1011101', '1001001', 2], 24 | ['2143896', '2233796', 3], 25 | ['ramer', 'cases', 3], 26 | ['abc', 'abc', 0], 27 | ['abc', 'abd', 1], 28 | ['night', 'nacht', 2], 29 | [[0, 1, 0, 1], [1, 2, 0, 1], 2] 30 | ]; 31 | 32 | tests.forEach(function([a, b, distance]) { 33 | assert.strictEqual(hamming(a, b), distance, `${a} / ${b}`); 34 | }); 35 | }); 36 | 37 | it('should correctly compute the normalized Hamming distance/similarity.', function() { 38 | const tests = [ 39 | ['cat', 'hat', 1 / 3], 40 | ['Niall', 'Neil', 0.6], 41 | ['aluminum', 'Catalan', 1], 42 | ['ATCG', 'TAGC', 1], 43 | ['Estelle', 'Estrella', 0.5] 44 | ]; 45 | 46 | tests.forEach(function([a, b, distance]) { 47 | assert.strictEqual(normalizedDistance(a, b), distance, `${a} / ${b}`); 48 | assert.strictEqual(normalizedSimilarity(a, b), 1 - distance, `${a} / ${b}`); 49 | }); 50 | }); 51 | 52 | 53 | it('should correctly compute bitwise Hamming distance.', function() { 54 | const tests = [ 55 | ['1011101', '1001001', 2], 56 | ['1111', '1111', 0], 57 | ['0110', '1001', 4] 58 | ].map(t => [parseInt(t[0], 2), parseInt(t[1], 2), t[2]]); 59 | 60 | tests.forEach(function([a, b, distance]) { 61 | assert.strictEqual(bitwise(a, b), distance); 62 | }); 63 | }); 64 | }); 65 | -------------------------------------------------------------------------------- /test/metrics/identity.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/identity tests 3 | * ========================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import {distance, similarity} from '../../src/metrics/identity'; 8 | 9 | describe('identity', function() { 10 | 11 | it('should correctly compute identity distance/similarity.', function() { 12 | const ref = [1, 2]; 13 | 14 | const tests = [ 15 | ['a', 'b', 1], 16 | ['a', 'a', 0], 17 | [ref, ref, 0], 18 | ['ab', 'abc', 1], 19 | [[1, 2], [1, 2], 0], 20 | [[1, 3], [1, 2], 1] 21 | ]; 22 | 23 | tests.forEach(function([a, b, d]) { 24 | assert.strictEqual(distance(a, b), d, `${a} / ${b} => ${d}`); 25 | assert.strictEqual(similarity(a, b), +!d, `${a} / ${b} => ${d}`); 26 | }); 27 | }); 28 | }); 29 | -------------------------------------------------------------------------------- /test/metrics/jaccard.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/jaccard tests 3 | * ======================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import jaccard, { 8 | index, 9 | similarity, 10 | distance 11 | } from '../../src/metrics/jaccard'; 12 | 13 | describe('jaccard', function() { 14 | 15 | it('should compute the jaccard index & aliases correctly.', function() { 16 | const tests = [ 17 | ['abc', '', 0], 18 | ['', 'abc', 0], 19 | ['', '', 1], 20 | ['abc', 'abc', 1], 21 | ['abc', 'xyz', 0], 22 | ['night', 'nacht', 3 / 7], 23 | ['context', 'contact', 4 / 7], 24 | ['ht', 'nacht', 2 / 5] 25 | ]; 26 | 27 | tests.forEach(function([x, y, i]) { 28 | assert.strictEqual(jaccard(x, y), i, `${x} / ${y}`); 29 | assert.strictEqual(jaccard(x, y), index(x, y)); 30 | assert.strictEqual(jaccard(x, y), similarity(x, y)); 31 | assert.strictEqual(1 - jaccard(x, y), distance(x, y)); 32 | }); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /test/metrics/jaro-winkler.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/jaro-winkler tests 3 | * ============================================= 4 | * 5 | */ 6 | import {assert} from 'chai'; 7 | import jaro, {distance, similarity} from '../../src/metrics/jaro'; 8 | import jaroWinkler, { 9 | distance as jaroWinklerDistance, 10 | similarity as jaroWinklerSimilarity, 11 | custom 12 | } from '../../src/metrics/jaro-winkler'; 13 | 14 | describe('jaro', function() { 15 | 16 | it('should compute the Jaro distance correctly.', function() { 17 | const tests = [ 18 | ['Duane', 'Duane', 1], 19 | ['Dwayne', 'Duane', 0.82], 20 | ['Dwayne'.split(''), 'Duane'.split(''), 0.82], 21 | ['Martha', 'Marhta', 0.94], 22 | ['Dixon', 'Dicksonx', 0.77], 23 | ['Duane', 'Freakishlylongstring', 0.47] 24 | ]; 25 | 26 | tests.forEach(function([a, b, d]) { 27 | assert.approximately(jaro(a, b), d, 0.01, `${a} / ${b}`); 28 | assert.approximately(distance(a, b), 1 - d, 0.01, `${a} / ${b}`); 29 | assert.approximately(similarity(a, b), d, 0.01, `${a} / ${b}`); 30 | }); 31 | }); 32 | }); 33 | 34 | describe('jaro-winkler', function() { 35 | 36 | it('should compute the Jaro-Winkler distance correctly.', function() { 37 | const tests = [ 38 | ['Duane', 'Duane', 1], 39 | ['Dwayne', 'Duane', 0.84], 40 | ['Dwayne'.split(''), 'Duane'.split(''), 0.84], 41 | ['Martha', 'Marhta', 0.96], 42 | ['Dixon', 'Dicksonx', 0.81], 43 | ['Duane', 'Freakishlylongstring', 0.47], 44 | ['commonlongprefixword', 'commonlongprefixworm', 0.98] 45 | ]; 46 | 47 | tests.forEach(function([a, b, d]) { 48 | assert.approximately(jaroWinkler(a, b), d, 0.01, `${a} / ${b}`); 49 | assert.approximately(jaroWinklerDistance(a, b), 1 - d, 0.01, `${a} / ${b}`); 50 | assert.approximately(jaroWinklerSimilarity(a, b), d, 0.01, `${a} / ${b}`); 51 | }); 52 | }); 53 | 54 | it('should throw when passing wrong parameters to the algorithm.', function() { 55 | assert.throws(function() { 56 | custom({boostThreshold: 2}, 'Duane', 'Dwayne'); 57 | }, /comprised/); 58 | 59 | assert.throws(function() { 60 | custom({scalingFactor: 0.40}, 'Duane', 'Dwayne'); 61 | }, /scaling/); 62 | }); 63 | 64 | it('should be possible to use a custom version of the algorithm.', function() { 65 | assert.approximately(custom({boostThreshold: 0.6}, 'Duane', 'Dwayne'), 0.84, 0.01); 66 | assert.approximately(custom({scalingFactor: 0.15}, 'Duane', 'Dwayne'), 0.84, 0.01); 67 | }); 68 | }); 69 | -------------------------------------------------------------------------------- /test/metrics/lcs.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/lcs tests 3 | * ==================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import {distance, similarity} from '../../src/metrics/lcs'; 8 | 9 | describe('lcs', function() { 10 | 11 | it('should correctly compute lcs distance/similarity.', function() { 12 | const tests = [ 13 | ['test', 'test', 1], 14 | ['test', '', 0], 15 | ['', '', 1], 16 | ['', 'test', 0], 17 | ['cat', 'hat', 2 / 3], 18 | ['Niall', 'Neil', 1 / 5], 19 | ['aluminum', 'Catalan', 0.25], 20 | ['ATCG', 'TAGC', 0.25], 21 | ['chat', 'cat', 1 / 2], 22 | [['h', 'a', 't'], ['c', 'a', 't'], 2 / 3] 23 | ]; 24 | 25 | tests.forEach(function([a, b, d]) { 26 | assert.strictEqual(similarity(a, b), d, `${a} / ${b} => ${d}`); 27 | assert.strictEqual(distance(a, b), 1 - d, `${a} / ${b} => ${1 - d}`); 28 | }); 29 | }); 30 | }); 31 | -------------------------------------------------------------------------------- /test/metrics/length.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/length tests 3 | * ========================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import {distance, similarity} from '../../src/metrics/length'; 8 | 9 | describe('length', function() { 10 | const tests = [ 11 | ['test', 'test', 1], 12 | ['hello', '', 0], 13 | ['', 'hello', 0], 14 | ['cat', 'hat', 1], 15 | [[0, 1, 1], [0, 0, 1], 1], 16 | ['Niall', 'Neil', 0.8], 17 | ['aluminum', 'Catalan', 0.875], 18 | ['ATCG', 'TAGC', 1] 19 | ]; 20 | 21 | tests.forEach(function([a, b, d]) { 22 | assert.strictEqual(similarity(a, b), d); 23 | assert.strictEqual(distance(a, b), 1 - d); 24 | }); 25 | }); 26 | -------------------------------------------------------------------------------- /test/metrics/levenshtein.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/levenshtein tests 3 | * ============================================ 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import levenshtein, {limited} from '../../src/metrics/levenshtein'; 8 | 9 | describe('levenshtein', function() { 10 | const tests = [ 11 | [['b', 'o', 'o', 'k'], ['b', 'a', 'c', 'k'], 2], 12 | [['the', 'cat', 'eats', 'mouse'], ['the', 'mouse', 'likes', 'mouse'], 2], 13 | ['book', 'back', 2], 14 | ['bbbbookkkk', 'bbbbackkkk', 2], 15 | ['hello', 'helo', 1], 16 | ['good sir', 'baal', 8], 17 | ['say', 'shiver', 5], 18 | ['feature', 'get-project-features', 13], 19 | ['example', 'samples', 3], 20 | ['sturgeon', 'urgently', 6], 21 | ['levenshtein', 'frankenstein', 6], 22 | ['distance', 'difference', 5], 23 | ['a', 'b', 1], 24 | ['ab', 'ac', 1], 25 | ['ac', 'bc', 1], 26 | ['abc', 'axc', 1], 27 | ['xabxcdxxefxgx', '1ab2cd34ef5g6', 6], 28 | ['a', '', 1], 29 | ['ab', 'a', 1], 30 | ['ab', 'b', 1], 31 | ['abc', 'ac', 1], 32 | ['xabxcdxxefxgx', 'abcdefg', 6], 33 | ['', 'a', 1], 34 | ['a', 'ab', 1], 35 | ['b', 'ab', 1], 36 | ['ac', 'abc', 1], 37 | ['abcdefg', 'xabxcdxxefxgx', 6], 38 | ['', '', 0], 39 | ['a', 'a', 0], 40 | ['abc', 'abc', 0], 41 | ['', '', 0], 42 | ['a', '', 1], 43 | ['', 'a', 1], 44 | ['abc', '', 3], 45 | ['', 'abc', 3], 46 | ['因為我是中國人所以我會說中文', '因為我是英國人所以我會說英文', 2], 47 | ['因為我是中國人所以我會說中文'.split(''), '因為我是英國人所以我會說英文'.split(''), 2], 48 | [['🔥', '👨👩👧👦'], ['🔥', '🌈'], 1] 49 | ]; 50 | 51 | it('should correctly compute the Levenshtein distance.', function() { 52 | tests.forEach(function([a, b, distance]) { 53 | assert.strictEqual(levenshtein(a, b), distance, `${a} <=> ${b}`); 54 | }); 55 | }); 56 | 57 | it('should be possible to use the limited version.', function() { 58 | tests.forEach(function([a, b, distance]) { 59 | assert.strictEqual(limited(2, a, b), distance > 2 ? Infinity : distance, `${a} <=> ${b}`); 60 | }); 61 | }); 62 | }); 63 | -------------------------------------------------------------------------------- /test/metrics/lig.js: -------------------------------------------------------------------------------- 1 | /* eslint no-unused-vars: 0 */ 2 | /** 3 | * Talisman metrics/distance/lig tests 4 | * ==================================== 5 | * 6 | */ 7 | import {assert} from 'chai'; 8 | import {lig2, lig3} from '../../src/metrics/lig'; 9 | 10 | describe('lig', function() { 11 | 12 | it('should correctly compute the LIG1, LIG2 & LIG3 distance.', function() { 13 | const tests = [ 14 | ['', '', 1, 1, 1], 15 | ['Hello', 'Hello', 1, 1, 1], 16 | ['abc', 'def', 0, 0, 0], 17 | ['Glavin', 'Glawyn', 0.5, 0.67, 0.80], 18 | ['Williams', 'Vylliems', 0.45, 0.63, 0.77], 19 | ['Lewis', 'Louis', 0.43, 0.6, 0.75], 20 | ['Alex', 'Alexander', 0.44, 0.44, 0.62], 21 | ['Wild', 'Wildsmith', 0.44, 0.44, 0.62], 22 | ['Bram', 'Bramberley', 0.4, 0.4, 0.58] 23 | ]; 24 | 25 | tests.forEach(function([a, b, lig1Distance, lig2Distance, lig3Distance]) { 26 | assert.approximately(lig2(a, b), lig2Distance, 0.01, `LIG2: ${a}, ${b}`); 27 | assert.approximately(lig3(a, b), lig3Distance, 0.01, `LIG3: ${a}, ${b}`); 28 | }); 29 | }); 30 | }); 31 | -------------------------------------------------------------------------------- /test/metrics/manhattan.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/manhattan tests 3 | * ========================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import manhattan from '../../src/metrics/manhattan'; 8 | 9 | describe('manhattan', function() { 10 | 11 | const tests = [ 12 | { 13 | a: [2], 14 | b: [4], 15 | distance: 2 16 | }, 17 | { 18 | a: [1, 3], 19 | b: [4, 5], 20 | distance: 5 21 | }, 22 | { 23 | a: [1, 3, 5], 24 | b: [2, 1, 4], 25 | distance: 4 26 | } 27 | ]; 28 | 29 | it('should throw if the given vectors are not of the same dimension.', function() { 30 | assert.throws(function() { 31 | manhattan([1, 2], [1, 2, 3]); 32 | }, /dimension/); 33 | }); 34 | 35 | it('should correctly compute the manhattan distance of n-dimensions vectors.', function() { 36 | tests.forEach(function({a, b, distance}) { 37 | assert.strictEqual(manhattan(a, b), distance); 38 | }); 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /test/metrics/minkowski.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/minkowski tests 3 | * ========================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import minkowski from '../../src/metrics/minkowski'; 8 | import euclidean from '../../src/metrics/euclidean'; 9 | import manhattan from '../../src/metrics/manhattan'; 10 | 11 | describe('minkowski', function() { 12 | 13 | it('should correctly compute the Minkowski distance.', function() { 14 | const vectors = [[1, 3], [4, 5]]; 15 | 16 | assert.strictEqual(minkowski(1, ...vectors), manhattan(...vectors)); 17 | assert.strictEqual(minkowski(2, ...vectors), euclidean(...vectors)); 18 | }); 19 | }); 20 | -------------------------------------------------------------------------------- /test/metrics/mlipns.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/mlipns tests 3 | * ======================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import mlipns from '../../src/metrics/mlipns'; 8 | 9 | describe('mlipns', function() { 10 | 11 | it('should correctly compute the MLIPNS distance.', function() { 12 | const tests = [ 13 | ['cat', 'cat', 1], 14 | ['cat', '', 0], 15 | ['', 'cat', 0], 16 | ['cat', 'hat', 1], 17 | ['Niall', 'Neil', 0], 18 | ['aluminum', 'Catalan', 0], 19 | ['ATCG', 'TAGC', 0] 20 | ]; 21 | 22 | tests.forEach(function([a, b, distance]) { 23 | assert.strictEqual(mlipns(a, b), distance, `${a}, ${b}`); 24 | }); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /test/metrics/monge-elkan.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/monge-elkan tests 3 | * ============================================ 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import mongeElkan, {symmetric} from '../../src/metrics/monge-elkan'; 8 | import {similarity as identity} from '../../src/metrics/identity'; 9 | 10 | describe('monge-elkan', function() { 11 | 12 | it('should properly compute Monge-Elkan distance.', function() { 13 | 14 | assert.strictEqual( 15 | mongeElkan(identity, ['test'], ['test', 'test2']), 16 | 1 17 | ); 18 | 19 | assert.strictEqual( 20 | symmetric(identity, ['test', 'test2'], ['test']), 21 | 0.75 22 | ); 23 | 24 | const tests = [ 25 | ['test string1', 'test string2', 0.5], 26 | ['test', 'test string2', 0.75], 27 | ['', 'test string2', 0], 28 | ['aaa bbb ccc ddd', 'aaa bbb ccc eee', 0.75], 29 | ['a b c d', 'a b c e', 0.75], 30 | ['Sam J Chapman', 'Samuel John Chapman', 1 / 3], 31 | ['Sam Chapman', 'S Chapman', 0.5], 32 | ['John Smith', 'Samuel John Chapman', 0.41666666666666663], 33 | ['John Smith', 'Sam Chapman', 0], 34 | ['John Smith', 'Sam J Chapman', 0], 35 | ['John Smith', 'S Chapman', 0], 36 | ['', '', 1], 37 | ['test', 'test', 1] 38 | ]; 39 | 40 | tests.forEach(function([a, b, s]) { 41 | assert.strictEqual( 42 | symmetric( 43 | identity, 44 | a.split(' '), 45 | b.split(' ') 46 | ), 47 | s, 48 | `${a} <=> ${b} (${s})` 49 | ); 50 | }); 51 | }); 52 | }); 53 | -------------------------------------------------------------------------------- /test/metrics/mra.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/mra tests 3 | * ==================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import mra from '../../src/metrics/mra'; 8 | 9 | describe('mra', function() { 10 | 11 | const tests = [ 12 | { 13 | a: 'Byrne', 14 | b: 'Boern', 15 | result: { 16 | minimum: 4, 17 | similarity: 5, 18 | codex: ['BYRN', 'BRN'], 19 | matching: true 20 | } 21 | }, 22 | { 23 | a: 'Smith', 24 | b: 'Smyth', 25 | result: { 26 | minimum: 3, 27 | similarity: 5, 28 | codex: ['SMTH', 'SMYTH'], 29 | matching: true 30 | } 31 | }, 32 | { 33 | a: 'Catherine', 34 | b: 'Kathryn', 35 | result: { 36 | minimum: 3, 37 | similarity: 4, 38 | codex: ['CTHRN', 'KTHRYN'], 39 | matching: true 40 | } 41 | }, 42 | { 43 | a: 'Wilfred', 44 | b: 'Manning', 45 | result: { 46 | minimum: 3, 47 | similarity: 1, 48 | codex: ['WLFRD', 'MNG'], 49 | matching: false 50 | } 51 | } 52 | ]; 53 | 54 | it('should throw if the given names are not strings.', function() { 55 | assert.throws(function() { 56 | mra(null, [1, 2, 3]); 57 | }, /string/); 58 | }); 59 | 60 | it('should correctly compute the euclidean distance of n-dimensions vectors.', function() { 61 | tests.forEach(function({a, b, result}) { 62 | assert.deepEqual(mra(a, b), result); 63 | }); 64 | }); 65 | }); 66 | -------------------------------------------------------------------------------- /test/metrics/overlap.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/overlap tests 3 | * ======================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import overlap from '../../src/metrics/overlap'; 8 | 9 | describe('overlap', function() { 10 | 11 | it('should correctly compute the overlap coefficient.', function() { 12 | const tests = [ 13 | ['abc', 'abc', 1], 14 | ['abc', 'def', 0], 15 | ['abc', 'abd', 2 / 3], 16 | ['abc', 'abcde', 1], 17 | ['abcdefij', 'abc', 1], 18 | ['abcdefij'.split(''), 'abc'.split(''), 1], 19 | [[1, 2, 3], [1, 2], 1] 20 | ]; 21 | 22 | tests.forEach(function([a, b, distance]) { 23 | assert.strictEqual(overlap(a, b), distance, `${a} / ${b}`); 24 | }); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /test/metrics/prefix.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/prefix tests 3 | * ======================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import {distance, similarity} from '../../src/metrics/prefix'; 8 | 9 | describe('prefix', function() { 10 | 11 | it('should correctly compute prefix distance/similarity.', function() { 12 | const tests = [ 13 | ['test', 'test', 1], 14 | ['test', '', 0], 15 | ['', '', 1], 16 | ['', 'test', 0], 17 | ['cat', 'hat', 0], 18 | ['Niall', 'Neil', 0.25], 19 | ['aluminum', 'Catalan', 0], 20 | ['ATCG', 'TAGC', 0], 21 | ['ATCG', 'ATCH', 0.75], 22 | ['ATCG', 'ATCGHI', 1] 23 | ]; 24 | 25 | tests.forEach(function([a, b, d]) { 26 | assert.strictEqual(similarity(a, b), d, `${a} / ${b} => ${d}`); 27 | assert.strictEqual(distance(a, b), 1 - d, `${a} / ${b} => ${1 - d}`); 28 | }); 29 | }); 30 | }); 31 | -------------------------------------------------------------------------------- /test/metrics/ratcliff-obershelp.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/lcs tests 3 | * ==================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import {distance, similarity} from '../../src/metrics/ratcliff-obershelp'; 8 | 9 | describe('ratcliff-obershelp', function() { 10 | 11 | it('should correctly compute ratcliff-obershelp distance/similarity.', function() { 12 | const tests = [ 13 | ['test', 'test', 1], 14 | ['test', '', 0], 15 | ['', '', 1], 16 | ['', 'test', 0], 17 | ['mathematics', 'matematica', 18 / 21], 18 | ['mathematics'.split(''), 'matematica'.split(''), 18 / 21], 19 | ['cat', 'hat', 2 / 3], 20 | ['Niall', 'Neil', 2 / 3], 21 | ['aluminum', 'Catalan', 0.4], 22 | ['ATCG', 'TAGC', 0.5] 23 | ]; 24 | 25 | tests.forEach(function([a, b, d]) { 26 | assert.strictEqual(similarity(a, b), d, `${a} / ${b} => ${d}`); 27 | assert.strictEqual(distance(a, b), 1 - d, `${a} / ${b} => ${1 - d}`); 28 | }); 29 | }); 30 | }); 31 | -------------------------------------------------------------------------------- /test/metrics/sift4.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/sift4 tests 3 | * ======================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import sift4, {custom} from '../../src/metrics/sift4'; 8 | 9 | describe('sift4', function() { 10 | 11 | it('should correctly compute the SIFT4 distance.', function() { 12 | const tests = [ 13 | ['', '', 0], 14 | ['cat', 'cat', 0], 15 | ['cat', '', 3], 16 | ['', 'cat', 3], 17 | ['cat', 'hat', 1], 18 | ['levenshtein', 'frankenstein', 10] 19 | ]; 20 | 21 | tests.forEach(function([a, b, distance]) { 22 | assert.strictEqual(sift4(a, b), distance, `${a}, ${b}`); 23 | assert.strictEqual(custom({symmetric: true}, a, b), distance, `${a}, ${b}`); 24 | }); 25 | }); 26 | 27 | it('should be possible to change the maxOffset.', function() { 28 | assert.strictEqual( 29 | custom({transpositions: true, maxOffset: 5}, 'levenshtein', 'frankenstein'), 30 | 6 31 | ); 32 | 33 | assert.strictEqual( 34 | custom({transpositions: true, maxOffset: 2}, 'levenshtein', 'frankenstein'), 35 | 7 36 | ); 37 | 38 | assert.strictEqual( 39 | custom({transpositions: true, maxOffset: 1}, 'levenshtein', 'frankenstein'), 40 | 12 41 | ); 42 | }); 43 | 44 | it('should be possible to compute transpositions.', function() { 45 | assert.strictEqual( 46 | sift4('levenshtein', 'frankenstein'), 47 | 10 48 | ); 49 | 50 | assert.strictEqual( 51 | custom({transpositions: true}, 'levenshtein', 'frankenstein'), 52 | 6 53 | ); 54 | }); 55 | 56 | it('should be possible to set a maximum distance.', function() { 57 | assert.strictEqual( 58 | custom({ 59 | maxDistance: 10 60 | }, 'levenshtein', 'frankenstein'), 61 | 10 62 | ); 63 | 64 | assert.strictEqual( 65 | custom({ 66 | maxDistance: 5 67 | }, 'levenshtein', 'frankenstein'), 68 | Infinity 69 | ); 70 | 71 | assert.strictEqual( 72 | custom({ 73 | maxDistance: 10, 74 | transpositions: true 75 | }, 'levenshtein', 'frankenstein'), 76 | 6 77 | ); 78 | 79 | assert.strictEqual( 80 | custom({ 81 | maxDistance: 3, 82 | transpositions: true 83 | }, 'levenshtein', 'frankenstein'), 84 | Infinity 85 | ); 86 | }); 87 | }); 88 | -------------------------------------------------------------------------------- /test/metrics/smith-waterman.js: -------------------------------------------------------------------------------- 1 | /* eslint no-confusing-arrow: 0 */ 2 | /** 3 | * Talisman metrics/distance/smith-waterman tests 4 | * =============================================== 5 | * 6 | */ 7 | import assert from 'assert'; 8 | import {score} from '../../src/metrics/smith-waterman'; 9 | 10 | describe('smith-waterman', function() { 11 | 12 | it('should correctly compute the smith-waterman distance.', function() { 13 | const tests = [ 14 | ['hello', 'hello', 5], 15 | ['hello', '', 0], 16 | ['', 'hello', 0], 17 | ['cat', 'hat', 2], 18 | ['xxxxABCx', 'yABCyyyy', 3], 19 | ['dva', 'deeve', 1, {gap: 2.2}], 20 | [[0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0], [1, 1, 1, 1], 3], 21 | ['dva', 'deeve', 2, {similarity: (a, b) => a === b ? 2 : -1}], 22 | ['GCATAGCU', 'GATTACA', 6.5, {gap: 1.4, similarity: (a, b) => a === b ? 1.5 : 0.5}] 23 | ]; 24 | 25 | tests.forEach(function([a, b, result, options = {}]) { 26 | assert.strictEqual(score(options, a, b), result, `${a} / ${b}`); 27 | }); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /test/metrics/suffix.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman metrics/distance/suffix tests 3 | * ======================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import {distance, similarity} from '../../src/metrics/suffix'; 8 | 9 | describe('suffix', function() { 10 | 11 | it('should correctly compute suffix distance/similarity.', function() { 12 | const tests = [ 13 | ['test', 'test', 1], 14 | ['test', '', 0], 15 | ['', '', 1], 16 | ['', 'test', 0], 17 | ['cat', 'hat', 2 / 3], 18 | ['Niall', 'Neil', 0.25], 19 | ['aluminum', 'Catalan', 0], 20 | ['ATCG', 'TAGC', 0], 21 | ['ATCG', 'ATCH', 0], 22 | ['Test', 'test', 3 / 4] 23 | ]; 24 | 25 | tests.forEach(function([a, b, d]) { 26 | assert.strictEqual(similarity(a, b), d, `${a} / ${b} => ${d}`); 27 | assert.strictEqual(distance(a, b), 1 - d, `${a} / ${b} => ${1 - d}`); 28 | }); 29 | }); 30 | }); 31 | -------------------------------------------------------------------------------- /test/parsers/brown.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman parsers/brown tests 3 | * ============================= 4 | */ 5 | import assert from 'assert'; 6 | import brown from '../../src/parsers/brown'; 7 | import {loadResource} from '../helpers'; 8 | 9 | const ca02 = loadResource('brown/ca02.txt'); 10 | 11 | describe('brown', function() { 12 | 13 | it('should correctly parse Brown corpus text.', function() { 14 | 15 | const tokens = brown(ca02); 16 | 17 | assert.strictEqual(tokens.length, 2277); 18 | 19 | assert.deepEqual( 20 | tokens.slice(0, 10), 21 | [ 22 | ['Austin', 'np-hl'], 23 | [',', ',-hl'], 24 | ['Texas', 'np-hl'], 25 | ['--', '--'], 26 | ['Committee', 'nn'], 27 | ['approval', 'nn'], 28 | ['of', 'in'], 29 | ['Gov.', 'nn-tl'], 30 | ['Price', 'np'], 31 | ['Daniel\'s', 'np$'] 32 | ] 33 | ); 34 | }); 35 | }); 36 | -------------------------------------------------------------------------------- /test/phonetics/alpha-sis.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/alpha-sis tests 3 | * =================================== 4 | * 5 | */ 6 | import {assert} from 'chai'; 7 | import alphaSis from '../../src/phonetics/alpha-sis'; 8 | 9 | describe('alpha-sis', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | alphaSis([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the Alpha SIS code correctly.', function() { 18 | const tests = [ 19 | ['', ['00000000000000']], 20 | ['Christopher', ['06401840000000', '07040184000000', '04018400000000']], 21 | ['Niall', ['02500000000000']], 22 | ['Smith', ['03100000000000']], 23 | ['Schmidt', ['06310000000000']], 24 | ['Rodgers', ['04740000000000']], 25 | ['Rogers', ['04740000000000']], 26 | ['Kant', ['07210000000000', '06210000000000']], 27 | ['Knuth', ['02100000000000']], 28 | ['Harper', ['24940000000000']], 29 | ['Collier', ['07540000000000', '06540000000000']], 30 | ['Schultz', ['06500000000000', '06510000000000']], 31 | ['Livingston', ['05827012000000']], 32 | ['Nichols', ['02650000000000', '02705000000000', '02050000000000']], 33 | ['Chavez', ['06800000000000', '07080000000000', '08000000000000']], 34 | ['Ohrbock', ['14970000000000', '14960000000000']], 35 | ['Ohrbach', ['14960000000000', '14970000000000', '14900000000000']], 36 | ['Lyle', ['05500000000000']], 37 | ['Lisle', ['05050000000000']], 38 | ['Catz', ['07000000000000', '06000000000000', '07100000000000', '06100000000000']], 39 | ['Chritz', ['06400000000000', '07040000000000', '04000000000000', '06410000000000', '07041000000000', '04100000000000']], 40 | ['Chrichritz', ['06464000000000', '07046400000000', '04640000000000', '06470400000000', '07047040000000', '04704000000000', '06404000000000', '07040400000000', '04040000000000', '06464100000000', '07046410000000', '04641000000000', '06470410000000', '07047041000000', '04704100000000', '06404100000000', '07040410000000', '04041000000000']] 41 | ]; 42 | 43 | tests.forEach(function([name, codes]) { 44 | assert.sameMembers(alphaSis(name), codes, name); 45 | }); 46 | }); 47 | }); 48 | -------------------------------------------------------------------------------- /test/phonetics/caverphone.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/caverphone tests 3 | * ==================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import caverphone, {revisited} from '../../src/phonetics/caverphone'; 8 | 9 | describe('caverphone', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | caverphone([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the caverphone code correctly.', function() { 18 | const tests = [ 19 | ['ANRKSN1111', 'Henrichsen'], 20 | ['ANRKSN1111', 'Henricsson'], 21 | ['ANRKSN1111', 'Henriksson'], 22 | ['ANRKSN1111', 'Hinrichsen'], 23 | ['ASKKA11111', 'Izchaki'], 24 | ['MKLFTA1111', 'Maclaverty'], 25 | ['MKLFTA1111', 'Macleverty'], 26 | ['MKLFTA1111', 'Mcclifferty'], 27 | ['MKLFTA1111', 'Mclafferty'], 28 | ['MKLFTA1111', 'Mclaverty'], 29 | ['SLKMP11111', 'Slocomb'], 30 | ['SLKMP11111', 'Slocombe'], 31 | ['SLKMP11111', 'Slocumb'], 32 | ['WTLM111111', 'Whitlam'] 33 | ]; 34 | 35 | tests.forEach(function([code, word]) { 36 | assert.strictEqual(caverphone(word), code, `${word} => ${code}`); 37 | }); 38 | }); 39 | 40 | it('should compute the revisited version of the code correctly.', function() { 41 | const tests = [ 42 | ['PTA1111111', 'Peter'], 43 | ['ANRKSN1111', 'Henrichsen'], 44 | ['STFNSN1111', 'Stevenson'] 45 | ]; 46 | 47 | tests.forEach(function([code, word]) { 48 | assert.strictEqual(revisited(word), code, `${word} => ${code}`); 49 | }); 50 | }); 51 | }); 52 | -------------------------------------------------------------------------------- /test/phonetics/daitch-mokotoff.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/daitch-mokotoff tests 3 | * ========================================= 4 | * 5 | */ 6 | import {assert} from 'chai'; 7 | import daitchMokotoff from '../../src/phonetics/daitch-mokotoff'; 8 | 9 | describe('daitch-mokotoff', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | daitchMokotoff([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the Daitch-Mokotoff code correctly.', function() { 18 | const tests = [ 19 | ['Alpert', ['087930']], 20 | ['Breuer', ['791900']], 21 | ['Golden', ['583600']], 22 | ['Haber', ['579000']], 23 | ['Manheim', ['665600']], 24 | ['Topf', ['370000']], 25 | ['Kleinman', ['586660']], 26 | ['Peters', ['739400', '734000']], 27 | ['Peterson', ['739460', '734600']], 28 | ['Moskowitz', ['645740']], 29 | ['Moskovitz', ['645740']], 30 | ['Auerbach', ['097500', '097400']], 31 | ['Ohrbach', ['097500', '097400']], 32 | ['Uhrbach', ['097500', '097400']], 33 | ['Lipshitz', ['874400']], 34 | ['Lippszyc', ['874500', '874400']], 35 | ['Lewinsky', ['876450']], 36 | ['Levinsky', ['876450']], 37 | ['Szlamawicz', ['486740']], 38 | ['Shlamovitz', ['486740']], 39 | ['Jackson', ['154600', '454600', '145460', '445460']], 40 | ['Jackson-Jackson', ['154654', '454654', '145465', '445465', '154645', '454645', '145464', '445464', '154644', '454644']], 41 | ['augsburg', ['054795']], 42 | ['halberstadt', ['587943', '587433']], 43 | ['mannheim', ['665600']], 44 | ['chernowitz', ['596740', '496740']], 45 | ['cherkassy', ['595400', '495400']], 46 | ['berlin', ['798600']], 47 | ['mintz', ['664000']], 48 | ['eisenstadt', ['046433']], 49 | ['izenstadt', ['046433']], 50 | ['lewin', ['876000']], 51 | ['levine', ['876000']], 52 | ['szlachter', ['485390', '484390']], 53 | ['chelm', ['586000', '486000']], 54 | ['chelmie', ['586000', '486000']], 55 | ['chelma', ['586000', '486000']], 56 | ['helm', ['586000']], 57 | ['daitch', ['340000']], 58 | ['levy', ['870000']], 59 | ['mokotoff', ['653700']], 60 | ['chajackachac', ['515550', '415550', '514555', '414555', '515450', '415450', '514545', '414545', '515540', '415540', '514554', '414554', '515440', '415440', '514544', '414544']] 61 | ]; 62 | 63 | tests.forEach(function([name, codes]) { 64 | assert.sameMembers(daitchMokotoff(name), codes, name); 65 | }); 66 | }); 67 | }); 68 | -------------------------------------------------------------------------------- /test/phonetics/eudex.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/eudex tests 3 | * =============================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import eudex from '../../src/phonetics/eudex'; 8 | 9 | describe('eudex', function() { 10 | 11 | it('should produce correct hashes.', function() { 12 | 13 | const identicalHashes = [ 14 | ['JAva', 'jAva'], 15 | ['co!mputer', 'computer'], 16 | ['comp-uter', 'computer'], 17 | ['comp@u#te?r', 'computer'], 18 | ['java', 'jiva'], 19 | ['lal', 'lel'], 20 | ['rindom', 'ryndom'], 21 | ['riiiindom', 'ryyyyyndom'], 22 | ['riyiyiiindom', 'ryyyyyndom'], 23 | ['triggered', 'TRIGGERED'], 24 | ['repert', 'ropert'] 25 | ]; 26 | 27 | identicalHashes.forEach(function([one, two]) { 28 | assert(eudex(one).equals(eudex(two)), `${one} = ${two}`); 29 | }); 30 | 31 | const differentHashes = [ 32 | ['reddit', 'eddit'], 33 | ['lol', 'lulz'], 34 | ['ijava', 'java'], 35 | ['jesus', 'iesus'], 36 | ['aesus', 'iesus'], 37 | ['iesus', 'yesus'], 38 | ['rupirt', 'ropert'], 39 | ['ripert', 'ropyrt'], 40 | ['rrr', 'rraaaa'], 41 | ['randomal', 'randomai'] 42 | ]; 43 | 44 | differentHashes.forEach(function([one, two]) { 45 | assert(!eudex(one).equals(eudex(two)), `${one} != ${two}`); 46 | }); 47 | }); 48 | }); 49 | -------------------------------------------------------------------------------- /test/phonetics/french/phonex.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/french/phonex tests 3 | * ======================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import phonex from '../../../src/phonetics/french/phonex'; 8 | 9 | describe('phonex', function() { 10 | it('should throw if the given word is not a string.', function() { 11 | assert.throws(function() { 12 | phonex([]); 13 | }, /string/); 14 | }); 15 | 16 | it('should compute the Phonex code correctly.', function() { 17 | const tests = [ 18 | ['PHYLAURHEIMSMET', 'FILOR4SNY'], 19 | ['Martin', 'NORTIN'], 20 | ['Bernard', 'FYRNOR'], 21 | ['Faure', 'FORE'], 22 | ['Perez', 'TYRYZ'], 23 | ['Gros', 'GROS'], 24 | ['Chapuis', '5OTUIS'], 25 | ['Boyer', 'F2YR'], 26 | ['Gauthier', 'KOTIYR'], 27 | ['Rey', 'RY'], 28 | ['Barthélémy', 'FORTILINI'], 29 | ['Henry', 'H1RI'], 30 | ['Moulin', 'N3LIN'], 31 | ['Rousseau', 'R3SO'] 32 | ]; 33 | 34 | tests.forEach(function([word, code]) { 35 | assert.strictEqual(phonex(word), code, `${word} => ${code}`); 36 | }); 37 | }); 38 | }); 39 | -------------------------------------------------------------------------------- /test/phonetics/french/soundex.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/french/soundex tests 3 | * ======================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import soundex from '../../../src/phonetics/french/soundex'; 8 | 9 | describe('soundex', function() { 10 | it('should throw if the given word is not a string.', function() { 11 | assert.throws(function() { 12 | soundex([]); 13 | }, /string/); 14 | }); 15 | 16 | it('should compute the Soundex code correctly.', function() { 17 | const tests = [ 18 | ['Florentin', 'F465'], 19 | ['Michael', 'M240'], 20 | ['Schœlcher', 'S242'], 21 | ['François', 'F658'], 22 | ['Christophe', 'C683'], 23 | ['Leffe', 'L900'], 24 | ['Elizabeth', 'E481'], 25 | ['Dupont', 'D153'], 26 | ['Dupond', 'D153'] 27 | ]; 28 | 29 | tests.forEach(function([word, code]) { 30 | assert.strictEqual(soundex(word), code, `${word} => ${code}`); 31 | }); 32 | 33 | const faure = soundex('Faure'); 34 | assert.strictEqual(soundex('Ferey'), faure); 35 | assert.strictEqual(soundex('Fery'), faure); 36 | assert.strictEqual(soundex('Frey'), faure); 37 | assert.strictEqual(soundex('Fueri'), faure); 38 | assert(soundex('Fort') !== faure); 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /test/phonetics/french/soundex2.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/french/soundex2 tests 3 | * ========================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import soundex2 from '../../../src/phonetics/french/soundex2'; 8 | 9 | describe('soundex2', function() { 10 | it('should throw if the given word is not a string.', function() { 11 | assert.throws(function() { 12 | soundex2([]); 13 | }, /string/); 14 | }); 15 | 16 | it('should compute the Soundex2 code correctly.', function() { 17 | const tests = [ 18 | ['Asamian', 'AZMN'], 19 | ['Knight', 'NG'], 20 | ['MacKenzie', 'MKNZ'], 21 | ['Pfeifer', 'FR'], 22 | ['Philippe', 'FLP'], 23 | ['Schindler', 'SNDL'], 24 | ['Chateau', 'CHT'], 25 | ['Habitat', 'HBT'], 26 | ['Téhéran', 'TRN'], 27 | ['Essayer', 'ESYR'], 28 | ['Crayon', 'CRYN'], 29 | ['Plyne', 'PLN'], 30 | ['Barad', 'BR'], 31 | ['Martin', 'MRTN'], 32 | ['Bernard', 'BRNR'], 33 | ['Faure', 'FR'], 34 | ['Perez', 'PRZ'], 35 | ['Gros', 'GR'], 36 | ['Chapuis', 'CHP'], 37 | ['Boyer', 'BYR'], 38 | ['Gauthier', 'KTR'], 39 | ['Rey', 'RY'], 40 | ['Barthélémy', 'BRTL'], 41 | ['Henry', 'HNR'], 42 | ['Moulin', 'MLN'], 43 | ['Rousseau', 'RS'] 44 | ]; 45 | 46 | tests.forEach(function([word, code]) { 47 | assert.strictEqual(soundex2(word), code, `${word} => ${code}`); 48 | }); 49 | 50 | assert.strictEqual(soundex2('Faure'), soundex2('Phaure')); 51 | }); 52 | }); 53 | -------------------------------------------------------------------------------- /test/phonetics/fuzzy-soundex.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/fuzzy-soundex tests 3 | * ======================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import fuzzySoundex from '../../src/phonetics/fuzzy-soundex'; 8 | 9 | describe('fuzzy-soundex', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | fuzzySoundex([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the fuzzy Soundex code correctly.', function() { 18 | const tests = [ 19 | ['', ''], 20 | ['Kristen', 'K6935'], 21 | ['Krissy', 'K69'], 22 | ['Christen', 'K6935'], 23 | ['peter', 'P36'], 24 | ['pete', 'P3'], 25 | ['pedro', 'P36'], 26 | ['stephen', 'S315'], 27 | ['steve', 'S31'], 28 | ['smith', 'S53'], 29 | ['smythe', 'S53'], 30 | ['gail', 'G4'], 31 | ['gayle', 'G4'], 32 | ['guillaume', 'G45'], 33 | ['christine', 'K6935'], 34 | ['christina', 'K6935'], 35 | ['kristina', 'K6935'], 36 | ['Wight', 'W3'], 37 | ['Hardt', 'H6'], 38 | ['Knight', 'N3'], 39 | ['Czech', 'S7'], 40 | ['Tsech', 'S7'], 41 | ['gnomic', 'N59'], 42 | ['Wright', 'R3'], 43 | ['Hrothgar', 'R376'], 44 | ['Hwaet', 'W3'], 45 | ['Grant', 'G63'], 46 | ['Hart', 'H6'], 47 | ['Hardt', 'H6'] 48 | ]; 49 | 50 | tests.forEach(function([word, code]) { 51 | assert.strictEqual(fuzzySoundex(word), code, `${word} => ${code}`); 52 | }); 53 | }); 54 | }); 55 | -------------------------------------------------------------------------------- /test/phonetics/german/cologne.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/german/cologne tests 3 | * ======================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import cologne from '../../../src/phonetics/german/cologne'; 8 | 9 | describe('cologne', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | cologne([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the cologne code correctly.', function() { 18 | const tests = [ 19 | ['65752682', 'Müller-Lüdenscheidt'], 20 | ['17863', 'Breschnew'], 21 | ['3412', 'Wikipedia'], 22 | ['4837', 'Xavier'], 23 | ['478237', 'Christopher'], 24 | ['3556', 'Wilhelm'], 25 | ['351', 'Philip'], 26 | ['1274', 'Patrick'], 27 | ['051742', 'Albrecht'] 28 | ]; 29 | 30 | tests.forEach(function([code, word]) { 31 | assert.strictEqual(cologne(word), code, `${word} => ${code}`); 32 | }); 33 | 34 | assert(cologne('Meyer') !== cologne('Müller')); 35 | assert(cologne('Meyer') === cologne('Mayr')); 36 | }); 37 | }); 38 | -------------------------------------------------------------------------------- /test/phonetics/german/phonem.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/german/phonem tests 3 | * ======================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import phonem from '../../../src/phonetics/german/phonem'; 8 | 9 | describe('phonem', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | phonem([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the phonem code correctly.', function() { 18 | const tests = [ 19 | ['', ''], 20 | ['müller', 'MYLR'], 21 | ['schmidt', 'CMYD'], 22 | ['schneider', 'CNAYDR'], 23 | ['fischer', 'VYCR'], 24 | ['weber', 'VBR'], 25 | ['meyer', 'MAYR'], 26 | ['wagner', 'VACNR'], 27 | ['schulz', 'CULC'], 28 | ['becker', 'BCR'], 29 | ['hoffmann', 'OVMAN'], 30 | ['schäfer', 'CVR'], 31 | ['mair', 'MAYR'], 32 | ['bäker', 'BCR'], 33 | ['schaeffer', 'CVR'], 34 | ['computer', 'COMBUDR'], 35 | ['pfeifer', 'VAYVR'], 36 | ['pfeiffer', 'VAYVR'] 37 | ]; 38 | 39 | tests.forEach(function([word, code]) { 40 | assert.strictEqual(phonem(word), code, `${word} => ${code}`); 41 | }); 42 | }); 43 | }); 44 | -------------------------------------------------------------------------------- /test/phonetics/lein.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/lein tests 3 | * ============================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import lein from '../../src/phonetics/lein'; 8 | 9 | describe('lein', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | lein([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the lein code correctly.', function() { 18 | const tests = [ 19 | ['Guillaume', 'G320'], 20 | ['Dabbs', 'D450'], 21 | ['Daves', 'D450'], 22 | ['Davies', 'D450'], 23 | ['Davis', 'D450'], 24 | ['Debaca', 'D450'], 25 | ['Debose', 'D450'], 26 | ['Debus', 'D450'], 27 | ['Defazio', 'D450'], 28 | ['Defigh', 'D450'], 29 | ['Deveaux', 'D450'], 30 | ['Devese', 'D450'], 31 | ['Devies', 'D450'], 32 | ['Devos', 'D450'], 33 | ['Dipiazza', 'D450'], 34 | ['Divish', 'D450'], 35 | ['Dobak', 'D450'], 36 | ['Dobbs', 'D450'], 37 | ['Dobis', 'D450'], 38 | ['Dobish', 'D450'], 39 | ['Dobosh', 'D450'], 40 | ['Doepke', 'D450'], 41 | ['Dopps', 'D450'], 42 | ['Doubek', 'D450'], 43 | ['Doviak', 'D450'], 44 | ['Dubbs', 'D450'], 45 | ['Dubke', 'D450'], 46 | ['Dubois', 'D450'], 47 | ['Duboise', 'D450'], 48 | ['Dubose', 'D450'], 49 | ['Dubs', 'D450'], 50 | ['Dubukey', 'D450'], 51 | ['Dubus', 'D450'], 52 | ['Dufek', 'D450'], 53 | ['Duffek', 'D450'], 54 | ['Dupas', 'D450'], 55 | ['Dupois', 'D450'], 56 | ['Dupuis', 'D450'], 57 | ['Arlène', 'A332'], 58 | ['Lüdenscheidt', 'L125'] 59 | ]; 60 | 61 | tests.forEach(function([word, code]) { 62 | assert.strictEqual(lein(word), code, `${word} => ${code}`); 63 | }); 64 | }); 65 | }); 66 | -------------------------------------------------------------------------------- /test/phonetics/metaphone.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/metaphone tests 3 | * =================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import metaphone from '../../src/phonetics/metaphone'; 8 | 9 | describe('metaphone', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | metaphone([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the metaphone code correctly.', function() { 18 | const tests = [ 19 | ['TSKRMNXN', 'discrimination'], 20 | ['HL', 'hello'], 21 | ['TRT', 'droid'], 22 | ['HPKRT', 'hypocrite'], 23 | ['WL', 'well'], 24 | ['AM', 'am'], 25 | ['S', 'say'], 26 | ['FSNT', 'pheasant'], 27 | ['KT', 'god'] 28 | ]; 29 | 30 | tests.forEach(function([code, word]) { 31 | assert.strictEqual(metaphone(word), code); 32 | }); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /test/phonetics/mra.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/mra tests 3 | * ============================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import mra from '../../src/phonetics/mra'; 8 | 9 | describe('mra', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | mra([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the MRA codex correctly.', function() { 18 | const tests = [ 19 | ['BYRN', 'Byrne'], 20 | ['BRN', 'Boern'], 21 | ['SMTH', 'Smith'], 22 | ['SMYTH', 'Smyth'], 23 | ['CTHRN', 'Catherine'], 24 | ['KTHRYN', 'Kathryn'] 25 | ]; 26 | 27 | tests.forEach(function([code, word]) { 28 | assert.strictEqual(mra(word), code, `${word} => ${code}`); 29 | }); 30 | }); 31 | }); 32 | -------------------------------------------------------------------------------- /test/phonetics/nysiis.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/nysiis tests 3 | * ================================ 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import nysiis, {refined} from '../../src/phonetics/nysiis'; 8 | 9 | describe('nysiis', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | nysiis([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the nysiis code correctly.', function() { 18 | const tests = [ 19 | ['ANDR', 'Andrew'], 20 | ['RABARTSAN', 'Robertson'], 21 | ['NALAN', 'Nolan'], 22 | ['LASXV', 'Louis XVI'], 23 | ['CAS', 'Case'], 24 | ['MCLAGLAN', 'Mclaughlin'], 25 | ['AWAL', 'Awale'], 26 | ['AAGAR', 'Aegir'], 27 | ['LANDGRAN', 'Lundgren'], 28 | ['FFALBAD', 'Philbert'], 29 | ['HARY', 'Harry'], 30 | ['MCANSY', 'Mackenzie'], 31 | ['ANAD', 'ANANND'] 32 | ]; 33 | 34 | tests.forEach(function([code, word]) { 35 | assert.strictEqual(nysiis(word), code, `${word} => ${code}`); 36 | }); 37 | }); 38 | 39 | it('should compute the refined version of the code correctly.', function() { 40 | const tests = [ 41 | ['ANDR', 'Andrew'], 42 | ['RABARTSAN', 'Robertson'], 43 | ['NALAN', 'Nolan'], 44 | ['LASXV', 'Louis XVI'], 45 | ['CAS', 'Case'], 46 | ['MCLAGHLAN', 'Mclaughlin'], 47 | ['AL', 'Awale'], 48 | ['AGAR', 'Aegir'], 49 | ['LANGRAN', 'Lundgren'], 50 | ['FALBAD', 'Philbert'], 51 | ['HARY', 'Harry'], 52 | ['MCANSY', 'Mackenzie'], 53 | ['ANAD', 'ANANND'] 54 | ]; 55 | 56 | tests.forEach(function([code, word]) { 57 | assert.strictEqual(refined(word), code, `${word} => ${code}`); 58 | }); 59 | }); 60 | }); 61 | -------------------------------------------------------------------------------- /test/phonetics/onca.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/onca tests 3 | * ============================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import onca from '../../src/phonetics/onca'; 8 | 9 | describe('onca', function() { 10 | 11 | it('should compute the ONCA code correctly.', function() { 12 | const tests = [ 13 | ['andersen', 'A536'], 14 | ['Anderson', 'A536'], 15 | ['Brian', 'B650'], 16 | ['Brown', 'B650'], 17 | ['brun', 'B650'], 18 | ['cap', 'C100'], 19 | ['cope', 'C100'], 20 | ['copp', 'C100'], 21 | ['kipp', 'C100'], 22 | ['dane', 'D500'], 23 | ['dean', 'D500'], 24 | ['dionne', 'D500'], 25 | ['smith', 'S530'], 26 | ['schmit', 'S530'], 27 | ['schmidt', 'S530'], 28 | ['truman', 'T655'], 29 | ['trueman', 'T655'] 30 | ]; 31 | 32 | tests.forEach(function([word, code]) { 33 | assert.strictEqual(onca(word), code, `${word} => ${code}`); 34 | }); 35 | }); 36 | }); 37 | -------------------------------------------------------------------------------- /test/phonetics/phonex.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/phonex tests 3 | * ================================ 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import phonex from '../../src/phonetics/phonex'; 8 | 9 | describe('phonex', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | phonex([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the phonex code correctly.', function() { 18 | const tests = [ 19 | ['', ''], 20 | ['Guillaume', 'G45'], 21 | ['Ewell', 'A4'], 22 | ['Filp', 'F1'], 23 | ['Heames', 'A5'], 24 | ['Kneves', 'N1'], 25 | ['River', 'R16'], 26 | ['Corley', 'C4'], 27 | ['Carton', 'C35'], 28 | ['Cachpole', 'C214'], 29 | ['Saxon', 'S25'], 30 | ['Wright', 'R23'], 31 | ['Ai', 'A'], 32 | ['Barth', 'B3'], 33 | ['Perry', 'B6'], 34 | ['Garth', 'G3'], 35 | ['Jerry', 'G6'], 36 | ['Gerry', 'G6'], 37 | ['Camden', 'C5'], 38 | ['Ganges', 'G5'], 39 | ['A-1', 'A'] 40 | ]; 41 | 42 | const identical = [ 43 | ['Ewell', 'Ule'], 44 | ['Filp', 'Philp'], 45 | ['Yule', 'Ewell'], 46 | ['Heames', 'Eames'], 47 | ['Kneves', 'Neves'], 48 | ['River', 'Rivers'], 49 | ['Corley', 'Coley'], 50 | ['Carton', 'Carlton'], 51 | ['Cachpole', 'Catchpole'], 52 | ]; 53 | 54 | tests.forEach(function([word, code]) { 55 | assert.strictEqual(phonex(word), code, `${word} => ${code}`); 56 | }); 57 | 58 | identical.forEach(function([one, two]) { 59 | assert.strictEqual(phonex(one), phonex(two)); 60 | }); 61 | }); 62 | }); 63 | -------------------------------------------------------------------------------- /test/phonetics/roger-root.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/roger-root tests 3 | * ==================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import rogerRoot from '../../src/phonetics/roger-root'; 8 | 9 | describe('roger-root', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | rogerRoot([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the roger-root code correctly.', function() { 18 | const tests = [ 19 | ['Guillaume', '07530'], 20 | ['Arlène', '14520'], 21 | ['Lüdenscheidt', '05126'], 22 | ['Chalman', '06532'], 23 | ['Ching', '06270'], 24 | ['Anderson', '12140'], 25 | ['Overstreet', '18401'], 26 | ['Heckel', '27500'], 27 | ['Wyszynski', '40207'], 28 | ['Whitted', '41100'], 29 | ['Ongoog', '12770'], 30 | ['Johnson', '32020'], 31 | ['Williams', '45300'], 32 | ['Smith', '00310'], 33 | ['Jones', '32000'], 34 | ['Brown', '09420'], 35 | ['Davis', '01800'], 36 | ['Jackson', '37020'], 37 | ['Wilson', '45020'], 38 | ['Lee', '05000'], 39 | ['Thomas', '01300'] 40 | ]; 41 | 42 | tests.forEach(function([word, code]) { 43 | assert.strictEqual(rogerRoot(word), code, `${word} => ${code}`); 44 | }); 45 | }); 46 | }); 47 | -------------------------------------------------------------------------------- /test/phonetics/sound-d.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/sound-d tests 3 | * ================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import soundD from '../../src/phonetics/sound-d'; 8 | 9 | describe('SoundD', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | soundD([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the SoundD code correctly.', function() { 18 | const tests = [ 19 | ['5630', 'Martha'], 20 | ['6262', 'Rogers'], 21 | ['6262', 'Rodgers'], 22 | ['2520', 'Hodgins'], 23 | ['2520', 'Hojins'], 24 | ['3500', 'Houghton'], 25 | ['5300', 'Knight'], 26 | ['5300', 'Night'], 27 | ['5550', 'Gnomon'], 28 | ['5550', 'Nomon'], 29 | ['5100', 'Pnaf'], 30 | ['5100', 'Naf'], 31 | ['2600', 'Ackroyd'], 32 | ['2600', 'Ckroyd'], 33 | ['6300', 'Wright'], 34 | ['6300', 'Right'], 35 | ['2160', 'Xavier'], 36 | ['2160', 'Savior'], 37 | ['3500', 'Whitney'], 38 | ['3500', 'Witney'] 39 | ]; 40 | 41 | tests.forEach(function([code, word]) { 42 | assert.strictEqual(soundD(word), code, `${word} => ${code}`); 43 | }); 44 | }); 45 | }); 46 | -------------------------------------------------------------------------------- /test/phonetics/statcan.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman phonetics/statcan tests 3 | * ================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import statcan from '../../src/phonetics/statcan'; 8 | 9 | describe('statcan', function() { 10 | 11 | it('should throw if the given word is not a string.', function() { 12 | assert.throws(function() { 13 | statcan([]); 14 | }, /string/); 15 | }); 16 | 17 | it('should compute the statcan code correctly.', function() { 18 | const tests = [ 19 | ['Guillaume', 'GLM'], 20 | ['Daves', 'DVS'], 21 | ['Davies', 'DVS'], 22 | ['Davis', 'DVS'], 23 | ['Devese', 'DVS'], 24 | ['Devies', 'DVS'], 25 | ['Devos', 'DVS'], 26 | ['Dove', 'DV'], 27 | ['Divish', 'DVSH'], 28 | ['Arlène', 'ARLN'], 29 | ['Lüdenscheidt', 'LDNS'] 30 | ]; 31 | 32 | tests.forEach(function([word, code]) { 33 | assert.strictEqual(statcan(word), code, `${word} => ${code}`); 34 | }); 35 | }); 36 | }); 37 | -------------------------------------------------------------------------------- /test/regexp/index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman regexp tests 3 | * ====================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import { 8 | createFuzzyPattern, 9 | escapeRegexp 10 | } from '../../src/regexp'; 11 | 12 | describe('regexp', function() { 13 | 14 | describe('#.escapeRegexp', function() { 15 | 16 | it('should correctly escape strings.', function() { 17 | assert.strictEqual(escapeRegexp('[]'), '\\[\\]'); 18 | }); 19 | }); 20 | 21 | describe('#.createFuzzyPattern', function() { 22 | 23 | it('should create the expected pattern.', function() { 24 | assert.strictEqual(createFuzzyPattern('ajs'), '(a).*?(j).*?(s)'); 25 | }); 26 | }); 27 | }); 28 | -------------------------------------------------------------------------------- /test/stemmers/french/carry.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman stemmers/french/carry tests 3 | * ===================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import carry from '../../../src/stemmers/french/carry'; 8 | 9 | describe('carry', function() { 10 | 11 | it('should correctly stem the given words.', function() { 12 | const tests = [ 13 | ['Chiennes', 'chien'], 14 | ['Tissaient', 'tis'], 15 | ['Tisser', 'tis'], 16 | ['Tisserand', 'tisserand'], 17 | ['enflammer', 'enflam'], 18 | ['groseilles', 'groseil'], 19 | ['tentateur', 'ten'], 20 | ['tentateurs', 'ten'], 21 | ['tentatrice', 'ten'], 22 | ['tenter', 'ten'], 23 | ['tenteras', 'ten'], 24 | ['formateur', 'form'], 25 | ['formatrice', 'form'], 26 | ['former', 'form'], 27 | ['formes', 'form'] 28 | ]; 29 | 30 | tests.forEach(function([word, stem]) { 31 | assert.strictEqual(carry(word), stem, `${word} => ${stem}`); 32 | }); 33 | }); 34 | }); 35 | -------------------------------------------------------------------------------- /test/stemmers/french/eda.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman stemmers/french/eda tests 3 | * =================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import eda from '../../../src/stemmers/french/eda'; 8 | 9 | describe('eda', function() { 10 | 11 | it('should correctly stem the given words.', function() { 12 | const tests = [ 13 | ['intestin', 'intestin'], 14 | ['intestins', 'intestin'], 15 | ['intestine', 'intestin'], 16 | ['intestines', 'intestin'], 17 | ['intestinal', 'intestin'], 18 | ['intestinaux', 'intestin'], 19 | ['intestinales', 'intestin'], 20 | ['intestinale', 'intestin'] 21 | ]; 22 | 23 | tests.forEach(function([word, stem]) { 24 | assert.strictEqual(eda(word), stem); 25 | }); 26 | }); 27 | }); 28 | -------------------------------------------------------------------------------- /test/stemmers/german/caumanns.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman stemmers/german/caumanns tests 3 | * ======================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import caumanns from '../../../src/stemmers/german/caumanns'; 8 | 9 | describe('caumanns', function() { 10 | it('should correctly stem the given words.', function() { 11 | const tests = [ 12 | ['', ''], 13 | ['lesen', 'les'], 14 | ['graues', 'grau'], 15 | ['buchstabieren', 'buchstabier'], 16 | ['singt', 'sing'], 17 | ['singen', 'sing'], 18 | ['beliebt', 'belieb'], 19 | ['beliebtester', 'belieb'], 20 | ['stören', 'stor'], 21 | ['stöhnen', 'stoh'], 22 | ['Kuß', 'kuss'], 23 | ['Küsse', 'kuss'], 24 | ['Verlierer', 'verlier'], 25 | ['Verlies', 'verlie'], 26 | ['Maus', 'mau'], 27 | ['Mauer', 'mau'], 28 | ['Störsender', 'stor'], 29 | ['Müllerinnen', 'mullerin'], 30 | ['Matrix', 'matrix'], 31 | ['Matrizen', 'matrix'], 32 | ['häufig', 'haufig'], 33 | ['üor', 'uor'], 34 | ['björk', 'bjork'], 35 | ['abschließen', 'abschliess'], 36 | ['abschließender', 'abschliess'], 37 | ['abschließendes', 'abschliess'], 38 | ['abschließenden', 'abschliess'], 39 | ['Tisch', 'tisch'], 40 | ['Tische', 'tisch'], 41 | ['Tischen', 'tisch'], 42 | ['geheimtür', 'geheimtur'], 43 | ['Haus', 'hau'], 44 | ['Hauses', 'hau'], 45 | ['Häuser', 'hau'], 46 | ['Häusern', 'hau'], 47 | ['hauen', 'hau'], 48 | ['Drama', 'drama'], 49 | ['Dramen', 'dram'], 50 | ['Ausmaß', 'ausmass'], 51 | ['xxxxxe', 'xxxxx'], 52 | ['xxxxxs', 'xxxxx'], 53 | ['xxxxxn', 'xxxxx'], 54 | ['xxxxxt', 'xxxxx'], 55 | ['xxxxxem', 'xxxxx'], 56 | ['xxxxxer', 'xxxxx'], 57 | ['xxxxxnd', 'xxxxx'], 58 | ['xxxxxetende', 'xxxxx'], 59 | ['xxe', 'xxe'], 60 | ['xxem', 'xxem'], 61 | ['xxer', 'xxer'], 62 | ['xxxnd', 'xxxnd'] 63 | ]; 64 | 65 | tests.forEach(function([word, stem]) { 66 | assert.strictEqual(caumanns(word), stem); 67 | }); 68 | }); 69 | }); 70 | -------------------------------------------------------------------------------- /test/stemmers/lancaster.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman stemmers/lancaster tests 3 | * ================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import lancaster from '../../src/stemmers/lancaster'; 8 | 9 | describe('lancaster', function() { 10 | it('should correctly stem the given words.', function() { 11 | const tests = [ 12 | ['worker', 'work'], 13 | ['marks', 'mark'], 14 | ['MARKS', 'mark'], 15 | ['living', 'liv'], 16 | ['thing', 'thing'], 17 | ['ear', 'ear'], 18 | ['string', 'string'], 19 | ['triplicate', 'triply'], 20 | ['classified', 'class'], 21 | ['maximum', 'maxim'], 22 | ['presumably', 'presum'], 23 | ['exceed', 'excess'], 24 | ['anguish', 'anct'], 25 | ['affluxion', 'affluct'], 26 | ['discept', 'disceiv'] 27 | ]; 28 | 29 | tests.forEach(function([word, stem]) { 30 | assert.strictEqual(lancaster(word), stem); 31 | }); 32 | }); 33 | }); 34 | -------------------------------------------------------------------------------- /test/stemmers/lovins.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman stemmers/lovins tests 3 | * =============================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import lovins from '../../src/stemmers/lovins'; 8 | 9 | describe('lovins', function() { 10 | it('should correctly stem the given words.', function() { 11 | const tests = [ 12 | ['nationally', 'nat'], 13 | ['sitting', 'sit'], 14 | ['matrix', 'matric'], 15 | ['matrices', 'matric'], 16 | ['rubbing', 'rub'], 17 | ['rubb', 'rub'], 18 | ['believe', 'belief'], 19 | ['consumption', 'consum'], 20 | ['induction', 'induc'], 21 | ['absorption', 'absorb'], 22 | ['recursive', 'recur'], 23 | ['administrate', 'administer'], 24 | ['parametric', 'parameter'], 25 | ['dissolved', 'dissolut'], 26 | ['angular', 'angl'], 27 | ['vibex', 'vibic'], 28 | ['index', 'indic'], 29 | ['apex', 'apic'], 30 | ['cortex', 'cortic'], 31 | ['anthrax', 'anthrac'], 32 | ['persuade', 'persuas'], 33 | ['evade', 'evas'], 34 | ['decide', 'dec'], 35 | ['elide', 'el'], 36 | ['deride', 'der'], 37 | ['expand', 'expans'], 38 | ['defend', 'defens'], 39 | ['respond', 'respons'], 40 | ['collusion', 'collus'], 41 | ['obstrusion', 'obstrus'], 42 | ['adhesion', 'adhes'], 43 | ['remit', 'remis'], 44 | ['extent', 'extens'], 45 | ['converted', 'convers'], 46 | ['parenthetic', 'parenthes'], 47 | ['analytic', 'analys'], 48 | ['analyzed', 'analys'] 49 | ]; 50 | 51 | tests.forEach(function([word, stem]) { 52 | assert.strictEqual(lovins(word), stem, `${word} => ${stem}`); 53 | }); 54 | }); 55 | }); 56 | -------------------------------------------------------------------------------- /test/stemmers/porter.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman stemmers/porter tests 3 | * =============================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import porter from '../../src/stemmers/porter'; 8 | 9 | describe('porter', function() { 10 | it('should correctly stem the given words.', function() { 11 | const tests = [ 12 | ['you', 'you'], 13 | ['catastrophe', 'catastroph'], 14 | ['anathema', 'anathema'], 15 | ['mathematics', 'mathemat'], 16 | ['adjective', 'adject'], 17 | ['mushroom', 'mushroom'], 18 | ['building', 'build'], 19 | ['spiteful', 'spite'], 20 | ['external', 'extern'], 21 | ['exterior', 'exterior'], 22 | ['coffee', 'coffe'] 23 | ]; 24 | 25 | tests.forEach(function([word, stem]) { 26 | assert.strictEqual(porter(word), stem); 27 | }); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /test/stemmers/s-stemmer.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman stemmers/s-stemmer tests 3 | * ================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import sStemmer from '../../src/stemmers/s-stemmer'; 8 | 9 | describe('s-stemmer', function() { 10 | it('should correctly stem the given words.', function() { 11 | const tests = [ 12 | ['', ''], 13 | ['one', 'one'], 14 | ['is', 'is'], 15 | ['reciprocity', 'reciprocity'], 16 | ['queries', 'query'], 17 | ['phrases', 'phrase'], 18 | ['corpus', 'corpus'], 19 | ['stress', 'stress'], 20 | ['kings', 'king'], 21 | ['panels', 'panel'], 22 | ['aerodynamics', 'aerodynamic'], 23 | ['congress', 'congress'], 24 | ['serious', 'serious'] 25 | ]; 26 | 27 | tests.forEach(function([word, stem]) { 28 | assert.strictEqual(sStemmer(word), stem, `${word} => ${stem}`); 29 | }); 30 | }); 31 | }); 32 | -------------------------------------------------------------------------------- /test/stemmers/uea-lite.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman stemmers/uea-lite tests 3 | * ================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import ueaLite, {withRule} from '../../src/stemmers/uea-lite'; 8 | 9 | describe('uea-lite', function() { 10 | it('should be possible to retrieve the number of the matched rule.', function() { 11 | assert.deepEqual(withRule('ordained'), { 12 | rule: '13.6', 13 | stem: 'ordain' 14 | }); 15 | 16 | assert.deepEqual(withRule('during'), { 17 | rule: '90', 18 | stem: 'during' 19 | }); 20 | }); 21 | 22 | it('should correctly stem the given words.', function() { 23 | const tests = [ 24 | ['is', 'is'], 25 | ['man', 'man'], 26 | ['happiness', 'happiness'], 27 | ['theses', 'thesis'], 28 | ['bases', 'base'], 29 | ['ordained', 'ordain'], 30 | ['killed', 'kill'], 31 | ['liked', 'like'], 32 | ['helped', 'help'], 33 | ['scarred', 'scar'], 34 | ['invited', 'invite'], 35 | ['exited', 'exit'], 36 | ['exited', 'exit'], 37 | ['debited', 'debit'], 38 | ['smited', 'smite'], 39 | ['running', 'run'], 40 | ['settings', 'set'], 41 | ['timing', 'time'], 42 | ['dying', 'die'], 43 | ['undying', 'undie'], 44 | ['untying', 'untie'], 45 | ['flying', 'fly'], 46 | ['lying', 'lie'], 47 | ['harping', 'harp'], 48 | ['charring', 'char'], 49 | ['changes', 'change'], 50 | ['deaths', 'death'], 51 | ['shadows', 'shadow'], 52 | ['flies', 'fly'], 53 | ['things', 'thing'], 54 | ['nothings', 'nothing'], 55 | ['witches', 'witch'], 56 | ['makes', 'make'], 57 | ['smokes', 'smoke'], 58 | ['does', 'do'], 59 | ['abodes', 'abode'], 60 | ['escapades', 'escapade'], 61 | ['crusades', 'crusade'], 62 | ['grades', 'grade'], 63 | ['wires', 'wire'], 64 | ['acres', 'acre'], 65 | ['fires', 'fire'], 66 | ['cares', 'care'], 67 | ['USA', 'USA'], 68 | ['FLOSS', 'FLOSS'], 69 | ['MREs', 'MRE'], 70 | ['USAED', 'USAED'] 71 | ]; 72 | 73 | tests.forEach(function([word, stem]) { 74 | assert.strictEqual(ueaLite(word), stem, `${word} => ${stem}`); 75 | }); 76 | }); 77 | }); 78 | -------------------------------------------------------------------------------- /test/tokenizers/hyphenation/liang.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/hyphenation/liang tests 3 | * ============================================ 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import liang from '../../../src/tokenizers/hyphenation/liang'; 8 | 9 | describe('liang', function() { 10 | it('should correctly tokenize liang.', function() { 11 | const tests = [ 12 | ['project', ['project']], 13 | ['hyphenation', ['hy', 'phen', 'ation']], 14 | ['supercalifragilisticexpialidocious', ['su', 'per', 'cal', 'ifrag', 'ilis', 'tic', 'ex', 'pi', 'ali', 'do', 'cious']], 15 | ['computer', ['com', 'put', 'er']], 16 | ['subdivision', ['sub', 'di', 'vi', 'sion']], 17 | ['creative', ['cre', 'ative']], 18 | ['disciplines', ['dis', 'ci', 'plines']], 19 | ['philanthropic', ['phil', 'an', 'thropic']] 20 | ]; 21 | 22 | tests.forEach(function([word, tokens]) { 23 | assert.deepEqual(liang(word), tokens); 24 | }); 25 | }); 26 | }); 27 | -------------------------------------------------------------------------------- /test/tokenizers/lines/naive.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/lines/naive tests 3 | * ====================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import lines from '../../../src/tokenizers/lines/naive'; 8 | 9 | describe('naive', function() { 10 | it('should properly tokenize lines.', function() { 11 | const text = 'First.\n \nSecond.\r\nThird.\rFourth.\n\rFifth.\n\nSixth.\n'; 12 | 13 | const tokens = lines(text); 14 | 15 | assert.deepEqual(tokens, [ 16 | 'First.', 17 | ' ', 18 | 'Second.', 19 | 'Third.', 20 | 'Fourth.', 21 | 'Fifth.', 22 | '', 23 | 'Sixth.', 24 | '' 25 | ]); 26 | }); 27 | }); 28 | -------------------------------------------------------------------------------- /test/tokenizers/ngrams.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/ngrams tests 3 | * ================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import ngrams, { 8 | bigrams, 9 | trigrams, 10 | quadrigrams 11 | } from '../../src/tokenizers/ngrams'; 12 | 13 | describe('ngrams', function() { 14 | 15 | it('should throw if n is < 1.', function() { 16 | assert.throws(function() { 17 | ngrams(-1, [1, 2, 3]); 18 | }, Error); 19 | }); 20 | 21 | it('should properly compute ngrams.', function() { 22 | const solutions = { 23 | 1: [['h'], ['e'], ['l'], ['l'], ['o']], 24 | 2: [['h', 'e'], ['e', 'l'], ['l', 'l'], ['l', 'o']], 25 | 3: [['h', 'e', 'l'], ['e', 'l', 'l'], ['l', 'l', 'o']], 26 | 4: [['h', 'e', 'l', 'l'], ['e', 'l', 'l', 'o']] 27 | }; 28 | 29 | Object.keys(solutions).forEach(n => { 30 | assert.deepEqual(ngrams(n, 'hello'.split('')), solutions[n], `n = ${n}`); 31 | assert.deepEqual(ngrams(n, 'hello'), solutions[n].map(s => s.join('')), `n = ${n}`); 32 | }); 33 | }); 34 | 35 | it('popular aliases should also work.', function() { 36 | assert.deepEqual(bigrams('hello'), ngrams(2, 'hello')); 37 | assert.deepEqual(trigrams('hello'), ngrams(3, 'hello')); 38 | assert.deepEqual(quadrigrams('hello'), ngrams(4, 'hello')); 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /test/tokenizers/paragraphs/naive.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/paragraphs/naive tests 3 | * =========================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import paragraphs from '../../../src/tokenizers/paragraphs/naive'; 8 | 9 | describe('naive', function() { 10 | it('should properly tokenize paragraphs.', function() { 11 | const text = [ 12 | 'Hello first paragraph.\n\nWhat do you do?\r\n\r\nHello Mom!\r\n\r\n', 13 | 'Another paragraph. Multiple sentences.\nYou see?\n\n\n', 14 | 'Here.\n\t\nThere.\n \nOver there!\n\n', 15 | 'One.\r\rTwo.\n\r \n\rThree.' 16 | ].join(''); 17 | 18 | const tokens = paragraphs(text); 19 | 20 | assert.deepEqual(tokens, [ 21 | 'Hello first paragraph.', 22 | 'What do you do?', 23 | 'Hello Mom!', 24 | 'Another paragraph. Multiple sentences.\nYou see?', 25 | 'Here.', 26 | 'There.', 27 | 'Over there!', 28 | 'One.', 29 | 'Two.', 30 | 'Three.' 31 | ]); 32 | }); 33 | }); 34 | -------------------------------------------------------------------------------- /test/tokenizers/skipgrams.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/skipgrams tests 3 | * ==================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import skipgrams from '../../src/tokenizers/skipgrams'; 8 | 9 | const WORDS = 'Insurgents killed in ongoing fighting'.split(' '); 10 | 11 | describe('skipgrams', function() { 12 | 13 | it('should throw if n is < k.', function() { 14 | assert.throws(function() { 15 | skipgrams(2, 1, [1, 2, 3]); 16 | }, Error); 17 | }); 18 | 19 | it('should throw if k is < 1.', function() { 20 | assert.throws(function() { 21 | skipgrams(-1, -1, [1, 2, 3]); 22 | }, Error); 23 | }); 24 | 25 | it('should throw if n is < 1.', function() { 26 | assert.throws(function() { 27 | skipgrams(1, -1, [1, 2, 3]); 28 | }, Error); 29 | }); 30 | 31 | it('should properly compute skipgrams.', function() { 32 | const twoSkipBigrams = skipgrams(2, 2, WORDS); 33 | 34 | assert.deepEqual(twoSkipBigrams, [ 35 | ['Insurgents', 'killed'], 36 | ['Insurgents', 'in'], 37 | ['Insurgents', 'ongoing'], 38 | ['killed', 'in'], 39 | ['killed', 'ongoing'], 40 | ['killed', 'fighting'], 41 | ['in', 'ongoing'], 42 | ['in', 'fighting'], 43 | ['ongoing', 'fighting'] 44 | ]); 45 | 46 | const twoSkipTrigrams = skipgrams(2, 3, WORDS); 47 | 48 | assert.deepEqual(twoSkipTrigrams, [ 49 | ['Insurgents', 'killed', 'in'], 50 | ['Insurgents', 'killed', 'ongoing'], 51 | ['Insurgents', 'killed', 'fighting'], 52 | ['Insurgents', 'in', 'ongoing'], 53 | ['Insurgents', 'in', 'fighting'], 54 | ['Insurgents', 'ongoing', 'fighting'], 55 | ['killed', 'in', 'ongoing'], 56 | ['killed', 'in', 'fighting'], 57 | ['killed', 'ongoing', 'fighting'], 58 | ['in', 'ongoing', 'fighting'] 59 | ]); 60 | }); 61 | 62 | it('should also work with strings.', function() { 63 | const grams = skipgrams(1, 2, 'abcd'); 64 | 65 | assert.deepEqual(grams, ['ab', 'ac', 'bc', 'bd', 'cd']); 66 | }); 67 | }); 68 | -------------------------------------------------------------------------------- /test/tokenizers/syllables/sonoripy.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/syllables/sonoripy tests 3 | * ============================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import defaultTokenizer, { 8 | createTokenizer, 9 | merge 10 | } from '../../../src/tokenizers/syllables/sonoripy'; 11 | 12 | const VOWELS_REGEX = /[aeiouy]/; 13 | 14 | const PHONOGRAM_HIERARCHY = [ 15 | 'aeéɛøoɔiuʌyãẽõ', 16 | 'jwɥh', 17 | 'rl', 18 | 'mn', 19 | 'zvðʒ', 20 | 'sfθʃ', 21 | 'bdg', 22 | 'ptkx' 23 | ]; 24 | 25 | describe('sonoripy', function() { 26 | it('should merge tokens correctly.', function() { 27 | assert.deepEqual(merge(VOWELS_REGEX, ['his', 'to', 'ry']), ['his', 'to', 'ry']); 28 | assert.deepEqual(merge(VOWELS_REGEX, ['grr', 'ad', 'ual']), ['grrad', 'ual']); 29 | assert.deepEqual(merge(VOWELS_REGEX, ['pro', 'grr', 'am']), ['progrr', 'am']); 30 | }); 31 | 32 | it('should tokenize words correctly.', function() { 33 | const tests = [ 34 | ['history', ['his', 'to', 'ry']], 35 | ['History', ['His', 'to', 'ry']], 36 | ['justification', ['jus', 'ti', 'fi', 'ca', 'tion']], 37 | ['channel', ['chan', 'nel']], 38 | ['chapel', ['cha', 'pel']], 39 | ['unconstitutional', ['un', 'cons', 'ti', 'tu', 'tio', 'nal']] 40 | ]; 41 | 42 | tests.forEach(function([word, syllables]) { 43 | assert.deepEqual(defaultTokenizer(word), syllables); 44 | }); 45 | }); 46 | 47 | it('should be possible to create a custom tokenizer.', function() { 48 | const phonogramTokenizer = createTokenizer({ 49 | hierarchy: PHONOGRAM_HIERARCHY 50 | }); 51 | 52 | // NOTE: see if breaking on diphtongue 53 | 54 | const tests = [ 55 | ['ametist', ['a', 'me', 'tist']], 56 | // ['aɛd', ['a', 'ɛd']], 57 | ['arl', ['arl']], 58 | ['arlezjɛn', ['ar', 'le', 'zjɛn']], 59 | ['ãbjãs', ['ã', 'bjãs']], 60 | // ['diazot', ['di', 'a', 'zot']], 61 | ['djazot', ['dja', 'zot']], 62 | ['ãtikõstitysjõ', ['ã', 'ti', 'kõs', 'ti', 'ty', 'sjõ']], 63 | ['pubel', ['pu', 'bel']], 64 | ['tjar', ['tjar']], 65 | // ['frao', ['fra', 'o']], 66 | ['fraw', ['fraw']], 67 | ['illegal', ['il', 'le', 'gal']], 68 | ['imminã', ['im', 'mi', 'nã']] 69 | ]; 70 | 71 | tests.forEach(function([word, syllables]) { 72 | assert.deepEqual(phonogramTokenizer(word), syllables); 73 | }); 74 | }); 75 | }); 76 | -------------------------------------------------------------------------------- /test/tokenizers/tweets/casual.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/tweets/casual tests 3 | * ======================================== 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import tokenizer from '../../../src/tokenizers/tweets/casual'; 8 | 9 | describe('casual', function() { 10 | it('should properly tokenize tweets.', function() { 11 | const tests = [ 12 | { 13 | tweet: 'This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--', 14 | tokens: ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] 15 | }, 16 | { 17 | tweet: '@remy: This is waaaaayyyy too much for you!!!!!!', 18 | tokens: ['@remy', ':', 'This', 'is', 'waaaaayyyy', 'too', 'much', 'for', 'you', '!', '!', '!'] 19 | }, 20 | // { 21 | // tweet: '@myke: Let\'s test these words: resumé España München français', 22 | // tokens: ['@myke', ':', 'Let\'s', 'test', 'these', 'words', ':', 'résumé', 'España', 'München', 'français'] 23 | // }, 24 | { 25 | tweet: 'What is this ugly © entity?', 26 | tokens: ['What', 'is', 'this', 'ugly', '©', 'entity', '?'] 27 | }, 28 | { 29 | tweet: 'Oh!, A url: https://google.com', 30 | tokens: ['Oh', '!', ',', 'A', 'url', ':', 'https://google.com'] 31 | } 32 | ]; 33 | 34 | tests.forEach(function({tweet, tokens}) { 35 | assert.deepEqual(tokenizer(tweet), tokens); 36 | }); 37 | }); 38 | }); 39 | -------------------------------------------------------------------------------- /test/tokenizers/words/treebank.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Talisman tokenizers/words/treebank tests 3 | * ========================================= 4 | * 5 | */ 6 | import assert from 'assert'; 7 | import words from '../../../src/tokenizers/words/treebank'; 8 | 9 | describe('treebank', function() { 10 | it('should correctly tokenize words.', function() { 11 | const tests = [ 12 | { 13 | text: 'Good muffins cost $3.88\nin New York. Please buy me\ntwo of them.\nThanks.', 14 | tokens: ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.'] 15 | }, 16 | { 17 | text: 'They\'ll save and invest more.', 18 | tokens: ['They', '\'ll', 'save', 'and', 'invest', 'more', '.'] 19 | }, 20 | { 21 | text: 'hi, my name can\'t hello,', 22 | tokens: ['hi', ',', 'my', 'name', 'ca', "n't", 'hello', ','] 23 | }, 24 | { 25 | text: 'O.N.U.', 26 | tokens: ['O.N.U', '.'] 27 | }, 28 | { 29 | text: '"Hello", Good sir (this is appaling)...', 30 | tokens: ['``', 'Hello', '\'\'', ',', 'Good', 'sir', '(', 'this', 'is', 'appaling', ')', '...'] 31 | } 32 | ]; 33 | 34 | tests.forEach(function({text, tokens}) { 35 | assert.deepEqual(words(text), tokens); 36 | }); 37 | }); 38 | }); 39 | --------------------------------------------------------------------------------