├── .gitmodules ├── typedoc.json ├── docs └── assets │ └── images │ ├── icons.png │ ├── widgets.png │ ├── icons@2x.png │ └── widgets@2x.png ├── ts ├── nlp │ ├── index.ts │ └── tokenizer.ts ├── distance │ └── index.ts ├── matchers │ ├── index.ts │ └── matcherconfig.ts └── index.ts ├── src ├── cs │ ├── PhoneticMatchingPerfTests │ │ ├── Properties │ │ │ └── launchSettings.json │ │ ├── Transcription.cs │ │ ├── TestElement.cs │ │ ├── TestQuery.cs │ │ ├── PhoneticMatchingPerfTests.csproj │ │ ├── Settings.StyleCop │ │ └── Program.cs │ ├── Microsoft.PhoneticMatching │ │ ├── Microsoft.PhoneticMatching.csproj │ │ ├── Nlp │ │ │ ├── Tokenizer │ │ │ │ ├── ITokenizer.cs │ │ │ │ ├── WhitespaceTokenizer.cs │ │ │ │ ├── Token.cs │ │ │ │ ├── Interval.cs │ │ │ │ └── SplittingTokenizer.cs │ │ │ └── Preprocessor │ │ │ │ ├── IPreProcessor.cs │ │ │ │ ├── CaseFoldingPreProcessor.cs │ │ │ │ ├── UnicodePreProcessor.cs │ │ │ │ ├── WhiteSpacePreProcessor.cs │ │ │ │ ├── ChainedRuleBasedPreProcessor.cs │ │ │ │ ├── EnPreProcessor.cs │ │ │ │ └── EnPlacesPreProcessor.cs │ │ ├── ManagedCallback.cs │ │ ├── Matchers │ │ │ ├── ContactMatcher │ │ │ │ ├── ContactFields.cs │ │ │ │ └── ContactMatcherConfig.cs │ │ │ ├── PlaceMatcher │ │ │ │ ├── PlaceFields.cs │ │ │ │ └── PlaceMatcherConfig.cs │ │ │ ├── Target.cs │ │ │ ├── FuzzyMatcher │ │ │ │ ├── IFuzzyMatcher.cs │ │ │ │ ├── Normalized │ │ │ │ │ ├── StringFuzzyMatcher.cs │ │ │ │ │ ├── EnPhoneticFuzzyMatcher.cs │ │ │ │ │ └── EnHybridFuzzyMatcher.cs │ │ │ │ ├── FuzzyMatcher.cs │ │ │ │ └── FuzzyMatcherBase.cs │ │ │ ├── MatcherConfig.cs │ │ │ └── BaseMatcher.cs │ │ ├── Distance │ │ │ ├── IDistance.cs │ │ │ ├── DistanceInput.cs │ │ │ ├── StringDistance.cs │ │ │ └── EnPhoneticDistance.cs │ │ ├── Settings.StyleCop │ │ ├── Match.cs │ │ └── EnPronouncer.cs │ ├── PhoneticMatchingTests │ │ ├── PhoneticMatchingTests.csproj │ │ ├── EnPronouncerTests.cs │ │ ├── Distance │ │ │ ├── BaseDistanceTester.cs │ │ │ ├── StringDistanceTests.cs │ │ │ ├── EnPhoneticDistanceTests.cs │ │ │ └── EnHybridDistanceTests.cs │ │ ├── Settings.StyleCop │ │ ├── NativeResourceWrapperTests.cs │ │ ├── Nlp │ │ │ ├── TokenizerTests.cs │ │ │ └── PreprocessorTests.cs │ │ ├── Matchers │ │ │ ├── BaseContactMatcherTester.cs │ │ │ └── ContactMatcherTests.cs │ │ └── EnPronunciationTests.cs │ ├── nuget │ │ ├── build │ │ │ └── Microsoft.PhoneticMatching.targets │ │ └── Microsoft.PhoneticMatching.nuspec │ └── PhoneticMatching.sln └── maluuba │ ├── speech │ ├── phoneticdistance │ │ └── phoneticdistance.cpp │ ├── nodejs │ │ ├── performance.hpp │ │ ├── phone.hpp │ │ ├── match.hpp │ │ ├── enpronouncer.hpp │ │ ├── stringdistance.hpp │ │ ├── enhybriddistance.hpp │ │ ├── enphoneticdistance.hpp │ │ ├── main.cpp │ │ ├── enpronunciation.hpp │ │ ├── performance │ │ │ └── performance.cpp │ │ ├── match │ │ │ └── match.cpp │ │ ├── stringdistance │ │ │ └── stringdistance.cpp │ │ ├── enpronouncer │ │ │ └── enpronouncer.cpp │ │ └── enphoneticdistance │ │ │ └── enphoneticdistance.cpp │ ├── pronouncer.hpp │ ├── csharp │ │ └── csharp.hpp │ ├── pronunciation │ │ ├── pronunciation.cpp │ │ ├── arpabet.cpp │ │ ├── phone.cpp │ │ └── impl.hpp │ ├── hybriddistance.hpp │ ├── pronouncer │ │ └── pronouncer.cpp │ └── phoneticdistance.hpp │ ├── xtd │ ├── optional.hpp │ └── string_view.hpp │ ├── metric.hpp │ ├── debug.hpp │ ├── unicode.hpp │ ├── unicode │ └── unicode.cpp │ └── levenshtein.hpp ├── jestConfig.json ├── tests ├── enpronouncer.test.ts ├── matchers │ ├── testsets │ │ ├── soundex.testset.spec.ts │ │ └── soundex.ts │ ├── contactmatcher.test.ts │ └── placematcher.test.ts ├── nlp │ ├── tokenizer.test.ts │ └── preprocessor.test.ts ├── distance │ ├── stringdistance.test.ts │ ├── enphoneticdistance.test.ts │ └── enhybriddistance.test.ts └── enpronunciation.test.ts ├── LICENSE ├── .gitignore ├── package.json ├── .gitattributes └── SECURITY.md /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "flite"] 2 | path = src/flite 3 | url = https://github.com/festvox/flite.git 4 | -------------------------------------------------------------------------------- /typedoc.json: -------------------------------------------------------------------------------- 1 | { 2 | "out": "docs", 3 | "excludePrivate": true, 4 | "gitRevision": "master" 5 | } 6 | -------------------------------------------------------------------------------- /docs/assets/images/icons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PhoneticMatching/HEAD/docs/assets/images/icons.png -------------------------------------------------------------------------------- /docs/assets/images/widgets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PhoneticMatching/HEAD/docs/assets/images/widgets.png -------------------------------------------------------------------------------- /docs/assets/images/icons@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PhoneticMatching/HEAD/docs/assets/images/icons@2x.png -------------------------------------------------------------------------------- /docs/assets/images/widgets@2x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/PhoneticMatching/HEAD/docs/assets/images/widgets@2x.png -------------------------------------------------------------------------------- /ts/nlp/index.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | export * from "./preprocessor"; 5 | export * from "./tokenizer"; 6 | -------------------------------------------------------------------------------- /ts/distance/index.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | export {StringDistance, EnPhoneticDistance, EnHybridDistance, DistanceInput} from "../maluuba"; 5 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingPerfTests/Properties/launchSettings.json: -------------------------------------------------------------------------------- 1 | { 2 | "profiles": { 3 | "PhoneticMatchingPerfTests": { 4 | "commandName": "Project", 5 | "commandLineArgs": "contact 120000 accuracy" 6 | } 7 | } 8 | } -------------------------------------------------------------------------------- /ts/matchers/index.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | export {FuzzyMatcher, AcceleratedFuzzyMatcher} from "../maluuba"; 5 | export * from "./contactmatcher"; 6 | export * from "./placematcher"; 7 | export * from "./matcherconfig"; -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Microsoft.PhoneticMatching.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netstandard2.1 5 | x64 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /jestConfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "globals": { 3 | "ts-jest": { 4 | "tsconfig": "ts/tsconfig.json", 5 | "diagnostics": false 6 | } 7 | }, 8 | "testRegex": "(/__tests__/.*|(\\.|/)(test|spec))\\.(jsx?|tsx?)$", 9 | "testPathIgnorePatterns": [ 10 | "/build/", 11 | "/docs/", 12 | "/lib/", 13 | "/src/", 14 | "/node_modules/" 15 | ], 16 | "preset": "ts-jest/presets/js-with-ts", 17 | "testMatch": null 18 | } -------------------------------------------------------------------------------- /ts/index.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | export {EnPronouncer, EnPronunciation, Speech} from "./maluuba"; 5 | 6 | /** 7 | * Bubble-up re-exports of __nlp__ module for convenience. 8 | */ 9 | export * from "./nlp"; 10 | 11 | /** 12 | * Bubble-up re-exports of __matchers__ module for convenience. 13 | */ 14 | export * from "./matchers"; 15 | 16 | /** 17 | * Bubble-up re-exports of __distance__ module for convenience. 18 | */ 19 | export * from "./distance"; 20 | -------------------------------------------------------------------------------- /src/maluuba/speech/phoneticdistance/phoneticdistance.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/phoneticdistance.hpp" 5 | 6 | namespace maluuba 7 | { 8 | namespace speech 9 | { 10 | PhoneticDistance::~PhoneticDistance() = default; 11 | 12 | EnPhoneticDistance::~EnPhoneticDistance() = default; 13 | 14 | double 15 | EnPhoneticDistance::operator()(const EnPronunciation& a, const EnPronunciation& b) const 16 | { 17 | return PhoneticDistance::operator()(phonetic_embedding(a), phonetic_embedding(b)); 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingPerfTests/Transcription.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingPerfTests 5 | { 6 | public class Transcription 7 | { 8 | /// 9 | /// Gets or sets A label to track what made this transcription. 10 | /// 11 | public string Source { get; set; } 12 | 13 | /// 14 | /// Gets or sets What was actually heard/spoken (possible ASR/STT errors). 15 | /// 16 | public string Utterance { get; set; } 17 | } 18 | } -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingPerfTests/TestElement.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingPerfTests 5 | { 6 | internal class TestElement 7 | { 8 | /// 9 | /// Gets or sets A unique ID to refer back to this element. 10 | /// 11 | public T Element { get; set; } 12 | 13 | /// 14 | /// Gets or sets Test queries with the intent targeting this element in some way. 15 | /// 16 | public TestQuery[] Queries { get; set; } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingPerfTests/TestQuery.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingPerfTests 5 | { 6 | public class TestQuery 7 | { 8 | /// 9 | /// Gets or sets What the intention is. What should be heard or what was read. 10 | /// 11 | public string Query { get; set; } 12 | 13 | /// 14 | /// Gets or sets The records for this test query. What was actually heard or what was written. 15 | /// 16 | public Transcription[] Transcriptions { get; set; } 17 | } 18 | } -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Tokenizer/ITokenizer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Tokenizer 5 | { 6 | using System.Collections.Generic; 7 | 8 | /// 9 | /// Tokenizer interface for strings. 10 | /// 11 | public interface ITokenizer 12 | { 13 | /// 14 | /// Tokenize the query. 15 | /// 16 | /// Query to tokenize. 17 | /// Collection of tokens. 18 | IList Tokenize(string query); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Tokenizer/WhitespaceTokenizer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Tokenizer 5 | { 6 | using System.Text.RegularExpressions; 7 | 8 | /// 9 | /// Tokenizer that splits on whitespace. 10 | /// 11 | public class WhitespaceTokenizer : SplittingTokenizer 12 | { 13 | /// 14 | /// Initializes a new instance of the class. 15 | /// 16 | public WhitespaceTokenizer() : base(new Regex(@"\s+")) 17 | { 18 | } 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/enpronouncer.test.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import {EnPronouncer} from "../ts"; 5 | 6 | test("English pronouncer.", () => { 7 | const pronouncer = new EnPronouncer(); 8 | expect(pronouncer.pronounce("This, is a test.").ipa).toBe("ðɪsɪzətɛst"); 9 | }); 10 | 11 | test("ctor used as function exception.", () => { 12 | expect(() => { 13 | const pronouncer = (EnPronouncer as any)(); 14 | }).toThrow(); 15 | }); 16 | 17 | test("Pronouncing undefined exception.", () => { 18 | expect(() => { 19 | const pronouncer = new EnPronouncer(); 20 | pronouncer.pronounce(undefined as any); 21 | }).toThrow(); 22 | }); 23 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/ManagedCallback.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching 5 | { 6 | using System; 7 | 8 | /// 9 | /// Used to keep track of the last exception that occurred during a managed callback was invoked from native code. Otherwise, native code swallows the exception. 10 | /// 11 | internal static class ManagedCallback 12 | { 13 | /// 14 | /// Gets or sets the last exception that occurred during a managed callback was invoked from native code. 15 | /// 16 | public static Exception LastError { get; set; } 17 | } 18 | } -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/IPreProcessor.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor 5 | { 6 | /// 7 | /// A Pre-processor interface. To transform a string before any classification or understanding is known about it. 8 | /// 9 | public interface IPreProcessor 10 | { 11 | /// 12 | /// Function to preform the pre-processing. 13 | /// 14 | /// The string to pre-process. 15 | /// The pre-processed string. 16 | string PreProcess(string query); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /tests/matchers/testsets/soundex.testset.spec.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import Soundex from "./soundex"; 5 | 6 | test("Soundex.", () => { 7 | expect(Soundex.encode("")).toBe(""); 8 | expect(Soundex.encode(" ")).toBe(""); 9 | 10 | expect(Soundex.encode("Robert")).toBe("R163"); 11 | expect(Soundex.encode("Rupert")).toBe("R163"); 12 | expect(Soundex.encode("Rubin")).toBe("R150"); 13 | expect(Soundex.encode("Ashcraft")).toBe("A261"); 14 | expect(Soundex.encode("Ashcroft")).toBe("A261"); 15 | expect(Soundex.encode("Tymczak")).toBe("T522"); 16 | expect(Soundex.encode("Pfister")).toBe("P236"); 17 | expect(Soundex.encode("Honeyman")).toBe("H555"); 18 | 19 | expect(Soundex.encode("Robert Robert")).toBe("R163 R163"); 20 | }); 21 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/ContactMatcher/ContactFields.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers.ContactMatcher 5 | { 6 | using System.Collections.Generic; 7 | 8 | /// 9 | /// Fields made available from the user defined Contact object for pronunciation and distance functions. 10 | /// 11 | public class ContactFields 12 | { 13 | /// 14 | /// Gets or sets the name of the contact. 15 | /// 16 | public string Name { get; set; } 17 | 18 | /// 19 | /// Gets or sets the aliases the contact also goes by. 20 | /// 21 | public IList Aliases { get; set; } 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Distance/IDistance.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Distance 5 | { 6 | /// 7 | /// Distance interface. Distance object are used to compute distance between two objects. 8 | /// 9 | /// Type of elements between which we compute distance. 10 | public interface IDistance 11 | { 12 | /// 13 | /// Computes the distance between first and second. 14 | /// 15 | /// First element. 16 | /// Second element. 17 | /// The distance between first and second. 18 | double Distance(T first, T second); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/performance.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Performance utility to make trace events. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | #ifndef MALUUBA_SPEECH_NODEJS_PERFORMANCE_HPP 10 | #define MALUUBA_SPEECH_NODEJS_PERFORMANCE_HPP 11 | 12 | #include 13 | #include 14 | 15 | namespace maluuba 16 | { 17 | namespace speech 18 | { 19 | namespace nodejs 20 | { 21 | class Performance 22 | { 23 | public: 24 | static void Init(v8::Local module); 25 | 26 | static void Mark(const std::string& name); 27 | static void Measure(const std::string& name, const std::string& start_mark, const std::string& end_mark); 28 | 29 | private: 30 | static v8::Persistent s_performance; 31 | }; 32 | } 33 | } 34 | } 35 | 36 | #endif // MALUUBA_SPEECH_NODEJS_PERFORMANCE_HPP 37 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/PhoneticMatchingTests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | netcoreapp3.1 5 | 6 | false 7 | 8 | AnyCPU;x64 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/maluuba/xtd/optional.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #ifndef MALUUBA_XTD_OPTIONAL_HPP 5 | #define MALUUBA_XTD_OPTIONAL_HPP 6 | 7 | #if __cplusplus >= 201703L && __has_include() 8 | 9 | #include 10 | 11 | namespace maluuba 12 | { 13 | namespace xtd 14 | { 15 | using std::optional; 16 | using std::bad_optional_access; 17 | using std::nullopt_t; 18 | using std::nullopt; 19 | using std::make_optional; 20 | } 21 | } 22 | 23 | #else 24 | 25 | #include 26 | 27 | namespace maluuba 28 | { 29 | namespace xtd 30 | { 31 | using std::experimental::optional; 32 | using std::experimental::bad_optional_access; 33 | using std::experimental::nullopt_t; 34 | using std::experimental::nullopt; 35 | using std::experimental::make_optional; 36 | } 37 | } 38 | 39 | #endif // __cplusplus 40 | 41 | #endif // MALUUBA_XTD_OPTIONAL_HPP 42 | -------------------------------------------------------------------------------- /src/maluuba/xtd/string_view.hpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #ifndef MALUUBA_XTD_STRING_VIEW_HPP 5 | #define MALUUBA_XTD_STRING_VIEW_HPP 6 | 7 | #if __cplusplus >= 201703L 8 | 9 | #include 10 | 11 | namespace maluuba 12 | { 13 | namespace xtd 14 | { 15 | using std::basic_string_view; 16 | using std::string_view; 17 | using std::wstring_view; 18 | using std::u16string_view; 19 | using std::u32string_view; 20 | } 21 | } 22 | 23 | #else 24 | 25 | #include 26 | 27 | namespace maluuba 28 | { 29 | namespace xtd 30 | { 31 | using std::experimental::basic_string_view; 32 | using std::experimental::string_view; 33 | using std::experimental::wstring_view; 34 | using std::experimental::u16string_view; 35 | using std::experimental::u32string_view; 36 | } 37 | } 38 | 39 | #endif // __cplusplus 40 | 41 | #endif // MALUUBA_XTD_STRING_VIEW_HPP 42 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/PlaceMatcher/PlaceFields.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers.PlaceMatcher 5 | { 6 | using System.Collections.Generic; 7 | 8 | /// 9 | /// Fields made available from the user defined Place object for pronunciation and distance functions. 10 | /// 11 | public class PlaceFields 12 | { 13 | /// 14 | /// Gets or sets the name of the place. 15 | /// 16 | public string Name { get; set; } 17 | 18 | /// 19 | /// Gets or sets The address of the place. 20 | /// 21 | public string Address { get; set; } 22 | 23 | /// 24 | /// Gets or sets The tags/categories defining the place. 25 | /// 26 | public IList Types { get; set; } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/CaseFoldingPreProcessor.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor 5 | { 6 | using System; 7 | 8 | /// 9 | /// Pre-Processor to preform the pre-processing with case. 10 | /// 11 | internal class CaseFoldingPreProcessor : IPreProcessor 12 | { 13 | /// 14 | /// Function to preform the pre-processing with case. 15 | /// 16 | /// The string to pre-process. 17 | /// The pre-processed string. 18 | public string PreProcess(string query) 19 | { 20 | if (query == null) 21 | { 22 | throw new ArgumentNullException("query can't be null"); 23 | } 24 | 25 | return query.ToLowerInvariant(); 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/cs/nuget/build/Microsoft.PhoneticMatching.targets: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | linux-x64 6 | osx-x64 7 | win-x64 8 | 9 | 10 | 11 | 12 | PreserveNewest 13 | maluubaspeech-csharp.dll 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/UnicodePreProcessor.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor 5 | { 6 | using System; 7 | using System.Text; 8 | 9 | /// 10 | /// Unicode pre-processor 11 | /// 12 | internal class UnicodePreProcessor : IPreProcessor 13 | { 14 | /// 15 | /// Function to preform the pre-processing with unicode normalization form. 16 | /// 17 | /// The string to pre-process. 18 | /// The pre-processed string. 19 | public string PreProcess(string query) 20 | { 21 | if (query == null) 22 | { 23 | throw new ArgumentNullException("query can't be null"); 24 | } 25 | 26 | return query.Normalize(NormalizationForm.FormKC); 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/cs/nuget/Microsoft.PhoneticMatching.nuspec: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Microsoft.PhoneticMatching 5 | 0.0.7 6 | Microsoft 7 | Microsoft 8 | https://opensource.org/licenses/MIT 9 | https://github.com/Microsoft/PhoneticMatching 10 | http://go.microsoft.com/fwlink/?LinkID=288890 11 | false 12 | PhoneticMatching C# project. 13 | Initial version. 14 | © Microsoft Corporation. All rights reserved. 15 | phone ipa match query target pronunciation hybrid fuzzy matcher arpabet distance pronouncer syllable 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/phone.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Phones wrapped in NodeJS. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | #ifndef MALUUBA_SPEECH_NODEJS_PHONE_HPP 10 | #define MALUUBA_SPEECH_NODEJS_PHONE_HPP 11 | 12 | #include "maluuba/speech/pronunciation.hpp" 13 | #include 14 | #include 15 | 16 | namespace maluuba 17 | { 18 | namespace speech 19 | { 20 | namespace nodejs 21 | { 22 | class Phone: public node::ObjectWrap 23 | { 24 | public: 25 | static void Init(v8::Local exports); 26 | static v8::Local constructor(v8::Isolate* isolate); 27 | 28 | Phone(speech::Phone phone); 29 | const speech::Phone& phone() const; 30 | 31 | private: 32 | static void New(const v8::FunctionCallbackInfo& args); 33 | static v8::Persistent s_constructor; 34 | speech::Phone m_phone; 35 | }; 36 | } 37 | } 38 | } 39 | 40 | #endif // MALUUBA_SPEECH_NODEJS_PHONE_HPP 41 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingPerfTests/PhoneticMatchingPerfTests.csproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Exe 5 | netcoreapp3.1 6 | x64 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/EnPronouncerTests.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingTests 5 | { 6 | using System; 7 | using Microsoft.PhoneticMatching; 8 | using Microsoft.VisualStudio.TestTools.UnitTesting; 9 | 10 | [TestClass] 11 | public class EnPronouncerTests 12 | { 13 | private EnPronouncer pronouncer = EnPronouncer.Instance; 14 | 15 | [TestMethod] 16 | public void GivenPronunciation_ExpectPositiveMatch() 17 | { 18 | var pronunciation = this.pronouncer.Pronounce("This, is a test."); 19 | Assert.AreEqual("ðɪsɪzətɛst", pronunciation.Ipa); 20 | } 21 | 22 | [TestMethod] 23 | public void GivenNullArgument_ExpectException() 24 | { 25 | Assert.ThrowsException(() => 26 | { 27 | var pronunciation = this.pronouncer.Pronounce(null); 28 | }); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Settings.StyleCop: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | arpabet 5 | fənɛtɪk 6 | ipa 7 | rhotic 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | False 16 | 17 | 18 | 19 | 20 | False 21 | 22 | 23 | 24 | 25 | True 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/WhiteSpacePreProcessor.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor 5 | { 6 | using System; 7 | using System.Text.RegularExpressions; 8 | 9 | /// 10 | /// Pre-processor that removes consecutive and trailing white spaces. 11 | /// 12 | internal class WhiteSpacePreProcessor : IPreProcessor 13 | { 14 | private readonly Regex pattern = new Regex(@"\s{2,}"); 15 | 16 | /// 17 | /// Function to preform the pre-processing. 18 | /// 19 | /// The string to pre-process. 20 | /// The pre-processed string. 21 | public string PreProcess(string query) 22 | { 23 | if (query == null) 24 | { 25 | throw new ArgumentNullException("query can't be null"); 26 | } 27 | 28 | return this.pattern.Replace(query.Trim(), " "); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /tests/nlp/tokenizer.test.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import { WhitespaceTokenizer, Token } from "../../ts/nlp"; 5 | 6 | function values(tokens: Token[]): string[] { 7 | return tokens.map((token) => token.value); 8 | } 9 | 10 | describe("WhiteSpaceTokenizer", () => { 11 | const tokenizer = new WhitespaceTokenizer(); 12 | 13 | test("empty string", () => { 14 | expect(values(tokenizer.tokenize(""))).toEqual([]); 15 | }); 16 | 17 | test("no whitespace", () => { 18 | expect(values(tokenizer.tokenize("example"))).toEqual(["example"]); 19 | }); 20 | 21 | test("Not ending with spaces", () => { 22 | expect(values(tokenizer.tokenize(" There are some words, here! #blessed"))) 23 | .toEqual(["There", "are", "some", "words,", "here!", "#blessed"]); 24 | }); 25 | 26 | test("Ends with spaces", () => { 27 | expect(values(tokenizer.tokenize(" There are some words, here! #blessed "))) 28 | .toEqual(["There", "are", "some", "words,", "here!", "#blessed"]); 29 | }); 30 | }); 31 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Tokenizer/Token.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Tokenizer 5 | { 6 | /// 7 | /// The substring token of the original string with its interval location. 8 | /// 9 | public class Token 10 | { 11 | /// 12 | /// Initializes a new instance of the class. 13 | /// 14 | /// Value of the token. 15 | /// Interval of the value. 16 | public Token(string value, Interval interval) 17 | { 18 | this.Value = value; 19 | this.Interval = interval; 20 | } 21 | 22 | /// 23 | /// Gets the value of the token. 24 | /// 25 | public string Value { get; private set; } 26 | 27 | /// 28 | /// Gets the interval of the token. 29 | /// 30 | public Interval Interval { get; private set; } 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/maluuba/metric.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Distance metrics. 4 | * 5 | * @author Tavian Barnes (tavian.barnes@microsoft.com) 6 | * 7 | * Copyright (c) Microsoft Corporation. All rights reserved. 8 | * Licensed under the MIT License. 9 | */ 10 | 11 | #ifndef MALUUBA_METRIC_HPP 12 | #define MALUUBA_METRIC_HPP 13 | 14 | #include 15 | #include 16 | 17 | namespace maluuba 18 | { 19 | /** 20 | * Infers the result type of a distance metric. 21 | */ 22 | template 23 | using MetricResult = std::result_of_t; 24 | 25 | /** 26 | * Equality distance metric. 27 | * 28 | * @author Tavian Barnes (tavian.barnes@microsoft.com) 29 | */ 30 | class EqualityMetric 31 | { 32 | public: 33 | /** 34 | * Compute the distance between @p t and @p u. 35 | * 36 | * @return 0 if t == u, 1 otherwise. 37 | */ 38 | template 39 | int 40 | operator()(const T& t, const U& u) const 41 | { 42 | return t == u ? 0 : 1; 43 | } 44 | }; 45 | } 46 | 47 | #endif // MALUUBA_METRIC_HPP 48 | -------------------------------------------------------------------------------- /tests/distance/stringdistance.test.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import {StringDistance} from "../../ts/distance" 5 | 6 | test("String distance equality.", () => { 7 | const dist = new StringDistance(); 8 | expect(dist.distance("This, is a test.", "This, is a test.")).toBe(0); 9 | }); 10 | 11 | test("String distance.", () => { 12 | const dist = new StringDistance(); 13 | 14 | expect(dist.distance("aaa", "bbb")).toBe(3); 15 | expect(dist.distance("aaa", "aaa")).toBe(0); 16 | expect(dist.distance("aaa", "aba")).toBe(1); 17 | expect(dist.distance("", "")).toBe(0); 18 | expect(dist.distance("", "aaa")).toBe(3); 19 | expect(dist.distance("aaa", "")).toBe(3); 20 | }); 21 | 22 | test("ctor used as function exception.", () => { 23 | expect(() => { 24 | const distance = (StringDistance as any)(); 25 | }).toThrow(); 26 | }); 27 | 28 | test("Distance on undefined exception.", () => { 29 | expect(() => { 30 | const dist = new StringDistance(); 31 | dist.distance(undefined, undefined); 32 | }).toThrow(); 33 | }); 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) Microsoft Corporation. All rights reserved. 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Match.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching 5 | { 6 | /// 7 | /// A matched element with its distance score. 8 | /// 9 | /// The element type. 10 | public class Match 11 | { 12 | /// 13 | /// Initializes a new instance of the class. 14 | /// 15 | /// the element wrapped 16 | /// the distance with query target 17 | public Match(T element, double distance) 18 | { 19 | this.Element = element; 20 | this.Distance = distance; 21 | } 22 | 23 | /// 24 | /// Gets the element. 25 | /// 26 | public T Element { get; private set; } 27 | 28 | /// 29 | /// Gets the distance with the target matched. 30 | /// 31 | public double Distance { get; private set; } 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/match.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Match wrapped in NodeJS. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | #ifndef MALUUBA_SPEECH_NODEJS_MATCH_HPP 10 | #define MALUUBA_SPEECH_NODEJS_MATCH_HPP 11 | 12 | #include "maluuba/speech/fuzzymatcher.hpp" 13 | #include 14 | #include 15 | 16 | namespace maluuba 17 | { 18 | namespace speech 19 | { 20 | namespace nodejs 21 | { 22 | class Match: public node::ObjectWrap 23 | { 24 | using NodeJsTarget = v8::UniquePersistent; 25 | using MatchType = speech::FuzzyMatcher::Match; 26 | 27 | public: 28 | static void Init(v8::Local exports); 29 | static v8::Local constructor(v8::Isolate* isolate); 30 | 31 | Match(MatchType match); 32 | const MatchType& match() const; 33 | 34 | private: 35 | static void New(const v8::FunctionCallbackInfo& args); 36 | static v8::Persistent s_constructor; 37 | MatchType m_match; 38 | }; 39 | } 40 | } 41 | } 42 | 43 | #endif // MALUUBA_SPEECH_NODEJS_MATCH_HPP 44 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/enpronouncer.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * English Pronouncer wrapped in NodeJS. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | #ifndef MALUUBA_SPEECH_NODEJS_ENPRONOUNCER_HPP 10 | #define MALUUBA_SPEECH_NODEJS_ENPRONOUNCER_HPP 11 | 12 | #include "maluuba/speech/pronouncer.hpp" 13 | #include 14 | #include 15 | 16 | namespace maluuba 17 | { 18 | namespace speech 19 | { 20 | namespace nodejs 21 | { 22 | class EnPronouncer: public node::ObjectWrap 23 | { 24 | public: 25 | static void Init(v8::Local exports); 26 | 27 | const speech::EnPronouncer& pronouncer() const; 28 | 29 | private: 30 | explicit EnPronouncer(speech::EnPronouncer pronouncer); 31 | ~EnPronouncer(); 32 | 33 | static void New(const v8::FunctionCallbackInfo& args); 34 | static void Pronounce(const v8::FunctionCallbackInfo& args); 35 | static v8::Persistent s_constructor; 36 | speech::EnPronouncer m_pronouncer; 37 | }; 38 | } 39 | } 40 | } 41 | 42 | #endif // MALUUBA_SPEECH_NODEJS_ENPRONOUNCER_HPP 43 | -------------------------------------------------------------------------------- /src/maluuba/speech/pronouncer.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Pronouncer. 4 | * 5 | * @author Benedicte Pierrejean 6 | * @author Tavian Barnes (tavian.barnes@microsoft.com) 7 | * 8 | * Copyright (c) Microsoft Corporation. All rights reserved. 9 | * Licensed under the MIT License. 10 | */ 11 | 12 | #ifndef MALUUBA_SPEECH_PRONOUNCER_HPP 13 | #define MALUUBA_SPEECH_PRONOUNCER_HPP 14 | 15 | #include "maluuba/speech/pronunciation.hpp" 16 | #include 17 | #include 18 | 19 | namespace maluuba 20 | { 21 | namespace speech 22 | { 23 | class Pronouncer 24 | { 25 | public: 26 | Pronouncer() = default; 27 | virtual ~Pronouncer() = 0; 28 | 29 | Pronouncer(Pronouncer&& other) = default; 30 | Pronouncer& operator=(Pronouncer&& other) = default; 31 | }; 32 | 33 | class EnPronouncer: public Pronouncer 34 | { 35 | public: 36 | EnPronouncer(); 37 | virtual ~EnPronouncer(); 38 | 39 | EnPronouncer(EnPronouncer&& other); 40 | EnPronouncer& operator=(EnPronouncer&& other); 41 | 42 | EnPronunciation pronounce(const std::string& text) const; 43 | 44 | private: 45 | struct Impl; 46 | std::unique_ptr m_impl; 47 | }; 48 | } 49 | } 50 | 51 | #endif // MALUUBA_SPEECH_PRONOUNCER_HPP 52 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/Distance/BaseDistanceTester.cs: -------------------------------------------------------------------------------- 1 | //----------------------------------------------------------------------- 2 | // 3 | // Copyright (c) Microsoft Corporation. All rights reserved. 4 | // Licensed under the MIT License. 5 | // 6 | //----------------------------------------------------------------------- 7 | 8 | namespace PhoneticMatchingTests.Distance 9 | { 10 | using System; 11 | using Microsoft.VisualStudio.TestTools.UnitTesting; 12 | using Microsoft.PhoneticMatching.Distance; 13 | 14 | public abstract class BaseDistanceTester where T : class 15 | { 16 | public BaseDistanceTester() 17 | { 18 | this.Distance = this.CreateDistanceOperator(); 19 | } 20 | 21 | protected IDistance Distance { get; private set; } 22 | 23 | [TestMethod] 24 | public void GivenNull_ExpectException() 25 | { 26 | Assert.ThrowsException(() => 27 | { 28 | var dist = this.Distance.Distance(null, null); 29 | }); 30 | } 31 | 32 | protected abstract IDistance CreateDistanceOperator(); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/stringdistance.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * String Distance wrapped in NodeJS. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | #ifndef MALUUBA_SPEECH_NODEJS_STRINGDISTANCE_HPP 10 | #define MALUUBA_SPEECH_NODEJS_STRINGDISTANCE_HPP 11 | 12 | #include "maluuba/levenshtein.hpp" 13 | #include 14 | #include 15 | 16 | namespace maluuba 17 | { 18 | namespace speech 19 | { 20 | namespace nodejs 21 | { 22 | class StringDistance: public node::ObjectWrap 23 | { 24 | public: 25 | static void Init(v8::Local exports); 26 | static v8::Local type(v8::Isolate* isolate); 27 | 28 | const LevenshteinDistance<>& distance() const; 29 | 30 | private: 31 | explicit StringDistance(LevenshteinDistance<> distance); 32 | ~StringDistance(); 33 | 34 | static void New(const v8::FunctionCallbackInfo& args); 35 | static void Distance(const v8::FunctionCallbackInfo& args); 36 | static v8::Persistent s_constructor; 37 | static v8::Persistent s_type; 38 | LevenshteinDistance<> m_distance; 39 | }; 40 | } 41 | } 42 | } 43 | 44 | #endif // MALUUBA_SPEECH_NODEJS_STRINGDISTANCE_HPP 45 | -------------------------------------------------------------------------------- /src/maluuba/speech/csharp/csharp.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Macro to export symbols. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | #ifndef MALUUBA_SPEECH_CSHARP_CSHARP_HPP 10 | #define MALUUBA_SPEECH_CSHARP_CSHARP_HPP 11 | 12 | 13 | #if defined _WIN32 || defined __CYGWIN__ 14 | #define STDCALL __stdcall 15 | #if 1 //def BUILDING_DLL 16 | #ifdef __GNUC__ 17 | #define DLL_PUBLIC __attribute__ ((dllexport)) 18 | #else 19 | #define DLL_PUBLIC __declspec(dllexport) // Note: actually gcc seems to also supports this syntax. 20 | #endif 21 | #else 22 | #ifdef __GNUC__ 23 | #define DLL_PUBLIC __attribute__ ((dllimport)) 24 | #else 25 | #define DLL_PUBLIC __declspec(dllimport) // Note: actually gcc seems to also supports this syntax. 26 | #endif 27 | #endif // BUILDING_DLL 28 | #define DLL_LOCAL 29 | #else 30 | #define STDCALL 31 | #if __GNUC__ >= 4 32 | #define DLL_PUBLIC __attribute__ ((visibility ("default"))) 33 | #define DLL_LOCAL __attribute__ ((visibility ("hidden"))) 34 | #else 35 | #define DLL_PUBLIC 36 | #define DLL_LOCAL 37 | #endif // __GNUC__ >= 4 38 | #endif // defined _WIN32 || defined __CYGWIN__ 39 | 40 | #endif // MALUUBA_SPEECH_CSHARP_CSHARP_HPP -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Distance/DistanceInput.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Distance 5 | { 6 | using System; 7 | 8 | /// 9 | /// Input object for . Hold the text and the pronunciation of that text. 10 | /// 11 | public class DistanceInput 12 | { 13 | /// 14 | /// Initializes a new instance of the class. 15 | /// 16 | /// the text to compute distance on 17 | /// the pronunciation to compute distance on 18 | public DistanceInput(string phrase, EnPronunciation pronunciation) 19 | { 20 | this.Phrase = phrase; 21 | this.Pronunciation = pronunciation; 22 | } 23 | 24 | /// 25 | /// Gets the text to compute distance on. 26 | /// 27 | public string Phrase { get; private set; } 28 | 29 | /// 30 | /// Gets the pronunciation to compute distance on. 31 | /// 32 | public EnPronunciation Pronunciation { get; private set; } 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/enhybriddistance.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * English Phonetic + String Distance wrapped in NodeJS. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | #ifndef MALUUBA_SPEECH_NODEJS_ENHYBRIDDISTANCE_HPP 10 | #define MALUUBA_SPEECH_NODEJS_ENHYBRIDDISTANCE_HPP 11 | 12 | #include "maluuba/speech/hybriddistance.hpp" 13 | #include 14 | #include 15 | 16 | namespace maluuba 17 | { 18 | namespace speech 19 | { 20 | namespace nodejs 21 | { 22 | class EnHybridDistance: public node::ObjectWrap 23 | { 24 | 25 | public: 26 | static void Init(v8::Local exports); 27 | static v8::Local type(v8::Isolate* isolate); 28 | 29 | const speech::HybridDistance<>& distance() const; 30 | 31 | private: 32 | explicit EnHybridDistance(speech::HybridDistance<> distance); 33 | ~EnHybridDistance(); 34 | 35 | static void New(const v8::FunctionCallbackInfo& args); 36 | static void Distance(const v8::FunctionCallbackInfo& args); 37 | static v8::Persistent s_constructor; 38 | static v8::Persistent s_type; 39 | speech::HybridDistance<> m_distance; 40 | }; 41 | } 42 | } 43 | } 44 | 45 | #endif // MALUUBA_SPEECH_NODEJS_ENHYBRIDDISTANCE_HPP 46 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/enphoneticdistance.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * English Phonetic Distance wrapped in NodeJS. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | #ifndef MALUUBA_SPEECH_NODEJS_ENPHONETICDISTANCE_HPP 10 | #define MALUUBA_SPEECH_NODEJS_ENPHONETICDISTANCE_HPP 11 | 12 | #include "maluuba/speech/phoneticdistance.hpp" 13 | #include 14 | #include 15 | 16 | namespace maluuba 17 | { 18 | namespace speech 19 | { 20 | namespace nodejs 21 | { 22 | class EnPhoneticDistance: public node::ObjectWrap 23 | { 24 | public: 25 | static void Init(v8::Local exports); 26 | static v8::Local type(v8::Isolate* isolate); 27 | 28 | const speech::EnPhoneticDistance& distance() const; 29 | 30 | private: 31 | explicit EnPhoneticDistance(speech::EnPhoneticDistance distance); 32 | ~EnPhoneticDistance(); 33 | 34 | static void New(const v8::FunctionCallbackInfo& args); 35 | static void Distance(const v8::FunctionCallbackInfo& args); 36 | static v8::Persistent s_constructor; 37 | static v8::Persistent s_type; 38 | speech::EnPhoneticDistance m_distance; 39 | }; 40 | } 41 | } 42 | } 43 | 44 | #endif // MALUUBA_SPEECH_NODEJS_ENPHONETICDISTANCE_HPP 45 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/Target.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers 5 | { 6 | /// 7 | /// Target of a matcher 8 | /// 9 | /// Type of the target 10 | public class Target 11 | { 12 | /// 13 | /// Initializes a new instance of the class. 14 | /// 15 | /// Target value 16 | /// Target phrase 17 | /// Target identifier 18 | public Target(T value, string phrase, int id) 19 | { 20 | this.Value = value; 21 | this.Phrase = phrase; 22 | this.Id = id; 23 | } 24 | 25 | /// 26 | /// Gets the Target element value. 27 | /// 28 | public T Value { get; private set; } 29 | 30 | /// 31 | /// Gets the Target element phrase. 32 | /// 33 | public string Phrase { get; private set; } 34 | 35 | /// 36 | /// Gets the Target element identifier. 37 | /// 38 | public int Id { get; private set; } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Tokenizer/Interval.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Tokenizer 5 | { 6 | /// 7 | /// An Interval holds the first and last index bounds. 8 | /// 9 | public class Interval 10 | { 11 | /// 12 | /// Initializes a new instance of the class. 13 | /// 14 | /// Starting index (inclusive). 15 | /// Ending index (exclusive). 16 | public Interval(int first, int last) 17 | { 18 | this.First = first; 19 | this.Last = last; 20 | } 21 | 22 | /// 23 | /// Gets the Starting index (inclusive). 24 | /// 25 | public int First { get; private set; } 26 | 27 | /// 28 | /// Gets the Ending index (exclusive). 29 | /// 30 | public int Last { get; private set; } 31 | 32 | /// 33 | /// Gets the length of the token. 34 | /// 35 | public int Length 36 | { 37 | get 38 | { 39 | return this.Last - this.First; 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Typescript output 2 | /lib 3 | 4 | # node-gyp build 5 | /build 6 | /bindings 7 | 8 | # gdb outputs 9 | .gdb_history 10 | 11 | # vs files 12 | .vs/ 13 | .vscode/ 14 | 15 | # Logs 16 | logs 17 | *.log 18 | npm-debug.log* 19 | yarn-debug.log* 20 | yarn-error.log* 21 | 22 | # Runtime data 23 | pids 24 | *.pid 25 | *.seed 26 | *.pid.lock 27 | 28 | # Directory for instrumented libs generated by jscoverage/JSCover 29 | lib-cov 30 | 31 | # Coverage directory used by tools like istanbul 32 | coverage 33 | 34 | # nyc test coverage 35 | .nyc_output 36 | 37 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 38 | .grunt 39 | 40 | # Bower dependency directory (https://bower.io/) 41 | bower_components 42 | 43 | # node-waf configuration 44 | .lock-wscript 45 | 46 | # Compiled binary addons (http://nodejs.org/api/addons.html) 47 | build/Release 48 | 49 | # Dependency directories 50 | node_modules/ 51 | jspm_packages/ 52 | 53 | # Typescript v1 declaration files 54 | typings/ 55 | 56 | # Optional npm cache directory 57 | .npm 58 | 59 | # Optional eslint cache 60 | .eslintcache 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | 74 | # .net 75 | bin/ 76 | obj/ 77 | packages/ 78 | *.Cache 79 | *.nupkg 80 | *.csproj.user 81 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/main.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/nodejs/enhybriddistance.hpp" 5 | #include "maluuba/speech/nodejs/enphoneticdistance.hpp" 6 | #include "maluuba/speech/nodejs/enpronouncer.hpp" 7 | #include "maluuba/speech/nodejs/enpronunciation.hpp" 8 | #include "maluuba/speech/nodejs/fuzzymatcher.hpp" 9 | #include "maluuba/speech/nodejs/match.hpp" 10 | // #include "maluuba/speech/nodejs/performance.hpp" 11 | #include "maluuba/speech/nodejs/phone.hpp" 12 | #include "maluuba/speech/nodejs/stringdistance.hpp" 13 | #include 14 | 15 | namespace maluuba 16 | { 17 | namespace speech 18 | { 19 | namespace nodejs 20 | { 21 | namespace 22 | { 23 | void 24 | Init(v8::Local exports, v8::Local module) 25 | { 26 | // Performance::Init(module); 27 | EnHybridDistance::Init(exports); 28 | EnPhoneticDistance::Init(exports); 29 | FuzzyMatcher::Init(exports, "FuzzyMatcher"); 30 | FuzzyMatcher::Init(exports, "AcceleratedFuzzyMatcher"); 31 | EnPronouncer::Init(exports); 32 | EnPronunciation::Init(exports); 33 | Match::Init(exports); 34 | Phone::Init(exports); 35 | StringDistance::Init(exports); 36 | } 37 | } 38 | 39 | NODE_MODULE(NODE_GYP_MODULE_NAME, Init) 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/enpronunciation.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * English Pronunciation wrapped in NodeJS. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | #ifndef MALUUBA_SPEECH_NODEJS_ENPRONUNCIATION_HPP 10 | #define MALUUBA_SPEECH_NODEJS_ENPRONUNCIATION_HPP 11 | 12 | #include "maluuba/speech/pronunciation.hpp" 13 | #include 14 | #include 15 | 16 | namespace maluuba 17 | { 18 | namespace speech 19 | { 20 | namespace nodejs 21 | { 22 | class EnPronunciation: public node::ObjectWrap 23 | { 24 | public: 25 | static void Init(v8::Local exports); 26 | static v8::Local constructor(v8::Isolate* isolate); 27 | static v8::Local type(v8::Isolate* isolate); 28 | 29 | EnPronunciation(speech::EnPronunciation pronunciation); 30 | const speech::EnPronunciation& pronunciation() const; 31 | 32 | private: 33 | static void New(const v8::FunctionCallbackInfo& args); 34 | static void FromIpa(const v8::FunctionCallbackInfo& args); 35 | static void FromArpabet(const v8::FunctionCallbackInfo& args); 36 | static v8::Persistent s_constructor; 37 | static v8::Persistent s_type; 38 | speech::EnPronunciation m_pronunciation; 39 | }; 40 | } 41 | } 42 | } 43 | 44 | #endif // MALUUBA_SPEECH_NODEJS_ENPRONUNCIATION_HPP 45 | -------------------------------------------------------------------------------- /tests/nlp/preprocessor.test.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import { EnPreProcessor } from "../../ts/nlp"; 5 | 6 | describe("EnPreProcessor", () => { 7 | const processor = new EnPreProcessor(); 8 | 9 | test("Hi", () => { 10 | // "Híffi" 11 | // í has a combining acute accent, ffi is a ligature 12 | expect(processor.preProcess("Hi\u0301\uFB03")).toBe("h\u00EDffi"); 13 | }); 14 | 15 | test("Digits", () => { 16 | expect(processor.preProcess("123 King St")).toBe("123 king st"); 17 | expect(processor.preProcess("2 Wildwood Place")).toBe("2 wildwood place"); 18 | }); 19 | 20 | test("Punctuation", () => { 21 | expect(processor.preProcess("!omg! ch!ll ?how?")).toBe("omg ch ll how"); 22 | }); 23 | 24 | test("Apostrophe and case", () => { 25 | expect(processor.preProcess("Justin's haus")).toBe("justin s haus"); 26 | }); 27 | 28 | test("simple tokenization", () => { 29 | expect(processor.preProcess("call mom")).toBe("call mom"); 30 | expect(processor.preProcess("call MoM!")).toBe("call mom"); 31 | expect(processor.preProcess("*(*&call, MoM! )_+")).toBe("call mom"); 32 | expect(processor.preProcess(":call/mom")).toBe("call mom"); 33 | expect(processor.preProcess("Call mom.")).toBe("call mom"); 34 | expect(processor.preProcess("Call mom .")).toBe("call mom"); 35 | expect(processor.preProcess("Call mom .")).toBe("call mom"); 36 | }); 37 | }); 38 | -------------------------------------------------------------------------------- /src/maluuba/debug.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Debuging utilities. 4 | * 5 | * @author Tavian Barnes (tavian.barnes@microsoft.com) 6 | * 7 | * Copyright (c) Microsoft Corporation. All rights reserved. 8 | * Licensed under the MIT License. 9 | */ 10 | 11 | #ifndef MALUUBA_DEBUG_HPP 12 | #define MALUUBA_DEBUG_HPP 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | namespace maluuba 20 | { 21 | /** 22 | * Check that a condition is true, and throw an exception if not. 23 | * 24 | * @tparam E The type of exception to throw. 25 | * @param condition The condition that must hold. 26 | * @param args Arguments to pass to the exception constructor. 27 | * @throws E If !condition. 28 | */ 29 | template 30 | void 31 | check(Cond&& condition, Args&&... args) 32 | { 33 | if (!static_cast(condition)) { 34 | throw E(std::forward(args)...); 35 | } 36 | } 37 | 38 | /** 39 | * Check that a condition is true, and throw @c std::logic_error if not. 40 | * 41 | * @param condition The condition that must hold. 42 | * @param message The exception message for failures. 43 | * @throws std::logic_error If !condition. 44 | */ 45 | template 46 | void 47 | check_logic(Cond&& condition, const char* message) 48 | { 49 | check(std::forward(condition), message); 50 | } 51 | } 52 | 53 | #endif // MALUUBA_DEBUG_HPP 54 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/Distance/StringDistanceTests.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingTests.Distance 5 | { 6 | using System; 7 | using Microsoft.VisualStudio.TestTools.UnitTesting; 8 | using Microsoft.PhoneticMatching; 9 | using Microsoft.PhoneticMatching.Distance; 10 | 11 | [TestClass] 12 | public class StringDistanceTests : BaseDistanceTester 13 | { 14 | [TestMethod] 15 | public void GivenExactString_ExpectZeroDistance() 16 | { 17 | Assert.AreEqual(0, this.Distance.Distance("This, is a test.", "This, is a test.")); 18 | } 19 | 20 | [TestMethod] 21 | public void GivenKnownDistances_ExpectPositiveMatches() 22 | { 23 | const string Aaa = "aaa"; 24 | const string Bbb = "bbb"; 25 | const string Aba = "aba"; 26 | 27 | Assert.AreEqual(3, this.Distance.Distance(Aaa, Bbb)); 28 | Assert.AreEqual(0, this.Distance.Distance(Aaa, Aaa)); 29 | Assert.AreEqual(1, this.Distance.Distance(Aaa, Aba)); 30 | Assert.AreEqual(0, this.Distance.Distance(string.Empty, string.Empty)); 31 | Assert.AreEqual(3, this.Distance.Distance(string.Empty, Aaa)); 32 | Assert.AreEqual(3, this.Distance.Distance(Aaa, string.Empty)); 33 | } 34 | 35 | protected override IDistance CreateDistanceOperator() 36 | { 37 | return new StringDistance(); 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/ContactMatcher/ContactMatcherConfig.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers.ContactMatcher 5 | { 6 | /// 7 | /// Configurations to tweak the accuracy of the contact matcher. 8 | /// 9 | public class ContactMatcherConfig : MatcherConfig 10 | { 11 | /// 12 | /// Initializes a new instance of the class. 13 | /// 14 | /// Weighting trade-off between the phonetic distance and the lexical distance scores. 15 | /// Maximum number of places the matcher can return 16 | /// Maximum distance to a match. Normalized to 0 for exact match, 1 for nothing matches 17 | /// Candidate cutoff given by Math.max({best matched distance} * bestDistanceMultiplier, maxDistanceMarginReturns) 18 | /// best distance multiplier 19 | public ContactMatcherConfig( 20 | double phoneticWeightPercentage = 0.7, 21 | int maxReturns = 4, 22 | double findThreshold = 0.35, 23 | double maxDistanceMarginReturns = 0.02, 24 | double bestDistanceMultiplier = 1.1) 25 | : base(phoneticWeightPercentage, maxReturns, findThreshold, maxDistanceMarginReturns, bestDistanceMultiplier) 26 | { 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/ChainedRuleBasedPreProcessor.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor 5 | { 6 | using System; 7 | using System.Collections.Generic; 8 | using System.Text.RegularExpressions; 9 | 10 | /// 11 | /// Pre-processes by appling a list of rules sequentially. First rules added are applied first. 12 | /// 13 | public class ChainedRuleBasedPreProcessor : IPreProcessor 14 | { 15 | private readonly List> rules = new List>(); 16 | 17 | /// 18 | /// Function to preform the pre-processing. 19 | /// 20 | /// The string to pre-process. 21 | /// The pre-processed string. 22 | public string PreProcess(string query) 23 | { 24 | string result = query; 25 | foreach (var rule in this.rules) 26 | { 27 | result = rule.Item1.Replace(result, rule.Item2); 28 | } 29 | 30 | return result; 31 | } 32 | 33 | /// 34 | /// Add a replacement rule 35 | /// 36 | /// Pattern to replace 37 | /// String to replace with. 38 | public void AddRule(Regex pattern, string replacement) 39 | { 40 | this.rules.Add(new Tuple(pattern, replacement)); 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/PlaceMatcher/PlaceMatcherConfig.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers.PlaceMatcher 5 | { 6 | using System; 7 | 8 | /// 9 | /// Configurations to tweak the accuracy of the place matcher. 10 | /// 11 | public class PlaceMatcherConfig : MatcherConfig 12 | { 13 | /// 14 | /// Initializes a new instance of the class. 15 | /// 16 | /// Weighting trade-off between the phonetic distance and the lexical distance scores. 17 | /// Maximum number of places the matcher can return 18 | /// Maximum distance to a match. Normalized to 0 for exact match, 1 for nothing matches 19 | /// Candidate cutoff given by Math.max({best matched distance} * bestDistanceMultiplier, maxDistanceMarginReturns) 20 | /// best distance multiplier 21 | public PlaceMatcherConfig( 22 | double phoneticWeightPercentage = 0.7, 23 | int maxReturns = 8, 24 | double findThreshold = 0.35, 25 | double maxDistanceMarginReturns = 0.02, 26 | double bestDistanceMultiplier = 1.1) 27 | : base(phoneticWeightPercentage, maxReturns, findThreshold, maxDistanceMarginReturns, bestDistanceMultiplier) 28 | { 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/Settings.StyleCop: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | rhotic 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | False 13 | 14 | 15 | 16 | 17 | False 18 | 19 | 20 | 21 | 22 | False 23 | 24 | 25 | 26 | 27 | False 28 | 29 | 30 | 31 | 32 | True 33 | 34 | 35 | 36 | 37 | 38 | 39 | False 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingPerfTests/Settings.StyleCop: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | rhotic 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | False 13 | 14 | 15 | 16 | 17 | False 18 | 19 | 20 | 21 | 22 | False 23 | 24 | 25 | 26 | 27 | False 28 | 29 | 30 | 31 | 32 | True 33 | 34 | 35 | 36 | 37 | 38 | 39 | False 40 | 41 | 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/maluuba/unicode.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Utilities for working with Unicode text. 4 | * 5 | * @author Tavian Barnes (tavian.barnes@microsoft.com) 6 | * 7 | * Copyright (c) Microsoft Corporation. All rights reserved. 8 | * Licensed under the MIT License. 9 | */ 10 | 11 | #ifndef MALUUBA_UNICODE_HPP 12 | #define MALUUBA_UNICODE_HPP 13 | 14 | #include "maluuba/xtd/string_view.hpp" 15 | #include 16 | 17 | namespace maluuba 18 | { 19 | 20 | template 21 | String unicode_cast(const xtd::string_view utf8); 22 | 23 | template 24 | String unicode_cast(const xtd::u16string_view utf16); 25 | 26 | /** 27 | * No-op conversion for UTF-8. 28 | * 29 | * @param utf8 A UTF-8 encoded string. 30 | * @return The equivalent UTF-8 encoded string. 31 | */ 32 | template <> 33 | std::string unicode_cast(const xtd::string_view utf8); 34 | 35 | /** 36 | * Convert the given UTF-16 encoded string to UTF-8. 37 | * 38 | * @param utf16 A UTF-16 encoded string. 39 | * @return The equivalent UTF-8 encoded string. 40 | */ 41 | template <> 42 | std::string unicode_cast(const xtd::u16string_view utf16); 43 | 44 | /** 45 | * Convert the given UTF-8 encoded string to UTF-16. 46 | * 47 | * @param utf8 A UTF-8 encoded string. 48 | * @return The equivalent UTF-16 encoded string. 49 | */ 50 | template <> 51 | std::u16string unicode_cast(const xtd::string_view utf8); 52 | 53 | /** 54 | * No-op conversion for UTF-16. 55 | * 56 | * @param utf16 A UTF-16 encoded string. 57 | * @return The equivalent UTF-16 encoded string. 58 | */ 59 | template <> 60 | std::u16string unicode_cast(const xtd::u16string_view utf16); 61 | } 62 | 63 | #endif // MALUUBA_UNICODE_HPP 64 | -------------------------------------------------------------------------------- /src/maluuba/speech/pronunciation/pronunciation.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/pronunciation.hpp" 5 | #include "maluuba/unicode.hpp" 6 | #include 7 | 8 | namespace maluuba 9 | { 10 | namespace speech 11 | { 12 | Pronunciation::Pronunciation() = default; 13 | 14 | Pronunciation::~Pronunciation() = default; 15 | 16 | Pronunciation::iterator 17 | Pronunciation::begin() const 18 | { 19 | return m_phones.begin(); 20 | } 21 | 22 | Pronunciation::iterator 23 | Pronunciation::end() const 24 | { 25 | return m_phones.end(); 26 | } 27 | 28 | bool 29 | Pronunciation::empty() const 30 | { 31 | return m_phones.empty(); 32 | } 33 | 34 | Pronunciation::size_type 35 | Pronunciation::size() const 36 | { 37 | return m_phones.size(); 38 | } 39 | 40 | std::string 41 | Pronunciation::to_ipa() const 42 | { 43 | return unicode_cast(m_ipa); 44 | } 45 | 46 | EnPronunciation::~EnPronunciation() = default; 47 | 48 | EnPronunciation::EnPronunciation(const EnPronunciation& other) = default; 49 | 50 | EnPronunciation::EnPronunciation(EnPronunciation&& other) = default; 51 | 52 | EnPronunciation& 53 | EnPronunciation::operator=(const EnPronunciation& other) = default; 54 | 55 | EnPronunciation& 56 | EnPronunciation::operator=(EnPronunciation&& other) = default; 57 | 58 | EnPronunciation::EnPronunciation() 59 | : Pronunciation{} 60 | { } 61 | 62 | EnPronunciation::EnPronunciation(std::u16string ipa) 63 | : Pronunciation{ipa} 64 | { } 65 | 66 | std::string 67 | to_string(const Pronunciation& pron) 68 | { 69 | return pron.to_ipa(); 70 | } 71 | 72 | std::ostream& 73 | operator<<(std::ostream& stream, const Pronunciation& pron) 74 | { 75 | return stream << to_string(pron); 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/maluuba/unicode/unicode.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/unicode.hpp" 5 | #include 6 | #include 7 | 8 | namespace maluuba 9 | { 10 | template <> 11 | std::string 12 | unicode_cast(const xtd::string_view utf8) 13 | { 14 | return std::string{utf8}; 15 | } 16 | 17 | // https://stackoverflow.com/a/35103224 18 | #if MALUUBA_CODECVT_BUG 19 | template <> 20 | std::string 21 | unicode_cast(const xtd::u16string_view utf16) 22 | { 23 | std::wstring_convert, int16_t> convertor; 24 | auto p = reinterpret_cast(utf16.data()); 25 | return convertor.to_bytes(p, p + utf16.size()); 26 | } 27 | 28 | template <> 29 | std::u16string 30 | unicode_cast(const xtd::string_view utf8) 31 | { 32 | std::wstring_convert, int16_t> convertor; 33 | auto w = convertor.from_bytes(utf8.data(), utf8.data() + utf8.size()); 34 | return {reinterpret_cast(w.data()), w.size()}; 35 | } 36 | #else 37 | template <> 38 | std::string 39 | unicode_cast(const xtd::u16string_view utf16) 40 | { 41 | std::wstring_convert, char16_t> convertor; 42 | return convertor.to_bytes(utf16.data(), utf16.data() + utf16.size()); 43 | } 44 | 45 | template <> 46 | std::u16string 47 | unicode_cast(const xtd::string_view utf8) 48 | { 49 | std::wstring_convert, char16_t> convertor; 50 | return convertor.from_bytes(utf8.data(), utf8.data() + utf8.size()); 51 | } 52 | #endif // MALUUBA_CODECVT_BUG 53 | 54 | template <> 55 | std::u16string 56 | unicode_cast(const xtd::u16string_view utf16) 57 | { 58 | return std::u16string{utf16}; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "phoneticmatching", 3 | "version": "0.3.8", 4 | "description": "A text utility to do string comparisons at a phonetic level.", 5 | "main": "./lib/index.js", 6 | "types": "./lib/index.d.ts", 7 | "scripts": { 8 | "install": "node-pre-gyp install --fallback-to-build", 9 | "prepare": "npm run tsc", 10 | "package": "node-pre-gyp package", 11 | "test": "jest --config jestConfig.json test[.]", 12 | "test:debug": "npm test -- --verbose -i", 13 | "test:testsets": "jest --config jestConfig.json testset.spec", 14 | "build-docs": "typedoc --options typedoc.json --tsconfig ts/tsconfig.json ts/", 15 | "tsc": "tsc --project ts/", 16 | "watch": "npm run tsc -- --watch", 17 | "build": "node-pre-gyp build --build-from-source", 18 | "build:debug": "npm run build -- --debug", 19 | "rebuild": "node-pre-gyp rebuild --build-from-source", 20 | "rebuild:debug": "npm run rebuild -- --debug", 21 | "release": "npm run rebuild && npm run tsc && npm run build-docs" 22 | }, 23 | "homepage": "https://microsoft.github.io/PhoneticMatching/", 24 | "repository": { 25 | "type": "git", 26 | "url": "https://github.com/Microsoft/PhoneticMatching" 27 | }, 28 | "author": "madixon@microsoft.com", 29 | "license": "MIT", 30 | "engines": { 31 | "node": ">=8.11.2" 32 | }, 33 | "devDependencies": { 34 | "@types/jest": "^25.2.3", 35 | "@types/node": "^11.15.54", 36 | "@types/xregexp": "^3.0.30", 37 | "jest": "^25.5.4", 38 | "ts-jest": "^25.5.1", 39 | "typedoc": "^0.20.37", 40 | "typescript": "^3.9.10" 41 | }, 42 | "dependencies": { 43 | "@mapbox/node-pre-gyp": "^1.0.10", 44 | "xregexp": "^4.4.1" 45 | }, 46 | "files": [ 47 | "binding.gyp", 48 | "lib/", 49 | "src/", 50 | "!src/cs/" 51 | ], 52 | "binary": { 53 | "module_name": "maluubaspeech", 54 | "module_path": "./bindings/lib/", 55 | "package_name": "{module_name}-{node_abi}-{platform}-{arch}.tar.gz", 56 | "remote_path": "{version}", 57 | "host": "https://github.com/Microsoft/PhoneticMatching/releases/download/" 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /ts/matchers/matcherconfig.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Matcher config. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | /** 10 | * Configurations to tweak the accuracy of a matcher. 11 | * 12 | * @export 13 | * @class MatcherConfig 14 | */ 15 | export class MatcherConfig { 16 | public readonly phoneticWeightPercentage: number; 17 | public maxReturns: number; 18 | public findThreshold: number; 19 | public maxDistanceMarginReturns: number; 20 | public bestDistanceMultiplier: number; 21 | 22 | /** 23 | *Creates an instance of MatcherConfig. 24 | * @param {*} [{ 25 | * phoneticWeightPercentage, Between 0 and 1. Weighting trade-off between the phonetic 26 | * distance and the lexical distance scores. 1 meaning 100% phonetic score and 0% lexical score. 27 | * maxReturns, The maximum number of places the matcher can return. 28 | * findThreshold, The maximum distance to a match. Normalized to 0 for exact match, 1 for nothing matches. 29 | * Can be >1 if the lengths do not match. 30 | * maxDistanceMarginReturns, Candidate cutoff given by 31 | * Math.max({best matched distance} * bestDistanceMultiplier, maxDistanceMarginReturns). 32 | * bestDistanceMultiplier, 33 | * }={}] 34 | * @memberof PlaceMatcherConfig 35 | */ 36 | constructor( 37 | phoneticWeightPercentage : number, 38 | maxReturns : number, 39 | findThreshold : number, 40 | maxDistanceMarginReturns : number, 41 | bestDistanceMultiplier :number ) { 42 | this.phoneticWeightPercentage = phoneticWeightPercentage; 43 | this.maxReturns = maxReturns; 44 | this.findThreshold = findThreshold; 45 | this.maxDistanceMarginReturns = maxDistanceMarginReturns; 46 | this.bestDistanceMultiplier = bestDistanceMultiplier; 47 | if (this.phoneticWeightPercentage < 0 || this.phoneticWeightPercentage > 1) { 48 | throw new TypeError("require 0 <= phoneticWeightPercentage <= 1"); 49 | } 50 | } 51 | } -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/NativeResourceWrapperTests.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingTests 5 | { 6 | using System; 7 | using System.Runtime.InteropServices; 8 | using System.Text; 9 | using Microsoft.VisualStudio.TestTools.UnitTesting; 10 | using Microsoft.PhoneticMatching; 11 | 12 | [TestClass] 13 | public class NativeResourceWrapperTests 14 | { 15 | [TestMethod] 16 | public void GivenBufferTooSmall_ExpectErrorCode() 17 | { 18 | TestNativeWrapper.TestBufferTooSmall(); 19 | } 20 | 21 | private abstract class TestNativeWrapper : NativeResourceWrapper 22 | { 23 | public static void TestBufferTooSmall() 24 | { 25 | double distance; 26 | 27 | // 2 is obviously too small to contain any error 28 | const int InitialBufferSize = 2; 29 | int bufferSize = InitialBufferSize; 30 | StringBuilder buffer = new StringBuilder(bufferSize); 31 | 32 | // IntPtr.Zero is a null reference exception 33 | var code = StringDistance_Distance(IntPtr.Zero, "123", "456", out distance, buffer, ref bufferSize); 34 | 35 | Assert.AreEqual(NativeResult.BufferTooSmall, code); 36 | Assert.IsTrue(bufferSize > InitialBufferSize); 37 | Assert.AreEqual(string.Empty, buffer.ToString()); 38 | 39 | // use the new buffer size returned by native 40 | buffer.Capacity = bufferSize; 41 | code = StringDistance_Distance(IntPtr.Zero, "123", "456", out distance, buffer, ref bufferSize); 42 | 43 | Assert.AreEqual(NativeResult.InvalidParameter, code); 44 | Assert.AreEqual("pointer is null", buffer.ToString()); 45 | } 46 | 47 | // Random dll import to test 48 | [DllImport("maluubaspeech-csharp.dll")] 49 | private static extern NativeResult StringDistance_Distance(IntPtr ptr, string s1, string s2, out double distance, StringBuilder buffer, ref int bufferSize); 50 | } 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /tests/distance/enphoneticdistance.test.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import {EnPronunciation} from "../../ts"; 5 | import {EnPhoneticDistance} from "../../ts/distance" 6 | 7 | test("English phonetic distance equality.", () => { 8 | const dist = new EnPhoneticDistance(); 9 | // This, is a test. 10 | const test = EnPronunciation.fromIpa("ðɪsɪzətɛst"); 11 | expect(dist.distance(test, test)).toBe(0); 12 | }); 13 | 14 | test("English phonetic distance.", () => { 15 | const dist = new EnPhoneticDistance(); 16 | 17 | // sam pasupalak 18 | const sam = EnPronunciation.fromIpa("sæmpɑsupələk"); 19 | // santa super black 20 | const santa = EnPronunciation.fromIpa("sæntəsupɝblæk"); 21 | // samples pollux 22 | const samples = EnPronunciation.fromIpa("sæmpəlzpɑləks"); 23 | 24 | // Check identity of indiscernibles 25 | expect(dist.distance(sam, sam)).toBe(0.0); 26 | expect(dist.distance(santa, santa)).toBe(0.0); 27 | expect(dist.distance(samples, samples)).toBe(0.0); 28 | 29 | // Check symmetry 30 | expect(dist.distance(sam, santa) == dist.distance(santa, sam)); 31 | expect(dist.distance(sam, samples) == dist.distance(samples, sam)); 32 | expect(dist.distance(santa, samples) == dist.distance(samples, santa)); 33 | 34 | // Check triangle inequality 35 | expect(dist.distance(sam, samples) < dist.distance(sam, santa) + dist.distance(santa, samples)); 36 | expect(dist.distance(sam, santa) < dist.distance(sam, samples) + dist.distance(samples, santa)); 37 | expect(dist.distance(santa, samples) < dist.distance(santa, sam) + dist.distance(sam, samples)); 38 | 39 | // Check performance 40 | expect(dist.distance(sam, santa) < dist.distance(sam, samples)); 41 | expect(dist.distance(sam, samples) < dist.distance(santa, samples)); 42 | }); 43 | 44 | test("ctor used as function exception.", () => { 45 | expect(() => { 46 | const distance = (EnPhoneticDistance as any)(); 47 | }).toThrow(); 48 | }); 49 | 50 | test("Distance on undefined exception.", () => { 51 | expect(() => { 52 | const dist = new EnPhoneticDistance(); 53 | dist.distance(undefined, undefined); 54 | }).toThrow(); 55 | }); 56 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Tokenizer/SplittingTokenizer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Tokenizer 5 | { 6 | using System.Collections.Generic; 7 | using System.Text.RegularExpressions; 8 | 9 | /// 10 | /// Tokenizing base-class that will split on the given RegExp. 11 | /// 12 | public class SplittingTokenizer : ITokenizer 13 | { 14 | private readonly Regex pattern; 15 | 16 | /// 17 | /// Initializes a new instance of the class. 18 | /// 19 | /// Pattern that splits the query when matched. 20 | public SplittingTokenizer(Regex pattern) 21 | { 22 | this.pattern = pattern; 23 | } 24 | 25 | /// 26 | /// Tokenize the query. 27 | /// 28 | /// Query to tokenize. 29 | /// Collection of tokens. 30 | public IList Tokenize(string query) 31 | { 32 | List result = new List(); 33 | var index = 0; 34 | MatchCollection matches = this.pattern.Matches(query); 35 | foreach (Match match in matches) 36 | { 37 | if (index < match.Index) 38 | { 39 | var interval = new Interval(index, match.Index); 40 | var token = new Token(query.Substring(interval.First, interval.Length), interval); 41 | result.Add(token); 42 | index += interval.Length + match.Length; 43 | } 44 | else if (index == match.Index) 45 | { 46 | index += match.Length; 47 | } 48 | } 49 | 50 | // Add the rest. 51 | if (index < query.Length) 52 | { 53 | var interval = new Interval(index, query.Length); 54 | var token = new Token(query.Substring(interval.First, interval.Length), interval); 55 | result.Add(token); 56 | } 57 | 58 | return result; 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/Nlp/TokenizerTests.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingTests.Nlp 5 | { 6 | using System.Collections.Generic; 7 | using Microsoft.VisualStudio.TestTools.UnitTesting; 8 | using Microsoft.PhoneticMatching.Nlp.Tokenizer; 9 | 10 | [TestClass] 11 | public class TokenizerTests 12 | { 13 | private readonly WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); 14 | 15 | [TestMethod] 16 | public void GivenEmptyString_ExpectNoToken() 17 | { 18 | var result = this.tokenizer.Tokenize(string.Empty); 19 | Assert.AreEqual(0, result.Count, "Expect no token for empty query"); 20 | } 21 | 22 | [TestMethod] 23 | public void GivenNoWhitespace_ExpectIdentity() 24 | { 25 | const string Query = "example"; 26 | var result = this.tokenizer.Tokenize(Query); 27 | Assert.AreEqual(1, result.Count); 28 | Assert.AreEqual(Query, result[0].Value); 29 | } 30 | 31 | [TestMethod] 32 | public void GivenQueryNotEndingWithSpaces_ExpectNoWhitespaceOrEmpty() 33 | { 34 | var result = this.tokenizer.Tokenize(" There are some words, here! #blessed"); 35 | var expected = new string[] { "There", "are", "some", "words,", "here!", "#blessed" }; 36 | this.AssertTokensAreEquals(expected, result); 37 | } 38 | 39 | [TestMethod] 40 | public void GivenQueryEndingWithSpaces_ExpectNoWhitespaceOrEmpty() 41 | { 42 | var result = this.tokenizer.Tokenize(" There are some words, here! #blessed "); 43 | var expected = new string[] { "There", "are", "some", "words,", "here!", "#blessed" }; 44 | this.AssertTokensAreEquals(expected, result); 45 | } 46 | 47 | private void AssertTokensAreEquals(string[] expectedValues, IList tokens) 48 | { 49 | Assert.AreEqual(expectedValues.Length, tokens.Count, "Tokenizer didn't return the expected result."); 50 | for (int idx = 0; idx < expectedValues.Length; ++idx) 51 | { 52 | Assert.AreEqual(expectedValues[idx], tokens[idx].Value); 53 | } 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/performance/performance.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/nodejs/performance.hpp" 5 | 6 | namespace maluuba 7 | { 8 | namespace speech 9 | { 10 | namespace nodejs 11 | { 12 | v8::Persistent Performance::s_performance; 13 | 14 | void 15 | Performance::Init(v8::Local module) 16 | { 17 | auto isolate = module->GetIsolate(); 18 | v8::Local context = isolate->GetCurrentContext(); 19 | 20 | auto require = module->Get(v8::String::NewFromUtf8(isolate, "require")).As(); 21 | const auto argc = 1; 22 | v8::Local argv[argc] = { v8::String::NewFromUtf8(isolate, "perf_hooks") }; 23 | auto perf_hooks = require->Call(context, module, argc, argv).ToLocalChecked().As(); 24 | auto performance = perf_hooks->Get(v8::String::NewFromUtf8(isolate, "performance")).As(); 25 | s_performance.Reset(isolate, performance); 26 | } 27 | 28 | void 29 | Performance::Mark(const std::string& name) 30 | { 31 | auto isolate = v8::Isolate::GetCurrent(); 32 | v8::Local context = isolate->GetCurrentContext(); 33 | 34 | auto performance = s_performance.Get(isolate); 35 | auto mark = performance->Get(v8::String::NewFromUtf8(isolate, "mark")).As(); 36 | const auto argc = 1; 37 | v8::Local argv[argc] = { v8::String::NewFromUtf8(isolate, name.data()) }; 38 | mark->Call(context, performance, argc, argv); 39 | } 40 | 41 | void 42 | Performance::Measure(const std::string& name, const std::string& start_mark, const std::string& end_mark) 43 | { 44 | auto isolate = v8::Isolate::GetCurrent(); 45 | v8::Local context = isolate->GetCurrentContext(); 46 | 47 | auto performance = s_performance.Get(isolate); 48 | auto measure = performance->Get(v8::String::NewFromUtf8(isolate, "measure")).As(); 49 | const auto argc = 3; 50 | v8::Local argv[argc] = { v8::String::NewFromUtf8(isolate, name.data()), 51 | v8::String::NewFromUtf8(isolate, start_mark.data()), v8::String::NewFromUtf8(isolate, end_mark.data()) }; 52 | measure->Call(context, performance, argc, argv); 53 | } 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /tests/distance/enhybriddistance.test.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import { EnPronouncer } from "../../ts"; 5 | import { DistanceInput, EnHybridDistance } from "../../ts/distance" 6 | 7 | const pronouncer = new EnPronouncer(); 8 | function makeInput(phrase: string): DistanceInput { 9 | return { 10 | phrase, 11 | pronunciation: pronouncer.pronounce(phrase) 12 | } 13 | } 14 | 15 | test("English hybrid distance equality.", () => { 16 | const dist = new EnHybridDistance(0.7); 17 | expect(dist.distance(makeInput("This, is a test."), makeInput("This, is a test."))).toBe(0); 18 | }); 19 | 20 | test("English hybrid distance get phoneticWeightPercentage.", () => { 21 | const dist = new EnHybridDistance(0.7); 22 | expect(dist.phoneticWeightPercentage).toBe(0.7) 23 | }); 24 | 25 | test("English hybrid distance.", () => { 26 | const dist = new EnHybridDistance(0.7); 27 | 28 | expect(dist.distance(makeInput("aaa"), makeInput("bbb"))).toBeGreaterThan(0); 29 | expect(dist.distance(makeInput("aaa"), makeInput("aaa"))).toBe(0); 30 | expect(dist.distance(makeInput(""), makeInput(""))).toBe(0); 31 | }); 32 | 33 | test("ctor used as function exception.", () => { 34 | expect(() => { 35 | const distance = (EnHybridDistance as any)(); 36 | }).toThrow(); 37 | }); 38 | 39 | test("Distance on undefined exception.", () => { 40 | expect(() => { 41 | const dist = new EnHybridDistance(0.7); 42 | dist.distance(undefined, undefined); 43 | }).toThrow(); 44 | }); 45 | 46 | test("Distance on empty objects.", () => { 47 | expect(() => { 48 | const dist = new EnHybridDistance(0.7); 49 | dist.distance({} as any, {} as any); 50 | }).toThrow(); 51 | }); 52 | 53 | test("Distance on empty input.", () => { 54 | expect(() => { 55 | const dist = new EnHybridDistance(0.7); 56 | dist.distance({phrase:"", pronunciation: undefined}, {phrase:"", pronunciation: undefined} as any); 57 | }).toThrow(); 58 | }); 59 | 60 | test("phoneticWeightPercentage undefined.", () => { 61 | expect(() => { 62 | const dist = new EnHybridDistance(undefined); 63 | }).toThrow(); 64 | }); 65 | 66 | test("phoneticWeightPercentage out of range.", () => { 67 | expect(() => { 68 | const dist = new EnHybridDistance(2); 69 | }).toThrow(); 70 | }); 71 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/IFuzzyMatcher.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers 5 | { 6 | using System; 7 | using System.Collections.Generic; 8 | 9 | /// 10 | /// Fuzzy Matcher interface. 11 | /// 12 | /// The type of the returned matched object. 13 | /// The type of the query object. 14 | public interface IFuzzyMatcher 15 | { 16 | /// 17 | /// Gets the size of the matcher. The number of targets constructed with. 18 | /// 19 | int Count { get; } 20 | 21 | /// 22 | /// Find the nearest element. 23 | /// 24 | /// The search target. 25 | /// The closest match to target, or null if the initial targets list was empty. 26 | Match FindNearest(Extraction query); 27 | 28 | /// 29 | /// Find the __k__ nearest elements. 30 | /// 31 | /// The search target. 32 | /// The maximum number of result to return. 33 | /// The __k__ nearest matches to target. 34 | IList> FindNearest(Extraction query, int count); 35 | 36 | /// 37 | /// Find the nearest element. 38 | /// 39 | /// The search target. 40 | /// The maximum distance to a match. 41 | /// The closest match to target within limit, or null if no match is found. 42 | Match FindNearestWithin(Extraction query, double limit); 43 | 44 | /// 45 | /// Find the __k__ nearest elements. 46 | /// 47 | /// The search target. 48 | /// The maximum distance to a match. 49 | /// The maximum number of result to return. 50 | /// The __k__ nearest matches to target within limit 51 | IList> FindNearestWithin(Extraction query, double limit, int count); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/EnPreProcessor.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor 5 | { 6 | using System.Text.RegularExpressions; 7 | 8 | /// 9 | /// English Pre-processor. 10 | /// 11 | public class EnPreProcessor : IPreProcessor 12 | { 13 | /// 14 | /// Rules to apply in chain to the query before pre-processing white spaces. Rules are applied in the order they added to the collection. 15 | /// 16 | protected readonly ChainedRuleBasedPreProcessor Rules = new ChainedRuleBasedPreProcessor(); 17 | 18 | private const string StopWords = "a|an|at|by|el|i|in|la|las|los|my|of|on|san|santa|some|the|with|you"; 19 | 20 | // TODO this belongs in native code to provide functionality cross language/platform. Will probably have to use libicu in some way. 21 | private readonly UnicodePreProcessor unicode = new UnicodePreProcessor(); 22 | private readonly CaseFoldingPreProcessor caseFold = new CaseFoldingPreProcessor(); 23 | private readonly WhiteSpacePreProcessor whitespace = new WhiteSpacePreProcessor(); 24 | 25 | /// 26 | /// Initializes a new instance of the class. 27 | /// 28 | public EnPreProcessor() 29 | { 30 | // remove stop words 31 | this.Rules.AddRule(new Regex(string.Format(@"\b({0})\b ?", StopWords)), string.Empty); 32 | this.Rules.AddRule(new Regex(string.Format(@" ?\b({0})\b", StopWords)), string.Empty); 33 | 34 | // clear punctuation 35 | this.Rules.AddRule(new Regex(@"[\p{P}\p{S}]+"), " "); 36 | } 37 | 38 | /// 39 | /// Pre-process a string. 40 | /// 41 | /// The string to pre-process. 42 | /// The pre-processed string. 43 | public string PreProcess(string query) 44 | { 45 | string result = query; 46 | result = this.unicode.PreProcess(result); 47 | result = this.caseFold.PreProcess(result); 48 | result = this.Rules.PreProcess(result); 49 | result = this.whitespace.PreProcess(result); 50 | return result; 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /tests/matchers/testsets/soundex.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import { WhitespaceTokenizer } from "../../../ts/nlp"; 5 | 6 | function soundexNumber(c: string) { 7 | switch (c) { 8 | case "B": 9 | case "F": 10 | case "P": 11 | case "V": 12 | return "1"; 13 | case "C": 14 | case "G": 15 | case "J": 16 | case "K": 17 | case "Q": 18 | case "S": 19 | case "X": 20 | case "Z": 21 | return "2"; 22 | case "D": 23 | case "T": 24 | return "3"; 25 | case "L": 26 | return "4"; 27 | case "M": 28 | case "N": 29 | return "5"; 30 | case "R": 31 | return "6"; 32 | 33 | default: 34 | return c; 35 | } 36 | } 37 | 38 | function encodeWord(word: string): string { 39 | let soundex = ""; 40 | if (word.length === 0) { 41 | return soundex; 42 | } 43 | 44 | let i = 0; 45 | let c = word.charAt(i); 46 | let n = soundexNumber(c); 47 | 48 | soundex += c; 49 | 50 | for (++i; i < word.length; ++i) { 51 | c = word.charAt(i); 52 | if (c == "H" || c == "W") { 53 | // Completely ignore H and W 54 | continue; 55 | } 56 | 57 | const newN = soundexNumber(c); 58 | if (newN === c) { 59 | // Ignore vowels, but make sure to encode consonants on either 60 | // side twice (i.e., "SIS" => "22") 61 | n = "0"; 62 | continue; 63 | } 64 | 65 | if (n !== newN) { 66 | n = newN; 67 | soundex += n; 68 | } 69 | } 70 | if (soundex.length < 4) { 71 | soundex += "0".repeat(4 - soundex.length); 72 | } 73 | return soundex.substr(0, 4); 74 | } 75 | 76 | 77 | /** 78 | * Modified version of Soundex to apply the original fixed-length Soundex on each word, 79 | * then concatenate those encoded results together. 80 | * 81 | * @abstract 82 | * @class Soundex 83 | */ 84 | abstract class Soundex { 85 | private static readonly tokenizer = new WhitespaceTokenizer(); 86 | 87 | static encode(text: string): string { 88 | const tokens = Soundex.tokenizer.tokenize(text.toUpperCase()); 89 | return tokens.map(token => encodeWord(token.value)).join(" "); 90 | } 91 | } 92 | 93 | export default Soundex; 94 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/Normalized/StringFuzzyMatcher.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers.FuzzyMatcher.Normalized 5 | { 6 | using System; 7 | using System.Collections.Generic; 8 | using FuzzyMatcher; 9 | using PhoneticMatching.Distance; 10 | 11 | /// 12 | /// A string fuzzy matcher which normalizes results based on length of queries. The fuzziness it determined by the provided distance function. 13 | /// 14 | /// The type of the returned matched object. 15 | public class StringFuzzyMatcher : NormalizedFuzzyMatcher 16 | { 17 | /// 18 | /// Initializes a new instance of the class. 19 | /// 20 | /// The set of objects that will be matched against. The order of equal targets is not guaranteed to be preserved. 21 | /// A mapping of the input types to the query(extraction) type. Note that Extraction == string for normalized cases. 22 | /// Whether the fuzzy matcher uses accelerated implementation or not. 23 | public StringFuzzyMatcher(IList targets, Func targetToExtraction = null, bool isAccelerated = true) 24 | { 25 | this.GenericFuzzyMatcher = new FuzzyMatcher(targets, new StringDistance(), targetToExtraction, isAccelerated); 26 | } 27 | 28 | /// 29 | /// Find the __k__ nearest elements. 30 | /// 31 | /// The search target. 32 | /// The maximum distance to a match. 33 | /// The maximum number of result to return. 34 | /// The __k__ nearest matches to target within limit 35 | public override IList> FindNearestWithin(string query, double limit, int count) 36 | { 37 | if (query == null) 38 | { 39 | throw new ArgumentNullException("query can't be null"); 40 | } 41 | 42 | double thresholdScale = query.Length; 43 | return this.FindNearestWithinNormalized(query, limit, count, thresholdScale); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/FuzzyMatcher.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers.FuzzyMatcher 5 | { 6 | using System; 7 | using System.Collections.Generic; 8 | using PhoneticMatching.Distance; 9 | 10 | /// 11 | /// A fuzzy matcher. The fuzziness it determined by the provided distance function. 12 | /// 13 | /// The type of the returned matched object. 14 | /// The type of the query object. 15 | public class FuzzyMatcher : AbstractFuzzyMatcher 16 | { 17 | /// 18 | /// Initializes a new instance of the class. 19 | /// 20 | /// The set of objects that will be matched against. The order of equal targets is not guaranteed to be preserved. 21 | /// The distance operator. 22 | /// A mapping of the input types to the query(extraction) type. Note that Extraction == Pronounceable for the usual case. 23 | /// Whether the fuzzy matcher uses accelerated implementation or not. 24 | public FuzzyMatcher(IList targets, IDistance distance, Func targetToExtraction = null, bool isAccelerated = false) 25 | : this(targets, distance.Distance, targetToExtraction, isAccelerated) 26 | { 27 | } 28 | 29 | /// 30 | /// Initializes a new instance of the class. 31 | /// 32 | /// The set of objects that will be matched against. The order of equal targets is not guaranteed to be preserved. 33 | /// The distance delegate. 34 | /// A mapping of the input types to the query(extraction) type. Note that Extraction == Pronounceable for the usual case. 35 | /// Whether the fuzzy matcher uses accelerated implementation or not. 36 | public FuzzyMatcher(IList targets, DistanceFunc distance, Func targetToExtraction = null, bool isAccelerated = false) 37 | : base(isAccelerated, targets, distance, null, targetToExtraction) 38 | { 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /tests/enpronunciation.test.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import {EnPronunciation, Speech} from "../ts"; 5 | 6 | test("From ARPABET.", () => { 7 | const arpabet = EnPronunciation.fromArpabet(["dh", "ih1", "s", "ih1", "z", "ax0", "t", "eh1", "s", "t"]); 8 | expect(arpabet.ipa).toBe("ðɪsɪzətɛst"); 9 | expect(arpabet.phones.length).toBeGreaterThan(0); 10 | }); 11 | 12 | test("From IPA.", () => { 13 | const ipa = EnPronunciation.fromIpa("ðɪsɪzətɛst"); 14 | expect(ipa.ipa).toBe("ðɪsɪzətɛst"); 15 | expect(ipa.phones.length).toBeGreaterThan(0); 16 | }); 17 | 18 | test("Phones.", () => { 19 | const pron = EnPronunciation.fromArpabet(["P", "R", "OW0", "N", "AH2", "N", "S", "IY0", "EY1", "SH", "AX0", "N"]); 20 | expect(pron.ipa).toBe("proʊ̯nʌnsieɪ̯ʃən"); 21 | expect(pron.phones.length).toBeGreaterThan(3); 22 | 23 | // p 24 | let phone = pron.phones[0]; 25 | expect(phone.type).toBe(Speech.PhoneType.CONSONANT); 26 | expect(phone.phonation).toBe(Speech.Phonation.VOICELESS); 27 | expect(phone.place).toBe(Speech.PlaceOfArticulation.BILABIAL); 28 | expect(phone.manner).toBe(Speech.MannerOfArticulation.PLOSIVE); 29 | expect(!phone.isSyllabic); 30 | 31 | // o 32 | phone = pron.phones[2]; 33 | expect(phone.type).toBe(Speech.PhoneType.VOWEL); 34 | expect(phone.phonation).toBe(Speech.Phonation.MODAL); 35 | expect(phone.height).toBe(Speech.VowelHeight.CLOSE_MID); 36 | expect(phone.backness).toBe(Speech.VowelBackness.BACK); 37 | expect(phone.roundedness).toBe(Speech.VowelRoundedness.ROUNDED); 38 | expect(phone.isSyllabic); 39 | 40 | // ʊ̯ 41 | phone = pron.phones[3]; 42 | expect(phone.type).toBe(Speech.PhoneType.VOWEL); 43 | expect(phone.phonation).toBe(Speech.Phonation.MODAL); 44 | expect(phone.height).toBe(Speech.VowelHeight.NEAR_CLOSE); 45 | expect(phone.backness).toBe(Speech.VowelBackness.NEAR_BACK); 46 | expect(phone.roundedness).toBe(Speech.VowelRoundedness.ROUNDED); 47 | expect(!phone.isSyllabic); 48 | }); 49 | 50 | test("Invalid ARPABET character (has space)", () => { 51 | expect(() => { 52 | const arpabet = EnPronunciation.fromArpabet(["F","B ","N","EH","T","IH","K"]); 53 | }).toThrow("Unrecognized"); 54 | }); 55 | 56 | test("Object import called as function exception.", () => { 57 | expect(() => { 58 | const pronunciation = (EnPronunciation as any)(); 59 | }).toThrow(); 60 | }); 61 | 62 | test("Object import called as ctor exception.", () => { 63 | expect(() => { 64 | const pronouncer = new (EnPronunciation as any)(); 65 | }).toThrow(); 66 | }); 67 | -------------------------------------------------------------------------------- /src/maluuba/speech/hybriddistance.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Hybrid distance combining strings and phonemes. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | #ifndef MALUUBA_SPEECH_HYBRID_DISTANCE_HPP 10 | #define MALUUBA_SPEECH_HYBRID_DISTANCE_HPP 11 | 12 | #include "maluuba/speech/phoneticdistance.hpp" 13 | #include "maluuba/debug.hpp" 14 | #include "maluuba/levenshtein.hpp" 15 | 16 | namespace maluuba 17 | { 18 | namespace speech 19 | { 20 | /** 21 | * Compute the phonetic distance between English pronunciations. 22 | * 23 | * @tparam LevenshteinDistance<> 24 | * @tparam EnPhoneticDistance 25 | */ 26 | template , typename PhoneticDistance = EnPhoneticDistance> 27 | class HybridDistance 28 | { 29 | public: 30 | /** 31 | * Construct a new Hybrid Distance metric. 32 | * 33 | * @param phonetic_weight_percentage Between 0 and 1. Weighting trade-off between the phonetic 34 | * distance and the lexical distance scores. 1 meaning 100% phonetic score and 0% lexical score. 35 | */ 36 | HybridDistance(double phonetic_weight_percentage) 37 | : m_phonetic_weight_percentage{phonetic_weight_percentage} 38 | { 39 | check(m_phonetic_weight_percentage >= 0.0 && m_phonetic_weight_percentage <= 1.0, 40 | "require 0 <= phonetic_weight_percentage <= 1"); 41 | } 42 | 43 | /** 44 | * @return The phonetic weight percentage being used. 45 | */ 46 | double 47 | phonetic_weight_percentage() const 48 | { 49 | return m_phonetic_weight_percentage; 50 | } 51 | 52 | /** 53 | * @return The combined phonetic and lexical distance between @p a and @p b. 54 | */ 55 | template 56 | double operator()(const StringInput& a_string, const PhoneticInput& a_pronunciation, const StringInput& b_string, const PhoneticInput& b_pronunciation) const 57 | { 58 | double string_weight = 0.0; 59 | double phonetic_weight = 0.0; 60 | if (m_phonetic_weight_percentage > 0.0) { 61 | phonetic_weight = m_phonetic_weight_percentage * m_phonetic_distance(a_pronunciation, b_pronunciation); 62 | } 63 | if (m_phonetic_weight_percentage < 1.0) { 64 | string_weight = (1.0 - m_phonetic_weight_percentage) * m_string_distance(a_string, b_string); 65 | } 66 | return phonetic_weight + string_weight; 67 | } 68 | 69 | private: 70 | double m_phonetic_weight_percentage; 71 | StringDistance m_string_distance; 72 | PhoneticDistance m_phonetic_distance; 73 | }; 74 | } 75 | } 76 | 77 | #endif // MALUUBA_SPEECH_HYBRID_DISTANCE_HPP 78 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Set default behavior to automatically normalize line endings. 3 | ############################################################################### 4 | * text=auto 5 | 6 | ############################################################################### 7 | # Set default behavior for command prompt diff. 8 | # 9 | # This is need for earlier builds of msysgit that does not have it on by 10 | # default for csharp files. 11 | # Note: This is only used by command line 12 | ############################################################################### 13 | #*.cs diff=csharp 14 | 15 | ############################################################################### 16 | # Set the merge driver for project and solution files 17 | # 18 | # Merging from the command prompt will add diff markers to the files if there 19 | # are conflicts (Merging from VS is not affected by the settings below, in VS 20 | # the diff markers are never inserted). Diff markers may cause the following 21 | # file extensions to fail to load in VS. An alternative would be to treat 22 | # these files as binary and thus will always conflict and require user 23 | # intervention with every merge. To do so, just uncomment the entries below 24 | ############################################################################### 25 | #*.sln merge=binary 26 | #*.csproj merge=binary 27 | #*.vbproj merge=binary 28 | #*.vcxproj merge=binary 29 | #*.vcproj merge=binary 30 | #*.dbproj merge=binary 31 | #*.fsproj merge=binary 32 | #*.lsproj merge=binary 33 | #*.wixproj merge=binary 34 | #*.modelproj merge=binary 35 | #*.sqlproj merge=binary 36 | #*.wwaproj merge=binary 37 | 38 | ############################################################################### 39 | # behavior for image files 40 | # 41 | # image files are treated as binary by default. 42 | ############################################################################### 43 | #*.jpg binary 44 | #*.png binary 45 | #*.gif binary 46 | 47 | ############################################################################### 48 | # diff behavior for common document formats 49 | # 50 | # Convert binary document formats to text before diffing them. This feature 51 | # is only available from the command line. Turn it on by uncommenting the 52 | # entries below. 53 | ############################################################################### 54 | #*.doc diff=astextplain 55 | #*.DOC diff=astextplain 56 | #*.docx diff=astextplain 57 | #*.DOCX diff=astextplain 58 | #*.dot diff=astextplain 59 | #*.DOT diff=astextplain 60 | #*.pdf diff=astextplain 61 | #*.PDF diff=astextplain 62 | #*.rtf diff=astextplain 63 | #*.RTF diff=astextplain 64 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /src/maluuba/speech/pronouncer/pronouncer.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/pronouncer.hpp" 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | namespace maluuba 11 | { 12 | namespace speech 13 | { 14 | namespace 15 | { 16 | cst_utterance* 17 | no_wave_synth(cst_utterance* u) 18 | { 19 | return u; 20 | } 21 | 22 | cst_voice* 23 | no_wave_voice() 24 | { 25 | flite_init(); 26 | 27 | cst_voice* v = new_voice(); 28 | cst_lexicon* lex; 29 | 30 | v->name = "no_wave_voice"; 31 | 32 | // Set up basic values for synthesizing with this voice 33 | usenglish_init(v); 34 | feat_set_string(v->features, "name", "cmu_us_no_wave"); 35 | 36 | // Lexicon 37 | lex = cmu_lex_init(); 38 | feat_set(v->features, "lexicon", lexicon_val(lex)); 39 | 40 | // Post lexical rules 41 | feat_set(v->features, "postlex_func", uttfunc_val(lex->postlex)); 42 | 43 | // Waveform synthesis: diphone_synth 44 | feat_set(v->features, "wave_synth_func", uttfunc_val(&no_wave_synth)); 45 | 46 | return v; 47 | } 48 | } 49 | 50 | Pronouncer::~Pronouncer() = default; 51 | 52 | using VoiceHandle = std::unique_ptr; 53 | 54 | struct EnPronouncer::Impl 55 | { 56 | VoiceHandle voice; 57 | 58 | Impl() 59 | : voice{no_wave_voice(), delete_voice} 60 | { } 61 | }; 62 | 63 | EnPronouncer::EnPronouncer() 64 | : m_impl{std::make_unique()} 65 | { } 66 | 67 | EnPronouncer::~EnPronouncer() = default; 68 | 69 | EnPronouncer::EnPronouncer(EnPronouncer&& other) = default; 70 | 71 | EnPronouncer& 72 | EnPronouncer::operator=(EnPronouncer&& other) = default; 73 | 74 | EnPronunciation 75 | EnPronouncer::pronounce(const std::string& text) const 76 | { 77 | using UtteranceHandle = std::unique_ptr; 78 | 79 | std::vector phonemes; 80 | 81 | auto utt = flite_synth_text(text.c_str(), m_impl->voice.get()); 82 | UtteranceHandle utt_handle{utt, delete_utterance}; 83 | 84 | for (auto s = relation_head(utt_relation(utt, "Segment")); s; s = item_next(s)) { 85 | std::string name = item_feat_string(s, "name"); 86 | if (name == "pau") { 87 | continue; 88 | } 89 | 90 | if (strcmp("+", ffeature_string(s, "ph_vc")) == 0) { 91 | // If the phoneme is a vowel, add stress value 92 | name += ffeature_string(s, "R:SylStructure.parent.stress"); 93 | } 94 | phonemes.push_back(std::move(name)); 95 | } 96 | 97 | return EnPronunciation::from_arpabet(phonemes); 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/maluuba/speech/phoneticdistance.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Phonetic distance. 4 | * 5 | * @author Benedicte Pierrejean 6 | * @author Tavian Barnes (tavian.barnes@microsoft.com) 7 | * 8 | * Copyright (c) Microsoft Corporation. All rights reserved. 9 | * Licensed under the MIT License. 10 | */ 11 | 12 | #ifndef MALUUBA_SPEECH_PHONETIC_DISTANCE_HPP 13 | #define MALUUBA_SPEECH_PHONETIC_DISTANCE_HPP 14 | 15 | #include "maluuba/speech/pronunciation.hpp" 16 | #include 17 | 18 | namespace maluuba 19 | { 20 | namespace speech 21 | { 22 | /** 23 | * A phoneme embedded in a metric space for similarity measurement. 24 | */ 25 | class PhonemeVector 26 | { 27 | public: 28 | /** 29 | * Initialize a @c PhonemeVector. 30 | * 31 | * @param v The three-dimensional embedding of this phoneme. 32 | * @param syllabic Whether the phoneme is syllabic. 33 | */ 34 | PhonemeVector(float v[3], bool syllabic); 35 | 36 | /** 37 | * @return The @p i'th dimension (out of 3) of the vector representation. 38 | */ 39 | float operator[](std::size_t i) const; 40 | 41 | /** 42 | * @return Whether this phoneme is syllabic. 43 | */ 44 | bool is_syllabic() const; 45 | 46 | private: 47 | friend bool operator==(const PhonemeVector&, const PhonemeVector&); 48 | 49 | float m_v[3]; 50 | bool m_syllabic; 51 | }; 52 | 53 | bool operator==(const PhonemeVector& lhs, const PhonemeVector& rhs); 54 | 55 | /** 56 | * An entire pronunciation embedded in a metric space. 57 | */ 58 | using PronunciationVector = std::vector; 59 | 60 | /** 61 | * Compute the vector representation of a pronunciation for similarity 62 | * measurement. 63 | * 64 | * @param pronunciation The pronunciation to embed. 65 | * @return A metric space embedding of the pronunciation. 66 | */ 67 | PronunciationVector phonetic_embedding(const Pronunciation& pronunciation); 68 | 69 | /** 70 | * Compute the phonetic distance between pronunciations. 71 | */ 72 | class PhoneticDistance 73 | { 74 | public: 75 | virtual ~PhoneticDistance() = 0; 76 | 77 | protected: 78 | /** 79 | * @return The phonetic distance of phonemes between @p a and @p b. 80 | */ 81 | double operator()(const PronunciationVector& a, const PronunciationVector& b) const; 82 | }; 83 | 84 | /** 85 | * Compute the phonetic distance between English pronunciations. 86 | */ 87 | class EnPhoneticDistance: public PhoneticDistance 88 | { 89 | public: 90 | virtual ~EnPhoneticDistance(); 91 | 92 | /** 93 | * @return The phonetic distance between English pronuncations @p a and @p b. 94 | */ 95 | double operator()(const EnPronunciation& a, const EnPronunciation& b) const; 96 | }; 97 | } 98 | } 99 | 100 | #endif // MALUUBA_SPEECH_PHONETIC_DISTANCE_HPP 101 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/Matchers/BaseContactMatcherTester.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingTests.Matchers 5 | { 6 | public class BaseContactMatcherTester 7 | { 8 | protected readonly string[] TargetStrings = 9 | { 10 | "Andrew Smith", 11 | "Andrew", 12 | "John B", 13 | "John C", 14 | "Jennifer" 15 | }; 16 | 17 | protected readonly TestContact[] Targets = 18 | { 19 | new TestContact() 20 | { 21 | FirstName = "Andrew", 22 | LastName = "Smith", 23 | Id = "1234567" 24 | }, 25 | new TestContact() 26 | { 27 | FirstName = "Andrew", 28 | LastName = string.Empty, 29 | }, 30 | new TestContact() 31 | { 32 | FirstName = "John", 33 | LastName = "B", 34 | Id = "7654321" 35 | }, 36 | new TestContact() 37 | { 38 | FirstName = "John", 39 | LastName = "C", 40 | Id = "2222222" 41 | }, 42 | new TestContact() 43 | { 44 | FirstName = "Jennifer", 45 | LastName = string.Empty 46 | } 47 | }; 48 | 49 | protected class TestContact 50 | { 51 | public string FirstName { get; set; } 52 | 53 | public string LastName { get; set; } 54 | 55 | public string Id { get; set; } 56 | 57 | public string FullName 58 | { 59 | get 60 | { 61 | return string.Format("{0} {1}", this.FirstName, this.LastName); 62 | } 63 | } 64 | 65 | public override bool Equals(object obj) 66 | { 67 | if (this == obj) 68 | { 69 | return true; 70 | } 71 | 72 | if (obj != null) 73 | { 74 | if (obj.GetType() == this.GetType()) 75 | { 76 | var other = (TestContact)obj; 77 | return other.FirstName == this.FirstName && 78 | other.LastName == this.LastName && 79 | other.Id == this.Id; 80 | } 81 | } 82 | 83 | return false; 84 | } 85 | 86 | public override int GetHashCode() 87 | { 88 | return (this.FirstName + this.LastName + this.Id).GetHashCode(); 89 | } 90 | 91 | public override string ToString() 92 | { 93 | return this.FullName; 94 | } 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/EnPlacesPreProcessor.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor 5 | { 6 | using System.Text.RegularExpressions; 7 | 8 | /// 9 | /// English Pre-processor with specific rules for places. 10 | /// 11 | public class EnPlacesPreProcessor : EnPreProcessor 12 | { 13 | /// 14 | /// Initializes a new instance of the class. 15 | /// 16 | public EnPlacesPreProcessor() 17 | { 18 | // Cardinal Directions 19 | this.Rules.AddRule(new Regex(@"\be\b"), "east"); 20 | this.Rules.AddRule(new Regex(@"\bn\b"), "north"); 21 | this.Rules.AddRule(new Regex(@"\bs\b"), "south"); 22 | this.Rules.AddRule(new Regex(@"\bw\b"), "west"); 23 | 24 | this.Rules.AddRule(new Regex(@"\bne\b"), "north east"); 25 | this.Rules.AddRule(new Regex(@"\bnw\b"), "north west"); 26 | this.Rules.AddRule(new Regex(@"\bse\b"), "south east"); 27 | this.Rules.AddRule(new Regex(@"\bsw\b"), "south west"); 28 | 29 | // Address Abbreviations 30 | // Word boundary doesn't work after the "." so we need look-ahead. 31 | this.Rules.AddRule(new Regex(@"\baly\.?(?=[\s\p{P}\p{S}]|$)"), "alley"); 32 | this.Rules.AddRule(new Regex(@"\bave?\.?(?=[\s\p{P}\p{S}]|$)"), "avenue"); 33 | this.Rules.AddRule(new Regex(@"\bblvd\.?(?=[\s\p{P}\p{S}]|$)"), "boulevard"); 34 | this.Rules.AddRule(new Regex(@"\bbnd\.?(?=[\s\p{P}\p{S}]|$)"), "bend"); 35 | this.Rules.AddRule(new Regex(@"\bcres\.?(?=[\s\p{P}\p{S}]|$)"), "crescent"); 36 | this.Rules.AddRule(new Regex(@"\bcir\.?(?=[\s\p{P}\p{S}]|$)"), "circle"); 37 | this.Rules.AddRule(new Regex(@"\bct\.?(?=[\s\p{P}\p{S}]|$)"), "court"); 38 | this.Rules.AddRule(new Regex(@"\bdr\.?(?=[\s\p{P}\p{S}]|$)"), "drive"); 39 | this.Rules.AddRule(new Regex(@"\best\.?(?=[\s\p{P}\p{S}]|$)"), "estate"); 40 | this.Rules.AddRule(new Regex(@"\bln\.?(?=[\s\p{P}\p{S}]|$)"), "lane"); 41 | this.Rules.AddRule(new Regex(@"\bpkwy\.?(?=[\s\p{P}\p{S}]|$)"), "parkway"); 42 | this.Rules.AddRule(new Regex(@"\bpl\.?(?=[\s\p{P}\p{S}]|$)"), "place"); 43 | this.Rules.AddRule(new Regex(@"\brd\.?(?=[\s\p{P}\p{S}]|$)"), "road"); 44 | 45 | // Assume "st" at the beginning is for "saint". 46 | this.Rules.AddRule(new Regex(@"^st\.?(?=[\s\p{P}\p{S}]|$)"), "saint"); 47 | 48 | // If "st" does not occur at the start of the string, then we cannot known if it is for "saint" or "street". 49 | this.Rules.AddRule(new Regex(@"\bst\.?(?=[\s\p{P}\p{S}]|$)"), "street"); 50 | this.Rules.AddRule(new Regex(@"\bxing\.?(?=[\s\p{P}\p{S}]|$)"), "crossing"); 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatching.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 15 4 | VisualStudioVersion = 15.0.28010.2003 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.PhoneticMatching", "Microsoft.PhoneticMatching\Microsoft.PhoneticMatching.csproj", "{25881C63-77D8-4DB4-B9E4-9537BF8DD182}" 7 | EndProject 8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PhoneticMatchingTests", "PhoneticMatchingTests\PhoneticMatchingTests.csproj", "{E78BAD40-0AA8-49D5-B219-111D5645D6E7}" 9 | EndProject 10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PhoneticMatchingPerfTests", "PhoneticMatchingPerfTests\PhoneticMatchingPerfTests.csproj", "{BA07F296-D5C6-4521-96E5-E764D7419EEE}" 11 | EndProject 12 | Global 13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 14 | Debug|Any CPU = Debug|Any CPU 15 | Debug|x64 = Debug|x64 16 | Release|Any CPU = Release|Any CPU 17 | Release|x64 = Release|x64 18 | EndGlobalSection 19 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 20 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Debug|Any CPU.ActiveCfg = Debug|x64 21 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Debug|x64.ActiveCfg = Debug|x64 22 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Debug|x64.Build.0 = Debug|x64 23 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Release|Any CPU.ActiveCfg = Release|x64 24 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Release|x64.ActiveCfg = Release|x64 25 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Release|x64.Build.0 = Release|x64 26 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 27 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Debug|Any CPU.Build.0 = Debug|Any CPU 28 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Debug|x64.ActiveCfg = Debug|x64 29 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Debug|x64.Build.0 = Debug|x64 30 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Release|Any CPU.ActiveCfg = Release|Any CPU 31 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Release|Any CPU.Build.0 = Release|Any CPU 32 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Release|x64.ActiveCfg = Release|x64 33 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Release|x64.Build.0 = Release|x64 34 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Debug|Any CPU.ActiveCfg = Debug|x64 35 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Debug|x64.ActiveCfg = Debug|x64 36 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Debug|x64.Build.0 = Debug|x64 37 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Release|Any CPU.ActiveCfg = Release|x64 38 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Release|Any CPU.Build.0 = Release|x64 39 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Release|x64.ActiveCfg = Release|x64 40 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Release|x64.Build.0 = Release|x64 41 | EndGlobalSection 42 | GlobalSection(SolutionProperties) = preSolution 43 | HideSolutionNode = FALSE 44 | EndGlobalSection 45 | GlobalSection(ExtensibilityGlobals) = postSolution 46 | SolutionGuid = {0D6EBE7F-D0D5-4B91-B0B5-EFF497F76B6A} 47 | EndGlobalSection 48 | EndGlobal 49 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/FuzzyMatcherBase.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers 5 | { 6 | using System; 7 | using System.Runtime.InteropServices; 8 | using System.Text; 9 | 10 | /// 11 | /// Abstract class to define static imports for generic fuzzy matcher. 12 | /// 13 | public abstract class FuzzyMatcherBase : NativeResourceWrapper 14 | { 15 | /// 16 | /// Initializes a new instance of the class. 17 | /// 18 | /// Parameter(s) required to initialize the native object if any. 19 | public FuzzyMatcherBase(params object[] args) : base(args) 20 | { 21 | } 22 | 23 | /// 24 | /// Delegate type passed to native code to access the managed objects using their indexes and compute distance on them. 25 | /// 26 | /// Index of the first managed object 27 | /// Index of the second managed object. 28 | /// The distance between the first and second managed objects. 29 | protected delegate double DistanceDelegate(int firstIdx, int secondIdx); 30 | 31 | [DllImport("maluubaspeech-csharp.dll")] 32 | protected static extern NativeResult FuzzyMatcher_Create(int count, DistanceDelegate distance, bool isAccelerated, out IntPtr fuzzyMatcher, StringBuilder errorMsg, ref int bufferSize); 33 | 34 | [DllImport("maluubaspeech-csharp.dll")] 35 | protected static extern NativeResult FuzzyMatcher_FindNearestWithin(IntPtr native, int count, double limit, [In, Out] int[] nearestIdx, [In, Out] double[] distances, StringBuilder buffer, ref int bufferSize); 36 | 37 | [DllImport("maluubaspeech-csharp.dll")] 38 | protected static extern NativeResult AcceleratedFuzzyMatcher_FindNearestWithin(IntPtr native, int count, double limit, [In, Out] int[] nearestIdx, [In, Out] double[] distances, StringBuilder buffer, ref int bufferSize); 39 | 40 | /// 41 | /// Delete the native pointer using the type specified in native bindings. 42 | /// 43 | /// Pointer to the native object. 44 | /// Buffer for any error message 45 | /// Size of the buffer, to be adjusted if error doesn't fit the current size. 46 | /// The result code from native library. 47 | protected override NativeResult NativeDelete(IntPtr native, StringBuilder buffer, ref int bufferSize) 48 | { 49 | return FuzzyMatcher_Delete(native, buffer, ref bufferSize); 50 | } 51 | 52 | [DllImport("maluubaspeech-csharp.dll")] 53 | private static extern NativeResult FuzzyMatcher_Delete(IntPtr ptr, StringBuilder buffer, ref int bufferSize); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/Normalized/EnPhoneticFuzzyMatcher.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers.FuzzyMatcher.Normalized 5 | { 6 | using System; 7 | using System.Collections.Generic; 8 | using PhoneticMatching.Distance; 9 | 10 | /// 11 | /// An english pronunciation fuzzy matcher which normalizes results based on length of queries. The fuzziness it determined by the provided distance function. 12 | /// 13 | /// The type of the returned matched object. 14 | public class EnPhoneticFuzzyMatcher : NormalizedFuzzyMatcher 15 | { 16 | private EnPronouncer pronouncer = EnPronouncer.Instance; 17 | 18 | /// 19 | /// Initializes a new instance of the class. 20 | /// 21 | /// The set of objects that will be matched against. The order of equal targets is not guaranteed to be preserved. 22 | /// A mapping of the input types to the query(extraction) type. Note that Extraction == string for normalized cases. 23 | /// Whether the fuzzy matcher uses accelerated implementation or not. 24 | public EnPhoneticFuzzyMatcher(IList targets, Func targetToExtractionPhrase = null, bool isAccelerated = true) 25 | { 26 | Func targetToExtraction = (target) => 27 | { 28 | string phrase = targetToExtractionPhrase == null ? target as string : targetToExtractionPhrase(target); 29 | if (phrase == null) 30 | { 31 | throw new InvalidCastException($"Can't cast Target type [{typeof(Target)}] to Extraction type [string]. You must provide a conversion function 'targetToExtractionPhrase'."); 32 | } 33 | 34 | return this.pronouncer.Pronounce(phrase); 35 | }; 36 | this.GenericFuzzyMatcher = new FuzzyMatcher(targets, new EnPhoneticDistance(), targetToExtraction, isAccelerated); 37 | } 38 | 39 | /// 40 | /// Find the __k__ nearest elements. 41 | /// 42 | /// The search target. 43 | /// The maximum distance to a match. 44 | /// The maximum number of result to return. 45 | /// The __k__ nearest matches to target within limit 46 | public override IList> FindNearestWithin(string query, double limit, int count) 47 | { 48 | var pronunciation = this.pronouncer.Pronounce(query); 49 | double thresholdScale = pronunciation.Ipa.Length; 50 | return this.FindNearestWithinNormalized(pronunciation, limit, count, thresholdScale); 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/match/match.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/nodejs/match.hpp" 5 | #include 6 | 7 | namespace maluuba 8 | { 9 | namespace speech 10 | { 11 | namespace nodejs 12 | { 13 | namespace 14 | { 15 | void 16 | getDistance(v8::Local property, const v8::PropertyCallbackInfo& info) 17 | { 18 | auto isolate = info.GetIsolate(); 19 | auto obj = node::ObjectWrap::Unwrap(info.Holder()); 20 | auto distance = obj->match().distance(); 21 | info.GetReturnValue().Set(v8::Number::New(isolate, distance)); 22 | } 23 | 24 | void 25 | getElement(v8::Local property, const v8::PropertyCallbackInfo& info) 26 | { 27 | auto isolate = info.GetIsolate(); 28 | auto obj = node::ObjectWrap::Unwrap(info.Holder()); 29 | auto element = obj->match().element().Get(isolate); 30 | info.GetReturnValue().Set(element); 31 | } 32 | 33 | void 34 | setThrow(v8::Local property, v8::Local value, const v8::PropertyCallbackInfo& info) 35 | { 36 | auto isolate = info.GetIsolate(); 37 | isolate->ThrowException(v8::Exception::Error( 38 | v8::String::NewFromUtf8(isolate, "Object is immutable, setters not allowed."))); 39 | return; 40 | } 41 | } 42 | 43 | v8::Persistent Match::s_constructor; 44 | 45 | Match::Match(Match::MatchType match) 46 | : m_match{std::move(match)} 47 | { } 48 | 49 | v8::Local 50 | Match::constructor(v8::Isolate* isolate) 51 | { 52 | return v8::Local::New(isolate, s_constructor); 53 | } 54 | 55 | void 56 | Match::Init(v8::Local exports) 57 | { 58 | auto isolate = exports->GetIsolate(); 59 | v8::Local context = isolate->GetCurrentContext(); 60 | 61 | auto tpl = v8::FunctionTemplate::New(isolate, New); 62 | tpl->SetClassName(v8::String::NewFromUtf8(isolate, "Match")); 63 | tpl->InstanceTemplate()->SetInternalFieldCount(1); 64 | tpl->InstanceTemplate()->SetAccessor(v8::String::NewFromUtf8(isolate, "distance"), getDistance, setThrow); 65 | tpl->InstanceTemplate()->SetAccessor(v8::String::NewFromUtf8(isolate, "element"), getElement, setThrow); 66 | 67 | s_constructor.Reset(isolate, tpl->GetFunction(context).ToLocalChecked()); 68 | } 69 | 70 | void 71 | Match::New(const v8::FunctionCallbackInfo& args) 72 | { 73 | auto isolate = args.GetIsolate(); 74 | 75 | if (!args[0]->IsExternal()) { 76 | isolate->ThrowException(v8::Exception::TypeError( 77 | v8::String::NewFromUtf8(isolate, "Not Expected to initialize directly, use a Fuzzy Matcher."))); 78 | return; 79 | } 80 | 81 | auto self = args.Holder(); 82 | auto external = args[0].As(); 83 | auto obj = static_cast(external->Value()); 84 | obj->Wrap(self); 85 | args.GetReturnValue().Set(self); 86 | } 87 | 88 | const Match::MatchType& 89 | Match::match() const 90 | { 91 | return m_match; 92 | } 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/stringdistance/stringdistance.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/nodejs/stringdistance.hpp" 5 | #include 6 | #include 7 | 8 | namespace maluuba 9 | { 10 | namespace speech 11 | { 12 | namespace nodejs 13 | { 14 | v8::Persistent StringDistance::s_constructor; 15 | v8::Persistent StringDistance::s_type; 16 | 17 | StringDistance::StringDistance(LevenshteinDistance<> distance) 18 | : m_distance{std::move(distance)} 19 | { } 20 | 21 | StringDistance::~StringDistance() = default; 22 | 23 | v8::Local 24 | StringDistance::type(v8::Isolate* isolate) 25 | { 26 | return s_type.Get(isolate); 27 | } 28 | 29 | void 30 | StringDistance::Init(v8::Local exports) 31 | { 32 | auto isolate = exports->GetIsolate(); 33 | v8::Local context = isolate->GetCurrentContext(); 34 | 35 | auto tpl = v8::FunctionTemplate::New(isolate, New); 36 | tpl->SetClassName(v8::String::NewFromUtf8(isolate, "StringDistance")); 37 | tpl->InstanceTemplate()->SetInternalFieldCount(1); 38 | 39 | NODE_SET_PROTOTYPE_METHOD(tpl, "distance", Distance); 40 | 41 | s_constructor.Reset(isolate, tpl->GetFunction(context).ToLocalChecked()); 42 | s_type.Reset(isolate, tpl); 43 | exports->Set(context, v8::String::NewFromUtf8(isolate, "StringDistance"), tpl->GetFunction(context).ToLocalChecked()); 44 | } 45 | 46 | void 47 | StringDistance::New(const v8::FunctionCallbackInfo& args) 48 | { 49 | auto isolate = args.GetIsolate(); 50 | 51 | if (args.IsConstructCall()) { 52 | LevenshteinDistance<> distance{}; 53 | auto obj = new StringDistance(std::move(distance)); 54 | obj->Wrap(args.This()); 55 | args.GetReturnValue().Set(args.This()); 56 | } else { 57 | isolate->ThrowException(v8::Exception::SyntaxError( 58 | v8::String::NewFromUtf8(isolate, "Not invoked as constructor, change to: `new StringDistance()`"))); 59 | return; 60 | } 61 | } 62 | 63 | void 64 | StringDistance::Distance(const v8::FunctionCallbackInfo& args) 65 | { 66 | auto isolate = args.GetIsolate(); 67 | 68 | if (args.Length() < 2) { 69 | isolate->ThrowException(v8::Exception::TypeError( 70 | v8::String::NewFromUtf8(isolate, "Expected 2 arguments."))); 71 | return; 72 | } 73 | 74 | if (!args[0]->IsString() || !args[1]->IsString()) { 75 | isolate->ThrowException(v8::Exception::TypeError( 76 | v8::String::NewFromUtf8(isolate, "Expected arguments to be string."))); 77 | return; 78 | } 79 | 80 | auto obj = ObjectWrap::Unwrap(args.Holder()); 81 | std::string a{*v8::String::Utf8Value{isolate, args[0]}}; 82 | std::string b{*v8::String::Utf8Value{isolate, args[1]}}; 83 | auto distance = obj->distance()(a, b); 84 | 85 | args.GetReturnValue().Set(v8::Number::New(isolate, distance)); 86 | } 87 | 88 | const LevenshteinDistance<>& 89 | StringDistance::distance() const 90 | { 91 | return m_distance; 92 | } 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/MatcherConfig.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers 5 | { 6 | using System; 7 | 8 | /// 9 | /// Simple matcher configuration without default values. 10 | /// 11 | public class MatcherConfig 12 | { 13 | /// 14 | /// Initializes a new instance of the class. 15 | /// 16 | /// Weighting trade-off between the phonetic distance and the lexical distance scores. 17 | /// Maximum number of places the matcher can return 18 | /// Maximum distance to a match. Normalized to 0 for exact match, 1 for nothing matches 19 | /// Candidate cutoff given by Math.max({best matched distance} * bestDistanceMultiplier, maxDistanceMarginReturns) 20 | /// best distance multiplier 21 | public MatcherConfig( 22 | double phoneticWeightPercentage, 23 | int maxReturns, 24 | double findThreshold, 25 | double maxDistanceMarginReturns, 26 | double bestDistanceMultiplier) 27 | { 28 | this.PhoneticWeightPercentage = phoneticWeightPercentage; 29 | this.MaxReturns = maxReturns; 30 | this.FindThreshold = findThreshold; 31 | this.MaxDistanceMarginReturns = maxDistanceMarginReturns; 32 | this.BestDistanceMultiplier = bestDistanceMultiplier; 33 | 34 | if (this.PhoneticWeightPercentage < 0 || this.PhoneticWeightPercentage > 1) 35 | { 36 | throw new ArgumentException("require 0 <= phoneticWeightPercentage <= 1"); 37 | } 38 | } 39 | 40 | /// 41 | /// Gets or sets the Weighting trade-off between the phonetic distance and 42 | /// the lexical distance scores. Between 0 and 1. 1 meaning 100% phonetic score and 0% lexical score. 43 | /// 44 | public double PhoneticWeightPercentage { get; protected set; } 45 | 46 | /// 47 | /// Gets or sets the maximum number of places the matcher can return. 48 | /// 49 | public int MaxReturns { get; set; } 50 | 51 | /// 52 | /// Gets or sets the maximum distance to a match. Normalized to 0 for exact match, 1 for nothing matches. 53 | /// Can be >1 if the lengths do not match. 54 | /// 55 | public double FindThreshold { get; set; } 56 | 57 | /// 58 | /// Gets or sets the candidate cutoff given by Math.max({best matched distance} * bestDistanceMultiplier, maxDistanceMarginReturns). 59 | /// 60 | public double MaxDistanceMarginReturns { get; set; } 61 | 62 | /// 63 | /// Gets or sets the best distance multiplier. 64 | /// 65 | public double BestDistanceMultiplier { get; set; } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/enpronouncer/enpronouncer.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/nodejs/enpronouncer.hpp" 5 | #include "maluuba/speech/nodejs/enpronunciation.hpp" 6 | #include 7 | 8 | namespace maluuba 9 | { 10 | namespace speech 11 | { 12 | namespace nodejs 13 | { 14 | v8::Persistent EnPronouncer::s_constructor; 15 | 16 | EnPronouncer::EnPronouncer(speech::EnPronouncer pronouncer) 17 | : m_pronouncer{std::move(pronouncer)} 18 | { } 19 | 20 | EnPronouncer::~EnPronouncer() = default; 21 | 22 | void 23 | EnPronouncer::Init(v8::Local exports) 24 | { 25 | auto isolate = exports->GetIsolate(); 26 | v8::Local context = isolate->GetCurrentContext(); 27 | 28 | auto tpl = v8::FunctionTemplate::New(isolate, New); 29 | tpl->SetClassName(v8::String::NewFromUtf8(isolate, "EnPronouncer")); 30 | tpl->InstanceTemplate()->SetInternalFieldCount(1); 31 | 32 | NODE_SET_PROTOTYPE_METHOD(tpl, "pronounce", Pronounce); 33 | 34 | s_constructor.Reset(isolate, tpl->GetFunction(context).ToLocalChecked()); 35 | exports->Set(context, v8::String::NewFromUtf8(isolate, "EnPronouncer"), tpl->GetFunction(context).ToLocalChecked()); 36 | } 37 | 38 | void 39 | EnPronouncer::New(const v8::FunctionCallbackInfo& args) 40 | { 41 | auto isolate = args.GetIsolate(); 42 | 43 | if (args.IsConstructCall()) { 44 | speech::EnPronouncer pronouncer{}; 45 | auto obj = new EnPronouncer(std::move(pronouncer)); 46 | obj->Wrap(args.This()); 47 | args.GetReturnValue().Set(args.This()); 48 | } else { 49 | isolate->ThrowException(v8::Exception::SyntaxError( 50 | v8::String::NewFromUtf8(isolate, "Not invoked as constructor, change to: `new EnPronouncer()`"))); 51 | return; 52 | } 53 | } 54 | 55 | void 56 | EnPronouncer::Pronounce(const v8::FunctionCallbackInfo& args) 57 | { 58 | auto isolate = args.GetIsolate(); 59 | 60 | if (args.Length() < 1) { 61 | isolate->ThrowException(v8::Exception::TypeError( 62 | v8::String::NewFromUtf8(isolate, "Expected 1 argument."))); 63 | return; 64 | } 65 | 66 | if (!args[0]->IsString()) { 67 | isolate->ThrowException(v8::Exception::TypeError( 68 | v8::String::NewFromUtf8(isolate, "Expected argument to be a string."))); 69 | return; 70 | } 71 | 72 | auto obj = ObjectWrap::Unwrap(args.Holder()); 73 | v8::String::Utf8Value phrase{isolate, args[0]}; 74 | try { 75 | auto pronunciation = obj->pronouncer().pronounce(*phrase); 76 | 77 | auto wrap = new EnPronunciation(std::move(pronunciation)); 78 | const auto argc = 1; 79 | v8::Local argv[argc] = { v8::External::New(isolate, wrap) }; 80 | auto context = isolate->GetCurrentContext(); 81 | auto instance = EnPronunciation::constructor(isolate)->NewInstance(context, argc, argv).ToLocalChecked(); 82 | args.GetReturnValue().Set(instance); 83 | } catch (const std::exception& e) { 84 | isolate->ThrowException(v8::Exception::Error( 85 | v8::String::NewFromUtf8(isolate, e.what()))); 86 | return; 87 | } 88 | } 89 | 90 | const speech::EnPronouncer& 91 | EnPronouncer::pronouncer() const 92 | { 93 | return m_pronouncer; 94 | } 95 | } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/maluuba/speech/pronunciation/arpabet.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/pronunciation.hpp" 5 | #include "maluuba/xtd/string_view.hpp" 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | namespace maluuba 12 | { 13 | namespace speech 14 | { 15 | namespace 16 | { 17 | /** Map from Arpabet phonemes to IPA pronunciations. */ 18 | const std::u16string& 19 | arpabet_to_ipa(const xtd::string_view phoneme) 20 | { 21 | static std::unordered_map arpabet_map = { 22 | // Vowels 23 | 24 | // Monophthongs 25 | {"AO", u"ɔ"}, 26 | {"AA", u"ɑ"}, 27 | {"IY", u"i"}, 28 | {"UW", u"u"}, 29 | {"EH", u"ɛ"}, 30 | {"IH", u"ɪ"}, 31 | {"UH", u"ʊ"}, 32 | {"AH", u"ʌ"}, 33 | {"AX", u"ə"}, 34 | {"AE", u"æ"}, 35 | 36 | // Diphthongs 37 | {"EY", u"eɪ̯"}, 38 | {"AY", u"aɪ̯"}, 39 | {"OW", u"oʊ̯"}, 40 | {"AW", u"aʊ̯"}, 41 | {"OY", u"ɔɪ̯"}, 42 | 43 | // Rhotic 44 | {"ER", u"ɝ"}, 45 | {"AXR", u"ɚ"}, 46 | 47 | // Consonants 48 | 49 | // Stops 50 | {"P", u"p"}, 51 | {"B", u"b"}, 52 | {"T", u"t"}, 53 | {"D", u"d"}, 54 | {"K", u"k"}, 55 | {"G", u"ɡ"}, 56 | 57 | // Affricates 58 | {"CH", u"tʃ"}, 59 | {"JH", u"dʒ"}, 60 | 61 | // Fricatives 62 | {"F", u"f"}, 63 | {"V", u"v"}, 64 | {"TH", u"θ"}, 65 | {"DH", u"ð"}, 66 | {"S", u"s"}, 67 | {"Z", u"z"}, 68 | {"SH", u"ʃ"}, 69 | {"ZH", u"ʒ"}, 70 | {"HH", u"h"}, 71 | 72 | // Nasals 73 | {"M", u"m"}, 74 | {"EM", u"m̩"}, 75 | {"N", u"n"}, 76 | {"EN", u"n̩"}, 77 | {"NG", u"ŋ"}, 78 | {"ENG", u"ŋ̍"}, 79 | 80 | // Liquids 81 | {"L", u"lˠ"}, 82 | {"EL", u"l̩ˠ"}, 83 | {"R", u"r"}, 84 | {"DX", u"ɾ"}, 85 | {"NX", u"ɾ̃"}, 86 | 87 | // Semivowels 88 | {"Y", u"j"}, 89 | {"W", u"w"}, 90 | {"Q", u"ʔ"}, 91 | 92 | // Suprasegmentals 93 | {" ", u" "}, 94 | }; 95 | 96 | auto found = arpabet_map.find(phoneme); 97 | if (found == arpabet_map.end()) { 98 | throw std::domain_error("Unrecognized ARPABET phoneme `" + std::string{phoneme} + "`."); 99 | } 100 | return found->second; 101 | } 102 | } 103 | 104 | EnPronunciation 105 | EnPronunciation::from_arpabet(const std::vector& arpabet) 106 | { 107 | std::u16string ipa; 108 | 109 | for (const auto& phoneme : arpabet) { 110 | std::string copy{phoneme.begin(), phoneme.end()}; 111 | 112 | // Convert to uppercase 113 | for (auto& c : copy) { 114 | if (c >= 'a' && c <= 'z') { 115 | c += 'A' - 'a'; 116 | } 117 | } 118 | 119 | if (!copy.empty()) { 120 | auto last = copy[copy.length() - 1]; 121 | if (last >= '0' && last <= '2') { 122 | copy.resize(copy.length() - 1); 123 | } 124 | } 125 | 126 | ipa += arpabet_to_ipa(copy); 127 | } 128 | 129 | return {std::move(ipa)}; 130 | } 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/maluuba/speech/nodejs/enphoneticdistance/enphoneticdistance.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/nodejs/enphoneticdistance.hpp" 5 | #include "maluuba/speech/nodejs/enpronunciation.hpp" 6 | #include 7 | 8 | namespace maluuba 9 | { 10 | namespace speech 11 | { 12 | namespace nodejs 13 | { 14 | v8::Persistent EnPhoneticDistance::s_constructor; 15 | v8::Persistent EnPhoneticDistance::s_type; 16 | 17 | EnPhoneticDistance::EnPhoneticDistance(speech::EnPhoneticDistance distance) 18 | : m_distance{std::move(distance)} 19 | { } 20 | 21 | EnPhoneticDistance::~EnPhoneticDistance() = default; 22 | 23 | v8::Local 24 | EnPhoneticDistance::type(v8::Isolate* isolate) 25 | { 26 | return s_type.Get(isolate); 27 | } 28 | 29 | void 30 | EnPhoneticDistance::Init(v8::Local exports) 31 | { 32 | auto isolate = exports->GetIsolate(); 33 | v8::Local context = isolate->GetCurrentContext(); 34 | 35 | auto tpl = v8::FunctionTemplate::New(isolate, New); 36 | tpl->SetClassName(v8::String::NewFromUtf8(isolate, "EnPhoneticDistance")); 37 | tpl->InstanceTemplate()->SetInternalFieldCount(1); 38 | 39 | NODE_SET_PROTOTYPE_METHOD(tpl, "distance", Distance); 40 | 41 | s_constructor.Reset(isolate, tpl->GetFunction(context).ToLocalChecked()); 42 | s_type.Reset(isolate, tpl); 43 | exports->Set(context, v8::String::NewFromUtf8(isolate, "EnPhoneticDistance"), tpl->GetFunction(context).ToLocalChecked()); 44 | } 45 | 46 | void 47 | EnPhoneticDistance::New(const v8::FunctionCallbackInfo& args) 48 | { 49 | auto isolate = args.GetIsolate(); 50 | 51 | if (args.IsConstructCall()) { 52 | speech::EnPhoneticDistance distance{}; 53 | auto obj = new EnPhoneticDistance(std::move(distance)); 54 | obj->Wrap(args.This()); 55 | args.GetReturnValue().Set(args.This()); 56 | } else { 57 | isolate->ThrowException(v8::Exception::SyntaxError( 58 | v8::String::NewFromUtf8(isolate, "Not invoked as constructor, change to: `new EnPhoneticDistance()`"))); 59 | return; 60 | } 61 | } 62 | 63 | void 64 | EnPhoneticDistance::Distance(const v8::FunctionCallbackInfo& args) 65 | { 66 | auto isolate = args.GetIsolate(); 67 | 68 | if (args.Length() < 2) { 69 | isolate->ThrowException(v8::Exception::TypeError( 70 | v8::String::NewFromUtf8(isolate, "Expected 2 arguments."))); 71 | return; 72 | } 73 | 74 | auto enPronunciationType = EnPronunciation::type(isolate); 75 | if (!enPronunciationType->HasInstance(args[0]) || !enPronunciationType->HasInstance(args[1])) { 76 | isolate->ThrowException(v8::Exception::TypeError( 77 | v8::String::NewFromUtf8(isolate, "Expected arguments to be EnPronunciation."))); 78 | return; 79 | } 80 | 81 | auto obj = ObjectWrap::Unwrap(args.Holder()); 82 | auto a = ObjectWrap::Unwrap(args[0].As()); 83 | auto b = ObjectWrap::Unwrap(args[1].As()); 84 | auto distance = obj->distance()(a->pronunciation(), b->pronunciation()); 85 | 86 | args.GetReturnValue().Set(v8::Number::New(isolate, distance)); 87 | } 88 | 89 | const speech::EnPhoneticDistance& 90 | EnPhoneticDistance::distance() const 91 | { 92 | return m_distance; 93 | } 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/Nlp/PreprocessorTests.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingTests.Nlp 5 | { 6 | using Microsoft.VisualStudio.TestTools.UnitTesting; 7 | using Microsoft.PhoneticMatching.Nlp.Preprocessor; 8 | 9 | /// 10 | /// Tests for the preprocessors. 11 | /// 12 | [TestClass] 13 | public class PreprocessorTests 14 | { 15 | private readonly EnPreProcessor englishPreProcessor = new EnPreProcessor(); 16 | private readonly EnPlacesPreProcessor englishPlacesPreProcessor = new EnPlacesPreProcessor(); 17 | 18 | [TestMethod] 19 | public void GivenStreetAndSaint_ToPlacesProcessor_ExpectProperFormatting() 20 | { 21 | var result = this.englishPlacesPreProcessor.PreProcess("St Maurice St"); 22 | Assert.AreEqual("saint maurice street", result, "Place pre-processing doesn't return the expected result."); 23 | } 24 | 25 | [TestMethod] 26 | public void GivenCombiningAcuteAndLigature_ToEnglishPreprocessor_ExpectProperFormatting() 27 | { 28 | // "Híffi" 29 | // í has a combining acute accent, ffi is a ligature 30 | var result = this.englishPreProcessor.PreProcess("Hi\u0301\uFB03"); 31 | Assert.AreEqual("h\u00EDffi", result); 32 | } 33 | 34 | [TestMethod] 35 | public void GivenDigits_ToEnglishPreprocessor_ExpectProperFormatting() 36 | { 37 | var result = this.englishPreProcessor.PreProcess("123 King St"); 38 | Assert.AreEqual("123 king st", result); 39 | 40 | result = this.englishPreProcessor.PreProcess("2 Wildwood Place"); 41 | Assert.AreEqual("2 wildwood place", result); 42 | } 43 | 44 | [TestMethod] 45 | public void GivenPunctuation_ToEnglishPreprocessor_ExpectProperFormatting() 46 | { 47 | var result = this.englishPreProcessor.PreProcess("!omg! ch!ll ?how?"); 48 | Assert.AreEqual("omg ch ll how", result); 49 | } 50 | 51 | [TestMethod] 52 | public void GivenApostropheAndCase_ToEnglishPreprocessor_ExpectProperFormatting() 53 | { 54 | var result = this.englishPreProcessor.PreProcess("Justin's haus"); 55 | Assert.AreEqual("justin s haus", result); 56 | } 57 | 58 | [TestMethod] 59 | public void GivenSimpleTokenization_ToEnglishPreprocessor_ExpectProperFormatting() 60 | { 61 | var result = this.englishPreProcessor.PreProcess("call mom"); 62 | Assert.AreEqual("call mom", result); 63 | 64 | result = this.englishPreProcessor.PreProcess("call MoM!"); 65 | Assert.AreEqual("call mom", result); 66 | 67 | result = this.englishPreProcessor.PreProcess("*(*&call, MoM! )_+"); 68 | Assert.AreEqual("call mom", result); 69 | 70 | result = this.englishPreProcessor.PreProcess(":call/mom"); 71 | Assert.AreEqual("call mom", result); 72 | 73 | result = this.englishPreProcessor.PreProcess("Call mom."); 74 | Assert.AreEqual("call mom", result); 75 | 76 | result = this.englishPreProcessor.PreProcess("Call mom ."); 77 | Assert.AreEqual("call mom", result); 78 | 79 | result = this.englishPreProcessor.PreProcess("Call mom ."); 80 | Assert.AreEqual("call mom", result); 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /ts/nlp/tokenizer.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Tokenizers. 4 | * 5 | * Copyright (c) Microsoft Corporation. All rights reserved. 6 | * Licensed under the MIT License. 7 | */ 8 | 9 | /** 10 | * An Interval holds the first and last index bounds. 11 | * 12 | * @export 13 | * @class Interval 14 | */ 15 | export class Interval { 16 | /** 17 | * Creates an instance of Interval. 18 | * 19 | * @param {number} first Starting index (inclusive). 20 | * @param {number} last Ending index (exclusive). 21 | * @memberof Interval 22 | */ 23 | constructor(readonly first: number, readonly last: number) { } 24 | 25 | /** 26 | * The length of the token. 27 | * 28 | * @returns {number} The length. 29 | * @memberof Interval 30 | */ 31 | length(): number { 32 | return this.last - this.first; 33 | } 34 | } 35 | 36 | /** 37 | * The substring token of the original string with its interval location. 38 | * 39 | * @export 40 | * @class Token 41 | */ 42 | export class Token { 43 | /** 44 | * Creates an instance of Token. 45 | * 46 | * @param {string} value The substring. 47 | * @param {Interval} interval The interval location. 48 | * @memberof Token 49 | */ 50 | constructor(readonly value: string, readonly interval: Interval) { } 51 | } 52 | 53 | /** 54 | * Tokenizer interface for strings. 55 | * 56 | * @export 57 | * @interface Tokenizer 58 | */ 59 | export interface Tokenizer { 60 | /** 61 | * Tokenizes a string. 62 | * 63 | * @param {string} query The string to tokenize. 64 | * @returns {Token[]} The tokens. 65 | * @memberof Tokenizer 66 | */ 67 | tokenize(query: string): Token[]; 68 | } 69 | 70 | /** 71 | * Tokenizing base-class that will split on the given RegExp. 72 | * 73 | * @export 74 | * @abstract 75 | * @class SplittingTokenizer 76 | * @implements {Tokenizer} 77 | */ 78 | export abstract class SplittingTokenizer implements Tokenizer { 79 | /** 80 | * Creates an instance of SplittingTokenizer. 81 | * 82 | * @param {RegExp} pattern The pattern to split on. 83 | * @memberof SplittingTokenizer 84 | */ 85 | constructor(private readonly pattern: RegExp) { } 86 | 87 | tokenize(query: string): Token[] { 88 | const result: Token[] = []; 89 | let boundary = 0; 90 | let match; 91 | while ((match = this.pattern.exec(query)) !== null) { 92 | if (boundary < match.index) { 93 | const interval = new Interval(boundary, match.index); 94 | const token = new Token(query.substring(interval.first, interval.last), interval); 95 | result.push(token); 96 | } 97 | boundary = this.pattern.lastIndex; 98 | } 99 | 100 | if (boundary < query.length) { 101 | // Add the rest. 102 | const interval = new Interval(boundary, query.length); 103 | const token = new Token(query.substring(interval.first, interval.last), interval); 104 | result.push(token); 105 | } 106 | return result; 107 | } 108 | } 109 | 110 | /** 111 | * Tokenizer that splits on whitespace. 112 | * 113 | * @export 114 | * @class WhitespaceTokenizer 115 | * @extends {SplittingTokenizer} 116 | */ 117 | export class WhitespaceTokenizer extends SplittingTokenizer { 118 | constructor() { 119 | super(/\s+/g); 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /tests/matchers/contactmatcher.test.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import { ContactFields, EnContactMatcher } from "../../ts/matchers"; 5 | 6 | interface TestContact { 7 | firstName: string; 8 | lastName: string; 9 | tele?: string; 10 | } 11 | 12 | const targets: Array = [ 13 | { 14 | firstName: "Andrew", 15 | lastName: "Smith", 16 | tele: "1234567", 17 | }, 18 | { 19 | firstName: "Andrew", 20 | lastName: "", 21 | }, 22 | { 23 | firstName: "John", 24 | lastName: "B", 25 | tele: "7654321", 26 | }, 27 | { 28 | firstName: "John", 29 | lastName: "C", 30 | tele: "2222222", 31 | }, 32 | { 33 | firstName: "Jennifer", 34 | lastName: "", 35 | } 36 | ]; 37 | 38 | function extractContactFields(contact: TestContact): ContactFields { 39 | return { 40 | name: `${contact.firstName} ${contact.lastName}` 41 | } 42 | } 43 | 44 | describe("EnContactMatcher", () => { 45 | test("Phonetic weight.", () => { 46 | const matcher = new EnContactMatcher(targets, extractContactFields); 47 | const results = matcher.find("andru"); 48 | 49 | expect(results.length).toBe(2); 50 | expect(results).toEqual(expect.arrayContaining([ 51 | expect.objectContaining({ 52 | firstName: "Andrew", 53 | lastName: "", 54 | }), 55 | expect.objectContaining({ 56 | firstName: "Andrew", 57 | lastName: "Smith", 58 | }) 59 | ])) 60 | }); 61 | 62 | test("Duplicate names.", () => { 63 | const matcher = new EnContactMatcher(targets, extractContactFields); 64 | const results = matcher.find("john"); 65 | 66 | expect(results.length).toBe(2); 67 | expect(results).toEqual(expect.arrayContaining([ 68 | expect.objectContaining({ 69 | firstName: "John", 70 | lastName: "B", 71 | }), 72 | expect.objectContaining({ 73 | firstName: "John", 74 | lastName: "C", 75 | }) 76 | ])) 77 | }); 78 | 79 | test("Exact match.", () => { 80 | const matcher = new EnContactMatcher(targets, extractContactFields); 81 | const results = matcher.find("Andrew Smith"); 82 | 83 | expect(results.length).toBe(1); 84 | expect(results).toEqual(expect.arrayContaining([ 85 | expect.objectContaining({ 86 | firstName: "Andrew", 87 | lastName: "Smith", 88 | tele: "1234567", 89 | }), 90 | ])) 91 | }); 92 | 93 | test("Find empty.", () => { 94 | const matcher = new EnContactMatcher(targets, extractContactFields); 95 | const results = matcher.find(""); 96 | expect(results).toEqual([]); 97 | }); 98 | 99 | test("ctor used as function exception.", () => { 100 | expect(() => { 101 | const matcher = (EnContactMatcher as any)(targets, extractContactFields); 102 | }).toThrow(); 103 | }); 104 | 105 | test("Find undefined exception.", () => { 106 | expect(() => { 107 | const matcher = new EnContactMatcher(targets, extractContactFields); 108 | matcher.find(undefined as any); 109 | }).toThrow(); 110 | }); 111 | }); 112 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Distance/StringDistance.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Distance 5 | { 6 | using System; 7 | using System.Runtime.InteropServices; 8 | using System.Text; 9 | 10 | /// 11 | /// String distance utility. 12 | /// 13 | public class StringDistance : NativeResourceWrapper, IDistance 14 | { 15 | /// 16 | /// Computes a string edit distance metric. 17 | /// 18 | /// First string to compare 19 | /// Second string to compare 20 | /// Returns the distance between string a and b. 21 | public double Distance(string first, string second) 22 | { 23 | if (first == null || second == null) 24 | { 25 | throw new ArgumentNullException("distance input can't be null"); 26 | } 27 | 28 | double distance = 0; 29 | NativeResourceWrapper.CallNative((buffer) => 30 | { 31 | int bufferSize = NativeResourceWrapper.BufferSize; 32 | var result = StringDistance_Distance(this.Native, first, second, out distance, buffer, ref bufferSize); 33 | NativeResourceWrapper.BufferSize = bufferSize; 34 | return result; 35 | }); 36 | return distance; 37 | } 38 | 39 | /// 40 | /// Instantiate the native resource wrapped 41 | /// 42 | /// The parameter is not used. 43 | /// A pointer to the native resource. 44 | protected override IntPtr CreateNativeResources(params object[] args) 45 | { 46 | IntPtr native = IntPtr.Zero; 47 | NativeResourceWrapper.CallNative((buffer) => 48 | { 49 | int bufferSize = NativeResourceWrapper.BufferSize; 50 | var result = StringDistance_Create(out native, buffer, ref bufferSize); 51 | NativeResourceWrapper.BufferSize = bufferSize; 52 | return result; 53 | }); 54 | return native; 55 | } 56 | 57 | /// 58 | /// Delete the native pointer using the type specified in native bindings. 59 | /// 60 | /// Pointer to the native object. 61 | /// Buffer for any error message 62 | /// Size of the buffer, to be adjusted if error doesn't fit the current size. 63 | /// The result code from native library. 64 | protected override NativeResult NativeDelete(IntPtr native, StringBuilder buffer, ref int bufferSize) 65 | { 66 | return StringDistance_Delete(native, buffer, ref bufferSize); 67 | } 68 | 69 | [DllImport("maluubaspeech-csharp.dll")] 70 | private static extern NativeResult StringDistance_Delete(IntPtr ptr, StringBuilder buffer, ref int bufferSize); 71 | 72 | [DllImport("maluubaspeech-csharp.dll")] 73 | private static extern NativeResult StringDistance_Create(out IntPtr native, StringBuilder buffer, ref int bufferSize); 74 | 75 | [DllImport("maluubaspeech-csharp.dll")] 76 | private static extern NativeResult StringDistance_Distance(IntPtr ptr, string s1, string s2, out double distance, StringBuilder buffer, ref int bufferSize); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/Normalized/EnHybridFuzzyMatcher.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers.FuzzyMatcher.Normalized 5 | { 6 | using System; 7 | using System.Collections.Generic; 8 | using PhoneticMatching.Distance; 9 | 10 | /// 11 | /// A hybrid fuzzy matcher which normalizes results based on length of queries. The fuzziness it determined by the provided distance function. 12 | /// 13 | /// The type of the returned matched object. 14 | public class EnHybridFuzzyMatcher : NormalizedFuzzyMatcher 15 | { 16 | private double phoneticWeightPercentage = 0; 17 | private EnPronouncer pronouncer = EnPronouncer.Instance; 18 | 19 | /// 20 | /// Initializes a new instance of the class. 21 | /// 22 | /// The set of objects that will be matched against. The order of equal targets is not guaranteed to be preserved. 23 | /// Between 0 and 1. 24 | /// Weighting trade-off between the phonetic distance and the lexical distance scores. 25 | /// 1 meaning 100% phonetic score and 0% lexical score. 26 | /// A mapping of the input types to the query(extraction) type. Note that Extraction == string for normalized cases. 27 | /// Whether the fuzzy matcher uses accelerated implementation or not. 28 | public EnHybridFuzzyMatcher(IList targets, double phoneticWeightPercentage, Func targetToExtractionPhrase = null, bool isAccelerated = true) 29 | { 30 | this.phoneticWeightPercentage = phoneticWeightPercentage; 31 | 32 | Func targetToExtraction = (target) => 33 | { 34 | string phrase = targetToExtractionPhrase == null ? target as string : targetToExtractionPhrase(target); 35 | if (phrase == null) 36 | { 37 | throw new InvalidCastException($"Can't cast Target type [{typeof(Target)}] to Extraction type [string]. You must provide a conversion function 'targetToExtractionPhrase'."); 38 | } 39 | 40 | return new DistanceInput(phrase, this.pronouncer.Pronounce(phrase)); 41 | }; 42 | this.GenericFuzzyMatcher = new FuzzyMatcher(targets, new EnHybridDistance(phoneticWeightPercentage), targetToExtraction, isAccelerated); 43 | } 44 | 45 | /// 46 | /// Find the __k__ nearest elements. 47 | /// 48 | /// The search target. 49 | /// The maximum distance to a match. 50 | /// The maximum number of result to return. 51 | /// The __k__ nearest matches to target within limit 52 | public override IList> FindNearestWithin(string query, double limit, int count) 53 | { 54 | var input = new DistanceInput(query, this.pronouncer.Pronounce(query)); 55 | double thresholdScale = (this.phoneticWeightPercentage * input.Pronunciation.Phones.Count) + ((1 - this.phoneticWeightPercentage) * input.Phrase.Length); 56 | return this.FindNearestWithinNormalized(input, limit, count, thresholdScale); 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Distance/EnPhoneticDistance.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Distance 5 | { 6 | using System; 7 | using System.Runtime.InteropServices; 8 | using System.Text; 9 | 10 | /// 11 | /// English phonetic distance utility. 12 | /// 13 | public class EnPhoneticDistance : NativeResourceWrapper, IDistance 14 | { 15 | /// 16 | /// Computes an English phonetic distance metric. 17 | /// 18 | /// First pronunciation to compare 19 | /// Second pronunciation to compare 20 | /// The english phonetic distance between a and b 21 | public double Distance(EnPronunciation first, EnPronunciation second) 22 | { 23 | if (first == null || second == null) 24 | { 25 | throw new ArgumentNullException("distance input can't be null"); 26 | } 27 | 28 | double distance = 0; 29 | NativeResourceWrapper.CallNative((buffer) => 30 | { 31 | int bufferSize = NativeResourceWrapper.BufferSize; 32 | var result = EnPhoneticDistance_Distance(this.Native, first.Native, second.Native, out distance, buffer, ref bufferSize); 33 | NativeResourceWrapper.BufferSize = bufferSize; 34 | return result; 35 | }); 36 | return distance; 37 | } 38 | 39 | /// 40 | /// Instantiate the native resource wrapped 41 | /// 42 | /// The parameter is not used. 43 | /// A pointer to the native resource. 44 | protected override IntPtr CreateNativeResources(params object[] args) 45 | { 46 | IntPtr native = IntPtr.Zero; 47 | NativeResourceWrapper.CallNative((buffer) => 48 | { 49 | int bufferSize = NativeResourceWrapper.BufferSize; 50 | var result = EnPhoneticDistance_Create(out native, buffer, ref bufferSize); 51 | NativeResourceWrapper.BufferSize = bufferSize; 52 | return result; 53 | }); 54 | return native; 55 | } 56 | 57 | /// 58 | /// Delete the native pointer using the type specified in native bindings. 59 | /// 60 | /// Pointer to the native object. 61 | /// Buffer for any error message 62 | /// Size of the buffer, to be adjusted if error doesn't fit the current size. 63 | /// The result code from native library. 64 | protected override NativeResult NativeDelete(IntPtr native, StringBuilder buffer, ref int bufferSize) 65 | { 66 | return EnPhoneticDistance_Delete(native, buffer, ref bufferSize); 67 | } 68 | 69 | [DllImport("maluubaspeech-csharp.dll")] 70 | private static extern NativeResult EnPhoneticDistance_Delete(IntPtr ptr, StringBuilder buffer, ref int bufferSize); 71 | 72 | [DllImport("maluubaspeech-csharp.dll")] 73 | private static extern NativeResult EnPhoneticDistance_Create(out IntPtr native, StringBuilder buffer, ref int bufferSize); 74 | 75 | [DllImport("maluubaspeech-csharp.dll")] 76 | private static extern NativeResult EnPhoneticDistance_Distance(IntPtr native, IntPtr first, IntPtr second, out double distance, StringBuilder buffer, ref int bufferSize); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/Matchers/ContactMatcherTests.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingTests.Matchers 5 | { 6 | using System; 7 | using Microsoft.VisualStudio.TestTools.UnitTesting; 8 | using Microsoft.PhoneticMatching.Matchers.ContactMatcher; 9 | 10 | [TestClass] 11 | public class ContactMatcherTests : BaseContactMatcherTester 12 | { 13 | [TestMethod] 14 | public void GivenSimilarPhoneticWeight_ExpectPositiveMatch() 15 | { 16 | var matcher = new EnContactMatcher(this.Targets, this.ContactFieldsExtrator); 17 | var results = matcher.Find("andru"); 18 | 19 | Assert.AreEqual(2, results.Count); 20 | var expected = new TestContact() 21 | { 22 | FirstName = "Andrew", 23 | LastName = string.Empty 24 | }; 25 | Assert.IsTrue(results.Contains(expected)); 26 | expected = new TestContact() 27 | { 28 | FirstName = "Andrew", 29 | LastName = "Smith", 30 | Id = "1234567" 31 | }; 32 | Assert.IsTrue(results.Contains(expected)); 33 | } 34 | 35 | [TestMethod] 36 | public void GivenDuplicateNames_ExpectPositiveMatch() 37 | { 38 | var matcher = new EnContactMatcher(this.Targets, this.ContactFieldsExtrator); 39 | var results = matcher.Find("john"); 40 | 41 | Assert.AreEqual(2, results.Count); 42 | var expected = new TestContact() 43 | { 44 | FirstName = "John", 45 | LastName = "B", 46 | Id = "7654321" 47 | }; 48 | Assert.IsTrue(results.Contains(expected)); 49 | expected = new TestContact() 50 | { 51 | FirstName = "John", 52 | LastName = "C", 53 | Id = "2222222" 54 | }; 55 | Assert.IsTrue(results.Contains(expected)); 56 | } 57 | 58 | [TestMethod] 59 | public void GivenExactMatch_ExpectPositiveMatch() 60 | { 61 | var matcher = new EnContactMatcher(this.Targets, this.ContactFieldsExtrator); 62 | var results = matcher.Find("Andrew Smith"); 63 | 64 | Assert.AreEqual(1, results.Count); 65 | var expected = new TestContact() 66 | { 67 | FirstName = "Andrew", 68 | LastName = "Smith", 69 | Id = "1234567" 70 | }; 71 | Assert.AreEqual(expected, results[0]); 72 | } 73 | 74 | [TestMethod] 75 | public void GivenEmptyQuery_ExpectEmptyResult() 76 | { 77 | var matcher = new EnContactMatcher(this.Targets, this.ContactFieldsExtrator); 78 | var results = matcher.Find(string.Empty); 79 | Assert.AreEqual(0, results.Count); 80 | } 81 | 82 | [TestMethod] 83 | public void GivenNullQuery_ExpectException() 84 | { 85 | Assert.ThrowsException(() => 86 | { 87 | var matcher = new EnContactMatcher(this.Targets, this.ContactFieldsExtrator); 88 | matcher.Find(null); 89 | }); 90 | } 91 | 92 | private ContactFields ContactFieldsExtrator(TestContact contact) 93 | { 94 | return new ContactFields() 95 | { 96 | Name = contact.FullName 97 | }; 98 | } 99 | } 100 | } -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/Distance/EnPhoneticDistanceTests.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingTests.Distance 5 | { 6 | using System; 7 | using Microsoft.VisualStudio.TestTools.UnitTesting; 8 | using Microsoft.PhoneticMatching; 9 | using Microsoft.PhoneticMatching.Distance; 10 | 11 | [TestClass] 12 | public class EnPhoneticDistanceTests : BaseDistanceTester 13 | { 14 | /// 15 | /// Sam pasupalak 16 | /// 17 | private EnPronunciation sam = EnPronunciation.FromIpa("sæmpɑsupələk"); 18 | 19 | /// 20 | /// Santa super black 21 | /// 22 | private EnPronunciation santa = EnPronunciation.FromIpa("sæntəsupɝblæk"); 23 | 24 | /// 25 | /// Samples pollux 26 | /// 27 | private EnPronunciation samples = EnPronunciation.FromIpa("sæmpəlzpɑləks"); 28 | 29 | [TestMethod] 30 | public void GivenSamePronunciation_ExpectZeroDistance() 31 | { 32 | const string ThisIsATest = "ðɪsɪzətɛst"; 33 | var test = EnPronunciation.FromIpa(ThisIsATest); 34 | var dist = this.Distance.Distance(test, test); 35 | Assert.AreEqual(0, dist); 36 | } 37 | 38 | /// 39 | /// Check identity of indiscernibles 40 | /// 41 | [TestMethod] 42 | public void GivenPhoneticDistances_ExpectIdentity() 43 | { 44 | Assert.AreEqual(0, this.Distance.Distance(this.sam, this.sam)); 45 | Assert.AreEqual(0, this.Distance.Distance(this.santa, this.santa)); 46 | Assert.AreEqual(0, this.Distance.Distance(this.samples, this.samples)); 47 | } 48 | 49 | /// 50 | /// Check symmetry 51 | /// 52 | [TestMethod] 53 | public void GivenPhoneticDistances_ExpectSymmetry() 54 | { 55 | Assert.AreEqual(this.Distance.Distance(this.sam, this.santa), this.Distance.Distance(this.santa, this.sam)); 56 | Assert.AreEqual(this.Distance.Distance(this.sam, this.samples), this.Distance.Distance(this.samples, this.sam)); 57 | Assert.AreEqual(this.Distance.Distance(this.samples, this.santa), this.Distance.Distance(this.santa, this.samples)); 58 | } 59 | 60 | /// 61 | /// Check triangle inequality 62 | /// 63 | [TestMethod] 64 | public void GivenPhoneticDistances_ExpectInequality() 65 | { 66 | Assert.IsTrue(this.Distance.Distance(this.sam, this.samples) < this.Distance.Distance(this.sam, this.santa) + this.Distance.Distance(this.santa, this.samples)); 67 | Assert.IsTrue(this.Distance.Distance(this.sam, this.santa) < this.Distance.Distance(this.sam, this.samples) + this.Distance.Distance(this.samples, this.santa)); 68 | Assert.IsTrue(this.Distance.Distance(this.santa, this.samples) < this.Distance.Distance(this.santa, this.sam) + this.Distance.Distance(this.sam, this.samples)); 69 | } 70 | 71 | /// 72 | /// Check performance 73 | /// 74 | [TestMethod] 75 | public void GivenPhoneticDistances_ExpectPerformance() 76 | { 77 | Assert.IsTrue(this.Distance.Distance(this.sam, this.santa) < this.Distance.Distance(this.sam, this.samples)); 78 | Assert.IsTrue(this.Distance.Distance(this.sam, this.samples) < this.Distance.Distance(this.santa, this.samples)); 79 | } 80 | 81 | protected override IDistance CreateDistanceOperator() 82 | { 83 | return new EnPhoneticDistance(); 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/maluuba/levenshtein.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Levenshtein (min edit) distance. 4 | * 5 | * @author Benedicte Pierrejean 6 | * @author Tavian Barnes (tavian.barnes@microsoft.com) 7 | * 8 | * Copyright (c) Microsoft Corporation. All rights reserved. 9 | * Licensed under the MIT License. 10 | */ 11 | 12 | #ifndef MALUUBA_LEVENSHTEIN_HPP 13 | #define MALUUBA_LEVENSHTEIN_HPP 14 | 15 | #include "maluuba/metric.hpp" 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace maluuba 23 | { 24 | /** 25 | * Cost functor that always returns the same value. 26 | * 27 | * @tparam T 28 | * The numeric type to return. 29 | * 30 | * @author Tavian Barnes (tavian.barnes@microsoft.com) 31 | */ 32 | template 33 | class ConstantCost 34 | { 35 | public: 36 | ConstantCost(T cost = T{1}) 37 | : m_cost{cost} 38 | { } 39 | 40 | template 41 | T 42 | operator()(const U& u) const 43 | { 44 | return m_cost; 45 | } 46 | 47 | private: 48 | T m_cost; 49 | }; 50 | 51 | /** 52 | * Levenshtein distance metric. 53 | * 54 | * @tparam SubstitutionMetric 55 | * The metric used to compute substitution costs. 56 | * @tparam CostFunction 57 | * The cost function for insertions/deletions. 58 | * 59 | * @author Benedicte Pierrejean 60 | * @author Tavian Barnes (tavian.barnes@microsoft.com) 61 | */ 62 | template >> 64 | class LevenshteinDistance 65 | { 66 | template 67 | using ValueType = typename std::iterator_traits::value_type; 68 | 69 | template 70 | using ResultType = MetricResult, ValueType>; 71 | 72 | public: 73 | /** 74 | * Create a @c LevenshteinDistance. 75 | * 76 | * @param sub_metric The metric used to compute substitution costs. 77 | * @param cost The cost (function) for insertions/deletions (default: 1). 78 | */ 79 | LevenshteinDistance(SubstitutionMetric sub_metric = SubstitutionMetric{}, 80 | CostFunction cost = CostFunction{}) 81 | : m_sub_metric(std::move(sub_metric)), 82 | m_cost(std::move(cost)) 83 | { } 84 | 85 | template 86 | ResultType 87 | operator()(const T& t_seq, const U& u_seq) const 88 | { 89 | // Wagner-Fischer algorithm with two active rows 90 | 91 | using Number = ResultType; 92 | 93 | auto cols = u_seq.size() + 1; 94 | auto row0 = std::make_unique(cols); 95 | auto row1 = std::make_unique(cols); 96 | 97 | Number initial_cost{}; 98 | std::size_t i = 0; 99 | row0[i] = initial_cost; 100 | for (const auto& u : u_seq) { 101 | ++i; 102 | initial_cost += m_cost(u); 103 | row0[i] = initial_cost; 104 | } 105 | 106 | for (const auto& t : t_seq) { 107 | auto t_cost = m_cost(t); 108 | row1[0] = row0[0] + t_cost; 109 | 110 | i = 1; 111 | for (const auto& u : u_seq) { 112 | auto sub_cost = row0[i - 1] + m_sub_metric(t, u); 113 | auto del_cost = row0[i] + t_cost; 114 | auto ins_cost = row1[i - 1] + m_cost(u); 115 | row1[i] = std::min(sub_cost, std::min(del_cost, ins_cost)); 116 | ++i; 117 | } 118 | 119 | std::swap(row0, row1); 120 | } 121 | 122 | return row0[cols - 1]; 123 | } 124 | 125 | private: 126 | SubstitutionMetric m_sub_metric; 127 | CostFunction m_cost; 128 | }; 129 | } 130 | 131 | #endif // MALUUBA_LEVENSHTEIN_HPP 132 | -------------------------------------------------------------------------------- /src/maluuba/speech/pronunciation/phone.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | #include "maluuba/speech/pronunciation/impl.hpp" 5 | #include "maluuba/speech/pronunciation.hpp" 6 | #include "maluuba/debug.hpp" 7 | #include 8 | 9 | namespace maluuba 10 | { 11 | namespace speech 12 | { 13 | using namespace internal; 14 | 15 | namespace 16 | { 17 | void 18 | check_consonant(const Phone& phone) 19 | { 20 | check_logic(phone.type() == PhoneType::CONSONANT, "This phone is not a consonant."); 21 | } 22 | 23 | void 24 | check_vowel(const Phone& phone) 25 | { 26 | check_logic(phone.type() == PhoneType::VOWEL, "This phone is not a vowel."); 27 | } 28 | } 29 | 30 | Phone::Phone(PhoneType type) 31 | : Phone{phone_encode(type, type_start)} 32 | { } 33 | 34 | Phone::Phone(std::uint16_t repr) 35 | : m_repr{repr} 36 | { } 37 | 38 | PhoneType 39 | Phone::type() const 40 | { 41 | return phone_decode(m_repr, type_start, type_end); 42 | } 43 | 44 | Phonation 45 | Phone::phonation() const 46 | { 47 | return phone_decode(m_repr, phonation_start, phonation_end); 48 | } 49 | 50 | void 51 | Phone::phonation(Phonation phonation) 52 | { 53 | m_repr = phone_encode(m_repr, phonation, phonation_start, phonation_end); 54 | } 55 | 56 | PlaceOfArticulation 57 | Phone::place() const 58 | { 59 | check_consonant(*this); 60 | return phone_decode(m_repr, place_start, place_end); 61 | } 62 | 63 | void 64 | Phone::place(PlaceOfArticulation place) 65 | { 66 | check_consonant(*this); 67 | m_repr = phone_encode(m_repr, place, place_start, place_end); 68 | } 69 | 70 | MannerOfArticulation 71 | Phone::manner() const 72 | { 73 | check_consonant(*this); 74 | return phone_decode(m_repr, manner_start, manner_end); 75 | } 76 | 77 | void 78 | Phone::manner(MannerOfArticulation manner) 79 | { 80 | check_consonant(*this); 81 | m_repr = phone_encode(m_repr, manner, manner_start, manner_end); 82 | } 83 | 84 | VowelHeight 85 | Phone::height() const 86 | { 87 | check_vowel(*this); 88 | return phone_decode(m_repr, height_start, height_end); 89 | } 90 | 91 | void 92 | Phone::height(VowelHeight height) 93 | { 94 | check_vowel(*this); 95 | m_repr = phone_encode(m_repr, height, height_start, height_end); 96 | } 97 | 98 | VowelBackness 99 | Phone::backness() const 100 | { 101 | check_vowel(*this); 102 | return phone_decode(m_repr, backness_start, backness_end); 103 | } 104 | 105 | void 106 | Phone::backness(VowelBackness backness) 107 | { 108 | check_vowel(*this); 109 | m_repr = phone_encode(m_repr, backness, backness_start, backness_end); 110 | } 111 | 112 | VowelRoundedness 113 | Phone::roundedness() const 114 | { 115 | check_vowel(*this); 116 | return phone_decode(m_repr, roundedness_start, roundedness_end); 117 | } 118 | 119 | void 120 | Phone::roundedness(VowelRoundedness roundedness) 121 | { 122 | check_vowel(*this); 123 | m_repr = phone_encode(m_repr, roundedness, roundedness_start, roundedness_end); 124 | } 125 | 126 | bool 127 | Phone::is_rhotic() const 128 | { 129 | check_vowel(*this); 130 | return phone_decode(m_repr, rhotic_start, rhotic_end); 131 | } 132 | 133 | void 134 | Phone::rhotic(bool rhotic) 135 | { 136 | check_vowel(*this); 137 | m_repr = phone_encode(m_repr, rhotic, rhotic_start, rhotic_end); 138 | } 139 | 140 | bool 141 | Phone::is_syllabic() const 142 | { 143 | return phone_decode(m_repr, syllabic_start, syllabic_end); 144 | } 145 | 146 | void 147 | Phone::syllabic(bool syllabic) 148 | { 149 | m_repr = phone_encode(m_repr, syllabic, syllabic_start, syllabic_end); 150 | } 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /tests/matchers/placematcher.test.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | import {PlaceFields, EnPlaceMatcher} from "../../ts/matchers"; 5 | 6 | interface TestPlace { 7 | name: string; 8 | address: string; 9 | tele?: string; 10 | categories?: string[]; 11 | } 12 | 13 | const targets: Array = [ 14 | { 15 | name: "Marbles Restaurant", 16 | address: "8 William Street E", 17 | categories: ["Canadian (New)"], 18 | tele: "1234567", 19 | }, 20 | { 21 | name: "Beertown", 22 | address: "75 King Street S", 23 | categories: ["Canadian (New)", "Beer, Wine & Spirits", "Bars"], 24 | tele: "7654321", 25 | }, 26 | { 27 | name: "Nick and Nat's Uptown 21", 28 | address: "21 King St N", 29 | categories: ["Canadian (New)"], 30 | }, 31 | { 32 | name: "The Shops", 33 | address: "7 Fake Cres. Toronto", 34 | } 35 | ]; 36 | 37 | function extractPlaceFields(place: TestPlace): PlaceFields { 38 | return { 39 | name: place.name, 40 | address: place.address, 41 | types: place.categories, 42 | } 43 | } 44 | 45 | describe("EnPlaceMatcher", () => { 46 | test("By Address.", () => { 47 | const matcher = new EnPlaceMatcher(targets, extractPlaceFields); 48 | const results = matcher.find("king street"); 49 | 50 | expect(results.length).toBe(2); 51 | expect(results).toEqual(expect.arrayContaining([ 52 | expect.objectContaining({ 53 | name: "Beertown", 54 | address: "75 King Street S", 55 | }), 56 | expect.objectContaining({ 57 | name: "Nick and Nat's Uptown 21", 58 | address: "21 King St N", 59 | }) 60 | ])) 61 | }); 62 | 63 | test("Address expansions.", () => { 64 | const matcher = new EnPlaceMatcher(targets, extractPlaceFields); 65 | const results = matcher.find("fake crescent"); 66 | 67 | expect(results.length).toBe(1); 68 | expect(results).toEqual(expect.arrayContaining([ 69 | expect.objectContaining({ 70 | name: "The Shops", 71 | address: "7 Fake Cres. Toronto", 72 | }) 73 | ])) 74 | }); 75 | 76 | test("By Types.", () => { 77 | const matcher = new EnPlaceMatcher(targets, extractPlaceFields); 78 | const results = matcher.find("Bars"); 79 | 80 | expect(results.length).toBe(1); 81 | expect(results).toEqual(expect.arrayContaining([ 82 | expect.objectContaining({ 83 | name: "Beertown", 84 | address: "75 King Street S", 85 | categories: ["Canadian (New)", "Beer, Wine & Spirits", "Bars"], 86 | tele: "7654321", 87 | }) 88 | ])) 89 | }); 90 | 91 | test("Exact match.", () => { 92 | const matcher = new EnPlaceMatcher(targets, extractPlaceFields); 93 | const results = matcher.find("The Shops"); 94 | 95 | expect(results.length).toBe(1); 96 | expect(results).toEqual(expect.arrayContaining([ 97 | expect.objectContaining({ 98 | name: "The Shops", 99 | address: "7 Fake Cres. Toronto", 100 | }), 101 | ])) 102 | }); 103 | 104 | test("Find empty.", () => { 105 | const matcher = new EnPlaceMatcher(targets, extractPlaceFields); 106 | const results = matcher.find(""); 107 | expect(results).toEqual([]); 108 | }); 109 | 110 | test("ctor used as function exception.", () => { 111 | expect(() => { 112 | const matcher = (EnPlaceMatcher as any)(targets, extractPlaceFields); 113 | }).toThrow(); 114 | }); 115 | 116 | test("Find undefined exception.", () => { 117 | expect(() => { 118 | const matcher = new EnPlaceMatcher(targets, extractPlaceFields); 119 | matcher.find(undefined as any); 120 | }).toThrow(); 121 | }); 122 | }); 123 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/Distance/EnHybridDistanceTests.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingTests.Distance 5 | { 6 | using System; 7 | using Microsoft.VisualStudio.TestTools.UnitTesting; 8 | using Microsoft.PhoneticMatching; 9 | using Microsoft.PhoneticMatching.Distance; 10 | 11 | [TestClass] 12 | public class EnHybridDistanceTests : BaseDistanceTester 13 | { 14 | [TestMethod] 15 | public void GivenHybridDistance_ExpectPhoneticWeightPercentage() 16 | { 17 | var hybrid = this.Distance as EnHybridDistance; 18 | Assert.AreEqual(0.7, hybrid.PhoneticWeightPercentage); 19 | } 20 | 21 | [TestMethod] 22 | public void GivenExactString_ExpectPositiveMatch() 23 | { 24 | const string Phrase = "This, is a test."; 25 | var inputA = CreateInput(Phrase); 26 | var inputB = CreateInput(Phrase); 27 | var dist = this.Distance.Distance(inputA, inputB); 28 | Assert.AreEqual(0, dist); 29 | } 30 | 31 | [TestMethod] 32 | public void GivenHybridDistance_ExpectValidDistance() 33 | { 34 | const string PhraseA = "aaa"; 35 | const string PhraseB = "bbb"; 36 | 37 | var inputA = CreateInput(PhraseA); 38 | var inputB = CreateInput(PhraseB); 39 | var inputEmpty = CreateInput(string.Empty); 40 | 41 | var dist = this.Distance.Distance(inputA, inputA); 42 | Assert.AreEqual(0, dist); 43 | dist = this.Distance.Distance(inputA, inputB); 44 | Assert.IsTrue(dist > 0); 45 | dist = this.Distance.Distance(inputEmpty, inputEmpty); 46 | Assert.AreEqual(0, dist); 47 | } 48 | 49 | [TestMethod] 50 | public void GivenSimilarAndDifferent_ExpectGreaterDistanceForDifferent() 51 | { 52 | var inputI = CreateInput("aaiaa"); 53 | var inputE = CreateInput("aaeaa"); 54 | var inputZ = CreateInput("zzrzz"); 55 | 56 | var distSimilar = this.Distance.Distance(inputI, inputE); 57 | var distDifferent = this.Distance.Distance(inputI, inputZ); 58 | Assert.IsTrue(distDifferent > distSimilar); 59 | } 60 | 61 | [TestMethod] 62 | public void GivenInvalidInput_ExpectException() 63 | { 64 | Assert.ThrowsException(() => 65 | { 66 | var input = new DistanceInput(null, null); 67 | var dist = this.Distance.Distance(input, input); 68 | }); 69 | } 70 | 71 | [TestMethod] 72 | public void GivenEmptyInput_ExpectException() 73 | { 74 | Assert.ThrowsException(() => 75 | { 76 | var input = new DistanceInput(string.Empty, null); 77 | var dist = this.Distance.Distance(input, input); 78 | }); 79 | } 80 | 81 | [TestMethod] 82 | public void GivenOutOfBoundLower_ExpectException() 83 | { 84 | Assert.ThrowsException(() => 85 | { 86 | var distance = new EnHybridDistance(-0.1); 87 | }); 88 | } 89 | 90 | [TestMethod] 91 | public void GivenOutOfBoundUpper_ExpectException() 92 | { 93 | Assert.ThrowsException(() => 94 | { 95 | var distance = new EnHybridDistance(2); 96 | }); 97 | } 98 | 99 | protected override IDistance CreateDistanceOperator() 100 | { 101 | return new EnHybridDistance(0.7); 102 | } 103 | 104 | private static DistanceInput CreateInput(string phrase) 105 | { 106 | EnPronouncer pronouncer = EnPronouncer.Instance; 107 | 108 | return new DistanceInput(phrase, pronouncer.Pronounce(phrase)); 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/EnPronouncer.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching 5 | { 6 | using System; 7 | using System.Runtime.InteropServices; 8 | using System.Text; 9 | 10 | /// 11 | /// Pronounces English texts. 12 | /// 13 | public sealed class EnPronouncer : NativeResourceWrapper 14 | { 15 | /// 16 | /// This implementation using System.Lazy makes the singleton thread-safe 17 | /// 18 | private static readonly Lazy LazyInstance = new Lazy(() => new EnPronouncer()); 19 | 20 | /// 21 | /// Prevents a default instance of the class from being created. 22 | /// 23 | private EnPronouncer() 24 | { 25 | } 26 | 27 | /// 28 | /// Gets the singleton instance. 29 | /// 30 | public static EnPronouncer Instance 31 | { 32 | get 33 | { 34 | return LazyInstance.Value; 35 | } 36 | } 37 | 38 | /// 39 | /// Pronounce text. 40 | /// 41 | /// The text to pronounce. 42 | /// The English Pronunciation. 43 | public EnPronunciation Pronounce(string phrase) 44 | { 45 | if (phrase == null) 46 | { 47 | throw new ArgumentNullException("phrase can't be null"); 48 | } 49 | 50 | IntPtr nativePronunciation = IntPtr.Zero; 51 | NativeResourceWrapper.CallNative((buffer) => 52 | { 53 | int bufferSize = NativeResourceWrapper.BufferSize; 54 | var result = EnPronouncer_Pronounce(this.Native, phrase, out nativePronunciation, buffer, ref bufferSize); 55 | NativeResourceWrapper.BufferSize = bufferSize; 56 | return result; 57 | }); 58 | 59 | return new EnPronunciation(nativePronunciation); 60 | } 61 | 62 | /// 63 | /// Instantiate the native resource wrapped. 64 | /// 65 | /// The parameter is not used. 66 | /// A pointer to the native resource. 67 | protected override IntPtr CreateNativeResources(params object[] args) 68 | { 69 | IntPtr native = IntPtr.Zero; 70 | NativeResourceWrapper.CallNative((buffer) => 71 | { 72 | int bufferSize = NativeResourceWrapper.BufferSize; 73 | var result = EnPronouncer_Create(out native, buffer, ref bufferSize); 74 | NativeResourceWrapper.BufferSize = bufferSize; 75 | return result; 76 | }); 77 | return native; 78 | } 79 | 80 | /// 81 | /// Delete the native pointer using the type specified in native bindings. 82 | /// 83 | /// Pointer to the native object. 84 | /// Buffer for any error message 85 | /// Size of the buffer, to be adjusted if error doesn't fit the current size. 86 | /// The result code from native library. 87 | protected override NativeResult NativeDelete(IntPtr native, StringBuilder buffer, ref int bufferSize) 88 | { 89 | return EnPronouncer_Delete(native, buffer, ref bufferSize); 90 | } 91 | 92 | [DllImport("maluubaspeech-csharp.dll")] 93 | private static extern NativeResult EnPronouncer_Delete(IntPtr ptr, StringBuilder buffer, ref int bufferSize); 94 | 95 | [DllImport("maluubaspeech-csharp.dll")] 96 | private static extern NativeResult EnPronouncer_Create(out IntPtr native, StringBuilder buffer, ref int bufferSize); 97 | 98 | [DllImport("maluubaspeech-csharp.dll")] 99 | private static extern NativeResult EnPronouncer_Pronounce(IntPtr nativePtr, string phrase, out IntPtr pronunciation, StringBuilder buffer, ref int bufferSize); 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingTests/EnPronunciationTests.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingTests 5 | { 6 | using System; 7 | using Microsoft.VisualStudio.TestTools.UnitTesting; 8 | using Microsoft.PhoneticMatching; 9 | 10 | [TestClass] 11 | public class EnPronunciationTests 12 | { 13 | [TestMethod] 14 | public void GivenIpaFromArpabet_ExpectPositiveMatch() 15 | { 16 | var arpabet = EnPronunciation.FromArpabet(new string[] { "dh", "ih1", "s", "ih1", "z", "ax0", "t", "eh1", "s", "t" }); 17 | Assert.IsTrue(arpabet.Phones.Count > 0); 18 | Assert.AreEqual("ðɪsɪzətɛst", arpabet.Ipa); 19 | } 20 | 21 | [TestMethod] 22 | public void GivenIpaFromIpa_ExpectPositiveMatch() 23 | { 24 | var ipa = EnPronunciation.FromIpa("ðɪsɪzətɛst"); 25 | Assert.IsTrue(ipa.Phones.Count > 0); 26 | Assert.AreEqual("ðɪsɪzətɛst", ipa.Ipa); 27 | } 28 | 29 | [TestMethod] 30 | public void GivenPronunciationFromArpabet_ExpectPositiveMatch() 31 | { 32 | var pron = EnPronunciation.FromArpabet(new string[] { "P", "R", "OW0", "N", "AH2", "N", "S", "IY0", "EY1", "SH", "AX0", "N" }); 33 | Assert.IsTrue(pron.Phones.Count > 0); 34 | Assert.AreEqual("proʊ̯nʌnsieɪ̯ʃən", pron.Ipa); 35 | 36 | // p 37 | var phone = pron.Phones[0]; 38 | Assert.AreEqual(PhoneType.Consonant, phone.Type); 39 | Assert.AreEqual(Phonation.Voiceless, phone.Phonation); 40 | Assert.AreEqual(PlaceOfArticulation.Bilabial, phone.Place); 41 | Assert.AreEqual(MannerOfArticulation.Plosive, phone.Manner); 42 | Assert.IsFalse(phone.IsSyllabic); 43 | Assert.IsNull(phone.Height); 44 | Assert.IsNull(phone.Backness); 45 | Assert.IsNull(phone.Roundedness); 46 | Assert.IsNull(phone.IsRhotic); 47 | 48 | // o 49 | phone = pron.Phones[2]; 50 | Assert.AreEqual(PhoneType.Vowel, phone.Type); 51 | Assert.AreEqual(Phonation.Modal, phone.Phonation); 52 | Assert.AreEqual(VowelHeight.CloseMid, phone.Height); 53 | Assert.AreEqual(VowelBackness.Back, phone.Backness); 54 | Assert.AreEqual(VowelRoundedness.Rounded, phone.Roundedness); 55 | Assert.IsTrue(phone.IsSyllabic); 56 | 57 | // ʊ̯ 58 | phone = pron.Phones[3]; 59 | Assert.AreEqual(PhoneType.Vowel, phone.Type); 60 | Assert.AreEqual(Phonation.Modal, phone.Phonation); 61 | Assert.AreEqual(VowelHeight.NearClose, phone.Height); 62 | Assert.AreEqual(VowelBackness.NearBack, phone.Backness); 63 | Assert.AreEqual(VowelRoundedness.Rounded, phone.Roundedness); 64 | Assert.IsFalse(phone.IsSyllabic); 65 | Assert.IsNull(phone.Place); 66 | Assert.IsNull(phone.Manner); 67 | } 68 | 69 | [TestMethod] 70 | public void GivenInvalidSpaceInArpabet_ExpectException() 71 | { 72 | Assert.ThrowsException(() => 73 | { 74 | var arpabet = EnPronunciation.FromArpabet(new string[] { "F", "B ", "N", "EH", "T", "IH", "K" }); 75 | }); 76 | } 77 | 78 | [TestMethod] 79 | public void GivenNullArgument_FromIpa_ExpectException() 80 | { 81 | Assert.ThrowsException(() => 82 | { 83 | var arpabet = EnPronunciation.FromArpabet(null); 84 | }); 85 | } 86 | 87 | [TestMethod] 88 | public void GivenNullArgument_FromArpabet_ExpectException() 89 | { 90 | Assert.ThrowsException(() => 91 | { 92 | var ipa = EnPronunciation.FromIpa(null); 93 | }); 94 | } 95 | 96 | [TestMethod] 97 | public void GivenInvalidIpa_FromIpa_ExpectException() 98 | { 99 | Assert.ThrowsException(() => 100 | { 101 | var ipa = EnPronunciation.FromIpa("This is not an IPA"); 102 | }); 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/cs/PhoneticMatchingPerfTests/Program.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace PhoneticMatchingPerfTests 5 | { 6 | using System; 7 | using System.Diagnostics; 8 | using System.IO; 9 | using System.Linq; 10 | using Newtonsoft.Json; 11 | using Microsoft.PhoneticMatching.Matchers.ContactMatcher; 12 | using Microsoft.PhoneticMatching.Matchers.PlaceMatcher; 13 | 14 | internal class Program 15 | { 16 | private const string Contact = "contact"; 17 | private const string Place = "place"; 18 | private const int MaxReturns = 3; 19 | 20 | /// 21 | /// Usage ".\PhoneticMatcherPerfTests contact|place timeoutMilliseconds [accuracy]" 22 | /// 23 | /// 24 | /// ".\PhoneticMatcherPerfTests contact 20000" Runs queries for 20 seconds for user to profiler performance results. 25 | /// 26 | /// Command line arguments 27 | private static void Main(string[] args) 28 | { 29 | string type = args[0]; 30 | int timeoutMilliseconds; 31 | double maxTimeout = TimeSpan.FromDays(7).TotalMilliseconds; 32 | string errorTimeout = $"second argument is the time during the profiling will last. It must be a valid integer between 1 and {maxTimeout} (one week)"; 33 | if (!int.TryParse(args[1], out timeoutMilliseconds)) 34 | { 35 | throw new ArgumentException(errorTimeout); 36 | } 37 | 38 | if (timeoutMilliseconds < 1 || timeoutMilliseconds > maxTimeout) 39 | { 40 | throw new ArgumentOutOfRangeException(errorTimeout + $" - current value : {timeoutMilliseconds}"); 41 | } 42 | 43 | bool isAccuracyTest = false; 44 | if (args.Length > 2) 45 | { 46 | isAccuracyTest = string.Compare(args[2], "accuracy", true) == 0; 47 | } 48 | 49 | Console.WriteLine("Starting tests..."); 50 | var sw = new Stopwatch(); 51 | sw.Start(); 52 | switch (type.ToLowerInvariant()) 53 | { 54 | case Contact: 55 | { 56 | TestElement[] contacts = JsonConvert.DeserializeObject[]>(File.ReadAllText(@".\contacts.json")); 57 | Console.WriteLine($"Took {sw.Elapsed} to deserialize contact fields."); 58 | sw.Restart(); 59 | var contactFields = contacts.Select(c => c.Element).ToArray(); 60 | var matcher = new EnContactMatcher(contactFields, c => c, new ContactMatcherConfig(maxReturns: MaxReturns)); 61 | var tester = new FuzzyMatcherPerfTester(matcher, contacts); 62 | Console.WriteLine($"Took {sw.Elapsed} to instantiate Contact Matcher with {contactFields.Length} contacts."); 63 | tester.Run(TimeSpan.FromMilliseconds(timeoutMilliseconds), isAccuracyTest); 64 | break; 65 | } 66 | 67 | case Place: 68 | { 69 | TestElement[] places = JsonConvert.DeserializeObject[]>(File.ReadAllText(@".\places.json")); 70 | Console.WriteLine($"Took {sw.Elapsed} to deserialize place fields."); 71 | sw.Restart(); 72 | var placeFields = places.Select(c => c.Element).ToArray(); 73 | var matcher = new EnPlaceMatcher(placeFields, c => c, new PlaceMatcherConfig(maxReturns: MaxReturns)); 74 | var tester = new FuzzyMatcherPerfTester(matcher, places); 75 | Console.WriteLine($"Took {sw.Elapsed} to instantiate Place Matcher with {placeFields.Length} places."); 76 | 77 | tester.Run(TimeSpan.FromMilliseconds(timeoutMilliseconds), isAccuracyTest); 78 | break; 79 | } 80 | 81 | default: 82 | throw new ArgumentException($"Type must be 'place' or 'contact'. Current value: {type}"); 83 | } 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/cs/Microsoft.PhoneticMatching/Matchers/BaseMatcher.cs: -------------------------------------------------------------------------------- 1 | // Copyright (c) Microsoft Corporation. All rights reserved. 2 | // Licensed under the MIT License. 3 | 4 | namespace Microsoft.PhoneticMatching.Matchers 5 | { 6 | using System; 7 | using System.Collections.Generic; 8 | 9 | /// 10 | /// Abstract matcher to implement common logic between various kind of simplified matchers. 11 | /// 12 | /// Type of elements being matched. 13 | public abstract class BaseMatcher 14 | { 15 | /// 16 | /// Initializes a new instance of the class. 17 | /// 18 | /// Matcher configurations 19 | protected BaseMatcher(MatcherConfig config) 20 | { 21 | this.Config = config; 22 | } 23 | 24 | /// 25 | /// Gets or sets the matcher configurations 26 | /// 27 | public MatcherConfig Config { get; protected set; } 28 | 29 | /// 30 | /// Find a contact. 31 | /// 32 | /// The search query. 33 | /// The matched contacts. 34 | public abstract IList Find(string query); 35 | 36 | /// 37 | /// Select the best matches from the candidates according to the matcher's configurations. 38 | /// 39 | /// Matches candidates 40 | /// A collection of items matching the target with the limit configured. 41 | protected IList SelectMatches(IList>> candidates) 42 | { 43 | var matches = new List(); 44 | if (candidates.Count != 0) 45 | { 46 | var bestDistance = candidates[0].Distance; 47 | var maxDistance = Math.Max( 48 | bestDistance * this.Config.BestDistanceMultiplier, 49 | this.Config.MaxDistanceMarginReturns); 50 | 51 | var dedupe = new HashSet(); 52 | foreach (var candidate in candidates) 53 | { 54 | // supports MaxReturns == 0 55 | if (matches.Count >= this.Config.MaxReturns) 56 | { 57 | break; 58 | } 59 | 60 | if (candidate.Distance < maxDistance) 61 | { 62 | if (!dedupe.Contains(candidate.Element.Id)) 63 | { 64 | dedupe.Add(candidate.Element.Id); 65 | matches.Add(candidate.Element.Value); 66 | } 67 | } 68 | } 69 | } 70 | 71 | return matches; 72 | } 73 | 74 | /// 75 | /// Target equality comparer based on phrase and target identifier. 76 | /// 77 | protected class TargetEqualityComparer : IEqualityComparer> 78 | { 79 | /// 80 | /// Returns true only if x and y have the same phrase and target identifier 81 | /// 82 | /// first target 83 | /// second target 84 | /// True only if x and y have the same phrase and target identifier 85 | public bool Equals(Target x, Target y) 86 | { 87 | if (x == null && y == null) 88 | { 89 | return true; 90 | } 91 | else if (x == null || y == null) 92 | { 93 | return false; 94 | } 95 | 96 | return string.Equals(x.Phrase, y.Phrase) && int.Equals(x.Id, y.Id); 97 | } 98 | 99 | /// 100 | /// Computes the hash code based on phrase and target identifier. 101 | /// 102 | /// target object 103 | /// Hash code of anonymous object constructed with phrase and target identifier 104 | public int GetHashCode(Target obj) 105 | { 106 | if (obj == null) 107 | { 108 | return 0; 109 | } 110 | 111 | return new { obj.Phrase, obj.Id }.GetHashCode(); 112 | } 113 | } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/maluuba/speech/pronunciation/impl.hpp: -------------------------------------------------------------------------------- 1 | /** 2 | * @file 3 | * Speech implementation details. 4 | * 5 | * @author Benedicte Pierrejean 6 | * @author Tavian Barnes (tavian.barnes@microsoft.com) 7 | * 8 | * Copyright (c) Microsoft Corporation. All rights reserved. 9 | * Licensed under the MIT License. 10 | */ 11 | 12 | #ifndef MALUUBA_SPEECH_PRONUNCIATION_IMPL_HPP 13 | #define MALUUBA_SPEECH_PRONUNCIATION_IMPL_HPP 14 | 15 | #include "maluuba/speech/pronunciation.hpp" 16 | #include 17 | 18 | namespace maluuba 19 | { 20 | namespace speech 21 | { 22 | namespace internal 23 | { 24 | // Phones are implemented as "manual" bitfields. 25 | // Bits are layed out like this: 26 | // 27 | // struct Phone (14) { 28 | // PhoneType type : 1; 29 | // Phonation phonation : 3; 30 | // bool syllabic : 1; 31 | // 32 | // union { 33 | // struct Consonant (8) { 34 | // PlaceOfArticulation place : 4; 35 | // MannerOfArticulation manner : 4; 36 | // }; 37 | // struct Vowel (9) { 38 | // VowelHeight height : 3; 39 | // VowelBackness backness : 3; 40 | // VowelRoundedness roundedness : 2; 41 | // bool rhotic : 1; 42 | // }; 43 | // }; 44 | // } 45 | 46 | static constexpr std::uint16_t type_start = 0; 47 | static constexpr std::uint16_t type_end = type_start + 1; 48 | 49 | static constexpr std::uint16_t phonation_start = type_end; 50 | static constexpr std::uint16_t phonation_end = phonation_start + 3; 51 | 52 | static constexpr std::uint16_t syllabic_start = phonation_end; 53 | static constexpr std::uint16_t syllabic_end = syllabic_start + 1; 54 | 55 | static constexpr std::uint16_t place_start = syllabic_end; 56 | static constexpr std::uint16_t place_end = place_start + 4; 57 | 58 | static constexpr std::uint16_t manner_start = place_end; 59 | static constexpr std::uint16_t manner_end = manner_start + 4; 60 | 61 | static constexpr std::uint16_t height_start = syllabic_end; 62 | static constexpr std::uint16_t height_end = height_start + 3; 63 | 64 | static constexpr std::uint16_t backness_start = height_end; 65 | static constexpr std::uint16_t backness_end = backness_start + 3; 66 | 67 | static constexpr std::uint16_t roundedness_start = backness_end; 68 | static constexpr std::uint16_t roundedness_end = roundedness_start + 2; 69 | 70 | static constexpr std::uint16_t rhotic_start = roundedness_end; 71 | static constexpr std::uint16_t rhotic_end = rhotic_start + 1; 72 | 73 | /** 74 | * Compute a bitmask from a bit range. 75 | */ 76 | constexpr std::uint16_t 77 | phone_mask(std::uint16_t start, std::uint16_t end) 78 | { 79 | return (1 << (end - start)) - 1; 80 | } 81 | 82 | /** 83 | * Decode some bits in the phone representation. 84 | */ 85 | template 86 | constexpr T 87 | phone_decode(std::uint16_t repr, std::uint16_t start, std::uint16_t end) 88 | { 89 | return static_cast((repr >> start) & phone_mask(start, end)); 90 | } 91 | 92 | /** 93 | * Encode some bits in the phone representation. 94 | */ 95 | template 96 | constexpr std::uint16_t 97 | phone_encode(T t, std::uint16_t start) 98 | { 99 | return static_cast(t) << start; 100 | } 101 | 102 | /** 103 | * Encode some bits in the phone representation. 104 | */ 105 | template 106 | constexpr std::uint16_t 107 | phone_encode(std::uint16_t repr, T t, std::uint16_t start, std::uint16_t end) 108 | { 109 | return (repr & ~(phone_mask(start, end) << start)) 110 | | phone_encode(t, start); 111 | } 112 | 113 | /** 114 | * Create a phone representation for a consonant. 115 | */ 116 | constexpr std::uint16_t 117 | consonant(Phonation phonation, PlaceOfArticulation place, MannerOfArticulation manner) 118 | { 119 | return phone_encode(PhoneType::CONSONANT, type_start) 120 | | phone_encode(phonation, phonation_start) 121 | | phone_encode(place, place_start) 122 | | phone_encode(manner, manner_start); 123 | } 124 | 125 | /** 126 | * Create a phone representation for a vowel. 127 | */ 128 | constexpr std::uint16_t 129 | vowel(VowelHeight height, VowelBackness backness, VowelRoundedness roundedness, bool rhotic = false) 130 | { 131 | return phone_encode(PhoneType::VOWEL, type_start) 132 | | phone_encode(Phonation::MODAL, phonation_start) 133 | | phone_encode(true, syllabic_start) 134 | | phone_encode(height, height_start) 135 | | phone_encode(backness, backness_start) 136 | | phone_encode(roundedness, roundedness_start) 137 | | phone_encode(rhotic, rhotic_start); 138 | } 139 | } 140 | } 141 | } 142 | 143 | #endif // MALUUBA_SPEECH_PRONUNCIATION_IMPL_HPP 144 | --------------------------------------------------------------------------------