├── .gitmodules
├── typedoc.json
├── docs
└── assets
│ └── images
│ ├── icons.png
│ ├── widgets.png
│ ├── icons@2x.png
│ └── widgets@2x.png
├── ts
├── nlp
│ ├── index.ts
│ └── tokenizer.ts
├── distance
│ └── index.ts
├── matchers
│ ├── index.ts
│ └── matcherconfig.ts
└── index.ts
├── src
├── cs
│ ├── PhoneticMatchingPerfTests
│ │ ├── Properties
│ │ │ └── launchSettings.json
│ │ ├── Transcription.cs
│ │ ├── TestElement.cs
│ │ ├── TestQuery.cs
│ │ ├── PhoneticMatchingPerfTests.csproj
│ │ ├── Settings.StyleCop
│ │ └── Program.cs
│ ├── Microsoft.PhoneticMatching
│ │ ├── Microsoft.PhoneticMatching.csproj
│ │ ├── Nlp
│ │ │ ├── Tokenizer
│ │ │ │ ├── ITokenizer.cs
│ │ │ │ ├── WhitespaceTokenizer.cs
│ │ │ │ ├── Token.cs
│ │ │ │ ├── Interval.cs
│ │ │ │ └── SplittingTokenizer.cs
│ │ │ └── Preprocessor
│ │ │ │ ├── IPreProcessor.cs
│ │ │ │ ├── CaseFoldingPreProcessor.cs
│ │ │ │ ├── UnicodePreProcessor.cs
│ │ │ │ ├── WhiteSpacePreProcessor.cs
│ │ │ │ ├── ChainedRuleBasedPreProcessor.cs
│ │ │ │ ├── EnPreProcessor.cs
│ │ │ │ └── EnPlacesPreProcessor.cs
│ │ ├── ManagedCallback.cs
│ │ ├── Matchers
│ │ │ ├── ContactMatcher
│ │ │ │ ├── ContactFields.cs
│ │ │ │ └── ContactMatcherConfig.cs
│ │ │ ├── PlaceMatcher
│ │ │ │ ├── PlaceFields.cs
│ │ │ │ └── PlaceMatcherConfig.cs
│ │ │ ├── Target.cs
│ │ │ ├── FuzzyMatcher
│ │ │ │ ├── IFuzzyMatcher.cs
│ │ │ │ ├── Normalized
│ │ │ │ │ ├── StringFuzzyMatcher.cs
│ │ │ │ │ ├── EnPhoneticFuzzyMatcher.cs
│ │ │ │ │ └── EnHybridFuzzyMatcher.cs
│ │ │ │ ├── FuzzyMatcher.cs
│ │ │ │ └── FuzzyMatcherBase.cs
│ │ │ ├── MatcherConfig.cs
│ │ │ └── BaseMatcher.cs
│ │ ├── Distance
│ │ │ ├── IDistance.cs
│ │ │ ├── DistanceInput.cs
│ │ │ ├── StringDistance.cs
│ │ │ └── EnPhoneticDistance.cs
│ │ ├── Settings.StyleCop
│ │ ├── Match.cs
│ │ └── EnPronouncer.cs
│ ├── PhoneticMatchingTests
│ │ ├── PhoneticMatchingTests.csproj
│ │ ├── EnPronouncerTests.cs
│ │ ├── Distance
│ │ │ ├── BaseDistanceTester.cs
│ │ │ ├── StringDistanceTests.cs
│ │ │ ├── EnPhoneticDistanceTests.cs
│ │ │ └── EnHybridDistanceTests.cs
│ │ ├── Settings.StyleCop
│ │ ├── NativeResourceWrapperTests.cs
│ │ ├── Nlp
│ │ │ ├── TokenizerTests.cs
│ │ │ └── PreprocessorTests.cs
│ │ ├── Matchers
│ │ │ ├── BaseContactMatcherTester.cs
│ │ │ └── ContactMatcherTests.cs
│ │ └── EnPronunciationTests.cs
│ ├── nuget
│ │ ├── build
│ │ │ └── Microsoft.PhoneticMatching.targets
│ │ └── Microsoft.PhoneticMatching.nuspec
│ └── PhoneticMatching.sln
└── maluuba
│ ├── speech
│ ├── phoneticdistance
│ │ └── phoneticdistance.cpp
│ ├── nodejs
│ │ ├── performance.hpp
│ │ ├── phone.hpp
│ │ ├── match.hpp
│ │ ├── enpronouncer.hpp
│ │ ├── stringdistance.hpp
│ │ ├── enhybriddistance.hpp
│ │ ├── enphoneticdistance.hpp
│ │ ├── main.cpp
│ │ ├── enpronunciation.hpp
│ │ ├── performance
│ │ │ └── performance.cpp
│ │ ├── match
│ │ │ └── match.cpp
│ │ ├── stringdistance
│ │ │ └── stringdistance.cpp
│ │ ├── enpronouncer
│ │ │ └── enpronouncer.cpp
│ │ └── enphoneticdistance
│ │ │ └── enphoneticdistance.cpp
│ ├── pronouncer.hpp
│ ├── csharp
│ │ └── csharp.hpp
│ ├── pronunciation
│ │ ├── pronunciation.cpp
│ │ ├── arpabet.cpp
│ │ ├── phone.cpp
│ │ └── impl.hpp
│ ├── hybriddistance.hpp
│ ├── pronouncer
│ │ └── pronouncer.cpp
│ └── phoneticdistance.hpp
│ ├── xtd
│ ├── optional.hpp
│ └── string_view.hpp
│ ├── metric.hpp
│ ├── debug.hpp
│ ├── unicode.hpp
│ ├── unicode
│ └── unicode.cpp
│ └── levenshtein.hpp
├── jestConfig.json
├── tests
├── enpronouncer.test.ts
├── matchers
│ ├── testsets
│ │ ├── soundex.testset.spec.ts
│ │ └── soundex.ts
│ ├── contactmatcher.test.ts
│ └── placematcher.test.ts
├── nlp
│ ├── tokenizer.test.ts
│ └── preprocessor.test.ts
├── distance
│ ├── stringdistance.test.ts
│ ├── enphoneticdistance.test.ts
│ └── enhybriddistance.test.ts
└── enpronunciation.test.ts
├── LICENSE
├── .gitignore
├── package.json
├── .gitattributes
└── SECURITY.md
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "flite"]
2 | path = src/flite
3 | url = https://github.com/festvox/flite.git
4 |
--------------------------------------------------------------------------------
/typedoc.json:
--------------------------------------------------------------------------------
1 | {
2 | "out": "docs",
3 | "excludePrivate": true,
4 | "gitRevision": "master"
5 | }
6 |
--------------------------------------------------------------------------------
/docs/assets/images/icons.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PhoneticMatching/HEAD/docs/assets/images/icons.png
--------------------------------------------------------------------------------
/docs/assets/images/widgets.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PhoneticMatching/HEAD/docs/assets/images/widgets.png
--------------------------------------------------------------------------------
/docs/assets/images/icons@2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PhoneticMatching/HEAD/docs/assets/images/icons@2x.png
--------------------------------------------------------------------------------
/docs/assets/images/widgets@2x.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/PhoneticMatching/HEAD/docs/assets/images/widgets@2x.png
--------------------------------------------------------------------------------
/ts/nlp/index.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | export * from "./preprocessor";
5 | export * from "./tokenizer";
6 |
--------------------------------------------------------------------------------
/ts/distance/index.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | export {StringDistance, EnPhoneticDistance, EnHybridDistance, DistanceInput} from "../maluuba";
5 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingPerfTests/Properties/launchSettings.json:
--------------------------------------------------------------------------------
1 | {
2 | "profiles": {
3 | "PhoneticMatchingPerfTests": {
4 | "commandName": "Project",
5 | "commandLineArgs": "contact 120000 accuracy"
6 | }
7 | }
8 | }
--------------------------------------------------------------------------------
/ts/matchers/index.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | export {FuzzyMatcher, AcceleratedFuzzyMatcher} from "../maluuba";
5 | export * from "./contactmatcher";
6 | export * from "./placematcher";
7 | export * from "./matcherconfig";
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Microsoft.PhoneticMatching.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netstandard2.1
5 | x64
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/jestConfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "globals": {
3 | "ts-jest": {
4 | "tsconfig": "ts/tsconfig.json",
5 | "diagnostics": false
6 | }
7 | },
8 | "testRegex": "(/__tests__/.*|(\\.|/)(test|spec))\\.(jsx?|tsx?)$",
9 | "testPathIgnorePatterns": [
10 | "/build/",
11 | "/docs/",
12 | "/lib/",
13 | "/src/",
14 | "/node_modules/"
15 | ],
16 | "preset": "ts-jest/presets/js-with-ts",
17 | "testMatch": null
18 | }
--------------------------------------------------------------------------------
/ts/index.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | export {EnPronouncer, EnPronunciation, Speech} from "./maluuba";
5 |
6 | /**
7 | * Bubble-up re-exports of __nlp__ module for convenience.
8 | */
9 | export * from "./nlp";
10 |
11 | /**
12 | * Bubble-up re-exports of __matchers__ module for convenience.
13 | */
14 | export * from "./matchers";
15 |
16 | /**
17 | * Bubble-up re-exports of __distance__ module for convenience.
18 | */
19 | export * from "./distance";
20 |
--------------------------------------------------------------------------------
/src/maluuba/speech/phoneticdistance/phoneticdistance.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/speech/phoneticdistance.hpp"
5 |
6 | namespace maluuba
7 | {
8 | namespace speech
9 | {
10 | PhoneticDistance::~PhoneticDistance() = default;
11 |
12 | EnPhoneticDistance::~EnPhoneticDistance() = default;
13 |
14 | double
15 | EnPhoneticDistance::operator()(const EnPronunciation& a, const EnPronunciation& b) const
16 | {
17 | return PhoneticDistance::operator()(phonetic_embedding(a), phonetic_embedding(b));
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingPerfTests/Transcription.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace PhoneticMatchingPerfTests
5 | {
6 | public class Transcription
7 | {
8 | ///
9 | /// Gets or sets A label to track what made this transcription.
10 | ///
11 | public string Source { get; set; }
12 |
13 | ///
14 | /// Gets or sets What was actually heard/spoken (possible ASR/STT errors).
15 | ///
16 | public string Utterance { get; set; }
17 | }
18 | }
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingPerfTests/TestElement.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace PhoneticMatchingPerfTests
5 | {
6 | internal class TestElement
7 | {
8 | ///
9 | /// Gets or sets A unique ID to refer back to this element.
10 | ///
11 | public T Element { get; set; }
12 |
13 | ///
14 | /// Gets or sets Test queries with the intent targeting this element in some way.
15 | ///
16 | public TestQuery[] Queries { get; set; }
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingPerfTests/TestQuery.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace PhoneticMatchingPerfTests
5 | {
6 | public class TestQuery
7 | {
8 | ///
9 | /// Gets or sets What the intention is. What should be heard or what was read.
10 | ///
11 | public string Query { get; set; }
12 |
13 | ///
14 | /// Gets or sets The records for this test query. What was actually heard or what was written.
15 | ///
16 | public Transcription[] Transcriptions { get; set; }
17 | }
18 | }
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Tokenizer/ITokenizer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Tokenizer
5 | {
6 | using System.Collections.Generic;
7 |
8 | ///
9 | /// Tokenizer interface for strings.
10 | ///
11 | public interface ITokenizer
12 | {
13 | ///
14 | /// Tokenize the query.
15 | ///
16 | /// Query to tokenize.
17 | /// Collection of tokens.
18 | IList Tokenize(string query);
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Tokenizer/WhitespaceTokenizer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Tokenizer
5 | {
6 | using System.Text.RegularExpressions;
7 |
8 | ///
9 | /// Tokenizer that splits on whitespace.
10 | ///
11 | public class WhitespaceTokenizer : SplittingTokenizer
12 | {
13 | ///
14 | /// Initializes a new instance of the class.
15 | ///
16 | public WhitespaceTokenizer() : base(new Regex(@"\s+"))
17 | {
18 | }
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/tests/enpronouncer.test.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | import {EnPronouncer} from "../ts";
5 |
6 | test("English pronouncer.", () => {
7 | const pronouncer = new EnPronouncer();
8 | expect(pronouncer.pronounce("This, is a test.").ipa).toBe("ðɪsɪzətɛst");
9 | });
10 |
11 | test("ctor used as function exception.", () => {
12 | expect(() => {
13 | const pronouncer = (EnPronouncer as any)();
14 | }).toThrow();
15 | });
16 |
17 | test("Pronouncing undefined exception.", () => {
18 | expect(() => {
19 | const pronouncer = new EnPronouncer();
20 | pronouncer.pronounce(undefined as any);
21 | }).toThrow();
22 | });
23 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/ManagedCallback.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching
5 | {
6 | using System;
7 |
8 | ///
9 | /// Used to keep track of the last exception that occurred during a managed callback was invoked from native code. Otherwise, native code swallows the exception.
10 | ///
11 | internal static class ManagedCallback
12 | {
13 | ///
14 | /// Gets or sets the last exception that occurred during a managed callback was invoked from native code.
15 | ///
16 | public static Exception LastError { get; set; }
17 | }
18 | }
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/IPreProcessor.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor
5 | {
6 | ///
7 | /// A Pre-processor interface. To transform a string before any classification or understanding is known about it.
8 | ///
9 | public interface IPreProcessor
10 | {
11 | ///
12 | /// Function to preform the pre-processing.
13 | ///
14 | /// The string to pre-process.
15 | /// The pre-processed string.
16 | string PreProcess(string query);
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/tests/matchers/testsets/soundex.testset.spec.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | import Soundex from "./soundex";
5 |
6 | test("Soundex.", () => {
7 | expect(Soundex.encode("")).toBe("");
8 | expect(Soundex.encode(" ")).toBe("");
9 |
10 | expect(Soundex.encode("Robert")).toBe("R163");
11 | expect(Soundex.encode("Rupert")).toBe("R163");
12 | expect(Soundex.encode("Rubin")).toBe("R150");
13 | expect(Soundex.encode("Ashcraft")).toBe("A261");
14 | expect(Soundex.encode("Ashcroft")).toBe("A261");
15 | expect(Soundex.encode("Tymczak")).toBe("T522");
16 | expect(Soundex.encode("Pfister")).toBe("P236");
17 | expect(Soundex.encode("Honeyman")).toBe("H555");
18 |
19 | expect(Soundex.encode("Robert Robert")).toBe("R163 R163");
20 | });
21 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/ContactMatcher/ContactFields.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers.ContactMatcher
5 | {
6 | using System.Collections.Generic;
7 |
8 | ///
9 | /// Fields made available from the user defined Contact object for pronunciation and distance functions.
10 | ///
11 | public class ContactFields
12 | {
13 | ///
14 | /// Gets or sets the name of the contact.
15 | ///
16 | public string Name { get; set; }
17 |
18 | ///
19 | /// Gets or sets the aliases the contact also goes by.
20 | ///
21 | public IList Aliases { get; set; }
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Distance/IDistance.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Distance
5 | {
6 | ///
7 | /// Distance interface. Distance object are used to compute distance between two objects.
8 | ///
9 | /// Type of elements between which we compute distance.
10 | public interface IDistance
11 | {
12 | ///
13 | /// Computes the distance between first and second.
14 | ///
15 | /// First element.
16 | /// Second element.
17 | /// The distance between first and second.
18 | double Distance(T first, T second);
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/performance.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Performance utility to make trace events.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | #ifndef MALUUBA_SPEECH_NODEJS_PERFORMANCE_HPP
10 | #define MALUUBA_SPEECH_NODEJS_PERFORMANCE_HPP
11 |
12 | #include
13 | #include
14 |
15 | namespace maluuba
16 | {
17 | namespace speech
18 | {
19 | namespace nodejs
20 | {
21 | class Performance
22 | {
23 | public:
24 | static void Init(v8::Local module);
25 |
26 | static void Mark(const std::string& name);
27 | static void Measure(const std::string& name, const std::string& start_mark, const std::string& end_mark);
28 |
29 | private:
30 | static v8::Persistent s_performance;
31 | };
32 | }
33 | }
34 | }
35 |
36 | #endif // MALUUBA_SPEECH_NODEJS_PERFORMANCE_HPP
37 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingTests/PhoneticMatchingTests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | netcoreapp3.1
5 |
6 | false
7 |
8 | AnyCPU;x64
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/src/maluuba/xtd/optional.hpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #ifndef MALUUBA_XTD_OPTIONAL_HPP
5 | #define MALUUBA_XTD_OPTIONAL_HPP
6 |
7 | #if __cplusplus >= 201703L && __has_include()
8 |
9 | #include
10 |
11 | namespace maluuba
12 | {
13 | namespace xtd
14 | {
15 | using std::optional;
16 | using std::bad_optional_access;
17 | using std::nullopt_t;
18 | using std::nullopt;
19 | using std::make_optional;
20 | }
21 | }
22 |
23 | #else
24 |
25 | #include
26 |
27 | namespace maluuba
28 | {
29 | namespace xtd
30 | {
31 | using std::experimental::optional;
32 | using std::experimental::bad_optional_access;
33 | using std::experimental::nullopt_t;
34 | using std::experimental::nullopt;
35 | using std::experimental::make_optional;
36 | }
37 | }
38 |
39 | #endif // __cplusplus
40 |
41 | #endif // MALUUBA_XTD_OPTIONAL_HPP
42 |
--------------------------------------------------------------------------------
/src/maluuba/xtd/string_view.hpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #ifndef MALUUBA_XTD_STRING_VIEW_HPP
5 | #define MALUUBA_XTD_STRING_VIEW_HPP
6 |
7 | #if __cplusplus >= 201703L
8 |
9 | #include
10 |
11 | namespace maluuba
12 | {
13 | namespace xtd
14 | {
15 | using std::basic_string_view;
16 | using std::string_view;
17 | using std::wstring_view;
18 | using std::u16string_view;
19 | using std::u32string_view;
20 | }
21 | }
22 |
23 | #else
24 |
25 | #include
26 |
27 | namespace maluuba
28 | {
29 | namespace xtd
30 | {
31 | using std::experimental::basic_string_view;
32 | using std::experimental::string_view;
33 | using std::experimental::wstring_view;
34 | using std::experimental::u16string_view;
35 | using std::experimental::u32string_view;
36 | }
37 | }
38 |
39 | #endif // __cplusplus
40 |
41 | #endif // MALUUBA_XTD_STRING_VIEW_HPP
42 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/PlaceMatcher/PlaceFields.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers.PlaceMatcher
5 | {
6 | using System.Collections.Generic;
7 |
8 | ///
9 | /// Fields made available from the user defined Place object for pronunciation and distance functions.
10 | ///
11 | public class PlaceFields
12 | {
13 | ///
14 | /// Gets or sets the name of the place.
15 | ///
16 | public string Name { get; set; }
17 |
18 | ///
19 | /// Gets or sets The address of the place.
20 | ///
21 | public string Address { get; set; }
22 |
23 | ///
24 | /// Gets or sets The tags/categories defining the place.
25 | ///
26 | public IList Types { get; set; }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/CaseFoldingPreProcessor.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor
5 | {
6 | using System;
7 |
8 | ///
9 | /// Pre-Processor to preform the pre-processing with case.
10 | ///
11 | internal class CaseFoldingPreProcessor : IPreProcessor
12 | {
13 | ///
14 | /// Function to preform the pre-processing with case.
15 | ///
16 | /// The string to pre-process.
17 | /// The pre-processed string.
18 | public string PreProcess(string query)
19 | {
20 | if (query == null)
21 | {
22 | throw new ArgumentNullException("query can't be null");
23 | }
24 |
25 | return query.ToLowerInvariant();
26 | }
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/cs/nuget/build/Microsoft.PhoneticMatching.targets:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | linux-x64
6 | osx-x64
7 | win-x64
8 |
9 |
10 |
11 |
12 | PreserveNewest
13 | maluubaspeech-csharp.dll
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/UnicodePreProcessor.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor
5 | {
6 | using System;
7 | using System.Text;
8 |
9 | ///
10 | /// Unicode pre-processor
11 | ///
12 | internal class UnicodePreProcessor : IPreProcessor
13 | {
14 | ///
15 | /// Function to preform the pre-processing with unicode normalization form.
16 | ///
17 | /// The string to pre-process.
18 | /// The pre-processed string.
19 | public string PreProcess(string query)
20 | {
21 | if (query == null)
22 | {
23 | throw new ArgumentNullException("query can't be null");
24 | }
25 |
26 | return query.Normalize(NormalizationForm.FormKC);
27 | }
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/cs/nuget/Microsoft.PhoneticMatching.nuspec:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Microsoft.PhoneticMatching
5 | 0.0.7
6 | Microsoft
7 | Microsoft
8 | https://opensource.org/licenses/MIT
9 | https://github.com/Microsoft/PhoneticMatching
10 | http://go.microsoft.com/fwlink/?LinkID=288890
11 | false
12 | PhoneticMatching C# project.
13 | Initial version.
14 | © Microsoft Corporation. All rights reserved.
15 | phone ipa match query target pronunciation hybrid fuzzy matcher arpabet distance pronouncer syllable
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/phone.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Phones wrapped in NodeJS.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | #ifndef MALUUBA_SPEECH_NODEJS_PHONE_HPP
10 | #define MALUUBA_SPEECH_NODEJS_PHONE_HPP
11 |
12 | #include "maluuba/speech/pronunciation.hpp"
13 | #include
14 | #include
15 |
16 | namespace maluuba
17 | {
18 | namespace speech
19 | {
20 | namespace nodejs
21 | {
22 | class Phone: public node::ObjectWrap
23 | {
24 | public:
25 | static void Init(v8::Local exports);
26 | static v8::Local constructor(v8::Isolate* isolate);
27 |
28 | Phone(speech::Phone phone);
29 | const speech::Phone& phone() const;
30 |
31 | private:
32 | static void New(const v8::FunctionCallbackInfo& args);
33 | static v8::Persistent s_constructor;
34 | speech::Phone m_phone;
35 | };
36 | }
37 | }
38 | }
39 |
40 | #endif // MALUUBA_SPEECH_NODEJS_PHONE_HPP
41 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingPerfTests/PhoneticMatchingPerfTests.csproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Exe
5 | netcoreapp3.1
6 | x64
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingTests/EnPronouncerTests.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace PhoneticMatchingTests
5 | {
6 | using System;
7 | using Microsoft.PhoneticMatching;
8 | using Microsoft.VisualStudio.TestTools.UnitTesting;
9 |
10 | [TestClass]
11 | public class EnPronouncerTests
12 | {
13 | private EnPronouncer pronouncer = EnPronouncer.Instance;
14 |
15 | [TestMethod]
16 | public void GivenPronunciation_ExpectPositiveMatch()
17 | {
18 | var pronunciation = this.pronouncer.Pronounce("This, is a test.");
19 | Assert.AreEqual("ðɪsɪzətɛst", pronunciation.Ipa);
20 | }
21 |
22 | [TestMethod]
23 | public void GivenNullArgument_ExpectException()
24 | {
25 | Assert.ThrowsException(() =>
26 | {
27 | var pronunciation = this.pronouncer.Pronounce(null);
28 | });
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Settings.StyleCop:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | arpabet
5 | fənɛtɪk
6 | ipa
7 | rhotic
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 | False
16 |
17 |
18 |
19 |
20 | False
21 |
22 |
23 |
24 |
25 | True
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/WhiteSpacePreProcessor.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor
5 | {
6 | using System;
7 | using System.Text.RegularExpressions;
8 |
9 | ///
10 | /// Pre-processor that removes consecutive and trailing white spaces.
11 | ///
12 | internal class WhiteSpacePreProcessor : IPreProcessor
13 | {
14 | private readonly Regex pattern = new Regex(@"\s{2,}");
15 |
16 | ///
17 | /// Function to preform the pre-processing.
18 | ///
19 | /// The string to pre-process.
20 | /// The pre-processed string.
21 | public string PreProcess(string query)
22 | {
23 | if (query == null)
24 | {
25 | throw new ArgumentNullException("query can't be null");
26 | }
27 |
28 | return this.pattern.Replace(query.Trim(), " ");
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/tests/nlp/tokenizer.test.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | import { WhitespaceTokenizer, Token } from "../../ts/nlp";
5 |
6 | function values(tokens: Token[]): string[] {
7 | return tokens.map((token) => token.value);
8 | }
9 |
10 | describe("WhiteSpaceTokenizer", () => {
11 | const tokenizer = new WhitespaceTokenizer();
12 |
13 | test("empty string", () => {
14 | expect(values(tokenizer.tokenize(""))).toEqual([]);
15 | });
16 |
17 | test("no whitespace", () => {
18 | expect(values(tokenizer.tokenize("example"))).toEqual(["example"]);
19 | });
20 |
21 | test("Not ending with spaces", () => {
22 | expect(values(tokenizer.tokenize(" There are some words, here! #blessed")))
23 | .toEqual(["There", "are", "some", "words,", "here!", "#blessed"]);
24 | });
25 |
26 | test("Ends with spaces", () => {
27 | expect(values(tokenizer.tokenize(" There are some words, here! #blessed ")))
28 | .toEqual(["There", "are", "some", "words,", "here!", "#blessed"]);
29 | });
30 | });
31 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Tokenizer/Token.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Tokenizer
5 | {
6 | ///
7 | /// The substring token of the original string with its interval location.
8 | ///
9 | public class Token
10 | {
11 | ///
12 | /// Initializes a new instance of the class.
13 | ///
14 | /// Value of the token.
15 | /// Interval of the value.
16 | public Token(string value, Interval interval)
17 | {
18 | this.Value = value;
19 | this.Interval = interval;
20 | }
21 |
22 | ///
23 | /// Gets the value of the token.
24 | ///
25 | public string Value { get; private set; }
26 |
27 | ///
28 | /// Gets the interval of the token.
29 | ///
30 | public Interval Interval { get; private set; }
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/maluuba/metric.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Distance metrics.
4 | *
5 | * @author Tavian Barnes (tavian.barnes@microsoft.com)
6 | *
7 | * Copyright (c) Microsoft Corporation. All rights reserved.
8 | * Licensed under the MIT License.
9 | */
10 |
11 | #ifndef MALUUBA_METRIC_HPP
12 | #define MALUUBA_METRIC_HPP
13 |
14 | #include
15 | #include
16 |
17 | namespace maluuba
18 | {
19 | /**
20 | * Infers the result type of a distance metric.
21 | */
22 | template
23 | using MetricResult = std::result_of_t;
24 |
25 | /**
26 | * Equality distance metric.
27 | *
28 | * @author Tavian Barnes (tavian.barnes@microsoft.com)
29 | */
30 | class EqualityMetric
31 | {
32 | public:
33 | /**
34 | * Compute the distance between @p t and @p u.
35 | *
36 | * @return 0 if t == u, 1 otherwise.
37 | */
38 | template
39 | int
40 | operator()(const T& t, const U& u) const
41 | {
42 | return t == u ? 0 : 1;
43 | }
44 | };
45 | }
46 |
47 | #endif // MALUUBA_METRIC_HPP
48 |
--------------------------------------------------------------------------------
/tests/distance/stringdistance.test.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | import {StringDistance} from "../../ts/distance"
5 |
6 | test("String distance equality.", () => {
7 | const dist = new StringDistance();
8 | expect(dist.distance("This, is a test.", "This, is a test.")).toBe(0);
9 | });
10 |
11 | test("String distance.", () => {
12 | const dist = new StringDistance();
13 |
14 | expect(dist.distance("aaa", "bbb")).toBe(3);
15 | expect(dist.distance("aaa", "aaa")).toBe(0);
16 | expect(dist.distance("aaa", "aba")).toBe(1);
17 | expect(dist.distance("", "")).toBe(0);
18 | expect(dist.distance("", "aaa")).toBe(3);
19 | expect(dist.distance("aaa", "")).toBe(3);
20 | });
21 |
22 | test("ctor used as function exception.", () => {
23 | expect(() => {
24 | const distance = (StringDistance as any)();
25 | }).toThrow();
26 | });
27 |
28 | test("Distance on undefined exception.", () => {
29 | expect(() => {
30 | const dist = new StringDistance();
31 | dist.distance(undefined, undefined);
32 | }).toThrow();
33 | });
34 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) Microsoft Corporation. All rights reserved.
2 |
3 | MIT License
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Match.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching
5 | {
6 | ///
7 | /// A matched element with its distance score.
8 | ///
9 | /// The element type.
10 | public class Match
11 | {
12 | ///
13 | /// Initializes a new instance of the class.
14 | ///
15 | /// the element wrapped
16 | /// the distance with query target
17 | public Match(T element, double distance)
18 | {
19 | this.Element = element;
20 | this.Distance = distance;
21 | }
22 |
23 | ///
24 | /// Gets the element.
25 | ///
26 | public T Element { get; private set; }
27 |
28 | ///
29 | /// Gets the distance with the target matched.
30 | ///
31 | public double Distance { get; private set; }
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/match.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Match wrapped in NodeJS.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | #ifndef MALUUBA_SPEECH_NODEJS_MATCH_HPP
10 | #define MALUUBA_SPEECH_NODEJS_MATCH_HPP
11 |
12 | #include "maluuba/speech/fuzzymatcher.hpp"
13 | #include
14 | #include
15 |
16 | namespace maluuba
17 | {
18 | namespace speech
19 | {
20 | namespace nodejs
21 | {
22 | class Match: public node::ObjectWrap
23 | {
24 | using NodeJsTarget = v8::UniquePersistent;
25 | using MatchType = speech::FuzzyMatcher::Match;
26 |
27 | public:
28 | static void Init(v8::Local exports);
29 | static v8::Local constructor(v8::Isolate* isolate);
30 |
31 | Match(MatchType match);
32 | const MatchType& match() const;
33 |
34 | private:
35 | static void New(const v8::FunctionCallbackInfo& args);
36 | static v8::Persistent s_constructor;
37 | MatchType m_match;
38 | };
39 | }
40 | }
41 | }
42 |
43 | #endif // MALUUBA_SPEECH_NODEJS_MATCH_HPP
44 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/enpronouncer.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * English Pronouncer wrapped in NodeJS.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | #ifndef MALUUBA_SPEECH_NODEJS_ENPRONOUNCER_HPP
10 | #define MALUUBA_SPEECH_NODEJS_ENPRONOUNCER_HPP
11 |
12 | #include "maluuba/speech/pronouncer.hpp"
13 | #include
14 | #include
15 |
16 | namespace maluuba
17 | {
18 | namespace speech
19 | {
20 | namespace nodejs
21 | {
22 | class EnPronouncer: public node::ObjectWrap
23 | {
24 | public:
25 | static void Init(v8::Local exports);
26 |
27 | const speech::EnPronouncer& pronouncer() const;
28 |
29 | private:
30 | explicit EnPronouncer(speech::EnPronouncer pronouncer);
31 | ~EnPronouncer();
32 |
33 | static void New(const v8::FunctionCallbackInfo& args);
34 | static void Pronounce(const v8::FunctionCallbackInfo& args);
35 | static v8::Persistent s_constructor;
36 | speech::EnPronouncer m_pronouncer;
37 | };
38 | }
39 | }
40 | }
41 |
42 | #endif // MALUUBA_SPEECH_NODEJS_ENPRONOUNCER_HPP
43 |
--------------------------------------------------------------------------------
/src/maluuba/speech/pronouncer.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Pronouncer.
4 | *
5 | * @author Benedicte Pierrejean
6 | * @author Tavian Barnes (tavian.barnes@microsoft.com)
7 | *
8 | * Copyright (c) Microsoft Corporation. All rights reserved.
9 | * Licensed under the MIT License.
10 | */
11 |
12 | #ifndef MALUUBA_SPEECH_PRONOUNCER_HPP
13 | #define MALUUBA_SPEECH_PRONOUNCER_HPP
14 |
15 | #include "maluuba/speech/pronunciation.hpp"
16 | #include
17 | #include
18 |
19 | namespace maluuba
20 | {
21 | namespace speech
22 | {
23 | class Pronouncer
24 | {
25 | public:
26 | Pronouncer() = default;
27 | virtual ~Pronouncer() = 0;
28 |
29 | Pronouncer(Pronouncer&& other) = default;
30 | Pronouncer& operator=(Pronouncer&& other) = default;
31 | };
32 |
33 | class EnPronouncer: public Pronouncer
34 | {
35 | public:
36 | EnPronouncer();
37 | virtual ~EnPronouncer();
38 |
39 | EnPronouncer(EnPronouncer&& other);
40 | EnPronouncer& operator=(EnPronouncer&& other);
41 |
42 | EnPronunciation pronounce(const std::string& text) const;
43 |
44 | private:
45 | struct Impl;
46 | std::unique_ptr m_impl;
47 | };
48 | }
49 | }
50 |
51 | #endif // MALUUBA_SPEECH_PRONOUNCER_HPP
52 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingTests/Distance/BaseDistanceTester.cs:
--------------------------------------------------------------------------------
1 | //-----------------------------------------------------------------------
2 | //
3 | // Copyright (c) Microsoft Corporation. All rights reserved.
4 | // Licensed under the MIT License.
5 | //
6 | //-----------------------------------------------------------------------
7 |
8 | namespace PhoneticMatchingTests.Distance
9 | {
10 | using System;
11 | using Microsoft.VisualStudio.TestTools.UnitTesting;
12 | using Microsoft.PhoneticMatching.Distance;
13 |
14 | public abstract class BaseDistanceTester where T : class
15 | {
16 | public BaseDistanceTester()
17 | {
18 | this.Distance = this.CreateDistanceOperator();
19 | }
20 |
21 | protected IDistance Distance { get; private set; }
22 |
23 | [TestMethod]
24 | public void GivenNull_ExpectException()
25 | {
26 | Assert.ThrowsException(() =>
27 | {
28 | var dist = this.Distance.Distance(null, null);
29 | });
30 | }
31 |
32 | protected abstract IDistance CreateDistanceOperator();
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/stringdistance.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * String Distance wrapped in NodeJS.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | #ifndef MALUUBA_SPEECH_NODEJS_STRINGDISTANCE_HPP
10 | #define MALUUBA_SPEECH_NODEJS_STRINGDISTANCE_HPP
11 |
12 | #include "maluuba/levenshtein.hpp"
13 | #include
14 | #include
15 |
16 | namespace maluuba
17 | {
18 | namespace speech
19 | {
20 | namespace nodejs
21 | {
22 | class StringDistance: public node::ObjectWrap
23 | {
24 | public:
25 | static void Init(v8::Local exports);
26 | static v8::Local type(v8::Isolate* isolate);
27 |
28 | const LevenshteinDistance<>& distance() const;
29 |
30 | private:
31 | explicit StringDistance(LevenshteinDistance<> distance);
32 | ~StringDistance();
33 |
34 | static void New(const v8::FunctionCallbackInfo& args);
35 | static void Distance(const v8::FunctionCallbackInfo& args);
36 | static v8::Persistent s_constructor;
37 | static v8::Persistent s_type;
38 | LevenshteinDistance<> m_distance;
39 | };
40 | }
41 | }
42 | }
43 |
44 | #endif // MALUUBA_SPEECH_NODEJS_STRINGDISTANCE_HPP
45 |
--------------------------------------------------------------------------------
/src/maluuba/speech/csharp/csharp.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Macro to export symbols.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | #ifndef MALUUBA_SPEECH_CSHARP_CSHARP_HPP
10 | #define MALUUBA_SPEECH_CSHARP_CSHARP_HPP
11 |
12 |
13 | #if defined _WIN32 || defined __CYGWIN__
14 | #define STDCALL __stdcall
15 | #if 1 //def BUILDING_DLL
16 | #ifdef __GNUC__
17 | #define DLL_PUBLIC __attribute__ ((dllexport))
18 | #else
19 | #define DLL_PUBLIC __declspec(dllexport) // Note: actually gcc seems to also supports this syntax.
20 | #endif
21 | #else
22 | #ifdef __GNUC__
23 | #define DLL_PUBLIC __attribute__ ((dllimport))
24 | #else
25 | #define DLL_PUBLIC __declspec(dllimport) // Note: actually gcc seems to also supports this syntax.
26 | #endif
27 | #endif // BUILDING_DLL
28 | #define DLL_LOCAL
29 | #else
30 | #define STDCALL
31 | #if __GNUC__ >= 4
32 | #define DLL_PUBLIC __attribute__ ((visibility ("default")))
33 | #define DLL_LOCAL __attribute__ ((visibility ("hidden")))
34 | #else
35 | #define DLL_PUBLIC
36 | #define DLL_LOCAL
37 | #endif // __GNUC__ >= 4
38 | #endif // defined _WIN32 || defined __CYGWIN__
39 |
40 | #endif // MALUUBA_SPEECH_CSHARP_CSHARP_HPP
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Distance/DistanceInput.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Distance
5 | {
6 | using System;
7 |
8 | ///
9 | /// Input object for . Hold the text and the pronunciation of that text.
10 | ///
11 | public class DistanceInput
12 | {
13 | ///
14 | /// Initializes a new instance of the class.
15 | ///
16 | /// the text to compute distance on
17 | /// the pronunciation to compute distance on
18 | public DistanceInput(string phrase, EnPronunciation pronunciation)
19 | {
20 | this.Phrase = phrase;
21 | this.Pronunciation = pronunciation;
22 | }
23 |
24 | ///
25 | /// Gets the text to compute distance on.
26 | ///
27 | public string Phrase { get; private set; }
28 |
29 | ///
30 | /// Gets the pronunciation to compute distance on.
31 | ///
32 | public EnPronunciation Pronunciation { get; private set; }
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/enhybriddistance.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * English Phonetic + String Distance wrapped in NodeJS.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | #ifndef MALUUBA_SPEECH_NODEJS_ENHYBRIDDISTANCE_HPP
10 | #define MALUUBA_SPEECH_NODEJS_ENHYBRIDDISTANCE_HPP
11 |
12 | #include "maluuba/speech/hybriddistance.hpp"
13 | #include
14 | #include
15 |
16 | namespace maluuba
17 | {
18 | namespace speech
19 | {
20 | namespace nodejs
21 | {
22 | class EnHybridDistance: public node::ObjectWrap
23 | {
24 |
25 | public:
26 | static void Init(v8::Local exports);
27 | static v8::Local type(v8::Isolate* isolate);
28 |
29 | const speech::HybridDistance<>& distance() const;
30 |
31 | private:
32 | explicit EnHybridDistance(speech::HybridDistance<> distance);
33 | ~EnHybridDistance();
34 |
35 | static void New(const v8::FunctionCallbackInfo& args);
36 | static void Distance(const v8::FunctionCallbackInfo& args);
37 | static v8::Persistent s_constructor;
38 | static v8::Persistent s_type;
39 | speech::HybridDistance<> m_distance;
40 | };
41 | }
42 | }
43 | }
44 |
45 | #endif // MALUUBA_SPEECH_NODEJS_ENHYBRIDDISTANCE_HPP
46 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/enphoneticdistance.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * English Phonetic Distance wrapped in NodeJS.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | #ifndef MALUUBA_SPEECH_NODEJS_ENPHONETICDISTANCE_HPP
10 | #define MALUUBA_SPEECH_NODEJS_ENPHONETICDISTANCE_HPP
11 |
12 | #include "maluuba/speech/phoneticdistance.hpp"
13 | #include
14 | #include
15 |
16 | namespace maluuba
17 | {
18 | namespace speech
19 | {
20 | namespace nodejs
21 | {
22 | class EnPhoneticDistance: public node::ObjectWrap
23 | {
24 | public:
25 | static void Init(v8::Local exports);
26 | static v8::Local type(v8::Isolate* isolate);
27 |
28 | const speech::EnPhoneticDistance& distance() const;
29 |
30 | private:
31 | explicit EnPhoneticDistance(speech::EnPhoneticDistance distance);
32 | ~EnPhoneticDistance();
33 |
34 | static void New(const v8::FunctionCallbackInfo& args);
35 | static void Distance(const v8::FunctionCallbackInfo& args);
36 | static v8::Persistent s_constructor;
37 | static v8::Persistent s_type;
38 | speech::EnPhoneticDistance m_distance;
39 | };
40 | }
41 | }
42 | }
43 |
44 | #endif // MALUUBA_SPEECH_NODEJS_ENPHONETICDISTANCE_HPP
45 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/Target.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers
5 | {
6 | ///
7 | /// Target of a matcher
8 | ///
9 | /// Type of the target
10 | public class Target
11 | {
12 | ///
13 | /// Initializes a new instance of the class.
14 | ///
15 | /// Target value
16 | /// Target phrase
17 | /// Target identifier
18 | public Target(T value, string phrase, int id)
19 | {
20 | this.Value = value;
21 | this.Phrase = phrase;
22 | this.Id = id;
23 | }
24 |
25 | ///
26 | /// Gets the Target element value.
27 | ///
28 | public T Value { get; private set; }
29 |
30 | ///
31 | /// Gets the Target element phrase.
32 | ///
33 | public string Phrase { get; private set; }
34 |
35 | ///
36 | /// Gets the Target element identifier.
37 | ///
38 | public int Id { get; private set; }
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Tokenizer/Interval.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Tokenizer
5 | {
6 | ///
7 | /// An Interval holds the first and last index bounds.
8 | ///
9 | public class Interval
10 | {
11 | ///
12 | /// Initializes a new instance of the class.
13 | ///
14 | /// Starting index (inclusive).
15 | /// Ending index (exclusive).
16 | public Interval(int first, int last)
17 | {
18 | this.First = first;
19 | this.Last = last;
20 | }
21 |
22 | ///
23 | /// Gets the Starting index (inclusive).
24 | ///
25 | public int First { get; private set; }
26 |
27 | ///
28 | /// Gets the Ending index (exclusive).
29 | ///
30 | public int Last { get; private set; }
31 |
32 | ///
33 | /// Gets the length of the token.
34 | ///
35 | public int Length
36 | {
37 | get
38 | {
39 | return this.Last - this.First;
40 | }
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Typescript output
2 | /lib
3 |
4 | # node-gyp build
5 | /build
6 | /bindings
7 |
8 | # gdb outputs
9 | .gdb_history
10 |
11 | # vs files
12 | .vs/
13 | .vscode/
14 |
15 | # Logs
16 | logs
17 | *.log
18 | npm-debug.log*
19 | yarn-debug.log*
20 | yarn-error.log*
21 |
22 | # Runtime data
23 | pids
24 | *.pid
25 | *.seed
26 | *.pid.lock
27 |
28 | # Directory for instrumented libs generated by jscoverage/JSCover
29 | lib-cov
30 |
31 | # Coverage directory used by tools like istanbul
32 | coverage
33 |
34 | # nyc test coverage
35 | .nyc_output
36 |
37 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
38 | .grunt
39 |
40 | # Bower dependency directory (https://bower.io/)
41 | bower_components
42 |
43 | # node-waf configuration
44 | .lock-wscript
45 |
46 | # Compiled binary addons (http://nodejs.org/api/addons.html)
47 | build/Release
48 |
49 | # Dependency directories
50 | node_modules/
51 | jspm_packages/
52 |
53 | # Typescript v1 declaration files
54 | typings/
55 |
56 | # Optional npm cache directory
57 | .npm
58 |
59 | # Optional eslint cache
60 | .eslintcache
61 |
62 | # Optional REPL history
63 | .node_repl_history
64 |
65 | # Output of 'npm pack'
66 | *.tgz
67 |
68 | # Yarn Integrity file
69 | .yarn-integrity
70 |
71 | # dotenv environment variables file
72 | .env
73 |
74 | # .net
75 | bin/
76 | obj/
77 | packages/
78 | *.Cache
79 | *.nupkg
80 | *.csproj.user
81 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/main.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/speech/nodejs/enhybriddistance.hpp"
5 | #include "maluuba/speech/nodejs/enphoneticdistance.hpp"
6 | #include "maluuba/speech/nodejs/enpronouncer.hpp"
7 | #include "maluuba/speech/nodejs/enpronunciation.hpp"
8 | #include "maluuba/speech/nodejs/fuzzymatcher.hpp"
9 | #include "maluuba/speech/nodejs/match.hpp"
10 | // #include "maluuba/speech/nodejs/performance.hpp"
11 | #include "maluuba/speech/nodejs/phone.hpp"
12 | #include "maluuba/speech/nodejs/stringdistance.hpp"
13 | #include
14 |
15 | namespace maluuba
16 | {
17 | namespace speech
18 | {
19 | namespace nodejs
20 | {
21 | namespace
22 | {
23 | void
24 | Init(v8::Local exports, v8::Local module)
25 | {
26 | // Performance::Init(module);
27 | EnHybridDistance::Init(exports);
28 | EnPhoneticDistance::Init(exports);
29 | FuzzyMatcher::Init(exports, "FuzzyMatcher");
30 | FuzzyMatcher::Init(exports, "AcceleratedFuzzyMatcher");
31 | EnPronouncer::Init(exports);
32 | EnPronunciation::Init(exports);
33 | Match::Init(exports);
34 | Phone::Init(exports);
35 | StringDistance::Init(exports);
36 | }
37 | }
38 |
39 | NODE_MODULE(NODE_GYP_MODULE_NAME, Init)
40 | }
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/enpronunciation.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * English Pronunciation wrapped in NodeJS.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | #ifndef MALUUBA_SPEECH_NODEJS_ENPRONUNCIATION_HPP
10 | #define MALUUBA_SPEECH_NODEJS_ENPRONUNCIATION_HPP
11 |
12 | #include "maluuba/speech/pronunciation.hpp"
13 | #include
14 | #include
15 |
16 | namespace maluuba
17 | {
18 | namespace speech
19 | {
20 | namespace nodejs
21 | {
22 | class EnPronunciation: public node::ObjectWrap
23 | {
24 | public:
25 | static void Init(v8::Local exports);
26 | static v8::Local constructor(v8::Isolate* isolate);
27 | static v8::Local type(v8::Isolate* isolate);
28 |
29 | EnPronunciation(speech::EnPronunciation pronunciation);
30 | const speech::EnPronunciation& pronunciation() const;
31 |
32 | private:
33 | static void New(const v8::FunctionCallbackInfo& args);
34 | static void FromIpa(const v8::FunctionCallbackInfo& args);
35 | static void FromArpabet(const v8::FunctionCallbackInfo& args);
36 | static v8::Persistent s_constructor;
37 | static v8::Persistent s_type;
38 | speech::EnPronunciation m_pronunciation;
39 | };
40 | }
41 | }
42 | }
43 |
44 | #endif // MALUUBA_SPEECH_NODEJS_ENPRONUNCIATION_HPP
45 |
--------------------------------------------------------------------------------
/tests/nlp/preprocessor.test.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | import { EnPreProcessor } from "../../ts/nlp";
5 |
6 | describe("EnPreProcessor", () => {
7 | const processor = new EnPreProcessor();
8 |
9 | test("Hi", () => {
10 | // "Híffi"
11 | // í has a combining acute accent, ffi is a ligature
12 | expect(processor.preProcess("Hi\u0301\uFB03")).toBe("h\u00EDffi");
13 | });
14 |
15 | test("Digits", () => {
16 | expect(processor.preProcess("123 King St")).toBe("123 king st");
17 | expect(processor.preProcess("2 Wildwood Place")).toBe("2 wildwood place");
18 | });
19 |
20 | test("Punctuation", () => {
21 | expect(processor.preProcess("!omg! ch!ll ?how?")).toBe("omg ch ll how");
22 | });
23 |
24 | test("Apostrophe and case", () => {
25 | expect(processor.preProcess("Justin's haus")).toBe("justin s haus");
26 | });
27 |
28 | test("simple tokenization", () => {
29 | expect(processor.preProcess("call mom")).toBe("call mom");
30 | expect(processor.preProcess("call MoM!")).toBe("call mom");
31 | expect(processor.preProcess("*(*&call, MoM! )_+")).toBe("call mom");
32 | expect(processor.preProcess(":call/mom")).toBe("call mom");
33 | expect(processor.preProcess("Call mom.")).toBe("call mom");
34 | expect(processor.preProcess("Call mom .")).toBe("call mom");
35 | expect(processor.preProcess("Call mom .")).toBe("call mom");
36 | });
37 | });
38 |
--------------------------------------------------------------------------------
/src/maluuba/debug.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Debuging utilities.
4 | *
5 | * @author Tavian Barnes (tavian.barnes@microsoft.com)
6 | *
7 | * Copyright (c) Microsoft Corporation. All rights reserved.
8 | * Licensed under the MIT License.
9 | */
10 |
11 | #ifndef MALUUBA_DEBUG_HPP
12 | #define MALUUBA_DEBUG_HPP
13 |
14 | #include
15 | #include
16 | #include
17 | #include
18 |
19 | namespace maluuba
20 | {
21 | /**
22 | * Check that a condition is true, and throw an exception if not.
23 | *
24 | * @tparam E The type of exception to throw.
25 | * @param condition The condition that must hold.
26 | * @param args Arguments to pass to the exception constructor.
27 | * @throws E If !condition.
28 | */
29 | template
30 | void
31 | check(Cond&& condition, Args&&... args)
32 | {
33 | if (!static_cast(condition)) {
34 | throw E(std::forward(args)...);
35 | }
36 | }
37 |
38 | /**
39 | * Check that a condition is true, and throw @c std::logic_error if not.
40 | *
41 | * @param condition The condition that must hold.
42 | * @param message The exception message for failures.
43 | * @throws std::logic_error If !condition.
44 | */
45 | template
46 | void
47 | check_logic(Cond&& condition, const char* message)
48 | {
49 | check(std::forward(condition), message);
50 | }
51 | }
52 |
53 | #endif // MALUUBA_DEBUG_HPP
54 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingTests/Distance/StringDistanceTests.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace PhoneticMatchingTests.Distance
5 | {
6 | using System;
7 | using Microsoft.VisualStudio.TestTools.UnitTesting;
8 | using Microsoft.PhoneticMatching;
9 | using Microsoft.PhoneticMatching.Distance;
10 |
11 | [TestClass]
12 | public class StringDistanceTests : BaseDistanceTester
13 | {
14 | [TestMethod]
15 | public void GivenExactString_ExpectZeroDistance()
16 | {
17 | Assert.AreEqual(0, this.Distance.Distance("This, is a test.", "This, is a test."));
18 | }
19 |
20 | [TestMethod]
21 | public void GivenKnownDistances_ExpectPositiveMatches()
22 | {
23 | const string Aaa = "aaa";
24 | const string Bbb = "bbb";
25 | const string Aba = "aba";
26 |
27 | Assert.AreEqual(3, this.Distance.Distance(Aaa, Bbb));
28 | Assert.AreEqual(0, this.Distance.Distance(Aaa, Aaa));
29 | Assert.AreEqual(1, this.Distance.Distance(Aaa, Aba));
30 | Assert.AreEqual(0, this.Distance.Distance(string.Empty, string.Empty));
31 | Assert.AreEqual(3, this.Distance.Distance(string.Empty, Aaa));
32 | Assert.AreEqual(3, this.Distance.Distance(Aaa, string.Empty));
33 | }
34 |
35 | protected override IDistance CreateDistanceOperator()
36 | {
37 | return new StringDistance();
38 | }
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/ContactMatcher/ContactMatcherConfig.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers.ContactMatcher
5 | {
6 | ///
7 | /// Configurations to tweak the accuracy of the contact matcher.
8 | ///
9 | public class ContactMatcherConfig : MatcherConfig
10 | {
11 | ///
12 | /// Initializes a new instance of the class.
13 | ///
14 | /// Weighting trade-off between the phonetic distance and the lexical distance scores.
15 | /// Maximum number of places the matcher can return
16 | /// Maximum distance to a match. Normalized to 0 for exact match, 1 for nothing matches
17 | /// Candidate cutoff given by Math.max({best matched distance} * bestDistanceMultiplier, maxDistanceMarginReturns)
18 | /// best distance multiplier
19 | public ContactMatcherConfig(
20 | double phoneticWeightPercentage = 0.7,
21 | int maxReturns = 4,
22 | double findThreshold = 0.35,
23 | double maxDistanceMarginReturns = 0.02,
24 | double bestDistanceMultiplier = 1.1)
25 | : base(phoneticWeightPercentage, maxReturns, findThreshold, maxDistanceMarginReturns, bestDistanceMultiplier)
26 | {
27 | }
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/ChainedRuleBasedPreProcessor.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor
5 | {
6 | using System;
7 | using System.Collections.Generic;
8 | using System.Text.RegularExpressions;
9 |
10 | ///
11 | /// Pre-processes by appling a list of rules sequentially. First rules added are applied first.
12 | ///
13 | public class ChainedRuleBasedPreProcessor : IPreProcessor
14 | {
15 | private readonly List> rules = new List>();
16 |
17 | ///
18 | /// Function to preform the pre-processing.
19 | ///
20 | /// The string to pre-process.
21 | /// The pre-processed string.
22 | public string PreProcess(string query)
23 | {
24 | string result = query;
25 | foreach (var rule in this.rules)
26 | {
27 | result = rule.Item1.Replace(result, rule.Item2);
28 | }
29 |
30 | return result;
31 | }
32 |
33 | ///
34 | /// Add a replacement rule
35 | ///
36 | /// Pattern to replace
37 | /// String to replace with.
38 | public void AddRule(Regex pattern, string replacement)
39 | {
40 | this.rules.Add(new Tuple(pattern, replacement));
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/PlaceMatcher/PlaceMatcherConfig.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers.PlaceMatcher
5 | {
6 | using System;
7 |
8 | ///
9 | /// Configurations to tweak the accuracy of the place matcher.
10 | ///
11 | public class PlaceMatcherConfig : MatcherConfig
12 | {
13 | ///
14 | /// Initializes a new instance of the class.
15 | ///
16 | /// Weighting trade-off between the phonetic distance and the lexical distance scores.
17 | /// Maximum number of places the matcher can return
18 | /// Maximum distance to a match. Normalized to 0 for exact match, 1 for nothing matches
19 | /// Candidate cutoff given by Math.max({best matched distance} * bestDistanceMultiplier, maxDistanceMarginReturns)
20 | /// best distance multiplier
21 | public PlaceMatcherConfig(
22 | double phoneticWeightPercentage = 0.7,
23 | int maxReturns = 8,
24 | double findThreshold = 0.35,
25 | double maxDistanceMarginReturns = 0.02,
26 | double bestDistanceMultiplier = 1.1)
27 | : base(phoneticWeightPercentage, maxReturns, findThreshold, maxDistanceMarginReturns, bestDistanceMultiplier)
28 | {
29 | }
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingTests/Settings.StyleCop:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | rhotic
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | False
13 |
14 |
15 |
16 |
17 | False
18 |
19 |
20 |
21 |
22 | False
23 |
24 |
25 |
26 |
27 | False
28 |
29 |
30 |
31 |
32 | True
33 |
34 |
35 |
36 |
37 |
38 |
39 | False
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingPerfTests/Settings.StyleCop:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | rhotic
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | False
13 |
14 |
15 |
16 |
17 | False
18 |
19 |
20 |
21 |
22 | False
23 |
24 |
25 |
26 |
27 | False
28 |
29 |
30 |
31 |
32 | True
33 |
34 |
35 |
36 |
37 |
38 |
39 | False
40 |
41 |
42 |
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/src/maluuba/unicode.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Utilities for working with Unicode text.
4 | *
5 | * @author Tavian Barnes (tavian.barnes@microsoft.com)
6 | *
7 | * Copyright (c) Microsoft Corporation. All rights reserved.
8 | * Licensed under the MIT License.
9 | */
10 |
11 | #ifndef MALUUBA_UNICODE_HPP
12 | #define MALUUBA_UNICODE_HPP
13 |
14 | #include "maluuba/xtd/string_view.hpp"
15 | #include
16 |
17 | namespace maluuba
18 | {
19 |
20 | template
21 | String unicode_cast(const xtd::string_view utf8);
22 |
23 | template
24 | String unicode_cast(const xtd::u16string_view utf16);
25 |
26 | /**
27 | * No-op conversion for UTF-8.
28 | *
29 | * @param utf8 A UTF-8 encoded string.
30 | * @return The equivalent UTF-8 encoded string.
31 | */
32 | template <>
33 | std::string unicode_cast(const xtd::string_view utf8);
34 |
35 | /**
36 | * Convert the given UTF-16 encoded string to UTF-8.
37 | *
38 | * @param utf16 A UTF-16 encoded string.
39 | * @return The equivalent UTF-8 encoded string.
40 | */
41 | template <>
42 | std::string unicode_cast(const xtd::u16string_view utf16);
43 |
44 | /**
45 | * Convert the given UTF-8 encoded string to UTF-16.
46 | *
47 | * @param utf8 A UTF-8 encoded string.
48 | * @return The equivalent UTF-16 encoded string.
49 | */
50 | template <>
51 | std::u16string unicode_cast(const xtd::string_view utf8);
52 |
53 | /**
54 | * No-op conversion for UTF-16.
55 | *
56 | * @param utf16 A UTF-16 encoded string.
57 | * @return The equivalent UTF-16 encoded string.
58 | */
59 | template <>
60 | std::u16string unicode_cast(const xtd::u16string_view utf16);
61 | }
62 |
63 | #endif // MALUUBA_UNICODE_HPP
64 |
--------------------------------------------------------------------------------
/src/maluuba/speech/pronunciation/pronunciation.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/speech/pronunciation.hpp"
5 | #include "maluuba/unicode.hpp"
6 | #include
7 |
8 | namespace maluuba
9 | {
10 | namespace speech
11 | {
12 | Pronunciation::Pronunciation() = default;
13 |
14 | Pronunciation::~Pronunciation() = default;
15 |
16 | Pronunciation::iterator
17 | Pronunciation::begin() const
18 | {
19 | return m_phones.begin();
20 | }
21 |
22 | Pronunciation::iterator
23 | Pronunciation::end() const
24 | {
25 | return m_phones.end();
26 | }
27 |
28 | bool
29 | Pronunciation::empty() const
30 | {
31 | return m_phones.empty();
32 | }
33 |
34 | Pronunciation::size_type
35 | Pronunciation::size() const
36 | {
37 | return m_phones.size();
38 | }
39 |
40 | std::string
41 | Pronunciation::to_ipa() const
42 | {
43 | return unicode_cast(m_ipa);
44 | }
45 |
46 | EnPronunciation::~EnPronunciation() = default;
47 |
48 | EnPronunciation::EnPronunciation(const EnPronunciation& other) = default;
49 |
50 | EnPronunciation::EnPronunciation(EnPronunciation&& other) = default;
51 |
52 | EnPronunciation&
53 | EnPronunciation::operator=(const EnPronunciation& other) = default;
54 |
55 | EnPronunciation&
56 | EnPronunciation::operator=(EnPronunciation&& other) = default;
57 |
58 | EnPronunciation::EnPronunciation()
59 | : Pronunciation{}
60 | { }
61 |
62 | EnPronunciation::EnPronunciation(std::u16string ipa)
63 | : Pronunciation{ipa}
64 | { }
65 |
66 | std::string
67 | to_string(const Pronunciation& pron)
68 | {
69 | return pron.to_ipa();
70 | }
71 |
72 | std::ostream&
73 | operator<<(std::ostream& stream, const Pronunciation& pron)
74 | {
75 | return stream << to_string(pron);
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/maluuba/unicode/unicode.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/unicode.hpp"
5 | #include
6 | #include
7 |
8 | namespace maluuba
9 | {
10 | template <>
11 | std::string
12 | unicode_cast(const xtd::string_view utf8)
13 | {
14 | return std::string{utf8};
15 | }
16 |
17 | // https://stackoverflow.com/a/35103224
18 | #if MALUUBA_CODECVT_BUG
19 | template <>
20 | std::string
21 | unicode_cast(const xtd::u16string_view utf16)
22 | {
23 | std::wstring_convert, int16_t> convertor;
24 | auto p = reinterpret_cast(utf16.data());
25 | return convertor.to_bytes(p, p + utf16.size());
26 | }
27 |
28 | template <>
29 | std::u16string
30 | unicode_cast(const xtd::string_view utf8)
31 | {
32 | std::wstring_convert, int16_t> convertor;
33 | auto w = convertor.from_bytes(utf8.data(), utf8.data() + utf8.size());
34 | return {reinterpret_cast(w.data()), w.size()};
35 | }
36 | #else
37 | template <>
38 | std::string
39 | unicode_cast(const xtd::u16string_view utf16)
40 | {
41 | std::wstring_convert, char16_t> convertor;
42 | return convertor.to_bytes(utf16.data(), utf16.data() + utf16.size());
43 | }
44 |
45 | template <>
46 | std::u16string
47 | unicode_cast(const xtd::string_view utf8)
48 | {
49 | std::wstring_convert, char16_t> convertor;
50 | return convertor.from_bytes(utf8.data(), utf8.data() + utf8.size());
51 | }
52 | #endif // MALUUBA_CODECVT_BUG
53 |
54 | template <>
55 | std::u16string
56 | unicode_cast(const xtd::u16string_view utf16)
57 | {
58 | return std::u16string{utf16};
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "phoneticmatching",
3 | "version": "0.3.8",
4 | "description": "A text utility to do string comparisons at a phonetic level.",
5 | "main": "./lib/index.js",
6 | "types": "./lib/index.d.ts",
7 | "scripts": {
8 | "install": "node-pre-gyp install --fallback-to-build",
9 | "prepare": "npm run tsc",
10 | "package": "node-pre-gyp package",
11 | "test": "jest --config jestConfig.json test[.]",
12 | "test:debug": "npm test -- --verbose -i",
13 | "test:testsets": "jest --config jestConfig.json testset.spec",
14 | "build-docs": "typedoc --options typedoc.json --tsconfig ts/tsconfig.json ts/",
15 | "tsc": "tsc --project ts/",
16 | "watch": "npm run tsc -- --watch",
17 | "build": "node-pre-gyp build --build-from-source",
18 | "build:debug": "npm run build -- --debug",
19 | "rebuild": "node-pre-gyp rebuild --build-from-source",
20 | "rebuild:debug": "npm run rebuild -- --debug",
21 | "release": "npm run rebuild && npm run tsc && npm run build-docs"
22 | },
23 | "homepage": "https://microsoft.github.io/PhoneticMatching/",
24 | "repository": {
25 | "type": "git",
26 | "url": "https://github.com/Microsoft/PhoneticMatching"
27 | },
28 | "author": "madixon@microsoft.com",
29 | "license": "MIT",
30 | "engines": {
31 | "node": ">=8.11.2"
32 | },
33 | "devDependencies": {
34 | "@types/jest": "^25.2.3",
35 | "@types/node": "^11.15.54",
36 | "@types/xregexp": "^3.0.30",
37 | "jest": "^25.5.4",
38 | "ts-jest": "^25.5.1",
39 | "typedoc": "^0.20.37",
40 | "typescript": "^3.9.10"
41 | },
42 | "dependencies": {
43 | "@mapbox/node-pre-gyp": "^1.0.10",
44 | "xregexp": "^4.4.1"
45 | },
46 | "files": [
47 | "binding.gyp",
48 | "lib/",
49 | "src/",
50 | "!src/cs/"
51 | ],
52 | "binary": {
53 | "module_name": "maluubaspeech",
54 | "module_path": "./bindings/lib/",
55 | "package_name": "{module_name}-{node_abi}-{platform}-{arch}.tar.gz",
56 | "remote_path": "{version}",
57 | "host": "https://github.com/Microsoft/PhoneticMatching/releases/download/"
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/ts/matchers/matcherconfig.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Matcher config.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | /**
10 | * Configurations to tweak the accuracy of a matcher.
11 | *
12 | * @export
13 | * @class MatcherConfig
14 | */
15 | export class MatcherConfig {
16 | public readonly phoneticWeightPercentage: number;
17 | public maxReturns: number;
18 | public findThreshold: number;
19 | public maxDistanceMarginReturns: number;
20 | public bestDistanceMultiplier: number;
21 |
22 | /**
23 | *Creates an instance of MatcherConfig.
24 | * @param {*} [{
25 | * phoneticWeightPercentage, Between 0 and 1. Weighting trade-off between the phonetic
26 | * distance and the lexical distance scores. 1 meaning 100% phonetic score and 0% lexical score.
27 | * maxReturns, The maximum number of places the matcher can return.
28 | * findThreshold, The maximum distance to a match. Normalized to 0 for exact match, 1 for nothing matches.
29 | * Can be >1 if the lengths do not match.
30 | * maxDistanceMarginReturns, Candidate cutoff given by
31 | * Math.max({best matched distance} * bestDistanceMultiplier, maxDistanceMarginReturns).
32 | * bestDistanceMultiplier,
33 | * }={}]
34 | * @memberof PlaceMatcherConfig
35 | */
36 | constructor(
37 | phoneticWeightPercentage : number,
38 | maxReturns : number,
39 | findThreshold : number,
40 | maxDistanceMarginReturns : number,
41 | bestDistanceMultiplier :number ) {
42 | this.phoneticWeightPercentage = phoneticWeightPercentage;
43 | this.maxReturns = maxReturns;
44 | this.findThreshold = findThreshold;
45 | this.maxDistanceMarginReturns = maxDistanceMarginReturns;
46 | this.bestDistanceMultiplier = bestDistanceMultiplier;
47 | if (this.phoneticWeightPercentage < 0 || this.phoneticWeightPercentage > 1) {
48 | throw new TypeError("require 0 <= phoneticWeightPercentage <= 1");
49 | }
50 | }
51 | }
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingTests/NativeResourceWrapperTests.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace PhoneticMatchingTests
5 | {
6 | using System;
7 | using System.Runtime.InteropServices;
8 | using System.Text;
9 | using Microsoft.VisualStudio.TestTools.UnitTesting;
10 | using Microsoft.PhoneticMatching;
11 |
12 | [TestClass]
13 | public class NativeResourceWrapperTests
14 | {
15 | [TestMethod]
16 | public void GivenBufferTooSmall_ExpectErrorCode()
17 | {
18 | TestNativeWrapper.TestBufferTooSmall();
19 | }
20 |
21 | private abstract class TestNativeWrapper : NativeResourceWrapper
22 | {
23 | public static void TestBufferTooSmall()
24 | {
25 | double distance;
26 |
27 | // 2 is obviously too small to contain any error
28 | const int InitialBufferSize = 2;
29 | int bufferSize = InitialBufferSize;
30 | StringBuilder buffer = new StringBuilder(bufferSize);
31 |
32 | // IntPtr.Zero is a null reference exception
33 | var code = StringDistance_Distance(IntPtr.Zero, "123", "456", out distance, buffer, ref bufferSize);
34 |
35 | Assert.AreEqual(NativeResult.BufferTooSmall, code);
36 | Assert.IsTrue(bufferSize > InitialBufferSize);
37 | Assert.AreEqual(string.Empty, buffer.ToString());
38 |
39 | // use the new buffer size returned by native
40 | buffer.Capacity = bufferSize;
41 | code = StringDistance_Distance(IntPtr.Zero, "123", "456", out distance, buffer, ref bufferSize);
42 |
43 | Assert.AreEqual(NativeResult.InvalidParameter, code);
44 | Assert.AreEqual("pointer is null", buffer.ToString());
45 | }
46 |
47 | // Random dll import to test
48 | [DllImport("maluubaspeech-csharp.dll")]
49 | private static extern NativeResult StringDistance_Distance(IntPtr ptr, string s1, string s2, out double distance, StringBuilder buffer, ref int bufferSize);
50 | }
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/tests/distance/enphoneticdistance.test.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | import {EnPronunciation} from "../../ts";
5 | import {EnPhoneticDistance} from "../../ts/distance"
6 |
7 | test("English phonetic distance equality.", () => {
8 | const dist = new EnPhoneticDistance();
9 | // This, is a test.
10 | const test = EnPronunciation.fromIpa("ðɪsɪzətɛst");
11 | expect(dist.distance(test, test)).toBe(0);
12 | });
13 |
14 | test("English phonetic distance.", () => {
15 | const dist = new EnPhoneticDistance();
16 |
17 | // sam pasupalak
18 | const sam = EnPronunciation.fromIpa("sæmpɑsupələk");
19 | // santa super black
20 | const santa = EnPronunciation.fromIpa("sæntəsupɝblæk");
21 | // samples pollux
22 | const samples = EnPronunciation.fromIpa("sæmpəlzpɑləks");
23 |
24 | // Check identity of indiscernibles
25 | expect(dist.distance(sam, sam)).toBe(0.0);
26 | expect(dist.distance(santa, santa)).toBe(0.0);
27 | expect(dist.distance(samples, samples)).toBe(0.0);
28 |
29 | // Check symmetry
30 | expect(dist.distance(sam, santa) == dist.distance(santa, sam));
31 | expect(dist.distance(sam, samples) == dist.distance(samples, sam));
32 | expect(dist.distance(santa, samples) == dist.distance(samples, santa));
33 |
34 | // Check triangle inequality
35 | expect(dist.distance(sam, samples) < dist.distance(sam, santa) + dist.distance(santa, samples));
36 | expect(dist.distance(sam, santa) < dist.distance(sam, samples) + dist.distance(samples, santa));
37 | expect(dist.distance(santa, samples) < dist.distance(santa, sam) + dist.distance(sam, samples));
38 |
39 | // Check performance
40 | expect(dist.distance(sam, santa) < dist.distance(sam, samples));
41 | expect(dist.distance(sam, samples) < dist.distance(santa, samples));
42 | });
43 |
44 | test("ctor used as function exception.", () => {
45 | expect(() => {
46 | const distance = (EnPhoneticDistance as any)();
47 | }).toThrow();
48 | });
49 |
50 | test("Distance on undefined exception.", () => {
51 | expect(() => {
52 | const dist = new EnPhoneticDistance();
53 | dist.distance(undefined, undefined);
54 | }).toThrow();
55 | });
56 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Tokenizer/SplittingTokenizer.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Tokenizer
5 | {
6 | using System.Collections.Generic;
7 | using System.Text.RegularExpressions;
8 |
9 | ///
10 | /// Tokenizing base-class that will split on the given RegExp.
11 | ///
12 | public class SplittingTokenizer : ITokenizer
13 | {
14 | private readonly Regex pattern;
15 |
16 | ///
17 | /// Initializes a new instance of the class.
18 | ///
19 | /// Pattern that splits the query when matched.
20 | public SplittingTokenizer(Regex pattern)
21 | {
22 | this.pattern = pattern;
23 | }
24 |
25 | ///
26 | /// Tokenize the query.
27 | ///
28 | /// Query to tokenize.
29 | /// Collection of tokens.
30 | public IList Tokenize(string query)
31 | {
32 | List result = new List();
33 | var index = 0;
34 | MatchCollection matches = this.pattern.Matches(query);
35 | foreach (Match match in matches)
36 | {
37 | if (index < match.Index)
38 | {
39 | var interval = new Interval(index, match.Index);
40 | var token = new Token(query.Substring(interval.First, interval.Length), interval);
41 | result.Add(token);
42 | index += interval.Length + match.Length;
43 | }
44 | else if (index == match.Index)
45 | {
46 | index += match.Length;
47 | }
48 | }
49 |
50 | // Add the rest.
51 | if (index < query.Length)
52 | {
53 | var interval = new Interval(index, query.Length);
54 | var token = new Token(query.Substring(interval.First, interval.Length), interval);
55 | result.Add(token);
56 | }
57 |
58 | return result;
59 | }
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingTests/Nlp/TokenizerTests.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace PhoneticMatchingTests.Nlp
5 | {
6 | using System.Collections.Generic;
7 | using Microsoft.VisualStudio.TestTools.UnitTesting;
8 | using Microsoft.PhoneticMatching.Nlp.Tokenizer;
9 |
10 | [TestClass]
11 | public class TokenizerTests
12 | {
13 | private readonly WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
14 |
15 | [TestMethod]
16 | public void GivenEmptyString_ExpectNoToken()
17 | {
18 | var result = this.tokenizer.Tokenize(string.Empty);
19 | Assert.AreEqual(0, result.Count, "Expect no token for empty query");
20 | }
21 |
22 | [TestMethod]
23 | public void GivenNoWhitespace_ExpectIdentity()
24 | {
25 | const string Query = "example";
26 | var result = this.tokenizer.Tokenize(Query);
27 | Assert.AreEqual(1, result.Count);
28 | Assert.AreEqual(Query, result[0].Value);
29 | }
30 |
31 | [TestMethod]
32 | public void GivenQueryNotEndingWithSpaces_ExpectNoWhitespaceOrEmpty()
33 | {
34 | var result = this.tokenizer.Tokenize(" There are some words, here! #blessed");
35 | var expected = new string[] { "There", "are", "some", "words,", "here!", "#blessed" };
36 | this.AssertTokensAreEquals(expected, result);
37 | }
38 |
39 | [TestMethod]
40 | public void GivenQueryEndingWithSpaces_ExpectNoWhitespaceOrEmpty()
41 | {
42 | var result = this.tokenizer.Tokenize(" There are some words, here! #blessed ");
43 | var expected = new string[] { "There", "are", "some", "words,", "here!", "#blessed" };
44 | this.AssertTokensAreEquals(expected, result);
45 | }
46 |
47 | private void AssertTokensAreEquals(string[] expectedValues, IList tokens)
48 | {
49 | Assert.AreEqual(expectedValues.Length, tokens.Count, "Tokenizer didn't return the expected result.");
50 | for (int idx = 0; idx < expectedValues.Length; ++idx)
51 | {
52 | Assert.AreEqual(expectedValues[idx], tokens[idx].Value);
53 | }
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/performance/performance.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/speech/nodejs/performance.hpp"
5 |
6 | namespace maluuba
7 | {
8 | namespace speech
9 | {
10 | namespace nodejs
11 | {
12 | v8::Persistent Performance::s_performance;
13 |
14 | void
15 | Performance::Init(v8::Local module)
16 | {
17 | auto isolate = module->GetIsolate();
18 | v8::Local context = isolate->GetCurrentContext();
19 |
20 | auto require = module->Get(v8::String::NewFromUtf8(isolate, "require")).As();
21 | const auto argc = 1;
22 | v8::Local argv[argc] = { v8::String::NewFromUtf8(isolate, "perf_hooks") };
23 | auto perf_hooks = require->Call(context, module, argc, argv).ToLocalChecked().As();
24 | auto performance = perf_hooks->Get(v8::String::NewFromUtf8(isolate, "performance")).As();
25 | s_performance.Reset(isolate, performance);
26 | }
27 |
28 | void
29 | Performance::Mark(const std::string& name)
30 | {
31 | auto isolate = v8::Isolate::GetCurrent();
32 | v8::Local context = isolate->GetCurrentContext();
33 |
34 | auto performance = s_performance.Get(isolate);
35 | auto mark = performance->Get(v8::String::NewFromUtf8(isolate, "mark")).As();
36 | const auto argc = 1;
37 | v8::Local argv[argc] = { v8::String::NewFromUtf8(isolate, name.data()) };
38 | mark->Call(context, performance, argc, argv);
39 | }
40 |
41 | void
42 | Performance::Measure(const std::string& name, const std::string& start_mark, const std::string& end_mark)
43 | {
44 | auto isolate = v8::Isolate::GetCurrent();
45 | v8::Local context = isolate->GetCurrentContext();
46 |
47 | auto performance = s_performance.Get(isolate);
48 | auto measure = performance->Get(v8::String::NewFromUtf8(isolate, "measure")).As();
49 | const auto argc = 3;
50 | v8::Local argv[argc] = { v8::String::NewFromUtf8(isolate, name.data()),
51 | v8::String::NewFromUtf8(isolate, start_mark.data()), v8::String::NewFromUtf8(isolate, end_mark.data()) };
52 | measure->Call(context, performance, argc, argv);
53 | }
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/tests/distance/enhybriddistance.test.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | import { EnPronouncer } from "../../ts";
5 | import { DistanceInput, EnHybridDistance } from "../../ts/distance"
6 |
7 | const pronouncer = new EnPronouncer();
8 | function makeInput(phrase: string): DistanceInput {
9 | return {
10 | phrase,
11 | pronunciation: pronouncer.pronounce(phrase)
12 | }
13 | }
14 |
15 | test("English hybrid distance equality.", () => {
16 | const dist = new EnHybridDistance(0.7);
17 | expect(dist.distance(makeInput("This, is a test."), makeInput("This, is a test."))).toBe(0);
18 | });
19 |
20 | test("English hybrid distance get phoneticWeightPercentage.", () => {
21 | const dist = new EnHybridDistance(0.7);
22 | expect(dist.phoneticWeightPercentage).toBe(0.7)
23 | });
24 |
25 | test("English hybrid distance.", () => {
26 | const dist = new EnHybridDistance(0.7);
27 |
28 | expect(dist.distance(makeInput("aaa"), makeInput("bbb"))).toBeGreaterThan(0);
29 | expect(dist.distance(makeInput("aaa"), makeInput("aaa"))).toBe(0);
30 | expect(dist.distance(makeInput(""), makeInput(""))).toBe(0);
31 | });
32 |
33 | test("ctor used as function exception.", () => {
34 | expect(() => {
35 | const distance = (EnHybridDistance as any)();
36 | }).toThrow();
37 | });
38 |
39 | test("Distance on undefined exception.", () => {
40 | expect(() => {
41 | const dist = new EnHybridDistance(0.7);
42 | dist.distance(undefined, undefined);
43 | }).toThrow();
44 | });
45 |
46 | test("Distance on empty objects.", () => {
47 | expect(() => {
48 | const dist = new EnHybridDistance(0.7);
49 | dist.distance({} as any, {} as any);
50 | }).toThrow();
51 | });
52 |
53 | test("Distance on empty input.", () => {
54 | expect(() => {
55 | const dist = new EnHybridDistance(0.7);
56 | dist.distance({phrase:"", pronunciation: undefined}, {phrase:"", pronunciation: undefined} as any);
57 | }).toThrow();
58 | });
59 |
60 | test("phoneticWeightPercentage undefined.", () => {
61 | expect(() => {
62 | const dist = new EnHybridDistance(undefined);
63 | }).toThrow();
64 | });
65 |
66 | test("phoneticWeightPercentage out of range.", () => {
67 | expect(() => {
68 | const dist = new EnHybridDistance(2);
69 | }).toThrow();
70 | });
71 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/IFuzzyMatcher.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers
5 | {
6 | using System;
7 | using System.Collections.Generic;
8 |
9 | ///
10 | /// Fuzzy Matcher interface.
11 | ///
12 | /// The type of the returned matched object.
13 | /// The type of the query object.
14 | public interface IFuzzyMatcher
15 | {
16 | ///
17 | /// Gets the size of the matcher. The number of targets constructed with.
18 | ///
19 | int Count { get; }
20 |
21 | ///
22 | /// Find the nearest element.
23 | ///
24 | /// The search target.
25 | /// The closest match to target, or null if the initial targets list was empty.
26 | Match FindNearest(Extraction query);
27 |
28 | ///
29 | /// Find the __k__ nearest elements.
30 | ///
31 | /// The search target.
32 | /// The maximum number of result to return.
33 | /// The __k__ nearest matches to target.
34 | IList> FindNearest(Extraction query, int count);
35 |
36 | ///
37 | /// Find the nearest element.
38 | ///
39 | /// The search target.
40 | /// The maximum distance to a match.
41 | /// The closest match to target within limit, or null if no match is found.
42 | Match FindNearestWithin(Extraction query, double limit);
43 |
44 | ///
45 | /// Find the __k__ nearest elements.
46 | ///
47 | /// The search target.
48 | /// The maximum distance to a match.
49 | /// The maximum number of result to return.
50 | /// The __k__ nearest matches to target within limit
51 | IList> FindNearestWithin(Extraction query, double limit, int count);
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/EnPreProcessor.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor
5 | {
6 | using System.Text.RegularExpressions;
7 |
8 | ///
9 | /// English Pre-processor.
10 | ///
11 | public class EnPreProcessor : IPreProcessor
12 | {
13 | ///
14 | /// Rules to apply in chain to the query before pre-processing white spaces. Rules are applied in the order they added to the collection.
15 | ///
16 | protected readonly ChainedRuleBasedPreProcessor Rules = new ChainedRuleBasedPreProcessor();
17 |
18 | private const string StopWords = "a|an|at|by|el|i|in|la|las|los|my|of|on|san|santa|some|the|with|you";
19 |
20 | // TODO this belongs in native code to provide functionality cross language/platform. Will probably have to use libicu in some way.
21 | private readonly UnicodePreProcessor unicode = new UnicodePreProcessor();
22 | private readonly CaseFoldingPreProcessor caseFold = new CaseFoldingPreProcessor();
23 | private readonly WhiteSpacePreProcessor whitespace = new WhiteSpacePreProcessor();
24 |
25 | ///
26 | /// Initializes a new instance of the class.
27 | ///
28 | public EnPreProcessor()
29 | {
30 | // remove stop words
31 | this.Rules.AddRule(new Regex(string.Format(@"\b({0})\b ?", StopWords)), string.Empty);
32 | this.Rules.AddRule(new Regex(string.Format(@" ?\b({0})\b", StopWords)), string.Empty);
33 |
34 | // clear punctuation
35 | this.Rules.AddRule(new Regex(@"[\p{P}\p{S}]+"), " ");
36 | }
37 |
38 | ///
39 | /// Pre-process a string.
40 | ///
41 | /// The string to pre-process.
42 | /// The pre-processed string.
43 | public string PreProcess(string query)
44 | {
45 | string result = query;
46 | result = this.unicode.PreProcess(result);
47 | result = this.caseFold.PreProcess(result);
48 | result = this.Rules.PreProcess(result);
49 | result = this.whitespace.PreProcess(result);
50 | return result;
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/tests/matchers/testsets/soundex.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | import { WhitespaceTokenizer } from "../../../ts/nlp";
5 |
6 | function soundexNumber(c: string) {
7 | switch (c) {
8 | case "B":
9 | case "F":
10 | case "P":
11 | case "V":
12 | return "1";
13 | case "C":
14 | case "G":
15 | case "J":
16 | case "K":
17 | case "Q":
18 | case "S":
19 | case "X":
20 | case "Z":
21 | return "2";
22 | case "D":
23 | case "T":
24 | return "3";
25 | case "L":
26 | return "4";
27 | case "M":
28 | case "N":
29 | return "5";
30 | case "R":
31 | return "6";
32 |
33 | default:
34 | return c;
35 | }
36 | }
37 |
38 | function encodeWord(word: string): string {
39 | let soundex = "";
40 | if (word.length === 0) {
41 | return soundex;
42 | }
43 |
44 | let i = 0;
45 | let c = word.charAt(i);
46 | let n = soundexNumber(c);
47 |
48 | soundex += c;
49 |
50 | for (++i; i < word.length; ++i) {
51 | c = word.charAt(i);
52 | if (c == "H" || c == "W") {
53 | // Completely ignore H and W
54 | continue;
55 | }
56 |
57 | const newN = soundexNumber(c);
58 | if (newN === c) {
59 | // Ignore vowels, but make sure to encode consonants on either
60 | // side twice (i.e., "SIS" => "22")
61 | n = "0";
62 | continue;
63 | }
64 |
65 | if (n !== newN) {
66 | n = newN;
67 | soundex += n;
68 | }
69 | }
70 | if (soundex.length < 4) {
71 | soundex += "0".repeat(4 - soundex.length);
72 | }
73 | return soundex.substr(0, 4);
74 | }
75 |
76 |
77 | /**
78 | * Modified version of Soundex to apply the original fixed-length Soundex on each word,
79 | * then concatenate those encoded results together.
80 | *
81 | * @abstract
82 | * @class Soundex
83 | */
84 | abstract class Soundex {
85 | private static readonly tokenizer = new WhitespaceTokenizer();
86 |
87 | static encode(text: string): string {
88 | const tokens = Soundex.tokenizer.tokenize(text.toUpperCase());
89 | return tokens.map(token => encodeWord(token.value)).join(" ");
90 | }
91 | }
92 |
93 | export default Soundex;
94 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/Normalized/StringFuzzyMatcher.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers.FuzzyMatcher.Normalized
5 | {
6 | using System;
7 | using System.Collections.Generic;
8 | using FuzzyMatcher;
9 | using PhoneticMatching.Distance;
10 |
11 | ///
12 | /// A string fuzzy matcher which normalizes results based on length of queries. The fuzziness it determined by the provided distance function.
13 | ///
14 | /// The type of the returned matched object.
15 | public class StringFuzzyMatcher : NormalizedFuzzyMatcher
16 | {
17 | ///
18 | /// Initializes a new instance of the class.
19 | ///
20 | /// The set of objects that will be matched against. The order of equal targets is not guaranteed to be preserved.
21 | /// A mapping of the input types to the query(extraction) type. Note that Extraction == string for normalized cases.
22 | /// Whether the fuzzy matcher uses accelerated implementation or not.
23 | public StringFuzzyMatcher(IList targets, Func targetToExtraction = null, bool isAccelerated = true)
24 | {
25 | this.GenericFuzzyMatcher = new FuzzyMatcher(targets, new StringDistance(), targetToExtraction, isAccelerated);
26 | }
27 |
28 | ///
29 | /// Find the __k__ nearest elements.
30 | ///
31 | /// The search target.
32 | /// The maximum distance to a match.
33 | /// The maximum number of result to return.
34 | /// The __k__ nearest matches to target within limit
35 | public override IList> FindNearestWithin(string query, double limit, int count)
36 | {
37 | if (query == null)
38 | {
39 | throw new ArgumentNullException("query can't be null");
40 | }
41 |
42 | double thresholdScale = query.Length;
43 | return this.FindNearestWithinNormalized(query, limit, count, thresholdScale);
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/FuzzyMatcher.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers.FuzzyMatcher
5 | {
6 | using System;
7 | using System.Collections.Generic;
8 | using PhoneticMatching.Distance;
9 |
10 | ///
11 | /// A fuzzy matcher. The fuzziness it determined by the provided distance function.
12 | ///
13 | /// The type of the returned matched object.
14 | /// The type of the query object.
15 | public class FuzzyMatcher : AbstractFuzzyMatcher
16 | {
17 | ///
18 | /// Initializes a new instance of the class.
19 | ///
20 | /// The set of objects that will be matched against. The order of equal targets is not guaranteed to be preserved.
21 | /// The distance operator.
22 | /// A mapping of the input types to the query(extraction) type. Note that Extraction == Pronounceable for the usual case.
23 | /// Whether the fuzzy matcher uses accelerated implementation or not.
24 | public FuzzyMatcher(IList targets, IDistance distance, Func targetToExtraction = null, bool isAccelerated = false)
25 | : this(targets, distance.Distance, targetToExtraction, isAccelerated)
26 | {
27 | }
28 |
29 | ///
30 | /// Initializes a new instance of the class.
31 | ///
32 | /// The set of objects that will be matched against. The order of equal targets is not guaranteed to be preserved.
33 | /// The distance delegate.
34 | /// A mapping of the input types to the query(extraction) type. Note that Extraction == Pronounceable for the usual case.
35 | /// Whether the fuzzy matcher uses accelerated implementation or not.
36 | public FuzzyMatcher(IList targets, DistanceFunc distance, Func targetToExtraction = null, bool isAccelerated = false)
37 | : base(isAccelerated, targets, distance, null, targetToExtraction)
38 | {
39 | }
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/tests/enpronunciation.test.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | import {EnPronunciation, Speech} from "../ts";
5 |
6 | test("From ARPABET.", () => {
7 | const arpabet = EnPronunciation.fromArpabet(["dh", "ih1", "s", "ih1", "z", "ax0", "t", "eh1", "s", "t"]);
8 | expect(arpabet.ipa).toBe("ðɪsɪzətɛst");
9 | expect(arpabet.phones.length).toBeGreaterThan(0);
10 | });
11 |
12 | test("From IPA.", () => {
13 | const ipa = EnPronunciation.fromIpa("ðɪsɪzətɛst");
14 | expect(ipa.ipa).toBe("ðɪsɪzətɛst");
15 | expect(ipa.phones.length).toBeGreaterThan(0);
16 | });
17 |
18 | test("Phones.", () => {
19 | const pron = EnPronunciation.fromArpabet(["P", "R", "OW0", "N", "AH2", "N", "S", "IY0", "EY1", "SH", "AX0", "N"]);
20 | expect(pron.ipa).toBe("proʊ̯nʌnsieɪ̯ʃən");
21 | expect(pron.phones.length).toBeGreaterThan(3);
22 |
23 | // p
24 | let phone = pron.phones[0];
25 | expect(phone.type).toBe(Speech.PhoneType.CONSONANT);
26 | expect(phone.phonation).toBe(Speech.Phonation.VOICELESS);
27 | expect(phone.place).toBe(Speech.PlaceOfArticulation.BILABIAL);
28 | expect(phone.manner).toBe(Speech.MannerOfArticulation.PLOSIVE);
29 | expect(!phone.isSyllabic);
30 |
31 | // o
32 | phone = pron.phones[2];
33 | expect(phone.type).toBe(Speech.PhoneType.VOWEL);
34 | expect(phone.phonation).toBe(Speech.Phonation.MODAL);
35 | expect(phone.height).toBe(Speech.VowelHeight.CLOSE_MID);
36 | expect(phone.backness).toBe(Speech.VowelBackness.BACK);
37 | expect(phone.roundedness).toBe(Speech.VowelRoundedness.ROUNDED);
38 | expect(phone.isSyllabic);
39 |
40 | // ʊ̯
41 | phone = pron.phones[3];
42 | expect(phone.type).toBe(Speech.PhoneType.VOWEL);
43 | expect(phone.phonation).toBe(Speech.Phonation.MODAL);
44 | expect(phone.height).toBe(Speech.VowelHeight.NEAR_CLOSE);
45 | expect(phone.backness).toBe(Speech.VowelBackness.NEAR_BACK);
46 | expect(phone.roundedness).toBe(Speech.VowelRoundedness.ROUNDED);
47 | expect(!phone.isSyllabic);
48 | });
49 |
50 | test("Invalid ARPABET character (has space)", () => {
51 | expect(() => {
52 | const arpabet = EnPronunciation.fromArpabet(["F","B ","N","EH","T","IH","K"]);
53 | }).toThrow("Unrecognized");
54 | });
55 |
56 | test("Object import called as function exception.", () => {
57 | expect(() => {
58 | const pronunciation = (EnPronunciation as any)();
59 | }).toThrow();
60 | });
61 |
62 | test("Object import called as ctor exception.", () => {
63 | expect(() => {
64 | const pronouncer = new (EnPronunciation as any)();
65 | }).toThrow();
66 | });
67 |
--------------------------------------------------------------------------------
/src/maluuba/speech/hybriddistance.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Hybrid distance combining strings and phonemes.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | #ifndef MALUUBA_SPEECH_HYBRID_DISTANCE_HPP
10 | #define MALUUBA_SPEECH_HYBRID_DISTANCE_HPP
11 |
12 | #include "maluuba/speech/phoneticdistance.hpp"
13 | #include "maluuba/debug.hpp"
14 | #include "maluuba/levenshtein.hpp"
15 |
16 | namespace maluuba
17 | {
18 | namespace speech
19 | {
20 | /**
21 | * Compute the phonetic distance between English pronunciations.
22 | *
23 | * @tparam LevenshteinDistance<>
24 | * @tparam EnPhoneticDistance
25 | */
26 | template , typename PhoneticDistance = EnPhoneticDistance>
27 | class HybridDistance
28 | {
29 | public:
30 | /**
31 | * Construct a new Hybrid Distance metric.
32 | *
33 | * @param phonetic_weight_percentage Between 0 and 1. Weighting trade-off between the phonetic
34 | * distance and the lexical distance scores. 1 meaning 100% phonetic score and 0% lexical score.
35 | */
36 | HybridDistance(double phonetic_weight_percentage)
37 | : m_phonetic_weight_percentage{phonetic_weight_percentage}
38 | {
39 | check(m_phonetic_weight_percentage >= 0.0 && m_phonetic_weight_percentage <= 1.0,
40 | "require 0 <= phonetic_weight_percentage <= 1");
41 | }
42 |
43 | /**
44 | * @return The phonetic weight percentage being used.
45 | */
46 | double
47 | phonetic_weight_percentage() const
48 | {
49 | return m_phonetic_weight_percentage;
50 | }
51 |
52 | /**
53 | * @return The combined phonetic and lexical distance between @p a and @p b.
54 | */
55 | template
56 | double operator()(const StringInput& a_string, const PhoneticInput& a_pronunciation, const StringInput& b_string, const PhoneticInput& b_pronunciation) const
57 | {
58 | double string_weight = 0.0;
59 | double phonetic_weight = 0.0;
60 | if (m_phonetic_weight_percentage > 0.0) {
61 | phonetic_weight = m_phonetic_weight_percentage * m_phonetic_distance(a_pronunciation, b_pronunciation);
62 | }
63 | if (m_phonetic_weight_percentage < 1.0) {
64 | string_weight = (1.0 - m_phonetic_weight_percentage) * m_string_distance(a_string, b_string);
65 | }
66 | return phonetic_weight + string_weight;
67 | }
68 |
69 | private:
70 | double m_phonetic_weight_percentage;
71 | StringDistance m_string_distance;
72 | PhoneticDistance m_phonetic_distance;
73 | };
74 | }
75 | }
76 |
77 | #endif // MALUUBA_SPEECH_HYBRID_DISTANCE_HPP
78 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | ###############################################################################
2 | # Set default behavior to automatically normalize line endings.
3 | ###############################################################################
4 | * text=auto
5 |
6 | ###############################################################################
7 | # Set default behavior for command prompt diff.
8 | #
9 | # This is need for earlier builds of msysgit that does not have it on by
10 | # default for csharp files.
11 | # Note: This is only used by command line
12 | ###############################################################################
13 | #*.cs diff=csharp
14 |
15 | ###############################################################################
16 | # Set the merge driver for project and solution files
17 | #
18 | # Merging from the command prompt will add diff markers to the files if there
19 | # are conflicts (Merging from VS is not affected by the settings below, in VS
20 | # the diff markers are never inserted). Diff markers may cause the following
21 | # file extensions to fail to load in VS. An alternative would be to treat
22 | # these files as binary and thus will always conflict and require user
23 | # intervention with every merge. To do so, just uncomment the entries below
24 | ###############################################################################
25 | #*.sln merge=binary
26 | #*.csproj merge=binary
27 | #*.vbproj merge=binary
28 | #*.vcxproj merge=binary
29 | #*.vcproj merge=binary
30 | #*.dbproj merge=binary
31 | #*.fsproj merge=binary
32 | #*.lsproj merge=binary
33 | #*.wixproj merge=binary
34 | #*.modelproj merge=binary
35 | #*.sqlproj merge=binary
36 | #*.wwaproj merge=binary
37 |
38 | ###############################################################################
39 | # behavior for image files
40 | #
41 | # image files are treated as binary by default.
42 | ###############################################################################
43 | #*.jpg binary
44 | #*.png binary
45 | #*.gif binary
46 |
47 | ###############################################################################
48 | # diff behavior for common document formats
49 | #
50 | # Convert binary document formats to text before diffing them. This feature
51 | # is only available from the command line. Turn it on by uncommenting the
52 | # entries below.
53 | ###############################################################################
54 | #*.doc diff=astextplain
55 | #*.DOC diff=astextplain
56 | #*.docx diff=astextplain
57 | #*.DOCX diff=astextplain
58 | #*.dot diff=astextplain
59 | #*.DOT diff=astextplain
60 | #*.pdf diff=astextplain
61 | #*.PDF diff=astextplain
62 | #*.rtf diff=astextplain
63 | #*.RTF diff=astextplain
64 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/src/maluuba/speech/pronouncer/pronouncer.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/speech/pronouncer.hpp"
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | namespace maluuba
11 | {
12 | namespace speech
13 | {
14 | namespace
15 | {
16 | cst_utterance*
17 | no_wave_synth(cst_utterance* u)
18 | {
19 | return u;
20 | }
21 |
22 | cst_voice*
23 | no_wave_voice()
24 | {
25 | flite_init();
26 |
27 | cst_voice* v = new_voice();
28 | cst_lexicon* lex;
29 |
30 | v->name = "no_wave_voice";
31 |
32 | // Set up basic values for synthesizing with this voice
33 | usenglish_init(v);
34 | feat_set_string(v->features, "name", "cmu_us_no_wave");
35 |
36 | // Lexicon
37 | lex = cmu_lex_init();
38 | feat_set(v->features, "lexicon", lexicon_val(lex));
39 |
40 | // Post lexical rules
41 | feat_set(v->features, "postlex_func", uttfunc_val(lex->postlex));
42 |
43 | // Waveform synthesis: diphone_synth
44 | feat_set(v->features, "wave_synth_func", uttfunc_val(&no_wave_synth));
45 |
46 | return v;
47 | }
48 | }
49 |
50 | Pronouncer::~Pronouncer() = default;
51 |
52 | using VoiceHandle = std::unique_ptr;
53 |
54 | struct EnPronouncer::Impl
55 | {
56 | VoiceHandle voice;
57 |
58 | Impl()
59 | : voice{no_wave_voice(), delete_voice}
60 | { }
61 | };
62 |
63 | EnPronouncer::EnPronouncer()
64 | : m_impl{std::make_unique()}
65 | { }
66 |
67 | EnPronouncer::~EnPronouncer() = default;
68 |
69 | EnPronouncer::EnPronouncer(EnPronouncer&& other) = default;
70 |
71 | EnPronouncer&
72 | EnPronouncer::operator=(EnPronouncer&& other) = default;
73 |
74 | EnPronunciation
75 | EnPronouncer::pronounce(const std::string& text) const
76 | {
77 | using UtteranceHandle = std::unique_ptr;
78 |
79 | std::vector phonemes;
80 |
81 | auto utt = flite_synth_text(text.c_str(), m_impl->voice.get());
82 | UtteranceHandle utt_handle{utt, delete_utterance};
83 |
84 | for (auto s = relation_head(utt_relation(utt, "Segment")); s; s = item_next(s)) {
85 | std::string name = item_feat_string(s, "name");
86 | if (name == "pau") {
87 | continue;
88 | }
89 |
90 | if (strcmp("+", ffeature_string(s, "ph_vc")) == 0) {
91 | // If the phoneme is a vowel, add stress value
92 | name += ffeature_string(s, "R:SylStructure.parent.stress");
93 | }
94 | phonemes.push_back(std::move(name));
95 | }
96 |
97 | return EnPronunciation::from_arpabet(phonemes);
98 | }
99 | }
100 | }
101 |
--------------------------------------------------------------------------------
/src/maluuba/speech/phoneticdistance.hpp:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Phonetic distance.
4 | *
5 | * @author Benedicte Pierrejean
6 | * @author Tavian Barnes (tavian.barnes@microsoft.com)
7 | *
8 | * Copyright (c) Microsoft Corporation. All rights reserved.
9 | * Licensed under the MIT License.
10 | */
11 |
12 | #ifndef MALUUBA_SPEECH_PHONETIC_DISTANCE_HPP
13 | #define MALUUBA_SPEECH_PHONETIC_DISTANCE_HPP
14 |
15 | #include "maluuba/speech/pronunciation.hpp"
16 | #include
17 |
18 | namespace maluuba
19 | {
20 | namespace speech
21 | {
22 | /**
23 | * A phoneme embedded in a metric space for similarity measurement.
24 | */
25 | class PhonemeVector
26 | {
27 | public:
28 | /**
29 | * Initialize a @c PhonemeVector.
30 | *
31 | * @param v The three-dimensional embedding of this phoneme.
32 | * @param syllabic Whether the phoneme is syllabic.
33 | */
34 | PhonemeVector(float v[3], bool syllabic);
35 |
36 | /**
37 | * @return The @p i'th dimension (out of 3) of the vector representation.
38 | */
39 | float operator[](std::size_t i) const;
40 |
41 | /**
42 | * @return Whether this phoneme is syllabic.
43 | */
44 | bool is_syllabic() const;
45 |
46 | private:
47 | friend bool operator==(const PhonemeVector&, const PhonemeVector&);
48 |
49 | float m_v[3];
50 | bool m_syllabic;
51 | };
52 |
53 | bool operator==(const PhonemeVector& lhs, const PhonemeVector& rhs);
54 |
55 | /**
56 | * An entire pronunciation embedded in a metric space.
57 | */
58 | using PronunciationVector = std::vector;
59 |
60 | /**
61 | * Compute the vector representation of a pronunciation for similarity
62 | * measurement.
63 | *
64 | * @param pronunciation The pronunciation to embed.
65 | * @return A metric space embedding of the pronunciation.
66 | */
67 | PronunciationVector phonetic_embedding(const Pronunciation& pronunciation);
68 |
69 | /**
70 | * Compute the phonetic distance between pronunciations.
71 | */
72 | class PhoneticDistance
73 | {
74 | public:
75 | virtual ~PhoneticDistance() = 0;
76 |
77 | protected:
78 | /**
79 | * @return The phonetic distance of phonemes between @p a and @p b.
80 | */
81 | double operator()(const PronunciationVector& a, const PronunciationVector& b) const;
82 | };
83 |
84 | /**
85 | * Compute the phonetic distance between English pronunciations.
86 | */
87 | class EnPhoneticDistance: public PhoneticDistance
88 | {
89 | public:
90 | virtual ~EnPhoneticDistance();
91 |
92 | /**
93 | * @return The phonetic distance between English pronuncations @p a and @p b.
94 | */
95 | double operator()(const EnPronunciation& a, const EnPronunciation& b) const;
96 | };
97 | }
98 | }
99 |
100 | #endif // MALUUBA_SPEECH_PHONETIC_DISTANCE_HPP
101 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingTests/Matchers/BaseContactMatcherTester.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace PhoneticMatchingTests.Matchers
5 | {
6 | public class BaseContactMatcherTester
7 | {
8 | protected readonly string[] TargetStrings =
9 | {
10 | "Andrew Smith",
11 | "Andrew",
12 | "John B",
13 | "John C",
14 | "Jennifer"
15 | };
16 |
17 | protected readonly TestContact[] Targets =
18 | {
19 | new TestContact()
20 | {
21 | FirstName = "Andrew",
22 | LastName = "Smith",
23 | Id = "1234567"
24 | },
25 | new TestContact()
26 | {
27 | FirstName = "Andrew",
28 | LastName = string.Empty,
29 | },
30 | new TestContact()
31 | {
32 | FirstName = "John",
33 | LastName = "B",
34 | Id = "7654321"
35 | },
36 | new TestContact()
37 | {
38 | FirstName = "John",
39 | LastName = "C",
40 | Id = "2222222"
41 | },
42 | new TestContact()
43 | {
44 | FirstName = "Jennifer",
45 | LastName = string.Empty
46 | }
47 | };
48 |
49 | protected class TestContact
50 | {
51 | public string FirstName { get; set; }
52 |
53 | public string LastName { get; set; }
54 |
55 | public string Id { get; set; }
56 |
57 | public string FullName
58 | {
59 | get
60 | {
61 | return string.Format("{0} {1}", this.FirstName, this.LastName);
62 | }
63 | }
64 |
65 | public override bool Equals(object obj)
66 | {
67 | if (this == obj)
68 | {
69 | return true;
70 | }
71 |
72 | if (obj != null)
73 | {
74 | if (obj.GetType() == this.GetType())
75 | {
76 | var other = (TestContact)obj;
77 | return other.FirstName == this.FirstName &&
78 | other.LastName == this.LastName &&
79 | other.Id == this.Id;
80 | }
81 | }
82 |
83 | return false;
84 | }
85 |
86 | public override int GetHashCode()
87 | {
88 | return (this.FirstName + this.LastName + this.Id).GetHashCode();
89 | }
90 |
91 | public override string ToString()
92 | {
93 | return this.FullName;
94 | }
95 | }
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Nlp/Preprocessor/EnPlacesPreProcessor.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Nlp.Preprocessor
5 | {
6 | using System.Text.RegularExpressions;
7 |
8 | ///
9 | /// English Pre-processor with specific rules for places.
10 | ///
11 | public class EnPlacesPreProcessor : EnPreProcessor
12 | {
13 | ///
14 | /// Initializes a new instance of the class.
15 | ///
16 | public EnPlacesPreProcessor()
17 | {
18 | // Cardinal Directions
19 | this.Rules.AddRule(new Regex(@"\be\b"), "east");
20 | this.Rules.AddRule(new Regex(@"\bn\b"), "north");
21 | this.Rules.AddRule(new Regex(@"\bs\b"), "south");
22 | this.Rules.AddRule(new Regex(@"\bw\b"), "west");
23 |
24 | this.Rules.AddRule(new Regex(@"\bne\b"), "north east");
25 | this.Rules.AddRule(new Regex(@"\bnw\b"), "north west");
26 | this.Rules.AddRule(new Regex(@"\bse\b"), "south east");
27 | this.Rules.AddRule(new Regex(@"\bsw\b"), "south west");
28 |
29 | // Address Abbreviations
30 | // Word boundary doesn't work after the "." so we need look-ahead.
31 | this.Rules.AddRule(new Regex(@"\baly\.?(?=[\s\p{P}\p{S}]|$)"), "alley");
32 | this.Rules.AddRule(new Regex(@"\bave?\.?(?=[\s\p{P}\p{S}]|$)"), "avenue");
33 | this.Rules.AddRule(new Regex(@"\bblvd\.?(?=[\s\p{P}\p{S}]|$)"), "boulevard");
34 | this.Rules.AddRule(new Regex(@"\bbnd\.?(?=[\s\p{P}\p{S}]|$)"), "bend");
35 | this.Rules.AddRule(new Regex(@"\bcres\.?(?=[\s\p{P}\p{S}]|$)"), "crescent");
36 | this.Rules.AddRule(new Regex(@"\bcir\.?(?=[\s\p{P}\p{S}]|$)"), "circle");
37 | this.Rules.AddRule(new Regex(@"\bct\.?(?=[\s\p{P}\p{S}]|$)"), "court");
38 | this.Rules.AddRule(new Regex(@"\bdr\.?(?=[\s\p{P}\p{S}]|$)"), "drive");
39 | this.Rules.AddRule(new Regex(@"\best\.?(?=[\s\p{P}\p{S}]|$)"), "estate");
40 | this.Rules.AddRule(new Regex(@"\bln\.?(?=[\s\p{P}\p{S}]|$)"), "lane");
41 | this.Rules.AddRule(new Regex(@"\bpkwy\.?(?=[\s\p{P}\p{S}]|$)"), "parkway");
42 | this.Rules.AddRule(new Regex(@"\bpl\.?(?=[\s\p{P}\p{S}]|$)"), "place");
43 | this.Rules.AddRule(new Regex(@"\brd\.?(?=[\s\p{P}\p{S}]|$)"), "road");
44 |
45 | // Assume "st" at the beginning is for "saint".
46 | this.Rules.AddRule(new Regex(@"^st\.?(?=[\s\p{P}\p{S}]|$)"), "saint");
47 |
48 | // If "st" does not occur at the start of the string, then we cannot known if it is for "saint" or "street".
49 | this.Rules.AddRule(new Regex(@"\bst\.?(?=[\s\p{P}\p{S}]|$)"), "street");
50 | this.Rules.AddRule(new Regex(@"\bxing\.?(?=[\s\p{P}\p{S}]|$)"), "crossing");
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatching.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 15
4 | VisualStudioVersion = 15.0.28010.2003
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Microsoft.PhoneticMatching", "Microsoft.PhoneticMatching\Microsoft.PhoneticMatching.csproj", "{25881C63-77D8-4DB4-B9E4-9537BF8DD182}"
7 | EndProject
8 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PhoneticMatchingTests", "PhoneticMatchingTests\PhoneticMatchingTests.csproj", "{E78BAD40-0AA8-49D5-B219-111D5645D6E7}"
9 | EndProject
10 | Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "PhoneticMatchingPerfTests", "PhoneticMatchingPerfTests\PhoneticMatchingPerfTests.csproj", "{BA07F296-D5C6-4521-96E5-E764D7419EEE}"
11 | EndProject
12 | Global
13 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
14 | Debug|Any CPU = Debug|Any CPU
15 | Debug|x64 = Debug|x64
16 | Release|Any CPU = Release|Any CPU
17 | Release|x64 = Release|x64
18 | EndGlobalSection
19 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
20 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Debug|Any CPU.ActiveCfg = Debug|x64
21 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Debug|x64.ActiveCfg = Debug|x64
22 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Debug|x64.Build.0 = Debug|x64
23 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Release|Any CPU.ActiveCfg = Release|x64
24 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Release|x64.ActiveCfg = Release|x64
25 | {25881C63-77D8-4DB4-B9E4-9537BF8DD182}.Release|x64.Build.0 = Release|x64
26 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
27 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Debug|Any CPU.Build.0 = Debug|Any CPU
28 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Debug|x64.ActiveCfg = Debug|x64
29 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Debug|x64.Build.0 = Debug|x64
30 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Release|Any CPU.ActiveCfg = Release|Any CPU
31 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Release|Any CPU.Build.0 = Release|Any CPU
32 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Release|x64.ActiveCfg = Release|x64
33 | {E78BAD40-0AA8-49D5-B219-111D5645D6E7}.Release|x64.Build.0 = Release|x64
34 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Debug|Any CPU.ActiveCfg = Debug|x64
35 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Debug|x64.ActiveCfg = Debug|x64
36 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Debug|x64.Build.0 = Debug|x64
37 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Release|Any CPU.ActiveCfg = Release|x64
38 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Release|Any CPU.Build.0 = Release|x64
39 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Release|x64.ActiveCfg = Release|x64
40 | {BA07F296-D5C6-4521-96E5-E764D7419EEE}.Release|x64.Build.0 = Release|x64
41 | EndGlobalSection
42 | GlobalSection(SolutionProperties) = preSolution
43 | HideSolutionNode = FALSE
44 | EndGlobalSection
45 | GlobalSection(ExtensibilityGlobals) = postSolution
46 | SolutionGuid = {0D6EBE7F-D0D5-4B91-B0B5-EFF497F76B6A}
47 | EndGlobalSection
48 | EndGlobal
49 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/FuzzyMatcherBase.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers
5 | {
6 | using System;
7 | using System.Runtime.InteropServices;
8 | using System.Text;
9 |
10 | ///
11 | /// Abstract class to define static imports for generic fuzzy matcher.
12 | ///
13 | public abstract class FuzzyMatcherBase : NativeResourceWrapper
14 | {
15 | ///
16 | /// Initializes a new instance of the class.
17 | ///
18 | /// Parameter(s) required to initialize the native object if any.
19 | public FuzzyMatcherBase(params object[] args) : base(args)
20 | {
21 | }
22 |
23 | ///
24 | /// Delegate type passed to native code to access the managed objects using their indexes and compute distance on them.
25 | ///
26 | /// Index of the first managed object
27 | /// Index of the second managed object.
28 | /// The distance between the first and second managed objects.
29 | protected delegate double DistanceDelegate(int firstIdx, int secondIdx);
30 |
31 | [DllImport("maluubaspeech-csharp.dll")]
32 | protected static extern NativeResult FuzzyMatcher_Create(int count, DistanceDelegate distance, bool isAccelerated, out IntPtr fuzzyMatcher, StringBuilder errorMsg, ref int bufferSize);
33 |
34 | [DllImport("maluubaspeech-csharp.dll")]
35 | protected static extern NativeResult FuzzyMatcher_FindNearestWithin(IntPtr native, int count, double limit, [In, Out] int[] nearestIdx, [In, Out] double[] distances, StringBuilder buffer, ref int bufferSize);
36 |
37 | [DllImport("maluubaspeech-csharp.dll")]
38 | protected static extern NativeResult AcceleratedFuzzyMatcher_FindNearestWithin(IntPtr native, int count, double limit, [In, Out] int[] nearestIdx, [In, Out] double[] distances, StringBuilder buffer, ref int bufferSize);
39 |
40 | ///
41 | /// Delete the native pointer using the type specified in native bindings.
42 | ///
43 | /// Pointer to the native object.
44 | /// Buffer for any error message
45 | /// Size of the buffer, to be adjusted if error doesn't fit the current size.
46 | /// The result code from native library.
47 | protected override NativeResult NativeDelete(IntPtr native, StringBuilder buffer, ref int bufferSize)
48 | {
49 | return FuzzyMatcher_Delete(native, buffer, ref bufferSize);
50 | }
51 |
52 | [DllImport("maluubaspeech-csharp.dll")]
53 | private static extern NativeResult FuzzyMatcher_Delete(IntPtr ptr, StringBuilder buffer, ref int bufferSize);
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/Normalized/EnPhoneticFuzzyMatcher.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers.FuzzyMatcher.Normalized
5 | {
6 | using System;
7 | using System.Collections.Generic;
8 | using PhoneticMatching.Distance;
9 |
10 | ///
11 | /// An english pronunciation fuzzy matcher which normalizes results based on length of queries. The fuzziness it determined by the provided distance function.
12 | ///
13 | /// The type of the returned matched object.
14 | public class EnPhoneticFuzzyMatcher : NormalizedFuzzyMatcher
15 | {
16 | private EnPronouncer pronouncer = EnPronouncer.Instance;
17 |
18 | ///
19 | /// Initializes a new instance of the class.
20 | ///
21 | /// The set of objects that will be matched against. The order of equal targets is not guaranteed to be preserved.
22 | /// A mapping of the input types to the query(extraction) type. Note that Extraction == string for normalized cases.
23 | /// Whether the fuzzy matcher uses accelerated implementation or not.
24 | public EnPhoneticFuzzyMatcher(IList targets, Func targetToExtractionPhrase = null, bool isAccelerated = true)
25 | {
26 | Func targetToExtraction = (target) =>
27 | {
28 | string phrase = targetToExtractionPhrase == null ? target as string : targetToExtractionPhrase(target);
29 | if (phrase == null)
30 | {
31 | throw new InvalidCastException($"Can't cast Target type [{typeof(Target)}] to Extraction type [string]. You must provide a conversion function 'targetToExtractionPhrase'.");
32 | }
33 |
34 | return this.pronouncer.Pronounce(phrase);
35 | };
36 | this.GenericFuzzyMatcher = new FuzzyMatcher(targets, new EnPhoneticDistance(), targetToExtraction, isAccelerated);
37 | }
38 |
39 | ///
40 | /// Find the __k__ nearest elements.
41 | ///
42 | /// The search target.
43 | /// The maximum distance to a match.
44 | /// The maximum number of result to return.
45 | /// The __k__ nearest matches to target within limit
46 | public override IList> FindNearestWithin(string query, double limit, int count)
47 | {
48 | var pronunciation = this.pronouncer.Pronounce(query);
49 | double thresholdScale = pronunciation.Ipa.Length;
50 | return this.FindNearestWithinNormalized(pronunciation, limit, count, thresholdScale);
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/match/match.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/speech/nodejs/match.hpp"
5 | #include
6 |
7 | namespace maluuba
8 | {
9 | namespace speech
10 | {
11 | namespace nodejs
12 | {
13 | namespace
14 | {
15 | void
16 | getDistance(v8::Local property, const v8::PropertyCallbackInfo& info)
17 | {
18 | auto isolate = info.GetIsolate();
19 | auto obj = node::ObjectWrap::Unwrap(info.Holder());
20 | auto distance = obj->match().distance();
21 | info.GetReturnValue().Set(v8::Number::New(isolate, distance));
22 | }
23 |
24 | void
25 | getElement(v8::Local property, const v8::PropertyCallbackInfo& info)
26 | {
27 | auto isolate = info.GetIsolate();
28 | auto obj = node::ObjectWrap::Unwrap(info.Holder());
29 | auto element = obj->match().element().Get(isolate);
30 | info.GetReturnValue().Set(element);
31 | }
32 |
33 | void
34 | setThrow(v8::Local property, v8::Local value, const v8::PropertyCallbackInfo& info)
35 | {
36 | auto isolate = info.GetIsolate();
37 | isolate->ThrowException(v8::Exception::Error(
38 | v8::String::NewFromUtf8(isolate, "Object is immutable, setters not allowed.")));
39 | return;
40 | }
41 | }
42 |
43 | v8::Persistent Match::s_constructor;
44 |
45 | Match::Match(Match::MatchType match)
46 | : m_match{std::move(match)}
47 | { }
48 |
49 | v8::Local
50 | Match::constructor(v8::Isolate* isolate)
51 | {
52 | return v8::Local::New(isolate, s_constructor);
53 | }
54 |
55 | void
56 | Match::Init(v8::Local exports)
57 | {
58 | auto isolate = exports->GetIsolate();
59 | v8::Local context = isolate->GetCurrentContext();
60 |
61 | auto tpl = v8::FunctionTemplate::New(isolate, New);
62 | tpl->SetClassName(v8::String::NewFromUtf8(isolate, "Match"));
63 | tpl->InstanceTemplate()->SetInternalFieldCount(1);
64 | tpl->InstanceTemplate()->SetAccessor(v8::String::NewFromUtf8(isolate, "distance"), getDistance, setThrow);
65 | tpl->InstanceTemplate()->SetAccessor(v8::String::NewFromUtf8(isolate, "element"), getElement, setThrow);
66 |
67 | s_constructor.Reset(isolate, tpl->GetFunction(context).ToLocalChecked());
68 | }
69 |
70 | void
71 | Match::New(const v8::FunctionCallbackInfo& args)
72 | {
73 | auto isolate = args.GetIsolate();
74 |
75 | if (!args[0]->IsExternal()) {
76 | isolate->ThrowException(v8::Exception::TypeError(
77 | v8::String::NewFromUtf8(isolate, "Not Expected to initialize directly, use a Fuzzy Matcher.")));
78 | return;
79 | }
80 |
81 | auto self = args.Holder();
82 | auto external = args[0].As();
83 | auto obj = static_cast(external->Value());
84 | obj->Wrap(self);
85 | args.GetReturnValue().Set(self);
86 | }
87 |
88 | const Match::MatchType&
89 | Match::match() const
90 | {
91 | return m_match;
92 | }
93 | }
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/stringdistance/stringdistance.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/speech/nodejs/stringdistance.hpp"
5 | #include
6 | #include
7 |
8 | namespace maluuba
9 | {
10 | namespace speech
11 | {
12 | namespace nodejs
13 | {
14 | v8::Persistent StringDistance::s_constructor;
15 | v8::Persistent StringDistance::s_type;
16 |
17 | StringDistance::StringDistance(LevenshteinDistance<> distance)
18 | : m_distance{std::move(distance)}
19 | { }
20 |
21 | StringDistance::~StringDistance() = default;
22 |
23 | v8::Local
24 | StringDistance::type(v8::Isolate* isolate)
25 | {
26 | return s_type.Get(isolate);
27 | }
28 |
29 | void
30 | StringDistance::Init(v8::Local exports)
31 | {
32 | auto isolate = exports->GetIsolate();
33 | v8::Local context = isolate->GetCurrentContext();
34 |
35 | auto tpl = v8::FunctionTemplate::New(isolate, New);
36 | tpl->SetClassName(v8::String::NewFromUtf8(isolate, "StringDistance"));
37 | tpl->InstanceTemplate()->SetInternalFieldCount(1);
38 |
39 | NODE_SET_PROTOTYPE_METHOD(tpl, "distance", Distance);
40 |
41 | s_constructor.Reset(isolate, tpl->GetFunction(context).ToLocalChecked());
42 | s_type.Reset(isolate, tpl);
43 | exports->Set(context, v8::String::NewFromUtf8(isolate, "StringDistance"), tpl->GetFunction(context).ToLocalChecked());
44 | }
45 |
46 | void
47 | StringDistance::New(const v8::FunctionCallbackInfo& args)
48 | {
49 | auto isolate = args.GetIsolate();
50 |
51 | if (args.IsConstructCall()) {
52 | LevenshteinDistance<> distance{};
53 | auto obj = new StringDistance(std::move(distance));
54 | obj->Wrap(args.This());
55 | args.GetReturnValue().Set(args.This());
56 | } else {
57 | isolate->ThrowException(v8::Exception::SyntaxError(
58 | v8::String::NewFromUtf8(isolate, "Not invoked as constructor, change to: `new StringDistance()`")));
59 | return;
60 | }
61 | }
62 |
63 | void
64 | StringDistance::Distance(const v8::FunctionCallbackInfo& args)
65 | {
66 | auto isolate = args.GetIsolate();
67 |
68 | if (args.Length() < 2) {
69 | isolate->ThrowException(v8::Exception::TypeError(
70 | v8::String::NewFromUtf8(isolate, "Expected 2 arguments.")));
71 | return;
72 | }
73 |
74 | if (!args[0]->IsString() || !args[1]->IsString()) {
75 | isolate->ThrowException(v8::Exception::TypeError(
76 | v8::String::NewFromUtf8(isolate, "Expected arguments to be string.")));
77 | return;
78 | }
79 |
80 | auto obj = ObjectWrap::Unwrap(args.Holder());
81 | std::string a{*v8::String::Utf8Value{isolate, args[0]}};
82 | std::string b{*v8::String::Utf8Value{isolate, args[1]}};
83 | auto distance = obj->distance()(a, b);
84 |
85 | args.GetReturnValue().Set(v8::Number::New(isolate, distance));
86 | }
87 |
88 | const LevenshteinDistance<>&
89 | StringDistance::distance() const
90 | {
91 | return m_distance;
92 | }
93 | }
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/MatcherConfig.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers
5 | {
6 | using System;
7 |
8 | ///
9 | /// Simple matcher configuration without default values.
10 | ///
11 | public class MatcherConfig
12 | {
13 | ///
14 | /// Initializes a new instance of the class.
15 | ///
16 | /// Weighting trade-off between the phonetic distance and the lexical distance scores.
17 | /// Maximum number of places the matcher can return
18 | /// Maximum distance to a match. Normalized to 0 for exact match, 1 for nothing matches
19 | /// Candidate cutoff given by Math.max({best matched distance} * bestDistanceMultiplier, maxDistanceMarginReturns)
20 | /// best distance multiplier
21 | public MatcherConfig(
22 | double phoneticWeightPercentage,
23 | int maxReturns,
24 | double findThreshold,
25 | double maxDistanceMarginReturns,
26 | double bestDistanceMultiplier)
27 | {
28 | this.PhoneticWeightPercentage = phoneticWeightPercentage;
29 | this.MaxReturns = maxReturns;
30 | this.FindThreshold = findThreshold;
31 | this.MaxDistanceMarginReturns = maxDistanceMarginReturns;
32 | this.BestDistanceMultiplier = bestDistanceMultiplier;
33 |
34 | if (this.PhoneticWeightPercentage < 0 || this.PhoneticWeightPercentage > 1)
35 | {
36 | throw new ArgumentException("require 0 <= phoneticWeightPercentage <= 1");
37 | }
38 | }
39 |
40 | ///
41 | /// Gets or sets the Weighting trade-off between the phonetic distance and
42 | /// the lexical distance scores. Between 0 and 1. 1 meaning 100% phonetic score and 0% lexical score.
43 | ///
44 | public double PhoneticWeightPercentage { get; protected set; }
45 |
46 | ///
47 | /// Gets or sets the maximum number of places the matcher can return.
48 | ///
49 | public int MaxReturns { get; set; }
50 |
51 | ///
52 | /// Gets or sets the maximum distance to a match. Normalized to 0 for exact match, 1 for nothing matches.
53 | /// Can be >1 if the lengths do not match.
54 | ///
55 | public double FindThreshold { get; set; }
56 |
57 | ///
58 | /// Gets or sets the candidate cutoff given by Math.max({best matched distance} * bestDistanceMultiplier, maxDistanceMarginReturns).
59 | ///
60 | public double MaxDistanceMarginReturns { get; set; }
61 |
62 | ///
63 | /// Gets or sets the best distance multiplier.
64 | ///
65 | public double BestDistanceMultiplier { get; set; }
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/enpronouncer/enpronouncer.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/speech/nodejs/enpronouncer.hpp"
5 | #include "maluuba/speech/nodejs/enpronunciation.hpp"
6 | #include
7 |
8 | namespace maluuba
9 | {
10 | namespace speech
11 | {
12 | namespace nodejs
13 | {
14 | v8::Persistent EnPronouncer::s_constructor;
15 |
16 | EnPronouncer::EnPronouncer(speech::EnPronouncer pronouncer)
17 | : m_pronouncer{std::move(pronouncer)}
18 | { }
19 |
20 | EnPronouncer::~EnPronouncer() = default;
21 |
22 | void
23 | EnPronouncer::Init(v8::Local exports)
24 | {
25 | auto isolate = exports->GetIsolate();
26 | v8::Local context = isolate->GetCurrentContext();
27 |
28 | auto tpl = v8::FunctionTemplate::New(isolate, New);
29 | tpl->SetClassName(v8::String::NewFromUtf8(isolate, "EnPronouncer"));
30 | tpl->InstanceTemplate()->SetInternalFieldCount(1);
31 |
32 | NODE_SET_PROTOTYPE_METHOD(tpl, "pronounce", Pronounce);
33 |
34 | s_constructor.Reset(isolate, tpl->GetFunction(context).ToLocalChecked());
35 | exports->Set(context, v8::String::NewFromUtf8(isolate, "EnPronouncer"), tpl->GetFunction(context).ToLocalChecked());
36 | }
37 |
38 | void
39 | EnPronouncer::New(const v8::FunctionCallbackInfo& args)
40 | {
41 | auto isolate = args.GetIsolate();
42 |
43 | if (args.IsConstructCall()) {
44 | speech::EnPronouncer pronouncer{};
45 | auto obj = new EnPronouncer(std::move(pronouncer));
46 | obj->Wrap(args.This());
47 | args.GetReturnValue().Set(args.This());
48 | } else {
49 | isolate->ThrowException(v8::Exception::SyntaxError(
50 | v8::String::NewFromUtf8(isolate, "Not invoked as constructor, change to: `new EnPronouncer()`")));
51 | return;
52 | }
53 | }
54 |
55 | void
56 | EnPronouncer::Pronounce(const v8::FunctionCallbackInfo& args)
57 | {
58 | auto isolate = args.GetIsolate();
59 |
60 | if (args.Length() < 1) {
61 | isolate->ThrowException(v8::Exception::TypeError(
62 | v8::String::NewFromUtf8(isolate, "Expected 1 argument.")));
63 | return;
64 | }
65 |
66 | if (!args[0]->IsString()) {
67 | isolate->ThrowException(v8::Exception::TypeError(
68 | v8::String::NewFromUtf8(isolate, "Expected argument to be a string.")));
69 | return;
70 | }
71 |
72 | auto obj = ObjectWrap::Unwrap(args.Holder());
73 | v8::String::Utf8Value phrase{isolate, args[0]};
74 | try {
75 | auto pronunciation = obj->pronouncer().pronounce(*phrase);
76 |
77 | auto wrap = new EnPronunciation(std::move(pronunciation));
78 | const auto argc = 1;
79 | v8::Local argv[argc] = { v8::External::New(isolate, wrap) };
80 | auto context = isolate->GetCurrentContext();
81 | auto instance = EnPronunciation::constructor(isolate)->NewInstance(context, argc, argv).ToLocalChecked();
82 | args.GetReturnValue().Set(instance);
83 | } catch (const std::exception& e) {
84 | isolate->ThrowException(v8::Exception::Error(
85 | v8::String::NewFromUtf8(isolate, e.what())));
86 | return;
87 | }
88 | }
89 |
90 | const speech::EnPronouncer&
91 | EnPronouncer::pronouncer() const
92 | {
93 | return m_pronouncer;
94 | }
95 | }
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/maluuba/speech/pronunciation/arpabet.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/speech/pronunciation.hpp"
5 | #include "maluuba/xtd/string_view.hpp"
6 | #include
7 | #include
8 | #include
9 | #include
10 |
11 | namespace maluuba
12 | {
13 | namespace speech
14 | {
15 | namespace
16 | {
17 | /** Map from Arpabet phonemes to IPA pronunciations. */
18 | const std::u16string&
19 | arpabet_to_ipa(const xtd::string_view phoneme)
20 | {
21 | static std::unordered_map arpabet_map = {
22 | // Vowels
23 |
24 | // Monophthongs
25 | {"AO", u"ɔ"},
26 | {"AA", u"ɑ"},
27 | {"IY", u"i"},
28 | {"UW", u"u"},
29 | {"EH", u"ɛ"},
30 | {"IH", u"ɪ"},
31 | {"UH", u"ʊ"},
32 | {"AH", u"ʌ"},
33 | {"AX", u"ə"},
34 | {"AE", u"æ"},
35 |
36 | // Diphthongs
37 | {"EY", u"eɪ̯"},
38 | {"AY", u"aɪ̯"},
39 | {"OW", u"oʊ̯"},
40 | {"AW", u"aʊ̯"},
41 | {"OY", u"ɔɪ̯"},
42 |
43 | // Rhotic
44 | {"ER", u"ɝ"},
45 | {"AXR", u"ɚ"},
46 |
47 | // Consonants
48 |
49 | // Stops
50 | {"P", u"p"},
51 | {"B", u"b"},
52 | {"T", u"t"},
53 | {"D", u"d"},
54 | {"K", u"k"},
55 | {"G", u"ɡ"},
56 |
57 | // Affricates
58 | {"CH", u"tʃ"},
59 | {"JH", u"dʒ"},
60 |
61 | // Fricatives
62 | {"F", u"f"},
63 | {"V", u"v"},
64 | {"TH", u"θ"},
65 | {"DH", u"ð"},
66 | {"S", u"s"},
67 | {"Z", u"z"},
68 | {"SH", u"ʃ"},
69 | {"ZH", u"ʒ"},
70 | {"HH", u"h"},
71 |
72 | // Nasals
73 | {"M", u"m"},
74 | {"EM", u"m̩"},
75 | {"N", u"n"},
76 | {"EN", u"n̩"},
77 | {"NG", u"ŋ"},
78 | {"ENG", u"ŋ̍"},
79 |
80 | // Liquids
81 | {"L", u"lˠ"},
82 | {"EL", u"l̩ˠ"},
83 | {"R", u"r"},
84 | {"DX", u"ɾ"},
85 | {"NX", u"ɾ̃"},
86 |
87 | // Semivowels
88 | {"Y", u"j"},
89 | {"W", u"w"},
90 | {"Q", u"ʔ"},
91 |
92 | // Suprasegmentals
93 | {" ", u" "},
94 | };
95 |
96 | auto found = arpabet_map.find(phoneme);
97 | if (found == arpabet_map.end()) {
98 | throw std::domain_error("Unrecognized ARPABET phoneme `" + std::string{phoneme} + "`.");
99 | }
100 | return found->second;
101 | }
102 | }
103 |
104 | EnPronunciation
105 | EnPronunciation::from_arpabet(const std::vector& arpabet)
106 | {
107 | std::u16string ipa;
108 |
109 | for (const auto& phoneme : arpabet) {
110 | std::string copy{phoneme.begin(), phoneme.end()};
111 |
112 | // Convert to uppercase
113 | for (auto& c : copy) {
114 | if (c >= 'a' && c <= 'z') {
115 | c += 'A' - 'a';
116 | }
117 | }
118 |
119 | if (!copy.empty()) {
120 | auto last = copy[copy.length() - 1];
121 | if (last >= '0' && last <= '2') {
122 | copy.resize(copy.length() - 1);
123 | }
124 | }
125 |
126 | ipa += arpabet_to_ipa(copy);
127 | }
128 |
129 | return {std::move(ipa)};
130 | }
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/src/maluuba/speech/nodejs/enphoneticdistance/enphoneticdistance.cpp:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | #include "maluuba/speech/nodejs/enphoneticdistance.hpp"
5 | #include "maluuba/speech/nodejs/enpronunciation.hpp"
6 | #include
7 |
8 | namespace maluuba
9 | {
10 | namespace speech
11 | {
12 | namespace nodejs
13 | {
14 | v8::Persistent EnPhoneticDistance::s_constructor;
15 | v8::Persistent EnPhoneticDistance::s_type;
16 |
17 | EnPhoneticDistance::EnPhoneticDistance(speech::EnPhoneticDistance distance)
18 | : m_distance{std::move(distance)}
19 | { }
20 |
21 | EnPhoneticDistance::~EnPhoneticDistance() = default;
22 |
23 | v8::Local
24 | EnPhoneticDistance::type(v8::Isolate* isolate)
25 | {
26 | return s_type.Get(isolate);
27 | }
28 |
29 | void
30 | EnPhoneticDistance::Init(v8::Local exports)
31 | {
32 | auto isolate = exports->GetIsolate();
33 | v8::Local context = isolate->GetCurrentContext();
34 |
35 | auto tpl = v8::FunctionTemplate::New(isolate, New);
36 | tpl->SetClassName(v8::String::NewFromUtf8(isolate, "EnPhoneticDistance"));
37 | tpl->InstanceTemplate()->SetInternalFieldCount(1);
38 |
39 | NODE_SET_PROTOTYPE_METHOD(tpl, "distance", Distance);
40 |
41 | s_constructor.Reset(isolate, tpl->GetFunction(context).ToLocalChecked());
42 | s_type.Reset(isolate, tpl);
43 | exports->Set(context, v8::String::NewFromUtf8(isolate, "EnPhoneticDistance"), tpl->GetFunction(context).ToLocalChecked());
44 | }
45 |
46 | void
47 | EnPhoneticDistance::New(const v8::FunctionCallbackInfo& args)
48 | {
49 | auto isolate = args.GetIsolate();
50 |
51 | if (args.IsConstructCall()) {
52 | speech::EnPhoneticDistance distance{};
53 | auto obj = new EnPhoneticDistance(std::move(distance));
54 | obj->Wrap(args.This());
55 | args.GetReturnValue().Set(args.This());
56 | } else {
57 | isolate->ThrowException(v8::Exception::SyntaxError(
58 | v8::String::NewFromUtf8(isolate, "Not invoked as constructor, change to: `new EnPhoneticDistance()`")));
59 | return;
60 | }
61 | }
62 |
63 | void
64 | EnPhoneticDistance::Distance(const v8::FunctionCallbackInfo& args)
65 | {
66 | auto isolate = args.GetIsolate();
67 |
68 | if (args.Length() < 2) {
69 | isolate->ThrowException(v8::Exception::TypeError(
70 | v8::String::NewFromUtf8(isolate, "Expected 2 arguments.")));
71 | return;
72 | }
73 |
74 | auto enPronunciationType = EnPronunciation::type(isolate);
75 | if (!enPronunciationType->HasInstance(args[0]) || !enPronunciationType->HasInstance(args[1])) {
76 | isolate->ThrowException(v8::Exception::TypeError(
77 | v8::String::NewFromUtf8(isolate, "Expected arguments to be EnPronunciation.")));
78 | return;
79 | }
80 |
81 | auto obj = ObjectWrap::Unwrap(args.Holder());
82 | auto a = ObjectWrap::Unwrap(args[0].As());
83 | auto b = ObjectWrap::Unwrap(args[1].As());
84 | auto distance = obj->distance()(a->pronunciation(), b->pronunciation());
85 |
86 | args.GetReturnValue().Set(v8::Number::New(isolate, distance));
87 | }
88 |
89 | const speech::EnPhoneticDistance&
90 | EnPhoneticDistance::distance() const
91 | {
92 | return m_distance;
93 | }
94 | }
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingTests/Nlp/PreprocessorTests.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace PhoneticMatchingTests.Nlp
5 | {
6 | using Microsoft.VisualStudio.TestTools.UnitTesting;
7 | using Microsoft.PhoneticMatching.Nlp.Preprocessor;
8 |
9 | ///
10 | /// Tests for the preprocessors.
11 | ///
12 | [TestClass]
13 | public class PreprocessorTests
14 | {
15 | private readonly EnPreProcessor englishPreProcessor = new EnPreProcessor();
16 | private readonly EnPlacesPreProcessor englishPlacesPreProcessor = new EnPlacesPreProcessor();
17 |
18 | [TestMethod]
19 | public void GivenStreetAndSaint_ToPlacesProcessor_ExpectProperFormatting()
20 | {
21 | var result = this.englishPlacesPreProcessor.PreProcess("St Maurice St");
22 | Assert.AreEqual("saint maurice street", result, "Place pre-processing doesn't return the expected result.");
23 | }
24 |
25 | [TestMethod]
26 | public void GivenCombiningAcuteAndLigature_ToEnglishPreprocessor_ExpectProperFormatting()
27 | {
28 | // "Híffi"
29 | // í has a combining acute accent, ffi is a ligature
30 | var result = this.englishPreProcessor.PreProcess("Hi\u0301\uFB03");
31 | Assert.AreEqual("h\u00EDffi", result);
32 | }
33 |
34 | [TestMethod]
35 | public void GivenDigits_ToEnglishPreprocessor_ExpectProperFormatting()
36 | {
37 | var result = this.englishPreProcessor.PreProcess("123 King St");
38 | Assert.AreEqual("123 king st", result);
39 |
40 | result = this.englishPreProcessor.PreProcess("2 Wildwood Place");
41 | Assert.AreEqual("2 wildwood place", result);
42 | }
43 |
44 | [TestMethod]
45 | public void GivenPunctuation_ToEnglishPreprocessor_ExpectProperFormatting()
46 | {
47 | var result = this.englishPreProcessor.PreProcess("!omg! ch!ll ?how?");
48 | Assert.AreEqual("omg ch ll how", result);
49 | }
50 |
51 | [TestMethod]
52 | public void GivenApostropheAndCase_ToEnglishPreprocessor_ExpectProperFormatting()
53 | {
54 | var result = this.englishPreProcessor.PreProcess("Justin's haus");
55 | Assert.AreEqual("justin s haus", result);
56 | }
57 |
58 | [TestMethod]
59 | public void GivenSimpleTokenization_ToEnglishPreprocessor_ExpectProperFormatting()
60 | {
61 | var result = this.englishPreProcessor.PreProcess("call mom");
62 | Assert.AreEqual("call mom", result);
63 |
64 | result = this.englishPreProcessor.PreProcess("call MoM!");
65 | Assert.AreEqual("call mom", result);
66 |
67 | result = this.englishPreProcessor.PreProcess("*(*&call, MoM! )_+");
68 | Assert.AreEqual("call mom", result);
69 |
70 | result = this.englishPreProcessor.PreProcess(":call/mom");
71 | Assert.AreEqual("call mom", result);
72 |
73 | result = this.englishPreProcessor.PreProcess("Call mom.");
74 | Assert.AreEqual("call mom", result);
75 |
76 | result = this.englishPreProcessor.PreProcess("Call mom .");
77 | Assert.AreEqual("call mom", result);
78 |
79 | result = this.englishPreProcessor.PreProcess("Call mom .");
80 | Assert.AreEqual("call mom", result);
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/ts/nlp/tokenizer.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * @file
3 | * Tokenizers.
4 | *
5 | * Copyright (c) Microsoft Corporation. All rights reserved.
6 | * Licensed under the MIT License.
7 | */
8 |
9 | /**
10 | * An Interval holds the first and last index bounds.
11 | *
12 | * @export
13 | * @class Interval
14 | */
15 | export class Interval {
16 | /**
17 | * Creates an instance of Interval.
18 | *
19 | * @param {number} first Starting index (inclusive).
20 | * @param {number} last Ending index (exclusive).
21 | * @memberof Interval
22 | */
23 | constructor(readonly first: number, readonly last: number) { }
24 |
25 | /**
26 | * The length of the token.
27 | *
28 | * @returns {number} The length.
29 | * @memberof Interval
30 | */
31 | length(): number {
32 | return this.last - this.first;
33 | }
34 | }
35 |
36 | /**
37 | * The substring token of the original string with its interval location.
38 | *
39 | * @export
40 | * @class Token
41 | */
42 | export class Token {
43 | /**
44 | * Creates an instance of Token.
45 | *
46 | * @param {string} value The substring.
47 | * @param {Interval} interval The interval location.
48 | * @memberof Token
49 | */
50 | constructor(readonly value: string, readonly interval: Interval) { }
51 | }
52 |
53 | /**
54 | * Tokenizer interface for strings.
55 | *
56 | * @export
57 | * @interface Tokenizer
58 | */
59 | export interface Tokenizer {
60 | /**
61 | * Tokenizes a string.
62 | *
63 | * @param {string} query The string to tokenize.
64 | * @returns {Token[]} The tokens.
65 | * @memberof Tokenizer
66 | */
67 | tokenize(query: string): Token[];
68 | }
69 |
70 | /**
71 | * Tokenizing base-class that will split on the given RegExp.
72 | *
73 | * @export
74 | * @abstract
75 | * @class SplittingTokenizer
76 | * @implements {Tokenizer}
77 | */
78 | export abstract class SplittingTokenizer implements Tokenizer {
79 | /**
80 | * Creates an instance of SplittingTokenizer.
81 | *
82 | * @param {RegExp} pattern The pattern to split on.
83 | * @memberof SplittingTokenizer
84 | */
85 | constructor(private readonly pattern: RegExp) { }
86 |
87 | tokenize(query: string): Token[] {
88 | const result: Token[] = [];
89 | let boundary = 0;
90 | let match;
91 | while ((match = this.pattern.exec(query)) !== null) {
92 | if (boundary < match.index) {
93 | const interval = new Interval(boundary, match.index);
94 | const token = new Token(query.substring(interval.first, interval.last), interval);
95 | result.push(token);
96 | }
97 | boundary = this.pattern.lastIndex;
98 | }
99 |
100 | if (boundary < query.length) {
101 | // Add the rest.
102 | const interval = new Interval(boundary, query.length);
103 | const token = new Token(query.substring(interval.first, interval.last), interval);
104 | result.push(token);
105 | }
106 | return result;
107 | }
108 | }
109 |
110 | /**
111 | * Tokenizer that splits on whitespace.
112 | *
113 | * @export
114 | * @class WhitespaceTokenizer
115 | * @extends {SplittingTokenizer}
116 | */
117 | export class WhitespaceTokenizer extends SplittingTokenizer {
118 | constructor() {
119 | super(/\s+/g);
120 | }
121 | }
122 |
--------------------------------------------------------------------------------
/tests/matchers/contactmatcher.test.ts:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | import { ContactFields, EnContactMatcher } from "../../ts/matchers";
5 |
6 | interface TestContact {
7 | firstName: string;
8 | lastName: string;
9 | tele?: string;
10 | }
11 |
12 | const targets: Array = [
13 | {
14 | firstName: "Andrew",
15 | lastName: "Smith",
16 | tele: "1234567",
17 | },
18 | {
19 | firstName: "Andrew",
20 | lastName: "",
21 | },
22 | {
23 | firstName: "John",
24 | lastName: "B",
25 | tele: "7654321",
26 | },
27 | {
28 | firstName: "John",
29 | lastName: "C",
30 | tele: "2222222",
31 | },
32 | {
33 | firstName: "Jennifer",
34 | lastName: "",
35 | }
36 | ];
37 |
38 | function extractContactFields(contact: TestContact): ContactFields {
39 | return {
40 | name: `${contact.firstName} ${contact.lastName}`
41 | }
42 | }
43 |
44 | describe("EnContactMatcher", () => {
45 | test("Phonetic weight.", () => {
46 | const matcher = new EnContactMatcher(targets, extractContactFields);
47 | const results = matcher.find("andru");
48 |
49 | expect(results.length).toBe(2);
50 | expect(results).toEqual(expect.arrayContaining([
51 | expect.objectContaining({
52 | firstName: "Andrew",
53 | lastName: "",
54 | }),
55 | expect.objectContaining({
56 | firstName: "Andrew",
57 | lastName: "Smith",
58 | })
59 | ]))
60 | });
61 |
62 | test("Duplicate names.", () => {
63 | const matcher = new EnContactMatcher(targets, extractContactFields);
64 | const results = matcher.find("john");
65 |
66 | expect(results.length).toBe(2);
67 | expect(results).toEqual(expect.arrayContaining([
68 | expect.objectContaining({
69 | firstName: "John",
70 | lastName: "B",
71 | }),
72 | expect.objectContaining({
73 | firstName: "John",
74 | lastName: "C",
75 | })
76 | ]))
77 | });
78 |
79 | test("Exact match.", () => {
80 | const matcher = new EnContactMatcher(targets, extractContactFields);
81 | const results = matcher.find("Andrew Smith");
82 |
83 | expect(results.length).toBe(1);
84 | expect(results).toEqual(expect.arrayContaining([
85 | expect.objectContaining({
86 | firstName: "Andrew",
87 | lastName: "Smith",
88 | tele: "1234567",
89 | }),
90 | ]))
91 | });
92 |
93 | test("Find empty.", () => {
94 | const matcher = new EnContactMatcher(targets, extractContactFields);
95 | const results = matcher.find("");
96 | expect(results).toEqual([]);
97 | });
98 |
99 | test("ctor used as function exception.", () => {
100 | expect(() => {
101 | const matcher = (EnContactMatcher as any)(targets, extractContactFields);
102 | }).toThrow();
103 | });
104 |
105 | test("Find undefined exception.", () => {
106 | expect(() => {
107 | const matcher = new EnContactMatcher(targets, extractContactFields);
108 | matcher.find(undefined as any);
109 | }).toThrow();
110 | });
111 | });
112 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Distance/StringDistance.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Distance
5 | {
6 | using System;
7 | using System.Runtime.InteropServices;
8 | using System.Text;
9 |
10 | ///
11 | /// String distance utility.
12 | ///
13 | public class StringDistance : NativeResourceWrapper, IDistance
14 | {
15 | ///
16 | /// Computes a string edit distance metric.
17 | ///
18 | /// First string to compare
19 | /// Second string to compare
20 | /// Returns the distance between string a and b.
21 | public double Distance(string first, string second)
22 | {
23 | if (first == null || second == null)
24 | {
25 | throw new ArgumentNullException("distance input can't be null");
26 | }
27 |
28 | double distance = 0;
29 | NativeResourceWrapper.CallNative((buffer) =>
30 | {
31 | int bufferSize = NativeResourceWrapper.BufferSize;
32 | var result = StringDistance_Distance(this.Native, first, second, out distance, buffer, ref bufferSize);
33 | NativeResourceWrapper.BufferSize = bufferSize;
34 | return result;
35 | });
36 | return distance;
37 | }
38 |
39 | ///
40 | /// Instantiate the native resource wrapped
41 | ///
42 | /// The parameter is not used.
43 | /// A pointer to the native resource.
44 | protected override IntPtr CreateNativeResources(params object[] args)
45 | {
46 | IntPtr native = IntPtr.Zero;
47 | NativeResourceWrapper.CallNative((buffer) =>
48 | {
49 | int bufferSize = NativeResourceWrapper.BufferSize;
50 | var result = StringDistance_Create(out native, buffer, ref bufferSize);
51 | NativeResourceWrapper.BufferSize = bufferSize;
52 | return result;
53 | });
54 | return native;
55 | }
56 |
57 | ///
58 | /// Delete the native pointer using the type specified in native bindings.
59 | ///
60 | /// Pointer to the native object.
61 | /// Buffer for any error message
62 | /// Size of the buffer, to be adjusted if error doesn't fit the current size.
63 | /// The result code from native library.
64 | protected override NativeResult NativeDelete(IntPtr native, StringBuilder buffer, ref int bufferSize)
65 | {
66 | return StringDistance_Delete(native, buffer, ref bufferSize);
67 | }
68 |
69 | [DllImport("maluubaspeech-csharp.dll")]
70 | private static extern NativeResult StringDistance_Delete(IntPtr ptr, StringBuilder buffer, ref int bufferSize);
71 |
72 | [DllImport("maluubaspeech-csharp.dll")]
73 | private static extern NativeResult StringDistance_Create(out IntPtr native, StringBuilder buffer, ref int bufferSize);
74 |
75 | [DllImport("maluubaspeech-csharp.dll")]
76 | private static extern NativeResult StringDistance_Distance(IntPtr ptr, string s1, string s2, out double distance, StringBuilder buffer, ref int bufferSize);
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Matchers/FuzzyMatcher/Normalized/EnHybridFuzzyMatcher.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Matchers.FuzzyMatcher.Normalized
5 | {
6 | using System;
7 | using System.Collections.Generic;
8 | using PhoneticMatching.Distance;
9 |
10 | ///
11 | /// A hybrid fuzzy matcher which normalizes results based on length of queries. The fuzziness it determined by the provided distance function.
12 | ///
13 | /// The type of the returned matched object.
14 | public class EnHybridFuzzyMatcher : NormalizedFuzzyMatcher
15 | {
16 | private double phoneticWeightPercentage = 0;
17 | private EnPronouncer pronouncer = EnPronouncer.Instance;
18 |
19 | ///
20 | /// Initializes a new instance of the class.
21 | ///
22 | /// The set of objects that will be matched against. The order of equal targets is not guaranteed to be preserved.
23 | /// Between 0 and 1.
24 | /// Weighting trade-off between the phonetic distance and the lexical distance scores.
25 | /// 1 meaning 100% phonetic score and 0% lexical score.
26 | /// A mapping of the input types to the query(extraction) type. Note that Extraction == string for normalized cases.
27 | /// Whether the fuzzy matcher uses accelerated implementation or not.
28 | public EnHybridFuzzyMatcher(IList targets, double phoneticWeightPercentage, Func targetToExtractionPhrase = null, bool isAccelerated = true)
29 | {
30 | this.phoneticWeightPercentage = phoneticWeightPercentage;
31 |
32 | Func targetToExtraction = (target) =>
33 | {
34 | string phrase = targetToExtractionPhrase == null ? target as string : targetToExtractionPhrase(target);
35 | if (phrase == null)
36 | {
37 | throw new InvalidCastException($"Can't cast Target type [{typeof(Target)}] to Extraction type [string]. You must provide a conversion function 'targetToExtractionPhrase'.");
38 | }
39 |
40 | return new DistanceInput(phrase, this.pronouncer.Pronounce(phrase));
41 | };
42 | this.GenericFuzzyMatcher = new FuzzyMatcher(targets, new EnHybridDistance(phoneticWeightPercentage), targetToExtraction, isAccelerated);
43 | }
44 |
45 | ///
46 | /// Find the __k__ nearest elements.
47 | ///
48 | /// The search target.
49 | /// The maximum distance to a match.
50 | /// The maximum number of result to return.
51 | /// The __k__ nearest matches to target within limit
52 | public override IList> FindNearestWithin(string query, double limit, int count)
53 | {
54 | var input = new DistanceInput(query, this.pronouncer.Pronounce(query));
55 | double thresholdScale = (this.phoneticWeightPercentage * input.Pronunciation.Phones.Count) + ((1 - this.phoneticWeightPercentage) * input.Phrase.Length);
56 | return this.FindNearestWithinNormalized(input, limit, count, thresholdScale);
57 | }
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/src/cs/Microsoft.PhoneticMatching/Distance/EnPhoneticDistance.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace Microsoft.PhoneticMatching.Distance
5 | {
6 | using System;
7 | using System.Runtime.InteropServices;
8 | using System.Text;
9 |
10 | ///
11 | /// English phonetic distance utility.
12 | ///
13 | public class EnPhoneticDistance : NativeResourceWrapper, IDistance
14 | {
15 | ///
16 | /// Computes an English phonetic distance metric.
17 | ///
18 | /// First pronunciation to compare
19 | /// Second pronunciation to compare
20 | /// The english phonetic distance between a and b
21 | public double Distance(EnPronunciation first, EnPronunciation second)
22 | {
23 | if (first == null || second == null)
24 | {
25 | throw new ArgumentNullException("distance input can't be null");
26 | }
27 |
28 | double distance = 0;
29 | NativeResourceWrapper.CallNative((buffer) =>
30 | {
31 | int bufferSize = NativeResourceWrapper.BufferSize;
32 | var result = EnPhoneticDistance_Distance(this.Native, first.Native, second.Native, out distance, buffer, ref bufferSize);
33 | NativeResourceWrapper.BufferSize = bufferSize;
34 | return result;
35 | });
36 | return distance;
37 | }
38 |
39 | ///
40 | /// Instantiate the native resource wrapped
41 | ///
42 | /// The parameter is not used.
43 | /// A pointer to the native resource.
44 | protected override IntPtr CreateNativeResources(params object[] args)
45 | {
46 | IntPtr native = IntPtr.Zero;
47 | NativeResourceWrapper.CallNative((buffer) =>
48 | {
49 | int bufferSize = NativeResourceWrapper.BufferSize;
50 | var result = EnPhoneticDistance_Create(out native, buffer, ref bufferSize);
51 | NativeResourceWrapper.BufferSize = bufferSize;
52 | return result;
53 | });
54 | return native;
55 | }
56 |
57 | ///
58 | /// Delete the native pointer using the type specified in native bindings.
59 | ///
60 | /// Pointer to the native object.
61 | /// Buffer for any error message
62 | /// Size of the buffer, to be adjusted if error doesn't fit the current size.
63 | /// The result code from native library.
64 | protected override NativeResult NativeDelete(IntPtr native, StringBuilder buffer, ref int bufferSize)
65 | {
66 | return EnPhoneticDistance_Delete(native, buffer, ref bufferSize);
67 | }
68 |
69 | [DllImport("maluubaspeech-csharp.dll")]
70 | private static extern NativeResult EnPhoneticDistance_Delete(IntPtr ptr, StringBuilder buffer, ref int bufferSize);
71 |
72 | [DllImport("maluubaspeech-csharp.dll")]
73 | private static extern NativeResult EnPhoneticDistance_Create(out IntPtr native, StringBuilder buffer, ref int bufferSize);
74 |
75 | [DllImport("maluubaspeech-csharp.dll")]
76 | private static extern NativeResult EnPhoneticDistance_Distance(IntPtr native, IntPtr first, IntPtr second, out double distance, StringBuilder buffer, ref int bufferSize);
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/cs/PhoneticMatchingTests/Matchers/ContactMatcherTests.cs:
--------------------------------------------------------------------------------
1 | // Copyright (c) Microsoft Corporation. All rights reserved.
2 | // Licensed under the MIT License.
3 |
4 | namespace PhoneticMatchingTests.Matchers
5 | {
6 | using System;
7 | using Microsoft.VisualStudio.TestTools.UnitTesting;
8 | using Microsoft.PhoneticMatching.Matchers.ContactMatcher;
9 |
10 | [TestClass]
11 | public class ContactMatcherTests : BaseContactMatcherTester
12 | {
13 | [TestMethod]
14 | public void GivenSimilarPhoneticWeight_ExpectPositiveMatch()
15 | {
16 | var matcher = new EnContactMatcher(this.Targets, this.ContactFieldsExtrator);
17 | var results = matcher.Find("andru");
18 |
19 | Assert.AreEqual(2, results.Count);
20 | var expected = new TestContact()
21 | {
22 | FirstName = "Andrew",
23 | LastName = string.Empty
24 | };
25 | Assert.IsTrue(results.Contains(expected));
26 | expected = new TestContact()
27 | {
28 | FirstName = "Andrew",
29 | LastName = "Smith",
30 | Id = "1234567"
31 | };
32 | Assert.IsTrue(results.Contains(expected));
33 | }
34 |
35 | [TestMethod]
36 | public void GivenDuplicateNames_ExpectPositiveMatch()
37 | {
38 | var matcher = new EnContactMatcher(this.Targets, this.ContactFieldsExtrator);
39 | var results = matcher.Find("john");
40 |
41 | Assert.AreEqual(2, results.Count);
42 | var expected = new TestContact()
43 | {
44 | FirstName = "John",
45 | LastName = "B",
46 | Id = "7654321"
47 | };
48 | Assert.IsTrue(results.Contains(expected));
49 | expected = new TestContact()
50 | {
51 | FirstName = "John",
52 | LastName = "C",
53 | Id = "2222222"
54 | };
55 | Assert.IsTrue(results.Contains(expected));
56 | }
57 |
58 | [TestMethod]
59 | public void GivenExactMatch_ExpectPositiveMatch()
60 | {
61 | var matcher = new EnContactMatcher(this.Targets, this.ContactFieldsExtrator);
62 | var results = matcher.Find("Andrew Smith");
63 |
64 | Assert.AreEqual(1, results.Count);
65 | var expected = new TestContact()
66 | {
67 | FirstName = "Andrew",
68 | LastName = "Smith",
69 | Id = "1234567"
70 | };
71 | Assert.AreEqual(expected, results[0]);
72 | }
73 |
74 | [TestMethod]
75 | public void GivenEmptyQuery_ExpectEmptyResult()
76 | {
77 | var matcher = new EnContactMatcher(this.Targets, this.ContactFieldsExtrator);
78 | var results = matcher.Find(string.Empty);
79 | Assert.AreEqual(0, results.Count);
80 | }
81 |
82 | [TestMethod]
83 | public void GivenNullQuery_ExpectException()
84 | {
85 | Assert.ThrowsException