├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .golangci.yml ├── CHANGELOG.md ├── LICENSE ├── Makefile ├── README.md ├── cmp.go ├── cmp_test.go ├── examples └── example_test.go ├── go.mod ├── go.sum ├── normalize.go ├── normalize_test.go └── normalizers.go /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | build: 9 | name: Build 10 | strategy: 11 | matrix: 12 | os: [ ubuntu-latest ] 13 | runs-on: ${{ matrix.os }} 14 | 15 | steps: 16 | - name: Set up Go 17 | uses: actions/setup-go@v2 18 | with: 19 | go-version: ^1.15 20 | id: go 21 | 22 | - name: Check out code 23 | uses: actions/checkout@v2 24 | 25 | - name: Get dependencies 26 | run: make deps 27 | 28 | - name: Build 29 | run: make build 30 | 31 | - name: Lint 32 | uses: golangci/golangci-lint-action@v2 33 | with: 34 | args: --config=.golangci.yml ./... 35 | 36 | - name: Test 37 | run: make test 38 | 39 | - name: Upload coverage to Codecov 40 | uses: codecov/codecov-action@v1 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | coverage.txt -------------------------------------------------------------------------------- /.golangci.yml: -------------------------------------------------------------------------------- 1 | # More info on config here: https://github.com/golangci/golangci-lint#config-file 2 | run: 3 | deadline: 10m 4 | issues-exit-code: 1 5 | tests: true 6 | skip-dirs: 7 | - bin 8 | - vendor 9 | - var 10 | - tmp 11 | - internal/generated 12 | 13 | output: 14 | format: colored-line-number 15 | print-issued-lines: true 16 | print-linter-name: true 17 | 18 | linters-settings: 19 | govet: 20 | check-shadowing: true 21 | golint: 22 | min-confidence: 0 23 | dupl: 24 | threshold: 100 25 | goconst: 26 | min-len: 2 27 | min-occurrences: 2 28 | 29 | linters: 30 | disable-all: true 31 | enable: 32 | - golint 33 | - govet 34 | - errcheck 35 | - deadcode 36 | - structcheck 37 | - varcheck 38 | - ineffassign 39 | - typecheck 40 | - goconst 41 | - gosec 42 | - goimports 43 | - gosimple 44 | - unused 45 | - staticcheck # enable before push 46 | # - dupl # - it's very slow, enable if you really know why you need it 47 | 48 | issues: 49 | exclude-use-default: false 50 | exclude: 51 | # _ instead of err checks 52 | - G104 53 | # for "public interface + private struct implementation" cases only! 54 | - exported func * returns unexported type *, which can be annoying to use 55 | # can be removed in the development phase 56 | - (comment on exported (method|function|type|const)|should have( a package)? comment|comment should be of the form) 57 | # not for the active development - can be removed in the stable phase 58 | - should have a package comment, unless it's in another file for this package 59 | - don't use an underscore in package name 60 | # errcheck: Almost all programs ignore errors on these functions and in most cases it's ok 61 | - Error return value of .((os\.)?std(out|err)\..*|.*Close|.*Flush|os\.Remove(All)?|.*printf?|os\.(Un)?Setenv|.*Rollback). is not checked 62 | - should check returned error before deferring 63 | - "not declared by package utf8" 64 | - "unicode/utf8/utf8.go" 65 | # Dns instead of DNS 66 | - (struct field|func|const) .*Dns.* should be .*DNS.* -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [0.0.1] - 2021-03-12 4 | 5 | ### Added 6 | 7 | - Initial version. 8 | 9 | [0.0.1]: https://stash.msk.avito.ru/projects/GL/repos/strcmp/browse?at=refs%2Ftags%2Fv0.0.1 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Avito 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: deps 2 | deps: 3 | go mod tidy && go mod verify 4 | 5 | .PHONY: build 6 | build: 7 | go build -v . 8 | 9 | .PHONY: lint 10 | lint: 11 | golangci-lint run --new-from-rev=origin/master --config=.golangci.yml ./... 12 | 13 | .PHONY: lint-full 14 | lint-full: 15 | golangci-lint run --config=.golangci.yml ./... 16 | 17 | .PHONY: fmt 18 | fmt: 19 | go fmt ./... 20 | goimports -w ./ 21 | 22 | .PHONY: test 23 | test: 24 | go test -v -coverprofile="coverage.txt" -covermode=atomic -race -count 1 -timeout 20s ./... 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # normalize 2 | [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) 3 | [![Go Reference](https://pkg.go.dev/badge/github.com/avito-tech/normalize.svg)](https://pkg.go.dev/github.com/avito-tech/normalize) 4 | [![ci](https://github.com/avito-tech/normalize/actions/workflows/ci.yml/badge.svg)](https://github.com/avito-tech/normalize/actions/workflows/ci.yml) 5 | [![codecov](https://codecov.io/gh/avito-tech/normalize/branch/master/graph/badge.svg?token=DJMFEBX8H7)](https://codecov.io/gh/avito-tech/normalize) 6 | [![Go Report Card](https://goreportcard.com/badge/github.com/avito-tech/normalize?style=flat)](https://goreportcard.com/report/github.com/avito-tech/normalize) 7 | 8 | Simple library for fuzzy text sanitizing, normalizing and comparison. 9 | 10 | ## Why 11 | People type differently. This may be a problem if you need to associate user input with some internal entity or compare two inputs of different users. Say `abc-01` and `ABC 01` must be considered to be the same strings in your system. There are many heuristics we can apply to make this work: 12 | 13 | * Remove special characters. 14 | * Convert everything to lowercase. 15 | * etc. 16 | 17 | This library is essentially an easily configurable set of useful helpers implementing all these transformations. 18 | ## Installation 19 | ```bash 20 | go get -u github.com/avito-tech/normalize 21 | ``` 22 | ## Features 23 | ### Normalize fuzzy text 24 | ```go 25 | package main 26 | 27 | import ( 28 | "fmt" 29 | "github.com/avito-tech/normalize" 30 | ) 31 | 32 | func main() { 33 | fuzzy := "VAG-1101" 34 | clean := normalize.Normalize(fuzzy) 35 | fmt.Print(clean) // vag1101 36 | 37 | manyFuzzy := []string{"VAG-1101", "VAG-1102"} 38 | manyClean := normalize.Many(manyFuzzy) 39 | fmt.Print(manyClean) // {"vag1101", "vag1102"} 40 | } 41 | ``` 42 | 43 | #### Default rules (in order of actual application): 44 | * Any char except latin/cyrillic letters, German umlauts (`ä`, `ö`, `ü`) and digits are removed. 45 | * Rare cyrillic letters `ё` and `й` are replaced with common equivalents `е` and `и`. 46 | * Latin/cyrillic look-alike pairs are normalized to latin letters, so `В (в)` becomes `B (b)`. Please check all replacement pairs in `WithCyrillicToLatinLookAlike` normalizer in `normalizers.go`. 47 | * German umlauts `ä`, `ö`, `ü` get converted to latin `a`, `o`, `u`. 48 | * The whole string gets lower cased. 49 | 50 | ### Compare fuzzy texts 51 | Compare two strings with all normalizations described above applied. Provide threshold parameters to tweak how similar strings must be to make the function return `true`. 52 | `threshold` is relative value, so `0.5` roughly means *"strings are 50% different after all normalizations applied"*. 53 | 54 | [Levenstein distance](https://en.wikipedia.org/wiki/Levenshtein_distance) is used under the hood to compute distance between strings. 55 | 56 | ```go 57 | package main 58 | 59 | import ( 60 | "fmt" 61 | "github.com/avito-tech/normalize" 62 | ) 63 | 64 | func main() { 65 | fuzzy := "Hyundai-Kia" 66 | otherFuzzy := "HYUNDAI" 67 | similarityThreshold := 0.3 68 | result := normalize.AreStringsSimilar(fuzzy, otherFuzzy, similarityThreshold) 69 | 70 | // distance(hyundaikia, hyundai) = 3 71 | // 3 / len(hyundaikia) = 0.3 72 | fmt.Print(result) // true 73 | } 74 | ``` 75 | 76 | #### Default rules 77 | * Apply default normalization (described above). 78 | * Calculate Levenstein distance and return `true` if `distance / strlen <= threshold`. 79 | 80 | 81 | ### Configuration 82 | Both `AreStringsSimilar` and `Normalize` accept arbitrary number of normalizers as an optional parameter. 83 | Normalizer is any function that accepts string and returns string. 84 | 85 | For example, following option will leave string unchanged. 86 | 87 | ```go 88 | package main 89 | 90 | import "github.com/avito-tech/normalize" 91 | 92 | func WithNoNormalization() normalize.Option { 93 | return func(str string) string { 94 | return str 95 | } 96 | } 97 | ``` 98 | 99 | You can configure normalizing to use only those options you need. For example, you can use only lower casing and cyr2lat conversion during normalization. Note that the order of options matters. 100 | ```go 101 | package main 102 | 103 | import ( 104 | "fmt" 105 | "github.com/avito-tech/normalize" 106 | ) 107 | 108 | func main() { 109 | fuzzy := "АВ-123" 110 | clean := normalize.Normalize(fuzzy, normalize.WithLowerCase(), normalize.WithCyrillicToLatinLookAlike()) 111 | fmt.Print(clean) // ab-123 112 | } 113 | ``` 114 | -------------------------------------------------------------------------------- /cmp.go: -------------------------------------------------------------------------------- 1 | package normalize 2 | 3 | import "github.com/agnivade/levenshtein" 4 | 5 | // AreStringsSimilar returns true if relative distance between 2 strings after normalization is lower than provided threshold. 6 | func AreStringsSimilar(one string, other string, threshold float64, normalizers ...Option) bool { 7 | one, other = Normalize(one, normalizers...), Normalize(other, normalizers...) 8 | greatestLen := greatest(len(one), len(other)) 9 | distance := levenshtein.ComputeDistance(one, other) 10 | return float64(distance)/float64(greatestLen) <= threshold 11 | } 12 | 13 | func greatest(a, b int) int { 14 | if a > b { 15 | return a 16 | } 17 | return b 18 | } 19 | -------------------------------------------------------------------------------- /cmp_test.go: -------------------------------------------------------------------------------- 1 | package normalize_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/avito-tech/normalize" 7 | ) 8 | 9 | func Test_areStringsSimilar(t *testing.T) { 10 | tests := []struct { 11 | name string 12 | one string 13 | other string 14 | threshold float64 15 | normalizers []normalize.Option 16 | want bool 17 | }{ 18 | { 19 | name: "same_strings_must_be_similar", 20 | one: "hello", 21 | other: "hello", 22 | want: true, 23 | }, 24 | { 25 | name: "non_normalized_strings_distance", 26 | one: "hella", 27 | other: "hello", 28 | want: false, 29 | }, 30 | { 31 | name: "non_normalized_strings_distance_with_threshold", 32 | one: "hella", 33 | other: "hello", 34 | threshold: 0.25, 35 | want: true, 36 | }, 37 | { 38 | name: "non_normalized_strings_distance_with_threshold", 39 | one: "hela", 40 | other: "hello", 41 | threshold: 0.25, 42 | want: false, 43 | }, 44 | { 45 | name: "non_normalized_stringds_of_different_length", 46 | one: "hell", 47 | other: "hello", 48 | threshold: 0.34, 49 | want: true, 50 | }, 51 | { 52 | name: "non_normalized_stringds_of_different_length_flipped", 53 | one: "hello", 54 | other: "hell", 55 | threshold: 0.34, 56 | want: true, 57 | }, 58 | { 59 | name: "non_normalized_strings_of_different_length_flipped", 60 | one: "hello", 61 | other: "hell", 62 | threshold: 0.34, 63 | want: true, 64 | }, 65 | { 66 | name: "normalized_strings", 67 | one: "A b", 68 | other: "АВ", // all cyrillic 69 | want: true, 70 | }, 71 | { 72 | name: "normalized_strings_with_threshold", 73 | one: "AB-test", 74 | other: "АВ тест", // all cyrillic 75 | threshold: 0.17, 76 | want: true, 77 | }, 78 | { 79 | name: "normalized_strings_with_custom_options", 80 | one: "AB", 81 | other: "АВ", // all cyrillic 82 | normalizers: []normalize.Option{normalize.WithLowerCase()}, // no cyr2lat 83 | want: false, 84 | }, 85 | { 86 | name: "normalized_strings_with_custom_options", 87 | one: "AB", 88 | other: "АВ", // all cyrillic 89 | normalizers: []normalize.Option{normalize.WithLowerCase(), normalize.WithCyrillicToLatinLookAlike()}, 90 | want: true, 91 | }, 92 | } 93 | for _, tt := range tests { 94 | t.Run(tt.name, func(t *testing.T) { 95 | if got := normalize.AreStringsSimilar(tt.one, tt.other, tt.threshold, tt.normalizers...); got != tt.want { 96 | t.Errorf("AreStringsSimilar() = %v, want %v", got, tt.want) 97 | } 98 | }) 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /examples/example_test.go: -------------------------------------------------------------------------------- 1 | package examples_test 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/avito-tech/normalize" 7 | ) 8 | 9 | func ExampleNormalize() { 10 | fuzzy := "VAG-1101" 11 | clean := normalize.Normalize(fuzzy) 12 | fmt.Print(clean) // Output: vag1101 13 | } 14 | 15 | func ExampleNormalizeMany() { 16 | manyFuzzy := []string{"VAG-1101", "VAG-1102"} 17 | manyClean := normalize.Many(manyFuzzy) 18 | fmt.Print(manyClean) // Output: [vag1101 vag1102] 19 | } 20 | 21 | func ExampleNormalize_withOptions() { 22 | fuzzy := "АВ-123" 23 | clean := normalize.Normalize(fuzzy, normalize.WithLowerCase(), normalize.WithCyrillicToLatinLookAlike()) 24 | fmt.Print(clean) 25 | // Output: ab-123 26 | } 27 | 28 | func ExampleAreStringsSimilar() { 29 | // nolint:goconst 30 | fuzzy := "Hyundai-Kia" 31 | // nolint:goconst 32 | otherFuzzy := "HYUNDAI" 33 | similarityThreshold := 0.3 34 | result := normalize.AreStringsSimilar(fuzzy, otherFuzzy, similarityThreshold) 35 | fmt.Print(result) 36 | // Output: true 37 | } 38 | 39 | func ExampleAreStringsSimilar_withOptions() { 40 | fuzzy := "Hyundai-Kia" 41 | otherFuzzy := "HYUNDAI" 42 | similarityThreshold := 0.3 43 | result := normalize.AreStringsSimilar(fuzzy, otherFuzzy, similarityThreshold, normalize.WithLowerCase(), normalize.WithCyrillicToLatinLookAlike()) 44 | fmt.Print(result) 45 | // Output: false 46 | } 47 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/avito-tech/normalize 2 | 3 | go 1.15 4 | 5 | require github.com/agnivade/levenshtein v1.1.0 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/agnivade/levenshtein v1.1.0 h1:n6qGwyHG61v3ABce1rPVZklEYRT8NFpCMrpZdBUbYGM= 2 | github.com/agnivade/levenshtein v1.1.0/go.mod h1:veldBMzWxcCG2ZvUTKD2kJNRdCk5hVbJomOvKkmgYbo= 3 | github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0 h1:jfIu9sQUG6Ig+0+Ap1h4unLjW6YQJpKZVmUzxsD4E/Q= 4 | github.com/arbovm/levenshtein v0.0.0-20160628152529-48b4e1c0c4d0/go.mod h1:t2tdKJDJF9BV14lnkjHmOQgcvEKgtqs5a1N3LNdJhGE= 5 | github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48 h1:fRzb/w+pyskVMQ+UbP35JkH8yB7MYb4q/qhBarqZE6g= 6 | github.com/dgryski/trifles v0.0.0-20200323201526-dd97f9abfb48/go.mod h1:if7Fbed8SFyPtHLHbg49SI7NAdJiC5WIA09pe59rfAA= 7 | -------------------------------------------------------------------------------- /normalize.go: -------------------------------------------------------------------------------- 1 | package normalize 2 | 3 | var defaultNormalizers = []Option{ 4 | WithRemoveSpecialChars(), 5 | WithFixRareCyrillicChars(), 6 | WithCyrillicToLatinLookAlike(), 7 | WithUmlautToLatinLookAlike(), 8 | WithLowerCase(), 9 | } 10 | 11 | // Normalize returns normalized string. 12 | // If not normalizers specified default set of normalizers is used. 13 | func Normalize(str string, normalizers ...Option) string { 14 | if len(normalizers) == 0 { 15 | normalizers = defaultNormalizers 16 | } 17 | result := str 18 | for _, normalizer := range normalizers { 19 | result = normalizer(result) 20 | } 21 | return result 22 | } 23 | 24 | // Many normalizes slice of strings returning new slice with normalized elements. 25 | func Many(strings []string, normalizers ...Option) []string { 26 | normalizedStrings := make([]string, 0, len(strings)) 27 | for _, str := range strings { 28 | normalizedStrings = append(normalizedStrings, Normalize(str, normalizers...)) 29 | } 30 | return normalizedStrings 31 | } 32 | -------------------------------------------------------------------------------- /normalize_test.go: -------------------------------------------------------------------------------- 1 | package normalize_test 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/avito-tech/normalize" 7 | ) 8 | 9 | func Test_Normalize(t *testing.T) { 10 | tests := []struct { 11 | name string 12 | str string 13 | normalizers []normalize.Option 14 | want string 15 | }{ 16 | { 17 | name: "default_options_string_gets_lowercased", 18 | str: "PART123", 19 | want: "part123", 20 | }, 21 | { 22 | name: "default_options_lowercase_string_is_unchanged", 23 | str: "part123", 24 | want: "part123", 25 | }, 26 | { 27 | name: "default_options_spaces_are_trimmed", 28 | str: " part123 ", 29 | want: "part123", 30 | }, 31 | { 32 | name: "default_options_special_characters_and_spaces_are_removed", 33 | str: "part - #123_50", 34 | want: "part12350", 35 | }, 36 | { 37 | name: "default_options_cyrillic_are_not_removed", 38 | str: "Часть-123", 39 | want: "чactь123", 40 | }, 41 | { 42 | name: "default_options_cyrillic_in_string_is_converted_to_latin", 43 | str: "Wewtаренбсхдумтокдлвгф", 44 | want: "wewtapehbcxdymtokdlbgf", 45 | }, 46 | { 47 | name: "default_options_ё_replaced_with_e_latin", 48 | str: "ёжикЁжик", 49 | want: "eжиkeжиk", 50 | }, 51 | { 52 | name: "default_options_й_replaced_with_и", 53 | str: "йодЙод", 54 | want: "иodиod", 55 | }, 56 | { 57 | name: "default_options_umlauts_converted_to_latin", 58 | str: "äöüÄÖÜ", 59 | want: "aouaou", 60 | }, 61 | { 62 | name: "no_normalizers_options", 63 | str: "Wewtаренбсхдумтокдлвгф", 64 | normalizers: []normalize.Option{normalize.WithNoNormalization()}, 65 | want: "Wewtаренбсхдумтокдлвгф", 66 | }, 67 | { 68 | name: "only_some_options", 69 | str: "АВ-тест", 70 | normalizers: []normalize.Option{normalize.WithLowerCase(), normalize.WithCyrillicToLatinLookAlike()}, 71 | want: "ab-tect", 72 | }, 73 | { 74 | name: "only_cyr2lat_cyrillic_letters_are_fixed_with_keeping_case", 75 | str: "АВС", 76 | normalizers: []normalize.Option{normalize.WithCyrillicToLatinLookAlike()}, 77 | want: "ABC", 78 | }, 79 | { 80 | name: "only_fix_rare_cyrillic_rare_cyrillic_letters_are_fixed_with_keeping_case", 81 | str: "ЙЁ", 82 | normalizers: []normalize.Option{normalize.WithFixRareCyrillicChars()}, 83 | want: "ИЕ", 84 | }, 85 | { 86 | name: "only_fix_umlaut2lat_umlauts_converted_to_latin_keeping_case", 87 | str: "äöüÄÖÜ", 88 | normalizers: []normalize.Option{normalize.WithUmlautToLatinLookAlike()}, 89 | want: "aouAOU", 90 | }, 91 | } 92 | for _, tt := range tests { 93 | t.Run(tt.name, func(t *testing.T) { 94 | got := normalize.Normalize(tt.str, tt.normalizers...) 95 | if tt.want != got { 96 | t.Errorf("want = %s, got = %s", tt.want, got) 97 | } 98 | }) 99 | } 100 | } 101 | 102 | func Test_NormalizeMany(t *testing.T) { 103 | tests := []struct { 104 | name string 105 | manyStr []string 106 | want []string 107 | }{ 108 | { 109 | name: "normalize_several_strings", 110 | manyStr: []string{"Part-1", "Часть 2", "WewtаренБсхдумтОк"}, 111 | want: []string{"part1", "чactь2", "wewtapehbcxdymtok"}, 112 | }, 113 | } 114 | for _, tt := range tests { 115 | t.Run(tt.name, func(t *testing.T) { 116 | got := normalize.Many(tt.manyStr) 117 | if len(tt.want) != len(got) { 118 | t.Errorf("got slices of different length, want = %s, got = %s", tt.want, got) 119 | } 120 | for i, wantStr := range tt.want { 121 | if wantStr != got[i] { 122 | t.Errorf("at pos %d want = %s, got = %s", i, tt.want, got) 123 | } 124 | } 125 | }) 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /normalizers.go: -------------------------------------------------------------------------------- 1 | package normalize 2 | 3 | import ( 4 | "regexp" 5 | "strings" 6 | "unicode" 7 | ) 8 | 9 | type Option func(string) string 10 | 11 | var specialCharsPattern = regexp.MustCompile(`(?i:[^äöüa-zа-яё0-9])`) 12 | 13 | // WithRemoveSpecialChars any char except latin/cyrillic letters, German umlauts (`ä`, `ö`, `ü`) and digits are removed 14 | func WithRemoveSpecialChars() Option { 15 | return func(str string) string { 16 | return specialCharsPattern.ReplaceAllString(str, "") 17 | } 18 | } 19 | 20 | var rareCyrillicChars = withUpperPairs(map[rune]rune{ 21 | 'ё': 'е', 22 | 'й': 'и', 23 | }) 24 | 25 | // WithFixRareCyrillicChars rare cyrillic letters `ё` and `й` are replaced with common equivalents `е` and `и` 26 | func WithFixRareCyrillicChars() Option { 27 | return WithRuneMapping(rareCyrillicChars) 28 | } 29 | 30 | var cyrillicTolatinsLookAlike = withUpperPairs(map[rune]rune{ 31 | 'а': 'a', 32 | 'е': 'e', 33 | 'т': 't', 34 | 'у': 'y', 35 | 'о': 'o', 36 | 'р': 'p', 37 | 'н': 'h', 38 | 'к': 'k', 39 | 'х': 'x', 40 | 'с': 'c', 41 | 'б': 'b', 42 | 'м': 'm', 43 | 'д': 'd', 44 | 'л': 'l', 45 | 'в': 'b', 46 | 'г': 'g', 47 | 'ф': 'f', 48 | }) 49 | 50 | // WithCyrillicToLatinLookAlike Latin/cyrillic look-alike pairs are normalized to latin letters so `В (в)` becomes `B (b)`, etc. 51 | func WithCyrillicToLatinLookAlike() Option { 52 | return WithRuneMapping(cyrillicTolatinsLookAlike) 53 | } 54 | 55 | var umlautsToLatin = withUpperPairs(map[rune]rune{ 56 | 'ä': 'a', 57 | 'ö': 'o', 58 | 'ü': 'u', 59 | }) 60 | 61 | // WithUmlautToLatinLookAlike german umlauts `ä`, `ö`, `ü` get converted to latin `a`, `o`, `u` 62 | func WithUmlautToLatinLookAlike() Option { 63 | return WithRuneMapping(umlautsToLatin) 64 | } 65 | 66 | // WithRuneMapping configures arbitrary rune mapping, case sensitive 67 | func WithRuneMapping(mapping map[rune]rune) Option { 68 | return func(str string) string { 69 | return strings.Map(func(letter rune) rune { 70 | if newLetter, ok := mapping[letter]; ok { 71 | return newLetter 72 | } 73 | return letter 74 | }, str) 75 | } 76 | } 77 | 78 | func withUpperPairs(m map[rune]rune) map[rune]rune { 79 | for from, to := range m { 80 | m[unicode.ToUpper(from)] = unicode.ToUpper(to) 81 | } 82 | return m 83 | } 84 | 85 | // WithLowerCase converts string to lowercase 86 | func WithLowerCase() Option { 87 | return func(str string) string { 88 | return strings.ToLower(str) 89 | } 90 | } 91 | 92 | // WithNoNormalization applies no changes to string 93 | func WithNoNormalization() Option { 94 | return func(str string) string { 95 | return str 96 | } 97 | } 98 | --------------------------------------------------------------------------------