├── .github └── workflows │ └── go.yml ├── LICENSE ├── README.md ├── collate.go ├── collate_test.go ├── go.mod └── go.sum /.github/workflows/go.yml: -------------------------------------------------------------------------------- 1 | name: Go 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | 11 | build: 12 | name: Build 13 | runs-on: ubuntu-latest 14 | steps: 15 | 16 | - name: Set up Go 1.x 17 | uses: actions/setup-go@v2 18 | with: 19 | go-version: ^1.13 20 | 21 | - name: Check out code into the Go module directory 22 | uses: actions/checkout@v2 23 | 24 | - name: Get dependencies 25 | run: | 26 | go get -v -t -d ./... 27 | if [ -f Gopkg.toml ]; then 28 | curl https://raw.githubusercontent.com/golang/dep/master/install.sh | sh 29 | dep ensure 30 | fi 31 | 32 | - name: Build 33 | run: go build -v . 34 | 35 | - name: Test 36 | run: go test -v . 37 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Josh Baker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Collate 2 | 3 | [![GoDoc](https://godoc.org/github.com/tidwall/collate?status.svg)](https://godoc.org/github.com/tidwall/collate) 4 | 5 | 6 | Collate is a simple collation library for comparing strings in various languages for Go. 7 | It's designed for the [BuntDB](https://github.com/tidwall/buntdb) project, and 8 | is simliar to the 9 | [collation](https://msdn.microsoft.com/en-us/library/ms174596.aspx) that is 10 | found in traditional database systems 11 | 12 | The idea is that you call a function with a collation name and it generates 13 | a `Less(a, b string) bool` function that can be used for sorting using the 14 | `sort` package or with B-Tree style databases. 15 | 16 | ## Install 17 | 18 | ``` 19 | go get -u github.com/tidwall/collate 20 | ``` 21 | 22 | ## Example 23 | 24 | ```go 25 | // create a case-insensitive collation for spanish. 26 | less := collate.IndexString("SPANISH_CI") 27 | println(less("Hola", "hola")) 28 | println(less("hola", "Hola")) 29 | // Output: 30 | // false 31 | // false 32 | ``` 33 | 34 | ## Options 35 | 36 | ### Case Sensitivity 37 | Add `_CI` to the collation name to specify case-insensitive comparing. 38 | Add `_CS` for case-sensitive compares, this is the default. 39 | 40 | ```go 41 | collate.Index("SPANISH_CI") // Case-insensitive collation for spanish 42 | collate.Index("SPANISH_CS") // Case-sensitive collation for spanish 43 | ``` 44 | 45 | ### Loose Compares 46 | Add `_LOOSE` to ignores diacritics, case and weight. 47 | 48 | ### Numeric Compares 49 | Add `_NUM` to specifies that numbers should sort numerically ("2" < "12") 50 | 51 | ### JSON 52 | You can also compare fields in json documents using the `IndexJSON` function. 53 | The [GJSON](https://github.com/tidwall/gjson) is used under-the-hood. 54 | 55 | ```go 56 | var jsonA = `{"name":{"last":"Miller"}}` 57 | var jsonB = `{"name":{"last":"anderson"}}` 58 | less := collate.IndexJSON("ENGLISH_CI", "name.last") 59 | println(less(jsonA, jsonB)) 60 | println(less(jsonB, jsonA)) 61 | // Output: 62 | // false 63 | // true 64 | ``` 65 | 66 | ## Supported Languages 67 | 68 | Afrikaans 69 | Albanian 70 | AmericanEnglish 71 | Amharic 72 | Arabic 73 | Armenian 74 | Azerbaijani 75 | Bengali 76 | BrazilianPortuguese 77 | BritishEnglish 78 | Bulgarian 79 | Burmese 80 | CanadianFrench 81 | Catalan 82 | Chinese 83 | Croatian 84 | Czech 85 | Danish 86 | Dutch 87 | English 88 | Estonian 89 | EuropeanPortuguese 90 | EuropeanSpanish 91 | Filipino 92 | Finnish 93 | French 94 | Georgian 95 | German 96 | Greek 97 | Gujarati 98 | Hebrew 99 | Hindi 100 | Hungarian 101 | Icelandic 102 | Indonesian 103 | Italian 104 | Japanese 105 | Kannada 106 | Kazakh 107 | Khmer 108 | Kirghiz 109 | Korean 110 | Lao 111 | LatinAmericanSpanish 112 | Latvian 113 | Lithuanian 114 | Macedonian 115 | Malay 116 | Malayalam 117 | Marathi 118 | ModernStandardArabic 119 | Mongolian 120 | Nepali 121 | Norwegian 122 | Persian 123 | Polish 124 | Portuguese 125 | Punjabi 126 | Romanian 127 | Russian 128 | Serbian 129 | SerbianLatin 130 | SimplifiedChinese 131 | Sinhala 132 | Slovak 133 | Slovenian 134 | Spanish 135 | Swahili 136 | Swedish 137 | Tamil 138 | Telugu 139 | Thai 140 | TraditionalChinese 141 | Turkish 142 | Ukrainian 143 | Urdu 144 | Uzbek 145 | Vietnamese 146 | Zulu 147 | 148 | 149 | ## Contact 150 | 151 | Josh Baker [@tidwall](http://twitter.com/tidwall) 152 | 153 | ## License 154 | 155 | Collate source code is available under the MIT [License](/LICENSE). 156 | 157 | 158 | -------------------------------------------------------------------------------- /collate.go: -------------------------------------------------------------------------------- 1 | package collate 2 | 3 | import ( 4 | "sort" 5 | "strings" 6 | 7 | "github.com/tidwall/gjson" 8 | "golang.org/x/text/collate" 9 | "golang.org/x/text/language" 10 | ) 11 | 12 | // SupportedLangs returns all of the languages that Index() supports. 13 | func SupportedLangs() []string { 14 | var langs []string 15 | for _, tag := range langMap { 16 | langs = append(langs, tag.name) 17 | } 18 | sort.Strings(langs) 19 | return langs 20 | } 21 | 22 | // IndexString returns a Less function that can be used to compare if 23 | // string "a" is less than string "b". 24 | // The "name" parameter should be a valid collate definition. 25 | // 26 | // Examples of collation names 27 | // -------------------------------------------------------------------- 28 | // ENGLISH, EN -- English 29 | // AMERICANENGLISH, EN-US -- English US 30 | // FRENCH, FR -- French 31 | // CHINESE, ZH -- Chinese 32 | // SIMPLIFIEDCHINESE, ZH-HANS -- Simplified Chinese 33 | // ... 34 | // 35 | // Case insensitive: add the CI tag to the name 36 | // -------------------------------------------------------------------- 37 | // ENGLISH_CI 38 | // FR_CI 39 | // ZH-HANS_CI 40 | // ... 41 | // 42 | // Case sensitive: add the CS tag to the name 43 | // -------------------------------------------------------------------- 44 | // ENGLISH_CS 45 | // FR_CS 46 | // ZH-HANS_CS 47 | // ... 48 | // 49 | // For numerics: add the NUM tag to the name 50 | // Specifies that numbers should sort numerically ("2" < "12") 51 | // -------------------------------------------------------------------- 52 | // DUTCH_NUM 53 | // JAPANESE_NUM 54 | // ... 55 | // 56 | // For loosness: add the LOOSE tag to the name 57 | // Ignores diacritics, case and weight 58 | // -------------------------------------------------------------------- 59 | // JA_LOOSE 60 | // CHINESE_LOOSE 61 | // ... 62 | // 63 | func IndexString(name string) (less func(a, b string) bool) { 64 | t, opts := parseCollation(name) 65 | c := collate.New(t, opts...) 66 | return func(a, b string) bool { 67 | return c.CompareString(a, b) == -1 68 | } 69 | } 70 | 71 | // IndexJSON is like IndexString expect for json. 72 | // The "name" parameter should be a valid collate definition. 73 | // The "path" parameter should be a valid gjson path. 74 | func IndexJSON(name, path string) (less func(a, b string) bool) { 75 | t, opts := parseCollation(name) 76 | c := collate.New(t, opts...) 77 | return func(a, b string) bool { 78 | ra := gjson.Get(a, path) 79 | rb := gjson.Get(b, path) 80 | if ra.Type == gjson.String || rb.Type == gjson.String { 81 | return c.CompareString(ra.String(), rb.String()) < 0 82 | } 83 | return ra.Less(rb, false) 84 | } 85 | } 86 | 87 | func parseCollation(s string) (tag language.Tag, opts []collate.Option) { 88 | parts := strings.Split(s, "_") 89 | if lt, ok := langMap[strings.ToLower(parts[0])]; ok { 90 | tag = lt.tag 91 | } else { 92 | tag = language.Make(parts[0]) 93 | } 94 | if tag == language.Und { 95 | tag = language.English 96 | } 97 | opts = append(opts, collate.OptionsFromTag(tag)) 98 | for i := 1; i < len(parts); i++ { 99 | switch strings.ToLower(parts[i]) { 100 | case "ci": 101 | opts = append(opts, collate.IgnoreCase) 102 | case "num": 103 | opts = append(opts, collate.Numeric) 104 | case "loose": 105 | opts = append(opts, collate.Loose) 106 | } 107 | } 108 | return tag, opts 109 | } 110 | 111 | type tlang struct { 112 | name string 113 | tag language.Tag 114 | } 115 | 116 | var langMap = map[string]tlang{ 117 | "afrikaans": tlang{"Afrikaans", language.Afrikaans}, 118 | "amharic": tlang{"Amharic", language.Amharic}, 119 | "arabic": tlang{"Arabic", language.Arabic}, 120 | "modernstandardarabic": tlang{"ModernStandardArabic", language.ModernStandardArabic}, 121 | "azerbaijani": tlang{"Azerbaijani", language.Azerbaijani}, 122 | "bulgarian": tlang{"Bulgarian", language.Bulgarian}, 123 | "bengali": tlang{"Bengali", language.Bengali}, 124 | "catalan": tlang{"Catalan", language.Catalan}, 125 | "czech": tlang{"Czech", language.Czech}, 126 | "danish": tlang{"Danish", language.Danish}, 127 | "german": tlang{"German", language.German}, 128 | "greek": tlang{"Greek", language.Greek}, 129 | "english": tlang{"English", language.English}, 130 | "americanenglish": tlang{"AmericanEnglish", language.AmericanEnglish}, 131 | "britishenglish": tlang{"BritishEnglish", language.BritishEnglish}, 132 | "spanish": tlang{"Spanish", language.Spanish}, 133 | "europeanspanish": tlang{"EuropeanSpanish", language.EuropeanSpanish}, 134 | "latinamericanspanish": tlang{"LatinAmericanSpanish", language.LatinAmericanSpanish}, 135 | "estonian": tlang{"Estonian", language.Estonian}, 136 | "persian": tlang{"Persian", language.Persian}, 137 | "finnish": tlang{"Finnish", language.Finnish}, 138 | "filipino": tlang{"Filipino", language.Filipino}, 139 | "french": tlang{"French", language.French}, 140 | "canadianfrench": tlang{"CanadianFrench", language.CanadianFrench}, 141 | "gujarati": tlang{"Gujarati", language.Gujarati}, 142 | "hebrew": tlang{"Hebrew", language.Hebrew}, 143 | "hindi": tlang{"Hindi", language.Hindi}, 144 | "croatian": tlang{"Croatian", language.Croatian}, 145 | "hungarian": tlang{"Hungarian", language.Hungarian}, 146 | "armenian": tlang{"Armenian", language.Armenian}, 147 | "indonesian": tlang{"Indonesian", language.Indonesian}, 148 | "icelandic": tlang{"Icelandic", language.Icelandic}, 149 | "italian": tlang{"Italian", language.Italian}, 150 | "japanese": tlang{"Japanese", language.Japanese}, 151 | "georgian": tlang{"Georgian", language.Georgian}, 152 | "kazakh": tlang{"Kazakh", language.Kazakh}, 153 | "khmer": tlang{"Khmer", language.Khmer}, 154 | "kannada": tlang{"Kannada", language.Kannada}, 155 | "korean": tlang{"Korean", language.Korean}, 156 | "kirghiz": tlang{"Kirghiz", language.Kirghiz}, 157 | "lao": tlang{"Lao", language.Lao}, 158 | "lithuanian": tlang{"Lithuanian", language.Lithuanian}, 159 | "latvian": tlang{"Latvian", language.Latvian}, 160 | "macedonian": tlang{"Macedonian", language.Macedonian}, 161 | "malayalam": tlang{"Malayalam", language.Malayalam}, 162 | "mongolian": tlang{"Mongolian", language.Mongolian}, 163 | "marathi": tlang{"Marathi", language.Marathi}, 164 | "malay": tlang{"Malay", language.Malay}, 165 | "burmese": tlang{"Burmese", language.Burmese}, 166 | "nepali": tlang{"Nepali", language.Nepali}, 167 | "dutch": tlang{"Dutch", language.Dutch}, 168 | "norwegian": tlang{"Norwegian", language.Norwegian}, 169 | "punjabi": tlang{"Punjabi", language.Punjabi}, 170 | "polish": tlang{"Polish", language.Polish}, 171 | "portuguese": tlang{"Portuguese", language.Portuguese}, 172 | "brazilianportuguese": tlang{"BrazilianPortuguese", language.BrazilianPortuguese}, 173 | "europeanportuguese": tlang{"EuropeanPortuguese", language.EuropeanPortuguese}, 174 | "romanian": tlang{"Romanian", language.Romanian}, 175 | "russian": tlang{"Russian", language.Russian}, 176 | "sinhala": tlang{"Sinhala", language.Sinhala}, 177 | "slovak": tlang{"Slovak", language.Slovak}, 178 | "slovenian": tlang{"Slovenian", language.Slovenian}, 179 | "albanian": tlang{"Albanian", language.Albanian}, 180 | "serbian": tlang{"Serbian", language.Serbian}, 181 | "serbianlatin": tlang{"SerbianLatin", language.SerbianLatin}, 182 | "swedish": tlang{"Swedish", language.Swedish}, 183 | "swahili": tlang{"Swahili", language.Swahili}, 184 | "tamil": tlang{"Tamil", language.Tamil}, 185 | "telugu": tlang{"Telugu", language.Telugu}, 186 | "thai": tlang{"Thai", language.Thai}, 187 | "turkish": tlang{"Turkish", language.Turkish}, 188 | "ukrainian": tlang{"Ukrainian", language.Ukrainian}, 189 | "urdu": tlang{"Urdu", language.Urdu}, 190 | "uzbek": tlang{"Uzbek", language.Uzbek}, 191 | "vietnamese": tlang{"Vietnamese", language.Vietnamese}, 192 | "chinese": tlang{"Chinese", language.Chinese}, 193 | "simplifiedchinese": tlang{"SimplifiedChinese", language.SimplifiedChinese}, 194 | "traditionalchinese": tlang{"TraditionalChinese", language.TraditionalChinese}, 195 | "zulu": tlang{"Zulu", language.Zulu}, 196 | } 197 | -------------------------------------------------------------------------------- /collate_test.go: -------------------------------------------------------------------------------- 1 | package collate 2 | 3 | import ( 4 | "fmt" 5 | "testing" 6 | ) 7 | 8 | func TestString(t *testing.T) { 9 | if len(SupportedLangs()) == 0 { 10 | t.Fatal("expected something greater than zero") 11 | } 12 | less := IndexString("ENGLISH_CI") 13 | if !less("a", "b") { 14 | t.Fatal("expected true, got false") 15 | } 16 | } 17 | 18 | func ExampleIndexJSON() { 19 | var jsonA = `{"name":{"last":"Miller"}}` 20 | var jsonB = `{"name":{"last":"anderson"}}` 21 | less := IndexJSON("ENGLISH_CI", "name.last") 22 | fmt.Printf("%t\n", less(jsonA, jsonB)) 23 | fmt.Printf("%t\n", less(jsonB, jsonA)) 24 | // Output: 25 | // false 26 | // true 27 | } 28 | 29 | func ExampleIndexString() { 30 | var nameA = "Miller" 31 | var nameB = "anderson" 32 | less := IndexString("ENGLISH_CI") 33 | fmt.Printf("%t\n", less(nameA, nameB)) 34 | fmt.Printf("%t\n", less(nameB, nameA)) 35 | // Output: 36 | // false 37 | // true 38 | } 39 | 40 | func ExampleSpanish() { 41 | less := IndexString("SPANISH_CI") 42 | fmt.Printf("%t\n", less("Hola", "hola")) 43 | fmt.Printf("%t\n", less("hola", "Hola")) 44 | // Output: 45 | // false 46 | // false 47 | } 48 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/tidwall/collate 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/tidwall/gjson v1.3.4 7 | golang.org/x/text v0.3.2 8 | ) 9 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/tidwall/gjson v1.3.4 h1:On5waDnyKKk3SWE4EthbjjirAWXp43xx5cKCUZY1eZw= 2 | github.com/tidwall/gjson v1.3.4/go.mod h1:P256ACg0Mn+j1RXIDXoss50DeIABTYK1PULOJHhxOls= 3 | github.com/tidwall/match v1.0.1 h1:PnKP62LPNxHKTwvHHZZzdOAOCtsJTjo6dZLCwpKm5xc= 4 | github.com/tidwall/match v1.0.1/go.mod h1:LujAq0jyVjBy028G1WhWfIzbpQfMO8bBZ6Tyb0+pL9E= 5 | github.com/tidwall/pretty v1.0.0 h1:HsD+QiTn7sK6flMKIvNmpqz1qrpP3Ps6jOKIKMooyg4= 6 | github.com/tidwall/pretty v1.0.0/go.mod h1:XNkn88O1ChpSDQmQeStsy+sBenx6DDtFZJxhVysOjyk= 7 | golang.org/x/text v0.3.2 h1:tW2bmiBqwgJj/UpqtC8EpXEZVYOwU0yG4iWbprSVAcs= 8 | golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= 9 | golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 10 | --------------------------------------------------------------------------------