├── go.sum ├── go.mod ├── .travis.yml ├── swedish ├── preprocess.go ├── README.md ├── step2.go ├── stem.go ├── step3.go ├── step1.go ├── common.go └── swedish_test.go ├── norwegian ├── preprocess.go ├── README.md ├── step2.go ├── stem.go ├── step3.go ├── step1.go ├── common.go └── norwegian_test.go ├── english ├── postprocess.go ├── step0.go ├── preprocess.go ├── step1c.go ├── README.md ├── stem.go ├── step5.go ├── step1a.go ├── step4.go ├── step3.go ├── step2.go ├── step1b.go ├── common.go └── english_test.go ├── spanish ├── postprocess.go ├── preprocess.go ├── README.md ├── step2a.go ├── step3.go ├── stem.go ├── step2b.go ├── step0.go ├── step1.go ├── common.go └── spanish_test.go ├── russian ├── preprocess.go ├── step2.go ├── step3.go ├── stem.go ├── step4.go ├── README.md ├── common.go └── step1.go ├── french ├── preprocess.go ├── step5.go ├── postprocess.go ├── step3.go ├── step6.go ├── step2a.go ├── stem.go ├── step2b.go ├── step4.go ├── common.go ├── step1.go └── french_test.go ├── .gitignore ├── .github └── workflows │ └── test.yml ├── hungarian ├── common_test.go ├── README.md ├── stem_test.go ├── common.go └── stem.go ├── HISTORY.md ├── romance ├── common.go └── testing_helpers.go ├── gostem └── gostem.go ├── snowball.go ├── LICENSE ├── snowball_test.go ├── snowballword ├── snowballword_test.go └── snowballword.go └── README.md /go.sum: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/kljensen/snowball 2 | 3 | go 1.19 4 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Travis-CI configuration. See 2 | # http://about.travis-ci.org/docs 3 | language: go 4 | install: echo "Skipping default travis install step" 5 | script: 6 | - curl https://raw.github.com/daaku/go.travis/master/install | sh -------------------------------------------------------------------------------- /swedish/preprocess.go: -------------------------------------------------------------------------------- 1 | package swedish 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Get the r1 of the word 8 | // 9 | func preprocess(word *snowballword.SnowballWord) { 10 | // Find the region R1. R2 is not used 11 | word.R1start = r1(word) 12 | } 13 | -------------------------------------------------------------------------------- /norwegian/preprocess.go: -------------------------------------------------------------------------------- 1 | package norwegian 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Get the r1 of the word 8 | // 9 | func preprocess(word *snowballword.SnowballWord) { 10 | // Find the region R1. R2 is not used 11 | word.R1start = r1(word) 12 | } 13 | -------------------------------------------------------------------------------- /english/postprocess.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Applies transformations necessary after 8 | // a word has been completely processed. 9 | // 10 | func postprocess(word *snowballword.SnowballWord) { 11 | 12 | uncapitalizeYs(word) 13 | } 14 | -------------------------------------------------------------------------------- /spanish/postprocess.go: -------------------------------------------------------------------------------- 1 | package spanish 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Applies transformations necessary after 8 | // a word has been completely processed. 9 | // 10 | func postprocess(word *snowballword.SnowballWord) { 11 | 12 | removeAccuteAccents(word) 13 | } 14 | -------------------------------------------------------------------------------- /spanish/preprocess.go: -------------------------------------------------------------------------------- 1 | package spanish 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | func preprocess(word *snowballword.SnowballWord) { 8 | r1start, r2start, rvstart := findRegions(word) 9 | word.R1start = r1start 10 | word.R2start = r2start 11 | word.RVstart = rvstart 12 | } 13 | -------------------------------------------------------------------------------- /russian/preprocess.go: -------------------------------------------------------------------------------- 1 | package russian 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | func preprocess(word *snowballword.SnowballWord) { 8 | 9 | r1start, r2start, rvstart := findRegions(word) 10 | word.R1start = r1start 11 | word.R2start = r2start 12 | word.RVstart = rvstart 13 | 14 | } 15 | -------------------------------------------------------------------------------- /russian/step2.go: -------------------------------------------------------------------------------- 1 | package russian 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Step 2 is the removal of the "и" suffix. 8 | func step2(word *snowballword.SnowballWord) bool { 9 | suffix := word.RemoveFirstSuffixIn(word.RVstart, "и") 10 | if suffix != "" { 11 | return true 12 | } 13 | return false 14 | } 15 | -------------------------------------------------------------------------------- /french/preprocess.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | func preprocess(word *snowballword.SnowballWord) { 8 | 9 | capitalizeYUI(word) 10 | 11 | r1start, r2start, rvstart := findRegions(word) 12 | word.R1start = r1start 13 | word.R2start = r2start 14 | word.RVstart = rvstart 15 | 16 | } 17 | -------------------------------------------------------------------------------- /french/step5.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Step 5 Undouble non-vowel endings 8 | func step5(word *snowballword.SnowballWord) bool { 9 | 10 | suffix := word.FirstSuffix("enn", "onn", "ett", "ell", "eill") 11 | if suffix != "" { 12 | word.RemoveLastNRunes(1) 13 | } 14 | return false 15 | } 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | 10 | # Architecture specific extensions/prefixes 11 | *.[568vq] 12 | [568vq].out 13 | 14 | *.cgo1.go 15 | *.cgo2.c 16 | _cgo_defun.c 17 | _cgo_gotypes.go 18 | _cgo_export.* 19 | 20 | _testmain.go 21 | 22 | *.exe 23 | 24 | tmp/* 25 | */tmp/* -------------------------------------------------------------------------------- /english/step0.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 0 is to strip off apostrophes and "s". 10 | func step0(w *snowballword.SnowballWord) bool { 11 | suffix := w.FirstSuffix("'s'", "'s", "'") 12 | if suffix == "" { 13 | return false 14 | } 15 | suffixLength := utf8.RuneCountInString(suffix) 16 | w.RemoveLastNRunes(suffixLength) 17 | return true 18 | } 19 | -------------------------------------------------------------------------------- /russian/step3.go: -------------------------------------------------------------------------------- 1 | package russian 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Step 3 is the removal of the derivational suffix. 8 | func step3(word *snowballword.SnowballWord) bool { 9 | 10 | // Search for a DERIVATIONAL ending in R2 (i.e. the entire 11 | // ending must lie in R2), and if one is found, remove it. 12 | 13 | suffix := word.RemoveFirstSuffixIn(word.R2start, "ост", "ость") 14 | if suffix != "" { 15 | return true 16 | } 17 | return false 18 | } 19 | -------------------------------------------------------------------------------- /swedish/README.md: -------------------------------------------------------------------------------- 1 | Snowball Swedish 2 | ================ 3 | 4 | This package implements the Swedish language 5 | [Snowball stemmer](http://snowball.tartarus.org/algorithms/swedish/stemmer.html). 6 | 7 | ## Implementation 8 | 9 | The Swedish language stemmer comprises preprocessing and 3 steps. 10 | Each of these is defined in a separate file in this 11 | package. All of the steps operate on a `SnowballWord` from the 12 | `snowballword` package and *modify the word in place*. 13 | 14 | ## Caveats 15 | 16 | None -------------------------------------------------------------------------------- /french/postprocess.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | func postprocess(word *snowballword.SnowballWord) { 8 | 9 | // Turn "I", "U", and "Y" into "i", "u", and "y". 10 | // Equivalently, unicode code points 11 | // 73 85 89 -> 105 117 121 12 | 13 | for i := 0; i < len(word.RS); i++ { 14 | switch word.RS[i] { 15 | case 73: 16 | word.RS[i] = 105 17 | case 85: 18 | word.RS[i] = 117 19 | case 89: 20 | word.RS[i] = 121 21 | } 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /norwegian/README.md: -------------------------------------------------------------------------------- 1 | Snowball Norwegian 2 | ================ 3 | 4 | This package implements the Norwegian language 5 | [Snowball stemmer](http://snowball.tartarus.org/algorithms/norwegian/stemmer.html). 6 | 7 | ## Implementation 8 | 9 | The Norwegian language stemmer comprises preprocessing and 3 steps. 10 | Each of these is defined in a separate file in this 11 | package. All of the steps operate on a `SnowballWord` from the 12 | `snowballword` package and *modify the word in place*. 13 | 14 | ## Caveats 15 | 16 | None 17 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | name: Build 3 | jobs: 4 | test: 5 | strategy: 6 | matrix: 7 | go-version: [1.19.x, 1.20.x, 1.21.x, 1.22.x] 8 | os: [ubuntu-latest, macos-latest, windows-latest] 9 | runs-on: ${{ matrix.os }} 10 | steps: 11 | - name: Install Go 12 | uses: actions/setup-go@v5 13 | with: 14 | go-version: ${{ matrix.go-version }} 15 | - name: Checkout code 16 | uses: actions/checkout@v4 17 | - name: Test 18 | run: go test ./... 19 | -------------------------------------------------------------------------------- /spanish/README.md: -------------------------------------------------------------------------------- 1 | Snowball Spanish 2 | ================ 3 | 4 | This package implements the 5 | [Spanish language Snowball stemmer](http://snowball.tartarus.org/algorithms/spanish/stemmer.html). 6 | 7 | ## Implementation 8 | 9 | The Spanish language stemmer comprises preprocessing, a number of steps, 10 | and postprocessing. Each of these is defined in a separate file in this 11 | package. All of the steps operate on a `SnowballWord` from the 12 | `snowballword` package and *modify the word in place*. 13 | 14 | ## Caveats 15 | 16 | None yet. -------------------------------------------------------------------------------- /hungarian/common_test.go: -------------------------------------------------------------------------------- 1 | package hungarian 2 | 3 | import ( 4 | "testing" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | func TestFindRegions(t *testing.T) { 10 | for k, want := range map[string]int{ 11 | "tóban": 2, // consonant-vowel 12 | "ablakan": 2, // vowel-consonant 13 | "acsony": 3, // vowel-digraph 14 | "cvs": 3, // null R1 region 15 | } { 16 | got := findRegions(snowballword.New(k)) 17 | if got != want { 18 | t.Errorf("%q: got %d, wanted %d", k, got, want) 19 | } 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /hungarian/README.md: -------------------------------------------------------------------------------- 1 | Snowball Hungarian 2 | ================ 3 | 4 | This package implements the 5 | [Hungarian language Snowball stemmer](https://snowballstem.org/algorithms/hungarian/stemmer.html) 6 | algorithm by [atordai@science.uval.nl](Anna Tordai). 7 | 8 | ## Implementation 9 | 10 | The Hungarian language stemmer comprises preprocessing, a number of steps, 11 | and postprocessing. Each of these is defined in a separate file in this 12 | package. All of the steps operate on a `SnowballWord` from the 13 | `snowballword` package and *modify the word in place*. 14 | 15 | -------------------------------------------------------------------------------- /norwegian/step2.go: -------------------------------------------------------------------------------- 1 | package norwegian 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 2: Search for one of the following suffixes in R1, 10 | // and if found delete the last letter. 11 | func step2(w *snowballword.SnowballWord) bool { 12 | 13 | suffix := w.FirstSuffix("dt", "vt") 14 | suffixLength := utf8.RuneCountInString(suffix) 15 | 16 | // If it is not in R1, do nothing 17 | if suffix == "" || suffixLength > len(w.RS)-w.R1start { 18 | return false 19 | } 20 | w.RemoveLastNRunes(1) 21 | return true 22 | } 23 | -------------------------------------------------------------------------------- /HISTORY.md: -------------------------------------------------------------------------------- 1 | History 2 | ======= 3 | 4 | ### v0.3.4 / 2013-05-19 5 | 6 | Add gostem program 7 | 8 | ### v0.3.3 / 2013-05-19 9 | 10 | Add large vocabulary tests for each language 11 | 12 | ### v0.3.1 / 2013-05-18 13 | 14 | Meaningless bump 15 | 16 | ### v0.3.0 / 2013-05-18 17 | 18 | Add Russian stemmer. 19 | 20 | ### v0.2.0 / 2013-05-17 21 | 22 | Add French stemmer and move more common code for romance 23 | languages into the `romance` package. 24 | 25 | ### v0.1.1 / 2013-05-14 26 | 27 | Documentation fixes. 28 | 29 | ### v0.1.0 / 2013-05-13 30 | 31 | Added Spanish stemmer and started versioning the project. -------------------------------------------------------------------------------- /french/step3.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Step 3 is the cleaning up of "Y" and "ç" suffixes. 8 | // 9 | func step3(word *snowballword.SnowballWord) bool { 10 | 11 | // Replace final Y with i or final ç with c 12 | if idx := len(word.RS) - 1; idx >= 0 { 13 | 14 | switch word.RS[idx] { 15 | 16 | case 89: 17 | // Replace Y (89) with "i" (105) 18 | word.RS[idx] = 105 19 | return true 20 | 21 | case 231: 22 | // Replace ç (231) with "c" (99) 23 | word.RS[idx] = 99 24 | return true 25 | } 26 | } 27 | return false 28 | } 29 | -------------------------------------------------------------------------------- /swedish/step2.go: -------------------------------------------------------------------------------- 1 | package swedish 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 2: Search for one of the following suffixes in R1, 10 | // and if found delete the last letter. 11 | func step2(w *snowballword.SnowballWord) bool { 12 | 13 | suffix := w.FirstSuffix( 14 | "dd", "gd", "nn", "dt", "gt", "kt", "tt", 15 | ) 16 | suffixLength := utf8.RuneCountInString(suffix) 17 | 18 | // If it is not in R1, do nothing 19 | if suffix == "" || suffixLength > len(w.RS)-w.R1start { 20 | return false 21 | } 22 | w.RemoveLastNRunes(1) 23 | return true 24 | } 25 | -------------------------------------------------------------------------------- /russian/stem.go: -------------------------------------------------------------------------------- 1 | package russian 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | "strings" 6 | ) 7 | 8 | // Stem an Russian word. This is the only exported 9 | // function in this package. 10 | // 11 | func Stem(word string, stemStopwWords bool) string { 12 | 13 | word = strings.ToLower(strings.TrimSpace(word)) 14 | w := snowballword.New(word) 15 | 16 | // Return small words and stop words 17 | if len(w.RS) <= 2 || (stemStopwWords == false && IsStopWord(word)) { 18 | return word 19 | } 20 | 21 | preprocess(w) 22 | step1(w) 23 | step2(w) 24 | step3(w) 25 | step4(w) 26 | return w.String() 27 | 28 | } 29 | -------------------------------------------------------------------------------- /english/preprocess.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Applies various transformations necessary for the 8 | // other, subsequent stemming steps. Most important 9 | // of which is defining the two regions R1 & R2. 10 | // 11 | func preprocess(word *snowballword.SnowballWord) { 12 | 13 | // Clean up apostrophes 14 | normalizeApostrophes(word) 15 | trimLeftApostrophes(word) 16 | 17 | // Capitalize Y's that are not behaving 18 | // as vowels. 19 | capitalizeYs(word) 20 | 21 | // Find the two regions, R1 & R2 22 | r1start, r2start := r1r2(word) 23 | word.R1start = r1start 24 | word.R2start = r2start 25 | } 26 | -------------------------------------------------------------------------------- /english/step1c.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Step 1c is the normalization of various "y" endings. 8 | // 9 | func step1c(w *snowballword.SnowballWord) bool { 10 | 11 | rsLen := len(w.RS) 12 | 13 | // Replace suffix y or Y by i if preceded by a non-vowel which is not 14 | // the first letter of the word (so cry -> cri, by -> by, say -> say) 15 | // 16 | // Note: the unicode code points for 17 | // y, Y, & i are 121, 89, & 105 respectively. 18 | // 19 | if len(w.RS) > 2 && (w.RS[rsLen-1] == 121 || w.RS[rsLen-1] == 89) && !isLowerVowel(w.RS[rsLen-2]) { 20 | w.RS[rsLen-1] = 105 21 | return true 22 | } 23 | return false 24 | } 25 | -------------------------------------------------------------------------------- /english/README.md: -------------------------------------------------------------------------------- 1 | Snowball English 2 | ================ 3 | 4 | This package implements the English language 5 | [Snowball stemmer](http://snowball.tartarus.org/algorithms/english/stemmer.html). 6 | 7 | ## Implementation 8 | 9 | The English language stemmer comprises preprocessing, a number of steps, 10 | and postprocessing. Each of these is defined in a separate file in this 11 | package. All of the steps operate on a `SnowballWord` from the 12 | `snowballword` package and *modify the word in place*. 13 | 14 | ## Caveats 15 | 16 | There is a single difference between this implementation and the original. 17 | Here, all apostrophes on the left hand side of a word are stripped off before 18 | the word is stemmed. -------------------------------------------------------------------------------- /norwegian/stem.go: -------------------------------------------------------------------------------- 1 | package norwegian 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | "strings" 6 | ) 7 | 8 | // Stem a Norwegian word. This is the only exported 9 | // function in this package. 10 | // 11 | func Stem(word string, stemStopwWords bool) string { 12 | 13 | word = strings.ToLower(strings.TrimSpace(word)) 14 | 15 | // Return small words and stop words 16 | if len(word) <= 2 || (stemStopwWords == false && IsStopWord(word)) { 17 | return word 18 | } 19 | 20 | w := snowballword.New(word) 21 | 22 | // Stem the word. Note, each of these 23 | // steps will alter `w` in place. 24 | // 25 | preprocess(w) 26 | step1(w) 27 | step2(w) 28 | step3(w) 29 | 30 | return w.String() 31 | 32 | } 33 | -------------------------------------------------------------------------------- /swedish/stem.go: -------------------------------------------------------------------------------- 1 | package swedish 2 | 3 | import ( 4 | "strings" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Stem a Swedish word. This is the only exported 10 | // function in this package. 11 | // 12 | func Stem(word string, stemStopwWords bool) string { 13 | 14 | word = strings.ToLower(strings.TrimSpace(word)) 15 | 16 | // Return small words and stop words 17 | if len(word) <= 2 || (stemStopwWords == false && IsStopWord(word)) { 18 | return word 19 | } 20 | 21 | w := snowballword.New(word) 22 | 23 | // Stem the word. Note, each of these 24 | // steps will alter `w` in place. 25 | // 26 | preprocess(w) 27 | step1(w) 28 | step2(w) 29 | step3(w) 30 | 31 | return w.String() 32 | 33 | } 34 | -------------------------------------------------------------------------------- /norwegian/step3.go: -------------------------------------------------------------------------------- 1 | package norwegian 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Step 3: 8 | // Search for the longest among the following suffixes, 9 | // and, if found and in R1, delete. 10 | 11 | func step3(w *snowballword.SnowballWord) bool { 12 | // Possible sufficies for this step, longest first. 13 | suffix := w.FirstSuffixIn(w.R1start, len(w.RS), 14 | "hetslov", "eleg", "elig", "elov", "slov", 15 | "leg", "eig", "lig", "els", "lov", "ig", 16 | ) 17 | suffixRunes := []rune(suffix) 18 | 19 | // If it is not in R1, do nothing 20 | if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start { 21 | return false 22 | } 23 | 24 | w.ReplaceSuffixRunes(suffixRunes, []rune(""), true) 25 | return true 26 | 27 | } 28 | -------------------------------------------------------------------------------- /spanish/step2a.go: -------------------------------------------------------------------------------- 1 | package spanish 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 2a is the removal of verb suffixes beginning y, 10 | // Search for the longest among the following suffixes 11 | // in RV, and if found, delete if preceded by u. 12 | func step2a(word *snowballword.SnowballWord) bool { 13 | suffix := word.FirstSuffixIn(word.RVstart, len(word.RS), "ya", "ye", "yan", "yen", "yeron", "yendo", "yo", "yó", "yas", "yes", "yais", "yamos") 14 | if suffix != "" { 15 | suffixLength := utf8.RuneCountInString(suffix) 16 | idx := len(word.RS) - suffixLength - 1 17 | if idx >= 0 && word.RS[idx] == 117 { 18 | word.RemoveLastNRunes(suffixLength) 19 | return true 20 | } 21 | } 22 | return false 23 | } 24 | -------------------------------------------------------------------------------- /romance/common.go: -------------------------------------------------------------------------------- 1 | package romance 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // A function type that accepts a rune and 8 | // returns a bool. In this particular case, 9 | // it is used for identifying vowels. 10 | type isVowelFunc func(rune) bool 11 | 12 | // Finds the region after the first non-vowel following a vowel, 13 | // or a the null region at the end of the word if there is no 14 | // such non-vowel. Returns the index in the Word where the 15 | // region starts; optionally skips the first `start` characters. 16 | // 17 | func VnvSuffix(word *snowballword.SnowballWord, f isVowelFunc, start int) int { 18 | for i := 1; i < len(word.RS[start:]); i++ { 19 | j := start + i 20 | if f(word.RS[j-1]) && !f(word.RS[j]) { 21 | return j + 1 22 | } 23 | } 24 | return len(word.RS) 25 | } 26 | -------------------------------------------------------------------------------- /spanish/step3.go: -------------------------------------------------------------------------------- 1 | package spanish 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 3 is the removal of residual suffixes. 10 | func step3(word *snowballword.SnowballWord) bool { 11 | suffix := word.FirstSuffixIfIn(word.RVstart, len(word.RS), 12 | "os", "a", "o", "á", "í", "ó", "e", "é", 13 | ) 14 | 15 | // No suffix found, nothing to do. 16 | // 17 | if suffix == "" { 18 | return false 19 | } 20 | suffixLength := utf8.RuneCountInString(suffix) 21 | 22 | // Remove all these suffixes 23 | word.RemoveLastNRunes(suffixLength) 24 | 25 | if suffix == "e" || suffix == "é" { 26 | 27 | // If preceded by gu with the u in RV delete the u 28 | // 29 | guSuffix := word.FirstSuffix("gu") 30 | if guSuffix != "" { 31 | word.RemoveLastNRunes(1) 32 | } 33 | } 34 | return true 35 | } 36 | -------------------------------------------------------------------------------- /english/stem.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | "strings" 6 | ) 7 | 8 | // Stem an English word. This is the only exported 9 | // function in this package. 10 | // 11 | func Stem(word string, stemStopwWords bool) string { 12 | 13 | word = strings.ToLower(strings.TrimSpace(word)) 14 | 15 | // Return small words and stop words 16 | if len(word) <= 2 || (stemStopwWords == false && IsStopWord(word)) { 17 | return word 18 | } 19 | 20 | // Return special words immediately 21 | if specialVersion := stemSpecialWord(word); specialVersion != "" { 22 | word = specialVersion 23 | return word 24 | } 25 | 26 | w := snowballword.New(word) 27 | 28 | // Stem the word. Note, each of these 29 | // steps will alter `w` in place. 30 | // 31 | preprocess(w) 32 | step0(w) 33 | step1a(w) 34 | step1b(w) 35 | step1c(w) 36 | step2(w) 37 | step3(w) 38 | step4(w) 39 | step5(w) 40 | postprocess(w) 41 | 42 | return w.String() 43 | 44 | } 45 | -------------------------------------------------------------------------------- /russian/step4.go: -------------------------------------------------------------------------------- 1 | package russian 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Step 4 is the undoubling of double non-vowel endings 8 | // and removal of superlative endings. 9 | func step4(word *snowballword.SnowballWord) bool { 10 | 11 | // (1) Undouble "н", or, 2) if the word ends with a SUPERLATIVE ending, 12 | // (remove it and undouble н n), or 3) if the word ends ь (') (soft sign) 13 | // remove it. 14 | 15 | // Undouble "н" 16 | if word.HasSuffixRunes([]rune("нн")) { 17 | word.RemoveLastNRunes(1) 18 | return true 19 | } 20 | 21 | // Remove superlative endings 22 | suffix := word.RemoveFirstSuffix("ейше", "ейш") 23 | if suffix != "" { 24 | // Undouble "н" 25 | if word.HasSuffixRunes([]rune("нн")) { 26 | word.RemoveLastNRunes(1) 27 | } 28 | return true 29 | } 30 | 31 | // Remove soft sign 32 | if rsLen := len(word.RS); rsLen > 0 && word.RS[rsLen-1] == 'ь' { 33 | word.RemoveLastNRunes(1) 34 | return true 35 | } 36 | return false 37 | } 38 | -------------------------------------------------------------------------------- /spanish/stem.go: -------------------------------------------------------------------------------- 1 | package spanish 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | "log" 6 | "strings" 7 | ) 8 | 9 | func printDebug(debug bool, w *snowballword.SnowballWord) { 10 | if debug { 11 | log.Println(w.DebugString()) 12 | } 13 | } 14 | 15 | // Stem an Spanish word. This is the only exported 16 | // function in this package. 17 | // 18 | func Stem(word string, stemStopwWords bool) string { 19 | 20 | word = strings.ToLower(strings.TrimSpace(word)) 21 | 22 | // Return small words and stop words 23 | if len(word) <= 2 || (stemStopwWords == false && IsStopWord(word)) { 24 | return word 25 | } 26 | 27 | w := snowballword.New(word) 28 | 29 | // Stem the word. Note, each of these 30 | // steps will alter `w` in place. 31 | // 32 | 33 | preprocess(w) 34 | step0(w) 35 | changeInStep1 := step1(w) 36 | if changeInStep1 == false { 37 | changeInStep2a := step2a(w) 38 | if changeInStep2a == false { 39 | step2b(w) 40 | } 41 | } 42 | step3(w) 43 | postprocess(w) 44 | 45 | return w.String() 46 | 47 | } 48 | -------------------------------------------------------------------------------- /gostem/gostem.go: -------------------------------------------------------------------------------- 1 | // 2 | // Creates a binary `gostem` that stems an input file. 3 | // 4 | package main 5 | 6 | import ( 7 | "bufio" 8 | "flag" 9 | "fmt" 10 | "github.com/kljensen/snowball" 11 | "io" 12 | "log" 13 | "os" 14 | "strings" 15 | ) 16 | 17 | func main() { 18 | 19 | var language *string = flag.String("l", "english", "Language") 20 | var infile *string = flag.String("i", "", "Input file for stemming") 21 | flag.Parse() 22 | 23 | f, err := os.Open(*infile) 24 | if err != nil { 25 | log.Fatal(err) 26 | } 27 | 28 | bf := bufio.NewReader(f) 29 | 30 | for { 31 | line, isPrefix, err := bf.ReadLine() 32 | 33 | if err == io.EOF { 34 | break 35 | } 36 | 37 | if err != nil { 38 | log.Fatal(err) 39 | } 40 | 41 | if isPrefix { 42 | log.Fatal("Error: Unexpected long line reading", f.Name()) 43 | } 44 | 45 | word := strings.TrimSpace(string(line)) 46 | stemmed, err := snowball.Stem(word, *language, true) 47 | if err != nil { 48 | log.Println(err) 49 | break 50 | } 51 | fmt.Println(stemmed) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /french/step6.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Step 6 Un-accent 8 | // 9 | func step6(word *snowballword.SnowballWord) bool { 10 | 11 | // If the words ends é or è (unicode code points 233 and 232) 12 | // followed by at least one non-vowel, remove the accent from the e. 13 | 14 | // Note, this step is oddly articulated on Porter's Snowball website: 15 | // http://snowball.tartarus.org/algorithms/french/stemmer.html 16 | // More clearly stated, we should replace é or è with e in the 17 | // case where the suffix of the word is é or è followed by 18 | // one-or-more non-vowels. 19 | 20 | numNonVowels := 0 21 | for i := len(word.RS) - 1; i >= 0; i-- { 22 | r := word.RS[i] 23 | 24 | if isLowerVowel(r) == false { 25 | numNonVowels += 1 26 | } else { 27 | 28 | // `r` is a vowel 29 | 30 | if (r == 233 || r == 232) && numNonVowels > 0 { 31 | 32 | // Replace with "e", or unicode code point 101 33 | word.RS[i] = 101 34 | return true 35 | 36 | } 37 | return false 38 | } 39 | 40 | } 41 | return false 42 | } 43 | -------------------------------------------------------------------------------- /snowball.go: -------------------------------------------------------------------------------- 1 | package snowball 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/kljensen/snowball/english" 7 | "github.com/kljensen/snowball/french" 8 | "github.com/kljensen/snowball/hungarian" 9 | "github.com/kljensen/snowball/norwegian" 10 | "github.com/kljensen/snowball/russian" 11 | "github.com/kljensen/snowball/spanish" 12 | "github.com/kljensen/snowball/swedish" 13 | ) 14 | 15 | const ( 16 | VERSION string = "v0.7.0" 17 | ) 18 | 19 | // Stem a word in the specified language. 20 | func Stem(word, language string, stemStopWords bool) (stemmed string, err error) { 21 | 22 | var f func(string, bool) string 23 | switch language { 24 | case "english": 25 | f = english.Stem 26 | case "spanish": 27 | f = spanish.Stem 28 | case "french": 29 | f = french.Stem 30 | case "russian": 31 | f = russian.Stem 32 | case "swedish": 33 | f = swedish.Stem 34 | case "norwegian": 35 | f = norwegian.Stem 36 | case "hungarian": 37 | f = hungarian.Stem 38 | default: 39 | err = fmt.Errorf("Unknown language: %s", language) 40 | return 41 | } 42 | stemmed = f(word, stemStopWords) 43 | return 44 | 45 | } 46 | -------------------------------------------------------------------------------- /french/step2a.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 2a is the removal of Verb suffixes beginning 10 | // with "i" in the RV region. 11 | func step2a(word *snowballword.SnowballWord) bool { 12 | 13 | // Search for the longest among the following suffixes 14 | // in RV and if found, delete if preceded by a non-vowel. 15 | 16 | suffix := word.FirstSuffixIn(word.RVstart, len(word.RS), 17 | "issantes", "issaIent", "issions", "issants", "issante", 18 | "iraIent", "issons", "issiez", "issent", "issant", "issait", 19 | "issais", "irions", "issez", "isses", "iront", "irons", "iriez", 20 | "irent", "irait", "irais", "îtes", "îmes", "isse", "irez", 21 | "iras", "irai", "ira", "ies", "ît", "it", "is", "ir", "ie", "i", 22 | ) 23 | 24 | if suffix != "" { 25 | suffixLength := utf8.RuneCountInString(suffix) 26 | idx := len(word.RS) - suffixLength - 1 27 | if idx >= 0 && word.FitsInRV(suffixLength+1) && isLowerVowel(word.RS[idx]) == false { 28 | word.RemoveLastNRunes(suffixLength) 29 | return true 30 | } 31 | } 32 | return false 33 | } 34 | -------------------------------------------------------------------------------- /swedish/step3.go: -------------------------------------------------------------------------------- 1 | package swedish 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 3: 10 | // Search for the longest among the following suffixes, 11 | // and, if found and in R1, perform the action indicated. 12 | 13 | // Delete: 14 | // lig, els & ig 15 | // Replace: 16 | // fullt: full, löst: lös 17 | 18 | func step3(w *snowballword.SnowballWord) bool { 19 | // Possible sufficies for this step, longest first. 20 | suffix := w.FirstSuffixIn(w.R1start, len(w.RS), 21 | "fullt", "löst", "lig", "els", "ig", 22 | ) 23 | suffixLength := utf8.RuneCountInString(suffix) 24 | 25 | // If it is not in R1, do nothing 26 | if suffix == "" || suffixLength > len(w.RS)-w.R1start { 27 | return false 28 | } 29 | 30 | // Handle a suffix that was found, which is going 31 | // to be replaced with a different suffix. 32 | // 33 | var repl string 34 | switch suffix { 35 | case "fullt": 36 | repl = "full" 37 | case "löst": 38 | repl = "lös" 39 | case "lig", "ig", "els": 40 | repl = "" 41 | } 42 | w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true) 43 | return true 44 | 45 | } 46 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) The project creators and maintainers 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /english/step5.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | ) 6 | 7 | // Step 5 is the stemming of "e" and "l" sufficies 8 | // found in R2. 9 | // 10 | func step5(w *snowballword.SnowballWord) bool { 11 | 12 | // Last rune index = `lri` 13 | lri := len(w.RS) - 1 14 | 15 | // If R1 is emtpy, R2 is also empty, and we 16 | // need not do anything in step 5. 17 | // 18 | if w.R1start > lri { 19 | return false 20 | } 21 | 22 | if w.RS[lri] == 101 { 23 | 24 | // The word ends with "e", which is unicode code point 101. 25 | 26 | // Delete "e" suffix if in R2, or in R1 and not preceded 27 | // by a short syllable. 28 | if w.R2start <= lri || !endsShortSyllable(w, lri) { 29 | w.ReplaceSuffix("e", "", true) 30 | return true 31 | } 32 | return false 33 | 34 | } else if w.R2start <= lri && w.RS[lri] == 108 && lri-1 >= 0 && w.RS[lri-1] == 108 { 35 | 36 | // The word ends in double "l", and the final "l" is 37 | // in R2. (Note, the unicode code point for "l" is 108.) 38 | 39 | // Delete the second "l". 40 | w.ReplaceSuffix("l", "", true) 41 | return true 42 | 43 | } 44 | return false 45 | } 46 | -------------------------------------------------------------------------------- /english/step1a.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 1a is normalization of various special "s"-endings. 10 | func step1a(w *snowballword.SnowballWord) bool { 11 | 12 | suffix := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s") 13 | switch suffix { 14 | 15 | case "sses": 16 | 17 | // Replace by ss 18 | w.ReplaceSuffixRunes([]rune(suffix), []rune("ss"), true) 19 | return true 20 | 21 | case "ies", "ied": 22 | 23 | // Replace by i if preceded by more than one letter, 24 | // otherwise by ie (so ties -> tie, cries -> cri). 25 | 26 | var repl string 27 | if len(w.RS) > 4 { 28 | repl = "i" 29 | } else { 30 | repl = "ie" 31 | } 32 | w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true) 33 | return true 34 | 35 | case "us", "ss": 36 | 37 | // Do nothing 38 | return false 39 | 40 | case "s": 41 | // Delete if the preceding word part contains a vowel 42 | // not immediately before the s (so gas and this retain 43 | // the s, gaps and kiwis lose it) 44 | // 45 | suffixLength := utf8.RuneCountInString(suffix) 46 | for i := 0; i < len(w.RS)-2; i++ { 47 | if isLowerVowel(w.RS[i]) { 48 | w.RemoveLastNRunes(suffixLength) 49 | return true 50 | } 51 | } 52 | } 53 | return false 54 | } 55 | -------------------------------------------------------------------------------- /french/stem.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "github.com/kljensen/snowball/snowballword" 5 | "strings" 6 | ) 7 | 8 | // Stem an French word. This is the only exported 9 | // function in this package. 10 | // 11 | func Stem(word string, stemStopwWords bool) string { 12 | 13 | word = strings.ToLower(strings.TrimSpace(word)) 14 | 15 | // Return small words and stop words 16 | if len(word) <= 2 || (stemStopwWords == false && IsStopWord(word)) { 17 | return word 18 | } 19 | 20 | w := snowballword.New(word) 21 | 22 | // Stem the word. Note, each of these 23 | // steps will alter `w` in place. 24 | // 25 | 26 | preprocess(w) 27 | var ( 28 | changeInStep1 bool 29 | changeInStep2a bool 30 | changeInStep2b bool 31 | ) 32 | 33 | changeInStep1 = step1(w) 34 | if changeInStep1 == false { 35 | changeInStep2a = step2a(w) 36 | if changeInStep2a == false { 37 | changeInStep2b = step2b(w) 38 | } 39 | } 40 | 41 | // If the last step was successful, do step 3. Note that, 42 | // since we only do 2a if 1 is unsuccessful, the following 43 | // "if" condition tests to see if the previous step was 44 | // successful. 45 | // 46 | if changeInStep1 || changeInStep2a || changeInStep2b { 47 | step3(w) 48 | } else { 49 | step4(w) 50 | } 51 | 52 | step5(w) 53 | step6(w) 54 | postprocess(w) 55 | return w.String() 56 | 57 | } 58 | -------------------------------------------------------------------------------- /english/step4.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 4: 10 | // Search for the longest among the following suffixes, 11 | // and, if found and in R2, perform the action indicated. 12 | 13 | // al, ance, ence, er, ic, able, ible, ant, ement, ment, 14 | // ent, ism, ate, iti, ous, ive, ize 15 | // delete 16 | // 17 | // ion 18 | // delete if preceded by s or t 19 | func step4(w *snowballword.SnowballWord) bool { 20 | 21 | // Find all endings in R1 22 | suffix := w.FirstSuffix( 23 | "ement", "ance", "ence", "able", "ible", "ment", 24 | "ent", "ant", "ism", "ate", "iti", "ous", "ive", 25 | "ize", "ion", "al", "er", "ic", 26 | ) 27 | suffixLength := utf8.RuneCountInString(suffix) 28 | 29 | // If it does not fit in R2, do nothing. 30 | if suffixLength > len(w.RS)-w.R2start { 31 | return false 32 | } 33 | 34 | // Handle special cases 35 | switch suffix { 36 | case "": 37 | return false 38 | 39 | case "ion": 40 | // Replace by og if preceded by l 41 | // l = 108 42 | rsLen := len(w.RS) 43 | if rsLen >= 4 { 44 | switch w.RS[rsLen-4] { 45 | case 115, 116: 46 | w.RemoveLastNRunes(suffixLength) 47 | return true 48 | } 49 | 50 | } 51 | return false 52 | } 53 | 54 | // Handle basic replacements 55 | w.RemoveLastNRunes(suffixLength) 56 | return true 57 | 58 | } 59 | -------------------------------------------------------------------------------- /english/step3.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 3 is the stemming of various longer sufficies 10 | // found in R1. 11 | func step3(w *snowballword.SnowballWord) bool { 12 | 13 | suffix := w.FirstSuffix( 14 | "ational", "tional", "alize", "icate", "ative", 15 | "iciti", "ical", "ful", "ness", 16 | ) 17 | 18 | suffixLength := utf8.RuneCountInString(suffix) 19 | 20 | // If it is not in R1, do nothing 21 | if suffix == "" || suffixLength > len(w.RS)-w.R1start { 22 | return false 23 | } 24 | 25 | // Handle special cases where we're not just going to 26 | // replace the suffix with another suffix: there are 27 | // other things we need to do. 28 | // 29 | if suffix == "ative" { 30 | 31 | // If in R2, delete. 32 | // 33 | if len(w.RS)-w.R2start >= 5 { 34 | w.RemoveLastNRunes(suffixLength) 35 | return true 36 | } 37 | return false 38 | } 39 | 40 | // Handle a suffix that was found, which is going 41 | // to be replaced with a different suffix. 42 | // 43 | var repl string 44 | switch suffix { 45 | case "ational": 46 | repl = "ate" 47 | case "tional": 48 | repl = "tion" 49 | case "alize": 50 | repl = "al" 51 | case "icate", "iciti", "ical": 52 | repl = "ic" 53 | case "ful", "ness": 54 | repl = "" 55 | } 56 | w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true) 57 | return true 58 | 59 | } 60 | -------------------------------------------------------------------------------- /russian/README.md: -------------------------------------------------------------------------------- 1 | Snowball Russian 2 | ================ 3 | 4 | This package implements the 5 | [Russian language Snowball stemmer](http://snowball.tartarus.org/algorithms/russian/stemmer.html). 6 | 7 | ## Russian overview 8 | 9 | Russian has 33 letters, 11 Vowels, 20 consonants 10 | and 2 unpronounced signs. The capital letters 11 | look the same as the lower case letters, with 12 | the exception of cursive capital letter and 13 | lower case. 14 | 15 | ## Implementation 16 | 17 | The Russian language stemmer comprises preprocessing, a number of steps. 18 | Each of these is defined in a separate file in this 19 | package. All of the steps operate on a `SnowballWord` from the 20 | `snowballword` package and *modify the word in place*. 21 | 22 | ## Caveats 23 | 24 | The [example vocabulary for the original Russian snowball stemmer](http://snowball.tartarus.org/algorithms/russian/voc.txt) contains the word "злейший", which means "worst" in English. 25 | This word contains the adjectival suffix "ий" preceded by the superlative suffix "ейш". 26 | The [output for the example vocabulary](http://snowball.tartarus.org/algorithms/russian/output.txt) 27 | indicates that this word should be stemmed to "злейш". However, this implementation stems 28 | the word to "зл". 29 | The [Python NLTK](https://github.com/nltk/nltk/blob/master/nltk/stem/snowball.py#L2879) 30 | implementation also stems "злейший" to "зл". 31 | It is unclear to me how the original snowball implementation would possibly produce "злейш". 32 | So, I removed that word from the tests. -------------------------------------------------------------------------------- /swedish/step1.go: -------------------------------------------------------------------------------- 1 | package swedish 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 1 is the stemming of various endings found in 10 | // R1 including "heterna", "ornas", and "andet". 11 | func step1(w *snowballword.SnowballWord) bool { 12 | 13 | // Possible sufficies for this step, longest first. 14 | suffixes := []string{ 15 | "heterna", "hetens", "anden", "heten", "heter", "arnas", 16 | "ernas", "ornas", "andes", "arens", "andet", "arna", "erna", 17 | "orna", "ande", "arne", "aste", "aren", "ades", "erns", "ade", 18 | "are", "ern", "ens", "het", "ast", "ad", "en", "ar", "er", 19 | "or", "as", "es", "at", "a", "e", "s", 20 | } 21 | 22 | // Using FirstSuffixIn since there are overlapping suffixes, where some might not be in the R1, 23 | // while another might. For example: "ärade" 24 | suffix := w.FirstSuffixIn(w.R1start, len(w.RS), suffixes...) 25 | suffixLength := utf8.RuneCountInString(suffix) 26 | 27 | // If it is not in R1, do nothing 28 | if suffix == "" || suffixLength > len(w.RS)-w.R1start { 29 | return false 30 | } 31 | 32 | if suffix == "s" { 33 | // Delete if preceded by a valid s-ending. Valid s-endings inlude the 34 | // following charaters: bcdfghjklmnoprtvy. 35 | // 36 | rsLen := len(w.RS) 37 | if rsLen >= 2 { 38 | switch w.RS[rsLen-2] { 39 | case 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 40 | 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y': 41 | w.RemoveLastNRunes(suffixLength) 42 | return true 43 | } 44 | } 45 | return false 46 | } 47 | // Remove the suffix 48 | w.RemoveLastNRunes(suffixLength) 49 | return true 50 | } 51 | -------------------------------------------------------------------------------- /french/step2b.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 2b is the removal of Verb suffixes in RV 10 | // that do not begin with "i". 11 | func step2b(word *snowballword.SnowballWord) bool { 12 | 13 | // Search for the longest among the following suffixes in RV. 14 | // 15 | suffix := word.FirstSuffixIn(word.RVstart, len(word.RS), 16 | "eraIent", "assions", "erions", "assiez", "assent", 17 | "èrent", "eront", "erons", "eriez", "erait", "erais", 18 | "asses", "antes", "aIent", "âtes", "âmes", "ions", 19 | "erez", "eras", "erai", "asse", "ants", "ante", "ées", 20 | "iez", "era", "ant", "ait", "ais", "és", "ée", "ât", 21 | "ez", "er", "as", "ai", "é", "a", 22 | ) 23 | 24 | suffixLen := utf8.RuneCountInString(suffix) 25 | switch suffix { 26 | case "ions": 27 | 28 | // Delete if in R2 29 | if word.FitsInR2(suffixLen) { 30 | word.RemoveLastNRunes(suffixLen) 31 | return true 32 | } 33 | return false 34 | 35 | case "é", "ée", "ées", "és", "èrent", "er", "era", 36 | "erai", "eraIent", "erais", "erait", "eras", "erez", 37 | "eriez", "erions", "erons", "eront", "ez", "iez": 38 | 39 | // Delete 40 | word.RemoveLastNRunes(suffixLen) 41 | return true 42 | 43 | case "âmes", "ât", "âtes", "a", "ai", "aIent", 44 | "ais", "ait", "ant", "ante", "antes", "ants", "as", 45 | "asse", "assent", "asses", "assiez", "assions": 46 | 47 | // Delete 48 | word.RemoveLastNRunes(suffixLen) 49 | 50 | // If preceded by e (unicode code point 101), delete 51 | // 52 | idx := len(word.RS) - 1 53 | if idx >= 0 && word.RS[idx] == 101 && word.FitsInRV(1) { 54 | word.RemoveLastNRunes(1) 55 | } 56 | return true 57 | 58 | } 59 | return false 60 | } 61 | -------------------------------------------------------------------------------- /spanish/step2b.go: -------------------------------------------------------------------------------- 1 | package spanish 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 2b is the removal of verb suffixes beginning y, 10 | // Search for the longest among the following suffixes 11 | // in RV, and if found, delete if preceded by u. 12 | func step2b(word *snowballword.SnowballWord) bool { 13 | suffix := word.FirstSuffixIn(word.RVstart, len(word.RS), 14 | "iésemos", "iéramos", "iríamos", "eríamos", "aríamos", "ásemos", 15 | "áramos", "ábamos", "isteis", "iríais", "iremos", "ieseis", 16 | "ierais", "eríais", "eremos", "asteis", "aríais", "aremos", 17 | "íamos", "irías", "irían", "iréis", "ieses", "iesen", "ieron", 18 | "ieras", "ieran", "iendo", "erías", "erían", "eréis", "aseis", 19 | "arías", "arían", "aréis", "arais", "abais", "íais", "iste", 20 | "iría", "irás", "irán", "imos", "iese", "iera", "idos", "idas", 21 | "ería", "erás", "erán", "aste", "ases", "asen", "aría", "arás", 22 | "arán", "aron", "aras", "aran", "ando", "amos", "ados", "adas", 23 | "abas", "aban", "ías", "ían", "éis", "áis", "iré", "irá", "ido", 24 | "ida", "eré", "erá", "emos", "ase", "aré", "ará", "ara", "ado", 25 | "ada", "aba", "ís", "ía", "ió", "ir", "id", "es", "er", "en", 26 | "ed", "as", "ar", "an", "ad", 27 | ) 28 | suffixLength := utf8.RuneCountInString(suffix) 29 | 30 | switch suffix { 31 | case "": 32 | return false 33 | 34 | case "en", "es", "éis", "emos": 35 | 36 | // Delete, and if preceded by gu delete the u (the gu need not be in RV) 37 | word.RemoveLastNRunes(suffixLength) 38 | guSuffix := word.FirstSuffix("gu") 39 | if guSuffix != "" { 40 | word.RemoveLastNRunes(1) 41 | } 42 | 43 | default: 44 | 45 | // Delete 46 | word.RemoveLastNRunes(suffixLength) 47 | } 48 | return true 49 | } 50 | -------------------------------------------------------------------------------- /norwegian/step1.go: -------------------------------------------------------------------------------- 1 | package norwegian 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 1 is the stemming of various endings found in 10 | // R1 including "hetene", "endes", and "ande". 11 | func step1(w *snowballword.SnowballWord) bool { 12 | 13 | // Possible sufficies for this step, longest first. 14 | suffixes := []string{ 15 | "hetenes", "hetene", "hetens", "endes", "heter", "heten", "ende", 16 | "ande", "edes", "enes", "ene", "ane", "ets", "ers", "ede", "ast", 17 | "ens", "het", "as", "es", "en", "ar", "er", "et", "e", "a", "s", 18 | } 19 | 20 | // Using FirstSuffixIn since there are overlapping suffixes, where some might not be in the R1, 21 | suffix := w.FirstSuffixIn(w.R1start, len(w.RS), suffixes...) 22 | suffixLength := utf8.RuneCountInString(suffix) 23 | 24 | if suffix == "s" { 25 | // Delete if preceded by a valid s-ending. Valid s-endings inlude the 26 | // following charaters: bcdfghjlmnoprtvyz or k not preceded by a vowel 27 | rsLen := len(w.RS) 28 | 29 | if rsLen >= 2 { 30 | switch w.RS[rsLen-2] { 31 | case 'b', 'c', 'd', 'f', 'g', 'h', 'j', 32 | 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z': 33 | 34 | w.RemoveLastNRunes(suffixLength) 35 | return true 36 | case 'k': 37 | if !isLowerVowel(w.RS[rsLen-3]) { 38 | w.RemoveLastNRunes(suffixLength) 39 | return true 40 | } 41 | } 42 | } 43 | 44 | return false 45 | } 46 | 47 | // Remove the suffix 48 | w.RemoveLastNRunes(suffixLength) 49 | 50 | // replace "erte" and "ert" with "er" 51 | suffix = w.FirstSuffix("erte", "ert") 52 | suffixLength = utf8.RuneCountInString(suffix) 53 | 54 | if suffix == "" || suffixLength > len(w.RS)-w.R1start { 55 | return false 56 | } 57 | 58 | w.ReplaceSuffixRunes([]rune(suffix), []rune("er"), true) 59 | 60 | return true 61 | } 62 | -------------------------------------------------------------------------------- /spanish/step0.go: -------------------------------------------------------------------------------- 1 | package spanish 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 0 is the removal of attached pronouns 10 | func step0(word *snowballword.SnowballWord) bool { 11 | 12 | // Search for the longest among the following suffixes 13 | suffix1 := word.FirstSuffixIn(word.RVstart, len(word.RS), 14 | "selas", "selos", "sela", "selo", "las", "les", 15 | "los", "nos", "me", "se", "la", "le", "lo", 16 | ) 17 | 18 | // If the suffix empty or not in RV, we have nothing to do. 19 | if suffix1 == "" { 20 | return false 21 | } 22 | s1Len := utf8.RuneCountInString(suffix1) 23 | 24 | // We'll remove suffix1, if comes after one of the following 25 | suffix2 := word.FirstSuffixIn(word.RVstart, len(word.RS)-len(suffix1), 26 | "iéndo", "iendo", "yendo", "ando", "ándo", 27 | "ár", "ér", "ír", "ar", "er", "ir", 28 | ) 29 | switch suffix2 { 30 | case "": 31 | 32 | // Nothing to do 33 | return false 34 | 35 | case "iéndo", "ándo", "ár", "ér", "ír": 36 | 37 | // In these cases, deletion is followed by removing 38 | // the acute accent (e.g., haciéndola -> haciendo). 39 | 40 | var suffix2repl string 41 | switch suffix2 { 42 | case "": 43 | return false 44 | case "iéndo": 45 | suffix2repl = "iendo" 46 | case "ándo": 47 | suffix2repl = "ando" 48 | case "ár": 49 | suffix2repl = "ar" 50 | case "ír": 51 | suffix2repl = "ir" 52 | } 53 | word.RemoveLastNRunes(s1Len) 54 | word.ReplaceSuffixRunes([]rune(suffix2), []rune(suffix2repl), true) 55 | return true 56 | 57 | case "ando", "iendo", "ar", "er", "ir": 58 | word.RemoveLastNRunes(s1Len) 59 | return true 60 | 61 | case "yendo": 62 | 63 | // In the case of "yendo", the "yendo" must lie in RV, 64 | // and be preceded by a "u" somewhere in the word. 65 | 66 | for i := 0; i < len(word.RS)-(len(suffix1)+len(suffix2)); i++ { 67 | 68 | // Note, the unicode code point for "u" is 117. 69 | if word.RS[i] == 117 { 70 | word.RemoveLastNRunes(s1Len) 71 | return true 72 | } 73 | } 74 | } 75 | return false 76 | } 77 | -------------------------------------------------------------------------------- /swedish/common.go: -------------------------------------------------------------------------------- 1 | package swedish 2 | 3 | import ( 4 | "github.com/kljensen/snowball/romance" 5 | "github.com/kljensen/snowball/snowballword" 6 | ) 7 | 8 | // Find the starting point of the region R1. 9 | // 10 | // R1 is the region after the first non-vowel following a vowel, 11 | // or is the null region at the end of the word if there is no 12 | // such non-vowel. R2 is not used in Swedish 13 | // 14 | // See http://snowball.tartarus.org/texts/r1r2.html 15 | // 16 | func r1(word *snowballword.SnowballWord) (r1start int) { 17 | // Like the German R1, the length of the Swedish R1 is adjusted to be at least three. 18 | r1start = romance.VnvSuffix(word, isLowerVowel, 0) 19 | if r1start < 3 && len(word.RS) >= 3 { 20 | r1start = 3 21 | } 22 | return 23 | } 24 | 25 | // Checks if a rune is a lowercase Swedish vowel. 26 | // 27 | func isLowerVowel(r rune) bool { 28 | switch r { 29 | case 'a', 'e', 'i', 'o', 'u', 'y', 'å', 'ä', 'ö': 30 | return true 31 | } 32 | return false 33 | } 34 | 35 | // Return `true` if the input `word` is a Swedish stop word. 36 | // 37 | func IsStopWord(word string) bool { 38 | switch word { 39 | case "och", "det", "att", "i", "en", "jag", "hon", "som", "han", 40 | "på", "den", "med", "var", "sig", "för", "så", "till", "är", "men", 41 | "ett", "om", "hade", "de", "av", "icke", "mig", "du", "henne", "då", 42 | "sin", "nu", "har", "inte", "hans", "honom", "skulle", "hennes", 43 | "där", "min", "man", "ej", "vid", "kunde", "något", "från", "ut", 44 | "när", "efter", "upp", "vi", "dem", "vara", "vad", "över", "än", 45 | "dig", "kan", "sina", "här", "ha", "mot", "alla", "under", "någon", 46 | "eller", "allt", "mycket", "sedan", "ju", "denna", "själv", "detta", 47 | "åt", "utan", "varit", "hur", "ingen", "mitt", "ni", "bli", "blev", 48 | "oss", "din", "dessa", "några", "deras", "blir", "mina", "samma", 49 | "vilken", "er", "sådan", "vår", "blivit", "dess", "inom", "mellan", 50 | "sådant", "varför", "varje", "vilka", "ditt", "vem", "vilket", 51 | "sitta", "sådana", "vart", "dina", "vars", "vårt", "våra", 52 | "ert", "era", "vilkas": 53 | return true 54 | } 55 | return false 56 | } 57 | -------------------------------------------------------------------------------- /french/step4.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "log" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 4 is the cleaning up of residual suffixes. 10 | func step4(word *snowballword.SnowballWord) bool { 11 | 12 | hadChange := false 13 | 14 | if word.String() == "voudrion" { 15 | log.Println("...", word) 16 | } 17 | 18 | // If the word ends s (unicode code point 115), 19 | // not preceded by a, i, o, u, è or s, delete it. 20 | // 21 | if idx := len(word.RS) - 1; idx >= 1 && word.RS[idx] == 115 { 22 | switch word.RS[idx-1] { 23 | 24 | case 97, 105, 111, 117, 232, 115: 25 | 26 | // Do nothing, preceded by a, i, o, u, è or s 27 | return false 28 | 29 | default: 30 | word.RemoveLastNRunes(1) 31 | hadChange = true 32 | 33 | } 34 | } 35 | 36 | // Note: all the following are restricted to the RV region. 37 | 38 | // Search for the longest among the following suffixes in RV. 39 | // 40 | suffix := word.FirstSuffixIn(word.RVstart, len(word.RS), 41 | "Ière", "ière", "Ier", "ier", "ion", "e", "ë", 42 | ) 43 | 44 | switch suffix { 45 | case "": 46 | return hadChange 47 | case "ion": 48 | 49 | // Delete if in R2 and preceded by s or t in RV 50 | 51 | const suffixLength int = 3 // equivalently, len(suffixRunes) 52 | idx := len(word.RS) - suffixLength - 1 53 | if word.FitsInR2(suffixLength) && idx >= 0 && word.FitsInRV(suffixLength+1) { 54 | if word.RS[idx] == 115 || word.RS[idx] == 116 { 55 | word.RemoveLastNRunes(suffixLength) 56 | return true 57 | } 58 | } 59 | return hadChange 60 | 61 | case "ier", "ière", "Ier", "Ière": 62 | // Replace with i 63 | suffixRunes := []rune(suffix) 64 | word.ReplaceSuffixRunes(suffixRunes, []rune("i"), true) 65 | return true 66 | 67 | case "e": 68 | word.RemoveLastNRunes(1) 69 | return true 70 | 71 | case "ë": 72 | 73 | // If preceded by gu (unicode code point 103 & 117), delete 74 | idx := len(word.RS) - 1 75 | if idx >= 2 && word.RS[idx-2] == 103 && word.RS[idx-1] == 117 { 76 | word.RemoveLastNRunes(1) 77 | return true 78 | } 79 | return hadChange 80 | } 81 | 82 | return true 83 | } 84 | -------------------------------------------------------------------------------- /norwegian/common.go: -------------------------------------------------------------------------------- 1 | package norwegian 2 | 3 | import ( 4 | "github.com/kljensen/snowball/romance" 5 | "github.com/kljensen/snowball/snowballword" 6 | ) 7 | 8 | // Find the starting point of the region R1. 9 | // 10 | // R1 is the region after the first non-vowel following a vowel, 11 | // or is the null region at the end of the word if there is no 12 | // such non-vowel. R2 is not used in Norwegian 13 | // 14 | // See http://snowball.tartarus.org/texts/r1r2.html 15 | // 16 | func r1(word *snowballword.SnowballWord) (r1start int) { 17 | // Like the German R1, the length of the Norwegian R1 is adjusted to be at least three. 18 | r1start = romance.VnvSuffix(word, isLowerVowel, 0) 19 | if r1start < 3 && len(word.RS) >= 3 { 20 | r1start = 3 21 | } 22 | return 23 | } 24 | 25 | // Checks if a rune is a lowercase Norwegian vowel. 26 | // 27 | func isLowerVowel(r rune) bool { 28 | switch r { 29 | case 'a', 'e', 'i', 'o', 'u', 'y', 'æ', 'ø', 'å': 30 | return true 31 | } 32 | return false 33 | } 34 | 35 | // Return `true` if the input `word` is a Norwegian stop word. 36 | // 37 | func IsStopWord(word string) bool { 38 | switch word { 39 | case "ut", "få", "hadde", "hva", "tilbake", "vil", "han", "meget", "men", "vi", "en", "før", 40 | "samme", "stille", "inn", "er", "kan", "makt", "ved", "forsøke", "hvis", "part", "rett", 41 | "måte", "denne", "mer", "i", "lang", "ny", "hans", "hvilken", "tid", "vite", "her", "opp", 42 | "var", "navn", "mye", "om", "sant", "tilstand", "der", "ikke", "mest", "punkt", "hvem", 43 | "skulle", "mange", "over", "vårt", "alle", "arbeid", "lik", "like", "gå", "når", "siden", 44 | "å", "begge", "bruke", "eller", "og", "til", "da", "et", "hvorfor", "nå", "sist", "slutt", 45 | "deres", "det", "hennes", "så", "mens", "bra", "din", "fordi", "gjøre", "god", "ha", "start", 46 | "andre", "må", "med", "under", "meg", "oss", "innen", "på", "verdi", "ville", "kunne", "uten", 47 | "vår", "slik", "ene", "folk", "min", "riktig", "enhver", "bort", "enn", "nei", "som", "våre", "disse", 48 | "gjorde", "lage", "si", "du", "fra", "også", "hvordan", "av", "eneste", "for", "hvor", "først", "hver": 49 | return true 50 | } 51 | return false 52 | } 53 | -------------------------------------------------------------------------------- /russian/common.go: -------------------------------------------------------------------------------- 1 | package russian 2 | 3 | import ( 4 | "github.com/kljensen/snowball/romance" 5 | "github.com/kljensen/snowball/snowballword" 6 | ) 7 | 8 | // Checks if a rune is a lowercase Russian vowel. 9 | // 10 | func isLowerVowel(r rune) bool { 11 | 12 | // The Russian vowels are "аеиоуыэюя", which 13 | // are referenced by their unicode code points 14 | // in the switch statement below. 15 | switch r { 16 | case 1072, 1077, 1080, 1086, 1091, 1099, 1101, 1102, 1103: 17 | return true 18 | } 19 | return false 20 | } 21 | 22 | // Return `true` if the input `word` is a French stop word. 23 | // 24 | func IsStopWord(word string) bool { 25 | switch word { 26 | case "и", "в", "во", "не", "что", "он", "на", "я", "с", 27 | "со", "как", "а", "то", "все", "она", "так", "его", 28 | "но", "да", "ты", "к", "у", "же", "вы", "за", "бы", 29 | "по", "только", "ее", "мне", "было", "вот", "от", 30 | "меня", "еще", "нет", "о", "из", "ему", "теперь", 31 | "когда", "даже", "ну", "вдруг", "ли", "если", "уже", 32 | "или", "ни", "быть", "был", "него", "до", "вас", 33 | "нибудь", "опять", "уж", "вам", "ведь", "там", "потом", 34 | "себя", "ничего", "ей", "может", "они", "тут", "где", 35 | "есть", "надо", "ней", "для", "мы", "тебя", "их", 36 | "чем", "была", "сам", "чтоб", "без", "будто", "чего", 37 | "раз", "тоже", "себе", "под", "будет", "ж", "тогда", 38 | "кто", "этот", "того", "потому", "этого", "какой", 39 | "совсем", "ним", "здесь", "этом", "один", "почти", 40 | "мой", "тем", "чтобы", "нее", "сейчас", "были", "куда", 41 | "зачем", "всех", "никогда", "можно", "при", "наконец", 42 | "два", "об", "другой", "хоть", "после", "над", "больше", 43 | "тот", "через", "эти", "нас", "про", "всего", "них", 44 | "какая", "много", "разве", "три", "эту", "моя", 45 | "впрочем", "хорошо", "свою", "этой", "перед", "иногда", 46 | "лучше", "чуть", "том", "нельзя", "такой", "им", "более", 47 | "всегда", "конечно", "всю", "между": 48 | return true 49 | } 50 | return false 51 | } 52 | 53 | // Find the starting point of the regions R1, R2, & RV 54 | // 55 | func findRegions(word *snowballword.SnowballWord) (r1start, r2start, rvstart int) { 56 | 57 | // R1 & R2 are defined in the standard manner. 58 | r1start = romance.VnvSuffix(word, isLowerVowel, 0) 59 | r2start = romance.VnvSuffix(word, isLowerVowel, r1start) 60 | 61 | // Set RV, by default, as empty. 62 | rvstart = len(word.RS) 63 | 64 | // RV is the region after the first vowel, or the end of 65 | // the word if it contains no vowel. 66 | // 67 | for i := 0; i < len(word.RS); i++ { 68 | if isLowerVowel(word.RS[i]) { 69 | rvstart = i + 1 70 | break 71 | } 72 | } 73 | 74 | return 75 | } 76 | -------------------------------------------------------------------------------- /snowball_test.go: -------------------------------------------------------------------------------- 1 | package snowball 2 | 3 | import ( 4 | "regexp" 5 | "testing" 6 | ) 7 | 8 | func Test_Stem(t *testing.T) { 9 | testCases := []struct { 10 | in string 11 | language string 12 | stemStopWords bool 13 | out string 14 | nilErr bool 15 | }{ 16 | {"aberration", "english", true, "aberr", true}, 17 | {"abruptness", "english", true, "abrupt", true}, 18 | {"absolute", "english", true, "absolut", true}, 19 | {"abated", "english", true, "abat", true}, 20 | {"acclivity", "english", true, "accliv", true}, 21 | {"accumulations", "english", true, "accumul", true}, 22 | {"agreement", "english", true, "agreement", true}, 23 | {"breed", "english", true, "breed", true}, 24 | {"ape", "english", true, "ape", true}, 25 | {"skating", "english", true, "skate", true}, 26 | {"fluently", "english", true, "fluentli", true}, 27 | {"ied", "english", true, "ie", true}, 28 | {"ies", "english", true, "ie", true}, 29 | // Change stemStopWords 30 | {"above", "english", true, "abov", true}, 31 | {"because", "english", false, "because", true}, 32 | // Give invalid language 33 | {"because", "klingon", false, "", false}, 34 | 35 | // Spanish tests, a few 36 | {"lejana", "spanish", true, "lejan", true}, 37 | {"preocuparse", "spanish", true, "preocup", true}, 38 | {"oposición", "spanish", true, "oposicion", true}, 39 | {"prisionero", "spanish", true, "prisioner", true}, 40 | {"ridiculización", "spanish", true, "ridiculiz", true}, 41 | {"cotidianeidad", "spanish", true, "cotidian", true}, 42 | {"portezuela", "spanish", true, "portezuel", true}, 43 | {"enriquecerse", "spanish", true, "enriquec", true}, 44 | {"campesinos", "spanish", true, "campesin", true}, 45 | {"desalojó", "spanish", true, "desaloj", true}, 46 | {"anticipadas", "spanish", true, "anticip", true}, 47 | {"goyesca", "spanish", true, "goyesc", true}, 48 | {"band", "spanish", true, "band", true}, 49 | } 50 | for _, testCase := range testCases { 51 | out, err := Stem(testCase.in, testCase.language, testCase.stemStopWords) 52 | nilErr := true 53 | if err != nil { 54 | nilErr = false 55 | } 56 | if out != testCase.out || nilErr != testCase.nilErr { 57 | t.Errorf("Stem(\"%v\", \"%v\", %v) = \"%v, %v\", but expected %v, %v", 58 | testCase.in, testCase.language, testCase.stemStopWords, 59 | out, nilErr, testCase.out, testCase.nilErr, 60 | ) 61 | } 62 | 63 | } 64 | } 65 | 66 | // Test if the VERSION constant is correctly formatted 67 | // 68 | func Test_Version(t *testing.T) { 69 | validVersionRegexp := regexp.MustCompile(`^v\d+\.\d+\.\d+$`) 70 | if validVersionRegexp.MatchString(VERSION) == false { 71 | t.Errorf("Invalid version specified: %v", VERSION) 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /english/step2.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 2 is the stemming of various endings found in 10 | // R1 including "al", "ness", and "li". 11 | func step2(w *snowballword.SnowballWord) bool { 12 | 13 | // Possible sufficies for this step, longest first. 14 | suffix := w.FirstSuffix( 15 | "ational", "fulness", "iveness", "ization", "ousness", 16 | "biliti", "lessli", "tional", "alism", "aliti", "ation", 17 | "entli", "fulli", "iviti", "ousli", "anci", "abli", 18 | "alli", "ator", "enci", "izer", "bli", "ogi", "li", 19 | ) 20 | suffixLength := utf8.RuneCountInString(suffix) 21 | 22 | // If it is not in R1, do nothing 23 | if suffix == "" || suffixLength > len(w.RS)-w.R1start { 24 | return false 25 | } 26 | 27 | // Handle special cases where we're not just going to 28 | // replace the suffix with another suffix: there are 29 | // other things we need to do. 30 | // 31 | switch suffix { 32 | 33 | case "li": 34 | 35 | // Delete if preceded by a valid li-ending. Valid li-endings inlude the 36 | // following charaters: cdeghkmnrt. (Note, the unicode code points for 37 | // these characters are, respectively, as follows: 38 | // 99 100 101 103 104 107 109 110 114 116) 39 | // 40 | rsLen := len(w.RS) 41 | if rsLen >= 3 { 42 | switch w.RS[rsLen-3] { 43 | case 99, 100, 101, 103, 104, 107, 109, 110, 114, 116: 44 | w.RemoveLastNRunes(suffixLength) 45 | return true 46 | } 47 | } 48 | return false 49 | 50 | case "ogi": 51 | 52 | // Replace by og if preceded by l. 53 | // (Note, the unicode code point for l is 108) 54 | // 55 | rsLen := len(w.RS) 56 | if rsLen >= 4 && w.RS[rsLen-4] == 108 { 57 | w.ReplaceSuffixRunes([]rune(suffix), []rune("og"), true) 58 | } 59 | return true 60 | } 61 | 62 | // Handle a suffix that was found, which is going 63 | // to be replaced with a different suffix. 64 | // 65 | var repl string 66 | switch suffix { 67 | case "tional": 68 | repl = "tion" 69 | case "enci": 70 | repl = "ence" 71 | case "anci": 72 | repl = "ance" 73 | case "abli": 74 | repl = "able" 75 | case "entli": 76 | repl = "ent" 77 | case "izer", "ization": 78 | repl = "ize" 79 | case "ational", "ation", "ator": 80 | repl = "ate" 81 | case "alism", "aliti", "alli": 82 | repl = "al" 83 | case "fulness": 84 | repl = "ful" 85 | case "ousli", "ousness": 86 | repl = "ous" 87 | case "iveness", "iviti": 88 | repl = "ive" 89 | case "biliti", "bli": 90 | repl = "ble" 91 | case "fulli": 92 | repl = "ful" 93 | case "lessli": 94 | repl = "less" 95 | } 96 | w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true) 97 | return true 98 | 99 | } 100 | -------------------------------------------------------------------------------- /english/step1b.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 1b is the normalization of various "ly" and "ed" sufficies. 10 | func step1b(w *snowballword.SnowballWord) bool { 11 | 12 | suffix := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed") 13 | suffixLength := utf8.RuneCountInString(suffix) 14 | 15 | switch suffix { 16 | 17 | case "": 18 | // No suffix found 19 | return false 20 | 21 | case "eed", "eedly": 22 | 23 | // Replace by ee if in R1 24 | if suffixLength <= len(w.RS)-w.R1start { 25 | w.ReplaceSuffixRunes([]rune(suffix), []rune("ee"), true) 26 | } 27 | return true 28 | 29 | case "ed", "edly", "ing", "ingly": 30 | hasLowerVowel := false 31 | for i := 0; i < len(w.RS)-suffixLength; i++ { 32 | if isLowerVowel(w.RS[i]) { 33 | hasLowerVowel = true 34 | break 35 | } 36 | } 37 | if hasLowerVowel { 38 | 39 | // This case requires a two-step transformation and, due 40 | // to the way we've implemented the `ReplaceSuffix` method 41 | // here, information about R1 and R2 would be lost between 42 | // the two. Therefore, we need to keep track of the 43 | // original R1 & R2, so that we may set them below, at the 44 | // end of this case. 45 | // 46 | originalR1start := w.R1start 47 | originalR2start := w.R2start 48 | 49 | // Delete if the preceding word part contains a vowel 50 | w.RemoveLastNRunes(suffixLength) 51 | 52 | // ...and after the deletion... 53 | 54 | newSuffix := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt") 55 | switch newSuffix { 56 | 57 | case "": 58 | 59 | // If the word is short, add "e" 60 | if isShortWord(w) { 61 | 62 | // By definition, r1 and r2 are the empty string for 63 | // short words. 64 | w.RS = append(w.RS, []rune("e")...) 65 | w.R1start = len(w.RS) 66 | w.R2start = len(w.RS) 67 | return true 68 | } 69 | 70 | case "at", "bl", "iz": 71 | 72 | // If the word ends "at", "bl" or "iz" add "e" 73 | w.ReplaceSuffixRunes([]rune(newSuffix), []rune(newSuffix+"e"), true) 74 | 75 | case "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt": 76 | 77 | // If the word ends with a double remove the last letter. 78 | // Note that, "double" does not include all possible doubles, 79 | // just those shown above. 80 | // 81 | w.RemoveLastNRunes(1) 82 | } 83 | 84 | // Because we did a double replacement, we need to fix 85 | // R1 and R2 manually. This is just becase of how we've 86 | // implemented the `ReplaceSuffix` method. 87 | // 88 | rsLen := len(w.RS) 89 | if originalR1start < rsLen { 90 | w.R1start = originalR1start 91 | } else { 92 | w.R1start = rsLen 93 | } 94 | if originalR2start < rsLen { 95 | w.R2start = originalR2start 96 | } else { 97 | w.R2start = rsLen 98 | } 99 | 100 | return true 101 | } 102 | 103 | } 104 | 105 | return false 106 | } 107 | -------------------------------------------------------------------------------- /romance/testing_helpers.go: -------------------------------------------------------------------------------- 1 | /* 2 | This file contains test runners that are common to 3 | the romance languages. 4 | */ 5 | package romance 6 | 7 | import ( 8 | "fmt" 9 | "github.com/kljensen/snowball/snowballword" 10 | "testing" 11 | ) 12 | 13 | type stepFunc func(*snowballword.SnowballWord) bool 14 | type StepTestCase struct { 15 | WordIn string 16 | R1start int 17 | R2start int 18 | RVstart int 19 | Changed bool 20 | WordOut string 21 | R1startOut int 22 | R2startOut int 23 | RVstartOut int 24 | } 25 | 26 | func RunStepTest(t *testing.T, f stepFunc, tcs []StepTestCase) { 27 | for _, testCase := range tcs { 28 | w := snowballword.New(testCase.WordIn) 29 | w.R1start = testCase.R1start 30 | w.R2start = testCase.R2start 31 | w.RVstart = testCase.RVstart 32 | retval := f(w) 33 | if retval != testCase.Changed || w.String() != testCase.WordOut || w.R1start != testCase.R1startOut || w.R2start != testCase.R2startOut || w.RVstart != testCase.RVstartOut { 34 | t.Errorf("Expected %v -> \"{%v, %v, %v, %v, %v}\", but got \"{%v, %v, %v, %v, %v}\"", testCase.WordIn, testCase.WordOut, testCase.R1startOut, testCase.R2startOut, testCase.RVstartOut, testCase.Changed, w.String(), w.R1start, w.R2start, w.RVstart, retval) 35 | } 36 | if w.String() != testCase.WordOut { 37 | fmt.Printf("{\"%v\", %v, %v, %v, true, \"%v\", %v, %v, %v},\n", testCase.WordIn, testCase.R1start, testCase.R2start, testCase.RVstart, testCase.WordOut, w.R1start, w.R2start, w.RVstart) 38 | } 39 | } 40 | } 41 | 42 | // Test case for functions that take a word and return a bool. 43 | type WordBoolTestCase struct { 44 | Word string 45 | Result bool 46 | } 47 | 48 | // Test runner for functions that take a word and return a bool. 49 | // 50 | func RunWordBoolTest(t *testing.T, f func(string) bool, tcs []WordBoolTestCase) { 51 | for _, testCase := range tcs { 52 | result := f(testCase.Word) 53 | if result != testCase.Result { 54 | t.Errorf("Expected %v -> %v, but got %v", testCase.Word, testCase.Result, result) 55 | } 56 | } 57 | } 58 | 59 | // Test runner for functions that should be fed each rune of 60 | // a string and that return a bool for each rune. Usually used 61 | // to test functions that return true if a rune is a vowel, etc. 62 | // 63 | func RunRunewiseBoolTest(t *testing.T, f func(rune) bool, tcs []WordBoolTestCase) { 64 | for _, testCase := range tcs { 65 | for _, r := range testCase.Word { 66 | result := f(r) 67 | if result != testCase.Result { 68 | t.Errorf("Expected %v -> %v, but got %v", r, testCase.Result, result) 69 | } 70 | } 71 | } 72 | } 73 | 74 | type FindRegionsTestCase struct { 75 | Word string 76 | R1start int 77 | R2start int 78 | RVstart int 79 | } 80 | 81 | // Test isLowerVowel for things we know should be true 82 | // or false. 83 | // 84 | func RunFindRegionsTest(t *testing.T, f func(*snowballword.SnowballWord) (int, int, int), tcs []FindRegionsTestCase) { 85 | for _, testCase := range tcs { 86 | w := snowballword.New(testCase.Word) 87 | r1start, r2start, rvstart := f(w) 88 | if r1start != testCase.R1start || r2start != testCase.R2start || rvstart != testCase.RVstart { 89 | t.Errorf("Expect \"%v\" -> %v, %v, %v, but got %v, %v, %v", 90 | testCase.Word, testCase.R1start, testCase.R2start, testCase.RVstart, 91 | r1start, r2start, rvstart, 92 | ) 93 | } 94 | 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /spanish/step1.go: -------------------------------------------------------------------------------- 1 | package spanish 2 | 3 | import ( 4 | "log" 5 | "unicode/utf8" 6 | 7 | "github.com/kljensen/snowball/snowballword" 8 | ) 9 | 10 | // Step 1 is the removal of standard suffixes 11 | func step1(word *snowballword.SnowballWord) bool { 12 | 13 | // Possible suffixes, longest first 14 | suffix := word.FirstSuffix( 15 | "amientos", "imientos", "aciones", "amiento", "imiento", 16 | "uciones", "logías", "idades", "encias", "ancias", "amente", 17 | "adores", "adoras", "ución", "mente", "logía", "istas", 18 | "ismos", "ibles", "encia", "anzas", "antes", "ancia", 19 | "adora", "ación", "ables", "osos", "osas", "ivos", "ivas", 20 | "ista", "ismo", "idad", "icos", "icas", "ible", "anza", 21 | "ante", "ador", "able", "oso", "osa", "ivo", "iva", 22 | "ico", "ica", 23 | ) 24 | suffixLength := utf8.RuneCountInString(suffix) 25 | 26 | isInR1 := (word.R1start <= len(word.RS)-suffixLength) 27 | isInR2 := (word.R2start <= len(word.RS)-suffixLength) 28 | 29 | // Deal with special cases first. All of these will 30 | // return if they are hit. 31 | // 32 | switch suffix { 33 | case "": 34 | 35 | // Nothing to do 36 | return false 37 | 38 | case "amente": 39 | 40 | if isInR1 { 41 | // Delete if in R1 42 | word.RemoveLastNRunes(suffixLength) 43 | 44 | // if preceded by iv, delete if in R2 (and if further preceded by at, 45 | // delete if in R2), otherwise, 46 | // if preceded by os, ic or ad, delete if in R2 47 | newSuffix := word.RemoveFirstSuffixIfIn(word.R2start, "iv", "os", "ic", "ad") 48 | if newSuffix == "iv" { 49 | word.RemoveFirstSuffixIfIn(word.R2start, "at") 50 | } 51 | return true 52 | } 53 | return false 54 | } 55 | 56 | // All the following cases require the found suffix 57 | // to be in R2. 58 | if isInR2 == false { 59 | return false 60 | } 61 | 62 | // Compound replacement cases. All these cases return 63 | // if they are hit. 64 | // 65 | compoundReplacement := func(otherSuffixes ...string) bool { 66 | word.RemoveLastNRunes(suffixLength) 67 | word.RemoveFirstSuffixIfIn(word.R2start, otherSuffixes...) 68 | return true 69 | } 70 | 71 | switch suffix { 72 | case "adora", "ador", "ación", "adoras", "adores", "aciones", "ante", "antes", "ancia", "ancias": 73 | return compoundReplacement("ic") 74 | case "mente": 75 | return compoundReplacement("ante", "able", "ible") 76 | case "idad", "idades": 77 | return compoundReplacement("abil", "ic", "iv") 78 | case "iva", "ivo", "ivas", "ivos": 79 | return compoundReplacement("at") 80 | } 81 | 82 | // Simple replacement & deletion cases are all that remain. 83 | // 84 | simpleReplacement := func(repl string) bool { 85 | word.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true) 86 | return true 87 | } 88 | switch suffix { 89 | case "logía", "logías": 90 | return simpleReplacement("log") 91 | case "ución", "uciones": 92 | return simpleReplacement("u") 93 | case "encia", "encias": 94 | return simpleReplacement("ente") 95 | case "anza", "anzas", "ico", "ica", "icos", "icas", 96 | "ismo", "ismos", "able", "ables", "ible", "ibles", 97 | "ista", "istas", "oso", "osa", "osos", "osas", 98 | "amiento", "amientos", "imiento", "imientos": 99 | word.RemoveLastNRunes(suffixLength) 100 | return true 101 | } 102 | 103 | log.Panicln("Unhandled suffix:", suffix) 104 | return false 105 | } 106 | -------------------------------------------------------------------------------- /norwegian/norwegian_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Herein lie all the tests of the norwegian snowball stemmer. 3 | TODO 4 | */ 5 | package norwegian 6 | 7 | import ( 8 | "testing" 9 | 10 | "github.com/kljensen/snowball/snowballword" 11 | ) 12 | 13 | // Test stopWords for things we know should be true 14 | // or false. 15 | // 16 | func Test_stopWords(t *testing.T) { 17 | 18 | // Test true 19 | knownTrueStopwords := [...]string{ 20 | "og", 21 | "for", 22 | "mye", 23 | "ikke", 24 | } 25 | for _, word := range knownTrueStopwords { 26 | if IsStopWord(word) == false { 27 | t.Errorf("Expected %v, to be in stopWords", word) 28 | } 29 | } 30 | 31 | // Test false 32 | knownFalseStopwords := [...]string{ 33 | "truck", 34 | "deoxyribonucleic", 35 | "farse", 36 | "bullschnizzle", 37 | } 38 | for _, word := range knownFalseStopwords { 39 | if IsStopWord(word) == true { 40 | t.Errorf("Expected %v, to be in stopWords", word) 41 | } 42 | } 43 | } 44 | 45 | func Test_r1(t *testing.T) { 46 | var wordTests = []struct { 47 | word string 48 | r1 string 49 | }{ 50 | {"åpnet", "et"}, 51 | {"åpner", "er"}, 52 | {"hvems", "s"}, 53 | {"ørene", "ne"}, 54 | // Special cases below 55 | } 56 | for _, testCase := range wordTests { 57 | w := snowballword.New(testCase.word) 58 | r1start := r1(w) 59 | w.R1start = r1start 60 | if w.R1String() != testCase.r1 { 61 | t.Errorf("Expected \"{%v}\", but got \"{%v}\"", testCase.r1, w.R1String()) 62 | } 63 | } 64 | } 65 | 66 | type stepFunc func(*snowballword.SnowballWord) bool 67 | type stepTest struct { 68 | wordIn string 69 | r1start int 70 | wordOut string 71 | r1out string 72 | } 73 | 74 | func runStepTest(t *testing.T, f stepFunc, tcs []stepTest) { 75 | for _, testCase := range tcs { 76 | w := snowballword.New(testCase.wordIn) 77 | w.R1start = testCase.r1start 78 | _ = f(w) 79 | if w.String() != testCase.wordOut || w.R1String() != testCase.r1out { 80 | t.Errorf("Expected \"{%v, %v}\", but got \"{%v, %v}\"", testCase.wordOut, testCase.r1out, w.String(), w.R1String()) 81 | } 82 | } 83 | } 84 | 85 | func Test_step1(t *testing.T) { 86 | var testCases = []stepTest{ 87 | {"høytidlighetene", 3, "høytidlig", "tidlig"}, 88 | {"øyets", 3, "øyet", "t"}, 89 | {"ørets", 3, "øret", "t"}, 90 | } 91 | runStepTest(t, step1, testCases) 92 | } 93 | 94 | func Test_step2(t *testing.T) { 95 | var testCases = []stepTest{} 96 | runStepTest(t, step2, testCases) 97 | } 98 | 99 | func Test_step3(t *testing.T) { 100 | var testCases = []stepTest{ 101 | {"årlig", 3, "årl", ""}, 102 | } 103 | runStepTest(t, step3, testCases) 104 | } 105 | 106 | func Test_Stem(t *testing.T) { 107 | var testCases = []struct { 108 | in string 109 | stemStopWords bool 110 | out string 111 | }{ 112 | {"havnedistrikt", true, "havnedistrikt"}, 113 | {"havnedistriktene", true, "havnedistrikt"}, 114 | {"havnedistrikter", true, "havnedistrikt"}, 115 | {"havnedistriktets", true, "havnedistrikt"}, 116 | {"havnedistriktets", true, "havnedistrikt"}, 117 | {"opp", true, "opp"}, 118 | {"oppad", true, "oppad"}, 119 | {"opning", true, "opning"}, 120 | {"havneinteresser", true, "havneinteress"}, 121 | {"oppbygginga", true, "oppbygging"}, 122 | {"oppbyggingen", true, "oppbygging"}, 123 | {"oppdaterte", true, "oppdater"}, 124 | {"tredjepersons", true, "tredjeperson"}, 125 | {"uspesisfisert", true, "uspesisfiser"}, 126 | {"voks", true, "voks"}, 127 | } 128 | for _, tc := range testCases { 129 | stemmed := Stem(tc.in, tc.stemStopWords) 130 | if stemmed != tc.out { 131 | t.Errorf("Expected %v to stem to %v, but got %v", tc.in, tc.out, stemmed) 132 | } 133 | } 134 | 135 | } 136 | -------------------------------------------------------------------------------- /swedish/swedish_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Herein lie all the tests of the Swedish snowball stemmer. 3 | 4 | */ 5 | package swedish 6 | 7 | import ( 8 | "testing" 9 | 10 | "github.com/kljensen/snowball/snowballword" 11 | ) 12 | 13 | // Test stopWords for things we know should be true 14 | // or false. 15 | // 16 | func Test_stopWords(t *testing.T) { 17 | 18 | // Test true 19 | knownTrueStopwords := [...]string{ 20 | "och", 21 | "för", 22 | "att", 23 | "inte", 24 | } 25 | for _, word := range knownTrueStopwords { 26 | if IsStopWord(word) == false { 27 | t.Errorf("Expected %v, to be in stopWords", word) 28 | } 29 | } 30 | 31 | // Test false 32 | knownFalseStopwords := [...]string{ 33 | "truck", 34 | "deoxyribonucleic", 35 | "farse", 36 | "bullschnizzle", 37 | } 38 | for _, word := range knownFalseStopwords { 39 | if IsStopWord(word) == true { 40 | t.Errorf("Expected %v, to be in stopWords", word) 41 | } 42 | } 43 | } 44 | 45 | func Test_r1(t *testing.T) { 46 | var wordTests = []struct { 47 | word string 48 | r1 string 49 | }{ 50 | {"öppnade", "nade"}, 51 | {"örnar", "ar"}, 52 | {"vems", "s"}, 53 | {"årorna", "rna"}, 54 | // Special cases below 55 | } 56 | for _, testCase := range wordTests { 57 | w := snowballword.New(testCase.word) 58 | r1start := r1(w) 59 | w.R1start = r1start 60 | if w.R1String() != testCase.r1 { 61 | t.Errorf("Expected \"{%v}\", but got \"{%v}\"", testCase.r1, w.R1String()) 62 | } 63 | } 64 | } 65 | 66 | type stepFunc func(*snowballword.SnowballWord) bool 67 | type stepTest struct { 68 | wordIn string 69 | r1start int 70 | wordOut string 71 | r1out string 72 | } 73 | 74 | func runStepTest(t *testing.T, f stepFunc, tcs []stepTest) { 75 | for _, testCase := range tcs { 76 | w := snowballword.New(testCase.wordIn) 77 | w.R1start = testCase.r1start 78 | _ = f(w) 79 | if w.String() != testCase.wordOut || w.R1String() != testCase.r1out { 80 | t.Errorf("Expected \"{%v, %v}\", but got \"{%v, %v}\"", testCase.wordOut, testCase.r1out, w.String(), w.R1String()) 81 | } 82 | } 83 | } 84 | 85 | func Test_step1(t *testing.T) { 86 | var testCases = []stepTest{ 87 | {"högtidligheterna", 3, "högtidlig", "tidlig"}, 88 | {"ögats", 3, "ögat", "t"}, 89 | {"ärade", 3, "ärad", "d"}, 90 | } 91 | runStepTest(t, step1, testCases) 92 | } 93 | 94 | func Test_step2(t *testing.T) { 95 | var testCases = []stepTest{} 96 | runStepTest(t, step2, testCases) 97 | } 98 | 99 | func Test_step3(t *testing.T) { 100 | var testCases = []stepTest{ 101 | {"årlig", 3, "årl", ""}, 102 | } 103 | runStepTest(t, step3, testCases) 104 | } 105 | 106 | func Test_Stem(t *testing.T) { 107 | var testCases = []struct { 108 | in string 109 | stemStopWords bool 110 | out string 111 | }{ 112 | {"jaktkarlar", true, "jaktkarl"}, 113 | {"jaktkarlarne", true, "jaktkarl"}, 114 | {"klokaste", true, "klok"}, 115 | {"klokheten", true, "klok"}, 116 | {"friskt", true, "frisk"}, 117 | {"fröken", true, "frök"}, 118 | {"kloliknande", true, "klolikn"}, 119 | {"hopplöst", true, "hopplös"}, 120 | {"hopplöshet", true, "hopplös"}, 121 | {"årorna", true, "årorn"}, 122 | // {"skating", true, "skate"}, 123 | // {"fluently", true, "fluentli"}, 124 | // {"ied", true, "ie"}, 125 | // {"ies", true, "ie"}, 126 | // Stop words 127 | {"vilkas", true, "vilk"}, 128 | {"vilkas", false, "vilkas"}, 129 | // {"above", true, "abov"}, 130 | // {"above", false, "above"}, 131 | } 132 | for _, tc := range testCases { 133 | stemmed := Stem(tc.in, tc.stemStopWords) 134 | if stemmed != tc.out { 135 | t.Errorf("Expected %v to stem to %v, but got %v", tc.in, tc.out, stemmed) 136 | } 137 | } 138 | 139 | } 140 | -------------------------------------------------------------------------------- /french/common.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/romance" 7 | "github.com/kljensen/snowball/snowballword" 8 | ) 9 | 10 | // Return `true` if the input `word` is a French stop word. 11 | func IsStopWord(word string) bool { 12 | switch word { 13 | case "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du", 14 | "elle", "en", "et", "eux", "il", "je", "la", "le", "leur", 15 | "lui", "ma", "mais", "me", "même", "mes", "moi", "mon", "ne", 16 | "nos", "notre", "nous", "on", "ou", "par", "pas", "pour", "qu", 17 | "que", "qui", "sa", "se", "ses", "son", "sur", "ta", "te", 18 | "tes", "toi", "ton", "tu", "un", "une", "vos", "votre", "vous", 19 | "c", "d", "j", "l", "à", "m", "n", "s", "t", "y", "été", 20 | "étée", "étées", "étés", "étant", "étante", "étants", "étantes", 21 | "suis", "es", "est", "sommes", "êtes", "sont", "serai", 22 | "seras", "sera", "serons", "serez", "seront", "serais", 23 | "serait", "serions", "seriez", "seraient", "étais", "était", 24 | "étions", "étiez", "étaient", "fus", "fut", "fûmes", "fûtes", 25 | "furent", "sois", "soit", "soyons", "soyez", "soient", "fusse", 26 | "fusses", "fût", "fussions", "fussiez", "fussent", "ayant", 27 | "ayante", "ayantes", "ayants", "eu", "eue", "eues", "eus", 28 | "ai", "as", "avons", "avez", "ont", "aurai", "auras", "aura", 29 | "aurons", "aurez", "auront", "aurais", "aurait", "aurions", 30 | "auriez", "auraient", "avais", "avait", "avions", "aviez", 31 | "avaient", "eut", "eûmes", "eûtes", "eurent", "aie", "aies", 32 | "ait", "ayons", "ayez", "aient", "eusse", "eusses", "eût", 33 | "eussions", "eussiez", "eussent": 34 | return true 35 | } 36 | return false 37 | } 38 | 39 | // Checks if a rune is a lowercase French vowel. 40 | func isLowerVowel(r rune) bool { 41 | 42 | // The French vowels are "aeiouyâàëéêèïîôûù", which 43 | // are referenced by their unicode code points 44 | // in the switch statement below. 45 | switch r { 46 | case 97, 101, 105, 111, 117, 121, 226, 224, 235, 233, 234, 232, 239, 238, 244, 251, 249: 47 | return true 48 | } 49 | return false 50 | } 51 | 52 | // Capitalize Y, I, and U runes that are acting as consanants. 53 | // Put into upper case "u" or "i" preceded and followed by a 54 | // vowel, and "y" preceded or followed by a vowel. "u" after q is 55 | // also put into upper case. 56 | func capitalizeYUI(word *snowballword.SnowballWord) { 57 | 58 | // Keep track of vowels that we see 59 | vowelPreviously := false 60 | 61 | // Peak ahead to see if the next rune is a vowel 62 | vowelNext := func(j int) bool { 63 | return (j+1 < len(word.RS) && isLowerVowel(word.RS[j+1])) 64 | } 65 | 66 | // Look at all runes 67 | for i := 0; i < len(word.RS); i++ { 68 | 69 | // Nothing to do for non-vowels 70 | if isLowerVowel(word.RS[i]) == false { 71 | vowelPreviously = false 72 | continue 73 | } 74 | 75 | vowelHere := true 76 | 77 | switch word.RS[i] { 78 | case 121: // y 79 | 80 | // Is this "y" preceded OR followed by a vowel? 81 | if vowelPreviously || vowelNext(i) { 82 | word.RS[i] = 89 // Y 83 | vowelHere = false 84 | } 85 | 86 | case 117: // u 87 | 88 | // Is this "u" is flanked by vowels OR preceded by a "q"? 89 | if (vowelPreviously && vowelNext(i)) || (i >= 1 && word.RS[i-1] == 113) { 90 | word.RS[i] = 85 // U 91 | vowelHere = false 92 | } 93 | 94 | case 105: // i 95 | 96 | // Is this "i" is flanked by vowels? 97 | if vowelPreviously && vowelNext(i) { 98 | word.RS[i] = 73 // I 99 | vowelHere = false 100 | } 101 | } 102 | vowelPreviously = vowelHere 103 | } 104 | } 105 | 106 | // Find the starting point of the regions R1, R2, & RV 107 | func findRegions(word *snowballword.SnowballWord) (r1start, r2start, rvstart int) { 108 | 109 | // R1 & R2 are defined in the standard manner. 110 | r1start = romance.VnvSuffix(word, isLowerVowel, 0) 111 | r2start = romance.VnvSuffix(word, isLowerVowel, r1start) 112 | 113 | // Set RV, by default, as empty. 114 | rvstart = len(word.RS) 115 | 116 | // Handle the three special cases: "par", "col", & "tap" 117 | // 118 | prefix := word.FirstPrefix("par", "col", "tap") 119 | if prefix != "" { 120 | rvstart = utf8.RuneCountInString(prefix) 121 | return 122 | } 123 | 124 | // If the word begins with two vowels, RV is the region after the third letter 125 | if len(word.RS) >= 3 && isLowerVowel(word.RS[0]) && isLowerVowel(word.RS[1]) { 126 | rvstart = 3 127 | return 128 | } 129 | 130 | // Otherwise the region after the first vowel not at the beginning of the word. 131 | for i := 1; i < len(word.RS); i++ { 132 | if isLowerVowel(word.RS[i]) { 133 | rvstart = i + 1 134 | return 135 | } 136 | } 137 | 138 | return 139 | } 140 | -------------------------------------------------------------------------------- /hungarian/stem_test.go: -------------------------------------------------------------------------------- 1 | package hungarian 2 | 3 | import ( 4 | "reflect" 5 | "testing" 6 | 7 | "github.com/kljensen/snowball/snowballword" 8 | ) 9 | 10 | func TestStemSentence(t *testing.T) { 11 | var pairs [][2]string 12 | var got []string 13 | for k, want := range map[string][]string{ 14 | `Tisztelettel az alábbi bankszámlára szeretném kérni az utalás. Raiffeisen 15 | Bank:999999999999999999999999.Tisztelettel:Horváth Péter 16 | 17 | Az alábbi email a KöBE hálózatán kívüli forrásból érkezett, kérjük, legyen óvatos a beágyazott linkekkel és csatolmányokkal! 18 | `: []string{ 19 | "tisztel", "az", "alább", "bankszáml", "szeretne", "kérn", "az", "utalás", 20 | "raiffeis", "ba", "999999999999999999999999", "tisztel", "horváth", "péter", 21 | "az", "alább", "email", "a", "kö", "hálózat", "kívül", "forrás", "érkezet", 22 | "kér", "legyen", "óvatos", "a", "beágyazot", "link", "és", "csatolmány", 23 | }, 24 | } { 25 | pairs = StemSentence(pairs[:0], k) 26 | got = got[:0] 27 | for _, p := range pairs { 28 | got = append(got, p[1]) 29 | } 30 | if !reflect.DeepEqual(got, want) { 31 | t.Errorf("%q: got %q, wanted %q", k, got, want) 32 | } 33 | } 34 | } 35 | func TestStem(t *testing.T) { 36 | for k, want := range map[string]string{ 37 | "fiaiéi": "fi", 38 | "megkelkáposztásíthatatlanságoskodásaitokért": "megkelkáposztásíthatatlanságoskodás", 39 | } { 40 | if got := Stem(k, false); got != want { 41 | t.Errorf("%q: got %q, wanted %q", k, got, want) 42 | } 43 | } 44 | } 45 | 46 | func TestStep1(t *testing.T) { 47 | for k, want := range map[string]string{ 48 | "taccsal": "tacs", 49 | "téttel": "tét", 50 | "paddal": "pad", 51 | "padló": "padló", 52 | } { 53 | w := snowballword.New(k) 54 | preprocess(w) 55 | step1(w) 56 | if got := string(w.RS); got != want { 57 | t.Errorf("%q: got %q, wanted %q", k, got, want) 58 | } 59 | } 60 | } 61 | 62 | func TestStep2(t *testing.T) { 63 | for k, want := range map[string]string{ 64 | "padonként": "pad", 65 | "tétről": "tét", 66 | "palából": "pala", 67 | } { 68 | w := snowballword.New(k) 69 | preprocess(w) 70 | step2(w) 71 | if got := string(w.RS); got != want { 72 | t.Errorf("%q: got %q, wanted %q", k, got, want) 73 | } 74 | } 75 | } 76 | func TestStep3(t *testing.T) { 77 | for k, want := range map[string]string{ 78 | "banánként": "bana", 79 | "bányánként": "bánya", 80 | "lepkén": "lepke", 81 | } { 82 | w := snowballword.New(k) 83 | preprocess(w) 84 | step3(w) 85 | if got := string(w.RS); got != want { 86 | t.Errorf("%q: got %q, wanted %q", k, got, want) 87 | } 88 | } 89 | } 90 | func TestStep4(t *testing.T) { 91 | for k, want := range map[string]string{ 92 | "házastul": "ház", 93 | "képestül": "kép", 94 | "akóstul": "akó", 95 | "ruhástul": "ruha", 96 | "vízeséstül": "vízese", 97 | } { 98 | w := snowballword.New(k) 99 | preprocess(w) 100 | step4(w) 101 | if got := string(w.RS); got != want { 102 | t.Errorf("%q: got %q, wanted %q", k, got, want) 103 | } 104 | } 105 | } 106 | func TestStep5(t *testing.T) { 107 | for k, want := range map[string]string{ 108 | "fiaié": "fiaié", 109 | "blatté": "blat", 110 | } { 111 | w := snowballword.New(k) 112 | preprocess(w) 113 | step5(w) 114 | if got := string(w.RS); got != want { 115 | t.Errorf("%q: got %q, wanted %q", k, got, want) 116 | } 117 | } 118 | } 119 | func TestStep6(t *testing.T) { 120 | for k, want := range map[string]string{ 121 | "fiatoké": "fiat", 122 | "fiáéi": "fia", 123 | } { 124 | w := snowballword.New(k) 125 | preprocess(w) 126 | step6(w) 127 | if got := string(w.RS); got != want { 128 | t.Errorf("%q: got %q, wanted %q", k, got, want) 129 | } 130 | } 131 | } 132 | func TestStep7(t *testing.T) { 133 | for k, want := range map[string]string{ 134 | "mamájuk": "mama", 135 | "fenéjük": "fene", 136 | "bánatod": "bánat", 137 | } { 138 | w := snowballword.New(k) 139 | preprocess(w) 140 | step7(w) 141 | if got := string(w.RS); got != want { 142 | t.Errorf("%q: got %q, wanted %q", k, got, want) 143 | } 144 | } 145 | } 146 | func TestStep8(t *testing.T) { 147 | for k, want := range map[string]string{ 148 | "mamáid": "mama", 149 | "fenéitek": "fene", 150 | "bánatai": "bánat", 151 | } { 152 | w := snowballword.New(k) 153 | preprocess(w) 154 | step8(w) 155 | if got := string(w.RS); got != want { 156 | t.Errorf("%q: got %q, wanted %q", k, got, want) 157 | } 158 | } 159 | } 160 | func TestStep9(t *testing.T) { 161 | for k, want := range map[string]string{ 162 | "mamák": "mama", 163 | "fenék": "fene", 164 | "bánatok": "bánat", 165 | } { 166 | w := snowballword.New(k) 167 | preprocess(w) 168 | step9(w) 169 | if got := string(w.RS); got != want { 170 | t.Errorf("%q: got %q, wanted %q", k, got, want) 171 | } 172 | } 173 | } 174 | -------------------------------------------------------------------------------- /russian/step1.go: -------------------------------------------------------------------------------- 1 | package russian 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | // "log" 8 | ) 9 | 10 | // Step 1 is the removal of standard suffixes, all of which must 11 | // occur in RV. 12 | // 13 | // Search for a PERFECTIVE GERUND ending. If one is found remove it, and 14 | // that is then the end of step 1. Otherwise try and remove a REFLEXIVE 15 | // ending, and then search in turn for (1) an ADJECTIVAL, (2) a VERB or 16 | // (3) a NOUN ending. As soon as one of the endings (1) to (3) is found 17 | // remove it, and terminate step 1. 18 | func step1(word *snowballword.SnowballWord) bool { 19 | 20 | // `stop` will be used to signal early termination 21 | var stop bool 22 | 23 | // Search for a PERFECTIVE GERUND ending 24 | stop = removePerfectiveGerundEnding(word) 25 | if stop { 26 | return true 27 | } 28 | 29 | // Next remove reflexive endings 30 | word.RemoveFirstSuffixIn(word.RVstart, "ся", "сь") 31 | 32 | // Next remove adjectival endings 33 | stop = removeAdjectivalEnding(word) 34 | if stop { 35 | return true 36 | } 37 | 38 | // Next remove verb endings 39 | stop = removeVerbEnding(word) 40 | if stop { 41 | return true 42 | } 43 | 44 | // Next remove noun endings 45 | suffix := word.RemoveFirstSuffixIn(word.RVstart, 46 | "иями", "ями", "иях", "иям", "ием", "ией", "ами", "ях", 47 | "ям", "ья", "ью", "ье", "ом", "ой", "ов", "ия", "ию", 48 | "ий", "ии", "ие", "ем", "ей", "еи", "ев", "ах", "ам", 49 | "я", "ю", "ь", "ы", "у", "о", "й", "и", "е", "а", 50 | ) 51 | if suffix != "" { 52 | return true 53 | } 54 | 55 | return false 56 | } 57 | 58 | // Remove perfective gerund endings and return true if one was removed. 59 | func removePerfectiveGerundEnding(word *snowballword.SnowballWord) bool { 60 | suffix := word.FirstSuffixIn(word.RVstart, len(word.RS), 61 | "ившись", "ывшись", "вшись", "ивши", "ывши", "вши", "ив", "ыв", "в", 62 | ) 63 | suffixLength := utf8.RuneCountInString(suffix) 64 | switch suffix { 65 | case "в", "вши", "вшись": 66 | 67 | // These are "Group 1" perfective gerund endings. 68 | // Group 1 endings must follow а (a) or я (ia) in RV. 69 | if precededByARinRV(word, suffixLength) == false { 70 | suffix = "" 71 | } 72 | 73 | } 74 | 75 | if suffix != "" { 76 | word.RemoveLastNRunes(suffixLength) 77 | return true 78 | } 79 | return false 80 | } 81 | 82 | // Remove adjectival endings and return true if one was removed. 83 | func removeAdjectivalEnding(word *snowballword.SnowballWord) bool { 84 | 85 | // Remove adjectival endings. Start by looking for 86 | // an adjective ending. 87 | // 88 | suffix := word.RemoveFirstSuffixIn(word.RVstart, 89 | "ими", "ыми", "его", "ого", "ему", "ому", "ее", "ие", 90 | "ые", "ое", "ей", "ий", "ый", "ой", "ем", "им", "ым", 91 | "ом", "их", "ых", "ую", "юю", "ая", "яя", "ою", "ею", 92 | ) 93 | if suffix != "" { 94 | 95 | // We found an adjective ending. Remove optional participle endings. 96 | // 97 | newSuffix := word.FirstSuffixIn(word.RVstart, len(word.RS), 98 | "ивш", "ывш", "ующ", 99 | "ем", "нн", "вш", "ющ", "щ", 100 | ) 101 | suffixLength := utf8.RuneCountInString(newSuffix) 102 | 103 | switch newSuffix { 104 | case "ем", "нн", "вш", "ющ", "щ": 105 | 106 | // These are "Group 1" participle endings. 107 | // Group 1 endings must follow а (a) or я (ia) in RV. 108 | if precededByARinRV(word, suffixLength) == false { 109 | newSuffix = "" 110 | } 111 | } 112 | 113 | if newSuffix != "" { 114 | word.RemoveLastNRunes(suffixLength) 115 | } 116 | return true 117 | } 118 | return false 119 | } 120 | 121 | // Remove verb endings and return true if one was removed. 122 | func removeVerbEnding(word *snowballword.SnowballWord) bool { 123 | suffix := word.FirstSuffixIn(word.RVstart, len(word.RS), 124 | "уйте", "ейте", "ыть", "ыло", "ыли", "ыла", "уют", "ует", 125 | "нно", "йте", "ишь", "ить", "ите", "ило", "или", "ила", 126 | "ешь", "ете", "ены", "ено", "ена", "ят", "ют", "ыт", "ым", 127 | "ыл", "ую", "уй", "ть", "ны", "но", "на", "ло", "ли", "ла", 128 | "ит", "им", "ил", "ет", "ен", "ем", "ей", "ю", "н", "л", "й", 129 | ) 130 | suffixLength := utf8.RuneCountInString(suffix) 131 | 132 | switch suffix { 133 | case "ла", "на", "ете", "йте", "ли", "й", "л", "ем", "н", 134 | "ло", "но", "ет", "ют", "ны", "ть", "ешь", "нно": 135 | 136 | // These are "Group 1" verb endings. 137 | // Group 1 endings must follow а (a) or я (ia) in RV. 138 | if precededByARinRV(word, suffixLength) == false { 139 | suffix = "" 140 | } 141 | 142 | } 143 | 144 | if suffix != "" { 145 | word.RemoveLastNRunes(suffixLength) 146 | return true 147 | } 148 | return false 149 | } 150 | 151 | // There are multiple classes of endings that must be 152 | // preceded by а (a) or я (ia) in RV in order to be removed. 153 | func precededByARinRV(word *snowballword.SnowballWord, suffixLen int) bool { 154 | idx := len(word.RS) - suffixLen - 1 155 | if idx >= word.RVstart && (word.RS[idx] == 'а' || word.RS[idx] == 'я') { 156 | return true 157 | } 158 | return false 159 | } 160 | -------------------------------------------------------------------------------- /snowballword/snowballword_test.go: -------------------------------------------------------------------------------- 1 | package snowballword 2 | 3 | import "testing" 4 | 5 | func Test_New(t *testing.T) { 6 | w := New("kyle") 7 | if w.String() != "kyle" { 8 | t.Errorf("Expected \"%v\" but got \"%v\"", "kyle", w.String()) 9 | } 10 | } 11 | 12 | func Test_FirstPrefix(t *testing.T) { 13 | var testCases = []struct { 14 | input string 15 | prefixes []string 16 | prefix string 17 | }{ 18 | {"firehose", []string{"x", "fi"}, "fi"}, 19 | {"firehose", []string{"x", "fix", "fi"}, "fi"}, 20 | {"firehose", []string{"x", "fi"}, "fi"}, 21 | {"firehose", []string{"fire", "fi"}, "fire"}, 22 | {"firehose", []string{"fixre", "xfi"}, ""}, 23 | {"firehose", []string{"firehosex"}, ""}, 24 | } 25 | for _, tc := range testCases { 26 | w := New(tc.input) 27 | prefix := w.FirstPrefix(tc.prefixes...) 28 | if prefix != tc.prefix { 29 | t.Errorf("Expected \"{%v}\" but got \"{%v}\"", tc.prefix, prefix) 30 | } 31 | } 32 | } 33 | 34 | func Test_FirstSuffix(t *testing.T) { 35 | var testCases = []struct { 36 | input string 37 | suffixes []string 38 | suffix string 39 | }{ 40 | {"firehose", []string{"x", "fi"}, ""}, 41 | {"firehose", []string{"x", "hose", "fi"}, "hose"}, 42 | {"firehose", []string{"x", "se"}, "se"}, 43 | {"firehose", []string{"fire", "xfirehose"}, ""}, 44 | } 45 | for _, tc := range testCases { 46 | w := New(tc.input) 47 | suffix := w.FirstSuffix(tc.suffixes...) 48 | if suffix != tc.suffix { 49 | t.Errorf("Expected \"{%v}\" but got \"{%v}\"", tc.suffix, suffix) 50 | } 51 | } 52 | } 53 | func Test_FirstSuffixIfIn(t *testing.T) { 54 | var testCases = []struct { 55 | input string 56 | startPos int 57 | endPos int 58 | suffixes []string 59 | suffix string 60 | }{ 61 | {"firehose", 0, 6, []string{"x", "fi"}, ""}, 62 | {"firehose", 0, 6, []string{"x", "eho", "fi"}, "eho"}, 63 | {"firehose", 0, 4, []string{"re", "se"}, "re"}, 64 | {"firehose", 0, 4, []string{"se", "xfirehose"}, ""}, 65 | {"firehose", 0, 4, []string{"fire", "xxx"}, "fire"}, 66 | {"firehose", 1, 5, []string{"fire", "xxx"}, ""}, 67 | // The follwoing tests shows how FirstSuffixIfIn works. It 68 | // first checks for the matching suffix and only then checks 69 | // to see if it is starts at or before startPos. This 70 | // is the behavior desired for many stemming steps but 71 | // is somewhat counterintuitive. 72 | {"firehose", 1, 5, []string{"fireh", "ireh", "h"}, ""}, 73 | {"firehose", 1, 5, []string{"ireh", "fireh", "h"}, "ireh"}, 74 | } 75 | for _, tc := range testCases { 76 | w := New(tc.input) 77 | suffix := w.FirstSuffixIfIn(tc.startPos, tc.endPos, tc.suffixes...) 78 | if suffix != tc.suffix { 79 | t.Errorf("Expected \"{%v}\" but got \"{%v}\"", tc.suffix, suffix) 80 | } 81 | } 82 | } 83 | 84 | func Test_ReplaceSuffixRunes(t *testing.T) { 85 | var testCases = []struct { 86 | input string 87 | suffix string 88 | repl string 89 | force bool 90 | output string 91 | }{ 92 | {"tonydanza", "danza", "yyy", true, "tonyyyy"}, 93 | {"tonydanza", "danza", "yyy", false, "tonyyyy"}, 94 | {"tonydanza", "danzad", "yyy", false, "tonydanza"}, 95 | {"tonydanza", "danzad", "yyy", true, "tonyyy"}, 96 | } 97 | for _, tc := range testCases { 98 | w := New(tc.input) 99 | w.ReplaceSuffixRunes([]rune(tc.suffix), []rune(tc.repl), tc.force) 100 | if w.String() != tc.output { 101 | t.Errorf("Expected %v -> \"%v\", but got \"%v\"", tc.input, tc.output, w.String()) 102 | } 103 | } 104 | 105 | } 106 | 107 | func Test_ReplaceSuffix(t *testing.T) { 108 | var testCases = []struct { 109 | input string 110 | r1start int 111 | r2start int 112 | suffix string 113 | repl string 114 | output string 115 | outputR1String string 116 | outputR2String string 117 | }{ 118 | {"accliviti", 2, 6, "iviti", "ive", "acclive", "clive", "e"}, 119 | {"skating", 4, 6, "ing", "e", "skate", "e", ""}, 120 | {"convirtiéndo", 3, 6, "iéndo", "iendo", "convirtiendo", "virtiendo", "tiendo"}, 121 | } 122 | for _, tc := range testCases { 123 | w := New(tc.input) 124 | w.R1start = tc.r1start 125 | w.R2start = tc.r2start 126 | w.ReplaceSuffix(tc.suffix, tc.repl, true) 127 | if w.String() != tc.output || w.R1String() != tc.outputR1String || w.R2String() != tc.outputR2String { 128 | t.Errorf("Expected %v -> \"{%v, %v, %v}\" but got \"{%v, %v, %v}\"", tc.input, tc.output, tc.outputR1String, tc.outputR2String, w.String(), w.R1String(), w.R2String()) 129 | } 130 | } 131 | } 132 | 133 | func Test_RemoveLastNRunes(t *testing.T) { 134 | var testCases = []struct { 135 | input string 136 | r1start int 137 | r2start int 138 | n int 139 | output string 140 | outputR1String string 141 | outputR2String string 142 | }{ 143 | {"aabbccddee", 8, 9, 0, "aabbccddee", "ee", "e"}, 144 | {"aabbccddee", 8, 9, 5, "aabbc", "", ""}, 145 | {"aabbccddee", 8, 9, 1, "aabbccdde", "e", ""}, 146 | } 147 | for _, tc := range testCases { 148 | w := New(tc.input) 149 | w.R1start = tc.r1start 150 | w.R2start = tc.r2start 151 | w.RemoveLastNRunes(tc.n) 152 | if w.String() != tc.output || w.R1String() != tc.outputR1String || w.R2String() != tc.outputR2String { 153 | t.Errorf("Expected %v -> \"{%v, %v, %v}\" but got \"{%v, %v, %v}\"", tc.input, tc.output, tc.outputR1String, tc.outputR2String, w.String(), w.R1String(), w.R2String()) 154 | } 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /spanish/common.go: -------------------------------------------------------------------------------- 1 | package spanish 2 | 3 | import ( 4 | "github.com/kljensen/snowball/romance" 5 | "github.com/kljensen/snowball/snowballword" 6 | ) 7 | 8 | // Change the vowels "áéíóú" into "aeiou". 9 | // 10 | func removeAccuteAccents(word *snowballword.SnowballWord) (didReplacement bool) { 11 | for i := 0; i < len(word.RS); i++ { 12 | switch word.RS[i] { 13 | case 225: 14 | // á -> a 15 | word.RS[i] = 97 16 | didReplacement = true 17 | case 233: 18 | // é -> e 19 | word.RS[i] = 101 20 | didReplacement = true 21 | case 237: 22 | // í -> i 23 | word.RS[i] = 105 24 | didReplacement = true 25 | case 243: 26 | // ó -> o 27 | word.RS[i] = 111 28 | didReplacement = true 29 | case 250: 30 | // ú -> u 31 | word.RS[i] = 117 32 | didReplacement = true 33 | } 34 | } 35 | return 36 | } 37 | 38 | // Find the starting point of the regions R1, R2, & RV 39 | // 40 | func findRegions(word *snowballword.SnowballWord) (r1start, r2start, rvstart int) { 41 | 42 | r1start = romance.VnvSuffix(word, isLowerVowel, 0) 43 | r2start = romance.VnvSuffix(word, isLowerVowel, r1start) 44 | rvstart = len(word.RS) 45 | 46 | if len(word.RS) >= 3 { 47 | switch { 48 | 49 | case !isLowerVowel(word.RS[1]): 50 | 51 | // If the second letter is a consonant, RV is the region after the 52 | // next following vowel. 53 | for i := 2; i < len(word.RS); i++ { 54 | if isLowerVowel(word.RS[i]) { 55 | rvstart = i + 1 56 | break 57 | } 58 | } 59 | 60 | case isLowerVowel(word.RS[0]) && isLowerVowel(word.RS[1]): 61 | 62 | // Or if the first two letters are vowels, RV 63 | // is the region after the next consonant. 64 | for i := 2; i < len(word.RS); i++ { 65 | if !isLowerVowel(word.RS[i]) { 66 | rvstart = i + 1 67 | break 68 | } 69 | } 70 | default: 71 | 72 | // Otherwise (consonant-vowel case) RV is the region after the 73 | // third letter. But RV is the end of the word if these 74 | // positions cannot be found. 75 | rvstart = 3 76 | } 77 | } 78 | 79 | return 80 | } 81 | 82 | // Checks if a rune is a lowercase Spanish vowel. 83 | // 84 | func isLowerVowel(r rune) bool { 85 | 86 | // The spanish vowels are "aeiouáéíóúü", which 87 | // are referenced by their unicode code points 88 | // in the switch statement below. 89 | switch r { 90 | case 97, 101, 105, 111, 117, 225, 233, 237, 243, 250, 252: 91 | return true 92 | } 93 | return false 94 | } 95 | 96 | // Return `true` if the input `word` is a Spanish stop word. 97 | // 98 | func IsStopWord(word string) bool { 99 | switch word { 100 | case "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las", 101 | "por", "un", "para", "con", "no", "una", "su", "al", "lo", "como", 102 | "más", "pero", "sus", "le", "ya", "o", "este", "sí", "porque", "esta", 103 | "entre", "cuando", "muy", "sin", "sobre", "también", "me", "hasta", 104 | "hay", "donde", "quien", "desde", "todo", "nos", "durante", "todos", 105 | "uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", "ellos", 106 | "e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo", "otro", 107 | "otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes", 108 | "nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas", 109 | "algo", "nosotros", "mi", "mis", "tú", "te", "ti", "tu", "tus", "ellas", 110 | "nosotras", "vosostros", "vosostras", "os", "mío", "mía", "míos", "mías", 111 | "tuyo", "tuya", "tuyos", "tuyas", "suyo", "suya", "suyos", "suyas", 112 | "nuestro", "nuestra", "nuestros", "nuestras", "vuestro", "vuestra", 113 | "vuestros", "vuestras", "esos", "esas", "estoy", "estás", "está", "estamos", 114 | "estáis", "están", "esté", "estés", "estemos", "estéis", "estén", "estaré", 115 | "estarás", "estará", "estaremos", "estaréis", "estarán", "estaría", 116 | "estarías", "estaríamos", "estaríais", "estarían", "estaba", "estabas", 117 | "estábamos", "estabais", "estaban", "estuve", "estuviste", "estuvo", 118 | "estuvimos", "estuvisteis", "estuvieron", "estuviera", "estuvieras", 119 | "estuviéramos", "estuvierais", "estuvieran", "estuviese", "estuvieses", 120 | "estuviésemos", "estuvieseis", "estuviesen", "estando", "estado", 121 | "estada", "estados", "estadas", "estad", "he", "has", "ha", "hemos", 122 | "habéis", "han", "haya", "hayas", "hayamos", "hayáis", "hayan", 123 | "habré", "habrás", "habrá", "habremos", "habréis", "habrán", "habría", 124 | "habrías", "habríamos", "habríais", "habrían", "había", "habías", 125 | "habíamos", "habíais", "habían", "hube", "hubiste", "hubo", "hubimos", 126 | "hubisteis", "hubieron", "hubiera", "hubieras", "hubiéramos", "hubierais", 127 | "hubieran", "hubiese", "hubieses", "hubiésemos", "hubieseis", "hubiesen", 128 | "habiendo", "habido", "habida", "habidos", "habidas", "soy", "eres", 129 | "es", "somos", "sois", "son", "sea", "seas", "seamos", "seáis", "sean", 130 | "seré", "serás", "será", "seremos", "seréis", "serán", "sería", "serías", 131 | "seríamos", "seríais", "serían", "era", "eras", "éramos", "erais", 132 | "eran", "fui", "fuiste", "fue", "fuimos", "fuisteis", "fueron", "fuera", 133 | "fueras", "fuéramos", "fuerais", "fueran", "fuese", "fueses", "fuésemos", 134 | "fueseis", "fuesen", "sintiendo", "sentido", "sentida", "sentidos", 135 | "sentidas", "siente", "sentid", "tengo", "tienes", "tiene", "tenemos", 136 | "tenéis", "tienen", "tenga", "tengas", "tengamos", "tengáis", "tengan", 137 | "tendré", "tendrás", "tendrá", "tendremos", "tendréis", "tendrán", 138 | "tendría", "tendrías", "tendríamos", "tendríais", "tendrían", "tenía", 139 | "tenías", "teníamos", "teníais", "tenían", "tuve", "tuviste", "tuvo", 140 | "tuvimos", "tuvisteis", "tuvieron", "tuviera", "tuvieras", "tuviéramos", 141 | "tuvierais", "tuvieran", "tuviese", "tuvieses", "tuviésemos", "tuvieseis", 142 | "tuviesen", "teniendo", "tenido", "tenida", "tenidos", "tenidas", "tened": 143 | return true 144 | } 145 | return false 146 | } 147 | -------------------------------------------------------------------------------- /hungarian/common.go: -------------------------------------------------------------------------------- 1 | package hungarian 2 | 3 | import ( 4 | "sync" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | var ( 10 | runesMapMu sync.Mutex 11 | runesMap = make(map[string][]rune) 12 | ) 13 | 14 | func runesOf(s string) []rune { 15 | runesMapMu.Lock() 16 | rs := runesMap[s] 17 | if rs == nil { 18 | rs = []rune(s) 19 | runesMap[s] = rs 20 | } 21 | runesMapMu.Unlock() 22 | return rs 23 | } 24 | 25 | // findRegions returns start of R1. 26 | // 27 | // If the word begins with a vowel, R1 is defined as the region after the first consonant or digraph in the word. 28 | // If the word begins with a consonant, it is defined as the region after the first vowel in the word. 29 | // If the word does not contain both a vowel and consonant, R1 is the null region at the end of the word. 30 | func findRegions(word *snowballword.SnowballWord) (r1start int) { 31 | if len(word.RS) < 2 { 32 | return 0 33 | } 34 | 35 | // If the word begins with a vowel, R1 is defined as the region 36 | // after the first consonant or digraph in the word. 37 | if isVowel(word.RS[0]) { 38 | for i := 1; i < len(word.RS); i++ { 39 | if isVowel(word.RS[i]) { 40 | continue 41 | } 42 | if j := isDigraph(word.RS[i:]); j > 0 { 43 | return i + j 44 | } 45 | // consonant 46 | return i + 1 47 | } 48 | return len(word.RS) 49 | } 50 | 51 | // If the word begins with a consonant, it is defined as the region 52 | // after the first vowel in the word. 53 | for i := 1; i < len(word.RS); i++ { 54 | if isVowel(word.RS[i]) { 55 | return i + 1 56 | } 57 | } 58 | return len(word.RS) 59 | } 60 | 61 | func isVowel(r rune) bool { 62 | switch r { 63 | case 'a', 'á', 'e', 'é', 'i', 'í', 'o', 'ó', 'ö', 'ő', 'u', 'ú', 'ü', 'ű': 64 | return true 65 | } 66 | return false 67 | } 68 | func isDigraph(rs []rune) int { 69 | if len(rs) < 2 { 70 | return 0 71 | } 72 | switch rs[0] { 73 | case 'c', 'z': // cs, zs 74 | if rs[1] == 's' { 75 | return 2 76 | } 77 | case 'd': 78 | if rs[1] == 'z' { 79 | if len(rs) > 2 && rs[2] == 's' { // dzs 80 | return 3 81 | } 82 | return 2 // dz 83 | } 84 | case 'g', 'l', 'n', 't': 85 | if rs[1] == 'y' { 86 | return 2 87 | } 88 | } 89 | return 0 90 | } 91 | 92 | func isConsonant(r rune) bool { 93 | switch r { 94 | case 'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'z': 95 | return true 96 | } 97 | return false 98 | } 99 | func isDoubleConsonant(rs []rune) int { 100 | if len(rs) < 2 || !isConsonant(rs[0]) || rs[0] != rs[1] { 101 | return 0 102 | } 103 | if len(rs) > 2 { 104 | switch rs[0] { 105 | case 'c', 'z': 106 | if rs[2] == 's' { 107 | return 3 108 | } 109 | case 's': 110 | if rs[2] == 'z' { 111 | return 3 112 | } 113 | case 'g', 'l', 'n', 't': 114 | if rs[2] == 'y' { 115 | return 3 116 | } 117 | } 118 | } 119 | return 2 120 | } 121 | 122 | // IsStopWord returns true it the word is a stop word. 123 | // 124 | // # Hungarian stop word list prepared by Anna Tordai 125 | // 126 | // https://snowballstem.org/algorithms/hungarian/stop.txt 127 | func IsStopWord(word string) bool { 128 | switch word { 129 | case "a", 130 | "ahogy", 131 | "ahol", 132 | "aki", 133 | "akik", 134 | "akkor", 135 | "alatt", 136 | "által", 137 | "általában", 138 | "amely", 139 | "amelyek", 140 | "amelyekben", 141 | "amelyeket", 142 | "amelyet", 143 | "amelynek", 144 | "ami", 145 | "amit", 146 | "amolyan", 147 | "amíg", 148 | "amikor", 149 | "át", 150 | "abban", 151 | "ahhoz", 152 | "annak", 153 | "arra", 154 | "arról", 155 | "az", 156 | "azok", 157 | "azon", 158 | "azt", 159 | "azzal", 160 | "azért", 161 | "aztán", 162 | "azután", 163 | "azonban", 164 | "bár", 165 | "be", 166 | "belül", 167 | "benne", 168 | "cikk", 169 | "cikkek", 170 | "cikkeket", 171 | "csak", 172 | "de", 173 | "e", 174 | "eddig", 175 | "egész", 176 | "egy", 177 | "egyes", 178 | "egyetlen", 179 | "egyéb", 180 | "egyik", 181 | "egyre", 182 | "ekkor", 183 | "el", 184 | "elég", 185 | "ellen", 186 | "elő", 187 | "először", 188 | "előtt", 189 | "első", 190 | "én", 191 | "éppen", 192 | "ebben", 193 | "ehhez", 194 | "emilyen", 195 | "ennek", 196 | "erre", 197 | "ez", 198 | "ezt", 199 | "ezek", 200 | "ezen", 201 | "ezzel", 202 | "ezért", 203 | "és", 204 | "fel", 205 | "felé", 206 | "hanem", 207 | "hiszen", 208 | "hogy", 209 | "hogyan", 210 | "igen", 211 | "így", 212 | "illetve", 213 | "ill.", 214 | "ill", 215 | "ilyen", 216 | "ilyenkor", 217 | "ison", 218 | "ismét", 219 | "itt", 220 | "jó", 221 | "jól", 222 | "jobban", 223 | "kell", 224 | "kellett", 225 | "keresztül", 226 | "keressünk", 227 | "ki", 228 | "kívül", 229 | "között", 230 | "közül", 231 | "legalább", 232 | "lehet", 233 | "lehetett", 234 | "legyen", 235 | "lenne", 236 | "lenni", 237 | "lesz", 238 | "lett", 239 | "maga", 240 | "magát", 241 | "majd", 242 | "már", 243 | "más", 244 | "másik", 245 | "meg", 246 | "még", 247 | "mellett", 248 | "mert", 249 | "mely", 250 | "melyek", 251 | "mi", 252 | "mit", 253 | "míg", 254 | "miért", 255 | "milyen", 256 | "mikor", 257 | "minden", 258 | "mindent", 259 | "mindenki", 260 | "mindig", 261 | "mint", 262 | "mintha", 263 | "mivel", 264 | "most", 265 | "nagy", 266 | "nagyobb", 267 | "nagyon", 268 | "ne", 269 | "néha", 270 | "nekem", 271 | "neki", 272 | "nem", 273 | "néhány", 274 | "nélkül", 275 | "nincs", 276 | "olyan", 277 | "ott", 278 | "össze", 279 | "ő", 280 | "ők", 281 | "őket", 282 | "pedig", 283 | "persze", 284 | "rá", 285 | "s", 286 | "saját", 287 | "sem", 288 | "semmi", 289 | "sok", 290 | "sokat", 291 | "sokkal", 292 | "számára", 293 | "szemben", 294 | "szerint", 295 | "szinte", 296 | "talán", 297 | "tehát", 298 | "teljes", 299 | "tovább", 300 | "továbbá", 301 | "több", 302 | "úgy", 303 | "ugyanis", 304 | "új", 305 | "újabb", 306 | "újra", 307 | "után", 308 | "utána", 309 | "utolsó", 310 | "vagy", 311 | "vagyis", 312 | "valaki", 313 | "valami", 314 | "valamint", 315 | "való", 316 | "vagyok", 317 | "van", 318 | "vannak", 319 | "volt", 320 | "voltam", 321 | "voltak", 322 | "voltunk", 323 | "vissza", 324 | "vele", 325 | "viszont", 326 | "volna": 327 | return true 328 | } 329 | return false 330 | } 331 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Snowball 2 | ======== 3 | 4 | 5 | A [Go (golang)](http://golang.org) implementation of the 6 | [Snowball stemmer](http://snowball.tartarus.org/) 7 | for natural language processing. 8 | 9 | 10 | | | Status | 11 | | -------------------- | ------------------------- | 12 | | Latest release | [v0.10.0](https://github.com/kljensen/snowball/tags) (2024-08-13) | 13 | | Latest build status | [![Build](https://github.com/kljensen/snowball/workflows/Build/badge.svg?event=push)](https://github.com/kljensen/snowball/actions) 14 | | Languages available | English, Spanish (español), French (le français), Russian (ру́сский язы́к), Swedish (svenska), Norwegian (norsk), Hungarian (magyar)| 15 | | License | MIT | 16 | 17 | 18 | ## Usage 19 | 20 | 21 | Here is a minimal Go program that uses this package in order 22 | to stem a single word. 23 | 24 | ```go 25 | package main 26 | import ( 27 | "fmt" 28 | "github.com/kljensen/snowball" 29 | ) 30 | func main(){ 31 | stemmed, err := snowball.Stem("Accumulations", "english", true) 32 | if err == nil{ 33 | fmt.Println(stemmed) // Prints "accumul" 34 | } 35 | } 36 | ``` 37 | 38 | 39 | ## Organization & Implementation 40 | 41 | The code is organized as follows: 42 | 43 | * The top-level `snowball` package has a single exported function `snowball.Stem`, 44 | which is defined in `snowball/snowball.go`. 45 | * The stemmer for each language is defined in a "sub-package", e.g `snowball/spanish`. 46 | * Each language exports a `Stem` function: e.g. `spanish.Stem`, 47 | which is defined in `snowball/spanish/stem.go`. 48 | * Code that is common to multiple languages may go in a separate package, 49 | e.g. the small `romance` package. 50 | 51 | Some notes about the implementation: 52 | 53 | * In order to ensure the code is easily extended to non-English languages, 54 | I avoided using bytes and byte arrays, and instead perform all operations 55 | on runes. See `snowball/snowballword/snowballword.go` and the 56 | `SnowballWord` struct. 57 | * In order to avoid casting strings into slices of runes numerous times, 58 | this implementation uses a single slice of runes stored in the `SnowballWord` 59 | struct for each word that needs to be stemmed. 60 | * In spite of the foregoing, readability requires that some strings be 61 | kept around and repeatedly cast into slices of runes. For example, 62 | in the Spanish stemmer, one step requires removing suffixes with accute 63 | accents such as "ución", "logía", and "logías". If I were to hard-code those 64 | suffices as slices of runes, the code would be substantially less readable. 65 | * Instead of carrying around the word regions R1, R2, & RV as separate strings 66 | (or slices or runes, or whatever), we carry around the index where each of 67 | these regions begins. These are stored as `R1start`, `R2start`, & `RVstart` 68 | on the `SnowballWord` struct. I believe this is a relatively efficient way of 69 | storing R1 and R2. 70 | * The code does not use any maps or regular expressions 1) for kicks, and 2) because 71 | I thought they'd negatively impact the performance. (But, mostly for #1; I realize 72 | #2 is silly.) 73 | * I end up refactoring the `snowballword` package a bit every time I implement a 74 | new language. 75 | * Clearly, the Go implentation of these stemmers is verbose relative to the 76 | Snowball language. However, it is much better than the 77 | [Java version](https://github.com/weavejester/snowball-stemmer/blob/master/src/java/org/tartarus/snowball/ext/frenchStemmer.java) 78 | and [others](https://github.com/patch/lingua-stem-unine-pm5/blob/master/src/frenchStemmerPlus.txt). 79 | 80 | ## Testing 81 | 82 | To run the tests, do `go test ./...` in the top-level directory. 83 | 84 | ## Future work 85 | 86 | I'd like to implement the Snowball stemmer in more languages. 87 | If you can help, I would greatly appreciate it: please fork the project and send 88 | a pull request! 89 | 90 | (Also, if you are interested in creating a larger NLP project for Go, please get in touch.) 91 | 92 | ## Related work 93 | 94 | I know of a few other stemmers availble in Go: 95 | 96 | * [stemmer](https://github.com/dchest/stemmer) by [Dmitry Chestnykh](https://github.com/dchest). 97 | His project also 98 | implements the Snowball (Porter2) English stemmer as well as the Snowball German stemmer. 99 | * [porter-stemmer](https://github.com/a2800276/porter-stemmer.go) - an implementation of the 100 | original Porter stemming algorithm. 101 | * [go-stem](https://github.com/agonopol/go-stem) by [Alex Gonopolskiy](https://github.com/agonopol). 102 | Also the original Porter algorithm. 103 | * [paicehusk](https://github.com/Rookii/paicehusk) by [Aaron Groves](https://github.com/rookii). 104 | This package implements the 105 | [Paice/Husk](http://www.comp.lancs.ac.uk/computing/research/stemming/) 106 | stemmer. 107 | * [golibstemmer](https://github.com/rjohnsondev/golibstemmer) 108 | by [Richard Johnson](https://github.com/rjohnsondev). This provides Go bindings for the 109 | [libstemmer](http://snowball.tartarus.org/download.php) C library. 110 | * [snowball](https://bitbucket.org/tebeka/snowball) by [Miki Tebeka](http://web.mikitebeka.com/). 111 | Also, I believe, Go bindings for the C library. 112 | 113 | ## Contributors 114 | 115 | * Kyle Jensen (kljensen@gmail.com, [@DataKyle](http://twitter.com/datakyle)) 116 | * [Shawn Smith](https://github.com/shawnps) 117 | * [Herman Schaaf](https://github.com/hermanschaaf) 118 | * [Anton Södergren](https://github.com/AAAton) 119 | * [Eivind Moland](https://github.com/eivindam) 120 | * [ Tamás Gulácsi](https://github.com/tgulacsi) 121 | * [@clipperhouse](https://github.com/clipperhouse) 122 | * Your name should be here! 123 | 124 | 125 | ## License (MIT) 126 | 127 | Copyright (c) the Contributors (see above) 128 | 129 | Permission is hereby granted, free of charge, to any person obtaining 130 | a copy of this software and associated documentation files (the 131 | "Software"), to deal in the Software without restriction, including 132 | without limitation the rights to use, copy, modify, merge, publish, 133 | distribute, sublicense, and/or sell copies of the Software, and to 134 | permit persons to whom the Software is furnished to do so, subject to 135 | the following conditions: 136 | 137 | The above copyright notice and this permission notice shall be 138 | included in all copies or substantial portions of the Software. 139 | 140 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 141 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 142 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 143 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 144 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 145 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 146 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 147 | -------------------------------------------------------------------------------- /french/step1.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/kljensen/snowball/snowballword" 7 | ) 8 | 9 | // Step 1 is the removal of standard suffixes 10 | func step1(word *snowballword.SnowballWord) bool { 11 | suffix := word.FirstSuffix( 12 | "issements", "issement", "atrices", "utions", "usions", "logies", 13 | "emment", "ements", "atrice", "ations", "ateurs", "amment", "ution", 14 | "usion", "ments", "logie", "istes", "ismes", "iqUes", "euses", 15 | "ences", "ement", "ation", "ateur", "ances", "ables", "ment", 16 | "ités", "iste", "isme", "iqUe", "euse", "ence", "eaux", "ance", 17 | "able", "ives", "ité", "eux", "aux", "ive", "ifs", "if", 18 | ) 19 | 20 | if suffix == "" { 21 | return false 22 | } 23 | suffixLength := utf8.RuneCountInString(suffix) 24 | 25 | isInR1 := (word.R1start <= len(word.RS)-suffixLength) 26 | isInR2 := (word.R2start <= len(word.RS)-suffixLength) 27 | isInRV := (word.RVstart <= len(word.RS)-suffixLength) 28 | 29 | // Handle simple replacements & deletions in R2 first 30 | if isInR2 { 31 | 32 | // Handle simple replacements in R2 33 | repl := "" 34 | switch suffix { 35 | case "logie", "logies": 36 | repl = "log" 37 | case "usion", "ution", "usions", "utions": 38 | repl = "u" 39 | case "ence", "ences": 40 | repl = "ent" 41 | } 42 | if repl != "" { 43 | word.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true) 44 | return true 45 | } 46 | 47 | // Handle simple deletions in R2 48 | switch suffix { 49 | case "ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes": 50 | word.RemoveLastNRunes(suffixLength) 51 | return true 52 | } 53 | } 54 | 55 | // Handle simple replacements in RV 56 | if isInRV { 57 | 58 | // NOTE: these are "special" suffixes in that 59 | // we must still do steps 2a and 2b of the 60 | // French stemmer even when these suffixes are 61 | // found in step1. Therefore, we are returning 62 | // `false` here. 63 | 64 | repl := "" 65 | switch suffix { 66 | case "amment": 67 | repl = "ant" 68 | case "emment": 69 | repl = "ent" 70 | } 71 | if repl != "" { 72 | word.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true) 73 | return false 74 | } 75 | 76 | // Delete if preceded by a vowel that is also in RV 77 | if suffix == "ment" || suffix == "ments" { 78 | idx := len(word.RS) - suffixLength - 1 79 | if idx >= word.RVstart && isLowerVowel(word.RS[idx]) { 80 | word.RemoveLastNRunes(suffixLength) 81 | return false 82 | } 83 | return false 84 | } 85 | } 86 | 87 | // Handle all the other "special" cases. All of these 88 | // return true immediately after changing the word. 89 | // 90 | switch suffix { 91 | case "eaux": 92 | 93 | // Replace with eau 94 | word.ReplaceSuffixRunes([]rune(suffix), []rune("eau"), true) 95 | return true 96 | 97 | case "aux": 98 | 99 | // Replace with al if in R1 100 | if isInR1 { 101 | word.ReplaceSuffixRunes([]rune(suffix), []rune("al"), true) 102 | return true 103 | } 104 | 105 | case "euse", "euses": 106 | 107 | // Delete if in R2, else replace by eux if in R1 108 | if isInR2 { 109 | word.RemoveLastNRunes(suffixLength) 110 | return true 111 | } else if isInR1 { 112 | word.ReplaceSuffixRunes([]rune(suffix), []rune("eux"), true) 113 | return true 114 | } 115 | 116 | case "issement", "issements": 117 | 118 | // Delete if in R1 and preceded by a non-vowel 119 | if isInR1 { 120 | idx := len(word.RS) - suffixLength - 1 121 | if idx >= 0 && isLowerVowel(word.RS[idx]) == false { 122 | word.RemoveLastNRunes(suffixLength) 123 | return true 124 | } 125 | } 126 | return false 127 | 128 | case "atrice", "ateur", "ation", "atrices", "ateurs", "ations": 129 | 130 | // Delete if in R2 131 | if isInR2 { 132 | word.RemoveLastNRunes(suffixLength) 133 | 134 | // If preceded by "ic", delete if in R2, else replace by "iqU". 135 | newSuffix := word.FirstSuffix("ic") 136 | newSuffixRunes := []rune(newSuffix) 137 | if newSuffix != "" { 138 | if word.FitsInR2(len(newSuffixRunes)) { 139 | word.RemoveLastNRunes(len(newSuffixRunes)) 140 | } else { 141 | word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true) 142 | } 143 | } 144 | return true 145 | } 146 | 147 | case "ement", "ements": 148 | 149 | if isInRV { 150 | 151 | // Delete if in RV 152 | word.RemoveLastNRunes(suffixLength) 153 | 154 | // If preceded by "iv", delete if in R2 155 | // (and if further preceded by "at", delete if in R2) 156 | newSuffix := word.RemoveFirstSuffixIfIn(word.R2start, "iv") 157 | newSuffixRunes := []rune(newSuffix) 158 | if newSuffix != "" { 159 | word.RemoveFirstSuffixIfIn(word.R2start, "at") 160 | return true 161 | } 162 | 163 | // If preceded by "eus", delete if in R2, else replace by "eux" if in R1 164 | newSuffix = word.FirstSuffix("eus") 165 | newSuffixRunes = []rune(newSuffix) 166 | if newSuffix != "" { 167 | newSuffixLen := len(newSuffixRunes) 168 | if word.FitsInR2(newSuffixLen) { 169 | word.RemoveLastNRunes(newSuffixLen) 170 | } else if word.FitsInR1(newSuffixLen) { 171 | word.ReplaceSuffixRunes(newSuffixRunes, []rune("eux"), true) 172 | } 173 | return true 174 | } 175 | 176 | // If preceded by abl or iqU, delete if in R2, otherwise, 177 | newSuffix = word.FirstSuffix("abl", "iqU") 178 | if newSuffix != "" { 179 | newSuffixLen := utf8.RuneCountInString(newSuffix) 180 | if word.FitsInR2(newSuffixLen) { 181 | word.RemoveLastNRunes(newSuffixLen) 182 | } 183 | return true 184 | } 185 | 186 | // If preceded by ièr or Ièr, replace by i if in RV 187 | newSuffix = word.FirstSuffix("ièr", "Ièr") 188 | newSuffixRunes = []rune(newSuffix) 189 | if newSuffix != "" { 190 | if word.FitsInRV(len(newSuffixRunes)) { 191 | word.ReplaceSuffixRunes(newSuffixRunes, []rune("i"), true) 192 | } 193 | return true 194 | } 195 | 196 | return true 197 | } 198 | 199 | case "ité", "ités": 200 | 201 | if isInR2 { 202 | 203 | // Delete if in R2 204 | word.RemoveLastNRunes(suffixLength) 205 | 206 | // If preceded by "abil", delete if in R2, else replace by "abl" 207 | newSuffix := word.FirstSuffix("abil") 208 | if newSuffix != "" { 209 | newSuffixLen := utf8.RuneCountInString(newSuffix) 210 | if word.FitsInR2(newSuffixLen) { 211 | word.RemoveLastNRunes(newSuffixLen) 212 | } else { 213 | word.ReplaceSuffixRunes([]rune(newSuffix), []rune("abl"), true) 214 | } 215 | return true 216 | } 217 | 218 | // If preceded by "ic", delete if in R2, else replace by "iqU" 219 | newSuffix = word.FirstSuffix("ic") 220 | if newSuffix != "" { 221 | newSuffixLen := utf8.RuneCountInString(newSuffix) 222 | if word.FitsInR2(newSuffixLen) { 223 | word.RemoveLastNRunes(newSuffixLen) 224 | } else { 225 | word.ReplaceSuffixRunes([]rune(newSuffix), []rune("iqU"), true) 226 | } 227 | return true 228 | } 229 | 230 | // If preceded by "iv", delete if in R2 231 | newSuffix = word.RemoveFirstSuffixIfIn(word.R2start, "iv") 232 | return true 233 | } 234 | case "if", "ive", "ifs", "ives": 235 | 236 | if isInR2 { 237 | 238 | // Delete if in R2 239 | word.RemoveLastNRunes(suffixLength) 240 | 241 | // If preceded by at, delete if in R2 242 | newSuffix := word.RemoveFirstSuffixIfIn(word.R2start, "at") 243 | if newSuffix != "" { 244 | 245 | // And if further preceded by ic, delete if in R2, else replace by iqU 246 | newSuffix = word.FirstSuffix("ic") 247 | if newSuffix != "" { 248 | newSuffixLen := utf8.RuneCountInString(newSuffix) 249 | if word.FitsInR2(newSuffixLen) { 250 | word.RemoveLastNRunes(newSuffixLen) 251 | } else { 252 | word.ReplaceSuffixRunes([]rune(newSuffix), []rune("iqU"), true) 253 | } 254 | } 255 | } 256 | return true 257 | 258 | } 259 | } 260 | return false 261 | } 262 | -------------------------------------------------------------------------------- /english/common.go: -------------------------------------------------------------------------------- 1 | package english 2 | 3 | import ( 4 | "github.com/kljensen/snowball/romance" 5 | "github.com/kljensen/snowball/snowballword" 6 | ) 7 | 8 | // Replaces all different kinds of apostrophes with a single 9 | // kind: "'" -- that is, "\x27", or unicode codepoint 39. 10 | func normalizeApostrophes(word *snowballword.SnowballWord) (numSubstitutions int) { 11 | for i, r := range word.RS { 12 | switch r { 13 | 14 | // The rune is one of "\u2019", "\u2018", or "\u201B"; 15 | // equivalently, unicode code points 8217, 8216, & 8219. 16 | case 8217, 8216, 8219: 17 | 18 | // (Note: the unicode code point for ' is 39.) 19 | 20 | word.RS[i] = 39 21 | numSubstitutions += 1 22 | } 23 | } 24 | return 25 | } 26 | 27 | // Trim off leading apostropes. (Slight variation from 28 | // NLTK implementation here, in which only the first is removed.) 29 | func trimLeftApostrophes(word *snowballword.SnowballWord) { 30 | var ( 31 | numApostrophes int 32 | r rune 33 | ) 34 | 35 | for numApostrophes, r = range word.RS { 36 | 37 | // Check for "'", which is unicode code point 39 38 | if r != 39 { 39 | break 40 | } 41 | } 42 | if numApostrophes > 0 { 43 | word.RS = word.RS[numApostrophes:] 44 | word.R1start = word.R1start - numApostrophes 45 | word.R2start = word.R2start - numApostrophes 46 | } 47 | } 48 | 49 | // Capitalize all 'Y's preceded by vowels or starting a word 50 | func capitalizeYs(word *snowballword.SnowballWord) (numCapitalizations int) { 51 | for i, r := range word.RS { 52 | 53 | // (Note: Y & y unicode code points = 89 & 121) 54 | 55 | if r == 121 && (i == 0 || isLowerVowel(word.RS[i-1])) { 56 | word.RS[i] = 89 57 | numCapitalizations += 1 58 | } 59 | } 60 | return 61 | } 62 | 63 | // Uncapitalize all 'Y's 64 | func uncapitalizeYs(word *snowballword.SnowballWord) { 65 | for i, r := range word.RS { 66 | 67 | // (Note: Y & y unicode code points = 89 & 121) 68 | 69 | if r == 89 { 70 | word.RS[i] = 121 71 | } 72 | } 73 | return 74 | } 75 | 76 | // Find the starting point of the two regions R1 & R2. 77 | // 78 | // R1 is the region after the first non-vowel following a vowel, 79 | // or is the null region at the end of the word if there is no 80 | // such non-vowel. 81 | // 82 | // R2 is the region after the first non-vowel following a vowel 83 | // in R1, or is the null region at the end of the word if there 84 | // is no such non-vowel. 85 | // 86 | // See http://snowball.tartarus.org/texts/r1r2.html 87 | func r1r2(word *snowballword.SnowballWord) (r1start, r2start int) { 88 | 89 | specialPrefix := word.FirstPrefix("gener", "commun", "arsen") 90 | 91 | if specialPrefix != "" { 92 | r1start = len(specialPrefix) 93 | } else { 94 | r1start = romance.VnvSuffix(word, isLowerVowel, 0) 95 | } 96 | r2start = romance.VnvSuffix(word, isLowerVowel, r1start) 97 | return 98 | } 99 | 100 | // Checks if a rune is a lowercase English vowel. 101 | func isLowerVowel(r rune) bool { 102 | switch r { 103 | case 97, 101, 105, 111, 117, 121: 104 | return true 105 | } 106 | return false 107 | } 108 | 109 | // Returns the stemmed version of a word if it is a special 110 | // case, otherwise returns the empty string. 111 | func stemSpecialWord(word string) (stemmed string) { 112 | switch word { 113 | case "skis": 114 | stemmed = "ski" 115 | case "skies": 116 | stemmed = "sky" 117 | case "dying": 118 | stemmed = "die" 119 | case "lying": 120 | stemmed = "lie" 121 | case "tying": 122 | stemmed = "tie" 123 | case "idly": 124 | stemmed = "idl" 125 | case "gently": 126 | stemmed = "gentl" 127 | case "ugly": 128 | stemmed = "ugli" 129 | case "early": 130 | stemmed = "earli" 131 | case "only": 132 | stemmed = "onli" 133 | case "singly": 134 | stemmed = "singl" 135 | case "sky": 136 | stemmed = "sky" 137 | case "news": 138 | stemmed = "news" 139 | case "howe": 140 | stemmed = "howe" 141 | case "atlas": 142 | stemmed = "atlas" 143 | case "cosmos": 144 | stemmed = "cosmos" 145 | case "bias": 146 | stemmed = "bias" 147 | case "andes": 148 | stemmed = "andes" 149 | case "inning": 150 | stemmed = "inning" 151 | case "innings": 152 | stemmed = "inning" 153 | case "outing": 154 | stemmed = "outing" 155 | case "outings": 156 | stemmed = "outing" 157 | case "canning": 158 | stemmed = "canning" 159 | case "cannings": 160 | stemmed = "canning" 161 | case "herring": 162 | stemmed = "herring" 163 | case "herrings": 164 | stemmed = "herring" 165 | case "earring": 166 | stemmed = "earring" 167 | case "earrings": 168 | stemmed = "earring" 169 | case "proceed": 170 | stemmed = "proceed" 171 | case "proceeds": 172 | stemmed = "proceed" 173 | case "proceeded": 174 | stemmed = "proceed" 175 | case "proceeding": 176 | stemmed = "proceed" 177 | case "exceed": 178 | stemmed = "exceed" 179 | case "exceeds": 180 | stemmed = "exceed" 181 | case "exceeded": 182 | stemmed = "exceed" 183 | case "exceeding": 184 | stemmed = "exceed" 185 | case "succeed": 186 | stemmed = "succeed" 187 | case "succeeds": 188 | stemmed = "succeed" 189 | case "succeeded": 190 | stemmed = "succeed" 191 | case "succeeding": 192 | stemmed = "succeed" 193 | } 194 | return 195 | } 196 | 197 | // Return `true` if the input `word` is an English stop word. 198 | func IsStopWord(word string) bool { 199 | switch word { 200 | case "a", "about", "above", "after", "again", "against", "all", "am", "an", 201 | "and", "any", "are", "as", "at", "be", "because", "been", "before", 202 | "being", "below", "between", "both", "but", "by", "can", "did", "do", 203 | "does", "doing", "don", "down", "during", "each", "few", "for", "from", 204 | "further", "had", "has", "have", "having", "he", "her", "here", "hers", 205 | "herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is", 206 | "it", "its", "itself", "just", "me", "more", "most", "my", "myself", 207 | "no", "nor", "not", "now", "of", "off", "on", "once", "only", "or", 208 | "other", "our", "ours", "ourselves", "out", "over", "own", "s", "same", 209 | "she", "should", "so", "some", "such", "t", "than", "that", "the", "their", 210 | "theirs", "them", "themselves", "then", "there", "these", "they", 211 | "this", "those", "through", "to", "too", "under", "until", "up", 212 | "very", "was", "we", "were", "what", "when", "where", "which", "while", 213 | "who", "whom", "why", "will", "with", "you", "your", "yours", "yourself", 214 | "yourselves": 215 | return true 216 | } 217 | return false 218 | } 219 | 220 | // A word is called short if it ends in a short syllable, and if R1 is null. 221 | func isShortWord(w *snowballword.SnowballWord) (isShort bool) { 222 | 223 | // If r1 is not empty, the word is not short 224 | if w.R1start < len(w.RS) { 225 | return 226 | } 227 | 228 | // Otherwise it must end in a short syllable 229 | return endsShortSyllable(w, len(w.RS)) 230 | } 231 | 232 | // Return true if the indicies at `w.RS[:i]` end in a short syllable. 233 | // Define a short syllable in a word as either 234 | // (a) a vowel followed by a non-vowel other than w, x or Y 235 | // 236 | // and preceded by a non-vowel, or 237 | // 238 | // (b) a vowel at the beginning of the word followed by a non-vowel. 239 | func endsShortSyllable(w *snowballword.SnowballWord, i int) bool { 240 | 241 | if i == 2 { 242 | 243 | // Check for a vowel at the beginning of the word followed by a non-vowel. 244 | if isLowerVowel(w.RS[0]) && !isLowerVowel(w.RS[1]) { 245 | return true 246 | } else { 247 | return false 248 | } 249 | 250 | } else if i >= 3 { 251 | 252 | // The runes 1, 2, & 3 positions to the left of `i`. 253 | s1 := w.RS[i-1] 254 | s2 := w.RS[i-2] 255 | s3 := w.RS[i-3] 256 | 257 | // Check for a vowel followed by a non-vowel other than w, x or Y 258 | // and preceded by a non-vowel. 259 | // (Note: w, x, Y rune codepoints = 119, 120, 89) 260 | // 261 | if !isLowerVowel(s1) && s1 != 119 && s1 != 120 && s1 != 89 && isLowerVowel(s2) && !isLowerVowel(s3) { 262 | return true 263 | } else { 264 | return false 265 | } 266 | 267 | } 268 | return false 269 | } 270 | -------------------------------------------------------------------------------- /snowballword/snowballword.go: -------------------------------------------------------------------------------- 1 | /* 2 | This package defines a SnowballWord struct that is used 3 | to encapsulate most of the "state" variables we must track 4 | when stemming a word. The SnowballWord struct also has 5 | a few methods common to stemming in a variety of languages. 6 | */ 7 | package snowballword 8 | 9 | import ( 10 | "fmt" 11 | "unicode/utf8" 12 | ) 13 | 14 | // SnowballWord represents a word that is going to be stemmed. 15 | type SnowballWord struct { 16 | 17 | // A slice of runes 18 | RS []rune 19 | 20 | // The index in RS where the R1 region begins 21 | R1start int 22 | 23 | // The index in RS where the R2 region begins 24 | R2start int 25 | 26 | // The index in RS where the RV region begins 27 | RVstart int 28 | } 29 | 30 | // Create a new SnowballWord struct 31 | func New(in string) (word *SnowballWord) { 32 | word = &SnowballWord{RS: []rune(in)} 33 | word.R1start = len(word.RS) 34 | word.R2start = len(word.RS) 35 | word.RVstart = len(word.RS) 36 | return 37 | } 38 | 39 | // Replace a suffix and adjust R1start and R2start as needed. 40 | // If `force` is false, check to make sure the suffix exists first. 41 | func (w *SnowballWord) ReplaceSuffix(suffix, replacement string, force bool) bool { 42 | 43 | var ( 44 | doReplacement bool 45 | suffixRunes []rune 46 | ) 47 | if force { 48 | doReplacement = true 49 | suffixRunes = []rune(suffix) 50 | } else { 51 | var foundSuffix string 52 | foundSuffix = w.FirstSuffix(suffix) 53 | suffixRunes = []rune(foundSuffix) 54 | if foundSuffix == suffix { 55 | doReplacement = true 56 | } 57 | } 58 | if doReplacement == false { 59 | return false 60 | } 61 | w.ReplaceSuffixRunes(suffixRunes, []rune(replacement), true) 62 | return true 63 | } 64 | 65 | // Remove the last `n` runes from the SnowballWord. 66 | func (w *SnowballWord) RemoveLastNRunes(n int) { 67 | w.RS = w.RS[:len(w.RS)-n] 68 | w.resetR1R2() 69 | } 70 | 71 | // Replace a suffix and adjust R1start and R2start as needed. 72 | // If `force` is false, check to make sure the suffix exists first. 73 | func (w *SnowballWord) ReplaceSuffixRunes(suffixRunes []rune, replacementRunes []rune, force bool) bool { 74 | 75 | if force || w.HasSuffixRunes(suffixRunes) { 76 | lenWithoutSuffix := len(w.RS) - len(suffixRunes) 77 | w.RS = append(w.RS[:lenWithoutSuffix], replacementRunes...) 78 | 79 | // If R, R2, & RV are now beyond the length 80 | // of the word, they are set to the length 81 | // of the word. Otherwise, they are left 82 | // as they were. 83 | w.resetR1R2() 84 | return true 85 | } 86 | return false 87 | } 88 | 89 | // Resets R1start and R2start to ensure they 90 | // are within bounds of the current rune slice. 91 | func (w *SnowballWord) resetR1R2() { 92 | rsLen := len(w.RS) 93 | if w.R1start > rsLen { 94 | w.R1start = rsLen 95 | } 96 | if w.R2start > rsLen { 97 | w.R2start = rsLen 98 | } 99 | if w.RVstart > rsLen { 100 | w.RVstart = rsLen 101 | } 102 | } 103 | 104 | // Return a slice of w.RS, allowing the start 105 | // and stop to be out of bounds. 106 | func (w *SnowballWord) slice(start, stop int) []rune { 107 | startMin := 0 108 | if start < startMin { 109 | start = startMin 110 | } 111 | max := len(w.RS) - 1 112 | if start > max { 113 | start = max 114 | } 115 | if stop > max { 116 | stop = max 117 | } 118 | return w.RS[start:stop] 119 | } 120 | 121 | // Returns true if `x` runes would fit into R1. 122 | func (w *SnowballWord) FitsInR1(x int) bool { 123 | return w.R1start <= len(w.RS)-x 124 | } 125 | 126 | // Returns true if `x` runes would fit into R2. 127 | func (w *SnowballWord) FitsInR2(x int) bool { 128 | return w.R2start <= len(w.RS)-x 129 | } 130 | 131 | // Returns true if `x` runes would fit into RV. 132 | func (w *SnowballWord) FitsInRV(x int) bool { 133 | return w.RVstart <= len(w.RS)-x 134 | } 135 | 136 | // Return the R1 region as a slice of runes 137 | func (w *SnowballWord) R1() []rune { 138 | return w.RS[w.R1start:] 139 | } 140 | 141 | // Return the R1 region as a string 142 | func (w *SnowballWord) R1String() string { 143 | return string(w.R1()) 144 | } 145 | 146 | // Return the R2 region as a slice of runes 147 | func (w *SnowballWord) R2() []rune { 148 | return w.RS[w.R2start:] 149 | } 150 | 151 | // Return the R2 region as a string 152 | func (w *SnowballWord) R2String() string { 153 | return string(w.R2()) 154 | } 155 | 156 | // Return the RV region as a slice of runes 157 | func (w *SnowballWord) RV() []rune { 158 | return w.RS[w.RVstart:] 159 | } 160 | 161 | // Return the RV region as a string 162 | func (w *SnowballWord) RVString() string { 163 | return string(w.RV()) 164 | } 165 | 166 | // Return the SnowballWord as a string 167 | func (w *SnowballWord) String() string { 168 | return string(w.RS) 169 | } 170 | 171 | func (w *SnowballWord) DebugString() string { 172 | return fmt.Sprintf("{\"%s\", %d, %d, %d}", w.String(), w.R1start, w.R2start, w.RVstart) 173 | } 174 | 175 | // Return the first prefix found or the empty string. 176 | func (w *SnowballWord) FirstPrefix(prefixes ...string) (foundPrefix string) { 177 | found := false 178 | rsLen := len(w.RS) 179 | 180 | for _, prefix := range prefixes { 181 | prefixRunes := []rune(prefix) 182 | if len(prefixRunes) > rsLen { 183 | continue 184 | } 185 | 186 | found = true 187 | for i, r := range prefixRunes { 188 | if i > rsLen-1 || (w.RS)[i] != r { 189 | found = false 190 | break 191 | } 192 | } 193 | if found { 194 | foundPrefix = prefix 195 | break 196 | } 197 | } 198 | return 199 | } 200 | 201 | // Return true if `w.RS[startPos:endPos]` ends with runes from `suffixRunes`. 202 | // That is, the slice of runes between startPos and endPos have a suffix of 203 | // suffixRunes. 204 | func (w *SnowballWord) HasSuffixRunesIn(startPos, endPos int, suffixRunes []rune) bool { 205 | maxLen := endPos - startPos 206 | suffixLen := len(suffixRunes) 207 | if suffixLen > maxLen { 208 | return false 209 | } 210 | 211 | numMatching := 0 212 | for i := 0; i < maxLen && i < suffixLen; i++ { 213 | if w.RS[endPos-i-1] != suffixRunes[suffixLen-i-1] { 214 | break 215 | } else { 216 | numMatching += 1 217 | } 218 | } 219 | if numMatching == suffixLen { 220 | return true 221 | } 222 | return false 223 | } 224 | 225 | // Return true if `w` ends with `suffixRunes` 226 | func (w *SnowballWord) HasSuffixRunes(suffixRunes []rune) bool { 227 | return w.HasSuffixRunesIn(0, len(w.RS), suffixRunes) 228 | } 229 | 230 | // Find the first suffix that ends at `endPos` in the word among 231 | // those provided; then, 232 | // check to see if it begins after startPos. If it does, return 233 | // it, else return the empty string and empty rune slice. This 234 | // may seem a counterintuitive manner to do this. However, it 235 | // matches what is required most of the time by the Snowball 236 | // stemmer steps. 237 | func (w *SnowballWord) FirstSuffixIfIn(startPos, endPos int, suffixes ...string) (suffix string) { 238 | for _, suffix := range suffixes { 239 | suffixRunes := []rune(suffix) 240 | if w.HasSuffixRunesIn(0, endPos, suffixRunes) { 241 | if endPos-len(suffixRunes) >= startPos { 242 | return suffix 243 | } else { 244 | return "" 245 | } 246 | } 247 | } 248 | 249 | return "" 250 | } 251 | 252 | func (w *SnowballWord) FirstSuffixIn(startPos, endPos int, suffixes ...string) (suffix string) { 253 | for _, suffix := range suffixes { 254 | suffixRunes := []rune(suffix) 255 | if w.HasSuffixRunesIn(startPos, endPos, suffixRunes) { 256 | return suffix 257 | } 258 | } 259 | 260 | return "" 261 | } 262 | 263 | // Find the first suffix in the word among those provided; then, 264 | // check to see if it begins after startPos. If it does, 265 | // remove it. 266 | func (w *SnowballWord) RemoveFirstSuffixIfIn(startPos int, suffixes ...string) (suffix string) { 267 | suffix = w.FirstSuffixIfIn(startPos, len(w.RS), suffixes...) 268 | suffixLength := utf8.RuneCountInString(suffix) 269 | if suffix != "" { 270 | w.RemoveLastNRunes(suffixLength) 271 | } 272 | return 273 | } 274 | 275 | // Removes the first suffix found that is in `word.RS[startPos:len(word.RS)]` 276 | func (w *SnowballWord) RemoveFirstSuffixIn(startPos int, suffixes ...string) (suffix string) { 277 | suffix = w.FirstSuffixIn(startPos, len(w.RS), suffixes...) 278 | suffixLength := utf8.RuneCountInString(suffix) 279 | if suffix != "" { 280 | w.RemoveLastNRunes(suffixLength) 281 | } 282 | return 283 | } 284 | 285 | // Removes the first suffix found 286 | func (w *SnowballWord) RemoveFirstSuffix(suffixes ...string) (suffix string) { 287 | return w.RemoveFirstSuffixIn(0, suffixes...) 288 | } 289 | 290 | // Return the first suffix found or the empty string. 291 | func (w *SnowballWord) FirstSuffix(suffixes ...string) (suffix string) { 292 | return w.FirstSuffixIfIn(0, len(w.RS), suffixes...) 293 | } 294 | -------------------------------------------------------------------------------- /hungarian/stem.go: -------------------------------------------------------------------------------- 1 | package hungarian 2 | 3 | import ( 4 | "log" 5 | "strings" 6 | "unicode" 7 | 8 | "github.com/kljensen/snowball/snowballword" 9 | ) 10 | 11 | func printDebug(debug bool, w *snowballword.SnowballWord) { 12 | if debug { 13 | log.Println(w.DebugString()) 14 | } 15 | } 16 | 17 | func StemSentence(pairs [][2]string, s string) [][2]string { 18 | for _, word := range strings.FieldsFunc(s, func(r rune) bool { 19 | return unicode.IsPunct(r) || unicode.IsSpace(r) 20 | }) { 21 | pairs = append(pairs, [2]string{word, Stem(word, false)}) 22 | } 23 | return pairs 24 | } 25 | 26 | // Stem an Hungarian word. This is the only exported 27 | // function in this package. 28 | // 29 | // This stemming algorithm removes the inflectional suffixes of nouns. Nouns are inflected for case, person/possession and number. 30 | // 31 | // Letters in Hungarian include the following accented forms, 32 | // 33 | // á é í ó ö ő ú ü ű 34 | // 35 | // The following letters are vowels: 36 | // 37 | // a á e é i í o ó ö ő u ú ü ű 38 | // 39 | // The following letters are digraphs: 40 | // 41 | // cs dz dzs gy ly ny ty zs 42 | // 43 | // A double consonant is defined as: 44 | // 45 | // bb cc ccs dd ff gg ggy jj kk ll lly mm nn nny pp rr ss ssz tt tty vv zz zzs 46 | func Stem(word string, stemStopwWords bool) string { 47 | 48 | word = strings.ToLower(strings.TrimSpace(word)) 49 | 50 | // Return small words and stop words 51 | if len(word) <= 2 || (!stemStopwWords && IsStopWord(word)) { 52 | return word 53 | } 54 | 55 | w := snowballword.New(word) 56 | 57 | // Stem the word. Note, each of these 58 | // steps will alter `w` in place. 59 | // 60 | 61 | preprocess(w) 62 | step1(w) 63 | step2(w) 64 | step3(w) 65 | step4(w) 66 | step5(w) 67 | step6(w) 68 | step7(w) 69 | step8(w) 70 | step9(w) 71 | 72 | return w.String() 73 | 74 | } 75 | 76 | func preprocess(w *snowballword.SnowballWord) { 77 | w.R1start = findRegions(w) 78 | } 79 | 80 | // step1 Remove instrumental case 81 | // 82 | // Search for one of the following suffixes and perform the action indicated. 83 | // 84 | // al el 85 | // 86 | // delete if in R1 and preceded by a double consonant, 87 | // and remove one of the double consonants. 88 | // (In the case of consonant plus digraph, such as ccs, remove a c). 89 | func step1(w *snowballword.SnowballWord) { 90 | n := len(w.RS) 91 | if n < 2 || 92 | !(w.RS[n-1] == 'l' && 93 | (w.RS[n-2] == 'a' || w.RS[n-2] == 'e')) { 94 | return 95 | } 96 | // in R1 97 | if w.R1start > n-2 || n < 4 { 98 | return 99 | } 100 | // (In the case of consonant plus digraph, such as ccs, remove a c). 101 | if n >= 5 && isDoubleConsonant(w.RS[n-5:n-2]) > 2 { 102 | w.RS[n-5], w.RS[n-4] = w.RS[n-4], w.RS[n-3] 103 | w.RemoveLastNRunes(3) 104 | } else if n >= 4 && isDoubleConsonant(w.RS[n-4:n-2]) > 1 { 105 | // preceded by a double consonant 106 | w.RemoveLastNRunes(3) 107 | } 108 | } 109 | 110 | // Step 2: Remove frequent cases 111 | // 112 | // Search for the longest among the following suffixes and perform the action indicated. 113 | // 114 | // ban ben ba be ra re nak nek val vel tól től ról ről ból ből hoz hez höz nál nél ig at et ot öt ért képp képpen kor ul ül vá vé onként enként anként ként en on an ön n t 115 | // 116 | // delete if in R1 117 | // 118 | // if the remaining word ends á replace by a 119 | // if the remaining word ends é replace by e 120 | func step2(w *snowballword.SnowballWord) { 121 | if suffix := firstSuffixInR1(w, []string{ 122 | "onként", "enként", "anként", 123 | "képpen", 124 | "ként", 125 | "képp", 126 | "kor", 127 | "ban", "ben", "nak", "nek", "val", "vel", "tól", "től", "ról", "ről", "ból", "ből", "hoz", "hez", "höz", "nál", "nél", 128 | "ért", 129 | "ba", "be", "ra", "re", "ig", "at", "et", "ot", "öt", 130 | "ul", "ül", "vá", "vé", 131 | "en", "on", "an", "ön", 132 | "n", "t", 133 | }); suffix != "" { 134 | rs := runesOf(suffix) 135 | // delete if in R1 136 | w.RemoveLastNRunes(len(rs)) 137 | if len(w.RS) == 0 { 138 | return 139 | } 140 | switch w.RS[len(w.RS)-1] { 141 | case 'á': 142 | // if the remaining word ends á replace by a 143 | w.RS[len(w.RS)-1] = 'a' 144 | case 'é': 145 | // if the remaining word ends é replace by e 146 | w.RS[len(w.RS)-1] = 'e' 147 | } 148 | } 149 | } 150 | 151 | // step3: Remove special cases: 152 | // 153 | // Search for the longest among the following suffixes and perform the action indicated. 154 | // 155 | // án ánként 156 | // 157 | // replace by a if in R1 158 | // 159 | // én 160 | // 161 | // replace by e if in R1 162 | func step3(w *snowballword.SnowballWord) { 163 | if suffix := firstSuffixInR1(w, []string{ 164 | "ánként", "án", 165 | "én", 166 | }); suffix != "" { 167 | rs := runesOf(suffix) 168 | repl := 'a' 169 | if rs[0] == 'é' { 170 | repl = 'e' 171 | } 172 | w.RS[len(w.RS)-len(rs)] = repl 173 | w.RemoveLastNRunes(len(rs) - 1) 174 | } 175 | } 176 | 177 | // step4: Remove other cases: 178 | // 179 | // Search for the longest among the following suffixes and perform the action indicated 180 | // 181 | // astul estül stul stül 182 | // 183 | // delete if in R1 184 | // 185 | // ástul 186 | // 187 | // replace with a if in R1 188 | // 189 | // éstül 190 | // 191 | // replace with e if in R1 192 | func step4(w *snowballword.SnowballWord) { 193 | if suffix := firstSuffixInR1(w, []string{"ástul"}); suffix != "" { 194 | w.RemoveLastNRunes(4) 195 | w.RS[len(w.RS)-1] = 'a' 196 | return 197 | } 198 | if suffix := firstSuffixInR1(w, []string{"éstül"}); suffix != "" { 199 | w.RemoveLastNRunes(4) 200 | w.RS[len(w.RS)-1] = 'e' 201 | return 202 | } 203 | // astul estül stul stül 204 | if suffix := firstSuffixInR1(w, []string{"astul", "estül", "stul", "stül"}); suffix != "" { 205 | w.RemoveLastNRunes(len(runesOf(suffix))) 206 | return 207 | } 208 | } 209 | 210 | // step5: Remove factive case 211 | // 212 | // Search for one of the following suffixes and perform the action indicated. 213 | // 214 | // á é 215 | // 216 | // delete if in R1 and preceded by a double consonant, 217 | // and remove one of the double consonants (as in step 1). 218 | func step5(w *snowballword.SnowballWord) { 219 | n := len(w.RS) 220 | if n < 3 || w.R1start >= n || !(w.RS[n-1] == 'á' || w.RS[n-1] == 'é') { 221 | return 222 | } 223 | // (In the case of consonant plus digraph, such as ccs, remove a c). 224 | if n >= 4 && isDoubleConsonant(w.RS[n-4:n-1]) > 2 { 225 | w.RS[n-4], w.RS[n-3] = w.RS[n-3], w.RS[n-1] 226 | w.RemoveLastNRunes(2) 227 | } else if isDoubleConsonant(w.RS[n-3:n-1]) > 1 { 228 | // preceded by a double consonant 229 | w.RemoveLastNRunes(2) 230 | } 231 | } 232 | 233 | // step6: Remove owned 234 | // Search for the longest among the following suffixes and perform the action indicated. 235 | // 236 | // oké öké aké eké ké éi é 237 | // 238 | // delete if in R1 239 | // 240 | // áké áéi 241 | // 242 | // replace with a if in R1 243 | // 244 | // éké ééi éé 245 | // 246 | // replace with e if in R1 247 | func step6(w *snowballword.SnowballWord) { 248 | if suffix := firstSuffixInR1(w, []string{ 249 | "áké", "áéi", 250 | "éké", "ééi", "éé", 251 | "oké", "öké", "aké", "eké", "ké", "éi", "é", 252 | }); suffix != "" { 253 | switch suffix { 254 | 255 | case "áké", "áéi": 256 | w.RemoveLastNRunes(2) 257 | w.RS[len(w.RS)-1] = 'a' 258 | 259 | case "éké", "ééi", "éé": 260 | w.RemoveLastNRunes(len(runesOf(suffix)) - 1) 261 | w.RS[len(w.RS)-1] = 'e' 262 | 263 | default: 264 | w.RemoveLastNRunes(len(runesOf(suffix))) 265 | } 266 | } 267 | } 268 | 269 | // step7: Remove singular owner suffixes 270 | // 271 | // Search for the longest among the following suffixes and perform the action indicated. 272 | // 273 | // ünk unk nk juk jük uk ük em om am m od ed ad öd d ja je a e o 274 | // 275 | // delete if in R1 276 | // 277 | // ánk ájuk ám ád á 278 | // 279 | // replace with a if in R1 280 | // 281 | // énk éjük ém éd é 282 | // 283 | // replace with e if in R1 284 | func step7(w *snowballword.SnowballWord) { 285 | if suffix := firstSuffixInR1(w, []string{ 286 | "ájuk", "éjük", 287 | "énk", 288 | "ünk", "unk", 289 | "juk", "jük", 290 | "ánk", 291 | "nk", 292 | "uk", "ük", "em", "om", "am", 293 | "od", "ed", "ad", "öd", "ja", "je", 294 | "ám", "ád", "ém", "éd", 295 | "m", "d", 296 | "a", "e", "o", 297 | "á", "é", 298 | }); suffix != "" { 299 | n := len(runesOf(suffix)) 300 | switch suffix { 301 | case "ánk", "ájuk", "ám", "ád", "á": 302 | w.RemoveLastNRunes(n - 1) 303 | w.RS[len(w.RS)-1] = 'a' 304 | case "énk", "éjük", "ém", "éd", "é": 305 | w.RemoveLastNRunes(n - 1) 306 | w.RS[len(w.RS)-1] = 'e' 307 | default: 308 | w.RemoveLastNRunes(n) 309 | } 310 | } 311 | } 312 | 313 | // step8: Remove plural owner suffixes 314 | // Search for the longest among the following suffixes and perform the action indicated. 315 | // 316 | // jaim jeim aim eim im jaid jeid aid eid id jai jei ai ei i jaink jeink eink aink ink jaitok jeitek aitok eitek itek jeik jaik aik eik ik 317 | // 318 | // delete if in R1 319 | // 320 | // áim áid ái áink áitok áik 321 | // 322 | // replace with a if in R1 323 | // 324 | // éim éid éi éink éitek éik 325 | // 326 | // replace with e if in R1 327 | func step8(w *snowballword.SnowballWord) { 328 | if suffix := firstSuffixInR1(w, []string{ 329 | "jaitok", "jeitek", 330 | "jaink", "jeink", "aitok", "eitek", "áitok", "éitek", 331 | "áink", "éink", "itek", "jeik", "jaik", 332 | "eink", "aink", "jaim", "jeim", "jaid", "jeid", 333 | "áim", "áid", "áik", "éim", "éid", "éik", 334 | "ink", "aik", "eik", "jai", "jei", 335 | "aim", "eim", "aid", "eid", 336 | "ái", "éi", "ik", "id", "ai", "ei", 337 | "im", 338 | "i", 339 | }); suffix != "" { 340 | n := len(runesOf(suffix)) 341 | switch suffix { 342 | case "áim", "áid", "ái", "áink", "áitok", "áik": 343 | w.RemoveLastNRunes(n - 1) 344 | w.RS[len(w.RS)-1] = 'a' 345 | case "éim", "éid", "éi", "éink", "éitek", "éik": 346 | w.RemoveLastNRunes(n - 1) 347 | w.RS[len(w.RS)-1] = 'e' 348 | default: 349 | w.RemoveLastNRunes(n) 350 | } 351 | } 352 | } 353 | 354 | // step9: Remove plural suffixes 355 | // 356 | // Search for the longest among the following suffixes and perform the action indicated. 357 | // 358 | // ák 359 | // 360 | // replace with a if in R1 361 | // replace with e if in R1 362 | // 363 | // ök ok ek ak k 364 | // 365 | // delete if in R1 366 | func step9(w *snowballword.SnowballWord) { 367 | if suffix := firstSuffixInR1(w, []string{ 368 | "ák", "ék", 369 | "ök", "ok", "ek", "ak", "k", 370 | }); suffix != "" { 371 | switch suffix { 372 | case "ák": 373 | w.RemoveLastNRunes(1) 374 | w.RS[len(w.RS)-1] = 'a' 375 | case "ék": 376 | w.RemoveLastNRunes(1) 377 | w.RS[len(w.RS)-1] = 'e' 378 | default: 379 | w.RemoveLastNRunes(len(runesOf(suffix))) 380 | } 381 | } 382 | } 383 | 384 | func firstSuffixInR1(w *snowballword.SnowballWord, suffixes []string) string { 385 | for _, suffix := range suffixes { 386 | rs := runesOf(suffix) 387 | if len(w.RS)-w.R1start >= len(rs) && w.HasSuffixRunes(rs) { 388 | return suffix 389 | } 390 | } 391 | return "" 392 | } 393 | -------------------------------------------------------------------------------- /english/english_test.go: -------------------------------------------------------------------------------- 1 | /* 2 | Herein lie all the tests of the Snowball English stemmer. 3 | 4 | Many of the tests are drawn from cases where this implementation 5 | did not match the results of the Python NLTK implementation. 6 | */ 7 | package english 8 | 9 | import ( 10 | "testing" 11 | 12 | "github.com/kljensen/snowball/romance" 13 | "github.com/kljensen/snowball/snowballword" 14 | ) 15 | 16 | // Test stopWords for things we know should be true 17 | // or false. 18 | func Test_stopWords(t *testing.T) { 19 | 20 | // Test true 21 | knownTrueStopwords := [...]string{ 22 | "a", 23 | "for", 24 | "be", 25 | "was", 26 | } 27 | for _, word := range knownTrueStopwords { 28 | if IsStopWord(word) == false { 29 | t.Errorf("Expected %v, to be in stopWords", word) 30 | } 31 | } 32 | 33 | // Test false 34 | knownFalseStopwords := [...]string{ 35 | "truck", 36 | "deoxyribonucleic", 37 | "farse", 38 | "bullschnizzle", 39 | } 40 | for _, word := range knownFalseStopwords { 41 | if IsStopWord(word) == true { 42 | t.Errorf("Expected %v, to be in stopWords", word) 43 | } 44 | } 45 | } 46 | 47 | // Test specialWords for things we know should be present 48 | // and not present. 49 | func Test_specialWords(t *testing.T) { 50 | 51 | // Test true 52 | knownTrueSpecialwords := [...]string{ 53 | "exceeding", 54 | "early", 55 | "outing", 56 | } 57 | for _, word := range knownTrueSpecialwords { 58 | if stemmed := stemSpecialWord(word); stemmed == "" { 59 | t.Errorf("Expected %v, to be in specialWords", word) 60 | } 61 | } 62 | 63 | // Test false 64 | knownFalseSpecialwords := [...]string{ 65 | "truck", 66 | "deoxyribonucleic", 67 | "farse", 68 | "bullschnizzle", 69 | } 70 | for _, word := range knownFalseSpecialwords { 71 | if stemmed := stemSpecialWord(word); stemmed != "" { 72 | t.Errorf("Expected %v, to NOT be in specialWords", word) 73 | } 74 | } 75 | } 76 | 77 | func Test_normalizeApostrophes(t *testing.T) { 78 | variants := [...]string{ 79 | "\u2019xxx\u2019", 80 | "\u2018xxx\u2018", 81 | "\u201Bxxx\u201B", 82 | "’xxx’", 83 | "‘xxx‘", 84 | "‛xxx‛", 85 | } 86 | for _, v := range variants { 87 | w := snowballword.New(v) 88 | normalizeApostrophes(w) 89 | if w.String() != "'xxx'" { 90 | t.Errorf("Expected \"'xxx'\", not \"%v\"", w.String()) 91 | } 92 | } 93 | } 94 | 95 | func Test_capitalizeYs(t *testing.T) { 96 | var wordTests = []struct { 97 | in string 98 | out string 99 | }{ 100 | {"ysdcsdeysdfsysdfsdiyoyyyxyxayxey", "YsdcsdeYsdfsysdfsdiYoYyYxyxaYxeY"}, 101 | } 102 | for _, wt := range wordTests { 103 | w := snowballword.New(wt.in) 104 | capitalizeYs(w) 105 | if w.String() != wt.out { 106 | t.Errorf("Expected \"%v\", not \"%v\"", wt.out, w.String()) 107 | } 108 | } 109 | } 110 | func Test_preprocess(t *testing.T) { 111 | var wordTests = []struct { 112 | in string 113 | out string 114 | }{ 115 | {"arguing", "arguing"}, 116 | {"'catty", "catty"}, 117 | {"kyle’s", "kyle's"}, 118 | {"toy", "toY"}, 119 | } 120 | for _, wt := range wordTests { 121 | w := snowballword.New(wt.in) 122 | preprocess(w) 123 | if w.String() != wt.out { 124 | t.Errorf("Expected \"%v\", not \"%v\"", wt.out, w.String()) 125 | } 126 | } 127 | } 128 | 129 | func Test_vnvSuffix(t *testing.T) { 130 | var wordTests = []struct { 131 | word string 132 | start int 133 | pos int 134 | }{ 135 | {"crepuscular", 0, 4}, 136 | {"uscular", 0, 2}, 137 | } 138 | for _, tc := range wordTests { 139 | w := snowballword.New(tc.word) 140 | pos := romance.VnvSuffix(w, isLowerVowel, tc.start) 141 | if pos != tc.pos { 142 | t.Errorf("Expected %v, but got %v", tc.pos, pos) 143 | } 144 | } 145 | } 146 | 147 | func Test_r1r2(t *testing.T) { 148 | var wordTests = []struct { 149 | word string 150 | r1 string 151 | r2 string 152 | }{ 153 | {"crepuscular", "uscular", "cular"}, 154 | {"beautiful", "iful", "ul"}, 155 | {"beauty", "y", ""}, 156 | {"eucharist", "harist", "ist"}, 157 | {"animadversion", "imadversion", "adversion"}, 158 | {"mistresses", "tresses", "ses"}, 159 | {"sprinkled", "kled", ""}, 160 | // Special cases below 161 | {"communism", "ism", "m"}, 162 | {"arsenal", "al", ""}, 163 | {"generalities", "alities", "ities"}, 164 | {"embed", "bed", ""}, 165 | } 166 | for _, testCase := range wordTests { 167 | w := snowballword.New(testCase.word) 168 | r1start, r2start := r1r2(w) 169 | w.R1start = r1start 170 | w.R2start = r2start 171 | if w.R1String() != testCase.r1 || w.R2String() != testCase.r2 { 172 | t.Errorf("Expected \"{%v, %v}\", but got \"{%v, %v}\"", testCase.r1, testCase.r2, w.R1String(), w.R2String()) 173 | } 174 | } 175 | } 176 | 177 | func Test_isShortWord(t *testing.T) { 178 | var testCases = []struct { 179 | word string 180 | isShort bool 181 | }{ 182 | {"bed", true}, 183 | {"shed", true}, 184 | {"shred", true}, 185 | {"bead", false}, 186 | {"embed", false}, 187 | {"beds", false}, 188 | } 189 | for _, testCase := range testCases { 190 | w := snowballword.New(testCase.word) 191 | r1start, r2start := r1r2(w) 192 | w.R1start = r1start 193 | w.R2start = r2start 194 | isShort := isShortWord(w) 195 | if isShort != testCase.isShort { 196 | t.Errorf("Expected %v, but got %v for \"{%v, %v}\"", testCase.isShort, isShort, testCase.word, w.R1String()) 197 | } 198 | } 199 | } 200 | 201 | func Test_endsShortSyllable(t *testing.T) { 202 | var testCases = []struct { 203 | word string 204 | pos int 205 | result bool 206 | }{ 207 | {"absolute", 7, true}, 208 | {"ape", 2, true}, 209 | {"rap", 3, true}, 210 | {"trap", 4, true}, 211 | {"entrap", 6, true}, 212 | {"uproot", 6, false}, 213 | {"bestow", 6, false}, 214 | {"disturb", 7, false}, 215 | } 216 | for _, testCase := range testCases { 217 | w := snowballword.New(testCase.word) 218 | result := endsShortSyllable(w, testCase.pos) 219 | if result != testCase.result { 220 | t.Errorf("Expected endsShortSyllable(%v, %v) to return %v, not %v", testCase.word, testCase.pos, testCase.result, result) 221 | } 222 | } 223 | 224 | } 225 | 226 | type stepFunc func(*snowballword.SnowballWord) bool 227 | type stepTest struct { 228 | wordIn string 229 | r1start int 230 | r2start int 231 | wordOut string 232 | r1out string 233 | r2out string 234 | } 235 | 236 | func runStepTest(t *testing.T, f stepFunc, tcs []stepTest) { 237 | for _, testCase := range tcs { 238 | w := snowballword.New(testCase.wordIn) 239 | w.R1start = testCase.r1start 240 | w.R2start = testCase.r2start 241 | _ = f(w) 242 | if w.String() != testCase.wordOut || w.R1String() != testCase.r1out || w.R2String() != testCase.r2out { 243 | t.Errorf("Expected \"{%v, %v, %v}\", but got \"{%v, %v, %v}\"", testCase.wordOut, testCase.r1out, testCase.r2out, w.String(), w.R1String(), w.R2String()) 244 | } 245 | } 246 | } 247 | 248 | func Test_step0(t *testing.T) { 249 | var testCases = []stepTest{ 250 | {"general's", 5, 9, "general", "al", ""}, 251 | {"general's'", 5, 10, "general", "al", ""}, 252 | {"spices'", 4, 7, "spices", "es", ""}, 253 | } 254 | runStepTest(t, step0, testCases) 255 | } 256 | 257 | func Test_step1a(t *testing.T) { 258 | var testCases = []stepTest{ 259 | {"ties", 0, 0, "tie", "tie", "tie"}, 260 | {"cries", 0, 0, "cri", "cri", "cri"}, 261 | {"mistresses", 3, 7, "mistress", "tress", "s"}, 262 | {"ied", 3, 3, "ie", "", ""}, 263 | } 264 | runStepTest(t, step1a, testCases) 265 | } 266 | 267 | func Test_step1b(t *testing.T) { 268 | 269 | // I could find immediately conjure up true words to 270 | // which these cases apply; so, I made some up. 271 | 272 | var testCases = []stepTest{ 273 | {"exxeedly", 1, 8, "exxee", "xxee", ""}, 274 | {"exxeed", 1, 7, "exxee", "xxee", ""}, 275 | {"luxuriated", 3, 5, "luxuriate", "uriate", "iate"}, 276 | {"luxuribled", 3, 5, "luxurible", "urible", "ible"}, 277 | {"luxuriized", 3, 5, "luxuriize", "uriize", "iize"}, 278 | {"luxuriedly", 3, 5, "luxuri", "uri", "i"}, 279 | {"vetted", 3, 6, "vet", "", ""}, 280 | {"hopping", 3, 7, "hop", "", ""}, 281 | {"breed", 5, 5, "breed", "", ""}, 282 | {"skating", 4, 6, "skate", "e", ""}, 283 | } 284 | runStepTest(t, step1b, testCases) 285 | } 286 | 287 | func Test_step1c(t *testing.T) { 288 | var testCases = []stepTest{ 289 | {"cry", 3, 3, "cri", "", ""}, 290 | {"say", 3, 3, "say", "", ""}, 291 | {"by", 2, 2, "by", "", ""}, 292 | {"xexby", 2, 5, "xexbi", "xbi", ""}, 293 | } 294 | runStepTest(t, step1c, testCases) 295 | } 296 | 297 | func Test_step2(t *testing.T) { 298 | // Here I've faked R1 & R2 for simplicity 299 | var testCases = []stepTest{ 300 | {"fluentli", 5, 8, "fluentli", "tli", ""}, 301 | // Test "tional" 302 | {"xxxtional", 3, 5, "xxxtion", "tion", "on"}, 303 | // Test when "tional" doesn't fit in R1 304 | {"xxxtional", 4, 5, "xxxtional", "ional", "onal"}, 305 | // Test "li" 306 | {"xxxcli", 3, 6, "xxxc", "c", ""}, 307 | // Test "li", non-valid li letter preceeding 308 | {"xxxxli", 3, 6, "xxxxli", "xli", ""}, 309 | // Test "ogi" 310 | {"xxlogi", 2, 6, "xxlog", "log", ""}, 311 | // Test "ogi", not preceeded by "l" 312 | {"xxxogi", 2, 6, "xxxogi", "xogi", ""}, 313 | // Test the others, which are simple replacements 314 | {"xxxxenci", 3, 7, "xxxxence", "xence", "e"}, 315 | {"xxxxanci", 3, 7, "xxxxance", "xance", "e"}, 316 | {"xxxxabli", 3, 7, "xxxxable", "xable", "e"}, 317 | {"xxxxentli", 3, 8, "xxxxent", "xent", ""}, 318 | {"xxxxizer", 3, 7, "xxxxize", "xize", ""}, 319 | {"xxxxization", 3, 10, "xxxxize", "xize", ""}, 320 | {"xxxxational", 3, 10, "xxxxate", "xate", ""}, 321 | {"xxxxation", 3, 8, "xxxxate", "xate", ""}, 322 | {"xxxxator", 3, 7, "xxxxate", "xate", ""}, 323 | {"xxxxalism", 3, 8, "xxxxal", "xal", ""}, 324 | {"xxxxaliti", 3, 8, "xxxxal", "xal", ""}, 325 | {"xxxxalli", 3, 7, "xxxxal", "xal", ""}, 326 | {"xxxxfulness", 3, 10, "xxxxful", "xful", ""}, 327 | {"xxxxousli", 3, 8, "xxxxous", "xous", ""}, 328 | {"xxxxousness", 3, 10, "xxxxous", "xous", ""}, 329 | {"xxxxiveness", 3, 10, "xxxxive", "xive", ""}, 330 | {"xxxxiviti", 3, 8, "xxxxive", "xive", ""}, 331 | {"xxxxbiliti", 3, 9, "xxxxble", "xble", ""}, 332 | {"xxxxbli", 3, 6, "xxxxble", "xble", "e"}, 333 | {"xxxxfulli", 3, 8, "xxxxful", "xful", ""}, 334 | {"xxxxlessli", 3, 8, "xxxxless", "xless", ""}, 335 | // Some of the same words, this time not in our fake R1 336 | {"xxxxenci", 8, 8, "xxxxenci", "", ""}, 337 | {"xxxxanci", 8, 8, "xxxxanci", "", ""}, 338 | {"xxxxabli", 8, 8, "xxxxabli", "", ""}, 339 | {"xxxxentli", 9, 9, "xxxxentli", "", ""}, 340 | {"xxxxizer", 8, 8, "xxxxizer", "", ""}, 341 | {"xxxxization", 11, 11, "xxxxization", "", ""}, 342 | {"xxxxational", 11, 11, "xxxxational", "", ""}, 343 | {"xxxxation", 9, 9, "xxxxation", "", ""}, 344 | {"xxxxator", 8, 8, "xxxxator", "", ""}, 345 | } 346 | runStepTest(t, step2, testCases) 347 | } 348 | 349 | func Test_step4(t *testing.T) { 350 | var testCases = []stepTest{ 351 | {"accumulate", 2, 5, "accumul", "cumul", "ul"}, 352 | {"agreement", 2, 6, "agreement", "reement", "ent"}, 353 | } 354 | runStepTest(t, step4, testCases) 355 | } 356 | func Test_step5(t *testing.T) { 357 | var testCases = []stepTest{ 358 | {"skate", 4, 5, "skate", "e", ""}, 359 | } 360 | runStepTest(t, step5, testCases) 361 | } 362 | 363 | func Test_Stem(t *testing.T) { 364 | var testCases = []struct { 365 | in string 366 | stemStopWords bool 367 | out string 368 | }{ 369 | {"aberration", true, "aberr"}, 370 | {"abruptness", true, "abrupt"}, 371 | {"absolute", true, "absolut"}, 372 | {"abated", true, "abat"}, 373 | {"acclivity", true, "accliv"}, 374 | {"accumulations", true, "accumul"}, 375 | {"agreement", true, "agreement"}, 376 | {"breed", true, "breed"}, 377 | {"ape", true, "ape"}, 378 | {"skating", true, "skate"}, 379 | {"fluently", true, "fluentli"}, 380 | {"ied", true, "ie"}, 381 | {"ies", true, "ie"}, 382 | // Stop words 383 | {"because", true, "becaus"}, 384 | {"because", false, "because"}, 385 | {"above", true, "abov"}, 386 | {"above", false, "above"}, 387 | } 388 | for _, tc := range testCases { 389 | stemmed := Stem(tc.in, tc.stemStopWords) 390 | if stemmed != tc.out { 391 | t.Errorf("Expected %v to stem to %v, but got %v", tc.in, tc.out, stemmed) 392 | } 393 | } 394 | 395 | } 396 | -------------------------------------------------------------------------------- /spanish/spanish_test.go: -------------------------------------------------------------------------------- 1 | package spanish 2 | 3 | import ( 4 | "github.com/kljensen/snowball/romance" 5 | "testing" 6 | ) 7 | 8 | // Test stopWords for things we know should be true 9 | // or false. 10 | // 11 | func Test_stopWords(t *testing.T) { 12 | testCases := []romance.WordBoolTestCase{ 13 | {"el", true}, 14 | {"queso", false}, 15 | } 16 | romance.RunWordBoolTest(t, IsStopWord, testCases) 17 | } 18 | 19 | // Test isLowerVowel for things we know should be true 20 | // or false. 21 | // 22 | func Test_isLowerVowel(t *testing.T) { 23 | testCases := []romance.WordBoolTestCase{ 24 | // These are all vowels. 25 | {"aeiouáéíóúü", true}, 26 | // None of these are vowels. 27 | {"cbfqhkl", false}, 28 | } 29 | romance.RunRunewiseBoolTest(t, isLowerVowel, testCases) 30 | } 31 | 32 | // Test isLowerVowel for things we know should be true 33 | // or false. 34 | // 35 | func Test_findRegions(t *testing.T) { 36 | testCases := []romance.FindRegionsTestCase{ 37 | {"macho", 3, 5, 3}, 38 | {"olivia", 2, 4, 3}, 39 | {"trabajo", 4, 6, 3}, 40 | {"áureo", 3, 5, 3}, 41 | {"piñaolayas", 3, 6, 3}, 42 | {"terminales", 3, 6, 3}, 43 | {"durmió", 3, 6, 3}, 44 | {"cobija", 3, 5, 3}, 45 | {"anderson", 2, 5, 4}, 46 | {"cervezas", 3, 6, 3}, 47 | {"climáticas", 4, 6, 3}, 48 | {"expide", 2, 5, 4}, 49 | {"cenizas", 3, 5, 3}, 50 | {"maximiliano", 3, 5, 3}, 51 | {"específicos", 2, 5, 4}, 52 | {"menor", 3, 5, 3}, 53 | {"generis", 3, 5, 3}, 54 | {"casero", 3, 5, 3}, 55 | {"pululan", 3, 5, 3}, 56 | {"suscitado", 3, 6, 3}, 57 | {"pesadez", 3, 5, 3}, 58 | {"interno", 2, 5, 4}, 59 | {"agredido", 2, 5, 4}, 60 | {"desprendía", 3, 7, 3}, 61 | {"vistazo", 3, 6, 3}, 62 | {"frecuentan", 4, 7, 3}, 63 | {"noviembre", 3, 6, 3}, 64 | {"sintética", 3, 6, 3}, 65 | {"newagismo", 3, 5, 3}, 66 | {"eliseo", 2, 4, 3}, 67 | {"desbordado", 3, 6, 3}, 68 | {"dispongo", 3, 6, 3}, 69 | {"dilatar", 3, 5, 3}, 70 | {"xochitl", 3, 6, 3}, 71 | {"proporcionaba", 4, 6, 3}, 72 | {"pue", 3, 3, 3}, 73 | {"alpargatado", 2, 5, 4}, 74 | {"exigida", 2, 4, 3}, 75 | {"céntricas", 3, 7, 3}, 76 | {"prende", 4, 6, 3}, 77 | {"estructural", 2, 6, 5}, 78 | {"ilegalmente", 2, 4, 3}, 79 | {"freeport", 5, 7, 3}, 80 | {"sonrisas", 3, 6, 3}, 81 | {"cobró", 3, 5, 3}, 82 | {"dioses", 4, 6, 3}, 83 | {"consistieron", 3, 6, 3}, 84 | {"policiales", 3, 5, 3}, 85 | {"conciliador", 3, 6, 3}, 86 | {"fierro", 4, 6, 3}, 87 | {"aparadores", 2, 4, 3}, 88 | {"coreados", 3, 6, 3}, 89 | {"posición", 3, 5, 3}, 90 | {"adversidades", 2, 5, 4}, 91 | {"comprometido", 3, 7, 3}, 92 | {"aventuras", 2, 4, 3}, 93 | {"santiso", 3, 6, 3}, 94 | {"talentos", 3, 5, 3}, 95 | {"apreciar", 2, 5, 4}, 96 | {"sprints", 5, 7, 4}, 97 | {"zarco", 3, 5, 3}, 98 | {"concretos", 3, 7, 3}, 99 | {"gavica", 3, 5, 3}, 100 | {"suavemente", 4, 6, 3}, 101 | {"españolitos", 2, 5, 4}, 102 | {"grabará", 4, 6, 3}, 103 | {"entregados", 2, 6, 5}, 104 | {"gustaría", 3, 6, 3}, 105 | {"nickin", 3, 6, 3}, 106 | {"sogem", 3, 5, 3}, 107 | {"prohíbe", 4, 6, 3}, 108 | {"espinoso", 2, 5, 4}, 109 | {"atraviesan", 2, 5, 4}, 110 | {"bancomext", 3, 6, 3}, 111 | {"paraguay", 3, 5, 3}, 112 | {"amamos", 2, 4, 3}, 113 | {"consigna", 3, 6, 3}, 114 | {"funcionarios", 3, 7, 3}, 115 | {"marquis", 3, 7, 3}, 116 | {"desactivaron", 3, 5, 3}, 117 | {"concentrados", 3, 6, 3}, 118 | {"democratizante", 3, 5, 3}, 119 | {"afianzadora", 2, 5, 3}, 120 | {"homicidio", 3, 5, 3}, 121 | {"promovidos", 4, 6, 3}, 122 | {"maquiladora", 3, 6, 3}, 123 | {"bike", 3, 4, 3}, 124 | {"recuerdos", 3, 6, 3}, 125 | {"géneros", 3, 5, 3}, 126 | {"rechaza", 3, 6, 3}, 127 | {"sentarían", 3, 6, 3}, 128 | {"quererlo", 4, 6, 3}, 129 | {"sofisticado", 3, 5, 3}, 130 | {"miriam", 3, 6, 3}, 131 | {"echara", 2, 5, 4}, 132 | {"mico", 3, 4, 3}, 133 | {"enferma", 2, 5, 4}, 134 | {"reforzamiento", 3, 5, 3}, 135 | {"circunscrito", 3, 6, 3}, 136 | {"indiana", 2, 6, 4}, 137 | {"metrópoli", 3, 6, 3}, 138 | {"libreta", 3, 6, 3}, 139 | {"gonzalez", 3, 6, 3}, 140 | {"antidemocrática", 2, 5, 4}, 141 | } 142 | romance.RunFindRegionsTest(t, findRegions, testCases) 143 | } 144 | 145 | // Test step0, the removal of pronoun suffixes. 146 | // 147 | func Test_step0(t *testing.T) { 148 | testCases := []romance.StepTestCase{ 149 | {"liberarlo", 3, 5, 3, true, "liberar", 3, 5, 3}, 150 | {"ejecutarse", 2, 4, 3, true, "ejecutar", 2, 4, 3}, 151 | {"convirtiéndolas", 3, 6, 3, true, "convirtiendo", 3, 6, 3}, 152 | {"perfeccionarlo", 3, 6, 3, true, "perfeccionar", 3, 6, 3}, 153 | {"formarlo", 3, 6, 3, true, "formar", 3, 6, 3}, 154 | {"negociarlo", 3, 5, 3, true, "negociar", 3, 5, 3}, 155 | {"dirigirla", 3, 5, 3, true, "dirigir", 3, 5, 3}, 156 | {"malograrlas", 3, 5, 3, true, "malograr", 3, 5, 3}, 157 | {"atacarlos", 2, 4, 3, true, "atacar", 2, 4, 3}, 158 | {"originarla", 2, 4, 3, true, "originar", 2, 4, 3}, 159 | {"ponerlos", 3, 5, 3, true, "poner", 3, 5, 3}, 160 | {"ubicándolo", 2, 4, 3, true, "ubicando", 2, 4, 3}, 161 | {"dejarme", 3, 5, 3, true, "dejar", 3, 5, 3}, 162 | {"regalarnos", 3, 5, 3, true, "regalar", 3, 5, 3}, 163 | {"resolverlas", 3, 5, 3, true, "resolver", 3, 5, 3}, 164 | {"esperarse", 2, 5, 4, true, "esperar", 2, 5, 4}, 165 | {"cuidarlo", 4, 6, 3, true, "cuidar", 4, 6, 3}, 166 | {"empezarlos", 2, 5, 4, true, "empezar", 2, 5, 4}, 167 | {"gastarla", 3, 6, 3, true, "gastar", 3, 6, 3}, 168 | {"levantarme", 3, 5, 3, true, "levantar", 3, 5, 3}, 169 | {"ausentarse", 3, 5, 3, true, "ausentar", 3, 5, 3}, 170 | {"colocándose", 3, 5, 3, true, "colocando", 3, 5, 3}, 171 | {"suponerse", 3, 5, 3, true, "suponer", 3, 5, 3}, 172 | {"someterlos", 3, 5, 3, true, "someter", 3, 5, 3}, 173 | {"criticarlos", 4, 6, 3, true, "criticar", 4, 6, 3}, 174 | {"consolidarlo", 3, 6, 3, true, "consolidar", 3, 6, 3}, 175 | {"globalizarse", 4, 6, 3, true, "globalizar", 4, 6, 3}, 176 | {"corregirla", 3, 6, 3, true, "corregir", 3, 6, 3}, 177 | {"aplicarle", 2, 5, 4, true, "aplicar", 2, 5, 4}, 178 | {"casarse", 3, 5, 3, true, "casar", 3, 5, 3}, 179 | {"costándole", 3, 6, 3, true, "costando", 3, 6, 3}, 180 | {"rescindirlo", 3, 6, 3, true, "rescindir", 3, 6, 3}, 181 | {"quitándole", 4, 6, 3, true, "quitando", 4, 6, 3}, 182 | {"conservarse", 3, 6, 3, true, "conservar", 3, 6, 3}, 183 | {"venderlo", 3, 6, 3, true, "vender", 3, 6, 3}, 184 | {"garantizarse", 3, 5, 3, true, "garantizar", 3, 5, 3}, 185 | {"disfrutarse", 3, 7, 3, true, "disfrutar", 3, 7, 3}, 186 | {"comunicarse", 3, 5, 3, true, "comunicar", 3, 5, 3}, 187 | {"propiciarse", 4, 6, 3, true, "propiciar", 4, 6, 3}, 188 | {"otorgarnos", 2, 4, 3, true, "otorgar", 2, 4, 3}, 189 | {"contorsionarse", 3, 6, 3, true, "contorsionar", 3, 6, 3}, 190 | {"motivarlas", 3, 5, 3, true, "motivar", 3, 5, 3}, 191 | {"congelarse", 3, 6, 3, true, "congelar", 3, 6, 3}, 192 | {"generandoles", 3, 5, 3, true, "generando", 3, 5, 3}, 193 | {"evitarlo", 2, 4, 3, true, "evitar", 2, 4, 3}, 194 | {"atenderlos", 2, 4, 3, true, "atender", 2, 4, 3}, 195 | {"apoyándola", 2, 4, 3, true, "apoyando", 2, 4, 3}, 196 | {"pasarse", 3, 5, 3, true, "pasar", 3, 5, 3}, 197 | {"escucharlos", 2, 5, 4, true, "escuchar", 2, 5, 4}, 198 | {"intervenirse", 2, 5, 4, true, "intervenir", 2, 5, 4}, 199 | {"contratarle", 3, 7, 3, true, "contratar", 3, 7, 3}, 200 | {"retirándose", 3, 5, 3, true, "retirando", 3, 5, 3}, 201 | {"quitarles", 4, 6, 3, true, "quitar", 4, 6, 3}, 202 | {"reforzarlas", 3, 5, 3, true, "reforzar", 3, 5, 3}, 203 | {"obtenerla", 2, 5, 4, true, "obtener", 2, 5, 4}, 204 | {"considerarlo", 3, 6, 3, true, "considerar", 3, 6, 3}, 205 | {"regresarse", 3, 6, 3, true, "regresar", 3, 6, 3}, 206 | {"ponerse", 3, 5, 3, true, "poner", 3, 5, 3}, 207 | {"llevándose", 4, 6, 3, true, "llevando", 4, 6, 3}, 208 | {"ocuparse", 2, 4, 3, true, "ocupar", 2, 4, 3}, 209 | {"aprovecharse", 2, 5, 4, true, "aprovechar", 2, 5, 4}, 210 | {"corregirlo", 3, 6, 3, true, "corregir", 3, 6, 3}, 211 | {"probarle", 4, 6, 3, true, "probar", 4, 6, 3}, 212 | {"comernos", 3, 5, 3, true, "comer", 3, 5, 3}, 213 | {"iniciarme", 2, 4, 3, true, "iniciar", 2, 4, 3}, 214 | {"concentrarse", 3, 6, 3, true, "concentrar", 3, 6, 3}, 215 | {"llevarse", 4, 6, 3, true, "llevar", 4, 6, 3}, 216 | {"difundirlo", 3, 5, 3, true, "difundir", 3, 5, 3}, 217 | {"basándose", 3, 5, 3, true, "basando", 3, 5, 3}, 218 | {"destinarlos", 3, 6, 3, true, "destinar", 3, 6, 3}, 219 | {"reubicarse", 4, 6, 3, true, "reubicar", 4, 6, 3}, 220 | {"manteniéndose", 3, 6, 3, true, "manteniendo", 3, 6, 3}, 221 | {"colocarla", 3, 5, 3, true, "colocar", 3, 5, 3}, 222 | {"pasarles", 3, 5, 3, true, "pasar", 3, 5, 3}, 223 | {"depositarse", 3, 5, 3, true, "depositar", 3, 5, 3}, 224 | {"tragarse", 4, 6, 3, true, "tragar", 4, 6, 3}, 225 | {"eliminarla", 2, 4, 3, true, "eliminar", 2, 4, 3}, 226 | {"eliminarse", 2, 4, 3, true, "eliminar", 2, 4, 3}, 227 | {"apegarnos", 2, 4, 3, true, "apegar", 2, 4, 3}, 228 | {"asociarse", 2, 4, 3, true, "asociar", 2, 4, 3}, 229 | {"cambiarlos", 3, 7, 3, true, "cambiar", 3, 7, 3}, 230 | {"envolviéndose", 2, 5, 4, true, "envolviendo", 2, 5, 4}, 231 | {"lograrse", 3, 6, 3, true, "lograr", 3, 6, 3}, 232 | {"mostrarse", 3, 7, 3, true, "mostrar", 3, 7, 3}, 233 | {"pasarle", 3, 5, 3, true, "pasar", 3, 5, 3}, 234 | {"enfrentándose", 2, 6, 5, true, "enfrentando", 2, 6, 5}, 235 | {"permitirse", 3, 6, 3, true, "permitir", 3, 6, 3}, 236 | {"sanearlas", 3, 6, 3, true, "sanear", 3, 6, 3}, 237 | {"refugiarse", 3, 5, 3, true, "refugiar", 3, 5, 3}, 238 | {"relacionarse", 3, 5, 3, true, "relacionar", 3, 5, 3}, 239 | {"sacarlo", 3, 5, 3, true, "sacar", 3, 5, 3}, 240 | {"organizarse", 2, 5, 4, true, "organizar", 2, 5, 4}, 241 | {"familiarizarse", 3, 5, 3, true, "familiarizar", 3, 5, 3}, 242 | {"decidirse", 3, 5, 3, true, "decidir", 3, 5, 3}, 243 | {"tomarle", 3, 5, 3, true, "tomar", 3, 5, 3}, 244 | {"volverlas", 3, 6, 3, true, "volver", 3, 6, 3}, 245 | {"efectuarse", 2, 4, 3, true, "efectuar", 2, 4, 3}, 246 | {"elegirse", 2, 4, 3, true, "elegir", 2, 4, 3}, 247 | {"establecerse", 2, 5, 4, true, "establecer", 2, 5, 4}, 248 | {"ponerles", 3, 5, 3, true, "poner", 3, 5, 3}, 249 | } 250 | romance.RunStepTest(t, step0, testCases) 251 | } 252 | 253 | // Test step1, the removal of standard suffixes. 254 | // 255 | func Test_step1(t *testing.T) { 256 | testCases := []romance.StepTestCase{ 257 | {"retrospectiva", 3, 6, 3, true, "retrospect", 3, 6, 3}, 258 | {"emperador", 2, 5, 4, true, "emper", 2, 5, 4}, 259 | {"instalaciones", 2, 6, 5, true, "instal", 2, 6, 5}, 260 | {"finiquitación", 3, 5, 3, true, "finiquit", 3, 5, 3}, 261 | {"definitivamente", 3, 5, 3, true, "definit", 3, 5, 3}, 262 | {"turísticas", 3, 5, 3, true, "turíst", 3, 5, 3}, 263 | {"puntualizaciones", 3, 7, 3, true, "puntualiz", 3, 7, 3}, 264 | {"fehacientemente", 3, 5, 3, true, "fehaciente", 3, 5, 3}, 265 | {"determinaciones", 3, 5, 3, true, "determin", 3, 5, 3}, 266 | {"irrelevante", 2, 5, 4, true, "irrelev", 2, 5, 4}, 267 | {"autoritarismo", 3, 5, 3, true, "autoritar", 3, 5, 3}, 268 | {"paralizante", 3, 5, 3, true, "paraliz", 3, 5, 3}, 269 | {"pediátrica", 3, 6, 3, true, "pediátr", 3, 6, 3}, 270 | {"británicos", 4, 6, 3, true, "britán", 4, 6, 3}, 271 | {"ayuntamientos", 2, 4, 3, true, "ayunt", 2, 4, 3}, 272 | {"sobrecalentamiento", 3, 6, 3, true, "sobrecalent", 3, 6, 3}, 273 | {"inocencia", 2, 4, 3, true, "inocente", 2, 4, 3}, 274 | {"amabilidad", 2, 4, 3, true, "amabil", 2, 4, 3}, 275 | {"personalidad", 3, 6, 3, true, "personal", 3, 6, 3}, 276 | {"vacunación", 3, 5, 3, true, "vacun", 3, 5, 3}, 277 | {"digestivos", 3, 5, 3, true, "digest", 3, 5, 3}, 278 | {"mecánica", 3, 5, 3, true, "mecán", 3, 5, 3}, 279 | {"sistemáticas", 3, 6, 3, true, "sistemát", 3, 6, 3}, 280 | {"programático", 4, 7, 3, true, "programát", 4, 7, 3}, 281 | {"incitación", 2, 5, 4, true, "incit", 2, 5, 4}, 282 | {"inicialmente", 2, 4, 3, true, "inicial", 2, 4, 3}, 283 | {"derivación", 3, 5, 3, true, "deriv", 3, 5, 3}, 284 | {"fraccionamientos", 4, 8, 3, true, "fraccion", 4, 8, 3}, 285 | {"frecuentemente", 4, 7, 3, true, "frecuente", 4, 7, 3}, 286 | {"econometristas", 2, 4, 3, true, "econometr", 2, 4, 3}, 287 | {"mentirosas", 3, 6, 3, true, "mentir", 3, 6, 3}, 288 | {"eficientemente", 2, 4, 3, true, "eficiente", 2, 4, 3}, 289 | {"utilidades", 2, 4, 3, true, "util", 2, 4, 3}, 290 | {"rehabilitación", 3, 5, 3, true, "rehabilit", 3, 5, 3}, 291 | {"adquisitivo", 2, 6, 4, true, "adquisit", 2, 6, 4}, 292 | {"consignación", 3, 6, 3, true, "consign", 3, 6, 3}, 293 | {"concursante", 3, 6, 3, true, "concurs", 3, 6, 3}, 294 | {"criminalidad", 4, 6, 3, true, "criminal", 4, 6, 3}, 295 | {"invitación", 2, 5, 4, true, "invit", 2, 5, 4}, 296 | {"adherencia", 2, 5, 4, true, "adherente", 2, 5, 4}, 297 | {"animalización", 2, 4, 3, true, "animaliz", 2, 4, 3}, 298 | {"enteramente", 2, 5, 4, true, "enter", 2, 5, 4}, 299 | {"deportivos", 3, 5, 3, true, "deport", 3, 5, 3}, 300 | {"controladora", 3, 7, 3, true, "control", 3, 7, 3}, 301 | {"edifico", 2, 4, 3, true, "edif", 2, 4, 3}, 302 | {"planificación", 4, 6, 3, true, "planif", 4, 6, 3}, 303 | {"expectación", 2, 5, 4, true, "expect", 2, 5, 4}, 304 | {"cosméticos", 3, 6, 3, true, "cosmét", 3, 6, 3}, 305 | {"cómodamente", 3, 5, 3, true, "cómod", 3, 5, 3}, 306 | {"intempestivamente", 2, 5, 4, true, "intempest", 2, 5, 4}, 307 | {"cetemistas", 3, 5, 3, true, "cetem", 3, 5, 3}, 308 | {"presumiblemente", 4, 6, 3, true, "presum", 4, 6, 3}, 309 | {"reivindicación", 4, 6, 3, true, "reivind", 4, 6, 3}, 310 | {"ventajosa", 3, 6, 3, true, "ventaj", 3, 6, 3}, 311 | {"atmosférico", 2, 5, 4, true, "atmosfér", 2, 5, 4}, 312 | {"subprocurador", 3, 7, 3, true, "subprocur", 3, 7, 3}, 313 | {"estadísticas", 2, 5, 4, true, "estadíst", 2, 5, 4}, 314 | {"respetuoso", 3, 6, 3, true, "respetu", 3, 6, 3}, 315 | {"procedimiento", 4, 6, 3, true, "proced", 4, 6, 3}, 316 | {"seguramente", 3, 5, 3, true, "segur", 3, 5, 3}, 317 | {"autocalifica", 3, 5, 3, true, "autocalif", 3, 5, 3}, 318 | {"esporádica", 2, 5, 4, true, "esporád", 2, 5, 4}, 319 | {"caudalosos", 4, 6, 3, true, "caudal", 4, 6, 3}, 320 | {"imperdonable", 2, 5, 4, true, "imperdon", 2, 5, 4}, 321 | {"magníficas", 3, 6, 3, true, "magníf", 3, 6, 3}, 322 | {"erróneamente", 2, 5, 4, true, "erróne", 2, 5, 4}, 323 | {"conmemorativa", 3, 6, 3, true, "conmemor", 3, 6, 3}, 324 | {"simulación", 3, 5, 3, true, "simul", 3, 5, 3}, 325 | {"arrendadora", 2, 5, 4, true, "arrend", 2, 5, 4}, 326 | {"moralización", 3, 5, 3, true, "moraliz", 3, 5, 3}, 327 | {"accesibles", 2, 5, 4, true, "acces", 2, 5, 4}, 328 | {"infidelidades", 2, 5, 4, true, "infidel", 2, 5, 4}, 329 | {"abdicación", 2, 5, 4, true, "abdic", 2, 5, 4}, 330 | {"airecombustible", 3, 5, 3, true, "airecombust", 3, 5, 3}, 331 | {"escuetamente", 2, 6, 4, true, "escuet", 2, 6, 4}, 332 | {"exóticos", 2, 4, 3, true, "exót", 2, 4, 3}, 333 | {"volcánica", 3, 6, 3, true, "volcán", 3, 6, 3}, 334 | {"oceánico", 2, 5, 3, true, "oceán", 2, 5, 3}, 335 | {"simulador", 3, 5, 3, true, "simul", 3, 5, 3}, 336 | {"compañerismo", 3, 6, 3, true, "compañer", 3, 6, 3}, 337 | {"fagotistas", 3, 5, 3, true, "fagot", 3, 5, 3}, 338 | {"cardenistas", 3, 6, 3, true, "carden", 3, 6, 3}, 339 | {"periférico", 3, 5, 3, true, "perifér", 3, 5, 3}, 340 | {"petroquímica", 3, 6, 3, true, "petroquím", 3, 6, 3}, 341 | {"columnista", 3, 5, 3, true, "column", 3, 5, 3}, 342 | {"comportamientos", 3, 6, 3, true, "comport", 3, 6, 3}, 343 | {"fanáticos", 3, 5, 3, true, "fanát", 3, 5, 3}, 344 | {"significativo", 3, 6, 3, true, "signific", 3, 6, 3}, 345 | {"turísticos", 3, 5, 3, true, "turíst", 3, 5, 3}, 346 | {"divergencias", 3, 5, 3, true, "divergente", 3, 5, 3}, 347 | {"lamentable", 3, 5, 3, true, "lament", 3, 5, 3}, 348 | {"estratosféricas", 2, 6, 5, true, "estratosfér", 2, 6, 5}, 349 | {"emigrantes", 2, 4, 3, true, "emigr", 2, 4, 3}, 350 | {"ahorrador", 2, 4, 3, true, "ahorr", 2, 4, 3}, 351 | {"transportaciones", 4, 8, 3, true, "transport", 4, 8, 3}, 352 | {"atomizador", 2, 4, 3, true, "atomiz", 2, 4, 3}, 353 | {"dolorosa", 3, 5, 3, true, "dolor", 3, 5, 3}, 354 | {"enervantes", 2, 4, 3, true, "enerv", 2, 4, 3}, 355 | {"gobernador", 3, 5, 3, true, "gobern", 3, 5, 3}, 356 | {"inexplicable", 2, 4, 3, true, "inexplic", 2, 4, 3}, 357 | } 358 | romance.RunStepTest(t, step1, testCases) 359 | } 360 | -------------------------------------------------------------------------------- /french/french_test.go: -------------------------------------------------------------------------------- 1 | package french 2 | 3 | import ( 4 | "github.com/kljensen/snowball/romance" 5 | "github.com/kljensen/snowball/snowballword" 6 | "testing" 7 | ) 8 | 9 | // Test stopWords for things we know should be true 10 | // or false. 11 | // 12 | func Test_stopWords(t *testing.T) { 13 | testCases := []romance.WordBoolTestCase{ 14 | {"eussiez", true}, 15 | {"machine", false}, 16 | } 17 | romance.RunWordBoolTest(t, IsStopWord, testCases) 18 | } 19 | 20 | // Test isLowerVowel for things we know should be true 21 | // or false. 22 | // 23 | func Test_isLowerVowel(t *testing.T) { 24 | testCases := []romance.WordBoolTestCase{ 25 | // These are all vowels. 26 | {"aeiouyâàëéêèïîôûù", true}, 27 | // None of these are vowels. 28 | {"cbfqhkl", false}, 29 | } 30 | romance.RunRunewiseBoolTest(t, isLowerVowel, testCases) 31 | } 32 | 33 | // Test capitalization of vowels acting as non-vowels. 34 | // 35 | func Test_capitalizeYUI(t *testing.T) { 36 | testCases := []struct { 37 | wordIn string 38 | wordOut string 39 | }{ 40 | {"jouer", "joUer"}, 41 | {"ennuie", "ennuIe"}, 42 | {"yeux", "Yeux"}, 43 | {"quand", "qUand"}, 44 | } 45 | 46 | for _, testCase := range testCases { 47 | w := snowballword.New(testCase.wordIn) 48 | capitalizeYUI(w) 49 | if w.String() != testCase.wordOut { 50 | t.Errorf("Expect %v -> %v, but got %v", testCase.wordIn, testCase.wordOut, w.String()) 51 | } 52 | } 53 | } 54 | func Test_findRegions(t *testing.T) { 55 | testCases := []romance.FindRegionsTestCase{ 56 | {"iriez", 2, 5, 3}, 57 | {"reçoivent", 3, 6, 2}, 58 | {"rébarbatif", 3, 5, 2}, 59 | {"paraîtrons", 3, 6, 3}, 60 | {"prétendus", 4, 6, 3}, 61 | {"boUilli", 3, 5, 2}, 62 | {"destitué", 3, 6, 2}, 63 | {"bataillons", 3, 6, 2}, 64 | {"buffa", 3, 5, 2}, 65 | {"suffisante", 3, 6, 2}, 66 | {"excepté", 2, 5, 4}, 67 | {"audace", 3, 5, 3}, 68 | {"vertueuses", 3, 8, 2}, 69 | {"écrièrent", 2, 6, 4}, 70 | {"provoqUer", 4, 6, 3}, 71 | {"barbotement", 3, 6, 2}, 72 | {"contribua", 3, 7, 2}, 73 | {"ensuit", 2, 6, 4}, 74 | {"confédéré", 3, 6, 2}, 75 | {"affairé", 2, 6, 4}, 76 | {"incompatibles", 2, 5, 4}, 77 | {"talma", 3, 5, 2}, 78 | {"péchais", 3, 7, 2}, 79 | {"abusé", 2, 4, 3}, 80 | {"plaisir", 5, 7, 3}, 81 | {"foretells", 3, 5, 2}, 82 | {"walbah", 3, 6, 2}, 83 | {"confucius", 3, 6, 2}, 84 | {"attelée", 2, 5, 4}, 85 | {"tirailler", 3, 6, 2}, 86 | {"vin", 3, 3, 2}, 87 | {"toucher", 4, 7, 2}, 88 | {"reprendrons", 3, 6, 2}, 89 | {"hé", 2, 2, 2}, 90 | {"intéressant", 2, 5, 4}, 91 | {"malebar", 3, 5, 2}, 92 | {"alimenter", 2, 4, 3}, 93 | {"inventée", 2, 5, 4}, 94 | {"rechargez", 3, 6, 2}, 95 | {"revêtu", 3, 5, 2}, 96 | {"étaYé", 2, 4, 3}, 97 | {"maladresse", 3, 5, 2}, 98 | {"envié", 2, 5, 4}, 99 | {"secoUaIent", 3, 5, 2}, 100 | {"parler", 3, 6, 3}, 101 | {"marécages", 3, 5, 2}, 102 | {"privilèges", 4, 6, 3}, 103 | {"examinez", 2, 4, 3}, 104 | {"contraria", 3, 7, 2}, 105 | {"sotte", 3, 5, 2}, 106 | {"méchantes", 3, 6, 2}, 107 | {"coffres", 3, 7, 2}, 108 | {"tressaillir", 4, 8, 3}, 109 | {"charlatanisme", 4, 7, 3}, 110 | {"appuYais", 2, 5, 4}, 111 | {"interdis", 2, 5, 4}, 112 | {"baissa", 4, 6, 2}, 113 | {"sanglotant", 3, 7, 2}, 114 | {"rencontrerai", 3, 6, 2}, 115 | {"subis", 3, 5, 2}, 116 | {"empestée", 2, 5, 4}, 117 | {"communiqUa", 3, 6, 2}, 118 | {"huit", 4, 4, 2}, 119 | {"heurter", 4, 7, 2}, 120 | {"premiers", 4, 7, 3}, 121 | {"brusqUe", 4, 7, 3}, 122 | {"inanimé", 2, 4, 3}, 123 | {"congédia", 3, 6, 2}, 124 | {"souffrir", 4, 8, 2}, 125 | {"élévations", 2, 4, 3}, 126 | {"sablé", 3, 5, 2}, 127 | {"salure", 3, 5, 2}, 128 | {"résigna", 3, 5, 2}, 129 | {"compatriotes", 3, 6, 2}, 130 | {"écrient", 2, 6, 4}, 131 | {"chanoine", 4, 7, 3}, 132 | {"conçois", 3, 7, 2}, 133 | {"lançaIent", 3, 6, 2}, 134 | {"pékin", 3, 5, 2}, 135 | {"poneYs", 3, 5, 2}, 136 | {"pratiqUer", 4, 6, 3}, 137 | {"bâtonne", 3, 5, 2}, 138 | {"possibilités", 3, 6, 2}, 139 | {"aiguille", 3, 6, 3}, 140 | {"ténor", 3, 5, 2}, 141 | {"déchirés", 3, 6, 2}, 142 | {"anoblit", 2, 4, 3}, 143 | {"tombât", 3, 6, 2}, 144 | {"paralysé", 3, 5, 3}, 145 | {"dot", 3, 3, 2}, 146 | {"aigre", 3, 5, 3}, 147 | {"ramena", 3, 5, 2}, 148 | {"appartiennent", 2, 5, 4}, 149 | {"premières", 4, 7, 3}, 150 | {"tentez", 3, 6, 2}, 151 | {"pari", 3, 4, 3}, 152 | {"coudes", 4, 6, 2}, 153 | {"étonnerait", 2, 4, 3}, 154 | {"embrunir", 2, 6, 5}, 155 | {"mobile", 3, 5, 2}, 156 | } 157 | 158 | romance.RunFindRegionsTest(t, findRegions, testCases) 159 | } 160 | 161 | // Test step1, the removal of standard suffixes. 162 | // 163 | func Test_step1(t *testing.T) { 164 | testCases := []romance.StepTestCase{ 165 | {"rapidement", 3, 5, 2, true, "rapid", 3, 5, 2}, 166 | {"paresseuse", 3, 5, 3, true, "paress", 3, 5, 3}, 167 | {"prosaïqUement", 4, 7, 3, true, "prosaïqU", 4, 7, 3}, 168 | {"nonchalance", 3, 7, 2, true, "nonchal", 3, 7, 2}, 169 | {"apostoliqUes", 2, 4, 3, true, "apostol", 2, 4, 3}, 170 | {"assiduités", 2, 5, 4, true, "assidu", 2, 5, 4}, 171 | {"philosophiqUement", 4, 6, 3, true, "philosoph", 4, 6, 3}, 172 | {"despotiqUement", 3, 6, 2, true, "despot", 3, 6, 2}, 173 | {"incontestablement", 2, 5, 4, true, "incontest", 2, 5, 4}, 174 | {"diminution", 3, 5, 2, true, "diminu", 3, 5, 2}, 175 | {"séditieuse", 3, 5, 2, true, "séditi", 3, 5, 2}, 176 | {"anonymement", 2, 4, 3, true, "anonym", 2, 4, 3}, 177 | {"conservation", 3, 6, 2, true, "conserv", 3, 6, 2}, 178 | {"fâcheuses", 3, 7, 2, true, "fâcheux", 3, 7, 2}, 179 | {"houleuse", 4, 7, 2, true, "houleux", 4, 7, 2}, 180 | {"historiqUes", 3, 6, 2, true, "histor", 3, 6, 2}, 181 | {"impérieusement", 2, 5, 4, true, "impéri", 2, 5, 4}, 182 | {"complaisances", 3, 8, 2, true, "complais", 3, 8, 2}, 183 | {"confessionnaux", 3, 6, 2, true, "confessionnal", 3, 6, 2}, 184 | {"grandement", 4, 7, 3, true, "grand", 4, 5, 3}, 185 | {"passablement", 3, 6, 2, true, "passabl", 3, 6, 2}, 186 | {"strictement", 5, 8, 4, true, "strict", 5, 6, 4}, 187 | {"physiqUement", 4, 6, 3, true, "physiqU", 4, 6, 3}, 188 | {"serieusement", 3, 7, 2, true, "serieux", 3, 7, 2}, 189 | {"roulement", 4, 6, 2, true, "roul", 4, 4, 2}, 190 | {"appartement", 2, 5, 4, true, "appart", 2, 5, 4}, 191 | {"reconnaissance", 3, 5, 2, true, "reconnaiss", 3, 5, 2}, 192 | {"aigrement", 3, 6, 3, true, "aigr", 3, 4, 3}, 193 | {"impertinences", 2, 5, 4, true, "impertinent", 2, 5, 4}, 194 | {"parlement", 3, 6, 3, true, "parl", 3, 4, 3}, 195 | {"malicieux", 3, 5, 2, true, "malici", 3, 5, 2}, 196 | {"suffisance", 3, 6, 2, true, "suffis", 3, 6, 2}, 197 | {"prémédité", 4, 6, 3, true, "préméd", 4, 6, 3}, 198 | {"métalliqUes", 3, 5, 2, true, "métall", 3, 5, 2}, 199 | {"météorologiste", 3, 6, 2, true, "météorolog", 3, 6, 2}, 200 | {"prononciation", 4, 6, 3, true, "prononci", 4, 6, 3}, 201 | {"nombreuse", 3, 8, 2, true, "nombreux", 3, 8, 2}, 202 | {"extatiqUe", 2, 5, 4, true, "extat", 2, 5, 4}, 203 | {"magnifiqUement", 3, 6, 2, true, "magnif", 3, 6, 2}, 204 | {"gymnastiqUe", 3, 6, 2, true, "gymnast", 3, 6, 2}, 205 | {"dramatiqUe", 4, 6, 3, true, "dramat", 4, 6, 3}, 206 | {"simplicité", 3, 7, 2, true, "simpliqU", 3, 7, 2}, 207 | {"roYalistes", 3, 5, 2, true, "roYal", 3, 5, 2}, 208 | {"fortifications", 3, 6, 2, true, "fortif", 3, 6, 2}, 209 | {"attendrissement", 2, 5, 4, true, "attendr", 2, 5, 4}, 210 | {"respectueusement", 3, 6, 2, true, "respectu", 3, 6, 2}, 211 | {"patriotisme", 3, 7, 2, true, "patriot", 3, 7, 2}, 212 | {"curieuse", 3, 7, 2, true, "curieux", 3, 7, 2}, 213 | {"fascination", 3, 6, 2, true, "fascin", 3, 6, 2}, 214 | {"effectivement", 2, 5, 4, true, "effect", 2, 5, 4}, 215 | {"condoléance", 3, 6, 2, true, "condolé", 3, 6, 2}, 216 | {"malignité", 3, 5, 2, true, "malign", 3, 5, 2}, 217 | {"capricieuse", 3, 6, 2, true, "caprici", 3, 6, 2}, 218 | {"applaudissements", 2, 7, 5, true, "applaud", 2, 7, 5}, 219 | {"praticable", 4, 6, 3, true, "pratic", 4, 6, 3}, 220 | {"rivaux", 3, 6, 2, true, "rival", 3, 5, 2}, 221 | {"augmentation", 3, 6, 3, true, "augment", 3, 6, 3}, 222 | {"ameublement", 2, 5, 3, true, "ameubl", 2, 5, 3}, 223 | {"honorables", 3, 5, 2, true, "honor", 3, 5, 2}, 224 | {"effervescence", 2, 5, 4, true, "effervescent", 2, 5, 4}, 225 | {"excentricité", 2, 5, 4, true, "excentr", 2, 5, 4}, 226 | {"misérable", 3, 5, 2, true, "misér", 3, 5, 2}, 227 | {"capitulation", 3, 5, 2, true, "capitul", 3, 5, 2}, 228 | {"enjoUement", 2, 5, 4, true, "enjoU", 2, 5, 4}, 229 | {"sévérité", 3, 5, 2, true, "sévér", 3, 5, 2}, 230 | {"perplexités", 3, 7, 2, true, "perplex", 3, 7, 2}, 231 | {"consentement", 3, 6, 2, true, "consent", 3, 6, 2}, 232 | {"convocation", 3, 6, 2, true, "convoc", 3, 6, 2}, 233 | {"assurances", 2, 5, 4, true, "assur", 2, 5, 4}, 234 | {"ébloUissement", 2, 5, 4, true, "ébloU", 2, 5, 4}, 235 | {"méridionaux", 3, 5, 2, true, "méridional", 3, 5, 2}, 236 | {"dérangements", 3, 5, 2, true, "dérang", 3, 5, 2}, 237 | {"domination", 3, 5, 2, true, "domin", 3, 5, 2}, 238 | {"incroYable", 2, 6, 5, true, "incroY", 2, 6, 5}, 239 | {"réjoUissances", 3, 5, 2, true, "réjoUiss", 3, 5, 2}, 240 | {"décadence", 3, 5, 2, true, "décadent", 3, 5, 2}, 241 | {"bâillement", 4, 7, 2, true, "bâill", 4, 5, 2}, 242 | {"habillement", 3, 5, 2, true, "habill", 3, 5, 2}, 243 | {"irréparablement", 2, 5, 4, true, "irrépar", 2, 5, 4}, 244 | {"diplomatiqUes", 3, 6, 2, true, "diplomat", 3, 6, 2}, 245 | {"distribution", 3, 7, 2, true, "distribu", 3, 7, 2}, 246 | {"pétulance", 3, 5, 2, true, "pétul", 3, 5, 2}, 247 | {"considérable", 3, 6, 2, true, "considér", 3, 6, 2}, 248 | {"éducation", 2, 4, 3, true, "éduc", 2, 4, 3}, 249 | {"indications", 2, 5, 4, true, "indiqU", 2, 5, 4}, 250 | {"cupidité", 3, 5, 2, true, "cupid", 3, 5, 2}, 251 | {"traîtreusement", 5, 9, 3, true, "traîtreux", 5, 9, 3}, 252 | {"silencieuse", 3, 5, 2, true, "silenci", 3, 5, 2}, 253 | {"pessimisme", 3, 6, 2, true, "pessim", 3, 6, 2}, 254 | {"préoccupation", 5, 8, 3, true, "préoccup", 5, 8, 3}, 255 | // Special cases that should return false despite 256 | // being changed. They "don't count". 257 | {"compliment", 3, 7, 2, false, "compli", 3, 6, 2}, 258 | {"vraiment", 5, 7, 3, false, "vrai", 4, 4, 3}, 259 | {"remercîment", 3, 5, 2, false, "remercî", 3, 5, 2}, 260 | {"puissamment", 4, 7, 2, false, "puissant", 4, 7, 2}, 261 | {"absolument", 2, 5, 4, false, "absolu", 2, 5, 4}, 262 | {"décidément", 3, 5, 2, false, "décidé", 3, 5, 2}, 263 | {"condiments", 3, 6, 2, false, "condi", 3, 5, 2}, 264 | } 265 | romance.RunStepTest(t, step1, testCases) 266 | 267 | } 268 | 269 | // the removal of Verb suffixes beginning 270 | // with "i" in the RV region. 271 | // Test step1, the removal of standard suffixes. 272 | // 273 | func Test_step2a(t *testing.T) { 274 | testCases := []romance.StepTestCase{ 275 | {"épanoUit", 2, 4, 3, true, "épanoU", 2, 4, 3}, 276 | {"faillirent", 4, 7, 2, true, "faill", 4, 5, 2}, 277 | {"acabit", 2, 4, 3, true, "acab", 2, 4, 3}, 278 | {"établissait", 2, 4, 3, true, "établ", 2, 4, 3}, 279 | {"découvrir", 3, 6, 2, true, "découvr", 3, 6, 2}, 280 | {"réjoUissait", 3, 5, 2, true, "réjoU", 3, 5, 2}, 281 | {"trahiront", 4, 6, 3, true, "trah", 4, 4, 3}, 282 | {"maintenir", 4, 7, 2, true, "mainten", 4, 7, 2}, 283 | {"vendit", 3, 6, 2, true, "vend", 3, 4, 2}, 284 | {"repartit", 3, 5, 2, true, "repart", 3, 5, 2}, 285 | {"giletti", 3, 5, 2, true, "gilett", 3, 5, 2}, 286 | {"rienzi", 4, 6, 2, true, "rienz", 4, 5, 2}, 287 | {"punie", 3, 5, 2, true, "pun", 3, 3, 2}, 288 | {"accueillir", 2, 7, 4, true, "accueill", 2, 7, 4}, 289 | {"rétablit", 3, 5, 2, true, "rétabl", 3, 5, 2}, 290 | {"ravis", 3, 5, 2, true, "rav", 3, 3, 2}, 291 | {"xviIi", 4, 5, 3, true, "xviI", 4, 4, 3}, 292 | {"blottie", 4, 7, 3, true, "blott", 4, 5, 3}, 293 | {"approfondie", 2, 6, 5, true, "approfond", 2, 6, 5}, 294 | {"infirmerie", 2, 5, 4, true, "infirmer", 2, 5, 4}, 295 | {"scotti", 4, 6, 3, true, "scott", 4, 5, 3}, 296 | {"adoucissait", 2, 5, 3, true, "adouc", 2, 5, 3}, 297 | {"finissait", 3, 5, 2, true, "fin", 3, 3, 2}, 298 | {"promit", 4, 6, 3, true, "prom", 4, 4, 3}, 299 | {"franchies", 4, 9, 3, true, "franch", 4, 6, 3}, 300 | {"franchissant", 4, 8, 3, true, "franch", 4, 6, 3}, 301 | {"micheli", 3, 6, 2, true, "michel", 3, 6, 2}, 302 | {"éteignit", 2, 5, 3, true, "éteign", 2, 5, 3}, 303 | {"puni", 3, 4, 2, true, "pun", 3, 3, 2}, 304 | {"apoplexie", 2, 4, 3, true, "apoplex", 2, 4, 3}, 305 | {"désira", 3, 5, 2, true, "dés", 3, 3, 2}, 306 | {"étourdi", 2, 5, 3, true, "étourd", 2, 5, 3}, 307 | {"giovanni", 4, 6, 2, true, "giovann", 4, 6, 2}, 308 | {"apprécie", 2, 6, 5, true, "appréc", 2, 6, 5}, 309 | {"poésies", 4, 7, 2, true, "poés", 4, 4, 2}, 310 | {"pairie", 4, 6, 2, true, "pair", 4, 4, 2}, 311 | {"sortit", 3, 6, 2, true, "sort", 3, 4, 2}, 312 | {"subi", 3, 4, 2, true, "sub", 3, 3, 2}, 313 | {"aigrirait", 3, 6, 3, true, "aigr", 3, 4, 3}, 314 | {"assailli", 2, 6, 4, true, "assaill", 2, 6, 4}, 315 | {"bertolotti", 3, 6, 2, true, "bertolott", 3, 6, 2}, 316 | {"recouvrir", 3, 6, 2, true, "recouvr", 3, 6, 2}, 317 | {"visconti", 3, 6, 2, true, "viscont", 3, 6, 2}, 318 | {"surgir", 3, 6, 2, true, "surg", 3, 4, 2}, 319 | {"remercie", 3, 5, 2, true, "remerc", 3, 5, 2}, 320 | {"joUissaIent", 3, 5, 2, true, "joU", 3, 3, 2}, 321 | {"bondissant", 3, 6, 2, true, "bond", 3, 4, 2}, 322 | {"saisi", 4, 5, 2, true, "sais", 4, 4, 2}, 323 | {"missouri", 3, 7, 2, true, "missour", 3, 7, 2}, 324 | {"remplirent", 3, 7, 2, true, "rempl", 3, 5, 2}, 325 | {"envahi", 2, 5, 4, true, "envah", 2, 5, 4}, 326 | {"tandis", 3, 6, 2, true, "tand", 3, 4, 2}, 327 | {"trahit", 4, 6, 3, true, "trah", 4, 4, 3}, 328 | {"trahissaIent", 4, 6, 3, true, "trah", 4, 4, 3}, 329 | {"réunie", 4, 6, 2, true, "réun", 4, 4, 2}, 330 | {"avarie", 2, 4, 3, true, "avar", 2, 4, 3}, 331 | {"dilettanti", 3, 5, 2, true, "dilettant", 3, 5, 2}, 332 | {"raidie", 4, 6, 2, true, "raid", 4, 4, 2}, 333 | {"écuries", 2, 4, 3, true, "écur", 2, 4, 3}, 334 | {"recouvrît", 3, 6, 2, true, "recouvr", 3, 6, 2}, 335 | {"parsis", 3, 6, 3, true, "pars", 3, 4, 3}, 336 | {"monti", 3, 5, 2, true, "mont", 3, 4, 2}, 337 | {"reproduisit", 3, 6, 2, true, "reproduis", 3, 6, 2}, 338 | {"étendit", 2, 4, 3, true, "étend", 2, 4, 3}, 339 | {"suffi", 3, 5, 2, true, "suff", 3, 4, 2}, 340 | {"pillaji", 3, 6, 2, true, "pillaj", 3, 6, 2}, 341 | {"rougir", 4, 6, 2, true, "roug", 4, 4, 2}, 342 | {"désirez", 3, 5, 2, true, "dés", 3, 3, 2}, 343 | {"subit", 3, 5, 2, true, "sub", 3, 3, 2}, 344 | {"fondirent", 3, 6, 2, true, "fond", 3, 4, 2}, 345 | {"coqUineries", 3, 6, 2, true, "coqUiner", 3, 6, 2}, 346 | {"venir", 3, 5, 2, true, "ven", 3, 3, 2}, 347 | {"plaidoirie", 5, 8, 3, true, "plaidoir", 5, 8, 3}, 348 | {"fournissant", 4, 7, 2, true, "fourn", 4, 5, 2}, 349 | {"bonzeries", 3, 6, 2, true, "bonzer", 3, 6, 2}, 350 | {"flétri", 4, 6, 3, true, "flétr", 4, 5, 3}, 351 | {"faillit", 4, 7, 2, true, "faill", 4, 5, 2}, 352 | {"hardie", 3, 6, 2, true, "hard", 3, 4, 2}, 353 | {"compagnie", 3, 6, 2, true, "compagn", 3, 6, 2}, 354 | {"vernis", 3, 6, 2, true, "vern", 3, 4, 2}, 355 | {"attendit", 2, 5, 4, true, "attend", 2, 5, 4}, 356 | {"blanchies", 4, 9, 3, true, "blanch", 4, 6, 3}, 357 | {"choisie", 5, 7, 3, true, "chois", 5, 5, 3}, 358 | {"rafraîchir", 3, 7, 2, true, "rafraîch", 3, 7, 2}, 359 | {"choisir", 5, 7, 3, true, "chois", 5, 5, 3}, 360 | {"nourrisse", 4, 7, 2, true, "nourr", 4, 5, 2}, 361 | {"chancellerie", 4, 7, 3, true, "chanceller", 4, 7, 3}, 362 | {"repartie", 3, 5, 2, true, "repart", 3, 5, 2}, 363 | {"redira", 3, 5, 2, true, "red", 3, 3, 2}, 364 | {"sentira", 3, 6, 2, true, "sent", 3, 4, 2}, 365 | {"surgirait", 3, 6, 2, true, "surg", 3, 4, 2}, 366 | {"cani", 3, 4, 2, true, "can", 3, 3, 2}, 367 | {"gratis", 4, 6, 3, true, "grat", 4, 4, 3}, 368 | {"médît", 3, 5, 2, true, "méd", 3, 3, 2}, 369 | {"avertis", 2, 4, 3, true, "avert", 2, 4, 3}, 370 | {"chirurgie", 4, 6, 3, true, "chirurg", 4, 6, 3}, 371 | {"ironie", 2, 4, 3, true, "iron", 2, 4, 3}, 372 | {"punîtes", 3, 5, 2, true, "pun", 3, 3, 2}, 373 | {"compromis", 3, 7, 2, true, "comprom", 3, 7, 2}, 374 | {"simonie", 3, 5, 2, true, "simon", 3, 5, 2}, 375 | } 376 | romance.RunStepTest(t, step2a, testCases) 377 | } 378 | 379 | // Test the removal of Verb suffixes in RV that 380 | // do not begin with "i". 381 | // 382 | func Test_step2b(t *testing.T) { 383 | testCases := []romance.StepTestCase{ 384 | {"posée", 3, 5, 2, true, "pos", 3, 3, 2}, 385 | {"contentait", 3, 6, 2, true, "content", 3, 6, 2}, 386 | {"évita", 2, 4, 3, true, "évit", 2, 4, 3}, 387 | {"cantonnées", 3, 6, 2, true, "cantonn", 3, 6, 2}, 388 | {"tender", 3, 6, 2, true, "tend", 3, 4, 2}, 389 | {"survenait", 3, 6, 2, true, "surven", 3, 6, 2}, 390 | {"plongeaIent", 4, 8, 3, true, "plong", 4, 5, 3}, 391 | {"modéra", 3, 5, 2, true, "modér", 3, 5, 2}, 392 | {"copier", 3, 6, 2, true, "copi", 3, 4, 2}, 393 | {"bougez", 4, 6, 2, true, "boug", 4, 4, 2}, 394 | {"déploYaIent", 3, 6, 2, true, "déploY", 3, 6, 2}, 395 | {"entendra", 2, 5, 4, true, "entendr", 2, 5, 4}, 396 | {"blâmer", 4, 6, 3, true, "blâm", 4, 4, 3}, 397 | {"déshonorait", 3, 6, 2, true, "déshonor", 3, 6, 2}, 398 | {"concentrés", 3, 6, 2, true, "concentr", 3, 6, 2}, 399 | {"mangeant", 3, 7, 2, true, "mang", 3, 4, 2}, 400 | {"écouteront", 2, 5, 3, true, "écout", 2, 5, 3}, 401 | {"pressaIent", 4, 7, 3, true, "press", 4, 5, 3}, 402 | {"ébréché", 2, 5, 4, true, "ébréch", 2, 5, 4}, 403 | {"frapper", 4, 7, 3, true, "frapp", 4, 5, 3}, 404 | {"côtoYé", 3, 5, 2, true, "côtoY", 3, 5, 2}, 405 | {"réfugié", 3, 5, 2, true, "réfugi", 3, 5, 2}, 406 | {"jeûnant", 4, 6, 2, true, "jeûn", 4, 4, 2}, 407 | {"succombé", 3, 6, 2, true, "succomb", 3, 6, 2}, 408 | {"irrité", 2, 5, 4, true, "irrit", 2, 5, 4}, 409 | {"danger", 3, 6, 2, true, "dang", 3, 4, 2}, 410 | {"sachant", 3, 6, 2, true, "sach", 3, 4, 2}, 411 | {"reparaissaIent", 3, 5, 2, true, "reparaiss", 3, 5, 2}, 412 | {"reconnaissant", 3, 5, 2, true, "reconnaiss", 3, 5, 2}, 413 | {"faisant", 4, 6, 2, true, "fais", 4, 4, 2}, 414 | {"arrangés", 2, 5, 4, true, "arrang", 2, 5, 4}, 415 | {"emparés", 2, 5, 4, true, "empar", 2, 5, 4}, 416 | {"choqUée", 4, 7, 3, true, "choqU", 4, 5, 3}, 417 | {"gênait", 3, 6, 2, true, "gên", 3, 3, 2}, 418 | {"croissante", 5, 8, 3, true, "croiss", 5, 6, 3}, 419 | {"scié", 4, 4, 3, true, "sci", 3, 3, 3}, 420 | {"reconnaissez", 3, 5, 2, true, "reconnaiss", 3, 5, 2}, 421 | {"pliaIent", 5, 7, 3, true, "pli", 3, 3, 3}, 422 | {"expédia", 2, 5, 4, true, "expédi", 2, 5, 4}, 423 | {"déshabillaIent", 3, 6, 2, true, "déshabill", 3, 6, 2}, 424 | {"appréciée", 2, 6, 5, true, "appréci", 2, 6, 5}, 425 | {"amputés", 2, 5, 4, true, "amput", 2, 5, 4}, 426 | {"dominait", 3, 5, 2, true, "domin", 3, 5, 2}, 427 | {"vexantes", 3, 5, 2, true, "vex", 3, 3, 2}, 428 | {"fabriqUées", 3, 6, 2, true, "fabriqU", 3, 6, 2}, 429 | {"retomber", 3, 5, 2, true, "retomb", 3, 5, 2}, 430 | {"exercer", 2, 4, 3, true, "exerc", 2, 4, 3}, 431 | {"entourait", 2, 6, 4, true, "entour", 2, 6, 4}, 432 | {"voYait", 3, 6, 2, true, "voY", 3, 3, 2}, 433 | {"soupait", 4, 7, 2, true, "soup", 4, 4, 2}, 434 | {"apportiez", 2, 5, 4, true, "apport", 2, 5, 4}, 435 | {"tuée", 4, 4, 2, true, "tu", 2, 2, 2}, 436 | {"proposait", 4, 6, 3, true, "propos", 4, 6, 3}, 437 | {"citations", 3, 5, 2, true, "citat", 3, 5, 2}, 438 | {"distinguée", 3, 6, 2, true, "distingu", 3, 6, 2}, 439 | {"parlerez", 3, 6, 3, true, "parl", 3, 4, 3}, 440 | {"stanislas", 4, 6, 3, true, "stanisl", 4, 6, 3}, 441 | {"enlevée", 2, 5, 4, true, "enlev", 2, 5, 4}, 442 | {"irriguaIent", 2, 5, 4, true, "irrigu", 2, 5, 4}, 443 | {"contenant", 3, 6, 2, true, "conten", 3, 6, 2}, 444 | {"empêchèrent", 2, 5, 4, true, "empêch", 2, 5, 4}, 445 | {"inspirées", 2, 6, 5, true, "inspir", 2, 6, 5}, 446 | {"basée", 3, 5, 2, true, "bas", 3, 3, 2}, 447 | {"consultait", 3, 6, 2, true, "consult", 3, 6, 2}, 448 | {"retardait", 3, 5, 2, true, "retard", 3, 5, 2}, 449 | {"enlevât", 2, 5, 4, true, "enlev", 2, 5, 4}, 450 | {"convenaIent", 3, 6, 2, true, "conven", 3, 6, 2}, 451 | {"portât", 3, 6, 2, true, "port", 3, 4, 2}, 452 | {"admirée", 2, 5, 4, true, "admir", 2, 5, 4}, 453 | {"copiée", 3, 6, 2, true, "copi", 3, 4, 2}, 454 | {"démenaIent", 3, 5, 2, true, "démen", 3, 5, 2}, 455 | {"fortifiées", 3, 6, 2, true, "fortifi", 3, 6, 2}, 456 | {"apercevrait", 2, 4, 3, true, "apercevr", 2, 4, 3}, 457 | {"risqUer", 3, 7, 2, true, "risqU", 3, 5, 2}, 458 | {"réclamer", 3, 6, 2, true, "réclam", 3, 6, 2}, 459 | {"tremblaIent", 4, 8, 3, true, "trembl", 4, 6, 3}, 460 | {"calomnier", 3, 5, 2, true, "calomni", 3, 5, 2}, 461 | {"réclamée", 3, 6, 2, true, "réclam", 3, 6, 2}, 462 | {"déposât", 3, 5, 2, true, "dépos", 3, 5, 2}, 463 | {"filé", 3, 4, 2, true, "fil", 3, 3, 2}, 464 | {"déchirée", 3, 6, 2, true, "déchir", 3, 6, 2}, 465 | {"prononça", 4, 6, 3, true, "prononç", 4, 6, 3}, 466 | {"précédé", 4, 6, 3, true, "précéd", 4, 6, 3}, 467 | {"asseYait", 2, 5, 4, true, "asseY", 2, 5, 4}, 468 | {"emploYés", 2, 6, 5, true, "emploY", 2, 6, 5}, 469 | {"chagriner", 4, 7, 3, true, "chagrin", 4, 7, 3}, 470 | {"dévorât", 3, 5, 2, true, "dévor", 3, 5, 2}, 471 | {"remonté", 3, 5, 2, true, "remont", 3, 5, 2}, 472 | {"emploYant", 2, 6, 5, true, "emploY", 2, 6, 5}, 473 | {"redoublait", 3, 6, 2, true, "redoubl", 3, 6, 2}, 474 | {"marchant", 3, 7, 2, true, "march", 3, 5, 2}, 475 | {"pétrifiée", 3, 6, 2, true, "pétrifi", 3, 6, 2}, 476 | {"enlevées", 2, 5, 4, true, "enlev", 2, 5, 4}, 477 | {"donnassent", 3, 6, 2, true, "donn", 3, 4, 2}, 478 | {"recomptait", 3, 5, 2, true, "recompt", 3, 5, 2}, 479 | {"masqUait", 3, 8, 2, true, "masqU", 3, 5, 2}, 480 | {"renouvelèrent", 3, 6, 2, true, "renouvel", 3, 6, 2}, 481 | {"recoucher", 3, 6, 2, true, "recouch", 3, 6, 2}, 482 | {"abrégea", 2, 5, 4, true, "abrég", 2, 5, 4}, 483 | {"flattait", 4, 8, 3, true, "flatt", 4, 5, 3}, 484 | } 485 | romance.RunStepTest(t, step2b, testCases) 486 | } 487 | 488 | // Test the cleaning up of "Y" and "ç" suffixes. 489 | // 490 | func Test_step3(t *testing.T) { 491 | testCases := []romance.StepTestCase{ 492 | {"ennuY", 5, 5, 5, true, "ennui", 5, 5, 5}, 493 | {"envoY", 5, 5, 4, true, "envoi", 5, 5, 4}, 494 | {"aboY", 4, 4, 3, true, "aboi", 4, 4, 3}, 495 | {"essaY", 5, 5, 4, true, "essai", 5, 5, 4}, 496 | {"effroY", 6, 6, 6, true, "effroi", 6, 6, 6}, 497 | {"désennuY", 8, 8, 8, true, "désennui", 8, 8, 8}, 498 | {"renvoY", 6, 6, 6, true, "renvoi", 6, 6, 6}, 499 | {"prononç", 7, 7, 3, true, "prononc", 7, 7, 3}, 500 | {"asseY", 5, 5, 5, true, "assei", 5, 5, 5}, 501 | {"croY", 4, 4, 3, true, "croi", 4, 4, 3}, 502 | {"asseY", 5, 5, 4, true, "assei", 5, 5, 4}, 503 | {"plaç", 4, 4, 3, true, "plac", 4, 4, 3}, 504 | {"ennuY", 5, 5, 5, true, "ennui", 5, 5, 5}, 505 | {"impaY", 5, 5, 5, true, "impai", 5, 5, 5}, 506 | {"déploY", 6, 6, 2, true, "déploi", 6, 6, 2}, 507 | {"avanç", 5, 5, 3, true, "avanc", 5, 5, 3}, 508 | {"recommenç", 9, 9, 2, true, "recommenc", 9, 9, 2}, 509 | {"pitoY", 5, 5, 5, true, "pitoi", 5, 5, 5}, 510 | {"renvoY", 6, 6, 6, true, "renvoi", 6, 6, 6}, 511 | {"choY", 4, 4, 4, true, "choi", 4, 4, 4}, 512 | {"effroY", 6, 6, 6, true, "effroi", 6, 6, 6}, 513 | {"forç", 4, 4, 2, true, "forc", 4, 4, 2}, 514 | {"envoY", 5, 5, 5, true, "envoi", 5, 5, 5}, 515 | {"paY", 3, 3, 3, true, "pai", 3, 3, 3}, 516 | {"bunhY", 5, 5, 2, true, "bunhi", 5, 5, 2}, 517 | } 518 | romance.RunStepTest(t, step3, testCases) 519 | } 520 | 521 | // Test 522 | // 523 | func Test_step4(t *testing.T) { 524 | testCases := []romance.StepTestCase{ 525 | {"défendues", 3, 5, 2, true, "défendu", 3, 5, 2}, 526 | {"mormones", 3, 6, 2, true, "mormon", 3, 6, 2}, 527 | {"souvienne", 4, 7, 2, true, "souvienn", 4, 7, 2}, 528 | {"poumons", 4, 6, 2, true, "poumon", 4, 6, 2}, 529 | {"relâche", 3, 5, 2, true, "relâch", 3, 5, 2}, 530 | {"ressource", 3, 7, 2, true, "ressourc", 3, 7, 2}, 531 | {"petits", 3, 5, 2, true, "petit", 3, 5, 2}, 532 | {"obstacles", 2, 6, 5, true, "obstacl", 2, 6, 5}, 533 | {"voisine", 4, 6, 2, true, "voisin", 4, 6, 2}, 534 | {"tunnels", 3, 6, 2, true, "tunnel", 3, 6, 2}, 535 | {"politesse", 3, 5, 2, true, "politess", 3, 5, 2}, 536 | {"obéisse", 2, 5, 3, true, "obéiss", 2, 5, 3}, 537 | {"brûlons", 4, 6, 3, true, "brûlon", 4, 6, 3}, 538 | {"tâchons", 3, 6, 2, true, "tâchon", 3, 6, 2}, 539 | {"gothiqUes", 3, 6, 2, true, "gothiqU", 3, 6, 2}, 540 | {"acqUise", 2, 6, 5, true, "acqUis", 2, 6, 5}, 541 | {"pigeons", 3, 6, 2, true, "pigeon", 3, 6, 2}, 542 | {"focs", 3, 4, 2, true, "foc", 3, 3, 2}, 543 | {"profondeurs", 4, 6, 3, true, "profondeur", 4, 6, 3}, 544 | {"mettrons", 3, 7, 2, true, "mettron", 3, 7, 2}, 545 | {"bavards", 3, 5, 2, true, "bavard", 3, 5, 2}, 546 | {"nigauds", 3, 6, 2, true, "nigaud", 3, 6, 2}, 547 | {"déesse", 4, 6, 2, true, "déess", 4, 5, 2}, 548 | {"libraires", 3, 7, 2, true, "librair", 3, 7, 2}, 549 | {"sentimentales", 3, 6, 2, true, "sentimental", 3, 6, 2}, 550 | {"libre", 3, 5, 2, true, "libr", 3, 4, 2}, 551 | {"matérielles", 3, 5, 2, true, "matériell", 3, 5, 2}, 552 | {"habitudes", 3, 5, 2, true, "habitud", 3, 5, 2}, 553 | {"blushes", 4, 7, 3, true, "blush", 4, 5, 3}, 554 | {"suppose", 3, 6, 2, true, "suppos", 3, 6, 2}, 555 | {"décrépitude", 3, 6, 2, true, "décrépitud", 3, 6, 2}, 556 | {"incluse", 2, 6, 5, true, "inclus", 2, 6, 5}, 557 | {"files", 3, 5, 2, true, "fil", 3, 3, 2}, 558 | {"côtes", 3, 5, 2, true, "côt", 3, 3, 2}, 559 | {"spirales", 4, 6, 3, true, "spiral", 4, 6, 3}, 560 | {"bamboches", 3, 6, 2, true, "bamboch", 3, 6, 2}, 561 | {"qUête", 4, 5, 3, true, "qUêt", 4, 4, 3}, 562 | {"siècles", 4, 7, 2, true, "siècl", 4, 5, 2}, 563 | {"glisse", 4, 6, 3, true, "gliss", 4, 5, 3}, 564 | {"carrosses", 3, 6, 2, true, "carross", 3, 6, 2}, 565 | {"supprime", 3, 7, 2, true, "supprim", 3, 7, 2}, 566 | {"officielle", 2, 5, 4, true, "officiell", 2, 5, 4}, 567 | {"vifs", 3, 4, 2, true, "vif", 3, 3, 2}, 568 | {"adresses", 2, 5, 4, true, "adress", 2, 5, 4}, 569 | {"hussards", 3, 6, 2, true, "hussard", 3, 6, 2}, 570 | {"colle", 3, 5, 3, true, "coll", 3, 4, 3}, 571 | {"amendes", 2, 4, 3, true, "amend", 2, 4, 3}, 572 | {"qUeUe", 4, 5, 3, true, "qUeU", 4, 4, 3}, 573 | {"écharpe", 2, 5, 4, true, "écharp", 2, 5, 4}, 574 | {"débute", 3, 5, 2, true, "début", 3, 5, 2}, 575 | {"refuse", 3, 5, 2, true, "refus", 3, 5, 2}, 576 | {"légers", 3, 5, 2, true, "léger", 3, 5, 2}, 577 | {"entrailles", 2, 7, 5, true, "entraill", 2, 7, 5}, 578 | {"écarlate", 2, 4, 3, true, "écarlat", 2, 4, 3}, 579 | {"manufacturières", 3, 5, 2, true, "manufacturi", 3, 5, 2}, 580 | {"instruire", 2, 8, 6, true, "instruir", 2, 8, 6}, 581 | {"danses", 3, 6, 2, true, "dans", 3, 4, 2}, 582 | {"lits", 3, 4, 2, true, "lit", 3, 3, 2}, 583 | {"cours", 4, 5, 2, true, "cour", 4, 4, 2}, 584 | {"belgirate", 3, 6, 2, true, "belgirat", 3, 6, 2}, 585 | {"délire", 3, 5, 2, true, "délir", 3, 5, 2}, 586 | {"offenses", 2, 5, 4, true, "offens", 2, 5, 4}, 587 | {"athènes", 2, 5, 4, true, "athèn", 2, 5, 4}, 588 | {"alphabets", 2, 6, 5, true, "alphabet", 2, 6, 5}, 589 | {"ascagne", 2, 5, 4, true, "ascagn", 2, 5, 4}, 590 | {"lièvre", 4, 6, 2, true, "lièvr", 4, 5, 2}, 591 | {"hercule", 3, 6, 2, true, "hercul", 3, 6, 2}, 592 | {"casqUe", 3, 6, 2, true, "casqU", 3, 5, 2}, 593 | {"cachons", 3, 6, 2, true, "cachon", 3, 6, 2}, 594 | {"herbe", 3, 5, 2, true, "herb", 3, 4, 2}, 595 | {"banqUette", 3, 7, 2, true, "banqUett", 3, 7, 2}, 596 | {"actuelles", 2, 6, 4, true, "actuell", 2, 6, 4}, 597 | {"intercession", 2, 5, 4, true, "intercess", 2, 5, 4}, 598 | {"pêle", 3, 4, 2, true, "pêl", 3, 3, 2}, 599 | {"grossières", 4, 8, 3, true, "grossi", 4, 6, 3}, 600 | {"qUelle", 4, 6, 3, true, "qUell", 4, 5, 3}, 601 | {"séduits", 3, 6, 2, true, "séduit", 3, 6, 2}, 602 | {"vengeance", 3, 7, 2, true, "vengeanc", 3, 7, 2}, 603 | {"indécentes", 2, 5, 4, true, "indécent", 2, 5, 4}, 604 | {"bergères", 3, 6, 2, true, "bergèr", 3, 6, 2}, 605 | {"fenestrelles", 3, 5, 2, true, "fenestrell", 3, 5, 2}, 606 | {"croupe", 5, 6, 3, true, "croup", 5, 5, 3}, 607 | {"légitime", 3, 5, 2, true, "légitim", 3, 5, 2}, 608 | {"ferrare", 3, 6, 2, true, "ferrar", 3, 6, 2}, 609 | {"briqUe", 4, 6, 3, true, "briqU", 4, 5, 3}, 610 | {"étrangère", 2, 5, 4, true, "étrangèr", 2, 5, 4}, 611 | {"arqUés", 2, 6, 5, true, "arqUé", 2, 5, 5}, 612 | {"guèbres", 4, 7, 2, true, "guèbr", 4, 5, 2}, 613 | {"partons", 3, 6, 3, true, "parton", 3, 6, 3}, 614 | {"distingue", 3, 6, 2, true, "distingu", 3, 6, 2}, 615 | {"paratonnerres", 3, 5, 3, true, "paratonnerr", 3, 5, 3}, 616 | {"anonyme", 2, 4, 3, true, "anonym", 2, 4, 3}, 617 | {"volutes", 3, 5, 2, true, "volut", 3, 5, 2}, 618 | {"décence", 3, 5, 2, true, "décenc", 3, 5, 2}, 619 | {"coupure", 4, 6, 2, true, "coupur", 4, 6, 2}, 620 | {"avarice", 2, 4, 3, true, "avaric", 2, 4, 3}, 621 | {"sensible", 3, 6, 2, true, "sensibl", 3, 6, 2}, 622 | {"cramponne", 4, 7, 3, true, "cramponn", 4, 7, 3}, 623 | {"sympathise", 3, 6, 2, true, "sympathis", 3, 6, 2}, 624 | {"assidue", 2, 5, 4, true, "assidu", 2, 5, 4}, 625 | } 626 | romance.RunStepTest(t, step4, testCases) 627 | } 628 | 629 | // Test a large set of words for which we know 630 | // the correct stemmed form. 631 | // 632 | func Test_FrenchVocabulary(t *testing.T) { 633 | testCases := []struct { 634 | in string 635 | out string 636 | }{ 637 | {"battements", "batt"}, 638 | {"mélangé", "mélang"}, 639 | {"impériales", "impérial"}, 640 | {"paragraphe", "paragraph"}, 641 | {"charité", "charit"}, 642 | {"reproche", "reproch"}, 643 | {"belvédère", "belvéder"}, 644 | {"illisible", "illisibl"}, 645 | {"pleurs", "pleur"}, 646 | {"passait", "pass"}, 647 | {"heaviest", "heaviest"}, 648 | {"correspondance", "correspond"}, 649 | {"c", "c"}, 650 | {"profitable", "profit"}, 651 | {"remontrance", "remontr"}, 652 | {"ramasseraient", "ramass"}, 653 | {"arrivera", "arriv"}, 654 | {"canta", "cant"}, 655 | {"évanouie", "évanou"}, 656 | {"bleuâtres", "bleuâtr"}, 657 | {"achetées", "achet"}, 658 | {"bazars", "bazar"}, 659 | {"affections", "affect"}, 660 | {"luttent", "luttent"}, 661 | {"recouvra", "recouvr"}, 662 | {"regorgent", "regorgent"}, 663 | {"pruderie", "pruder"}, 664 | {"entomologique", "entomolog"}, 665 | {"jansénisme", "jansen"}, 666 | {"tourne", "tourn"}, 667 | {"tuer", "tu"}, 668 | {"concluantes", "conclu"}, 669 | {"subi", "sub"}, 670 | {"agent", "agent"}, 671 | {"instantanément", "instantan"}, 672 | {"gustave", "gustav"}, 673 | {"colossales", "colossal"}, 674 | {"nothing", "nothing"}, 675 | {"quantièmes", "quantiem"}, 676 | {"aidez", "aid"}, 677 | {"horlogerie", "horloger"}, 678 | {"ranimer", "ranim"}, 679 | {"landau", "landau"}, 680 | {"mêler", "mêl"}, 681 | {"scrupuleusement", "scrupul"}, 682 | {"poitrail", "poitrail"}, 683 | {"chaudement", "chaud"}, 684 | {"impiété", "impiet"}, 685 | {"redoublaient", "redoubl"}, 686 | {"punira", "pun"}, 687 | {"proposa", "propos"}, 688 | {"envolés", "envol"}, 689 | {"réparer", "répar"}, 690 | {"inventer", "invent"}, 691 | {"précision", "précis"}, 692 | {"déguisa", "déguis"}, 693 | {"plantations", "plantat"}, 694 | {"appliqua", "appliqu"}, 695 | {"plat", "plat"}, 696 | {"préfète", "préfet"}, 697 | {"baisers", "baiser"}, 698 | {"calmèrent", "calm"}, 699 | {"tressé", "tress"}, 700 | {"consulta", "consult"}, 701 | {"dédaigneux", "dédaign"}, 702 | {"dithyrambe", "dithyramb"}, 703 | {"obligera", "oblig"}, 704 | {"nommés", "nomm"}, 705 | {"mousseux", "mousseux"}, 706 | {"pusillanimes", "pusillanim"}, 707 | {"richissime", "richissim"}, 708 | {"weber", "web"}, 709 | {"groupes", "group"}, 710 | {"rentra", "rentr"}, 711 | {"persécuté", "persécut"}, 712 | {"nuiraient", "nuir"}, 713 | {"ayant", "ayant"}, 714 | {"joueraient", "jou"}, 715 | {"attenante", "atten"}, 716 | {"formait", "form"}, 717 | {"encombrées", "encombr"}, 718 | {"sifflait", "siffl"}, 719 | {"lire", "lir"}, 720 | {"faciliter", "facilit"}, 721 | {"casse", "cass"}, 722 | {"remit", "rem"}, 723 | {"profond", "profond"}, 724 | {"sortez", "sort"}, 725 | {"boiteux", "boiteux"}, 726 | {"flatteuses", "flatteux"}, 727 | {"plafonds", "plafond"}, 728 | {"trahît", "trah"}, 729 | {"lesquelles", "lesquel"}, 730 | {"fantaisies", "fantais"}, 731 | {"séduite", "séduit"}, 732 | {"consolée", "consol"}, 733 | {"estomac", "estomac"}, 734 | {"adverbe", "adverb"}, 735 | {"promenés", "promen"}, 736 | {"côte", "côt"}, 737 | {"flegme", "flegm"}, 738 | {"végétaient", "véget"}, 739 | {"annoncerait", "annonc"}, 740 | {"quais", "quais"}, 741 | {"hissa", "hiss"}, 742 | {"protection", "protect"}, 743 | {"destine", "destin"}, 744 | {"justice", "justic"}, 745 | {"fili", "fil"}, 746 | {"conduite", "conduit"}, 747 | {"narra", "narr"}, 748 | {"torturé", "tortur"}, 749 | {"couloirs", "couloir"}, 750 | {"bronché", "bronch"}, 751 | {"oeuvres", "oeuvr"}, 752 | {"retire", "retir"}, 753 | {"laisserai", "laiss"}, 754 | {"rassura", "rassur"}, 755 | {"leipsick", "leipsick"}, 756 | {"gâte", "gât"}, 757 | {"désormais", "désorm"}, 758 | {"pain", "pain"}, 759 | {"pianos", "pianos"}, 760 | {"opérée", "oper"}, 761 | {"effrayèrent", "effrai"}, 762 | {"sachez", "sach"}, 763 | {"répétées", "répet"}, 764 | {"time", "tim"}, 765 | {"golgonda", "golgond"}, 766 | {"occupèrent", "occup"}, 767 | {"embrasserais", "embrass"}, 768 | {"dévorante", "dévor"}, 769 | {"soutenant", "souten"}, 770 | {"voluptueuse", "voluptu"}, 771 | {"vicomtes", "vicomt"}, 772 | {"constante", "const"}, 773 | {"admirable", "admir"}, 774 | {"déroger", "dérog"}, 775 | {"survit", "surv"}, 776 | {"manquerais", "manqu"}, 777 | {"remontrer", "remontr"}, 778 | {"exercent", "exercent"}, 779 | {"outrageantes", "outrag"}, 780 | {"dépôt", "dépôt"}, 781 | {"engagées", "engag"}, 782 | {"rouvray", "rouvray"}, 783 | {"comprenez", "compren"}, 784 | {"imprudentes", "imprudent"}, 785 | {"billards", "billard"}, 786 | {"tremblante", "trembl"}, 787 | {"impie", "impi"}, 788 | {"peu", "peu"}, 789 | {"indigène", "indigen"}, 790 | {"social", "social"}, 791 | {"consigne", "consign"}, 792 | {"emporterait", "emport"}, 793 | {"rocky", "rocky"}, 794 | {"cosmopolite", "cosmopolit"}, 795 | {"police", "polic"}, 796 | {"jeun", "jeun"}, 797 | {"lourdes", "lourd"}, 798 | {"extraordinaire", "extraordinair"}, 799 | {"dérangeait", "dérang"}, 800 | {"long", "long"}, 801 | {"empressées", "empress"}, 802 | {"capitulation", "capitul"}, 803 | {"giration", "girat"}, 804 | {"guidés", "guid"}, 805 | {"bourbiers", "bourbi"}, 806 | {"provisions", "provis"}, 807 | {"dois", "dois"}, 808 | {"squelette", "squelet"}, 809 | {"extravagante", "extravag"}, 810 | {"bruns", "brun"}, 811 | {"considérerais", "consider"}, 812 | {"entièrement", "entier"}, 813 | {"suffocations", "suffoc"}, 814 | {"diminue", "diminu"}, 815 | {"froissants", "froiss"}, 816 | {"avalé", "aval"}, 817 | {"détacher", "détach"}, 818 | {"remplace", "remplac"}, 819 | {"exagérait", "exager"}, 820 | {"élévations", "élev"}, 821 | {"exagérant", "exager"}, 822 | {"promenaient", "promen"}, 823 | {"antidatée", "antidat"}, 824 | {"touchait", "touch"}, 825 | {"aimerait", "aim"}, 826 | {"lope", "lop"}, 827 | {"tranchait", "tranch"}, 828 | {"environnent", "environnent"}, 829 | {"inondation", "inond"}, 830 | {"frayeur", "frayeur"}, 831 | {"solaire", "solair"}, 832 | {"oysters", "oyster"}, 833 | {"rêveuse", "rêveux"}, 834 | {"concession", "concess"}, 835 | {"existé", "exist"}, 836 | {"promener", "promen"}, 837 | } 838 | for _, testCase := range testCases { 839 | result := Stem(testCase.in, true) 840 | if result != testCase.out { 841 | t.Errorf("Expected %v -> %v, but got %v", testCase.in, testCase.out, result) 842 | } 843 | } 844 | } 845 | --------------------------------------------------------------------------------