├── go.sum
├── go.mod
├── .travis.yml
├── swedish
    ├── preprocess.go
    ├── README.md
    ├── step2.go
    ├── stem.go
    ├── step3.go
    ├── step1.go
    ├── common.go
    └── swedish_test.go
├── norwegian
    ├── preprocess.go
    ├── README.md
    ├── step2.go
    ├── stem.go
    ├── step3.go
    ├── step1.go
    ├── common.go
    └── norwegian_test.go
├── english
    ├── postprocess.go
    ├── step0.go
    ├── preprocess.go
    ├── step1c.go
    ├── README.md
    ├── stem.go
    ├── step5.go
    ├── step1a.go
    ├── step4.go
    ├── step3.go
    ├── step2.go
    ├── step1b.go
    ├── common.go
    └── english_test.go
├── spanish
    ├── postprocess.go
    ├── preprocess.go
    ├── README.md
    ├── step2a.go
    ├── step3.go
    ├── stem.go
    ├── step2b.go
    ├── step0.go
    ├── step1.go
    ├── common.go
    └── spanish_test.go
├── russian
    ├── preprocess.go
    ├── step2.go
    ├── step3.go
    ├── stem.go
    ├── step4.go
    ├── README.md
    ├── common.go
    └── step1.go
├── french
    ├── preprocess.go
    ├── step5.go
    ├── postprocess.go
    ├── step3.go
    ├── step6.go
    ├── step2a.go
    ├── stem.go
    ├── step2b.go
    ├── step4.go
    ├── common.go
    ├── step1.go
    └── french_test.go
├── .gitignore
├── .github
    └── workflows
    │   └── test.yml
├── hungarian
    ├── common_test.go
    ├── README.md
    ├── stem_test.go
    ├── common.go
    └── stem.go
├── HISTORY.md
├── romance
    ├── common.go
    └── testing_helpers.go
├── gostem
    └── gostem.go
├── snowball.go
├── LICENSE
├── snowball_test.go
├── snowballword
    ├── snowballword_test.go
    └── snowballword.go
└── README.md


/go.sum:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/kljensen/snowball
2 | 
3 | go 1.19
4 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # Travis-CI configuration.  See
2 | # http://about.travis-ci.org/docs
3 | language: go
4 | install: echo "Skipping default travis install step"
5 | script:
6 |  - curl https://raw.github.com/daaku/go.travis/master/install | sh


--------------------------------------------------------------------------------
/swedish/preprocess.go:
--------------------------------------------------------------------------------
 1 | package swedish
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Get the r1 of the word
 8 | //
 9 | func preprocess(word *snowballword.SnowballWord) {
10 | 	// Find the region R1. R2 is not used
11 | 	word.R1start = r1(word)
12 | }
13 | 


--------------------------------------------------------------------------------
/norwegian/preprocess.go:
--------------------------------------------------------------------------------
 1 | package norwegian
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Get the r1 of the word
 8 | //
 9 | func preprocess(word *snowballword.SnowballWord) {
10 | 	// Find the region R1. R2 is not used
11 | 	word.R1start = r1(word)
12 | }
13 | 


--------------------------------------------------------------------------------
/english/postprocess.go:
--------------------------------------------------------------------------------
 1 | package english
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Applies transformations necessary after
 8 | // a word has been completely processed.
 9 | //
10 | func postprocess(word *snowballword.SnowballWord) {
11 | 
12 | 	uncapitalizeYs(word)
13 | }
14 | 


--------------------------------------------------------------------------------
/spanish/postprocess.go:
--------------------------------------------------------------------------------
 1 | package spanish
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Applies transformations necessary after
 8 | // a word has been completely processed.
 9 | //
10 | func postprocess(word *snowballword.SnowballWord) {
11 | 
12 | 	removeAccuteAccents(word)
13 | }
14 | 


--------------------------------------------------------------------------------
/spanish/preprocess.go:
--------------------------------------------------------------------------------
 1 | package spanish
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | func preprocess(word *snowballword.SnowballWord) {
 8 | 	r1start, r2start, rvstart := findRegions(word)
 9 | 	word.R1start = r1start
10 | 	word.R2start = r2start
11 | 	word.RVstart = rvstart
12 | }
13 | 


--------------------------------------------------------------------------------
/russian/preprocess.go:
--------------------------------------------------------------------------------
 1 | package russian
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | func preprocess(word *snowballword.SnowballWord) {
 8 | 
 9 | 	r1start, r2start, rvstart := findRegions(word)
10 | 	word.R1start = r1start
11 | 	word.R2start = r2start
12 | 	word.RVstart = rvstart
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/russian/step2.go:
--------------------------------------------------------------------------------
 1 | package russian
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Step 2 is the removal of the "и" suffix.
 8 | func step2(word *snowballword.SnowballWord) bool {
 9 | 	suffix := word.RemoveFirstSuffixIn(word.RVstart, "и")
10 | 	if suffix != "" {
11 | 		return true
12 | 	}
13 | 	return false
14 | }
15 | 


--------------------------------------------------------------------------------
/french/preprocess.go:
--------------------------------------------------------------------------------
 1 | package french
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | func preprocess(word *snowballword.SnowballWord) {
 8 | 
 9 | 	capitalizeYUI(word)
10 | 
11 | 	r1start, r2start, rvstart := findRegions(word)
12 | 	word.R1start = r1start
13 | 	word.R2start = r2start
14 | 	word.RVstart = rvstart
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/french/step5.go:
--------------------------------------------------------------------------------
 1 | package french
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Step 5 Undouble non-vowel endings
 8 | func step5(word *snowballword.SnowballWord) bool {
 9 | 
10 | 	suffix := word.FirstSuffix("enn", "onn", "ett", "ell", "eill")
11 | 	if suffix != "" {
12 | 		word.RemoveLastNRunes(1)
13 | 	}
14 | 	return false
15 | }
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects)
 2 | *.o
 3 | *.a
 4 | *.so
 5 | 
 6 | # Folders
 7 | _obj
 8 | _test
 9 | 
10 | # Architecture specific extensions/prefixes
11 | *.[568vq]
12 | [568vq].out
13 | 
14 | *.cgo1.go
15 | *.cgo2.c
16 | _cgo_defun.c
17 | _cgo_gotypes.go
18 | _cgo_export.*
19 | 
20 | _testmain.go
21 | 
22 | *.exe
23 | 
24 | tmp/*
25 | */tmp/*


--------------------------------------------------------------------------------
/english/step0.go:
--------------------------------------------------------------------------------
 1 | package english
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 0 is to strip off apostrophes and "s".
10 | func step0(w *snowballword.SnowballWord) bool {
11 | 	suffix := w.FirstSuffix("'s'", "'s", "'")
12 | 	if suffix == "" {
13 | 		return false
14 | 	}
15 | 	suffixLength := utf8.RuneCountInString(suffix)
16 | 	w.RemoveLastNRunes(suffixLength)
17 | 	return true
18 | }
19 | 


--------------------------------------------------------------------------------
/russian/step3.go:
--------------------------------------------------------------------------------
 1 | package russian
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Step 3 is the removal of the derivational suffix.
 8 | func step3(word *snowballword.SnowballWord) bool {
 9 | 
10 | 	// Search for a DERIVATIONAL ending in R2 (i.e. the entire
11 | 	// ending must lie in R2), and if one is found, remove it.
12 | 
13 | 	suffix := word.RemoveFirstSuffixIn(word.R2start, "ост", "ость")
14 | 	if suffix != "" {
15 | 		return true
16 | 	}
17 | 	return false
18 | }
19 | 


--------------------------------------------------------------------------------
/swedish/README.md:
--------------------------------------------------------------------------------
 1 | Snowball Swedish
 2 | ================
 3 | 
 4 | This package implements the Swedish language
 5 | [Snowball stemmer](http://snowball.tartarus.org/algorithms/swedish/stemmer.html).
 6 | 
 7 | ## Implementation
 8 | 
 9 | The Swedish language stemmer comprises preprocessing and 3 steps.
10 | Each of these is defined in a separate file in this
11 | package.  All of the steps operate on a `SnowballWord` from the
12 | `snowballword` package and *modify the word in place*.
13 | 
14 | ## Caveats
15 | 
16 | None


--------------------------------------------------------------------------------
/french/postprocess.go:
--------------------------------------------------------------------------------
 1 | package french
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | func postprocess(word *snowballword.SnowballWord) {
 8 | 
 9 | 	// Turn "I", "U", and "Y" into "i", "u", and "y".
10 | 	// Equivalently, unicode code points
11 | 	// 73 85 89 -> 105 117 121
12 | 
13 | 	for i := 0; i < len(word.RS); i++ {
14 | 		switch word.RS[i] {
15 | 		case 73:
16 | 			word.RS[i] = 105
17 | 		case 85:
18 | 			word.RS[i] = 117
19 | 		case 89:
20 | 			word.RS[i] = 121
21 | 		}
22 | 	}
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/norwegian/README.md:
--------------------------------------------------------------------------------
 1 | Snowball Norwegian
 2 | ================
 3 | 
 4 | This package implements the Norwegian language
 5 | [Snowball stemmer](http://snowball.tartarus.org/algorithms/norwegian/stemmer.html).
 6 | 
 7 | ## Implementation
 8 | 
 9 | The Norwegian language stemmer comprises preprocessing and 3 steps.
10 | Each of these is defined in a separate file in this
11 | package.  All of the steps operate on a `SnowballWord` from the
12 | `snowballword` package and *modify the word in place*.
13 | 
14 | ## Caveats
15 | 
16 | None
17 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | on: [push, pull_request]
 2 | name: Build
 3 | jobs:
 4 |   test:
 5 |     strategy:
 6 |       matrix:
 7 |         go-version: [1.19.x, 1.20.x, 1.21.x, 1.22.x]
 8 |         os: [ubuntu-latest, macos-latest, windows-latest]
 9 |     runs-on: ${{ matrix.os }}
10 |     steps:
11 |     - name: Install Go
12 |       uses: actions/setup-go@v5
13 |       with:
14 |         go-version: ${{ matrix.go-version }}
15 |     - name: Checkout code
16 |       uses: actions/checkout@v4
17 |     - name: Test
18 |       run: go test ./...
19 | 


--------------------------------------------------------------------------------
/spanish/README.md:
--------------------------------------------------------------------------------
 1 | Snowball Spanish
 2 | ================
 3 | 
 4 | This package implements the
 5 | [Spanish language Snowball stemmer](http://snowball.tartarus.org/algorithms/spanish/stemmer.html).
 6 | 
 7 | ## Implementation
 8 | 
 9 | The Spanish language stemmer comprises preprocessing, a number of steps,
10 | and postprocessing.  Each of these is defined in a separate file in this
11 | package.  All of the steps operate on a `SnowballWord` from the
12 | `snowballword` package and *modify the word in place*.
13 | 
14 | ## Caveats
15 | 
16 | None yet.


--------------------------------------------------------------------------------
/hungarian/common_test.go:
--------------------------------------------------------------------------------
 1 | package hungarian
 2 | 
 3 | import (
 4 | 	"testing"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | func TestFindRegions(t *testing.T) {
10 | 	for k, want := range map[string]int{
11 | 		"tóban":   2, //          consonant-vowel
12 | 		"ablakan": 2, //       vowel-consonant
13 | 		"acsony":  3, //         vowel-digraph
14 | 		"cvs":     3, //          null R1 region
15 | 	} {
16 | 		got := findRegions(snowballword.New(k))
17 | 		if got != want {
18 | 			t.Errorf("%q: got %d, wanted %d", k, got, want)
19 | 		}
20 | 	}
21 | }
22 | 


--------------------------------------------------------------------------------
/hungarian/README.md:
--------------------------------------------------------------------------------
 1 | Snowball Hungarian
 2 | ================
 3 | 
 4 | This package implements the
 5 | [Hungarian language Snowball stemmer](https://snowballstem.org/algorithms/hungarian/stemmer.html)
 6 | algorithm by [atordai@science.uval.nl](Anna Tordai).
 7 | 
 8 | ## Implementation
 9 | 
10 | The Hungarian language stemmer comprises preprocessing, a number of steps,
11 | and postprocessing.  Each of these is defined in a separate file in this
12 | package.  All of the steps operate on a `SnowballWord` from the
13 | `snowballword` package and *modify the word in place*.
14 | 
15 | 


--------------------------------------------------------------------------------
/norwegian/step2.go:
--------------------------------------------------------------------------------
 1 | package norwegian
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 2: Search for one of the following suffixes in R1,
10 | // and if found delete the last letter.
11 | func step2(w *snowballword.SnowballWord) bool {
12 | 
13 | 	suffix := w.FirstSuffix("dt", "vt")
14 | 	suffixLength := utf8.RuneCountInString(suffix)
15 | 
16 | 	// If it is not in R1, do nothing
17 | 	if suffix == "" || suffixLength > len(w.RS)-w.R1start {
18 | 		return false
19 | 	}
20 | 	w.RemoveLastNRunes(1)
21 | 	return true
22 | }
23 | 


--------------------------------------------------------------------------------
/HISTORY.md:
--------------------------------------------------------------------------------
 1 | History
 2 | =======
 3 | 
 4 | ### v0.3.4 / 2013-05-19
 5 | 
 6 | Add gostem program
 7 | 
 8 | ### v0.3.3 / 2013-05-19
 9 | 
10 | Add large vocabulary tests for each language
11 | 
12 | ### v0.3.1 / 2013-05-18
13 | 
14 | Meaningless bump
15 | 
16 | ### v0.3.0 / 2013-05-18
17 | 
18 | Add Russian stemmer.
19 | 
20 | ### v0.2.0 / 2013-05-17
21 | 
22 | Add French stemmer and move more common code for romance
23 | languages into the `romance` package.
24 | 
25 | ### v0.1.1 / 2013-05-14
26 | 
27 | Documentation fixes.
28 | 
29 | ### v0.1.0 / 2013-05-13
30 | 
31 | Added Spanish stemmer and started versioning the project.


--------------------------------------------------------------------------------
/french/step3.go:
--------------------------------------------------------------------------------
 1 | package french
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Step 3 is the cleaning up of "Y" and "ç" suffixes.
 8 | //
 9 | func step3(word *snowballword.SnowballWord) bool {
10 | 
11 | 	// Replace final Y with i or final ç with c
12 | 	if idx := len(word.RS) - 1; idx >= 0 {
13 | 
14 | 		switch word.RS[idx] {
15 | 
16 | 		case 89:
17 | 			// Replace Y (89) with "i" (105)
18 | 			word.RS[idx] = 105
19 | 			return true
20 | 
21 | 		case 231:
22 | 			// Replace ç (231) with "c" (99)
23 | 			word.RS[idx] = 99
24 | 			return true
25 | 		}
26 | 	}
27 | 	return false
28 | }
29 | 


--------------------------------------------------------------------------------
/swedish/step2.go:
--------------------------------------------------------------------------------
 1 | package swedish
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 2: Search for one of the following suffixes in R1,
10 | // and if found delete the last letter.
11 | func step2(w *snowballword.SnowballWord) bool {
12 | 
13 | 	suffix := w.FirstSuffix(
14 | 		"dd", "gd", "nn", "dt", "gt", "kt", "tt",
15 | 	)
16 | 	suffixLength := utf8.RuneCountInString(suffix)
17 | 
18 | 	// If it is not in R1, do nothing
19 | 	if suffix == "" || suffixLength > len(w.RS)-w.R1start {
20 | 		return false
21 | 	}
22 | 	w.RemoveLastNRunes(1)
23 | 	return true
24 | }
25 | 


--------------------------------------------------------------------------------
/russian/stem.go:
--------------------------------------------------------------------------------
 1 | package russian
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | 	"strings"
 6 | )
 7 | 
 8 | // Stem an Russian word.  This is the only exported
 9 | // function in this package.
10 | //
11 | func Stem(word string, stemStopwWords bool) string {
12 | 
13 | 	word = strings.ToLower(strings.TrimSpace(word))
14 | 	w := snowballword.New(word)
15 | 
16 | 	// Return small words and stop words
17 | 	if len(w.RS) <= 2 || (stemStopwWords == false && IsStopWord(word)) {
18 | 		return word
19 | 	}
20 | 
21 | 	preprocess(w)
22 | 	step1(w)
23 | 	step2(w)
24 | 	step3(w)
25 | 	step4(w)
26 | 	return w.String()
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/english/preprocess.go:
--------------------------------------------------------------------------------
 1 | package english
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Applies various transformations necessary for the
 8 | // other, subsequent stemming steps.  Most important
 9 | // of which is defining the two regions R1 & R2.
10 | //
11 | func preprocess(word *snowballword.SnowballWord) {
12 | 
13 | 	// Clean up apostrophes
14 | 	normalizeApostrophes(word)
15 | 	trimLeftApostrophes(word)
16 | 
17 | 	// Capitalize Y's that are not behaving
18 | 	// as vowels.
19 | 	capitalizeYs(word)
20 | 
21 | 	// Find the two regions, R1 & R2
22 | 	r1start, r2start := r1r2(word)
23 | 	word.R1start = r1start
24 | 	word.R2start = r2start
25 | }
26 | 


--------------------------------------------------------------------------------
/english/step1c.go:
--------------------------------------------------------------------------------
 1 | package english
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Step 1c is the normalization of various "y" endings.
 8 | //
 9 | func step1c(w *snowballword.SnowballWord) bool {
10 | 
11 | 	rsLen := len(w.RS)
12 | 
13 | 	// Replace suffix y or Y by i if preceded by a non-vowel which is not
14 | 	// the first letter of the word (so cry -> cri, by -> by, say -> say)
15 | 	//
16 | 	// Note: the unicode code points for
17 | 	// y, Y, & i are 121, 89, & 105 respectively.
18 | 	//
19 | 	if len(w.RS) > 2 && (w.RS[rsLen-1] == 121 || w.RS[rsLen-1] == 89) && !isLowerVowel(w.RS[rsLen-2]) {
20 | 		w.RS[rsLen-1] = 105
21 | 		return true
22 | 	}
23 | 	return false
24 | }
25 | 


--------------------------------------------------------------------------------
/english/README.md:
--------------------------------------------------------------------------------
 1 | Snowball English
 2 | ================
 3 | 
 4 | This package implements the English language
 5 | [Snowball stemmer](http://snowball.tartarus.org/algorithms/english/stemmer.html).
 6 | 
 7 | ## Implementation
 8 | 
 9 | The English language stemmer comprises preprocessing, a number of steps,
10 | and postprocessing.  Each of these is defined in a separate file in this
11 | package.  All of the steps operate on a `SnowballWord` from the
12 | `snowballword` package and *modify the word in place*.
13 | 
14 | ## Caveats
15 | 
16 | There is a single difference between this implementation and the original.
17 | Here, all apostrophes on the left hand side of a word are stripped off before
18 | the word is stemmed.  


--------------------------------------------------------------------------------
/norwegian/stem.go:
--------------------------------------------------------------------------------
 1 | package norwegian
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | 	"strings"
 6 | )
 7 | 
 8 | // Stem a Norwegian word. This is the only exported
 9 | // function in this package.
10 | //
11 | func Stem(word string, stemStopwWords bool) string {
12 | 
13 | 	word = strings.ToLower(strings.TrimSpace(word))
14 | 
15 | 	// Return small words and stop words
16 | 	if len(word) <= 2 || (stemStopwWords == false && IsStopWord(word)) {
17 | 		return word
18 | 	}
19 | 
20 | 	w := snowballword.New(word)
21 | 
22 | 	// Stem the word.  Note, each of these
23 | 	// steps will alter `w` in place.
24 | 	//
25 | 	preprocess(w)
26 | 	step1(w)
27 | 	step2(w)
28 | 	step3(w)
29 | 
30 | 	return w.String()
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/swedish/stem.go:
--------------------------------------------------------------------------------
 1 | package swedish
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Stem a Swedish word. This is the only exported
10 | // function in this package.
11 | //
12 | func Stem(word string, stemStopwWords bool) string {
13 | 
14 | 	word = strings.ToLower(strings.TrimSpace(word))
15 | 
16 | 	// Return small words and stop words
17 | 	if len(word) <= 2 || (stemStopwWords == false && IsStopWord(word)) {
18 | 		return word
19 | 	}
20 | 
21 | 	w := snowballword.New(word)
22 | 
23 | 	// Stem the word.  Note, each of these
24 | 	// steps will alter `w` in place.
25 | 	//
26 | 	preprocess(w)
27 | 	step1(w)
28 | 	step2(w)
29 | 	step3(w)
30 | 
31 | 	return w.String()
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/norwegian/step3.go:
--------------------------------------------------------------------------------
 1 | package norwegian
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Step 3:
 8 | // Search for the longest among the following suffixes,
 9 | // and, if found and in R1, delete.
10 | 
11 | func step3(w *snowballword.SnowballWord) bool {
12 | 	// Possible sufficies for this step, longest first.
13 | 	suffix := w.FirstSuffixIn(w.R1start, len(w.RS),
14 | 		"hetslov", "eleg", "elig", "elov", "slov",
15 | 		"leg", "eig", "lig", "els", "lov", "ig",
16 | 	)
17 | 	suffixRunes := []rune(suffix)
18 | 
19 | 	// If it is not in R1, do nothing
20 | 	if suffix == "" || len(suffixRunes) > len(w.RS)-w.R1start {
21 | 		return false
22 | 	}
23 | 
24 | 	w.ReplaceSuffixRunes(suffixRunes, []rune(""), true)
25 | 	return true
26 | 
27 | }
28 | 


--------------------------------------------------------------------------------
/spanish/step2a.go:
--------------------------------------------------------------------------------
 1 | package spanish
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 2a is the removal of verb suffixes beginning y,
10 | // Search for the longest among the following suffixes
11 | // in RV, and if found, delete if preceded by u.
12 | func step2a(word *snowballword.SnowballWord) bool {
13 | 	suffix := word.FirstSuffixIn(word.RVstart, len(word.RS), "ya", "ye", "yan", "yen", "yeron", "yendo", "yo", "yó", "yas", "yes", "yais", "yamos")
14 | 	if suffix != "" {
15 | 		suffixLength := utf8.RuneCountInString(suffix)
16 | 		idx := len(word.RS) - suffixLength - 1
17 | 		if idx >= 0 && word.RS[idx] == 117 {
18 | 			word.RemoveLastNRunes(suffixLength)
19 | 			return true
20 | 		}
21 | 	}
22 | 	return false
23 | }
24 | 


--------------------------------------------------------------------------------
/romance/common.go:
--------------------------------------------------------------------------------
 1 | package romance
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // A function type that accepts a rune and
 8 | // returns a bool.  In this particular case,
 9 | // it is used for identifying vowels.
10 | type isVowelFunc func(rune) bool
11 | 
12 | // Finds the region after the first non-vowel following a vowel,
13 | // or a the null region at the end of the word if there is no
14 | // such non-vowel.  Returns the index in the Word where the
15 | // region starts; optionally skips the first `start` characters.
16 | //
17 | func VnvSuffix(word *snowballword.SnowballWord, f isVowelFunc, start int) int {
18 | 	for i := 1; i < len(word.RS[start:]); i++ {
19 | 		j := start + i
20 | 		if f(word.RS[j-1]) && !f(word.RS[j]) {
21 | 			return j + 1
22 | 		}
23 | 	}
24 | 	return len(word.RS)
25 | }
26 | 


--------------------------------------------------------------------------------
/spanish/step3.go:
--------------------------------------------------------------------------------
 1 | package spanish
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 3 is the removal of residual suffixes.
10 | func step3(word *snowballword.SnowballWord) bool {
11 | 	suffix := word.FirstSuffixIfIn(word.RVstart, len(word.RS),
12 | 		"os", "a", "o", "á", "í", "ó", "e", "é",
13 | 	)
14 | 
15 | 	// No suffix found, nothing to do.
16 | 	//
17 | 	if suffix == "" {
18 | 		return false
19 | 	}
20 | 	suffixLength := utf8.RuneCountInString(suffix)
21 | 
22 | 	// Remove all these suffixes
23 | 	word.RemoveLastNRunes(suffixLength)
24 | 
25 | 	if suffix == "e" || suffix == "é" {
26 | 
27 | 		// If preceded by gu with the u in RV delete the u
28 | 		//
29 | 		guSuffix := word.FirstSuffix("gu")
30 | 		if guSuffix != "" {
31 | 			word.RemoveLastNRunes(1)
32 | 		}
33 | 	}
34 | 	return true
35 | }
36 | 


--------------------------------------------------------------------------------
/english/stem.go:
--------------------------------------------------------------------------------
 1 | package english
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | 	"strings"
 6 | )
 7 | 
 8 | // Stem an English word.  This is the only exported
 9 | // function in this package.
10 | //
11 | func Stem(word string, stemStopwWords bool) string {
12 | 
13 | 	word = strings.ToLower(strings.TrimSpace(word))
14 | 
15 | 	// Return small words and stop words
16 | 	if len(word) <= 2 || (stemStopwWords == false && IsStopWord(word)) {
17 | 		return word
18 | 	}
19 | 
20 | 	// Return special words immediately
21 | 	if specialVersion := stemSpecialWord(word); specialVersion != "" {
22 | 		word = specialVersion
23 | 		return word
24 | 	}
25 | 
26 | 	w := snowballword.New(word)
27 | 
28 | 	// Stem the word.  Note, each of these
29 | 	// steps will alter `w` in place.
30 | 	//
31 | 	preprocess(w)
32 | 	step0(w)
33 | 	step1a(w)
34 | 	step1b(w)
35 | 	step1c(w)
36 | 	step2(w)
37 | 	step3(w)
38 | 	step4(w)
39 | 	step5(w)
40 | 	postprocess(w)
41 | 
42 | 	return w.String()
43 | 
44 | }
45 | 


--------------------------------------------------------------------------------
/russian/step4.go:
--------------------------------------------------------------------------------
 1 | package russian
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Step 4 is the undoubling of double non-vowel endings
 8 | // and removal of superlative endings.
 9 | func step4(word *snowballword.SnowballWord) bool {
10 | 
11 | 	// (1) Undouble "н", or, 2) if the word ends with a SUPERLATIVE ending,
12 | 	// (remove it and undouble н n), or 3) if the word ends ь (') (soft sign)
13 | 	// remove it.
14 | 
15 | 	// Undouble "н"
16 | 	if word.HasSuffixRunes([]rune("нн")) {
17 | 		word.RemoveLastNRunes(1)
18 | 		return true
19 | 	}
20 | 
21 | 	// Remove superlative endings
22 | 	suffix := word.RemoveFirstSuffix("ейше", "ейш")
23 | 	if suffix != "" {
24 | 		// Undouble "н"
25 | 		if word.HasSuffixRunes([]rune("нн")) {
26 | 			word.RemoveLastNRunes(1)
27 | 		}
28 | 		return true
29 | 	}
30 | 
31 | 	// Remove soft sign
32 | 	if rsLen := len(word.RS); rsLen > 0 && word.RS[rsLen-1] == 'ь' {
33 | 		word.RemoveLastNRunes(1)
34 | 		return true
35 | 	}
36 | 	return false
37 | }
38 | 


--------------------------------------------------------------------------------
/spanish/stem.go:
--------------------------------------------------------------------------------
 1 | package spanish
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | 	"log"
 6 | 	"strings"
 7 | )
 8 | 
 9 | func printDebug(debug bool, w *snowballword.SnowballWord) {
10 | 	if debug {
11 | 		log.Println(w.DebugString())
12 | 	}
13 | }
14 | 
15 | // Stem an Spanish word.  This is the only exported
16 | // function in this package.
17 | //
18 | func Stem(word string, stemStopwWords bool) string {
19 | 
20 | 	word = strings.ToLower(strings.TrimSpace(word))
21 | 
22 | 	// Return small words and stop words
23 | 	if len(word) <= 2 || (stemStopwWords == false && IsStopWord(word)) {
24 | 		return word
25 | 	}
26 | 
27 | 	w := snowballword.New(word)
28 | 
29 | 	// Stem the word.  Note, each of these
30 | 	// steps will alter `w` in place.
31 | 	//
32 | 
33 | 	preprocess(w)
34 | 	step0(w)
35 | 	changeInStep1 := step1(w)
36 | 	if changeInStep1 == false {
37 | 		changeInStep2a := step2a(w)
38 | 		if changeInStep2a == false {
39 | 			step2b(w)
40 | 		}
41 | 	}
42 | 	step3(w)
43 | 	postprocess(w)
44 | 
45 | 	return w.String()
46 | 
47 | }
48 | 


--------------------------------------------------------------------------------
/gostem/gostem.go:
--------------------------------------------------------------------------------
 1 | //
 2 | // Creates a binary `gostem` that stems an input file.
 3 | //
 4 | package main
 5 | 
 6 | import (
 7 | 	"bufio"
 8 | 	"flag"
 9 | 	"fmt"
10 | 	"github.com/kljensen/snowball"
11 | 	"io"
12 | 	"log"
13 | 	"os"
14 | 	"strings"
15 | )
16 | 
17 | func main() {
18 | 
19 | 	var language *string = flag.String("l", "english", "Language")
20 | 	var infile *string = flag.String("i", "", "Input file for stemming")
21 | 	flag.Parse()
22 | 
23 | 	f, err := os.Open(*infile)
24 | 	if err != nil {
25 | 		log.Fatal(err)
26 | 	}
27 | 
28 | 	bf := bufio.NewReader(f)
29 | 
30 | 	for {
31 | 		line, isPrefix, err := bf.ReadLine()
32 | 
33 | 		if err == io.EOF {
34 | 			break
35 | 		}
36 | 
37 | 		if err != nil {
38 | 			log.Fatal(err)
39 | 		}
40 | 
41 | 		if isPrefix {
42 | 			log.Fatal("Error: Unexpected long line reading", f.Name())
43 | 		}
44 | 
45 | 		word := strings.TrimSpace(string(line))
46 | 		stemmed, err := snowball.Stem(word, *language, true)
47 | 		if err != nil {
48 | 			log.Println(err)
49 | 			break
50 | 		}
51 | 		fmt.Println(stemmed)
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/french/step6.go:
--------------------------------------------------------------------------------
 1 | package french
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Step 6 Un-accent
 8 | //
 9 | func step6(word *snowballword.SnowballWord) bool {
10 | 
11 | 	// If the words ends é or è (unicode code points 233 and 232)
12 | 	// followed by at least one non-vowel, remove the accent from the e.
13 | 
14 | 	// Note, this step is oddly articulated on Porter's Snowball website:
15 | 	// http://snowball.tartarus.org/algorithms/french/stemmer.html
16 | 	// More clearly stated, we should replace é or è with e in the
17 | 	// case where the suffix of the word is é or è followed by
18 | 	// one-or-more non-vowels.
19 | 
20 | 	numNonVowels := 0
21 | 	for i := len(word.RS) - 1; i >= 0; i-- {
22 | 		r := word.RS[i]
23 | 
24 | 		if isLowerVowel(r) == false {
25 | 			numNonVowels += 1
26 | 		} else {
27 | 
28 | 			// `r` is a vowel
29 | 
30 | 			if (r == 233 || r == 232) && numNonVowels > 0 {
31 | 
32 | 				// Replace with "e", or unicode code point 101
33 | 				word.RS[i] = 101
34 | 				return true
35 | 
36 | 			}
37 | 			return false
38 | 		}
39 | 
40 | 	}
41 | 	return false
42 | }
43 | 


--------------------------------------------------------------------------------
/snowball.go:
--------------------------------------------------------------------------------
 1 | package snowball
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/kljensen/snowball/english"
 7 | 	"github.com/kljensen/snowball/french"
 8 | 	"github.com/kljensen/snowball/hungarian"
 9 | 	"github.com/kljensen/snowball/norwegian"
10 | 	"github.com/kljensen/snowball/russian"
11 | 	"github.com/kljensen/snowball/spanish"
12 | 	"github.com/kljensen/snowball/swedish"
13 | )
14 | 
15 | const (
16 | 	VERSION string = "v0.7.0"
17 | )
18 | 
19 | // Stem a word in the specified language.
20 | func Stem(word, language string, stemStopWords bool) (stemmed string, err error) {
21 | 
22 | 	var f func(string, bool) string
23 | 	switch language {
24 | 	case "english":
25 | 		f = english.Stem
26 | 	case "spanish":
27 | 		f = spanish.Stem
28 | 	case "french":
29 | 		f = french.Stem
30 | 	case "russian":
31 | 		f = russian.Stem
32 | 	case "swedish":
33 | 		f = swedish.Stem
34 | 	case "norwegian":
35 | 		f = norwegian.Stem
36 | 	case "hungarian":
37 | 		f = hungarian.Stem
38 | 	default:
39 | 		err = fmt.Errorf("Unknown language: %s", language)
40 | 		return
41 | 	}
42 | 	stemmed = f(word, stemStopWords)
43 | 	return
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/french/step2a.go:
--------------------------------------------------------------------------------
 1 | package french
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 2a is the removal of Verb suffixes beginning
10 | // with "i" in the RV region.
11 | func step2a(word *snowballword.SnowballWord) bool {
12 | 
13 | 	// Search for the longest among the following suffixes
14 | 	// in RV and if found, delete if preceded by a non-vowel.
15 | 
16 | 	suffix := word.FirstSuffixIn(word.RVstart, len(word.RS),
17 | 		"issantes", "issaIent", "issions", "issants", "issante",
18 | 		"iraIent", "issons", "issiez", "issent", "issant", "issait",
19 | 		"issais", "irions", "issez", "isses", "iront", "irons", "iriez",
20 | 		"irent", "irait", "irais", "îtes", "îmes", "isse", "irez",
21 | 		"iras", "irai", "ira", "ies", "ît", "it", "is", "ir", "ie", "i",
22 | 	)
23 | 
24 | 	if suffix != "" {
25 | 		suffixLength := utf8.RuneCountInString(suffix)
26 | 		idx := len(word.RS) - suffixLength - 1
27 | 		if idx >= 0 && word.FitsInRV(suffixLength+1) && isLowerVowel(word.RS[idx]) == false {
28 | 			word.RemoveLastNRunes(suffixLength)
29 | 			return true
30 | 		}
31 | 	}
32 | 	return false
33 | }
34 | 


--------------------------------------------------------------------------------
/swedish/step3.go:
--------------------------------------------------------------------------------
 1 | package swedish
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 3:
10 | // Search for the longest among the following suffixes,
11 | // and, if found and in R1, perform the action indicated.
12 | 
13 | // Delete:
14 | // lig, els & ig
15 | // Replace:
16 | // fullt: full, löst: lös
17 | 
18 | func step3(w *snowballword.SnowballWord) bool {
19 | 	// Possible sufficies for this step, longest first.
20 | 	suffix := w.FirstSuffixIn(w.R1start, len(w.RS),
21 | 		"fullt", "löst", "lig", "els", "ig",
22 | 	)
23 | 	suffixLength := utf8.RuneCountInString(suffix)
24 | 
25 | 	// If it is not in R1, do nothing
26 | 	if suffix == "" || suffixLength > len(w.RS)-w.R1start {
27 | 		return false
28 | 	}
29 | 
30 | 	// Handle a suffix that was found, which is going
31 | 	// to be replaced with a different suffix.
32 | 	//
33 | 	var repl string
34 | 	switch suffix {
35 | 	case "fullt":
36 | 		repl = "full"
37 | 	case "löst":
38 | 		repl = "lös"
39 | 	case "lig", "ig", "els":
40 | 		repl = ""
41 | 	}
42 | 	w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
43 | 	return true
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) The project creators and maintainers
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/english/step5.go:
--------------------------------------------------------------------------------
 1 | package english
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | )
 6 | 
 7 | // Step 5 is the stemming of "e" and "l" sufficies
 8 | // found in R2.
 9 | //
10 | func step5(w *snowballword.SnowballWord) bool {
11 | 
12 | 	// Last rune index = `lri`
13 | 	lri := len(w.RS) - 1
14 | 
15 | 	// If R1 is emtpy, R2 is also empty, and we
16 | 	// need not do anything in step 5.
17 | 	//
18 | 	if w.R1start > lri {
19 | 		return false
20 | 	}
21 | 
22 | 	if w.RS[lri] == 101 {
23 | 
24 | 		// The word ends with "e", which is unicode code point 101.
25 | 
26 | 		// Delete "e" suffix if in R2, or in R1 and not preceded
27 | 		// by a short syllable.
28 | 		if w.R2start <= lri || !endsShortSyllable(w, lri) {
29 | 			w.ReplaceSuffix("e", "", true)
30 | 			return true
31 | 		}
32 | 		return false
33 | 
34 | 	} else if w.R2start <= lri && w.RS[lri] == 108 && lri-1 >= 0 && w.RS[lri-1] == 108 {
35 | 
36 | 		// The word ends in double "l", and the final "l" is
37 | 		// in R2. (Note, the unicode code point for "l" is 108.)
38 | 
39 | 		// Delete the second "l".
40 | 		w.ReplaceSuffix("l", "", true)
41 | 		return true
42 | 
43 | 	}
44 | 	return false
45 | }
46 | 


--------------------------------------------------------------------------------
/english/step1a.go:
--------------------------------------------------------------------------------
 1 | package english
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 1a is normalization of various special "s"-endings.
10 | func step1a(w *snowballword.SnowballWord) bool {
11 | 
12 | 	suffix := w.FirstSuffix("sses", "ied", "ies", "us", "ss", "s")
13 | 	switch suffix {
14 | 
15 | 	case "sses":
16 | 
17 | 		// Replace by ss
18 | 		w.ReplaceSuffixRunes([]rune(suffix), []rune("ss"), true)
19 | 		return true
20 | 
21 | 	case "ies", "ied":
22 | 
23 | 		// Replace by i if preceded by more than one letter,
24 | 		// otherwise by ie (so ties -> tie, cries -> cri).
25 | 
26 | 		var repl string
27 | 		if len(w.RS) > 4 {
28 | 			repl = "i"
29 | 		} else {
30 | 			repl = "ie"
31 | 		}
32 | 		w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
33 | 		return true
34 | 
35 | 	case "us", "ss":
36 | 
37 | 		// Do nothing
38 | 		return false
39 | 
40 | 	case "s":
41 | 		// Delete if the preceding word part contains a vowel
42 | 		// not immediately before the s (so gas and this retain
43 | 		// the s, gaps and kiwis lose it)
44 | 		//
45 | 		suffixLength := utf8.RuneCountInString(suffix)
46 | 		for i := 0; i < len(w.RS)-2; i++ {
47 | 			if isLowerVowel(w.RS[i]) {
48 | 				w.RemoveLastNRunes(suffixLength)
49 | 				return true
50 | 			}
51 | 		}
52 | 	}
53 | 	return false
54 | }
55 | 


--------------------------------------------------------------------------------
/french/stem.go:
--------------------------------------------------------------------------------
 1 | package french
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/snowballword"
 5 | 	"strings"
 6 | )
 7 | 
 8 | // Stem an French word.  This is the only exported
 9 | // function in this package.
10 | //
11 | func Stem(word string, stemStopwWords bool) string {
12 | 
13 | 	word = strings.ToLower(strings.TrimSpace(word))
14 | 
15 | 	// Return small words and stop words
16 | 	if len(word) <= 2 || (stemStopwWords == false && IsStopWord(word)) {
17 | 		return word
18 | 	}
19 | 
20 | 	w := snowballword.New(word)
21 | 
22 | 	// Stem the word.  Note, each of these
23 | 	// steps will alter `w` in place.
24 | 	//
25 | 
26 | 	preprocess(w)
27 | 	var (
28 | 		changeInStep1  bool
29 | 		changeInStep2a bool
30 | 		changeInStep2b bool
31 | 	)
32 | 
33 | 	changeInStep1 = step1(w)
34 | 	if changeInStep1 == false {
35 | 		changeInStep2a = step2a(w)
36 | 		if changeInStep2a == false {
37 | 			changeInStep2b = step2b(w)
38 | 		}
39 | 	}
40 | 
41 | 	// If the last step was successful, do step 3.  Note that,
42 | 	// since we only do 2a if 1 is unsuccessful, the following
43 | 	// "if" condition tests to see if the previous step was
44 | 	// successful.
45 | 	//
46 | 	if changeInStep1 || changeInStep2a || changeInStep2b {
47 | 		step3(w)
48 | 	} else {
49 | 		step4(w)
50 | 	}
51 | 
52 | 	step5(w)
53 | 	step6(w)
54 | 	postprocess(w)
55 | 	return w.String()
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/english/step4.go:
--------------------------------------------------------------------------------
 1 | package english
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 4:
10 | // Search for the longest among the following suffixes,
11 | // and, if found and in R2, perform the action indicated.
12 | 
13 | // al, ance, ence, er, ic, able, ible, ant, ement, ment,
14 | // ent, ism, ate, iti, ous, ive, ize
15 | // delete
16 | //
17 | // ion
18 | // delete if preceded by s or t
19 | func step4(w *snowballword.SnowballWord) bool {
20 | 
21 | 	// Find all endings in R1
22 | 	suffix := w.FirstSuffix(
23 | 		"ement", "ance", "ence", "able", "ible", "ment",
24 | 		"ent", "ant", "ism", "ate", "iti", "ous", "ive",
25 | 		"ize", "ion", "al", "er", "ic",
26 | 	)
27 | 	suffixLength := utf8.RuneCountInString(suffix)
28 | 
29 | 	// If it does not fit in R2, do nothing.
30 | 	if suffixLength > len(w.RS)-w.R2start {
31 | 		return false
32 | 	}
33 | 
34 | 	// Handle special cases
35 | 	switch suffix {
36 | 	case "":
37 | 		return false
38 | 
39 | 	case "ion":
40 | 		// Replace by og if preceded by l
41 | 		// l = 108
42 | 		rsLen := len(w.RS)
43 | 		if rsLen >= 4 {
44 | 			switch w.RS[rsLen-4] {
45 | 			case 115, 116:
46 | 				w.RemoveLastNRunes(suffixLength)
47 | 				return true
48 | 			}
49 | 
50 | 		}
51 | 		return false
52 | 	}
53 | 
54 | 	// Handle basic replacements
55 | 	w.RemoveLastNRunes(suffixLength)
56 | 	return true
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/english/step3.go:
--------------------------------------------------------------------------------
 1 | package english
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 3 is the stemming of various longer sufficies
10 | // found in R1.
11 | func step3(w *snowballword.SnowballWord) bool {
12 | 
13 | 	suffix := w.FirstSuffix(
14 | 		"ational", "tional", "alize", "icate", "ative",
15 | 		"iciti", "ical", "ful", "ness",
16 | 	)
17 | 
18 | 	suffixLength := utf8.RuneCountInString(suffix)
19 | 
20 | 	// If it is not in R1, do nothing
21 | 	if suffix == "" || suffixLength > len(w.RS)-w.R1start {
22 | 		return false
23 | 	}
24 | 
25 | 	// Handle special cases where we're not just going to
26 | 	// replace the suffix with another suffix: there are
27 | 	// other things we need to do.
28 | 	//
29 | 	if suffix == "ative" {
30 | 
31 | 		// If in R2, delete.
32 | 		//
33 | 		if len(w.RS)-w.R2start >= 5 {
34 | 			w.RemoveLastNRunes(suffixLength)
35 | 			return true
36 | 		}
37 | 		return false
38 | 	}
39 | 
40 | 	// Handle a suffix that was found, which is going
41 | 	// to be replaced with a different suffix.
42 | 	//
43 | 	var repl string
44 | 	switch suffix {
45 | 	case "ational":
46 | 		repl = "ate"
47 | 	case "tional":
48 | 		repl = "tion"
49 | 	case "alize":
50 | 		repl = "al"
51 | 	case "icate", "iciti", "ical":
52 | 		repl = "ic"
53 | 	case "ful", "ness":
54 | 		repl = ""
55 | 	}
56 | 	w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
57 | 	return true
58 | 
59 | }
60 | 


--------------------------------------------------------------------------------
/russian/README.md:
--------------------------------------------------------------------------------
 1 | Snowball Russian
 2 | ================
 3 | 
 4 | This package implements the
 5 | [Russian language Snowball stemmer](http://snowball.tartarus.org/algorithms/russian/stemmer.html).
 6 | 
 7 | ## Russian overview
 8 | 
 9 | Russian has 33 letters, 11 Vowels, 20 consonants
10 | and 2 unpronounced signs.  The capital letters 
11 | look the same as the lower case letters, with
12 | the exception of cursive capital letter and
13 | lower case.
14 | 
15 | ## Implementation
16 | 
17 | The Russian language stemmer comprises preprocessing, a number of steps.
18 | Each of these is defined in a separate file in this
19 | package.  All of the steps operate on a `SnowballWord` from the
20 | `snowballword` package and *modify the word in place*.
21 | 
22 | ## Caveats
23 | 
24 | The [example vocabulary for the original Russian snowball stemmer](http://snowball.tartarus.org/algorithms/russian/voc.txt) contains the word "злейший", which means "worst" in English.
25 | This word contains the adjectival suffix "ий" preceded by the superlative suffix "ейш".
26 | The [output for the example vocabulary](http://snowball.tartarus.org/algorithms/russian/output.txt)
27 | indicates that this word should be stemmed to "злейш".  However, this implementation stems
28 | the word to "зл".
29 | The [Python NLTK](https://github.com/nltk/nltk/blob/master/nltk/stem/snowball.py#L2879)
30 | implementation also stems "злейший" to "зл".
31 | It is unclear to me how the original snowball implementation would possibly produce "злейш".
32 | So, I removed that word from the tests.


--------------------------------------------------------------------------------
/swedish/step1.go:
--------------------------------------------------------------------------------
 1 | package swedish
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 1 is the stemming of various endings found in
10 | // R1 including "heterna", "ornas", and "andet".
11 | func step1(w *snowballword.SnowballWord) bool {
12 | 
13 | 	// Possible sufficies for this step, longest first.
14 | 	suffixes := []string{
15 | 		"heterna", "hetens", "anden", "heten", "heter", "arnas",
16 | 		"ernas", "ornas", "andes", "arens", "andet", "arna", "erna",
17 | 		"orna", "ande", "arne", "aste", "aren", "ades", "erns", "ade",
18 | 		"are", "ern", "ens", "het", "ast", "ad", "en", "ar", "er",
19 | 		"or", "as", "es", "at", "a", "e", "s",
20 | 	}
21 | 
22 | 	// Using FirstSuffixIn since there are overlapping suffixes, where some might not be in the R1,
23 | 	// while another might. For example: "ärade"
24 | 	suffix := w.FirstSuffixIn(w.R1start, len(w.RS), suffixes...)
25 | 	suffixLength := utf8.RuneCountInString(suffix)
26 | 
27 | 	// If it is not in R1, do nothing
28 | 	if suffix == "" || suffixLength > len(w.RS)-w.R1start {
29 | 		return false
30 | 	}
31 | 
32 | 	if suffix == "s" {
33 | 		// Delete if preceded by a valid s-ending. Valid s-endings inlude the
34 | 		// following charaters: bcdfghjklmnoprtvy.
35 | 		//
36 | 		rsLen := len(w.RS)
37 | 		if rsLen >= 2 {
38 | 			switch w.RS[rsLen-2] {
39 | 			case 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k',
40 | 				'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y':
41 | 				w.RemoveLastNRunes(suffixLength)
42 | 				return true
43 | 			}
44 | 		}
45 | 		return false
46 | 	}
47 | 	// Remove the suffix
48 | 	w.RemoveLastNRunes(suffixLength)
49 | 	return true
50 | }
51 | 


--------------------------------------------------------------------------------
/french/step2b.go:
--------------------------------------------------------------------------------
 1 | package french
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 2b is the removal of Verb suffixes in RV
10 | // that do not begin with "i".
11 | func step2b(word *snowballword.SnowballWord) bool {
12 | 
13 | 	// Search for the longest among the following suffixes in RV.
14 | 	//
15 | 	suffix := word.FirstSuffixIn(word.RVstart, len(word.RS),
16 | 		"eraIent", "assions", "erions", "assiez", "assent",
17 | 		"èrent", "eront", "erons", "eriez", "erait", "erais",
18 | 		"asses", "antes", "aIent", "âtes", "âmes", "ions",
19 | 		"erez", "eras", "erai", "asse", "ants", "ante", "ées",
20 | 		"iez", "era", "ant", "ait", "ais", "és", "ée", "ât",
21 | 		"ez", "er", "as", "ai", "é", "a",
22 | 	)
23 | 
24 | 	suffixLen := utf8.RuneCountInString(suffix)
25 | 	switch suffix {
26 | 	case "ions":
27 | 
28 | 		// Delete if in R2
29 | 		if word.FitsInR2(suffixLen) {
30 | 			word.RemoveLastNRunes(suffixLen)
31 | 			return true
32 | 		}
33 | 		return false
34 | 
35 | 	case "é", "ée", "ées", "és", "èrent", "er", "era",
36 | 		"erai", "eraIent", "erais", "erait", "eras", "erez",
37 | 		"eriez", "erions", "erons", "eront", "ez", "iez":
38 | 
39 | 		// Delete
40 | 		word.RemoveLastNRunes(suffixLen)
41 | 		return true
42 | 
43 | 	case "âmes", "ât", "âtes", "a", "ai", "aIent",
44 | 		"ais", "ait", "ant", "ante", "antes", "ants", "as",
45 | 		"asse", "assent", "asses", "assiez", "assions":
46 | 
47 | 		// Delete
48 | 		word.RemoveLastNRunes(suffixLen)
49 | 
50 | 		// If preceded by e (unicode code point 101), delete
51 | 		//
52 | 		idx := len(word.RS) - 1
53 | 		if idx >= 0 && word.RS[idx] == 101 && word.FitsInRV(1) {
54 | 			word.RemoveLastNRunes(1)
55 | 		}
56 | 		return true
57 | 
58 | 	}
59 | 	return false
60 | }
61 | 


--------------------------------------------------------------------------------
/spanish/step2b.go:
--------------------------------------------------------------------------------
 1 | package spanish
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 2b is the removal of verb suffixes beginning y,
10 | // Search for the longest among the following suffixes
11 | // in RV, and if found, delete if preceded by u.
12 | func step2b(word *snowballword.SnowballWord) bool {
13 | 	suffix := word.FirstSuffixIn(word.RVstart, len(word.RS),
14 | 		"iésemos", "iéramos", "iríamos", "eríamos", "aríamos", "ásemos",
15 | 		"áramos", "ábamos", "isteis", "iríais", "iremos", "ieseis",
16 | 		"ierais", "eríais", "eremos", "asteis", "aríais", "aremos",
17 | 		"íamos", "irías", "irían", "iréis", "ieses", "iesen", "ieron",
18 | 		"ieras", "ieran", "iendo", "erías", "erían", "eréis", "aseis",
19 | 		"arías", "arían", "aréis", "arais", "abais", "íais", "iste",
20 | 		"iría", "irás", "irán", "imos", "iese", "iera", "idos", "idas",
21 | 		"ería", "erás", "erán", "aste", "ases", "asen", "aría", "arás",
22 | 		"arán", "aron", "aras", "aran", "ando", "amos", "ados", "adas",
23 | 		"abas", "aban", "ías", "ían", "éis", "áis", "iré", "irá", "ido",
24 | 		"ida", "eré", "erá", "emos", "ase", "aré", "ará", "ara", "ado",
25 | 		"ada", "aba", "ís", "ía", "ió", "ir", "id", "es", "er", "en",
26 | 		"ed", "as", "ar", "an", "ad",
27 | 	)
28 | 	suffixLength := utf8.RuneCountInString(suffix)
29 | 
30 | 	switch suffix {
31 | 	case "":
32 | 		return false
33 | 
34 | 	case "en", "es", "éis", "emos":
35 | 
36 | 		// Delete, and if preceded by gu delete the u (the gu need not be in RV)
37 | 		word.RemoveLastNRunes(suffixLength)
38 | 		guSuffix := word.FirstSuffix("gu")
39 | 		if guSuffix != "" {
40 | 			word.RemoveLastNRunes(1)
41 | 		}
42 | 
43 | 	default:
44 | 
45 | 		// Delete
46 | 		word.RemoveLastNRunes(suffixLength)
47 | 	}
48 | 	return true
49 | }
50 | 


--------------------------------------------------------------------------------
/norwegian/step1.go:
--------------------------------------------------------------------------------
 1 | package norwegian
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 1 is the stemming of various endings found in
10 | // R1 including "hetene", "endes", and "ande".
11 | func step1(w *snowballword.SnowballWord) bool {
12 | 
13 | 	// Possible sufficies for this step, longest first.
14 | 	suffixes := []string{
15 | 		"hetenes", "hetene", "hetens", "endes", "heter", "heten", "ende",
16 | 		"ande", "edes", "enes", "ene", "ane", "ets", "ers", "ede", "ast",
17 | 		"ens", "het", "as", "es", "en", "ar", "er", "et", "e", "a", "s",
18 | 	}
19 | 
20 | 	// Using FirstSuffixIn since there are overlapping suffixes, where some might not be in the R1,
21 | 	suffix := w.FirstSuffixIn(w.R1start, len(w.RS), suffixes...)
22 | 	suffixLength := utf8.RuneCountInString(suffix)
23 | 
24 | 	if suffix == "s" {
25 | 		// Delete if preceded by a valid s-ending. Valid s-endings inlude the
26 | 		// following charaters: bcdfghjlmnoprtvyz or k not preceded by a vowel
27 | 		rsLen := len(w.RS)
28 | 
29 | 		if rsLen >= 2 {
30 | 			switch w.RS[rsLen-2] {
31 | 			case 'b', 'c', 'd', 'f', 'g', 'h', 'j',
32 | 				'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z':
33 | 
34 | 				w.RemoveLastNRunes(suffixLength)
35 | 				return true
36 | 			case 'k':
37 | 				if !isLowerVowel(w.RS[rsLen-3]) {
38 | 					w.RemoveLastNRunes(suffixLength)
39 | 					return true
40 | 				}
41 | 			}
42 | 		}
43 | 
44 | 		return false
45 | 	}
46 | 
47 | 	// Remove the suffix
48 | 	w.RemoveLastNRunes(suffixLength)
49 | 
50 | 	// replace "erte" and "ert" with "er"
51 | 	suffix = w.FirstSuffix("erte", "ert")
52 | 	suffixLength = utf8.RuneCountInString(suffix)
53 | 
54 | 	if suffix == "" || suffixLength > len(w.RS)-w.R1start {
55 | 		return false
56 | 	}
57 | 
58 | 	w.ReplaceSuffixRunes([]rune(suffix), []rune("er"), true)
59 | 
60 | 	return true
61 | }
62 | 


--------------------------------------------------------------------------------
/spanish/step0.go:
--------------------------------------------------------------------------------
 1 | package spanish
 2 | 
 3 | import (
 4 | 	"unicode/utf8"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 0 is the removal of attached pronouns
10 | func step0(word *snowballword.SnowballWord) bool {
11 | 
12 | 	// Search for the longest among the following suffixes
13 | 	suffix1 := word.FirstSuffixIn(word.RVstart, len(word.RS),
14 | 		"selas", "selos", "sela", "selo", "las", "les",
15 | 		"los", "nos", "me", "se", "la", "le", "lo",
16 | 	)
17 | 
18 | 	// If the suffix empty or not in RV, we have nothing to do.
19 | 	if suffix1 == "" {
20 | 		return false
21 | 	}
22 | 	s1Len := utf8.RuneCountInString(suffix1)
23 | 
24 | 	// We'll remove suffix1, if comes after one of the following
25 | 	suffix2 := word.FirstSuffixIn(word.RVstart, len(word.RS)-len(suffix1),
26 | 		"iéndo", "iendo", "yendo", "ando", "ándo",
27 | 		"ár", "ér", "ír", "ar", "er", "ir",
28 | 	)
29 | 	switch suffix2 {
30 | 	case "":
31 | 
32 | 		// Nothing to do
33 | 		return false
34 | 
35 | 	case "iéndo", "ándo", "ár", "ér", "ír":
36 | 
37 | 		// In these cases, deletion is followed by removing
38 | 		// the acute accent (e.g., haciéndola -> haciendo).
39 | 
40 | 		var suffix2repl string
41 | 		switch suffix2 {
42 | 		case "":
43 | 			return false
44 | 		case "iéndo":
45 | 			suffix2repl = "iendo"
46 | 		case "ándo":
47 | 			suffix2repl = "ando"
48 | 		case "ár":
49 | 			suffix2repl = "ar"
50 | 		case "ír":
51 | 			suffix2repl = "ir"
52 | 		}
53 | 		word.RemoveLastNRunes(s1Len)
54 | 		word.ReplaceSuffixRunes([]rune(suffix2), []rune(suffix2repl), true)
55 | 		return true
56 | 
57 | 	case "ando", "iendo", "ar", "er", "ir":
58 | 		word.RemoveLastNRunes(s1Len)
59 | 		return true
60 | 
61 | 	case "yendo":
62 | 
63 | 		// In the case of "yendo", the "yendo" must lie in RV,
64 | 		// and be preceded by a "u" somewhere in the word.
65 | 
66 | 		for i := 0; i < len(word.RS)-(len(suffix1)+len(suffix2)); i++ {
67 | 
68 | 			// Note, the unicode code point for "u" is 117.
69 | 			if word.RS[i] == 117 {
70 | 				word.RemoveLastNRunes(s1Len)
71 | 				return true
72 | 			}
73 | 		}
74 | 	}
75 | 	return false
76 | }
77 | 


--------------------------------------------------------------------------------
/swedish/common.go:
--------------------------------------------------------------------------------
 1 | package swedish
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/romance"
 5 | 	"github.com/kljensen/snowball/snowballword"
 6 | )
 7 | 
 8 | // Find the starting point of the region R1.
 9 | //
10 | // R1 is the region after the first non-vowel following a vowel,
11 | // or is the null region at the end of the word if there is no
12 | // such non-vowel. R2 is not used in Swedish
13 | //
14 | // See http://snowball.tartarus.org/texts/r1r2.html
15 | //
16 | func r1(word *snowballword.SnowballWord) (r1start int) {
17 | 	// Like the German R1, the length of the Swedish R1 is adjusted to be at least three.
18 | 	r1start = romance.VnvSuffix(word, isLowerVowel, 0)
19 | 	if r1start < 3 && len(word.RS) >= 3 {
20 | 		r1start = 3
21 | 	}
22 | 	return
23 | }
24 | 
25 | // Checks if a rune is a lowercase Swedish vowel.
26 | //
27 | func isLowerVowel(r rune) bool {
28 | 	switch r {
29 | 	case 'a', 'e', 'i', 'o', 'u', 'y', 'å', 'ä', 'ö':
30 | 		return true
31 | 	}
32 | 	return false
33 | }
34 | 
35 | // Return `true` if the input `word` is a Swedish stop word.
36 | //
37 | func IsStopWord(word string) bool {
38 | 	switch word {
39 | 	case "och", "det", "att", "i", "en", "jag", "hon", "som", "han",
40 | 		"på", "den", "med", "var", "sig", "för", "så", "till", "är", "men",
41 | 		"ett", "om", "hade", "de", "av", "icke", "mig", "du", "henne", "då",
42 | 		"sin", "nu", "har", "inte", "hans", "honom", "skulle", "hennes",
43 | 		"där", "min", "man", "ej", "vid", "kunde", "något", "från", "ut",
44 | 		"när", "efter", "upp", "vi", "dem", "vara", "vad", "över", "än",
45 | 		"dig", "kan", "sina", "här", "ha", "mot", "alla", "under", "någon",
46 | 		"eller", "allt", "mycket", "sedan", "ju", "denna", "själv", "detta",
47 | 		"åt", "utan", "varit", "hur", "ingen", "mitt", "ni", "bli", "blev",
48 | 		"oss", "din", "dessa", "några", "deras", "blir", "mina", "samma",
49 | 		"vilken", "er", "sådan", "vår", "blivit", "dess", "inom", "mellan",
50 | 		"sådant", "varför", "varje", "vilka", "ditt", "vem", "vilket",
51 | 		"sitta", "sådana", "vart", "dina", "vars", "vårt", "våra",
52 | 		"ert", "era", "vilkas":
53 | 		return true
54 | 	}
55 | 	return false
56 | }
57 | 


--------------------------------------------------------------------------------
/french/step4.go:
--------------------------------------------------------------------------------
 1 | package french
 2 | 
 3 | import (
 4 | 	"log"
 5 | 
 6 | 	"github.com/kljensen/snowball/snowballword"
 7 | )
 8 | 
 9 | // Step 4 is the cleaning up of residual suffixes.
10 | func step4(word *snowballword.SnowballWord) bool {
11 | 
12 | 	hadChange := false
13 | 
14 | 	if word.String() == "voudrion" {
15 | 		log.Println("...", word)
16 | 	}
17 | 
18 | 	// If the word ends s (unicode code point 115),
19 | 	// not preceded by a, i, o, u, è or s, delete it.
20 | 	//
21 | 	if idx := len(word.RS) - 1; idx >= 1 && word.RS[idx] == 115 {
22 | 		switch word.RS[idx-1] {
23 | 
24 | 		case 97, 105, 111, 117, 232, 115:
25 | 
26 | 			// Do nothing, preceded by a, i, o, u, è or s
27 | 			return false
28 | 
29 | 		default:
30 | 			word.RemoveLastNRunes(1)
31 | 			hadChange = true
32 | 
33 | 		}
34 | 	}
35 | 
36 | 	// Note: all the following are restricted to the RV region.
37 | 
38 | 	// Search for the longest among the following suffixes in RV.
39 | 	//
40 | 	suffix := word.FirstSuffixIn(word.RVstart, len(word.RS),
41 | 		"Ière", "ière", "Ier", "ier", "ion", "e", "ë",
42 | 	)
43 | 
44 | 	switch suffix {
45 | 	case "":
46 | 		return hadChange
47 | 	case "ion":
48 | 
49 | 		// Delete if in R2 and preceded by s or t in RV
50 | 
51 | 		const suffixLength int = 3 // equivalently, len(suffixRunes)
52 | 		idx := len(word.RS) - suffixLength - 1
53 | 		if word.FitsInR2(suffixLength) && idx >= 0 && word.FitsInRV(suffixLength+1) {
54 | 			if word.RS[idx] == 115 || word.RS[idx] == 116 {
55 | 				word.RemoveLastNRunes(suffixLength)
56 | 				return true
57 | 			}
58 | 		}
59 | 		return hadChange
60 | 
61 | 	case "ier", "ière", "Ier", "Ière":
62 | 		// Replace with i
63 | 		suffixRunes := []rune(suffix)
64 | 		word.ReplaceSuffixRunes(suffixRunes, []rune("i"), true)
65 | 		return true
66 | 
67 | 	case "e":
68 | 		word.RemoveLastNRunes(1)
69 | 		return true
70 | 
71 | 	case "ë":
72 | 
73 | 		// If preceded by gu (unicode code point 103 & 117), delete
74 | 		idx := len(word.RS) - 1
75 | 		if idx >= 2 && word.RS[idx-2] == 103 && word.RS[idx-1] == 117 {
76 | 			word.RemoveLastNRunes(1)
77 | 			return true
78 | 		}
79 | 		return hadChange
80 | 	}
81 | 
82 | 	return true
83 | }
84 | 


--------------------------------------------------------------------------------
/norwegian/common.go:
--------------------------------------------------------------------------------
 1 | package norwegian
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/romance"
 5 | 	"github.com/kljensen/snowball/snowballword"
 6 | )
 7 | 
 8 | // Find the starting point of the region R1.
 9 | //
10 | // R1 is the region after the first non-vowel following a vowel,
11 | // or is the null region at the end of the word if there is no
12 | // such non-vowel. R2 is not used in Norwegian
13 | //
14 | // See http://snowball.tartarus.org/texts/r1r2.html
15 | //
16 | func r1(word *snowballword.SnowballWord) (r1start int) {
17 | 	// Like the German R1, the length of the Norwegian R1 is adjusted to be at least three.
18 | 	r1start = romance.VnvSuffix(word, isLowerVowel, 0)
19 | 	if r1start < 3 && len(word.RS) >= 3 {
20 | 		r1start = 3
21 | 	}
22 | 	return
23 | }
24 | 
25 | // Checks if a rune is a lowercase Norwegian vowel.
26 | //
27 | func isLowerVowel(r rune) bool {
28 | 	switch r {
29 | 	case 'a', 'e', 'i', 'o', 'u', 'y', 'æ', 'ø', 'å':
30 | 		return true
31 | 	}
32 | 	return false
33 | }
34 | 
35 | // Return `true` if the input `word` is a Norwegian stop word.
36 | //
37 | func IsStopWord(word string) bool {
38 | 	switch word {
39 | 	case "ut", "få", "hadde", "hva", "tilbake", "vil", "han", "meget", "men", "vi", "en", "før",
40 | 		"samme", "stille", "inn", "er", "kan", "makt", "ved", "forsøke", "hvis", "part", "rett",
41 | 		"måte", "denne", "mer", "i", "lang", "ny", "hans", "hvilken", "tid", "vite", "her", "opp",
42 | 		"var", "navn", "mye", "om", "sant", "tilstand", "der", "ikke", "mest", "punkt", "hvem",
43 | 		"skulle", "mange", "over", "vårt", "alle", "arbeid", "lik", "like", "gå", "når", "siden",
44 | 		"å", "begge", "bruke", "eller", "og", "til", "da", "et", "hvorfor", "nå", "sist", "slutt",
45 | 		"deres", "det", "hennes", "så", "mens", "bra", "din", "fordi", "gjøre", "god", "ha", "start",
46 | 		"andre", "må", "med", "under", "meg", "oss", "innen", "på", "verdi", "ville", "kunne", "uten",
47 | 		"vår", "slik", "ene", "folk", "min", "riktig", "enhver", "bort", "enn", "nei", "som", "våre", "disse",
48 | 		"gjorde", "lage", "si", "du", "fra", "også", "hvordan", "av", "eneste", "for", "hvor", "først", "hver":
49 | 		return true
50 | 	}
51 | 	return false
52 | }
53 | 


--------------------------------------------------------------------------------
/russian/common.go:
--------------------------------------------------------------------------------
 1 | package russian
 2 | 
 3 | import (
 4 | 	"github.com/kljensen/snowball/romance"
 5 | 	"github.com/kljensen/snowball/snowballword"
 6 | )
 7 | 
 8 | // Checks if a rune is a lowercase Russian vowel.
 9 | //
10 | func isLowerVowel(r rune) bool {
11 | 
12 | 	// The Russian vowels are "аеиоуыэюя", which
13 | 	// are referenced by their unicode code points
14 | 	// in the switch statement below.
15 | 	switch r {
16 | 	case 1072, 1077, 1080, 1086, 1091, 1099, 1101, 1102, 1103:
17 | 		return true
18 | 	}
19 | 	return false
20 | }
21 | 
22 | // Return `true` if the input `word` is a French stop word.
23 | //
24 | func IsStopWord(word string) bool {
25 | 	switch word {
26 | 	case "и", "в", "во", "не", "что", "он", "на", "я", "с",
27 | 		"со", "как", "а", "то", "все", "она", "так", "его",
28 | 		"но", "да", "ты", "к", "у", "же", "вы", "за", "бы",
29 | 		"по", "только", "ее", "мне", "было", "вот", "от",
30 | 		"меня", "еще", "нет", "о", "из", "ему", "теперь",
31 | 		"когда", "даже", "ну", "вдруг", "ли", "если", "уже",
32 | 		"или", "ни", "быть", "был", "него", "до", "вас",
33 | 		"нибудь", "опять", "уж", "вам", "ведь", "там", "потом",
34 | 		"себя", "ничего", "ей", "может", "они", "тут", "где",
35 | 		"есть", "надо", "ней", "для", "мы", "тебя", "их",
36 | 		"чем", "была", "сам", "чтоб", "без", "будто", "чего",
37 | 		"раз", "тоже", "себе", "под", "будет", "ж", "тогда",
38 | 		"кто", "этот", "того", "потому", "этого", "какой",
39 | 		"совсем", "ним", "здесь", "этом", "один", "почти",
40 | 		"мой", "тем", "чтобы", "нее", "сейчас", "были", "куда",
41 | 		"зачем", "всех", "никогда", "можно", "при", "наконец",
42 | 		"два", "об", "другой", "хоть", "после", "над", "больше",
43 | 		"тот", "через", "эти", "нас", "про", "всего", "них",
44 | 		"какая", "много", "разве", "три", "эту", "моя",
45 | 		"впрочем", "хорошо", "свою", "этой", "перед", "иногда",
46 | 		"лучше", "чуть", "том", "нельзя", "такой", "им", "более",
47 | 		"всегда", "конечно", "всю", "между":
48 | 		return true
49 | 	}
50 | 	return false
51 | }
52 | 
53 | // Find the starting point of the regions R1, R2, & RV
54 | //
55 | func findRegions(word *snowballword.SnowballWord) (r1start, r2start, rvstart int) {
56 | 
57 | 	// R1 & R2 are defined in the standard manner.
58 | 	r1start = romance.VnvSuffix(word, isLowerVowel, 0)
59 | 	r2start = romance.VnvSuffix(word, isLowerVowel, r1start)
60 | 
61 | 	// Set RV, by default, as empty.
62 | 	rvstart = len(word.RS)
63 | 
64 | 	// RV is the region after the first vowel, or the end of
65 | 	// the word if it contains no vowel.
66 | 	//
67 | 	for i := 0; i < len(word.RS); i++ {
68 | 		if isLowerVowel(word.RS[i]) {
69 | 			rvstart = i + 1
70 | 			break
71 | 		}
72 | 	}
73 | 
74 | 	return
75 | }
76 | 


--------------------------------------------------------------------------------
/snowball_test.go:
--------------------------------------------------------------------------------
 1 | package snowball
 2 | 
 3 | import (
 4 | 	"regexp"
 5 | 	"testing"
 6 | )
 7 | 
 8 | func Test_Stem(t *testing.T) {
 9 | 	testCases := []struct {
10 | 		in            string
11 | 		language      string
12 | 		stemStopWords bool
13 | 		out           string
14 | 		nilErr        bool
15 | 	}{
16 | 		{"aberration", "english", true, "aberr", true},
17 | 		{"abruptness", "english", true, "abrupt", true},
18 | 		{"absolute", "english", true, "absolut", true},
19 | 		{"abated", "english", true, "abat", true},
20 | 		{"acclivity", "english", true, "accliv", true},
21 | 		{"accumulations", "english", true, "accumul", true},
22 | 		{"agreement", "english", true, "agreement", true},
23 | 		{"breed", "english", true, "breed", true},
24 | 		{"ape", "english", true, "ape", true},
25 | 		{"skating", "english", true, "skate", true},
26 | 		{"fluently", "english", true, "fluentli", true},
27 | 		{"ied", "english", true, "ie", true},
28 | 		{"ies", "english", true, "ie", true},
29 | 		// Change stemStopWords
30 | 		{"above", "english", true, "abov", true},
31 | 		{"because", "english", false, "because", true},
32 | 		// Give invalid language
33 | 		{"because", "klingon", false, "", false},
34 | 
35 | 		// Spanish tests, a few
36 | 		{"lejana", "spanish", true, "lejan", true},
37 | 		{"preocuparse", "spanish", true, "preocup", true},
38 | 		{"oposición", "spanish", true, "oposicion", true},
39 | 		{"prisionero", "spanish", true, "prisioner", true},
40 | 		{"ridiculización", "spanish", true, "ridiculiz", true},
41 | 		{"cotidianeidad", "spanish", true, "cotidian", true},
42 | 		{"portezuela", "spanish", true, "portezuel", true},
43 | 		{"enriquecerse", "spanish", true, "enriquec", true},
44 | 		{"campesinos", "spanish", true, "campesin", true},
45 | 		{"desalojó", "spanish", true, "desaloj", true},
46 | 		{"anticipadas", "spanish", true, "anticip", true},
47 | 		{"goyesca", "spanish", true, "goyesc", true},
48 | 		{"band", "spanish", true, "band", true},
49 | 	}
50 | 	for _, testCase := range testCases {
51 | 		out, err := Stem(testCase.in, testCase.language, testCase.stemStopWords)
52 | 		nilErr := true
53 | 		if err != nil {
54 | 			nilErr = false
55 | 		}
56 | 		if out != testCase.out || nilErr != testCase.nilErr {
57 | 			t.Errorf("Stem(\"%v\", \"%v\", %v) = \"%v, %v\", but expected %v, %v",
58 | 				testCase.in, testCase.language, testCase.stemStopWords,
59 | 				out, nilErr, testCase.out, testCase.nilErr,
60 | 			)
61 | 		}
62 | 
63 | 	}
64 | }
65 | 
66 | // Test if the VERSION constant is correctly formatted
67 | //
68 | func Test_Version(t *testing.T) {
69 | 	validVersionRegexp := regexp.MustCompile(`^v\d+\.\d+\.\d+$`)
70 | 	if validVersionRegexp.MatchString(VERSION) == false {
71 | 		t.Errorf("Invalid version specified: %v", VERSION)
72 | 	}
73 | }
74 | 


--------------------------------------------------------------------------------
/english/step2.go:
--------------------------------------------------------------------------------
  1 | package english
  2 | 
  3 | import (
  4 | 	"unicode/utf8"
  5 | 
  6 | 	"github.com/kljensen/snowball/snowballword"
  7 | )
  8 | 
  9 | // Step 2 is the stemming of various endings found in
 10 | // R1 including "al", "ness", and "li".
 11 | func step2(w *snowballword.SnowballWord) bool {
 12 | 
 13 | 	// Possible sufficies for this step, longest first.
 14 | 	suffix := w.FirstSuffix(
 15 | 		"ational", "fulness", "iveness", "ization", "ousness",
 16 | 		"biliti", "lessli", "tional", "alism", "aliti", "ation",
 17 | 		"entli", "fulli", "iviti", "ousli", "anci", "abli",
 18 | 		"alli", "ator", "enci", "izer", "bli", "ogi", "li",
 19 | 	)
 20 | 	suffixLength := utf8.RuneCountInString(suffix)
 21 | 
 22 | 	// If it is not in R1, do nothing
 23 | 	if suffix == "" || suffixLength > len(w.RS)-w.R1start {
 24 | 		return false
 25 | 	}
 26 | 
 27 | 	// Handle special cases where we're not just going to
 28 | 	// replace the suffix with another suffix: there are
 29 | 	// other things we need to do.
 30 | 	//
 31 | 	switch suffix {
 32 | 
 33 | 	case "li":
 34 | 
 35 | 		// Delete if preceded by a valid li-ending. Valid li-endings inlude the
 36 | 		// following charaters: cdeghkmnrt. (Note, the unicode code points for
 37 | 		// these characters are, respectively, as follows:
 38 | 		// 99 100 101 103 104 107 109 110 114 116)
 39 | 		//
 40 | 		rsLen := len(w.RS)
 41 | 		if rsLen >= 3 {
 42 | 			switch w.RS[rsLen-3] {
 43 | 			case 99, 100, 101, 103, 104, 107, 109, 110, 114, 116:
 44 | 				w.RemoveLastNRunes(suffixLength)
 45 | 				return true
 46 | 			}
 47 | 		}
 48 | 		return false
 49 | 
 50 | 	case "ogi":
 51 | 
 52 | 		// Replace by og if preceded by l.
 53 | 		// (Note, the unicode code point for l is 108)
 54 | 		//
 55 | 		rsLen := len(w.RS)
 56 | 		if rsLen >= 4 && w.RS[rsLen-4] == 108 {
 57 | 			w.ReplaceSuffixRunes([]rune(suffix), []rune("og"), true)
 58 | 		}
 59 | 		return true
 60 | 	}
 61 | 
 62 | 	// Handle a suffix that was found, which is going
 63 | 	// to be replaced with a different suffix.
 64 | 	//
 65 | 	var repl string
 66 | 	switch suffix {
 67 | 	case "tional":
 68 | 		repl = "tion"
 69 | 	case "enci":
 70 | 		repl = "ence"
 71 | 	case "anci":
 72 | 		repl = "ance"
 73 | 	case "abli":
 74 | 		repl = "able"
 75 | 	case "entli":
 76 | 		repl = "ent"
 77 | 	case "izer", "ization":
 78 | 		repl = "ize"
 79 | 	case "ational", "ation", "ator":
 80 | 		repl = "ate"
 81 | 	case "alism", "aliti", "alli":
 82 | 		repl = "al"
 83 | 	case "fulness":
 84 | 		repl = "ful"
 85 | 	case "ousli", "ousness":
 86 | 		repl = "ous"
 87 | 	case "iveness", "iviti":
 88 | 		repl = "ive"
 89 | 	case "biliti", "bli":
 90 | 		repl = "ble"
 91 | 	case "fulli":
 92 | 		repl = "ful"
 93 | 	case "lessli":
 94 | 		repl = "less"
 95 | 	}
 96 | 	w.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
 97 | 	return true
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/english/step1b.go:
--------------------------------------------------------------------------------
  1 | package english
  2 | 
  3 | import (
  4 | 	"unicode/utf8"
  5 | 
  6 | 	"github.com/kljensen/snowball/snowballword"
  7 | )
  8 | 
  9 | // Step 1b is the normalization of various "ly" and "ed" sufficies.
 10 | func step1b(w *snowballword.SnowballWord) bool {
 11 | 
 12 | 	suffix := w.FirstSuffix("eedly", "ingly", "edly", "ing", "eed", "ed")
 13 | 	suffixLength := utf8.RuneCountInString(suffix)
 14 | 
 15 | 	switch suffix {
 16 | 
 17 | 	case "":
 18 | 		// No suffix found
 19 | 		return false
 20 | 
 21 | 	case "eed", "eedly":
 22 | 
 23 | 		// Replace by ee if in R1
 24 | 		if suffixLength <= len(w.RS)-w.R1start {
 25 | 			w.ReplaceSuffixRunes([]rune(suffix), []rune("ee"), true)
 26 | 		}
 27 | 		return true
 28 | 
 29 | 	case "ed", "edly", "ing", "ingly":
 30 | 		hasLowerVowel := false
 31 | 		for i := 0; i < len(w.RS)-suffixLength; i++ {
 32 | 			if isLowerVowel(w.RS[i]) {
 33 | 				hasLowerVowel = true
 34 | 				break
 35 | 			}
 36 | 		}
 37 | 		if hasLowerVowel {
 38 | 
 39 | 			// This case requires a two-step transformation and, due
 40 | 			// to the way we've implemented the `ReplaceSuffix` method
 41 | 			// here, information about R1 and R2 would be lost between
 42 | 			// the two.  Therefore, we need to keep track of the
 43 | 			// original R1 & R2, so that we may set them below, at the
 44 | 			// end of this case.
 45 | 			//
 46 | 			originalR1start := w.R1start
 47 | 			originalR2start := w.R2start
 48 | 
 49 | 			// Delete if the preceding word part contains a vowel
 50 | 			w.RemoveLastNRunes(suffixLength)
 51 | 
 52 | 			// ...and after the deletion...
 53 | 
 54 | 			newSuffix := w.FirstSuffix("at", "bl", "iz", "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt")
 55 | 			switch newSuffix {
 56 | 
 57 | 			case "":
 58 | 
 59 | 				// If the word is short, add "e"
 60 | 				if isShortWord(w) {
 61 | 
 62 | 					// By definition, r1 and r2 are the empty string for
 63 | 					// short words.
 64 | 					w.RS = append(w.RS, []rune("e")...)
 65 | 					w.R1start = len(w.RS)
 66 | 					w.R2start = len(w.RS)
 67 | 					return true
 68 | 				}
 69 | 
 70 | 			case "at", "bl", "iz":
 71 | 
 72 | 				// If the word ends "at", "bl" or "iz" add "e"
 73 | 				w.ReplaceSuffixRunes([]rune(newSuffix), []rune(newSuffix+"e"), true)
 74 | 
 75 | 			case "bb", "dd", "ff", "gg", "mm", "nn", "pp", "rr", "tt":
 76 | 
 77 | 				// If the word ends with a double remove the last letter.
 78 | 				// Note that, "double" does not include all possible doubles,
 79 | 				// just those shown above.
 80 | 				//
 81 | 				w.RemoveLastNRunes(1)
 82 | 			}
 83 | 
 84 | 			// Because we did a double replacement, we need to fix
 85 | 			// R1 and R2 manually. This is just becase of how we've
 86 | 			// implemented the `ReplaceSuffix` method.
 87 | 			//
 88 | 			rsLen := len(w.RS)
 89 | 			if originalR1start < rsLen {
 90 | 				w.R1start = originalR1start
 91 | 			} else {
 92 | 				w.R1start = rsLen
 93 | 			}
 94 | 			if originalR2start < rsLen {
 95 | 				w.R2start = originalR2start
 96 | 			} else {
 97 | 				w.R2start = rsLen
 98 | 			}
 99 | 
100 | 			return true
101 | 		}
102 | 
103 | 	}
104 | 
105 | 	return false
106 | }
107 | 


--------------------------------------------------------------------------------
/romance/testing_helpers.go:
--------------------------------------------------------------------------------
 1 | /*
 2 | 	This file contains test runners that are common to
 3 | 	the romance languages.
 4 | */
 5 | package romance
 6 | 
 7 | import (
 8 | 	"fmt"
 9 | 	"github.com/kljensen/snowball/snowballword"
10 | 	"testing"
11 | )
12 | 
13 | type stepFunc func(*snowballword.SnowballWord) bool
14 | type StepTestCase struct {
15 | 	WordIn     string
16 | 	R1start    int
17 | 	R2start    int
18 | 	RVstart    int
19 | 	Changed    bool
20 | 	WordOut    string
21 | 	R1startOut int
22 | 	R2startOut int
23 | 	RVstartOut int
24 | }
25 | 
26 | func RunStepTest(t *testing.T, f stepFunc, tcs []StepTestCase) {
27 | 	for _, testCase := range tcs {
28 | 		w := snowballword.New(testCase.WordIn)
29 | 		w.R1start = testCase.R1start
30 | 		w.R2start = testCase.R2start
31 | 		w.RVstart = testCase.RVstart
32 | 		retval := f(w)
33 | 		if retval != testCase.Changed || w.String() != testCase.WordOut || w.R1start != testCase.R1startOut || w.R2start != testCase.R2startOut || w.RVstart != testCase.RVstartOut {
34 | 			t.Errorf("Expected %v -> \"{%v, %v, %v, %v, %v}\", but got \"{%v, %v, %v, %v, %v}\"", testCase.WordIn, testCase.WordOut, testCase.R1startOut, testCase.R2startOut, testCase.RVstartOut, testCase.Changed, w.String(), w.R1start, w.R2start, w.RVstart, retval)
35 | 		}
36 | 		if w.String() != testCase.WordOut {
37 | 			fmt.Printf("{\"%v\", %v, %v, %v, true, \"%v\", %v, %v, %v},\n", testCase.WordIn, testCase.R1start, testCase.R2start, testCase.RVstart, testCase.WordOut, w.R1start, w.R2start, w.RVstart)
38 | 		}
39 | 	}
40 | }
41 | 
42 | // Test case for functions that take a word and return a bool.
43 | type WordBoolTestCase struct {
44 | 	Word   string
45 | 	Result bool
46 | }
47 | 
48 | // Test runner for functions that take a word and return a bool.
49 | //
50 | func RunWordBoolTest(t *testing.T, f func(string) bool, tcs []WordBoolTestCase) {
51 | 	for _, testCase := range tcs {
52 | 		result := f(testCase.Word)
53 | 		if result != testCase.Result {
54 | 			t.Errorf("Expected %v -> %v, but got %v", testCase.Word, testCase.Result, result)
55 | 		}
56 | 	}
57 | }
58 | 
59 | // Test runner for functions that should be fed each rune of
60 | // a string and that return a bool for each rune.  Usually used
61 | // to test functions that return true if a rune is a vowel, etc.
62 | //
63 | func RunRunewiseBoolTest(t *testing.T, f func(rune) bool, tcs []WordBoolTestCase) {
64 | 	for _, testCase := range tcs {
65 | 		for _, r := range testCase.Word {
66 | 			result := f(r)
67 | 			if result != testCase.Result {
68 | 				t.Errorf("Expected %v -> %v, but got %v", r, testCase.Result, result)
69 | 			}
70 | 		}
71 | 	}
72 | }
73 | 
74 | type FindRegionsTestCase struct {
75 | 	Word    string
76 | 	R1start int
77 | 	R2start int
78 | 	RVstart int
79 | }
80 | 
81 | // Test isLowerVowel for things we know should be true
82 | // or false.
83 | //
84 | func RunFindRegionsTest(t *testing.T, f func(*snowballword.SnowballWord) (int, int, int), tcs []FindRegionsTestCase) {
85 | 	for _, testCase := range tcs {
86 | 		w := snowballword.New(testCase.Word)
87 | 		r1start, r2start, rvstart := f(w)
88 | 		if r1start != testCase.R1start || r2start != testCase.R2start || rvstart != testCase.RVstart {
89 | 			t.Errorf("Expect \"%v\" -> %v, %v, %v, but got %v, %v, %v",
90 | 				testCase.Word, testCase.R1start, testCase.R2start, testCase.RVstart,
91 | 				r1start, r2start, rvstart,
92 | 			)
93 | 		}
94 | 
95 | 	}
96 | }
97 | 


--------------------------------------------------------------------------------
/spanish/step1.go:
--------------------------------------------------------------------------------
  1 | package spanish
  2 | 
  3 | import (
  4 | 	"log"
  5 | 	"unicode/utf8"
  6 | 
  7 | 	"github.com/kljensen/snowball/snowballword"
  8 | )
  9 | 
 10 | // Step 1 is the removal of standard suffixes
 11 | func step1(word *snowballword.SnowballWord) bool {
 12 | 
 13 | 	// Possible suffixes, longest first
 14 | 	suffix := word.FirstSuffix(
 15 | 		"amientos", "imientos", "aciones", "amiento", "imiento",
 16 | 		"uciones", "logías", "idades", "encias", "ancias", "amente",
 17 | 		"adores", "adoras", "ución", "mente", "logía", "istas",
 18 | 		"ismos", "ibles", "encia", "anzas", "antes", "ancia",
 19 | 		"adora", "ación", "ables", "osos", "osas", "ivos", "ivas",
 20 | 		"ista", "ismo", "idad", "icos", "icas", "ible", "anza",
 21 | 		"ante", "ador", "able", "oso", "osa", "ivo", "iva",
 22 | 		"ico", "ica",
 23 | 	)
 24 | 	suffixLength := utf8.RuneCountInString(suffix)
 25 | 
 26 | 	isInR1 := (word.R1start <= len(word.RS)-suffixLength)
 27 | 	isInR2 := (word.R2start <= len(word.RS)-suffixLength)
 28 | 
 29 | 	// Deal with special cases first.  All of these will
 30 | 	// return if they are hit.
 31 | 	//
 32 | 	switch suffix {
 33 | 	case "":
 34 | 
 35 | 		// Nothing to do
 36 | 		return false
 37 | 
 38 | 	case "amente":
 39 | 
 40 | 		if isInR1 {
 41 | 			// Delete if in R1
 42 | 			word.RemoveLastNRunes(suffixLength)
 43 | 
 44 | 			// if preceded by iv, delete if in R2 (and if further preceded by at,
 45 | 			// delete if in R2), otherwise,
 46 | 			// if preceded by os, ic or ad, delete if in R2
 47 | 			newSuffix := word.RemoveFirstSuffixIfIn(word.R2start, "iv", "os", "ic", "ad")
 48 | 			if newSuffix == "iv" {
 49 | 				word.RemoveFirstSuffixIfIn(word.R2start, "at")
 50 | 			}
 51 | 			return true
 52 | 		}
 53 | 		return false
 54 | 	}
 55 | 
 56 | 	// All the following cases require the found suffix
 57 | 	// to be in R2.
 58 | 	if isInR2 == false {
 59 | 		return false
 60 | 	}
 61 | 
 62 | 	// Compound replacement cases.  All these cases return
 63 | 	// if they are hit.
 64 | 	//
 65 | 	compoundReplacement := func(otherSuffixes ...string) bool {
 66 | 		word.RemoveLastNRunes(suffixLength)
 67 | 		word.RemoveFirstSuffixIfIn(word.R2start, otherSuffixes...)
 68 | 		return true
 69 | 	}
 70 | 
 71 | 	switch suffix {
 72 | 	case "adora", "ador", "ación", "adoras", "adores", "aciones", "ante", "antes", "ancia", "ancias":
 73 | 		return compoundReplacement("ic")
 74 | 	case "mente":
 75 | 		return compoundReplacement("ante", "able", "ible")
 76 | 	case "idad", "idades":
 77 | 		return compoundReplacement("abil", "ic", "iv")
 78 | 	case "iva", "ivo", "ivas", "ivos":
 79 | 		return compoundReplacement("at")
 80 | 	}
 81 | 
 82 | 	// Simple replacement & deletion cases are all that remain.
 83 | 	//
 84 | 	simpleReplacement := func(repl string) bool {
 85 | 		word.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
 86 | 		return true
 87 | 	}
 88 | 	switch suffix {
 89 | 	case "logía", "logías":
 90 | 		return simpleReplacement("log")
 91 | 	case "ución", "uciones":
 92 | 		return simpleReplacement("u")
 93 | 	case "encia", "encias":
 94 | 		return simpleReplacement("ente")
 95 | 	case "anza", "anzas", "ico", "ica", "icos", "icas",
 96 | 		"ismo", "ismos", "able", "ables", "ible", "ibles",
 97 | 		"ista", "istas", "oso", "osa", "osos", "osas",
 98 | 		"amiento", "amientos", "imiento", "imientos":
 99 | 		word.RemoveLastNRunes(suffixLength)
100 | 		return true
101 | 	}
102 | 
103 | 	log.Panicln("Unhandled suffix:", suffix)
104 | 	return false
105 | }
106 | 


--------------------------------------------------------------------------------
/norwegian/norwegian_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	Herein lie all the tests of the norwegian snowball stemmer.
  3 | 	TODO
  4 | */
  5 | package norwegian
  6 | 
  7 | import (
  8 | 	"testing"
  9 | 
 10 | 	"github.com/kljensen/snowball/snowballword"
 11 | )
 12 | 
 13 | // Test stopWords for things we know should be true
 14 | // or false.
 15 | //
 16 | func Test_stopWords(t *testing.T) {
 17 | 
 18 | 	// Test true
 19 | 	knownTrueStopwords := [...]string{
 20 | 		"og",
 21 | 		"for",
 22 | 		"mye",
 23 | 		"ikke",
 24 | 	}
 25 | 	for _, word := range knownTrueStopwords {
 26 | 		if IsStopWord(word) == false {
 27 | 			t.Errorf("Expected %v, to be in stopWords", word)
 28 | 		}
 29 | 	}
 30 | 
 31 | 	// Test false
 32 | 	knownFalseStopwords := [...]string{
 33 | 		"truck",
 34 | 		"deoxyribonucleic",
 35 | 		"farse",
 36 | 		"bullschnizzle",
 37 | 	}
 38 | 	for _, word := range knownFalseStopwords {
 39 | 		if IsStopWord(word) == true {
 40 | 			t.Errorf("Expected %v, to be in stopWords", word)
 41 | 		}
 42 | 	}
 43 | }
 44 | 
 45 | func Test_r1(t *testing.T) {
 46 | 	var wordTests = []struct {
 47 | 		word string
 48 | 		r1   string
 49 | 	}{
 50 | 		{"åpnet", "et"},
 51 | 		{"åpner", "er"},
 52 | 		{"hvems", "s"},
 53 | 		{"ørene", "ne"},
 54 | 		// Special cases below
 55 | 	}
 56 | 	for _, testCase := range wordTests {
 57 | 		w := snowballword.New(testCase.word)
 58 | 		r1start := r1(w)
 59 | 		w.R1start = r1start
 60 | 		if w.R1String() != testCase.r1 {
 61 | 			t.Errorf("Expected \"{%v}\", but got \"{%v}\"", testCase.r1, w.R1String())
 62 | 		}
 63 | 	}
 64 | }
 65 | 
 66 | type stepFunc func(*snowballword.SnowballWord) bool
 67 | type stepTest struct {
 68 | 	wordIn  string
 69 | 	r1start int
 70 | 	wordOut string
 71 | 	r1out   string
 72 | }
 73 | 
 74 | func runStepTest(t *testing.T, f stepFunc, tcs []stepTest) {
 75 | 	for _, testCase := range tcs {
 76 | 		w := snowballword.New(testCase.wordIn)
 77 | 		w.R1start = testCase.r1start
 78 | 		_ = f(w)
 79 | 		if w.String() != testCase.wordOut || w.R1String() != testCase.r1out {
 80 | 			t.Errorf("Expected \"{%v, %v}\", but got \"{%v, %v}\"", testCase.wordOut, testCase.r1out, w.String(), w.R1String())
 81 | 		}
 82 | 	}
 83 | }
 84 | 
 85 | func Test_step1(t *testing.T) {
 86 | 	var testCases = []stepTest{
 87 | 		{"høytidlighetene", 3, "høytidlig", "tidlig"},
 88 | 		{"øyets", 3, "øyet", "t"},
 89 | 		{"ørets", 3, "øret", "t"},
 90 | 	}
 91 | 	runStepTest(t, step1, testCases)
 92 | }
 93 | 
 94 | func Test_step2(t *testing.T) {
 95 | 	var testCases = []stepTest{}
 96 | 	runStepTest(t, step2, testCases)
 97 | }
 98 | 
 99 | func Test_step3(t *testing.T) {
100 | 	var testCases = []stepTest{
101 | 		{"årlig", 3, "årl", ""},
102 | 	}
103 | 	runStepTest(t, step3, testCases)
104 | }
105 | 
106 | func Test_Stem(t *testing.T) {
107 | 	var testCases = []struct {
108 | 		in            string
109 | 		stemStopWords bool
110 | 		out           string
111 | 	}{
112 | 		{"havnedistrikt", true, "havnedistrikt"},
113 | 		{"havnedistriktene", true, "havnedistrikt"},
114 | 		{"havnedistrikter", true, "havnedistrikt"},
115 | 		{"havnedistriktets", true, "havnedistrikt"},
116 | 		{"havnedistriktets", true, "havnedistrikt"},
117 | 		{"opp", true, "opp"},
118 | 		{"oppad", true, "oppad"},
119 | 		{"opning", true, "opning"},
120 | 		{"havneinteresser", true, "havneinteress"},
121 | 		{"oppbygginga", true, "oppbygging"},
122 | 		{"oppbyggingen", true, "oppbygging"},
123 | 		{"oppdaterte", true, "oppdater"},
124 | 		{"tredjepersons", true, "tredjeperson"},
125 | 		{"uspesisfisert", true, "uspesisfiser"},
126 | 		{"voks", true, "voks"},
127 | 	}
128 | 	for _, tc := range testCases {
129 | 		stemmed := Stem(tc.in, tc.stemStopWords)
130 | 		if stemmed != tc.out {
131 | 			t.Errorf("Expected %v to stem to %v, but got %v", tc.in, tc.out, stemmed)
132 | 		}
133 | 	}
134 | 
135 | }
136 | 


--------------------------------------------------------------------------------
/swedish/swedish_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | 	Herein lie all the tests of the Swedish snowball stemmer.
  3 | 
  4 | */
  5 | package swedish
  6 | 
  7 | import (
  8 | 	"testing"
  9 | 
 10 | 	"github.com/kljensen/snowball/snowballword"
 11 | )
 12 | 
 13 | // Test stopWords for things we know should be true
 14 | // or false.
 15 | //
 16 | func Test_stopWords(t *testing.T) {
 17 | 
 18 | 	// Test true
 19 | 	knownTrueStopwords := [...]string{
 20 | 		"och",
 21 | 		"för",
 22 | 		"att",
 23 | 		"inte",
 24 | 	}
 25 | 	for _, word := range knownTrueStopwords {
 26 | 		if IsStopWord(word) == false {
 27 | 			t.Errorf("Expected %v, to be in stopWords", word)
 28 | 		}
 29 | 	}
 30 | 
 31 | 	// Test false
 32 | 	knownFalseStopwords := [...]string{
 33 | 		"truck",
 34 | 		"deoxyribonucleic",
 35 | 		"farse",
 36 | 		"bullschnizzle",
 37 | 	}
 38 | 	for _, word := range knownFalseStopwords {
 39 | 		if IsStopWord(word) == true {
 40 | 			t.Errorf("Expected %v, to be in stopWords", word)
 41 | 		}
 42 | 	}
 43 | }
 44 | 
 45 | func Test_r1(t *testing.T) {
 46 | 	var wordTests = []struct {
 47 | 		word string
 48 | 		r1   string
 49 | 	}{
 50 | 		{"öppnade", "nade"},
 51 | 		{"örnar", "ar"},
 52 | 		{"vems", "s"},
 53 | 		{"årorna", "rna"},
 54 | 		// Special cases below
 55 | 	}
 56 | 	for _, testCase := range wordTests {
 57 | 		w := snowballword.New(testCase.word)
 58 | 		r1start := r1(w)
 59 | 		w.R1start = r1start
 60 | 		if w.R1String() != testCase.r1 {
 61 | 			t.Errorf("Expected \"{%v}\", but got \"{%v}\"", testCase.r1, w.R1String())
 62 | 		}
 63 | 	}
 64 | }
 65 | 
 66 | type stepFunc func(*snowballword.SnowballWord) bool
 67 | type stepTest struct {
 68 | 	wordIn  string
 69 | 	r1start int
 70 | 	wordOut string
 71 | 	r1out   string
 72 | }
 73 | 
 74 | func runStepTest(t *testing.T, f stepFunc, tcs []stepTest) {
 75 | 	for _, testCase := range tcs {
 76 | 		w := snowballword.New(testCase.wordIn)
 77 | 		w.R1start = testCase.r1start
 78 | 		_ = f(w)
 79 | 		if w.String() != testCase.wordOut || w.R1String() != testCase.r1out {
 80 | 			t.Errorf("Expected \"{%v, %v}\", but got \"{%v, %v}\"", testCase.wordOut, testCase.r1out, w.String(), w.R1String())
 81 | 		}
 82 | 	}
 83 | }
 84 | 
 85 | func Test_step1(t *testing.T) {
 86 | 	var testCases = []stepTest{
 87 | 		{"högtidligheterna", 3, "högtidlig", "tidlig"},
 88 | 		{"ögats", 3, "ögat", "t"},
 89 | 		{"ärade", 3, "ärad", "d"},
 90 | 	}
 91 | 	runStepTest(t, step1, testCases)
 92 | }
 93 | 
 94 | func Test_step2(t *testing.T) {
 95 | 	var testCases = []stepTest{}
 96 | 	runStepTest(t, step2, testCases)
 97 | }
 98 | 
 99 | func Test_step3(t *testing.T) {
100 | 	var testCases = []stepTest{
101 | 		{"årlig", 3, "årl", ""},
102 | 	}
103 | 	runStepTest(t, step3, testCases)
104 | }
105 | 
106 | func Test_Stem(t *testing.T) {
107 | 	var testCases = []struct {
108 | 		in            string
109 | 		stemStopWords bool
110 | 		out           string
111 | 	}{
112 | 		{"jaktkarlar", true, "jaktkarl"},
113 | 		{"jaktkarlarne", true, "jaktkarl"},
114 | 		{"klokaste", true, "klok"},
115 | 		{"klokheten", true, "klok"},
116 | 		{"friskt", true, "frisk"},
117 | 		{"fröken", true, "frök"},
118 | 		{"kloliknande", true, "klolikn"},
119 | 		{"hopplöst", true, "hopplös"},
120 | 		{"hopplöshet", true, "hopplös"},
121 | 		{"årorna", true, "årorn"},
122 | 		// {"skating", true, "skate"},
123 | 		// {"fluently", true, "fluentli"},
124 | 		// {"ied", true, "ie"},
125 | 		// {"ies", true, "ie"},
126 | 		// Stop words
127 | 		{"vilkas", true, "vilk"},
128 | 		{"vilkas", false, "vilkas"},
129 | 		// {"above", true, "abov"},
130 | 		// {"above", false, "above"},
131 | 	}
132 | 	for _, tc := range testCases {
133 | 		stemmed := Stem(tc.in, tc.stemStopWords)
134 | 		if stemmed != tc.out {
135 | 			t.Errorf("Expected %v to stem to %v, but got %v", tc.in, tc.out, stemmed)
136 | 		}
137 | 	}
138 | 
139 | }
140 | 


--------------------------------------------------------------------------------
/french/common.go:
--------------------------------------------------------------------------------
  1 | package french
  2 | 
  3 | import (
  4 | 	"unicode/utf8"
  5 | 
  6 | 	"github.com/kljensen/snowball/romance"
  7 | 	"github.com/kljensen/snowball/snowballword"
  8 | )
  9 | 
 10 | // Return `true` if the input `word` is a French stop word.
 11 | func IsStopWord(word string) bool {
 12 | 	switch word {
 13 | 	case "au", "aux", "avec", "ce", "ces", "dans", "de", "des", "du",
 14 | 		"elle", "en", "et", "eux", "il", "je", "la", "le", "leur",
 15 | 		"lui", "ma", "mais", "me", "même", "mes", "moi", "mon", "ne",
 16 | 		"nos", "notre", "nous", "on", "ou", "par", "pas", "pour", "qu",
 17 | 		"que", "qui", "sa", "se", "ses", "son", "sur", "ta", "te",
 18 | 		"tes", "toi", "ton", "tu", "un", "une", "vos", "votre", "vous",
 19 | 		"c", "d", "j", "l", "à", "m", "n", "s", "t", "y", "été",
 20 | 		"étée", "étées", "étés", "étant", "étante", "étants", "étantes",
 21 | 		"suis", "es", "est", "sommes", "êtes", "sont", "serai",
 22 | 		"seras", "sera", "serons", "serez", "seront", "serais",
 23 | 		"serait", "serions", "seriez", "seraient", "étais", "était",
 24 | 		"étions", "étiez", "étaient", "fus", "fut", "fûmes", "fûtes",
 25 | 		"furent", "sois", "soit", "soyons", "soyez", "soient", "fusse",
 26 | 		"fusses", "fût", "fussions", "fussiez", "fussent", "ayant",
 27 | 		"ayante", "ayantes", "ayants", "eu", "eue", "eues", "eus",
 28 | 		"ai", "as", "avons", "avez", "ont", "aurai", "auras", "aura",
 29 | 		"aurons", "aurez", "auront", "aurais", "aurait", "aurions",
 30 | 		"auriez", "auraient", "avais", "avait", "avions", "aviez",
 31 | 		"avaient", "eut", "eûmes", "eûtes", "eurent", "aie", "aies",
 32 | 		"ait", "ayons", "ayez", "aient", "eusse", "eusses", "eût",
 33 | 		"eussions", "eussiez", "eussent":
 34 | 		return true
 35 | 	}
 36 | 	return false
 37 | }
 38 | 
 39 | // Checks if a rune is a lowercase French vowel.
 40 | func isLowerVowel(r rune) bool {
 41 | 
 42 | 	// The French vowels are "aeiouyâàëéêèïîôûù", which
 43 | 	// are referenced by their unicode code points
 44 | 	// in the switch statement below.
 45 | 	switch r {
 46 | 	case 97, 101, 105, 111, 117, 121, 226, 224, 235, 233, 234, 232, 239, 238, 244, 251, 249:
 47 | 		return true
 48 | 	}
 49 | 	return false
 50 | }
 51 | 
 52 | // Capitalize Y, I, and U runes that are acting as consanants.
 53 | // Put into upper case "u" or "i" preceded and followed by a
 54 | // vowel, and "y" preceded or followed by a vowel. "u" after q is
 55 | // also put into upper case.
 56 | func capitalizeYUI(word *snowballword.SnowballWord) {
 57 | 
 58 | 	// Keep track of vowels that we see
 59 | 	vowelPreviously := false
 60 | 
 61 | 	// Peak ahead to see if the next rune is a vowel
 62 | 	vowelNext := func(j int) bool {
 63 | 		return (j+1 < len(word.RS) && isLowerVowel(word.RS[j+1]))
 64 | 	}
 65 | 
 66 | 	// Look at all runes
 67 | 	for i := 0; i < len(word.RS); i++ {
 68 | 
 69 | 		// Nothing to do for non-vowels
 70 | 		if isLowerVowel(word.RS[i]) == false {
 71 | 			vowelPreviously = false
 72 | 			continue
 73 | 		}
 74 | 
 75 | 		vowelHere := true
 76 | 
 77 | 		switch word.RS[i] {
 78 | 		case 121: // y
 79 | 
 80 | 			// Is this "y" preceded OR followed by a vowel?
 81 | 			if vowelPreviously || vowelNext(i) {
 82 | 				word.RS[i] = 89 // Y
 83 | 				vowelHere = false
 84 | 			}
 85 | 
 86 | 		case 117: // u
 87 | 
 88 | 			// Is this "u" is flanked by vowels OR preceded by a "q"?
 89 | 			if (vowelPreviously && vowelNext(i)) || (i >= 1 && word.RS[i-1] == 113) {
 90 | 				word.RS[i] = 85 // U
 91 | 				vowelHere = false
 92 | 			}
 93 | 
 94 | 		case 105: // i
 95 | 
 96 | 			// Is this "i" is flanked by vowels?
 97 | 			if vowelPreviously && vowelNext(i) {
 98 | 				word.RS[i] = 73 // I
 99 | 				vowelHere = false
100 | 			}
101 | 		}
102 | 		vowelPreviously = vowelHere
103 | 	}
104 | }
105 | 
106 | // Find the starting point of the regions R1, R2, & RV
107 | func findRegions(word *snowballword.SnowballWord) (r1start, r2start, rvstart int) {
108 | 
109 | 	// R1 & R2 are defined in the standard manner.
110 | 	r1start = romance.VnvSuffix(word, isLowerVowel, 0)
111 | 	r2start = romance.VnvSuffix(word, isLowerVowel, r1start)
112 | 
113 | 	// Set RV, by default, as empty.
114 | 	rvstart = len(word.RS)
115 | 
116 | 	// Handle the three special cases: "par", "col", & "tap"
117 | 	//
118 | 	prefix := word.FirstPrefix("par", "col", "tap")
119 | 	if prefix != "" {
120 | 		rvstart = utf8.RuneCountInString(prefix)
121 | 		return
122 | 	}
123 | 
124 | 	// If the word begins with two vowels, RV is the region after the third letter
125 | 	if len(word.RS) >= 3 && isLowerVowel(word.RS[0]) && isLowerVowel(word.RS[1]) {
126 | 		rvstart = 3
127 | 		return
128 | 	}
129 | 
130 | 	// Otherwise the region after the first vowel not at the beginning of the word.
131 | 	for i := 1; i < len(word.RS); i++ {
132 | 		if isLowerVowel(word.RS[i]) {
133 | 			rvstart = i + 1
134 | 			return
135 | 		}
136 | 	}
137 | 
138 | 	return
139 | }
140 | 


--------------------------------------------------------------------------------
/hungarian/stem_test.go:
--------------------------------------------------------------------------------
  1 | package hungarian
  2 | 
  3 | import (
  4 | 	"reflect"
  5 | 	"testing"
  6 | 
  7 | 	"github.com/kljensen/snowball/snowballword"
  8 | )
  9 | 
 10 | func TestStemSentence(t *testing.T) {
 11 | 	var pairs [][2]string
 12 | 	var got []string
 13 | 	for k, want := range map[string][]string{
 14 | 		`Tisztelettel az alábbi bankszámlára szeretném kérni az utalás. Raiffeisen
 15 | Bank:999999999999999999999999.Tisztelettel:Horváth Péter
 16 | 
 17 | Az alábbi email a KöBE hálózatán kívüli forrásból érkezett, kérjük, legyen óvatos a beágyazott linkekkel és csatolmányokkal!
 18 | `: []string{
 19 | 			"tisztel", "az", "alább", "bankszáml", "szeretne", "kérn", "az", "utalás",
 20 | 			"raiffeis", "ba", "999999999999999999999999", "tisztel", "horváth", "péter",
 21 | 			"az", "alább", "email", "a", "kö", "hálózat", "kívül", "forrás", "érkezet",
 22 | 			"kér", "legyen", "óvatos", "a", "beágyazot", "link", "és", "csatolmány",
 23 | 		},
 24 | 	} {
 25 | 		pairs = StemSentence(pairs[:0], k)
 26 | 		got = got[:0]
 27 | 		for _, p := range pairs {
 28 | 			got = append(got, p[1])
 29 | 		}
 30 | 		if !reflect.DeepEqual(got, want) {
 31 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
 32 | 		}
 33 | 	}
 34 | }
 35 | func TestStem(t *testing.T) {
 36 | 	for k, want := range map[string]string{
 37 | 		"fiaiéi": "fi",
 38 | 		"megkelkáposztásíthatatlanságoskodásaitokért": "megkelkáposztásíthatatlanságoskodás",
 39 | 	} {
 40 | 		if got := Stem(k, false); got != want {
 41 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
 42 | 		}
 43 | 	}
 44 | }
 45 | 
 46 | func TestStep1(t *testing.T) {
 47 | 	for k, want := range map[string]string{
 48 | 		"taccsal": "tacs",
 49 | 		"téttel":  "tét",
 50 | 		"paddal":  "pad",
 51 | 		"padló":   "padló",
 52 | 	} {
 53 | 		w := snowballword.New(k)
 54 | 		preprocess(w)
 55 | 		step1(w)
 56 | 		if got := string(w.RS); got != want {
 57 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
 58 | 		}
 59 | 	}
 60 | }
 61 | 
 62 | func TestStep2(t *testing.T) {
 63 | 	for k, want := range map[string]string{
 64 | 		"padonként": "pad",
 65 | 		"tétről":    "tét",
 66 | 		"palából":   "pala",
 67 | 	} {
 68 | 		w := snowballword.New(k)
 69 | 		preprocess(w)
 70 | 		step2(w)
 71 | 		if got := string(w.RS); got != want {
 72 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
 73 | 		}
 74 | 	}
 75 | }
 76 | func TestStep3(t *testing.T) {
 77 | 	for k, want := range map[string]string{
 78 | 		"banánként":  "bana",
 79 | 		"bányánként": "bánya",
 80 | 		"lepkén":     "lepke",
 81 | 	} {
 82 | 		w := snowballword.New(k)
 83 | 		preprocess(w)
 84 | 		step3(w)
 85 | 		if got := string(w.RS); got != want {
 86 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
 87 | 		}
 88 | 	}
 89 | }
 90 | func TestStep4(t *testing.T) {
 91 | 	for k, want := range map[string]string{
 92 | 		"házastul":   "ház",
 93 | 		"képestül":   "kép",
 94 | 		"akóstul":    "akó",
 95 | 		"ruhástul":   "ruha",
 96 | 		"vízeséstül": "vízese",
 97 | 	} {
 98 | 		w := snowballword.New(k)
 99 | 		preprocess(w)
100 | 		step4(w)
101 | 		if got := string(w.RS); got != want {
102 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
103 | 		}
104 | 	}
105 | }
106 | func TestStep5(t *testing.T) {
107 | 	for k, want := range map[string]string{
108 | 		"fiaié":  "fiaié",
109 | 		"blatté": "blat",
110 | 	} {
111 | 		w := snowballword.New(k)
112 | 		preprocess(w)
113 | 		step5(w)
114 | 		if got := string(w.RS); got != want {
115 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
116 | 		}
117 | 	}
118 | }
119 | func TestStep6(t *testing.T) {
120 | 	for k, want := range map[string]string{
121 | 		"fiatoké": "fiat",
122 | 		"fiáéi":   "fia",
123 | 	} {
124 | 		w := snowballword.New(k)
125 | 		preprocess(w)
126 | 		step6(w)
127 | 		if got := string(w.RS); got != want {
128 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
129 | 		}
130 | 	}
131 | }
132 | func TestStep7(t *testing.T) {
133 | 	for k, want := range map[string]string{
134 | 		"mamájuk": "mama",
135 | 		"fenéjük": "fene",
136 | 		"bánatod": "bánat",
137 | 	} {
138 | 		w := snowballword.New(k)
139 | 		preprocess(w)
140 | 		step7(w)
141 | 		if got := string(w.RS); got != want {
142 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
143 | 		}
144 | 	}
145 | }
146 | func TestStep8(t *testing.T) {
147 | 	for k, want := range map[string]string{
148 | 		"mamáid":   "mama",
149 | 		"fenéitek": "fene",
150 | 		"bánatai":  "bánat",
151 | 	} {
152 | 		w := snowballword.New(k)
153 | 		preprocess(w)
154 | 		step8(w)
155 | 		if got := string(w.RS); got != want {
156 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
157 | 		}
158 | 	}
159 | }
160 | func TestStep9(t *testing.T) {
161 | 	for k, want := range map[string]string{
162 | 		"mamák":   "mama",
163 | 		"fenék":   "fene",
164 | 		"bánatok": "bánat",
165 | 	} {
166 | 		w := snowballword.New(k)
167 | 		preprocess(w)
168 | 		step9(w)
169 | 		if got := string(w.RS); got != want {
170 | 			t.Errorf("%q: got %q, wanted %q", k, got, want)
171 | 		}
172 | 	}
173 | }
174 | 


--------------------------------------------------------------------------------
/russian/step1.go:
--------------------------------------------------------------------------------
  1 | package russian
  2 | 
  3 | import (
  4 | 	"unicode/utf8"
  5 | 
  6 | 	"github.com/kljensen/snowball/snowballword"
  7 | 	// "log"
  8 | )
  9 | 
 10 | // Step 1 is the removal of standard suffixes, all of which must
 11 | // occur in RV.
 12 | //
 13 | // Search for a PERFECTIVE GERUND ending. If one is found remove it, and
 14 | // that is then the end of step 1. Otherwise try and remove a REFLEXIVE
 15 | // ending, and then search in turn for (1) an ADJECTIVAL, (2) a VERB or
 16 | // (3) a NOUN ending. As soon as one of the endings (1) to (3) is found
 17 | // remove it, and terminate step 1.
 18 | func step1(word *snowballword.SnowballWord) bool {
 19 | 
 20 | 	// `stop` will be used to signal early termination
 21 | 	var stop bool
 22 | 
 23 | 	// Search for a PERFECTIVE GERUND ending
 24 | 	stop = removePerfectiveGerundEnding(word)
 25 | 	if stop {
 26 | 		return true
 27 | 	}
 28 | 
 29 | 	// Next remove reflexive endings
 30 | 	word.RemoveFirstSuffixIn(word.RVstart, "ся", "сь")
 31 | 
 32 | 	// Next remove adjectival endings
 33 | 	stop = removeAdjectivalEnding(word)
 34 | 	if stop {
 35 | 		return true
 36 | 	}
 37 | 
 38 | 	// Next remove verb endings
 39 | 	stop = removeVerbEnding(word)
 40 | 	if stop {
 41 | 		return true
 42 | 	}
 43 | 
 44 | 	// Next remove noun endings
 45 | 	suffix := word.RemoveFirstSuffixIn(word.RVstart,
 46 | 		"иями", "ями", "иях", "иям", "ием", "ией", "ами", "ях",
 47 | 		"ям", "ья", "ью", "ье", "ом", "ой", "ов", "ия", "ию",
 48 | 		"ий", "ии", "ие", "ем", "ей", "еи", "ев", "ах", "ам",
 49 | 		"я", "ю", "ь", "ы", "у", "о", "й", "и", "е", "а",
 50 | 	)
 51 | 	if suffix != "" {
 52 | 		return true
 53 | 	}
 54 | 
 55 | 	return false
 56 | }
 57 | 
 58 | // Remove perfective gerund endings and return true if one was removed.
 59 | func removePerfectiveGerundEnding(word *snowballword.SnowballWord) bool {
 60 | 	suffix := word.FirstSuffixIn(word.RVstart, len(word.RS),
 61 | 		"ившись", "ывшись", "вшись", "ивши", "ывши", "вши", "ив", "ыв", "в",
 62 | 	)
 63 | 	suffixLength := utf8.RuneCountInString(suffix)
 64 | 	switch suffix {
 65 | 	case "в", "вши", "вшись":
 66 | 
 67 | 		// These are "Group 1" perfective gerund endings.
 68 | 		// Group 1 endings must follow а (a) or я (ia) in RV.
 69 | 		if precededByARinRV(word, suffixLength) == false {
 70 | 			suffix = ""
 71 | 		}
 72 | 
 73 | 	}
 74 | 
 75 | 	if suffix != "" {
 76 | 		word.RemoveLastNRunes(suffixLength)
 77 | 		return true
 78 | 	}
 79 | 	return false
 80 | }
 81 | 
 82 | // Remove adjectival endings and return true if one was removed.
 83 | func removeAdjectivalEnding(word *snowballword.SnowballWord) bool {
 84 | 
 85 | 	// Remove adjectival endings.  Start by looking for
 86 | 	// an adjective ending.
 87 | 	//
 88 | 	suffix := word.RemoveFirstSuffixIn(word.RVstart,
 89 | 		"ими", "ыми", "его", "ого", "ему", "ому", "ее", "ие",
 90 | 		"ые", "ое", "ей", "ий", "ый", "ой", "ем", "им", "ым",
 91 | 		"ом", "их", "ых", "ую", "юю", "ая", "яя", "ою", "ею",
 92 | 	)
 93 | 	if suffix != "" {
 94 | 
 95 | 		// We found an adjective ending.  Remove optional participle endings.
 96 | 		//
 97 | 		newSuffix := word.FirstSuffixIn(word.RVstart, len(word.RS),
 98 | 			"ивш", "ывш", "ующ",
 99 | 			"ем", "нн", "вш", "ющ", "щ",
100 | 		)
101 | 		suffixLength := utf8.RuneCountInString(newSuffix)
102 | 
103 | 		switch newSuffix {
104 | 		case "ем", "нн", "вш", "ющ", "щ":
105 | 
106 | 			// These are "Group 1" participle endings.
107 | 			// Group 1 endings must follow а (a) or я (ia) in RV.
108 | 			if precededByARinRV(word, suffixLength) == false {
109 | 				newSuffix = ""
110 | 			}
111 | 		}
112 | 
113 | 		if newSuffix != "" {
114 | 			word.RemoveLastNRunes(suffixLength)
115 | 		}
116 | 		return true
117 | 	}
118 | 	return false
119 | }
120 | 
121 | // Remove verb endings and return true if one was removed.
122 | func removeVerbEnding(word *snowballword.SnowballWord) bool {
123 | 	suffix := word.FirstSuffixIn(word.RVstart, len(word.RS),
124 | 		"уйте", "ейте", "ыть", "ыло", "ыли", "ыла", "уют", "ует",
125 | 		"нно", "йте", "ишь", "ить", "ите", "ило", "или", "ила",
126 | 		"ешь", "ете", "ены", "ено", "ена", "ят", "ют", "ыт", "ым",
127 | 		"ыл", "ую", "уй", "ть", "ны", "но", "на", "ло", "ли", "ла",
128 | 		"ит", "им", "ил", "ет", "ен", "ем", "ей", "ю", "н", "л", "й",
129 | 	)
130 | 	suffixLength := utf8.RuneCountInString(suffix)
131 | 
132 | 	switch suffix {
133 | 	case "ла", "на", "ете", "йте", "ли", "й", "л", "ем", "н",
134 | 		"ло", "но", "ет", "ют", "ны", "ть", "ешь", "нно":
135 | 
136 | 		// These are "Group 1" verb endings.
137 | 		// Group 1 endings must follow а (a) or я (ia) in RV.
138 | 		if precededByARinRV(word, suffixLength) == false {
139 | 			suffix = ""
140 | 		}
141 | 
142 | 	}
143 | 
144 | 	if suffix != "" {
145 | 		word.RemoveLastNRunes(suffixLength)
146 | 		return true
147 | 	}
148 | 	return false
149 | }
150 | 
151 | // There are multiple classes of endings that must be
152 | // preceded by а (a) or я (ia) in RV in order to be removed.
153 | func precededByARinRV(word *snowballword.SnowballWord, suffixLen int) bool {
154 | 	idx := len(word.RS) - suffixLen - 1
155 | 	if idx >= word.RVstart && (word.RS[idx] == 'а' || word.RS[idx] == 'я') {
156 | 		return true
157 | 	}
158 | 	return false
159 | }
160 | 


--------------------------------------------------------------------------------
/snowballword/snowballword_test.go:
--------------------------------------------------------------------------------
  1 | package snowballword
  2 | 
  3 | import "testing"
  4 | 
  5 | func Test_New(t *testing.T) {
  6 | 	w := New("kyle")
  7 | 	if w.String() != "kyle" {
  8 | 		t.Errorf("Expected \"%v\" but got \"%v\"", "kyle", w.String())
  9 | 	}
 10 | }
 11 | 
 12 | func Test_FirstPrefix(t *testing.T) {
 13 | 	var testCases = []struct {
 14 | 		input    string
 15 | 		prefixes []string
 16 | 		prefix   string
 17 | 	}{
 18 | 		{"firehose", []string{"x", "fi"}, "fi"},
 19 | 		{"firehose", []string{"x", "fix", "fi"}, "fi"},
 20 | 		{"firehose", []string{"x", "fi"}, "fi"},
 21 | 		{"firehose", []string{"fire", "fi"}, "fire"},
 22 | 		{"firehose", []string{"fixre", "xfi"}, ""},
 23 | 		{"firehose", []string{"firehosex"}, ""},
 24 | 	}
 25 | 	for _, tc := range testCases {
 26 | 		w := New(tc.input)
 27 | 		prefix := w.FirstPrefix(tc.prefixes...)
 28 | 		if prefix != tc.prefix {
 29 | 			t.Errorf("Expected \"{%v}\" but got \"{%v}\"", tc.prefix, prefix)
 30 | 		}
 31 | 	}
 32 | }
 33 | 
 34 | func Test_FirstSuffix(t *testing.T) {
 35 | 	var testCases = []struct {
 36 | 		input    string
 37 | 		suffixes []string
 38 | 		suffix   string
 39 | 	}{
 40 | 		{"firehose", []string{"x", "fi"}, ""},
 41 | 		{"firehose", []string{"x", "hose", "fi"}, "hose"},
 42 | 		{"firehose", []string{"x", "se"}, "se"},
 43 | 		{"firehose", []string{"fire", "xfirehose"}, ""},
 44 | 	}
 45 | 	for _, tc := range testCases {
 46 | 		w := New(tc.input)
 47 | 		suffix := w.FirstSuffix(tc.suffixes...)
 48 | 		if suffix != tc.suffix {
 49 | 			t.Errorf("Expected \"{%v}\" but got \"{%v}\"", tc.suffix, suffix)
 50 | 		}
 51 | 	}
 52 | }
 53 | func Test_FirstSuffixIfIn(t *testing.T) {
 54 | 	var testCases = []struct {
 55 | 		input    string
 56 | 		startPos int
 57 | 		endPos   int
 58 | 		suffixes []string
 59 | 		suffix   string
 60 | 	}{
 61 | 		{"firehose", 0, 6, []string{"x", "fi"}, ""},
 62 | 		{"firehose", 0, 6, []string{"x", "eho", "fi"}, "eho"},
 63 | 		{"firehose", 0, 4, []string{"re", "se"}, "re"},
 64 | 		{"firehose", 0, 4, []string{"se", "xfirehose"}, ""},
 65 | 		{"firehose", 0, 4, []string{"fire", "xxx"}, "fire"},
 66 | 		{"firehose", 1, 5, []string{"fire", "xxx"}, ""},
 67 | 		// The follwoing tests shows how FirstSuffixIfIn works. It
 68 | 		// first checks for the matching suffix and only then checks
 69 | 		// to see if it is starts at or before startPos.  This
 70 | 		// is the behavior desired for many stemming steps but
 71 | 		// is somewhat counterintuitive.
 72 | 		{"firehose", 1, 5, []string{"fireh", "ireh", "h"}, ""},
 73 | 		{"firehose", 1, 5, []string{"ireh", "fireh", "h"}, "ireh"},
 74 | 	}
 75 | 	for _, tc := range testCases {
 76 | 		w := New(tc.input)
 77 | 		suffix := w.FirstSuffixIfIn(tc.startPos, tc.endPos, tc.suffixes...)
 78 | 		if suffix != tc.suffix {
 79 | 			t.Errorf("Expected \"{%v}\" but got \"{%v}\"", tc.suffix, suffix)
 80 | 		}
 81 | 	}
 82 | }
 83 | 
 84 | func Test_ReplaceSuffixRunes(t *testing.T) {
 85 | 	var testCases = []struct {
 86 | 		input  string
 87 | 		suffix string
 88 | 		repl   string
 89 | 		force  bool
 90 | 		output string
 91 | 	}{
 92 | 		{"tonydanza", "danza", "yyy", true, "tonyyyy"},
 93 | 		{"tonydanza", "danza", "yyy", false, "tonyyyy"},
 94 | 		{"tonydanza", "danzad", "yyy", false, "tonydanza"},
 95 | 		{"tonydanza", "danzad", "yyy", true, "tonyyy"},
 96 | 	}
 97 | 	for _, tc := range testCases {
 98 | 		w := New(tc.input)
 99 | 		w.ReplaceSuffixRunes([]rune(tc.suffix), []rune(tc.repl), tc.force)
100 | 		if w.String() != tc.output {
101 | 			t.Errorf("Expected %v -> \"%v\", but got \"%v\"", tc.input, tc.output, w.String())
102 | 		}
103 | 	}
104 | 
105 | }
106 | 
107 | func Test_ReplaceSuffix(t *testing.T) {
108 | 	var testCases = []struct {
109 | 		input          string
110 | 		r1start        int
111 | 		r2start        int
112 | 		suffix         string
113 | 		repl           string
114 | 		output         string
115 | 		outputR1String string
116 | 		outputR2String string
117 | 	}{
118 | 		{"accliviti", 2, 6, "iviti", "ive", "acclive", "clive", "e"},
119 | 		{"skating", 4, 6, "ing", "e", "skate", "e", ""},
120 | 		{"convirtiéndo", 3, 6, "iéndo", "iendo", "convirtiendo", "virtiendo", "tiendo"},
121 | 	}
122 | 	for _, tc := range testCases {
123 | 		w := New(tc.input)
124 | 		w.R1start = tc.r1start
125 | 		w.R2start = tc.r2start
126 | 		w.ReplaceSuffix(tc.suffix, tc.repl, true)
127 | 		if w.String() != tc.output || w.R1String() != tc.outputR1String || w.R2String() != tc.outputR2String {
128 | 			t.Errorf("Expected %v -> \"{%v, %v, %v}\" but got \"{%v, %v, %v}\"", tc.input, tc.output, tc.outputR1String, tc.outputR2String, w.String(), w.R1String(), w.R2String())
129 | 		}
130 | 	}
131 | }
132 | 
133 | func Test_RemoveLastNRunes(t *testing.T) {
134 | 	var testCases = []struct {
135 | 		input          string
136 | 		r1start        int
137 | 		r2start        int
138 | 		n              int
139 | 		output         string
140 | 		outputR1String string
141 | 		outputR2String string
142 | 	}{
143 | 		{"aabbccddee", 8, 9, 0, "aabbccddee", "ee", "e"},
144 | 		{"aabbccddee", 8, 9, 5, "aabbc", "", ""},
145 | 		{"aabbccddee", 8, 9, 1, "aabbccdde", "e", ""},
146 | 	}
147 | 	for _, tc := range testCases {
148 | 		w := New(tc.input)
149 | 		w.R1start = tc.r1start
150 | 		w.R2start = tc.r2start
151 | 		w.RemoveLastNRunes(tc.n)
152 | 		if w.String() != tc.output || w.R1String() != tc.outputR1String || w.R2String() != tc.outputR2String {
153 | 			t.Errorf("Expected %v -> \"{%v, %v, %v}\" but got \"{%v, %v, %v}\"", tc.input, tc.output, tc.outputR1String, tc.outputR2String, w.String(), w.R1String(), w.R2String())
154 | 		}
155 | 	}
156 | }
157 | 


--------------------------------------------------------------------------------
/spanish/common.go:
--------------------------------------------------------------------------------
  1 | package spanish
  2 | 
  3 | import (
  4 | 	"github.com/kljensen/snowball/romance"
  5 | 	"github.com/kljensen/snowball/snowballword"
  6 | )
  7 | 
  8 | // Change the vowels "áéíóú" into "aeiou".
  9 | //
 10 | func removeAccuteAccents(word *snowballword.SnowballWord) (didReplacement bool) {
 11 | 	for i := 0; i < len(word.RS); i++ {
 12 | 		switch word.RS[i] {
 13 | 		case 225:
 14 | 			// á -> a
 15 | 			word.RS[i] = 97
 16 | 			didReplacement = true
 17 | 		case 233:
 18 | 			// é -> e
 19 | 			word.RS[i] = 101
 20 | 			didReplacement = true
 21 | 		case 237:
 22 | 			// í -> i
 23 | 			word.RS[i] = 105
 24 | 			didReplacement = true
 25 | 		case 243:
 26 | 			// ó -> o
 27 | 			word.RS[i] = 111
 28 | 			didReplacement = true
 29 | 		case 250:
 30 | 			// ú -> u
 31 | 			word.RS[i] = 117
 32 | 			didReplacement = true
 33 | 		}
 34 | 	}
 35 | 	return
 36 | }
 37 | 
 38 | // Find the starting point of the regions R1, R2, & RV
 39 | //
 40 | func findRegions(word *snowballword.SnowballWord) (r1start, r2start, rvstart int) {
 41 | 
 42 | 	r1start = romance.VnvSuffix(word, isLowerVowel, 0)
 43 | 	r2start = romance.VnvSuffix(word, isLowerVowel, r1start)
 44 | 	rvstart = len(word.RS)
 45 | 
 46 | 	if len(word.RS) >= 3 {
 47 | 		switch {
 48 | 
 49 | 		case !isLowerVowel(word.RS[1]):
 50 | 
 51 | 			// If the second letter is a consonant, RV is the region after the
 52 | 			// next following vowel.
 53 | 			for i := 2; i < len(word.RS); i++ {
 54 | 				if isLowerVowel(word.RS[i]) {
 55 | 					rvstart = i + 1
 56 | 					break
 57 | 				}
 58 | 			}
 59 | 
 60 | 		case isLowerVowel(word.RS[0]) && isLowerVowel(word.RS[1]):
 61 | 
 62 | 			// Or if the first two letters are vowels, RV
 63 | 			// is the region after the next consonant.
 64 | 			for i := 2; i < len(word.RS); i++ {
 65 | 				if !isLowerVowel(word.RS[i]) {
 66 | 					rvstart = i + 1
 67 | 					break
 68 | 				}
 69 | 			}
 70 | 		default:
 71 | 
 72 | 			// Otherwise (consonant-vowel case) RV is the region after the
 73 | 			// third letter. But RV is the end of the word if these
 74 | 			// positions cannot be found.
 75 | 			rvstart = 3
 76 | 		}
 77 | 	}
 78 | 
 79 | 	return
 80 | }
 81 | 
 82 | // Checks if a rune is a lowercase Spanish vowel.
 83 | //
 84 | func isLowerVowel(r rune) bool {
 85 | 
 86 | 	// The spanish vowels are "aeiouáéíóúü", which
 87 | 	// are referenced by their unicode code points
 88 | 	// in the switch statement below.
 89 | 	switch r {
 90 | 	case 97, 101, 105, 111, 117, 225, 233, 237, 243, 250, 252:
 91 | 		return true
 92 | 	}
 93 | 	return false
 94 | }
 95 | 
 96 | // Return `true` if the input `word` is a Spanish stop word.
 97 | //
 98 | func IsStopWord(word string) bool {
 99 | 	switch word {
100 | 	case "de", "la", "que", "el", "en", "y", "a", "los", "del", "se", "las",
101 | 		"por", "un", "para", "con", "no", "una", "su", "al", "lo", "como",
102 | 		"más", "pero", "sus", "le", "ya", "o", "este", "sí", "porque", "esta",
103 | 		"entre", "cuando", "muy", "sin", "sobre", "también", "me", "hasta",
104 | 		"hay", "donde", "quien", "desde", "todo", "nos", "durante", "todos",
105 | 		"uno", "les", "ni", "contra", "otros", "ese", "eso", "ante", "ellos",
106 | 		"e", "esto", "mí", "antes", "algunos", "qué", "unos", "yo", "otro",
107 | 		"otras", "otra", "él", "tanto", "esa", "estos", "mucho", "quienes",
108 | 		"nada", "muchos", "cual", "poco", "ella", "estar", "estas", "algunas",
109 | 		"algo", "nosotros", "mi", "mis", "tú", "te", "ti", "tu", "tus", "ellas",
110 | 		"nosotras", "vosostros", "vosostras", "os", "mío", "mía", "míos", "mías",
111 | 		"tuyo", "tuya", "tuyos", "tuyas", "suyo", "suya", "suyos", "suyas",
112 | 		"nuestro", "nuestra", "nuestros", "nuestras", "vuestro", "vuestra",
113 | 		"vuestros", "vuestras", "esos", "esas", "estoy", "estás", "está", "estamos",
114 | 		"estáis", "están", "esté", "estés", "estemos", "estéis", "estén", "estaré",
115 | 		"estarás", "estará", "estaremos", "estaréis", "estarán", "estaría",
116 | 		"estarías", "estaríamos", "estaríais", "estarían", "estaba", "estabas",
117 | 		"estábamos", "estabais", "estaban", "estuve", "estuviste", "estuvo",
118 | 		"estuvimos", "estuvisteis", "estuvieron", "estuviera", "estuvieras",
119 | 		"estuviéramos", "estuvierais", "estuvieran", "estuviese", "estuvieses",
120 | 		"estuviésemos", "estuvieseis", "estuviesen", "estando", "estado",
121 | 		"estada", "estados", "estadas", "estad", "he", "has", "ha", "hemos",
122 | 		"habéis", "han", "haya", "hayas", "hayamos", "hayáis", "hayan",
123 | 		"habré", "habrás", "habrá", "habremos", "habréis", "habrán", "habría",
124 | 		"habrías", "habríamos", "habríais", "habrían", "había", "habías",
125 | 		"habíamos", "habíais", "habían", "hube", "hubiste", "hubo", "hubimos",
126 | 		"hubisteis", "hubieron", "hubiera", "hubieras", "hubiéramos", "hubierais",
127 | 		"hubieran", "hubiese", "hubieses", "hubiésemos", "hubieseis", "hubiesen",
128 | 		"habiendo", "habido", "habida", "habidos", "habidas", "soy", "eres",
129 | 		"es", "somos", "sois", "son", "sea", "seas", "seamos", "seáis", "sean",
130 | 		"seré", "serás", "será", "seremos", "seréis", "serán", "sería", "serías",
131 | 		"seríamos", "seríais", "serían", "era", "eras", "éramos", "erais",
132 | 		"eran", "fui", "fuiste", "fue", "fuimos", "fuisteis", "fueron", "fuera",
133 | 		"fueras", "fuéramos", "fuerais", "fueran", "fuese", "fueses", "fuésemos",
134 | 		"fueseis", "fuesen", "sintiendo", "sentido", "sentida", "sentidos",
135 | 		"sentidas", "siente", "sentid", "tengo", "tienes", "tiene", "tenemos",
136 | 		"tenéis", "tienen", "tenga", "tengas", "tengamos", "tengáis", "tengan",
137 | 		"tendré", "tendrás", "tendrá", "tendremos", "tendréis", "tendrán",
138 | 		"tendría", "tendrías", "tendríamos", "tendríais", "tendrían", "tenía",
139 | 		"tenías", "teníamos", "teníais", "tenían", "tuve", "tuviste", "tuvo",
140 | 		"tuvimos", "tuvisteis", "tuvieron", "tuviera", "tuvieras", "tuviéramos",
141 | 		"tuvierais", "tuvieran", "tuviese", "tuvieses", "tuviésemos", "tuvieseis",
142 | 		"tuviesen", "teniendo", "tenido", "tenida", "tenidos", "tenidas", "tened":
143 | 		return true
144 | 	}
145 | 	return false
146 | }
147 | 


--------------------------------------------------------------------------------
/hungarian/common.go:
--------------------------------------------------------------------------------
  1 | package hungarian
  2 | 
  3 | import (
  4 | 	"sync"
  5 | 
  6 | 	"github.com/kljensen/snowball/snowballword"
  7 | )
  8 | 
  9 | var (
 10 | 	runesMapMu sync.Mutex
 11 | 	runesMap   = make(map[string][]rune)
 12 | )
 13 | 
 14 | func runesOf(s string) []rune {
 15 | 	runesMapMu.Lock()
 16 | 	rs := runesMap[s]
 17 | 	if rs == nil {
 18 | 		rs = []rune(s)
 19 | 		runesMap[s] = rs
 20 | 	}
 21 | 	runesMapMu.Unlock()
 22 | 	return rs
 23 | }
 24 | 
 25 | // findRegions returns start of R1.
 26 | //
 27 | // If the word begins with a vowel, R1 is defined as the region after the first consonant or digraph in the word.
 28 | // If the word begins with a consonant, it is defined as the region after the first vowel in the word.
 29 | // If the word does not contain both a vowel and consonant, R1 is the null region at the end of the word.
 30 | func findRegions(word *snowballword.SnowballWord) (r1start int) {
 31 | 	if len(word.RS) < 2 {
 32 | 		return 0
 33 | 	}
 34 | 
 35 | 	// If the word begins with a vowel, R1 is defined as the region
 36 | 	// after the first consonant or digraph in the word.
 37 | 	if isVowel(word.RS[0]) {
 38 | 		for i := 1; i < len(word.RS); i++ {
 39 | 			if isVowel(word.RS[i]) {
 40 | 				continue
 41 | 			}
 42 | 			if j := isDigraph(word.RS[i:]); j > 0 {
 43 | 				return i + j
 44 | 			}
 45 | 			// consonant
 46 | 			return i + 1
 47 | 		}
 48 | 		return len(word.RS)
 49 | 	}
 50 | 
 51 | 	// If the word begins with a consonant, it is defined as the region
 52 | 	// after the first vowel in the word.
 53 | 	for i := 1; i < len(word.RS); i++ {
 54 | 		if isVowel(word.RS[i]) {
 55 | 			return i + 1
 56 | 		}
 57 | 	}
 58 | 	return len(word.RS)
 59 | }
 60 | 
 61 | func isVowel(r rune) bool {
 62 | 	switch r {
 63 | 	case 'a', 'á', 'e', 'é', 'i', 'í', 'o', 'ó', 'ö', 'ő', 'u', 'ú', 'ü', 'ű':
 64 | 		return true
 65 | 	}
 66 | 	return false
 67 | }
 68 | func isDigraph(rs []rune) int {
 69 | 	if len(rs) < 2 {
 70 | 		return 0
 71 | 	}
 72 | 	switch rs[0] {
 73 | 	case 'c', 'z': // cs, zs
 74 | 		if rs[1] == 's' {
 75 | 			return 2
 76 | 		}
 77 | 	case 'd':
 78 | 		if rs[1] == 'z' {
 79 | 			if len(rs) > 2 && rs[2] == 's' { // dzs
 80 | 				return 3
 81 | 			}
 82 | 			return 2 // dz
 83 | 		}
 84 | 	case 'g', 'l', 'n', 't':
 85 | 		if rs[1] == 'y' {
 86 | 			return 2
 87 | 		}
 88 | 	}
 89 | 	return 0
 90 | }
 91 | 
 92 | func isConsonant(r rune) bool {
 93 | 	switch r {
 94 | 	case 'b', 'c', 'd', 'f', 'g', 'j', 'k', 'l', 'm', 'n', 'p', 'r', 's', 't', 'v', 'z':
 95 | 		return true
 96 | 	}
 97 | 	return false
 98 | }
 99 | func isDoubleConsonant(rs []rune) int {
100 | 	if len(rs) < 2 || !isConsonant(rs[0]) || rs[0] != rs[1] {
101 | 		return 0
102 | 	}
103 | 	if len(rs) > 2 {
104 | 		switch rs[0] {
105 | 		case 'c', 'z':
106 | 			if rs[2] == 's' {
107 | 				return 3
108 | 			}
109 | 		case 's':
110 | 			if rs[2] == 'z' {
111 | 				return 3
112 | 			}
113 | 		case 'g', 'l', 'n', 't':
114 | 			if rs[2] == 'y' {
115 | 				return 3
116 | 			}
117 | 		}
118 | 	}
119 | 	return 2
120 | }
121 | 
122 | // IsStopWord returns true it the word is a stop word.
123 | //
124 | // # Hungarian stop word list prepared by Anna Tordai
125 | //
126 | // https://snowballstem.org/algorithms/hungarian/stop.txt
127 | func IsStopWord(word string) bool {
128 | 	switch word {
129 | 	case "a",
130 | 		"ahogy",
131 | 		"ahol",
132 | 		"aki",
133 | 		"akik",
134 | 		"akkor",
135 | 		"alatt",
136 | 		"által",
137 | 		"általában",
138 | 		"amely",
139 | 		"amelyek",
140 | 		"amelyekben",
141 | 		"amelyeket",
142 | 		"amelyet",
143 | 		"amelynek",
144 | 		"ami",
145 | 		"amit",
146 | 		"amolyan",
147 | 		"amíg",
148 | 		"amikor",
149 | 		"át",
150 | 		"abban",
151 | 		"ahhoz",
152 | 		"annak",
153 | 		"arra",
154 | 		"arról",
155 | 		"az",
156 | 		"azok",
157 | 		"azon",
158 | 		"azt",
159 | 		"azzal",
160 | 		"azért",
161 | 		"aztán",
162 | 		"azután",
163 | 		"azonban",
164 | 		"bár",
165 | 		"be",
166 | 		"belül",
167 | 		"benne",
168 | 		"cikk",
169 | 		"cikkek",
170 | 		"cikkeket",
171 | 		"csak",
172 | 		"de",
173 | 		"e",
174 | 		"eddig",
175 | 		"egész",
176 | 		"egy",
177 | 		"egyes",
178 | 		"egyetlen",
179 | 		"egyéb",
180 | 		"egyik",
181 | 		"egyre",
182 | 		"ekkor",
183 | 		"el",
184 | 		"elég",
185 | 		"ellen",
186 | 		"elő",
187 | 		"először",
188 | 		"előtt",
189 | 		"első",
190 | 		"én",
191 | 		"éppen",
192 | 		"ebben",
193 | 		"ehhez",
194 | 		"emilyen",
195 | 		"ennek",
196 | 		"erre",
197 | 		"ez",
198 | 		"ezt",
199 | 		"ezek",
200 | 		"ezen",
201 | 		"ezzel",
202 | 		"ezért",
203 | 		"és",
204 | 		"fel",
205 | 		"felé",
206 | 		"hanem",
207 | 		"hiszen",
208 | 		"hogy",
209 | 		"hogyan",
210 | 		"igen",
211 | 		"így",
212 | 		"illetve",
213 | 		"ill.",
214 | 		"ill",
215 | 		"ilyen",
216 | 		"ilyenkor",
217 | 		"ison",
218 | 		"ismét",
219 | 		"itt",
220 | 		"jó",
221 | 		"jól",
222 | 		"jobban",
223 | 		"kell",
224 | 		"kellett",
225 | 		"keresztül",
226 | 		"keressünk",
227 | 		"ki",
228 | 		"kívül",
229 | 		"között",
230 | 		"közül",
231 | 		"legalább",
232 | 		"lehet",
233 | 		"lehetett",
234 | 		"legyen",
235 | 		"lenne",
236 | 		"lenni",
237 | 		"lesz",
238 | 		"lett",
239 | 		"maga",
240 | 		"magát",
241 | 		"majd",
242 | 		"már",
243 | 		"más",
244 | 		"másik",
245 | 		"meg",
246 | 		"még",
247 | 		"mellett",
248 | 		"mert",
249 | 		"mely",
250 | 		"melyek",
251 | 		"mi",
252 | 		"mit",
253 | 		"míg",
254 | 		"miért",
255 | 		"milyen",
256 | 		"mikor",
257 | 		"minden",
258 | 		"mindent",
259 | 		"mindenki",
260 | 		"mindig",
261 | 		"mint",
262 | 		"mintha",
263 | 		"mivel",
264 | 		"most",
265 | 		"nagy",
266 | 		"nagyobb",
267 | 		"nagyon",
268 | 		"ne",
269 | 		"néha",
270 | 		"nekem",
271 | 		"neki",
272 | 		"nem",
273 | 		"néhány",
274 | 		"nélkül",
275 | 		"nincs",
276 | 		"olyan",
277 | 		"ott",
278 | 		"össze",
279 | 		"ő",
280 | 		"ők",
281 | 		"őket",
282 | 		"pedig",
283 | 		"persze",
284 | 		"rá",
285 | 		"s",
286 | 		"saját",
287 | 		"sem",
288 | 		"semmi",
289 | 		"sok",
290 | 		"sokat",
291 | 		"sokkal",
292 | 		"számára",
293 | 		"szemben",
294 | 		"szerint",
295 | 		"szinte",
296 | 		"talán",
297 | 		"tehát",
298 | 		"teljes",
299 | 		"tovább",
300 | 		"továbbá",
301 | 		"több",
302 | 		"úgy",
303 | 		"ugyanis",
304 | 		"új",
305 | 		"újabb",
306 | 		"újra",
307 | 		"után",
308 | 		"utána",
309 | 		"utolsó",
310 | 		"vagy",
311 | 		"vagyis",
312 | 		"valaki",
313 | 		"valami",
314 | 		"valamint",
315 | 		"való",
316 | 		"vagyok",
317 | 		"van",
318 | 		"vannak",
319 | 		"volt",
320 | 		"voltam",
321 | 		"voltak",
322 | 		"voltunk",
323 | 		"vissza",
324 | 		"vele",
325 | 		"viszont",
326 | 		"volna":
327 | 		return true
328 | 	}
329 | 	return false
330 | }
331 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Snowball
  2 | ========
  3 | 
  4 | 
  5 | A [Go (golang)](http://golang.org) implementation of the
  6 | [Snowball stemmer](http://snowball.tartarus.org/)
  7 | for natural language processing.
  8 | 
  9 | 
 10 | |                      |  Status                   |
 11 | | -------------------- | ------------------------- |
 12 | | Latest release       |  [v0.10.0](https://github.com/kljensen/snowball/tags) (2024-08-13) |
 13 | | Latest build status  |  [![Build](https://github.com/kljensen/snowball/workflows/Build/badge.svg?event=push)](https://github.com/kljensen/snowball/actions)
 14 | | Languages available  |  English, Spanish (español), French (le français), Russian (ру́сский язы́к), Swedish (svenska), Norwegian (norsk), Hungarian (magyar)|
 15 | | License              |  MIT                      |
 16 | 
 17 | 
 18 | ## Usage
 19 | 
 20 | 
 21 | Here is a minimal Go program that uses this package in order
 22 | to stem a single word.
 23 | 
 24 | ```go
 25 | package main
 26 | import (
 27 | 	"fmt"
 28 | 	"github.com/kljensen/snowball"
 29 | )
 30 | func main(){
 31 | 	stemmed, err := snowball.Stem("Accumulations", "english", true)
 32 | 	if err == nil{
 33 | 		fmt.Println(stemmed) // Prints "accumul"
 34 | 	}
 35 | }
 36 | ```
 37 | 
 38 | 
 39 | ## Organization & Implementation
 40 | 
 41 | The code is organized as follows:
 42 | 
 43 | * The top-level `snowball` package has a single exported function `snowball.Stem`,
 44 |   which is defined in `snowball/snowball.go`.
 45 | * The stemmer for each language is defined in a "sub-package", e.g `snowball/spanish`.
 46 | * Each language exports a `Stem` function: e.g. `spanish.Stem`,
 47 |   which is defined in `snowball/spanish/stem.go`.
 48 | * Code that is common to multiple languages may go in a separate package,
 49 |   e.g. the small `romance` package.
 50 | 
 51 | Some notes about the implementation:
 52 | 
 53 | * In order to ensure the code is easily extended to non-English languages,
 54 |   I avoided using bytes and byte arrays, and instead perform all operations
 55 |   on runes.  See `snowball/snowballword/snowballword.go` and the
 56 |   `SnowballWord` struct.
 57 | * In order to avoid casting strings into slices of runes numerous times,
 58 |   this implementation uses a single slice of runes stored in the `SnowballWord`
 59 |   struct for each word that needs to be stemmed.
 60 | * In spite of the foregoing, readability requires that some strings be
 61 |   kept around and repeatedly cast into slices of runes.  For example,
 62 |   in the Spanish stemmer, one step requires removing suffixes with accute
 63 |   accents such as "ución", "logía", and "logías".  If I were to hard-code those
 64 |   suffices as slices of runes, the code would be substantially less readable.
 65 | * Instead of carrying around the word regions R1, R2, & RV as separate strings
 66 |   (or slices or runes, or whatever), we carry around the index where each of
 67 |   these regions begins.  These are stored as `R1start`, `R2start`, & `RVstart`
 68 |   on the `SnowballWord` struct. I believe this is a relatively efficient way of
 69 |   storing R1 and R2.
 70 | * The code does not use any maps or regular expressions 1) for kicks, and 2) because
 71 |   I thought they'd negatively impact the performance. (But, mostly for #1; I realize
 72 |   #2 is silly.)
 73 | * I end up refactoring the `snowballword` package a bit every time I implement a
 74 |   new language.
 75 | * Clearly, the Go implentation of these stemmers is verbose relative to the
 76 |   Snowball language.  However, it is much better than the
 77 |   [Java version](https://github.com/weavejester/snowball-stemmer/blob/master/src/java/org/tartarus/snowball/ext/frenchStemmer.java)
 78 |   and [others](https://github.com/patch/lingua-stem-unine-pm5/blob/master/src/frenchStemmerPlus.txt).
 79 | 
 80 | ## Testing
 81 | 
 82 | To run the tests, do `go test ./...` in the top-level directory.
 83 | 
 84 | ## Future work
 85 | 
 86 | I'd like to implement the Snowball stemmer in more languages.
 87 | If you can help, I would greatly appreciate it: please fork the project and send
 88 | a pull request!
 89 | 
 90 | (Also, if you are interested in creating a larger NLP project for Go, please get in touch.)
 91 | 
 92 | ## Related work
 93 | 
 94 | I know of a few other stemmers availble in Go:
 95 | 
 96 | * [stemmer](https://github.com/dchest/stemmer) by [Dmitry Chestnykh](https://github.com/dchest).
 97 |   His project also
 98 |   implements the Snowball (Porter2) English stemmer as well as the Snowball German stemmer.
 99 | * [porter-stemmer](https://github.com/a2800276/porter-stemmer.go) - an implementation of the
100 |   original Porter stemming algorithm.
101 | * [go-stem](https://github.com/agonopol/go-stem) by [Alex Gonopolskiy](https://github.com/agonopol).
102 |   Also the original Porter algorithm.
103 | * [paicehusk](https://github.com/Rookii/paicehusk) by [Aaron Groves](https://github.com/rookii).
104 |   This package implements the
105 |   [Paice/Husk](http://www.comp.lancs.ac.uk/computing/research/stemming/)
106 |   stemmer.
107 | * [golibstemmer](https://github.com/rjohnsondev/golibstemmer)
108 |   by [Richard Johnson](https://github.com/rjohnsondev).  This provides Go bindings for the
109 |   [libstemmer](http://snowball.tartarus.org/download.php) C library.
110 | * [snowball](https://bitbucket.org/tebeka/snowball) by [Miki Tebeka](http://web.mikitebeka.com/).
111 |   Also, I believe, Go bindings for the C library.
112 | 
113 | ## Contributors
114 | 
115 | * Kyle Jensen (kljensen@gmail.com, [@DataKyle](http://twitter.com/datakyle))
116 | * [Shawn Smith](https://github.com/shawnps)
117 | * [Herman Schaaf](https://github.com/hermanschaaf)
118 | * [Anton Södergren](https://github.com/AAAton)
119 | * [Eivind Moland](https://github.com/eivindam)
120 | * [ Tamás Gulácsi](https://github.com/tgulacsi)
121 | * [@clipperhouse](https://github.com/clipperhouse)
122 | * Your name should be here!
123 | 
124 | 
125 | ## License (MIT)
126 | 
127 | Copyright (c) the Contributors (see above)
128 | 
129 | Permission is hereby granted, free of charge, to any person obtaining
130 | a copy of this software and associated documentation files (the
131 | "Software"), to deal in the Software without restriction, including
132 | without limitation the rights to use, copy, modify, merge, publish,
133 | distribute, sublicense, and/or sell copies of the Software, and to
134 | permit persons to whom the Software is furnished to do so, subject to
135 | the following conditions:
136 | 
137 | The above copyright notice and this permission notice shall be
138 | included in all copies or substantial portions of the Software.
139 | 
140 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
141 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
142 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
143 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
144 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
145 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
146 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
147 | 


--------------------------------------------------------------------------------
/french/step1.go:
--------------------------------------------------------------------------------
  1 | package french
  2 | 
  3 | import (
  4 | 	"unicode/utf8"
  5 | 
  6 | 	"github.com/kljensen/snowball/snowballword"
  7 | )
  8 | 
  9 | // Step 1 is the removal of standard suffixes
 10 | func step1(word *snowballword.SnowballWord) bool {
 11 | 	suffix := word.FirstSuffix(
 12 | 		"issements", "issement", "atrices", "utions", "usions", "logies",
 13 | 		"emment", "ements", "atrice", "ations", "ateurs", "amment", "ution",
 14 | 		"usion", "ments", "logie", "istes", "ismes", "iqUes", "euses",
 15 | 		"ences", "ement", "ation", "ateur", "ances", "ables", "ment",
 16 | 		"ités", "iste", "isme", "iqUe", "euse", "ence", "eaux", "ance",
 17 | 		"able", "ives", "ité", "eux", "aux", "ive", "ifs", "if",
 18 | 	)
 19 | 
 20 | 	if suffix == "" {
 21 | 		return false
 22 | 	}
 23 | 	suffixLength := utf8.RuneCountInString(suffix)
 24 | 
 25 | 	isInR1 := (word.R1start <= len(word.RS)-suffixLength)
 26 | 	isInR2 := (word.R2start <= len(word.RS)-suffixLength)
 27 | 	isInRV := (word.RVstart <= len(word.RS)-suffixLength)
 28 | 
 29 | 	// Handle simple replacements & deletions in R2 first
 30 | 	if isInR2 {
 31 | 
 32 | 		// Handle simple replacements in R2
 33 | 		repl := ""
 34 | 		switch suffix {
 35 | 		case "logie", "logies":
 36 | 			repl = "log"
 37 | 		case "usion", "ution", "usions", "utions":
 38 | 			repl = "u"
 39 | 		case "ence", "ences":
 40 | 			repl = "ent"
 41 | 		}
 42 | 		if repl != "" {
 43 | 			word.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
 44 | 			return true
 45 | 		}
 46 | 
 47 | 		// Handle simple deletions in R2
 48 | 		switch suffix {
 49 | 		case "ance", "iqUe", "isme", "able", "iste", "eux", "ances", "iqUes", "ismes", "ables", "istes":
 50 | 			word.RemoveLastNRunes(suffixLength)
 51 | 			return true
 52 | 		}
 53 | 	}
 54 | 
 55 | 	// Handle simple replacements in RV
 56 | 	if isInRV {
 57 | 
 58 | 		// NOTE: these are "special" suffixes in that
 59 | 		// we must still do steps 2a and 2b of the
 60 | 		// French stemmer even when these suffixes are
 61 | 		// found in step1.  Therefore, we are returning
 62 | 		// `false` here.
 63 | 
 64 | 		repl := ""
 65 | 		switch suffix {
 66 | 		case "amment":
 67 | 			repl = "ant"
 68 | 		case "emment":
 69 | 			repl = "ent"
 70 | 		}
 71 | 		if repl != "" {
 72 | 			word.ReplaceSuffixRunes([]rune(suffix), []rune(repl), true)
 73 | 			return false
 74 | 		}
 75 | 
 76 | 		// Delete if preceded by a vowel that is also in RV
 77 | 		if suffix == "ment" || suffix == "ments" {
 78 | 			idx := len(word.RS) - suffixLength - 1
 79 | 			if idx >= word.RVstart && isLowerVowel(word.RS[idx]) {
 80 | 				word.RemoveLastNRunes(suffixLength)
 81 | 				return false
 82 | 			}
 83 | 			return false
 84 | 		}
 85 | 	}
 86 | 
 87 | 	// Handle all the other "special" cases.  All of these
 88 | 	// return true immediately after changing the word.
 89 | 	//
 90 | 	switch suffix {
 91 | 	case "eaux":
 92 | 
 93 | 		// Replace with eau
 94 | 		word.ReplaceSuffixRunes([]rune(suffix), []rune("eau"), true)
 95 | 		return true
 96 | 
 97 | 	case "aux":
 98 | 
 99 | 		// Replace with al if in R1
100 | 		if isInR1 {
101 | 			word.ReplaceSuffixRunes([]rune(suffix), []rune("al"), true)
102 | 			return true
103 | 		}
104 | 
105 | 	case "euse", "euses":
106 | 
107 | 		// Delete if in R2, else replace by eux if in R1
108 | 		if isInR2 {
109 | 			word.RemoveLastNRunes(suffixLength)
110 | 			return true
111 | 		} else if isInR1 {
112 | 			word.ReplaceSuffixRunes([]rune(suffix), []rune("eux"), true)
113 | 			return true
114 | 		}
115 | 
116 | 	case "issement", "issements":
117 | 
118 | 		// Delete if in R1 and preceded by a non-vowel
119 | 		if isInR1 {
120 | 			idx := len(word.RS) - suffixLength - 1
121 | 			if idx >= 0 && isLowerVowel(word.RS[idx]) == false {
122 | 				word.RemoveLastNRunes(suffixLength)
123 | 				return true
124 | 			}
125 | 		}
126 | 		return false
127 | 
128 | 	case "atrice", "ateur", "ation", "atrices", "ateurs", "ations":
129 | 
130 | 		// Delete if in R2
131 | 		if isInR2 {
132 | 			word.RemoveLastNRunes(suffixLength)
133 | 
134 | 			// If preceded by "ic", delete if in R2, else replace by "iqU".
135 | 			newSuffix := word.FirstSuffix("ic")
136 | 			newSuffixRunes := []rune(newSuffix)
137 | 			if newSuffix != "" {
138 | 				if word.FitsInR2(len(newSuffixRunes)) {
139 | 					word.RemoveLastNRunes(len(newSuffixRunes))
140 | 				} else {
141 | 					word.ReplaceSuffixRunes(newSuffixRunes, []rune("iqU"), true)
142 | 				}
143 | 			}
144 | 			return true
145 | 		}
146 | 
147 | 	case "ement", "ements":
148 | 
149 | 		if isInRV {
150 | 
151 | 			// Delete if in RV
152 | 			word.RemoveLastNRunes(suffixLength)
153 | 
154 | 			// If preceded by "iv", delete if in R2
155 | 			// (and if further preceded by "at", delete if in R2)
156 | 			newSuffix := word.RemoveFirstSuffixIfIn(word.R2start, "iv")
157 | 			newSuffixRunes := []rune(newSuffix)
158 | 			if newSuffix != "" {
159 | 				word.RemoveFirstSuffixIfIn(word.R2start, "at")
160 | 				return true
161 | 			}
162 | 
163 | 			// If preceded by "eus", delete if in R2, else replace by "eux" if in R1
164 | 			newSuffix = word.FirstSuffix("eus")
165 | 			newSuffixRunes = []rune(newSuffix)
166 | 			if newSuffix != "" {
167 | 				newSuffixLen := len(newSuffixRunes)
168 | 				if word.FitsInR2(newSuffixLen) {
169 | 					word.RemoveLastNRunes(newSuffixLen)
170 | 				} else if word.FitsInR1(newSuffixLen) {
171 | 					word.ReplaceSuffixRunes(newSuffixRunes, []rune("eux"), true)
172 | 				}
173 | 				return true
174 | 			}
175 | 
176 | 			// If preceded by abl or iqU, delete if in R2, otherwise,
177 | 			newSuffix = word.FirstSuffix("abl", "iqU")
178 | 			if newSuffix != "" {
179 | 				newSuffixLen := utf8.RuneCountInString(newSuffix)
180 | 				if word.FitsInR2(newSuffixLen) {
181 | 					word.RemoveLastNRunes(newSuffixLen)
182 | 				}
183 | 				return true
184 | 			}
185 | 
186 | 			// If preceded by ièr or Ièr, replace by i if in RV
187 | 			newSuffix = word.FirstSuffix("ièr", "Ièr")
188 | 			newSuffixRunes = []rune(newSuffix)
189 | 			if newSuffix != "" {
190 | 				if word.FitsInRV(len(newSuffixRunes)) {
191 | 					word.ReplaceSuffixRunes(newSuffixRunes, []rune("i"), true)
192 | 				}
193 | 				return true
194 | 			}
195 | 
196 | 			return true
197 | 		}
198 | 
199 | 	case "ité", "ités":
200 | 
201 | 		if isInR2 {
202 | 
203 | 			// Delete if in R2
204 | 			word.RemoveLastNRunes(suffixLength)
205 | 
206 | 			// If preceded by "abil", delete if in R2, else replace by "abl"
207 | 			newSuffix := word.FirstSuffix("abil")
208 | 			if newSuffix != "" {
209 | 				newSuffixLen := utf8.RuneCountInString(newSuffix)
210 | 				if word.FitsInR2(newSuffixLen) {
211 | 					word.RemoveLastNRunes(newSuffixLen)
212 | 				} else {
213 | 					word.ReplaceSuffixRunes([]rune(newSuffix), []rune("abl"), true)
214 | 				}
215 | 				return true
216 | 			}
217 | 
218 | 			// If preceded by "ic", delete if in R2, else replace by "iqU"
219 | 			newSuffix = word.FirstSuffix("ic")
220 | 			if newSuffix != "" {
221 | 				newSuffixLen := utf8.RuneCountInString(newSuffix)
222 | 				if word.FitsInR2(newSuffixLen) {
223 | 					word.RemoveLastNRunes(newSuffixLen)
224 | 				} else {
225 | 					word.ReplaceSuffixRunes([]rune(newSuffix), []rune("iqU"), true)
226 | 				}
227 | 				return true
228 | 			}
229 | 
230 | 			// If preceded by "iv", delete if in R2
231 | 			newSuffix = word.RemoveFirstSuffixIfIn(word.R2start, "iv")
232 | 			return true
233 | 		}
234 | 	case "if", "ive", "ifs", "ives":
235 | 
236 | 		if isInR2 {
237 | 
238 | 			// Delete if in R2
239 | 			word.RemoveLastNRunes(suffixLength)
240 | 
241 | 			// If preceded by at, delete if in R2
242 | 			newSuffix := word.RemoveFirstSuffixIfIn(word.R2start, "at")
243 | 			if newSuffix != "" {
244 | 
245 | 				// And if further preceded by ic, delete if in R2, else replace by iqU
246 | 				newSuffix = word.FirstSuffix("ic")
247 | 				if newSuffix != "" {
248 | 					newSuffixLen := utf8.RuneCountInString(newSuffix)
249 | 					if word.FitsInR2(newSuffixLen) {
250 | 						word.RemoveLastNRunes(newSuffixLen)
251 | 					} else {
252 | 						word.ReplaceSuffixRunes([]rune(newSuffix), []rune("iqU"), true)
253 | 					}
254 | 				}
255 | 			}
256 | 			return true
257 | 
258 | 		}
259 | 	}
260 | 	return false
261 | }
262 | 


--------------------------------------------------------------------------------
/english/common.go:
--------------------------------------------------------------------------------
  1 | package english
  2 | 
  3 | import (
  4 | 	"github.com/kljensen/snowball/romance"
  5 | 	"github.com/kljensen/snowball/snowballword"
  6 | )
  7 | 
  8 | // Replaces all different kinds of apostrophes with a single
  9 | // kind: "'" -- that is, "\x27", or unicode codepoint 39.
 10 | func normalizeApostrophes(word *snowballword.SnowballWord) (numSubstitutions int) {
 11 | 	for i, r := range word.RS {
 12 | 		switch r {
 13 | 
 14 | 		// The rune is one of "\u2019", "\u2018", or "\u201B";
 15 | 		// equivalently, unicode code points 8217, 8216, & 8219.
 16 | 		case 8217, 8216, 8219:
 17 | 
 18 | 			// (Note: the unicode code point for ' is 39.)
 19 | 
 20 | 			word.RS[i] = 39
 21 | 			numSubstitutions += 1
 22 | 		}
 23 | 	}
 24 | 	return
 25 | }
 26 | 
 27 | // Trim off leading apostropes.  (Slight variation from
 28 | // NLTK implementation here, in which only the first is removed.)
 29 | func trimLeftApostrophes(word *snowballword.SnowballWord) {
 30 | 	var (
 31 | 		numApostrophes int
 32 | 		r              rune
 33 | 	)
 34 | 
 35 | 	for numApostrophes, r = range word.RS {
 36 | 
 37 | 		// Check for "'", which is unicode code point 39
 38 | 		if r != 39 {
 39 | 			break
 40 | 		}
 41 | 	}
 42 | 	if numApostrophes > 0 {
 43 | 		word.RS = word.RS[numApostrophes:]
 44 | 		word.R1start = word.R1start - numApostrophes
 45 | 		word.R2start = word.R2start - numApostrophes
 46 | 	}
 47 | }
 48 | 
 49 | // Capitalize all 'Y's preceded by vowels or starting a word
 50 | func capitalizeYs(word *snowballword.SnowballWord) (numCapitalizations int) {
 51 | 	for i, r := range word.RS {
 52 | 
 53 | 		// (Note: Y & y unicode code points = 89 & 121)
 54 | 
 55 | 		if r == 121 && (i == 0 || isLowerVowel(word.RS[i-1])) {
 56 | 			word.RS[i] = 89
 57 | 			numCapitalizations += 1
 58 | 		}
 59 | 	}
 60 | 	return
 61 | }
 62 | 
 63 | // Uncapitalize all 'Y's
 64 | func uncapitalizeYs(word *snowballword.SnowballWord) {
 65 | 	for i, r := range word.RS {
 66 | 
 67 | 		// (Note: Y & y unicode code points = 89 & 121)
 68 | 
 69 | 		if r == 89 {
 70 | 			word.RS[i] = 121
 71 | 		}
 72 | 	}
 73 | 	return
 74 | }
 75 | 
 76 | // Find the starting point of the two regions R1 & R2.
 77 | //
 78 | // R1 is the region after the first non-vowel following a vowel,
 79 | // or is the null region at the end of the word if there is no
 80 | // such non-vowel.
 81 | //
 82 | // R2 is the region after the first non-vowel following a vowel
 83 | // in R1, or is the null region at the end of the word if there
 84 | // is no such non-vowel.
 85 | //
 86 | // See http://snowball.tartarus.org/texts/r1r2.html
 87 | func r1r2(word *snowballword.SnowballWord) (r1start, r2start int) {
 88 | 
 89 | 	specialPrefix := word.FirstPrefix("gener", "commun", "arsen")
 90 | 
 91 | 	if specialPrefix != "" {
 92 | 		r1start = len(specialPrefix)
 93 | 	} else {
 94 | 		r1start = romance.VnvSuffix(word, isLowerVowel, 0)
 95 | 	}
 96 | 	r2start = romance.VnvSuffix(word, isLowerVowel, r1start)
 97 | 	return
 98 | }
 99 | 
100 | // Checks if a rune is a lowercase English vowel.
101 | func isLowerVowel(r rune) bool {
102 | 	switch r {
103 | 	case 97, 101, 105, 111, 117, 121:
104 | 		return true
105 | 	}
106 | 	return false
107 | }
108 | 
109 | // Returns the stemmed version of a word if it is a special
110 | // case, otherwise returns the empty string.
111 | func stemSpecialWord(word string) (stemmed string) {
112 | 	switch word {
113 | 	case "skis":
114 | 		stemmed = "ski"
115 | 	case "skies":
116 | 		stemmed = "sky"
117 | 	case "dying":
118 | 		stemmed = "die"
119 | 	case "lying":
120 | 		stemmed = "lie"
121 | 	case "tying":
122 | 		stemmed = "tie"
123 | 	case "idly":
124 | 		stemmed = "idl"
125 | 	case "gently":
126 | 		stemmed = "gentl"
127 | 	case "ugly":
128 | 		stemmed = "ugli"
129 | 	case "early":
130 | 		stemmed = "earli"
131 | 	case "only":
132 | 		stemmed = "onli"
133 | 	case "singly":
134 | 		stemmed = "singl"
135 | 	case "sky":
136 | 		stemmed = "sky"
137 | 	case "news":
138 | 		stemmed = "news"
139 | 	case "howe":
140 | 		stemmed = "howe"
141 | 	case "atlas":
142 | 		stemmed = "atlas"
143 | 	case "cosmos":
144 | 		stemmed = "cosmos"
145 | 	case "bias":
146 | 		stemmed = "bias"
147 | 	case "andes":
148 | 		stemmed = "andes"
149 | 	case "inning":
150 | 		stemmed = "inning"
151 | 	case "innings":
152 | 		stemmed = "inning"
153 | 	case "outing":
154 | 		stemmed = "outing"
155 | 	case "outings":
156 | 		stemmed = "outing"
157 | 	case "canning":
158 | 		stemmed = "canning"
159 | 	case "cannings":
160 | 		stemmed = "canning"
161 | 	case "herring":
162 | 		stemmed = "herring"
163 | 	case "herrings":
164 | 		stemmed = "herring"
165 | 	case "earring":
166 | 		stemmed = "earring"
167 | 	case "earrings":
168 | 		stemmed = "earring"
169 | 	case "proceed":
170 | 		stemmed = "proceed"
171 | 	case "proceeds":
172 | 		stemmed = "proceed"
173 | 	case "proceeded":
174 | 		stemmed = "proceed"
175 | 	case "proceeding":
176 | 		stemmed = "proceed"
177 | 	case "exceed":
178 | 		stemmed = "exceed"
179 | 	case "exceeds":
180 | 		stemmed = "exceed"
181 | 	case "exceeded":
182 | 		stemmed = "exceed"
183 | 	case "exceeding":
184 | 		stemmed = "exceed"
185 | 	case "succeed":
186 | 		stemmed = "succeed"
187 | 	case "succeeds":
188 | 		stemmed = "succeed"
189 | 	case "succeeded":
190 | 		stemmed = "succeed"
191 | 	case "succeeding":
192 | 		stemmed = "succeed"
193 | 	}
194 | 	return
195 | }
196 | 
197 | // Return `true` if the input `word` is an English stop word.
198 | func IsStopWord(word string) bool {
199 | 	switch word {
200 | 	case "a", "about", "above", "after", "again", "against", "all", "am", "an",
201 | 		"and", "any", "are", "as", "at", "be", "because", "been", "before",
202 | 		"being", "below", "between", "both", "but", "by", "can", "did", "do",
203 | 		"does", "doing", "don", "down", "during", "each", "few", "for", "from",
204 | 		"further", "had", "has", "have", "having", "he", "her", "here", "hers",
205 | 		"herself", "him", "himself", "his", "how", "i", "if", "in", "into", "is",
206 | 		"it", "its", "itself", "just", "me", "more", "most", "my", "myself",
207 | 		"no", "nor", "not", "now", "of", "off", "on", "once", "only", "or",
208 | 		"other", "our", "ours", "ourselves", "out", "over", "own", "s", "same",
209 | 		"she", "should", "so", "some", "such", "t", "than", "that", "the", "their",
210 | 		"theirs", "them", "themselves", "then", "there", "these", "they",
211 | 		"this", "those", "through", "to", "too", "under", "until", "up",
212 | 		"very", "was", "we", "were", "what", "when", "where", "which", "while",
213 | 		"who", "whom", "why", "will", "with", "you", "your", "yours", "yourself",
214 | 		"yourselves":
215 | 		return true
216 | 	}
217 | 	return false
218 | }
219 | 
220 | // A word is called short if it ends in a short syllable, and if R1 is null.
221 | func isShortWord(w *snowballword.SnowballWord) (isShort bool) {
222 | 
223 | 	// If r1 is not empty, the word is not short
224 | 	if w.R1start < len(w.RS) {
225 | 		return
226 | 	}
227 | 
228 | 	// Otherwise it must end in a short syllable
229 | 	return endsShortSyllable(w, len(w.RS))
230 | }
231 | 
232 | // Return true if the indicies at `w.RS[:i]` end in a short syllable.
233 | // Define a short syllable in a word as either
234 | // (a) a vowel followed by a non-vowel other than w, x or Y
235 | //
236 | //	and preceded by a non-vowel, or
237 | //
238 | // (b) a vowel at the beginning of the word followed by a non-vowel.
239 | func endsShortSyllable(w *snowballword.SnowballWord, i int) bool {
240 | 
241 | 	if i == 2 {
242 | 
243 | 		// Check for a vowel at the beginning of the word followed by a non-vowel.
244 | 		if isLowerVowel(w.RS[0]) && !isLowerVowel(w.RS[1]) {
245 | 			return true
246 | 		} else {
247 | 			return false
248 | 		}
249 | 
250 | 	} else if i >= 3 {
251 | 
252 | 		// The runes 1, 2, & 3 positions to the left of `i`.
253 | 		s1 := w.RS[i-1]
254 | 		s2 := w.RS[i-2]
255 | 		s3 := w.RS[i-3]
256 | 
257 | 		// Check for a vowel followed by a non-vowel other than w, x or Y
258 | 		// and preceded by a non-vowel.
259 | 		// (Note: w, x, Y rune codepoints = 119, 120, 89)
260 | 		//
261 | 		if !isLowerVowel(s1) && s1 != 119 && s1 != 120 && s1 != 89 && isLowerVowel(s2) && !isLowerVowel(s3) {
262 | 			return true
263 | 		} else {
264 | 			return false
265 | 		}
266 | 
267 | 	}
268 | 	return false
269 | }
270 | 


--------------------------------------------------------------------------------
/snowballword/snowballword.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | This package defines a SnowballWord struct that is used
  3 | to encapsulate most of the "state" variables we must track
  4 | when stemming a word.  The SnowballWord struct also has
  5 | a few methods common to stemming in a variety of languages.
  6 | */
  7 | package snowballword
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"unicode/utf8"
 12 | )
 13 | 
 14 | // SnowballWord represents a word that is going to be stemmed.
 15 | type SnowballWord struct {
 16 | 
 17 | 	// A slice of runes
 18 | 	RS []rune
 19 | 
 20 | 	// The index in RS where the R1 region begins
 21 | 	R1start int
 22 | 
 23 | 	// The index in RS where the R2 region begins
 24 | 	R2start int
 25 | 
 26 | 	// The index in RS where the RV region begins
 27 | 	RVstart int
 28 | }
 29 | 
 30 | // Create a new SnowballWord struct
 31 | func New(in string) (word *SnowballWord) {
 32 | 	word = &SnowballWord{RS: []rune(in)}
 33 | 	word.R1start = len(word.RS)
 34 | 	word.R2start = len(word.RS)
 35 | 	word.RVstart = len(word.RS)
 36 | 	return
 37 | }
 38 | 
 39 | // Replace a suffix and adjust R1start and R2start as needed.
 40 | // If `force` is false, check to make sure the suffix exists first.
 41 | func (w *SnowballWord) ReplaceSuffix(suffix, replacement string, force bool) bool {
 42 | 
 43 | 	var (
 44 | 		doReplacement bool
 45 | 		suffixRunes   []rune
 46 | 	)
 47 | 	if force {
 48 | 		doReplacement = true
 49 | 		suffixRunes = []rune(suffix)
 50 | 	} else {
 51 | 		var foundSuffix string
 52 | 		foundSuffix = w.FirstSuffix(suffix)
 53 | 		suffixRunes = []rune(foundSuffix)
 54 | 		if foundSuffix == suffix {
 55 | 			doReplacement = true
 56 | 		}
 57 | 	}
 58 | 	if doReplacement == false {
 59 | 		return false
 60 | 	}
 61 | 	w.ReplaceSuffixRunes(suffixRunes, []rune(replacement), true)
 62 | 	return true
 63 | }
 64 | 
 65 | // Remove the last `n` runes from the SnowballWord.
 66 | func (w *SnowballWord) RemoveLastNRunes(n int) {
 67 | 	w.RS = w.RS[:len(w.RS)-n]
 68 | 	w.resetR1R2()
 69 | }
 70 | 
 71 | // Replace a suffix and adjust R1start and R2start as needed.
 72 | // If `force` is false, check to make sure the suffix exists first.
 73 | func (w *SnowballWord) ReplaceSuffixRunes(suffixRunes []rune, replacementRunes []rune, force bool) bool {
 74 | 
 75 | 	if force || w.HasSuffixRunes(suffixRunes) {
 76 | 		lenWithoutSuffix := len(w.RS) - len(suffixRunes)
 77 | 		w.RS = append(w.RS[:lenWithoutSuffix], replacementRunes...)
 78 | 
 79 | 		// If R, R2, & RV are now beyond the length
 80 | 		// of the word, they are set to the length
 81 | 		// of the word.  Otherwise, they are left
 82 | 		// as they were.
 83 | 		w.resetR1R2()
 84 | 		return true
 85 | 	}
 86 | 	return false
 87 | }
 88 | 
 89 | // Resets R1start and R2start to ensure they
 90 | // are within bounds of the current rune slice.
 91 | func (w *SnowballWord) resetR1R2() {
 92 | 	rsLen := len(w.RS)
 93 | 	if w.R1start > rsLen {
 94 | 		w.R1start = rsLen
 95 | 	}
 96 | 	if w.R2start > rsLen {
 97 | 		w.R2start = rsLen
 98 | 	}
 99 | 	if w.RVstart > rsLen {
100 | 		w.RVstart = rsLen
101 | 	}
102 | }
103 | 
104 | // Return a slice of w.RS, allowing the start
105 | // and stop to be out of bounds.
106 | func (w *SnowballWord) slice(start, stop int) []rune {
107 | 	startMin := 0
108 | 	if start < startMin {
109 | 		start = startMin
110 | 	}
111 | 	max := len(w.RS) - 1
112 | 	if start > max {
113 | 		start = max
114 | 	}
115 | 	if stop > max {
116 | 		stop = max
117 | 	}
118 | 	return w.RS[start:stop]
119 | }
120 | 
121 | // Returns true if `x` runes would fit into R1.
122 | func (w *SnowballWord) FitsInR1(x int) bool {
123 | 	return w.R1start <= len(w.RS)-x
124 | }
125 | 
126 | // Returns true if `x` runes would fit into R2.
127 | func (w *SnowballWord) FitsInR2(x int) bool {
128 | 	return w.R2start <= len(w.RS)-x
129 | }
130 | 
131 | // Returns true if `x` runes would fit into RV.
132 | func (w *SnowballWord) FitsInRV(x int) bool {
133 | 	return w.RVstart <= len(w.RS)-x
134 | }
135 | 
136 | // Return the R1 region as a slice of runes
137 | func (w *SnowballWord) R1() []rune {
138 | 	return w.RS[w.R1start:]
139 | }
140 | 
141 | // Return the R1 region as a string
142 | func (w *SnowballWord) R1String() string {
143 | 	return string(w.R1())
144 | }
145 | 
146 | // Return the R2 region as a slice of runes
147 | func (w *SnowballWord) R2() []rune {
148 | 	return w.RS[w.R2start:]
149 | }
150 | 
151 | // Return the R2 region as a string
152 | func (w *SnowballWord) R2String() string {
153 | 	return string(w.R2())
154 | }
155 | 
156 | // Return the RV region as a slice of runes
157 | func (w *SnowballWord) RV() []rune {
158 | 	return w.RS[w.RVstart:]
159 | }
160 | 
161 | // Return the RV region as a string
162 | func (w *SnowballWord) RVString() string {
163 | 	return string(w.RV())
164 | }
165 | 
166 | // Return the SnowballWord as a string
167 | func (w *SnowballWord) String() string {
168 | 	return string(w.RS)
169 | }
170 | 
171 | func (w *SnowballWord) DebugString() string {
172 | 	return fmt.Sprintf("{\"%s\", %d, %d, %d}", w.String(), w.R1start, w.R2start, w.RVstart)
173 | }
174 | 
175 | // Return the first prefix found or the empty string.
176 | func (w *SnowballWord) FirstPrefix(prefixes ...string) (foundPrefix string) {
177 | 	found := false
178 | 	rsLen := len(w.RS)
179 | 
180 | 	for _, prefix := range prefixes {
181 | 		prefixRunes := []rune(prefix)
182 | 		if len(prefixRunes) > rsLen {
183 | 			continue
184 | 		}
185 | 
186 | 		found = true
187 | 		for i, r := range prefixRunes {
188 | 			if i > rsLen-1 || (w.RS)[i] != r {
189 | 				found = false
190 | 				break
191 | 			}
192 | 		}
193 | 		if found {
194 | 			foundPrefix = prefix
195 | 			break
196 | 		}
197 | 	}
198 | 	return
199 | }
200 | 
201 | // Return true if `w.RS[startPos:endPos]` ends with runes from `suffixRunes`.
202 | // That is, the slice of runes between startPos and endPos have a suffix of
203 | // suffixRunes.
204 | func (w *SnowballWord) HasSuffixRunesIn(startPos, endPos int, suffixRunes []rune) bool {
205 | 	maxLen := endPos - startPos
206 | 	suffixLen := len(suffixRunes)
207 | 	if suffixLen > maxLen {
208 | 		return false
209 | 	}
210 | 
211 | 	numMatching := 0
212 | 	for i := 0; i < maxLen && i < suffixLen; i++ {
213 | 		if w.RS[endPos-i-1] != suffixRunes[suffixLen-i-1] {
214 | 			break
215 | 		} else {
216 | 			numMatching += 1
217 | 		}
218 | 	}
219 | 	if numMatching == suffixLen {
220 | 		return true
221 | 	}
222 | 	return false
223 | }
224 | 
225 | // Return true if `w` ends with `suffixRunes`
226 | func (w *SnowballWord) HasSuffixRunes(suffixRunes []rune) bool {
227 | 	return w.HasSuffixRunesIn(0, len(w.RS), suffixRunes)
228 | }
229 | 
230 | // Find the first suffix that ends at `endPos` in the word among
231 | // those provided; then,
232 | // check to see if it begins after startPos.  If it does, return
233 | // it, else return the empty string and empty rune slice.  This
234 | // may seem a counterintuitive manner to do this.  However, it
235 | // matches what is required most of the time by the Snowball
236 | // stemmer steps.
237 | func (w *SnowballWord) FirstSuffixIfIn(startPos, endPos int, suffixes ...string) (suffix string) {
238 | 	for _, suffix := range suffixes {
239 | 		suffixRunes := []rune(suffix)
240 | 		if w.HasSuffixRunesIn(0, endPos, suffixRunes) {
241 | 			if endPos-len(suffixRunes) >= startPos {
242 | 				return suffix
243 | 			} else {
244 | 				return ""
245 | 			}
246 | 		}
247 | 	}
248 | 
249 | 	return ""
250 | }
251 | 
252 | func (w *SnowballWord) FirstSuffixIn(startPos, endPos int, suffixes ...string) (suffix string) {
253 | 	for _, suffix := range suffixes {
254 | 		suffixRunes := []rune(suffix)
255 | 		if w.HasSuffixRunesIn(startPos, endPos, suffixRunes) {
256 | 			return suffix
257 | 		}
258 | 	}
259 | 
260 | 	return ""
261 | }
262 | 
263 | // Find the first suffix in the word among those provided; then,
264 | // check to see if it begins after startPos.  If it does,
265 | // remove it.
266 | func (w *SnowballWord) RemoveFirstSuffixIfIn(startPos int, suffixes ...string) (suffix string) {
267 | 	suffix = w.FirstSuffixIfIn(startPos, len(w.RS), suffixes...)
268 | 	suffixLength := utf8.RuneCountInString(suffix)
269 | 	if suffix != "" {
270 | 		w.RemoveLastNRunes(suffixLength)
271 | 	}
272 | 	return
273 | }
274 | 
275 | // Removes the first suffix found that is in `word.RS[startPos:len(word.RS)]`
276 | func (w *SnowballWord) RemoveFirstSuffixIn(startPos int, suffixes ...string) (suffix string) {
277 | 	suffix = w.FirstSuffixIn(startPos, len(w.RS), suffixes...)
278 | 	suffixLength := utf8.RuneCountInString(suffix)
279 | 	if suffix != "" {
280 | 		w.RemoveLastNRunes(suffixLength)
281 | 	}
282 | 	return
283 | }
284 | 
285 | // Removes the first suffix found
286 | func (w *SnowballWord) RemoveFirstSuffix(suffixes ...string) (suffix string) {
287 | 	return w.RemoveFirstSuffixIn(0, suffixes...)
288 | }
289 | 
290 | // Return the first suffix found or the empty string.
291 | func (w *SnowballWord) FirstSuffix(suffixes ...string) (suffix string) {
292 | 	return w.FirstSuffixIfIn(0, len(w.RS), suffixes...)
293 | }
294 | 


--------------------------------------------------------------------------------
/hungarian/stem.go:
--------------------------------------------------------------------------------
  1 | package hungarian
  2 | 
  3 | import (
  4 | 	"log"
  5 | 	"strings"
  6 | 	"unicode"
  7 | 
  8 | 	"github.com/kljensen/snowball/snowballword"
  9 | )
 10 | 
 11 | func printDebug(debug bool, w *snowballword.SnowballWord) {
 12 | 	if debug {
 13 | 		log.Println(w.DebugString())
 14 | 	}
 15 | }
 16 | 
 17 | func StemSentence(pairs [][2]string, s string) [][2]string {
 18 | 	for _, word := range strings.FieldsFunc(s, func(r rune) bool {
 19 | 		return unicode.IsPunct(r) || unicode.IsSpace(r)
 20 | 	}) {
 21 | 		pairs = append(pairs, [2]string{word, Stem(word, false)})
 22 | 	}
 23 | 	return pairs
 24 | }
 25 | 
 26 | // Stem an Hungarian word.  This is the only exported
 27 | // function in this package.
 28 | //
 29 | //	This stemming algorithm removes the inflectional suffixes of nouns. Nouns are inflected for case, person/possession and number.
 30 | //
 31 | // Letters in Hungarian include the following accented forms,
 32 | //
 33 | //	á   é   í   ó   ö   ő   ú   ü   ű
 34 | //
 35 | // The following letters are vowels:
 36 | //
 37 | //	a   á   e   é   i   í   o   ó   ö   ő   u   ú   ü   ű
 38 | //
 39 | // The following letters are digraphs:
 40 | //
 41 | //	cs   dz   dzs   gy   ly   ny   ty   zs
 42 | //
 43 | // A double consonant is defined as:
 44 | //
 45 | //	bb   cc   ccs   dd   ff   gg   ggy   jj   kk   ll   lly   mm   nn   nny   pp   rr   ss   ssz   tt   tty   vv   zz   zzs
 46 | func Stem(word string, stemStopwWords bool) string {
 47 | 
 48 | 	word = strings.ToLower(strings.TrimSpace(word))
 49 | 
 50 | 	// Return small words and stop words
 51 | 	if len(word) <= 2 || (!stemStopwWords && IsStopWord(word)) {
 52 | 		return word
 53 | 	}
 54 | 
 55 | 	w := snowballword.New(word)
 56 | 
 57 | 	// Stem the word.  Note, each of these
 58 | 	// steps will alter `w` in place.
 59 | 	//
 60 | 
 61 | 	preprocess(w)
 62 | 	step1(w)
 63 | 	step2(w)
 64 | 	step3(w)
 65 | 	step4(w)
 66 | 	step5(w)
 67 | 	step6(w)
 68 | 	step7(w)
 69 | 	step8(w)
 70 | 	step9(w)
 71 | 
 72 | 	return w.String()
 73 | 
 74 | }
 75 | 
 76 | func preprocess(w *snowballword.SnowballWord) {
 77 | 	w.R1start = findRegions(w)
 78 | }
 79 | 
 80 | // step1 Remove instrumental case
 81 | //
 82 | // Search for one of the following suffixes and perform the action indicated.
 83 | //
 84 | //	al   el
 85 | //
 86 | // delete if in R1 and preceded by a double consonant,
 87 | // and remove one of the double consonants.
 88 | // (In the case of consonant plus digraph, such as ccs, remove a c).
 89 | func step1(w *snowballword.SnowballWord) {
 90 | 	n := len(w.RS)
 91 | 	if n < 2 ||
 92 | 		!(w.RS[n-1] == 'l' &&
 93 | 			(w.RS[n-2] == 'a' || w.RS[n-2] == 'e')) {
 94 | 		return
 95 | 	}
 96 | 	// in R1
 97 | 	if w.R1start > n-2 || n < 4 {
 98 | 		return
 99 | 	}
100 | 	// (In the case of consonant plus digraph, such as ccs, remove a c).
101 | 	if n >= 5 && isDoubleConsonant(w.RS[n-5:n-2]) > 2 {
102 | 		w.RS[n-5], w.RS[n-4] = w.RS[n-4], w.RS[n-3]
103 | 		w.RemoveLastNRunes(3)
104 | 	} else if n >= 4 && isDoubleConsonant(w.RS[n-4:n-2]) > 1 {
105 | 		// preceded by a double consonant
106 | 		w.RemoveLastNRunes(3)
107 | 	}
108 | }
109 | 
110 | //	Step 2: Remove frequent cases
111 | //
112 | // Search for the longest among the following suffixes and perform the action indicated.
113 | //
114 | //	ban   ben   ba   be   ra   re   nak   nek   val   vel   tól   től   ról   ről   ból   ből   hoz   hez   höz   nál   nél   ig   at   et   ot   öt   ért   képp   képpen   kor   ul   ül   vá   vé   onként   enként   anként   ként   en   on   an   ön   n   t
115 | //
116 | // delete if in R1
117 | //
118 | // if the remaining word ends á replace by a
119 | // if the remaining word ends é replace by e
120 | func step2(w *snowballword.SnowballWord) {
121 | 	if suffix := firstSuffixInR1(w, []string{
122 | 		"onként", "enként", "anként",
123 | 		"képpen",
124 | 		"ként",
125 | 		"képp",
126 | 		"kor",
127 | 		"ban", "ben", "nak", "nek", "val", "vel", "tól", "től", "ról", "ről", "ból", "ből", "hoz", "hez", "höz", "nál", "nél",
128 | 		"ért",
129 | 		"ba", "be", "ra", "re", "ig", "at", "et", "ot", "öt",
130 | 		"ul", "ül", "vá", "vé",
131 | 		"en", "on", "an", "ön",
132 | 		"n", "t",
133 | 	}); suffix != "" {
134 | 		rs := runesOf(suffix)
135 | 		// delete if in R1
136 | 		w.RemoveLastNRunes(len(rs))
137 | 		if len(w.RS) == 0 {
138 | 			return
139 | 		}
140 | 		switch w.RS[len(w.RS)-1] {
141 | 		case 'á':
142 | 			// if the remaining word ends á replace by a
143 | 			w.RS[len(w.RS)-1] = 'a'
144 | 		case 'é':
145 | 			// if the remaining word ends é replace by e
146 | 			w.RS[len(w.RS)-1] = 'e'
147 | 		}
148 | 	}
149 | }
150 | 
151 | // step3: Remove special cases:
152 | //
153 | // Search for the longest among the following suffixes and perform the action indicated.
154 | //
155 | //	án   ánként
156 | //
157 | // replace by a if in R1
158 | //
159 | //	én
160 | //
161 | // replace by e if in R1
162 | func step3(w *snowballword.SnowballWord) {
163 | 	if suffix := firstSuffixInR1(w, []string{
164 | 		"ánként", "án",
165 | 		"én",
166 | 	}); suffix != "" {
167 | 		rs := runesOf(suffix)
168 | 		repl := 'a'
169 | 		if rs[0] == 'é' {
170 | 			repl = 'e'
171 | 		}
172 | 		w.RS[len(w.RS)-len(rs)] = repl
173 | 		w.RemoveLastNRunes(len(rs) - 1)
174 | 	}
175 | }
176 | 
177 | // step4: Remove other cases:
178 | //
179 | // Search for the longest among the following suffixes and perform the action indicated
180 | //
181 | //	astul   estül   stul   stül
182 | //
183 | // delete if in R1
184 | //
185 | //	ástul
186 | //
187 | // replace with a if in R1
188 | //
189 | //	éstül
190 | //
191 | // replace with e if in R1
192 | func step4(w *snowballword.SnowballWord) {
193 | 	if suffix := firstSuffixInR1(w, []string{"ástul"}); suffix != "" {
194 | 		w.RemoveLastNRunes(4)
195 | 		w.RS[len(w.RS)-1] = 'a'
196 | 		return
197 | 	}
198 | 	if suffix := firstSuffixInR1(w, []string{"éstül"}); suffix != "" {
199 | 		w.RemoveLastNRunes(4)
200 | 		w.RS[len(w.RS)-1] = 'e'
201 | 		return
202 | 	}
203 | 	// astul   estül   stul   stül
204 | 	if suffix := firstSuffixInR1(w, []string{"astul", "estül", "stul", "stül"}); suffix != "" {
205 | 		w.RemoveLastNRunes(len(runesOf(suffix)))
206 | 		return
207 | 	}
208 | }
209 | 
210 | // step5: Remove factive case
211 | //
212 | // Search for one of the following suffixes and perform the action indicated.
213 | //
214 | //	á   é
215 | //
216 | // delete if in R1 and preceded by a double consonant,
217 | // and remove one of the double consonants (as in step 1).
218 | func step5(w *snowballword.SnowballWord) {
219 | 	n := len(w.RS)
220 | 	if n < 3 || w.R1start >= n || !(w.RS[n-1] == 'á' || w.RS[n-1] == 'é') {
221 | 		return
222 | 	}
223 | 	// (In the case of consonant plus digraph, such as ccs, remove a c).
224 | 	if n >= 4 && isDoubleConsonant(w.RS[n-4:n-1]) > 2 {
225 | 		w.RS[n-4], w.RS[n-3] = w.RS[n-3], w.RS[n-1]
226 | 		w.RemoveLastNRunes(2)
227 | 	} else if isDoubleConsonant(w.RS[n-3:n-1]) > 1 {
228 | 		// preceded by a double consonant
229 | 		w.RemoveLastNRunes(2)
230 | 	}
231 | }
232 | 
233 | // step6: Remove owned
234 | // Search for the longest among the following suffixes and perform the action indicated.
235 | //
236 | //	oké   öké   aké   eké   ké   éi   é
237 | //
238 | // delete if in R1
239 | //
240 | //	áké   áéi
241 | //
242 | // replace with a if in R1
243 | //
244 | //	éké   ééi   éé
245 | //
246 | // replace with e if in R1
247 | func step6(w *snowballword.SnowballWord) {
248 | 	if suffix := firstSuffixInR1(w, []string{
249 | 		"áké", "áéi",
250 | 		"éké", "ééi", "éé",
251 | 		"oké", "öké", "aké", "eké", "ké", "éi", "é",
252 | 	}); suffix != "" {
253 | 		switch suffix {
254 | 
255 | 		case "áké", "áéi":
256 | 			w.RemoveLastNRunes(2)
257 | 			w.RS[len(w.RS)-1] = 'a'
258 | 
259 | 		case "éké", "ééi", "éé":
260 | 			w.RemoveLastNRunes(len(runesOf(suffix)) - 1)
261 | 			w.RS[len(w.RS)-1] = 'e'
262 | 
263 | 		default:
264 | 			w.RemoveLastNRunes(len(runesOf(suffix)))
265 | 		}
266 | 	}
267 | }
268 | 
269 | // step7: Remove singular owner suffixes
270 | //
271 | // Search for the longest among the following suffixes and perform the action indicated.
272 | //
273 | //	ünk   unk   nk   juk   jük   uk   ük   em   om   am   m   od   ed   ad   öd   d   ja   je   a   e o
274 | //
275 | // delete if in R1
276 | //
277 | //	ánk ájuk ám ád á
278 | //
279 | // replace with a if in R1
280 | //
281 | //	énk éjük ém éd é
282 | //
283 | // replace with e if in R1
284 | func step7(w *snowballword.SnowballWord) {
285 | 	if suffix := firstSuffixInR1(w, []string{
286 | 		"ájuk", "éjük",
287 | 		"énk",
288 | 		"ünk", "unk",
289 | 		"juk", "jük",
290 | 		"ánk",
291 | 		"nk",
292 | 		"uk", "ük", "em", "om", "am",
293 | 		"od", "ed", "ad", "öd", "ja", "je",
294 | 		"ám", "ád", "ém", "éd",
295 | 		"m", "d",
296 | 		"a", "e", "o",
297 | 		"á", "é",
298 | 	}); suffix != "" {
299 | 		n := len(runesOf(suffix))
300 | 		switch suffix {
301 | 		case "ánk", "ájuk", "ám", "ád", "á":
302 | 			w.RemoveLastNRunes(n - 1)
303 | 			w.RS[len(w.RS)-1] = 'a'
304 | 		case "énk", "éjük", "ém", "éd", "é":
305 | 			w.RemoveLastNRunes(n - 1)
306 | 			w.RS[len(w.RS)-1] = 'e'
307 | 		default:
308 | 			w.RemoveLastNRunes(n)
309 | 		}
310 | 	}
311 | }
312 | 
313 | // step8: Remove plural owner suffixes
314 | // Search for the longest among the following suffixes and perform the action indicated.
315 | //
316 | //	jaim   jeim   aim   eim   im   jaid   jeid   aid   eid   id   jai   jei   ai   ei   i   jaink   jeink   eink   aink   ink   jaitok   jeitek   aitok   eitek   itek   jeik   jaik   aik   eik   ik
317 | //
318 | // delete if in R1
319 | //
320 | //	áim   áid   ái   áink   áitok   áik
321 | //
322 | // replace with a if in R1
323 | //
324 | //	éim   éid     éi   éink   éitek   éik
325 | //
326 | // replace with e if in R1
327 | func step8(w *snowballword.SnowballWord) {
328 | 	if suffix := firstSuffixInR1(w, []string{
329 | 		"jaitok", "jeitek",
330 | 		"jaink", "jeink", "aitok", "eitek", "áitok", "éitek",
331 | 		"áink", "éink", "itek", "jeik", "jaik",
332 | 		"eink", "aink", "jaim", "jeim", "jaid", "jeid",
333 | 		"áim", "áid", "áik", "éim", "éid", "éik",
334 | 		"ink", "aik", "eik", "jai", "jei",
335 | 		"aim", "eim", "aid", "eid",
336 | 		"ái", "éi", "ik", "id", "ai", "ei",
337 | 		"im",
338 | 		"i",
339 | 	}); suffix != "" {
340 | 		n := len(runesOf(suffix))
341 | 		switch suffix {
342 | 		case "áim", "áid", "ái", "áink", "áitok", "áik":
343 | 			w.RemoveLastNRunes(n - 1)
344 | 			w.RS[len(w.RS)-1] = 'a'
345 | 		case "éim", "éid", "éi", "éink", "éitek", "éik":
346 | 			w.RemoveLastNRunes(n - 1)
347 | 			w.RS[len(w.RS)-1] = 'e'
348 | 		default:
349 | 			w.RemoveLastNRunes(n)
350 | 		}
351 | 	}
352 | }
353 | 
354 | // step9: Remove plural suffixes
355 | //
356 | // Search for the longest among the following suffixes and perform the action indicated.
357 | //
358 | //	ák
359 | //
360 | // replace with a if in R1
361 | // replace with e if in R1
362 | //
363 | //	ök   ok   ek   ak   k
364 | //
365 | // delete if in R1
366 | func step9(w *snowballword.SnowballWord) {
367 | 	if suffix := firstSuffixInR1(w, []string{
368 | 		"ák", "ék",
369 | 		"ök", "ok", "ek", "ak", "k",
370 | 	}); suffix != "" {
371 | 		switch suffix {
372 | 		case "ák":
373 | 			w.RemoveLastNRunes(1)
374 | 			w.RS[len(w.RS)-1] = 'a'
375 | 		case "ék":
376 | 			w.RemoveLastNRunes(1)
377 | 			w.RS[len(w.RS)-1] = 'e'
378 | 		default:
379 | 			w.RemoveLastNRunes(len(runesOf(suffix)))
380 | 		}
381 | 	}
382 | }
383 | 
384 | func firstSuffixInR1(w *snowballword.SnowballWord, suffixes []string) string {
385 | 	for _, suffix := range suffixes {
386 | 		rs := runesOf(suffix)
387 | 		if len(w.RS)-w.R1start >= len(rs) && w.HasSuffixRunes(rs) {
388 | 			return suffix
389 | 		}
390 | 	}
391 | 	return ""
392 | }
393 | 


--------------------------------------------------------------------------------
/english/english_test.go:
--------------------------------------------------------------------------------
  1 | /*
  2 | Herein lie all the tests of the Snowball English stemmer.
  3 | 
  4 | Many of the tests are drawn from cases where this implementation
  5 | did not match the results of the Python NLTK implementation.
  6 | */
  7 | package english
  8 | 
  9 | import (
 10 | 	"testing"
 11 | 
 12 | 	"github.com/kljensen/snowball/romance"
 13 | 	"github.com/kljensen/snowball/snowballword"
 14 | )
 15 | 
 16 | // Test stopWords for things we know should be true
 17 | // or false.
 18 | func Test_stopWords(t *testing.T) {
 19 | 
 20 | 	// Test true
 21 | 	knownTrueStopwords := [...]string{
 22 | 		"a",
 23 | 		"for",
 24 | 		"be",
 25 | 		"was",
 26 | 	}
 27 | 	for _, word := range knownTrueStopwords {
 28 | 		if IsStopWord(word) == false {
 29 | 			t.Errorf("Expected %v, to be in stopWords", word)
 30 | 		}
 31 | 	}
 32 | 
 33 | 	// Test false
 34 | 	knownFalseStopwords := [...]string{
 35 | 		"truck",
 36 | 		"deoxyribonucleic",
 37 | 		"farse",
 38 | 		"bullschnizzle",
 39 | 	}
 40 | 	for _, word := range knownFalseStopwords {
 41 | 		if IsStopWord(word) == true {
 42 | 			t.Errorf("Expected %v, to be in stopWords", word)
 43 | 		}
 44 | 	}
 45 | }
 46 | 
 47 | // Test specialWords for things we know should be present
 48 | // and not present.
 49 | func Test_specialWords(t *testing.T) {
 50 | 
 51 | 	// Test true
 52 | 	knownTrueSpecialwords := [...]string{
 53 | 		"exceeding",
 54 | 		"early",
 55 | 		"outing",
 56 | 	}
 57 | 	for _, word := range knownTrueSpecialwords {
 58 | 		if stemmed := stemSpecialWord(word); stemmed == "" {
 59 | 			t.Errorf("Expected %v, to be in specialWords", word)
 60 | 		}
 61 | 	}
 62 | 
 63 | 	// Test false
 64 | 	knownFalseSpecialwords := [...]string{
 65 | 		"truck",
 66 | 		"deoxyribonucleic",
 67 | 		"farse",
 68 | 		"bullschnizzle",
 69 | 	}
 70 | 	for _, word := range knownFalseSpecialwords {
 71 | 		if stemmed := stemSpecialWord(word); stemmed != "" {
 72 | 			t.Errorf("Expected %v, to NOT be in specialWords", word)
 73 | 		}
 74 | 	}
 75 | }
 76 | 
 77 | func Test_normalizeApostrophes(t *testing.T) {
 78 | 	variants := [...]string{
 79 | 		"\u2019xxx\u2019",
 80 | 		"\u2018xxx\u2018",
 81 | 		"\u201Bxxx\u201B",
 82 | 		"’xxx’",
 83 | 		"‘xxx‘",
 84 | 		"‛xxx‛",
 85 | 	}
 86 | 	for _, v := range variants {
 87 | 		w := snowballword.New(v)
 88 | 		normalizeApostrophes(w)
 89 | 		if w.String() != "'xxx'" {
 90 | 			t.Errorf("Expected \"'xxx'\", not \"%v\"", w.String())
 91 | 		}
 92 | 	}
 93 | }
 94 | 
 95 | func Test_capitalizeYs(t *testing.T) {
 96 | 	var wordTests = []struct {
 97 | 		in  string
 98 | 		out string
 99 | 	}{
100 | 		{"ysdcsdeysdfsysdfsdiyoyyyxyxayxey", "YsdcsdeYsdfsysdfsdiYoYyYxyxaYxeY"},
101 | 	}
102 | 	for _, wt := range wordTests {
103 | 		w := snowballword.New(wt.in)
104 | 		capitalizeYs(w)
105 | 		if w.String() != wt.out {
106 | 			t.Errorf("Expected \"%v\", not \"%v\"", wt.out, w.String())
107 | 		}
108 | 	}
109 | }
110 | func Test_preprocess(t *testing.T) {
111 | 	var wordTests = []struct {
112 | 		in  string
113 | 		out string
114 | 	}{
115 | 		{"arguing", "arguing"},
116 | 		{"'catty", "catty"},
117 | 		{"kyle’s", "kyle's"},
118 | 		{"toy", "toY"},
119 | 	}
120 | 	for _, wt := range wordTests {
121 | 		w := snowballword.New(wt.in)
122 | 		preprocess(w)
123 | 		if w.String() != wt.out {
124 | 			t.Errorf("Expected \"%v\", not \"%v\"", wt.out, w.String())
125 | 		}
126 | 	}
127 | }
128 | 
129 | func Test_vnvSuffix(t *testing.T) {
130 | 	var wordTests = []struct {
131 | 		word  string
132 | 		start int
133 | 		pos   int
134 | 	}{
135 | 		{"crepuscular", 0, 4},
136 | 		{"uscular", 0, 2},
137 | 	}
138 | 	for _, tc := range wordTests {
139 | 		w := snowballword.New(tc.word)
140 | 		pos := romance.VnvSuffix(w, isLowerVowel, tc.start)
141 | 		if pos != tc.pos {
142 | 			t.Errorf("Expected %v, but got %v", tc.pos, pos)
143 | 		}
144 | 	}
145 | }
146 | 
147 | func Test_r1r2(t *testing.T) {
148 | 	var wordTests = []struct {
149 | 		word string
150 | 		r1   string
151 | 		r2   string
152 | 	}{
153 | 		{"crepuscular", "uscular", "cular"},
154 | 		{"beautiful", "iful", "ul"},
155 | 		{"beauty", "y", ""},
156 | 		{"eucharist", "harist", "ist"},
157 | 		{"animadversion", "imadversion", "adversion"},
158 | 		{"mistresses", "tresses", "ses"},
159 | 		{"sprinkled", "kled", ""},
160 | 		// Special cases below
161 | 		{"communism", "ism", "m"},
162 | 		{"arsenal", "al", ""},
163 | 		{"generalities", "alities", "ities"},
164 | 		{"embed", "bed", ""},
165 | 	}
166 | 	for _, testCase := range wordTests {
167 | 		w := snowballword.New(testCase.word)
168 | 		r1start, r2start := r1r2(w)
169 | 		w.R1start = r1start
170 | 		w.R2start = r2start
171 | 		if w.R1String() != testCase.r1 || w.R2String() != testCase.r2 {
172 | 			t.Errorf("Expected \"{%v, %v}\", but got \"{%v, %v}\"", testCase.r1, testCase.r2, w.R1String(), w.R2String())
173 | 		}
174 | 	}
175 | }
176 | 
177 | func Test_isShortWord(t *testing.T) {
178 | 	var testCases = []struct {
179 | 		word    string
180 | 		isShort bool
181 | 	}{
182 | 		{"bed", true},
183 | 		{"shed", true},
184 | 		{"shred", true},
185 | 		{"bead", false},
186 | 		{"embed", false},
187 | 		{"beds", false},
188 | 	}
189 | 	for _, testCase := range testCases {
190 | 		w := snowballword.New(testCase.word)
191 | 		r1start, r2start := r1r2(w)
192 | 		w.R1start = r1start
193 | 		w.R2start = r2start
194 | 		isShort := isShortWord(w)
195 | 		if isShort != testCase.isShort {
196 | 			t.Errorf("Expected %v, but got %v for \"{%v, %v}\"", testCase.isShort, isShort, testCase.word, w.R1String())
197 | 		}
198 | 	}
199 | }
200 | 
201 | func Test_endsShortSyllable(t *testing.T) {
202 | 	var testCases = []struct {
203 | 		word   string
204 | 		pos    int
205 | 		result bool
206 | 	}{
207 | 		{"absolute", 7, true},
208 | 		{"ape", 2, true},
209 | 		{"rap", 3, true},
210 | 		{"trap", 4, true},
211 | 		{"entrap", 6, true},
212 | 		{"uproot", 6, false},
213 | 		{"bestow", 6, false},
214 | 		{"disturb", 7, false},
215 | 	}
216 | 	for _, testCase := range testCases {
217 | 		w := snowballword.New(testCase.word)
218 | 		result := endsShortSyllable(w, testCase.pos)
219 | 		if result != testCase.result {
220 | 			t.Errorf("Expected endsShortSyllable(%v, %v) to return %v, not %v", testCase.word, testCase.pos, testCase.result, result)
221 | 		}
222 | 	}
223 | 
224 | }
225 | 
226 | type stepFunc func(*snowballword.SnowballWord) bool
227 | type stepTest struct {
228 | 	wordIn  string
229 | 	r1start int
230 | 	r2start int
231 | 	wordOut string
232 | 	r1out   string
233 | 	r2out   string
234 | }
235 | 
236 | func runStepTest(t *testing.T, f stepFunc, tcs []stepTest) {
237 | 	for _, testCase := range tcs {
238 | 		w := snowballword.New(testCase.wordIn)
239 | 		w.R1start = testCase.r1start
240 | 		w.R2start = testCase.r2start
241 | 		_ = f(w)
242 | 		if w.String() != testCase.wordOut || w.R1String() != testCase.r1out || w.R2String() != testCase.r2out {
243 | 			t.Errorf("Expected \"{%v, %v, %v}\", but got \"{%v, %v, %v}\"", testCase.wordOut, testCase.r1out, testCase.r2out, w.String(), w.R1String(), w.R2String())
244 | 		}
245 | 	}
246 | }
247 | 
248 | func Test_step0(t *testing.T) {
249 | 	var testCases = []stepTest{
250 | 		{"general's", 5, 9, "general", "al", ""},
251 | 		{"general's'", 5, 10, "general", "al", ""},
252 | 		{"spices'", 4, 7, "spices", "es", ""},
253 | 	}
254 | 	runStepTest(t, step0, testCases)
255 | }
256 | 
257 | func Test_step1a(t *testing.T) {
258 | 	var testCases = []stepTest{
259 | 		{"ties", 0, 0, "tie", "tie", "tie"},
260 | 		{"cries", 0, 0, "cri", "cri", "cri"},
261 | 		{"mistresses", 3, 7, "mistress", "tress", "s"},
262 | 		{"ied", 3, 3, "ie", "", ""},
263 | 	}
264 | 	runStepTest(t, step1a, testCases)
265 | }
266 | 
267 | func Test_step1b(t *testing.T) {
268 | 
269 | 	// I could find immediately conjure up true words to
270 | 	// which these cases apply; so, I made some up.
271 | 
272 | 	var testCases = []stepTest{
273 | 		{"exxeedly", 1, 8, "exxee", "xxee", ""},
274 | 		{"exxeed", 1, 7, "exxee", "xxee", ""},
275 | 		{"luxuriated", 3, 5, "luxuriate", "uriate", "iate"},
276 | 		{"luxuribled", 3, 5, "luxurible", "urible", "ible"},
277 | 		{"luxuriized", 3, 5, "luxuriize", "uriize", "iize"},
278 | 		{"luxuriedly", 3, 5, "luxuri", "uri", "i"},
279 | 		{"vetted", 3, 6, "vet", "", ""},
280 | 		{"hopping", 3, 7, "hop", "", ""},
281 | 		{"breed", 5, 5, "breed", "", ""},
282 | 		{"skating", 4, 6, "skate", "e", ""},
283 | 	}
284 | 	runStepTest(t, step1b, testCases)
285 | }
286 | 
287 | func Test_step1c(t *testing.T) {
288 | 	var testCases = []stepTest{
289 | 		{"cry", 3, 3, "cri", "", ""},
290 | 		{"say", 3, 3, "say", "", ""},
291 | 		{"by", 2, 2, "by", "", ""},
292 | 		{"xexby", 2, 5, "xexbi", "xbi", ""},
293 | 	}
294 | 	runStepTest(t, step1c, testCases)
295 | }
296 | 
297 | func Test_step2(t *testing.T) {
298 | 	// Here I've faked R1 & R2 for simplicity
299 | 	var testCases = []stepTest{
300 | 		{"fluentli", 5, 8, "fluentli", "tli", ""},
301 | 		// Test "tional"
302 | 		{"xxxtional", 3, 5, "xxxtion", "tion", "on"},
303 | 		// Test when "tional" doesn't fit in R1
304 | 		{"xxxtional", 4, 5, "xxxtional", "ional", "onal"},
305 | 		// Test "li"
306 | 		{"xxxcli", 3, 6, "xxxc", "c", ""},
307 | 		// Test "li", non-valid li letter preceeding
308 | 		{"xxxxli", 3, 6, "xxxxli", "xli", ""},
309 | 		// Test "ogi"
310 | 		{"xxlogi", 2, 6, "xxlog", "log", ""},
311 | 		// Test "ogi", not preceeded by "l"
312 | 		{"xxxogi", 2, 6, "xxxogi", "xogi", ""},
313 | 		// Test the others, which are simple replacements
314 | 		{"xxxxenci", 3, 7, "xxxxence", "xence", "e"},
315 | 		{"xxxxanci", 3, 7, "xxxxance", "xance", "e"},
316 | 		{"xxxxabli", 3, 7, "xxxxable", "xable", "e"},
317 | 		{"xxxxentli", 3, 8, "xxxxent", "xent", ""},
318 | 		{"xxxxizer", 3, 7, "xxxxize", "xize", ""},
319 | 		{"xxxxization", 3, 10, "xxxxize", "xize", ""},
320 | 		{"xxxxational", 3, 10, "xxxxate", "xate", ""},
321 | 		{"xxxxation", 3, 8, "xxxxate", "xate", ""},
322 | 		{"xxxxator", 3, 7, "xxxxate", "xate", ""},
323 | 		{"xxxxalism", 3, 8, "xxxxal", "xal", ""},
324 | 		{"xxxxaliti", 3, 8, "xxxxal", "xal", ""},
325 | 		{"xxxxalli", 3, 7, "xxxxal", "xal", ""},
326 | 		{"xxxxfulness", 3, 10, "xxxxful", "xful", ""},
327 | 		{"xxxxousli", 3, 8, "xxxxous", "xous", ""},
328 | 		{"xxxxousness", 3, 10, "xxxxous", "xous", ""},
329 | 		{"xxxxiveness", 3, 10, "xxxxive", "xive", ""},
330 | 		{"xxxxiviti", 3, 8, "xxxxive", "xive", ""},
331 | 		{"xxxxbiliti", 3, 9, "xxxxble", "xble", ""},
332 | 		{"xxxxbli", 3, 6, "xxxxble", "xble", "e"},
333 | 		{"xxxxfulli", 3, 8, "xxxxful", "xful", ""},
334 | 		{"xxxxlessli", 3, 8, "xxxxless", "xless", ""},
335 | 		// Some of the same words, this time not in our fake R1
336 | 		{"xxxxenci", 8, 8, "xxxxenci", "", ""},
337 | 		{"xxxxanci", 8, 8, "xxxxanci", "", ""},
338 | 		{"xxxxabli", 8, 8, "xxxxabli", "", ""},
339 | 		{"xxxxentli", 9, 9, "xxxxentli", "", ""},
340 | 		{"xxxxizer", 8, 8, "xxxxizer", "", ""},
341 | 		{"xxxxization", 11, 11, "xxxxization", "", ""},
342 | 		{"xxxxational", 11, 11, "xxxxational", "", ""},
343 | 		{"xxxxation", 9, 9, "xxxxation", "", ""},
344 | 		{"xxxxator", 8, 8, "xxxxator", "", ""},
345 | 	}
346 | 	runStepTest(t, step2, testCases)
347 | }
348 | 
349 | func Test_step4(t *testing.T) {
350 | 	var testCases = []stepTest{
351 | 		{"accumulate", 2, 5, "accumul", "cumul", "ul"},
352 | 		{"agreement", 2, 6, "agreement", "reement", "ent"},
353 | 	}
354 | 	runStepTest(t, step4, testCases)
355 | }
356 | func Test_step5(t *testing.T) {
357 | 	var testCases = []stepTest{
358 | 		{"skate", 4, 5, "skate", "e", ""},
359 | 	}
360 | 	runStepTest(t, step5, testCases)
361 | }
362 | 
363 | func Test_Stem(t *testing.T) {
364 | 	var testCases = []struct {
365 | 		in            string
366 | 		stemStopWords bool
367 | 		out           string
368 | 	}{
369 | 		{"aberration", true, "aberr"},
370 | 		{"abruptness", true, "abrupt"},
371 | 		{"absolute", true, "absolut"},
372 | 		{"abated", true, "abat"},
373 | 		{"acclivity", true, "accliv"},
374 | 		{"accumulations", true, "accumul"},
375 | 		{"agreement", true, "agreement"},
376 | 		{"breed", true, "breed"},
377 | 		{"ape", true, "ape"},
378 | 		{"skating", true, "skate"},
379 | 		{"fluently", true, "fluentli"},
380 | 		{"ied", true, "ie"},
381 | 		{"ies", true, "ie"},
382 | 		// Stop words
383 | 		{"because", true, "becaus"},
384 | 		{"because", false, "because"},
385 | 		{"above", true, "abov"},
386 | 		{"above", false, "above"},
387 | 	}
388 | 	for _, tc := range testCases {
389 | 		stemmed := Stem(tc.in, tc.stemStopWords)
390 | 		if stemmed != tc.out {
391 | 			t.Errorf("Expected %v to stem to %v, but got %v", tc.in, tc.out, stemmed)
392 | 		}
393 | 	}
394 | 
395 | }
396 | 


--------------------------------------------------------------------------------
/spanish/spanish_test.go:
--------------------------------------------------------------------------------
  1 | package spanish
  2 | 
  3 | import (
  4 | 	"github.com/kljensen/snowball/romance"
  5 | 	"testing"
  6 | )
  7 | 
  8 | // Test stopWords for things we know should be true
  9 | // or false.
 10 | //
 11 | func Test_stopWords(t *testing.T) {
 12 | 	testCases := []romance.WordBoolTestCase{
 13 | 		{"el", true},
 14 | 		{"queso", false},
 15 | 	}
 16 | 	romance.RunWordBoolTest(t, IsStopWord, testCases)
 17 | }
 18 | 
 19 | // Test isLowerVowel for things we know should be true
 20 | // or false.
 21 | //
 22 | func Test_isLowerVowel(t *testing.T) {
 23 | 	testCases := []romance.WordBoolTestCase{
 24 | 		// These are all vowels.
 25 | 		{"aeiouáéíóúü", true},
 26 | 		// None of these are vowels.
 27 | 		{"cbfqhkl", false},
 28 | 	}
 29 | 	romance.RunRunewiseBoolTest(t, isLowerVowel, testCases)
 30 | }
 31 | 
 32 | // Test isLowerVowel for things we know should be true
 33 | // or false.
 34 | //
 35 | func Test_findRegions(t *testing.T) {
 36 | 	testCases := []romance.FindRegionsTestCase{
 37 | 		{"macho", 3, 5, 3},
 38 | 		{"olivia", 2, 4, 3},
 39 | 		{"trabajo", 4, 6, 3},
 40 | 		{"áureo", 3, 5, 3},
 41 | 		{"piñaolayas", 3, 6, 3},
 42 | 		{"terminales", 3, 6, 3},
 43 | 		{"durmió", 3, 6, 3},
 44 | 		{"cobija", 3, 5, 3},
 45 | 		{"anderson", 2, 5, 4},
 46 | 		{"cervezas", 3, 6, 3},
 47 | 		{"climáticas", 4, 6, 3},
 48 | 		{"expide", 2, 5, 4},
 49 | 		{"cenizas", 3, 5, 3},
 50 | 		{"maximiliano", 3, 5, 3},
 51 | 		{"específicos", 2, 5, 4},
 52 | 		{"menor", 3, 5, 3},
 53 | 		{"generis", 3, 5, 3},
 54 | 		{"casero", 3, 5, 3},
 55 | 		{"pululan", 3, 5, 3},
 56 | 		{"suscitado", 3, 6, 3},
 57 | 		{"pesadez", 3, 5, 3},
 58 | 		{"interno", 2, 5, 4},
 59 | 		{"agredido", 2, 5, 4},
 60 | 		{"desprendía", 3, 7, 3},
 61 | 		{"vistazo", 3, 6, 3},
 62 | 		{"frecuentan", 4, 7, 3},
 63 | 		{"noviembre", 3, 6, 3},
 64 | 		{"sintética", 3, 6, 3},
 65 | 		{"newagismo", 3, 5, 3},
 66 | 		{"eliseo", 2, 4, 3},
 67 | 		{"desbordado", 3, 6, 3},
 68 | 		{"dispongo", 3, 6, 3},
 69 | 		{"dilatar", 3, 5, 3},
 70 | 		{"xochitl", 3, 6, 3},
 71 | 		{"proporcionaba", 4, 6, 3},
 72 | 		{"pue", 3, 3, 3},
 73 | 		{"alpargatado", 2, 5, 4},
 74 | 		{"exigida", 2, 4, 3},
 75 | 		{"céntricas", 3, 7, 3},
 76 | 		{"prende", 4, 6, 3},
 77 | 		{"estructural", 2, 6, 5},
 78 | 		{"ilegalmente", 2, 4, 3},
 79 | 		{"freeport", 5, 7, 3},
 80 | 		{"sonrisas", 3, 6, 3},
 81 | 		{"cobró", 3, 5, 3},
 82 | 		{"dioses", 4, 6, 3},
 83 | 		{"consistieron", 3, 6, 3},
 84 | 		{"policiales", 3, 5, 3},
 85 | 		{"conciliador", 3, 6, 3},
 86 | 		{"fierro", 4, 6, 3},
 87 | 		{"aparadores", 2, 4, 3},
 88 | 		{"coreados", 3, 6, 3},
 89 | 		{"posición", 3, 5, 3},
 90 | 		{"adversidades", 2, 5, 4},
 91 | 		{"comprometido", 3, 7, 3},
 92 | 		{"aventuras", 2, 4, 3},
 93 | 		{"santiso", 3, 6, 3},
 94 | 		{"talentos", 3, 5, 3},
 95 | 		{"apreciar", 2, 5, 4},
 96 | 		{"sprints", 5, 7, 4},
 97 | 		{"zarco", 3, 5, 3},
 98 | 		{"concretos", 3, 7, 3},
 99 | 		{"gavica", 3, 5, 3},
100 | 		{"suavemente", 4, 6, 3},
101 | 		{"españolitos", 2, 5, 4},
102 | 		{"grabará", 4, 6, 3},
103 | 		{"entregados", 2, 6, 5},
104 | 		{"gustaría", 3, 6, 3},
105 | 		{"nickin", 3, 6, 3},
106 | 		{"sogem", 3, 5, 3},
107 | 		{"prohíbe", 4, 6, 3},
108 | 		{"espinoso", 2, 5, 4},
109 | 		{"atraviesan", 2, 5, 4},
110 | 		{"bancomext", 3, 6, 3},
111 | 		{"paraguay", 3, 5, 3},
112 | 		{"amamos", 2, 4, 3},
113 | 		{"consigna", 3, 6, 3},
114 | 		{"funcionarios", 3, 7, 3},
115 | 		{"marquis", 3, 7, 3},
116 | 		{"desactivaron", 3, 5, 3},
117 | 		{"concentrados", 3, 6, 3},
118 | 		{"democratizante", 3, 5, 3},
119 | 		{"afianzadora", 2, 5, 3},
120 | 		{"homicidio", 3, 5, 3},
121 | 		{"promovidos", 4, 6, 3},
122 | 		{"maquiladora", 3, 6, 3},
123 | 		{"bike", 3, 4, 3},
124 | 		{"recuerdos", 3, 6, 3},
125 | 		{"géneros", 3, 5, 3},
126 | 		{"rechaza", 3, 6, 3},
127 | 		{"sentarían", 3, 6, 3},
128 | 		{"quererlo", 4, 6, 3},
129 | 		{"sofisticado", 3, 5, 3},
130 | 		{"miriam", 3, 6, 3},
131 | 		{"echara", 2, 5, 4},
132 | 		{"mico", 3, 4, 3},
133 | 		{"enferma", 2, 5, 4},
134 | 		{"reforzamiento", 3, 5, 3},
135 | 		{"circunscrito", 3, 6, 3},
136 | 		{"indiana", 2, 6, 4},
137 | 		{"metrópoli", 3, 6, 3},
138 | 		{"libreta", 3, 6, 3},
139 | 		{"gonzalez", 3, 6, 3},
140 | 		{"antidemocrática", 2, 5, 4},
141 | 	}
142 | 	romance.RunFindRegionsTest(t, findRegions, testCases)
143 | }
144 | 
145 | // Test step0, the removal of pronoun suffixes.
146 | //
147 | func Test_step0(t *testing.T) {
148 | 	testCases := []romance.StepTestCase{
149 | 		{"liberarlo", 3, 5, 3, true, "liberar", 3, 5, 3},
150 | 		{"ejecutarse", 2, 4, 3, true, "ejecutar", 2, 4, 3},
151 | 		{"convirtiéndolas", 3, 6, 3, true, "convirtiendo", 3, 6, 3},
152 | 		{"perfeccionarlo", 3, 6, 3, true, "perfeccionar", 3, 6, 3},
153 | 		{"formarlo", 3, 6, 3, true, "formar", 3, 6, 3},
154 | 		{"negociarlo", 3, 5, 3, true, "negociar", 3, 5, 3},
155 | 		{"dirigirla", 3, 5, 3, true, "dirigir", 3, 5, 3},
156 | 		{"malograrlas", 3, 5, 3, true, "malograr", 3, 5, 3},
157 | 		{"atacarlos", 2, 4, 3, true, "atacar", 2, 4, 3},
158 | 		{"originarla", 2, 4, 3, true, "originar", 2, 4, 3},
159 | 		{"ponerlos", 3, 5, 3, true, "poner", 3, 5, 3},
160 | 		{"ubicándolo", 2, 4, 3, true, "ubicando", 2, 4, 3},
161 | 		{"dejarme", 3, 5, 3, true, "dejar", 3, 5, 3},
162 | 		{"regalarnos", 3, 5, 3, true, "regalar", 3, 5, 3},
163 | 		{"resolverlas", 3, 5, 3, true, "resolver", 3, 5, 3},
164 | 		{"esperarse", 2, 5, 4, true, "esperar", 2, 5, 4},
165 | 		{"cuidarlo", 4, 6, 3, true, "cuidar", 4, 6, 3},
166 | 		{"empezarlos", 2, 5, 4, true, "empezar", 2, 5, 4},
167 | 		{"gastarla", 3, 6, 3, true, "gastar", 3, 6, 3},
168 | 		{"levantarme", 3, 5, 3, true, "levantar", 3, 5, 3},
169 | 		{"ausentarse", 3, 5, 3, true, "ausentar", 3, 5, 3},
170 | 		{"colocándose", 3, 5, 3, true, "colocando", 3, 5, 3},
171 | 		{"suponerse", 3, 5, 3, true, "suponer", 3, 5, 3},
172 | 		{"someterlos", 3, 5, 3, true, "someter", 3, 5, 3},
173 | 		{"criticarlos", 4, 6, 3, true, "criticar", 4, 6, 3},
174 | 		{"consolidarlo", 3, 6, 3, true, "consolidar", 3, 6, 3},
175 | 		{"globalizarse", 4, 6, 3, true, "globalizar", 4, 6, 3},
176 | 		{"corregirla", 3, 6, 3, true, "corregir", 3, 6, 3},
177 | 		{"aplicarle", 2, 5, 4, true, "aplicar", 2, 5, 4},
178 | 		{"casarse", 3, 5, 3, true, "casar", 3, 5, 3},
179 | 		{"costándole", 3, 6, 3, true, "costando", 3, 6, 3},
180 | 		{"rescindirlo", 3, 6, 3, true, "rescindir", 3, 6, 3},
181 | 		{"quitándole", 4, 6, 3, true, "quitando", 4, 6, 3},
182 | 		{"conservarse", 3, 6, 3, true, "conservar", 3, 6, 3},
183 | 		{"venderlo", 3, 6, 3, true, "vender", 3, 6, 3},
184 | 		{"garantizarse", 3, 5, 3, true, "garantizar", 3, 5, 3},
185 | 		{"disfrutarse", 3, 7, 3, true, "disfrutar", 3, 7, 3},
186 | 		{"comunicarse", 3, 5, 3, true, "comunicar", 3, 5, 3},
187 | 		{"propiciarse", 4, 6, 3, true, "propiciar", 4, 6, 3},
188 | 		{"otorgarnos", 2, 4, 3, true, "otorgar", 2, 4, 3},
189 | 		{"contorsionarse", 3, 6, 3, true, "contorsionar", 3, 6, 3},
190 | 		{"motivarlas", 3, 5, 3, true, "motivar", 3, 5, 3},
191 | 		{"congelarse", 3, 6, 3, true, "congelar", 3, 6, 3},
192 | 		{"generandoles", 3, 5, 3, true, "generando", 3, 5, 3},
193 | 		{"evitarlo", 2, 4, 3, true, "evitar", 2, 4, 3},
194 | 		{"atenderlos", 2, 4, 3, true, "atender", 2, 4, 3},
195 | 		{"apoyándola", 2, 4, 3, true, "apoyando", 2, 4, 3},
196 | 		{"pasarse", 3, 5, 3, true, "pasar", 3, 5, 3},
197 | 		{"escucharlos", 2, 5, 4, true, "escuchar", 2, 5, 4},
198 | 		{"intervenirse", 2, 5, 4, true, "intervenir", 2, 5, 4},
199 | 		{"contratarle", 3, 7, 3, true, "contratar", 3, 7, 3},
200 | 		{"retirándose", 3, 5, 3, true, "retirando", 3, 5, 3},
201 | 		{"quitarles", 4, 6, 3, true, "quitar", 4, 6, 3},
202 | 		{"reforzarlas", 3, 5, 3, true, "reforzar", 3, 5, 3},
203 | 		{"obtenerla", 2, 5, 4, true, "obtener", 2, 5, 4},
204 | 		{"considerarlo", 3, 6, 3, true, "considerar", 3, 6, 3},
205 | 		{"regresarse", 3, 6, 3, true, "regresar", 3, 6, 3},
206 | 		{"ponerse", 3, 5, 3, true, "poner", 3, 5, 3},
207 | 		{"llevándose", 4, 6, 3, true, "llevando", 4, 6, 3},
208 | 		{"ocuparse", 2, 4, 3, true, "ocupar", 2, 4, 3},
209 | 		{"aprovecharse", 2, 5, 4, true, "aprovechar", 2, 5, 4},
210 | 		{"corregirlo", 3, 6, 3, true, "corregir", 3, 6, 3},
211 | 		{"probarle", 4, 6, 3, true, "probar", 4, 6, 3},
212 | 		{"comernos", 3, 5, 3, true, "comer", 3, 5, 3},
213 | 		{"iniciarme", 2, 4, 3, true, "iniciar", 2, 4, 3},
214 | 		{"concentrarse", 3, 6, 3, true, "concentrar", 3, 6, 3},
215 | 		{"llevarse", 4, 6, 3, true, "llevar", 4, 6, 3},
216 | 		{"difundirlo", 3, 5, 3, true, "difundir", 3, 5, 3},
217 | 		{"basándose", 3, 5, 3, true, "basando", 3, 5, 3},
218 | 		{"destinarlos", 3, 6, 3, true, "destinar", 3, 6, 3},
219 | 		{"reubicarse", 4, 6, 3, true, "reubicar", 4, 6, 3},
220 | 		{"manteniéndose", 3, 6, 3, true, "manteniendo", 3, 6, 3},
221 | 		{"colocarla", 3, 5, 3, true, "colocar", 3, 5, 3},
222 | 		{"pasarles", 3, 5, 3, true, "pasar", 3, 5, 3},
223 | 		{"depositarse", 3, 5, 3, true, "depositar", 3, 5, 3},
224 | 		{"tragarse", 4, 6, 3, true, "tragar", 4, 6, 3},
225 | 		{"eliminarla", 2, 4, 3, true, "eliminar", 2, 4, 3},
226 | 		{"eliminarse", 2, 4, 3, true, "eliminar", 2, 4, 3},
227 | 		{"apegarnos", 2, 4, 3, true, "apegar", 2, 4, 3},
228 | 		{"asociarse", 2, 4, 3, true, "asociar", 2, 4, 3},
229 | 		{"cambiarlos", 3, 7, 3, true, "cambiar", 3, 7, 3},
230 | 		{"envolviéndose", 2, 5, 4, true, "envolviendo", 2, 5, 4},
231 | 		{"lograrse", 3, 6, 3, true, "lograr", 3, 6, 3},
232 | 		{"mostrarse", 3, 7, 3, true, "mostrar", 3, 7, 3},
233 | 		{"pasarle", 3, 5, 3, true, "pasar", 3, 5, 3},
234 | 		{"enfrentándose", 2, 6, 5, true, "enfrentando", 2, 6, 5},
235 | 		{"permitirse", 3, 6, 3, true, "permitir", 3, 6, 3},
236 | 		{"sanearlas", 3, 6, 3, true, "sanear", 3, 6, 3},
237 | 		{"refugiarse", 3, 5, 3, true, "refugiar", 3, 5, 3},
238 | 		{"relacionarse", 3, 5, 3, true, "relacionar", 3, 5, 3},
239 | 		{"sacarlo", 3, 5, 3, true, "sacar", 3, 5, 3},
240 | 		{"organizarse", 2, 5, 4, true, "organizar", 2, 5, 4},
241 | 		{"familiarizarse", 3, 5, 3, true, "familiarizar", 3, 5, 3},
242 | 		{"decidirse", 3, 5, 3, true, "decidir", 3, 5, 3},
243 | 		{"tomarle", 3, 5, 3, true, "tomar", 3, 5, 3},
244 | 		{"volverlas", 3, 6, 3, true, "volver", 3, 6, 3},
245 | 		{"efectuarse", 2, 4, 3, true, "efectuar", 2, 4, 3},
246 | 		{"elegirse", 2, 4, 3, true, "elegir", 2, 4, 3},
247 | 		{"establecerse", 2, 5, 4, true, "establecer", 2, 5, 4},
248 | 		{"ponerles", 3, 5, 3, true, "poner", 3, 5, 3},
249 | 	}
250 | 	romance.RunStepTest(t, step0, testCases)
251 | }
252 | 
253 | // Test step1, the removal of standard suffixes.
254 | //
255 | func Test_step1(t *testing.T) {
256 | 	testCases := []romance.StepTestCase{
257 | 		{"retrospectiva", 3, 6, 3, true, "retrospect", 3, 6, 3},
258 | 		{"emperador", 2, 5, 4, true, "emper", 2, 5, 4},
259 | 		{"instalaciones", 2, 6, 5, true, "instal", 2, 6, 5},
260 | 		{"finiquitación", 3, 5, 3, true, "finiquit", 3, 5, 3},
261 | 		{"definitivamente", 3, 5, 3, true, "definit", 3, 5, 3},
262 | 		{"turísticas", 3, 5, 3, true, "turíst", 3, 5, 3},
263 | 		{"puntualizaciones", 3, 7, 3, true, "puntualiz", 3, 7, 3},
264 | 		{"fehacientemente", 3, 5, 3, true, "fehaciente", 3, 5, 3},
265 | 		{"determinaciones", 3, 5, 3, true, "determin", 3, 5, 3},
266 | 		{"irrelevante", 2, 5, 4, true, "irrelev", 2, 5, 4},
267 | 		{"autoritarismo", 3, 5, 3, true, "autoritar", 3, 5, 3},
268 | 		{"paralizante", 3, 5, 3, true, "paraliz", 3, 5, 3},
269 | 		{"pediátrica", 3, 6, 3, true, "pediátr", 3, 6, 3},
270 | 		{"británicos", 4, 6, 3, true, "britán", 4, 6, 3},
271 | 		{"ayuntamientos", 2, 4, 3, true, "ayunt", 2, 4, 3},
272 | 		{"sobrecalentamiento", 3, 6, 3, true, "sobrecalent", 3, 6, 3},
273 | 		{"inocencia", 2, 4, 3, true, "inocente", 2, 4, 3},
274 | 		{"amabilidad", 2, 4, 3, true, "amabil", 2, 4, 3},
275 | 		{"personalidad", 3, 6, 3, true, "personal", 3, 6, 3},
276 | 		{"vacunación", 3, 5, 3, true, "vacun", 3, 5, 3},
277 | 		{"digestivos", 3, 5, 3, true, "digest", 3, 5, 3},
278 | 		{"mecánica", 3, 5, 3, true, "mecán", 3, 5, 3},
279 | 		{"sistemáticas", 3, 6, 3, true, "sistemát", 3, 6, 3},
280 | 		{"programático", 4, 7, 3, true, "programát", 4, 7, 3},
281 | 		{"incitación", 2, 5, 4, true, "incit", 2, 5, 4},
282 | 		{"inicialmente", 2, 4, 3, true, "inicial", 2, 4, 3},
283 | 		{"derivación", 3, 5, 3, true, "deriv", 3, 5, 3},
284 | 		{"fraccionamientos", 4, 8, 3, true, "fraccion", 4, 8, 3},
285 | 		{"frecuentemente", 4, 7, 3, true, "frecuente", 4, 7, 3},
286 | 		{"econometristas", 2, 4, 3, true, "econometr", 2, 4, 3},
287 | 		{"mentirosas", 3, 6, 3, true, "mentir", 3, 6, 3},
288 | 		{"eficientemente", 2, 4, 3, true, "eficiente", 2, 4, 3},
289 | 		{"utilidades", 2, 4, 3, true, "util", 2, 4, 3},
290 | 		{"rehabilitación", 3, 5, 3, true, "rehabilit", 3, 5, 3},
291 | 		{"adquisitivo", 2, 6, 4, true, "adquisit", 2, 6, 4},
292 | 		{"consignación", 3, 6, 3, true, "consign", 3, 6, 3},
293 | 		{"concursante", 3, 6, 3, true, "concurs", 3, 6, 3},
294 | 		{"criminalidad", 4, 6, 3, true, "criminal", 4, 6, 3},
295 | 		{"invitación", 2, 5, 4, true, "invit", 2, 5, 4},
296 | 		{"adherencia", 2, 5, 4, true, "adherente", 2, 5, 4},
297 | 		{"animalización", 2, 4, 3, true, "animaliz", 2, 4, 3},
298 | 		{"enteramente", 2, 5, 4, true, "enter", 2, 5, 4},
299 | 		{"deportivos", 3, 5, 3, true, "deport", 3, 5, 3},
300 | 		{"controladora", 3, 7, 3, true, "control", 3, 7, 3},
301 | 		{"edifico", 2, 4, 3, true, "edif", 2, 4, 3},
302 | 		{"planificación", 4, 6, 3, true, "planif", 4, 6, 3},
303 | 		{"expectación", 2, 5, 4, true, "expect", 2, 5, 4},
304 | 		{"cosméticos", 3, 6, 3, true, "cosmét", 3, 6, 3},
305 | 		{"cómodamente", 3, 5, 3, true, "cómod", 3, 5, 3},
306 | 		{"intempestivamente", 2, 5, 4, true, "intempest", 2, 5, 4},
307 | 		{"cetemistas", 3, 5, 3, true, "cetem", 3, 5, 3},
308 | 		{"presumiblemente", 4, 6, 3, true, "presum", 4, 6, 3},
309 | 		{"reivindicación", 4, 6, 3, true, "reivind", 4, 6, 3},
310 | 		{"ventajosa", 3, 6, 3, true, "ventaj", 3, 6, 3},
311 | 		{"atmosférico", 2, 5, 4, true, "atmosfér", 2, 5, 4},
312 | 		{"subprocurador", 3, 7, 3, true, "subprocur", 3, 7, 3},
313 | 		{"estadísticas", 2, 5, 4, true, "estadíst", 2, 5, 4},
314 | 		{"respetuoso", 3, 6, 3, true, "respetu", 3, 6, 3},
315 | 		{"procedimiento", 4, 6, 3, true, "proced", 4, 6, 3},
316 | 		{"seguramente", 3, 5, 3, true, "segur", 3, 5, 3},
317 | 		{"autocalifica", 3, 5, 3, true, "autocalif", 3, 5, 3},
318 | 		{"esporádica", 2, 5, 4, true, "esporád", 2, 5, 4},
319 | 		{"caudalosos", 4, 6, 3, true, "caudal", 4, 6, 3},
320 | 		{"imperdonable", 2, 5, 4, true, "imperdon", 2, 5, 4},
321 | 		{"magníficas", 3, 6, 3, true, "magníf", 3, 6, 3},
322 | 		{"erróneamente", 2, 5, 4, true, "erróne", 2, 5, 4},
323 | 		{"conmemorativa", 3, 6, 3, true, "conmemor", 3, 6, 3},
324 | 		{"simulación", 3, 5, 3, true, "simul", 3, 5, 3},
325 | 		{"arrendadora", 2, 5, 4, true, "arrend", 2, 5, 4},
326 | 		{"moralización", 3, 5, 3, true, "moraliz", 3, 5, 3},
327 | 		{"accesibles", 2, 5, 4, true, "acces", 2, 5, 4},
328 | 		{"infidelidades", 2, 5, 4, true, "infidel", 2, 5, 4},
329 | 		{"abdicación", 2, 5, 4, true, "abdic", 2, 5, 4},
330 | 		{"airecombustible", 3, 5, 3, true, "airecombust", 3, 5, 3},
331 | 		{"escuetamente", 2, 6, 4, true, "escuet", 2, 6, 4},
332 | 		{"exóticos", 2, 4, 3, true, "exót", 2, 4, 3},
333 | 		{"volcánica", 3, 6, 3, true, "volcán", 3, 6, 3},
334 | 		{"oceánico", 2, 5, 3, true, "oceán", 2, 5, 3},
335 | 		{"simulador", 3, 5, 3, true, "simul", 3, 5, 3},
336 | 		{"compañerismo", 3, 6, 3, true, "compañer", 3, 6, 3},
337 | 		{"fagotistas", 3, 5, 3, true, "fagot", 3, 5, 3},
338 | 		{"cardenistas", 3, 6, 3, true, "carden", 3, 6, 3},
339 | 		{"periférico", 3, 5, 3, true, "perifér", 3, 5, 3},
340 | 		{"petroquímica", 3, 6, 3, true, "petroquím", 3, 6, 3},
341 | 		{"columnista", 3, 5, 3, true, "column", 3, 5, 3},
342 | 		{"comportamientos", 3, 6, 3, true, "comport", 3, 6, 3},
343 | 		{"fanáticos", 3, 5, 3, true, "fanát", 3, 5, 3},
344 | 		{"significativo", 3, 6, 3, true, "signific", 3, 6, 3},
345 | 		{"turísticos", 3, 5, 3, true, "turíst", 3, 5, 3},
346 | 		{"divergencias", 3, 5, 3, true, "divergente", 3, 5, 3},
347 | 		{"lamentable", 3, 5, 3, true, "lament", 3, 5, 3},
348 | 		{"estratosféricas", 2, 6, 5, true, "estratosfér", 2, 6, 5},
349 | 		{"emigrantes", 2, 4, 3, true, "emigr", 2, 4, 3},
350 | 		{"ahorrador", 2, 4, 3, true, "ahorr", 2, 4, 3},
351 | 		{"transportaciones", 4, 8, 3, true, "transport", 4, 8, 3},
352 | 		{"atomizador", 2, 4, 3, true, "atomiz", 2, 4, 3},
353 | 		{"dolorosa", 3, 5, 3, true, "dolor", 3, 5, 3},
354 | 		{"enervantes", 2, 4, 3, true, "enerv", 2, 4, 3},
355 | 		{"gobernador", 3, 5, 3, true, "gobern", 3, 5, 3},
356 | 		{"inexplicable", 2, 4, 3, true, "inexplic", 2, 4, 3},
357 | 	}
358 | 	romance.RunStepTest(t, step1, testCases)
359 | }
360 | 


--------------------------------------------------------------------------------
/french/french_test.go:
--------------------------------------------------------------------------------
  1 | package french
  2 | 
  3 | import (
  4 | 	"github.com/kljensen/snowball/romance"
  5 | 	"github.com/kljensen/snowball/snowballword"
  6 | 	"testing"
  7 | )
  8 | 
  9 | // Test stopWords for things we know should be true
 10 | // or false.
 11 | //
 12 | func Test_stopWords(t *testing.T) {
 13 | 	testCases := []romance.WordBoolTestCase{
 14 | 		{"eussiez", true},
 15 | 		{"machine", false},
 16 | 	}
 17 | 	romance.RunWordBoolTest(t, IsStopWord, testCases)
 18 | }
 19 | 
 20 | // Test isLowerVowel for things we know should be true
 21 | // or false.
 22 | //
 23 | func Test_isLowerVowel(t *testing.T) {
 24 | 	testCases := []romance.WordBoolTestCase{
 25 | 		// These are all vowels.
 26 | 		{"aeiouyâàëéêèïîôûù", true},
 27 | 		// None of these are vowels.
 28 | 		{"cbfqhkl", false},
 29 | 	}
 30 | 	romance.RunRunewiseBoolTest(t, isLowerVowel, testCases)
 31 | }
 32 | 
 33 | // Test capitalization of vowels acting as non-vowels.
 34 | //
 35 | func Test_capitalizeYUI(t *testing.T) {
 36 | 	testCases := []struct {
 37 | 		wordIn  string
 38 | 		wordOut string
 39 | 	}{
 40 | 		{"jouer", "joUer"},
 41 | 		{"ennuie", "ennuIe"},
 42 | 		{"yeux", "Yeux"},
 43 | 		{"quand", "qUand"},
 44 | 	}
 45 | 
 46 | 	for _, testCase := range testCases {
 47 | 		w := snowballword.New(testCase.wordIn)
 48 | 		capitalizeYUI(w)
 49 | 		if w.String() != testCase.wordOut {
 50 | 			t.Errorf("Expect %v -> %v, but got %v", testCase.wordIn, testCase.wordOut, w.String())
 51 | 		}
 52 | 	}
 53 | }
 54 | func Test_findRegions(t *testing.T) {
 55 | 	testCases := []romance.FindRegionsTestCase{
 56 | 		{"iriez", 2, 5, 3},
 57 | 		{"reçoivent", 3, 6, 2},
 58 | 		{"rébarbatif", 3, 5, 2},
 59 | 		{"paraîtrons", 3, 6, 3},
 60 | 		{"prétendus", 4, 6, 3},
 61 | 		{"boUilli", 3, 5, 2},
 62 | 		{"destitué", 3, 6, 2},
 63 | 		{"bataillons", 3, 6, 2},
 64 | 		{"buffa", 3, 5, 2},
 65 | 		{"suffisante", 3, 6, 2},
 66 | 		{"excepté", 2, 5, 4},
 67 | 		{"audace", 3, 5, 3},
 68 | 		{"vertueuses", 3, 8, 2},
 69 | 		{"écrièrent", 2, 6, 4},
 70 | 		{"provoqUer", 4, 6, 3},
 71 | 		{"barbotement", 3, 6, 2},
 72 | 		{"contribua", 3, 7, 2},
 73 | 		{"ensuit", 2, 6, 4},
 74 | 		{"confédéré", 3, 6, 2},
 75 | 		{"affairé", 2, 6, 4},
 76 | 		{"incompatibles", 2, 5, 4},
 77 | 		{"talma", 3, 5, 2},
 78 | 		{"péchais", 3, 7, 2},
 79 | 		{"abusé", 2, 4, 3},
 80 | 		{"plaisir", 5, 7, 3},
 81 | 		{"foretells", 3, 5, 2},
 82 | 		{"walbah", 3, 6, 2},
 83 | 		{"confucius", 3, 6, 2},
 84 | 		{"attelée", 2, 5, 4},
 85 | 		{"tirailler", 3, 6, 2},
 86 | 		{"vin", 3, 3, 2},
 87 | 		{"toucher", 4, 7, 2},
 88 | 		{"reprendrons", 3, 6, 2},
 89 | 		{"hé", 2, 2, 2},
 90 | 		{"intéressant", 2, 5, 4},
 91 | 		{"malebar", 3, 5, 2},
 92 | 		{"alimenter", 2, 4, 3},
 93 | 		{"inventée", 2, 5, 4},
 94 | 		{"rechargez", 3, 6, 2},
 95 | 		{"revêtu", 3, 5, 2},
 96 | 		{"étaYé", 2, 4, 3},
 97 | 		{"maladresse", 3, 5, 2},
 98 | 		{"envié", 2, 5, 4},
 99 | 		{"secoUaIent", 3, 5, 2},
100 | 		{"parler", 3, 6, 3},
101 | 		{"marécages", 3, 5, 2},
102 | 		{"privilèges", 4, 6, 3},
103 | 		{"examinez", 2, 4, 3},
104 | 		{"contraria", 3, 7, 2},
105 | 		{"sotte", 3, 5, 2},
106 | 		{"méchantes", 3, 6, 2},
107 | 		{"coffres", 3, 7, 2},
108 | 		{"tressaillir", 4, 8, 3},
109 | 		{"charlatanisme", 4, 7, 3},
110 | 		{"appuYais", 2, 5, 4},
111 | 		{"interdis", 2, 5, 4},
112 | 		{"baissa", 4, 6, 2},
113 | 		{"sanglotant", 3, 7, 2},
114 | 		{"rencontrerai", 3, 6, 2},
115 | 		{"subis", 3, 5, 2},
116 | 		{"empestée", 2, 5, 4},
117 | 		{"communiqUa", 3, 6, 2},
118 | 		{"huit", 4, 4, 2},
119 | 		{"heurter", 4, 7, 2},
120 | 		{"premiers", 4, 7, 3},
121 | 		{"brusqUe", 4, 7, 3},
122 | 		{"inanimé", 2, 4, 3},
123 | 		{"congédia", 3, 6, 2},
124 | 		{"souffrir", 4, 8, 2},
125 | 		{"élévations", 2, 4, 3},
126 | 		{"sablé", 3, 5, 2},
127 | 		{"salure", 3, 5, 2},
128 | 		{"résigna", 3, 5, 2},
129 | 		{"compatriotes", 3, 6, 2},
130 | 		{"écrient", 2, 6, 4},
131 | 		{"chanoine", 4, 7, 3},
132 | 		{"conçois", 3, 7, 2},
133 | 		{"lançaIent", 3, 6, 2},
134 | 		{"pékin", 3, 5, 2},
135 | 		{"poneYs", 3, 5, 2},
136 | 		{"pratiqUer", 4, 6, 3},
137 | 		{"bâtonne", 3, 5, 2},
138 | 		{"possibilités", 3, 6, 2},
139 | 		{"aiguille", 3, 6, 3},
140 | 		{"ténor", 3, 5, 2},
141 | 		{"déchirés", 3, 6, 2},
142 | 		{"anoblit", 2, 4, 3},
143 | 		{"tombât", 3, 6, 2},
144 | 		{"paralysé", 3, 5, 3},
145 | 		{"dot", 3, 3, 2},
146 | 		{"aigre", 3, 5, 3},
147 | 		{"ramena", 3, 5, 2},
148 | 		{"appartiennent", 2, 5, 4},
149 | 		{"premières", 4, 7, 3},
150 | 		{"tentez", 3, 6, 2},
151 | 		{"pari", 3, 4, 3},
152 | 		{"coudes", 4, 6, 2},
153 | 		{"étonnerait", 2, 4, 3},
154 | 		{"embrunir", 2, 6, 5},
155 | 		{"mobile", 3, 5, 2},
156 | 	}
157 | 
158 | 	romance.RunFindRegionsTest(t, findRegions, testCases)
159 | }
160 | 
161 | // Test step1, the removal of standard suffixes.
162 | //
163 | func Test_step1(t *testing.T) {
164 | 	testCases := []romance.StepTestCase{
165 | 		{"rapidement", 3, 5, 2, true, "rapid", 3, 5, 2},
166 | 		{"paresseuse", 3, 5, 3, true, "paress", 3, 5, 3},
167 | 		{"prosaïqUement", 4, 7, 3, true, "prosaïqU", 4, 7, 3},
168 | 		{"nonchalance", 3, 7, 2, true, "nonchal", 3, 7, 2},
169 | 		{"apostoliqUes", 2, 4, 3, true, "apostol", 2, 4, 3},
170 | 		{"assiduités", 2, 5, 4, true, "assidu", 2, 5, 4},
171 | 		{"philosophiqUement", 4, 6, 3, true, "philosoph", 4, 6, 3},
172 | 		{"despotiqUement", 3, 6, 2, true, "despot", 3, 6, 2},
173 | 		{"incontestablement", 2, 5, 4, true, "incontest", 2, 5, 4},
174 | 		{"diminution", 3, 5, 2, true, "diminu", 3, 5, 2},
175 | 		{"séditieuse", 3, 5, 2, true, "séditi", 3, 5, 2},
176 | 		{"anonymement", 2, 4, 3, true, "anonym", 2, 4, 3},
177 | 		{"conservation", 3, 6, 2, true, "conserv", 3, 6, 2},
178 | 		{"fâcheuses", 3, 7, 2, true, "fâcheux", 3, 7, 2},
179 | 		{"houleuse", 4, 7, 2, true, "houleux", 4, 7, 2},
180 | 		{"historiqUes", 3, 6, 2, true, "histor", 3, 6, 2},
181 | 		{"impérieusement", 2, 5, 4, true, "impéri", 2, 5, 4},
182 | 		{"complaisances", 3, 8, 2, true, "complais", 3, 8, 2},
183 | 		{"confessionnaux", 3, 6, 2, true, "confessionnal", 3, 6, 2},
184 | 		{"grandement", 4, 7, 3, true, "grand", 4, 5, 3},
185 | 		{"passablement", 3, 6, 2, true, "passabl", 3, 6, 2},
186 | 		{"strictement", 5, 8, 4, true, "strict", 5, 6, 4},
187 | 		{"physiqUement", 4, 6, 3, true, "physiqU", 4, 6, 3},
188 | 		{"serieusement", 3, 7, 2, true, "serieux", 3, 7, 2},
189 | 		{"roulement", 4, 6, 2, true, "roul", 4, 4, 2},
190 | 		{"appartement", 2, 5, 4, true, "appart", 2, 5, 4},
191 | 		{"reconnaissance", 3, 5, 2, true, "reconnaiss", 3, 5, 2},
192 | 		{"aigrement", 3, 6, 3, true, "aigr", 3, 4, 3},
193 | 		{"impertinences", 2, 5, 4, true, "impertinent", 2, 5, 4},
194 | 		{"parlement", 3, 6, 3, true, "parl", 3, 4, 3},
195 | 		{"malicieux", 3, 5, 2, true, "malici", 3, 5, 2},
196 | 		{"suffisance", 3, 6, 2, true, "suffis", 3, 6, 2},
197 | 		{"prémédité", 4, 6, 3, true, "préméd", 4, 6, 3},
198 | 		{"métalliqUes", 3, 5, 2, true, "métall", 3, 5, 2},
199 | 		{"météorologiste", 3, 6, 2, true, "météorolog", 3, 6, 2},
200 | 		{"prononciation", 4, 6, 3, true, "prononci", 4, 6, 3},
201 | 		{"nombreuse", 3, 8, 2, true, "nombreux", 3, 8, 2},
202 | 		{"extatiqUe", 2, 5, 4, true, "extat", 2, 5, 4},
203 | 		{"magnifiqUement", 3, 6, 2, true, "magnif", 3, 6, 2},
204 | 		{"gymnastiqUe", 3, 6, 2, true, "gymnast", 3, 6, 2},
205 | 		{"dramatiqUe", 4, 6, 3, true, "dramat", 4, 6, 3},
206 | 		{"simplicité", 3, 7, 2, true, "simpliqU", 3, 7, 2},
207 | 		{"roYalistes", 3, 5, 2, true, "roYal", 3, 5, 2},
208 | 		{"fortifications", 3, 6, 2, true, "fortif", 3, 6, 2},
209 | 		{"attendrissement", 2, 5, 4, true, "attendr", 2, 5, 4},
210 | 		{"respectueusement", 3, 6, 2, true, "respectu", 3, 6, 2},
211 | 		{"patriotisme", 3, 7, 2, true, "patriot", 3, 7, 2},
212 | 		{"curieuse", 3, 7, 2, true, "curieux", 3, 7, 2},
213 | 		{"fascination", 3, 6, 2, true, "fascin", 3, 6, 2},
214 | 		{"effectivement", 2, 5, 4, true, "effect", 2, 5, 4},
215 | 		{"condoléance", 3, 6, 2, true, "condolé", 3, 6, 2},
216 | 		{"malignité", 3, 5, 2, true, "malign", 3, 5, 2},
217 | 		{"capricieuse", 3, 6, 2, true, "caprici", 3, 6, 2},
218 | 		{"applaudissements", 2, 7, 5, true, "applaud", 2, 7, 5},
219 | 		{"praticable", 4, 6, 3, true, "pratic", 4, 6, 3},
220 | 		{"rivaux", 3, 6, 2, true, "rival", 3, 5, 2},
221 | 		{"augmentation", 3, 6, 3, true, "augment", 3, 6, 3},
222 | 		{"ameublement", 2, 5, 3, true, "ameubl", 2, 5, 3},
223 | 		{"honorables", 3, 5, 2, true, "honor", 3, 5, 2},
224 | 		{"effervescence", 2, 5, 4, true, "effervescent", 2, 5, 4},
225 | 		{"excentricité", 2, 5, 4, true, "excentr", 2, 5, 4},
226 | 		{"misérable", 3, 5, 2, true, "misér", 3, 5, 2},
227 | 		{"capitulation", 3, 5, 2, true, "capitul", 3, 5, 2},
228 | 		{"enjoUement", 2, 5, 4, true, "enjoU", 2, 5, 4},
229 | 		{"sévérité", 3, 5, 2, true, "sévér", 3, 5, 2},
230 | 		{"perplexités", 3, 7, 2, true, "perplex", 3, 7, 2},
231 | 		{"consentement", 3, 6, 2, true, "consent", 3, 6, 2},
232 | 		{"convocation", 3, 6, 2, true, "convoc", 3, 6, 2},
233 | 		{"assurances", 2, 5, 4, true, "assur", 2, 5, 4},
234 | 		{"ébloUissement", 2, 5, 4, true, "ébloU", 2, 5, 4},
235 | 		{"méridionaux", 3, 5, 2, true, "méridional", 3, 5, 2},
236 | 		{"dérangements", 3, 5, 2, true, "dérang", 3, 5, 2},
237 | 		{"domination", 3, 5, 2, true, "domin", 3, 5, 2},
238 | 		{"incroYable", 2, 6, 5, true, "incroY", 2, 6, 5},
239 | 		{"réjoUissances", 3, 5, 2, true, "réjoUiss", 3, 5, 2},
240 | 		{"décadence", 3, 5, 2, true, "décadent", 3, 5, 2},
241 | 		{"bâillement", 4, 7, 2, true, "bâill", 4, 5, 2},
242 | 		{"habillement", 3, 5, 2, true, "habill", 3, 5, 2},
243 | 		{"irréparablement", 2, 5, 4, true, "irrépar", 2, 5, 4},
244 | 		{"diplomatiqUes", 3, 6, 2, true, "diplomat", 3, 6, 2},
245 | 		{"distribution", 3, 7, 2, true, "distribu", 3, 7, 2},
246 | 		{"pétulance", 3, 5, 2, true, "pétul", 3, 5, 2},
247 | 		{"considérable", 3, 6, 2, true, "considér", 3, 6, 2},
248 | 		{"éducation", 2, 4, 3, true, "éduc", 2, 4, 3},
249 | 		{"indications", 2, 5, 4, true, "indiqU", 2, 5, 4},
250 | 		{"cupidité", 3, 5, 2, true, "cupid", 3, 5, 2},
251 | 		{"traîtreusement", 5, 9, 3, true, "traîtreux", 5, 9, 3},
252 | 		{"silencieuse", 3, 5, 2, true, "silenci", 3, 5, 2},
253 | 		{"pessimisme", 3, 6, 2, true, "pessim", 3, 6, 2},
254 | 		{"préoccupation", 5, 8, 3, true, "préoccup", 5, 8, 3},
255 | 		// Special cases that should return false despite
256 | 		// being changed.  They "don't count".
257 | 		{"compliment", 3, 7, 2, false, "compli", 3, 6, 2},
258 | 		{"vraiment", 5, 7, 3, false, "vrai", 4, 4, 3},
259 | 		{"remercîment", 3, 5, 2, false, "remercî", 3, 5, 2},
260 | 		{"puissamment", 4, 7, 2, false, "puissant", 4, 7, 2},
261 | 		{"absolument", 2, 5, 4, false, "absolu", 2, 5, 4},
262 | 		{"décidément", 3, 5, 2, false, "décidé", 3, 5, 2},
263 | 		{"condiments", 3, 6, 2, false, "condi", 3, 5, 2},
264 | 	}
265 | 	romance.RunStepTest(t, step1, testCases)
266 | 
267 | }
268 | 
269 | // the removal of Verb suffixes beginning
270 | // with "i" in the RV region.
271 | // Test step1, the removal of standard suffixes.
272 | //
273 | func Test_step2a(t *testing.T) {
274 | 	testCases := []romance.StepTestCase{
275 | 		{"épanoUit", 2, 4, 3, true, "épanoU", 2, 4, 3},
276 | 		{"faillirent", 4, 7, 2, true, "faill", 4, 5, 2},
277 | 		{"acabit", 2, 4, 3, true, "acab", 2, 4, 3},
278 | 		{"établissait", 2, 4, 3, true, "établ", 2, 4, 3},
279 | 		{"découvrir", 3, 6, 2, true, "découvr", 3, 6, 2},
280 | 		{"réjoUissait", 3, 5, 2, true, "réjoU", 3, 5, 2},
281 | 		{"trahiront", 4, 6, 3, true, "trah", 4, 4, 3},
282 | 		{"maintenir", 4, 7, 2, true, "mainten", 4, 7, 2},
283 | 		{"vendit", 3, 6, 2, true, "vend", 3, 4, 2},
284 | 		{"repartit", 3, 5, 2, true, "repart", 3, 5, 2},
285 | 		{"giletti", 3, 5, 2, true, "gilett", 3, 5, 2},
286 | 		{"rienzi", 4, 6, 2, true, "rienz", 4, 5, 2},
287 | 		{"punie", 3, 5, 2, true, "pun", 3, 3, 2},
288 | 		{"accueillir", 2, 7, 4, true, "accueill", 2, 7, 4},
289 | 		{"rétablit", 3, 5, 2, true, "rétabl", 3, 5, 2},
290 | 		{"ravis", 3, 5, 2, true, "rav", 3, 3, 2},
291 | 		{"xviIi", 4, 5, 3, true, "xviI", 4, 4, 3},
292 | 		{"blottie", 4, 7, 3, true, "blott", 4, 5, 3},
293 | 		{"approfondie", 2, 6, 5, true, "approfond", 2, 6, 5},
294 | 		{"infirmerie", 2, 5, 4, true, "infirmer", 2, 5, 4},
295 | 		{"scotti", 4, 6, 3, true, "scott", 4, 5, 3},
296 | 		{"adoucissait", 2, 5, 3, true, "adouc", 2, 5, 3},
297 | 		{"finissait", 3, 5, 2, true, "fin", 3, 3, 2},
298 | 		{"promit", 4, 6, 3, true, "prom", 4, 4, 3},
299 | 		{"franchies", 4, 9, 3, true, "franch", 4, 6, 3},
300 | 		{"franchissant", 4, 8, 3, true, "franch", 4, 6, 3},
301 | 		{"micheli", 3, 6, 2, true, "michel", 3, 6, 2},
302 | 		{"éteignit", 2, 5, 3, true, "éteign", 2, 5, 3},
303 | 		{"puni", 3, 4, 2, true, "pun", 3, 3, 2},
304 | 		{"apoplexie", 2, 4, 3, true, "apoplex", 2, 4, 3},
305 | 		{"désira", 3, 5, 2, true, "dés", 3, 3, 2},
306 | 		{"étourdi", 2, 5, 3, true, "étourd", 2, 5, 3},
307 | 		{"giovanni", 4, 6, 2, true, "giovann", 4, 6, 2},
308 | 		{"apprécie", 2, 6, 5, true, "appréc", 2, 6, 5},
309 | 		{"poésies", 4, 7, 2, true, "poés", 4, 4, 2},
310 | 		{"pairie", 4, 6, 2, true, "pair", 4, 4, 2},
311 | 		{"sortit", 3, 6, 2, true, "sort", 3, 4, 2},
312 | 		{"subi", 3, 4, 2, true, "sub", 3, 3, 2},
313 | 		{"aigrirait", 3, 6, 3, true, "aigr", 3, 4, 3},
314 | 		{"assailli", 2, 6, 4, true, "assaill", 2, 6, 4},
315 | 		{"bertolotti", 3, 6, 2, true, "bertolott", 3, 6, 2},
316 | 		{"recouvrir", 3, 6, 2, true, "recouvr", 3, 6, 2},
317 | 		{"visconti", 3, 6, 2, true, "viscont", 3, 6, 2},
318 | 		{"surgir", 3, 6, 2, true, "surg", 3, 4, 2},
319 | 		{"remercie", 3, 5, 2, true, "remerc", 3, 5, 2},
320 | 		{"joUissaIent", 3, 5, 2, true, "joU", 3, 3, 2},
321 | 		{"bondissant", 3, 6, 2, true, "bond", 3, 4, 2},
322 | 		{"saisi", 4, 5, 2, true, "sais", 4, 4, 2},
323 | 		{"missouri", 3, 7, 2, true, "missour", 3, 7, 2},
324 | 		{"remplirent", 3, 7, 2, true, "rempl", 3, 5, 2},
325 | 		{"envahi", 2, 5, 4, true, "envah", 2, 5, 4},
326 | 		{"tandis", 3, 6, 2, true, "tand", 3, 4, 2},
327 | 		{"trahit", 4, 6, 3, true, "trah", 4, 4, 3},
328 | 		{"trahissaIent", 4, 6, 3, true, "trah", 4, 4, 3},
329 | 		{"réunie", 4, 6, 2, true, "réun", 4, 4, 2},
330 | 		{"avarie", 2, 4, 3, true, "avar", 2, 4, 3},
331 | 		{"dilettanti", 3, 5, 2, true, "dilettant", 3, 5, 2},
332 | 		{"raidie", 4, 6, 2, true, "raid", 4, 4, 2},
333 | 		{"écuries", 2, 4, 3, true, "écur", 2, 4, 3},
334 | 		{"recouvrît", 3, 6, 2, true, "recouvr", 3, 6, 2},
335 | 		{"parsis", 3, 6, 3, true, "pars", 3, 4, 3},
336 | 		{"monti", 3, 5, 2, true, "mont", 3, 4, 2},
337 | 		{"reproduisit", 3, 6, 2, true, "reproduis", 3, 6, 2},
338 | 		{"étendit", 2, 4, 3, true, "étend", 2, 4, 3},
339 | 		{"suffi", 3, 5, 2, true, "suff", 3, 4, 2},
340 | 		{"pillaji", 3, 6, 2, true, "pillaj", 3, 6, 2},
341 | 		{"rougir", 4, 6, 2, true, "roug", 4, 4, 2},
342 | 		{"désirez", 3, 5, 2, true, "dés", 3, 3, 2},
343 | 		{"subit", 3, 5, 2, true, "sub", 3, 3, 2},
344 | 		{"fondirent", 3, 6, 2, true, "fond", 3, 4, 2},
345 | 		{"coqUineries", 3, 6, 2, true, "coqUiner", 3, 6, 2},
346 | 		{"venir", 3, 5, 2, true, "ven", 3, 3, 2},
347 | 		{"plaidoirie", 5, 8, 3, true, "plaidoir", 5, 8, 3},
348 | 		{"fournissant", 4, 7, 2, true, "fourn", 4, 5, 2},
349 | 		{"bonzeries", 3, 6, 2, true, "bonzer", 3, 6, 2},
350 | 		{"flétri", 4, 6, 3, true, "flétr", 4, 5, 3},
351 | 		{"faillit", 4, 7, 2, true, "faill", 4, 5, 2},
352 | 		{"hardie", 3, 6, 2, true, "hard", 3, 4, 2},
353 | 		{"compagnie", 3, 6, 2, true, "compagn", 3, 6, 2},
354 | 		{"vernis", 3, 6, 2, true, "vern", 3, 4, 2},
355 | 		{"attendit", 2, 5, 4, true, "attend", 2, 5, 4},
356 | 		{"blanchies", 4, 9, 3, true, "blanch", 4, 6, 3},
357 | 		{"choisie", 5, 7, 3, true, "chois", 5, 5, 3},
358 | 		{"rafraîchir", 3, 7, 2, true, "rafraîch", 3, 7, 2},
359 | 		{"choisir", 5, 7, 3, true, "chois", 5, 5, 3},
360 | 		{"nourrisse", 4, 7, 2, true, "nourr", 4, 5, 2},
361 | 		{"chancellerie", 4, 7, 3, true, "chanceller", 4, 7, 3},
362 | 		{"repartie", 3, 5, 2, true, "repart", 3, 5, 2},
363 | 		{"redira", 3, 5, 2, true, "red", 3, 3, 2},
364 | 		{"sentira", 3, 6, 2, true, "sent", 3, 4, 2},
365 | 		{"surgirait", 3, 6, 2, true, "surg", 3, 4, 2},
366 | 		{"cani", 3, 4, 2, true, "can", 3, 3, 2},
367 | 		{"gratis", 4, 6, 3, true, "grat", 4, 4, 3},
368 | 		{"médît", 3, 5, 2, true, "méd", 3, 3, 2},
369 | 		{"avertis", 2, 4, 3, true, "avert", 2, 4, 3},
370 | 		{"chirurgie", 4, 6, 3, true, "chirurg", 4, 6, 3},
371 | 		{"ironie", 2, 4, 3, true, "iron", 2, 4, 3},
372 | 		{"punîtes", 3, 5, 2, true, "pun", 3, 3, 2},
373 | 		{"compromis", 3, 7, 2, true, "comprom", 3, 7, 2},
374 | 		{"simonie", 3, 5, 2, true, "simon", 3, 5, 2},
375 | 	}
376 | 	romance.RunStepTest(t, step2a, testCases)
377 | }
378 | 
379 | // Test the removal of Verb suffixes in RV that
380 | // do not begin with "i".
381 | //
382 | func Test_step2b(t *testing.T) {
383 | 	testCases := []romance.StepTestCase{
384 | 		{"posée", 3, 5, 2, true, "pos", 3, 3, 2},
385 | 		{"contentait", 3, 6, 2, true, "content", 3, 6, 2},
386 | 		{"évita", 2, 4, 3, true, "évit", 2, 4, 3},
387 | 		{"cantonnées", 3, 6, 2, true, "cantonn", 3, 6, 2},
388 | 		{"tender", 3, 6, 2, true, "tend", 3, 4, 2},
389 | 		{"survenait", 3, 6, 2, true, "surven", 3, 6, 2},
390 | 		{"plongeaIent", 4, 8, 3, true, "plong", 4, 5, 3},
391 | 		{"modéra", 3, 5, 2, true, "modér", 3, 5, 2},
392 | 		{"copier", 3, 6, 2, true, "copi", 3, 4, 2},
393 | 		{"bougez", 4, 6, 2, true, "boug", 4, 4, 2},
394 | 		{"déploYaIent", 3, 6, 2, true, "déploY", 3, 6, 2},
395 | 		{"entendra", 2, 5, 4, true, "entendr", 2, 5, 4},
396 | 		{"blâmer", 4, 6, 3, true, "blâm", 4, 4, 3},
397 | 		{"déshonorait", 3, 6, 2, true, "déshonor", 3, 6, 2},
398 | 		{"concentrés", 3, 6, 2, true, "concentr", 3, 6, 2},
399 | 		{"mangeant", 3, 7, 2, true, "mang", 3, 4, 2},
400 | 		{"écouteront", 2, 5, 3, true, "écout", 2, 5, 3},
401 | 		{"pressaIent", 4, 7, 3, true, "press", 4, 5, 3},
402 | 		{"ébréché", 2, 5, 4, true, "ébréch", 2, 5, 4},
403 | 		{"frapper", 4, 7, 3, true, "frapp", 4, 5, 3},
404 | 		{"côtoYé", 3, 5, 2, true, "côtoY", 3, 5, 2},
405 | 		{"réfugié", 3, 5, 2, true, "réfugi", 3, 5, 2},
406 | 		{"jeûnant", 4, 6, 2, true, "jeûn", 4, 4, 2},
407 | 		{"succombé", 3, 6, 2, true, "succomb", 3, 6, 2},
408 | 		{"irrité", 2, 5, 4, true, "irrit", 2, 5, 4},
409 | 		{"danger", 3, 6, 2, true, "dang", 3, 4, 2},
410 | 		{"sachant", 3, 6, 2, true, "sach", 3, 4, 2},
411 | 		{"reparaissaIent", 3, 5, 2, true, "reparaiss", 3, 5, 2},
412 | 		{"reconnaissant", 3, 5, 2, true, "reconnaiss", 3, 5, 2},
413 | 		{"faisant", 4, 6, 2, true, "fais", 4, 4, 2},
414 | 		{"arrangés", 2, 5, 4, true, "arrang", 2, 5, 4},
415 | 		{"emparés", 2, 5, 4, true, "empar", 2, 5, 4},
416 | 		{"choqUée", 4, 7, 3, true, "choqU", 4, 5, 3},
417 | 		{"gênait", 3, 6, 2, true, "gên", 3, 3, 2},
418 | 		{"croissante", 5, 8, 3, true, "croiss", 5, 6, 3},
419 | 		{"scié", 4, 4, 3, true, "sci", 3, 3, 3},
420 | 		{"reconnaissez", 3, 5, 2, true, "reconnaiss", 3, 5, 2},
421 | 		{"pliaIent", 5, 7, 3, true, "pli", 3, 3, 3},
422 | 		{"expédia", 2, 5, 4, true, "expédi", 2, 5, 4},
423 | 		{"déshabillaIent", 3, 6, 2, true, "déshabill", 3, 6, 2},
424 | 		{"appréciée", 2, 6, 5, true, "appréci", 2, 6, 5},
425 | 		{"amputés", 2, 5, 4, true, "amput", 2, 5, 4},
426 | 		{"dominait", 3, 5, 2, true, "domin", 3, 5, 2},
427 | 		{"vexantes", 3, 5, 2, true, "vex", 3, 3, 2},
428 | 		{"fabriqUées", 3, 6, 2, true, "fabriqU", 3, 6, 2},
429 | 		{"retomber", 3, 5, 2, true, "retomb", 3, 5, 2},
430 | 		{"exercer", 2, 4, 3, true, "exerc", 2, 4, 3},
431 | 		{"entourait", 2, 6, 4, true, "entour", 2, 6, 4},
432 | 		{"voYait", 3, 6, 2, true, "voY", 3, 3, 2},
433 | 		{"soupait", 4, 7, 2, true, "soup", 4, 4, 2},
434 | 		{"apportiez", 2, 5, 4, true, "apport", 2, 5, 4},
435 | 		{"tuée", 4, 4, 2, true, "tu", 2, 2, 2},
436 | 		{"proposait", 4, 6, 3, true, "propos", 4, 6, 3},
437 | 		{"citations", 3, 5, 2, true, "citat", 3, 5, 2},
438 | 		{"distinguée", 3, 6, 2, true, "distingu", 3, 6, 2},
439 | 		{"parlerez", 3, 6, 3, true, "parl", 3, 4, 3},
440 | 		{"stanislas", 4, 6, 3, true, "stanisl", 4, 6, 3},
441 | 		{"enlevée", 2, 5, 4, true, "enlev", 2, 5, 4},
442 | 		{"irriguaIent", 2, 5, 4, true, "irrigu", 2, 5, 4},
443 | 		{"contenant", 3, 6, 2, true, "conten", 3, 6, 2},
444 | 		{"empêchèrent", 2, 5, 4, true, "empêch", 2, 5, 4},
445 | 		{"inspirées", 2, 6, 5, true, "inspir", 2, 6, 5},
446 | 		{"basée", 3, 5, 2, true, "bas", 3, 3, 2},
447 | 		{"consultait", 3, 6, 2, true, "consult", 3, 6, 2},
448 | 		{"retardait", 3, 5, 2, true, "retard", 3, 5, 2},
449 | 		{"enlevât", 2, 5, 4, true, "enlev", 2, 5, 4},
450 | 		{"convenaIent", 3, 6, 2, true, "conven", 3, 6, 2},
451 | 		{"portât", 3, 6, 2, true, "port", 3, 4, 2},
452 | 		{"admirée", 2, 5, 4, true, "admir", 2, 5, 4},
453 | 		{"copiée", 3, 6, 2, true, "copi", 3, 4, 2},
454 | 		{"démenaIent", 3, 5, 2, true, "démen", 3, 5, 2},
455 | 		{"fortifiées", 3, 6, 2, true, "fortifi", 3, 6, 2},
456 | 		{"apercevrait", 2, 4, 3, true, "apercevr", 2, 4, 3},
457 | 		{"risqUer", 3, 7, 2, true, "risqU", 3, 5, 2},
458 | 		{"réclamer", 3, 6, 2, true, "réclam", 3, 6, 2},
459 | 		{"tremblaIent", 4, 8, 3, true, "trembl", 4, 6, 3},
460 | 		{"calomnier", 3, 5, 2, true, "calomni", 3, 5, 2},
461 | 		{"réclamée", 3, 6, 2, true, "réclam", 3, 6, 2},
462 | 		{"déposât", 3, 5, 2, true, "dépos", 3, 5, 2},
463 | 		{"filé", 3, 4, 2, true, "fil", 3, 3, 2},
464 | 		{"déchirée", 3, 6, 2, true, "déchir", 3, 6, 2},
465 | 		{"prononça", 4, 6, 3, true, "prononç", 4, 6, 3},
466 | 		{"précédé", 4, 6, 3, true, "précéd", 4, 6, 3},
467 | 		{"asseYait", 2, 5, 4, true, "asseY", 2, 5, 4},
468 | 		{"emploYés", 2, 6, 5, true, "emploY", 2, 6, 5},
469 | 		{"chagriner", 4, 7, 3, true, "chagrin", 4, 7, 3},
470 | 		{"dévorât", 3, 5, 2, true, "dévor", 3, 5, 2},
471 | 		{"remonté", 3, 5, 2, true, "remont", 3, 5, 2},
472 | 		{"emploYant", 2, 6, 5, true, "emploY", 2, 6, 5},
473 | 		{"redoublait", 3, 6, 2, true, "redoubl", 3, 6, 2},
474 | 		{"marchant", 3, 7, 2, true, "march", 3, 5, 2},
475 | 		{"pétrifiée", 3, 6, 2, true, "pétrifi", 3, 6, 2},
476 | 		{"enlevées", 2, 5, 4, true, "enlev", 2, 5, 4},
477 | 		{"donnassent", 3, 6, 2, true, "donn", 3, 4, 2},
478 | 		{"recomptait", 3, 5, 2, true, "recompt", 3, 5, 2},
479 | 		{"masqUait", 3, 8, 2, true, "masqU", 3, 5, 2},
480 | 		{"renouvelèrent", 3, 6, 2, true, "renouvel", 3, 6, 2},
481 | 		{"recoucher", 3, 6, 2, true, "recouch", 3, 6, 2},
482 | 		{"abrégea", 2, 5, 4, true, "abrég", 2, 5, 4},
483 | 		{"flattait", 4, 8, 3, true, "flatt", 4, 5, 3},
484 | 	}
485 | 	romance.RunStepTest(t, step2b, testCases)
486 | }
487 | 
488 | // Test the cleaning up of "Y" and "ç" suffixes.
489 | //
490 | func Test_step3(t *testing.T) {
491 | 	testCases := []romance.StepTestCase{
492 | 		{"ennuY", 5, 5, 5, true, "ennui", 5, 5, 5},
493 | 		{"envoY", 5, 5, 4, true, "envoi", 5, 5, 4},
494 | 		{"aboY", 4, 4, 3, true, "aboi", 4, 4, 3},
495 | 		{"essaY", 5, 5, 4, true, "essai", 5, 5, 4},
496 | 		{"effroY", 6, 6, 6, true, "effroi", 6, 6, 6},
497 | 		{"désennuY", 8, 8, 8, true, "désennui", 8, 8, 8},
498 | 		{"renvoY", 6, 6, 6, true, "renvoi", 6, 6, 6},
499 | 		{"prononç", 7, 7, 3, true, "prononc", 7, 7, 3},
500 | 		{"asseY", 5, 5, 5, true, "assei", 5, 5, 5},
501 | 		{"croY", 4, 4, 3, true, "croi", 4, 4, 3},
502 | 		{"asseY", 5, 5, 4, true, "assei", 5, 5, 4},
503 | 		{"plaç", 4, 4, 3, true, "plac", 4, 4, 3},
504 | 		{"ennuY", 5, 5, 5, true, "ennui", 5, 5, 5},
505 | 		{"impaY", 5, 5, 5, true, "impai", 5, 5, 5},
506 | 		{"déploY", 6, 6, 2, true, "déploi", 6, 6, 2},
507 | 		{"avanç", 5, 5, 3, true, "avanc", 5, 5, 3},
508 | 		{"recommenç", 9, 9, 2, true, "recommenc", 9, 9, 2},
509 | 		{"pitoY", 5, 5, 5, true, "pitoi", 5, 5, 5},
510 | 		{"renvoY", 6, 6, 6, true, "renvoi", 6, 6, 6},
511 | 		{"choY", 4, 4, 4, true, "choi", 4, 4, 4},
512 | 		{"effroY", 6, 6, 6, true, "effroi", 6, 6, 6},
513 | 		{"forç", 4, 4, 2, true, "forc", 4, 4, 2},
514 | 		{"envoY", 5, 5, 5, true, "envoi", 5, 5, 5},
515 | 		{"paY", 3, 3, 3, true, "pai", 3, 3, 3},
516 | 		{"bunhY", 5, 5, 2, true, "bunhi", 5, 5, 2},
517 | 	}
518 | 	romance.RunStepTest(t, step3, testCases)
519 | }
520 | 
521 | // Test
522 | //
523 | func Test_step4(t *testing.T) {
524 | 	testCases := []romance.StepTestCase{
525 | 		{"défendues", 3, 5, 2, true, "défendu", 3, 5, 2},
526 | 		{"mormones", 3, 6, 2, true, "mormon", 3, 6, 2},
527 | 		{"souvienne", 4, 7, 2, true, "souvienn", 4, 7, 2},
528 | 		{"poumons", 4, 6, 2, true, "poumon", 4, 6, 2},
529 | 		{"relâche", 3, 5, 2, true, "relâch", 3, 5, 2},
530 | 		{"ressource", 3, 7, 2, true, "ressourc", 3, 7, 2},
531 | 		{"petits", 3, 5, 2, true, "petit", 3, 5, 2},
532 | 		{"obstacles", 2, 6, 5, true, "obstacl", 2, 6, 5},
533 | 		{"voisine", 4, 6, 2, true, "voisin", 4, 6, 2},
534 | 		{"tunnels", 3, 6, 2, true, "tunnel", 3, 6, 2},
535 | 		{"politesse", 3, 5, 2, true, "politess", 3, 5, 2},
536 | 		{"obéisse", 2, 5, 3, true, "obéiss", 2, 5, 3},
537 | 		{"brûlons", 4, 6, 3, true, "brûlon", 4, 6, 3},
538 | 		{"tâchons", 3, 6, 2, true, "tâchon", 3, 6, 2},
539 | 		{"gothiqUes", 3, 6, 2, true, "gothiqU", 3, 6, 2},
540 | 		{"acqUise", 2, 6, 5, true, "acqUis", 2, 6, 5},
541 | 		{"pigeons", 3, 6, 2, true, "pigeon", 3, 6, 2},
542 | 		{"focs", 3, 4, 2, true, "foc", 3, 3, 2},
543 | 		{"profondeurs", 4, 6, 3, true, "profondeur", 4, 6, 3},
544 | 		{"mettrons", 3, 7, 2, true, "mettron", 3, 7, 2},
545 | 		{"bavards", 3, 5, 2, true, "bavard", 3, 5, 2},
546 | 		{"nigauds", 3, 6, 2, true, "nigaud", 3, 6, 2},
547 | 		{"déesse", 4, 6, 2, true, "déess", 4, 5, 2},
548 | 		{"libraires", 3, 7, 2, true, "librair", 3, 7, 2},
549 | 		{"sentimentales", 3, 6, 2, true, "sentimental", 3, 6, 2},
550 | 		{"libre", 3, 5, 2, true, "libr", 3, 4, 2},
551 | 		{"matérielles", 3, 5, 2, true, "matériell", 3, 5, 2},
552 | 		{"habitudes", 3, 5, 2, true, "habitud", 3, 5, 2},
553 | 		{"blushes", 4, 7, 3, true, "blush", 4, 5, 3},
554 | 		{"suppose", 3, 6, 2, true, "suppos", 3, 6, 2},
555 | 		{"décrépitude", 3, 6, 2, true, "décrépitud", 3, 6, 2},
556 | 		{"incluse", 2, 6, 5, true, "inclus", 2, 6, 5},
557 | 		{"files", 3, 5, 2, true, "fil", 3, 3, 2},
558 | 		{"côtes", 3, 5, 2, true, "côt", 3, 3, 2},
559 | 		{"spirales", 4, 6, 3, true, "spiral", 4, 6, 3},
560 | 		{"bamboches", 3, 6, 2, true, "bamboch", 3, 6, 2},
561 | 		{"qUête", 4, 5, 3, true, "qUêt", 4, 4, 3},
562 | 		{"siècles", 4, 7, 2, true, "siècl", 4, 5, 2},
563 | 		{"glisse", 4, 6, 3, true, "gliss", 4, 5, 3},
564 | 		{"carrosses", 3, 6, 2, true, "carross", 3, 6, 2},
565 | 		{"supprime", 3, 7, 2, true, "supprim", 3, 7, 2},
566 | 		{"officielle", 2, 5, 4, true, "officiell", 2, 5, 4},
567 | 		{"vifs", 3, 4, 2, true, "vif", 3, 3, 2},
568 | 		{"adresses", 2, 5, 4, true, "adress", 2, 5, 4},
569 | 		{"hussards", 3, 6, 2, true, "hussard", 3, 6, 2},
570 | 		{"colle", 3, 5, 3, true, "coll", 3, 4, 3},
571 | 		{"amendes", 2, 4, 3, true, "amend", 2, 4, 3},
572 | 		{"qUeUe", 4, 5, 3, true, "qUeU", 4, 4, 3},
573 | 		{"écharpe", 2, 5, 4, true, "écharp", 2, 5, 4},
574 | 		{"débute", 3, 5, 2, true, "début", 3, 5, 2},
575 | 		{"refuse", 3, 5, 2, true, "refus", 3, 5, 2},
576 | 		{"légers", 3, 5, 2, true, "léger", 3, 5, 2},
577 | 		{"entrailles", 2, 7, 5, true, "entraill", 2, 7, 5},
578 | 		{"écarlate", 2, 4, 3, true, "écarlat", 2, 4, 3},
579 | 		{"manufacturières", 3, 5, 2, true, "manufacturi", 3, 5, 2},
580 | 		{"instruire", 2, 8, 6, true, "instruir", 2, 8, 6},
581 | 		{"danses", 3, 6, 2, true, "dans", 3, 4, 2},
582 | 		{"lits", 3, 4, 2, true, "lit", 3, 3, 2},
583 | 		{"cours", 4, 5, 2, true, "cour", 4, 4, 2},
584 | 		{"belgirate", 3, 6, 2, true, "belgirat", 3, 6, 2},
585 | 		{"délire", 3, 5, 2, true, "délir", 3, 5, 2},
586 | 		{"offenses", 2, 5, 4, true, "offens", 2, 5, 4},
587 | 		{"athènes", 2, 5, 4, true, "athèn", 2, 5, 4},
588 | 		{"alphabets", 2, 6, 5, true, "alphabet", 2, 6, 5},
589 | 		{"ascagne", 2, 5, 4, true, "ascagn", 2, 5, 4},
590 | 		{"lièvre", 4, 6, 2, true, "lièvr", 4, 5, 2},
591 | 		{"hercule", 3, 6, 2, true, "hercul", 3, 6, 2},
592 | 		{"casqUe", 3, 6, 2, true, "casqU", 3, 5, 2},
593 | 		{"cachons", 3, 6, 2, true, "cachon", 3, 6, 2},
594 | 		{"herbe", 3, 5, 2, true, "herb", 3, 4, 2},
595 | 		{"banqUette", 3, 7, 2, true, "banqUett", 3, 7, 2},
596 | 		{"actuelles", 2, 6, 4, true, "actuell", 2, 6, 4},
597 | 		{"intercession", 2, 5, 4, true, "intercess", 2, 5, 4},
598 | 		{"pêle", 3, 4, 2, true, "pêl", 3, 3, 2},
599 | 		{"grossières", 4, 8, 3, true, "grossi", 4, 6, 3},
600 | 		{"qUelle", 4, 6, 3, true, "qUell", 4, 5, 3},
601 | 		{"séduits", 3, 6, 2, true, "séduit", 3, 6, 2},
602 | 		{"vengeance", 3, 7, 2, true, "vengeanc", 3, 7, 2},
603 | 		{"indécentes", 2, 5, 4, true, "indécent", 2, 5, 4},
604 | 		{"bergères", 3, 6, 2, true, "bergèr", 3, 6, 2},
605 | 		{"fenestrelles", 3, 5, 2, true, "fenestrell", 3, 5, 2},
606 | 		{"croupe", 5, 6, 3, true, "croup", 5, 5, 3},
607 | 		{"légitime", 3, 5, 2, true, "légitim", 3, 5, 2},
608 | 		{"ferrare", 3, 6, 2, true, "ferrar", 3, 6, 2},
609 | 		{"briqUe", 4, 6, 3, true, "briqU", 4, 5, 3},
610 | 		{"étrangère", 2, 5, 4, true, "étrangèr", 2, 5, 4},
611 | 		{"arqUés", 2, 6, 5, true, "arqUé", 2, 5, 5},
612 | 		{"guèbres", 4, 7, 2, true, "guèbr", 4, 5, 2},
613 | 		{"partons", 3, 6, 3, true, "parton", 3, 6, 3},
614 | 		{"distingue", 3, 6, 2, true, "distingu", 3, 6, 2},
615 | 		{"paratonnerres", 3, 5, 3, true, "paratonnerr", 3, 5, 3},
616 | 		{"anonyme", 2, 4, 3, true, "anonym", 2, 4, 3},
617 | 		{"volutes", 3, 5, 2, true, "volut", 3, 5, 2},
618 | 		{"décence", 3, 5, 2, true, "décenc", 3, 5, 2},
619 | 		{"coupure", 4, 6, 2, true, "coupur", 4, 6, 2},
620 | 		{"avarice", 2, 4, 3, true, "avaric", 2, 4, 3},
621 | 		{"sensible", 3, 6, 2, true, "sensibl", 3, 6, 2},
622 | 		{"cramponne", 4, 7, 3, true, "cramponn", 4, 7, 3},
623 | 		{"sympathise", 3, 6, 2, true, "sympathis", 3, 6, 2},
624 | 		{"assidue", 2, 5, 4, true, "assidu", 2, 5, 4},
625 | 	}
626 | 	romance.RunStepTest(t, step4, testCases)
627 | }
628 | 
629 | // Test a large set of words for which we know
630 | // the correct stemmed form.
631 | //
632 | func Test_FrenchVocabulary(t *testing.T) {
633 | 	testCases := []struct {
634 | 		in  string
635 | 		out string
636 | 	}{
637 | 		{"battements", "batt"},
638 | 		{"mélangé", "mélang"},
639 | 		{"impériales", "impérial"},
640 | 		{"paragraphe", "paragraph"},
641 | 		{"charité", "charit"},
642 | 		{"reproche", "reproch"},
643 | 		{"belvédère", "belvéder"},
644 | 		{"illisible", "illisibl"},
645 | 		{"pleurs", "pleur"},
646 | 		{"passait", "pass"},
647 | 		{"heaviest", "heaviest"},
648 | 		{"correspondance", "correspond"},
649 | 		{"c", "c"},
650 | 		{"profitable", "profit"},
651 | 		{"remontrance", "remontr"},
652 | 		{"ramasseraient", "ramass"},
653 | 		{"arrivera", "arriv"},
654 | 		{"canta", "cant"},
655 | 		{"évanouie", "évanou"},
656 | 		{"bleuâtres", "bleuâtr"},
657 | 		{"achetées", "achet"},
658 | 		{"bazars", "bazar"},
659 | 		{"affections", "affect"},
660 | 		{"luttent", "luttent"},
661 | 		{"recouvra", "recouvr"},
662 | 		{"regorgent", "regorgent"},
663 | 		{"pruderie", "pruder"},
664 | 		{"entomologique", "entomolog"},
665 | 		{"jansénisme", "jansen"},
666 | 		{"tourne", "tourn"},
667 | 		{"tuer", "tu"},
668 | 		{"concluantes", "conclu"},
669 | 		{"subi", "sub"},
670 | 		{"agent", "agent"},
671 | 		{"instantanément", "instantan"},
672 | 		{"gustave", "gustav"},
673 | 		{"colossales", "colossal"},
674 | 		{"nothing", "nothing"},
675 | 		{"quantièmes", "quantiem"},
676 | 		{"aidez", "aid"},
677 | 		{"horlogerie", "horloger"},
678 | 		{"ranimer", "ranim"},
679 | 		{"landau", "landau"},
680 | 		{"mêler", "mêl"},
681 | 		{"scrupuleusement", "scrupul"},
682 | 		{"poitrail", "poitrail"},
683 | 		{"chaudement", "chaud"},
684 | 		{"impiété", "impiet"},
685 | 		{"redoublaient", "redoubl"},
686 | 		{"punira", "pun"},
687 | 		{"proposa", "propos"},
688 | 		{"envolés", "envol"},
689 | 		{"réparer", "répar"},
690 | 		{"inventer", "invent"},
691 | 		{"précision", "précis"},
692 | 		{"déguisa", "déguis"},
693 | 		{"plantations", "plantat"},
694 | 		{"appliqua", "appliqu"},
695 | 		{"plat", "plat"},
696 | 		{"préfète", "préfet"},
697 | 		{"baisers", "baiser"},
698 | 		{"calmèrent", "calm"},
699 | 		{"tressé", "tress"},
700 | 		{"consulta", "consult"},
701 | 		{"dédaigneux", "dédaign"},
702 | 		{"dithyrambe", "dithyramb"},
703 | 		{"obligera", "oblig"},
704 | 		{"nommés", "nomm"},
705 | 		{"mousseux", "mousseux"},
706 | 		{"pusillanimes", "pusillanim"},
707 | 		{"richissime", "richissim"},
708 | 		{"weber", "web"},
709 | 		{"groupes", "group"},
710 | 		{"rentra", "rentr"},
711 | 		{"persécuté", "persécut"},
712 | 		{"nuiraient", "nuir"},
713 | 		{"ayant", "ayant"},
714 | 		{"joueraient", "jou"},
715 | 		{"attenante", "atten"},
716 | 		{"formait", "form"},
717 | 		{"encombrées", "encombr"},
718 | 		{"sifflait", "siffl"},
719 | 		{"lire", "lir"},
720 | 		{"faciliter", "facilit"},
721 | 		{"casse", "cass"},
722 | 		{"remit", "rem"},
723 | 		{"profond", "profond"},
724 | 		{"sortez", "sort"},
725 | 		{"boiteux", "boiteux"},
726 | 		{"flatteuses", "flatteux"},
727 | 		{"plafonds", "plafond"},
728 | 		{"trahît", "trah"},
729 | 		{"lesquelles", "lesquel"},
730 | 		{"fantaisies", "fantais"},
731 | 		{"séduite", "séduit"},
732 | 		{"consolée", "consol"},
733 | 		{"estomac", "estomac"},
734 | 		{"adverbe", "adverb"},
735 | 		{"promenés", "promen"},
736 | 		{"côte", "côt"},
737 | 		{"flegme", "flegm"},
738 | 		{"végétaient", "véget"},
739 | 		{"annoncerait", "annonc"},
740 | 		{"quais", "quais"},
741 | 		{"hissa", "hiss"},
742 | 		{"protection", "protect"},
743 | 		{"destine", "destin"},
744 | 		{"justice", "justic"},
745 | 		{"fili", "fil"},
746 | 		{"conduite", "conduit"},
747 | 		{"narra", "narr"},
748 | 		{"torturé", "tortur"},
749 | 		{"couloirs", "couloir"},
750 | 		{"bronché", "bronch"},
751 | 		{"oeuvres", "oeuvr"},
752 | 		{"retire", "retir"},
753 | 		{"laisserai", "laiss"},
754 | 		{"rassura", "rassur"},
755 | 		{"leipsick", "leipsick"},
756 | 		{"gâte", "gât"},
757 | 		{"désormais", "désorm"},
758 | 		{"pain", "pain"},
759 | 		{"pianos", "pianos"},
760 | 		{"opérée", "oper"},
761 | 		{"effrayèrent", "effrai"},
762 | 		{"sachez", "sach"},
763 | 		{"répétées", "répet"},
764 | 		{"time", "tim"},
765 | 		{"golgonda", "golgond"},
766 | 		{"occupèrent", "occup"},
767 | 		{"embrasserais", "embrass"},
768 | 		{"dévorante", "dévor"},
769 | 		{"soutenant", "souten"},
770 | 		{"voluptueuse", "voluptu"},
771 | 		{"vicomtes", "vicomt"},
772 | 		{"constante", "const"},
773 | 		{"admirable", "admir"},
774 | 		{"déroger", "dérog"},
775 | 		{"survit", "surv"},
776 | 		{"manquerais", "manqu"},
777 | 		{"remontrer", "remontr"},
778 | 		{"exercent", "exercent"},
779 | 		{"outrageantes", "outrag"},
780 | 		{"dépôt", "dépôt"},
781 | 		{"engagées", "engag"},
782 | 		{"rouvray", "rouvray"},
783 | 		{"comprenez", "compren"},
784 | 		{"imprudentes", "imprudent"},
785 | 		{"billards", "billard"},
786 | 		{"tremblante", "trembl"},
787 | 		{"impie", "impi"},
788 | 		{"peu", "peu"},
789 | 		{"indigène", "indigen"},
790 | 		{"social", "social"},
791 | 		{"consigne", "consign"},
792 | 		{"emporterait", "emport"},
793 | 		{"rocky", "rocky"},
794 | 		{"cosmopolite", "cosmopolit"},
795 | 		{"police", "polic"},
796 | 		{"jeun", "jeun"},
797 | 		{"lourdes", "lourd"},
798 | 		{"extraordinaire", "extraordinair"},
799 | 		{"dérangeait", "dérang"},
800 | 		{"long", "long"},
801 | 		{"empressées", "empress"},
802 | 		{"capitulation", "capitul"},
803 | 		{"giration", "girat"},
804 | 		{"guidés", "guid"},
805 | 		{"bourbiers", "bourbi"},
806 | 		{"provisions", "provis"},
807 | 		{"dois", "dois"},
808 | 		{"squelette", "squelet"},
809 | 		{"extravagante", "extravag"},
810 | 		{"bruns", "brun"},
811 | 		{"considérerais", "consider"},
812 | 		{"entièrement", "entier"},
813 | 		{"suffocations", "suffoc"},
814 | 		{"diminue", "diminu"},
815 | 		{"froissants", "froiss"},
816 | 		{"avalé", "aval"},
817 | 		{"détacher", "détach"},
818 | 		{"remplace", "remplac"},
819 | 		{"exagérait", "exager"},
820 | 		{"élévations", "élev"},
821 | 		{"exagérant", "exager"},
822 | 		{"promenaient", "promen"},
823 | 		{"antidatée", "antidat"},
824 | 		{"touchait", "touch"},
825 | 		{"aimerait", "aim"},
826 | 		{"lope", "lop"},
827 | 		{"tranchait", "tranch"},
828 | 		{"environnent", "environnent"},
829 | 		{"inondation", "inond"},
830 | 		{"frayeur", "frayeur"},
831 | 		{"solaire", "solair"},
832 | 		{"oysters", "oyster"},
833 | 		{"rêveuse", "rêveux"},
834 | 		{"concession", "concess"},
835 | 		{"existé", "exist"},
836 | 		{"promener", "promen"},
837 | 	}
838 | 	for _, testCase := range testCases {
839 | 		result := Stem(testCase.in, true)
840 | 		if result != testCase.out {
841 | 			t.Errorf("Expected %v -> %v, but got %v", testCase.in, testCase.out, result)
842 | 		}
843 | 	}
844 | }
845 | 


--------------------------------------------------------------------------------