├── LICENSE
├── README.md
├── C
├── cistem.h
└── cistem.c
├── Cpp
├── cistem.hpp
└── cistem.cpp
├── Cistem.hs
├── Cistem.java
├── Cistem.pm
├── Cistem.cs
├── Cistem.swift
├── Cistem.py
└── Cistem.js
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Leonie Weißweiler
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CISTEM
2 | []()
3 |
4 | CISTEM is a stemming algorithm for the German language, developed by Leonie Weißweiler and Alexander Fraser. This repository contains official implementations in a variety of programming languages. At the moment, the following languages are available:
5 |
6 | - Python
7 | - Java
8 | - C++
9 | - C
10 | - Javascript
11 | - Go
12 | - Haskell
13 | - Perl
14 | - Swift
15 |
16 | The code for each language encludes a method for stemming as well as one for segmentation, which returns the stripped suffix as well as the stem.
17 |
18 | ## Performance
19 | We performed a comparative analysis of six publicly available German stemmers, where CISTEM achieved the best results for f-measure and state-of-the-art results for runtime.
20 |
21 |
22 |
23 | ## Gold standards
24 | The *gold_standards* folder contains the two gold standards we used for evaluation. Each file is utf-8 text file with each line containing all the stems of one cluster separated by a single space. Note that we do not supply a reference *stem* for each cluster, as we measure stemming performance as the ability to group words with the same meaning, which is more relevant for information retrieval purposes than the absolute stem. If you use these gold standards in your own research, please cite our paper: [Bibtex](http://cis.lmu.de/~weissweiler/cistem/bibtex/bibtex.php)
25 |
26 | More information on how we evaluated runtimes and stemming quality can be found in our paper:
27 |
28 | Leonie Weißweiler, Alexander Fraser (2017). [Developing a Stemmer for German Based on a Comparative Analysis of Publicly Available Stemmers.](http://cis.lmu.de/~weissweiler/cistem/) In Proceedings of the German Society for Computational Linguistics and Language Technology (GSCL), to appear.
29 |
--------------------------------------------------------------------------------
/C/cistem.h:
--------------------------------------------------------------------------------
1 | /* CISTEM Stemmer for German
2 |
3 | This is the official Perl implementation of the CISTEM stemmer.
4 | It is based on the paper
5 | Leonie Weißweiler, Alexander Fraser (2017). Developing a Stemmer for German Based on a Comparative Analysis of Publicly Available Stemmers. In Proceedings of the German Society for Computational Linguistics and Language Technology (GSCL)
6 | which can be read here:
7 | http://www.cis.lmu.de/~weissweiler/cistem/
8 |
9 | In the paper, we conducted an analysis of publicly available stemmers, developed
10 | two gold standards for German stemming and evaluated the stemmers based on the
11 | two gold standards. We then proposed the stemmer implemented here and show
12 | that it achieves slightly better f-measure than the other stemmers and is
13 | thrice as fast as the Snowball stemmer for German while being about as fast as
14 | most other stemmers.
15 | */
16 |
17 | #ifndef CISTEM_H
18 | #define CISTEM_H
19 |
20 | /* This method takes the word to be stemmed and a boolean specifiying if case-insensitive stemming should be used and returns the stemmed word. If only the word
21 | is passed to the method or the second parameter is 0, normal case-sensitive stemming is used, if the second parameter is 1, case-insensitive stemming is used.
22 |
23 | Case sensitivity improves performance only if words in the text may be incorrectly upper case.
24 | For all-lowercase and correctly cased text, best performance is achieved by
25 | using the case-sensitive version.
26 | */
27 | wchar_t* stem(wchar_t* word);
28 | wchar_t* stem_case_insensitive(wchar_t* word);
29 |
30 | /* This method works very similarly to stem. The only difference is that in
31 | addition to returning the stem, it also returns the rest that was removed at
32 | the end. To be able to return the stem unchanged so the stem and the rest
33 | can be concatenated to form the original word, all subsitutions that altered
34 | the stem in any other way than by removing letters at the end were left out.
35 |
36 | The return value is an array that contains a pointer to the stem first, then
37 | to the suffix second
38 | */
39 | wchar_t** segment(wchar_t* word);
40 | wchar_t** segment_case_insensitive(wchar_t* word);
41 |
42 | #endif
--------------------------------------------------------------------------------
/Cpp/cistem.hpp:
--------------------------------------------------------------------------------
1 | /* CISTEM Stemmer for German
2 |
3 | This is the official Perl implementation of the CISTEM stemmer.
4 | It is based on the paper
5 | Leonie Weißweiler, Alexander Fraser (2017). Developing a Stemmer for German Based on a Comparative Analysis of Publicly Available Stemmers. In Proceedings of the German Society for Computational Linguistics and Language Technology (GSCL)
6 | which can be read here:
7 | http://www.cis.lmu.de/~weissweiler/cistem/
8 |
9 | In the paper, we conducted an analysis of publicly available stemmers, developed
10 | two gold standards for German stemming and evaluated the stemmers based on the
11 | two gold standards. We then proposed the stemmer implemented here and show
12 | that it achieves slightly better f-measure than the other stemmers and is
13 | thrice as fast as the Snowball stemmer for German while being about as fast as
14 | most other stemmers.
15 | */
16 |
17 | #ifndef CISTEM_H
18 | #define CISTEM_H
19 |
20 | #include
21 | #include
22 |
23 | namespace Cistem {
24 |
25 | /* This function takes the word to be stemmed and a boolean specifiying if case-insensitive stemming should be used and returns the stemmed word. If only the word is passed to the method or the second parameter is false, normal case-sensitive stemming is used, if the second parameter is true, case-insensitive stemming is used.
26 |
27 | Case sensitivity improves performance only if words in the text may be incorrectly upper case.
28 | For all-lowercase and correctly cased text, best performance is achieved by
29 | using the case-sensitive version.
30 | */
31 | std::wstring stem(std::wstring word);
32 | std::wstring stem(std::wstring word, bool caseInsensitive);
33 |
34 | /* This function works very similarly to stem. The only difference is that in
35 | addition to returning the stem, it also returns the rest that was removed at
36 | the end. To be able to return the stem unchanged so the stem and the rest
37 | can be concatenated to form the original word, all subsitutions that altered
38 | the stem in any other way than by removing letters at the end were left out.
39 |
40 | The return value is an array that contains a pointer to the stem first, then
41 | to the suffix second
42 | */
43 | std::array segment(std::wstring word);
44 | std::array segment(std::wstring word, bool caseInsensitive);
45 |
46 | }
47 |
48 | #endif
49 |
--------------------------------------------------------------------------------
/Cistem.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings, RecordWildCards #-}
2 |
3 | module NLP.Stemmer.Cistem (stem,stemCaseInsensitive,Segmentation(..),segment',segment,segment'CaseInsensitive,segmentCaseInsensitive) where
4 |
5 | import Data.Char
6 | import Data.Monoid
7 | import Data.Text as T
8 |
9 | -- | Guess the word stem. This module uses the CISTEM algorithm, published by L. Weißweiler and A. Fraser in "Developing a Stemmer for German Based on a Comparative Analysis of Publicly Available Stemmers" (2017).
10 | stem :: Text -> Text
11 | stem t =
12 | let firstUpper = isUpper (T.head t)
13 | in postpare $ loop firstUpper $ prepare t
14 |
15 | -- | A case insensitive variant. Use only if the text may be incorrectly upper case.
16 | stemCaseInsensitive :: Text -> Text
17 | stemCaseInsensitive t = postpare $ loop False $ prepare t
18 |
19 | data Segmentation = Segmentation { segPrefix :: Text, segStem :: Text, segSuffix :: Text } deriving (Show,Eq)
20 |
21 | -- | Split the word into a prefix, the stem and a suffix. In contrast to the `stem` function umlauts remain unchanged.
22 | segment' :: Text -> Segmentation
23 | segment' t =
24 | let firstUpper = isUpper (T.head t)
25 | lower = T.toLower t
26 | prepared = segmentPrepare t
27 | theStem = postpare $ loop firstUpper prepared
28 | thePrefix | theStem `isPrefixOf` lower = ""
29 | | "ge" `isPrefixOf` lower = "ge"
30 | | otherwise = error ("segment' should be debugged; extracted stem: "++ unpack theStem)
31 | theSuffix = T.drop (T.length thePrefix + T.length theStem) lower
32 | in Segmentation thePrefix theStem theSuffix
33 |
34 | -- | Split the word into stem and suffix. This is supposed to be compatible to the `segment` function from the reference implementation.
35 | segment :: Text -> (Text,Text)
36 | segment t =
37 | let Segmentation{..} = segment' t
38 | in (segPrefix<>segStem, segSuffix)
39 |
40 | -- | A case insensitive variant. Use only if the text may be incorrectly upper case.
41 | segmentCaseInsensitive :: Text -> (Text,Text)
42 | segmentCaseInsensitive = segment . T.toLower
43 |
44 | -- | A case insensitive variant. Use only if the text may be incorrectly upper case.
45 | segment'CaseInsensitive :: Text -> Segmentation
46 | segment'CaseInsensitive = segment' . T.toLower
47 |
48 | loop u t | T.length t <= 3 = t
49 | | (T.length t > 5) && (["em","er","nd"] `isSuffixOf'` t) = loop u (stripSuffix' ["em","er","nd"] t)
50 | | not u && ("t" `isSuffixOf` t) = loop u (stripSuffix' ["t"] t)
51 | | ["e","s","n"] `isSuffixOf'` t = loop u (stripSuffix' ["e","s","n"] t)
52 | | otherwise = t
53 |
54 |
55 | prepare :: Text -> Text
56 | prepare =
57 | replace "ü" "u" .
58 | replace "ö" "o" .
59 | replace "ä" "a" .
60 | replxx .
61 | replace "ß" "ss" .
62 | segmentPrepare
63 |
64 | segmentPrepare :: Text -> Text
65 | segmentPrepare =
66 | replace "sch" "$" .
67 | replace "ie" "&" .
68 | replace "ei" "%" .
69 | replxx .
70 | stripge .
71 | T.toLower
72 |
73 | postpare :: Text -> Text
74 | postpare =
75 | replace "%" "ei" .
76 | replace "&" "ie" .
77 | replace "$" "sch" .
78 | replxxback
79 |
80 | replxx :: Text -> Text
81 | replxx = snd . mapAccumL f '\0'
82 | where f prev curr | prev == curr = (curr,'*')
83 | | otherwise = (curr,curr)
84 |
85 | replxxback :: Text -> Text
86 | replxxback = snd . mapAccumL f '\0'
87 | where f prev '*' = (prev,prev)
88 | f prev curr = (curr,curr)
89 |
90 | stripge :: Text -> Text
91 | stripge t | T.length t >= 6 =
92 | case stripPrefix "ge" t of
93 | Nothing -> t
94 | Just t -> t
95 | | otherwise = t
96 |
97 | isSuffixOf' [] _ = False
98 | isSuffixOf' (s:ss) t = (s `isSuffixOf` t) || (ss `isSuffixOf'` t)
99 |
100 | stripSuffix' :: [Text] -> Text -> Text
101 | stripSuffix' [] hay = hay
102 | stripSuffix' (suff:ss) hay =
103 | case stripSuffix suff hay of
104 | Just t -> t
105 | Nothing -> stripSuffix' ss hay
106 |
--------------------------------------------------------------------------------
/Cistem.java:
--------------------------------------------------------------------------------
1 | import java.util.regex.Pattern;
2 | public class Cistem {;
3 |
4 | private static final Pattern GE_PATTERN = Pattern.compile("^ge(.{4,})");
5 | private static final Pattern DOLLAR1_PATTERN = Pattern.compile("(.)\\1");
6 | private static final Pattern ND_PATTERN = Pattern.compile("nd$");
7 | private static final Pattern EMR_PATTERN = Pattern.compile("e[mr]$");
8 | private static final Pattern T_PATTERN = Pattern.compile("t$");
9 | private static final Pattern ESN_PATTERN = Pattern.compile("[esn]$");
10 | private static final Pattern STAR_PATTERN = Pattern.compile("(.)\\*");
11 |
12 | public static String stem(String word) {
13 | return stem(word, false);
14 | }
15 |
16 | public static String stem(String word, boolean case_insensitive) {
17 | if (word.length() == 0) return word;
18 |
19 | word = word.replace("Ü", "U");
20 | word = word.replace("Ö", "O");
21 | word = word.replace("Ä", "A");
22 | word = word.replace("ü", "u");
23 | word = word.replace("ö", "o");
24 | word = word.replace("ä", "a");
25 |
26 | boolean uppercase = Character.isUpperCase(word.charAt(0));
27 |
28 | word = word.toLowerCase();
29 |
30 | word = word.replace("ß", "ss");
31 | word = GE_PATTERN.matcher(word).replaceAll("$1");
32 | word = word.replace("sch", "$");
33 | word = word.replace("ei", "%");
34 | word = word.replace("ie", "&");
35 |
36 | word = DOLLAR1_PATTERN.matcher(word).replaceAll("$1*");
37 |
38 | while (word.length() > 3) {
39 | if (word.length() > 5) {
40 | String newWord = EMR_PATTERN.matcher(word).replaceAll("");
41 | if (!word.equals(newWord)) {
42 | word = newWord;
43 | continue;
44 | }
45 |
46 | newWord = ND_PATTERN.matcher(word).replaceAll("");
47 | if (!word.equals(newWord)) {
48 | word = newWord;
49 | continue;
50 | }
51 | }
52 |
53 | if (!uppercase || case_insensitive) {
54 | final String newWord = T_PATTERN.matcher(word).replaceAll("");
55 | if (!word.equals(newWord)) {
56 | word = newWord;
57 | continue;
58 | }
59 | }
60 |
61 | String newWord = ESN_PATTERN.matcher(word).replaceAll("");
62 | if (!word.equals(newWord)) {
63 | word = newWord;
64 | continue;
65 | } else {
66 | break;
67 | }
68 | }
69 |
70 | word = STAR_PATTERN.matcher(word).replaceAll("$1$1");
71 | word = word.replace("&", "ie");
72 | word = word.replace("%", "ei");
73 | word = word.replace("$", "sch");
74 |
75 | return word;
76 | }
77 |
78 | public static String[] segment(String word) {
79 | return segment(word, false);
80 | }
81 |
82 | public static String[] segment(String word, boolean case_insensitive) {
83 | if (word.length() == 0) {
84 | String[] result = new String[2];
85 | result[0] = "";
86 | result[1] = "";
87 | return result;
88 | }
89 |
90 | int restLength = 0;
91 | boolean uppercase = Character.isUpperCase(word.charAt(0));
92 | word = word.toLowerCase();
93 | String original = new String(word);
94 |
95 | word = word.replace("sch", "$");
96 | word = word.replace("ei", "%");
97 | word = word.replace("ie", "&");
98 |
99 | word = DOLLAR1_PATTERN.matcher(word).replaceAll("$1*");
100 |
101 | while (word.length() > 3) {
102 | if (word.length() > 5) {
103 | String newWord = word.replaceAll("e[mr]$", "");
104 | if (!word.equals(newWord)) {
105 | restLength += 2;
106 | word = newWord;
107 | continue;
108 | }
109 |
110 | newWord = word.replaceAll("nd$", "");
111 | if (!word.equals(newWord)) {
112 | restLength += 2;
113 | word = newWord;
114 | continue;
115 | }
116 | }
117 |
118 | if (!uppercase || case_insensitive) {
119 | String newWord = word.replaceAll("t$", "");
120 | if (!word.equals(newWord)) {
121 | restLength += 1;
122 | word = newWord;
123 | continue;
124 | }
125 | }
126 |
127 | String newWord = word.replaceAll("[esn]$", "");
128 | if (!word.equals(newWord)) {
129 | restLength += 1;
130 | word = newWord;
131 | continue;
132 | } else {
133 | break;
134 | }
135 | }
136 |
137 | word = word.replaceAll("(.)\\*", "$1$1");
138 | word = word.replace("&", "ie");
139 | word = word.replace("%", "ei");
140 | word = word.replace("$", "sch");
141 |
142 | String rest = "";
143 | if (restLength != 0) {
144 | rest = original.substring(original.length() - restLength);
145 | }
146 |
147 | String[] result = new String[2];
148 | result[0] = word;
149 | result[1] = rest;
150 | return result;
151 | }
152 | }
153 |
--------------------------------------------------------------------------------
/Cistem.pm:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | use utf8;
3 | package Cistem;
4 |
5 | sub stem{
6 | $word = shift;
7 | $case_insensitive = shift;
8 |
9 | $word =~ s/Ü/U/g;
10 | $word =~ s/Ö/O/g;
11 | $word =~ s/Ä/A/g;
12 |
13 | $word =~ s/üü/uu/g; #necessary because of Perl Unicode problems
14 | $word =~ s/öö/oo/g;
15 | $word =~ s/ää/aa/g;
16 |
17 | $word =~ s/ü/u/g;
18 | $word =~ s/ö/o/g;
19 | $word =~ s/ä/a/g;
20 |
21 | $upper = (ucfirst $word eq $word);
22 |
23 | $word = lc($word);
24 |
25 | $word =~ s/ß/ss/g;
26 |
27 | $word =~ s/^ge(.{4,})/\1/;
28 |
29 | $word =~s/sch/\$/g;
30 | $word =~s/ei/\%/g;
31 | $word =~s/ie/\&/g;
32 |
33 | $word =~ s/(.)\1/\1\*/g;
34 |
35 | while(length($word)>3){
36 | if(length($word)>5 && ($word =~ s/e[mr]$// || $word =~ s/nd$//)){
37 | pass;
38 | }
39 | elsif((!($upper) || $case_insensitive) && $word =~ s/t$//){
40 | pass;
41 | }
42 | elsif($word =~ s/[esn]$//){
43 | pass;
44 | }
45 | else{
46 | last;
47 | }
48 | }
49 |
50 |
51 | $word =~s/(.)\*/\1\1/g;
52 |
53 | $word =~s/\$/sch/g;
54 | $word =~s/\%/ei/g;
55 | $word =~s/\&/ie/g;
56 |
57 | return $word;
58 | }
59 |
60 | sub segment{
61 | $word = shift;
62 | $case_insensitive = shift;
63 | $rest_length = 0;
64 |
65 | $upper = (ucfirst $word eq $word);
66 |
67 | $word = lc($word);
68 |
69 | $original = $word;
70 |
71 | $word =~s/sch/\$/g;
72 | $word =~s/ei/\%/g;
73 | $word =~s/ie/\&/g;
74 |
75 | $word =~ s/(.)\1/\1\*/g;
76 |
77 | while(length($word)>3){
78 | if(length($word)>5 && ($word =~ s/(e[mr])$// || $word =~ s/(nd)$//)){
79 | $rest_length += 2;
80 | }
81 | elsif((!($upper) || $case_insensitive) && $word =~ s/t$//){
82 | $rest_length++;
83 | }
84 | elsif($word =~ s/([esn])$//){
85 | $rest_length++;
86 | }
87 | else{
88 | last;
89 | }
90 | }
91 |
92 | $word =~s/(.)\*/\1\1/g;
93 |
94 | $word =~s/\$/sch/g;
95 | $word =~s/\%/ei/g;
96 | $word =~s/\&/ie/g;
97 |
98 | if($rest_length){
99 | $rest = substr($original, - $rest_length);
100 | }
101 | else{
102 | $rest = "";
103 | }
104 |
105 |
106 |
107 | return ($word,$rest);
108 | }
109 |
110 | 1;
111 |
112 | =pod
113 |
114 | =head1 NAME
115 |
116 | CISTEM Stemmer for German
117 |
118 | =head1 SYNOPSIS
119 |
120 | use Cistem;
121 | my $stemmed_word = stem($word);
122 |
123 | or, for segmentation:
124 |
125 | my @segments = segment($word);
126 |
127 | =head1 DESCRIPTION
128 |
129 | This is the official Perl implementation of the CISTEM stemmer.
130 | It is based on the paper
131 | Leonie Weißweiler, Alexander Fraser (2017). Developing a Stemmer for German Based on a Comparative Analysis of Publicly Available Stemmers. In Proceedings of the German Society for Computational Linguistics and Language Technology (GSCL)
132 | which can be read here:
133 | http://www.cis.lmu.de/~weissweiler/cistem/
134 |
135 | In the paper, we conducted an analysis of publicly available stemmers, developed
136 | two gold standards for German stemming and evaluated the stemmers based on the
137 | two gold standards. We then proposed the stemmer implemented here and show
138 | that it achieves slightly better f-measure than the other stemmers and is
139 | thrice as fast as the Snowball stemmer for German while being about as fast as
140 | most other stemmers.
141 |
142 | =head1 METHODS
143 |
144 | =over 8
145 |
146 | =item stem($word, $case_insensitivity)
147 |
148 | This method takes the word to be stemmed and a boolean specifiying if case-insensitive stemming should be used and returns the stemmed word. If only the word
149 | is passed to the method or the second parameter is 0, normal case-sensitive stemming is used, if the second parameter is 1, case-insensitive stemming is used.
150 |
151 | Case sensitivity improves performance only if words in the text may be incorrectly upper case.
152 | For all-lowercase and correctly cased text, best performance is achieved by
153 | using the case-sensitive version.
154 |
155 | =item segment($word, $case_insensitivity)
156 |
157 | This method works very similarly to stem. The only difference is that in
158 | addition to returning the stem, it also returns the rest that was removed at
159 | the end. To be able to return the stem unchanged so the stem and the rest
160 | can be concatenated to form the original word, all subsitutions that altered
161 | the stem in any other way than by removing letters at the end were left out.
162 |
163 | =cut
164 |
--------------------------------------------------------------------------------
/Cpp/cistem.cpp:
--------------------------------------------------------------------------------
1 | #include "cistem.hpp"
2 | #include
3 | #include
4 |
5 | using namespace std;
6 |
7 | namespace Cistem {
8 | wregex replacess(L"ß");
9 |
10 | wregex replaceSch(L"sch");
11 | wregex replaceSchBack(L"\\$");
12 | wregex replaceEi(L"ei");
13 | wregex replaceEiBack(L"%");
14 | wregex replaceIe(L"ie");
15 | wregex replaceIeBack(L"&");
16 | wregex replacexx(L"(.)\\1");
17 | wregex replacexxback(L"(.)\\*");
18 |
19 | wregex stripge(L"^ge(.{4,})");
20 |
21 | wregex stripemr(L"e[mr]$");
22 | wregex stripnd(L"nd$");
23 | wregex stript(L"t$");
24 | wregex stripesn(L"[esn]$");
25 |
26 | wstring stem(const wstring word, bool caseInsensitive) {
27 | if (word.size() == 0) {
28 | return L"";
29 | }
30 |
31 | bool uppercase = iswupper(word[0]);
32 |
33 | wstring stem = word;
34 | setlocale(LC_ALL, "de_DE.UTF-8");
35 | transform(stem.begin(), stem.end(), stem.begin(), towlower);
36 |
37 | replace(stem.begin(), stem.end(), L'ä', L'a');
38 | replace(stem.begin(), stem.end(), L'ö', L'o');
39 | replace(stem.begin(), stem.end(), L'ü', L'u');
40 | stem = regex_replace(stem, replacess, L"ss");
41 |
42 | stem = regex_replace(stem, stripge, L"$1");
43 | stem = regex_replace(stem, replaceSch, L"$$");
44 | stem = regex_replace(stem, replaceEi, L"%");
45 | stem = regex_replace(stem, replaceIe, L"&");
46 | stem = regex_replace(stem, replacexx, L"$1*");
47 |
48 | bool match;
49 | while (stem.size() > 3) {
50 | if (stem.size() > 5) {
51 | match = regex_search(stem, stripemr);
52 | if (match) {
53 | stem = regex_replace(stem, stripemr, L"");
54 | continue;
55 | }
56 | match = regex_search(stem, stripnd);
57 | if (match) {
58 | stem = regex_replace(stem, stripnd, L"");
59 | continue;
60 | }
61 | }
62 |
63 | if (!uppercase || caseInsensitive) {
64 | match = regex_search(stem, stript);
65 | if (match) {
66 | stem = regex_replace(stem, stript, L"");
67 | continue;
68 | }
69 | }
70 |
71 | match = regex_search(stem, stripesn);
72 | if (match) {
73 | stem = regex_replace(stem, stripesn, L"");
74 | continue;
75 | }
76 |
77 | break;
78 | }
79 |
80 | stem = regex_replace(stem, replacexxback, L"$1$1");
81 | stem = regex_replace(stem, replaceEiBack, L"ei");
82 | stem = regex_replace(stem, replaceIeBack, L"ie");
83 | stem = regex_replace(stem, replaceSchBack, L"sch");
84 |
85 | return stem;
86 | }
87 |
88 |
89 | array segment(const wstring word, bool caseInsensitive) {
90 | array result;
91 |
92 | if (word.size() == 0) {
93 | result[0] = L"";
94 | result[1] = L"";
95 | return result;
96 | }
97 |
98 | bool uppercase = iswupper(word[0]);
99 |
100 | wstring stem = word;
101 | setlocale(LC_ALL, "de_DE.UTF-8");
102 | transform(stem.begin(), stem.end(), stem.begin(), towlower);
103 | wstring original(stem);
104 |
105 | stem = regex_replace(stem, replaceSch, L"$$");
106 | stem = regex_replace(stem, replaceEi, L"%");
107 | stem = regex_replace(stem, replaceIe, L"&");
108 | stem = regex_replace(stem, replacexx, L"$1*");
109 |
110 | bool match;
111 | unsigned int restLength = 0;
112 | while (stem.size() > 3) {
113 | if (stem.size() > 5) {
114 | match = regex_search(stem, stripemr);
115 | if (match) {
116 | stem = regex_replace(stem, stripemr, L"");
117 | restLength += 2;
118 | continue;
119 | }
120 | match = regex_search(stem, stripnd);
121 | if (match) {
122 | stem = regex_replace(stem, stripnd, L"");
123 | restLength += 2;
124 | continue;
125 | }
126 | }
127 |
128 | if (!uppercase || caseInsensitive) {
129 | match = regex_search(stem, stript);
130 | if (match) {
131 | stem = regex_replace(stem, stript, L"");
132 | restLength += 1;
133 | continue;
134 | }
135 | }
136 |
137 | match = regex_search(stem, stripesn);
138 | if (match) {
139 | stem = regex_replace(stem, stripesn, L"");
140 | restLength += 1;
141 | continue;
142 | }
143 |
144 | break;
145 | }
146 |
147 | stem = regex_replace(stem, replacexxback, L"$1$1");
148 | stem = regex_replace(stem, replaceEiBack, L"ei");
149 | stem = regex_replace(stem, replaceIeBack, L"ie");
150 | stem = regex_replace(stem, replaceSchBack, L"sch");
151 |
152 | result[1] = original.substr(original.length() - restLength,
153 | original.length());
154 |
155 | result[0] = stem;
156 | return result;
157 | }
158 |
159 |
160 | wstring stem(const wstring word) {
161 | return stem(word, false);
162 | }
163 | array segment(const wstring word) {
164 | return segment(word, false);
165 | }
166 |
167 | }
168 |
--------------------------------------------------------------------------------
/Cistem.cs:
--------------------------------------------------------------------------------
1 | using System;
2 | using System.Text.RegularExpressions;
3 |
4 | namespace Cistem
5 | {
6 | public static class Cistem
7 | {
8 | private const string GE_PATTERN = "^ge(.{4,})";
9 | private const string DOLLAR1_PATTERN = "(.)\\1";
10 | private const string ND_PATTERN = "nd$";
11 | private const string EMR_PATTERN = "e[mr]$";
12 | private const string T_PATTERN = "t$";
13 | private const string ESN_PATTERN = "[esn]$";
14 | private const string STAR_PATTERN = "(.)\\*";
15 |
16 | public static String Stem(String word)
17 | {
18 | return Stem(word, false);
19 | }
20 |
21 | public static String Stem(String word, bool case_insensitive)
22 | {
23 | if (word.Length == 0) return word;
24 |
25 | word = word.Replace("Ü", "U");
26 | word = word.Replace("Ö", "O");
27 | word = word.Replace("Ä", "A");
28 | word = word.Replace("ü", "u");
29 | word = word.Replace("ö", "o");
30 | word = word.Replace("ä", "a");
31 |
32 | var uppercase = char.IsUpper(word[0]);
33 |
34 | word = word.ToLower();
35 |
36 | word = word.Replace("ß", "ss");
37 | word = Regex.Replace(word, GE_PATTERN, "$1");
38 | word = word.Replace("sch", "$");
39 | word = word.Replace("ei", "%");
40 | word = word.Replace("ie", "&");
41 |
42 | word = Regex.Replace(word, DOLLAR1_PATTERN, "$1*");
43 |
44 | string newWord;
45 |
46 | while (word.Length > 3)
47 | {
48 | if (word.Length > 5)
49 | {
50 | newWord = Regex.Replace(word, EMR_PATTERN, "");
51 | if (word != newWord)
52 | {
53 | word = newWord;
54 | continue;
55 | }
56 |
57 | newWord = Regex.Replace(word, ND_PATTERN, "");
58 | if (word != newWord)
59 | {
60 | word = newWord;
61 | continue;
62 | }
63 | }
64 |
65 | if (!uppercase || case_insensitive)
66 | {
67 | newWord = Regex.Replace(word, T_PATTERN, "");
68 | if (word != newWord)
69 | {
70 | word = newWord;
71 | continue;
72 | }
73 | }
74 |
75 | newWord = Regex.Replace(word, ESN_PATTERN, "");
76 |
77 | if (word != newWord)
78 | word = newWord;
79 | else
80 | break;
81 | }
82 |
83 | word = Regex.Replace(word, STAR_PATTERN, "$1$1");
84 | word = word.Replace("&", "ie");
85 | word = word.Replace("%", "ei");
86 | word = word.Replace("$", "sch");
87 |
88 | return word;
89 | }
90 |
91 | public static String[] Segment(String word)
92 | {
93 | return Segment(word, false);
94 | }
95 |
96 | public static String[] Segment(String word, bool case_insensitive)
97 | {
98 | if (word.Length == 0)
99 | return new string[] { string.Empty, string.Empty };
100 |
101 | var restLength = 0;
102 | var uppercase = char.IsUpper(word[0]);
103 | word = word.ToLower();
104 | var original = word;
105 |
106 | word = word.Replace("sch", "$");
107 | word = word.Replace("ei", "%");
108 | word = word.Replace("ie", "&");
109 |
110 | word = Regex.Replace(word, DOLLAR1_PATTERN, "$1*");
111 |
112 | string newWord;
113 |
114 | while (word.Length > 3)
115 | {
116 | if (word.Length > 5)
117 | {
118 | newWord = Regex.Replace(word, EMR_PATTERN, "");
119 | if (word != newWord)
120 | {
121 | restLength += 2;
122 | word = newWord;
123 | continue;
124 | }
125 |
126 | newWord = Regex.Replace(word, ND_PATTERN, "");
127 | if (word != newWord)
128 | {
129 | restLength += 2;
130 | word = newWord;
131 | continue;
132 | }
133 | }
134 |
135 | if (!uppercase || case_insensitive)
136 | {
137 | newWord = Regex.Replace(word, T_PATTERN, "");
138 | if (word != newWord)
139 | {
140 | restLength += 1;
141 | word = newWord;
142 | continue;
143 | }
144 | }
145 |
146 | newWord = Regex.Replace(word, ESN_PATTERN, "");
147 | if (word != newWord)
148 | {
149 | restLength += 1;
150 | word = newWord;
151 | }
152 | else
153 | break;
154 | }
155 |
156 | word = Regex.Replace(word, STAR_PATTERN, "$1$1");
157 | word = word.Replace("&", "ie");
158 | word = word.Replace("%", "ei");
159 | word = word.Replace("$", "sch");
160 |
161 | var rest = string.Empty;
162 |
163 | if (restLength != 0)
164 | rest = original.Substring(original.Length - restLength);
165 |
166 | return new string[] { word, rest };
167 | }
168 | }
169 | }
--------------------------------------------------------------------------------
/Cistem.swift:
--------------------------------------------------------------------------------
1 | //
2 | // cistem.swift
3 | // cistem
4 | //
5 | // Created by Hendrik Noeller on 15.09.17.
6 | // Copyright © 2017 de.HendrikNoeller. All rights reserved.
7 | //
8 |
9 | import Foundation
10 | extension String {
11 | func index(_ index:Int) -> String.Index{
12 | if (index < 0) {
13 | return self.index(self.endIndex, offsetBy: index)
14 | } else {
15 | return self.index(self.startIndex, offsetBy: index)
16 | }
17 | }
18 | }
19 |
20 | struct Cistem {
21 |
22 | static let doubleCharacterRegex = try! NSRegularExpression(pattern: "(.)\\1")
23 | static let doubleCharacterReverseRegex = try! NSRegularExpression(pattern: "(.)\\*")
24 |
25 | private static func isUppercase(_ word: String) -> Bool {
26 | let first = String(word[word.startIndex])
27 | let firstUpper = first.uppercased()
28 | return first == firstUpper
29 | }
30 |
31 | static func stem(_ word: String, caseInsensitive: Bool = false) -> String {
32 | if (word.isEmpty) {
33 | return ""
34 | }
35 | let uppercase = isUppercase(word)
36 |
37 | var result = word
38 | result = result.lowercased()
39 |
40 | result = result.replacingOccurrences(of: "ü", with: "u")
41 | result = result.replacingOccurrences(of: "ö", with: "o")
42 | result = result.replacingOccurrences(of: "ä", with: "a")
43 | result = result.replacingOccurrences(of: "ß", with: "ss")
44 |
45 | if (result.characters.count >= 6 && result.hasPrefix("ge")) {
46 | result = result[result.index(2).. 3) {
57 | if (result.characters.count > 5) {
58 | if (result.hasSuffix("em") || result.hasSuffix("er") || result.hasSuffix("nd")){
59 | result = result[result.startIndex.. [String] {
90 | if (word.isEmpty) {
91 | return ["", ""]
92 | }
93 | let uppercase = isUppercase(word)
94 | var restLength = 0
95 |
96 | var result = word
97 | result = result.lowercased()
98 | let original = result
99 |
100 | result = result.replacingOccurrences(of: "sch", with: "$")
101 | result = result.replacingOccurrences(of: "ei", with: "%")
102 | result = result.replacingOccurrences(of: "ie", with: "&")
103 |
104 |
105 | result = doubleCharacterRegex.stringByReplacingMatches(in: result, options: [], range: NSMakeRange(0, result.characters.count), withTemplate: "$1*")
106 |
107 | while (result.characters.count > 3) {
108 | if (result.characters.count > 5) {
109 | if (result.hasSuffix("em") || result.hasSuffix("er") || result.hasSuffix("nd")){
110 | restLength += 2
111 | result = result[result.startIndex.. 0) {
140 | rest = String(original[original.index(-restLength).. 3:
56 | if len(word) > 5:
57 | (word, success) = stripemr.subn("", word)
58 | if success != 0:
59 | continue
60 |
61 | (word, success) = stripnd.subn("", word)
62 | if success != 0:
63 | continue
64 |
65 | if not upper or case_insensitive:
66 | (word, success) = stript.subn("", word)
67 | if success != 0:
68 | continue
69 |
70 | (word, success) = stripesn.subn("", word)
71 | if success != 0:
72 | continue
73 | else:
74 | break
75 |
76 | word = replxxback.sub(r"\1\1", word)
77 | word = word.replace("%","ei")
78 | word = word.replace("&","ie")
79 | word = word.replace("$","sch")
80 |
81 | return word
82 |
83 | """
84 | This method works very similarly to stem. The only difference is that in
85 | addition to returning the stem, it also returns the rest that was removed at
86 | the end. To be able to return the stem unchanged so the stem and the rest
87 | can be concatenated to form the original word, all subsitutions that altered
88 | the stem in any other way than by removing letters at the end were left out.
89 | """
90 |
91 | def segment(word, case_insensitive = False):
92 |
93 | rest_length = 0
94 |
95 | if len(word) == 0:
96 | return ("", "")
97 |
98 | upper = word[0].isupper()
99 | word = word.lower()
100 |
101 | original = word[:]
102 |
103 | word = word.replace("sch","$")
104 | word = word.replace("ei","%")
105 | word = word.replace("ie","&")
106 | word = replxx.sub(r"\1*", word)
107 |
108 | while len(word) > 3:
109 | if len(word) > 5:
110 | (word, success) = stripemr.subn("", word)
111 | if success != 0:
112 | rest_length += 2
113 | continue
114 |
115 | (word, success) = stripnd.subn("", word)
116 | if success != 0:
117 | rest_length += 2
118 | continue
119 |
120 | if not upper or case_insensitive:
121 | (word, success) = stript.subn("", word)
122 | if success != 0:
123 | rest_length += 1
124 | continue
125 |
126 | (word, success) = stripesn.subn("", word)
127 | if success != 0:
128 | rest_length += 1
129 | continue
130 | else:
131 | break
132 |
133 |
134 | word = replxxback.sub(r"\1\1", word)
135 | word = word.replace("%","ei")
136 | word = word.replace("&","ie")
137 | word = word.replace("$","sch")
138 |
139 | if rest_length:
140 | rest = original[-rest_length:]
141 | else:
142 | rest = ""
143 |
144 | return (word,rest)
145 |
--------------------------------------------------------------------------------
/Cistem.js:
--------------------------------------------------------------------------------
1 | /**
2 | * CISTEM Stemmer for German
3 | *
4 | * This is the official Javascript implementation of the CISTEM stemmer.
5 | * It is based on the paper
6 | * Leonie Weißweiler, Alexander Fraser (2017). Developing a Stemmer for German Based on a Comparative Analysis of Publicly Available Stemmers. In Proceedings of the German Society for Computational Linguistics and Language Technology (GSCL)
7 | * which can be read here:
8 | * http://www.cis.lmu.de/~weissweiler/cistem/
9 | *
10 | * In the paper, we conducted an analysis of publicly available stemmers, developed
11 | * two gold standards for German stemming and evaluated the stemmers based on the
12 | * two gold standards. We then proposed the stemmer implemented here and show
13 | * that it achieves slightly better f-measure than the other stemmers and is
14 | * thrice as fast as the Snowball stemmer for German while being about as fast as
15 | * most other stemmers.
16 | */
17 |
18 |
19 | const stripge = /^ge(.{4,})/;
20 | const replxx = /(.)\1/g;
21 | const replxxback = /(.)\*/g;
22 | const replü = /ü/g;
23 | const replö = /ö/g;
24 | const replä = /ä/g;
25 | const replß = /ß/g;
26 | const replsch = /sch/g;
27 | const replei = /ei/g;
28 | const replie = /ie/g;
29 | const replschback = /\$/g;
30 | const repleiback = /%/g;
31 | const replieback = /&/g;
32 | const stripemr = /e[mr]$/;
33 | const stripnd = /nd$/;
34 | const stript = /t$/;
35 | const stripesn = /[esn]$/;
36 |
37 | /**
38 | * This method takes the word to be stemmed and a boolean specifiying if case-insensitive stemming should be used and returns the stemmed word. If only the word
39 | * is passed to the method or the second parameter is 0, normal case-sensitive stemming is used, if the second parameter is 1, case-insensitive stemming is used.
40 | * Case sensitivity improves performance only if words in the text may be incorrectly upper case.
41 | * For all-lowercase and correctly cased text, best performance is achieved by
42 | * using the case-sensitive version.
43 | * @param {String} word
44 | * @param {boolean} case_insensitive
45 | * @returns {String}
46 | */
47 | function stem(word, case_insensitive = false) {
48 | if (word.length == 0) return word;
49 |
50 | upper = (word[0] === word[0].toUpperCase());
51 | word = word.toLowerCase();
52 |
53 | word = word.replace(replü, "u");
54 | word = word.replace(replö,"o");
55 | word = word.replace(replä,"a");
56 | word = word.replace(replß,"ss");
57 |
58 | word = word.replace(stripge, "$1");
59 | word = word.replace(replsch,"$");
60 | word = word.replace(replei,"%");
61 | word = word.replace(replie,"&");
62 | word = word.replace(replxx, "$1*");
63 |
64 | while (word.length > 3) {
65 | let result;
66 |
67 | if (word.length > 5) {
68 | result = word.replace(stripemr, "");
69 | if (result !== word) {
70 | word = result;
71 | continue;
72 | }
73 |
74 | result = word.replace(stripnd, "");
75 | if (result !== word) {
76 | word = result;
77 | continue;
78 | }
79 | }
80 |
81 | if (!upper || case_insensitive) {
82 | result = word.replace(stript, "");
83 | if (result !== word) {
84 | word = result;
85 | continue;
86 | }
87 | }
88 |
89 | result = word.replace(stripesn, "");
90 | if (result !== word) {
91 | word = result;
92 | continue;
93 | } else {
94 | break;
95 | }
96 | }
97 |
98 | word = word.replace(replxxback, "$1$1");
99 | word = word.replace(repleiback,"ei");
100 | word = word.replace(replieback,"ie");
101 | word = word.replace(replschback,"sch");
102 |
103 | return word;
104 | }
105 |
106 | /**
107 | * This method works very similarly to stem. The only difference is that in
108 | * addition to returning the stem, it also returns the rest that was removed at
109 | * the end. To be able to return the stem unchanged so the stem and the rest
110 | * can be concatenated to form the original word, all subsitutions that altered
111 | * the stem in any other way than by removing letters at the end were left out.
112 | * @param {String} word
113 | * @param {boolean} case_insensitive
114 | * @returns {Array}
115 | */
116 | function segment(word, case_insensitive = false) {
117 | if (word.length == 0) return ["", ""];
118 |
119 | let rest_length = 0;
120 | upper = (word[0] === word[0].toUpperCase());
121 | word = word.toLowerCase();
122 |
123 | let original = word;
124 |
125 | word = word.replace(replsch,"$");
126 | word = word.replace(replei,"%");
127 | word = word.replace(replie,"&");
128 | word = word.replace(replxx, "$1*");
129 |
130 | while (word.length > 3) {
131 | let result;
132 |
133 | if (word.length > 5) {
134 | result = word.replace(stripemr, "")
135 |
136 | if (result !== word) {
137 | word = result;
138 | rest_length += 2;
139 | continue;
140 | }
141 |
142 | result = word.replace(stripnd, "");
143 | if (result !== word) {
144 | word = result;
145 | rest_length += 2;
146 | continue;
147 | }
148 | }
149 |
150 | if (!upper || case_insensitive) {
151 | result = word.replace(stript, "");
152 |
153 | if (result !== word) {
154 | word = result;
155 | rest_length += 1;
156 | continue;
157 | }
158 | }
159 |
160 | result = word.replace(stripesn, "");
161 | if (result !== word) {
162 | word = result;
163 | rest_length += 1;
164 | continue;
165 | } else {
166 | break;
167 | }
168 | }
169 |
170 | word = word.replace(replxxback, "$1$1");
171 | word = word.replace(repleiback,"ei");
172 | word = word.replace(replieback,"ie");
173 | word = word.replace(replschback,"sch");
174 |
175 | let rest;
176 | if (rest_length > 0) {
177 | rest = original.substr(original.length - rest_length);
178 | } else {
179 | rest = "";
180 | }
181 |
182 | return [word, rest];
183 | }
184 |
--------------------------------------------------------------------------------
/C/cistem.c:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 | #include
5 |
6 | static const wchar_t lowercase_ae = 0x00E4;
7 | static const wchar_t lowercase_oe = 0x00F6;
8 | static const wchar_t lowercase_ue = 0x00FC;
9 | static const wchar_t uppercase_ae = 0x00C4;
10 | static const wchar_t uppercase_oe = 0x00D6;
11 | static const wchar_t uppercase_ue = 0x00DC;
12 | static const wchar_t scharfes_ss = 0x00DF;
13 |
14 |
15 | int count_double_characters(wchar_t* word, unsigned long length) {
16 | int result = 0;
17 | for (int i = 0; i < length; i++) {
18 | switch (word[i]) {
19 | case scharfes_ss:
20 | result++;
21 | break;
22 | default:
23 | break;
24 | }
25 | }
26 | return result;
27 | }
28 |
29 | int match(wchar_t* source, unsigned index, wchar_t* pattern, unsigned pattern_length) {
30 | for (int i = 0; i < pattern_length; i++) {
31 | if (tolower(source[i + index]) != pattern[i]) return 0;
32 | }
33 | return 1;
34 | }
35 |
36 | int match_suffix(wchar_t* source, unsigned long length, wchar_t* pattern, unsigned pattern_length) {
37 | for (int i = 0; i < pattern_length; i++) {
38 | if (source[length - pattern_length + i] != pattern[i]) return 0;
39 | }
40 | return 1;
41 | }
42 |
43 | #define APPEND(x) destination[offset] = x; offset++;
44 |
45 | unsigned long copy_and_normalize(wchar_t* source, unsigned long length, wchar_t* destination, int modify) {
46 | // Offset in the destination
47 | unsigned long offset = 0;
48 |
49 | // Iterate over source characters
50 | for (int i = 0; i < length; i++) {
51 | if (source[i] == 0) break;
52 |
53 | // If the word after it at least 4 characters, skip ahead of a "ge"
54 | if (i == 0 && modify) {
55 | unsigned long remaining_length_with_expansion = length + count_double_characters(source,length);
56 | if (remaining_length_with_expansion >= 4 + 2 && match(source,0,L"ge",2)) {
57 | i += 1;
58 | continue;
59 | }
60 | }
61 |
62 | unsigned long remaining_chars = length - i;
63 |
64 | if (source[i] == scharfes_ss && modify) {
65 | APPEND('s');
66 | APPEND('*');
67 | } else if (source[i] == lowercase_ae && modify) {
68 | APPEND('a');
69 | } else if (source[i] == lowercase_oe && modify) {
70 | APPEND('o');
71 | } else if (source[i] == lowercase_ue && modify) {
72 | APPEND('u');
73 | } else if (source[i] == uppercase_ae && modify) {
74 | APPEND('a');
75 | } else if (source[i] == uppercase_oe && modify) {
76 | APPEND('o');
77 | } else if (source[i] == uppercase_ue && modify) {
78 | APPEND('u');
79 | } else if (remaining_chars >= 3 && match(source,i,L"sch",3)) {
80 | APPEND('$');
81 | i += 2;
82 | } else if (remaining_chars >= 2 && match(source,i,L"ei",2)) {
83 | APPEND('%');
84 | i += 1;
85 | } else if (remaining_chars >= 2 && match(source,i,L"ie",2)) {
86 | APPEND('&');
87 | i += 1;
88 | } else {
89 | APPEND(towlower(source[i]));
90 | }
91 | }
92 | APPEND(0);
93 |
94 | // Replace "xx" by "x*"
95 |
96 | unsigned destination_length = offset - 1;
97 |
98 | for (int i = 0; i < destination_length; i++) {
99 | int remaining_chars = destination_length - i;
100 |
101 | if (remaining_chars >= 2 && destination[i] == destination[i+1]) {
102 | destination[i+1] = '*';
103 | i++;
104 | }
105 | }
106 |
107 | return offset - 1;
108 | }
109 |
110 | void copy_and_denormalize(wchar_t* source, unsigned long length, wchar_t* destination) {
111 | // Replace "x*" by "xx"
112 |
113 | for (int i = 0; i < length; i++) {
114 | unsigned long remaining_chars = length - i;
115 |
116 | if (remaining_chars >= 2 && source[i+1] == '*') {
117 | source[i+1] = source[i];
118 | i++;
119 | }
120 | }
121 |
122 | // Offset in the destination
123 | int offset = 0;
124 |
125 | // Iterate over source characters
126 | for (int i = 0; i < length; i++) {
127 | if (source[i] == 0) break;
128 |
129 | if (source[i] == '$') {
130 | APPEND('s');
131 | APPEND('c');
132 | APPEND('h');
133 | } else if (source[i] == '%') {
134 | APPEND('e');
135 | APPEND('i');
136 | } else if (source[i] == '&') {
137 | APPEND('i');
138 | APPEND('e');
139 | } else {
140 | APPEND(source[i]);
141 | }
142 | }
143 |
144 | destination[offset] = 0;
145 | }
146 |
147 | unsigned long strip_suffixes(wchar_t* word, unsigned long length, int uppercase, int case_sensitive) {
148 | unsigned long string_length = wcslen(word);
149 |
150 | while (string_length > 3) {
151 | if (string_length > 5 &&
152 | (match_suffix(word, string_length, L"em", 2) ||
153 | match_suffix(word, string_length, L"er", 2) ||
154 | match_suffix(word, string_length, L"nd", 2))) {
155 | string_length -= 2;
156 | } else if (((!uppercase) || case_sensitive) && match_suffix(word, string_length, L"t", 1)) {
157 | string_length -= 1;
158 | } else if ((match_suffix(word, string_length, L"e", 1) ||
159 | match_suffix(word, string_length, L"s", 1) ||
160 | match_suffix(word, string_length, L"n", 1))) {
161 | string_length -= 1;
162 | } else {
163 | break;
164 | }
165 | }
166 |
167 | return string_length;
168 | }
169 |
170 | wchar_t* stem_internal(wchar_t* word, int case_insensitive) {
171 | int uppercase = !iswlower(word[0]);
172 |
173 | unsigned long length = wcslen(word);
174 | unsigned long intermediate_length = length + count_double_characters(word, length) + 16; // count ß, ae etc. to allocate properly and add some to be safe
175 | wchar_t* intermediate = malloc(intermediate_length * sizeof(wchar_t));
176 | wchar_t* result = malloc(intermediate_length * sizeof(wchar_t));
177 |
178 | copy_and_normalize(word, length, intermediate, 1);
179 |
180 | unsigned long stem_length = strip_suffixes(intermediate, intermediate_length, uppercase, case_insensitive);
181 |
182 | intermediate[stem_length] = 0;
183 |
184 | copy_and_denormalize(intermediate, stem_length, result);
185 |
186 | free(intermediate);
187 | return result;
188 | }
189 |
190 | wchar_t* stem(wchar_t* word) {
191 | return stem_internal(word, 0);
192 | }
193 |
194 | wchar_t* stem_case_insensitive(wchar_t* word) {
195 | return stem_internal(word, 1);
196 | }
197 |
198 | wchar_t** segment_internal(wchar_t* word, int case_insensitive) {
199 | wchar_t** result = malloc(2 * sizeof(wchar_t*));
200 | int uppercase = !iswlower(word[0]);
201 |
202 | unsigned long length = wcslen(word);
203 | unsigned long intermediate_length = length + count_double_characters(word, length) + 16; // count ß, ae etc. to allocate properly and add some to be safe
204 | wchar_t* intermediate = malloc(intermediate_length * sizeof(wchar_t));
205 | wchar_t* stem_result = malloc(intermediate_length * sizeof(wchar_t));
206 | wchar_t* suffix_result = malloc(intermediate_length * sizeof(wchar_t));
207 |
208 | unsigned long normalized_length = copy_and_normalize(word, length, intermediate, 0);
209 |
210 | unsigned long stem_length = strip_suffixes(intermediate, intermediate_length, uppercase, case_insensitive);
211 |
212 | wcsncpy(suffix_result, intermediate + stem_length, normalized_length - stem_length);
213 | suffix_result[normalized_length - stem_length] = 0;
214 |
215 | intermediate[stem_length] = 0;
216 |
217 | copy_and_denormalize(intermediate, stem_length, stem_result);
218 |
219 | free(intermediate);
220 |
221 | result[0] = stem_result;
222 | result[1] = suffix_result;
223 |
224 | return result;
225 | }
226 |
227 | wchar_t** segment(wchar_t* word) {
228 | return segment_internal(word, 0);
229 | }
230 |
231 | wchar_t** segment_case_insensitive(wchar_t* word) {
232 | return segment_internal(word, 1);
233 | }
--------------------------------------------------------------------------------