├── .github └── workflows │ └── tests.yml ├── .gitignore ├── LICENSE ├── README.md ├── composer.json ├── phpunit.xml.dist ├── src ├── NotFoundException.php ├── Stemmer │ ├── Catalan.php │ ├── Danish.php │ ├── Dutch.php │ ├── English.php │ ├── Finnish.php │ ├── French.php │ ├── German.php │ ├── Italian.php │ ├── Norwegian.php │ ├── Portuguese.php │ ├── Romanian.php │ ├── Russian.php │ ├── Spanish.php │ ├── Stem.php │ ├── Stemmer.php │ └── Swedish.php ├── StemmerFactory.php ├── StemmerManager.php └── Transliterate.php └── test ├── CatalanTest.php ├── CsvFileIterator.php ├── CsvFileVerboseIterator.php ├── DanishTest.php ├── DutchTest.php ├── EnglishTest.php ├── FactoryTest.php ├── FinnishTest.php ├── FrenchTest.php ├── GermanTest.php ├── ItalianTest.php ├── ManagerTest.php ├── NorwegianTest.php ├── PortugueseTest.php ├── RomanianTest.php ├── RussianTest.php ├── SpanishTest.php ├── SwedishTest.php └── files ├── ca.txt ├── de.txt ├── dk.txt ├── en.txt ├── es.txt ├── fi.txt ├── fr.txt ├── it.txt ├── nl.txt ├── no.txt ├── pt.txt ├── ro.txt ├── ru.txt └── sw.txt /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: [master] 7 | 8 | jobs: 9 | tests: 10 | runs-on: ubuntu-latest 11 | 12 | strategy: 13 | fail-fast: true 14 | matrix: 15 | php: [7.3, 7.4, 8.0, 8.1, 8.2, 8.3, 8.4] 16 | stability: [prefer-lowest, prefer-stable] 17 | 18 | name: PHP ${{ matrix.php }} - ${{ matrix.stability }} 19 | 20 | steps: 21 | - name: Checkout code 22 | uses: actions/checkout@v3 23 | 24 | - name: Setup PHP 25 | uses: shivammathur/setup-php@v2 26 | with: 27 | php-version: ${{ matrix.php }} 28 | tools: composer:v2 29 | coverage: none 30 | 31 | - name: Install dependencies 32 | uses: nick-fields/retry@v2 33 | with: 34 | timeout_minutes: 5 35 | max_attempts: 5 36 | command: composer update --${{ matrix.stability }} --prefer-dist --no-interaction --no-progress 37 | 38 | - name: Copy PHP Unit Settings 39 | run: cp phpunit.xml.dist phpunit.xml 40 | 41 | - name: Execute tests 42 | run: vendor/bin/phpunit --verbose 43 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Managed by Composer 2 | /vendor/ 3 | 4 | # PHPUnit 5 | /app/phpunit.xml 6 | /phpunit.xml 7 | 8 | # Build data 9 | /build/ 10 | 11 | # Composer PHAR 12 | /composer.phar 13 | 14 | /.settings/ 15 | /.buildpath 16 | /.project 17 | /composer.lock 18 | 19 | .history 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 wamania 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # php-stemmer 2 | 3 | PHP native implementation of Snowball stemmer 4 | https://snowballstem.org/ 5 | 6 | Accept only UTF-8 7 | 8 | * [Languages](#languages) 9 | * [Installation](#installation) 10 | * [Usage](#usage) 11 | 12 | Languages 13 | ------------ 14 | Available : 15 | - Catalan (by Orestes Sanchez Benavente orestes@estotienearreglo.es) 16 | - Danish 17 | - Dutch 18 | - English 19 | - Finnish (by [Mikko Saari](https://github.com/msaari/)) 20 | - French 21 | - German 22 | - Italian 23 | - Norwegian 24 | - Portuguese 25 | - Romanian 26 | - Russian 27 | - Spanish 28 | - Swedish 29 | 30 | Installation 31 | ------------ 32 | 33 | For PHP5, use 1.3 34 | ``` 35 | composer require wamania/php-stemmer "^1.3" 36 | ``` 37 | 38 | For PHP7 use 2.x (branch 2.x is backward compatible with 1.x) 39 | ``` 40 | composer require wamania/php-stemmer "^2.0" 41 | ``` 42 | 43 | For PHP^7.3 and PHP^8.0 use 3.x (backward compatible, but phpunit^9 don't work with php < 7.3) 44 | ``` 45 | composer require wamania/php-stemmer "^3.0" 46 | ``` 47 | 48 | For PHP^8.4 use 4.x (avoid deprecated by switching from voku utf8 to [joomla/string](https://github.com/joomla-framework/string) ) 49 | ``` 50 | composer require wamania/php-stemmer "^4.0" 51 | ``` 52 | 53 | Usage 54 | ----- 55 | 56 | For 2.x ~ 4.x, you should use the factory 57 | ```php 58 | use Wamania\Snowball\StemmerFactory; 59 | 60 | // use ISO_639 (2 or 3 letters) or language name in english 61 | $stemmer = StemmerFactory::create('fr'); 62 | $stemmer = StemmerFactory::create ('spanish'); 63 | 64 | // then 65 | $stem = $stemmer->stem('automóvil'); 66 | ``` 67 | 68 | Or the manager 69 | ```php 70 | use Wamania\Snowball\StemmerManager; 71 | 72 | $manager = new StemmerManager(); 73 | $stem = $manager->stem('automóvil', 'es'); 74 | ``` 75 | 76 | In 1.3, you must instantiate manually 77 | 78 | ```php 79 | use Wamania\Snowball\French; 80 | 81 | $stemmer = new French(); 82 | $stem = $stemmer->stem('anticonstitutionnellement'); 83 | ``` 84 | 85 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "wamania/php-stemmer", 3 | "description": "Native PHP Stemmer", 4 | "keywords": ["stemmer", "porter", "php"], 5 | "license": "MIT", 6 | "authors": [ 7 | { 8 | "name": "Wamania", 9 | "homepage": "http://wamania.com" 10 | } 11 | ], 12 | "require": { 13 | "php": ">=7.3", 14 | "joomla/string": ">=2.0.1" 15 | }, 16 | "require-dev":{ 17 | "phpunit/phpunit": "^9.0" 18 | }, 19 | "autoload": { 20 | "psr-4": { 21 | "Wamania\\Snowball\\": "src/" 22 | } 23 | }, 24 | "autoload-dev": { 25 | "psr-4": { 26 | "Wamania\\Snowball\\Tests\\": "test/" 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 12 | 13 | 14 | 15 | test 16 | 17 | 18 | 19 | 20 | 21 | src 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/NotFoundException.php: -------------------------------------------------------------------------------- 1 | 11 | * 12 | * 13 | * Some fine tuning was necessary in this implementation of the original catalan stemmer algorithm in Snowball: 14 | * 15 | * 1. Some suffix sets have overlapping items, so here all items are sorted by decreasing size, to 16 | * prevent that a shorter suffix will skip a larger one. 17 | * 18 | * 2. Some alternatives (`or` operator in Snowball) in the original algorithm have 19 | * been rearranged to make sure they are applied in the right order. 20 | * 21 | * Based on the reference Snowball implementation by Israel Olalla of iSOCO 22 | */ 23 | class Catalan extends Stem 24 | { 25 | 26 | /** 27 | * All catalan vowels 28 | */ 29 | protected static $vowels = ['a', 'e', 'i', 'o', 'u', 'á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ï', 'ò', 'ü']; 30 | 31 | protected static $standard_suffix_1a = [ 32 | 'allengües', 'ativitats', 'bilitats', 'ionistes', 'ialistes', 'ialismes', 'ativitat', 'atòries', 'isament', 33 | 'bilitat', 'ivitats', 'ionisme', 'ionista', 'ialista', 'ialisme', 'íssimes', 'formes', 'ivisme', 'aments', 34 | 'nça', 'ificar', 'idores', 'ancies', 'atòria', 'ivitat', 'encies', 'ències', 'atives', 'íssima', 'íssims', 35 | 'ictes', 'eries', 'itats', 'itzar', 'ament', 'ments', 'sfera', 'ícies', 'àries', 'cions', 'ístic', 'issos', 36 | 'íssem', 'íssiu', 'issem', 'isseu', 'ísseu', 'dores', 'adura', 'ívola', 'ables', 'adors', 'idors', 'adora', 37 | 'doras', 'dures', 'ancia', 'toris', 'encia', 'ència', 'ïtats', 'atius', 'ativa', 'ibles', 'asses', 'assos', 38 | 'íssim', 'ìssem', 'ìsseu', 'ìssin', 'ismes', 'istes', 'inies', 'íinia', 'ínies', 'trius', 'atge', 'icte', 39 | 'ells', 'ella', 'essa', 'eres', 'ines', 'able', 'itat', 'ives', 'ment', 'amen', 'iste', 'aire', 'eria', 40 | 'eses', 'esos', 'ícia', 'icis', 'ícis', 'ària', 'alla', 'nces', 'enca', 'issa', 'dora', 'dors', 'bles', 41 | 'ívol', 'egar', 'ejar', 'itar', 'ació', 'ants', 'tori', 'ions', 'isam', 'ores', 'aris', 'ïtat', 'atiu', 42 | 'ible', 'assa', 'ents', 'imes', 'isme', 'ista', 'inia', 'ites', 'triu', 'oses', 'osos', 'ient', 'otes', 43 | 'ell', 'esc', 'ets', 'eta', 'ers', 'ina', 'iva', 'ius', 'fer', 'als', 'era', 'ana', 'esa', 'ici', 'íci', 44 | 'ció', 'dor', 'all', 'enc', 'osa', 'ble', 'dís', 'dur', 'ant', 'ats', 'ota', 'ors', 'ora', 'ari', 'uts', 45 | 'uds', 'ent', 'ims', 'ima', 'ita', 'ar', 'és', 'ès', 'et', 'ls', 'ió', 'ot', 'al', 'or', 'il', 'ís', 'ós', 46 | 'ud', 'ots', 'ó' 47 | ]; 48 | 49 | protected static $attached_pronoun = [ 50 | 'selas', 'selos', '\'hi', '\'ho', '\'ls', '-les', '-nos', '\'ns', 'sela', 'selo', '\'s', '\'l', '-ls', '-la', 51 | '-li', 'vos', 'nos', '-us', '\'n', '-ns', '\'m', '-me', '-te', '\'t', 'los', 'las', 'les', 'ens', 'se', 'us', 52 | '-n', '-m', 'li', 'lo', 'me', 'le', 'la', 'ho', 'hi' 53 | ]; 54 | 55 | protected static $verb_suffixes = [ 56 | 'aríamos', 'eríamos', 'iríamos', 'eresseu', 'iéramos', 'iésemos', 'adores', 'aríais', 'aremos', 'eríais', 57 | 'eremos', 'iríais', 'iremos', 'ierais', 'ieseis', 'asteis', 'isteis', 'ábamos', 'áramos', 'ásemos', 'isquen', 58 | 'esquin', 'esquis', 'esques', 'esquen', 'ïsquen', 'ïsques', 'adora', 'adors', 'arían', 'arías', 'arian', 59 | 'arien', 'aries', 'aréis', 'erían', 'erías', 'eréis', 'erass', 'irían', 'irías', 'iréis', 'asseu', 'esseu', 60 | 'àsseu', 'àssem', 'àssim', 'àssiu', 'essen', 'esses', 'assen', 'asses', 'assim', 'assiu', 'éssen', 'ésseu', 61 | 'éssim', 'éssiu', 'éssem', 'aríem', 'aríeu', 'eixer', 'eixes', 'ieran', 'iesen', 'ieron', 'iendo', 'essin', 62 | 'essis', 'assin', 'assis', 'essim', 'èssim', 'èssiu', 'ieras', 'ieses', 'abais', 'arais', 'aseis', 'íamos', 63 | 'irien', 'iries', 'irìem', 'irìeu', 'iguem', 'igueu', 'esqui', 'eixin', 'eixis', 'eixen', 'iríem', 'iríeu', 64 | 'atges', 'issen', 'isses', 'issin', 'issis', 'issiu', 'issim', 'ïssin', 'íssiu', 'íssim', 'ïssis', 'ïguem', 65 | 'ïgueu', 'ïssen', 'ïsses', 'itzeu', 'itzis', 'ador', 'ents', 'udes', 'eren', 'arán', 'arás', 'aria', 'aràs', 66 | 'aría', 'arés', 'erán', 'erás', 'ería', 'erau', 'irán', 'irás', 'iría', 'írem', 'íreu', 'aves', 'avem', 'ávem', 67 | 'àvem', 'àveu', 'áveu', 'aven', 'ares', 'àrem', 'àreu', 'àren', 'areu', 'aren', 'tzar', 'ides', 'ïdes', 'ades', 68 | 'iera', 'iese', 'aste', 'iste', 'aban', 'aran', 'asen', 'aron', 'abas', 'adas', 'idas', 'aras', 'ases', 'íais', 69 | 'ados', 'idos', 'amos', 'imos', 'ques', 'iran', 'irem', 'iren', 'ires', 'ireu', 'iria', 'iràs', 'eixi', 'eixo', 70 | 'isin', 'isis', 'esca', 'isca', 'ïsca', 'ïren', 'ïres', 'ïxen', 'ïxes', 'ixen', 'ixes', 'inin', 'inis', 'ineu', 71 | 'itza', 'itzi', 'itzo', 'itzà', 'arem', 'ent', 'arà', 'ará', 'ara', 'aré', 'erá', 'eré', 'irá', 'iré', 'íeu', 72 | 'ies', 'íem', 'ìeu', 'ien', 'uda', 'ava', 'ats', 'ant', 'ïen', 'ams', 'ïes', 'dre', 'eix', 'ïda', 'aba', 'ada', 73 | 'ida', 'its', 'ids', 'ase', 'ían', 'ado', 'ido', 'ieu', 'ess', 'ass', 'ías', 'áis', 'ira', 'irà', 'irè', 'sis', 74 | 'sin', 'int', 'isc', 'ïsc', 'ïra', 'ïxo', 'ixo', 'ixa', 'ini', 'itz', 'iïn', 're', 'ie', 'er', 'ia', 'at', 'ut', 75 | 'au', 'ïm', 'ïu', 'és', 'en', 'es', 'em', 'am', 'ïa', 'it', 'ït', 'ía', 'ad', 'ed', 'id', 'an', 'ió', 'ar', 76 | 'ir', 'as', 'ii', 'io', 'ià', 'ís', 'ïx', 'ix', 'in', 'às', 'iï', 'iïs', 'í' 77 | ]; 78 | 79 | protected static $residual_suffixes = [ 80 | 'itz', 'it', 'os', 'eu', 'iu', 'is', 'ir', 'ïn', 'ïs', 'a', 'o', 'á', 'à', 'í', 'ó', 'e', 'é', 'i', 's', 'ì', 81 | 'ï' 82 | ]; 83 | 84 | /** 85 | * {@inheritdoc} 86 | */ 87 | public function stem($word) 88 | { 89 | $this->word = StringHelper::strtolower($word); 90 | 91 | // Catalan stemmer does not use Rv 92 | $this->r1(); 93 | $this->r2(); 94 | 95 | // Step 0: Attached pronoun 96 | $this->step0(); 97 | 98 | $word = $this->word; 99 | // Step 1a: Standard suffix 100 | $this->step1a(); 101 | 102 | // Step 1b: Verb suffix 103 | // Do step 1b if no ending was removed by step 1a. 104 | if ($this->word == $word) { 105 | $this->step1b(); 106 | } 107 | 108 | $this->step2(); 109 | $this->finish(); 110 | 111 | return $this->word; 112 | } 113 | 114 | /** 115 | * Step 0: Attached pronoun 116 | * 117 | * Search for the longest among the following suffixes 118 | * and delete it in R1. 119 | */ 120 | 121 | private function step0() 122 | { 123 | if (($position = $this->search(static::$attached_pronoun)) !== false) { 124 | if ($this->inR1($position)) { 125 | $this->word = StringHelper::substr($this->word, 0, $position); 126 | return true; 127 | } 128 | } 129 | return false; 130 | } 131 | 132 | /** 133 | * Step 1a: Standard suffix 134 | */ 135 | private function step1a() 136 | { 137 | // Run step 1a.2 before 1a.1, since they overlap on `cions` (1a.1) and `acions` (1a.2) 138 | // 139 | // Step 1a.2. 140 | // acions ada ades 141 | // delete if in R2 142 | if (($position = $this->search(['acions', 'ada', 'ades'])) !== false) { 143 | if ($this->inR2($position)) { 144 | $this->word = StringHelper::substr($this->word, 0, $position); 145 | } 146 | return true; 147 | } 148 | 149 | // Step 1a.1. 150 | // ar atge formes icte ictes ell ells ella és ès esc essa et ets eta eres eries ers ina ines able ls ió itat 151 | // itats itzar iva ives ivisme ius fer ment amen ament aments ments ot sfera al als era ana iste aire eria esa 152 | // eses esos or ícia ícies icis ici íci ícis ària àries alla ció cions n{c}a nces ó dor all il ístic enc enca 153 | // ís issa issos íssem íssiu issem isseu ísseu ós osa dora dores dors adura ble bles ívol ívola dís egar ejar 154 | // ificar itar ables adors idores idors adora ació doras dur dures alleng{u"}es ant ants ancia ancies atòria 155 | // atòries tori toris ats ions ota isam ors ora ores isament bilitat bilitats ivitat ivitats ari aris ionisme 156 | // ionista ionistes ialista ialistes ialisme ialismes ud uts uds encia encies ència ències ïtat ïtats atiu 157 | // atius atives ativa ativitat ativitats ible ibles assa asses assos ent ents íssim íssima íssims íssimes 158 | // ìssem ìsseu ìssin ims ima imes isme ista ismes istes inia inies íinia ínies ita ites triu trius oses osos 159 | // ient otes ots 160 | // 161 | // delete if in R1 162 | if (($position = $this->search(self::$standard_suffix_1a)) !== false) { 163 | if ($this->inR1($position)) { 164 | $this->word = StringHelper::substr($this->word, 0, $position); 165 | } 166 | return true; 167 | } 168 | 169 | // Step 1a.3. 170 | // logía logíes logia logies logi logis lógica lógics lógiques 171 | // replace with log if in R2 172 | if (($position = $this->search( 173 | ['logía', 'logíes', 'logia', 'logies', 'logis', 'lógica', 'lógics', 'lógiques', 'logi'] 174 | )) !== false) { 175 | if ($this->inR2($position)) { 176 | $this->word = preg_replace( 177 | '#(logía|logíes|logia|logies|logis|lógica|lógics|lógiques|logi)$#u', 'log', $this->word 178 | ); 179 | } 180 | return true; 181 | } 182 | 183 | // Step 1a.4. 184 | // ic ica ics iques 185 | // replace with ic if in R2 186 | if (($position = $this->search(['ics', 'ica', 'iques', 'ic'])) !== false) { 187 | if ($this->inR2($position)) { 188 | $this->word = preg_replace('#(ics|ica|iques|ic)$#u', 'ic', $this->word); 189 | } 190 | return true; 191 | } 192 | 193 | // Step 1a.5. 194 | // quíssims quíssimes quíssima quíssim 195 | // replace with c if in R1 196 | if (($position = $this->search(['quíssima', 'quíssims', 'quíssimes', 'quíssim'])) !== false) { 197 | if ($this->inR1($position)) { 198 | $this->word = preg_replace('#(quíssima|quíssims|quíssimes|quíssim)$#u', 'c', $this->word); 199 | } 200 | return true; 201 | } 202 | 203 | return false; 204 | } 205 | 206 | /** 207 | * Step 1b: Verb suffixes 208 | * Search for the longest among the following suffixes in r1 and r2, and 209 | * perform the action indicated. 210 | */ 211 | private function step1b() 212 | { 213 | // Step 1b.1 214 | // 215 | // aríamos eríamos iríamos eresseu iéramos iésemos adores aríais aremos eríais 216 | // eremos iríais iremos ierais ieseis asteis isteis ábamos áramos ásemos isquen 217 | // esquin esquis esques esquen ïsquen ïsques adora adors arían arías arian 218 | // arien aries aréis erían erías eréis erass irían irías iréis asseu esseu 219 | // àsseu àssem àssim àssiu essen esses assen asses assim assiu éssen ésseu 220 | // éssim éssiu éssem aríem aríeu eixer eixes ieran iesen ieron iendo essin 221 | // essis assin assis essim èssim èssiu ieras ieses abais arais aseis íamos 222 | // irien iries irìem irìeu iguem igueu esqui eixin eixis eixen iríem iríeu 223 | // atges issen isses issin issis issiu issim ïssin íssiu íssim ïssis ïguem 224 | // ïgueu ïssen ïsses itzeu itzis ador ents udes eren arán arás aria aràs 225 | // aría arés erán erás ería erau irán irás iría írem íreu aves avem ávem 226 | // àvem àveu áveu aven ares àrem àreu àren areu aren tzar ides ïdes ades 227 | // iera iese aste iste aban aran asen aron abas adas idas aras ases íais 228 | // ados idos amos imos ques iran irem iren ires ireu iria iràs eixi eixo 229 | // isin isis esca isca ïsca ïren ïres ïxen ïxes ixen ixes inin inis ineu 230 | // itza itzi itzo itzà arem ent arà ará ara aré erá eré irá iré íeu 231 | // ies íem ìeu ien uda ava ats ant ïen ams ïes dre eix ïda aba ada 232 | // ida its ids ase ían ado ido ieu ess ass ías áis ira irà irè sis 233 | // sin int isc ïsc ïra ïxo ixo ixa ini itz iïn re ie er ia at ut 234 | // au ïm ïu és en es em am ïa it ït ía ad ed id an ió ar 235 | // ir as ii io ià ís ïx ix in às iï iïs í 236 | // delete if in R1 237 | if (($position = $this->search(static::$verb_suffixes)) !== false) { 238 | if ($this->inR1($position)) { 239 | $this->word = StringHelper::substr($this->word, 0, $position); 240 | } 241 | return true; 242 | } 243 | 244 | // Step 1b.2 245 | // ando 246 | // delete if in R2 247 | if (($position = $this->search(['ando'])) !== false) { 248 | if ($this->inR2($position)) { 249 | $this->word = StringHelper::substr($this->word, 0, $position); 250 | } 251 | return true; 252 | } 253 | return false; 254 | } 255 | 256 | /** 257 | * Step 2: residual suffix 258 | * Search for the longest among the following suffixes in R1, and perform 259 | * the action indicated. 260 | */ 261 | private function step2() 262 | { 263 | // Step 2.1 264 | // residual suffix 265 | // delete if in R1 266 | if (($position = $this->search(static::$residual_suffixes)) !== false) { 267 | if ($this->inR1($position)) { 268 | $this->word = StringHelper::substr($this->word, 0, $position); 269 | } 270 | return true; 271 | } 272 | 273 | // Step 2.2 274 | // iqu 275 | // replace with ic if in R1 276 | if (($position = $this->search(['iqu'])) !== false) { 277 | if ($this->inR1($position)) { 278 | $this->word = preg_replace('#(iqu)$#u', 'ic', $this->word); 279 | } 280 | return true; 281 | } 282 | 283 | return false; 284 | } 285 | 286 | /** 287 | * And finally: 288 | * Remove accents and l aggeminades 289 | */ 290 | private function finish() 291 | { 292 | $this->word = str_replace( 293 | ['á', 'é', 'í', 'ó', 'ú', 'à', 'è', 'ì', 'ò', 'ï', 'ü', '·'], 294 | ['a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'i', 'u', '.'], 295 | $this->word 296 | ); 297 | } 298 | 299 | } 300 | -------------------------------------------------------------------------------- /src/Stemmer/Danish.php: -------------------------------------------------------------------------------- 1 | word = StringHelper::strtolower($word); 26 | 27 | // R2 is not used: R1 is defined in the same way as in the German stemmer 28 | $this->r1(); 29 | 30 | // then R1 is adjusted so that the region before it contains at least 3 letters. 31 | if ($this->r1Index < 3) { 32 | $this->r1Index = 3; 33 | $this->r1 = StringHelper::substr($this->word, 3); 34 | } 35 | 36 | // Do each of steps 1, 2 3 and 4. 37 | $this->step1(); 38 | $this->step2(); 39 | $this->step3(); 40 | $this->step4(); 41 | 42 | return $this->word; 43 | } 44 | 45 | /** 46 | * Define a valid s-ending as one of 47 | * a b c d f g h j k l m n o p r t v y z å 48 | * 49 | * @param string $ending 50 | * @return boolean 51 | */ 52 | private function hasValidSEnding($word) 53 | { 54 | $lastLetter = StringHelper::substr($word, -1, 1); 55 | return in_array($lastLetter, array('a', 'b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z', 'å')); 56 | } 57 | 58 | /** 59 | * Step 1 60 | * Search for the longest among the following suffixes in R1, and perform the action indicated. 61 | */ 62 | private function step1() 63 | { 64 | // hed ethed ered e erede ende erende ene erne ere en heden eren er heder erer 65 | // heds es endes erendes enes ernes eres ens hedens erens ers ets erets et eret 66 | // delete 67 | if ( ($position = $this->searchIfInR1(array( 68 | 'erendes', 'erende', 'hedens', 'erede', 'ethed', 'heden', 'endes', 'erets', 'heder', 'ernes', 69 | 'erens', 'ered', 'ende', 'erne', 'eres', 'eren', 'eret', 'erer', 'enes', 'heds', 70 | 'ens', 'ene', 'ere', 'ers', 'ets', 'hed', 'es', 'et', 'er', 'en', 'e' 71 | ))) !== false) { 72 | $this->word = StringHelper::substr($this->word, 0, $position); 73 | return true; 74 | } 75 | 76 | // s 77 | // delete if preceded by a valid s-ending 78 | if ( ($position = $this->searchIfInR1(array('s'))) !== false) { 79 | $word = StringHelper::substr($this->word, 0, $position); 80 | if ($this->hasValidSEnding($word)) { 81 | $this->word = $word; 82 | } 83 | return true; 84 | } 85 | } 86 | 87 | /** 88 | * Step 2 89 | * Search for one of the following suffixes in R1, and if found delete the last letter. 90 | * gd dt gt kt 91 | */ 92 | private function step2() 93 | { 94 | if ($this->searchIfInR1(array('gd', 'dt', 'gt', 'kt')) !== false) { 95 | $this->word = StringHelper::substr($this->word, 0, -1); 96 | } 97 | } 98 | 99 | /** 100 | * Step 3: 101 | */ 102 | private function step3() 103 | { 104 | // If the word ends igst, remove the final st. 105 | if ($this->search(array('igst')) !== false) { 106 | $this->word = StringHelper::substr($this->word, 0, -2); 107 | } 108 | 109 | // Search for the longest among the following suffixes in R1, and perform the action indicated. 110 | // ig lig elig els 111 | // delete, and then repeat step 2 112 | if ( ($position = $this->searchIfInR1(array('elig', 'lig', 'ig', 'els'))) !== false) { 113 | $this->word = StringHelper::substr($this->word, 0, $position); 114 | $this->step2(); 115 | return true; 116 | } 117 | 118 | // løst 119 | // replace with løs 120 | if ($this->searchIfInR1(array('løst')) !== false) { 121 | $this->word = StringHelper::substr($this->word, 0, -1); 122 | } 123 | } 124 | 125 | /** 126 | * Step 4: undouble 127 | * If the word ends with double consonant in R1, remove one of the consonants. 128 | */ 129 | private function step4() 130 | { 131 | $length = StringHelper::strlen($this->word); 132 | if (!$this->inR1(($length-1))) { 133 | return false; 134 | } 135 | 136 | $lastLetter = StringHelper::substr($this->word, -1, 1); 137 | if (in_array($lastLetter, self::$vowels)) { 138 | return false; 139 | } 140 | $beforeLastLetter = StringHelper::substr($this->word, -2, 1); 141 | 142 | if ($lastLetter == $beforeLastLetter) { 143 | $this->word = StringHelper::substr($this->word, 0, -1); 144 | } 145 | return true; 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /src/Stemmer/Dutch.php: -------------------------------------------------------------------------------- 1 | word = StringHelper::strtolower($word); 26 | 27 | // First, remove all umlaut and acute accents. 28 | $this->word = str_replace( 29 | array('ä', 'ë', 'ï', 'ö', 'ü', 'á', 'é', 'í', 'ó', 'ú'), 30 | array('a', 'e', 'i', 'o', 'u', 'a', 'e', 'i', 'o', 'u'), 31 | $this->word); 32 | 33 | $this->plainVowels = implode('', self::$vowels); 34 | 35 | // Put initial y, y after a vowel, and i between vowels into upper case. 36 | $this->word = preg_replace('#^y#u', 'Y', $this->word); 37 | $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); 38 | $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); 39 | 40 | // R1 and R2 (see the note on R1 and R2) are then defined as in German. 41 | // R1 and R2 are first set up in the standard way 42 | $this->r1(); 43 | $this->r2(); 44 | 45 | // but then R1 is adjusted so that the region before it contains at least 3 letters. 46 | if ($this->r1Index < 3) { 47 | $this->r1Index = 3; 48 | $this->r1 = StringHelper::substr($this->word, 3); 49 | } 50 | 51 | // Do each of steps 1, 2 3 and 4. 52 | $this->step1(); 53 | $removedE = $this->step2(); 54 | $this->step3a(); 55 | $this->step3b($removedE); 56 | $this->step4(); 57 | $this->finish(); 58 | 59 | return $this->word; 60 | } 61 | 62 | /** 63 | * Define a valid s-ending as a non-vowel other than j. 64 | * @param string $ending 65 | * @return boolean 66 | */ 67 | private function hasValidSEnding($word) 68 | { 69 | $lastLetter = StringHelper::substr($word, -1, 1); 70 | return !in_array($lastLetter, array_merge(self::$vowels, array('j'))); 71 | } 72 | 73 | /** 74 | * Define a valid en-ending as a non-vowel, and not gem. 75 | * @param string $ending 76 | * @return boolean 77 | */ 78 | private function hasValidEnEnding($word) 79 | { 80 | $lastLetter = StringHelper::substr($word, -1, 1); 81 | if (in_array($lastLetter, self::$vowels)) { 82 | return false; 83 | } 84 | 85 | $threeLastLetters = StringHelper::substr($word, -3, 3); 86 | if ($threeLastLetters == 'gem') { 87 | return false; 88 | } 89 | return true; 90 | } 91 | 92 | /** 93 | * Define undoubling the ending as removing the last letter if the word ends kk, dd or tt. 94 | */ 95 | private function unDoubling() 96 | { 97 | if ($this->search(array('kk', 'dd', 'tt')) !== false) { 98 | $this->word = StringHelper::substr($this->word, 0, -1); 99 | } 100 | } 101 | 102 | /** 103 | * Step 1 104 | * Search for the longest among the following suffixes, and perform the action indicated 105 | */ 106 | private function step1() 107 | { 108 | // heden 109 | // replace with heid if in R1 110 | if ( ($position = $this->search(array('heden'))) !== false) { 111 | if ($this->inR1($position)) { 112 | $this->word = preg_replace('#(heden)$#u', 'heid', $this->word); 113 | } 114 | return true; 115 | } 116 | 117 | // en ene 118 | // delete if in R1 and preceded by a valid en-ending, and then undouble the ending 119 | if ( ($position = $this->search(array('ene', 'en'))) !== false) { 120 | if ($this->inR1($position)) { 121 | $word = StringHelper::substr($this->word, 0, $position); 122 | if ($this->hasValidEnEnding($word)) { 123 | $this->word = $word; 124 | $this->unDoubling(); 125 | } 126 | } 127 | return true; 128 | } 129 | 130 | // s se 131 | // delete if in R1 and preceded by a valid s-ending 132 | if ( ($position = $this->search(array('se', 's'))) !== false) { 133 | if ($this->inR1($position)) { 134 | $word = StringHelper::substr($this->word, 0, $position); 135 | if ($this->hasValidSEnding($word)) { 136 | $this->word = $word; 137 | } 138 | } 139 | return true; 140 | } 141 | 142 | return false; 143 | } 144 | 145 | /** 146 | * Step 2 147 | * Delete suffix e if in R1 and preceded by a non-vowel, and then undouble the ending 148 | */ 149 | private function step2() 150 | { 151 | if ( ($position = $this->search(array('e'))) !== false) { 152 | if ($this->inR1($position)) { 153 | $letter = StringHelper::substr($this->word, -2, 1); 154 | if (!in_array($letter, self::$vowels)) { 155 | $this->word = StringHelper::substr($this->word, 0, $position); 156 | $this->unDoubling(); 157 | 158 | return true; 159 | } 160 | } 161 | } 162 | 163 | return false; 164 | } 165 | 166 | /** 167 | * Step 3a: heid 168 | * delete heid if in R2 and not preceded by c, and treat a preceding en as in step 1(b) 169 | */ 170 | private function step3a() 171 | { 172 | if ( ($position = $this->search(array('heid'))) !== false) { 173 | if ($this->inR2($position)) { 174 | $letter = StringHelper::substr($this->word, -5, 1); 175 | if ($letter !== 'c') { 176 | $this->word = StringHelper::substr($this->word, 0, $position); 177 | 178 | if ( ($position = $this->search(array('en'))) !== false) { 179 | if ($this->inR1($position)) { 180 | $word = StringHelper::substr($this->word, 0, $position); 181 | if ($this->hasValidEnEnding($word)) { 182 | $this->word = $word; 183 | $this->unDoubling(); 184 | } 185 | } 186 | } 187 | } 188 | } 189 | } 190 | 191 | } 192 | 193 | /** 194 | * Step 3b: d-suffixe 195 | * Search for the longest among the following suffixes, and perform the action indicated. 196 | */ 197 | private function step3b($removedE) 198 | { 199 | // end ing 200 | // delete if in R2 201 | // if preceded by ig, delete if in R2 and not preceded by e, otherwise undouble the ending 202 | if ( ($position = $this->search(array('end', 'ing'))) !== false) { 203 | if ($this->inR2($position)) { 204 | $this->word = StringHelper::substr($this->word, 0, $position); 205 | 206 | if ( ($position2 = $this->searchIfInR2(array('ig'))) !== false) { 207 | $letter = StringHelper::substr($this->word, -3, 1); 208 | if ($letter !== 'e') { 209 | $this->word = StringHelper::substr($this->word, 0, $position2); 210 | } 211 | } else { 212 | $this->unDoubling(); 213 | } 214 | } 215 | 216 | 217 | return true; 218 | } 219 | 220 | // ig 221 | // delete if in R2 and not preceded by e 222 | if ( ($position = $this->search(array('ig'))) !== false) { 223 | if ($this->inR2($position)) { 224 | $letter = StringHelper::substr($this->word, -3, 1); 225 | if ($letter !== 'e') { 226 | $this->word = StringHelper::substr($this->word, 0, $position); 227 | } 228 | } 229 | return true; 230 | } 231 | 232 | // lijk 233 | // delete if in R2, and then repeat step 2 234 | if ( ($position = $this->search(array('lijk'))) !== false) { 235 | if ($this->inR2($position)) { 236 | $this->word = StringHelper::substr($this->word, 0, $position); 237 | $this->step2(); 238 | } 239 | return true; 240 | } 241 | 242 | // baar 243 | // delete if in R2 244 | if ( ($position = $this->search(array('baar'))) !== false) { 245 | if ($this->inR2($position)) { 246 | $this->word = StringHelper::substr($this->word, 0, $position); 247 | } 248 | return true; 249 | } 250 | 251 | // bar 252 | // delete if in R2 and if step 2 actually removed an e 253 | if ( ($position = $this->search(array('bar'))) !== false) { 254 | if ($this->inR2($position) && $removedE) { 255 | $this->word = StringHelper::substr($this->word, 0, $position); 256 | } 257 | return true; 258 | } 259 | 260 | return false; 261 | } 262 | 263 | /** 264 | * Step 4: undouble vowel 265 | * If the words ends CVD, where C is a non-vowel, D is a non-vowel other than I, and V is double a, e, o or u, 266 | * remove one of the vowels from V (for example, maan -> man, brood -> brod). 267 | */ 268 | private function step4() 269 | { 270 | // D is a non-vowel other than I 271 | $d = StringHelper::substr($this->word, -1, 1); 272 | if (in_array($d, array_merge(self::$vowels, array('I')))) { 273 | return false; 274 | } 275 | 276 | // V is double a, e, o or u 277 | $v = StringHelper::substr($this->word, -3, 2); 278 | if (!in_array($v, array('aa', 'ee', 'oo', 'uu'))) { 279 | return false; 280 | } 281 | $singleV = StringHelper::substr($v, 0, 1); 282 | 283 | // C is a non-vowel 284 | $c = StringHelper::substr($this->word, -4, 1); 285 | if (in_array($c, self::$vowels)) { 286 | return false; 287 | } 288 | 289 | $this->word = StringHelper::substr($this->word, 0, -4); 290 | $this->word .= $c . $singleV .$d; 291 | } 292 | 293 | /** 294 | * Finally 295 | * Turn I and Y back into lower case. 296 | */ 297 | private function finish() 298 | { 299 | $this->word = str_replace(array('I', 'Y'), array('i', 'y'), $this->word); 300 | } 301 | } 302 | -------------------------------------------------------------------------------- /src/Stemmer/English.php: -------------------------------------------------------------------------------- 1 | word = StringHelper::strtolower($word); 35 | 36 | // exceptions 37 | if (null !== ($word = $this->exception1())) { 38 | return $word; 39 | } 40 | 41 | 42 | $this->plainVowels = implode('', self::$vowels); 43 | 44 | // Remove initial ', if present. 45 | $first = StringHelper::substr($this->word, 0, 1); 46 | if ($first == "'") { 47 | $this->word = StringHelper::substr($this->word, 1); 48 | } 49 | 50 | // Set initial y, or y after a vowel, to Y 51 | if ($first == 'y') { 52 | $this->word = preg_replace('#^y#u', 'Y', $this->word); 53 | } 54 | $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); 55 | 56 | $this->r1(); 57 | $this->exceptionR1(); 58 | $this->r2(); 59 | 60 | $this->step0(); 61 | $this->step1a(); 62 | 63 | // exceptions 2 64 | if (null !== ($word = $this->exception2())) { 65 | return $word; 66 | } 67 | 68 | $this->step1b(); 69 | $this->step1c(); 70 | $this->step2(); 71 | $this->step3(); 72 | $this->step4(); 73 | $this->step5(); 74 | $this->finish(); 75 | 76 | return $this->word; 77 | } 78 | 79 | /** 80 | * Step 0 81 | * Remove ', 's, 's' 82 | */ 83 | private function step0() 84 | { 85 | if ( ($position = $this->search(array("'s'", "'s", "'"))) !== false) { 86 | $this->word = StringHelper::substr($this->word, 0, $position); 87 | } 88 | } 89 | 90 | private function step1a() 91 | { 92 | // sses 93 | // replace by ss 94 | if ( ($position = $this->search(array('sses'))) !== false) { 95 | $this->word = preg_replace('#(sses)$#u', 'ss', $this->word); 96 | return true; 97 | } 98 | 99 | // ied+ ies* 100 | // replace by i if preceded by more than one letter, otherwise by ie (so ties -> tie, cries -> cri) 101 | if ( ($position = $this->search(array('ied', 'ies'))) !== false) { 102 | if ($position > 1) { 103 | $this->word = preg_replace('#(ied|ies)$#u', 'i', $this->word); 104 | 105 | } else { 106 | $this->word = preg_replace('#(ied|ies)$#u', 'ie', $this->word); 107 | } 108 | return true; 109 | } 110 | 111 | // us+ ss 112 | // do nothing 113 | if ( ($position = $this->search(array('us', 'ss'))) !== false) { 114 | return true; 115 | } 116 | 117 | // s 118 | // delete if the preceding word part contains a vowel not immediately before the s (so gas and this retain the s, gaps and kiwis lose it) 119 | if ( ($position = $this->search(array('s'))) !== false) { 120 | for ($i=0; $i<$position-1; $i++) { 121 | $letter = StringHelper::substr($this->word, $i, 1); 122 | 123 | if (in_array($letter, self::$vowels)) { 124 | $this->word = StringHelper::substr($this->word, 0, $position); 125 | return true; 126 | } 127 | } 128 | return true; 129 | } 130 | 131 | return false; 132 | } 133 | 134 | /** 135 | * Step 1b 136 | */ 137 | private function step1b() 138 | { 139 | // eed eedly+ 140 | // replace by ee if in R1 141 | if ( ($position = $this->search(array('eedly', 'eed'))) !== false) { 142 | if ($this->inR1($position)) { 143 | $this->word = preg_replace('#(eedly|eed)$#u', 'ee', $this->word); 144 | } 145 | return true; 146 | } 147 | 148 | // ed edly+ ing ingly+ 149 | // delete if the preceding word part contains a vowel, and after the deletion: 150 | // if the word ends at, bl or iz add e (so luxuriat -> luxuriate), or 151 | // if the word ends with a double remove the last letter (so hopp -> hop), or 152 | // if the word is short, add e (so hop -> hope) 153 | if ( ($position = $this->search(array('edly', 'ingly', 'ed', 'ing'))) !== false) { 154 | for ($i=0; $i<$position; $i++) { 155 | $letter = StringHelper::substr($this->word, $i, 1); 156 | 157 | if (in_array($letter, self::$vowels)) { 158 | $this->word = StringHelper::substr($this->word, 0, $position); 159 | 160 | if ($this->search(array('at', 'bl', 'iz')) !== false) { 161 | $this->word .= 'e'; 162 | 163 | } elseif ( ($position2 = $this->search(self::$doubles)) !== false) { 164 | $this->word = StringHelper::substr($this->word, 0, ($position2+1)); 165 | 166 | } elseif ($this->isShort()) { 167 | $this->word .= 'e'; 168 | } 169 | 170 | return true; 171 | } 172 | } 173 | return true; 174 | } 175 | 176 | return false; 177 | } 178 | 179 | /** 180 | * Step 1c: * 181 | */ 182 | private function step1c() 183 | { 184 | // replace suffix y or Y by i if preceded by a non-vowel 185 | // which is not the first letter of the word (so cry -> cri, by -> by, say -> say) 186 | $length = StringHelper::strlen($this->word); 187 | 188 | if ($length < 3) { 189 | return true; 190 | } 191 | 192 | if ( ($position = $this->search(array('y', 'Y'))) !== false) { 193 | $before = $position - 1; 194 | $letter = StringHelper::substr($this->word, $before, 1); 195 | 196 | if (! in_array($letter, self::$vowels)) { 197 | $this->word = preg_replace('#(y|Y)$#u', 'i', $this->word); 198 | } 199 | 200 | return true; 201 | } 202 | 203 | return false; 204 | } 205 | 206 | /** 207 | * Step 2 208 | * Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated. 209 | */ 210 | private function step2() 211 | { 212 | // iveness iviti: replace by ive 213 | if ( ($position = $this->search(array('iveness', 'iviti'))) !== false) { 214 | if ($this->inR1($position)) { 215 | $this->word = preg_replace('#(iveness|iviti)$#u', 'ive', $this->word); 216 | } 217 | return true; 218 | } 219 | 220 | // ousli ousness: replace by ous 221 | if ( ($position = $this->search(array('ousli', 'ousness'))) !== false) { 222 | if ($this->inR1($position)) { 223 | $this->word = preg_replace('#(ousli|ousness)$#u', 'ous', $this->word); 224 | } 225 | return true; 226 | } 227 | 228 | // izer ization: replace by ize 229 | if ( ($position = $this->search(array('izer', 'ization'))) !== false) { 230 | if ($this->inR1($position)) { 231 | $this->word = preg_replace('#(izer|ization)$#u', 'ize', $this->word); 232 | } 233 | return true; 234 | } 235 | 236 | // ational ation ator: replace by ate 237 | if ( ($position = $this->search(array('ational', 'ation', 'ator'))) !== false) { 238 | if ($this->inR1($position)) { 239 | $this->word = preg_replace('#(ational|ation|ator)$#u', 'ate', $this->word); 240 | } 241 | return true; 242 | } 243 | 244 | // biliti bli+: replace by ble 245 | if ( ($position = $this->search(array('biliti', 'bli'))) !== false) { 246 | if ($this->inR1($position)) { 247 | $this->word = preg_replace('#(biliti|bli)$#u', 'ble', $this->word); 248 | } 249 | return true; 250 | } 251 | 252 | // lessli+: replace by less 253 | if ( ($position = $this->search(array('lessli'))) !== false) { 254 | if ($this->inR1($position)) { 255 | $this->word = preg_replace('#(lessli)$#u', 'less', $this->word); 256 | } 257 | return true; 258 | } 259 | 260 | // fulness: replace by ful 261 | if ( ($position = $this->search(array('fulness', 'fulli'))) !== false) { 262 | if ($this->inR1($position)) { 263 | $this->word = preg_replace('#(fulness|fulli)$#u', 'ful', $this->word); 264 | } 265 | return true; 266 | } 267 | 268 | // tional: replace by tion 269 | if ( ($position = $this->search(array('tional'))) !== false) { 270 | if ($this->inR1($position)) { 271 | $this->word = preg_replace('#(tional)$#u', 'tion', $this->word); 272 | } 273 | return true; 274 | } 275 | 276 | // alism aliti alli: replace by al 277 | if ( ($position = $this->search(array('alism', 'aliti', 'alli'))) !== false) { 278 | if ($this->inR1($position)) { 279 | $this->word = preg_replace('#(alism|aliti|alli)$#u', 'al', $this->word); 280 | } 281 | return true; 282 | } 283 | 284 | // enci: replace by ence 285 | if ( ($position = $this->search(array('enci'))) !== false) { 286 | if ($this->inR1($position)) { 287 | $this->word = preg_replace('#(enci)$#u', 'ence', $this->word); 288 | } 289 | return true; 290 | } 291 | 292 | // anci: replace by ance 293 | if ( ($position = $this->search(array('anci'))) !== false) { 294 | if ($this->inR1($position)) { 295 | $this->word = preg_replace('#(anci)$#u', 'ance', $this->word); 296 | } 297 | return true; 298 | } 299 | 300 | // abli: replace by able 301 | if ( ($position = $this->search(array('abli'))) !== false) { 302 | if ($this->inR1($position)) { 303 | $this->word = preg_replace('#(abli)$#u', 'able', $this->word); 304 | } 305 | return true; 306 | } 307 | 308 | // entli: replace by ent 309 | if ( ($position = $this->search(array('entli'))) !== false) { 310 | if ($this->inR1($position)) { 311 | $this->word = preg_replace('#(entli)$#u', 'ent', $this->word); 312 | } 313 | return true; 314 | } 315 | 316 | // ogi+: replace by og if preceded by l 317 | if ( ($position = $this->search(array('ogi'))) !== false) { 318 | 319 | if ($this->inR1($position)) { 320 | $before = $position - 1; 321 | $letter = StringHelper::substr($this->word, $before, 1); 322 | 323 | if ($letter == 'l') { 324 | $this->word = preg_replace('#(ogi)$#u', 'og', $this->word); 325 | } 326 | } 327 | 328 | return true; 329 | } 330 | 331 | // li+: delete if preceded by a valid li-ending 332 | if ( ($position = $this->search(array('li'))) !== false) { 333 | 334 | if ($this->inR1($position)) { 335 | // a letter for you 336 | $letter = StringHelper::substr($this->word, ($position-1), 1); 337 | 338 | if (in_array($letter, self::$liEnding)) { 339 | $this->word = StringHelper::substr($this->word, 0, $position); 340 | } 341 | } 342 | 343 | return true; 344 | } 345 | 346 | return false; 347 | } 348 | 349 | /** 350 | * Step 3: 351 | * Search for the longest among the following suffixes, and, if found and in R1, perform the action indicated. 352 | */ 353 | private function step3() 354 | { 355 | // ational+: replace by ate 356 | if ($this->searchIfInR1(array('ational')) !== false) { 357 | $this->word = preg_replace('#(ational)$#u', 'ate', $this->word); 358 | return true; 359 | } 360 | 361 | // tional+: replace by tion 362 | if ($this->searchIfInR1(array('tional')) !== false) { 363 | $this->word = preg_replace('#(tional)$#u', 'tion', $this->word); 364 | return true; 365 | } 366 | 367 | // alize: replace by al 368 | if ($this->searchIfInR1(array('alize')) !== false) { 369 | $this->word = preg_replace('#(alize)$#u', 'al', $this->word); 370 | return true; 371 | } 372 | 373 | // icate iciti ical: replace by ic 374 | if ($this->searchIfInR1(array('icate', 'iciti', 'ical')) !== false) { 375 | $this->word = preg_replace('#(icate|iciti|ical)$#u', 'ic', $this->word); 376 | return true; 377 | } 378 | 379 | // ful ness: delete 380 | if ( ($position = $this->searchIfInR1(array('ful', 'ness'))) !== false) { 381 | $this->word = StringHelper::substr($this->word, 0, $position); 382 | return true; 383 | } 384 | 385 | // ative*: delete if in R2 386 | if ( (($position = $this->searchIfInR1(array('ative'))) !== false) && ($this->inR2($position)) ) { 387 | $this->word = StringHelper::substr($this->word, 0, $position); 388 | return true; 389 | } 390 | 391 | return false; 392 | } 393 | 394 | /** 395 | * Step 4 396 | * Search for the longest among the following suffixes, and, if found and in R2, perform the action indicated. 397 | */ 398 | private function step4() 399 | { 400 | // ement ance ence able ible ant ment ent ism ate iti ous ive ize al er ic 401 | // delete 402 | if ( ($position = $this->search(array( 403 | 'ance', 'ence', 'ement', 'able', 'ible', 'ant', 'ment', 'ent', 'ism', 404 | 'ate', 'iti', 'ous', 'ive', 'ize', 'al', 'er', 'ic'))) !== false) { 405 | 406 | if ($this->inR2($position)) { 407 | $this->word = StringHelper::substr($this->word, 0, $position); 408 | } 409 | return true; 410 | } 411 | 412 | // ion 413 | // delete if preceded by s or t 414 | if ( ($position = $this->searchIfInR2(array('ion'))) !== false) { 415 | $before = $position - 1; 416 | $letter = StringHelper::substr($this->word, $before, 1); 417 | 418 | if ($letter == 's' || $letter == 't') { 419 | $this->word = StringHelper::substr($this->word, 0, $position); 420 | } 421 | 422 | return true; 423 | } 424 | 425 | return false; 426 | } 427 | 428 | /** 429 | * Step 5: * 430 | * Search for the the following suffixes, and, if found, perform the action indicated. 431 | */ 432 | private function step5() 433 | { 434 | // e 435 | // delete if in R2, or in R1 and not preceded by a short syllable 436 | if ( ($position = $this->search(array('e'))) !== false) { 437 | if ($this->inR2($position)) { 438 | $this->word = StringHelper::substr($this->word, 0, $position); 439 | 440 | } elseif ($this->inR1($position)) { 441 | if ( (! $this->searchShortSyllabe(-4, 3)) && (! $this->searchShortSyllabe(-3, 2)) ) { 442 | $this->word = StringHelper::substr($this->word, 0, $position); 443 | } 444 | } 445 | 446 | return true; 447 | } 448 | 449 | // l 450 | // delete if in R2 and preceded by l 451 | if ( ($position = $this->searchIfInR2(array('l'))) !== false) { 452 | $before = $position - 1; 453 | $letter = StringHelper::substr($this->word, $before, 1); 454 | 455 | if ($letter == 'l') { 456 | $this->word = StringHelper::substr($this->word, 0, $position); 457 | } 458 | 459 | return true; 460 | } 461 | 462 | return false; 463 | } 464 | 465 | private function finish() 466 | { 467 | $this->word = str_replace('Y', 'y', $this->word); 468 | } 469 | 470 | private function exceptionR1() 471 | { 472 | if (StringHelper::strpos($this->word, 'gener') === 0) { 473 | $this->r1 = StringHelper::substr($this->word, 5); 474 | $this->r1Index = 5; 475 | 476 | } elseif (StringHelper::strpos($this->word, 'commun') === 0) { 477 | $this->r1 = StringHelper::substr($this->word, 6); 478 | $this->r1Index = 6; 479 | 480 | } elseif (StringHelper::strpos($this->word, 'arsen') === 0) { 481 | $this->r1 = StringHelper::substr($this->word, 5); 482 | $this->r1Index = 5; 483 | } 484 | } 485 | 486 | /** 487 | * 1/ Stem certain special words as follows, 488 | * 2/ If one of the following is found, leave it invariant, 489 | */ 490 | private function exception1() 491 | { 492 | $exceptions = array( 493 | 'skis' => 'ski', 494 | 'skies' => 'sky', 495 | 'dying' => 'die', 496 | 'lying' => 'lie', 497 | 'tying' => 'tie', 498 | 'idly' => 'idl', 499 | 'gently' => 'gentl', 500 | 'ugly' => 'ugli', 501 | 'early' => 'earli', 502 | 'only' => 'onli', 503 | 'singly' => 'singl', 504 | // invariants 505 | 'sky' => 'sky', 506 | 'news' => 'news', 507 | 'howe' => 'howe', 508 | 'atlas' => 'atlas', 509 | 'cosmos' => 'cosmos', 510 | 'bias' => 'bias', 511 | 'andes' => 'andes' 512 | ); 513 | 514 | if (isset($exceptions[$this->word])) { 515 | return $exceptions[$this->word]; 516 | } 517 | 518 | return null; 519 | } 520 | 521 | /** 522 | * Following step 1a, leave the following invariant, 523 | */ 524 | private function exception2() 525 | { 526 | $exceptions = array( 527 | 'inning' => 'inning', 528 | 'outing' => 'outing', 529 | 'canning' => 'canning', 530 | 'herring' => 'herring', 531 | 'earring' => 'earring', 532 | 'proceed' => 'proceed', 533 | 'exceed' => 'exceed', 534 | 'succeed' => 'succeed' 535 | ); 536 | 537 | if (isset($exceptions[$this->word])) { 538 | return $exceptions[$this->word]; 539 | } 540 | 541 | return null; 542 | } 543 | 544 | /** 545 | * A word is called short if it ends in a short syllable, and if R1 is null. 546 | * Note : R1 not really null, but the word at this state must be smaller than r1 index 547 | * 548 | * @return boolean 549 | */ 550 | private function isShort() 551 | { 552 | $length = StringHelper::strlen($this->word); 553 | return ( ($this->searchShortSyllabe(-3, 3) || $this->searchShortSyllabe(-2, 2)) && ($length == $this->r1Index) ); 554 | } 555 | 556 | /** 557 | * Define a short syllable in a word as either (a) a vowel followed by a non-vowel other than w, x or Y and preceded by a non-vowel, 558 | * or * (b) a vowel at the beginning of the word followed by a non-vowel. 559 | * 560 | * So rap, trap, entrap end with a short syllable, and ow, on, at are classed as short syllables. 561 | * But uproot, bestow, disturb do not end with a short syllable. 562 | */ 563 | private function searchShortSyllabe($from, $nbLetters) 564 | { 565 | $length = StringHelper::strlen($this->word); 566 | 567 | if ($from < 0) { 568 | $from = $length + $from; 569 | } 570 | if ($from < 0) { 571 | $from = 0; 572 | } 573 | 574 | // (a) is just for beginning of the word 575 | if ( ($nbLetters == 2) && ($from != 0) ) { 576 | return false; 577 | } 578 | 579 | $first = StringHelper::substr($this->word, $from, 1); 580 | $second = StringHelper::substr($this->word, ($from+1), 1); 581 | 582 | if ($nbLetters == 2) { 583 | if ( (in_array($first, self::$vowels)) && (!in_array($second, self::$vowels)) ) { 584 | return true; 585 | } 586 | } 587 | 588 | $third = StringHelper::substr($this->word, ($from+2), 1); 589 | 590 | if ( (!in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) 591 | && (!in_array($third, array_merge(self::$vowels, array('x', 'Y', 'w'))))) { 592 | return true; 593 | } 594 | 595 | return false; 596 | } 597 | } 598 | -------------------------------------------------------------------------------- /src/Stemmer/Finnish.php: -------------------------------------------------------------------------------- 1 | 6 | */ 7 | namespace Wamania\Snowball\Stemmer; 8 | 9 | use Joomla\String\StringHelper; 10 | 11 | /** 12 | * Finnish Snowball Stemmer. 13 | * 14 | * @link http://snowball.tartarus.org/algorithms/finnish/stemmer.html 15 | * @author msaari 16 | */ 17 | class Finnish extends Stem 18 | { 19 | /** 20 | * All swedish vowels 21 | */ 22 | protected static $vowels = array('a', 'e', 'i', 'o', 'u', 'y', 'ä', 'ö'); 23 | 24 | protected static $consonants = array('b', 'c', 'd', 'f', 'g', 'h', 'j', 25 | 'k', 'l', 'm', 'n', 'p', 'q', 'r', 's', 't', 'v', 'w', 'x', 'z'); 26 | 27 | protected static $restrictedVowels = array('a', 'e', 'i', 'o', 'u', 'ä', 'ö'); 28 | 29 | /** 30 | * Long restricted vowels, ie. doubled vowels. 31 | */ 32 | protected static $longVowels = array('aa', 'ee', 'ii', 'oo', 'uu', 'ää', 'öö'); 33 | 34 | private $_removedInStep3 = false; 35 | 36 | /** 37 | * {@inheritdoc} 38 | */ 39 | public function stem($word) 40 | { 41 | $this->word = StringHelper::strtolower($word); 42 | 43 | // R1 and R2 are then defined in the usual way 44 | $this->r1(); 45 | $this->r2(); 46 | 47 | // Do each of steps 1, 2 3, 4, 5 and 6. 48 | 49 | $this->step1(); 50 | $this->step2(); 51 | $this->step3(); 52 | $this->step4(); 53 | $this->step5(); 54 | $this->step6(); 55 | 56 | return $this->word; 57 | } 58 | 59 | /** 60 | * Step 1 61 | * 62 | * Search for the longest among the following suffixes in R1, and perform 63 | * the action indicated. 64 | * 65 | * @return boolean True when something is done. 66 | */ 67 | private function step1() 68 | { 69 | // (a) kin kaan kään ko kö han hän pa pä 70 | // delete if preceded by n, t or a vowel 71 | if (($position = $this->searchIfInR1(array('kaan', 'kään', 'kin', 'han', 'hän', 'ko', 'kö', 'pa', 'pä'))) !== false) { 72 | $lastLetter = StringHelper::substr($this->word, ($position-1), 1); 73 | 74 | if (in_array($lastLetter, array_merge(['t', 'n'], self::$vowels))) { 75 | $this->word = StringHelper::substr($this->word, 0, $position); 76 | $this->r1(); 77 | $this->r2(); 78 | } 79 | 80 | return true; 81 | } 82 | 83 | // sti 84 | // delete if in R2 85 | if (($position = $this->searchIfInR1(array('sti'))) !== false) { 86 | if ($this->inR2($position)) { 87 | $this->word = StringHelper::substr($this->word, 0, $position); 88 | $this->r1(); 89 | $this->r2(); 90 | } 91 | 92 | return true; 93 | } 94 | } 95 | 96 | /** 97 | * Step 2: possessives. 98 | * 99 | * Search for the longest among the following suffixes in R1, and perform 100 | * the action indicated. 101 | * 102 | * @return boolean True when something is done. 103 | */ 104 | private function step2() 105 | { 106 | // si 107 | // delete if not preceded by k 108 | if (($position = $this->searchIfInR1(array('si'))) !== false) { 109 | $lastLetter = StringHelper::substr($this->word, ($position-1), 1); 110 | 111 | if ($lastLetter !== 'k') { 112 | $this->word = StringHelper::substr($this->word, 0, $position); 113 | $this->r1(); 114 | $this->r2(); 115 | return true; 116 | } 117 | } 118 | 119 | // ni 120 | // delete 121 | if (($position = $this->searchIfInR1(array('ni'))) !== false) { 122 | $this->word = StringHelper::substr($this->word, 0, $position); 123 | // if preceded by kse, replace with ksi 124 | if ( ($position = $this->search(array('kse'))) !== false) { 125 | $this->word = preg_replace('#(kse)$#u', 'ksi', $this->word); 126 | } 127 | $this->r1(); 128 | $this->r2(); 129 | return true; 130 | } 131 | 132 | // nsa nsä mme nne 133 | // delete 134 | if (($position = $this->searchIfInR1(array('nsa', 'nsä', 'mme', 'nne'))) !== false) { 135 | $this->word = StringHelper::substr($this->word, 0, $position); 136 | $this->r1(); 137 | $this->r2(); 138 | return true; 139 | } 140 | 141 | // an 142 | // delete if preceded by one of ta ssa sta lla lta na 143 | if (($position = $this->searchIfInR1(array('an'))) !== false) { 144 | $word = StringHelper::substr($this->word, 0, $position); 145 | $lastThreeLetters = StringHelper::substr($word, -3, 3); 146 | $lastTwoLetters = StringHelper::substr($word, -2, 2); 147 | if (in_array($lastThreeLetters, array('ssa', 'sta', 'lla', 'lta'), true) || in_array($lastTwoLetters, array('na', 'ta'), true)) { 148 | $this->word = $word; 149 | $this->r1(); 150 | $this->r2(); 151 | return true; 152 | } 153 | } 154 | 155 | // än 156 | // delete if preceded by one of tä ssä stä llä ltä nä 157 | if (($position = $this->searchIfInR1(array('än'))) !== false) { 158 | $word = StringHelper::substr($this->word, 0, $position); 159 | $lastThreeLetters = StringHelper::substr($word, -3, 3); 160 | $lastTwoLetters = StringHelper::substr($word, -2, 2); 161 | if (in_array($lastThreeLetters, array('ssä', 'stä', 'llä', 'ltä'), true) || in_array($lastTwoLetters, array('nä', 'tä'), true)) { 162 | $this->word = $word; 163 | $this->r1(); 164 | $this->r2(); 165 | return true; 166 | } 167 | } 168 | 169 | // en 170 | // delete if preceded by one of lle ine 171 | if (($position = $this->searchIfInR1(array('en'))) !== false) { 172 | $word = StringHelper::substr($this->word, 0, $position); 173 | if (StringHelper::strlen($this->word) > 4) { 174 | $lastThreeLetters = StringHelper::substr($this->word, -5, 3); 175 | if (in_array($lastThreeLetters, array('lle', 'ine'), true)) { 176 | $this->word = $word; 177 | $this->r1(); 178 | $this->r2(); 179 | return true; 180 | } 181 | } 182 | } 183 | } 184 | 185 | /** 186 | * Step 3: cases 187 | * 188 | * Search for the longest among the following suffixes in R1, and perform 189 | * the action indicated. 190 | * 191 | * @return boolean True when something is done. 192 | */ 193 | private function step3() 194 | { 195 | // hXn 196 | // delete if preceded by X, where X is a V other than u (a/han, e/hen etc) 197 | foreach (self::$restrictedVowels as $vowel) { 198 | if ($vowel === 'u') { 199 | continue; 200 | } 201 | if (($position = $this->searchIfInR1(array('h' . $vowel . 'n'))) !== false) { 202 | $lastLetter = StringHelper::substr($this->word, $position-1, 1); 203 | if ($lastLetter === $vowel) { 204 | $this->word = StringHelper::substr($this->word, 0, $position); 205 | $this->_removedInStep3 = true; 206 | $this->r1(); 207 | $this->r2(); 208 | } 209 | return true; 210 | } 211 | } 212 | 213 | // siin den tten 214 | // delete if preceded by Vi 215 | if (($position = $this->searchIfInR1(array('siin', 'den', 'tten'))) !== false) { 216 | $lastLetter = StringHelper::substr($this->word, ($position-1), 1); 217 | if ($lastLetter === 'i') { 218 | $nextLastLetter = StringHelper::substr($this->word, ($position-2), 1); 219 | if (in_array($nextLastLetter, self::$restrictedVowels, true)) { 220 | $this->word = StringHelper::substr($this->word, 0, $position); 221 | $this->_removedInStep3 = true; 222 | $this->r1(); 223 | $this->r2(); 224 | return true; 225 | } 226 | } 227 | } 228 | 229 | // seen 230 | // delete if preceded by LV 231 | if (($position = $this->searchIfInR1(array('seen'))) !== false) { 232 | $lastLetters = StringHelper::substr($this->word, ($position-2), 2); 233 | 234 | if (in_array($lastLetters, self::$longVowels, true)) { 235 | $this->word = StringHelper::substr($this->word, 0, $position); 236 | $this->_removedInStep3 = true; 237 | $this->r1(); 238 | $this->r2(); 239 | return true; 240 | } 241 | } 242 | 243 | // tta ttä 244 | // delete if preceded by e 245 | if (($position = $this->searchIfInR1(array('tta', 'ttä'))) !== false) { 246 | $lastLetter = StringHelper::substr($this->word, ($position-1), 1); 247 | 248 | if ($lastLetter === 'e') { 249 | $this->word = StringHelper::substr($this->word, 0, $position); 250 | $this->_removedInStep3 = true; 251 | $this->r1(); 252 | $this->r2(); 253 | return true; 254 | } 255 | } 256 | 257 | // ta tä ssa ssä sta stä lla llä lta ltä lle na nä ksi ine 258 | // delete 259 | if (($position = $this->searchIfInR1(array('ssa', 'ssä', 'sta', 'stä', 'lla', 'llä', 'lta', 'ltä', 'lle', 'ksi', 'na', 'nä', 'ine', 'ta', 'tä'))) !== false) { 260 | $this->word = StringHelper::substr($this->word, 0, $position); 261 | $this->_removedInStep3 = true; 262 | $this->r1(); 263 | $this->r2(); 264 | return true; 265 | } 266 | 267 | // a ä 268 | // delete if preceded by cv 269 | if (($position = $this->searchIfInR1(array('a', 'ä'))) !== false) { 270 | $lastLetter = StringHelper::substr($this->word, ($position-1), 1); 271 | $nextLastLetter = StringHelper::substr($this->word, ($position-2), 1); 272 | 273 | if (in_array($lastLetter, self::$vowels, true) && in_array($nextLastLetter, self::$consonants, true)) { 274 | $this->word = StringHelper::substr($this->word, 0, $position); 275 | $this->_removedInStep3 = true; 276 | $this->r1(); 277 | $this->r2(); 278 | return true; 279 | } 280 | } 281 | 282 | // n 283 | // delete, and if preceded by LV or ie, delete the last vowel 284 | if (($position = $this->searchIfInR1(array('n'))) !== false) { 285 | $lastLetters = StringHelper::substr($this->word, ($position-2), 2); 286 | 287 | if (in_array($lastLetters, self::$longVowels, true) || $lastLetters === 'ie') { 288 | $this->word = StringHelper::substr($this->word, 0, $position-1); 289 | } else { 290 | $this->word = StringHelper::substr($this->word, 0, $position); 291 | } 292 | $this->r1(); 293 | $this->r2(); 294 | $this->_removedInStep3 = true; 295 | return true; 296 | } 297 | } 298 | 299 | /** 300 | * Step 4: other endings 301 | * 302 | * Search for the longest among the following suffixes in R2, and perform 303 | * the action indicated 304 | * 305 | * @return boolean True when something is done. 306 | */ 307 | private function step4() 308 | { 309 | // mpi mpa mpä mmi mma mmä 310 | // delete if not preceded by po 311 | if (($position = $this->searchIfInR2(array('mpi', 'mpa', 'mpä', 'mmi', 'mma', 'mmä'))) !== false) { 312 | $lastLetters = StringHelper::substr($this->word, ($position-2), 2); 313 | if ($lastLetters !== 'po') { 314 | $this->word = StringHelper::substr($this->word, 0, $position); 315 | $this->r1(); 316 | $this->r2(); 317 | return true; 318 | } 319 | } 320 | 321 | // impi impa impä immi imma immä eja ejä 322 | // delete 323 | if (($position = $this->searchIfInR2(array('impi', 'impa', 'impä', 'immi', 'imma', 'immä', 'eja', 'ejä'))) !== false) { 324 | $this->word = StringHelper::substr($this->word, 0, $position); 325 | $this->r1(); 326 | $this->r2(); 327 | return true; 328 | } 329 | } 330 | 331 | /** 332 | * Step 5: plurals 333 | * If an ending was removed in step 3, delete a final i or j if in R1; 334 | * otherwise, 335 | * if an ending was not removed in step 3, delete a final t in R1 if it 336 | * follows a vowel, and, if a t is removed, delete a final mma or imma in 337 | * R2, unless the mma is preceded by po. 338 | * 339 | * @return boolean True when something is done. 340 | */ 341 | private function step5() 342 | { 343 | if ($this->_removedInStep3) { 344 | if (($position = $this->searchIfInR1(array('i', 'j'))) !== false) { 345 | $this->word = StringHelper::substr($this->word, 0, $position); 346 | $this->r1(); 347 | $this->r2(); 348 | return true; 349 | } 350 | } else { 351 | if (($position = $this->searchIfInR1(array('t'))) !== false) { 352 | $lastLetter = StringHelper::substr($this->word, ($position-1), 1); 353 | if (in_array($lastLetter, self::$vowels, true)) { 354 | $this->word = StringHelper::substr($this->word, 0, $position); 355 | $this->r1(); 356 | $this->r2(); 357 | if (($position2 = $this->searchIfInR2(array('imma'))) !== false) { 358 | $this->word = StringHelper::substr($this->word, 0, $position2); 359 | $this->r1(); 360 | $this->r2(); 361 | return true; 362 | } elseif (($position2 = $this->searchIfInR2(array('mma'))) !== false) { 363 | $lastLetters = StringHelper::substr($this->word, ($position2-2), 2); 364 | if ($lastLetters !== 'po') { 365 | $this->word = StringHelper::substr($this->word, 0, $position2); 366 | $this->r1(); 367 | $this->r2(); 368 | return true; 369 | } 370 | } 371 | } 372 | } 373 | } 374 | 375 | } 376 | 377 | /** 378 | * Step 6: tidying up 379 | * 380 | * Do in turn steps (a), (b), (c), (d), restricting all tests to the 381 | * region R1. 382 | */ 383 | private function step6() 384 | { 385 | // a) If R1 ends LV 386 | // delete the last letter 387 | if (($position = $this->searchIfInR1(self::$longVowels)) !== false) { 388 | $this->word = StringHelper::substr($this->word, 0, $position+1); 389 | $this->r1(); 390 | $this->r2(); 391 | } 392 | 393 | // b) If R1 ends cX, c a consonant and X one of a ä e i, 394 | // delete the last letter 395 | $lastLetter = StringHelper::substr($this->r1, -1, 1); 396 | $secondToLastLetter = StringHelper::substr($this->r1, -2, 1); 397 | if (in_array($secondToLastLetter, self::$consonants, true) && in_array($lastLetter, array('a', 'e', 'i', 'ä'))) { 398 | $this->word = StringHelper::substr($this->word, 0, -1); 399 | $this->r1(); 400 | $this->r2(); 401 | } 402 | 403 | // c) If R1 ends oj or uj 404 | // delete the last letter 405 | $twoLastLetters = StringHelper::substr($this->r1, -2, 2); 406 | if (in_array($twoLastLetters, array('oj', 'uj'))) { 407 | $this->word = StringHelper::substr($this->word, 0, -1); 408 | $this->r1(); 409 | $this->r2(); 410 | } 411 | 412 | // d) If R1 ends jo 413 | // delete the last letter 414 | $twoLastLetters = StringHelper::substr($this->r1, -2, 2); 415 | if ($twoLastLetters === 'jo') { 416 | $this->word = StringHelper::substr($this->word, 0, -1); 417 | $this->r1(); 418 | $this->r2(); 419 | } 420 | 421 | // e) If the word ends with a double consonant followed by zero or more 422 | // vowels, remove the last consonant (so eläkk -> eläk, 423 | // aatonaatto -> aatonaato) 424 | $endVowels = ''; 425 | for ($i = StringHelper::strlen($this->word) - 1; $i > 0; $i--) { 426 | $letter = StringHelper::substr($this->word, $i, 1); 427 | if (in_array($letter, self::$vowels, true)) { 428 | $endVowels = $letter . $endVowels; 429 | } else { 430 | // check for double consonant 431 | $prevLetter = StringHelper::substr($this->word, $i-1, 1); 432 | if ($prevLetter === $letter) { 433 | $this->word = StringHelper::substr($this->word, 0, $i) . $endVowels; 434 | } 435 | break; 436 | } 437 | } 438 | } 439 | } 440 | -------------------------------------------------------------------------------- /src/Stemmer/French.php: -------------------------------------------------------------------------------- 1 | word = StringHelper::strtolower($word); 26 | 27 | $this->plainVowels = implode('', self::$vowels); 28 | 29 | $this->step0(); 30 | 31 | $this->rv(); 32 | $this->r1(); 33 | $this->r2(); 34 | 35 | // to know if step1, 2a or 2b have altered the word 36 | $this->originalWord = $this->word; 37 | 38 | $nextStep = $this->step1(); 39 | 40 | // Do step 2a if either no ending was removed by step 1, or if one of endings amment, emment, ment, ments was found. 41 | if ( ($nextStep == 2) || ($this->originalWord == $this->word) ) { 42 | $modified = $this->step2a(); 43 | if (!$modified) { 44 | $this->step2b(); 45 | } 46 | } 47 | 48 | if ($this->word != $this->originalWord) { 49 | $this->step3(); 50 | 51 | } else { 52 | $this->step4(); 53 | } 54 | 55 | $this->step5(); 56 | $this->step6(); 57 | $this->finish(); 58 | 59 | return $this->word; 60 | } 61 | 62 | 63 | 64 | /** 65 | * Assume the word is in lower case. 66 | * Then put into upper case u or i preceded and followed by a vowel, and y preceded or followed by a vowel. 67 | * u after q is also put into upper case. For example, 68 | * jouer -> joUer 69 | * ennuie -> ennuIe 70 | * yeux -> Yeux 71 | * quand -> qUand 72 | */ 73 | private function step0() 74 | { 75 | $this->word = preg_replace('#([q])u#u', '$1U', $this->word); 76 | $this->word = preg_replace('#(['.$this->plainVowels.'])y#u', '$1Y', $this->word); 77 | $this->word = preg_replace('#y(['.$this->plainVowels.'])#u', 'Y$1', $this->word); 78 | $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); 79 | $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); 80 | } 81 | 82 | /** 83 | * Step 1 84 | * Search for the longest among the following suffixes, and perform the action indicated. 85 | * 86 | * @return integer Next step number 87 | */ 88 | private function step1() 89 | { 90 | // ance iqUe isme able iste eux ances iqUes ismes ables istes 91 | // delete if in R2 92 | if ( ($position = $this->search(array('ances', 'iqUes', 'ismes', 'ables', 'istes', 'ance', 'iqUe','isme', 'able', 'iste', 'eux'))) !== false) { 93 | if ($this->inR2($position)) { 94 | $this->word = StringHelper::substr($this->word, 0, $position); 95 | } 96 | return 3; 97 | } 98 | 99 | // atrice ateur ation atrices ateurs ations 100 | // delete if in R2 101 | // if preceded by ic, delete if in R2, else replace by iqU 102 | if ( ($position = $this->search(array('atrices', 'ateurs', 'ations', 'atrice', 'ateur', 'ation'))) !== false) { 103 | if ($this->inR2($position)) { 104 | $this->word = StringHelper::substr($this->word, 0, $position); 105 | 106 | if ( ($position2 = $this->searchIfInR2(array('ic'))) !== false) { 107 | $this->word = StringHelper::substr($this->word, 0, $position2); 108 | } else { 109 | $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); 110 | } 111 | } 112 | 113 | return 3; 114 | } 115 | 116 | // logie logies 117 | // replace with log if in R2 118 | if ( ($position = $this->search(array('logies', 'logie'))) !== false) { 119 | if ($this->inR2($position)) { 120 | $this->word = preg_replace('#(logies|logie)$#u', 'log', $this->word); 121 | } 122 | return 3; 123 | } 124 | 125 | // usion ution usions utions 126 | // replace with u if in R2 127 | if ( ($position = $this->search(array('usions', 'utions', 'usion', 'ution'))) !== false) { 128 | if ($this->inR2($position)) { 129 | $this->word = preg_replace('#(usion|ution|usions|utions)$#u', 'u', $this->word); 130 | } 131 | return 3; 132 | } 133 | 134 | // ence ences 135 | // replace with ent if in R2 136 | if ( ($position = $this->search(array('ences', 'ence'))) !== false) { 137 | if ($this->inR2($position)) { 138 | $this->word = preg_replace('#(ence|ences)$#u', 'ent', $this->word); 139 | } 140 | return 3; 141 | } 142 | 143 | // issement issements 144 | // delete if in R1 and preceded by a non-vowel 145 | if ( ($position = $this->search(array('issements', 'issement'))) != false) { 146 | if ($this->inR1($position)) { 147 | $before = $position - 1; 148 | $letter = StringHelper::substr($this->word, $before, 1); 149 | if (! in_array($letter, self::$vowels)) { 150 | $this->word = StringHelper::substr($this->word, 0, $position); 151 | } 152 | } 153 | return 3; 154 | } 155 | 156 | // ement ements 157 | // delete if in RV 158 | // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, 159 | // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, 160 | // if preceded by abl or iqU, delete if in R2, otherwise, 161 | // if preceded by ièr or Ièr, replace by i if in RV 162 | if ( ($position = $this->search(array('ements', 'ement'))) !== false) { 163 | 164 | // delete if in RV 165 | if ($this->inRv($position)) { 166 | $this->word = StringHelper::substr($this->word, 0, $position); 167 | } 168 | 169 | // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, 170 | if ( ($position = $this->searchIfInR2(array('iv'))) !== false) { 171 | $this->word = StringHelper::substr($this->word, 0, $position); 172 | if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { 173 | $this->word = StringHelper::substr($this->word, 0, $position2); 174 | } 175 | 176 | // if preceded by eus, delete if in R2, else replace by eux if in R1, otherwise, 177 | } elseif ( ($position = $this->search(array('eus'))) !== false) { 178 | if ($this->inR2($position)) { 179 | $this->word = StringHelper::substr($this->word, 0, $position); 180 | 181 | } elseif ($this->inR1($position)) { 182 | $this->word = preg_replace('#(eus)$#u', 'eux', $this->word); 183 | } 184 | 185 | // if preceded by abl or iqU, delete if in R2, otherwise, 186 | } elseif ( ($position = $this->searchIfInR2(array('abl', 'iqU'))) !== false) { 187 | $this->word = StringHelper::substr($this->word, 0, $position); 188 | 189 | // if preceded by ièr or Ièr, replace by i if in RV 190 | } elseif ( ($position = $this->searchIfInRv(array('ièr', 'Ièr'))) !== false) { 191 | $this->word = preg_replace('#(ièr|Ièr)$#u', 'i', $this->word); 192 | } 193 | return 3; 194 | } 195 | 196 | // ité ités 197 | // delete if in R2 198 | // if preceded by abil, delete if in R2, else replace by abl, otherwise, 199 | // if preceded by ic, delete if in R2, else replace by iqU, otherwise, 200 | // if preceded by iv, delete if in R2 201 | if ( ($position = $this->search(array('ités', 'ité'))) !== false) { 202 | 203 | // delete if in R2 204 | if ($this->inR2($position)) { 205 | $this->word = StringHelper::substr($this->word, 0, $position); 206 | } 207 | 208 | // if preceded by abil, delete if in R2, else replace by abl, otherwise, 209 | if ( ($position = $this->search(array('abil'))) !== false) { 210 | if ($this->inR2($position)) { 211 | $this->word = StringHelper::substr($this->word, 0, $position); 212 | } else { 213 | $this->word = preg_replace('#(abil)$#u', 'abl', $this->word); 214 | } 215 | 216 | // if preceded by ic, delete if in R2, else replace by iqU, otherwise, 217 | } elseif ( ($position = $this->search(array('ic'))) !== false) { 218 | if ($this->inR2($position)) { 219 | $this->word = StringHelper::substr($this->word, 0, $position); 220 | } else { 221 | $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); 222 | } 223 | 224 | // if preceded by iv, delete if in R2 225 | } elseif ( ($position = $this->searchIfInR2(array('iv'))) !== false) { 226 | $this->word = StringHelper::substr($this->word, 0, $position); 227 | } 228 | 229 | return 3; 230 | } 231 | 232 | // if ive ifs ives 233 | // delete if in R2 234 | // if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2, else replace by iqU) 235 | if ( ($position = $this->search(array('ifs', 'ives', 'if', 'ive'))) !== false) { 236 | 237 | if ($this->inR2($position)) { 238 | $this->word = StringHelper::substr($this->word, 0, $position); 239 | } 240 | 241 | if ( ($position = $this->searchIfInR2(array('at'))) !== false) { 242 | $this->word = StringHelper::substr($this->word, 0, $position); 243 | 244 | if ( ($position2 = $this->search(array('ic'))) !== false) { 245 | if ($this->inR2($position2)) { 246 | $this->word = StringHelper::substr($this->word, 0, $position2); 247 | } else { 248 | $this->word = preg_replace('#(ic)$#u', 'iqU', $this->word); 249 | } 250 | } 251 | } 252 | 253 | return 3; 254 | } 255 | 256 | // eaux 257 | // replace with eau 258 | if ( ($position = $this->search(array('eaux'))) !== false) { 259 | $this->word = preg_replace('#(eaux)$#u', 'eau', $this->word); 260 | return 3; 261 | } 262 | 263 | // aux 264 | // replace with al if in R1 265 | if ( ($position = $this->search(array('aux'))) !== false) { 266 | if ($this->inR1($position)) { 267 | $this->word = preg_replace('#(aux)$#u', 'al', $this->word); 268 | } 269 | return 3; 270 | } 271 | 272 | // euse euses 273 | // delete if in R2, else replace by eux if in R1 274 | if ( ($position = $this->search(array('euses', 'euse'))) !== false) { 275 | if ($this->inR2($position)) { 276 | $this->word = StringHelper::substr($this->word, 0, $position); 277 | 278 | } elseif ($this->inR1($position)) { 279 | $this->word = preg_replace('#(euses|euse)$#u', 'eux', $this->word); 280 | //return 3; 281 | } 282 | return 3; 283 | } 284 | 285 | // amment 286 | // replace with ant if in RV 287 | if ( ($position = $this->search(array('amment'))) !== false) { 288 | if ($this->inRv($position)) { 289 | $this->word = preg_replace('#(amment)$#u', 'ant', $this->word); 290 | } 291 | return 2; 292 | } 293 | 294 | // emment 295 | // replace with ent if in RV 296 | if ( ($position = $this->search(array('emment'))) !== false) { 297 | if ($this->inRv($position)) { 298 | $this->word = preg_replace('#(emment)$#u', 'ent', $this->word); 299 | } 300 | return 2; 301 | } 302 | 303 | // ment ments 304 | // delete if preceded by a vowel in RV 305 | if ( ($position = $this->search(array('ments', 'ment'))) != false) { 306 | $before = $position - 1; 307 | $letter = StringHelper::substr($this->word, $before, 1); 308 | if ( $this->inRv($before) && (in_array($letter, self::$vowels)) ) { 309 | $this->word = StringHelper::substr($this->word, 0, $position); 310 | } 311 | 312 | return 2; 313 | } 314 | 315 | return 2; 316 | } 317 | 318 | /** 319 | * Step 2a: Verb suffixes beginning i 320 | * In steps 2a and 2b all tests are confined to the RV region. 321 | * Search for the longest among the following suffixes and if found, delete if preceded by a non-vowel. 322 | * îmes ît îtes i ie ies ir ira irai iraIent irais irait iras irent irez iriez 323 | * irions irons iront is issaIent issais issait issant issante issantes issants isse 324 | * issent isses issez issiez issions issons it 325 | * (Note that the non-vowel itself must also be in RV.) 326 | */ 327 | private function step2a() 328 | { 329 | if ( ($position = $this->searchIfInRv(array( 330 | 'îmes', 'îtes', 'ît', 'ies', 'ie', 'iraIent', 'irais', 'irait', 'irai', 'iras', 'ira', 'irent', 'irez', 'iriez', 331 | 'irions', 'irons', 'iront', 'ir', 'issaIent', 'issais', 'issait', 'issant', 'issantes', 'issante', 'issants', 332 | 'issent', 'isses', 'issez', 'isse', 'issiez', 'issions', 'issons', 'is', 'it', 'i'))) !== false) { 333 | 334 | $before = $position - 1; 335 | $letter = StringHelper::substr($this->word, $before, 1); 336 | if ( $this->inRv($before) && (!in_array($letter, self::$vowels)) ) { 337 | $this->word = StringHelper::substr($this->word, 0, $position); 338 | 339 | return true; 340 | } 341 | } 342 | 343 | return false; 344 | } 345 | 346 | /** 347 | * Do step 2b if step 2a was done, but failed to remove a suffix. 348 | * Step 2b: Other verb suffixes 349 | */ 350 | private function step2b() 351 | { 352 | // é ée ées és èrent er era erai eraIent erais erait eras erez eriez erions erons eront ez iez 353 | // delete 354 | if ( ($position = $this->searchIfInRv(array( 355 | 'ées', 'èrent', 'erais', 'erait', 'erai', 'eraIent', 'eras', 'erez', 'eriez', 356 | 'erions', 'erons', 'eront', 'era', 'er', 'iez', 'ez','és', 'ée', 'é'))) !== false) { 357 | 358 | $this->word = StringHelper::substr($this->word, 0, $position); 359 | 360 | return true; 361 | } 362 | 363 | // âmes ât âtes a ai aIent ais ait ant ante antes ants as asse assent asses assiez assions 364 | // delete 365 | // if preceded by e, delete 366 | if ( ($position = $this->searchIfInRv(array( 367 | 'âmes', 'âtes', 'ât', 'aIent', 'ais', 'ait', 'antes', 'ante', 'ants', 'ant', 368 | 'assent', 'asses', 'assiez', 'assions', 'asse', 'as', 'ai', 'a'))) !== false) { 369 | 370 | $before = $position - 1; 371 | $letter = StringHelper::substr($this->word, $before, 1); 372 | if ( $this->inRv($before) && ($letter == 'e') ) { 373 | $this->word = StringHelper::substr($this->word, 0, $before); 374 | 375 | } else { 376 | $this->word = StringHelper::substr($this->word, 0, $position); 377 | } 378 | 379 | return true; 380 | } 381 | 382 | // ions 383 | // delete if in R2 384 | if ( ($position = $this->searchIfInRv(array('ions'))) !== false) { 385 | if ($this->inR2($position)) { 386 | $this->word = StringHelper::substr($this->word, 0, $position); 387 | } 388 | 389 | return true; 390 | } 391 | 392 | return false; 393 | } 394 | 395 | /** 396 | * Step 3: Replace final Y with i or final ç with c 397 | */ 398 | private function step3() 399 | { 400 | $this->word = preg_replace('#(Y)$#u', 'i', $this->word); 401 | $this->word = preg_replace('#(ç)$#u', 'c', $this->word); 402 | } 403 | 404 | /** 405 | * Step 4: Residual suffix 406 | */ 407 | private function step4() 408 | { 409 | //If the word ends s, not preceded by a, i, o, u, è or s, delete it. 410 | if (preg_match('#[^aiouès]s$#', $this->word)) { 411 | $this->word = StringHelper::substr($this->word, 0, -1); 412 | } 413 | 414 | // In the rest of step 4, all tests are confined to the RV region. 415 | // ion 416 | // delete if in R2 and preceded by s or t 417 | if ( (($position = $this->searchIfInRv(array('ion'))) !== false) && ($this->inR2($position)) ) { 418 | $before = $position - 1; 419 | $letter = StringHelper::substr($this->word, $before, 1); 420 | if ( $this->inRv($before) && (($letter == 's') || ($letter == 't')) ) { 421 | $this->word = StringHelper::substr($this->word, 0, $position); 422 | } 423 | return true; 424 | } 425 | 426 | // ier ière Ier Ière 427 | // replace with i 428 | if ( ($this->searchIfInRv(array('ier', 'ière', 'Ier', 'Ière'))) !== false) { 429 | $this->word = preg_replace('#(ier|ière|Ier|Ière)$#u', 'i', $this->word); 430 | return true; 431 | } 432 | 433 | // e 434 | // delete 435 | if ( ($this->searchIfInRv(array('e'))) !== false) { 436 | $this->word = StringHelper::substr($this->word, 0, -1); 437 | return true; 438 | } 439 | 440 | // ë 441 | // if preceded by gu, delete 442 | if ( ($position = $this->searchIfInRv(array('guë'))) !== false) { 443 | if ($this->inRv($position+2)) { 444 | $this->word = StringHelper::substr($this->word, 0, -1); 445 | return true; 446 | } 447 | } 448 | 449 | return false; 450 | } 451 | 452 | /** 453 | * Step 5: Undouble 454 | * If the word ends enn, onn, ett, ell or eill, delete the last letter 455 | */ 456 | private function step5() 457 | { 458 | if ($this->search(array('enn', 'onn', 'ett', 'ell', 'eill')) !== false) { 459 | $this->word = StringHelper::substr($this->word, 0, -1); 460 | } 461 | } 462 | 463 | /** 464 | * Step 6: Un-accent 465 | * If the words ends é or è followed by at least one non-vowel, remove the accent from the e. 466 | */ 467 | private function step6() 468 | { 469 | $this->word = preg_replace('#(é|è)([^'.$this->plainVowels.']+)$#u', 'e$2', $this->word); 470 | } 471 | 472 | /** 473 | * And finally: 474 | * Turn any remaining I, U and Y letters in the word back into lower case. 475 | */ 476 | private function finish() 477 | { 478 | $this->word = str_replace(array('I','U','Y'), array('i', 'u', 'y'), $this->word); 479 | } 480 | 481 | /** 482 | * If the word begins with two vowels, RV is the region after the third letter, 483 | * otherwise the region after the first vowel not at the beginning of the word, 484 | * or the end of the word if these positions cannot be found. 485 | * (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) 486 | */ 487 | protected function rv() 488 | { 489 | $length = StringHelper::strlen($this->word); 490 | 491 | $this->rv = ''; 492 | $this->rvIndex = $length; 493 | 494 | if ($length < 3) { 495 | return true; 496 | } 497 | 498 | // If the word begins with two vowels, RV is the region after the third letter 499 | $first = StringHelper::substr($this->word, 0, 1); 500 | $second = StringHelper::substr($this->word, 1, 1); 501 | 502 | if ( (in_array($first, self::$vowels)) && (in_array($second, self::$vowels)) ) { 503 | $this->rv = StringHelper::substr($this->word, 3); 504 | $this->rvIndex = 3; 505 | return true; 506 | } 507 | 508 | // (Exceptionally, par, col or tap, at the begining of a word is also taken to define RV as the region to their right.) 509 | $begin3 = StringHelper::substr($this->word, 0, 3); 510 | if (in_array($begin3, array('par', 'col', 'tap'))) { 511 | $this->rv = StringHelper::substr($this->word, 3); 512 | $this->rvIndex = 3; 513 | return true; 514 | } 515 | 516 | // otherwise the region after the first vowel not at the beginning of the word, 517 | for ($i=1; $i<$length; $i++) { 518 | $letter = StringHelper::substr($this->word, $i, 1); 519 | if (in_array($letter, self::$vowels)) { 520 | $this->rv = StringHelper::substr($this->word, ($i + 1)); 521 | $this->rvIndex = $i + 1; 522 | return true; 523 | } 524 | } 525 | 526 | return false; 527 | } 528 | } 529 | -------------------------------------------------------------------------------- /src/Stemmer/German.php: -------------------------------------------------------------------------------- 1 | plainVowels = implode('', self::$vowels); 30 | 31 | $this->word = StringHelper::strtolower($word); 32 | 33 | // First, replace ß by ss 34 | $this->word = str_replace('ß', 'ss', $this->word); 35 | 36 | // put u and y between vowels into upper case 37 | $this->word = preg_replace('#(['.$this->plainVowels.'])y(['.$this->plainVowels.'])#u', '$1Y$2', $this->word); 38 | $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); 39 | 40 | // R1 and R2 are first set up in the standard way 41 | $this->r1(); 42 | $this->r2(); 43 | 44 | // but then R1 is adjusted so that the region before it contains at least 3 letters. 45 | if ($this->r1Index < 3) { 46 | $this->r1Index = 3; 47 | $this->r1 = StringHelper::substr($this->word, 3); 48 | } 49 | 50 | $this->step1(); 51 | $this->step2(); 52 | $this->step3(); 53 | $this->finish(); 54 | 55 | return $this->word; 56 | } 57 | 58 | /** 59 | * Step 1 60 | */ 61 | private function step1() 62 | { 63 | // delete if in R1 64 | if ( ($position = $this->search(array('em', 'ern', 'er'))) !== false) { 65 | if ($this->inR1($position)) { 66 | $this->word = StringHelper::substr($this->word, 0, $position); 67 | } 68 | return true; 69 | } 70 | 71 | // delete if in R1 72 | if ( ($position = $this->search(array('es', 'en', 'e'))) !== false) { 73 | if ($this->inR1($position)) { 74 | $this->word = StringHelper::substr($this->word, 0, $position); 75 | 76 | //If an ending of group (b) is deleted, and the ending is preceded by niss, delete the final s 77 | if ($this->search(array('niss')) !== false) { 78 | $this->word = StringHelper::substr($this->word, 0, -1); 79 | } 80 | } 81 | return true; 82 | } 83 | 84 | // s (preceded by a valid s-ending) 85 | if ( ($position = $this->search(array('s'))) !== false) { 86 | if ($this->inR1($position)) { 87 | $before = $position - 1; 88 | $letter = StringHelper::substr($this->word, $before, 1); 89 | 90 | if (in_array($letter, self::$sEndings)) { 91 | $this->word = StringHelper::substr($this->word, 0, $position); 92 | } 93 | } 94 | return true; 95 | } 96 | 97 | return false; 98 | } 99 | 100 | /** 101 | * Step 2 102 | */ 103 | private function step2() 104 | { 105 | // en er est 106 | // delete if in R1 107 | if ( ($position = $this->search(array('en', 'er', 'est'))) !== false) { 108 | if ($this->inR1($position)) { 109 | $this->word = StringHelper::substr($this->word, 0, $position); 110 | } 111 | return true; 112 | } 113 | 114 | // st (preceded by a valid st-ending, itself preceded by at least 3 letters) 115 | // delete if in R1 116 | if ( ($position = $this->search(array('st'))) !== false) { 117 | if ($this->inR1($position)) { 118 | $before = $position - 1; 119 | if ($before >= 3) { 120 | $letter = StringHelper::substr($this->word, $before, 1); 121 | 122 | if (in_array($letter, self::$stEndings)) { 123 | $this->word = StringHelper::substr($this->word, 0, $position); 124 | } 125 | } 126 | } 127 | return true; 128 | } 129 | return false; 130 | } 131 | 132 | /** 133 | * Step 3: d-suffixes 134 | */ 135 | private function step3() 136 | { 137 | // end ung 138 | // delete if in R2 139 | // if preceded by ig, delete if in R2 and not preceded by e 140 | if ( ($position = $this->search(array('end', 'ung'))) !== false) { 141 | if ($this->inR2($position)) { 142 | $this->word = StringHelper::substr($this->word, 0, $position); 143 | } 144 | 145 | if ( ($position2 = $this->search(array('ig'))) !== false) { 146 | $before = $position2 - 1; 147 | $letter = StringHelper::substr($this->word, $before, 1); 148 | 149 | if ( ($this->inR2($position2)) && ($letter != 'e') ) { 150 | $this->word = StringHelper::substr($this->word, 0, $position2); 151 | } 152 | } 153 | return true; 154 | } 155 | 156 | // ig ik isch 157 | // delete if in R2 and not preceded by e 158 | if ( ($position = $this->search(array('ig', 'ik', 'isch'))) !== false) { 159 | $before = $position - 1; 160 | $letter = StringHelper::substr($this->word, $before, 1); 161 | 162 | if ( ($this->inR2($position)) && ($letter != 'e') ) { 163 | $this->word = StringHelper::substr($this->word, 0, $position); 164 | } 165 | return true; 166 | } 167 | 168 | // lich heit 169 | // delete if in R2 170 | // if preceded by er or en, delete if in R1 171 | if ( ($position = $this->search(array('lich', 'heit'))) != false) { 172 | if ($this->inR2($position)) { 173 | $this->word = StringHelper::substr($this->word, 0, $position); 174 | } 175 | 176 | if ( ($position2 = $this->search(array('er', 'en'))) !== false) { 177 | if ($this->inR1($position2)) { 178 | $this->word = StringHelper::substr($this->word, 0, $position2); 179 | } 180 | } 181 | return true; 182 | } 183 | 184 | // keit 185 | // delete if in R2 186 | // if preceded by lich or ig, delete if in R2 187 | if ( ($position = $this->search(array('keit'))) != false) { 188 | if ($this->inR2($position)) { 189 | $this->word = StringHelper::substr($this->word, 0, $position); 190 | } 191 | 192 | if ( ($position2 = $this->search(array('lich', 'ig'))) !== false) { 193 | if ($this->inR2($position2)) { 194 | $this->word = StringHelper::substr($this->word, 0, $position2); 195 | } 196 | } 197 | return true; 198 | } 199 | 200 | return false; 201 | } 202 | 203 | /** 204 | * Finally 205 | */ 206 | private function finish() 207 | { 208 | // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. 209 | $this->word = str_replace(array('U', 'Y', 'ä', 'ü', 'ö'), array('u', 'y', 'a', 'u', 'o'), $this->word); 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /src/Stemmer/Italian.php: -------------------------------------------------------------------------------- 1 | plainVowels = implode('', self::$vowels); 26 | 27 | $this->word = StringHelper::strtolower($word); 28 | 29 | // First, replace all acute accents by grave accents. 30 | $this->word = str_replace(array('á', 'é', 'í', 'ó', 'ú'), array('à', 'è', 'ì', 'ò', 'ù'), $this->word); 31 | 32 | //And, as in French, put u after q, and u, i between vowels into upper case. (See note on vowel marking.) The vowels are then 33 | $this->word = preg_replace('#([q])u#u', '$1U', $this->word); 34 | $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); 35 | $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); 36 | 37 | $this->rv(); 38 | $this->r1(); 39 | $this->r2(); 40 | 41 | $this->step0(); 42 | 43 | $word = $this->word; 44 | $this->step1(); 45 | 46 | //Do step 2 if no ending was removed by step 1. 47 | if ($word == $this->word) { 48 | $this->step2(); 49 | } 50 | 51 | $this->step3a(); 52 | $this->step3b(); 53 | $this->finish(); 54 | 55 | return $this->word; 56 | } 57 | 58 | /** 59 | * Step 0: Attached pronoun 60 | */ 61 | private function step0() 62 | { 63 | // Search for the longest among the following suffixes 64 | if ( ($position = $this->search(array( 65 | 'gliela', 'gliele', 'glieli', 'glielo', 'gliene', 66 | 'sene', 'mela', 'mele', 'meli', 'melo', 'mene', 'tela', 'tele', 'teli', 'telo', 'tene', 'cela', 67 | 'cele', 'celi', 'celo', 'cene', 'vela', 'vele', 'veli', 'velo', 'vene', 68 | 'gli', 'la', 'le', 'li', 'lo', 'mi', 'ne', 'si', 'ti', 'vi', 'ci'))) !== false) { 69 | 70 | $suffixe = StringHelper::substr($this->word, $position); 71 | 72 | // following one of (in RV) 73 | // a 74 | $a = array('ando', 'endo'); 75 | $a = array_map(function($item) use ($suffixe) { 76 | return $item . $suffixe; 77 | }, $a); 78 | // In case of (a) the suffix is deleted 79 | if ($this->searchIfInRv($a) !== false) { 80 | $this->word = StringHelper::substr($this->word, 0, $position); 81 | } 82 | 83 | //b 84 | $b = array('ar', 'er', 'ir'); 85 | $b = array_map(function($item) use ($suffixe) { 86 | return $item . $suffixe; 87 | }, $b); 88 | // in case (b) it is replace by e 89 | if ($this->searchIfInRv($b) !== false) { 90 | $this->word = preg_replace('#('.$suffixe.')$#u', 'e', $this->word); 91 | } 92 | 93 | return true; 94 | } 95 | 96 | return false; 97 | } 98 | 99 | /** 100 | * Step 1: Standard suffix removal 101 | */ 102 | private function step1() 103 | { 104 | // amente 105 | // delete if in R1 106 | // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, 107 | // if preceded by os, ic or abil, delete if in R2 108 | if ( ($position = $this->search(array('amente'))) !== false) { 109 | if ($this->inR1($position)) { 110 | $this->word = StringHelper::substr($this->word, 0, $position); 111 | } 112 | 113 | // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, 114 | if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { 115 | $this->word = StringHelper::substr($this->word, 0, $position2); 116 | if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { 117 | $this->word = StringHelper::substr($this->word, 0, $position3); 118 | } 119 | 120 | // if preceded by os, ic or ad, delete if in R2 121 | } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'abil'))) != false) { 122 | $this->word = StringHelper::substr($this->word, 0, $position4); 123 | } 124 | return true; 125 | } 126 | 127 | // delete if in R2 128 | if ( ($position = $this->search(array( 129 | 'ibili', 'atrice', 'abili', 'abile', 'ibile', 'atrici', 'mente', 130 | 'anza', 'anze', 'iche', 'ichi', 'ismo', 'ismi', 'ista', 'iste', 'isti', 'istà', 'istè', 'istì', 'ante', 'anti', 131 | 'ico', 'ici', 'ica', 'ice', 'oso', 'osi', 'osa', 'ose' 132 | ))) !== false) { 133 | 134 | if ($this->inR2($position)) { 135 | $this->word = StringHelper::substr($this->word, 0, $position); 136 | } 137 | return true; 138 | } 139 | 140 | // azione azioni atore atori 141 | // delete if in R2 142 | // if preceded by ic, delete if in R2 143 | if ( ($position = $this->search(array('azione', 'azioni', 'atore', 'atori'))) !== false) { 144 | if ($this->inR2($position)) { 145 | $this->word = StringHelper::substr($this->word, 0, $position); 146 | 147 | if ( ($position2 = $this->search(array('ic'))) !== false) { 148 | if ($this->inR2($position2)) { 149 | $this->word = StringHelper::substr($this->word, 0, $position2); 150 | } 151 | } 152 | } 153 | return true; 154 | } 155 | 156 | // logia logie 157 | // replace with log if in R2 158 | if ( ($position = $this->search(array('logia', 'logie'))) !== false) { 159 | if ($this->inR2($position)) { 160 | $this->word = preg_replace('#(logia|logie)$#u', 'log', $this->word); 161 | } 162 | return true; 163 | } 164 | 165 | // uzione uzioni usione usioni 166 | // replace with u if in R2 167 | if ( ($position = $this->search(array('uzione', 'uzioni', 'usione', 'usioni'))) !== false) { 168 | if ($this->inR2($position)) { 169 | $this->word = preg_replace('#(uzione|uzioni|usione|usioni)$#u', 'u', $this->word); 170 | } 171 | return true; 172 | } 173 | 174 | // enza enze 175 | // replace with ente if in R2 176 | if ( ($position = $this->search(array('enza', 'enze'))) !== false) { 177 | if ($this->inR2($position)) { 178 | $this->word = preg_replace('#(enza|enze)$#u', 'ente', $this->word); 179 | } 180 | return true; 181 | } 182 | 183 | // amento amenti imento imenti 184 | // delete if in RV 185 | if ( ($position = $this->search(array('amento', 'amenti', 'imento', 'imenti'))) !== false) { 186 | if ($this->inRv($position)) { 187 | $this->word = StringHelper::substr($this->word, 0, $position); 188 | } 189 | return true; 190 | } 191 | 192 | // ità 193 | // delete if in R2 194 | // if preceded by abil, ic or iv, delete if in R2 195 | if ( ($position = $this->search(array('ità'))) !== false) { 196 | if ($this->inR2($position)) { 197 | $this->word = StringHelper::substr($this->word, 0, $position); 198 | } 199 | 200 | if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) { 201 | $this->word = StringHelper::substr($this->word, 0, $position2); 202 | } 203 | return true; 204 | } 205 | 206 | // ivo ivi iva ive 207 | // delete if in R2 208 | // if preceded by at, delete if in R2 (and if further preceded by ic, delete if in R2) 209 | if ( ($position = $this->search(array('ivo', 'ivi', 'iva', 'ive'))) !== false) { 210 | if ($this->inR2($position)) { 211 | $this->word = StringHelper::substr($this->word, 0, $position); 212 | } 213 | 214 | if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { 215 | $this->word = StringHelper::substr($this->word, 0, $position2); 216 | if ( ($position3 = $this->searchIfInR2(array('ic'))) !== false) { 217 | $this->word = StringHelper::substr($this->word, 0, $position3); 218 | } 219 | } 220 | return true; 221 | } 222 | 223 | return false; 224 | } 225 | 226 | /** 227 | * Step 2: Verb suffixes 228 | * Search for the longest among the following suffixes in RV, and if found, delete. 229 | */ 230 | private function step2() 231 | { 232 | if ( ($position = $this->searchIfInRv(array( 233 | 'assimo', 'assero', 'eranno', 'erebbero', 'erebbe', 'eremmo', 'ereste', 'eresti', 'essero', 'iranno', 'irebbero', 'irebbe', 'iremmo', 234 | 'iscano', 'ireste', 'iresti', 'iscono', 'issero', 235 | 'avamo', 'arono', 'avano', 'avate', 'eremo', 'erete', 'erono', 'evamo', 'evano', 'evate', 'ivamo', 'ivano', 'ivate', 'iremo', 'irete', 'irono', 236 | 'ammo', 'ando', 'asse', 'assi', 'emmo', 'enda', 'ende', 'endi', 'endo', 'erai', 'erei', 'Yamo', 'iamo', 'immo', 'irà', 'irai', 'irei', 237 | 'isca', 'isce', 'isci', 'isco', 238 | 'ano', 'are', 'ata', 'ate', 'ati', 'ato', 'ava', 'avi', 'avo', 'erà', 'ere', 'erò', 'ete', 'eva', 239 | 'evi', 'evo', 'ire', 'ita', 'ite', 'iti', 'ito', 'iva', 'ivi', 'ivo', 'ono', 'uta', 'ute', 'uti', 'uto', 'irò', 'ar', 'ir'))) !== false) { 240 | 241 | $this->word = StringHelper::substr($this->word, 0, $position); 242 | } 243 | } 244 | 245 | /** 246 | * Step 3a 247 | * Delete a final a, e, i, o, à, è, ì or ò if it is in RV, and a preceding i if it is in RV 248 | */ 249 | private function step3a() 250 | { 251 | if ($this->searchIfInRv(array('a', 'e', 'i', 'o', 'à', 'è', 'ì', 'ò')) !== false) { 252 | $this->word = StringHelper::substr($this->word, 0, -1); 253 | 254 | if ($this->searchIfInRv(array('i')) !== false) { 255 | $this->word = StringHelper::substr($this->word, 0, -1); 256 | } 257 | return true; 258 | } 259 | return false; 260 | } 261 | 262 | /** 263 | * Step 3b 264 | * Replace final ch (or gh) with c (or g) if in RV (crocch -> crocc) 265 | */ 266 | private function step3b() 267 | { 268 | if ($this->searchIfInRv(array('ch')) !== false) { 269 | $this->word = preg_replace('#(ch)$#u', 'c', $this->word); 270 | 271 | } elseif ($this->searchIfInRv(array('gh')) !== false) { 272 | $this->word = preg_replace('#(gh)$#u', 'g', $this->word); 273 | } 274 | } 275 | 276 | /** 277 | * Finally 278 | * turn I and U back into lower case 279 | */ 280 | private function finish() 281 | { 282 | $this->word = str_replace(array('I', 'U'), array('i', 'u'), $this->word); 283 | } 284 | } 285 | -------------------------------------------------------------------------------- /src/Stemmer/Norwegian.php: -------------------------------------------------------------------------------- 1 | word = StringHelper::strtolower($word); 26 | 27 | // R2 is not used: R1 is defined in the same way as in the German stemmer 28 | $this->r1(); 29 | 30 | // then R1 is adjusted so that the region before it contains at least 3 letters. 31 | if ($this->r1Index < 3) { 32 | $this->r1Index = 3; 33 | $this->r1 = StringHelper::substr($this->word, 3); 34 | } 35 | 36 | // Do each of steps 1, 2 3 and 4. 37 | $this->step1(); 38 | $this->step2(); 39 | $this->step3(); 40 | 41 | return $this->word; 42 | } 43 | 44 | /** 45 | * Define a valid s-ending as one of 46 | * b c d f g h j l m n o p r t v y z, 47 | * or k not preceded by a vowel 48 | * 49 | * @param string $ending 50 | * @return boolean 51 | */ 52 | private function hasValidSEnding($word) 53 | { 54 | $lastLetter = StringHelper::substr($word, -1, 1); 55 | if (in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y', 'z'))) { 56 | return true; 57 | } 58 | if ($lastLetter == 'k') { 59 | $beforeLetter = StringHelper::substr($word, -2, 1); 60 | if (!in_array($beforeLetter, self::$vowels)) { 61 | return true; 62 | } 63 | } 64 | return false; 65 | } 66 | 67 | /** 68 | * Step 1 69 | * Search for the longest among the following suffixes in R1, and perform the action indicated. 70 | */ 71 | private function step1() 72 | { 73 | // erte ert 74 | // replace with er 75 | if ( ($position = $this->searchIfInR1(array('erte', 'ert'))) !== false) { 76 | $this->word = preg_replace('#(erte|ert)$#u', 'er', $this->word); 77 | return true; 78 | } 79 | 80 | // a e ede ande ende ane ene hetene en heten ar er heter as es edes endes enes hetenes ens hetens ers ets et het ast 81 | // delete 82 | if ( ($position = $this->searchIfInR1(array( 83 | 'hetenes', 'hetene', 'hetens', 'heten', 'endes', 'heter', 'ande', 'ende', 'enes', 'edes', 'ede', 'ane', 84 | 'ene', 'het', 'ers', 'ets', 'ast', 'ens', 'en', 'ar', 'er', 'as', 'es', 'et', 'a', 'e' 85 | ))) !== false) { 86 | $this->word = StringHelper::substr($this->word, 0, $position); 87 | return true; 88 | } 89 | 90 | // s 91 | // delete if preceded by a valid s-ending 92 | if ( ($position = $this->searchIfInR1(array('s'))) !== false) { 93 | $word = StringHelper::substr($this->word, 0, $position); 94 | if ($this->hasValidSEnding($word)) { 95 | $this->word = $word; 96 | } 97 | return true; 98 | } 99 | } 100 | 101 | /** 102 | * Step 2 103 | * If the word ends dt or vt in R1, delete the t. 104 | */ 105 | private function step2() 106 | { 107 | if ($this->searchIfInR1(array('dt', 'vt')) !== false) { 108 | $this->word = StringHelper::substr($this->word, 0, -1); 109 | } 110 | } 111 | 112 | /** 113 | * Step 3: 114 | * Search for the longest among the following suffixes in R1, and if found, delete. 115 | */ 116 | private function step3() 117 | { 118 | // leg eleg ig eig lig elig els lov elov slov hetslov 119 | if ( ($position = $this->searchIfInR1(array( 120 | 'hetslov', 'eleg', 'elov', 'slov', 'elig', 'eig', 'lig', 'els', 'lov', 'leg', 'ig' 121 | ))) !== false) { 122 | $this->word = StringHelper::substr($this->word, 0, $position); 123 | } 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/Stemmer/Portuguese.php: -------------------------------------------------------------------------------- 1 | word = StringHelper::strtolower($word); 26 | 27 | $this->word = str_replace(array('ã', 'õ'), array('a~', 'o~'), $this->word); 28 | 29 | $this->rv(); 30 | $this->r1(); 31 | $this->r2(); 32 | 33 | $word = $this->word; 34 | $this->step1(); 35 | 36 | if ($word == $this->word) { 37 | $this->step2(); 38 | } 39 | 40 | if ($word != $this->word) { 41 | $this->step3(); 42 | } else { 43 | $this->step4(); 44 | } 45 | 46 | $this->step5(); 47 | $this->finish(); 48 | 49 | return $this->word; 50 | } 51 | 52 | /** 53 | * Step 1: Standard suffix removal 54 | */ 55 | private function step1() 56 | { 57 | // delete if in R2 58 | if ( ($position = $this->search(array( 59 | 'amentos', 'imentos', 'adoras', 'adores', 'amento', 'imento', 'adora', 'istas', 'ismos', 'antes', 'ância', 60 | 'ezas', 'eza', 'icos', 'icas', 'ismo', 'ável', 'ível', 'ista', 'oso', 61 | 'osos', 'osas', 'osa', 'ico', 'ica', 'ador', 'aça~o', 'aço~es' , 'ante'))) !== false) { 62 | 63 | if ($this->inR2($position)) { 64 | $this->word = StringHelper::substr($this->word, 0, $position); 65 | } 66 | return true; 67 | } 68 | 69 | // logía logías 70 | // replace with log if in R2 71 | if ( ($position = $this->search(array('logías', 'logía'))) !== false) { 72 | if ($this->inR2($position)) { 73 | $this->word = preg_replace('#(logías|logía)$#u', 'log', $this->word); 74 | } 75 | return true; 76 | } 77 | 78 | // ución uciones 79 | // replace with u if in R2 80 | if ( ($position = $this->search(array('uciones', 'ución'))) !== false) { 81 | if ($this->inR2($position)) { 82 | $this->word = preg_replace('#(uciones|ución)$#u', 'u', $this->word); 83 | } 84 | return true; 85 | } 86 | 87 | // ência ências 88 | // replace with ente if in R2 89 | if ( ($position = $this->search(array('ências', 'ência'))) !== false) { 90 | if ($this->inR2($position)) { 91 | $this->word = preg_replace('#(ências|ência)$#u', 'ente', $this->word); 92 | } 93 | return true; 94 | } 95 | 96 | // amente 97 | // delete if in R1 98 | // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, 99 | // if preceded by os, ic or ad, delete if in R2 100 | if ( ($position = $this->search(array('amente'))) !== false) { 101 | 102 | // delete if in R1 103 | if ($this->inR1($position)) { 104 | $this->word = StringHelper::substr($this->word, 0, $position); 105 | } 106 | 107 | // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, 108 | if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { 109 | $this->word = StringHelper::substr($this->word, 0, $position2); 110 | if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { 111 | $this->word = StringHelper::substr($this->word, 0, $position3); 112 | } 113 | 114 | // if preceded by os, ic or ad, delete if in R2 115 | } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) !== false) { 116 | $this->word = StringHelper::substr($this->word, 0, $position4); 117 | } 118 | return true; 119 | } 120 | 121 | // mente 122 | // delete if in R2 123 | // if preceded by ante, avel or ível, delete if in R2 124 | if ( ($position = $this->search(array('mente'))) !== false) { 125 | 126 | // delete if in R2 127 | if ($this->inR2($position)) { 128 | $this->word = StringHelper::substr($this->word, 0, $position); 129 | } 130 | 131 | // if preceded by ante, avel or ível, delete if in R2 132 | if ( ($position2 = $this->searchIfInR2(array('ante', 'avel', 'ível'))) != false) { 133 | $this->word = StringHelper::substr($this->word, 0, $position2); 134 | } 135 | return true; 136 | } 137 | 138 | // idade idades 139 | // delete if in R2 140 | // if preceded by abil, ic or iv, delete if in R2 141 | if ( ($position = $this->search(array('idades', 'idade'))) !== false) { 142 | 143 | // delete if in R2 144 | if ($this->inR2($position)) { 145 | $this->word = StringHelper::substr($this->word, 0, $position); 146 | } 147 | 148 | // if preceded by abil, ic or iv, delete if in R2 149 | if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) !== false) { 150 | $this->word = StringHelper::substr($this->word, 0, $position2); 151 | } 152 | return true; 153 | } 154 | 155 | // iva ivo ivas ivos 156 | // delete if in R2 157 | // if preceded by at, delete if in R2 158 | if ( ($position = $this->search(array('ivas', 'ivos', 'iva', 'ivo'))) !== false) { 159 | 160 | // delete if in R2 161 | if ($this->inR2($position)) { 162 | $this->word = StringHelper::substr($this->word, 0, $position); 163 | } 164 | 165 | // if preceded by at, delete if in R2 166 | if ( ($position2 = $this->searchIfInR2(array('at'))) !== false) { 167 | $this->word = StringHelper::substr($this->word, 0, $position2); 168 | } 169 | return true; 170 | } 171 | 172 | // ira iras 173 | // replace with ir if in RV and preceded by e 174 | if ( ($position = $this->search(array('iras', 'ira'))) !== false) { 175 | 176 | if ($this->inRv($position)) { 177 | $before = $position -1; 178 | $letter = StringHelper::substr($this->word, $before, 1); 179 | 180 | if ($letter == 'e') { 181 | $this->word = preg_replace('#(iras|ira)$#u', 'ir', $this->word); 182 | } 183 | } 184 | return true; 185 | } 186 | 187 | return false; 188 | } 189 | 190 | /** 191 | * Step 2: Verb suffixes 192 | * Search for the longest among the following suffixes in RV, and if found, delete. 193 | */ 194 | private function step2() 195 | { 196 | if ( ($position = $this->searchIfInRv(array( 197 | 'aríamos', 'eríamos', 'iríamos', 'ássemos', 'êssemos', 'íssemos', 198 | 'aríeis', 'eríeis', 'iríeis', 'ásseis', 'ésseis', 'ísseis', 'áramos', 'éramos', 'íramos', 'ávamos', 199 | 'aremos', 'eremos', 'iremos', 200 | 'ariam', 'eriam', 'iriam', 'assem', 'essem', 'issem', 'arias', 'erias', 'irias', 'ardes', 'erdes', 'irdes', 201 | 'asses', 'esses', 'isses', 'astes', 'estes', 'istes', 'áreis', 'areis', 'éreis', 'ereis', 'íreis', 'ireis', 202 | 'áveis', 'íamos', 'armos', 'ermos', 'irmos', 203 | 'aria', 'eria', 'iria', 'asse', 'esse', 'isse', 'aste', 'este', 'iste', 'arei', 'erei', 'irei', 'adas', 'idas', 204 | 'aram', 'eram', 'iram', 'avam', 'arem', 'erem', 'irem', 'ando', 'endo', 'indo', 'ara~o', 'era~o', 'ira~o', 205 | 'arás', 'aras', 'erás', 'eras', 'irás', 'avas', 'ares', 'eres', 'ires', 'íeis', 'ados', 'idos', 'ámos', 'amos', 206 | 'emos', 'imos', 'iras', 207 | 'ada', 'ida', 'ará', 'ara', 'erá', 'era', 'irá', 'ava', 'iam', 'ado', 'ido', 'ias', 'ais', 'eis', 'ira', 208 | 'ia', 'ei', 'am', 'em', 'ar', 'er', 'ir', 'as', 'es', 'is', 'eu', 'iu', 'ou', 209 | ))) !== false) { 210 | 211 | $this->word = StringHelper::substr($this->word, 0, $position); 212 | return true; 213 | } 214 | return false; 215 | } 216 | 217 | /** 218 | * Step 3: d-suffixes 219 | * 220 | */ 221 | private function step3() 222 | { 223 | // Delete suffix i if in RV and preceded by c 224 | if ($this->searchIfInRv(array('i')) !== false) { 225 | $letter = StringHelper::substr($this->word, -2, 1); 226 | 227 | if ($letter == 'c') { 228 | $this->word = StringHelper::substr($this->word, 0, -1); 229 | } 230 | return true; 231 | } 232 | return false; 233 | } 234 | 235 | /** 236 | * Step 4 237 | */ 238 | private function step4() 239 | { 240 | // If the word ends with one of the suffixes "os a i o á í ó" in RV, delete it 241 | if ( ($position = $this->searchIfInRv(array('os', 'a', 'i', 'o','á', 'í', 'ó'))) !== false) { 242 | $this->word = StringHelper::substr($this->word, 0, $position); 243 | return true; 244 | } 245 | return false; 246 | } 247 | 248 | /** 249 | * Step 5 250 | */ 251 | private function step5() 252 | { 253 | // If the word ends with one of "e é ê" in RV, delete it, and if preceded by gu (or ci) with the u (or i) in RV, delete the u (or i). 254 | if ($this->searchIfInRv(array('e', 'é', 'ê')) !== false) { 255 | $this->word = StringHelper::substr($this->word, 0, -1); 256 | 257 | if ( ($position2 = $this->search(array('gu', 'ci'))) !== false) { 258 | if ($this->inRv(($position2+1))) { 259 | $this->word = StringHelper::substr($this->word, 0, -1); 260 | } 261 | } 262 | return true; 263 | } else if ($this->search(array('ç')) !== false) { 264 | $this->word = preg_replace('#(ç)$#u', 'c', $this->word); 265 | return true; 266 | } 267 | return false; 268 | } 269 | 270 | /** 271 | * Finally 272 | */ 273 | private function finish() 274 | { 275 | // turn U and Y back into lower case, and remove the umlaut accent from a, o and u. 276 | $this->word = str_replace(array('a~', 'o~'), array('ã', 'õ'), $this->word); 277 | } 278 | } 279 | -------------------------------------------------------------------------------- /src/Stemmer/Romanian.php: -------------------------------------------------------------------------------- 1 | word = StringHelper::strtolower($word); 26 | 27 | $this->plainVowels = implode('', self::$vowels); 28 | 29 | // First, i and u between vowels are put into upper case (so that they are treated as consonants). 30 | $this->word = preg_replace('#(['.$this->plainVowels.'])u(['.$this->plainVowels.'])#u', '$1U$2', $this->word); 31 | $this->word = preg_replace('#(['.$this->plainVowels.'])i(['.$this->plainVowels.'])#u', '$1I$2', $this->word); 32 | 33 | $this->rv(); 34 | $this->r1(); 35 | $this->r2(); 36 | 37 | $this->step0(); 38 | 39 | $word1 = $this->word; 40 | $word2 = $this->word; 41 | 42 | do { 43 | $word1 = $this->word; 44 | $this->step1(); 45 | } while ($this->word != $word1); 46 | 47 | $this->step2(); 48 | 49 | // Do step 3 if no suffix was removed either by step 1 or step 2. 50 | if ($word2 == $this->word) { 51 | $this->step3(); 52 | } 53 | 54 | $this->step4(); 55 | $this->finish(); 56 | 57 | return $this->word; 58 | } 59 | 60 | /** 61 | * Step 0: Removal of plurals (and other simplifications) 62 | * Search for the longest among the following suffixes, and, if it is in R1, perform the action indicated. 63 | * @return boolean 64 | */ 65 | private function step0() 66 | { 67 | // ul ului 68 | // delete 69 | if ( ($position = $this->search(array('ul', 'ului'))) !== false) { 70 | if ($this->inR1($position)) { 71 | $this->word = StringHelper::substr($this->word, 0, $position); 72 | } 73 | return true; 74 | } 75 | 76 | // aua 77 | // replace with a 78 | if ( ($position = $this->search(array('aua'))) !== false) { 79 | if ($this->inR1($position)) { 80 | $this->word = preg_replace('#(aua)$#u', 'a', $this->word); 81 | } 82 | return true; 83 | } 84 | 85 | // ea ele elor 86 | // replace with e 87 | if ( ($position = $this->search(array('ea', 'ele', 'elor'))) !== false) { 88 | if ($this->inR1($position)) { 89 | $this->word = preg_replace('#(ea|ele|elor)$#u', 'e', $this->word); 90 | } 91 | return true; 92 | } 93 | 94 | // ii iua iei iile iilor ilor 95 | // replace with i 96 | if ( ($position = $this->search(array('ii', 'iua', 'iei', 'iile', 'iilor', 'ilor'))) !== false) { 97 | if ($this->inR1($position)) { 98 | $this->word = preg_replace('#(ii|iua|iei|iile|iilor|ilor)$#u', 'i', $this->word); 99 | } 100 | return true; 101 | } 102 | 103 | // ile 104 | // replace with i if not preceded by ab 105 | if ( ($position = $this->search(array('ile'))) !== false) { 106 | if ($this->inR1($position)) { 107 | $before = StringHelper::substr($this->word, ($position-2), 2); 108 | 109 | if ($before != 'ab') { 110 | $this->word = preg_replace('#(ile)$#u', 'i', $this->word); 111 | } 112 | } 113 | return true; 114 | } 115 | 116 | // atei 117 | // replace with at 118 | if ( ($position = $this->search(array('atei'))) != false) { 119 | if ($this->inR1($position)) { 120 | $this->word = preg_replace('#(atei)$#u', 'at', $this->word); 121 | } 122 | return true; 123 | } 124 | 125 | // aţie aţia 126 | // replace with aţi 127 | if ( ($position = $this->search(array('aţie', 'aţia'))) !== false) { 128 | if ($this->inR1($position)) { 129 | $this->word = preg_replace('#(aţie|aţia)$#u', 'aţi', $this->word); 130 | } 131 | return true; 132 | } 133 | 134 | return false; 135 | } 136 | 137 | /** 138 | * Step 1: Reduction of combining suffixes 139 | * Search for the longest among the following suffixes, and, if it is in R1, preform the replacement action indicated. 140 | * Then repeat this step until no replacement occurs. 141 | * @return boolean 142 | */ 143 | private function step1() 144 | { 145 | // abilitate abilitati abilităi abilităţi 146 | // replace with abil 147 | if ( ($position = $this->search(array('abilitate', 'abilitati', 'abilităi', 'abilităţi'))) !== false) { 148 | if ($this->inR1($position)) { 149 | $this->word = preg_replace('#(abilitate|abilitati|abilităi|abilităţi)$#u', 'abil', $this->word); 150 | } 151 | return true; 152 | } 153 | 154 | // ibilitate 155 | // replace with ibil 156 | if ( ($position = $this->search(array('ibilitate'))) !== false) { 157 | if ($this->inR1($position)) { 158 | $this->word = preg_replace('#(ibilitate)$#u', 'ibil', $this->word); 159 | } 160 | return true; 161 | } 162 | 163 | // ivitate ivitati ivităi ivităţi 164 | // replace with iv 165 | if ( ($position = $this->search(array('ivitate', 'ivitati', 'ivităi', 'ivităţi'))) !== false) { 166 | if ($this->inR1($position)) { 167 | $this->word = preg_replace('#(ivitate|ivitati|ivităi|ivităţi)$#u', 'iv', $this->word); 168 | } 169 | return true; 170 | } 171 | 172 | // icitate icitati icităi icităţi icator icatori iciv iciva icive icivi icivă ical icala icale icali icală 173 | // replace with ic 174 | if ( ($position = $this->search(array( 175 | 'icitate', 'icitati', 'icităi', 'icităţi', 'icatori', 'icator', 'iciva', 176 | 'icive', 'icivi', 'icivă', 'icala', 'icale', 'icali', 'icală', 'iciv', 'ical'))) !== false) { 177 | if ($this->inR1($position)) { 178 | $this->word = preg_replace('#(icitate|icitati|icităi|icităţi|cator|icatori|iciva|icive|icivi|icivă|icala|icale|icali|icală|ical|iciv)$#u', 'ic', $this->word); 179 | } 180 | return true; 181 | } 182 | 183 | // ativ ativa ative ativi ativă aţiune atoare ator atori ătoare ător ători 184 | // replace with at 185 | if ( ($position = $this->search(array('ativa', 'ative', 'ativi', 'ativă', 'ativ', 'aţiune', 'atoare', 'atori', 'ătoare', 'ători', 'ător', 'ator'))) !== false) { 186 | if ($this->inR1($position)) { 187 | $this->word = preg_replace('#(ativa|ative|ativi|ativă|ativ|aţiune|atoare|atori|ătoare|ători|ător|ator)$#u', 'at', $this->word); 188 | } 189 | return true; 190 | } 191 | 192 | // itiv itiva itive itivi itivă iţiune itoare itor itori 193 | // replace with it 194 | if ( ($position = $this->search(array('itiva', 'itive', 'itivi', 'itivă', 'itiv', 'iţiune', 'itoare', 'itori', 'itor'))) !== false) { 195 | if ($this->inR1($position)) { 196 | $this->word = preg_replace('#(itiva|itive|itivi|itivă|itiv|iţiune|itoare|itori|itor)$#u', 'it', $this->word); 197 | } 198 | return true; 199 | } 200 | 201 | return false; 202 | } 203 | 204 | /** 205 | * Step 2: Removal of 'standard' suffixes 206 | * Search for the longest among the following suffixes, and, if it is in R2, perform the action indicated. 207 | * @return boolean 208 | */ 209 | private function step2() 210 | { 211 | // atori itate itati, ităţi, abila abile abili abilă, ibila ibile ibili ibilă 212 | // anta, ante, anti, antă, ator, ibil, oasa oasă oase, ităi, abil 213 | // osi oşi ant ici ică iva ive ivi ivă ata ată ati ate, ata ată ati ate uta ută uti ute, ita ită iti ite ica ice 214 | // at, os, iv, ut, it, ic 215 | // delete 216 | if ( ($position = $this->search(array( 217 | 'atori', 'itate', 'itati', 'ităţi', 'abila', 'abile', 'abili', 'abilă', 'ibila', 'ibile', 'ibili', 'ibilă', 218 | 'anta', 'ante', 'anti', 'antă', 'ator', 'ibil', 'oasa', 'oasă', 'oase', 'ităi', 'abil', 219 | 'osi', 'oşi', 'ant', 'ici', 'ică', 'iva', 'ive', 'ivi', 'ivă', 'ata', 'ată', 'ati', 'ate', 'ata', 'ată', 220 | 'ati', 'ate', 'uta', 'ută', 'uti', 'ute', 'ita', 'ită', 'iti', 'ite', 'ica', 'ice', 221 | 'at', 'os', 'iv', 'ut', 'it', 'ic' 222 | ))) !== false) { 223 | if ($this->inR2($position)) { 224 | $this->word = StringHelper::substr($this->word, 0, $position); 225 | } 226 | return true; 227 | } 228 | 229 | // iune iuni 230 | // delete if preceded by ţ, and replace the ţ by t. 231 | if ( ($position = $this->search(array('iune', 'iuni'))) !== false) { 232 | if ($this->inR2($position)) { 233 | $before = $position - 1; 234 | $letter = StringHelper::substr($this->word, $before, 1); 235 | if ($letter == 'ţ') { 236 | $this->word = StringHelper::substr($this->word, 0, $position); 237 | $this->word = preg_replace('#(ţ)$#u', 't', $this->word); 238 | } 239 | } 240 | return true; 241 | } 242 | 243 | // ism isme ist ista iste isti istă işti 244 | // replace with ist 245 | if ( ($position = $this->search(array('isme', 'ism', 'ista', 'iste', 'isti', 'istă', 'işti', 'ist'))) !== false) { 246 | if ($this->inR2($position)) { 247 | $this->word = preg_replace('#(isme|ism|ista|iste|isti|istă|işti|ist)$#u', 'ist', $this->word); 248 | } 249 | return true; 250 | } 251 | 252 | return false; 253 | } 254 | 255 | /** 256 | * Step 3: Removal of verb suffixes 257 | * Do step 3 if no suffix was removed either by step 1 or step 2. 258 | * @return boolean 259 | */ 260 | private function step3() 261 | { 262 | // are ere ire âre ind ând indu ându eze ească ez ezi ează esc eşti 263 | // eşte ăsc ăşti ăşte am ai au eam eai ea eaţi eau iam iai ia iaţi 264 | // iau ui aşi arăm arăţi ară uşi urăm urăţi ură işi irăm irăţi iră âi 265 | // âşi ârăm ârăţi âră asem aseşi ase aserăm aserăţi aseră isem iseşi ise 266 | // iserăm iserăţi iseră âsem âseşi âse âserăm âserăţi âseră usem useşi use userăm userăţi useră 267 | // delete if preceded in RV by a consonant or u 268 | if ( ($position = $this->searchIfInRv(array( 269 | 'userăţi', 'iserăţi', 'âserăţi', 'aserăţi', 270 | 'userăm', 'iserăm', 'âserăm', 'aserăm', 271 | 'iseră', 'âseşi', 'useră', 'âseră', 'useşi', 'iseşi', 'aseră', 'aseşi', 'ârăţi', 'irăţi', 'urăţi', 'arăţi', 'ească', 272 | 'usem', 'âsem', 'isem', 'asem', 'ârăm', 'urăm', 'irăm', 'arăm', 'iaţi', 'eaţi', 'ăşte', 'ăşti', 'eşte', 'eşti', 'ează', 'ându', 'indu', 273 | 'âse', 'use', 'ise', 'ase', 'âră', 'iră', 'işi', 'ură', 'uşi', 'ară', 'aşi', 'âşi', 'iau', 'iai', 'iam', 'eau', 'eai', 'eam', 'ăsc', 274 | 'are', 'ere', 'ire', 'âre', 'ind', 'ând', 'eze', 'ezi', 'esc', 275 | 'âi', 'ui', 'ia', 'ea', 'au', 'ai', 'am', 'ez' 276 | ))) !== false) { 277 | if ($this->inRv($position)) { 278 | $before = $position - 1; 279 | if ($this->inRv($before)) { 280 | $letter = StringHelper::substr($this->word, $before, 1); 281 | 282 | if ( (!in_array($letter, self::$vowels)) || ($letter == 'u') ) { 283 | $this->word = StringHelper::substr($this->word, 0, $position); 284 | } 285 | } 286 | } 287 | return true; 288 | } 289 | 290 | 291 | 292 | // ăm aţi em eţi im iţi âm âţi seşi serăm serăţi seră sei se sesem seseşi sese seserăm seserăţi seseră 293 | // delete 294 | if ( ($position = $this->searchIfInRv(array( 295 | 'seserăm', 'seserăţi', 'seseră', 'seseşi', 'sesem', 'serăţi', 'serăm', 'seşi', 'sese', 'seră', 296 | 'aţi', 'eţi', 'iţi', 'âţi', 'sei', 'se', 'ăm', 'âm', 'em', 'im' 297 | ))) !== false) { 298 | if ($this->inRv($position)) { 299 | $this->word = StringHelper::substr($this->word, 0, $position); 300 | } 301 | return true; 302 | } 303 | } 304 | 305 | /** 306 | * Step 4: Removal of final vowel 307 | */ 308 | private function step4() 309 | { 310 | // Search for the longest among the suffixes "a e i ie ă " and, if it is in RV, delete it. 311 | if ( ($position = $this->search(array('a', 'ie', 'e', 'i', 'ă'))) !== false) { 312 | if ($this->inRv($position)) { 313 | $this->word = StringHelper::substr($this->word, 0, $position); 314 | } 315 | } 316 | 317 | return true; 318 | } 319 | 320 | /** 321 | * Finally 322 | * Turn I, U back into i, u 323 | */ 324 | private function finish() 325 | { 326 | // Turn I, U back into i, u 327 | $this->word = str_replace(array('I', 'U'), array('i', 'u'), $this->word); 328 | } 329 | } 330 | -------------------------------------------------------------------------------- /src/Stemmer/Russian.php: -------------------------------------------------------------------------------- 1 | word = StringHelper::strtolower($word); 60 | 61 | // R2 is not used: R1 is defined in the same way as in the German stemmer 62 | $this->r1(); 63 | $this->r2(); 64 | $this->rv(); 65 | 66 | // Do each of steps 1, 2 3 and 4. 67 | $this->step1(); 68 | $this->step2(); 69 | $this->step3(); 70 | $this->step4(); 71 | 72 | return $this->word; 73 | } 74 | 75 | /** 76 | * Step 1: Search for a PERFECTIVE GERUND ending. If one is found remove it, and that is then the end of step 1. 77 | * Otherwise try and remove a REFLEXIVE ending, and then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a NOUN ending. 78 | * As soon as one of the endings (1) to (3) is found remove it, and terminate step 1. 79 | */ 80 | private function step1() 81 | { 82 | // Search for a PERFECTIVE GERUND ending. 83 | // group 1 84 | if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[0])) !== false) { 85 | if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) { 86 | $this->word = StringHelper::substr($this->word, 0, $position); 87 | return true; 88 | } 89 | } 90 | 91 | // group 2 92 | if ( ($position = $this->searchIfInRv(self::$perfectiveGerund[1])) !== false) { 93 | if ($this->inRv($position)) { 94 | $this->word = StringHelper::substr($this->word, 0, $position); 95 | return true; 96 | } 97 | } 98 | 99 | // Otherwise try and remove a REFLEXIVE ending 100 | if ( ($position = $this->searchIfInRv(self::$reflexive)) !== false) { 101 | if ($this->inRv($position)) { 102 | $this->word = StringHelper::substr($this->word, 0, $position); 103 | } 104 | } 105 | 106 | // then search in turn for (1) an ADJECTIVAL, (2) a VERB or (3) a NOUN ending. 107 | // As soon as one of the endings (1) to (3) is found remove it, and terminate step 1. 108 | if ( ($position = $this->searchIfInRv(self::$adjective)) !== false) { 109 | if ($this->inRv($position)) { 110 | $this->word = StringHelper::substr($this->word, 0, $position); 111 | 112 | if ( ($position2 = $this->search(self::$participle[0])) !== false) { 113 | if ( ($this->inRv($position2)) && ($this->checkGroup1($position2)) ) { 114 | $this->word = StringHelper::substr($this->word, 0, $position2); 115 | return true; 116 | } 117 | } 118 | 119 | if ( ($position2 = $this->search(self::$participle[1])) !== false) { 120 | if ($this->inRv($position2)) { 121 | $this->word = StringHelper::substr($this->word, 0, $position2); 122 | return true; 123 | } 124 | } 125 | 126 | return true; 127 | } 128 | } 129 | 130 | if ( ($position = $this->searchIfInRv(self::$verb[0])) !== false) { 131 | if ( ($this->inRv($position)) && ($this->checkGroup1($position)) ) { 132 | $this->word = StringHelper::substr($this->word, 0, $position); 133 | return true; 134 | } 135 | } 136 | 137 | if ( ($position = $this->searchIfInRv(self::$verb[1])) !== false) { 138 | if ($this->inRv($position)) { 139 | $this->word = StringHelper::substr($this->word, 0, $position); 140 | return true; 141 | } 142 | } 143 | 144 | if ( ($position = $this->searchIfInRv(self::$noun)) !== false) { 145 | if ($this->inRv($position)) { 146 | $this->word = StringHelper::substr($this->word, 0, $position); 147 | return true; 148 | } 149 | } 150 | 151 | return false; 152 | } 153 | 154 | /** 155 | * Step 2: If the word ends with и (i), remove it. 156 | */ 157 | private function step2() 158 | { 159 | if ( ($position = $this->searchIfInRv(array('и'))) !== false) { 160 | if ($this->inRv($position)) { 161 | $this->word = StringHelper::substr($this->word, 0, $position); 162 | return true; 163 | } 164 | } 165 | return false; 166 | } 167 | 168 | /** 169 | * Step 3: Search for a DERIVATIONAL ending in R2 (i.e. the entire ending must lie in R2), 170 | * and if one is found, remove it. 171 | */ 172 | private function step3() 173 | { 174 | if ( ($position = $this->searchIfInRv(self::$derivational)) !== false) { 175 | if ($this->inR2($position)) { 176 | $this->word = StringHelper::substr($this->word, 0, $position); 177 | return true; 178 | } 179 | } 180 | } 181 | 182 | /** 183 | * Step 4: (1) Undouble н (n), or, (2) if the word ends with a SUPERLATIVE ending, remove it 184 | * and undouble н (n), or (3) if the word ends ь (') (soft sign) remove it. 185 | */ 186 | private function step4() 187 | { 188 | // (2) if the word ends with a SUPERLATIVE ending, remove it 189 | if ( ($position = $this->searchIfInRv(self::$superlative)) !== false) { 190 | $this->word = StringHelper::substr($this->word, 0, $position); 191 | } 192 | 193 | // (1) Undouble н (n) 194 | if ( ($position = $this->searchIfInRv(array('нн'))) !== false) { 195 | $this->word = StringHelper::substr($this->word, 0, ($position+1)); 196 | return true; 197 | } 198 | 199 | // (3) if the word ends ь (') (soft sign) remove it 200 | if ( ($position = $this->searchIfInRv(array('ь'))) !== false) { 201 | $this->word = StringHelper::substr($this->word, 0, $position); 202 | return true; 203 | } 204 | } 205 | 206 | /** 207 | * In any word, RV is the region after the first vowel, or the end of the word if it contains no vowel. 208 | */ 209 | protected function rv() 210 | { 211 | $length = StringHelper::strlen($this->word); 212 | 213 | $this->rv = ''; 214 | $this->rvIndex = $length; 215 | 216 | for ($i=0; $i<$length; $i++) { 217 | $letter = StringHelper::substr($this->word, $i, 1); 218 | if (in_array($letter, self::$vowels)) { 219 | $this->rv = StringHelper::substr($this->word, ($i+1)); 220 | $this->rvIndex = $i + 1; 221 | return true; 222 | } 223 | } 224 | 225 | return false; 226 | } 227 | 228 | /** 229 | * group 1 endings must follow а (a) or я (ia) 230 | * 231 | * @param integer $position 232 | * @return boolean 233 | */ 234 | private function checkGroup1($position) 235 | { 236 | if (! $this->inRv(($position-1))) { 237 | return false; 238 | } 239 | 240 | $letter = StringHelper::substr($this->word, ($position - 1), 1); 241 | 242 | if ($letter == 'а' || $letter == 'я') { 243 | return true; 244 | } 245 | return false; 246 | } 247 | } 248 | -------------------------------------------------------------------------------- /src/Stemmer/Spanish.php: -------------------------------------------------------------------------------- 1 | word = StringHelper::strtolower($word); 27 | 28 | $this->rv(); 29 | $this->r1(); 30 | $this->r2(); 31 | 32 | $this->step0(); 33 | 34 | $word = $this->word; 35 | $this->step1(); 36 | 37 | // Do step 2a if no ending was removed by step 1. 38 | if ($this->word == $word) { 39 | $this->step2a(); 40 | 41 | // Do Step 2b if step 2a was done, but failed to remove a suffix. 42 | if ($this->word == $word) { 43 | $this->step2b(); 44 | } 45 | } 46 | 47 | $this->step3(); 48 | $this->finish(); 49 | 50 | return $this->word; 51 | } 52 | 53 | /** 54 | * Step 0: Attached pronoun 55 | * 56 | * Search for the longest among the following suffixes 57 | * me se sela selo selas selos la le lo las les los nos 58 | * 59 | * and delete it, if comes after one of 60 | * (a) iéndo ándo ár ér ír 61 | * (b) ando iendo ar er ir 62 | * (c) yendo following u 63 | * 64 | * in RV. In the case of (c), yendo must lie in RV, but the preceding u can be outside it. 65 | * In the case of (a), deletion is followed by removing the acute accent (for example, haciéndola -> haciendo). 66 | */ 67 | private function step0() 68 | { 69 | if ( ($position = $this->searchIfInRv(array('selas', 'selos', 'las', 'los', 'les', 'nos', 'selo', 'sela', 'me', 'se', 'la', 'le', 'lo' ))) != false) { 70 | $suffixe = StringHelper::substr($this->word, $position); 71 | 72 | // a 73 | $a = array('iéndo', 'ándo', 'ár', 'ér', 'ír'); 74 | $a = array_map(function($item) use ($suffixe) { 75 | return $item . $suffixe; 76 | }, $a); 77 | 78 | if ( ($position2 = $this->searchIfInRv($a)) !== false) { 79 | $suffixe2 = StringHelper::substr($this->word, $position2); 80 | $suffixe2 = Transliterate::utf8_latin_to_ascii($suffixe2); // unaccent 81 | $this->word = StringHelper::substr($this->word, 0, $position2); 82 | $this->word .= $suffixe2; 83 | $this->word = StringHelper::substr($this->word, 0, $position); 84 | return true; 85 | } 86 | 87 | // b 88 | $b = array('iendo', 'ando', 'ar', 'er', 'ir'); 89 | $b = array_map(function($item) use ($suffixe) { 90 | return $item . $suffixe; 91 | }, $b); 92 | 93 | if ( ($position2 = $this->searchIfInRv($b)) !== false) { 94 | $this->word = StringHelper::substr($this->word, 0, $position); 95 | return true; 96 | } 97 | 98 | // c 99 | if ( ($position2 = $this->searchIfInRv(array('yendo' . $suffixe))) != false) { 100 | $before = StringHelper::substr($this->word, ($position2-1), 1); 101 | if ( (isset($before)) && ($before == 'u') ) { 102 | $this->word = StringHelper::substr($this->word, 0, $position); 103 | return true; 104 | } 105 | } 106 | } 107 | 108 | return false; 109 | } 110 | 111 | /** 112 | * Step 1 113 | */ 114 | private function step1() 115 | { 116 | // anza anzas ico ica icos icas ismo ismos able ables ible ibles ista 117 | // istas oso osa osos osas amiento amientos imiento imientos 118 | // delete if in R2 119 | if ( ($position = $this->search(array( 120 | 'imientos', 'imiento', 'amientos', 'amiento', 'osas', 'osos', 'osa', 'oso', 'istas', 'ista', 'ibles', 121 | 'ible', 'ables', 'able', 'ismos', 'ismo', 'icas', 'icos', 'ica', 'ico', 'anzas', 'anza'))) != false) { 122 | 123 | if ($this->inR2($position)) { 124 | $this->word = StringHelper::substr($this->word, 0, $position); 125 | } 126 | return true; 127 | } 128 | 129 | // adora ador ación adoras adores aciones ante antes ancia ancias 130 | // delete if in R2 131 | // if preceded by ic, delete if in R2 132 | if ( ($position = $this->search(array( 133 | 'adoras', 'adora', 'aciones', 'ación', 'adores', 'ador', 'antes', 'ante', 'ancias', 'ancia'))) != false) { 134 | 135 | if ($this->inR2($position)) { 136 | $this->word = StringHelper::substr($this->word, 0, $position); 137 | } 138 | 139 | if ( ($position2 = $this->searchIfInR2(array('ic')))) { 140 | $this->word = StringHelper::substr($this->word, 0, $position2); 141 | } 142 | return true; 143 | } 144 | 145 | // logía logías 146 | // replace with log if in R2 147 | if ( ($position = $this->search(array('logías', 'logía'))) != false) { 148 | if ($this->inR2($position)) { 149 | $this->word = preg_replace('#(logías|logía)$#u', 'log', $this->word); 150 | } 151 | return true; 152 | } 153 | 154 | // ución uciones 155 | // replace with u if in R2 156 | if ( ($position = $this->search(array('uciones', 'ución'))) != false) { 157 | if ($this->inR2($position)) { 158 | $this->word = preg_replace('#(uciones|ución)$#u', 'u', $this->word); 159 | } 160 | return true; 161 | } 162 | 163 | // encia encias 164 | // replace with ente if in R2 165 | if ( ($position = $this->search(array('encias', 'encia'))) != false) { 166 | if ($this->inR2($position)) { 167 | $this->word = preg_replace('#(encias|encia)$#u', 'ente', $this->word); 168 | } 169 | return true; 170 | } 171 | 172 | // amente 173 | // delete if in R1 174 | // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, 175 | // if preceded by os, ic or ad, delete if in R2 176 | if ( ($position = $this->search(array('amente'))) != false) { 177 | 178 | // delete if in R1 179 | if ($this->inR1($position)) { 180 | $this->word = StringHelper::substr($this->word, 0, $position); 181 | } 182 | 183 | // if preceded by iv, delete if in R2 (and if further preceded by at, delete if in R2), otherwise, 184 | if ( ($position2 = $this->searchIfInR2(array('iv'))) !== false) { 185 | $this->word = StringHelper::substr($this->word, 0, $position2); 186 | if ( ($position3 = $this->searchIfInR2(array('at'))) !== false) { 187 | $this->word = StringHelper::substr($this->word, 0, $position3); 188 | } 189 | 190 | // if preceded by os, ic or ad, delete if in R2 191 | } elseif ( ($position4 = $this->searchIfInR2(array('os', 'ic', 'ad'))) != false) { 192 | $this->word = StringHelper::substr($this->word, 0, $position4); 193 | } 194 | return true; 195 | } 196 | 197 | // mente 198 | // delete if in R2 199 | // if preceded by ante, able or ible, delete if in R2 200 | if ( ($position = $this->search(array('mente'))) != false) { 201 | 202 | // delete if in R2 203 | if ($this->inR2($position)) { 204 | $this->word = StringHelper::substr($this->word, 0, $position); 205 | } 206 | 207 | // if preceded by ante, able or ible, delete if in R2 208 | if ( ($position2 = $this->searchIfInR2(array('ante', 'able', 'ible'))) != false) { 209 | $this->word = StringHelper::substr($this->word, 0, $position2); 210 | } 211 | return true; 212 | } 213 | 214 | // idad idades 215 | // delete if in R2 216 | // if preceded by abil, ic or iv, delete if in R2 217 | if ( ($position = $this->search(array('idades', 'idad'))) != false) { 218 | 219 | // delete if in R2 220 | if ($this->inR2($position)) { 221 | $this->word = StringHelper::substr($this->word, 0, $position); 222 | } 223 | 224 | // if preceded by abil, ic or iv, delete if in R2 225 | if ( ($position2 = $this->searchIfInR2(array('abil', 'ic', 'iv'))) != false) { 226 | $this->word = StringHelper::substr($this->word, 0, $position2); 227 | } 228 | return true; 229 | } 230 | 231 | // iva ivo ivas ivos 232 | // delete if in R2 233 | // if preceded by at, delete if in R2 234 | if ( ($position = $this->search(array('ivas', 'ivos', 'iva', 'ivo'))) != false) { 235 | 236 | // delete if in R2 237 | if ($this->inR2($position)) { 238 | $this->word = StringHelper::substr($this->word, 0, $position); 239 | } 240 | 241 | // if preceded by at, delete if in R2 242 | if ( ($position2 = $this->searchIfInR2(array('at'))) != false) { 243 | $this->word = StringHelper::substr($this->word, 0, $position2); 244 | } 245 | return true; 246 | } 247 | 248 | return false; 249 | } 250 | 251 | /** 252 | * Step 2a: Verb suffixes beginning y 253 | */ 254 | private function step2a() 255 | { 256 | // if found, delete if preceded by u 257 | // (Note that the preceding u need not be in RV.) 258 | if ( ($position = $this->searchIfInRv(array( 259 | 'yamos', 'yendo', 'yeron', 'yan', 'yen', 'yais', 'yas', 'yes', 'yo', 'yó', 'ya', 'ye'))) != false) { 260 | 261 | $before = StringHelper::substr($this->word, ($position-1), 1); 262 | if ( (isset($before)) && ($before == 'u') ) { 263 | $this->word = StringHelper::substr($this->word, 0, $position); 264 | return true; 265 | } 266 | } 267 | 268 | return false; 269 | } 270 | 271 | /** 272 | * Step 2b: Other verb suffixes 273 | * Search for the longest among the following suffixes in RV, and perform the action indicated. 274 | */ 275 | private function step2b() 276 | { 277 | // delete 278 | if ( ($position = $this->searchIfInRv(array( 279 | 'iésemos', 'iéramos', 'ábamos', 'iríamos', 'eríamos', 'aríamos', 'áramos', 'ásemos', 'eríais', 280 | 'aremos', 'eremos', 'iremos', 'asteis', 'ieseis', 'ierais', 'isteis', 'aríais', 281 | 'irían', 'aréis', 'erían', 'erías', 'eréis', 'iréis', 'irías', 'ieran', 'iesen', 'ieron', 'iendo', 'ieras', 282 | 'iríais', 'arían', 'arías', 283 | 'amos', 'imos', 'ados', 'idos', 'irán', 'irás', 'erán', 'erás', 'ería', 'iría', 'íais', 'arán', 'arás', 'aría', 284 | 'iera', 'iese', 'aste', 'iste', 'aban', 'aran', 'asen', 'aron', 'ando', 'abas', 'adas', 'idas', 'ases', 'aras', 285 | 'aré', 'erá', 'eré', 'áis', 'ías', 'irá', 'iré', 'aba', 'ían', 'ada', 'ara', 'ase', 'ida', 'ado', 'ido', 'ará', 286 | 'ad', 'ed', 'id', 'ís', 'ió', 'ar', 'er', 'ir', 'as', 'ía', 'an' 287 | ))) != false) { 288 | $this->word = StringHelper::substr($this->word, 0, $position); 289 | return true; 290 | } 291 | 292 | // en es éis emos 293 | // delete, and if preceded by gu delete the u (the gu need not be in RV) 294 | if ( ($position = $this->searchIfInRv(array('éis', 'emos', 'en', 'es'))) != false) { 295 | $this->word = StringHelper::substr($this->word, 0, $position); 296 | 297 | if ( ($position2 = $this->search(array('gu'))) != false) { 298 | $this->word = StringHelper::substr($this->word, 0, ($position2+1)); 299 | } 300 | 301 | 302 | return true; 303 | } 304 | } 305 | 306 | /** 307 | * Step 3: residual suffix 308 | * Search for the longest among the following suffixes in RV, and perform the action indicated. 309 | */ 310 | private function step3() 311 | { 312 | // os a o á í ó 313 | // delete if in RV 314 | if ( ($position = $this->searchIfInRv(array('os', 'a', 'o', 'á', 'í', 'ó'))) != false) { 315 | $this->word = StringHelper::substr($this->word, 0, $position); 316 | return true; 317 | } 318 | 319 | // e é 320 | // delete if in RV, and if preceded by gu with the u in RV delete the u 321 | if ( ($position = $this->searchIfInRv(array('e', 'é'))) != false) { 322 | $this->word = StringHelper::substr($this->word, 0, $position); 323 | 324 | if ( ($position2 = $this->searchIfInRv(array('u'))) != false) { 325 | $before = StringHelper::substr($this->word, ($position2-1), 1); 326 | if ( (isset($before)) && ($before == 'g') ) { 327 | $this->word = StringHelper::substr($this->word, 0, $position2); 328 | return true; 329 | } 330 | } 331 | } 332 | 333 | return false; 334 | } 335 | 336 | /** 337 | * And finally: 338 | * Remove acute accents 339 | */ 340 | private function finish() 341 | { 342 | $this->word = str_replace(array('á', 'í', 'ó', 'é', 'ú'), array('a', 'i', 'o', 'e', 'u'), $this->word); 343 | } 344 | } 345 | -------------------------------------------------------------------------------- /src/Stemmer/Stem.php: -------------------------------------------------------------------------------- 1 | = $this->rvIndex); 68 | } 69 | 70 | protected function inR1($position) 71 | { 72 | return ($position >= $this->r1Index); 73 | } 74 | 75 | protected function inR2($position) 76 | { 77 | return ($position >= $this->r2Index); 78 | } 79 | 80 | protected function searchIfInRv($suffixes) 81 | { 82 | return $this->search($suffixes, $this->rvIndex); 83 | } 84 | 85 | protected function searchIfInR1($suffixes) 86 | { 87 | return $this->search($suffixes, $this->r1Index); 88 | } 89 | 90 | protected function searchIfInR2($suffixes) 91 | { 92 | return $this->search($suffixes, $this->r2Index); 93 | } 94 | 95 | protected function search($suffixes, $offset = 0) 96 | { 97 | $length = StringHelper::strlen($this->word); 98 | if ($offset > $length) { 99 | return false; 100 | } 101 | foreach ($suffixes as $suffixe) { 102 | if ( (($position = StringHelper::strrpos($this->word, $suffixe, $offset)) !== false) && ((StringHelper::strlen($suffixe)+$position) == $length) ) { 103 | return $position; 104 | } 105 | } 106 | 107 | return false; 108 | } 109 | 110 | /** 111 | * R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. 112 | */ 113 | protected function r1() 114 | { 115 | list($this->r1Index, $this->r1) = $this->rx($this->word); 116 | } 117 | 118 | /** 119 | * R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel. 120 | */ 121 | protected function r2() 122 | { 123 | list($index, $value) = $this->rx($this->r1); 124 | 125 | $this->r2 = $value; 126 | $this->r2Index = $this->r1Index + $index; 127 | } 128 | 129 | /** 130 | * Common function for R1 and R2 131 | * Search the region after the first non-vowel following a vowel in $word, or the end of the word if there is no such non-vowel. 132 | * R1 : $in = $this->word 133 | * R2 : $in = R1 134 | */ 135 | protected function rx($in) 136 | { 137 | $length = StringHelper::strlen($in); 138 | 139 | // defaults 140 | $value = ''; 141 | $index = $length; 142 | 143 | // we search all vowels 144 | $vowels = array(); 145 | for ($i=0; $i<$length; $i++) { 146 | $letter = StringHelper::substr($in, $i, 1); 147 | if (in_array($letter, static::$vowels)) { 148 | $vowels[] = $i; 149 | } 150 | } 151 | 152 | // search the non-vowel following a vowel 153 | foreach ($vowels as $position) { 154 | $after = $position + 1; 155 | $letter = StringHelper::substr($in, $after, 1); 156 | 157 | if (! in_array($letter, static::$vowels)) { 158 | $index = $after + 1; 159 | $value = StringHelper::substr($in, ($after+1)); 160 | 161 | break; 162 | } 163 | } 164 | 165 | return array($index, $value); 166 | } 167 | 168 | /** 169 | * Used by spanish, italian, portuguese, etc (but not by french) 170 | * 171 | * If the second letter is a consonant, RV is the region after the next following vowel, 172 | * or if the first two letters are vowels, RV is the region after the next consonant, 173 | * and otherwise (consonant-vowel case) RV is the region after the third letter. 174 | * But RV is the end of the word if these positions cannot be found. 175 | */ 176 | protected function rv() 177 | { 178 | $length = StringHelper::strlen($this->word); 179 | 180 | $this->rv = ''; 181 | $this->rvIndex = $length; 182 | 183 | if ($length < 3) { 184 | return true; 185 | } 186 | 187 | $first = StringHelper::substr($this->word, 0, 1); 188 | $second = StringHelper::substr($this->word, 1, 1); 189 | 190 | // If the second letter is a consonant, RV is the region after the next following vowel, 191 | if (!in_array($second, static::$vowels)) { 192 | for ($i=2; $i<$length; $i++) { 193 | $letter = StringHelper::substr($this->word, $i, 1); 194 | if (in_array($letter, static::$vowels)) { 195 | $this->rvIndex = $i + 1; 196 | $this->rv = StringHelper::substr($this->word, ($i+1)); 197 | return true; 198 | } 199 | } 200 | } 201 | 202 | // or if the first two letters are vowels, RV is the region after the next consonant, 203 | if ( (in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { 204 | for ($i=2; $i<$length; $i++) { 205 | $letter = StringHelper::substr($this->word, $i, 1); 206 | if (! in_array($letter, static::$vowels)) { 207 | $this->rvIndex = $i + 1; 208 | $this->rv = StringHelper::substr($this->word, ($i+1)); 209 | return true; 210 | } 211 | } 212 | } 213 | 214 | // and otherwise (consonant-vowel case) RV is the region after the third letter. 215 | if ( (! in_array($first, static::$vowels)) && (in_array($second, static::$vowels)) ) { 216 | $this->rv = StringHelper::substr($this->word, 3); 217 | $this->rvIndex = 3; 218 | return true; 219 | } 220 | } 221 | } 222 | -------------------------------------------------------------------------------- /src/Stemmer/Stemmer.php: -------------------------------------------------------------------------------- 1 | 6 | */ 7 | interface Stemmer 8 | { 9 | /** 10 | * Main function to get the STEM of a word 11 | * 12 | * @param string $word A valid UTF-8 word 13 | * 14 | * @return string 15 | * 16 | * @throws \Exception 17 | */ 18 | public function stem($word); 19 | } 20 | -------------------------------------------------------------------------------- /src/Stemmer/Swedish.php: -------------------------------------------------------------------------------- 1 | word = StringHelper::strtolower($word); 26 | 27 | // R2 is not used: R1 is defined in the same way as in the German stemmer 28 | $this->r1(); 29 | 30 | // then R1 is adjusted so that the region before it contains at least 3 letters. 31 | if ($this->r1Index < 3) { 32 | $this->r1Index = 3; 33 | $this->r1 = StringHelper::substr($this->word, 3); 34 | } 35 | 36 | // Do each of steps 1, 2 3 and 4. 37 | $this->step1(); 38 | $this->step2(); 39 | $this->step3(); 40 | 41 | return $this->word; 42 | } 43 | 44 | /** 45 | * Define a valid s-ending as one of 46 | * b c d f g h j k l m n o p r t v y 47 | * 48 | * @param string $ending 49 | * @return boolean 50 | */ 51 | private function hasValidSEnding($word) 52 | { 53 | $lastLetter = StringHelper::substr($word, -1, 1); 54 | return in_array($lastLetter, array('b', 'c', 'd', 'f', 'g', 'h', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 't', 'v', 'y')); 55 | } 56 | 57 | /** 58 | * Step 1 59 | * Search for the longest among the following suffixes in R1, and perform the action indicated. 60 | */ 61 | private function step1() 62 | { 63 | // a arna erna heterna orna ad e ade ande arne are aste en anden aren heten 64 | // ern ar er heter or as arnas ernas ornas es ades andes ens arens hetens 65 | // erns at andet het ast 66 | // delete 67 | if ( ($position = $this->searchIfInR1(array( 68 | 'heterna', 'hetens', 'ornas', 'andes', 'arnas', 'heter', 'ernas', 'anden', 'heten', 'andet', 'arens', 69 | 'orna', 'arna', 'erna', 'aren', 'ande', 'ades', 'arne', 'erns', 'aste', 'ade', 'ern', 'het', 70 | 'ast', 'are', 'ens', 'or', 'es', 'ad', 'en', 'at', 'ar', 'as', 'er', 'a', 'e' 71 | ))) !== false) { 72 | $this->word = StringHelper::substr($this->word, 0, $position); 73 | return true; 74 | } 75 | 76 | // s 77 | // delete if preceded by a valid s-ending 78 | if ( ($position = $this->searchIfInR1(array('s'))) !== false) { 79 | $word = StringHelper::substr($this->word, 0, $position); 80 | if ($this->hasValidSEnding($word)) { 81 | $this->word = $word; 82 | } 83 | } 84 | } 85 | 86 | /** 87 | * Step 2 88 | * Search for one of the following suffixes in R1, and if found delete the last letter. 89 | */ 90 | private function step2() 91 | { 92 | // dd gd nn dt gt kt tt 93 | if ($this->searchIfInR1(array('dd', 'gd', 'nn', 'dt', 'gt', 'kt', 'tt')) !== false) { 94 | $this->word = StringHelper::substr($this->word, 0, -1); 95 | } 96 | } 97 | 98 | /** 99 | * Step 3: 100 | * Search for the longest among the following suffixes in R1, and perform the action indicated. 101 | */ 102 | private function step3() 103 | { 104 | // lig ig els 105 | // delete 106 | if ( ($position = $this->searchIfInR1(array('lig', 'ig', 'els'))) !== false) { 107 | $this->word = StringHelper::substr($this->word, 0, $position); 108 | return true; 109 | } 110 | 111 | // löst 112 | // replace with lös 113 | if ( ($this->searchIfInR1(array('löst'))) !== false) { 114 | $this->word = StringHelper::substr($this->word, 0, -1); 115 | return true; 116 | } 117 | 118 | // fullt 119 | // replace with full 120 | if ( ($this->searchIfInR1(array('fullt'))) !== false) { 121 | $this->word = StringHelper::substr($this->word, 0, -1); 122 | return true; 123 | } 124 | } 125 | } 126 | -------------------------------------------------------------------------------- /src/StemmerFactory.php: -------------------------------------------------------------------------------- 1 | ['ca', 'cat', 'catalan'], 26 | Danish::class => ['da', 'dan', 'danish'], 27 | Dutch::class => ['nl', 'dut', 'nld', 'dutch'], 28 | English::class => ['en', 'eng', 'english'], 29 | Finnish::class => ['fi', 'fin', 'finnish'], 30 | French::class => ['fr', 'fre', 'fra', 'french'], 31 | German::class => ['de', 'deu', 'ger', 'german'], 32 | Italian::class => ['it', 'ita', 'italian'], 33 | Norwegian::class => ['no', 'nor', 'norwegian'], 34 | Portuguese::class => ['pt', 'por', 'portuguese'], 35 | Romanian::class => ['ro', 'rum', 'ron', 'romanian'], 36 | Russian::class => ['ru', 'rus', 'russian'], 37 | Spanish::class => ['es', 'spa', 'spanish'], 38 | Swedish::class => ['sv', 'swe', 'swedish'] 39 | ]; 40 | 41 | /** 42 | * @throws NotFoundException 43 | */ 44 | public static function create(string $code): Stemmer 45 | { 46 | $code = StringHelper::strtolower($code); 47 | 48 | foreach (self::LANGS as $classname => $isoCodes) { 49 | if (in_array($code, $isoCodes)) { 50 | return new $classname; 51 | } 52 | } 53 | 54 | throw new NotFoundException(sprintf('Stemmer not found for %s', $code)); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/StemmerManager.php: -------------------------------------------------------------------------------- 1 | stemmers = []; 13 | } 14 | 15 | /** 16 | * @throws NotFoundException 17 | */ 18 | public function stem(string $word, string $isoCode): string 19 | { 20 | if (!isset($this->stemmers[$isoCode])) { 21 | $this->stemmers[$isoCode] = StemmerFactory::create($isoCode); 22 | } 23 | 24 | return $this->stemmers[$isoCode]->stem($word); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/Transliterate.php: -------------------------------------------------------------------------------- 1 | 'a', 29 | 'ô' => 'o', 30 | 'ď' => 'd', 31 | 'ḟ' => 'f', 32 | 'ë' => 'e', 33 | 'š' => 's', 34 | 'ơ' => 'o', 35 | 'ß' => 'ss', 36 | 'ă' => 'a', 37 | 'ř' => 'r', 38 | 'ț' => 't', 39 | 'ň' => 'n', 40 | 'ā' => 'a', 41 | 'ķ' => 'k', 42 | 'ŝ' => 's', 43 | 'ỳ' => 'y', 44 | 'ņ' => 'n', 45 | 'ĺ' => 'l', 46 | 'ħ' => 'h', 47 | 'ṗ' => 'p', 48 | 'ó' => 'o', 49 | 'ú' => 'u', 50 | 'ě' => 'e', 51 | 'é' => 'e', 52 | 'ç' => 'c', 53 | 'ẁ' => 'w', 54 | 'ċ' => 'c', 55 | 'õ' => 'o', 56 | 'ṡ' => 's', 57 | 'ø' => 'o', 58 | 'ģ' => 'g', 59 | 'ŧ' => 't', 60 | 'ș' => 's', 61 | 'ė' => 'e', 62 | 'ĉ' => 'c', 63 | 'ś' => 's', 64 | 'î' => 'i', 65 | 'ű' => 'u', 66 | 'ć' => 'c', 67 | 'ę' => 'e', 68 | 'ŵ' => 'w', 69 | 'ṫ' => 't', 70 | 'ū' => 'u', 71 | 'č' => 'c', 72 | 'ö' => 'oe', 73 | 'è' => 'e', 74 | 'ŷ' => 'y', 75 | 'ą' => 'a', 76 | 'ł' => 'l', 77 | 'ų' => 'u', 78 | 'ů' => 'u', 79 | 'ş' => 's', 80 | 'ğ' => 'g', 81 | 'ļ' => 'l', 82 | 'ƒ' => 'f', 83 | 'ž' => 'z', 84 | 'ẃ' => 'w', 85 | 'ḃ' => 'b', 86 | 'å' => 'a', 87 | 'ì' => 'i', 88 | 'ï' => 'i', 89 | 'ḋ' => 'd', 90 | 'ť' => 't', 91 | 'ŗ' => 'r', 92 | 'ä' => 'ae', 93 | 'í' => 'i', 94 | 'ŕ' => 'r', 95 | 'ê' => 'e', 96 | 'ü' => 'ue', 97 | 'ò' => 'o', 98 | 'ē' => 'e', 99 | 'ñ' => 'n', 100 | 'ń' => 'n', 101 | 'ĥ' => 'h', 102 | 'ĝ' => 'g', 103 | 'đ' => 'd', 104 | 'ĵ' => 'j', 105 | 'ÿ' => 'y', 106 | 'ũ' => 'u', 107 | 'ŭ' => 'u', 108 | 'ư' => 'u', 109 | 'ţ' => 't', 110 | 'ý' => 'y', 111 | 'ő' => 'o', 112 | 'â' => 'a', 113 | 'ľ' => 'l', 114 | 'ẅ' => 'w', 115 | 'ż' => 'z', 116 | 'ī' => 'i', 117 | 'ã' => 'a', 118 | 'ġ' => 'g', 119 | 'ṁ' => 'm', 120 | 'ō' => 'o', 121 | 'ĩ' => 'i', 122 | 'ù' => 'u', 123 | 'į' => 'i', 124 | 'ź' => 'z', 125 | 'á' => 'a', 126 | 'û' => 'u', 127 | 'þ' => 'th', 128 | 'ð' => 'dh', 129 | 'æ' => 'ae', 130 | 'µ' => 'u', 131 | 'ĕ' => 'e', 132 | 'œ' => 'oe', 133 | ]; 134 | } 135 | 136 | $string = str_replace(array_keys($UTF8_LOWER_ACCENTS), array_values($UTF8_LOWER_ACCENTS), $string); 137 | } 138 | 139 | if ($case >= 0) { 140 | if (\is_null($UTF8_UPPER_ACCENTS)) { 141 | $UTF8_UPPER_ACCENTS = [ 142 | 'À' => 'A', 143 | 'Ô' => 'O', 144 | 'Ď' => 'D', 145 | 'Ḟ' => 'F', 146 | 'Ë' => 'E', 147 | 'Š' => 'S', 148 | 'Ơ' => 'O', 149 | 'Ă' => 'A', 150 | 'Ř' => 'R', 151 | 'Ț' => 'T', 152 | 'Ň' => 'N', 153 | 'Ā' => 'A', 154 | 'Ķ' => 'K', 155 | 'Ŝ' => 'S', 156 | 'Ỳ' => 'Y', 157 | 'Ņ' => 'N', 158 | 'Ĺ' => 'L', 159 | 'Ħ' => 'H', 160 | 'Ṗ' => 'P', 161 | 'Ó' => 'O', 162 | 'Ú' => 'U', 163 | 'Ě' => 'E', 164 | 'É' => 'E', 165 | 'Ç' => 'C', 166 | 'Ẁ' => 'W', 167 | 'Ċ' => 'C', 168 | 'Õ' => 'O', 169 | 'Ṡ' => 'S', 170 | 'Ø' => 'O', 171 | 'Ģ' => 'G', 172 | 'Ŧ' => 'T', 173 | 'Ș' => 'S', 174 | 'Ė' => 'E', 175 | 'Ĉ' => 'C', 176 | 'Ś' => 'S', 177 | 'Î' => 'I', 178 | 'Ű' => 'U', 179 | 'Ć' => 'C', 180 | 'Ę' => 'E', 181 | 'Ŵ' => 'W', 182 | 'Ṫ' => 'T', 183 | 'Ū' => 'U', 184 | 'Č' => 'C', 185 | 'Ö' => 'Oe', 186 | 'È' => 'E', 187 | 'Ŷ' => 'Y', 188 | 'Ą' => 'A', 189 | 'Ł' => 'L', 190 | 'Ų' => 'U', 191 | 'Ů' => 'U', 192 | 'Ş' => 'S', 193 | 'Ğ' => 'G', 194 | 'Ļ' => 'L', 195 | 'Ƒ' => 'F', 196 | 'Ž' => 'Z', 197 | 'Ẃ' => 'W', 198 | 'Ḃ' => 'B', 199 | 'Å' => 'A', 200 | 'Ì' => 'I', 201 | 'Ï' => 'I', 202 | 'Ḋ' => 'D', 203 | 'Ť' => 'T', 204 | 'Ŗ' => 'R', 205 | 'Ä' => 'Ae', 206 | 'Í' => 'I', 207 | 'Ŕ' => 'R', 208 | 'Ê' => 'E', 209 | 'Ü' => 'Ue', 210 | 'Ò' => 'O', 211 | 'Ē' => 'E', 212 | 'Ñ' => 'N', 213 | 'Ń' => 'N', 214 | 'Ĥ' => 'H', 215 | 'Ĝ' => 'G', 216 | 'Đ' => 'D', 217 | 'Ĵ' => 'J', 218 | 'Ÿ' => 'Y', 219 | 'Ũ' => 'U', 220 | 'Ŭ' => 'U', 221 | 'Ư' => 'U', 222 | 'Ţ' => 'T', 223 | 'Ý' => 'Y', 224 | 'Ő' => 'O', 225 | 'Â' => 'A', 226 | 'Ľ' => 'L', 227 | 'Ẅ' => 'W', 228 | 'Ż' => 'Z', 229 | 'Ī' => 'I', 230 | 'Ã' => 'A', 231 | 'Ġ' => 'G', 232 | 'Ṁ' => 'M', 233 | 'Ō' => 'O', 234 | 'Ĩ' => 'I', 235 | 'Ù' => 'U', 236 | 'Į' => 'I', 237 | 'Ź' => 'Z', 238 | 'Á' => 'A', 239 | 'Û' => 'U', 240 | 'Þ' => 'Th', 241 | 'Ð' => 'Dh', 242 | 'Æ' => 'Ae', 243 | 'Ĕ' => 'E', 244 | 'Œ' => 'Oe', 245 | ]; 246 | } 247 | 248 | $string = str_replace(array_keys($UTF8_UPPER_ACCENTS), array_values($UTF8_UPPER_ACCENTS), $string); 249 | } 250 | 251 | return $string; 252 | } 253 | } 254 | -------------------------------------------------------------------------------- /test/CatalanTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileVerboseIterator('test/files/ca.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/CsvFileIterator.php: -------------------------------------------------------------------------------- 1 | file = fopen($file, 'r'))) { 13 | die('Can\'t open file '.$this->file)."\n"; 14 | } 15 | } 16 | 17 | public function __destruct() 18 | { 19 | fclose($this->file); 20 | } 21 | 22 | public function rewind() 23 | { 24 | rewind($this->file); 25 | //$this->current = fgetcsv($this->file, null, "\t"); 26 | $line = fgets($this->file); 27 | $current = explode(' ', $line); 28 | $current = array_filter($current); 29 | $current = array_values($current); 30 | $current = array_map('trim', $current); 31 | $this->current = $current; 32 | $this->key = 0; 33 | } 34 | 35 | public function valid() 36 | { 37 | return !feof($this->file); 38 | } 39 | 40 | public function key() 41 | { 42 | return $this->key; 43 | } 44 | 45 | public function current() 46 | { 47 | return $this->current; 48 | } 49 | 50 | public function next() 51 | { 52 | $line = fgets($this->file); 53 | $current = explode(' ', $line); 54 | $current = array_filter($current); 55 | $current = array_values($current); 56 | $current = array_map('trim', $current); 57 | $this->current = $current; 58 | $this->key++; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /test/CsvFileVerboseIterator.php: -------------------------------------------------------------------------------- 1 | _updateKey($this->current()); 10 | } 11 | 12 | public function next() 13 | { 14 | parent::next(); 15 | if ($this->valid()) { 16 | $this->_updateKey($this->current()); 17 | } 18 | } 19 | 20 | protected function _updateKey($value) 21 | { 22 | if ($value && sizeof($value)) { 23 | $this->key = $value[0]; 24 | } elseif (sizeof($this->current)) { 25 | $this->key = $this->current[0]; 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /test/DanishTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/dk.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/DutchTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/nl.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/EnglishTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/en.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/FactoryTest.php: -------------------------------------------------------------------------------- 1 | 'Wamania\\Snowball\\Stemmer\\Catalan', 14 | 'cat' => 'Wamania\\Snowball\\Stemmer\\Catalan', 15 | 'catalan' => 'Wamania\\Snowball\\Stemmer\\Catalan', 16 | 'da' => 'Wamania\\Snowball\\Stemmer\\Danish', 17 | 'dan' => 'Wamania\\Snowball\\Stemmer\\Danish', 18 | 'danish' => 'Wamania\\Snowball\\Stemmer\\Danish', 19 | 'nl' => 'Wamania\\Snowball\\Stemmer\\Dutch', 20 | 'dut' => 'Wamania\\Snowball\\Stemmer\\Dutch', 21 | 'nld' => 'Wamania\\Snowball\\Stemmer\\Dutch', 22 | 'dutch' => 'Wamania\\Snowball\\Stemmer\\Dutch', 23 | 'en' => 'Wamania\\Snowball\\Stemmer\\English', 24 | 'eng' => 'Wamania\\Snowball\\Stemmer\\English', 25 | 'english' => 'Wamania\\Snowball\\Stemmer\\English', 26 | 'fr' => 'Wamania\\Snowball\\Stemmer\\French', 27 | 'fre' => 'Wamania\\Snowball\\Stemmer\\French', 28 | 'fra' => 'Wamania\\Snowball\\Stemmer\\French', 29 | 'french' => 'Wamania\\Snowball\\Stemmer\\French', 30 | 'de' => 'Wamania\\Snowball\\Stemmer\\German', 31 | 'deu' => 'Wamania\\Snowball\\Stemmer\\German', 32 | 'ger' => 'Wamania\\Snowball\\Stemmer\\German', 33 | 'german' => 'Wamania\\Snowball\\Stemmer\\German', 34 | 'it' => 'Wamania\\Snowball\\Stemmer\\Italian', 35 | 'ita' => 'Wamania\\Snowball\\Stemmer\\Italian', 36 | 'italian' => 'Wamania\\Snowball\\Stemmer\\Italian', 37 | 'no' => 'Wamania\\Snowball\\Stemmer\\Norwegian', 38 | 'nor' => 'Wamania\\Snowball\\Stemmer\\Norwegian', 39 | 'norwegian' => 'Wamania\\Snowball\\Stemmer\\Norwegian', 40 | 'pt' => 'Wamania\\Snowball\\Stemmer\\Portuguese', 41 | 'por' => 'Wamania\\Snowball\\Stemmer\\Portuguese', 42 | 'portuguese' => 'Wamania\\Snowball\\Stemmer\\Portuguese', 43 | 'ro' => 'Wamania\\Snowball\\Stemmer\\Romanian', 44 | 'rum' => 'Wamania\\Snowball\\Stemmer\\Romanian', 45 | 'ron' => 'Wamania\\Snowball\\Stemmer\\Romanian', 46 | 'romanian' => 'Wamania\\Snowball\\Stemmer\\Romanian', 47 | 'ru' => 'Wamania\\Snowball\\Stemmer\\Russian', 48 | 'rus' => 'Wamania\\Snowball\\Stemmer\\Russian', 49 | 'russian' => 'Wamania\\Snowball\\Stemmer\\Russian', 50 | 'es' => 'Wamania\\Snowball\\Stemmer\\Spanish', 51 | 'spa' => 'Wamania\\Snowball\\Stemmer\\Spanish', 52 | 'spanish' => 'Wamania\\Snowball\\Stemmer\\Spanish', 53 | 'sv' => 'Wamania\\Snowball\\Stemmer\\Swedish', 54 | 'swe' => 'Wamania\\Snowball\\Stemmer\\Swedish', 55 | 'swedish' => 'Wamania\\Snowball\\Stemmer\\Swedish', 56 | ]; 57 | 58 | foreach ($isoCodes as $isoCode => $classname) { 59 | $stemmer = StemmerFactory::create($isoCode); 60 | 61 | $this->assertTrue($stemmer instanceof $classname); 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /test/FinnishTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/fi.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/FrenchTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/fr.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/GermanTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/de.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/ItalianTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/it.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/ManagerTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('anticonstitutionnel', $stemmerManager->stem('anticonstitutionnelement', 'fr')); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /test/NorwegianTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/no.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/PortugueseTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/pt.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/RomanianTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/ro.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/RussianTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/ru.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/SpanishTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/es.txt'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /test/SwedishTest.php: -------------------------------------------------------------------------------- 1 | stem($word); 17 | 18 | $this->assertEquals($stem, $snowballStem); 19 | } 20 | 21 | public function load() 22 | { 23 | return new CsvFileIterator('test/files/sw.txt'); 24 | } 25 | } 26 | --------------------------------------------------------------------------------