├── .editorconfig ├── .github └── workflows │ └── run-tests.yml ├── .gitignore ├── LICENSE ├── README.md ├── composer.json ├── console ├── arr_to_regex.php ├── extractor.php ├── stopwords_en_US.json └── stopwords_en_US.txt ├── examples ├── LaravelControllerExample.php ├── de_DE_example.php ├── en_US_example.php ├── es_AR_example.php ├── fa_IR_example.php ├── fr_FR_example.php ├── it_IT_example.php └── pt_BR_example.php ├── lang ├── af_ZA.pattern ├── af_ZA.php ├── ar_AE.pattern ├── ar_AE.php ├── ckb_IQ.pattern ├── ckb_IQ.php ├── de_DE.pattern ├── de_DE.php ├── en_US.pattern ├── en_US.php ├── es_AR.pattern ├── es_AR.php ├── fa_IR.pattern ├── fa_IR.php ├── fr_FR.pattern ├── fr_FR.php ├── it_IT.pattern ├── it_IT.php ├── nl_NL.pattern ├── nl_NL.php ├── pl_PL.pattern ├── pl_PL.php ├── pt_BR.pattern ├── pt_BR.php ├── pt_PT.pattern ├── pt_PT.php ├── ru_RU.pattern ├── ru_RU.php ├── sv_SE.pattern ├── sv_SE.php ├── ta_TA.pattern ├── ta_TA.php ├── tr_TR.pattern └── tr_TR.php ├── phpunit.xml ├── src ├── ILangParseOptions.php ├── LangParseOptions.php ├── RakePlus.php └── StopwordProviders │ ├── AbstractStopwordProvider.php │ ├── StopwordsArray.php │ ├── StopwordsPHP.php │ └── StopwordsPatternFile.php └── tests ├── LangParseOptionsTest.php ├── RakePlusTest.php ├── StopwordsPHPTest.php ├── StopwordsPatternFileTest.php ├── fixtures ├── empty_lang.php ├── en_US.php └── string_lang.php ├── lang ├── en_US.ereg.pattern └── en_US.non_ereg.pattern ├── lang_ar_AE_Test.php ├── lang_ckb_IQ_Test.php ├── lang_es_AR_Test.php ├── lang_fr_FR_Test.php └── lang_sv_SE_Test.php /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | charset = utf-8 5 | indent_size = 4 6 | indent_style = space 7 | end_of_line = lf 8 | insert_final_newline = true 9 | trim_trailing_whitespace = true 10 | 11 | [*.md] 12 | trim_trailing_whitespace = false 13 | insert_final_newline = false 14 | 15 | [*.{yml,yaml,json}] 16 | indent_size = 2 17 | -------------------------------------------------------------------------------- /.github/workflows/run-tests.yml: -------------------------------------------------------------------------------- 1 | name: Run Tests 2 | 3 | on: 4 | push: 5 | tags: 6 | - '*' 7 | pull_request: 8 | workflow_dispatch: 9 | 10 | jobs: 11 | test: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | php: [7.4, 8.0, 8.1, 8.2, 8.3, 8.4] 17 | stability: [prefer-lowest, prefer-stable] 18 | 19 | name: P${{ matrix.php }} - ${{ matrix.stability }} 20 | 21 | steps: 22 | - name: Checkout code 23 | uses: actions/checkout@v4 24 | 25 | - name: Setup PHP 26 | uses: shivammathur/setup-php@v2 27 | with: 28 | php-version: ${{ matrix.php }} 29 | extensions: dom, curl, libxml, mbstring, zip, pcntl, pdo, sqlite, pdo_sqlite 30 | coverage: none 31 | 32 | - name: Get composer cache directory 33 | id: composer-cache 34 | run: echo "dir=$(composer config cache-files-dir)" >> $GITHUB_OUTPUT 35 | 36 | - name: Cache composer dependencies 37 | uses: actions/cache@v4 38 | with: 39 | path: ${{ steps.composer-cache.outputs.dir }} 40 | key: ${{ runner.os }}-php-${{ matrix.php }}-composer-${{ matrix.stability }}-${{ hashFiles('**/composer.json') }} 41 | restore-keys: ${{ runner.os }}-php-${{ matrix.php }}-composer-${{ matrix.stability }}- 42 | 43 | - name: Install dependencies 44 | run: composer update --${{ matrix.stability }} --prefer-dist --no-interaction 45 | 46 | - name: Execute tests 47 | run: vendor/bin/phpunit --no-coverage 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Project Specific # 2 | #################### 3 | /keys 4 | *.kml 5 | .idea 6 | *.sublime-project 7 | *.sublime-workspace 8 | settings.json 9 | .directory 10 | /.phpunit.cache 11 | /build 12 | /vendor 13 | /coverage 14 | Homestead.yaml 15 | Homestead.json 16 | .env 17 | _ide_helper.php 18 | composer.lock 19 | composer.phar 20 | .phpunit.result.cache 21 | packages.txt 22 | 23 | # Compiled source # 24 | ################### 25 | *.pyc 26 | *.com 27 | *.class 28 | *.dll 29 | *.exe 30 | *.o 31 | *.so 32 | *.bin 33 | 34 | # Packages # 35 | ############ 36 | # it's better to unpack these files and commit the raw source 37 | # git has its own built in compression methods 38 | *.7z 39 | *.dmg 40 | *.gz 41 | *.iso 42 | *.jar 43 | *.rar 44 | *.tar 45 | *.zip 46 | 47 | # Logs and databases # 48 | ###################### 49 | *.log 50 | *.sqlite 51 | 52 | # OS generated files # 53 | ###################### 54 | .DS_Store 55 | .DS_Store? 56 | ._* 57 | .Spotlight-V100 58 | .Trashes 59 | Icon? 60 | ehthumbs.db 61 | Thumbs.db 62 | 63 | # General # 64 | ########### 65 | *.key 66 | *.csv 67 | *~ 68 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Don Schoeman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "donatello-za/rake-php-plus", 3 | "type": "library", 4 | "description": "Yet another PHP implementation of the Rapid Automatic Keyword Extraction algorithm (RAKE).", 5 | "keywords": [ 6 | "rake", 7 | "rapid", 8 | "automatic", 9 | "keyword", 10 | "extraction", 11 | "algorithm" 12 | ], 13 | "homepage": "https://github.com/Donatello-za/rake-php-plus", 14 | "support": { 15 | "issues": "https://github.com/Donatello-za/rake-php-plus/issues", 16 | "source": "https://github.com/Donatello-za/rake-php-plus" 17 | }, 18 | "license": "MIT", 19 | "authors": [ 20 | { 21 | "name": "Don Schoeman", 22 | "email": "ta.maximus@gmail.com" 23 | } 24 | ], 25 | "require": { 26 | "php": ">=7.4.0", 27 | "ext-json": "*", 28 | "ext-mbstring": "*" 29 | }, 30 | "require-dev": { 31 | "phpunit/phpunit": "^9.5|^10.1|^11.0" 32 | }, 33 | "autoload": { 34 | "psr-4": { 35 | "DonatelloZa\\RakePlus\\": "src/" 36 | } 37 | }, 38 | "autoload-dev": { 39 | "psr-4": { 40 | "Tests\\": "tests/" 41 | } 42 | }, 43 | "extra": { 44 | "branch-alias": { 45 | "dev-master": "1.0.13-dev" 46 | } 47 | }, 48 | "scripts": { 49 | "test": "./vendor/bin/phpunit --no-coverage", 50 | "test-coverage": "./vendor/bin/phpunit --coverage-html build/coverage", 51 | "phpunit": "./vendor/bin/phpunit" 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /console/arr_to_regex.php: -------------------------------------------------------------------------------- 1 | 0) { 94 | render_pattern_output($stopwords); 95 | } else { 96 | echo "\n"; 97 | echo "Error: No stopwords found in file \"$php_file\".\n"; 98 | echo "\n"; 99 | exit(1); 100 | } 101 | } 102 | 103 | check_args($argc); 104 | 105 | $stopwords_file = get_arg($argv, 1); 106 | $stopwords = load_stopwords($stopwords_file); 107 | 108 | /** @noinspection PhpUnhandledExceptionInspection */ 109 | render_output( 110 | $stopwords, 111 | $stopwords_file 112 | ); 113 | -------------------------------------------------------------------------------- /console/extractor.php: -------------------------------------------------------------------------------- 1 | A is 26 | * important and for the tool to sort languages other than 27 | * English properly it needs to set the locale using PHP's 28 | * setlocale() function which depends on your system's 29 | * available locals. To check your locals on Linux run: 30 | * 31 | * $ local -a 32 | * 33 | * To install more locals: 34 | * 35 | * $ sudo locale-gen es_AR 36 | * $ sudo locale-gen es_AR.utf8 37 | */ 38 | 39 | /** 40 | * @param int $arg_count 41 | */ 42 | function check_args(int $arg_count) 43 | { 44 | if ($arg_count < 2) { 45 | echo "\n"; 46 | echo "Error: Please specify the filename of the stopwords file to extract.\n"; 47 | echo "Example:\n"; 48 | echo " php ./console/extractor.php stopwords_en_US.txt --locale=en_US --output=php\n"; 49 | echo " php ./console extractor.php stopwords_en_US.json --locale=en_US --output=php\n"; 50 | echo "\n"; 51 | echo "For better RakePlus performance, use the --output argument to produce\n"; 52 | echo "regular expression pattern instead of a PHP script.\n"; 53 | echo "Example:\n"; 54 | echo " php ./console/extractor.php stopwords_en_US.txt --locale=en_US --output=pattern\n"; 55 | echo " php ./console/extractor.php stopwords_en_US.json --locale=en_US --output=pattern\n"; 56 | echo "\n"; 57 | echo "You can pipe the output of this tool directly into a\n"; 58 | echo ".php or .pattern file:\n"; 59 | echo "Example:\n"; 60 | echo " php ./console/extractor.php stopwords_en_US.txt --locale=en_US --output=php > en_US.php\n"; 61 | echo " php ./console/extractor.php stopwords_en_US.json --locale=en_US --output=pattern > en_US.pattern\n"; 62 | echo " php ./console/extractor.php en_US.php --locale=en_US --output=pattern > en_US.pattern\n"; 63 | echo "\n"; 64 | 65 | exit(1); 66 | } 67 | } 68 | 69 | /** 70 | * @param array $args 71 | * @param int $arg_no 72 | * @param mixed $default 73 | * 74 | * @return mixed 75 | */ 76 | function get_arg_by_index(array $args, int $arg_no, $default = null) 77 | { 78 | if ($arg_no < count($args)) { 79 | return $args[$arg_no]; 80 | } else { 81 | return $default; 82 | } 83 | } 84 | 85 | /** 86 | * @param array $args 87 | * @param string $name 88 | * @param mixed $default 89 | * 90 | * @return mixed 91 | */ 92 | function get_arg_by_name(array $args, string $name, $default = null) 93 | { 94 | foreach ($args as $arg) { 95 | list($key, $value) = array_pad(explode('=', $arg), 2, $default); 96 | if ($key == $name) { 97 | return $value; 98 | } 99 | } 100 | 101 | return $default; 102 | } 103 | 104 | /** 105 | * Returns true if one of the arguments consists 106 | * of the supplied $arg. 107 | * 108 | * @param array $args 109 | * @param string $name 110 | * 111 | * @return bool 112 | */ 113 | function has_arg(array $args, string $name): bool 114 | { 115 | if (in_array($name, $args)) { 116 | return true; 117 | } 118 | 119 | return false; 120 | } 121 | 122 | /** 123 | * Loads stopwords from a .txt, .json or .php file. 124 | * 125 | * @param string $stopwords_file 126 | * 127 | * @return array 128 | */ 129 | function load_stopwords(string $stopwords_file): array 130 | { 131 | $stopwords = []; 132 | 133 | $ext = pathinfo($stopwords_file, PATHINFO_EXTENSION); 134 | if (!file_exists($stopwords_file)) { 135 | echo "\n"; 136 | echo "Error: Stopwords file \"$stopwords_file\" not found.\n"; 137 | echo "\n"; 138 | exit(1); 139 | } 140 | 141 | if ($ext === 'txt') { 142 | if ($h = @fopen($stopwords_file, 'r')) { 143 | while (($line = fgets($h)) !== false) { 144 | $line = preg_replace('/^[\pZ\pC]+|[\pZ\pC]+$/u', '', $line); 145 | if (!empty($line) && $line[0] != '#') { 146 | $stopwords[$line] = true; 147 | } 148 | } 149 | 150 | return array_keys($stopwords); 151 | } else { 152 | echo "\n"; 153 | echo "Error: Could not read text file \"$stopwords_file\".\n"; 154 | echo "\n"; 155 | exit(1); 156 | } 157 | } 158 | 159 | if ($ext === 'json') { 160 | $stopwords = json_decode(file_get_contents($stopwords_file), true); 161 | return array_keys(array_fill_keys($stopwords, true)); 162 | } 163 | 164 | if ($ext === 'php') { 165 | $stopwords = require $stopwords_file; 166 | return array_keys(array_fill_keys($stopwords, true)); 167 | } 168 | 169 | return []; 170 | } 171 | 172 | /** 173 | * Render a PHP formatted output to console. 174 | * 175 | * @param array $stopwords 176 | * 177 | * @throws Exception 178 | */ 179 | function render_php_output(array $stopwords) 180 | { 181 | $stopword_count = count($stopwords); 182 | $timestamp = (new DateTime('now', new DateTimeZone('UTC')))->format(DateTimeInterface::ATOM); 183 | 184 | echo "\xEF\xBB\xBF 0) { 255 | if ($output == 'pattern') { 256 | render_pattern_output($stopwords); 257 | } else if ($output == 'php') { 258 | render_php_output($stopwords); 259 | } else if ($output == 'json') { 260 | render_json_output($stopwords); 261 | } 262 | 263 | } else { 264 | echo "\n"; 265 | echo "Error: No stopwords found in file \"$stopwords_file\".\n"; 266 | echo "\n"; 267 | exit(1); 268 | } 269 | } 270 | 271 | check_args($argc); 272 | 273 | $stopwords_file = get_arg_by_index($argv, 1); 274 | $stopwords = load_stopwords($stopwords_file); 275 | 276 | $locale = get_arg_by_name($argv, '--locale'); 277 | if ($locale === null) { 278 | echo "Please specify the locale, e.g. --locale=en_US\n"; 279 | } 280 | 281 | if (!has_arg($argv, '--nosort')) { 282 | $result = setlocale(LC_COLLATE, $locale . '.utf8'); 283 | if (!has_arg($argv, '--ascending')) { 284 | usort($stopwords, function ($a, $b) { 285 | return strcoll($b, $a); 286 | }); 287 | } else { 288 | usort($stopwords, function ($a, $b) { 289 | return strcoll($a, $b); 290 | }); 291 | } 292 | 293 | /* 294 | if (!has_arg($argv, '--ascending')) { 295 | rsort($stopwords); 296 | } else { 297 | sort($stopwords); 298 | } 299 | */ 300 | } 301 | 302 | $OUTPUT_TYPES = ['pattern', 'php', 'json']; 303 | $output = get_arg_by_name($argv, '--output'); 304 | if (!in_array($output, $OUTPUT_TYPES)) { 305 | echo "Please specify the output format, e.g. --output=pattern, --output=php or --output=json\n"; 306 | exit(1); 307 | } 308 | 309 | /** @noinspection PhpUnhandledExceptionInspection */ 310 | render_output($stopwords, $stopwords_file, $output); 311 | 312 | -------------------------------------------------------------------------------- /examples/LaravelControllerExample.php: -------------------------------------------------------------------------------- 1 | input('article_text'); 19 | $keywords = RakePlus::create($text)->keywords(); 20 | 21 | // Store the array of keywords to a database table etc. 22 | // .... 23 | 24 | // Handle rest of the request... 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /examples/de_DE_example.php: -------------------------------------------------------------------------------- 1 | keywords(); 18 | print "The keywords for \"$argv[1]\" is:\n"; 19 | print_r($keywords); 20 | 21 | $phrases = RakePlus::create($argv[1], 'de_DE')->get(); 22 | print "The phrases for \"$argv[1]\" is:\n"; 23 | print_r($phrases); 24 | 25 | -------------------------------------------------------------------------------- /examples/en_US_example.php: -------------------------------------------------------------------------------- 1 | keywords(); 18 | print "The keywords for \"$argv[1]\" is:\n"; 19 | print_r($keywords); 20 | 21 | $phrases = RakePlus::create($argv[1])->get(); 22 | print "The phrases for \"$argv[1]\" is:\n"; 23 | print_r($phrases); 24 | 25 | -------------------------------------------------------------------------------- /examples/es_AR_example.php: -------------------------------------------------------------------------------- 1 | keywords(); 18 | print "Resultados de palabras clave: \"$argv[1]\"\n"; 19 | print_r($keywords); 20 | 21 | $phrases = RakePlus::create($argv[1], 'es_AR')->get(); 22 | print "Resultados de la frase: \"$argv[1]\"\n"; 23 | print_r($phrases); 24 | -------------------------------------------------------------------------------- /examples/fa_IR_example.php: -------------------------------------------------------------------------------- 1 | keywords(); 18 | print "The keywords for \"$argv[1]\" is:\n"; 19 | print_r($keywords); 20 | 21 | $phrases = RakePlus::create($argv[1])->get(); 22 | print "The phrases for \"$argv[1]\" is:\n"; 23 | print_r($phrases); 24 | -------------------------------------------------------------------------------- /examples/fr_FR_example.php: -------------------------------------------------------------------------------- 1 | keywords(); 18 | print "Résultats de mots clés: \"$argv[1]\"\n"; 19 | print_r($keywords); 20 | 21 | $phrases = RakePlus::create($argv[1], 'fr_FR')->get(); 22 | print "Résultats de la phrase: \"$argv[1]\"\n"; 23 | print_r($phrases); 24 | -------------------------------------------------------------------------------- /examples/it_IT_example.php: -------------------------------------------------------------------------------- 1 | keywords(); 18 | print "Parole chiave estratte da \"$argv[1]\":\n"; 19 | print_r($keywords); 20 | 21 | $phrases = RakePlus::create($argv[1])->get(); 22 | print "Frasi estratte da \"$argv[1]\":\n"; 23 | print_r($phrases); 24 | -------------------------------------------------------------------------------- /examples/pt_BR_example.php: -------------------------------------------------------------------------------- 1 | keywords(); 18 | print "Resultados de palabras clave: \"$argv[1]\"\n"; 19 | print_r($keywords); 20 | 21 | $phrases = RakePlus::create($argv[1], 'pt_BR')->scores(); 22 | print "Resultados de la frase: \"$argv[1]\"\n"; 23 | print_r($phrases); 24 | -------------------------------------------------------------------------------- /lang/af_ZA.pattern: -------------------------------------------------------------------------------- 1 | /\bwat\b|\bwas\b|\bvir\b|\bvan\b|\buit\b|\btoe\b|\bte\b|\bsy\b|\bso\b|\bsien\b|\bse\b|\bsal\b|\bsaam\b|\bop\b|\bons\b|\bom\b|\bnie\b|\bna\b|\bʼn(?!(-|'))\b|\b'n\b|\bmy\b|\bmet\b|\bmaar\b|\bma\b|\bkom\b|\bkan\b|\bjy\b|\bjou\b|\bis\b|\bin\b|\bhy\b|\bhulle\b|\bhom\b|\bhet\b|\bhaar\b|\bgesê\b|\bgaan\b|\ben\b|\bek\b|\been\b|\bdit\b|\bdie\b|\bdat\b|\bdag\b|\bdaar\b|\bby\b|\bbaie\b|\bas\b|\bal\b|\baf\b|\baan\b/i 2 | -------------------------------------------------------------------------------- /lang/af_ZA.php: -------------------------------------------------------------------------------- 1 |  2 | 11 | 12 | 13 | tests 14 | 15 | 16 | 17 | 18 | src 19 | 20 | 21 | src/autoload.php 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /src/ILangParseOptions.php: -------------------------------------------------------------------------------- 1 | language = $language; 27 | 28 | $this->setLineTerminator("\n"); 29 | 30 | switch ($language) { 31 | case 'ckb_IQ': 32 | case 'ar_AE': 33 | $this->setSentenceRegEx('[-؛؟،“.!?,;:\t\"\(\)]'); 34 | break; 35 | 36 | default: 37 | $this->setSentenceRegEx('[.!?,;:\t\"\(\)]'); 38 | } 39 | } 40 | 41 | /** 42 | * Instantiates a language parse options instance. 43 | * 44 | * @param string $language 45 | * 46 | * @return static 47 | */ 48 | public static function create(string $language = 'en_US'): ILangParseOptions 49 | { 50 | return (new self($language)); 51 | } 52 | 53 | /** 54 | * Returns the language that was specified when instantiating the options class. 55 | * 56 | * @return string 57 | */ 58 | public function getLanguage(): string 59 | { 60 | return $this->language; 61 | } 62 | 63 | /** 64 | * Set the text parsing options. 65 | * 66 | * @param string $sentence_regex The regular expression to use when 67 | * splitting sentences. 68 | * 69 | * @return static 70 | */ 71 | public function setSentenceRegEx(string $sentence_regex): ILangParseOptions 72 | { 73 | $this->sentence_regex = $sentence_regex; 74 | return $this; 75 | } 76 | 77 | /** 78 | * Returns the regular expression that is used to split sentences. 79 | * 80 | * @return string 81 | */ 82 | public function getSentenceRegex(): string 83 | { 84 | return $this->sentence_regex; 85 | } 86 | 87 | /** 88 | * Returns the line terminator that is typically used in the source text. 89 | * 90 | * @return string 91 | */ 92 | public function getLineTerminator(): string 93 | { 94 | return $this->line_terminator; 95 | } 96 | 97 | /** 98 | * Sets the line terminator that is typically used in the source text. 99 | * 100 | * @param string $line_terminator 101 | * 102 | * @return $this 103 | */ 104 | public function setLineTerminator(string $line_terminator): ILangParseOptions 105 | { 106 | $this->line_terminator = $line_terminator; 107 | return $this; 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /src/StopwordProviders/AbstractStopwordProvider.php: -------------------------------------------------------------------------------- 1 | 0) { 19 | $this->stopwords = $stopwords; 20 | $this->pattern = $this->buildPatternFromArray($stopwords); 21 | } else { 22 | throw new RuntimeException('The language array can not be empty.'); 23 | } 24 | } 25 | 26 | /** 27 | * Creates a new instance of the StopwordsArray class. 28 | * 29 | * @param array $stopwords 30 | * 31 | * @return StopwordsArray 32 | */ 33 | public static function create(array $stopwords): StopwordsArray 34 | { 35 | return (new self($stopwords)); 36 | } 37 | 38 | /** 39 | * Returns a string containing a regular expression pattern. 40 | * 41 | * @return string 42 | */ 43 | public function pattern(): string 44 | { 45 | return $this->pattern; 46 | } 47 | 48 | /** 49 | * Returns an array of stopwords. 50 | * 51 | * @return array 52 | */ 53 | public function stopwords(): array 54 | { 55 | return $this->stopwords; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/StopwordProviders/StopwordsPHP.php: -------------------------------------------------------------------------------- 1 | filename = $filename; 21 | $this->stopwords = $this->loadLangPHPFile($filename); 22 | $this->pattern = $this->buildPatternFromArray($this->stopwords); 23 | } 24 | 25 | /** 26 | * Creates a new instance of the StopwordsPHP class. 27 | * 28 | * @param string $filename 29 | * 30 | * @return StopwordsPHP 31 | */ 32 | public static function create(string $filename): StopwordsPHP 33 | { 34 | return (new self($filename)); 35 | } 36 | 37 | /** 38 | * Creates a new instance of the StopwordsPHP class 39 | * but automatically determines the filename to use 40 | * based on the language string provided. 41 | * 42 | * The function looks in the ./lang directory for a file called 43 | * xxxx.php file where xxxx is the language string you specified. 44 | * 45 | * @param string $language (Default is en_US) 46 | * 47 | * @return StopwordsPHP 48 | */ 49 | public static function createFromLanguage(string $language = 'en_US'): StopwordsPHP 50 | { 51 | return (new self(self::languageFile($language))); 52 | } 53 | 54 | /** 55 | * Returns the full path to the language file containing the 56 | * stopwords. 57 | * 58 | * @param string $language 59 | * 60 | * @return string 61 | */ 62 | public static function languageFile(string $language = 'en_US'): string 63 | { 64 | return __DIR__ . '/../../lang/' . $language . '.php'; 65 | } 66 | 67 | /** 68 | * Returns a string containing a regular expression pattern. 69 | * 70 | * @return string 71 | */ 72 | public function pattern(): string 73 | { 74 | return $this->pattern; 75 | } 76 | 77 | /** 78 | * Returns an array of stopwords. 79 | * 80 | * @return array 81 | */ 82 | public function stopwords(): array 83 | { 84 | return $this->stopwords; 85 | } 86 | 87 | /** 88 | * Returns the originally supplied filename 89 | * 90 | * @return string 91 | */ 92 | public function filename(): string 93 | { 94 | return $this->filename; 95 | } 96 | 97 | /** 98 | * Loads the specified language file and returns with the results. 99 | * 100 | * @param string $language_file 101 | * 102 | * @return array 103 | */ 104 | protected function loadLangPHPFile(string $language_file): array 105 | { 106 | if (!file_exists($language_file)) { 107 | throw new RuntimeException("Could not find the RAKE stopwords file: $language_file"); 108 | } 109 | 110 | $stopwords = include($language_file); 111 | 112 | if (!is_array($stopwords)) { 113 | throw new RuntimeException("Invalid results retrieved from RAKE stopwords file: $language_file"); 114 | } 115 | 116 | if (count($stopwords) < 1) { 117 | throw new RuntimeException("No words found in RAKE stopwords file: $language_file"); 118 | } 119 | 120 | return $stopwords; 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/StopwordProviders/StopwordsPatternFile.php: -------------------------------------------------------------------------------- 1 | filename = $filename; 19 | $this->pattern = $this->loadLangPatternFile($filename); 20 | } 21 | 22 | /** 23 | * Creates a new instance of the StopwordsPatternFile class. 24 | * 25 | * @param string $filename 26 | * 27 | * @return StopwordsPatternFile 28 | */ 29 | public static function create(string $filename): StopwordsPatternFile 30 | { 31 | return (new self($filename)); 32 | } 33 | 34 | /** 35 | * Creates a new instance of the StopwordsPHP class 36 | * but automatically determines the filename to use 37 | * based on the language string provided. 38 | * 39 | * The function looks in the ./lang directory for a file called 40 | * xxxx.pattern file where xxxx is the language string you specified. 41 | * 42 | * @param string $language (Default is en_US) 43 | * 44 | * @return StopwordsPatternFile 45 | */ 46 | public static function createFromLanguage(string $language = 'en_US'): StopwordsPatternFile 47 | { 48 | return (new self(self::languageFile($language))); 49 | } 50 | 51 | /** 52 | * Returns the full path to the language file containing the 53 | * stopwords. 54 | * 55 | * @param string $language 56 | * 57 | * @return string 58 | */ 59 | public static function languageFile(string $language = 'en_US'): string 60 | { 61 | return __DIR__ . '/../../lang/' . $language . '.pattern'; 62 | } 63 | 64 | /** 65 | * Returns a string containing a regular expression pattern. 66 | * 67 | * @return string 68 | */ 69 | public function pattern(): string 70 | { 71 | return $this->pattern; 72 | } 73 | 74 | /** 75 | * Returns the originally supplied filename 76 | * 77 | * @return string 78 | */ 79 | public function filename(): string 80 | { 81 | return $this->filename; 82 | } 83 | 84 | /** 85 | * Loads the specified language file and returns with the results. 86 | * 87 | * @param string $language_file 88 | * 89 | * @return false|string 90 | */ 91 | protected function loadLangPatternFile(string $language_file) 92 | { 93 | if (!file_exists($language_file)) { 94 | throw new RuntimeException("Could not find the RAKE stopwords file: $language_file"); 95 | } 96 | 97 | // Trim leading "/" character and trailing "/i" if it exists in the string 98 | $pattern = trim(file_get_contents($language_file)); 99 | 100 | if (mb_substr($pattern, 0, 1) === '/' && mb_substr($pattern, -2) === '/i') { 101 | return mb_substr($pattern, 1, -2); 102 | } 103 | 104 | return $pattern; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /tests/LangParseOptionsTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('en_US', $options->getLanguage()); 17 | $this->assertEquals('[.!?,;:\t\"\(\)]', $options->getSentenceRegex()); 18 | $this->assertEquals("\n", $options->getLineTerminator()); 19 | } 20 | 21 | public function testUnknownLanguageConstruct() 22 | { 23 | $options = LangParseOptions::create('en_Whatever'); 24 | $this->assertEquals('en_Whatever', $options->getLanguage()); 25 | $this->assertEquals('[.!?,;:\t\"\(\)]', $options->getSentenceRegex()); 26 | $this->assertEquals("\n", $options->getLineTerminator()); 27 | } 28 | 29 | public function test_en_USLanguageConstruct() 30 | { 31 | $options = LangParseOptions::create('en_US'); 32 | $this->assertEquals('en_US', $options->getLanguage()); 33 | $this->assertEquals('[.!?,;:\t\"\(\)]', $options->getSentenceRegex()); 34 | $this->assertEquals("\n", $options->getLineTerminator()); 35 | } 36 | 37 | public function test_ar_AE_LanguageConstruct() 38 | { 39 | $options = LangParseOptions::create('ar_AE'); 40 | $this->assertEquals('ar_AE', $options->getLanguage()); 41 | $this->assertEquals('[-؛؟،“.!?,;:\t\"\(\)]', $options->getSentenceRegex()); 42 | $this->assertEquals("\n", $options->getLineTerminator()); 43 | } 44 | 45 | public function test_ckb_IQ_LanguageConstruct() 46 | { 47 | $options = LangParseOptions::create('ckb_IQ'); 48 | $this->assertEquals('ckb_IQ', $options->getLanguage()); 49 | $this->assertEquals('[-؛؟،“.!?,;:\t\"\(\)]', $options->getSentenceRegex()); 50 | $this->assertEquals("\n", $options->getLineTerminator()); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /tests/StopwordsPHPTest.php: -------------------------------------------------------------------------------- 1 | expectException(RuntimeException::class); 13 | $this->expectExceptionMessage('file'); 14 | 15 | StopwordsPHP::create('wrong'); 16 | } 17 | 18 | public function testThrowsExceptionWhenIncorrectFile() 19 | { 20 | $this->expectException(RuntimeException::class); 21 | $this->expectExceptionMessage('Invalid results'); 22 | 23 | StopwordsPHP::create(__DIR__ . '/fixtures/string_lang.php'); 24 | } 25 | 26 | public function testThrowsExceptionWhenEmptyFile() 27 | { 28 | $this->expectException(RuntimeException::class); 29 | $this->expectExceptionMessage('No words found'); 30 | 31 | StopwordsPHP::create(__DIR__ . '/fixtures/empty_lang.php'); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tests/StopwordsPatternFileTest.php: -------------------------------------------------------------------------------- 1 | expectException(RuntimeException::class); 13 | $this->expectExceptionMessage('file'); 14 | 15 | StopwordsPatternFile::create('wrong'); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /tests/fixtures/empty_lang.php: -------------------------------------------------------------------------------- 1 | get(); 12 | 13 | $this->assertCount(2, $phrases); 14 | $this->assertContains('يا أمجد', $phrases); 15 | $this->assertContains('افتح الباب', $phrases); 16 | } 17 | 18 | public function testFullStopInArabicPhrase() 19 | { 20 | $text = ".ذهب الفتى إلى الحديقة ليلعب مع أصدقائه"; 21 | $phrases = RakePlus::create($text, 'ar_AE', 0, false)->get(); 22 | 23 | $this->assertCount(2, $phrases); 24 | $this->assertContains('ذهب الفتى إلى الحديقة ليلعب', $phrases); 25 | $this->assertContains('أصدقائه', $phrases); 26 | } 27 | 28 | public function testQuotationsInArabicPhrase() 29 | { 30 | // Note that this test uses mixed (") marks, i.e. a quotation that starts 31 | // with Unicode character %U201C and ends with a standard ASCII (") 32 | $text = "“.قال عماد لأخيه : \"لا تنس أنني سأكون دائمًا معك، فلا داعي للقلق"; 33 | $phrases = RakePlus::create($text, 'ar_AE', 0, false)->get(); 34 | 35 | $this->assertCount(3, $phrases); 36 | $this->assertContains('عماد لأخيه', $phrases); 37 | $this->assertContains('تنس أنني سأكون دائمًا معك', $phrases); 38 | } 39 | 40 | public function testRoundBracketsInArabicPhrase() 41 | { 42 | $text = ".الظروف الطبيعية القاسية (البرد الشديد ثم الجفاف) أفسدت موسم الفواكه هذا العام"; 43 | $phrases = RakePlus::create($text, 'ar_AE', 0, false)->get(); 44 | 45 | $this->assertCount(5, $phrases); 46 | $this->assertContains('الظروف الطبيعية القاسية', $phrases); 47 | $this->assertContains('البرد الشديد', $phrases); 48 | $this->assertContains('الجفاف', $phrases); 49 | $this->assertContains('أفسدت موسم الفواكه', $phrases); 50 | $this->assertContains('العام', $phrases); 51 | } 52 | 53 | public function testColonInArabicPhrase() 54 | { 55 | $text = "“.قال عماد لأخيه : \"لا تنس أنني سأكون دائمًا معك، فلا داعي للقلق"; 56 | $phrases = RakePlus::create($text, 'ar_AE', 0, false)->get(); 57 | 58 | $this->assertCount(3, $phrases); 59 | $this->assertContains('عماد لأخيه', $phrases); 60 | $this->assertContains('تنس أنني سأكون دائمًا معك', $phrases); 61 | $this->assertContains('فلا داعي للقلق', $phrases); 62 | } 63 | 64 | public function testDashesAndQuestionMarkInArabicPhrase() 65 | { 66 | $text = "هل أعدت لندى ساعتها التي نسيتها؟- 67 | 68 | بالطبع، أعدتها لها بالأمس- 69 | 70 | ممتاز-"; 71 | 72 | $phrases = RakePlus::create($text, 'ar_AE', 0, false)->get(); 73 | 74 | // $this->assertContains('', $phrases); 75 | $this->assertCount(6, $phrases); 76 | $this->assertContains('هل أعدت لندى ساعتها', $phrases); 77 | $this->assertContains('نسيتها', $phrases); 78 | $this->assertContains('بالطبع', $phrases); 79 | $this->assertContains('أعدتها', $phrases); 80 | $this->assertContains('بالأمس', $phrases); 81 | $this->assertContains('ممتاز', $phrases); 82 | } 83 | 84 | public function testExclamationMarkInArabicPhrase() 85 | { 86 | $text = "“.قال عماد لأخيه : \"لا تنس أنني سأكون دائمًا معك، فلا داعي للقلق"; 87 | $phrases = RakePlus::create($text, 'ar_AE', 0, false)->get(); 88 | 89 | $this->assertCount(3, $phrases); 90 | $this->assertContains('عماد لأخيه', $phrases); 91 | $this->assertContains('تنس أنني سأكون دائمًا معك', $phrases); 92 | $this->assertContains('فلا داعي للقلق', $phrases); 93 | } 94 | 95 | public function testSemicolonInArabicPhrase() 96 | { 97 | $text = "اجتهد الطالب في مذاكرته، فكان الأول على رفاقه."; 98 | $phrases = RakePlus::create($text, 'ar_AE', 0, false)->get(); 99 | 100 | $this->assertCount(4, $phrases); 101 | $this->assertContains('اجتهد الطالب', $phrases); 102 | $this->assertContains('مذاكرته', $phrases); 103 | $this->assertContains('فكان الأول', $phrases); 104 | $this->assertContains('رفاقه', $phrases); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /tests/lang_ckb_IQ_Test.php: -------------------------------------------------------------------------------- 1 | get(); 12 | 13 | $this->assertCount(7, $phrases); 14 | $this->assertContains('كوردێكی دانیشتوی فینلاند', $phrases); 15 | $this->assertContains('تابلۆیه‌كی به‌ ناوی', $phrases); 16 | $this->assertContains('ڤایرۆسی كۆرۆنا', $phrases); 17 | $this->assertContains('كێشا', $phrases); 18 | $this->assertContains('ئێسته‌ له‌ یه‌كێك له‌ به‌ناوبانگترین ماڵپه‌ڕه‌كانی فرۆشتنی تابلۆی ئۆنلاین', $phrases); 19 | $this->assertContains('خستویه‌تییه‌ڕو', $phrases); 20 | $this->assertContains('فرۆشتن', $phrases); 21 | } 22 | 23 | public function testGeneralKurdishKeywords() 24 | { 25 | $text = "كوردێكی دانیشتوی فینلاند، تابلۆیه‌كی به‌ ناوی \"ڤایرۆسی كۆرۆنا\" كێشا، ئێسته‌ له‌ یه‌كێك له‌ به‌ناوبانگترین ماڵپه‌ڕه‌كانی فرۆشتنی تابلۆی ئۆنلاین، خستویه‌تییه‌ڕو بۆ فرۆشتن."; 26 | $keywords = RakePlus::create($text, 'ckb_IQ', 0, false)->keywords(); 27 | 28 | $this->assertCount(19, $keywords); 29 | $this->assertContains('كوردێكی', $keywords); 30 | $this->assertContains('دانیشتوی', $keywords); 31 | $this->assertContains('فینلاند', $keywords); 32 | $this->assertContains('تابلۆیه‌كی', $keywords); 33 | $this->assertContains('به‌', $keywords); 34 | $this->assertContains('ناوی', $keywords); 35 | $this->assertContains('ڤایرۆسی', $keywords); 36 | $this->assertContains('كۆرۆنا', $keywords); 37 | $this->assertContains('كێشا', $keywords); 38 | $this->assertContains('ئێسته‌', $keywords); 39 | $this->assertContains('له‌', $keywords); 40 | $this->assertContains('یه‌كێك', $keywords); 41 | $this->assertContains('به‌ناوبانگترین', $keywords); 42 | $this->assertContains('ماڵپه‌ڕه‌كانی', $keywords); 43 | $this->assertContains('فرۆشتنی', $keywords); 44 | $this->assertContains('تابلۆی', $keywords); 45 | $this->assertContains('ئۆنلاین', $keywords); 46 | $this->assertContains('خستویه‌تییه‌ڕو', $keywords); 47 | $this->assertContains('فرۆشتن', $keywords); 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /tests/lang_es_AR_Test.php: -------------------------------------------------------------------------------- 1 | get(); 18 | 19 | $this->assertCount(19, $phrases); 20 | 21 | $this->assertContains('saritha', $phrases); 22 | $this->assertContains('viendo', $phrases); 23 | $this->assertContains('película', $phrases); 24 | $this->assertContains('bollywood', $phrases); 25 | $this->assertContains('novio chris', $phrases); 26 | $this->assertContains('películas', $phrases); 27 | $this->assertContains('filmadas', $phrases); 28 | $this->assertContains('lengua hindi', $phrases); 29 | $this->assertContains('industria cinematográfica', $phrases); 30 | $this->assertContains('sede', $phrases); 31 | $this->assertContains('mumbai', $phrases); 32 | $this->assertContains('india', $phrases); 33 | $this->assertContains('\'b\'', $phrases); 34 | $this->assertContains('bollywood viene', $phrases); 35 | $this->assertContains('\'bombay\'', $phrases); 36 | $this->assertContains('antiguo nombre', $phrases); 37 | $this->assertContains('películas tradicionales', $phrases); 38 | $this->assertContains('intención', $phrases); 39 | $this->assertContains('realistas', $phrases); 40 | } 41 | 42 | public function testKeywordsMinimumLength() 43 | { 44 | $text = "Saritha está viendo una película de Bollywood con su novio Chris. " . 45 | "Las películas de Bollywood son filmadas en lengua hindi por la industria " . 46 | "cinematográfica con sede en Mumbai, India. La 'B' de Bollywood viene de 'Bombay', " . 47 | "el antiguo nombre de Mumbai. Las películas tradicionales de Bollywood no tienen " . 48 | "la intención de ser realistas."; 49 | 50 | $rake = RakePlus::create($text, 'es_AR', 8, false); 51 | $keywords = $rake->sortByScore('desc')->keywords(); 52 | $this->assertCount(10, $keywords); 53 | 54 | $this->assertContains('industria', $keywords); 55 | $this->assertContains('cinematográfica', $keywords); 56 | $this->assertContains('películas', $keywords); 57 | $this->assertContains('tradicionales', $keywords); 58 | $this->assertContains('bollywood', $keywords); 59 | $this->assertContains('película', $keywords); 60 | $this->assertContains('filmadas', $keywords); 61 | $this->assertContains('\'bombay\'', $keywords); 62 | $this->assertContains('intención', $keywords); 63 | $this->assertContains('realistas', $keywords); 64 | } 65 | 66 | public function testKeywordsWithNumbers() 67 | { 68 | $text = "6462 Little Crest Suite 413, Lake Carlietown, WA 12643"; 69 | $keywords = RakePlus::create($text, 'es_AR', 0, false)->keywords(); 70 | 71 | $this->assertCount(9, $keywords); 72 | 73 | $this->assertContains('6462', $keywords); 74 | $this->assertContains('little', $keywords); 75 | $this->assertContains('crest', $keywords); 76 | $this->assertContains('suite', $keywords); 77 | $this->assertContains('lake', $keywords); 78 | $this->assertContains('carlietown', $keywords); 79 | $this->assertContains('wa', $keywords); 80 | $this->assertContains('12643', $keywords); 81 | 82 | foreach ($keywords as $keyword) { 83 | $this->assertIsString($keyword); 84 | } 85 | } 86 | 87 | public function testNumberedKeywordLimitedLengths() 88 | { 89 | $text = "6462 Little Crest Suite 413, Lake Carlietown, WA 12643"; 90 | $keywords = RakePlus::create($text, 'es_AR', 3, true)->keywords(); 91 | 92 | $this->assertCount(5, $keywords); 93 | 94 | $this->assertContains('little', $keywords); 95 | $this->assertContains('crest', $keywords); 96 | $this->assertContains('suite', $keywords); 97 | $this->assertContains('lake', $keywords); 98 | $this->assertContains('carlietown', $keywords); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /tests/lang_fr_FR_Test.php: -------------------------------------------------------------------------------- 1 | get(); 12 | 13 | $this->assertCount(6, $phrases); 14 | 15 | $this->assertContains('l\'arabie saoudite', $phrases); 16 | $this->assertContains('l\'accueil', $phrases); 17 | $this->assertContains('dakar s\'inscrit', $phrases); 18 | $this->assertContains('plan visant', $phrases); 19 | $this->assertContains('préparer l\'', $phrases); 20 | $this->assertContains('-pétrole', $phrases); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /tests/lang_sv_SE_Test.php: -------------------------------------------------------------------------------- 1 | get(); 19 | 20 | $this->assertCount(33, $phrases); 21 | 22 | $this->assertContains('mytologiska varelser', $phrases); 23 | $this->assertContains('sägs skydda gårdar', $phrases); 24 | $this->assertContains('förförisk kvinnlig varelse', $phrases); 25 | $this->assertContains('fördärv', $phrases); 26 | $this->assertContains('svensk folklore', $phrases); 27 | $this->assertContains('landets rika kulturarv', $phrases); 28 | } 29 | } 30 | --------------------------------------------------------------------------------