├── .env ├── .github └── workflows │ └── tests.yml ├── .gitignore ├── LICENSE ├── README.md ├── composer.json ├── docker-compose.yml ├── docker └── cli │ └── Dockerfile ├── phpunit.xml ├── res └── sample1.txt ├── src ├── TextRankFacade.php └── Tool │ ├── Graph.php │ ├── Parser.php │ ├── Score.php │ ├── StopWords │ ├── Arabic.php │ ├── Dutch.php │ ├── English.php │ ├── French.php │ ├── German.php │ ├── Indonesian.php │ ├── Italian.php │ ├── Norwegian.php │ ├── Russian.php │ ├── Spanish.php │ ├── StopWordsAbstract.php │ └── Turkish.php │ ├── Summarize.php │ └── Text.php └── tests └── TextRankFacadeTest.php /.env: -------------------------------------------------------------------------------- 1 | COMPOSE_PROJECT_NAME=PHP-Science-TextRank 2 | 3 | PREFIX=php-science-textrank 4 | 5 | SOURCE_DIR=./ 6 | TARGET_DIR=/var/www/html 7 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: tests 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | jobs: 8 | run: 9 | runs-on: ${{ matrix.operating-system }} 10 | strategy: 11 | matrix: 12 | operating-system: [ubuntu-latest] 13 | php-versions: ['8.0', '8.1', '8.2', '8.3'] 14 | name: PHP ${{ matrix.php-versions }} Test on ${{ matrix.operating-system }} 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v2 18 | 19 | - name: Setup PHP 20 | uses: shivammathur/setup-php@v2 21 | with: 22 | php-version: ${{ matrix.php-versions }} 23 | coverage: xdebug 24 | 25 | - name: Validate composer files 26 | run: composer validate 27 | 28 | - name: Install dependencies 29 | if: steps.composer-cache.outputs.cache-hit != 'true' 30 | run: composer install --prefer-dist --no-progress --no-suggest 31 | 32 | - name: Run test suite 33 | run: composer test 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | /.tmp 3 | /.phpunit.result.cache 4 | /composer.lock 5 | /vendor -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) 2016-2021 PHP-Science 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | TextRank 3 |

4 | 5 |

6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 |

19 | 20 |

21 | This source code is an implementation of TextRank algorithm in PHP programming language, under MIT licence.
22 |
23 |

24 | 25 | # TextRank vs. ChatGPT 26 | GPTs like ChatGPT are supervised language models that understand the context and generate new content from the given 27 | input using vast resources while TextRank is a cost-efficient/low-cost text extraction algorithm. TextRank algorithm 28 | also can be used as a pre-processor to a GPT model to reduce the text size to save on resource consumption. 29 | 30 | # TextRank or Automatic summarization 31 | > Automatic summarization is the process of reducing a text document with a computer program in order to create a summary that retains the most important points of the original document. Technologies that can make a coherent summary take into account variables such as length, writing style and syntax. Automatic data summarization is part of machine learning and data mining. The main idea of summarization is to find a representative subset of the data, which contains the information of the entire set. Summarization technologies are used in a large number of sectors in industry today. - Wikipedia 32 | 33 | The algorithm of this implementation is: 34 | * Extracts sentences, 35 | * Removes stopwords, 36 | * Adds integer values to words by finding and counting the matching words, 37 | * Weights the values of the words, 38 | * Normalizes values to get the scores, 39 | * Sorts by scores 40 | 41 | # Install to use it in your project 42 | ``` 43 | cd your-project-folder 44 | composer require php-science/textrank 45 | ``` 46 | 47 | # Install for contributing 48 | ``` 49 | cd git-project-folder 50 | docker-compose build 51 | docker-compose up -d 52 | composer install 53 | composer test 54 | ``` 55 | 56 | # Examples 57 | ```php 58 | 59 | use PhpScience\TextRank\Tool\StopWords\English; 60 | 61 | // String contains a long text, see the /res/sample1.txt file. 62 | $text = "Lorem ipsum..."; 63 | 64 | $api = new TextRankFacade(); 65 | // English implementation for stopwords/junk words: 66 | $stopWords = new English(); 67 | $api->setStopWords($stopWords); 68 | 69 | // Array of the most important keywords: 70 | $result = $api->getOnlyKeyWords($text); 71 | 72 | // Array of the sentences from the most important part of the text: 73 | $result = $api->getHighlights($text); 74 | 75 | // Array of the most important sentences from the text: 76 | $result = $api->summarizeTextBasic($text); 77 | ``` 78 | More examples: 79 | * [tests/TextRankFacadeTest.php](https://github.com/DavidBelicza/PHP-Science-TextRank/blob/master/tests/TextRankFacadeTest.php) 80 | * https://php.science 81 | 82 | # Authors, Contributors 83 | 84 | Name | GitHub user 85 | --- | --- 86 | David Belicza | @DavidBelicza 87 | Riccardo Marton | @riccardomarton 88 | Syndesi | @Syndesi 89 | vincentsch | @vincentsch 90 | Andrew Welch | @khalwat 91 | Andrey Astashov | @mvcaaa 92 | Leo Toneff | @bragle 93 | Willy Arisky | @willyarisky 94 | Robert-Jan Keizer | @KeizerDev 95 | Morty | @evil1morty 96 | Sezer Fidancı | @SezerFidanci 97 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "php-science/textrank", 3 | "description": "TextRank (automatic text summarization) for PHP.", 4 | "keywords": ["science", "textrank", "automatic", "summarization", "PHP", "PHP8", "strict", "ai", "artificial", "intelligence"], 5 | "license": "MIT", 6 | "authors": [ 7 | { 8 | "name": "David Belicza", 9 | "email": "david@belicza.com" 10 | } 11 | ], 12 | "require": { 13 | "php": ">=7.2", 14 | "ext-ctype": "*", 15 | "ext-mbstring": "*" 16 | }, 17 | "require-dev": { 18 | "phpunit/phpunit": "9.*" 19 | }, 20 | "autoload": { 21 | "psr-4": { 22 | "PhpScience\\TextRank\\": ["src/"] 23 | } 24 | }, 25 | "autoload-dev": { 26 | "psr-4": { 27 | "PhpScience\\TextRank\\": ["tests/"] 28 | } 29 | }, 30 | "scripts": { 31 | "test": "phpunit --colors='always' $(pwd)/tests" 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | 5 | cli: 6 | container_name: ${PREFIX}_cli 7 | image: ${PREFIX}_cli 8 | build: ./docker/cli 9 | volumes: 10 | - ${SOURCE_DIR}:${TARGET_DIR} 11 | stdin_open: true 12 | -------------------------------------------------------------------------------- /docker/cli/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM php:8.2-cli 2 | 3 | ENV TZ=Europe/Budapest 4 | ENV DEBIAN_FRONTEND=noninteractive 5 | 6 | RUN apt-get update 7 | 8 | RUN apt-get install -y \ 9 | libfreetype6-dev \ 10 | libicu-dev \ 11 | libjpeg62-turbo-dev \ 12 | libmcrypt-dev \ 13 | libxslt1-dev 14 | 15 | RUN apt-get install --no-install-recommends -y \ 16 | tzdata \ 17 | zip \ 18 | unzip \ 19 | git 20 | 21 | RUN apt-get install -y software-properties-common 22 | 23 | RUN php -r "copy('https://getcomposer.org/installer', 'composer-setup.php');" 24 | RUN php -r "if (hash_file('sha384', 'composer-setup.php') === 'e21205b207c3ff031906575712edab6f13eb0b361f2085f1f1237b7126d785e826a450292b6cfd1d64d92e6563bbde02') { echo 'Installer verified'; } else { echo 'Installer corrupt'; unlink('composer-setup.php'); } echo PHP_EOL;" 25 | RUN php composer-setup.php 26 | RUN php -r "unlink('composer-setup.php');" 27 | RUN mv composer.phar /usr/local/bin/composer 28 | 29 | WORKDIR /var/www/html 30 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 2 | 11 | 15 | 16 | 17 | 18 | ./tests/ 19 | 20 | 21 | 22 | 23 | 24 | src 25 | 26 | 27 | -------------------------------------------------------------------------------- /res/sample1.txt: -------------------------------------------------------------------------------- 1 | Over the past fortnight we asked you to nominate your top extensions for the GNOME desktop. And you did just that. Having now sifted through the hundreds of entries, we’re ready to reveal your favourite GNOME Shell extensions. GNOME 3 (which is more commonly used with the GNOME Shell) has an extension framework that lets developers (and users) extend, build on, and shape how the desktop looks, acts and functions. Dash to Dock takes the GNOME Dash — this is the ‘favourites bar’ that appears on the left-hand side of the screen in the Activities overlay — and transforms it into a desktop dock. And just like Plank, Docky or AWN you can add app launchers, rearrange them, and use them to minimise, restore and switch between app windows. Dash to Dock has many of the common “Dock” features you’d expect, including autohide and intellihide, a fixed-width mode, adjustable icon size, and custom themes. My biggest pet peeve with GNOME Shell is its legacy app tray that hides in the bottom left of the screen. All extraneous non-system applets, indicators and tray icons hide down here. This makes it a little harder to use applications that rely on a system tray presence, like Skype, Franz, Telegram, and Dropbox. TopIcons Plus is the quick way to put GNOME system tray icons back where they belong: on show and in reach. The extension moves legacy tray icons from the bottom left of Gnome Shell to the right-hand side of the top panel. A well-stocked settings panel lets you adjust icon opacity, color, padding, size and tray position. Dive into the settings to adjust the sizing, styling and positioning of icons. Like the popular daily stimulant of choice, the Caffeine GNOME extension keeps your computer awake. It couldn’t be simpler to use: just click the empty mug icon. An empty cup means you’re using normal auto suspend rules – e.g., a screensaver – while a freshly brewed cup of coffee means auto suspend and screensaver are turned off. The Caffeine GNOME extension supports GNOME Shell 3.4 or later. Familiar with applications like Guake and Tilda? If so, you’ll instantly see the appeal of the (superbly named) Drop Down Terminal GNOME extension. When installed just tap the key above the tab key (though it can be changed to almost any key you wish) to get instant access to the command line. Want to speed up using workspaces? This simple tool lets you do just that. Once installed you can quickly switch between workspaces by scrolling over the top panel - no need to enter the Activities Overlay! 2 | -------------------------------------------------------------------------------- /src/TextRankFacade.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank; 13 | 14 | use PhpScience\TextRank\Tool\Graph; 15 | use PhpScience\TextRank\Tool\Parser; 16 | use PhpScience\TextRank\Tool\Score; 17 | use PhpScience\TextRank\Tool\StopWords\StopWordsAbstract; 18 | use PhpScience\TextRank\Tool\Summarize; 19 | 20 | /** 21 | * Class TextRankFacade 22 | * 23 | * This Facade class is capable to find the keywords in a raw text, weigh them 24 | * and retrieve the most important sentences from the whole text. It is an 25 | * implementation of the TextRank algorithm. 26 | * 27 | * 28 | * $stopWords = new English(); 29 | * 30 | * $textRank = new TextRankFacade(); 31 | * $textRank->setStopWords($stopWords); 32 | * 33 | * $sentences = $textRank->summarizeTextFreely( 34 | * $rawText, 35 | * 5, 36 | * 2, 37 | * Summarize::GET_ALL_IMPORTANT 38 | * ); 39 | * 40 | * 41 | * @package PhpScience\TextRank 42 | */ 43 | class TextRankFacade 44 | { 45 | /** 46 | * Stop Words 47 | * 48 | * Stop Words to ignore because of dummy words. These words will not be Key 49 | * Words. A, like, no yes, one, two, I, you for example. 50 | * 51 | * @see \PhpScience\TextRank\Tool\StopWords\English 52 | * 53 | * @var StopWordsAbstract 54 | */ 55 | protected $stopWords; 56 | 57 | /** 58 | * Set Stop Words. 59 | * 60 | * @param StopWordsAbstract $stopWords Stop Words to ignore because of 61 | * dummy words. 62 | */ 63 | public function setStopWords(StopWordsAbstract $stopWords) 64 | { 65 | $this->stopWords = $stopWords; 66 | } 67 | 68 | /** 69 | * Only Keywords 70 | * 71 | * It retrieves the possible keywords with their scores from a text. 72 | * 73 | * @param string $rawText A single raw text. 74 | * 75 | * @return array Array from Keywords. Key is the parsed word, value is the 76 | * word score. 77 | */ 78 | public function getOnlyKeyWords(string $rawText): array 79 | { 80 | $parser = new Parser(); 81 | $parser->setMinimumWordLength(3); 82 | $parser->setRawText($rawText); 83 | 84 | if ($this->stopWords) { 85 | $parser->setStopWords($this->stopWords); 86 | } 87 | 88 | $text = $parser->parse(); 89 | 90 | $graph = new Graph(); 91 | $graph->createGraph($text); 92 | 93 | $score = new Score(); 94 | 95 | return $score->calculate( 96 | $graph, $text 97 | ); 98 | } 99 | 100 | /** 101 | * Highlighted Texts 102 | * 103 | * It finds the most important sentences from a text by the most important 104 | * keywords and these keywords also found by automatically. It retrieves 105 | * the most important sentences what are 20 percent of the full text. 106 | * 107 | * @param string $rawText A single raw text. 108 | * 109 | * @return array An array from sentences. 110 | */ 111 | public function getHighlights(string $rawText): array 112 | { 113 | $parser = new Parser(); 114 | $parser->setMinimumWordLength(3); 115 | $parser->setRawText($rawText); 116 | 117 | if ($this->stopWords) { 118 | $parser->setStopWords($this->stopWords); 119 | } 120 | 121 | $text = $parser->parse(); 122 | $maximumSentences = (int) (count($text->getSentences()) * 0.2); 123 | 124 | $graph = new Graph(); 125 | $graph->createGraph($text); 126 | 127 | $score = new Score(); 128 | $scores = $score->calculate($graph, $text); 129 | 130 | $summarize = new Summarize(); 131 | 132 | return $summarize->getSummarize( 133 | $scores, 134 | $graph, 135 | $text, 136 | 12, 137 | $maximumSentences, 138 | Summarize::GET_ALL_IMPORTANT 139 | ); 140 | } 141 | 142 | /** 143 | * Compounds a Summarized Text 144 | * 145 | * It finds the three most important sentences from a text by the most 146 | * important keywords and these keywords also found by automatically. It 147 | * retrieves these important sentences. 148 | * 149 | * @param string $rawText A single raw text. 150 | * 151 | * @return array An array from sentences. 152 | */ 153 | public function summarizeTextCompound(string $rawText): array 154 | { 155 | $parser = new Parser(); 156 | $parser->setMinimumWordLength(3); 157 | $parser->setRawText($rawText); 158 | 159 | if ($this->stopWords) { 160 | $parser->setStopWords($this->stopWords); 161 | } 162 | 163 | $text = $parser->parse(); 164 | 165 | $graph = new Graph(); 166 | $graph->createGraph($text); 167 | 168 | $score = new Score(); 169 | $scores = $score->calculate($graph, $text); 170 | 171 | $summarize = new Summarize(); 172 | 173 | return $summarize->getSummarize( 174 | $scores, 175 | $graph, 176 | $text, 177 | 10, 178 | 3, 179 | Summarize::GET_ALL_IMPORTANT 180 | ); 181 | } 182 | 183 | /** 184 | * Summarized Text 185 | * 186 | * It finds the most important sentence from a text by the most important 187 | * keywords and these keywords also found by automatically. It retrieves 188 | * the most important sentence and its following sentences. 189 | * 190 | * @param string $rawText A single raw text. 191 | * 192 | * @return array An array from sentences. 193 | */ 194 | public function summarizeTextBasic(string $rawText): array 195 | { 196 | $parser = new Parser(); 197 | $parser->setMinimumWordLength(3); 198 | $parser->setRawText($rawText); 199 | 200 | if ($this->stopWords) { 201 | $parser->setStopWords($this->stopWords); 202 | } 203 | 204 | $text = $parser->parse(); 205 | 206 | $graph = new Graph(); 207 | $graph->createGraph($text); 208 | 209 | $score = new Score(); 210 | $scores = $score->calculate($graph, $text); 211 | 212 | $summarize = new Summarize(); 213 | 214 | return $summarize->getSummarize( 215 | $scores, 216 | $graph, 217 | $text, 218 | 10, 219 | 3, 220 | Summarize::GET_FIRST_IMPORTANT_AND_FOLLOWINGS 221 | ); 222 | } 223 | 224 | /** 225 | * Freely Summarized Text. 226 | * 227 | * It retrieves the most important sentences from a text by the most important 228 | * keywords and these keywords also found by automatically. 229 | * 230 | * @param string $rawText A single raw text. 231 | * @param int $analyzedKeyWords Maximum number of the most important 232 | * Key Words to analyze the text. 233 | * @param int $expectedSentences How many sentence should be retrieved. 234 | * @param int $summarizeType Highlights from the text or a part of 235 | * the text. 236 | * 237 | * @return array An array from sentences. 238 | */ 239 | public function summarizeTextFreely( 240 | string $rawText, 241 | int $analyzedKeyWords, 242 | int $expectedSentences, 243 | int $summarizeType 244 | ): array { 245 | $parser = new Parser(); 246 | $parser->setMinimumWordLength(3); 247 | $parser->setRawText($rawText); 248 | 249 | if ($this->stopWords) { 250 | $parser->setStopWords($this->stopWords); 251 | } 252 | 253 | $text = $parser->parse(); 254 | 255 | $graph = new Graph(); 256 | $graph->createGraph($text); 257 | 258 | $score = new Score(); 259 | $scores = $score->calculate($graph, $text); 260 | 261 | $summarize = new Summarize(); 262 | 263 | return $summarize->getSummarize( 264 | $scores, 265 | $graph, 266 | $text, 267 | $analyzedKeyWords, 268 | $expectedSentences, 269 | $summarizeType 270 | ); 271 | } 272 | } 273 | -------------------------------------------------------------------------------- /src/Tool/Graph.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank\Tool; 13 | 14 | /** 15 | * Class Graph 16 | * 17 | * This graph store the sentences and their words with the indexes. This graph 18 | * is the full map of the whole text. 19 | * 20 | * @package PhpScience\TextRank\Tool 21 | */ 22 | class Graph 23 | { 24 | /** 25 | * Key is the word, value is an array with the sentence IDs. 26 | * 27 | * @var array 28 | */ 29 | protected $graph = []; 30 | 31 | /** 32 | * Create Graph. 33 | * 34 | * It creates a graph and save it into the graph property. 35 | * 36 | * @param Text $text Text object contains the parsed and prepared text 37 | * data. 38 | */ 39 | public function createGraph(Text &$text) 40 | { 41 | $wordMatrix = $text->getWordMatrix(); 42 | 43 | foreach ($wordMatrix as $sentenceIdx => $words) { 44 | $idxArray = array_keys($words); 45 | 46 | foreach ($idxArray as $idxKey => $idxValue) { 47 | $connections = []; 48 | 49 | if (isset($idxArray[$idxKey - 1])) { 50 | $connections[] = $idxArray[$idxKey - 1]; 51 | } 52 | 53 | if (isset($idxArray[$idxKey + 1])) { 54 | $connections[] = $idxArray[$idxKey + 1]; 55 | } 56 | 57 | $this->graph[$words[$idxValue]][$sentenceIdx][$idxValue] = $connections; 58 | } 59 | } 60 | } 61 | 62 | /** 63 | * Graph. 64 | * 65 | * It retrieves the graph. Key is the word, value is an array with the 66 | * sentence IDs. 67 | * 68 | * 69 | * array( 70 | * 'apple' => array( // word 71 | * 2 => array( // ID of the sentence 72 | * 52 => array( // ID of the word in the sentence 73 | * 51, 53 // IDs of the closest words to the apple word 74 | * ), 75 | * 10 => array( // IDs of the closest words to the apple word 76 | * 9, 11 // IDs of the closest words to the apple word 77 | * ), 78 | * 5 => array(6) 79 | * ), 80 | * 6 => array( 81 | * 9 => array(8, 10) 82 | * ), 83 | * ), 84 | * 'orange' => array( 85 | * 1 => array( 86 | * 30 => array(29, 31) 87 | * ) 88 | * ) 89 | * ); 90 | * 91 | * 92 | * @return array 93 | */ 94 | public function getGraph(): array 95 | { 96 | return $this->graph; 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/Tool/Parser.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank\Tool; 13 | 14 | use PhpScience\TextRank\Tool\StopWords\StopWordsAbstract; 15 | 16 | /** 17 | * Class Parser 18 | * 19 | * This class purpose to parse a real text to sentences and array. 20 | * 21 | * @package PhpScience\TextRank\Tool 22 | */ 23 | class Parser 24 | { 25 | /** 26 | * The number of length of the smallest word. Words bellow it will be 27 | * ignored. 28 | * 29 | * @var int 30 | */ 31 | protected $minimumWordLength = 0; 32 | 33 | /** 34 | * A single text, article, book for example. 35 | * 36 | * @var string 37 | */ 38 | protected $rawText = ''; 39 | 40 | /** 41 | * The array of the punctuations. The punctuation is the value. The key 42 | * refers to the key of its sentence. 43 | * 44 | * @var array 45 | */ 46 | protected $marks = []; 47 | 48 | /** 49 | * Stop Words to ignore. These words will not be keywords. 50 | * 51 | * @var StopWordsAbstract 52 | */ 53 | protected $stopWords; 54 | 55 | /** 56 | * It sets the minimum word length. Words bellow it will be ignored. 57 | * 58 | * @param int $wordLength 59 | */ 60 | public function setMinimumWordLength(int $wordLength) 61 | { 62 | $this->minimumWordLength = $wordLength; 63 | } 64 | 65 | /** 66 | * It sets the raw text. 67 | * 68 | * @param string $rawText 69 | */ 70 | public function setRawText(string $rawText) 71 | { 72 | $this->rawText = $rawText; 73 | } 74 | 75 | /** 76 | * Set Stop Words. 77 | * 78 | * It sets the stop words to remove them from the found keywords. 79 | * 80 | * @param StopWordsAbstract $words Stop Words to ignore. These words will 81 | * not be keywords. 82 | */ 83 | public function setStopWords(StopWordsAbstract $words) 84 | { 85 | $this->stopWords = $words; 86 | } 87 | 88 | /** 89 | * It retrieves the punctuations. 90 | * 91 | * @return array Array from punctuations where key is the index to link to 92 | * the sentence and value is the punctuation. 93 | */ 94 | public function getMarks(): array 95 | { 96 | return $this->marks; 97 | } 98 | 99 | /** 100 | * Parse. 101 | * 102 | * It parses the text from the property and retrieves in Text object 103 | * prepared to scoring and to searching. 104 | * 105 | * @return Text Parsed text prepared to scoring. 106 | */ 107 | public function parse(): Text 108 | { 109 | $matrix = []; 110 | $sentences = $this->getSentences(); 111 | 112 | foreach ($sentences as $sentenceIdx => $sentence) { 113 | $matrix[$sentenceIdx] = $this->getWords($sentence); 114 | } 115 | 116 | $text = new Text(); 117 | $text->setSentences($sentences); 118 | $text->setWordMatrix($matrix); 119 | $text->setMarks($this->marks); 120 | 121 | return $text; 122 | } 123 | 124 | /** 125 | * Sentences. 126 | * 127 | * It retrieves the sentences in array without junk data. 128 | * 129 | * @return array Array from sentences. 130 | */ 131 | protected function getSentences(): array 132 | { 133 | $sentences = $sentences = preg_split( 134 | '/(\n+)|(\.\s|\?\s|\!\s)(?![^\(]*\))/', 135 | $this->rawText, 136 | -1, 137 | PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE 138 | ); 139 | 140 | return array_values( 141 | array_filter( 142 | array_map( 143 | [$this, 'cleanSentence'], 144 | $sentences 145 | ) 146 | ) 147 | ); 148 | } 149 | 150 | /** 151 | * Possible Keywords. 152 | * 153 | * It retrieves an array of possible keywords without junk characters, 154 | * spaces and stop words. 155 | * 156 | * @param string $subText It should be a sentence. 157 | * 158 | * @return array The array of the possible keywords. 159 | */ 160 | protected function getWords(string $subText): array 161 | { 162 | $words = preg_split( 163 | '/(?:(^\p{P}+)|(\p{P}*\s+\p{P}*)|(\p{P}+$))/', 164 | $subText, 165 | -1, 166 | PREG_SPLIT_NO_EMPTY 167 | ); 168 | 169 | $words = array_values( 170 | array_filter( 171 | array_map( 172 | [$this, 'cleanWord'], 173 | $words 174 | ) 175 | ) 176 | ); 177 | 178 | if ($this->stopWords) { 179 | return array_filter($words, function($word) { 180 | return !ctype_punct($word) 181 | && strlen($word) > $this->minimumWordLength 182 | && !$this->stopWords->exist($word); 183 | }); 184 | } else { 185 | return array_filter($words, function($word) { 186 | return !ctype_punct($word) 187 | && strlen($word) > $this->minimumWordLength; 188 | }); 189 | } 190 | } 191 | 192 | /** 193 | * Clean Sentence. 194 | * 195 | * It clean the sentence. If it is a punctuation it will be stored in the 196 | * property $marks. 197 | * 198 | * @param string $sentence A sentence as a string. 199 | * 200 | * @return string It is empty string when it's punctuation. Otherwise it's 201 | * the trimmed sentence itself. 202 | */ 203 | protected function cleanSentence(string $sentence): string 204 | { 205 | if (strlen(trim($sentence)) == 1) { 206 | $this->marks[] = trim($sentence); 207 | return ''; 208 | 209 | } else { 210 | return trim($sentence); 211 | } 212 | } 213 | 214 | /** 215 | * Clean Word. 216 | * 217 | * It removes the junk spaces from the word and retrieves it. 218 | * 219 | * @param string $word 220 | * 221 | * @return string Cleaned word. 222 | */ 223 | protected function cleanWord(string $word): string 224 | { 225 | return mb_strtolower(trim($word)); 226 | } 227 | } 228 | -------------------------------------------------------------------------------- /src/Tool/Score.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank\Tool; 13 | 14 | /** 15 | * Class Score 16 | * 17 | * It handles words and assigns weighted numbers to them. 18 | * 19 | * @package PhpScience\TextRank\Tool 20 | */ 21 | class Score 22 | { 23 | /** 24 | * The maximum connections by a word in the current text. 25 | * 26 | * @var int 27 | */ 28 | protected $maximumValue = 0; 29 | 30 | /** 31 | * The minimum connection by a word in the current text. 32 | * 33 | * @var int 34 | */ 35 | protected $minimumValue = 0; 36 | 37 | /** 38 | * Calculate Scores. 39 | * 40 | * It calculates the scores from word's connections and the connections' 41 | * scores. It retrieves the scores in a form of a matrix where the key is 42 | * the word and value is the score. The score is between 0 and 1. 43 | * 44 | * @param Graph $graph The graph of the text. 45 | * @param Text $text Text object what stores all text data. 46 | * 47 | * @return array Key is the word and value is the float or int type score 48 | * between 1 and 0. 49 | */ 50 | public function calculate(Graph $graph, Text &$text): array 51 | { 52 | $graphData = $graph->getGraph(); 53 | $wordMatrix = $text->getWordMatrix(); 54 | $wordConnections = $this->calculateConnectionNumbers($graphData); 55 | $scores = $this->calculateScores( 56 | $graphData, 57 | $wordMatrix, 58 | $wordConnections 59 | ); 60 | 61 | return $this->normalizeAndSortScores($scores); 62 | } 63 | 64 | /** 65 | * Connection Numbers. 66 | * 67 | * It calculates the number of connections for each word and retrieves it 68 | * in array where key is the word and value is the number of connections. 69 | * 70 | * @param array $graphData Graph data from a Graph type object. 71 | * 72 | * @return array Key is the word and value is the number of the connected 73 | * words. 74 | */ 75 | protected function calculateConnectionNumbers(array &$graphData): array 76 | { 77 | $wordConnections = []; 78 | 79 | foreach ($graphData as $wordKey => $sentences) { 80 | $connectionCount = 0; 81 | 82 | foreach ($sentences as $sentenceIdx => $wordInstances) { 83 | foreach ($wordInstances as $connections) { 84 | $connectionCount += count($connections); 85 | } 86 | } 87 | 88 | $wordConnections[$wordKey] = $connectionCount; 89 | } 90 | 91 | return $wordConnections; 92 | } 93 | 94 | /** 95 | * Calculate Scores. 96 | * 97 | * It calculates the score of the words and retrieves it in array where key 98 | * is the word and value is the score. The score depends on the number of 99 | * the connections and the closest word's connection numbers. 100 | * 101 | * @param array $graphData Graph data from a Graph type object. 102 | * @param array $wordMatrix Multidimensional array from integer keys 103 | * and string values. 104 | * @param array $wordConnections Key is the word and value is the number of 105 | * the connected words. 106 | * 107 | * @return array Scores where key is the word and value is the score. 108 | */ 109 | protected function calculateScores( 110 | array &$graphData, 111 | array &$wordMatrix, 112 | array &$wordConnections 113 | ): array { 114 | $scores = []; 115 | 116 | foreach ($graphData as $wordKey => $sentences) { 117 | $value = 0; 118 | 119 | foreach ($sentences as $sentenceIdx => $wordInstances) { 120 | foreach ($wordInstances as $connections) { 121 | foreach ($connections as $wordIdx) { 122 | $word = $wordMatrix[$sentenceIdx][$wordIdx]; 123 | $value += $wordConnections[$word]; 124 | } 125 | } 126 | } 127 | 128 | $scores[$wordKey] = $value; 129 | 130 | if ($value > $this->maximumValue) { 131 | $this->maximumValue = $value; 132 | } 133 | 134 | if ($value < $this->minimumValue || $this->minimumValue == 0) { 135 | $this->minimumValue = $value; 136 | } 137 | } 138 | 139 | return $scores; 140 | } 141 | 142 | /** 143 | * Normalize and Sort Scores. 144 | * 145 | * It recalculates the scores by normalize the score numbers to between 0 146 | * and 1. 147 | * 148 | * @param array $scores Keywords with scores. Score is the key. 149 | * 150 | * @return array Keywords with normalized and ordered scores. 151 | */ 152 | protected function normalizeAndSortScores(array &$scores): array 153 | { 154 | foreach ($scores as $key => $value) { 155 | $v = $this->normalize( 156 | $value, 157 | $this->minimumValue, 158 | $this->maximumValue 159 | ); 160 | 161 | $scores[$key] = $v; 162 | } 163 | 164 | arsort($scores); 165 | 166 | return $scores; 167 | } 168 | 169 | /** 170 | * It normalizes a number. 171 | * 172 | * @param int $value Current weight. 173 | * @param int $min Minimum weight. 174 | * @param int $max Maximum weight. 175 | * 176 | * @return float|int Normalized weight aka score. 177 | */ 178 | protected function normalize(int $value, int $min, int $max): float 179 | { 180 | $divisor = $max - $min; 181 | 182 | if ($divisor == 0) { 183 | return 0.0; 184 | } 185 | 186 | $normalized = ($value - $min) / $divisor; 187 | 188 | return $normalized; 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /src/Tool/StopWords/Arabic.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank\Tool\StopWords; 13 | 14 | /** 15 | * Class English 16 | * 17 | * @package PhpScience\TextRank\Tool\StopWords 18 | */ 19 | class English extends StopWordsAbstract 20 | { 21 | /** 22 | * Stop words for avoid dummy keywords for Language English. 23 | * 24 | * @var array 25 | */ 26 | protected $words = [ 27 | 'a', 28 | 'about', 29 | 'above', 30 | 'above', 31 | 'across', 32 | 'after', 33 | 'afterwards', 34 | 'again', 35 | 'against', 36 | 'all', 37 | 'almost', 38 | 'alone', 39 | 'along', 40 | 'already', 41 | 'also', 42 | 'although', 43 | 'always', 44 | 'am', 45 | 'among', 46 | 'amongst', 47 | 'amoungst', 48 | 'amount', 49 | 'an', 50 | 'and', 51 | 'another', 52 | 'any', 53 | 'anyhow', 54 | 'anyone', 55 | 'anything', 56 | 'anyway', 57 | 'anywhere', 58 | 'are', 59 | 'around', 60 | 'as', 61 | 'at', 62 | 'back', 63 | 'be', 64 | 'became', 65 | 'because', 66 | 'become', 67 | 'becomes', 68 | 'becoming', 69 | 'been', 70 | 'before', 71 | 'beforehand', 72 | 'behind', 73 | 'being', 74 | 'below', 75 | 'beside', 76 | 'besides', 77 | 'between', 78 | 'beyond', 79 | 'bill', 80 | 'both', 81 | 'bottom', 82 | 'but', 83 | 'by', 84 | 'call', 85 | 'can', 86 | 'cannot', 87 | 'cant', 88 | 'co', 89 | 'con', 90 | 'could', 91 | 'couldnt', 92 | 'cry', 93 | 'de', 94 | 'describe', 95 | 'detail', 96 | 'do', 97 | 'done', 98 | 'down', 99 | 'due', 100 | 'during', 101 | 'each', 102 | 'eg', 103 | 'eight', 104 | 'either', 105 | 'eleven', 106 | 'else', 107 | 'elsewhere', 108 | 'empty', 109 | 'enough', 110 | 'etc', 111 | 'even', 112 | 'ever', 113 | 'every', 114 | 'everyone', 115 | 'everything', 116 | 'everywhere', 117 | 'except', 118 | 'few', 119 | 'fifteen', 120 | 'fify', 121 | 'fill', 122 | 'find', 123 | 'fire', 124 | 'first', 125 | 'five', 126 | 'for', 127 | 'former', 128 | 'formerly', 129 | 'forty', 130 | 'found', 131 | 'four', 132 | 'from', 133 | 'front', 134 | 'full', 135 | 'further', 136 | 'get', 137 | 'give', 138 | 'go', 139 | 'had', 140 | 'has', 141 | 'hasnt', 142 | 'have', 143 | 'he', 144 | 'hence', 145 | 'her', 146 | 'here', 147 | 'hereafter', 148 | 'hereby', 149 | 'herein', 150 | 'hereupon', 151 | 'hers', 152 | 'herself', 153 | 'him', 154 | 'himself', 155 | 'his', 156 | 'how', 157 | 'however', 158 | 'hundred', 159 | 'ie', 160 | 'if', 161 | 'in', 162 | 'inc', 163 | 'indeed', 164 | 'interest', 165 | 'into', 166 | 'is', 167 | 'it', 168 | 'its', 169 | 'itself', 170 | 'keep', 171 | 'last', 172 | 'latter', 173 | 'latterly', 174 | 'least', 175 | 'less', 176 | 'ltd', 177 | 'made', 178 | 'many', 179 | 'may', 180 | 'me', 181 | 'meanwhile', 182 | 'might', 183 | 'mill', 184 | 'mine', 185 | 'more', 186 | 'moreover', 187 | 'most', 188 | 'mostly', 189 | 'move', 190 | 'much', 191 | 'must', 192 | 'my', 193 | 'myself', 194 | 'name', 195 | 'namely', 196 | 'neither', 197 | 'never', 198 | 'nevertheless', 199 | 'next', 200 | 'nine', 201 | 'no', 202 | 'nobody', 203 | 'none', 204 | 'noone', 205 | 'nor', 206 | 'not', 207 | 'nothing', 208 | 'now', 209 | 'nowhere', 210 | 'of', 211 | 'off', 212 | 'often', 213 | 'on', 214 | 'once', 215 | 'one', 216 | 'only', 217 | 'onto', 218 | 'or', 219 | 'other', 220 | 'others', 221 | 'otherwise', 222 | 'our', 223 | 'ours', 224 | 'ourselves', 225 | 'out', 226 | 'over', 227 | 'own', 228 | 'part', 229 | 'per', 230 | 'perhaps', 231 | 'please', 232 | 'put', 233 | 'rather', 234 | 're', 235 | 'same', 236 | 'see', 237 | 'seem', 238 | 'seemed', 239 | 'seeming', 240 | 'seems', 241 | 'serious', 242 | 'several', 243 | 'she', 244 | 'should', 245 | 'show', 246 | 'side', 247 | 'since', 248 | 'sincere', 249 | 'six', 250 | 'sixty', 251 | 'so', 252 | 'some', 253 | 'somehow', 254 | 'someone', 255 | 'something', 256 | 'sometime', 257 | 'sometimes', 258 | 'somewhere', 259 | 'still', 260 | 'such', 261 | 'system', 262 | 'take', 263 | 'ten', 264 | 'than', 265 | 'that', 266 | 'the', 267 | 'their', 268 | 'them', 269 | 'themselves', 270 | 'then', 271 | 'thence', 272 | 'there', 273 | 'thereafter', 274 | 'thereby', 275 | 'therefore', 276 | 'therein', 277 | 'thereupon', 278 | 'these', 279 | 'they', 280 | 'thickv', 281 | 'thin', 282 | 'third', 283 | 'this', 284 | 'those', 285 | 'though', 286 | 'three', 287 | 'through', 288 | 'throughout', 289 | 'thru', 290 | 'thus', 291 | 'to', 292 | 'together', 293 | 'too', 294 | 'top', 295 | 'toward', 296 | 'towards', 297 | 'twelve', 298 | 'twenty', 299 | 'two', 300 | 'un', 301 | 'under', 302 | 'until', 303 | 'up', 304 | 'upon', 305 | 'us', 306 | 'very', 307 | 'via', 308 | 'was', 309 | 'we', 310 | 'well', 311 | 'were', 312 | 'what', 313 | 'whatever', 314 | 'when', 315 | 'whence', 316 | 'whenever', 317 | 'where', 318 | 'whereafter', 319 | 'whereas', 320 | 'whereby', 321 | 'wherein', 322 | 'whereupon', 323 | 'wherever', 324 | 'whether', 325 | 'which', 326 | 'while', 327 | 'whither', 328 | 'who', 329 | 'whoever', 330 | 'whole', 331 | 'whom', 332 | 'whose', 333 | 'why', 334 | 'will', 335 | 'with', 336 | 'within', 337 | 'without', 338 | 'would', 339 | 'yet', 340 | 'you', 341 | 'your', 342 | 'yours', 343 | 'yourself', 344 | 'yourselves' 345 | ]; 346 | } 347 | -------------------------------------------------------------------------------- /src/Tool/StopWords/French.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank\Tool\StopWords; 13 | 14 | /** 15 | * Class French 16 | * 17 | * @package PhpScience\TextRank\Tool\StopWords 18 | */ 19 | class French extends StopWordsAbstract 20 | { 21 | /** 22 | * Stop words for avoid dummy keywords for Language French. 23 | * Source: https://github.com/stopwords-iso/stopwords-fr 24 | * 25 | * @var array 26 | */ 27 | protected $words = [ 28 | 'a', 29 | 'abord', 30 | 'absolument', 31 | 'afin', 32 | 'ah', 33 | 'ai', 34 | 'aie', 35 | 'aient', 36 | 'aies', 37 | 'ailleurs', 38 | 'ainsi', 39 | 'ait', 40 | 'allaient', 41 | 'allo', 42 | 'allons', 43 | 'allô', 44 | 'alors', 45 | 'anterieur', 46 | 'anterieure', 47 | 'anterieures', 48 | 'apres', 49 | 'après', 50 | 'as', 51 | 'assez', 52 | 'attendu', 53 | 'au', 54 | 'aucun', 55 | 'aucune', 56 | 'aucuns', 57 | 'aujourd', 58 | 'aujourd\'hui', 59 | 'aupres', 60 | 'auquel', 61 | 'aura', 62 | 'aurai', 63 | 'auraient', 64 | 'aurais', 65 | 'aurait', 66 | 'auras', 67 | 'aurez', 68 | 'auriez', 69 | 'aurions', 70 | 'aurons', 71 | 'auront', 72 | 'aussi', 73 | 'autre', 74 | 'autrefois', 75 | 'autrement', 76 | 'autres', 77 | 'autrui', 78 | 'aux', 79 | 'auxquelles', 80 | 'auxquels', 81 | 'avaient', 82 | 'avais', 83 | 'avait', 84 | 'avant', 85 | 'avec', 86 | 'avez', 87 | 'aviez', 88 | 'avions', 89 | 'avoir', 90 | 'avons', 91 | 'ayant', 92 | 'ayez', 93 | 'ayons', 94 | 'b', 95 | 'bah', 96 | 'bas', 97 | 'basee', 98 | 'bat', 99 | 'beau', 100 | 'beaucoup', 101 | 'bien', 102 | 'bigre', 103 | 'bon', 104 | 'boum', 105 | 'bravo', 106 | 'brrr', 107 | 'c', 108 | 'car', 109 | 'ce', 110 | 'ceci', 111 | 'cela', 112 | 'celle', 113 | 'celle-ci', 114 | 'celle-là', 115 | 'celles', 116 | 'celles-ci', 117 | 'celles-là', 118 | 'celui', 119 | 'celui-ci', 120 | 'celui-là', 121 | 'celà', 122 | 'cent', 123 | 'cependant', 124 | 'certain', 125 | 'certaine', 126 | 'certaines', 127 | 'certains', 128 | 'certes', 129 | 'ces', 130 | 'cet', 131 | 'cette', 132 | 'ceux', 133 | 'ceux-ci', 134 | 'ceux-là', 135 | 'chacun', 136 | 'chacune', 137 | 'chaque', 138 | 'cher', 139 | 'chers', 140 | 'chez', 141 | 'chiche', 142 | 'chut', 143 | 'chère', 144 | 'chères', 145 | 'ci', 146 | 'cinq', 147 | 'cinquantaine', 148 | 'cinquante', 149 | 'cinquantième', 150 | 'cinquième', 151 | 'clac', 152 | 'clic', 153 | 'combien', 154 | 'comme', 155 | 'comment', 156 | 'comparable', 157 | 'comparables', 158 | 'compris', 159 | 'concernant', 160 | 'contre', 161 | 'couic', 162 | 'crac', 163 | 'd', 164 | 'da', 165 | 'dans', 166 | 'de', 167 | 'debout', 168 | 'dedans', 169 | 'dehors', 170 | 'deja', 171 | 'delà', 172 | 'depuis', 173 | 'dernier', 174 | 'derniere', 175 | 'derriere', 176 | 'derrière', 177 | 'des', 178 | 'desormais', 179 | 'desquelles', 180 | 'desquels', 181 | 'dessous', 182 | 'dessus', 183 | 'deux', 184 | 'deuxième', 185 | 'deuxièmement', 186 | 'devant', 187 | 'devers', 188 | 'devra', 189 | 'devrait', 190 | 'different', 191 | 'differentes', 192 | 'differents', 193 | 'différent', 194 | 'différente', 195 | 'différentes', 196 | 'différents', 197 | 'dire', 198 | 'directe', 199 | 'directement', 200 | 'dit', 201 | 'dite', 202 | 'dits', 203 | 'divers', 204 | 'diverse', 205 | 'diverses', 206 | 'dix', 207 | 'dix-huit', 208 | 'dix-neuf', 209 | 'dix-sept', 210 | 'dixième', 211 | 'doit', 212 | 'doivent', 213 | 'donc', 214 | 'dont', 215 | 'dos', 216 | 'douze', 217 | 'douzième', 218 | 'dring', 219 | 'droite', 220 | 'du', 221 | 'duquel', 222 | 'durant', 223 | 'dès', 224 | 'début', 225 | 'désormais', 226 | 'e', 227 | 'effet', 228 | 'egale', 229 | 'egalement', 230 | 'egales', 231 | 'eh', 232 | 'elle', 233 | 'elle-même', 234 | 'elles', 235 | 'elles-mêmes', 236 | 'en', 237 | 'encore', 238 | 'enfin', 239 | 'entre', 240 | 'envers', 241 | 'environ', 242 | 'es', 243 | 'essai', 244 | 'est', 245 | 'et', 246 | 'etant', 247 | 'etc', 248 | 'etre', 249 | 'eu', 250 | 'eue', 251 | 'eues', 252 | 'euh', 253 | 'eurent', 254 | 'eus', 255 | 'eusse', 256 | 'eussent', 257 | 'eusses', 258 | 'eussiez', 259 | 'eussions', 260 | 'eut', 261 | 'eux', 262 | 'eux-mêmes', 263 | 'exactement', 264 | 'excepté', 265 | 'extenso', 266 | 'exterieur', 267 | 'eûmes', 268 | 'eût', 269 | 'eûtes', 270 | 'f', 271 | 'fais', 272 | 'faisaient', 273 | 'faisant', 274 | 'fait', 275 | 'faites', 276 | 'façon', 277 | 'feront', 278 | 'fi', 279 | 'flac', 280 | 'floc', 281 | 'fois', 282 | 'font', 283 | 'force', 284 | 'furent', 285 | 'fus', 286 | 'fusse', 287 | 'fussent', 288 | 'fusses', 289 | 'fussiez', 290 | 'fussions', 291 | 'fut', 292 | 'fûmes', 293 | 'fût', 294 | 'fûtes', 295 | 'g', 296 | 'gens', 297 | 'h', 298 | 'ha', 299 | 'haut', 300 | 'hein', 301 | 'hem', 302 | 'hep', 303 | 'hi', 304 | 'ho', 305 | 'holà', 306 | 'hop', 307 | 'hormis', 308 | 'hors', 309 | 'hou', 310 | 'houp', 311 | 'hue', 312 | 'hui', 313 | 'huit', 314 | 'huitième', 315 | 'hum', 316 | 'hurrah', 317 | 'hé', 318 | 'hélas', 319 | 'i', 320 | 'ici', 321 | 'il', 322 | 'ils', 323 | 'importe', 324 | 'j', 325 | 'je', 326 | 'jusqu', 327 | 'jusque', 328 | 'juste', 329 | 'k', 330 | 'l', 331 | 'la', 332 | 'laisser', 333 | 'laquelle', 334 | 'las', 335 | 'le', 336 | 'lequel', 337 | 'les', 338 | 'lesquelles', 339 | 'lesquels', 340 | 'leur', 341 | 'leurs', 342 | 'longtemps', 343 | 'lors', 344 | 'lorsque', 345 | 'lui', 346 | 'lui-meme', 347 | 'lui-même', 348 | 'là', 349 | 'lès', 350 | 'm', 351 | 'ma', 352 | 'maint', 353 | 'maintenant', 354 | 'mais', 355 | 'malgre', 356 | 'malgré', 357 | 'maximale', 358 | 'me', 359 | 'meme', 360 | 'memes', 361 | 'merci', 362 | 'mes', 363 | 'mien', 364 | 'mienne', 365 | 'miennes', 366 | 'miens', 367 | 'mille', 368 | 'mince', 369 | 'mine', 370 | 'minimale', 371 | 'moi', 372 | 'moi-meme', 373 | 'moi-même', 374 | 'moindres', 375 | 'moins', 376 | 'mon', 377 | 'mot', 378 | 'moyennant', 379 | 'multiple', 380 | 'multiples', 381 | 'même', 382 | 'mêmes', 383 | 'n', 384 | 'na', 385 | 'naturel', 386 | 'naturelle', 387 | 'naturelles', 388 | 'ne', 389 | 'neanmoins', 390 | 'necessaire', 391 | 'necessairement', 392 | 'neuf', 393 | 'neuvième', 394 | 'ni', 395 | 'nombreuses', 396 | 'nombreux', 397 | 'nommés', 398 | 'non', 399 | 'nos', 400 | 'notamment', 401 | 'notre', 402 | 'nous', 403 | 'nous-mêmes', 404 | 'nouveau', 405 | 'nouveaux', 406 | 'nul', 407 | 'néanmoins', 408 | 'nôtre', 409 | 'nôtres', 410 | 'o', 411 | 'oh', 412 | 'ohé', 413 | 'ollé', 414 | 'olé', 415 | 'on', 416 | 'ont', 417 | 'onze', 418 | 'onzième', 419 | 'ore', 420 | 'ou', 421 | 'ouf', 422 | 'ouias', 423 | 'oust', 424 | 'ouste', 425 | 'outre', 426 | 'ouvert', 427 | 'ouverte', 428 | 'ouverts', 429 | 'o|', 430 | 'où', 431 | 'p', 432 | 'paf', 433 | 'pan', 434 | 'par', 435 | 'parce', 436 | 'parfois', 437 | 'parle', 438 | 'parlent', 439 | 'parler', 440 | 'parmi', 441 | 'parole', 442 | 'parseme', 443 | 'partant', 444 | 'particulier', 445 | 'particulière', 446 | 'particulièrement', 447 | 'pas', 448 | 'passé', 449 | 'pendant', 450 | 'pense', 451 | 'permet', 452 | 'personne', 453 | 'personnes', 454 | 'peu', 455 | 'peut', 456 | 'peuvent', 457 | 'peux', 458 | 'pff', 459 | 'pfft', 460 | 'pfut', 461 | 'pif', 462 | 'pire', 463 | 'pièce', 464 | 'plein', 465 | 'plouf', 466 | 'plupart', 467 | 'plus', 468 | 'plusieurs', 469 | 'plutôt', 470 | 'possessif', 471 | 'possessifs', 472 | 'possible', 473 | 'possibles', 474 | 'pouah', 475 | 'pour', 476 | 'pourquoi', 477 | 'pourrais', 478 | 'pourrait', 479 | 'pouvait', 480 | 'prealable', 481 | 'precisement', 482 | 'premier', 483 | 'première', 484 | 'premièrement', 485 | 'pres', 486 | 'probable', 487 | 'probante', 488 | 'procedant', 489 | 'proche', 490 | 'près', 491 | 'psitt', 492 | 'pu', 493 | 'puis', 494 | 'puisque', 495 | 'pur', 496 | 'pure', 497 | 'q', 498 | 'qu', 499 | 'quand', 500 | 'quant', 501 | 'quant-à-soi', 502 | 'quanta', 503 | 'quarante', 504 | 'quatorze', 505 | 'quatre', 506 | 'quatre-vingt', 507 | 'quatrième', 508 | 'quatrièmement', 509 | 'que', 510 | 'quel', 511 | 'quelconque', 512 | 'quelle', 513 | 'quelles', 514 | 'quelqu\'un', 515 | 'quelque', 516 | 'quelques', 517 | 'quels', 518 | 'qui', 519 | 'quiconque', 520 | 'quinze', 521 | 'quoi', 522 | 'quoique', 523 | 'r', 524 | 'rare', 525 | 'rarement', 526 | 'rares', 527 | 'relative', 528 | 'relativement', 529 | 'remarquable', 530 | 'rend', 531 | 'rendre', 532 | 'restant', 533 | 'reste', 534 | 'restent', 535 | 'restrictif', 536 | 'retour', 537 | 'revoici', 538 | 'revoilà', 539 | 'rien', 540 | 's', 541 | 'sa', 542 | 'sacrebleu', 543 | 'sait', 544 | 'sans', 545 | 'sapristi', 546 | 'sauf', 547 | 'se', 548 | 'sein', 549 | 'seize', 550 | 'selon', 551 | 'semblable', 552 | 'semblaient', 553 | 'semble', 554 | 'semblent', 555 | 'sent', 556 | 'sept', 557 | 'septième', 558 | 'sera', 559 | 'serai', 560 | 'seraient', 561 | 'serais', 562 | 'serait', 563 | 'seras', 564 | 'serez', 565 | 'seriez', 566 | 'serions', 567 | 'serons', 568 | 'seront', 569 | 'ses', 570 | 'seul', 571 | 'seule', 572 | 'seulement', 573 | 'si', 574 | 'sien', 575 | 'sienne', 576 | 'siennes', 577 | 'siens', 578 | 'sinon', 579 | 'six', 580 | 'sixième', 581 | 'soi', 582 | 'soi-même', 583 | 'soient', 584 | 'sois', 585 | 'soit', 586 | 'soixante', 587 | 'sommes', 588 | 'son', 589 | 'sont', 590 | 'sous', 591 | 'souvent', 592 | 'soyez', 593 | 'soyons', 594 | 'specifique', 595 | 'specifiques', 596 | 'speculatif', 597 | 'stop', 598 | 'strictement', 599 | 'subtiles', 600 | 'suffisant', 601 | 'suffisante', 602 | 'suffit', 603 | 'suis', 604 | 'suit', 605 | 'suivant', 606 | 'suivante', 607 | 'suivantes', 608 | 'suivants', 609 | 'suivre', 610 | 'sujet', 611 | 'superpose', 612 | 'sur', 613 | 'surtout', 614 | 't', 615 | 'ta', 616 | 'tac', 617 | 'tandis', 618 | 'tant', 619 | 'tardive', 620 | 'te', 621 | 'tel', 622 | 'telle', 623 | 'tellement', 624 | 'telles', 625 | 'tels', 626 | 'tenant', 627 | 'tend', 628 | 'tenir', 629 | 'tente', 630 | 'tes', 631 | 'tic', 632 | 'tien', 633 | 'tienne', 634 | 'tiennes', 635 | 'tiens', 636 | 'toc', 637 | 'toi', 638 | 'toi-même', 639 | 'ton', 640 | 'touchant', 641 | 'toujours', 642 | 'tous', 643 | 'tout', 644 | 'toute', 645 | 'toutefois', 646 | 'toutes', 647 | 'treize', 648 | 'trente', 649 | 'tres', 650 | 'trois', 651 | 'troisième', 652 | 'troisièmement', 653 | 'trop', 654 | 'très', 655 | 'tsoin', 656 | 'tsouin', 657 | 'tu', 658 | 'té', 659 | 'u', 660 | 'un', 661 | 'une', 662 | 'unes', 663 | 'uniformement', 664 | 'unique', 665 | 'uniques', 666 | 'uns', 667 | 'v', 668 | 'va', 669 | 'vais', 670 | 'valeur', 671 | 'vas', 672 | 'vers', 673 | 'via', 674 | 'vif', 675 | 'vifs', 676 | 'vingt', 677 | 'vivat', 678 | 'vive', 679 | 'vives', 680 | 'vlan', 681 | 'voici', 682 | 'voie', 683 | 'voient', 684 | 'voilà', 685 | 'vont', 686 | 'vos', 687 | 'votre', 688 | 'vous', 689 | 'vous-mêmes', 690 | 'vu', 691 | 'vé', 692 | 'vôtre', 693 | 'vôtres', 694 | 'w', 695 | 'x', 696 | 'y', 697 | 'z', 698 | 'zut', 699 | 'à', 700 | 'â', 701 | 'ça', 702 | 'ès', 703 | 'étaient', 704 | 'étais', 705 | 'était', 706 | 'étant', 707 | 'état', 708 | 'étiez', 709 | 'étions', 710 | 'été', 711 | 'étée', 712 | 'étées', 713 | 'étés', 714 | 'êtes', 715 | 'être', 716 | 'ô' 717 | ]; 718 | } 719 | -------------------------------------------------------------------------------- /src/Tool/StopWords/German.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | declare(strict_types=1); 10 | namespace PhpScience\TextRank\Tool\StopWords; 11 | /** 12 | * Class German 13 | * 14 | * @package PhpScience\TextRank\Tool\StopWords 15 | */ 16 | class German extends StopWordsAbstract 17 | { 18 | /** 19 | * Stop words for avoid dummy keywords for Language German. 20 | * Word list created by Marco Götze, Steffen Geyer. 21 | * Source: https://solariz.de/de/downloads/6/german-enhanced-stopwords.htm 22 | * 23 | * @var array 24 | */ 25 | protected $words = [ 26 | 'ab', 27 | 'aber', 28 | 'alle', 29 | 'allein', 30 | 'allem', 31 | 'allen', 32 | 'aller', 33 | 'allerdings', 34 | 'allerlei', 35 | 'alles', 36 | 'allmählich', 37 | 'allzu', 38 | 'als', 39 | 'alsbald', 40 | 'also', 41 | 'am', 42 | 'an', 43 | 'and', 44 | 'ander', 45 | 'andere', 46 | 'anderem', 47 | 'anderen', 48 | 'anderer', 49 | 'andererseits', 50 | 'anderes', 51 | 'anderm', 52 | 'andern', 53 | 'andernfalls', 54 | 'anders', 55 | 'anstatt', 56 | 'auch', 57 | 'auf', 58 | 'aus', 59 | 'ausgenommen', 60 | 'ausser', 61 | 'ausserdem', 62 | 'außer', 63 | 'außerdem', 64 | 'außerhalb', 65 | 'bald', 66 | 'bei', 67 | 'beide', 68 | 'beiden', 69 | 'beiderlei', 70 | 'beides', 71 | 'beim', 72 | 'beinahe', 73 | 'bereits', 74 | 'besonders', 75 | 'besser', 76 | 'beträchtlich', 77 | 'bevor', 78 | 'bezüglich', 79 | 'bin', 80 | 'bis', 81 | 'bisher', 82 | 'bislang', 83 | 'bist', 84 | 'bloß', 85 | 'bsp.', 86 | 'bzw', 87 | 'ca', 88 | 'ca.', 89 | 'content', 90 | 'da', 91 | 'dabei', 92 | 'dadurch', 93 | 'dafür', 94 | 'dagegen', 95 | 'daher', 96 | 'dahin', 97 | 'damals', 98 | 'damit', 99 | 'danach', 100 | 'daneben', 101 | 'dann', 102 | 'daran', 103 | 'darauf', 104 | 'daraus', 105 | 'darin', 106 | 'darum', 107 | 'darunter', 108 | 'darüber', 109 | 'darüberhinaus', 110 | 'das', 111 | 'dass', 112 | 'dasselbe', 113 | 'davon', 114 | 'davor', 115 | 'dazu', 116 | 'daß', 117 | 'dein', 118 | 'deine', 119 | 'deinem', 120 | 'deinen', 121 | 'deiner', 122 | 'deines', 123 | 'dem', 124 | 'demnach', 125 | 'demselben', 126 | 'den', 127 | 'denen', 128 | 'denn', 129 | 'dennoch', 130 | 'denselben', 131 | 'der', 132 | 'derart', 133 | 'derartig', 134 | 'derem', 135 | 'deren', 136 | 'derer', 137 | 'derjenige', 138 | 'derjenigen', 139 | 'derselbe', 140 | 'derselben', 141 | 'derzeit', 142 | 'des', 143 | 'deshalb', 144 | 'desselben', 145 | 'dessen', 146 | 'desto', 147 | 'deswegen', 148 | 'dich', 149 | 'die', 150 | 'diejenige', 151 | 'dies', 152 | 'diese', 153 | 'dieselbe', 154 | 'dieselben', 155 | 'diesem', 156 | 'diesen', 157 | 'dieser', 158 | 'dieses', 159 | 'diesseits', 160 | 'dir', 161 | 'direkt', 162 | 'direkte', 163 | 'direkten', 164 | 'direkter', 165 | 'doch', 166 | 'dort', 167 | 'dorther', 168 | 'dorthin', 169 | 'drauf', 170 | 'drin', 171 | 'drunter', 172 | 'drüber', 173 | 'du', 174 | 'dunklen', 175 | 'durch', 176 | 'durchaus', 177 | 'eben', 178 | 'ebenfalls', 179 | 'ebenso', 180 | 'eher', 181 | 'eigenen', 182 | 'eigenes', 183 | 'eigentlich', 184 | 'ein', 185 | 'eine', 186 | 'einem', 187 | 'einen', 188 | 'einer', 189 | 'einerseits', 190 | 'eines', 191 | 'einfach', 192 | 'einführen', 193 | 'einführte', 194 | 'einführten', 195 | 'eingesetzt', 196 | 'einig', 197 | 'einige', 198 | 'einigem', 199 | 'einigen', 200 | 'einiger', 201 | 'einigermaßen', 202 | 'einiges', 203 | 'einmal', 204 | 'eins', 205 | 'einseitig', 206 | 'einseitige', 207 | 'einseitigen', 208 | 'einseitiger', 209 | 'einst', 210 | 'einstmals', 211 | 'einzig', 212 | 'entsprechend', 213 | 'entweder', 214 | 'er', 215 | 'erst', 216 | 'es', 217 | 'etc', 218 | 'etliche', 219 | 'etwa', 220 | 'etwas', 221 | 'euch', 222 | 'euer', 223 | 'eure', 224 | 'eurem', 225 | 'euren', 226 | 'eurer', 227 | 'eures', 228 | 'falls', 229 | 'fast', 230 | 'ferner', 231 | 'folgende', 232 | 'folgenden', 233 | 'folgender', 234 | 'folgendes', 235 | 'folglich', 236 | 'fuer', 237 | 'für', 238 | 'gab', 239 | 'ganze', 240 | 'ganzem', 241 | 'ganzen', 242 | 'ganzer', 243 | 'ganzes', 244 | 'gar', 245 | 'gegen', 246 | 'gemäss', 247 | 'ggf', 248 | 'gleich', 249 | 'gleichwohl', 250 | 'gleichzeitig', 251 | 'glücklicherweise', 252 | 'gänzlich', 253 | 'hab', 254 | 'habe', 255 | 'haben', 256 | 'haette', 257 | 'hast', 258 | 'hat', 259 | 'hatte', 260 | 'hatten', 261 | 'hattest', 262 | 'hattet', 263 | 'heraus', 264 | 'herein', 265 | 'hier', 266 | 'hier', 267 | 'hinter', 268 | 'hiermit', 269 | 'hiesige', 270 | 'hin', 271 | 'hinein', 272 | 'hinten', 273 | 'hinter', 274 | 'hinterher', 275 | 'http', 276 | 'hätt', 277 | 'hätte', 278 | 'hätten', 279 | 'höchstens', 280 | 'ich', 281 | 'igitt', 282 | 'ihm', 283 | 'ihn', 284 | 'ihnen', 285 | 'ihr', 286 | 'ihre', 287 | 'ihrem', 288 | 'ihren', 289 | 'ihrer', 290 | 'ihres', 291 | 'im', 292 | 'immer', 293 | 'immerhin', 294 | 'in', 295 | 'indem', 296 | 'indessen', 297 | 'infolge', 298 | 'innen', 299 | 'innerhalb', 300 | 'ins', 301 | 'insofern', 302 | 'inzwischen', 303 | 'irgend', 304 | 'irgendeine', 305 | 'irgendwas', 306 | 'irgendwen', 307 | 'irgendwer', 308 | 'irgendwie', 309 | 'irgendwo', 310 | 'ist', 311 | 'ja', 312 | 'je', 313 | 'jed', 314 | 'jede', 315 | 'jedem', 316 | 'jeden', 317 | 'jedenfalls', 318 | 'jeder', 319 | 'jederlei', 320 | 'jedes', 321 | 'jedoch', 322 | 'jemand', 323 | 'jene', 324 | 'jenem', 325 | 'jenen', 326 | 'jener', 327 | 'jenes', 328 | 'jenseits', 329 | 'jetzt', 330 | 'jährig', 331 | 'jährige', 332 | 'jährigen', 333 | 'jähriges', 334 | 'kam', 335 | 'kann', 336 | 'kannst', 337 | 'kaum', 338 | 'kein', 339 | 'keine', 340 | 'keinem', 341 | 'keinen', 342 | 'keiner', 343 | 'keinerlei', 344 | 'keines', 345 | 'keineswegs', 346 | 'klar', 347 | 'klare', 348 | 'klaren', 349 | 'klares', 350 | 'klein', 351 | 'kleinen', 352 | 'kleiner', 353 | 'kleines', 354 | 'koennen', 355 | 'koennt', 356 | 'koennte', 357 | 'koennten', 358 | 'komme', 359 | 'kommen', 360 | 'kommt', 361 | 'konkret', 362 | 'konkrete', 363 | 'konkreten', 364 | 'konkreter', 365 | 'konkretes', 366 | 'können', 367 | 'könnt', 368 | 'künftig', 369 | 'leider', 370 | 'machen', 371 | 'man', 372 | 'manche', 373 | 'manchem', 374 | 'manchen', 375 | 'mancher', 376 | 'mancherorts', 377 | 'manches', 378 | 'manchmal', 379 | 'mehr', 380 | 'mehrere', 381 | 'mein', 382 | 'meine', 383 | 'meinem', 384 | 'meinen', 385 | 'meiner', 386 | 'meines', 387 | 'mich', 388 | 'mir', 389 | 'mit', 390 | 'mithin', 391 | 'muessen', 392 | 'muesst', 393 | 'muesste', 394 | 'muss', 395 | 'musst', 396 | 'musste', 397 | 'mussten', 398 | 'muß', 399 | 'mußt', 400 | 'müssen', 401 | 'müsste', 402 | 'müssten', 403 | 'müßt', 404 | 'müßte', 405 | 'nach', 406 | 'nachdem', 407 | 'nachher', 408 | 'nachhinein', 409 | 'nahm', 410 | 'natürlich', 411 | 'neben', 412 | 'nebenan', 413 | 'nehmen', 414 | 'nein', 415 | 'nicht', 416 | 'nichts', 417 | 'nie', 418 | 'niemals', 419 | 'niemand', 420 | 'nirgends', 421 | 'nirgendwo', 422 | 'noch', 423 | 'nun', 424 | 'nur', 425 | 'nächste', 426 | 'nämlich', 427 | 'nötigenfalls', 428 | 'ob', 429 | 'oben', 430 | 'oberhalb', 431 | 'obgleich', 432 | 'obschon', 433 | 'obwohl', 434 | 'oder', 435 | 'oft', 436 | 'per', 437 | 'plötzlich', 438 | 'schließlich', 439 | 'schon', 440 | 'sehr', 441 | 'sehrwohl', 442 | 'seid', 443 | 'sein', 444 | 'seine', 445 | 'seinem', 446 | 'seinen', 447 | 'seiner', 448 | 'seines', 449 | 'seit', 450 | 'seitdem', 451 | 'seither', 452 | 'selber', 453 | 'selbst', 454 | 'sich', 455 | 'sicher', 456 | 'sicherlich', 457 | 'sie', 458 | 'sind', 459 | 'so', 460 | 'sobald', 461 | 'sodass', 462 | 'sodaß', 463 | 'soeben', 464 | 'sofern', 465 | 'sofort', 466 | 'sogar', 467 | 'solange', 468 | 'solch', 469 | 'solche', 470 | 'solchem', 471 | 'solchen', 472 | 'solcher', 473 | 'solches', 474 | 'soll', 475 | 'sollen', 476 | 'sollst', 477 | 'sollt', 478 | 'sollte', 479 | 'sollten', 480 | 'solltest', 481 | 'somit', 482 | 'sondern', 483 | 'sonst', 484 | 'sonstwo', 485 | 'sooft', 486 | 'soviel', 487 | 'soweit', 488 | 'sowie', 489 | 'sowohl', 490 | 'tatsächlich', 491 | 'tatsächlichen', 492 | 'tatsächlicher', 493 | 'tatsächliches', 494 | 'trotzdem', 495 | 'ueber', 496 | 'um', 497 | 'umso', 498 | 'unbedingt', 499 | 'und', 500 | 'unmöglich', 501 | 'unmögliche', 502 | 'unmöglichen', 503 | 'unmöglicher', 504 | 'uns', 505 | 'unser', 506 | 'unser', 507 | 'unsere', 508 | 'unsere', 509 | 'unserem', 510 | 'unseren', 511 | 'unserer', 512 | 'unseres', 513 | 'unter', 514 | 'usw', 515 | 'viel', 516 | 'viele', 517 | 'vielen', 518 | 'vieler', 519 | 'vieles', 520 | 'vielleicht', 521 | 'vielmals', 522 | 'vom', 523 | 'von', 524 | 'vor', 525 | 'voran', 526 | 'vorher', 527 | 'vorüber', 528 | 'völlig', 529 | 'wann', 530 | 'war', 531 | 'waren', 532 | 'warst', 533 | 'warum', 534 | 'was', 535 | 'weder', 536 | 'weil', 537 | 'weiter', 538 | 'weitere', 539 | 'weiterem', 540 | 'weiteren', 541 | 'weiterer', 542 | 'weiteres', 543 | 'weiterhin', 544 | 'weiß', 545 | 'welche', 546 | 'welchem', 547 | 'welchen', 548 | 'welcher', 549 | 'welches', 550 | 'wem', 551 | 'wen', 552 | 'wenig', 553 | 'wenige', 554 | 'weniger', 555 | 'wenigstens', 556 | 'wenn', 557 | 'wenngleich', 558 | 'wer', 559 | 'werde', 560 | 'werden', 561 | 'werdet', 562 | 'weshalb', 563 | 'wessen', 564 | 'wichtig', 565 | 'wie', 566 | 'wieder', 567 | 'wieso', 568 | 'wieviel', 569 | 'wiewohl', 570 | 'will', 571 | 'willst', 572 | 'wir', 573 | 'wird', 574 | 'wirklich', 575 | 'wirst', 576 | 'wo', 577 | 'wodurch', 578 | 'wogegen', 579 | 'woher', 580 | 'wohin', 581 | 'wohingegen', 582 | 'wohl', 583 | 'wohlweislich', 584 | 'womit', 585 | 'woraufhin', 586 | 'woraus', 587 | 'worin', 588 | 'wurde', 589 | 'wurden', 590 | 'während', 591 | 'währenddessen', 592 | 'wär', 593 | 'wäre', 594 | 'wären', 595 | 'würde', 596 | 'würden', 597 | 'z.B.', 598 | 'zB', 599 | 'zahlreich', 600 | 'zeitweise', 601 | 'zu', 602 | 'zudem', 603 | 'zuerst', 604 | 'zufolge', 605 | 'zugleich', 606 | 'zuletzt', 607 | 'zum', 608 | 'zumal', 609 | 'zur', 610 | 'zurück', 611 | 'zusammen', 612 | 'zuviel', 613 | 'zwar', 614 | 'zwischen', 615 | 'ähnlich', 616 | 'übel', 617 | 'über', 618 | 'überall', 619 | 'überallhin', 620 | 'überdies', 621 | 'übermorgen', 622 | 'übrig', 623 | 'übrigens' 624 | ]; 625 | } -------------------------------------------------------------------------------- /src/Tool/StopWords/Indonesian.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank\Tool\StopWords; 13 | 14 | /** 15 | * Class Norwegian 16 | * 17 | * @package PhpScience\TextRank\Tool\StopWords 18 | */ 19 | class Norwegian extends StopWordsAbstract 20 | { 21 | /** 22 | * Stop words for avoid dummy keywords for Language Norwegian. 23 | * Source: https://github.com/stopwords-iso/stopwords-no 24 | * 25 | * @var array 26 | */ 27 | protected $words = [ 28 | 'alle', 29 | 'andre', 30 | 'arbeid', 31 | 'at', 32 | 'av', 33 | 'bare', 34 | 'begge', 35 | 'ble', 36 | 'blei', 37 | 'bli', 38 | 'blir', 39 | 'blitt', 40 | 'bort', 41 | 'bra', 42 | 'bruke', 43 | 'både', 44 | 'båe', 45 | 'da', 46 | 'de', 47 | 'deg', 48 | 'dei', 49 | 'deim', 50 | 'deira', 51 | 'deires', 52 | 'dem', 53 | 'den', 54 | 'denne', 55 | 'der', 56 | 'dere', 57 | 'deres', 58 | 'det', 59 | 'dette', 60 | 'di', 61 | 'din', 62 | 'disse', 63 | 'ditt', 64 | 'du', 65 | 'dykk', 66 | 'dykkar', 67 | 'då', 68 | 'eg', 69 | 'ein', 70 | 'eit', 71 | 'eitt', 72 | 'eller', 73 | 'elles', 74 | 'en', 75 | 'ene', 76 | 'eneste', 77 | 'enhver', 78 | 'enn', 79 | 'er', 80 | 'et', 81 | 'ett', 82 | 'etter', 83 | 'folk', 84 | 'for', 85 | 'fordi', 86 | 'forsûke', 87 | 'fra', 88 | 'få', 89 | 'før', 90 | 'fûr', 91 | 'fûrst', 92 | 'gjorde', 93 | 'gjûre', 94 | 'god', 95 | 'gå', 96 | 'ha', 97 | 'hadde', 98 | 'han', 99 | 'hans', 100 | 'har', 101 | 'hennar', 102 | 'henne', 103 | 'hennes', 104 | 'her', 105 | 'hjå', 106 | 'ho', 107 | 'hoe', 108 | 'honom', 109 | 'hoss', 110 | 'hossen', 111 | 'hun', 112 | 'hva', 113 | 'hvem', 114 | 'hver', 115 | 'hvilke', 116 | 'hvilken', 117 | 'hvis', 118 | 'hvor', 119 | 'hvordan', 120 | 'hvorfor', 121 | 'i', 122 | 'ikke', 123 | 'ikkje', 124 | 'ingen', 125 | 'ingi', 126 | 'inkje', 127 | 'inn', 128 | 'innen', 129 | 'inni', 130 | 'ja', 131 | 'jeg', 132 | 'kan', 133 | 'kom', 134 | 'korleis', 135 | 'korso', 136 | 'kun', 137 | 'kunne', 138 | 'kva', 139 | 'kvar', 140 | 'kvarhelst', 141 | 'kven', 142 | 'kvi', 143 | 'kvifor', 144 | 'lage', 145 | 'lang', 146 | 'lik', 147 | 'like', 148 | 'makt', 149 | 'man', 150 | 'mange', 151 | 'me', 152 | 'med', 153 | 'medan', 154 | 'meg', 155 | 'meget', 156 | 'mellom', 157 | 'men', 158 | 'mens', 159 | 'mer', 160 | 'mest', 161 | 'mi', 162 | 'min', 163 | 'mine', 164 | 'mitt', 165 | 'mot', 166 | 'mye', 167 | 'mykje', 168 | 'må', 169 | 'måte', 170 | 'navn', 171 | 'ned', 172 | 'nei', 173 | 'no', 174 | 'noe', 175 | 'noen', 176 | 'noka', 177 | 'noko', 178 | 'nokon', 179 | 'nokor', 180 | 'nokre', 181 | 'ny', 182 | 'nå', 183 | 'når', 184 | 'og', 185 | 'også', 186 | 'om', 187 | 'opp', 188 | 'oss', 189 | 'over', 190 | 'part', 191 | 'punkt', 192 | 'på', 193 | 'rett', 194 | 'riktig', 195 | 'samme', 196 | 'sant', 197 | 'seg', 198 | 'selv', 199 | 'si', 200 | 'sia', 201 | 'sidan', 202 | 'siden', 203 | 'sin', 204 | 'sine', 205 | 'sist', 206 | 'sitt', 207 | 'sjøl', 208 | 'skal', 209 | 'skulle', 210 | 'slik', 211 | 'slutt', 212 | 'so', 213 | 'som', 214 | 'somme', 215 | 'somt', 216 | 'start', 217 | 'stille', 218 | 'så', 219 | 'sånn', 220 | 'tid', 221 | 'til', 222 | 'tilbake', 223 | 'tilstand', 224 | 'um', 225 | 'under', 226 | 'upp', 227 | 'ut', 228 | 'uten', 229 | 'var', 230 | 'vart', 231 | 'varte', 232 | 'ved', 233 | 'verdi', 234 | 'vere', 235 | 'verte', 236 | 'vi', 237 | 'vil', 238 | 'ville', 239 | 'vite', 240 | 'vore', 241 | 'vors', 242 | 'vort', 243 | 'vår', 244 | 'være', 245 | 'vært', 246 | 'vöre', 247 | 'vört', 248 | 'å' 249 | ]; 250 | } 251 | -------------------------------------------------------------------------------- /src/Tool/StopWords/Russian.php: -------------------------------------------------------------------------------- 1 | 8 | * @author Andrey Astashov (Russian StopWords) 9 | */ 10 | 11 | declare(strict_types=1); 12 | 13 | namespace PhpScience\TextRank\Tool\StopWords; 14 | 15 | /** 16 | * Class Russian 17 | * 18 | * @package PhpScience\TextRank\Tool\StopWords 19 | */ 20 | class Russian extends StopWordsAbstract 21 | { 22 | /** 23 | * Stop words for avoid dummy keywords for Language Russian. 24 | * 25 | * @var array 26 | */ 27 | protected $words = [ 28 | 'c', 29 | 'а', 30 | 'алло', 31 | 'без', 32 | 'белый', 33 | 'близко', 34 | 'более', 35 | 'больше', 36 | 'большой', 37 | 'будем', 38 | 'будет', 39 | 'будете', 40 | 'будешь', 41 | 'будто', 42 | 'буду', 43 | 'будут', 44 | 'будь', 45 | 'бы', 46 | 'бывает', 47 | 'бывь', 48 | 'был', 49 | 'была', 50 | 'были', 51 | 'было', 52 | 'быть', 53 | 'в', 54 | 'важная', 55 | 'важное', 56 | 'важные', 57 | 'важный', 58 | 'вам', 59 | 'вами', 60 | 'вас', 61 | 'ваш', 62 | 'ваша', 63 | 'ваше', 64 | 'ваши', 65 | 'вверх', 66 | 'вдали', 67 | 'вдруг', 68 | 'ведь', 69 | 'везде', 70 | 'вернуться', 71 | 'весь', 72 | 'вечер', 73 | 'взгляд', 74 | 'взять', 75 | 'вид', 76 | 'видел', 77 | 'видеть', 78 | 'вместе', 79 | 'вне', 80 | 'вниз', 81 | 'внизу', 82 | 'во', 83 | 'вода', 84 | 'война', 85 | 'вокруг', 86 | 'вон', 87 | 'вообще', 88 | 'вопрос', 89 | 'восемнадцатый', 90 | 'восемнадцать', 91 | 'восемь', 92 | 'восьмой', 93 | 'вот', 94 | 'впрочем', 95 | 'времени', 96 | 'время', 97 | 'все', 98 | 'все еще', 99 | 'всегда', 100 | 'всего', 101 | 'всем', 102 | 'всеми', 103 | 'всему', 104 | 'всех', 105 | 'всею', 106 | 'всю', 107 | 'всюду', 108 | 'вся', 109 | 'всё', 110 | 'второй', 111 | 'вы', 112 | 'выйти', 113 | 'г', 114 | 'где', 115 | 'главный', 116 | 'глаз', 117 | 'говорил', 118 | 'говорит', 119 | 'говорить', 120 | 'год', 121 | 'года', 122 | 'году', 123 | 'голова', 124 | 'голос', 125 | 'город', 126 | 'да', 127 | 'давать', 128 | 'давно', 129 | 'даже', 130 | 'далекий', 131 | 'далеко', 132 | 'дальше', 133 | 'даром', 134 | 'дать', 135 | 'два', 136 | 'двадцатый', 137 | 'двадцать', 138 | 'две', 139 | 'двенадцатый', 140 | 'двенадцать', 141 | 'дверь', 142 | 'двух', 143 | 'девятнадцатый', 144 | 'девятнадцать', 145 | 'девятый', 146 | 'девять', 147 | 'действительно', 148 | 'дел', 149 | 'делал', 150 | 'делать', 151 | 'делаю', 152 | 'дело', 153 | 'день', 154 | 'деньги', 155 | 'десятый', 156 | 'десять', 157 | 'для', 158 | 'до', 159 | 'довольно', 160 | 'долго', 161 | 'должен', 162 | 'должно', 163 | 'должный', 164 | 'дом', 165 | 'дорога', 166 | 'друг', 167 | 'другая', 168 | 'другие', 169 | 'других', 170 | 'друго', 171 | 'другое', 172 | 'другой', 173 | 'думать', 174 | 'душа', 175 | 'е', 176 | 'его', 177 | 'ее', 178 | 'ей', 179 | 'ему', 180 | 'если', 181 | 'есть', 182 | 'еще', 183 | 'ещё', 184 | 'ею', 185 | 'её', 186 | 'ж', 187 | 'ждать', 188 | 'же', 189 | 'жена', 190 | 'женщина', 191 | 'жизнь', 192 | 'жить', 193 | 'за', 194 | 'занят', 195 | 'занята', 196 | 'занято', 197 | 'заняты', 198 | 'затем', 199 | 'зато', 200 | 'зачем', 201 | 'здесь', 202 | 'земля', 203 | 'знать', 204 | 'значит', 205 | 'значить', 206 | 'и', 207 | 'иди', 208 | 'идти', 209 | 'из', 210 | 'или', 211 | 'им', 212 | 'имеет', 213 | 'имел', 214 | 'именно', 215 | 'иметь', 216 | 'ими', 217 | 'имя', 218 | 'иногда', 219 | 'их', 220 | 'к', 221 | 'каждая', 222 | 'каждое', 223 | 'каждые', 224 | 'каждый', 225 | 'кажется', 226 | 'казаться', 227 | 'как', 228 | 'какая', 229 | 'какой', 230 | 'кем', 231 | 'книга', 232 | 'когда', 233 | 'кого', 234 | 'ком', 235 | 'комната', 236 | 'кому', 237 | 'конец', 238 | 'конечно', 239 | 'которая', 240 | 'которого', 241 | 'которой', 242 | 'которые', 243 | 'который', 244 | 'которых', 245 | 'кроме', 246 | 'кругом', 247 | 'кто', 248 | 'куда', 249 | 'лежать', 250 | 'лет', 251 | 'ли', 252 | 'лицо', 253 | 'лишь', 254 | 'лучше', 255 | 'любить', 256 | 'люди', 257 | 'м', 258 | 'маленький', 259 | 'мало', 260 | 'мать', 261 | 'машина', 262 | 'между', 263 | 'меля', 264 | 'менее', 265 | 'меньше', 266 | 'меня', 267 | 'место', 268 | 'миллионов', 269 | 'мимо', 270 | 'минута', 271 | 'мир', 272 | 'мира', 273 | 'мне', 274 | 'много', 275 | 'многочисленная', 276 | 'многочисленное', 277 | 'многочисленные', 278 | 'многочисленный', 279 | 'мной', 280 | 'мною', 281 | 'мог', 282 | 'могу', 283 | 'могут', 284 | 'мож', 285 | 'может', 286 | 'может быть', 287 | 'можно', 288 | 'можхо', 289 | 'мои', 290 | 'мой', 291 | 'мор', 292 | 'москва', 293 | 'мочь', 294 | 'моя', 295 | 'моё', 296 | 'мы', 297 | 'на', 298 | 'наверху', 299 | 'над', 300 | 'надо', 301 | 'назад', 302 | 'наиболее', 303 | 'найти', 304 | 'наконец', 305 | 'нам', 306 | 'нами', 307 | 'народ', 308 | 'нас', 309 | 'начала', 310 | 'начать', 311 | 'наш', 312 | 'наша', 313 | 'наше', 314 | 'наши', 315 | 'не', 316 | 'него', 317 | 'недавно', 318 | 'недалеко', 319 | 'нее', 320 | 'ней', 321 | 'некоторый', 322 | 'нельзя', 323 | 'нем', 324 | 'немного', 325 | 'нему', 326 | 'непрерывно', 327 | 'нередко', 328 | 'несколько', 329 | 'нет', 330 | 'нею', 331 | 'неё', 332 | 'ни', 333 | 'нибудь', 334 | 'ниже', 335 | 'низко', 336 | 'никакой', 337 | 'никогда', 338 | 'никто', 339 | 'никуда', 340 | 'ним', 341 | 'ними', 342 | 'них', 343 | 'ничего', 344 | 'ничто', 345 | 'но', 346 | 'новый', 347 | 'нога', 348 | 'ночь', 349 | 'ну', 350 | 'нужно', 351 | 'нужный', 352 | 'нх', 353 | 'о', 354 | 'об', 355 | 'оба', 356 | 'обычно', 357 | 'один', 358 | 'одиннадцатый', 359 | 'одиннадцать', 360 | 'однажды', 361 | 'однако', 362 | 'одного', 363 | 'одной', 364 | 'оказаться', 365 | 'окно', 366 | 'около', 367 | 'он', 368 | 'она', 369 | 'они', 370 | 'оно', 371 | 'опять', 372 | 'особенно', 373 | 'остаться', 374 | 'от', 375 | 'ответить', 376 | 'отец', 377 | 'откуда', 378 | 'отовсюду', 379 | 'отсюда', 380 | 'очень', 381 | 'первый', 382 | 'перед', 383 | 'писать', 384 | 'плечо', 385 | 'по', 386 | 'под', 387 | 'подойди', 388 | 'подумать', 389 | 'пожалуйста', 390 | 'позже', 391 | 'пойти', 392 | 'пока', 393 | 'пол', 394 | 'получить', 395 | 'помнить', 396 | 'понимать', 397 | 'понять', 398 | 'пор', 399 | 'пора', 400 | 'после', 401 | 'последний', 402 | 'посмотреть', 403 | 'посреди', 404 | 'потом', 405 | 'потому', 406 | 'почему', 407 | 'почти', 408 | 'правда', 409 | 'прекрасно', 410 | 'при', 411 | 'про', 412 | 'просто', 413 | 'против', 414 | 'процентов', 415 | 'путь', 416 | 'пятнадцатый', 417 | 'пятнадцать', 418 | 'пятый', 419 | 'пять', 420 | 'работа', 421 | 'работать', 422 | 'раз', 423 | 'разве', 424 | 'рано', 425 | 'раньше', 426 | 'ребенок', 427 | 'решить', 428 | 'россия', 429 | 'рука', 430 | 'русский', 431 | 'ряд', 432 | 'рядом', 433 | 'с', 434 | 'с кем', 435 | 'сам', 436 | 'сама', 437 | 'сами', 438 | 'самим', 439 | 'самими', 440 | 'самих', 441 | 'само', 442 | 'самого', 443 | 'самой', 444 | 'самом', 445 | 'самому', 446 | 'саму', 447 | 'самый', 448 | 'свет', 449 | 'свое', 450 | 'своего', 451 | 'своей', 452 | 'свои', 453 | 'своих', 454 | 'свой', 455 | 'свою', 456 | 'сделать', 457 | 'сеаой', 458 | 'себе', 459 | 'себя', 460 | 'сегодня', 461 | 'седьмой', 462 | 'сейчас', 463 | 'семнадцатый', 464 | 'семнадцать', 465 | 'семь', 466 | 'сидеть', 467 | 'сила', 468 | 'сих', 469 | 'сказал', 470 | 'сказала', 471 | 'сказать', 472 | 'сколько', 473 | 'слишком', 474 | 'слово', 475 | 'случай', 476 | 'смотреть', 477 | 'сначала', 478 | 'снова', 479 | 'со', 480 | 'собой', 481 | 'собою', 482 | 'советский', 483 | 'совсем', 484 | 'спасибо', 485 | 'спросить', 486 | 'сразу', 487 | 'стал', 488 | 'старый', 489 | 'стать', 490 | 'стол', 491 | 'сторона', 492 | 'стоять', 493 | 'страна', 494 | 'суть', 495 | 'считать', 496 | 'т', 497 | 'та', 498 | 'так', 499 | 'такая', 500 | 'также', 501 | 'таки', 502 | 'такие', 503 | 'такое', 504 | 'такой', 505 | 'там', 506 | 'твои', 507 | 'твой', 508 | 'твоя', 509 | 'твоё', 510 | 'те', 511 | 'тебе', 512 | 'тебя', 513 | 'тем', 514 | 'теми', 515 | 'теперь', 516 | 'тех', 517 | 'то', 518 | 'тобой', 519 | 'тобою', 520 | 'товарищ', 521 | 'тогда', 522 | 'того', 523 | 'тоже', 524 | 'только', 525 | 'том', 526 | 'тому', 527 | 'тот', 528 | 'тою', 529 | 'третий', 530 | 'три', 531 | 'тринадцатый', 532 | 'тринадцать', 533 | 'ту', 534 | 'туда', 535 | 'тут', 536 | 'ты', 537 | 'тысяч', 538 | 'у', 539 | 'увидеть', 540 | 'уж', 541 | 'уже', 542 | 'улица', 543 | 'уметь', 544 | 'утро', 545 | 'хороший', 546 | 'хорошо', 547 | 'хотел бы', 548 | 'хотеть', 549 | 'хоть', 550 | 'хотя', 551 | 'хочешь', 552 | 'час', 553 | 'часто', 554 | 'часть', 555 | 'чаще', 556 | 'чего', 557 | 'человек', 558 | 'чем', 559 | 'чему', 560 | 'через', 561 | 'четвертый', 562 | 'четыре', 563 | 'четырнадцатый', 564 | 'четырнадцать', 565 | 'что', 566 | 'чтоб', 567 | 'чтобы', 568 | 'чуть', 569 | 'шестнадцатый', 570 | 'шестнадцать', 571 | 'шестой', 572 | 'шесть', 573 | 'эта', 574 | 'эти', 575 | 'этим', 576 | 'этими', 577 | 'этих', 578 | 'это', 579 | 'этого', 580 | 'этой', 581 | 'этом', 582 | 'этому', 583 | 'этот', 584 | 'эту', 585 | 'я', 586 | 'являюсь' 587 | ]; 588 | } 589 | -------------------------------------------------------------------------------- /src/Tool/StopWords/Spanish.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank\Tool\StopWords; 13 | 14 | /** 15 | * Class Spanish 16 | * 17 | * @package PhpScience\TextRank\Tool\StopWords 18 | */ 19 | class Spanish extends StopWordsAbstract 20 | { 21 | /** 22 | * Stop words for avoid dummy keywords for Language Spanish. 23 | * Source: https://github.com/stopwords-iso/stopwords-es 24 | * 25 | * @var array 26 | */ 27 | protected $words = [ 28 | 'a', 29 | 'actualmente', 30 | 'acuerdo', 31 | 'adelante', 32 | 'ademas', 33 | 'además', 34 | 'adrede', 35 | 'afirmó', 36 | 'agregó', 37 | 'ahi', 38 | 'ahora', 39 | 'ahí', 40 | 'al', 41 | 'algo', 42 | 'alguna', 43 | 'algunas', 44 | 'alguno', 45 | 'algunos', 46 | 'algún', 47 | 'alli', 48 | 'allí', 49 | 'alrededor', 50 | 'ambos', 51 | 'ampleamos', 52 | 'antano', 53 | 'antaño', 54 | 'ante', 55 | 'anterior', 56 | 'antes', 57 | 'apenas', 58 | 'aproximadamente', 59 | 'aquel', 60 | 'aquella', 61 | 'aquellas', 62 | 'aquello', 63 | 'aquellos', 64 | 'aqui', 65 | 'aquél', 66 | 'aquélla', 67 | 'aquéllas', 68 | 'aquéllos', 69 | 'aquí', 70 | 'arriba', 71 | 'arribaabajo', 72 | 'aseguró', 73 | 'asi', 74 | 'así', 75 | 'atras', 76 | 'aun', 77 | 'aunque', 78 | 'ayer', 79 | 'añadió', 80 | 'aún', 81 | 'b', 82 | 'bajo', 83 | 'bastante', 84 | 'bien', 85 | 'breve', 86 | 'buen', 87 | 'buena', 88 | 'buenas', 89 | 'bueno', 90 | 'buenos', 91 | 'c', 92 | 'cada', 93 | 'casi', 94 | 'cerca', 95 | 'cierta', 96 | 'ciertas', 97 | 'cierto', 98 | 'ciertos', 99 | 'cinco', 100 | 'claro', 101 | 'comentó', 102 | 'como', 103 | 'con', 104 | 'conmigo', 105 | 'conocer', 106 | 'conseguimos', 107 | 'conseguir', 108 | 'considera', 109 | 'consideró', 110 | 'consigo', 111 | 'consigue', 112 | 'consiguen', 113 | 'consigues', 114 | 'contigo', 115 | 'contra', 116 | 'cosas', 117 | 'creo', 118 | 'cual', 119 | 'cuales', 120 | 'cualquier', 121 | 'cuando', 122 | 'cuanta', 123 | 'cuantas', 124 | 'cuanto', 125 | 'cuantos', 126 | 'cuatro', 127 | 'cuenta', 128 | 'cuál', 129 | 'cuáles', 130 | 'cuándo', 131 | 'cuánta', 132 | 'cuántas', 133 | 'cuánto', 134 | 'cuántos', 135 | 'cómo', 136 | 'd', 137 | 'da', 138 | 'dado', 139 | 'dan', 140 | 'dar', 141 | 'de', 142 | 'debajo', 143 | 'debe', 144 | 'deben', 145 | 'debido', 146 | 'decir', 147 | 'dejó', 148 | 'del', 149 | 'delante', 150 | 'demasiado', 151 | 'demás', 152 | 'dentro', 153 | 'deprisa', 154 | 'desde', 155 | 'despacio', 156 | 'despues', 157 | 'después', 158 | 'detras', 159 | 'detrás', 160 | 'dia', 161 | 'dias', 162 | 'dice', 163 | 'dicen', 164 | 'dicho', 165 | 'dieron', 166 | 'diferente', 167 | 'diferentes', 168 | 'dijeron', 169 | 'dijo', 170 | 'dio', 171 | 'donde', 172 | 'dos', 173 | 'durante', 174 | 'día', 175 | 'días', 176 | 'dónde', 177 | 'e', 178 | 'ejemplo', 179 | 'el', 180 | 'ella', 181 | 'ellas', 182 | 'ello', 183 | 'ellos', 184 | 'embargo', 185 | 'empleais', 186 | 'emplean', 187 | 'emplear', 188 | 'empleas', 189 | 'empleo', 190 | 'en', 191 | 'encima', 192 | 'encuentra', 193 | 'enfrente', 194 | 'enseguida', 195 | 'entonces', 196 | 'entre', 197 | 'era', 198 | 'erais', 199 | 'eramos', 200 | 'eran', 201 | 'eras', 202 | 'eres', 203 | 'es', 204 | 'esa', 205 | 'esas', 206 | 'ese', 207 | 'eso', 208 | 'esos', 209 | 'esta', 210 | 'estaba', 211 | 'estabais', 212 | 'estaban', 213 | 'estabas', 214 | 'estad', 215 | 'estada', 216 | 'estadas', 217 | 'estado', 218 | 'estados', 219 | 'estais', 220 | 'estamos', 221 | 'estan', 222 | 'estando', 223 | 'estar', 224 | 'estaremos', 225 | 'estará', 226 | 'estarán', 227 | 'estarás', 228 | 'estaré', 229 | 'estaréis', 230 | 'estaría', 231 | 'estaríais', 232 | 'estaríamos', 233 | 'estarían', 234 | 'estarías', 235 | 'estas', 236 | 'este', 237 | 'estemos', 238 | 'esto', 239 | 'estos', 240 | 'estoy', 241 | 'estuve', 242 | 'estuviera', 243 | 'estuvierais', 244 | 'estuvieran', 245 | 'estuvieras', 246 | 'estuvieron', 247 | 'estuviese', 248 | 'estuvieseis', 249 | 'estuviesen', 250 | 'estuvieses', 251 | 'estuvimos', 252 | 'estuviste', 253 | 'estuvisteis', 254 | 'estuviéramos', 255 | 'estuviésemos', 256 | 'estuvo', 257 | 'está', 258 | 'estábamos', 259 | 'estáis', 260 | 'están', 261 | 'estás', 262 | 'esté', 263 | 'estéis', 264 | 'estén', 265 | 'estés', 266 | 'ex', 267 | 'excepto', 268 | 'existe', 269 | 'existen', 270 | 'explicó', 271 | 'expresó', 272 | 'f', 273 | 'fin', 274 | 'final', 275 | 'fue', 276 | 'fuera', 277 | 'fuerais', 278 | 'fueran', 279 | 'fueras', 280 | 'fueron', 281 | 'fuese', 282 | 'fueseis', 283 | 'fuesen', 284 | 'fueses', 285 | 'fui', 286 | 'fuimos', 287 | 'fuiste', 288 | 'fuisteis', 289 | 'fuéramos', 290 | 'fuésemos', 291 | 'g', 292 | 'general', 293 | 'gran', 294 | 'grandes', 295 | 'gueno', 296 | 'h', 297 | 'ha', 298 | 'haber', 299 | 'habia', 300 | 'habida', 301 | 'habidas', 302 | 'habido', 303 | 'habidos', 304 | 'habiendo', 305 | 'habla', 306 | 'hablan', 307 | 'habremos', 308 | 'habrá', 309 | 'habrán', 310 | 'habrás', 311 | 'habré', 312 | 'habréis', 313 | 'habría', 314 | 'habríais', 315 | 'habríamos', 316 | 'habrían', 317 | 'habrías', 318 | 'habéis', 319 | 'había', 320 | 'habíais', 321 | 'habíamos', 322 | 'habían', 323 | 'habías', 324 | 'hace', 325 | 'haceis', 326 | 'hacemos', 327 | 'hacen', 328 | 'hacer', 329 | 'hacerlo', 330 | 'haces', 331 | 'hacia', 332 | 'haciendo', 333 | 'hago', 334 | 'han', 335 | 'has', 336 | 'hasta', 337 | 'hay', 338 | 'haya', 339 | 'hayamos', 340 | 'hayan', 341 | 'hayas', 342 | 'hayáis', 343 | 'he', 344 | 'hecho', 345 | 'hemos', 346 | 'hicieron', 347 | 'hizo', 348 | 'horas', 349 | 'hoy', 350 | 'hube', 351 | 'hubiera', 352 | 'hubierais', 353 | 'hubieran', 354 | 'hubieras', 355 | 'hubieron', 356 | 'hubiese', 357 | 'hubieseis', 358 | 'hubiesen', 359 | 'hubieses', 360 | 'hubimos', 361 | 'hubiste', 362 | 'hubisteis', 363 | 'hubiéramos', 364 | 'hubiésemos', 365 | 'hubo', 366 | 'i', 367 | 'igual', 368 | 'incluso', 369 | 'indicó', 370 | 'informo', 371 | 'informó', 372 | 'intenta', 373 | 'intentais', 374 | 'intentamos', 375 | 'intentan', 376 | 'intentar', 377 | 'intentas', 378 | 'intento', 379 | 'ir', 380 | 'j', 381 | 'junto', 382 | 'k', 383 | 'l', 384 | 'la', 385 | 'lado', 386 | 'largo', 387 | 'las', 388 | 'le', 389 | 'lejos', 390 | 'les', 391 | 'llegó', 392 | 'lleva', 393 | 'llevar', 394 | 'lo', 395 | 'los', 396 | 'luego', 397 | 'lugar', 398 | 'm', 399 | 'mal', 400 | 'manera', 401 | 'manifestó', 402 | 'mas', 403 | 'mayor', 404 | 'me', 405 | 'mediante', 406 | 'medio', 407 | 'mejor', 408 | 'mencionó', 409 | 'menos', 410 | 'menudo', 411 | 'mi', 412 | 'mia', 413 | 'mias', 414 | 'mientras', 415 | 'mio', 416 | 'mios', 417 | 'mis', 418 | 'misma', 419 | 'mismas', 420 | 'mismo', 421 | 'mismos', 422 | 'modo', 423 | 'momento', 424 | 'mucha', 425 | 'muchas', 426 | 'mucho', 427 | 'muchos', 428 | 'muy', 429 | 'más', 430 | 'mí', 431 | 'mía', 432 | 'mías', 433 | 'mío', 434 | 'míos', 435 | 'n', 436 | 'nada', 437 | 'nadie', 438 | 'ni', 439 | 'ninguna', 440 | 'ningunas', 441 | 'ninguno', 442 | 'ningunos', 443 | 'ningún', 444 | 'no', 445 | 'nos', 446 | 'nosotras', 447 | 'nosotros', 448 | 'nuestra', 449 | 'nuestras', 450 | 'nuestro', 451 | 'nuestros', 452 | 'nueva', 453 | 'nuevas', 454 | 'nuevo', 455 | 'nuevos', 456 | 'nunca', 457 | 'o', 458 | 'ocho', 459 | 'os', 460 | 'otra', 461 | 'otras', 462 | 'otro', 463 | 'otros', 464 | 'p', 465 | 'pais', 466 | 'para', 467 | 'parece', 468 | 'parte', 469 | 'partir', 470 | 'pasada', 471 | 'pasado', 472 | 'paìs', 473 | 'peor', 474 | 'pero', 475 | 'pesar', 476 | 'poca', 477 | 'pocas', 478 | 'poco', 479 | 'pocos', 480 | 'podeis', 481 | 'podemos', 482 | 'poder', 483 | 'podria', 484 | 'podriais', 485 | 'podriamos', 486 | 'podrian', 487 | 'podrias', 488 | 'podrá', 489 | 'podrán', 490 | 'podría', 491 | 'podrían', 492 | 'poner', 493 | 'por', 494 | 'por qué', 495 | 'porque', 496 | 'posible', 497 | 'primer', 498 | 'primera', 499 | 'primero', 500 | 'primeros', 501 | 'principalmente', 502 | 'pronto', 503 | 'propia', 504 | 'propias', 505 | 'propio', 506 | 'propios', 507 | 'proximo', 508 | 'próximo', 509 | 'próximos', 510 | 'pudo', 511 | 'pueda', 512 | 'puede', 513 | 'pueden', 514 | 'puedo', 515 | 'pues', 516 | 'q', 517 | 'qeu', 518 | 'que', 519 | 'quedó', 520 | 'queremos', 521 | 'quien', 522 | 'quienes', 523 | 'quiere', 524 | 'quiza', 525 | 'quizas', 526 | 'quizá', 527 | 'quizás', 528 | 'quién', 529 | 'quiénes', 530 | 'qué', 531 | 'r', 532 | 'raras', 533 | 'realizado', 534 | 'realizar', 535 | 'realizó', 536 | 'repente', 537 | 'respecto', 538 | 's', 539 | 'sabe', 540 | 'sabeis', 541 | 'sabemos', 542 | 'saben', 543 | 'saber', 544 | 'sabes', 545 | 'sal', 546 | 'salvo', 547 | 'se', 548 | 'sea', 549 | 'seamos', 550 | 'sean', 551 | 'seas', 552 | 'segun', 553 | 'segunda', 554 | 'segundo', 555 | 'según', 556 | 'seis', 557 | 'ser', 558 | 'sera', 559 | 'seremos', 560 | 'será', 561 | 'serán', 562 | 'serás', 563 | 'seré', 564 | 'seréis', 565 | 'sería', 566 | 'seríais', 567 | 'seríamos', 568 | 'serían', 569 | 'serías', 570 | 'seáis', 571 | 'señaló', 572 | 'si', 573 | 'sido', 574 | 'siempre', 575 | 'siendo', 576 | 'siete', 577 | 'sigue', 578 | 'siguiente', 579 | 'sin', 580 | 'sino', 581 | 'sobre', 582 | 'sois', 583 | 'sola', 584 | 'solamente', 585 | 'solas', 586 | 'solo', 587 | 'solos', 588 | 'somos', 589 | 'son', 590 | 'soy', 591 | 'soyos', 592 | 'su', 593 | 'supuesto', 594 | 'sus', 595 | 'suya', 596 | 'suyas', 597 | 'suyo', 598 | 'suyos', 599 | 'sé', 600 | 'sí', 601 | 'sólo', 602 | 't', 603 | 'tal', 604 | 'tambien', 605 | 'también', 606 | 'tampoco', 607 | 'tan', 608 | 'tanto', 609 | 'tarde', 610 | 'te', 611 | 'temprano', 612 | 'tendremos', 613 | 'tendrá', 614 | 'tendrán', 615 | 'tendrás', 616 | 'tendré', 617 | 'tendréis', 618 | 'tendría', 619 | 'tendríais', 620 | 'tendríamos', 621 | 'tendrían', 622 | 'tendrías', 623 | 'tened', 624 | 'teneis', 625 | 'tenemos', 626 | 'tener', 627 | 'tenga', 628 | 'tengamos', 629 | 'tengan', 630 | 'tengas', 631 | 'tengo', 632 | 'tengáis', 633 | 'tenida', 634 | 'tenidas', 635 | 'tenido', 636 | 'tenidos', 637 | 'teniendo', 638 | 'tenéis', 639 | 'tenía', 640 | 'teníais', 641 | 'teníamos', 642 | 'tenían', 643 | 'tenías', 644 | 'tercera', 645 | 'ti', 646 | 'tiempo', 647 | 'tiene', 648 | 'tienen', 649 | 'tienes', 650 | 'toda', 651 | 'todas', 652 | 'todavia', 653 | 'todavía', 654 | 'todo', 655 | 'todos', 656 | 'total', 657 | 'trabaja', 658 | 'trabajais', 659 | 'trabajamos', 660 | 'trabajan', 661 | 'trabajar', 662 | 'trabajas', 663 | 'trabajo', 664 | 'tras', 665 | 'trata', 666 | 'través', 667 | 'tres', 668 | 'tu', 669 | 'tus', 670 | 'tuve', 671 | 'tuviera', 672 | 'tuvierais', 673 | 'tuvieran', 674 | 'tuvieras', 675 | 'tuvieron', 676 | 'tuviese', 677 | 'tuvieseis', 678 | 'tuviesen', 679 | 'tuvieses', 680 | 'tuvimos', 681 | 'tuviste', 682 | 'tuvisteis', 683 | 'tuviéramos', 684 | 'tuviésemos', 685 | 'tuvo', 686 | 'tuya', 687 | 'tuyas', 688 | 'tuyo', 689 | 'tuyos', 690 | 'tú', 691 | 'u', 692 | 'ultimo', 693 | 'un', 694 | 'una', 695 | 'unas', 696 | 'uno', 697 | 'unos', 698 | 'usa', 699 | 'usais', 700 | 'usamos', 701 | 'usan', 702 | 'usar', 703 | 'usas', 704 | 'uso', 705 | 'usted', 706 | 'ustedes', 707 | 'v', 708 | 'va', 709 | 'vais', 710 | 'valor', 711 | 'vamos', 712 | 'van', 713 | 'varias', 714 | 'varios', 715 | 'vaya', 716 | 'veces', 717 | 'ver', 718 | 'verdad', 719 | 'verdadera', 720 | 'verdadero', 721 | 'vez', 722 | 'vosotras', 723 | 'vosotros', 724 | 'voy', 725 | 'vuestra', 726 | 'vuestras', 727 | 'vuestro', 728 | 'vuestros', 729 | 'w', 730 | 'x', 731 | 'y', 732 | 'ya', 733 | 'yo', 734 | 'z', 735 | 'él', 736 | 'éramos', 737 | 'ésa', 738 | 'ésas', 739 | 'ése', 740 | 'ésos', 741 | 'ésta', 742 | 'éstas', 743 | 'éste', 744 | 'éstos', 745 | 'última', 746 | 'últimas', 747 | 'último', 748 | 'últimos', 749 | ]; 750 | } 751 | -------------------------------------------------------------------------------- /src/Tool/StopWords/StopWordsAbstract.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank\Tool\StopWords; 13 | 14 | /** 15 | * Class StopWordsAbstract 16 | * 17 | * @package PhpScience\TextRank\Tool\StopWords 18 | */ 19 | abstract class StopWordsAbstract 20 | { 21 | /** 22 | * Stop words for avoid dummy keywords. 23 | * 24 | * @var array 25 | */ 26 | protected $words = []; 27 | 28 | /** 29 | * It retrieves the word exists or does not in the list of Stop words. 30 | * 31 | * @param string $word 32 | * 33 | * @return bool It is True when it exists. 34 | */ 35 | public function exist(string $word): bool 36 | { 37 | return array_search($word, $this->words) !== false; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/Tool/StopWords/Turkish.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | declare(strict_types=1); 10 | namespace PhpScience\TextRank\Tool\StopWords; 11 | /** 12 | * Class Turkish 13 | * 14 | * @package PhpScience\TextRank\Tool\StopWords 15 | */ 16 | class Turkish extends StopWordsAbstract 17 | { 18 | /** 19 | * Stop words for avoid dummy keywords for Language Turkish. 20 | * Word list created by Sezer Fidancı. 21 | * Source: https://raw.githubusercontent.com/abdullahharuntahtali/turkish_stop_words/master/turkish_stopwords.txt 22 | * 23 | * @var array 24 | */ 25 | protected $words = [ 26 | "acaba", 27 | "aksine", 28 | "al", 29 | "alarak", 30 | "aldılar", 31 | "aldım", 32 | "aldırdılar", 33 | "aldırdım", 34 | "aldırmadık", 35 | "aldırmadım", 36 | "almadım", 37 | "almaktadır", 38 | "almıştır", 39 | "altmış", 40 | "altı", 41 | "alıp", 42 | "ama", 43 | "amacı", 44 | "amacında", 45 | "amacıyla", 46 | "amaçla", 47 | "amaçlanmaktadır", 48 | "an", 49 | "ancak", 50 | "anlaşılmaktadır", 51 | "arada", 52 | "arasında", 53 | "artık", 54 | "asla", 55 | "aslında", 56 | "ay", 57 | "ayrıca", 58 | "ayrılmaktadır", 59 | "ayy", 60 | "az", 61 | "azdır", 62 | "bana", 63 | "bazen", 64 | "bazı", 65 | "bazıları", 66 | "bazısı", 67 | "başda", 68 | "başlık", 69 | "başta", 70 | "belgelenmiştir", 71 | "belirlendi", 72 | "belirlenmiş", 73 | "belirlenmişdir", 74 | "belirlenmiştir", 75 | "belirli", 76 | "belki", 77 | "belli", 78 | "ben", 79 | "benden", 80 | "beni", 81 | "benim", 82 | "benimde", 83 | "beri", 84 | "beş", 85 | "beşe", 86 | "beşi", 87 | "beşinci", 88 | "beşli", 89 | "bile", 90 | "bilhassa", 91 | "bin", 92 | "bir", 93 | "biri", 94 | "birisi", 95 | "birkaç", 96 | "birkaçı", 97 | "birkez", 98 | "birlikte", 99 | "birçok", 100 | "birçokları", 101 | "birçoğu", 102 | "birşey", 103 | "birşeyi", 104 | "biz", 105 | "bizden", 106 | "bize", 107 | "bizi", 108 | "bizim", 109 | "bu", 110 | "bulunan", 111 | "bulunanlar", 112 | "bulunduk", 113 | "bulundular", 114 | "bulundum", 115 | "bulundunuz", 116 | "bulunmak", 117 | "bulunuldu", 118 | "bulunulmuştur", 119 | "buna", 120 | "bunda", 121 | "bundan", 122 | "bunlar", 123 | "bunları", 124 | "bunların", 125 | "bunu", 126 | "bunun", 127 | "bununda", 128 | "bununla", 129 | "burada", 130 | "böyle", 131 | "böylece", 132 | "bütün", 133 | "ca", 134 | "ce", 135 | "çeşitler", 136 | "çeşitli", 137 | "çok", 138 | "çoktur", 139 | "çoğu", 140 | "çoğuna", 141 | "çoğunu", 142 | "çünkü", 143 | "da", 144 | "daa", 145 | "daha", 146 | "dahi", 147 | "dair", 148 | "de", 149 | "defa", 150 | "demek", 151 | "değil", 152 | "di", 153 | "diye", 154 | "diğer", 155 | "diğeri", 156 | "diğerleri", 157 | "doksan", 158 | "dokuz", 159 | "dolayı", 160 | "dolayısıyla", 161 | "du", 162 | "durdu", 163 | "durduk", 164 | "durdular", 165 | "durdum", 166 | "durulacak", 167 | "durulacaktır", 168 | "duruldu", 169 | "durulmamış", 170 | "durulmamıştır", 171 | "durulmuştur", 172 | "durulur", 173 | "durulurlar", 174 | "durumda", 175 | "durumdur", 176 | "durunuz", 177 | "dört", 178 | "dış", 179 | "edecek", 180 | "eden", 181 | "ederek", 182 | "edilecek", 183 | "ediliyor", 184 | "edilmesi", 185 | "edilmiş", 186 | "ediyor", 187 | "elbette", 188 | "elli", 189 | "en", 190 | "en çok", 191 | "et", 192 | "etme", 193 | "etmedim", 194 | "etmek", 195 | "etmekte", 196 | "etmesi", 197 | "etti", 198 | "ettiklerini", 199 | "ettirmek", 200 | "ettiği", 201 | "ettiğini", 202 | "eğer", 203 | "fakat", 204 | "felan", 205 | "filan", 206 | "geldiler", 207 | "gelir", 208 | "geliyorlar", 209 | "gelmiş", 210 | "gelmişler", 211 | "gene", 212 | "gerektiğinde", 213 | "getirdi", 214 | "getirdik", 215 | "getirdiler", 216 | "getirdim", 217 | "getirdiniz", 218 | "getirmişler", 219 | "gibi", 220 | "gider", 221 | "gidiyorlar", 222 | "gil", 223 | "giller", 224 | "gine", 225 | "gitmişler", 226 | "gittiler", 227 | "göre", 228 | "ha", 229 | "haa", 230 | "halen", 231 | "hangi", 232 | "hangisi", 233 | "hani", 234 | "hatta", 235 | "he", 236 | "hee", 237 | "hem", 238 | "henüz", 239 | "hep", 240 | "hepsi", 241 | "hepsine", 242 | "hepsini", 243 | "her", 244 | "her biri", 245 | "herhangi", 246 | "herkes", 247 | "herkese", 248 | "herkesi", 249 | "herkesin", 250 | "hi", 251 | "hiç", 252 | "hiç kimse", 253 | "hiçbir", 254 | "hiçbiri", 255 | "hiçbirine", 256 | "hiçbirini", 257 | "hu", 258 | "huu", 259 | "hâlâ", 260 | "hı", 261 | "ın", 262 | "ıt", 263 | "iki", 264 | "ile", 265 | "ilgili", 266 | "in", 267 | "inceledik", 268 | "incelediler", 269 | "incelediniz", 270 | "incelen", 271 | "incelendi", 272 | "incelenmiş", 273 | "ise", 274 | "isimiyle", 275 | "isimle", 276 | "isimlendirildi", 277 | "isimlendirilen", 278 | "isimlendirilmiş", 279 | "isimli", 280 | "ismi ile", 281 | "isminde", 282 | "isminden", 283 | "isminin", 284 | "it", 285 | "itibaren", 286 | "itibariyle", 287 | "içerisi", 288 | "içerisinde", 289 | "içerisine", 290 | "içerisiyle", 291 | "içersi", 292 | "için", 293 | "içinde", 294 | "işte", 295 | "kadar", 296 | "kal", 297 | "kaldı", 298 | "kaldık", 299 | "kaldılar", 300 | "kaldın", 301 | "kalır", 302 | "karşın", 303 | "katrilyon", 304 | "kaç", 305 | "kendi", 306 | "kendilerine", 307 | "kendine", 308 | "kendini", 309 | "kendisi", 310 | "kendisine", 311 | "kendisini", 312 | "kez", 313 | "ki", 314 | "kikir", 315 | "kikiri", 316 | "kim", 317 | "kimden", 318 | "kime", 319 | "kimi", 320 | "kimin", 321 | "kimisi", 322 | "kimse", 323 | "kurulduk", 324 | "kuruldum", 325 | "kurulmak", 326 | "kurulmuştur", 327 | "kıkır", 328 | "kırk", 329 | "la", 330 | "lar", 331 | "le", 332 | "ler", 333 | "madem", 334 | "maksadı", 335 | "maksadı ile", 336 | "maksadıyla", 337 | "mi", 338 | "milyar", 339 | "milyon", 340 | "mu", 341 | "mü", 342 | "mı", 343 | "na", 344 | "nasıl", 345 | "nda", 346 | "nde", 347 | "ndi", 348 | "ndı", 349 | "ne", 350 | "ne kadar", 351 | "ne zaman", 352 | "neden", 353 | "nedenle", 354 | "nedir", 355 | "nerde", 356 | "nerede", 357 | "nereden", 358 | "nereye", 359 | "nesi", 360 | "neyse", 361 | "ni", 362 | "nin", 363 | "niye", 364 | "niçin", 365 | "nu", 366 | "nü", 367 | "nı", 368 | "nın", 369 | "ol", 370 | "olan", 371 | "olanlar", 372 | "olanların", 373 | "olarak", 374 | "oldu", 375 | "olduk", 376 | "olduklarını", 377 | "oldular", 378 | "oldum", 379 | "oldun", 380 | "oldunuz", 381 | "oldurdu", 382 | "oldurdular", 383 | "oldurdun", 384 | "oldurdunuz", 385 | "olduğu", 386 | "olduğunu", 387 | "olmadı", 388 | "olmadığı", 389 | "olmak", 390 | "olmaktadır", 391 | "olması", 392 | "olmayan", 393 | "olmaz", 394 | "olsa", 395 | "olsun", 396 | "olundu", 397 | "olundular", 398 | "olundum", 399 | "olundun", 400 | "olup", 401 | "olur", 402 | "olursa", 403 | "oluyor", 404 | "oluşturmaktadır", 405 | "on", 406 | "ona", 407 | "ondan", 408 | "onlar", 409 | "onlara", 410 | "onlardan", 411 | "onları", 412 | "onların", 413 | "onu", 414 | "onun", 415 | "onunda", 416 | "onunla", 417 | "orada", 418 | "ortada", 419 | "ortadalar", 420 | "ortadan", 421 | "ortadayım", 422 | "ortadayız", 423 | "ortaya", 424 | "ortayı", 425 | "otuz", 426 | "oysa", 427 | "oysaki", 428 | "öbürü", 429 | "ön", 430 | "önce", 431 | "önerdi", 432 | "önerdiler", 433 | "önerdim", 434 | "önerilmiş", 435 | "önerilmiştir", 436 | "ötürü", 437 | "öyle", 438 | "öze", 439 | "özü", 440 | "pek", 441 | "rağmen", 442 | "sadece", 443 | "sana", 444 | "sanki", 445 | "saten", 446 | "sağladığı", 447 | "sekiz", 448 | "seksen", 449 | "sen", 450 | "senden", 451 | "seni", 452 | "senin", 453 | "seninde", 454 | "siz", 455 | "sizden", 456 | "size", 457 | "sizi", 458 | "sizin", 459 | "son", 460 | "sonra", 461 | "sunulmuştur", 462 | "suretiyle", 463 | "suretle", 464 | "söylenebilir", 465 | "sürece", 466 | "süretiyle", 467 | "süretle", 468 | "sürüldü", 469 | "sürüldük", 470 | "sürüldüler", 471 | "sürüldüm", 472 | "sürüldünüz", 473 | "sıfır", 474 | "sırala", 475 | "sıraladım", 476 | "sıralamıştır", 477 | "şayet", 478 | "şekilde", 479 | "şekliyle", 480 | "şey", 481 | "şeyden", 482 | "şeye", 483 | "şeyi", 484 | "şeyler", 485 | "şimdi", 486 | "şu", 487 | "şuna", 488 | "şunda", 489 | "şundan", 490 | "şunlar", 491 | "şunları", 492 | "şunu", 493 | "şunun", 494 | "şununda", 495 | "şununla", 496 | "şöyle", 497 | "ta", 498 | "taa", 499 | "tabi", 500 | "tamam", 501 | "taraftan", 502 | "tarafından", 503 | "tartışılmıştır", 504 | "te", 505 | "tee", 506 | "tir", 507 | "trilyon", 508 | "tüm", 509 | "tümü", 510 | "tır", 511 | "ulaştık", 512 | "ulaştılar", 513 | "ulaştım", 514 | "ulaşılan", 515 | "ulaşıldı", 516 | "ulaşılmak", 517 | "ulaşılır", 518 | "üstlenir", 519 | "üt", 520 | "üzere", 521 | "üzeri", 522 | "üzerinde", 523 | "üzerinden", 524 | "üzerine", 525 | "üzerinize", 526 | "üç", 527 | "var", 528 | "vardı", 529 | "ve", 530 | "veya", 531 | "veyahut", 532 | "ya", 533 | "yaa", 534 | "yani", 535 | "yapacak", 536 | "yapma", 537 | "yapmak", 538 | "yaptı", 539 | "yaptıkları", 540 | "yaptıklarını", 541 | "yaptığı", 542 | "yaptığını", 543 | "yapılan", 544 | "yapılması", 545 | "yapıyor", 546 | "yararlanılmıştır", 547 | "ye", 548 | "yedi", 549 | "yeniden", 550 | "yerine", 551 | "yetmiş", 552 | "yine", 553 | "yinemi", 554 | "yirmi", 555 | "yla", 556 | "yle", 557 | "yoksa", 558 | "yolla", 559 | "yolladı", 560 | "yolladılar", 561 | "yolladım", 562 | "yollayalım", 563 | "yüksektir", 564 | "yüz", 565 | "zannetti", 566 | "zaten", 567 | "zenberek", 568 | "zinhar", 569 | ]; 570 | } -------------------------------------------------------------------------------- /src/Tool/Summarize.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank\Tool; 13 | 14 | /** 15 | * Class Summarize 16 | * 17 | * This is for summarize the text from parsed data. 18 | * 19 | * @package PhpScience\TextRank\Tool 20 | */ 21 | class Summarize 22 | { 23 | /** 24 | * To find all important sentences. 25 | * 26 | * @var int 27 | */ 28 | const GET_ALL_IMPORTANT = 0; 29 | 30 | /** 31 | * To find the most important sentence and its following sentences. 32 | * 33 | * @var int 34 | */ 35 | const GET_FIRST_IMPORTANT_AND_FOLLOWINGS = 1; 36 | 37 | /** 38 | * Array of sentence weight. Key is the index of the sentence and value is 39 | * the weight of the sentence. 40 | * 41 | * @var array 42 | */ 43 | protected $sentenceWeight = []; 44 | 45 | /** 46 | * Summarize text. 47 | * 48 | * It retrieves the summarized text in array. 49 | * 50 | * @param array $scores Keywords with scores. Score is the key. 51 | * @param Graph $graph The graph of the text. 52 | * @param Text $text Text object what stores all text data. 53 | * @param int $keyWordLimit How many keyword should be used to find the 54 | * important sentences. 55 | * @param int $sentenceLimit How many sentence should be retrieved. 56 | * @param int $type The type of summarizing. Possible values are 57 | * the constants of this class. 58 | * 59 | * @return array An array from sentences. 60 | */ 61 | public function getSummarize( 62 | array &$scores, 63 | Graph &$graph, 64 | Text &$text, 65 | int $keyWordLimit, 66 | int $sentenceLimit, 67 | int $type 68 | ): array { 69 | 70 | $graphData = $graph->getGraph(); 71 | $sentences = $text->getSentences(); 72 | $marks = $text->getMarks(); 73 | $this->findAndWeightSentences($scores, $graphData, $keyWordLimit); 74 | 75 | if ($type == Summarize::GET_ALL_IMPORTANT) { 76 | return $this->getAllImportant($sentences, $marks, $sentenceLimit); 77 | 78 | } else if ($type == Summarize::GET_FIRST_IMPORTANT_AND_FOLLOWINGS) { 79 | return $this->getFirstImportantAndFollowings( 80 | $sentences, 81 | $marks, 82 | $sentenceLimit 83 | ); 84 | } 85 | 86 | return []; 87 | } 88 | 89 | /** 90 | * Find and Weight Sentences. 91 | * 92 | * It finds the most important sentences and stores them into the property. 93 | * 94 | * @param array $scores Keywords with scores. Score is the key. 95 | * @param array $graphData Graph data from a Graph type object. 96 | * @param int $keyWordLimit How many keyword should be used to find the 97 | * important sentences. 98 | */ 99 | protected function findAndWeightSentences( 100 | array &$scores, 101 | array &$graphData, 102 | int $keyWordLimit 103 | ) { 104 | $i = 0; 105 | 106 | foreach ($scores as $word => $score) { 107 | if ($i >= $keyWordLimit) { 108 | break; 109 | } 110 | 111 | $i++; 112 | $wordMap = $graphData[$word]; 113 | 114 | foreach ($wordMap as $key => $value) { 115 | $this->updateSentenceWeight($key); 116 | } 117 | } 118 | 119 | arsort($this->sentenceWeight); 120 | } 121 | 122 | /** 123 | * Important Sentences. 124 | * 125 | * It retrieves the important sentences. 126 | * 127 | * @param array $sentences Sentences, ordered by weights. 128 | * @param array $marks Array of punctuations. Key is the reference 129 | * to the sentence, value is the punctuation. 130 | * @param int $sentenceLimit How many sentence should be retrieved. 131 | * 132 | * @return array An array from sentences what are the most important 133 | * sentences. 134 | */ 135 | protected function getAllImportant( 136 | array &$sentences, 137 | array &$marks, 138 | int $sentenceLimit 139 | ): array { 140 | 141 | $summary = []; 142 | $i = 0; 143 | 144 | foreach ($this->sentenceWeight as $sentenceIdx => $weight) { 145 | if ($i >= $sentenceLimit) { 146 | break; 147 | } 148 | 149 | $i++; 150 | $summary[$sentenceIdx] = $sentences[$sentenceIdx] 151 | . $this->getMark($marks, $sentenceIdx); 152 | } 153 | 154 | ksort($summary); 155 | 156 | return $summary; 157 | } 158 | 159 | /** 160 | * Most Important Sentence and Next. 161 | * 162 | * It retrieves the first most important sentence and its following 163 | * sentences. 164 | * 165 | * @param array $sentences Sentences, ordered by weights. 166 | * @param array $marks Array of punctuations. Key is the reference 167 | * to the sentence, value is the punctuation. 168 | * @param int $sentenceLimit How many sentence should be retrieved. 169 | * 170 | * @return array An array from sentences what contains the most important 171 | * sentence and its following sentences. 172 | */ 173 | protected function getFirstImportantAndFollowings( 174 | array &$sentences, 175 | array &$marks, 176 | int $sentenceLimit 177 | ): array { 178 | 179 | $summary = []; 180 | $startIdx = 0; 181 | 182 | foreach ($this->sentenceWeight as $sentenceIdx => $weight) { 183 | $summary[$sentenceIdx] = $sentences[$sentenceIdx] . 184 | $this->getMark($marks, $sentenceIdx); 185 | 186 | $startIdx = $sentenceIdx; 187 | break; 188 | } 189 | 190 | $i = 0; 191 | 192 | foreach ($sentences as $sentenceIdx => $sentence) { 193 | if ($sentenceIdx <= $startIdx) { 194 | continue; 195 | } else if ($i >= $sentenceLimit - 1) { 196 | break; 197 | } 198 | 199 | $i++; 200 | $summary[$sentenceIdx] = $sentences[$sentenceIdx] . 201 | $this->getMark($marks, $sentenceIdx); 202 | } 203 | 204 | return $summary; 205 | } 206 | 207 | /** 208 | * Update Sentence Weight. 209 | * 210 | * It updates the sentence weight what is stored in the property. 211 | * 212 | * @param int $sentenceIdx Index of the sentence. 213 | */ 214 | protected function updateSentenceWeight(int $sentenceIdx) 215 | { 216 | if (isset($this->sentenceWeight[$sentenceIdx])) { 217 | $this->sentenceWeight[$sentenceIdx] = $this->sentenceWeight[$sentenceIdx] + 1; 218 | } else { 219 | $this->sentenceWeight[$sentenceIdx] = 1; 220 | } 221 | } 222 | 223 | /** 224 | * Punctuations. 225 | * 226 | * It retrieves the punctuation of the sentence. 227 | * 228 | * @param array $marks The punctuation. Key is the reference to the 229 | * sentence, value is the punctuation. 230 | * @param int $idx Key of the punctuation. 231 | * 232 | * @return string The punctuation of the sentence. 233 | */ 234 | protected function getMark(array &$marks, int $idx) 235 | { 236 | return isset($marks[$idx]) ? $marks[$idx] : ''; 237 | } 238 | } 239 | -------------------------------------------------------------------------------- /src/Tool/Text.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank\Tool; 13 | 14 | /** 15 | * Class Text 16 | * 17 | * This class is for store the parsed texts. 18 | * 19 | * @package PhpScience\TextRank\Tool 20 | */ 21 | class Text 22 | { 23 | /** 24 | * Multidimensional array from words of the text. Key is index of the 25 | * sentence, value is an array from words where key is the index of the 26 | * word and value is the word. 27 | * 28 | * @var array 29 | */ 30 | protected $wordMatrix = []; 31 | 32 | /** 33 | * Array from sentences where key is the index and value is the sentence. 34 | * 35 | * @var array 36 | */ 37 | protected $sentences = []; 38 | 39 | /** 40 | * Array from punctuations where key is the index to link to the sentence 41 | * and value is the punctuation. 42 | * 43 | * @var array 44 | */ 45 | protected $marks = []; 46 | 47 | /** 48 | * It set the Words' matrix to the property. 49 | * 50 | * @param array $wordMatrix Multidimensional array from integer keys and 51 | * string values. 52 | */ 53 | public function setWordMatrix(array $wordMatrix) 54 | { 55 | $this->wordMatrix = $wordMatrix; 56 | } 57 | 58 | /** 59 | * It sets the sentences. 60 | * 61 | * @param array $sentences Array's key should be an int and value should be 62 | * string. 63 | */ 64 | public function setSentences(array $sentences) 65 | { 66 | $this->sentences = $sentences; 67 | } 68 | 69 | /** 70 | * It set the punctuations to the property. 71 | * 72 | * @param array $marks Array's key should be an int and value should be 73 | * string. 74 | */ 75 | public function setMarks(array $marks) 76 | { 77 | $this->marks = $marks; 78 | } 79 | 80 | /** 81 | * It retrieves the words in sentence groups. 82 | * 83 | * @return array Multidimensional array from words of the text. Key is 84 | * index of the sentence, value is an array from words 85 | * where key is the index of the word and value is the word. 86 | */ 87 | public function getWordMatrix(): array 88 | { 89 | return $this->wordMatrix; 90 | } 91 | 92 | /** 93 | * It retrieves the sentences. 94 | * 95 | * @return array Array from sentences where key is the index and value is 96 | * the sentence. 97 | */ 98 | public function getSentences(): array 99 | { 100 | return $this->sentences; 101 | } 102 | 103 | /** 104 | * It retrieves the punctuations. 105 | * 106 | * @return array Array from punctuations where key is the index to link to 107 | * the sentence and value is the punctuation. 108 | */ 109 | public function getMarks(): array 110 | { 111 | return $this->marks; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /tests/TextRankFacadeTest.php: -------------------------------------------------------------------------------- 1 | 8 | */ 9 | 10 | declare(strict_types=1); 11 | 12 | namespace PhpScience\TextRank; 13 | 14 | use PhpScience\TextRank\Tool\StopWords\English; 15 | use PhpScience\TextRank\Tool\StopWords\Russian; 16 | use PhpScience\TextRank\Tool\Summarize; 17 | use PHPUnit\Framework\TestCase; 18 | 19 | class TextRankFacadeTest extends TestCase 20 | { 21 | protected $sampleText1; 22 | 23 | public function setUp(): void 24 | { 25 | parent::setUp(); 26 | 27 | $path = __DIR__ . DIRECTORY_SEPARATOR . '..' . DIRECTORY_SEPARATOR . 'res' 28 | . DIRECTORY_SEPARATOR . 'sample1.txt'; 29 | $file = fopen($path, 'r'); 30 | 31 | $this->sampleText1 = fread($file, filesize($path)); 32 | 33 | fclose($file); 34 | } 35 | 36 | public function testGetOnlyKeyWords() 37 | { 38 | $api = new TextRankFacade(); 39 | $stopWords = new English(); 40 | $api->setStopWords($stopWords); 41 | 42 | $result = $api->getOnlyKeyWords($this->sampleText1); 43 | 44 | $this->assertTrue(count($result) > 0); 45 | $this->assertTrue(array_values($result)[0] == 1); 46 | } 47 | 48 | public function testGetHighlights() 49 | { 50 | $api = new TextRankFacade(); 51 | $stopWords = new English(); 52 | $api->setStopWords($stopWords); 53 | 54 | $result = $api->getHighlights($this->sampleText1); 55 | 56 | $this->assertTrue(count($result) > 0); 57 | } 58 | 59 | public function testSummarizeTextCompound() 60 | { 61 | $api = new TextRankFacade(); 62 | $stopWords = new English(); 63 | $api->setStopWords($stopWords); 64 | 65 | $result = $api->summarizeTextCompound($this->sampleText1); 66 | 67 | $this->assertTrue(count($result) > 0); 68 | } 69 | 70 | public function testSummarizeTextBasic() 71 | { 72 | $api = new TextRankFacade(); 73 | $stopWords = new English(); 74 | $api->setStopWords($stopWords); 75 | 76 | $result = $api->summarizeTextBasic($this->sampleText1); 77 | 78 | $this->assertTrue(count($result) > 0); 79 | } 80 | 81 | public function testSummarizeTextFreely() 82 | { 83 | $api = new TextRankFacade(); 84 | $stopWords = new English(); 85 | $api->setStopWords($stopWords); 86 | 87 | $result = $api->summarizeTextFreely( 88 | $this->sampleText1, 89 | 5, 90 | 2, 91 | Summarize::GET_ALL_IMPORTANT 92 | ); 93 | 94 | $this->assertTrue(count($result) == 2); 95 | 96 | $result = $api->summarizeTextFreely( 97 | $this->sampleText1, 98 | 10, 99 | 1, 100 | Summarize::GET_FIRST_IMPORTANT_AND_FOLLOWINGS 101 | ); 102 | 103 | $this->assertTrue(count($result) == 1); 104 | 105 | // Stop words. 106 | $result = $api->summarizeTextFreely( 107 | 'one two. one two. three four.', 108 | 2, 109 | 10, 110 | Summarize::GET_ALL_IMPORTANT 111 | ); 112 | 113 | $this->assertTrue(count($result) == 0); 114 | 115 | // Less sentences then expected. 116 | $result = $api->summarizeTextFreely( 117 | 'lorem ipsum. lorem holy ipsum. sit dolor amet.', 118 | 2, 119 | 10, 120 | Summarize::GET_ALL_IMPORTANT 121 | ); 122 | 123 | $this->assertTrue(count($result) == 2); 124 | } 125 | 126 | public function testSmallText() 127 | { 128 | $api = new TextRankFacade(); 129 | $stopWords = new English(); 130 | $api->setStopWords($stopWords); 131 | 132 | $result = $api->getOnlyKeyWords('lorem ipsum sit'); 133 | 134 | $this->assertEquals(2, count($result)); 135 | 136 | $result = $api->getOnlyKeyWords('sit'); 137 | 138 | $this->assertEquals(0, count($result)); 139 | 140 | $result = $api->getOnlyKeyWords(''); 141 | 142 | $this->assertEquals(0, count($result)); 143 | } 144 | 145 | public function testSmallTextRu() 146 | { 147 | $api = new TextRankFacade(); 148 | $stopWords = new Russian(); 149 | $api->setStopWords($stopWords); 150 | $result = $api->getOnlyKeyWords('между холодными ладонями'); 151 | $this->assertCount(2, $result); 152 | 153 | $result = $api->getOnlyKeyWords('конец'); 154 | $this->assertCount(0, $result); 155 | 156 | $result = $api->getOnlyKeyWords(''); 157 | $this->assertCount(0, $result); 158 | } 159 | } 160 | --------------------------------------------------------------------------------