├── .coveralls.yml ├── .gitignore ├── .php_cs.dist ├── .travis.yml ├── LICENSE ├── README.md ├── composer.json ├── phpunit.xml.dist ├── src └── Shdev │ └── FlashText │ ├── FileReadException.php │ └── KeywordProcessor.php └── tests └── Shdev └── FlashText ├── KeywordProcessor ├── CircularTest.php ├── DictionaryLoadingTest.php ├── EmptyTest.php ├── ExceptionTest.php ├── ExtractSpanTest.php ├── ExtractorTest.php ├── FileLoadTest.php ├── GetAllKeywordsTest.php ├── LenTest.php ├── LoadingKeywordListTest.php ├── NoBordersTest.php ├── NonWordBoundaryTest.php ├── RemoveKeywordsTest.php ├── ReplacerTest.php ├── TermInKpTest.php ├── UmlautTest.php ├── keyword_extractor_test_cases.json ├── keyword_remover_test_cases.json ├── keywords_format_one.txt └── keywords_format_two.txt └── flashtext_vs_regex.php /.coveralls.yml: -------------------------------------------------------------------------------- 1 | 2 | repo_token: your_token # should be kept secret! 3 | service_name: travis-ci 4 | 5 | coverage_clover: var/logs/phpunit/clover.xml 6 | json_path: var/logs/coverall/coveralls-upload.json -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/phpstorm+all 3 | 4 | ### PhpStorm+all ### 5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 7 | 8 | # User-specific stuff: 9 | .idea/**/workspace.xml 10 | .idea/**/tasks.xml 11 | .idea/dictionaries 12 | 13 | # Sensitive or high-churn files: 14 | .idea/**/dataSources/ 15 | .idea/**/dataSources.ids 16 | .idea/**/dataSources.xml 17 | .idea/**/dataSources.local.xml 18 | .idea/**/sqlDataSources.xml 19 | .idea/**/dynamic.xml 20 | .idea/**/uiDesigner.xml 21 | 22 | # Gradle: 23 | .idea/**/gradle.xml 24 | .idea/**/libraries 25 | 26 | # CMake 27 | cmake-build-debug/ 28 | 29 | # Mongo Explorer plugin: 30 | .idea/**/mongoSettings.xml 31 | 32 | ## File-based project format: 33 | *.iws 34 | 35 | ## Plugin-specific files: 36 | 37 | # IntelliJ 38 | /out/ 39 | 40 | # mpeltonen/sbt-idea plugin 41 | .idea_modules/ 42 | 43 | # JIRA plugin 44 | atlassian-ide-plugin.xml 45 | 46 | # Cursive Clojure plugin 47 | .idea/replstate.xml 48 | 49 | # Ruby plugin and RubyMine 50 | /.rakeTasks 51 | 52 | # Crashlytics plugin (for Android Studio and IntelliJ) 53 | com_crashlytics_export_strings.xml 54 | crashlytics.properties 55 | crashlytics-build.properties 56 | fabric.properties 57 | 58 | ### PhpStorm+all Patch ### 59 | # Ignores the whole idea folder 60 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 61 | 62 | .idea/ 63 | 64 | # End of https://www.gitignore.io/api/phpstorm+all 65 | var/logs/phpunit 66 | vendor 67 | composer.lock 68 | var/logs/coverall 69 | -------------------------------------------------------------------------------- /.php_cs.dist: -------------------------------------------------------------------------------- 1 | exclude('somedir') 5 | ->in(__DIR__) 6 | ; 7 | 8 | return PhpCsFixer\Config::create() 9 | ->setCacheFile(__DIR__.'/.php_cs.cache') 10 | ->setIndent(' ') 11 | ->setRules([ 12 | '@Symfony' => true, 13 | '@DoctrineAnnotation' => true, 14 | 'array_syntax' => array('syntax' => 'short'), 15 | 'no_singleline_whitespace_before_semicolons' => false, 16 | 'phpdoc_no_package' => false, 17 | 'phpdoc_var_without_name' => false, 18 | ]) 19 | ->setFinder($finder) 20 | ; 21 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | php: 3 | - '5.6' 4 | - '7.0' 5 | - '7.1' 6 | - '7.2' 7 | 8 | git: 9 | depth: 1 10 | 11 | notifications: 12 | email: 13 | - mail-github@sh-dev.de 14 | 15 | install: 16 | - composer install 17 | 18 | script: 19 | - mkdir -p var/logs/{phpunit,coverall} 20 | - vendor/bin/phpunit --coverage-clover var/logs/phpunit/clover.xml tests/Shdev/FlashText/KeywordProcessor/ 21 | - '[ -f var/logs/phpunit/clover.xml ] && vendor/bin/php-coveralls --coverage_clover var/logs/phpunit/clover.xml -o var/logs/coverall/coveralls-upload.json || echo coverage data could not created' 22 | # - php tests/Shdev/FlashText/flashtext_vs_regex.php 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Sebastian Holtz (mail-github@sh-dev.de) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Flashtext for PHP 3 | 4 | 5 | [![Build Status](https://travis-ci.org/shdev/phpflashtext.svg?branch=master)](https://travis-ci.org/shdev/phpflashtext) [![Coverage Status](https://coveralls.io/repos/github/shdev/phpflashtext/badge.svg?branch=master)](https://coveralls.io/github/shdev/phpflashtext?branch=master) 6 | 7 | It's a port from the wonderful python project https://github.com/vi3k6i5/flashtext, 8 | for internals of the algorithm look there. 9 | 10 | This algorithm allows you to extract or replace several keywords at ones. 11 | If you deal with 300 keywords, which have 5 variants each a regex approach is slower than the flashtext approach. 12 | For 1000 keyword with 5 variants each the regex can't be build. 13 | 14 | In PHP 5.6 using regex is really slow. In newer verions it performs better. 15 | 16 | ## Install 17 | 18 | ```bash 19 | composer require shdev/phpflashtext 20 | ``` 21 | 22 | ## Usage 23 | 24 | ```php 25 | ['java_2e', 'java programing'], 33 | 'product management' => ['product management techniques', 'product management'], 34 | ]; 35 | 36 | $keywordProcessor->addKeywordsFromAssocArray($keywords); 37 | 38 | $sentence = 'I know java_2e and product management techniques'; 39 | 40 | $keywordsExtracted = $keywordProcessor->extractKeywords($sentence); 41 | // $keywordsExtracted = ['java', 'product management'] 42 | 43 | $keywordsExtractedWithSpanInfo = $keywordProcessor->extractKeywords($sentence, true); 44 | // $keywordsExtractedWithSpanInfo = [ 45 | // ['java', 7, 14], 46 | // ['product management', 19, 48], 47 | //] 48 | 49 | 50 | $sentenceNew = $keywordProcessor->replaceKeywords($sentence); 51 | // $sentenceNew = 'I know java and product management'; 52 | 53 | ``` 54 | 55 | ## Citation 56 | 57 | 58 | The original paper published on [FlashText algorithm](https://arxiv.org/abs/1711.00046). 59 | 60 | ```tex 61 | @ARTICLE{2017arXiv171100046S, 62 | author = {{Singh}, V.}, 63 | title = "{Replace or Retrieve Keywords In Documents at Scale}", 64 | journal = {ArXiv e-prints}, 65 | archivePrefix = "arXiv", 66 | eprint = {1711.00046}, 67 | primaryClass = "cs.DS", 68 | keywords = {Computer Science - Data Structures and Algorithms}, 69 | year = 2017, 70 | month = oct, 71 | adsurl = {http://adsabs.harvard.edu/abs/2017arXiv171100046S}, 72 | adsnote = {Provided by the SAO/NASA Astrophysics Data System} 73 | } 74 | 75 | ``` 76 | The article published on [Medium freeCodeCamp](https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f). -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "shdev/phpflashtext", 3 | "license": "MIT", 4 | "description": "A port of the flashtext python implementation", 5 | "type": "library", 6 | "authors": [ 7 | { 8 | "name": "Sebastian Holtz", 9 | "email": "holtz@make-better.de" 10 | } 11 | ], 12 | "require": { 13 | "php": ">=5.6.0" 14 | }, 15 | "autoload": { 16 | "psr-4": { 17 | "": "src/" 18 | } 19 | }, 20 | "autoload-dev": { 21 | "psr-4": { 22 | "Tests\\": "tests/" 23 | } 24 | }, 25 | "require-dev": { 26 | "phpunit/phpunit": "^5.7", 27 | "symfony/var-dumper": "^3.4", 28 | "symfony/stopwatch": "^3.4", 29 | "php-coveralls/php-coveralls": "^2.0" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | tests 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | src 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /src/Shdev/FlashText/FileReadException.php: -------------------------------------------------------------------------------- 1 | caseSensitiv = $caseSensitiv; 37 | $this->setNonWordBoundaries(self::INIT_NON_WORD_BOUNDARIES); 38 | } 39 | 40 | /** 41 | * @return int 42 | */ 43 | public function count() 44 | { 45 | return $this->termsInTrie; 46 | } 47 | 48 | /** 49 | * @param $word 50 | * 51 | * @return bool 52 | */ 53 | public function contains($word) 54 | { 55 | if (!$this->caseSensitiv) { 56 | $word = mb_strtolower($word); 57 | } 58 | $currentDict = $this->keywordTrieDict; 59 | $lenCovered = 0; 60 | 61 | $chars = str_split($word); 62 | foreach ($chars as $char) { 63 | if (isset($currentDict[$char])) { 64 | $currentDict = $currentDict[$char]; 65 | ++$lenCovered; 66 | } else { 67 | break; 68 | } 69 | } 70 | 71 | return isset($currentDict[self::TREE_LEAF]) && $lenCovered === strlen($word); 72 | } 73 | 74 | /** 75 | * @return string[] 76 | */ 77 | public function getNonWordBoundaries() 78 | { 79 | return $this->nonWordBoundaries; 80 | } 81 | 82 | /** 83 | * @param string|string[] $nonWordBoundaries 84 | * @return KeywordProcessor 85 | */ 86 | public function setNonWordBoundaries($nonWordBoundaries) 87 | { 88 | if (is_string($nonWordBoundaries)) { 89 | $nonWordBoundaries = str_split($nonWordBoundaries); 90 | } 91 | 92 | $this->nonWordBoundaries = $nonWordBoundaries; 93 | 94 | return $this; 95 | } 96 | 97 | /** 98 | * @param string $nonWordBoundary 99 | * @return KeywordProcessor 100 | */ 101 | public function addNonWordBoundaries($nonWordBoundary) 102 | { 103 | $this->nonWordBoundaries[] = $nonWordBoundary; 104 | 105 | return $this; 106 | } 107 | 108 | /** 109 | * @param string $keyword 110 | * @param string|null $cleanName 111 | * @return bool 112 | */ 113 | public function addKeyword($keyword, $cleanName = null) 114 | { 115 | $status = false; 116 | 117 | if (!$cleanName && $keyword) { 118 | $cleanName = $keyword; 119 | } 120 | 121 | if ($keyword && $cleanName) { 122 | if (!$this->caseSensitiv) { 123 | $keyword = mb_strtolower($keyword); 124 | } 125 | $currentDict = &$this->keywordTrieDict; 126 | 127 | $chars = str_split($keyword); 128 | foreach ($chars as $char) { 129 | if (!isset($currentDict[$char])) { 130 | $currentDict[$char] = []; 131 | } 132 | $currentDict = &$currentDict[$char]; 133 | } 134 | 135 | if (!isset($currentDict[self::TREE_LEAF])) { 136 | $status = true; 137 | ++$this->termsInTrie; 138 | } 139 | 140 | $currentDict[self::TREE_LEAF ] = $cleanName; 141 | } 142 | 143 | return $status; 144 | } 145 | 146 | /** 147 | * @param string $keyword 148 | * @return bool 149 | */ 150 | public function removeKeyword($keyword) 151 | { 152 | $status = false; 153 | 154 | if ($keyword) { 155 | if (!$this->caseSensitiv) { 156 | $keyword = mb_strtolower($keyword); 157 | } 158 | $currentDict = &$this->keywordTrieDict; 159 | 160 | $characterTrieList = []; 161 | 162 | $chars = str_split($keyword); 163 | foreach ($chars as $char) { 164 | if (isset($currentDict[$char])) { 165 | $characterTrieList[] = [$char, &$currentDict]; 166 | $currentDict = &$currentDict[$char]; 167 | } 168 | } 169 | 170 | if (isset($currentDict[self::TREE_LEAF])) { 171 | $characterTrieList[] = [self::TREE_LEAF, &$currentDict]; 172 | $characterTrieList = array_reverse($characterTrieList); 173 | 174 | foreach ($characterTrieList as $item) { 175 | $keyToRemove = $item[0]; 176 | $dictPointer = &$item[1]; 177 | if (1 === count(array_keys($dictPointer))) { 178 | unset($dictPointer[$keyToRemove]); 179 | } else { 180 | unset($dictPointer[$keyToRemove]); 181 | break; 182 | } 183 | } 184 | 185 | $status = true; 186 | --$this->termsInTrie; 187 | } 188 | } 189 | 190 | return $status; 191 | } 192 | 193 | /** 194 | * @param $word 195 | * @return null|string 196 | */ 197 | public function getKeyword($word) 198 | { 199 | if (!$this->caseSensitiv) { 200 | $word = mb_strtolower($word); 201 | } 202 | $currentDict = $this->keywordTrieDict; 203 | $lenCovered = 0; 204 | $chars = str_split($word); 205 | foreach ($chars as $char) { 206 | if (isset($currentDict[$char])) { 207 | $currentDict = $currentDict[$char]; 208 | ++$lenCovered; 209 | } else { 210 | break; 211 | } 212 | } 213 | 214 | if (isset($currentDict[self::TREE_LEAF]) && $lenCovered === strlen($word)) { 215 | return $currentDict[self::TREE_LEAF]; 216 | } 217 | 218 | return null; 219 | } 220 | 221 | /** 222 | * @param $keywordFile 223 | * @return $this 224 | * @throws FileReadException 225 | */ 226 | public function addKeywordFromFile($keywordFile) 227 | { 228 | 229 | $fileContent = @file_get_contents($keywordFile); 230 | 231 | if (false === $fileContent) { 232 | throw new FileReadException(sprintf('Error during reading file \'%s\'.', $keywordFile)); 233 | } 234 | $lines = explode(PHP_EOL, $fileContent); 235 | 236 | foreach ($lines as $line) { 237 | $keyword = null; 238 | $cleanName = null; 239 | if (false === strpos($line, '=>')) { 240 | $keyword = trim($line); 241 | } else { 242 | list($keyword, $cleanName) = explode('=>', $line, 2); 243 | $keyword = trim($keyword); 244 | $cleanName = trim($cleanName); 245 | } 246 | $this->addKeyword($keyword, $cleanName); 247 | } 248 | 249 | return $this; 250 | } 251 | 252 | /** 253 | * @param string[] $array 254 | * @return KeywordProcessor 255 | */ 256 | public function addKeywordsFromAssocArray(array $array) 257 | { 258 | foreach ($array as $cleanName => $keywords) { 259 | foreach ((array)$keywords as $keyword) { 260 | $this->addKeyword($keyword, $cleanName); 261 | } 262 | } 263 | 264 | return $this; 265 | } 266 | 267 | /** 268 | * @param string[] $array 269 | * @return $this 270 | */ 271 | public function removeKeywordsFromAssocArray(array $array) 272 | { 273 | foreach ($array as $cleanName => $keywords) { 274 | foreach ((array) $keywords as $keyword) { 275 | $this->removeKeyword($keyword); 276 | } 277 | } 278 | 279 | return $this; 280 | } 281 | 282 | /** 283 | * @param array $keywords 284 | * @return $this 285 | */ 286 | public function addKeywordsFromArray(array $keywords) 287 | { 288 | foreach ($keywords as $keyword) { 289 | $this->addKeyword($keyword); 290 | } 291 | 292 | return $this; 293 | } 294 | 295 | /** 296 | * @param array $keywords 297 | * @return $this 298 | */ 299 | public function removeKeywordFromArray(array $keywords) 300 | { 301 | foreach ($keywords as $keyword) { 302 | $this->removeKeyword($keyword); 303 | } 304 | 305 | return $this; 306 | } 307 | 308 | /** 309 | * @return string[] 310 | */ 311 | public function getAllKeywords() 312 | { 313 | return $this->getAllKeywordsRecursive(); 314 | } 315 | 316 | /** 317 | * @param string $termSoFar 318 | * @param string[]|null $currentDict 319 | * 320 | * @return string[] 321 | */ 322 | private function getAllKeywordsRecursive($termSoFar = '', array &$currentDict = null) 323 | { 324 | $termPresent = []; 325 | 326 | if (!$termSoFar) { 327 | $termSoFar = ''; 328 | } 329 | if (null === $currentDict) { 330 | $currentDict = &$this->keywordTrieDict; 331 | } 332 | 333 | foreach ($currentDict as $key => $value) { 334 | if (self::TREE_LEAF === $key) { 335 | $termPresent[$termSoFar] = $value; 336 | } else { 337 | $subValues = $this->getAllKeywordsRecursive($termSoFar . $key, $currentDict[$key]); 338 | foreach ($subValues as $subKey => $subValue) { 339 | $termPresent[$subKey] = $subValue; 340 | } 341 | } 342 | } 343 | 344 | return $termPresent; 345 | } 346 | 347 | /** 348 | * @param $sentence 349 | * @param bool $spanInfo 350 | * @return array 351 | */ 352 | public function extractKeywords($sentence, $spanInfo = false) 353 | { 354 | $keywordsExtracted= []; 355 | if (!$sentence) { 356 | return $keywordsExtracted; 357 | } 358 | 359 | if (!$this->caseSensitiv) { 360 | $sentence = mb_strtolower($sentence); 361 | } 362 | 363 | $currentDict = &$this->keywordTrieDict; 364 | $sequenceStartPos = 0; 365 | $sequenceEndPos = 0; 366 | $resetCurrentDict = false; 367 | $idx = 0; 368 | $sentenceLen = strlen($sentence); 369 | $decreaseIndex = false; 370 | while ($idx < $sentenceLen) { 371 | $char = $sentence[$idx]; 372 | if (!in_array($char, $this->nonWordBoundaries, true)) { 373 | if (isset($currentDict[$char]) || isset($currentDict[self::TREE_LEAF ])) { 374 | $sequenceFound= null; 375 | $longestSequenceFound= null; 376 | $isLongerSeqFound= false; 377 | if (isset($currentDict[self::TREE_LEAF])) { 378 | $longestSequenceFound = $currentDict[self::TREE_LEAF ]; 379 | $sequenceEndPos = $idx; 380 | } 381 | if (isset($currentDict[$char])) { 382 | $currentDictContinued = &$currentDict[$char]; 383 | 384 | $idy = $idx + 1; 385 | 386 | $notBroken = true; 387 | while ($idy < $sentenceLen) { 388 | $innerChar = $sentence[$idy]; 389 | if (!in_array($innerChar, $this->nonWordBoundaries, true) && isset($currentDictContinued[self::TREE_LEAF ])) { 390 | $longestSequenceFound = $currentDictContinued[self::TREE_LEAF]; 391 | $sequenceEndPos = $idy; 392 | $isLongerSeqFound = true; 393 | } 394 | if (isset($currentDictContinued[$innerChar])) { 395 | $currentDictContinued= &$currentDictContinued[$innerChar]; 396 | } else { 397 | $notBroken = false; 398 | break; 399 | } 400 | ++$idy; 401 | } 402 | 403 | if ($notBroken && isset($currentDictContinued[self::TREE_LEAF])) { 404 | $longestSequenceFound = $currentDictContinued[self::TREE_LEAF]; 405 | $sequenceEndPos = $idy; 406 | $isLongerSeqFound= true; 407 | } 408 | if ($isLongerSeqFound) { 409 | $idx = $sequenceEndPos; 410 | } 411 | } 412 | $currentDict = &$this->keywordTrieDict; 413 | if ($longestSequenceFound) { 414 | $keywordsExtracted[] = [$longestSequenceFound, $sequenceStartPos, $idx]; 415 | // Decrease index to match a possible candidate which starts immediately 416 | $decreaseIndex = true; 417 | } 418 | $resetCurrentDict= true; 419 | } else { 420 | $currentDict = &$this->keywordTrieDict; 421 | $resetCurrentDict = true; 422 | } 423 | } elseif (isset($currentDict[$char])) { 424 | $currentDict = &$currentDict[$char]; 425 | } else { 426 | $currentDict = &$this->keywordTrieDict; 427 | $resetCurrentDict = true; 428 | 429 | $idy = $idx + 1; 430 | while ($idy < $sentenceLen) { 431 | $char = $sentence[$idy]; 432 | if (!in_array($char, $this->nonWordBoundaries, true)) { 433 | break; 434 | } 435 | ++$idy; 436 | } 437 | $idx = $idy; 438 | } 439 | if (($idx + 1) >= $sentenceLen) { 440 | if (isset($currentDict[self::TREE_LEAF])) { 441 | $sequenceFound = $currentDict[self::TREE_LEAF]; 442 | $keywordsExtracted[] = [$sequenceFound, $sequenceStartPos, $sentenceLen]; 443 | } 444 | } 445 | $idx++; 446 | if ($resetCurrentDict) { 447 | $resetCurrentDict = false; 448 | $sequenceStartPos = $idx; 449 | $longestSequenceFound = ''; 450 | } 451 | if ($decreaseIndex) { 452 | $decreaseIndex = false; 453 | $idx--; 454 | $sequenceStartPos = $idx; 455 | } 456 | 457 | } 458 | 459 | if ($spanInfo) { 460 | return $keywordsExtracted; 461 | } 462 | 463 | return array_map(function ($value) { return $value[0]; }, $keywordsExtracted); 464 | } 465 | 466 | /** 467 | * @param $sentence 468 | * @return string 469 | */ 470 | public function replaceKeywords($sentence) 471 | { 472 | $newSentence = ''; 473 | if (!$sentence) { 474 | return $newSentence; 475 | } 476 | 477 | $origSentence= $sentence; 478 | if (!$this->caseSensitiv) { 479 | $sentence = mb_strtolower($sentence); 480 | } 481 | $currentWord = ''; 482 | $currentDict= &$this->keywordTrieDict; 483 | $sequenceEndPos = 0; 484 | $idx = 0; 485 | $sentenceLen= strlen($sentence); 486 | 487 | while ($idx < $sentenceLen) { 488 | $char = $sentence[$idx]; 489 | $currentWord .= $origSentence[$idx]; 490 | 491 | if (!in_array($char, $this->nonWordBoundaries, true)) { 492 | $currentWhiteSpace = $char; 493 | 494 | if (isset($currentDict[self::TREE_LEAF]) || isset($currentDict[$char])) { 495 | $sequenceFound = null; 496 | $longestSequenceFound = null; 497 | $isLongerSeqFound = false; 498 | if (isset($currentDict[self::TREE_LEAF])) { 499 | $longestSequenceFound = $currentDict[self::TREE_LEAF ]; 500 | $sequenceEndPos = $idx; 501 | } 502 | 503 | if (isset($currentDict[$char])) { 504 | $currentDictContinued = $currentDict[$char]; 505 | $currentWordContinued = $currentWord; 506 | $idy = $idx + 1; 507 | 508 | $notBroken = true; 509 | while ($idy < $sentenceLen) { 510 | $innerChar = $sentence[$idy]; 511 | $currentWordContinued .= $origSentence[$idy]; 512 | if (!in_array($innerChar, $this->nonWordBoundaries, true) && isset($currentDictContinued[self::TREE_LEAF ])) { 513 | $currentWhiteSpace = $innerChar; 514 | $longestSequenceFound = $currentDictContinued[self::TREE_LEAF]; 515 | $sequenceEndPos = $idy; 516 | $isLongerSeqFound = true; 517 | } 518 | if (isset($currentDictContinued[$innerChar])) { 519 | $currentDictContinued = &$currentDictContinued[$innerChar]; 520 | } else { 521 | $notBroken = false; 522 | break; 523 | } 524 | ++$idy; 525 | } 526 | if ($notBroken && isset($currentDictContinued[self::TREE_LEAF])) { 527 | $currentWhiteSpace = ''; 528 | $longestSequenceFound = $currentDictContinued[self::TREE_LEAF]; 529 | $sequenceEndPos = $idy; 530 | $isLongerSeqFound = true; 531 | } 532 | if ($isLongerSeqFound) { 533 | $idx = $sequenceEndPos; 534 | $currentWord = $currentWordContinued; 535 | } 536 | } 537 | 538 | $currentDict = &$this->keywordTrieDict; 539 | if ($longestSequenceFound) { 540 | $newSentence .= $longestSequenceFound . $currentWhiteSpace; 541 | $currentWord = ''; 542 | } else { 543 | $newSentence .= $currentWord; 544 | $currentWord = ''; 545 | } 546 | } else { 547 | $currentDict = &$this->keywordTrieDict; 548 | $newSentence .= $currentWord; 549 | $currentWord = ''; 550 | } 551 | } elseif (isset($currentDict[$char])) { 552 | $currentDict = &$currentDict[$char]; 553 | } else { 554 | $currentDict = &$this->keywordTrieDict; 555 | $idy = $idx + 1; 556 | while ($idy < $sentenceLen) { 557 | $char = $sentence[$idy]; 558 | $currentWord .= $origSentence[$idy]; 559 | if (!in_array($char, $this->nonWordBoundaries, true)) { 560 | break; 561 | } 562 | ++$idy; 563 | } 564 | $idx = $idy; 565 | $newSentence .= $currentWord; 566 | $currentWord = ''; 567 | } 568 | 569 | if (($idx + 1) >= $sentenceLen) { 570 | if (isset($currentDict[self::TREE_LEAF])) { 571 | $sequenceFound = $currentDict[self::TREE_LEAF]; 572 | $newSentence .= $sequenceFound; 573 | } elseif ($currentWord) { 574 | $newSentence .= $currentWord; 575 | } 576 | } 577 | ++$idx; 578 | } 579 | return $newSentence; 580 | } 581 | 582 | /** 583 | * @return array 584 | */ 585 | public function getKeywordTrieDict() 586 | { 587 | return $this->keywordTrieDict; 588 | } 589 | 590 | 591 | } 592 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/CircularTest.php: -------------------------------------------------------------------------------- 1 | ['php'], 21 | 'php' => ['java'], 22 | ]; 23 | 24 | $keywordProcessor->addKeywordsFromAssocArray($keywordAssocArray); 25 | 26 | $sentence = 'I know java but I love php and java hugs php.'; 27 | 28 | $keywordsExtracted = $keywordProcessor->extractKeywords($sentence); 29 | 30 | $this->assertEquals(['php', 'java', 'php', 'java'], $keywordsExtracted); 31 | 32 | $sentenceNew = $keywordProcessor->replaceKeywords($sentence); 33 | $this->assertEquals('I know php but I love java and php hugs java.', $sentenceNew); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/DictionaryLoadingTest.php: -------------------------------------------------------------------------------- 1 | ['java_2e', 'java programing'], 22 | 'product management' => ['product management techniques', 'product management'], 23 | ]; 24 | 25 | $keywordProcessor->addKeywordsFromAssocArray($keywordAssocArray); 26 | 27 | $sentence = 'I know java_2e and product management techniques'; 28 | 29 | $keywordsExtracted= $keywordProcessor->extractKeywords($sentence); 30 | 31 | $this->assertEquals(['java', 'product management'], $keywordsExtracted, 32 | 'Failed file format one test'); 33 | 34 | $sentenceNew= $keywordProcessor->replaceKeywords($sentence); 35 | $this->assertEquals('I know java and product management', $sentenceNew, 36 | 'Failed file format one test'); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/EmptyTest.php: -------------------------------------------------------------------------------- 1 | assertEquals([],$keywordProcessor->extractKeywords('')); 20 | $this->assertEquals('',$keywordProcessor->replaceKeywords('')); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/ExceptionTest.php: -------------------------------------------------------------------------------- 1 | addKeywordFromFile('missing file'); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/ExtractSpanTest.php: -------------------------------------------------------------------------------- 1 | testData as $testId => $testCase) { 21 | $keywordProcessor = new KeywordProcessor(); 22 | 23 | foreach ($testCase['keyword_dict'] as $keywords) { 24 | $keywordProcessor->addKeywordsFromArray($keywords); 25 | } 26 | $keywordsExtracted = $keywordProcessor->extractKeywords($testCase['sentence'], true); 27 | 28 | foreach ($keywordsExtracted as $kwd) { 29 | $this->assertEquals(mb_strtolower(substr($testCase['sentence'], $kwd[1], $kwd[2] - $kwd[1])), mb_strtolower($kwd[0]), 30 | sprintf('keywords span don\'t match the expected results for test case: %s', $testId) 31 | ); 32 | } 33 | } 34 | } 35 | 36 | public function testExtractKeywordsCaseSensitive() 37 | { 38 | foreach ($this->testData as $testId => $testCase) { 39 | $keywordProcessor = new KeywordProcessor(true); 40 | 41 | foreach ($testCase['keyword_dict'] as $keywords) { 42 | $keywordProcessor->addKeywordsFromArray($keywords); 43 | } 44 | $keywordsExtracted = $keywordProcessor->extractKeywords($testCase['sentence'], true); 45 | 46 | foreach ($keywordsExtracted as $kwd) { 47 | $this->assertEquals(mb_strtolower(substr($testCase['sentence'], $kwd[1], $kwd[2] - $kwd[1])), mb_strtolower($kwd[0]), 48 | sprintf('keywords span don\'t match the expected results for test case: %s', $testId) 49 | ); 50 | } 51 | } 52 | } 53 | 54 | protected function setUp() 55 | { 56 | $testData = file_get_contents(__DIR__ . '/keyword_extractor_test_cases.json'); 57 | $this->testData = json_decode($testData, true); 58 | } 59 | 60 | 61 | } 62 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/ExtractorTest.php: -------------------------------------------------------------------------------- 1 | testData = json_decode($testData, true); 22 | } 23 | 24 | public function testExtractKeywords() 25 | { 26 | foreach ($this->testData as $testId => $testCase) { 27 | $keywordProcessor = new KeywordProcessor(); 28 | $keywordProcessor->addKeywordsFromAssocArray($testCase['keyword_dict']); 29 | $keywordsExtracted = $keywordProcessor->extractKeywords($testCase['sentence']); 30 | $this->assertEquals($testCase['keywords'], $keywordsExtracted, sprintf('keywords_extracted don\'t match the expected results for test case: %s', $testId)); 31 | } 32 | } 33 | 34 | public function testExtractKeywordsCaseSensitive() 35 | { 36 | foreach ($this->testData as $testId => $testCase) { 37 | $keywordProcessor= new KeywordProcessor(true); 38 | $keywordProcessor->addKeywordsFromAssocArray($testCase['keyword_dict']); 39 | $keywordsExtracted = $keywordProcessor->extractKeywords($testCase['sentence']); 40 | $this->assertEquals($testCase['keywords_case_sensitive'], $keywordsExtracted, 41 | sprintf('keywords_extracted don\'t match the expected results for test case: %s', $testId)); 42 | } 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/FileLoadTest.php: -------------------------------------------------------------------------------- 1 | addKeywordFromFile(__DIR__ . '/keywords_format_one.txt'); 20 | 21 | $sentence = 'I know java_2e and product management techniques'; 22 | $keywordExtracted = $keywordProcessor->extractKeywords($sentence); 23 | $this->assertEquals(['java', 'product management'], $keywordExtracted); 24 | 25 | $sentenceNew = $keywordProcessor->replaceKeywords($sentence); 26 | $this->assertEquals('I know java and product management', $sentenceNew); 27 | } 28 | 29 | public function testFileFormatTwo() 30 | { 31 | $keywordProcessor = new KeywordProcessor(); 32 | $keywordProcessor->addKeywordFromFile(__DIR__ . '/keywords_format_two.txt'); 33 | 34 | $sentence = 'I know java and product management'; 35 | $keywordExtracted = $keywordProcessor->extractKeywords($sentence); 36 | $this->assertEquals(['java', 'product management'], $keywordExtracted); 37 | 38 | $sentenceNew = $keywordProcessor->replaceKeywords($sentence); 39 | $this->assertEquals('I know java and product management', $sentenceNew); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/GetAllKeywordsTest.php: -------------------------------------------------------------------------------- 1 | addKeyword('colour', 'color'); 20 | $keywordProcessor->addKeyword('j2ee', 'Java'); 21 | 22 | $this->assertEquals(['colour' => 'color', 'j2ee' => 'Java'], $keywordProcessor->getAllKeywords(), 'get_all_keywords didn\'t match expected results.'); 23 | } 24 | 25 | 26 | } 27 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/LenTest.php: -------------------------------------------------------------------------------- 1 | testData as $testId => $testCase) { 21 | $keywordProcessor = new KeywordProcessor(); 22 | $keywordProcessor->addKeywordsFromAssocArray($testCase['keyword_dict']); 23 | 24 | $kpCount= $keywordProcessor->count(); 25 | $kpCountExpected= array_reduce($testCase['keyword_dict'], function ($carry , $item ) { 26 | return $carry + count($item); 27 | }); 28 | 29 | $this->assertEquals($kpCountExpected, $kpCount, sprintf('keyword processor length doesn\'t match for Text ID %s', $testId)); 30 | 31 | $keywordProcessor->removeKeywordsFromAssocArray($testCase['remove_keyword_dict']); 32 | 33 | $kpDecressedCount= $keywordProcessor->count(); 34 | $kpDecressedCountExpected = array_reduce($testCase['remove_keyword_dict'], function ($carry , $item ) { 35 | return $carry + count($item); 36 | }); 37 | 38 | $this->assertEquals($kpCountExpected - $kpDecressedCountExpected, $kpDecressedCount, sprintf('keyword processor length doesn\'t match for Text ID %s', $testId)); 39 | } 40 | } 41 | 42 | public function testRemoveKeywordsDictionaryLen() 43 | { 44 | foreach ($this->testData as $testId => $testCase) { 45 | $keywordProcessor = new KeywordProcessor(); 46 | $keywordProcessor->addKeywordsFromAssocArray($testCase['keyword_dict']); 47 | $keywordProcessor->removeKeywordsFromAssocArray($testCase['remove_keyword_dict']); 48 | 49 | $kpCount= $keywordProcessor->count(); 50 | 51 | $newDictionary= []; 52 | foreach ($testCase['keyword_dict'] as $key => $values) { 53 | foreach ($values as $value) { 54 | if (!(isset($testCase['remove_keyword_dict'][$key]) && in_array($value, $testCase['remove_keyword_dict'][$key], true)) ) { 55 | if (isset($newDictionary[$key])) { 56 | $newDictionary[$key][] = $value; 57 | } else { 58 | $newDictionary[$key] = [$value]; 59 | } 60 | } 61 | } 62 | } 63 | 64 | $keywordProcessorTwo = new KeywordProcessor(); 65 | $keywordProcessorTwo->addKeywordsFromAssocArray($newDictionary); 66 | $kpCountTwo = $keywordProcessorTwo->count(); 67 | $this->assertEquals($kpCountTwo, $kpCount, sprintf('keyword processor length doesn\'t match for Text ID %s', $testId)); 68 | } 69 | } 70 | 71 | 72 | protected function setUp() 73 | { 74 | $testData = file_get_contents(__DIR__ . '/keyword_remover_test_cases.json'); 75 | $this->testData = json_decode($testData, true); 76 | } 77 | 78 | 79 | } 80 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/LoadingKeywordListTest.php: -------------------------------------------------------------------------------- 1 | addKeywordsFromArray( $keywordList); 21 | $sentence = 'I know java and product management'; 22 | $keywordsExtracted = $keywordProcessor->extractKeywords($sentence); 23 | 24 | $this->assertEquals(['java', 'product management'], $keywordsExtracted); 25 | $sentenceNew = $keywordProcessor->replaceKeywords($sentence); 26 | $this->assertEquals('I know java and product management', $sentenceNew); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/NoBordersTest.php: -------------------------------------------------------------------------------- 1 | setNonWordBoundaries(''); 20 | $keywordAssocArray = [ 21 | 'word1' => ['word1'], 22 | 'word2' => ['word2'], 23 | 'word3' => ['word3'], 24 | ]; 25 | 26 | $keywordProcessor->addKeywordsFromAssocArray($keywordAssocArray); 27 | 28 | $sentence = 'word1word2word3'; 29 | 30 | $keywordsExtracted = $keywordProcessor->extractKeywords($sentence, true); 31 | 32 | $this->assertEquals(['word1', 0, 5], $keywordsExtracted[0]); 33 | $this->assertEquals(['word2', 5, 10], $keywordsExtracted[1]); 34 | $this->assertEquals(['word3', 10, 15], $keywordsExtracted[2]); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/NonWordBoundaryTest.php: -------------------------------------------------------------------------------- 1 | assertEquals(KeywordProcessor::INIT_NON_WORD_BOUNDARIES, implode('', $keywordProcessor->getNonWordBoundaries())); 20 | 21 | $keywordProcessor->setNonWordBoundaries(['a', '1']); 22 | 23 | $this->assertEquals(['a', '1'], $keywordProcessor->getNonWordBoundaries()); 24 | 25 | $keywordProcessor->addNonWordBoundaries('b'); 26 | $this->assertEquals(['a', '1', 'b'], $keywordProcessor->getNonWordBoundaries()); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/RemoveKeywordsTest.php: -------------------------------------------------------------------------------- 1 | testData as $testId => $testCase) { 21 | $keywordProcessor= new KeywordProcessor(); 22 | $keywordProcessor->addKeywordsFromAssocArray($testCase['keyword_dict']); 23 | $keywordProcessor->removeKeywordsFromAssocArray($testCase['remove_keyword_dict']); 24 | 25 | $keywordsExtracted = $keywordProcessor->extractKeywords($testCase['sentence']); 26 | 27 | $this->assertEquals($testCase['keywords'], $keywordsExtracted, sprintf('keywords_extracted don\'t match the expected results for test case: %s', $testId)); 28 | } 29 | } 30 | 31 | public function testRemoveKeywordsUsingList() 32 | { 33 | foreach ($this->testData as $testId => $testCase) { 34 | $keywordProcessor = new KeywordProcessor(); 35 | $keywordProcessor->addKeywordsFromAssocArray($testCase['keyword_dict']); 36 | 37 | foreach ($testCase['remove_keyword_dict'] as $values) { 38 | $keywordProcessor->removeKeywordFromArray($values); 39 | } 40 | 41 | $keywordsExtracted= $keywordProcessor->extractKeywords($testCase['sentence']); 42 | $this->assertEquals($testCase['keywords'], $keywordsExtracted, sprintf('keywords_extracted don\'t match the expected results for test case: %s', $testId)); 43 | } 44 | } 45 | 46 | public function testRemoveKeywordsDictionaryCompare() 47 | { 48 | foreach ($this->testData as $testId => $testCase) { 49 | $keywordProcessor = new KeywordProcessor(); 50 | $keywordProcessor->addKeywordsFromAssocArray($testCase['keyword_dict']); 51 | $keywordProcessor->removeKeywordsFromAssocArray($testCase['remove_keyword_dict']); 52 | 53 | $keywordTrieDict = $keywordProcessor->getKeywordTrieDict(); 54 | 55 | $newDictionary= []; 56 | foreach ($testCase['keyword_dict'] as $key => $values) { 57 | foreach ($values as $value) { 58 | if (!(isset($testCase['remove_keyword_dict'][$key]) && in_array($value, $testCase['remove_keyword_dict'][$key], true)) ) { 59 | if (isset($newDictionary[$key])) { 60 | $newDictionary[$key][] = $value; 61 | } else { 62 | $newDictionary[$key] = [$value]; 63 | } 64 | } 65 | } 66 | } 67 | 68 | $keywordProcessorTwo = new KeywordProcessor(); 69 | $keywordProcessorTwo->addKeywordsFromAssocArray($newDictionary); 70 | $keywordTrieDictTwo = $keywordProcessorTwo->getKeywordTrieDict(); 71 | $this->assertEquals($keywordTrieDict, $keywordTrieDictTwo, sprintf('keywords_extracted don\'t match the expected results for test case: %s', $testId)); 72 | } 73 | } 74 | 75 | 76 | protected function setUp() 77 | { 78 | $testData = file_get_contents(__DIR__ . '/keyword_remover_test_cases.json'); 79 | $this->testData = json_decode($testData, true); 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/ReplacerTest.php: -------------------------------------------------------------------------------- 1 | testData as $testId => $testCase) { 22 | $keywordReplacer = new KeywordProcessor(); 23 | foreach ($testCase['keyword_dict'] as $key => $values) { 24 | foreach ($values as $value) { 25 | $keywordReplacer->addKeyword($value, str_replace(' ', '_', $key)); 26 | } 27 | } 28 | $newSentence= $keywordReplacer->replaceKeywords($testCase['sentence']); 29 | 30 | $replacedSentence= $testCase['sentence']; 31 | $keywordMapping= []; 32 | foreach ($testCase['keyword_dict'] as $key => $values) { 33 | foreach ($values as $value) { 34 | $keywordMapping[$value] = str_replace(" ", "_", $key); 35 | } 36 | } 37 | 38 | $keys = array_keys($keywordMapping); 39 | 40 | uasort($keys, function ($a, $b) { 41 | if (strlen($a) == strlen($b)) { 42 | return 0; 43 | } 44 | return (strlen($a) < strlen($b)) ? -1 : 1; 45 | }); 46 | 47 | foreach (array_reverse($keys) as $key) { 48 | $lowerCase = sprintf('/(?assertEquals($replacedSentence, $newSentence, sprintf('new_sentence don\'t match the expected results for test case: %s', $testId)); 56 | } 57 | } 58 | 59 | protected function setUp() 60 | { 61 | $testData = file_get_contents(__DIR__ . '/keyword_extractor_test_cases.json'); 62 | $this->testData = json_decode($testData, true); 63 | } 64 | 65 | 66 | } 67 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/TermInKpTest.php: -------------------------------------------------------------------------------- 1 | addKeyword('j2ee', 'Java'); 20 | $keywordProcessor->addKeyword('colour', 'color'); 21 | $keywordProcessor->getKeyword('j2ee'); 22 | 23 | $this->assertEquals('Java', $keywordProcessor->getKeyword('j2ee')); 24 | $this->assertEquals('color', $keywordProcessor->getKeyword('colour')); 25 | $this->assertEquals(null, $keywordProcessor->getKeyword('Test')); 26 | $this->assertTrue($keywordProcessor->contains('colour')); 27 | $this->assertFalse($keywordProcessor->contains('Test')); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/UmlautTest.php: -------------------------------------------------------------------------------- 1 | ['AÖÜÄß'], 21 | 'Ein Wort' => ['Ass'], 22 | ]; 23 | 24 | $keywordProcessor->addKeywordsFromAssocArray($keywordAssocArray); 25 | 26 | $sentence = 'Dies ist kein Wort: Aöüäß oder Assü'; 27 | 28 | $keywordsExtracted = $keywordProcessor->extractKeywords($sentence); 29 | 30 | $this->assertEquals(['Kein Wort'], $keywordsExtracted); 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/keyword_extractor_test_cases.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "sentence": "I like python", 4 | "keyword_dict": { 5 | "Python": ["python"] 6 | }, 7 | "explanation": "Keyword at the end of the sentence.", 8 | "keywords": ["Python"], 9 | "keywords_case_sensitive": ["Python"] 10 | }, 11 | { 12 | "sentence": "python I like", 13 | "keyword_dict": { 14 | "Python": ["python"] 15 | }, 16 | "explanation": "Keyword at the beginning of the sentence.", 17 | "keywords": ["Python"], 18 | "keywords_case_sensitive": ["Python"] 19 | }, 20 | { 21 | "sentence": "I like python also", 22 | "keyword_dict": { 23 | "Python": ["python"] 24 | }, 25 | "explanation": "Keyword before the end of the sentence.", 26 | "keywords": ["Python"], 27 | "keywords_case_sensitive": ["Python"] 28 | }, 29 | { 30 | "sentence": "I like python java", 31 | "keyword_dict": { 32 | "Python": ["python"], 33 | "Java": ["java"] 34 | }, 35 | "explanation": "Multiple keywords in the end of the sentence.", 36 | "keywords": ["Python", "Java"], 37 | "keywords_case_sensitive": ["Python", "Java"] 38 | }, 39 | { 40 | "sentence": "I like python and java", 41 | "keyword_dict": { 42 | "Python": ["python"], 43 | "Java": ["java"] 44 | }, 45 | "explanation": "Multiple keywords in the sentence with other word in between.", 46 | "keywords": ["Python", "Java"], 47 | "keywords_case_sensitive": ["Python", "Java"] 48 | }, 49 | { 50 | "sentence": "python", 51 | "keyword_dict": { 52 | "Python": ["python"] 53 | }, 54 | "explanation": "Single keyword in the sentence.", 55 | "keywords": ["Python"], 56 | "keywords_case_sensitive": ["Python"] 57 | }, 58 | { 59 | "sentence": " python", 60 | "keyword_dict": { 61 | "Python": ["python"] 62 | }, 63 | "explanation": "Single keyword in the sentence with space prefix.", 64 | "keywords": ["Python"], 65 | "keywords_case_sensitive": ["Python"] 66 | }, 67 | { 68 | "sentence": "I like r", 69 | "keyword_dict": { 70 | "R": ["r"] 71 | }, 72 | "explanation": "Single char keyword at the end of the sentence.", 73 | "keywords": ["R"], 74 | "keywords_case_sensitive": ["R"] 75 | }, 76 | { 77 | "sentence": "r I like", 78 | "keyword_dict": { 79 | "R": ["r"] 80 | }, 81 | "explanation": "Single char keyword at the beginning of the sentence.", 82 | "keywords": ["R"], 83 | "keywords_case_sensitive": ["R"] 84 | }, 85 | { 86 | "sentence": "I like R also", 87 | "keyword_dict": { 88 | "R": ["r"] 89 | }, 90 | "explanation": "Single char keyword before the end of the sentence.", 91 | "keywords": ["R"], 92 | "keywords_case_sensitive": [] 93 | }, 94 | { 95 | "sentence": "I like R java", 96 | "keyword_dict": { 97 | "R": ["r"], 98 | "Java": ["java"] 99 | }, 100 | "explanation": "Multiple keywords in the end of the sentence.", 101 | "keywords": ["R", "Java"], 102 | "keywords_case_sensitive": ["Java"] 103 | }, 104 | { 105 | "sentence": "I like R and java", 106 | "keyword_dict": { 107 | "R": ["R"], 108 | "Java": ["java"] 109 | }, 110 | "explanation": "Multiple keywords in the sentence with other word in between.", 111 | "keywords": ["R", "Java"], 112 | "keywords_case_sensitive": ["R", "Java"] 113 | }, 114 | { 115 | "sentence": "R", 116 | "keyword_dict": { 117 | "R": ["r"] 118 | }, 119 | "explanation": "Single character keyword in the sentence.", 120 | "keywords": ["R"], 121 | "keywords_case_sensitive": [] 122 | }, 123 | { 124 | "sentence": " R", 125 | "keyword_dict": { 126 | "R": ["R"] 127 | }, 128 | "explanation": "Single character keyword in the sentence with space prefix.", 129 | "keywords": ["R"], 130 | "keywords_case_sensitive": ["R"] 131 | }, 132 | { 133 | "sentence": "I like distributed super computing", 134 | "keyword_dict": { 135 | "Distributed Super Computing": ["distributed super computing"] 136 | }, 137 | "explanation": "Multi word Keyword at the end of the sentence.", 138 | "keywords": ["Distributed Super Computing"], 139 | "keywords_case_sensitive": ["Distributed Super Computing"] 140 | }, 141 | { 142 | "sentence": "distributed super computing I like", 143 | "keyword_dict": { 144 | "Distributed Super Computing": ["distributed super computing"] 145 | }, 146 | "explanation": "Multi word Keyword at the beginning of the sentence.", 147 | "keywords": ["Distributed Super Computing"], 148 | "keywords_case_sensitive": ["Distributed Super Computing"] 149 | }, 150 | { 151 | "sentence": "I like distributed super computing also", 152 | "keyword_dict": { 153 | "Distributed Super Computing": ["distributed super computing"] 154 | }, 155 | "explanation": "Multi word Keyword before the end of the sentence.", 156 | "keywords": ["Distributed Super Computing"], 157 | "keywords_case_sensitive": ["Distributed Super Computing"] 158 | }, 159 | { 160 | "sentence": "I like distributed super computing java", 161 | "keyword_dict": { 162 | "Distributed Super Computing": ["distributed super computing"], 163 | "Java": ["java"] 164 | }, 165 | "explanation": "Multi word Keyword at the end of the sentence.", 166 | "keywords": ["Distributed Super Computing", "Java"], 167 | "keywords_case_sensitive": ["Distributed Super Computing", "Java"] 168 | }, 169 | { 170 | "sentence": "I like distributed super computing java programing", 171 | "keyword_dict": { 172 | "Distributed Super Computing": ["distributed super computing"], 173 | "Java": ["java programing"] 174 | }, 175 | "explanation": "Multiple Multi word Keyword at the end of the sentence.", 176 | "keywords": ["Distributed Super Computing", "Java"], 177 | "keywords_case_sensitive": ["Distributed Super Computing", "Java"] 178 | }, 179 | { 180 | "sentence": "I like distributed super computing and java", 181 | "keyword_dict": { 182 | "Distributed Super Computing": ["distributed super computing"], 183 | "Java": ["java"] 184 | }, 185 | "explanation": "Multiple keywords in the sentence with other word in between.", 186 | "keywords": ["Distributed Super Computing", "Java"], 187 | "keywords_case_sensitive": ["Distributed Super Computing", "Java"] 188 | }, 189 | { 190 | "sentence": "distributed super computing", 191 | "keyword_dict": { 192 | "Distributed Super Computing": ["distributed super computing"] 193 | }, 194 | "explanation": "Single Multi word Keyword in the sentence.", 195 | "keywords": ["Distributed Super Computing"], 196 | "keywords_case_sensitive": ["Distributed Super Computing"] 197 | }, 198 | { 199 | "sentence": " distributed super computing", 200 | "keyword_dict": { 201 | "Distributed Super Computing": ["distributed super computing"] 202 | }, 203 | "explanation": "Single Multi word Keyword in the sentence with space prefix.", 204 | "keywords": ["Distributed Super Computing"], 205 | "keywords_case_sensitive": ["Distributed Super Computing"] 206 | }, 207 | { 208 | "sentence": "distributed super computing distributed super computing", 209 | "keyword_dict": { 210 | "Distributed Super Computing": ["distributed super computing"] 211 | }, 212 | "explanation": "Multi word Keyword twice", 213 | "keywords": ["Distributed Super Computing", "Distributed Super Computing"], 214 | "keywords_case_sensitive": ["Distributed Super Computing", "Distributed Super Computing"] 215 | }, 216 | { 217 | "sentence": "distributed super distributed super computing", 218 | "keyword_dict": { 219 | "Distributed Super Computing": ["distributed super computing"] 220 | }, 221 | "explanation": "Multi word Keyword partial then complete.", 222 | "keywords": ["Distributed Super Computing"], 223 | "keywords_case_sensitive": ["Distributed Super Computing"] 224 | }, 225 | { 226 | "sentence": "distributed super distributed super computing java", 227 | "keyword_dict": { 228 | "Distributed Super Computing": ["distributed super computing"], 229 | "Java": ["java"] 230 | }, 231 | "explanation": "", 232 | "keywords": ["Distributed Super Computing", "Java"], 233 | "keywords_case_sensitive": ["Distributed Super Computing", "Java"] 234 | }, 235 | { 236 | "sentence": "distributed super distributed super computing institute", 237 | "keyword_dict": { 238 | "Distributed Super Computing": ["distributed super computing"], 239 | "Distributed Super Computing Institute": ["distributed super computing institute"] 240 | }, 241 | "explanation": "", 242 | "keywords": ["Distributed Super Computing Institute"], 243 | "keywords_case_sensitive": ["Distributed Super Computing Institute"] 244 | }, 245 | { 246 | "sentence": "distributed super distributed super computing insti", 247 | "keyword_dict": { 248 | "Distributed Super Computing": ["distributed super computing"], 249 | "Distributed Super Computing Institute": ["distributed super computing institute"] 250 | }, 251 | "explanation": "", 252 | "keywords": ["Distributed Super Computing"], 253 | "keywords_case_sensitive": ["Distributed Super Computing"] 254 | }, 255 | { 256 | "sentence": "distributed super distributed super computing insti java", 257 | "keyword_dict": { 258 | "Distributed Super Computing": ["distributed super computing"], 259 | "Distributed Super Computing Institute": ["distributed super computing institute"], 260 | "Java": ["java"] 261 | }, 262 | "explanation": "", 263 | "keywords": ["Distributed Super Computing", "Java"], 264 | "keywords_case_sensitive": ["Distributed Super Computing", "Java"] 265 | }, 266 | { 267 | "sentence": "distributed super distributed super computing institute java", 268 | "keyword_dict": { 269 | "Distributed Super Computing": ["distributed super computing"], 270 | "Distributed Super Computing Institute": ["distributed super computing institute"], 271 | "Java": ["java"] 272 | }, 273 | "explanation": "", 274 | "keywords": ["Distributed Super Computing Institute", "Java"], 275 | "keywords_case_sensitive": ["Distributed Super Computing Institute", "Java"] 276 | }, 277 | { 278 | "sentence": "distributed super distributed super computing institute and java", 279 | "keyword_dict": { 280 | "Distributed Super Computing": ["distributed super computing"], 281 | "Distributed Super Computing Institute": ["distributed super computing institute"], 282 | "Java": ["java"] 283 | }, 284 | "explanation": "", 285 | "keywords": ["Distributed Super Computing Institute", "Java"], 286 | "keywords_case_sensitive": ["Distributed Super Computing Institute", "Java"] 287 | }, 288 | { 289 | "sentence": "distributed super distributed super computing insti r", 290 | "keyword_dict": { 291 | "Distributed Super Computing": ["distributed super computing"], 292 | "Distributed Super Computing Institute": ["distributed super computing institute"], 293 | "R": ["r"] 294 | }, 295 | "explanation": "", 296 | "keywords": ["Distributed Super Computing", "R"], 297 | "keywords_case_sensitive": ["Distributed Super Computing", "R"] 298 | }, 299 | { 300 | "sentence": "distributed super distributed super computing institute r", 301 | "keyword_dict": { 302 | "Distributed Super Computing": ["distributed super computing"], 303 | "Distributed Super Computing Institute": ["distributed super computing institute"], 304 | "R": ["r"] 305 | }, 306 | "explanation": "", 307 | "keywords": ["Distributed Super Computing Institute", "R"], 308 | "keywords_case_sensitive": ["Distributed Super Computing Institute", "R"] 309 | }, 310 | { 311 | "sentence": "distributed super distributed super computing institute and r", 312 | "keyword_dict": { 313 | "Distributed Super Computing": ["distributed super computing"], 314 | "Distributed Super Computing Institute": ["distributed super computing institute"], 315 | "R": ["r"] 316 | }, 317 | "explanation": "", 318 | "keywords": ["Distributed Super Computing Institute", "R"], 319 | "keywords_case_sensitive": ["Distributed Super Computing Institute", "R"] 320 | }, 321 | { 322 | "sentence": "distributed pronoun game", 323 | "keyword_dict": { 324 | "Distributed Programing": ["distributed programing"], 325 | "Pronoun Game": ["pronoun game"] 326 | }, 327 | "explanation": "", 328 | "keywords": ["Pronoun Game"], 329 | "keywords_case_sensitive": ["Pronoun Game"] 330 | }, 331 | { 332 | "sentence": "distributed super computer game", 333 | "keyword_dict": { 334 | "Distributed Super Computer": ["distributed super computer"], 335 | "Computer Game": ["computer game"] 336 | }, 337 | "explanation": "", 338 | "keywords": ["Distributed Super Computer"], 339 | "keywords_case_sensitive": ["Distributed Super Computer"] 340 | }, 341 | { 342 | "sentence": "distributed super computer game", 343 | "keyword_dict": { 344 | "Distributed Super Company": ["distributed super company"], 345 | "Computer Game": ["computer game"] 346 | }, 347 | "explanation": "", 348 | "keywords": ["Computer Game"], 349 | "keywords_case_sensitive": ["Computer Game"] 350 | }, 351 | { 352 | "sentence": "distributed super computer game", 353 | "keyword_dict": { 354 | "Distributed Super Company": ["distributed super company"], 355 | "Super Computer": ["super computer"], 356 | "Computer Game": ["computer game"] 357 | }, 358 | "explanation": "", 359 | "keywords": ["Super Computer"], 360 | "keywords_case_sensitive": ["Super Computer"] 361 | }, 362 | { 363 | "sentence": "distributed super compute game", 364 | "keyword_dict": { 365 | "Distributed Super Company": ["distributed super company"], 366 | "Super Computer": ["super computer"], 367 | "Computer Game": ["computer game"] 368 | }, 369 | "explanation": "", 370 | "keywords": [], 371 | "keywords_case_sensitive": [] 372 | }, 373 | { 374 | "sentence": "computer game development", 375 | "keyword_dict": { 376 | "Computer Game": ["computer game"], 377 | "Computer Game Development": ["computer game development"] 378 | }, 379 | "explanation": "", 380 | "keywords": ["Computer Game Development"], 381 | "keywords_case_sensitive": ["Computer Game Development"] 382 | }, 383 | { 384 | "sentence": "computer game development", 385 | "keyword_dict": { 386 | "Computer Gaming": ["computer gaming"], 387 | "Computer Game Development": ["computer game development"] 388 | }, 389 | "explanation": "", 390 | "keywords": ["Computer Game Development"], 391 | "keywords_case_sensitive": ["Computer Game Development"] 392 | }, 393 | { 394 | "sentence": "I like .net", 395 | "keyword_dict": { 396 | ".NET": [".net"] 397 | }, 398 | "explanation": "keyword with special character", 399 | "keywords": [".NET"], 400 | "keywords_case_sensitive": [".NET"] 401 | }, 402 | { 403 | "sentence": "I like c++", 404 | "keyword_dict": { 405 | "Cpp": ["c++"] 406 | }, 407 | "explanation": "keyword with special character", 408 | "keywords": ["Cpp"], 409 | "keywords_case_sensitive": ["Cpp"] 410 | }, 411 | { 412 | "sentence": "python.", 413 | "keyword_dict": { 414 | "Python": ["python."] 415 | }, 416 | "explanation": "Ending with special character", 417 | "keywords": ["Python"], 418 | "keywords_case_sensitive": ["Python"] 419 | }, 420 | { 421 | "sentence": "python ", 422 | "keyword_dict": { 423 | "Python": ["python"] 424 | }, 425 | "explanation": "Ending with special character", 426 | "keywords": ["Python"], 427 | "keywords_case_sensitive": ["Python"] 428 | }, 429 | { 430 | "sentence": "i like python programming", 431 | "keyword_dict": { 432 | "Python": ["python prog"] 433 | }, 434 | "explanation": "Negative test case", 435 | "keywords": [], 436 | "keywords_case_sensitive": [] 437 | }, 438 | { 439 | "sentence": "distributed super distributed super computing institute java", 440 | "keyword_dict": { 441 | "Java": ["java"], 442 | "Distributed Super Computing Institutes": ["distributed super computing institutes"], 443 | "Institute": ["institute"], 444 | "Distributed Super Computing": ["distributed super computing"] 445 | }, 446 | "explanation": "Negative test case", 447 | "keywords": ["Distributed Super Computing", "Institute", "Java"], 448 | "keywords_case_sensitive": ["Distributed Super Computing", "Institute", "Java"] 449 | }, 450 | { 451 | "sentence": "targets relative to targets of the IRE1/XBP1s and PERK arms of the UPR", 452 | "keyword_dict": { 453 | "IRE1": ["IRE1"], 454 | "XBP1s": ["XBP1s"], 455 | "UPR": ["upr"] 456 | }, 457 | "explanation": "", 458 | "keywords": ["IRE1", "XBP1s", "UPR"], 459 | "keywords_case_sensitive": ["IRE1", "XBP1s"] 460 | }, 461 | { 462 | "sentence": "spring framework", 463 | "keyword_dict": { 464 | "spring framework": ["spring", "spring framework"], 465 | "framework": ["framework"] 466 | }, 467 | "explanation": "", 468 | "keywords": ["spring framework"], 469 | "keywords_case_sensitive": ["spring framework"] 470 | }, 471 | { 472 | "sentence": "what the heck is java", 473 | "keyword_dict": { 474 | "java": ["javanese"] 475 | }, 476 | "explanation": "prefix match at the end of the sentence", 477 | "keywords": [], 478 | "keywords_case_sensitive": [] 479 | } 480 | ] 481 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/keyword_remover_test_cases.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "sentence": "i like python programming", 4 | "keyword_dict": { 5 | "Python": ["python prog", "python programming"] 6 | }, 7 | "remove_keyword_dict": { 8 | "Python": ["python programming"] 9 | }, 10 | "keywords": [], 11 | "keywords_case_sensitive": [] 12 | }, 13 | { 14 | "sentence": "i like python programming", 15 | "keyword_dict": { 16 | "Python": ["python prog", "python programming"] 17 | }, 18 | "remove_keyword_dict": { 19 | "Python": ["python prog"] 20 | }, 21 | "keywords": ["Python"], 22 | "keywords_case_sensitive": ["Python"] 23 | }, 24 | { 25 | "sentence": "distributed super distributed super computing institute java", 26 | "keyword_dict": { 27 | "Java": ["java"], 28 | "Distributed Super Computing Institutes": ["distributed super computing institutes"], 29 | "Institute": ["institute"], 30 | "Distributed Super Computing": ["distributed super computing"] 31 | }, 32 | "remove_keyword_dict": { 33 | "Distributed Super Computing Institutes": ["distributed super computing institutes"] 34 | }, 35 | "keywords": ["Distributed Super Computing", "Institute", "Java"], 36 | "keywords_case_sensitive": ["Distributed Super Computing", "Institute", "Java"] 37 | }, 38 | { 39 | "sentence": "distributed super distributed super computing institute java", 40 | "keyword_dict": { 41 | "Java": ["java"], 42 | "Distributed Super Computing Institute": ["distributed super computing institute"], 43 | "Institute": ["institute"], 44 | "Distributed Super Computing": ["distributed super computing"] 45 | }, 46 | "remove_keyword_dict": { 47 | "Distributed Super Computing": ["distributed super computing"] 48 | }, 49 | "keywords": ["Distributed Super Computing Institute", "Java"], 50 | "keywords_case_sensitive": ["Distributed Super Computing Institute", "Java"] 51 | }, 52 | { 53 | "sentence": "distributed super distributed super computing institute java", 54 | "keyword_dict": { 55 | "Java": ["java"], 56 | "Distributed Super Computing Institutes": ["distributed super computing institutes"], 57 | "Institute": ["institute"], 58 | "Distributed Super Computing": ["distributed super computing"] 59 | }, 60 | "remove_keyword_dict": { 61 | "Distributed Super Computing Institutes": ["distributed super computing institutes"] 62 | }, 63 | "keywords": ["Distributed Super Computing", "Institute", "Java"], 64 | "keywords_case_sensitive": ["Distributed Super Computing", "Institute", "Java"] 65 | }, 66 | { 67 | "sentence": "distributed super distributed super computing institute java", 68 | "keyword_dict": { 69 | "Java": ["java"], 70 | "Distributed Super Computing Institute": ["distributed super computing institute"], 71 | "Institute": ["institute"], 72 | "Distributed Super Computing": ["distributed super computing"] 73 | }, 74 | "remove_keyword_dict": { 75 | "Distributed Super Computing": ["distributed super computing"] 76 | }, 77 | "keywords": ["Distributed Super Computing Institute", "Java"], 78 | "keywords_case_sensitive": ["Distributed Super Computing Institute", "Java"] 79 | }, 80 | { 81 | "sentence": "targets relative to targets of the IRE1/XBP1s and PERK arms of the UPR", 82 | "keyword_dict": { 83 | "IRE1": ["IRE1"], 84 | "XBP1s": ["XBP1s"], 85 | "UPR": ["upr"] 86 | }, 87 | "remove_keyword_dict": { 88 | "IRE1": ["IRE1"] 89 | }, 90 | "keywords": ["XBP1s", "UPR"], 91 | "keywords_case_sensitive": ["XBP1s"] 92 | }, 93 | { 94 | "sentence": "spring framework", 95 | "keyword_dict": { 96 | "spring framework": ["spring", "spring framework"], 97 | "framework": ["framework"] 98 | }, 99 | "remove_keyword_dict": { 100 | "spring framework": ["spring"] 101 | }, 102 | "keywords": ["spring framework"], 103 | "keywords_case_sensitive": ["spring framework"] 104 | } 105 | ] 106 | -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/keywords_format_one.txt: -------------------------------------------------------------------------------- 1 | java_2e=>java 2 | java programing=>java 3 | product management=>product management 4 | product management techniques=>product management -------------------------------------------------------------------------------- /tests/Shdev/FlashText/KeywordProcessor/keywords_format_two.txt: -------------------------------------------------------------------------------- 1 | java 2 | product management -------------------------------------------------------------------------------- /tests/Shdev/FlashText/flashtext_vs_regex.php: -------------------------------------------------------------------------------- 1 | start('generate keywords'); 28 | $keywords = []; 29 | 30 | for ($i = 0; $i < KEYWORDS_COUNT; ++$i) { 31 | $keywordLength = mt_rand(MIN_WORD_LENGTH, MAX_WORD_COUNT); 32 | $keyword = ''; 33 | for ($wordPos = 0; $wordPos < $keywordLength; ++$wordPos) { 34 | $keyword .= $alphabet[mt_rand(0, $alphabetLength - 1)]; 35 | } 36 | $keywords[] = $keyword; 37 | } 38 | 39 | $keywords = array_values(array_unique($keywords)); 40 | 41 | while (count($keywords) < KEYWORDS_COUNT) { 42 | $keywordLength = mt_rand(MIN_WORD_LENGTH, MAX_WORD_COUNT); 43 | $keyword = ''; 44 | for ($wordPos = 0; $wordPos < $keywordLength; ++$wordPos) { 45 | $keyword .= $alphabet[mt_rand(0, $alphabetLength - 1)]; 46 | } 47 | $keywords[] = $keyword; 48 | 49 | $keywords = array_values(array_unique($keywords)); 50 | } 51 | 52 | $keywords = array_values(array_unique($keywords)); 53 | 54 | $event = $stopwatch->stop('generate keywords'); 55 | echo 'generate keywords '.$event->getDuration()."ms\n"; 56 | $stopwatch->start('generate documents'); 57 | $documents = []; 58 | for ($i = 0; $i < DOCUMENT_COUNT; ++$i) { 59 | $document = []; 60 | for ($wordIdx = 0; $wordIdx < WORDS_PER_DOCUMENT_COUNT; ++$wordIdx) { 61 | $document[] = $keywords[mt_rand(0, KEYWORDS_COUNT - 1)]; 62 | } 63 | $documents[] = $document; 64 | } 65 | $event = $stopwatch->stop('generate documents'); 66 | echo 'generate documents '.$event->getDuration()."ms\n"; 67 | $stopwatch->start('generate keywords per documents'); 68 | $keywordPerDocument = []; 69 | $maxKeywords = []; 70 | foreach ($documents as $index => $document) { 71 | $uniqueWords = array_values(array_unique($document)); 72 | $keywordPerDocument[$index] = []; 73 | foreach (KEYWORD_STEPS as $keywordStep) { 74 | $keywordChunks = array_chunk(array_slice($uniqueWords, 0, $keywordStep * KEYWORD_CHUNK_SIZE), KEYWORD_CHUNK_SIZE); 75 | 76 | $keywordPerDocument[$index][$keywordStep] = []; 77 | 78 | foreach ($keywordChunks as $keywordChunk) { 79 | $keywordPerDocument[$index][$keywordStep][$keywordChunk[0]] = $keywordChunk; 80 | } 81 | } 82 | $maxKeywords[] = count($uniqueWords); 83 | } 84 | $event = $stopwatch->stop('generate keywords per documents'); 85 | echo 'generate keywords per documents '.$event->getDuration()."ms\n"; 86 | echo "\n"; 87 | 88 | $strPattern = '%6s | %17s | %17s | %17s'."\n"; 89 | printf($strPattern, 'count', 'flashtext', 'flashtext_replace', 'regex'); 90 | echo str_repeat('-', 66)."\n"; 91 | $times = []; 92 | foreach (KEYWORD_STEPS as $keywordStep) { 93 | $stopwatch->openSection(); 94 | foreach ($documents as $key => $document) { 95 | $keywordProcessor = new \Shdev\FlashText\KeywordProcessor(); 96 | $keywordProcessor->addKeywordsFromAssocArray($keywordPerDocument[$key][$keywordStep]); 97 | $sentence = implode(' ', $document); 98 | $stopwatch->start('flashtext', $keywordStep); 99 | $keywordProcessor->extractKeywords($sentence); 100 | $event = $stopwatch->stop('flashtext', $keywordStep); 101 | } 102 | foreach ($documents as $key => $document) { 103 | $keywordProcessor = new \Shdev\FlashText\KeywordProcessor(); 104 | $keywordProcessor->addKeywordsFromAssocArray($keywordPerDocument[$key][$keywordStep]); 105 | $sentence = implode(' ', $document); 106 | $stopwatch->start('flashtext_replace', $keywordStep); 107 | $keywordProcessor->replaceKeywords($sentence); 108 | $event = $stopwatch->stop('flashtext_replace', $keywordStep); 109 | } 110 | foreach ($documents as $key => $document) { 111 | $pattern = []; 112 | foreach ($keywordPerDocument[$key][$keywordStep] as $keywordChunk) { 113 | $pattern[] = implode('|', $keywordChunk); 114 | } 115 | $pattern = '/'.implode('|', $pattern).'/i'; 116 | $sentence = implode(' ', $document); 117 | // warmup then pattern 118 | @preg_match_all($pattern, '23423', $matches); 119 | $stopwatch->start('regex', $keywordStep); 120 | @preg_match_all($pattern, $sentence, $matches); 121 | $event = $stopwatch->stop('regex', $keywordStep); 122 | } 123 | $stopwatch->stopSection($keywordStep); 124 | 125 | $events = $stopwatch->getSectionEvents($keywordStep); 126 | 127 | printf( 128 | $strPattern, 129 | $keywordStep, 130 | $events['flashtext']->getDuration() / DOCUMENT_COUNT, 131 | $events['flashtext_replace']->getDuration() / DOCUMENT_COUNT, 132 | $events['regex']->getDuration() / DOCUMENT_COUNT 133 | ); 134 | } 135 | --------------------------------------------------------------------------------