├── .gitignore ├── .scrutinizer.yml ├── .travis.yml ├── LICENSE ├── README.md ├── composer.json ├── doc ├── 404.html ├── class-Sentence.html ├── class-SentenceTest.html ├── elementlist.js ├── function-Sentence_autoloader.html ├── index.html ├── resources │ ├── collapsed.png │ ├── combined.js │ ├── footer.png │ ├── inherit.png │ ├── resize.png │ ├── sort.png │ ├── style.css │ ├── tree-cleaner.png │ ├── tree-hasnext.png │ ├── tree-last.png │ └── tree-vertical.png ├── source-class-Sentence.html ├── source-class-SentenceTest.html └── source-function-Sentence_autoloader.html ├── index.html ├── phpunit.xml ├── src ├── Multibyte.php └── Sentence.php └── tests ├── MultibyteTest.php ├── SentenceTest.php └── bootstrap.php /.gitignore: -------------------------------------------------------------------------------- 1 | /.idea 2 | /nbproject 3 | /debug 4 | /vendor 5 | /composer.lock 6 | -------------------------------------------------------------------------------- /.scrutinizer.yml: -------------------------------------------------------------------------------- 1 | filter: 2 | excluded_paths: 3 | - doc/* 4 | - tests/* 5 | - example/* 6 | - vendor/* -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | 3 | php: 4 | - 5.4 5 | - 5.5 6 | - 5.6 7 | - 7.0 8 | - 7.1 9 | - 7.2 10 | - 7.3 11 | - 7.4 12 | - 8.0 13 | - 8.1 14 | - 8.2 15 | - 8.3 16 | - hhvm 17 | 18 | matrix: 19 | include: 20 | - php: 5.3 21 | dist: precise 22 | allow_failures: 23 | - php: 5.3 24 | - php: hhvm 25 | 26 | before_script: 27 | - mkdir -p cache 28 | - composer install 29 | 30 | script: 31 | - vendor/bin/phpunit 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Martijn van der Lee 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Sentence 2 | ======== 3 | [![License](https://img.shields.io/github/license/vanderlee/php-sentence.svg)]() 4 | [![Build Status](https://travis-ci.org/vanderlee/php-sentence.svg?branch=master)](https://travis-ci.org/vanderlee/php-sentence) 5 | [![Quality](https://scrutinizer-ci.com/g/vanderlee/php-sentence/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/vanderlee/php-sentence) 6 | 7 | Version 1.0.8 8 | 9 | Copyright © 2016-2024 Martijn van der Lee (@vanderlee), parts copyright © 2017 @marktaw. 10 | 11 | MIT Open Source license applies. 12 | 13 | ## Introduction 14 | PHP natural language sentence segmentation (splitting) and counting. 15 | Sentence boundary disambiguation. 16 | 17 | Still early, but should support most western languages. 18 | If you find any problems, please let me know. 19 | 20 | Supports PHP 5.3 and up, so you can use it on older servers. 21 | 22 | ## Installation 23 | Requires PHP 5.4 or greater. PHP 5.3 is supported as long as no more recent 24 | features are absolutely necessary. 25 | 26 | To install using Composer: 27 | 28 | composer require vanderlee/php-sentence 29 | 30 | ## Methods 31 | ### ***`integer`*** `count(`***`string`*** `$text)` 32 | Counts the number of sentences in the text. 33 | Provided for convenience; this is exactly the same as counting the number of 34 | returned array items from `split`, so if you need both results, just do that. 35 | 36 | ### ***`array`*** `split(`***`string`*** `$text, `***`integer`*** `$flags = 0)` 37 | Splits the text into sentences. 38 | 39 | `$flags` is zero (`0`, default) or the following class constant: 40 | 41 | - **`Sentence::SPLIT_TRIM`**: Trim whitespace off the left and right sides of 42 | each returned sentence. 43 | 44 | ## Documentation 45 | You can find documentation generated from the source code by ApiGen here: [ApiGen documentation](doc/) 46 | 47 | # Examples 48 | split($text); 60 | 61 | // Count the number of sentences 62 | $count = $Sentence->count($text); 63 | 64 | ?> 65 | 66 | # How it works 67 | The method used is not based on any on the established or published methods. 68 | It seems to work pretty well, though. 69 | 70 | The method follows a number of simple steps in splitting and re-merging the 71 | text into full sentences. You can easily check the steps in the code. 72 | 73 | Though the splitting may be a bit off, in particular abbreviations at the start 74 | of sentences tend to be merged with the preceding sentences. In most ordinary 75 | text this should pose no problem. In either case this should not affect the 76 | sentence count except in very uncommon situations. 77 | 78 | It should be noted that this algorithm depends on reasonably gramatically 79 | correct punctuation. Do not L33t-5p3ak!!!!!1!1!11!eleven!! 80 | 81 | ## Rules 82 | The following is a rough list of the rules used to split sentences. 83 | 84 | - Each linebreak separates sentences. 85 | - The end of the text indicates the end if a sentence if not otherwise ended 86 | through proper punctuation. 87 | - Sentences must be at least two words long, unless a linebreak or end-of-text. 88 | - An empty line is not a sentence. 89 | - Each question- or exclamation mark or combination thereof, is considered 90 | the end of a sentence. 91 | - A single period is considered the end of a sentence, unless... 92 | - It is preceded by one word, or... 93 | - It is followed by one word. 94 | - A sequence of multiple periods is not considered the end of a sentence. 95 | 96 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "vanderlee/php-sentence", 3 | "description": "Simple text sentence splitting and counting. Supports at least english, german and dutch, possibly more.", 4 | "type": "library", 5 | "keywords": [ 6 | "sentence", 7 | "split", 8 | "count", 9 | "boundary", 10 | "disambiguation", 11 | "segmentation" 12 | ], 13 | "homepage": "https://github.com/vanderlee/php-sentence.git", 14 | "license": "MIT", 15 | "support": { 16 | "issues": "https://github.com/vanderlee/php-sentence/issues", 17 | "source": "https://github.com/vanderlee/php-sentence" 18 | }, 19 | "require": { 20 | "php": ">=5.4.0", 21 | "ext-mbstring": "*", 22 | "ext-ctype": "*" 23 | }, 24 | "autoload": { 25 | "psr-4": { 26 | "Vanderlee\\Sentence\\": "src" 27 | } 28 | }, 29 | "require-dev": { 30 | "phpunit/phpunit": "^3.6.12" 31 | }, 32 | "scripts": { 33 | "test": "vendor/bin/phpunit" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /doc/404.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Page not found | phpSentence 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 41 |
42 | 43 |
44 | 45 | 79 | 80 | 81 | 82 | 83 | -------------------------------------------------------------------------------- /doc/class-Sentence.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Class Sentence | phpSentence 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 40 |
41 | 42 |
43 | 44 | 237 | 238 | 239 | 240 | 241 | -------------------------------------------------------------------------------- /doc/class-SentenceTest.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Class SentenceTest | phpSentence 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 40 |
41 | 42 |
43 | 44 | 671 | 672 | 673 | 674 | 675 | -------------------------------------------------------------------------------- /doc/elementlist.js: -------------------------------------------------------------------------------- 1 | 2 | var ApiGen = ApiGen || {}; 3 | ApiGen.elements = [["c","Sentence"],["f","Sentence_autoloader()"],["c","SentenceTest"]]; 4 | -------------------------------------------------------------------------------- /doc/function-Sentence_autoloader.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | Function Sentence_autoloader | phpSentence 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 40 |
41 | 42 |
43 | 44 | 108 | 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /doc/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | phpSentence 7 | 8 | 9 | 10 | 11 | 12 | 13 |
14 | 39 |
40 | 41 |
42 | 43 | 103 | 104 | 105 | 106 | 107 | -------------------------------------------------------------------------------- /doc/resources/collapsed.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vanderlee/php-sentence/348c1e5f7ae63b92fed521b794b79e09adebbe72/doc/resources/collapsed.png -------------------------------------------------------------------------------- /doc/resources/footer.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vanderlee/php-sentence/348c1e5f7ae63b92fed521b794b79e09adebbe72/doc/resources/footer.png -------------------------------------------------------------------------------- /doc/resources/inherit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vanderlee/php-sentence/348c1e5f7ae63b92fed521b794b79e09adebbe72/doc/resources/inherit.png -------------------------------------------------------------------------------- /doc/resources/resize.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vanderlee/php-sentence/348c1e5f7ae63b92fed521b794b79e09adebbe72/doc/resources/resize.png -------------------------------------------------------------------------------- /doc/resources/sort.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vanderlee/php-sentence/348c1e5f7ae63b92fed521b794b79e09adebbe72/doc/resources/sort.png -------------------------------------------------------------------------------- /doc/resources/style.css: -------------------------------------------------------------------------------- 1 | body { 2 | font: 13px/1.5 Verdana, 'Geneva CE', lucida, sans-serif; 3 | margin: 0; 4 | padding: 0; 5 | background: #ffffff; 6 | color: #333333; 7 | } 8 | 9 | h1, h2, h3, h4, caption { 10 | font-family: 'Trebuchet MS', 'Geneva CE', lucida, sans-serif; 11 | color: #053368; 12 | } 13 | 14 | h1 { 15 | color: #1e5eb6; 16 | font-size: 230%; 17 | font-weight: normal; 18 | margin: .3em 0; 19 | } 20 | 21 | h2 { 22 | color: #1e5eb6; 23 | font-size: 150%; 24 | font-weight: normal; 25 | margin: -.3em 0 .3em 0; 26 | } 27 | 28 | h3 { 29 | font-size: 1.6em; 30 | font-weight: normal; 31 | margin-bottom: 2px; 32 | } 33 | 34 | h4 { 35 | font-size: 100%; 36 | font-weight: bold; 37 | padding: 0; 38 | margin: 0; 39 | } 40 | 41 | caption { 42 | border: 1px solid #cccccc; 43 | background: #ecede5; 44 | font-weight: bold; 45 | font-size: 1.2em; 46 | padding: 3px 5px; 47 | text-align: left; 48 | margin-bottom: 0; 49 | } 50 | 51 | p { 52 | margin: .7em 0 1em; 53 | padding: 0; 54 | } 55 | 56 | hr { 57 | margin: 2em 0 1em; 58 | border: none; 59 | border-top: 1px solid #cccccc; 60 | height: 0; 61 | } 62 | 63 | a { 64 | color: #006aeb; 65 | padding: 3px 1px; 66 | text-decoration: none; 67 | } 68 | 69 | h1 a { 70 | color: #1e5eb6; 71 | } 72 | 73 | a:hover, a:active, a:focus, a:hover b, a:hover var { 74 | background-color: #006aeb; 75 | color: #ffffff !important; 76 | } 77 | 78 | code, var, pre { 79 | font-family: monospace; 80 | } 81 | 82 | var { 83 | font-weight: bold; 84 | font-style: normal; 85 | color: #ca8a04; 86 | } 87 | 88 | pre { 89 | margin: 0; 90 | } 91 | 92 | code a b { 93 | color: #000000; 94 | } 95 | 96 | .deprecated { 97 | text-decoration: line-through; 98 | opacity: .5; 99 | } 100 | 101 | .invalid { 102 | color: #e71818; 103 | } 104 | 105 | .hidden { 106 | display: none; 107 | } 108 | 109 | /* Left side */ 110 | #left { 111 | overflow: auto; 112 | width: 270px; 113 | height: 100%; 114 | position: fixed; 115 | } 116 | 117 | /* Menu */ 118 | #menu { 119 | padding: 10px; 120 | } 121 | 122 | #menu ul { 123 | list-style: none; 124 | padding: 0; 125 | margin: 0; 126 | } 127 | 128 | #menu ul ul { 129 | padding-left: 10px; 130 | } 131 | 132 | #menu li { 133 | white-space: nowrap; 134 | position: relative; 135 | } 136 | 137 | #menu a { 138 | display: block; 139 | padding: 0 2px; 140 | } 141 | 142 | #menu .active > a, #menu > span { 143 | color: #333333; 144 | background: none; 145 | font-weight: bold; 146 | } 147 | 148 | #menu .active > a.invalid { 149 | color: #e71818; 150 | } 151 | 152 | #menu .active > a:hover, #menu .active > a:active, #menu .active > a:focus { 153 | background-color: #006aeb; 154 | } 155 | 156 | #menu #groups span { 157 | position: absolute; 158 | top: 4px; 159 | right: 2px; 160 | cursor: pointer; 161 | display: block; 162 | width: 12px; 163 | height: 12px; 164 | background: url('collapsed.png') transparent 0 0 no-repeat; 165 | } 166 | 167 | #menu #groups span:hover { 168 | background-position: -12px 0; 169 | } 170 | 171 | #menu #groups span.collapsed { 172 | background-position: 0 -12px; 173 | } 174 | 175 | #menu #groups span.collapsed:hover { 176 | background-position: -12px -12px; 177 | } 178 | 179 | #menu #groups ul.collapsed { 180 | display: none; 181 | } 182 | 183 | /* Right side */ 184 | #right { 185 | overflow: auto; 186 | margin-left: 275px; 187 | height: 100%; 188 | position: relative; 189 | left: 0; 190 | right: 0; 191 | } 192 | 193 | #rightInner { 194 | max-width: 1000px; 195 | min-width: 350px; 196 | } 197 | 198 | /* Search */ 199 | #search { 200 | float: right; 201 | margin: 3px 8px; 202 | } 203 | 204 | #search input.text { 205 | padding: 3px 5px; 206 | width: 250px; 207 | } 208 | 209 | /* Autocomplete */ 210 | .ac_results { 211 | padding: 0; 212 | border: 1px solid #cccccc; 213 | background-color: #ffffff; 214 | overflow: hidden; 215 | z-index: 99999; 216 | } 217 | 218 | .ac_results ul { 219 | width: 100%; 220 | list-style-position: outside; 221 | list-style: none; 222 | padding: 0; 223 | margin: 0; 224 | } 225 | 226 | .ac_results li { 227 | margin: 0; 228 | padding: 2px 5px; 229 | cursor: default; 230 | display: block; 231 | font: 12px 'Trebuchet MS', 'Geneva CE', lucida, sans-serif; 232 | line-height: 16px; 233 | overflow: hidden; 234 | white-space: nowrap; 235 | } 236 | 237 | .ac_results li strong { 238 | color: #000000; 239 | } 240 | 241 | .ac_odd { 242 | background-color: #eeeeee; 243 | } 244 | 245 | .ac_over { 246 | background-color: #006aeb; 247 | color: #ffffff; 248 | } 249 | 250 | .ac_results li.ac_over strong { 251 | color: #ffffff; 252 | } 253 | 254 | /* Navigation */ 255 | #navigation { 256 | padding: 3px 8px; 257 | background-color: #f6f6f4; 258 | height: 26px; 259 | } 260 | 261 | #navigation ul { 262 | list-style: none; 263 | margin: 0 8px 4px 0; 264 | padding: 0; 265 | overflow: hidden; 266 | float: left; 267 | } 268 | 269 | #navigation ul + ul { 270 | border-left: 1px solid #000000; 271 | padding-left: 8px; 272 | } 273 | 274 | #navigation ul li { 275 | float: left; 276 | margin: 2px; 277 | padding: 0 3px; 278 | font-family: Verdana, 'Geneva CE', lucida, sans-serif; 279 | color: #808080; 280 | } 281 | 282 | #navigation ul li.active { 283 | background-color: #053368; 284 | color: #ffffff; 285 | font-weight: bold; 286 | } 287 | 288 | #navigation ul li a { 289 | color: #000000; 290 | font-weight: bold; 291 | padding: 0; 292 | } 293 | 294 | #navigation ul li span { 295 | float: left; 296 | padding: 0 3px; 297 | } 298 | 299 | #navigation ul li a:hover span, #navigation ul li a:active span, #navigation ul li a:focus span { 300 | background-color: #006aeb; 301 | } 302 | 303 | /* Content */ 304 | #content { 305 | clear: both; 306 | padding: 5px 15px; 307 | } 308 | 309 | .description pre { 310 | padding: .6em; 311 | background: #fcfcf7; 312 | } 313 | 314 | #content > .description { 315 | background: #ecede5; 316 | padding: 1px 8px; 317 | margin: 1.2em 0; 318 | } 319 | 320 | #content > .description pre { 321 | margin: .5em 0; 322 | } 323 | 324 | dl.tree { 325 | margin: 1.2em 0; 326 | } 327 | 328 | dl.tree dd { 329 | margin: 0; 330 | padding: 0; 331 | } 332 | 333 | .info { 334 | margin: 1.2em 0; 335 | } 336 | 337 | .summary { 338 | border: 1px solid #cccccc; 339 | border-collapse: collapse; 340 | font-size: 1em; 341 | width: 100%; 342 | margin: 1.2em 0 2.4em; 343 | } 344 | 345 | .summary caption { 346 | border-width: 1px 1px 0; 347 | } 348 | 349 | .summary caption.switchable { 350 | background: #ecede5 url('sort.png') no-repeat center right; 351 | cursor: pointer; 352 | } 353 | 354 | .summary td { 355 | border: 1px solid #cccccc; 356 | margin: 0; 357 | padding: 3px 10px; 358 | font-size: 1em; 359 | vertical-align: top; 360 | } 361 | 362 | .summary td:first-child { 363 | text-align: right; 364 | } 365 | 366 | .summary td hr { 367 | margin: 3px -10px; 368 | } 369 | 370 | #packages.summary td:first-child, #namespaces.summary td:first-child, .inherited.summary td:first-child, .used.summary td:first-child { 371 | text-align: left; 372 | } 373 | 374 | .summary tr:hover td { 375 | background: #f6f6f4; 376 | } 377 | 378 | .summary .description pre { 379 | border: .5em solid #ecede5; 380 | } 381 | 382 | .summary .description p { 383 | margin: 0; 384 | } 385 | 386 | .summary .description p + p, .summary .description ul { 387 | margin: 3px 0 0 0; 388 | } 389 | 390 | .summary .description.detailed h4 { 391 | margin-top: 3px; 392 | } 393 | 394 | .summary dl { 395 | margin: 0; 396 | } 397 | 398 | .summary dd { 399 | margin: 0 0 0 25px; 400 | } 401 | 402 | .name, .attributes { 403 | white-space: nowrap; 404 | } 405 | 406 | .value code { 407 | white-space: pre-wrap; 408 | } 409 | 410 | td.name, td.attributes { 411 | width: 1%; 412 | } 413 | 414 | td.attributes { 415 | width: 1%; 416 | } 417 | 418 | .class .methods .name, .class .properties .name, .class .constants .name { 419 | width: auto; 420 | white-space: normal; 421 | } 422 | 423 | .class .methods .name > div > code { 424 | white-space: pre-wrap; 425 | } 426 | 427 | .class .methods .name > div > code span, .function .value > code { 428 | white-space: nowrap; 429 | } 430 | 431 | .class .methods td.name > div, .class td.value > div { 432 | position: relative; 433 | padding-right: 1em; 434 | } 435 | 436 | .anchor { 437 | position: absolute; 438 | top: 0; 439 | right: 0; 440 | line-height: 1; 441 | font-size: 85%; 442 | margin: 0; 443 | color: #006aeb !important; 444 | } 445 | 446 | .list { 447 | margin: 0 0 5px 25px; 448 | } 449 | 450 | div.invalid { 451 | background-color: #fae4e0; 452 | padding: 10px; 453 | } 454 | 455 | /* Splitter */ 456 | #splitter { 457 | position: fixed; 458 | height: 100%; 459 | width: 5px; 460 | left: 270px; 461 | background: #1e5eb6 url('resize.png') left center no-repeat; 462 | cursor: e-resize; 463 | } 464 | 465 | #splitter.active { 466 | opacity: .5; 467 | } 468 | 469 | /* Footer */ 470 | #footer { 471 | border-top: 1px solid #e9eeef; 472 | clear: both; 473 | color: #a7a7a7; 474 | font-size: 8pt; 475 | text-align: center; 476 | padding: 20px 0 0; 477 | margin: 3em 0 0; 478 | height: 90px; 479 | background: #ffffff url('footer.png') no-repeat center top; 480 | } 481 | 482 | /* Tree */ 483 | div.tree ul { 484 | list-style: none; 485 | background: url('tree-vertical.png') left repeat-y; 486 | padding: 0; 487 | margin-left: 20px; 488 | } 489 | 490 | div.tree li { 491 | margin: 0; 492 | padding: 0; 493 | } 494 | 495 | div.tree div { 496 | padding-left: 30px; 497 | } 498 | 499 | div.tree div.notlast { 500 | background: url('tree-hasnext.png') left 10px no-repeat; 501 | } 502 | 503 | div.tree div.last { 504 | background: url('tree-last.png') left -240px no-repeat; 505 | } 506 | 507 | div.tree li.last { 508 | background: url('tree-cleaner.png') left center repeat-y; 509 | } 510 | 511 | div.tree span.padding { 512 | padding-left: 15px; 513 | } 514 | 515 | /* Source code */ 516 | .php-keyword1 { 517 | color: #e71818; 518 | font-weight: bold; 519 | } 520 | 521 | .php-keyword2 { 522 | font-weight: bold; 523 | } 524 | 525 | .php-var { 526 | color: #d59401; 527 | font-weight: bold; 528 | } 529 | 530 | .php-num { 531 | color: #cd0673; 532 | } 533 | 534 | .php-quote { 535 | color: #008000; 536 | } 537 | 538 | .php-comment { 539 | color: #929292; 540 | } 541 | 542 | .xlang { 543 | color: #ff0000; 544 | font-weight: bold; 545 | } 546 | 547 | span.l { 548 | display: block; 549 | } 550 | 551 | span.l.selected { 552 | background: #f6f6f4; 553 | } 554 | 555 | span.l a { 556 | color: #333333; 557 | } 558 | 559 | span.l a:hover, div.l a:active, div.l a:focus { 560 | background: transparent; 561 | color: #333333 !important; 562 | } 563 | 564 | span.l .php-var a { 565 | color: #d59401; 566 | } 567 | 568 | span.l .php-var a:hover, span.l .php-var a:active, span.l .php-var a:focus { 569 | color: #d59401 !important; 570 | } 571 | 572 | span.l a.l { 573 | padding-left: 2px; 574 | color: #c0c0c0; 575 | } 576 | 577 | span.l a.l:hover, span.l a.l:active, span.l a.l:focus { 578 | background: transparent; 579 | color: #c0c0c0 !important; 580 | } 581 | 582 | #rightInner.medium #navigation { 583 | height: 52px; 584 | } 585 | 586 | #rightInner.medium #navigation ul:first-child + ul { 587 | clear: left; 588 | border: none; 589 | padding: 0; 590 | } 591 | 592 | #rightInner.medium .name, #rightInner.medium .attributes { 593 | white-space: normal; 594 | } 595 | 596 | #rightInner.small #search { 597 | float: left; 598 | } 599 | 600 | #rightInner.small #navigation { 601 | height: 78px; 602 | } 603 | 604 | #rightInner.small #navigation ul:first-child { 605 | clear: both; 606 | } 607 | 608 | /* global style */ 609 | .left, .summary td.left { 610 | text-align: left; 611 | } 612 | .right, .summary td.right { 613 | text-align: right; 614 | } 615 | -------------------------------------------------------------------------------- /doc/resources/tree-cleaner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vanderlee/php-sentence/348c1e5f7ae63b92fed521b794b79e09adebbe72/doc/resources/tree-cleaner.png -------------------------------------------------------------------------------- /doc/resources/tree-hasnext.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vanderlee/php-sentence/348c1e5f7ae63b92fed521b794b79e09adebbe72/doc/resources/tree-hasnext.png -------------------------------------------------------------------------------- /doc/resources/tree-last.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vanderlee/php-sentence/348c1e5f7ae63b92fed521b794b79e09adebbe72/doc/resources/tree-last.png -------------------------------------------------------------------------------- /doc/resources/tree-vertical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vanderlee/php-sentence/348c1e5f7ae63b92fed521b794b79e09adebbe72/doc/resources/tree-vertical.png -------------------------------------------------------------------------------- /doc/source-class-Sentence.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | File classes/Sentence.php | phpSentence 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 41 |
42 | 43 |
44 | 45 | 401 | 402 | 403 | 404 | 405 | -------------------------------------------------------------------------------- /doc/source-class-SentenceTest.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | File tests/SentenceTest.php | phpSentence 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 41 |
42 | 43 |
44 | 45 | 256 | 257 | 258 | 259 | 260 | -------------------------------------------------------------------------------- /doc/source-function-Sentence_autoloader.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | File classes/autoloader.php | phpSentence 8 | 9 | 10 | 11 | 12 | 13 | 14 |
15 | 41 |
42 | 43 |
44 | 45 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | PHP Sentence 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 98 | 187 | 188 | 189 | Fork me on GitHub 190 | 191 | 192 | 193 |
194 |
195 |
196 | 197 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /phpunit.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | tests 6 | 7 | 8 | 9 | 10 | 11 | src 12 | 13 | 14 | tests 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /src/Multibyte.php: -------------------------------------------------------------------------------- 1 | "'", // U+0082⇒U+201A single low-9 quotation mark 14 | "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark 15 | "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark 16 | "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark 17 | "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark 18 | "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark 19 | "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark 20 | "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark 21 | // Regular Unicode // U+0022 quotation mark (") 22 | // U+0027 apostrophe (') 23 | "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark 24 | "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark 25 | "\xE2\x80\x98" => "'", // U+2018 left single quotation mark 26 | "\xE2\x80\x99" => "'", // U+2019 right single quotation mark 27 | "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark 28 | "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark 29 | "\xE2\x80\x9C" => '"', // U+201C left double quotation mark 30 | "\xE2\x80\x9D" => '"', // U+201D right double quotation mark 31 | "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark 32 | "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark 33 | "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark 34 | "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark 35 | ]; 36 | 37 | /** 38 | * Replace 39 | * 40 | * @staticvar array $chr_map 41 | * @param string $string 42 | * @return string 43 | */ 44 | public static function cleanUnicode($string) 45 | { 46 | $character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should 47 | $replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays 48 | return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8")); 49 | } 50 | 51 | /** 52 | * Multibyte.php safe version of standard trim() function. 53 | * 54 | * @param string $string 55 | * @return string 56 | */ 57 | public static function trim($string) 58 | { 59 | return mb_ereg_replace('(^\s*)|(\s*$)', '', $string); 60 | } 61 | 62 | /** 63 | * A cross between mb_split and preg_split, adding the preg_split flags 64 | * to mb_split. 65 | * 66 | * @param string $pattern 67 | * @param string $string 68 | * @param int $limit 69 | * @param int $flags 70 | * @return array 71 | */ 72 | public static function split($pattern, $string, $limit = -1, $flags = 0) 73 | { 74 | $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE); 75 | 76 | $lengths = self::getSplitLengths($pattern, $string); 77 | 78 | // Substrings 79 | $parts = []; 80 | $position = 0; 81 | $count = 1; 82 | foreach ($lengths as $length) { 83 | if (self::isLastPart($length, $flags, $limit, $count)) { 84 | $parts[] = self::makePart($string, $position, null, $offset_capture); 85 | return $parts; 86 | } 87 | 88 | if (self::isPart($length, $flags)) { 89 | $parts[] = self::makePart($string, $position, $length[0], $offset_capture); 90 | } 91 | 92 | $position += $length[0]; 93 | } 94 | 95 | return $parts; 96 | } 97 | 98 | /** 99 | * @param $length 100 | * @param $flags 101 | * @param $limit 102 | * @param $count 103 | * @return bool 104 | */ 105 | private static function isLastPart($length, $flags, $limit, &$count) 106 | { 107 | $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0]; 108 | $is_delimiter = $length[1]; 109 | 110 | return $limit > 0 111 | && !$is_delimiter 112 | && $split_empty 113 | && ++$count > $limit; 114 | } 115 | 116 | /** 117 | * @param $length 118 | * @param $flags 119 | * @return bool 120 | */ 121 | private static function isPart($length, $flags) 122 | { 123 | $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0]; 124 | $is_delimiter = $length[1]; 125 | $is_captured = ($flags & PREG_SPLIT_DELIM_CAPTURE) && $length[2]; 126 | 127 | return (!$is_delimiter 128 | || $is_captured) 129 | && $split_empty; 130 | } 131 | 132 | /** 133 | * Make part 134 | * @param string $string 135 | * @param integer $position 136 | * @param integer|null $length 137 | * @param bool $offset_capture 138 | * @return array|string 139 | */ 140 | private static function makePart($string, $position, $length = null, $offset_capture = false) 141 | { 142 | $cut = mb_strcut($string, $position, $length); 143 | 144 | return $offset_capture 145 | ? [$cut, $position] 146 | : $cut; 147 | } 148 | 149 | /** 150 | * Splits the string by pattern and for each element (part or split) returns: 151 | * [ 0 => length, 1 => is_delimiter?, 2 => 152 | * 153 | * @param $pattern 154 | * @param $string 155 | * @return array 156 | */ 157 | private static function getSplitLengths($pattern, $string) 158 | { 159 | $strlen = strlen($string); // bytes! 160 | $lengths = []; 161 | 162 | mb_ereg_search_init($string); 163 | 164 | $position = 0; 165 | while ($position < $strlen 166 | && ($array = mb_ereg_search_pos($pattern, '')) !== false) { 167 | // capture split 168 | $lengths[] = [$array[0] - $position, false, null]; 169 | 170 | // move position 171 | $position = $array[0] + $array[1]; 172 | 173 | // capture delimiter 174 | $regs = mb_ereg_search_getregs(); 175 | $lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]]; 176 | } 177 | 178 | // Add last bit, if not ending with split 179 | $lengths[] = [$strlen - $position, false, null]; 180 | 181 | return $lengths; 182 | } 183 | } -------------------------------------------------------------------------------- /src/Sentence.php: -------------------------------------------------------------------------------- 1 | replacements = []; 71 | $index = 0; 72 | foreach (array_reverse($matches[0]) as $match) { 73 | $number = $match[0]; 74 | $offset = $match[1]; 75 | $code = $this->getReplaceCode($index); 76 | 77 | $this->replacements[$index] = $number; 78 | 79 | $text = (string)substr_replace($text, $code, $offset, mb_strlen($number)); 80 | 81 | ++$index; 82 | } 83 | 84 | return $text; 85 | } 86 | 87 | /** 88 | * Restore any stored replacements 89 | * 90 | * @param string[] $text 91 | * 92 | * @return string[] 93 | */ 94 | private function restoreReplacements($text) 95 | { 96 | return array_map(function ($value) { 97 | foreach ($this->replacements as $index => $number) { 98 | $code = $this->getReplaceCode($index); 99 | $value = str_replace($code, $number, $value); 100 | } 101 | 102 | return $value; 103 | }, $text); 104 | } 105 | 106 | /** 107 | * Breaks a piece of text into lines by linebreak. 108 | * Eats up any linebreak characters as if one. 109 | * 110 | * Multibyte.php safe 111 | * 112 | * @param string $text 113 | * 114 | * @return string[] 115 | */ 116 | private static function linebreakSplit($text) 117 | { 118 | $lines = []; 119 | $line = ''; 120 | 121 | foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) { 122 | $line .= $part; 123 | if (Multibyte::trim($part) === '') { 124 | $lines[] = $line; 125 | $line = ''; 126 | } 127 | } 128 | $lines[] = $line; 129 | 130 | return $lines; 131 | } 132 | 133 | /** 134 | * Splits an array of lines by (consecutive sequences of) 135 | * terminals, keeping terminals. 136 | * 137 | * Multibyte.php safe (at least for UTF-8) 138 | * 139 | * For example: 140 | * "There ... is. More!" 141 | * ... becomes ... 142 | * [ "There ", "...", " is", ".", " More", "!" ] 143 | * 144 | * @param string $line 145 | * 146 | * @return string[] 147 | */ 148 | private function punctuationSplit($line) 149 | { 150 | $parts = []; 151 | 152 | $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe! 153 | $is_terminal = in_array($chars[0], $this->terminals); 154 | 155 | $part = ''; 156 | foreach ($chars as $char) { 157 | if (in_array($char, $this->terminals) !== $is_terminal) { 158 | $parts[] = $part; 159 | $part = ''; 160 | $is_terminal = !$is_terminal; 161 | } 162 | $part .= $char; 163 | } 164 | 165 | if (!empty($part)) { 166 | $parts[] = $part; 167 | } 168 | 169 | return $parts; 170 | } 171 | 172 | /** 173 | * Appends each terminal item after it's preceding 174 | * non-terminals. 175 | * 176 | * Multibyte.php safe (at least for UTF-8) 177 | * 178 | * For example: 179 | * [ "There ", "...", " is", ".", " More", "!" ] 180 | * ... becomes ... 181 | * [ "There ... is.", "More!" ] 182 | * 183 | * @param string[] $punctuations 184 | * 185 | * @return string[] 186 | */ 187 | private function punctuationMerge($punctuations) 188 | { 189 | $definite_terminals = array_diff($this->terminals, $this->abbreviators); 190 | 191 | $merges = []; 192 | $merge = ''; 193 | 194 | $filtered = array_filter($punctuations, function ($p) { 195 | return $p !== ''; 196 | }); 197 | 198 | foreach ($filtered as $punctuation) { 199 | $merge .= $punctuation; 200 | if (mb_strlen($punctuation) === 1 201 | && in_array($punctuation, $this->terminals)) { 202 | $merges[] = $merge; 203 | $merge = ''; 204 | } else { 205 | foreach ($definite_terminals as $terminal) { 206 | if (mb_strpos($punctuation, $terminal) !== false) { 207 | $merges[] = $merge; 208 | $merge = ''; 209 | break; 210 | } 211 | } 212 | } 213 | } 214 | if (!empty($merge)) { 215 | $merges[] = $merge; 216 | } 217 | 218 | return $merges; 219 | } 220 | 221 | /** 222 | * Looks for capitalized abbreviations & includes them with the following fragment. 223 | * 224 | * For example: 225 | * [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ] 226 | * ... becomes ... 227 | * [ "Last week, former director of the F.B.I. James B. Comey was fired." ] 228 | * [ "Mr. Comey was not available for comment." ] 229 | * 230 | * @param string[] $fragments 231 | * 232 | * @return string[] 233 | */ 234 | private function abbreviationMerge($fragments) 235 | { 236 | $return_fragment = []; 237 | 238 | $previous_fragment = ''; 239 | $previous_is_abbreviation = false; 240 | $i = 0; 241 | foreach ($fragments as $fragment) { 242 | $is_abbreviation = self::isAbbreviation($fragment); 243 | 244 | // merge previous fragment with this 245 | if ($previous_is_abbreviation) { 246 | $fragment = $previous_fragment . $fragment; 247 | } 248 | $return_fragment[$i] = $fragment; 249 | 250 | $previous_is_abbreviation = $is_abbreviation; 251 | $previous_fragment = $fragment; 252 | 253 | // only increment if this isn't an abbreviation 254 | if (!$is_abbreviation) { 255 | $i++; 256 | } 257 | } 258 | 259 | return $return_fragment; 260 | } 261 | 262 | /** 263 | * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters. 264 | * 265 | * @param $fragment 266 | * 267 | * @return bool 268 | */ 269 | private static function isAbbreviation($fragment) 270 | { 271 | $words = mb_split('\s+', Multibyte::trim($fragment)); 272 | 273 | $word_count = count($words); 274 | 275 | $last_word = Multibyte::trim($words[$word_count - 1]); 276 | $last_is_capital = preg_match('#^\p{Lu}#u', $last_word); 277 | $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.'; 278 | 279 | return $last_is_capital > 0 280 | && $last_is_abbreviation > 0 281 | && mb_strlen($last_word) <= 3; 282 | } 283 | 284 | /** 285 | * Merges any part starting with a closing parenthesis ')' to the previous 286 | * part. 287 | * 288 | * @param string[] $parts 289 | * 290 | * @return string[] 291 | */ 292 | private function parenthesesMerge($parts) 293 | { 294 | $subSentences = []; 295 | 296 | foreach ($parts as $part) { 297 | if ($part[0] === ')' && !empty($subSentences)) { 298 | $subSentences[count($subSentences) - 1] .= $part; 299 | } else { 300 | $subSentences[] = $part; 301 | } 302 | } 303 | 304 | return $subSentences; 305 | } 306 | 307 | /** 308 | * Looks for closing quotes to include them with the previous statement. 309 | * "That was very interesting," he said. 310 | * "That was very interesting." 311 | * 312 | * @param string[] $statements 313 | * 314 | * @return string[] 315 | */ 316 | private function closeQuotesMerge($statements) 317 | { 318 | $i = 0; 319 | $previous_statement = ''; 320 | $return = []; 321 | foreach ($statements as $statement) { 322 | if (self::isEndQuote($statement)) { 323 | $statement = $previous_statement . $statement; 324 | } else { 325 | $i++; 326 | } 327 | 328 | $return[$i] = $statement; 329 | $previous_statement = $statement; 330 | } 331 | 332 | return $return; 333 | } 334 | 335 | /** 336 | * Check if the entire string is a quotation mark or quote, then space, then lowercase. 337 | * 338 | * @param $statement 339 | * 340 | * @return bool 341 | */ 342 | private static function isEndQuote($statement) 343 | { 344 | $trimmed = Multibyte::trim($statement); 345 | $first = mb_substr($statement, 0, 1); 346 | 347 | return in_array($trimmed, ['"', '\'']) 348 | || ( 349 | in_array($first, ['"', '\'']) 350 | && mb_substr($statement, 1, 1) === ' ' 351 | && ctype_lower(mb_substr($statement, 2, 1)) === true 352 | ); 353 | } 354 | 355 | /** 356 | * Merges items into larger sentences. 357 | * Multibyte.php safe 358 | * 359 | * @param string[] $shorts 360 | * 361 | * @return string[] 362 | */ 363 | private function sentenceMerge($shorts) 364 | { 365 | $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators); 366 | 367 | $sentences = []; 368 | 369 | $sentence = ''; 370 | $has_words = false; 371 | $previous_word_ending = null; 372 | foreach ($shorts as $short) { 373 | $word_count = count(mb_split('\s+', Multibyte::trim($short))); 374 | $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals); 375 | 376 | if ($after_non_abbreviating_terminal 377 | || ($has_words && $word_count > 1)) { 378 | 379 | $sentences[] = $sentence; 380 | 381 | $sentence = ''; 382 | $has_words = false; 383 | } 384 | 385 | $has_words = $has_words 386 | || $word_count > 1; 387 | 388 | $sentence .= $short; 389 | $previous_word_ending = mb_substr($short, -1); 390 | } 391 | 392 | if (!empty($sentence)) { 393 | $sentences[] = $sentence; 394 | } 395 | 396 | return $sentences; 397 | } 398 | 399 | /** 400 | * Return the sentences detected in the provided text. 401 | * Set the Sentence::SPLIT_TRIM flag to trim whitespace. 402 | * 403 | * @param string $text 404 | * @param integer $flags 405 | * 406 | * @return string[] 407 | */ 408 | public function split($text, $flags = 0, $pipeline = []) 409 | { 410 | if (empty($pipeline)) { 411 | static $pipeline = [ 412 | 'replaceFloatNumbers', 413 | 'punctuationSplit', 414 | 'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge 415 | 'punctuationMerge', 416 | 'abbreviationMerge', 417 | 'closeQuotesMerge', 418 | 'sentenceMerge', 419 | 'restoreReplacements', 420 | ]; 421 | } 422 | 423 | // clean funny quotes 424 | $text = Multibyte::cleanUnicode($text); 425 | 426 | // Split 427 | $sentences = []; 428 | foreach (self::linebreakSplit($text) as $input) { 429 | if (Multibyte::trim($input) !== '') { 430 | foreach ($pipeline as $method) { 431 | $input = $this->$method($input); 432 | } 433 | $sentences = array_merge($sentences, $input); 434 | } 435 | } 436 | 437 | // Post process 438 | if ($flags & self::SPLIT_TRIM) { 439 | return self::trimSentences($sentences); 440 | } 441 | 442 | return $sentences; 443 | } 444 | 445 | /** 446 | * Multibyte.php trim each string in an array. 447 | * 448 | * @param string[] $sentences 449 | * 450 | * @return string[] 451 | */ 452 | private static function trimSentences($sentences) 453 | { 454 | return array_map(function ($sentence) { 455 | return Multibyte::trim($sentence); 456 | }, $sentences); 457 | } 458 | 459 | /** 460 | * Return the number of sentences detected in the provided text. 461 | * 462 | * @param string $text 463 | * 464 | * @return integer 465 | */ 466 | public function count($text) 467 | { 468 | return count($this->split($text)); 469 | } 470 | 471 | } 472 | -------------------------------------------------------------------------------- /tests/MultibyteTest.php: -------------------------------------------------------------------------------- 1 | assertSame($expected, Multibyte::split($pattern, $subject, $limit, $flags)); 21 | } 22 | 23 | /** 24 | * @return array[] 25 | */ 26 | public function dataSplit() 27 | { 28 | return [ 29 | [['a', 'b', 'c'], '-', 'a-b-c'], 30 | [['a', 'b', 'c'], '-', 'a-b-c', 3], 31 | [['a', 'b', 'c'], '-', 'a-b-c', -1], 32 | [['a', 'b-c'], '-', 'a-b-c', 2], 33 | [['a-b-c'], '-', 'a-b-c', 1], 34 | [['a', 'b', 'c'], '-', 'a-b-c', -1, PREG_SPLIT_DELIM_CAPTURE], 35 | [['a', '-', 'b', '-', 'c'], '(-)', 'a-b-c', -1, PREG_SPLIT_DELIM_CAPTURE], 36 | ]; 37 | } 38 | 39 | /** 40 | * @covers :: 41 | * 42 | * @dataProvider dataTrim 43 | * @param $subject 44 | * @param $expected 45 | * @return void 46 | */ 47 | public function testTrim($subject, $expected=null) 48 | { 49 | if ($expected === null) { 50 | $expected = $subject; 51 | } 52 | $this->assertSame($expected, Multibyte::trim($subject)); 53 | } 54 | 55 | /** 56 | * @return array[] 57 | */ 58 | public function dataTrim() 59 | { 60 | return [ 61 | ['Foo bar', 'Foo bar'], 62 | [' Foo bar', 'Foo bar'], 63 | [' Foo bar ', 'Foo bar'], 64 | ['Foo bar ', 'Foo bar'], 65 | ]; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /tests/SentenceTest.php: -------------------------------------------------------------------------------- 1 | object = new Sentence(); 26 | } 27 | 28 | /** 29 | * @covers ::count 30 | */ 31 | public function testCountEmpty() 32 | { 33 | $this->assertSame(0, $this->object->count('')); 34 | $this->assertSame(0, $this->object->count(' ')); 35 | $this->assertSame(0, $this->object->count("\n")); 36 | } 37 | 38 | /** 39 | * @covers ::count 40 | */ 41 | public function testCountWord() 42 | { 43 | $this->assertSame(1, $this->object->count('Hello')); 44 | $this->assertSame(1, $this->object->count('Hello.')); 45 | $this->assertSame(1, $this->object->count('Hello...')); 46 | $this->assertSame(1, $this->object->count('Hello!')); 47 | $this->assertSame(1, $this->object->count('Hello?')); 48 | $this->assertSame(1, $this->object->count('Hello?!')); 49 | } 50 | 51 | /** 52 | * @covers ::count 53 | */ 54 | public function testCountTwoWords() 55 | { 56 | $this->assertSame(1, $this->object->count('Hello world')); 57 | $this->assertSame(1, $this->object->count('Hello world.')); 58 | $this->assertSame(1, $this->object->count('Hello world...')); 59 | $this->assertSame(1, $this->object->count('Hello world!')); 60 | $this->assertSame(1, $this->object->count('Hello world?')); 61 | $this->assertSame(1, $this->object->count('Hello world?!')); 62 | } 63 | 64 | /** 65 | * @covers ::count 66 | */ 67 | public function testCountMultipleWords() 68 | { 69 | $this->assertSame(2, $this->object->count('Hello world. Are you there')); 70 | $this->assertSame(2, $this->object->count('Hello world. Are you there?')); 71 | $this->assertSame(1, $this->object->count('Hello world, Are you there?')); 72 | $this->assertSame(1, $this->object->count('Hello world: Are you there?')); 73 | $this->assertSame(1, $this->object->count('Hello world... Are you there?')); 74 | } 75 | 76 | /** 77 | * @covers ::count 78 | */ 79 | public function testCountLinebreaks() 80 | { 81 | $this->assertSame(2, $this->object->count("Hello world...\rAre you there?")); 82 | $this->assertSame(2, $this->object->count("Hello world...\nAre you there?")); 83 | $this->assertSame(2, $this->object->count("Hello world...\r\nAre you there?")); 84 | $this->assertSame(2, $this->object->count("Hello world...\r\n\rAre you there?")); 85 | $this->assertSame(2, $this->object->count("Hello world...\n\r\nAre you there?")); 86 | $this->assertSame(2, $this->object->count("Hello world...\n\nAre you there?")); 87 | $this->assertSame(2, $this->object->count("Hello world...\r\rAre you there?")); 88 | } 89 | 90 | /** 91 | * @covers ::count 92 | */ 93 | public function testCountAbbreviations() 94 | { 95 | $this->assertSame(1, $this->object->count("Hello mr. Smith.")); 96 | $this->assertSame(1, $this->object->count("Hello, OMG Kittens!")); 97 | $this->assertSame(1, $this->object->count("Hello, abbrev. Kittens!")); 98 | $this->assertSame(1, $this->object->count("Hello, O.M.G. Kittens!")); 99 | $this->assertSame(1, $this->object->count("Last week, former director of the A.B.C. John B. Smith was fired.")); 100 | $this->assertSame(1, $this->object->count("Mr. Smith was not available for comment..")); 101 | } 102 | 103 | /** 104 | * @covers ::count 105 | */ 106 | public function testCountMultiplePunctuation() 107 | { 108 | $this->assertSame(2, $this->object->count("Hello there. Brave new world.")); 109 | $this->assertSame(1, $this->object->count("Hello there... Brave new world.")); 110 | $this->assertSame(2, $this->object->count("Hello there?... Brave new world.")); 111 | $this->assertSame(2, $this->object->count("Hello there!... Brave new world.")); 112 | $this->assertSame(2, $this->object->count("Hello there!!! Brave new world.")); 113 | $this->assertSame(2, $this->object->count("Hello there??? Brave new world.")); 114 | } 115 | 116 | /** 117 | * @covers ::count 118 | */ 119 | public function testCountOneWordSentences() 120 | { 121 | $this->assertSame(2, $this->object->count("You? Smith?")); 122 | $this->assertSame(2, $this->object->count("You there? Smith?")); 123 | $this->assertSame(1, $this->object->count("You mr. Smith?")); 124 | $this->assertSame(2, $this->object->count("Are you there. Mister Smith?")); 125 | $this->assertSame(2, $this->object->count("Are you there. Smith, sir?")); 126 | $this->assertSame(2, $this->object->count("Are you there. Mr. Smith?")); 127 | } 128 | 129 | /** 130 | * @covers ::split 131 | */ 132 | public function testSplitEmpty() 133 | { 134 | $this->assertSame([], $this->object->split('')); 135 | $this->assertSame([], $this->object->split(' ')); 136 | $this->assertSame([], $this->object->split("\n")); 137 | } 138 | 139 | /** 140 | * @covers ::cleanupUnicode 141 | */ 142 | public function testCleanupUnicode() 143 | { 144 | $this->assertSame(['Fix "these" quotes'], $this->object->split('Fix "these" quotes')); 145 | $this->assertSame(['Fix "these" quotes'], $this->object->split("Fix \xC2\xABthese\xC2\xAB quotes")); 146 | } 147 | 148 | /** 149 | * @covers ::split 150 | */ 151 | public function testSplitWord() 152 | { 153 | $this->assertSame(['Hello'], $this->object->split('Hello')); 154 | $this->assertSame(['Hello.'], $this->object->split('Hello.')); 155 | $this->assertSame(['Hello...'], $this->object->split('Hello...')); 156 | $this->assertSame(['Hello!'], $this->object->split('Hello!')); 157 | $this->assertSame(['Hello?'], $this->object->split('Hello?')); 158 | $this->assertSame(['Hello?!'], $this->object->split('Hello?!')); 159 | } 160 | 161 | /** 162 | * @covers ::split 163 | */ 164 | public function testSplitMultipleWords() 165 | { 166 | $this->assertSame(['Hello world.', ' Are you there'], $this->object->split('Hello world. Are you there')); 167 | $this->assertSame(['Hello world.', ' Are you there?'], $this->object->split('Hello world. Are you there?')); 168 | $this->assertSame(['Hello world.', 'Are you there'], $this->object->split('Hello world. Are you there', Sentence::SPLIT_TRIM)); 169 | $this->assertSame(['Hello world.', 'Are you there?'], $this->object->split('Hello world. Are you there?', Sentence::SPLIT_TRIM)); 170 | $this->assertSame(['Hello world, Are you there?'], $this->object->split('Hello world, Are you there?')); 171 | $this->assertSame(['Hello world: Are you there?'], $this->object->split('Hello world: Are you there?')); 172 | $this->assertSame(['Hello world... Are you there?'], $this->object->split('Hello world... Are you there?')); 173 | } 174 | 175 | /** 176 | * @covers ::split 177 | */ 178 | public function testSplitLinebreaks() 179 | { 180 | $this->assertSame(["Hello world...\r", "Are you there?"], $this->object->split("Hello world...\rAre you there?")); 181 | $this->assertSame(["Hello world...\n", " Are you there?"], $this->object->split("Hello world...\n Are you there?")); 182 | $this->assertSame(["Hello world...\n", "Are you there?"], $this->object->split("Hello world...\nAre you there?")); 183 | $this->assertSame(["Hello world...\r\n", "Are you there?"], $this->object->split("Hello world...\r\nAre you there?")); 184 | $this->assertSame(["Hello world...\r\n\r", "Are you there?"], $this->object->split("Hello world...\r\n\rAre you there?")); 185 | $this->assertSame(["Hello world...\n\r\n", "Are you there?"], $this->object->split("Hello world...\n\r\nAre you there?")); 186 | $this->assertSame(["Hello world...\n\n", "Are you there?"], $this->object->split("Hello world...\n\nAre you there?")); 187 | $this->assertSame(["Hello world...\r\r", "Are you there?"], $this->object->split("Hello world...\r\rAre you there?")); 188 | } 189 | 190 | /** 191 | * @covers ::split 192 | */ 193 | public function testSplitAbbreviations() 194 | { 195 | $this->assertSame(['Hello mr. Smith.'], $this->object->split("Hello mr. Smith.")); 196 | $this->assertSame(['Hello, OMG Kittens!'], $this->object->split("Hello, OMG Kittens!")); 197 | $this->assertSame(['Hello, abbrev. Kittens!'], $this->object->split("Hello, abbrev. Kittens!")); 198 | $this->assertSame(['Hello, O.M.G. Kittens!'], $this->object->split("Hello, O.M.G. Kittens!")); 199 | $this->assertSame(['Last week, former director of the A.B.C. John B. Smith was fired.'], $this->object->split("Last week, former director of the A.B.C. John B. Smith was fired.")); 200 | $this->assertSame(['Mr. Smith was not available for comment..'], $this->object->split("Mr. Smith was not available for comment..")); 201 | $this->assertSame(['Hello mr. Smith.', ' Are you there?'], $this->object->split("Hello mr. Smith. Are you there?")); 202 | } 203 | 204 | /** 205 | * @covers ::split 206 | */ 207 | public function testSplitOneWordSentences() 208 | { 209 | $this->assertSame(["You?", " Smith?"], $this->object->split("You? Smith?")); 210 | $this->assertSame(["You there?", " Smith?"], $this->object->split("You there? Smith?")); 211 | $this->assertSame(["You mr. Smith?"], $this->object->split("You mr. Smith?")); 212 | $this->assertSame(["Are you there.", " Mister Smith?"], $this->object->split("Are you there. Mister Smith?")); 213 | $this->assertSame(["Are you there.", " Smith, sir?"], $this->object->split("Are you there. Smith, sir?")); 214 | $this->assertSame(["Are you there.", " Mr. Smith?"], $this->object->split("Are you there. Mr. Smith?")); 215 | } 216 | 217 | /** 218 | * @covers ::split 219 | */ 220 | public function testSplitParenthesis() 221 | { 222 | $this->assertSame(["You there (not here!).", " Mister Smith"], $this->object->split("You there (not here!). Mister Smith")); 223 | $this->assertSame(["You (not him!) here.", " Mister Smith"], $this->object->split("You (not him!) here. Mister Smith")); 224 | $this->assertSame(["(What!) you here.", " Mister Smith"], $this->object->split("(What!) you here. Mister Smith")); 225 | $this->assertSame(["You there (not here).", " Mister Smith"], $this->object->split("You there (not here). Mister Smith")); 226 | $this->assertSame(["You (not him) here.", " Mister Smith"], $this->object->split("You (not him) here. Mister Smith")); 227 | $this->assertSame(["(What) you here.", " Mister Smith"], $this->object->split("(What) you here. Mister Smith")); 228 | } 229 | 230 | /** 231 | * @covers ::split 232 | */ 233 | public function testSentenceWithNumericValues() 234 | { 235 | $this->assertSame(1, $this->object->count("The price is £25.50, including postage and packing.")); 236 | $this->assertSame(1, $this->object->count("The price is 25.50, including postage and packing.")); 237 | $this->assertSame(1, $this->object->count("I went true to size at 10.5 cms.")); 238 | $this->assertSame(2, $this->object->count("The prices are £25.50 or £27.50, including postage and packing. I went true to size at 10.5 cms.")); 239 | $this->assertSame(1, $this->object->count("Prices will go up for 8.6% and because of that it is expensive.")); 240 | } 241 | 242 | /** 243 | * @covers ::replaceFloatNumbers 244 | * @covers ::restoreReplacements 245 | * 246 | * @dataProvider dataSplit 247 | * 248 | * @param string[] $expected 249 | * @param string $text 250 | * 251 | * @return void 252 | */ 253 | public function testSplit($expected, $text) 254 | { 255 | $this->assertSame($expected, $this->object->split($text)); 256 | $this->assertSame(count($expected), $this->object->count($text)); 257 | } 258 | 259 | public function dataSplit() 260 | { 261 | return [ 262 | 'repeat 2' => [ 263 | [ 264 | 'He got £2.', 265 | ' He lost £2.', 266 | ' He had £2.', 267 | ], 268 | 'He got £2. He lost £2. He had £2.', 269 | ], 270 | 'times' => [ 271 | [ 272 | 'If at 8:00 pm, do something, there is a good chance that by 8:45 pm we do something else.', 273 | ' This is another sentence', 274 | ], 275 | 'If at 8:00 pm, do something, there is a good chance that by 8:45 pm we do something else. This is another sentence', 276 | ], 277 | 'lead/trailing zeroes' => [ 278 | [ 279 | 'Number 00.20 it is', 280 | ], 281 | 'Number 00.20 it is', 282 | ], 283 | 'Bug report #15; ))) -1 index offset' => [ 284 | [ 285 | ')))', 286 | ], 287 | ')))', 288 | ], 289 | 'Price' => [ 290 | [ 291 | 'The price is 25.50, including postage and packing.', 292 | ], 293 | 'The price is 25.50, including postage and packing.', 294 | ], 295 | 'Recursive replacement' => [ 296 | [ 297 | 'From 11 to 12.', 298 | ' From 11 to 15.', 299 | ], 300 | 'From 11 to 12. From 11 to 15.', 301 | ], 302 | ]; 303 | } 304 | } 305 | -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 |