├── LICENSE-MIT ├── Readme.md ├── unittest.php └── truncateHTML.php /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | Copyright (c) 2018 Jean-Louis Grall, contributors 3 | 4 | Permission is hereby granted, free of charge, to any person 5 | obtaining a copy of this software and associated documentation 6 | files (the "Software"), to deal in the Software without 7 | restriction, including without limitation the rights to use, 8 | copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the 10 | Software is furnished to do so, subject to the following 11 | conditions: 12 | 13 | The above copyright notice and this permission notice shall be 14 | included in all copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES 18 | OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 20 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, 21 | WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 | FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 23 | OTHER DEALINGS IN THE SOFTWARE. 24 | -------------------------------------------------------------------------------- /Readme.md: -------------------------------------------------------------------------------- 1 | # truncateHTML 2 | 3 | A PHP function that truncates (shortens) a given HTML5 string to a max number of characters. 4 | 5 | __Example:__ truncate after 6 characters including the ellipsis: 6 | `
A red ball.
` __=>__ `A red…
` 7 | 8 | Compatible with PHP 5.6 and 7+ 9 | Uses the _mbstring_ PHP extension for UTF-8. 10 | More than 240 unit tests (see or run: [unittest.php](unittest.php)) 11 | 12 | _The function is in [truncateHTML.php](truncateHTML.php), you can just copy/paste it to your project._ 13 | 14 | 15 | ## Features: 16 | 17 | - Quickly truncate most common HTML5 sources without using a full HTML parser (which is ~100x slower). 18 | - Configurable ellipsis: `…`, `...`, `More`, etc. 19 | - Can include the length of the ellipsis in the truncated result. 20 | - Supports self-closing tags like: `5
5…
"); 264 | t( 6, "15
5
5
12");
271 | t( 0, "…");
272 | t( 1, "
1…");
273 | t( 2, "
12");
274 |
275 |
276 | /* TEST: Tags AND Include ellipsis length */
277 | sub(['includeEllipsisLength' => true], function() {
278 |
279 | input(['ellipsis' => "…"]);
280 |
281 | input("1234567");
282 | t( 0, "…");
283 | t( 1, "…");
284 | t( 2, "1…");
285 | t( 3, "12…");
286 | t( 4, "123…");
287 | t( 5, "1234…");
288 | t( 6, "12345…");
289 | t( 7, "1234567");
290 |
291 |
292 | input(['ellipsis' => "..."]);
293 |
294 | input("123456789");
295 | t( 0, "...");
296 | t( 1, "...");
297 | t( 2, "...");
298 | t( 3, "...");
299 | t( 4, "1...");
300 | t( 5, "12...");
301 | t( 6, "123...");
302 | t( 7, "1234...");
303 | t( 8, "12345...");
304 | t( 9, "123456789");
305 | });
306 |
307 |
308 | /* TEST: Don't count spaces separating tags */
309 | input(" 2 4 éa a
"); 430 | t( 0, "…"); 431 | t( 1, "…"); 432 | t( 2, "…"); 433 | t( 3, "éa…
"); 434 | t( 4, "éa a
"); 435 | }); 436 | }); 437 | 438 | 439 | 440 | /* TEST: Readme.md examples */ 441 | 442 | input("A red ball.
"); 443 | t( 6, "A red…
"); 444 | 445 | input("A lumberjack"); 446 | t( 5, "
A…"); 447 | 448 | input("
A lumberjack"); 449 | t( 5, "
A lum…", ['wholeWord' => false, 'includeEllipsisLength' => false]); 450 | 451 | input("https://php.net/docs.php"); 452 | t( 5, "…"); 453 | input("https://php.net/docs.php"); 454 | t(20, "https://php.net/doc…"); 455 | 456 | input("
A red ball.
", ['wholeWord' => false]); 469 | t( 9, "A red ba…
"); 470 | 471 | 472 | 473 | finish(); 474 | 475 | 476 | 477 | 478 | 479 | 480 | /*############################################################*/ 481 | /*############################################################*/ 482 | /*############################################################*/ 483 | /*###### UTILITY FUNCTIONS ######*/ 484 | 485 | function init() { 486 | global $unittest, $paramsStack, $params; 487 | 488 | ini_set('display_errors', 1); 489 | ini_set('display_startup_errors', 1); 490 | error_reporting(E_ALL); 491 | ini_set('assert.exception', 1); // Assertion failure will throw an exception 492 | 493 | if (!function_exists('truncateHTML')) { 494 | require_once('truncateHTML.php'); 495 | } 496 | 497 | // SETUP some globals: 498 | $unittest = []; // Data related to the executed tests, see definition in init(). 499 | $paramsStack = []; // Used by sub() to manage $params when changing contexts. 500 | $params; // Contains the current parameters for truncateHTML(), see definition in resetParams(). 501 | 502 | resetParams(); 503 | 504 | $unittest = [ 505 | 'startTime' => microtime(true), 506 | 'succeededTests' => 0, 507 | 'failedTests' => 0, 508 | 'executedTests' => 0, 509 | ]; 510 | } 511 | 512 | function finish() { 513 | global $unittest; 514 | 515 | $unittest['endTime'] = microtime(true); 516 | 517 | echo "\033[01;32mSuccess ({$unittest['succeededTests']}/{$unittest['executedTests']})\033[0m\n"; 518 | echo "Run time: " . round(($unittest['endTime'] - $unittest['startTime']) * 1000, 2) . " ms\n"; 519 | } 520 | 521 | function resetParams() { 522 | global $params; 523 | $params = [ 524 | 'html' => '', 525 | 'options' => [], 526 | ]; 527 | } 528 | 529 | 530 | function input($html) { 531 | global $params, $paramsStack; 532 | 533 | $args = func_get_args(); 534 | foreach ($args as $arg) { 535 | if (is_string($arg)) { 536 | $params['html'] = $arg; 537 | } 538 | else if (is_array($arg)) { 539 | $params['options'] = $arg + $params['options']; 540 | } 541 | } 542 | } 543 | 544 | function sub() { 545 | global $params, $paramsStack; 546 | 547 | $paramsStack[] = $params; 548 | 549 | $args = func_get_args(); 550 | foreach ($args as $arg) { 551 | if (is_string($arg)) { 552 | $params['html'] = $arg; 553 | } 554 | else if (is_array($arg)) { 555 | $params['options'] = $arg + $params['options']; 556 | } 557 | else if (is_callable($arg)) { 558 | $arg(); 559 | } 560 | } 561 | 562 | $params = array_pop($paramsStack); 563 | } 564 | 565 | function t($maxLength, $expect, array $options = []) { 566 | global $unittest, $params; 567 | 568 | $html = $params['html']; 569 | $options = $options + $params['options']; 570 | $out = truncateHTML($maxLength, $html, $options); 571 | $unittest['executedTests']++; 572 | if ($out !== $expect) { 573 | $unittest['failedTests']++; 574 | $trace = debug_backtrace(DEBUG_BACKTRACE_IGNORE_ARGS, 2); 575 | $line = $trace[0]['line']; 576 | echo "\033[01;31mFailed test {$unittest['executedTests']} (line $line):\033[0m\n"; 577 | echo "maxLength: $maxLength\n"; 578 | echo "html: '$html'\n"; 579 | echo "output: '$out'\n"; 580 | echo "expected: '$expect'\n"; 581 | echo "options: ".var_export($options, true)."\n"; 582 | exit(); 583 | } 584 | else { 585 | $unittest['succeededTests']++; 586 | } 587 | } -------------------------------------------------------------------------------- /truncateHTML.php: -------------------------------------------------------------------------------- 1 | (string) Ellipsis. Default: utf8 ? '…' : '...' 14 | * 'includeEllipsisLength' => (bool) Does $maxLength include the length of ellipsis ? Default: true 15 | * 'wholeWord' => (bool) Truncate at end of last whole word. Default: true 16 | * 'cutWord' => (int>=0|false) Default: 18 17 | * 'utf8' => (bool) Default: true 18 | * ] 19 | * @return string $truncated_html 20 | */ 21 | function truncateHTML($maxLength, $html, array $options = []) { 22 | assert(is_int($maxLength), "Parameter \$maxLength must be an int"); 23 | assert(is_string($html), "Parameter \$html must be a string"); 24 | 25 | $_isUtf8 = !isset($options['utf8']) || $options['utf8'] === true; 26 | $default = [ 27 | // If utf8, ellipsis defaults to HORIZONTAL ELLIPSIS ('…' ie. '...' as a single unicode character): 28 | 'ellipsis' => $_isUtf8 ? "\xe2\x80\xa6" : '...', 29 | 'includeEllipsisLength' => true, 30 | 'wholeWord' => true, 31 | 'cutWord' => 18, // Set to 0 or false to disable 32 | 'utf8' => true, 33 | 34 | // Internal use: 35 | 'forceBacktrack' => false, 36 | 'debug' => false, 37 | ]; 38 | $options += $default; 39 | 40 | assert(is_int($options['cutWord']) || $options['cutWord'] === false, "Option \$options['cutWord'] must be an integer or FALSE"); 41 | 42 | // THE function that does all the work of finding the position for the ellipsis, 43 | // the position for the truncation, and keeping track of opened tags: 44 | $analyze = function($maxLength, $html, array $options = []) use (&$analyze) { 45 | // For UTF-8 input: 46 | $utf8_mod = $options['utf8'] ? 'u' : ''; 47 | $strlen = $options['utf8'] ? 'mb_strlen' : 'strlen'; 48 | $substr = $options['utf8'] ? 'mb_substr' : 'substr'; 49 | 50 | if ($maxLength === -1) { 51 | // Internal use only: in this case, we are only interested in the length of $html, not in really truncating it. 52 | $maxLength = strlen($html); 53 | $options = ['ellipsis' => '', 'includeEllipsisLength' => false, 'wholeWord' => false] + $options; 54 | } 55 | 56 | $pos = 0; // Current position in $html 57 | $length = 0; // Length of $html at $pos (number of countable characters) 58 | $openedTags = []; // Stack of opened tags at $pos 59 | $isCounting = true; // Are we currently counting the characters we meet ? (false in HTML comments, ') { // End script: 335 | $re_nextTag = $re_inHTML; 336 | $isCounting = true; 337 | } 338 | elseif ($tag === '') { // End script: 339 | $re_nextTag = $re_inHTML; 340 | $isCounting = true; 341 | } 342 | else { // Other tag: 343 | $tagName = strtolower($tagMatches[1][0]); 344 | 345 | // Opening tag: 346 | if ($tag[1] !== '/') { 347 | $isCountingTag = $isCounting && !in_array($tagName, $noCountingTags, true); 348 | if (!$reachedOpenTag()) break; 349 | 350 | // If not self-closing tag: 351 | if ($tag[strlen($tag) - 2] !== '/' && !in_array($tagName, $selfClosingTags, true)) { 352 | if ($tagName === '!--') { // Start HTML comment: 353 | $re_nextTag = $re_inComment; 354 | } 355 | elseif ($tagName === 'script') { // Start script: 356 | $re_nextTag = $re_inScript; 357 | } 358 | elseif ($tagName === 'style') { // Start style: 359 | $re_nextTag = $re_inStyle; 360 | } 361 | else { 362 | // Stack opened tag: 363 | $openedTags[] = ['name' => $tagName, 'wasCounting' => $isCounting]; 364 | } 365 | $isCounting = $isCountingTag; 366 | } 367 | } 368 | // Closing tag: 369 | else { 370 | $prevTag = array_pop($openedTags); 371 | 372 | if ($tagName === $prevTag['name']) { 373 | $isCounting = $prevTag['wasCounting']; 374 | } 375 | else { // Un-paired closing tag (Malformed HTML ? Mismatched or badly nested tag ?) 376 | if ($prevTag !== null) $openedTags[] = $prevTag; 377 | if ($options['debug'] === true) throw new \Exception("Unmatched closing tag '$tag' (\$tagPos=$tagPos, \$pos=$pos, \$length=$length)"); 378 | else { 379 | // We backtrack: 380 | if ($endData_lastCountedChar['ellipsisPos'] !== -2) { 381 | $endData_maxLength = $endData_lastCountedChar; 382 | break; 383 | } 384 | // If we cannot backtrack directly, we rerun analyze() and force backtracking: 385 | else { 386 | $maxLength = ($endData_ellipsisIncluded['ellipsisPos'] === -1) ? $ellipsis_maxLength : $maxLength; 387 | return $analyze($maxLength, $html, ['forceBacktrack' => true] + $options); 388 | } 389 | } 390 | } 391 | } 392 | } 393 | 394 | // Continue after the tag: 395 | $pos += strlen($tag); 396 | } 397 | 398 | // Complete endDatas if needed with the current $pos: 399 | foreach ([&$endData_maxLength, &$endData_ellipsisIncluded] as &$endData) { 400 | if ($endData['ellipsisPos'] === -1) { // ie. we didn't reach $maxLength 401 | // So we can include all the length to $pos: 402 | $endData['ellipsisPos'] = $pos; 403 | $endData['length'] = $length; 404 | } 405 | if ($endData['truncatePos'] === -1) { // ie. we didn't reach a countable character after $maxLength 406 | // So we can include all the bytes to $pos: 407 | $endData['truncatePos'] = $pos; 408 | $endData['openedTags'] = $openedTags; 409 | } 410 | } 411 | 412 | // Should we return $endData_maxLength or $endData_ellipsisIncluded ? 413 | // In case we must include the ellipsis length: 414 | // - if we could reach the end of $html, it means that without the added length of the ellipsis, the length of $html is less than $maxLength 415 | // - otherwise we return the end with the ellipsis length included 416 | $endData_selected = $endData_maxLength; 417 | if ($options['includeEllipsisLength'] && $endData_maxLength['truncatePos'] !== strlen($html)) { 418 | $endData_selected = $endData_ellipsisIncluded; 419 | } 420 | 421 | return $endData_selected; 422 | }; // End of analyze() 423 | 424 | 425 | // If $maxLength is negative, remove $maxLength countable characters from the end of the $html: 426 | if ($maxLength < 0) { 427 | $maxLength = $analyze(-1, $html, $options)['length'] + $maxLength; 428 | if ($maxLength < 0) $maxLength = 0; 429 | } 430 | 431 | // Analyze $html: 432 | $r = $analyze($maxLength, $html, $options); 433 | $ellipsisPos = $r['ellipsisPos']; 434 | $truncatePos = $r['truncatePos']; 435 | $openedTags = $r['openedTags']; 436 | 437 | assert(!($ellipsisPos < 0), "Not counted: \$ellipsisPos=$ellipsisPos"); 438 | assert(!($truncatePos < 0), "Not processed: \$truncatePos=$truncatePos"); 439 | assert(!($truncatePos > strlen($html)), "Read too far: \$truncatePos=$truncatePos is greater than strlen(\$html)=".strlen($html)); 440 | 441 | // If $html is shorter than $maxLength: 442 | if ($truncatePos === strlen($html)) return $html; 443 | 444 | // Close all remaining opened tags: 445 | $closingTags = ''; 446 | while (!empty($openedTags)) $closingTags .= ''.array_pop($openedTags)['name'].'>'; 447 | 448 | // Return truncated $html with insertion of ellipsis and appended closing tags: 449 | return substr($html, 0, $ellipsisPos) 450 | . $options['ellipsis'] 451 | . substr($html, $ellipsisPos, $truncatePos - $ellipsisPos) 452 | . $closingTags; 453 | } --------------------------------------------------------------------------------