├── tests ├── tmp │ └── .gitignore ├── .gitignore ├── _output │ └── .gitignore ├── _data │ └── dump.sql ├── _bootstrap.php ├── unit │ ├── _bootstrap.php │ └── Rose │ │ ├── Entity │ │ ├── FulltextResultTest.php │ │ ├── WordPositionContainerTest.php │ │ ├── SnippetLineTest.php │ │ ├── SentenceMapTest.php │ │ ├── QueryTest.php │ │ ├── ResultSetTest.php │ │ └── SnippetTest.php │ │ ├── Helper │ │ ├── SnippetTextHelperTest.php │ │ └── StringHelperTest.php │ │ ├── Storage │ │ ├── Database │ │ │ ├── RepositoryValidationTest.php │ │ │ └── MysqlRepositoryTest.php │ │ └── SingleFileArrayStorageTest.php │ │ ├── Stemmer │ │ └── StemmerTest.php │ │ └── FinderTest.php ├── config.php.dist.sqlite ├── unit.suite.yml ├── config.php.dist.postgres ├── config.php.dist.mysql └── _support │ ├── Helper │ ├── Unit.php │ ├── Acceptance.php │ └── Functional.php │ ├── UnitTester.php │ ├── AcceptanceTester.php │ └── FunctionalTester.php ├── .gitignore ├── bin ├── codecept ├── stem └── process_test.php ├── src └── S2 │ └── Rose │ ├── Exception │ ├── ExceptionInterface.php │ ├── UnknownException.php │ ├── RuntimeException.php │ ├── ImmutableException.php │ ├── LogicException.php │ ├── InvalidArgumentException.php │ └── UnknownIdException.php │ ├── Stemmer │ ├── StemmerInterface.php │ └── AbstractStemmer.php │ ├── Storage │ ├── Exception │ │ ├── EmptyIndexException.php │ │ └── InvalidEnvironmentException.php │ ├── StorageEraseInterface.php │ ├── TransactionalStorageInterface.php │ ├── Dto │ │ ├── SnippetResult.php │ │ └── SnippetQuery.php │ ├── FulltextProxyInterface.php │ ├── StorageReadInterface.php │ ├── Database │ │ └── IdMappingStorage.php │ ├── FulltextIndexContent.php │ ├── StorageWriteInterface.php │ ├── FulltextIndexPositionBag.php │ ├── File │ │ └── SingleFileArrayStorage.php │ ├── ArrayFulltextStorage.php │ └── ArrayStorage.php │ ├── Extractor │ ├── ExtractorInterface.php │ ├── DefaultExtractorFactory.php │ ├── ExtractionResult.php │ ├── ExtractionErrors.php │ ├── ChainExtractor.php │ ├── HtmlRegex │ │ └── RegexExtractor.php │ └── HtmlDom │ │ └── DomState.php │ ├── Entity │ ├── ContentWithMetadata.php │ ├── Metadata │ │ ├── ImgCollection.php │ │ ├── Img.php │ │ ├── SnippetSource.php │ │ ├── SentenceCollection.php │ │ └── SentenceMap.php │ ├── HighlightIntervals.php │ ├── TocEntryWithMetadata.php │ ├── ExternalIdCollection.php │ ├── ResultTrace.php │ ├── ExternalId.php │ ├── FulltextQuery.php │ ├── TocEntry.php │ ├── WordPositionContainer.php │ ├── Indexable.php │ ├── ResultItem.php │ ├── Snippet.php │ ├── Query.php │ ├── FulltextResult.php │ └── SnippetLine.php │ ├── Helper │ ├── ProfileHelper.php │ ├── SnippetTextHelper.php │ └── StringHelper.php │ ├── Snippet │ └── SnippetBuilder.php │ ├── Finder.php │ └── Indexer.php ├── .editorconfig ├── codeception.yml ├── composer.json ├── LICENSE ├── .github └── workflows │ ├── test_sqlite.yml │ ├── test_postgres.yml │ └── test_mysql.yml └── doc └── rose.svg /tests/tmp/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /tests/.gitignore: -------------------------------------------------------------------------------- 1 | config.php 2 | -------------------------------------------------------------------------------- /tests/_output/.gitignore: -------------------------------------------------------------------------------- 1 | * 2 | -------------------------------------------------------------------------------- /tests/_data/dump.sql: -------------------------------------------------------------------------------- 1 | /* Replace this file with actual dump of your database */ -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .git 2 | .idea/ 3 | vendor/ 4 | tests/_support/_generated/ 5 | composer.lock 6 | -------------------------------------------------------------------------------- /tests/_bootstrap.php: -------------------------------------------------------------------------------- 1 | 'sqlite:tests/_output/s2_rose_test', 6 | 'username' => '', 7 | 'passwd' => '', 8 | ]; 9 | -------------------------------------------------------------------------------- /tests/unit.suite.yml: -------------------------------------------------------------------------------- 1 | # Codeception Test Suite Configuration 2 | # 3 | # Suite for unit (internal) tests. 4 | 5 | class_name: UnitTester 6 | modules: 7 | enabled: 8 | - Asserts 9 | - \Helper\Unit 10 | -------------------------------------------------------------------------------- /src/S2/Rose/Exception/ExceptionInterface.php: -------------------------------------------------------------------------------- 1 | 'pgsql:host=127.0.0.1;dbname=s2_rose_test', 6 | 'username' => 'postgres', 7 | 'passwd' => '12345', 8 | ]; 9 | -------------------------------------------------------------------------------- /tests/config.php.dist.mysql: -------------------------------------------------------------------------------- 1 | 'mysql:host=127.0.0.1;dbname=s2_rose_test;charset=utf8', 6 | 'username' => 'root', 7 | 'passwd' => 'root', 8 | ]; 9 | -------------------------------------------------------------------------------- /src/S2/Rose/Exception/UnknownException.php: -------------------------------------------------------------------------------- 1 | nextStemmer = $nextStemmer; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /codeception.yml: -------------------------------------------------------------------------------- 1 | actor: Tester 2 | paths: 3 | tests: tests 4 | log: tests/_output # prior to 5.0? 5 | output: tests/_output # ~5.0? 6 | data: tests/_data 7 | support: tests/_support 8 | envs: tests/_envs 9 | bootstrap: _bootstrap.php 10 | settings: 11 | colors: true 12 | memory_limit: 1024M 13 | extensions: 14 | enabled: 15 | - Codeception\Extension\RunFailed 16 | modules: 17 | config: 18 | Db: 19 | dsn: '' 20 | user: '' 21 | password: '' 22 | dump: tests/_data/dump.sql 23 | coverage: 24 | enabled: true 25 | include: 26 | - src/* 27 | -------------------------------------------------------------------------------- /src/S2/Rose/Storage/TransactionalStorageInterface.php: -------------------------------------------------------------------------------- 1 | assertEquals(0.9889808283708308, FulltextResult::frequencyReduction(50, 2)); 20 | $this->assertEquals(0.17705374665950163, FulltextResult::frequencyReduction(50, 25)); 21 | $this->assertEquals(1, FulltextResult::frequencyReduction(3, 2)); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/S2/Rose/Extractor/DefaultExtractorFactory.php: -------------------------------------------------------------------------------- 1 | attachExtractor(new DomExtractor()); 19 | } 20 | $extractor->attachExtractor(new RegexExtractor()); 21 | 22 | return $extractor; 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /tests/_support/UnitTester.php: -------------------------------------------------------------------------------- 1 | ' . PHP_EOL); 17 | } 18 | 19 | $language = $argv[1]; 20 | $argument = $argv[2]; 21 | 22 | $stemmer = match ($language) { 23 | 'russian' => new \S2\Rose\Stemmer\PorterStemmerRussian(), 24 | 'english' => new \S2\Rose\Stemmer\PorterStemmerEnglish(), 25 | default => throw new \Exception('Unknown stemmer language: ' . $language), 26 | }; 27 | 28 | echo $stemmer->stemWord($argument), PHP_EOL; 29 | -------------------------------------------------------------------------------- /src/S2/Rose/Storage/Dto/SnippetResult.php: -------------------------------------------------------------------------------- 1 | data[$externalId->toString()][] = $snippet; 19 | } 20 | 21 | public function iterate(callable $callback): void 22 | { 23 | foreach ($this->data as $serializedId => $snippets) { 24 | $callback(ExternalId::fromString($serializedId), ...$snippets); 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "s2/rose", 3 | "description": "Search engine for PHP sites", 4 | "authors": [ 5 | { 6 | "name": "Roman Parpalak", 7 | "email": "roman@parpalak.com" 8 | } 9 | ], 10 | "license": "MIT", 11 | "require": { 12 | "php": ">=7.4", 13 | "ext-json": "*", 14 | "symfony/polyfill-mbstring": "^1.2", 15 | "psr/log": "^1.1|^2.0|^3.0" 16 | }, 17 | "require-dev": { 18 | "codeception/codeception": "^4.2|^5.0", 19 | "codeception/module-asserts": "^1.3|^3.0" 20 | }, 21 | "suggest": { 22 | "ext-dom": "*", 23 | "ext-pdo": "*" 24 | }, 25 | "autoload": { 26 | "psr-4": { 27 | "S2\\Rose\\": "src/S2/Rose" 28 | } 29 | }, 30 | "autoload-dev": { 31 | "psr-4": { 32 | "S2\\Rose\\Test\\": "tests/unit/Rose" 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/S2/Rose/Storage/FulltextProxyInterface.php: -------------------------------------------------------------------------------- 1 | sentenceMap = $sentenceMap; 20 | $this->imageCollection = $images; 21 | } 22 | 23 | public function getSentenceMap(): SentenceMap 24 | { 25 | return $this->sentenceMap; 26 | } 27 | 28 | public function getImageCollection(): ImgCollection 29 | { 30 | return $this->imageCollection; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/S2/Rose/Extractor/ExtractionResult.php: -------------------------------------------------------------------------------- 1 | contentWithMetadata = $contentWithMetadata; 19 | $this->errors = $errors; 20 | } 21 | 22 | public function getContentWithMetadata(): ContentWithMetadata 23 | { 24 | return $this->contentWithMetadata; 25 | } 26 | 27 | public function getErrors(): ExtractionErrors 28 | { 29 | return $this->errors; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/S2/Rose/Exception/UnknownIdException.php: -------------------------------------------------------------------------------- 1 | getId(), 18 | $externalId->getInstanceId() 19 | )); 20 | } 21 | 22 | public static function createResultMissingExternalId(ExternalId $externalId) 23 | { 24 | return new static(sprintf( 25 | 'External id "%s" for instance "%s" not found in result.', 26 | $externalId->getId(), 27 | $externalId->getInstanceId() 28 | )); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/S2/Rose/Storage/StorageReadInterface.php: -------------------------------------------------------------------------------- 1 | Img::fromArray($item), json_decode($json, true, 512, JSON_THROW_ON_ERROR))); 24 | } 25 | 26 | public function toJson(): string 27 | { 28 | /** @noinspection PhpUnhandledExceptionInspection */ 29 | return json_encode($this->getArrayCopy(), JSON_THROW_ON_ERROR | JSON_UNESCAPED_UNICODE); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/HighlightIntervals.php: -------------------------------------------------------------------------------- 1 | hasPreviousInterval) { 19 | $this->highlightIntervals[] = [$start, $end]; 20 | } else { 21 | $this->highlightIntervals[\count($this->highlightIntervals) - 1][1] = $end; 22 | } 23 | 24 | $this->hasPreviousInterval = true; 25 | } 26 | 27 | public function skipInterval(): void 28 | { 29 | $this->hasPreviousInterval = false; 30 | } 31 | 32 | public function toArray(): array 33 | { 34 | return $this->highlightIntervals; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /tests/unit/Rose/Helper/SnippetTextHelperTest.php: -------------------------------------------------------------------------------- 1 | alert(1)', SnippetSource::FORMAT_PLAIN_TEXT, true); 20 | 21 | $this->assertSame('<script>alert(1)</script>', $result); 22 | } 23 | 24 | public function testKeepsInternalFormattingTags(): void 25 | { 26 | $result = SnippetTextHelper::prepareForOutput('\\iDanger\\I text', SnippetSource::FORMAT_INTERNAL, true); 27 | 28 | $this->assertSame('Danger text', $result); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /tests/unit/Rose/Storage/Database/RepositoryValidationTest.php: -------------------------------------------------------------------------------- 1 | expectException(InvalidArgumentException::class); 18 | new MysqlRepository(new class extends \PDO { 19 | public function __construct() {} 20 | }, 'bad;DROP', []); 21 | } 22 | 23 | public function testRejectsInvalidTableOverride(): void 24 | { 25 | $this->expectException(InvalidArgumentException::class); 26 | new MysqlRepository(new class extends \PDO { 27 | public function __construct() {} 28 | }, 'ok_prefix', ['toc' => 'toc;DROP']); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/S2/Rose/Storage/Database/IdMappingStorage.php: -------------------------------------------------------------------------------- 1 | idMapping[$externalId->toString()] = $internalId; 22 | } 23 | 24 | public function remove(ExternalId $externalId) 25 | { 26 | unset($this->idMapping[$externalId->toString()]); 27 | } 28 | 29 | public function clear() 30 | { 31 | $this->idMapping = []; 32 | } 33 | 34 | public function get(ExternalId $externalId) 35 | { 36 | $externalIdString = $externalId->toString(); 37 | if (!isset($this->idMapping[$externalIdString])) { 38 | return null; 39 | } 40 | 41 | return $this->idMapping[$externalIdString]; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Roman Parpalak 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/TocEntryWithMetadata.php: -------------------------------------------------------------------------------- 1 | tocEntry = $tocEntry; 21 | $this->externalId = $externalId; 22 | $this->imgCollection = $imgCollection; 23 | } 24 | 25 | public function getTocEntry(): TocEntry 26 | { 27 | return $this->tocEntry; 28 | } 29 | 30 | public function getExternalId(): ExternalId 31 | { 32 | return $this->externalId; 33 | } 34 | 35 | /** 36 | * @return ImgCollection|Img[] 37 | */ 38 | public function getImgCollection(): ImgCollection 39 | { 40 | return $this->imgCollection; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /.github/workflows/test_sqlite.yml: -------------------------------------------------------------------------------- 1 | name: Test on SQLite 2 | 3 | on: [ push ] 4 | 5 | jobs: 6 | build: 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | operating_system: 11 | - 'ubuntu-22.04' 12 | php_versions: 13 | - '7.4' 14 | - '8.0' 15 | - '8.1' 16 | - '8.2' 17 | - '8.3' 18 | - '8.4' 19 | 20 | runs-on: '${{ matrix.operating_system }}' 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - name: 'Setup PHP' 26 | uses: shivammathur/setup-php@v2 27 | with: 28 | php-version: ${{ matrix.php_versions }} 29 | 30 | - name: Show SQLite version 31 | run: php --ri sqlite3 32 | 33 | - name: Install dependencies 34 | run: COMPOSER_MEMORY_LIMIT=-1 composer install --prefer-dist --no-interaction 35 | 36 | - name: Prepare config 37 | run: cp tests/config.php.dist.sqlite tests/config.php 38 | 39 | - name: Run test cases 40 | run: php bin/codecept run --skip-group profile 41 | 42 | - name: Run profiling 43 | if: success() || failure() 44 | run: php bin/codecept run -g profile -d -------------------------------------------------------------------------------- /src/S2/Rose/Helper/ProfileHelper.php: -------------------------------------------------------------------------------- 1 | $message, 15 | 'duration' => $duration, 16 | 'memory_usage' => memory_get_usage(), 17 | 'memory_peak_usage' => memory_get_peak_usage(), 18 | ]; 19 | } 20 | 21 | public static function formatProfilePoint(array $point): string 22 | { 23 | $point['message'] = str_pad($point['message'], 25, ' ', STR_PAD_RIGHT); 24 | $point['duration'] = str_pad(number_format($point['duration'] * 1000.0, 2, '.', ' ') . ' ms', 20, ' ', STR_PAD_LEFT); 25 | $point['memory_usage'] = str_pad(number_format($point['memory_usage'], 0, '.', ' '), 20, ' ', STR_PAD_LEFT); 26 | $point['memory_peak_usage'] = str_pad(number_format($point['memory_peak_usage'], 0, '.', ' '), 20, ' ', STR_PAD_LEFT); 27 | 28 | return implode('', $point); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /bin/process_test.php: -------------------------------------------------------------------------------- 1 | setAttribute(\PDO::ATTR_ERRMODE, \PDO::ERRMODE_EXCEPTION); 18 | 19 | $storage = new \S2\Rose\Storage\Database\PdoStorage($pdo, 'multiprocess_'); 20 | // $storage->erase(); 21 | 22 | $stemmer = new \S2\Rose\Stemmer\PorterStemmerRussian(); 23 | $indexer = new \S2\Rose\Indexer($storage, $stemmer); 24 | 25 | $filenames = glob(__DIR__ . '/../tests/Resource/data/' . '*.txt'); 26 | $filenames = array_slice($filenames, 0, TEST_FILE_NUM); 27 | 28 | foreach ($filenames as $filename) { 29 | echo 'Indexing ', $filename, "\n"; 30 | $content = file_get_contents($filename) . ' ' . rand(); 31 | $indexable = new \S2\Rose\Entity\Indexable( 32 | basename($filename), 33 | substr($content, 0, strpos($content, "\n")), 34 | $content 35 | ); 36 | 37 | $indexer->index($indexable); 38 | } 39 | -------------------------------------------------------------------------------- /src/S2/Rose/Storage/FulltextIndexContent.php: -------------------------------------------------------------------------------- 1 | getExternalId()->toString(); 20 | 21 | $contentPositions = $positionBag->getContentPositions(); 22 | if (\count($contentPositions) > 0) { 23 | $this->dataByExternalId[$serializedExtId][$word] = $contentPositions; 24 | } 25 | 26 | $this->dataByWord[$word][$serializedExtId] = $positionBag; 27 | } 28 | 29 | /** 30 | * @return FulltextIndexPositionBag[][] 31 | * @deprecated TODO rename or refactor this data transformation 32 | */ 33 | public function toArray(): array 34 | { 35 | return $this->dataByWord; 36 | } 37 | 38 | public function iterateContentWordPositions(\Closure $callback): void 39 | { 40 | foreach ($this->dataByExternalId as $serializedExtId => $data) { 41 | $callback(ExternalId::fromString($serializedExtId), new WordPositionContainer($data)); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/ExternalIdCollection.php: -------------------------------------------------------------------------------- 1 | externalIds = $externalIds; 32 | } 33 | 34 | /** 35 | * @param string[] $serializedExternalIds 36 | */ 37 | public static function fromStringArray(array $serializedExternalIds): self 38 | { 39 | return new self(array_map(static function ($serializedExtId) { 40 | return ExternalId::fromString($serializedExtId); 41 | }, $serializedExternalIds)); 42 | } 43 | 44 | /** 45 | * @return ExternalId[] 46 | */ 47 | public function toArray(): array 48 | { 49 | return $this->externalIds; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/S2/Rose/Extractor/ExtractionErrors.php: -------------------------------------------------------------------------------- 1 | errors[] = [ 19 | 'message' => $message, 20 | 'code' => $code, 21 | 'line' => $line, 22 | 'column' => $column 23 | ]; 24 | 25 | return $this; 26 | } 27 | 28 | /** @noinspection PhpComposerExtensionStubsInspection */ 29 | public function addLibXmlError(\LibXMLError $error): self 30 | { 31 | return $this->addError(trim($error->message), (string)$error->code, $error->line, $error->column); 32 | } 33 | 34 | public function hasErrors(): bool 35 | { 36 | return \count($this->errors) > 0; 37 | } 38 | 39 | /** 40 | * @return string[] 41 | */ 42 | public function getFormattedLines(): array 43 | { 44 | return array_map(static fn(array $error) => sprintf( 45 | "%s:%s %s (code=%s)", 46 | $error['line'], 47 | $error['column'] ?? '?', 48 | $error['message'], 49 | $error['code'] 50 | ), $this->errors); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/S2/Rose/Extractor/ChainExtractor.php: -------------------------------------------------------------------------------- 1 | extractors[] = $extractor; 25 | } 26 | 27 | /** 28 | * {@inheritdoc} 29 | * @throws RuntimeException 30 | */ 31 | public function extract(string $text): ExtractionResult 32 | { 33 | if (\count($this->extractors) === 0) { 34 | throw new LogicException('No extractors were attached to the ChainExtractor.'); 35 | } 36 | 37 | $e = null; 38 | foreach ($this->extractors as $extractor) { 39 | try { 40 | return $extractor->extract($text); 41 | } catch (\Exception $e) { 42 | if ($this->logger) { 43 | $this->logger->error($e->getMessage(), ['exception' => $e]); 44 | } 45 | } 46 | } 47 | 48 | throw new RuntimeException($e->getMessage(), 0, $e); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/ResultTrace.php: -------------------------------------------------------------------------------- 1 | data[$serializedExtId]['fulltext ' . $word][] = [ 20 | sprintf( 21 | '%s: match at positions [%s]', 22 | array_product($weights), 23 | implode(', ', $positions) 24 | ) => $weights, 25 | ]; 26 | } 27 | 28 | /** 29 | * @param float[]|array $weights 30 | */ 31 | public function addKeywordWeight(string $word, string $serializedExtId, array $weights): void 32 | { 33 | $this->data[$serializedExtId]['keyword ' . $word][] = [ 34 | (string)array_product($weights) => $weights, 35 | ]; 36 | } 37 | 38 | public function addNeighbourWeight(string $word1, string $word2, string $serializedExtId, float $weight, int $distance): void 39 | { 40 | $this->data[$serializedExtId]['fulltext ' . $word1 . ' - ' . $word2][] = $weight . ': matches are close (shift = ' . $distance . ')'; 41 | } 42 | 43 | public function toArray(): array 44 | { 45 | return $this->data; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/Metadata/Img.php: -------------------------------------------------------------------------------- 1 | src = $src; 19 | $this->width = $width; 20 | $this->height = $height; 21 | $this->alt = $alt; 22 | } 23 | 24 | public function getSrc(): string 25 | { 26 | return $this->src; 27 | } 28 | 29 | public function getWidth(): string 30 | { 31 | return $this->width; 32 | } 33 | 34 | public function getHeight(): string 35 | { 36 | return $this->height; 37 | } 38 | 39 | public function getAlt(): string 40 | { 41 | return $this->alt; 42 | } 43 | 44 | public static function fromArray(array $img): Img 45 | { 46 | return new self($img['src'], $img['width'], $img['height'], $img['alt']); 47 | } 48 | 49 | /** 50 | * @return mixed 51 | */ 52 | #[\ReturnTypeWillChange] 53 | public function jsonSerialize() 54 | { 55 | return get_object_vars($this); 56 | } 57 | 58 | public function hasNumericDimensions(): bool 59 | { 60 | return is_numeric($this->width) && is_numeric($this->height); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/S2/Rose/Storage/StorageWriteInterface.php: -------------------------------------------------------------------------------- 1 | toArray() as $externalId) { 20 | $this->data[$externalId->toString()] = null; 21 | } 22 | } 23 | 24 | /** 25 | * @param int[] $positions 26 | */ 27 | public function attach(ExternalId $externalId, array $positions): void 28 | { 29 | $serializedExtId = $externalId->toString(); 30 | if (isset($this->data[$serializedExtId])) { 31 | throw new LogicException(sprintf('SnippetQuery already has id "%s".', $serializedExtId)); 32 | } 33 | $this->data[$serializedExtId] = $positions; 34 | } 35 | 36 | public function iterate(callable $callback): void 37 | { 38 | foreach ($this->data as $serializedExtId => $positions) { 39 | $callback(ExternalId::fromString($serializedExtId), $positions); 40 | } 41 | } 42 | 43 | /** 44 | * @return ExternalId[] 45 | */ 46 | public function getExternalIds(): array 47 | { 48 | return array_map(static fn(string $serializedExtId) => ExternalId::fromString($serializedExtId), array_keys($this->data)); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/ExternalId.php: -------------------------------------------------------------------------------- 1 | 0)) { 22 | // @codeCoverageIgnoreStart 23 | throw new InvalidArgumentException('Instance id must be positive.'); 24 | // @codeCoverageIgnoreEnd 25 | } 26 | 27 | if (!\is_string($id) && !\is_int($id) && !\is_float($id)) { 28 | // @codeCoverageIgnoreStart 29 | throw new InvalidArgumentException('External id must be string or int or float.'); 30 | // @codeCoverageIgnoreEnd 31 | } 32 | 33 | $this->id = (string)$id; 34 | $this->instanceId = $instanceId; 35 | } 36 | 37 | public function getId(): string 38 | { 39 | return $this->id; 40 | } 41 | 42 | public function getInstanceId(): ?int 43 | { 44 | return $this->instanceId; 45 | } 46 | 47 | public function toString(): string 48 | { 49 | return $this->instanceId . ':' . $this->id; 50 | } 51 | 52 | public static function fromString(string $string): self 53 | { 54 | $data = explode(':', $string, 2); 55 | 56 | return new static($data[1], $data[0] !== '' ? (int)$data[0] : null); 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/FulltextQuery.php: -------------------------------------------------------------------------------- 1 | words = array_values($words); 29 | $this->extractStems($stemmer); 30 | } 31 | 32 | protected function extractStems(StemmerInterface $stemmer): void 33 | { 34 | foreach ($this->words as $i => $word) { 35 | $stemWord = $stemmer->stemWord($word); 36 | if ($stemWord !== $word) { 37 | $this->additionalStems[$i] = $stemWord; 38 | } 39 | } 40 | } 41 | 42 | /** 43 | * @return string[] 44 | */ 45 | public function getWordsWithStems(): array 46 | { 47 | return array_merge($this->words, $this->additionalStems); 48 | } 49 | 50 | public function toWordPositionContainer(): WordPositionContainer 51 | { 52 | $container = new WordPositionContainer(); 53 | 54 | foreach ($this->words as $position => $word) { 55 | $container->addWordAt($word, $position); 56 | } 57 | 58 | foreach ($this->additionalStems as $position => $stem) { 59 | $container->addWordAt($stem, $position); 60 | } 61 | 62 | return $container; 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /tests/unit/Rose/Storage/Database/MysqlRepositoryTest.php: -------------------------------------------------------------------------------- 1 | capturedSql = $statement; 29 | 30 | return new class($this) extends \PDOStatement { 31 | private $pdo; 32 | 33 | public function __construct($pdo) 34 | { 35 | $this->pdo = $pdo; 36 | } 37 | 38 | public function execute($params = null): bool 39 | { 40 | $this->pdo->executedParams[] = $params ?? []; 41 | 42 | return true; 43 | } 44 | }; 45 | } 46 | }; 47 | 48 | $repository = new MysqlRepository($pdo, 'prefix_', []); 49 | $repository->insertWords(['test"', "danger\\word"]); 50 | 51 | $this->assertStringNotContainsString('test"', $pdo->capturedSql); 52 | $this->assertSame([['test"', "danger\\word"]], $pdo->executedParams); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /.github/workflows/test_postgres.yml: -------------------------------------------------------------------------------- 1 | name: Test on PostgreSQL 2 | 3 | on: [ push ] 4 | 5 | jobs: 6 | build: 7 | strategy: 8 | fail-fast: false 9 | matrix: 10 | operating_system: ['ubuntu-22.04'] 11 | postgresql-version: [10, 11, 12, 13, 14, 15, 16, 17, 18] 12 | php_versions: 13 | - '8.4' 14 | 15 | runs-on: '${{ matrix.operating_system }}' 16 | 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Install PostgreSQL 21 | env: 22 | POSTGRESQL_VERSION: ${{ matrix.postgresql-version }} 23 | run: | 24 | sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list' 25 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - 26 | sudo apt-get update 27 | sudo apt-get -y install "postgresql-$POSTGRESQL_VERSION" 28 | sudo service postgresql start 29 | 30 | - name: Set up PostgreSQL 31 | run: | 32 | sudo -u postgres psql -c "ALTER USER postgres WITH PASSWORD '12345';" 33 | sudo -u postgres psql -c "CREATE DATABASE s2_rose_test OWNER postgres;" 34 | 35 | - name: 'Setup PHP' 36 | uses: shivammathur/setup-php@v2 37 | with: 38 | php-version: ${{ matrix.php_versions }} 39 | 40 | - name: Install dependencies 41 | run: COMPOSER_MEMORY_LIMIT=-1 composer install --prefer-dist --no-interaction 42 | 43 | - name: Prepare config 44 | run: cp tests/config.php.dist.postgres tests/config.php 45 | 46 | - name: Run test cases 47 | run: php bin/codecept run --skip-group profile 48 | 49 | - name: Run profiling 50 | if: success() || failure() 51 | run: php bin/codecept run -g profile -d -------------------------------------------------------------------------------- /src/S2/Rose/Storage/FulltextIndexPositionBag.php: -------------------------------------------------------------------------------- 1 | externalId = $externalId; 29 | $this->titlePositions = $titlePositions; 30 | $this->keywordPositions = $keywordPositions; 31 | $this->contentPositions = $contentPositions; 32 | $this->wordCount = $wordCount; 33 | $this->externalRelevanceRatio = $externalRelevanceRatio; 34 | } 35 | 36 | public function getExternalId(): ExternalId 37 | { 38 | return $this->externalId; 39 | } 40 | 41 | public function getTitlePositions(): array 42 | { 43 | return $this->titlePositions; 44 | } 45 | 46 | public function getKeywordPositions(): array 47 | { 48 | return $this->keywordPositions; 49 | } 50 | 51 | public function getContentPositions(): array 52 | { 53 | return $this->contentPositions; 54 | } 55 | 56 | public function getWordCount(): int 57 | { 58 | return $this->wordCount; 59 | } 60 | 61 | public function getExternalRelevanceRatio(): float 62 | { 63 | return $this->externalRelevanceRatio; 64 | } 65 | 66 | public function hasExternalRelevanceRatio(): bool 67 | { 68 | return $this->externalRelevanceRatio !== 1.0; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/TocEntry.php: -------------------------------------------------------------------------------- 1 | title = $title; 22 | $this->description = $description; 23 | $this->date = $date; 24 | $this->url = $url; 25 | $this->relevanceRatio = $relevanceRatio; 26 | $this->hash = $hash; 27 | } 28 | 29 | public function getTitle(): string 30 | { 31 | return $this->title; 32 | } 33 | 34 | public function getDescription(): string 35 | { 36 | return $this->description; 37 | } 38 | 39 | public function getDate(): ?\DateTime 40 | { 41 | return $this->date; 42 | } 43 | 44 | public function getUrl(): string 45 | { 46 | return $this->url; 47 | } 48 | 49 | public function getRelevanceRatio(): float 50 | { 51 | return $this->relevanceRatio; 52 | } 53 | 54 | public function getInternalId(): ?int 55 | { 56 | return $this->internalId; 57 | } 58 | 59 | public function getHash(): string 60 | { 61 | return $this->hash; 62 | } 63 | 64 | /** 65 | * @deprecated Make immutable 66 | */ 67 | public function setInternalId(int $internalId): self 68 | { 69 | $this->internalId = $internalId; 70 | 71 | return $this; 72 | } 73 | 74 | public function getFormattedDate(): ?string 75 | { 76 | return $this->date !== null ? $this->date->format('Y-m-d H:i:s') : null; 77 | } 78 | 79 | public function getTimeZone(): ?string 80 | { 81 | return $this->date !== null ? $this->date->getTimeZone()->getName() : null; 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /tests/unit/Rose/Entity/WordPositionContainerTest.php: -------------------------------------------------------------------------------- 1 | [23, 56, 74], 23 | 'word2' => [2, 57], 24 | ]); 25 | 26 | $this->assertEquals(1, $container->getClosestDistanceBetween('word1', 'word2', 0)); 27 | $this->assertEquals(-1, $container->getClosestDistanceBetween('word2', 'word1', 0)); 28 | $this->assertEquals(23 - 2 - 20, $container->getClosestDistanceBetween('word2', 'word1', 20)); 29 | $this->assertEquals(23 - 2 - 25, $container->getClosestDistanceBetween('word2', 'word1', 25)); 30 | } 31 | 32 | public function testCompare() 33 | { 34 | $container = new WordPositionContainer(); 35 | foreach (explode(' ', 'Циркуляция вектора напряженности электростатического поля вдоль замкнутого контура всегда равна нулю') as $k => $word) { 36 | $container->addWordAt($word, $k); 37 | } 38 | 39 | $this->assertEquals([['поля', 'нулю', 7]], $container->compareWith(new WordPositionContainer([ 40 | 'нулю' => [5], 41 | 'нул' => [5], 42 | 'поля' => [6], 43 | 'пол' => [6], 44 | ]))); 45 | 46 | $this->assertEquals([['поля', 'нулю', 5]], $container->compareWith(new WordPositionContainer([ 47 | 'нулю' => [1], 48 | 'нул' => [1], 49 | 'поля' => [0], 50 | 'пол' => [0], 51 | ]))); 52 | 53 | $this->assertEquals([ 54 | ['вектора', 'поля', 2], 55 | ['вектора', 'контура', 4], 56 | ['поля', 'контура', 2], 57 | ], $container->compareWith(new WordPositionContainer([ 58 | 'вектора' => [1], 59 | 'поля' => [2], 60 | 'контура' => [3], 61 | ]))); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/S2/Rose/Helper/SnippetTextHelper.php: -------------------------------------------------------------------------------- 1 | assertEquals( 32 | 'Testing string to highlight some test values, Test is case-sensitive.', 33 | $snippetLine->getHighlighted('%s', false) 34 | ); 35 | } 36 | 37 | public function testCreateHighlighted2() 38 | { 39 | $snippetLine = new SnippetLine( 40 | 'Testing string to highlight some test values, Test is case-sensitive.', 41 | SnippetSource::FORMAT_PLAIN_TEXT, 42 | new PorterStemmerEnglish(), 43 | ['Test'], // unknown stem, stems are normalized to lower case, however there is a match due to direct comparison 44 | 1 45 | ); 46 | 47 | $this->assertEquals( 48 | 'Testing string to highlight some test values, Test is case-sensitive.', 49 | $snippetLine->getHighlighted('%s', false) 50 | ); 51 | } 52 | 53 | public function testJoinHighlighted() 54 | { 55 | $snippetLine = new SnippetLine( 56 | 'Testing string to highlight some test values, Test is case-sensitive.', 57 | SnippetSource::FORMAT_PLAIN_TEXT, 58 | new PorterStemmerEnglish(), 59 | ['to', 'highlight'], 60 | 1 61 | ); 62 | 63 | $this->assertEquals( 64 | 'Testing string to highlight some test values, Test is case-sensitive.', 65 | $snippetLine->getHighlighted('%s', false) 66 | ); 67 | } 68 | 69 | public function testCreateHighlightedFail() 70 | { 71 | $snippetLine = new SnippetLine( 72 | 'Testing string to highlight some test values, Test is case-sensitive.', 73 | SnippetSource::FORMAT_PLAIN_TEXT, 74 | new PorterStemmerEnglish(), 75 | ['test', 'is'], 76 | 2 77 | ); 78 | $this->expectException(RuntimeException::class); 79 | $snippetLine->getHighlighted('', false); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /doc/rose.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /src/S2/Rose/Extractor/HtmlRegex/RegexExtractor.php: -------------------------------------------------------------------------------- 1 | ', '
', '
', '
',]; 24 | $replaceTo = [' ', '', SentenceMap::LINE_SEPARATOR, SentenceMap::LINE_SEPARATOR, SentenceMap::LINE_SEPARATOR, SentenceMap::LINE_SEPARATOR]; 25 | 26 | foreach ([ 27 | '
', 28 | '
', 29 | '
', 30 | '
', 31 | '', '

', '

', '

', '

', '
', 40 | '

', '

', '
', '
  • ', 41 | '

    ', '

    ', '

    ', '

    ', '

    ', '
    ', 42 | '

    ', '

    ', '
    ', '
  • ', 43 | ] as $tag) { 44 | $replaceFrom[] = $tag; 45 | $replaceTo[] = self::PARAGRAPH_SEPARATOR . $tag; 46 | } 47 | foreach ([ 48 | '
  • ', '
  • ', '', '', '', '', 49 | '

    ', '', '', '', '', 50 | '', '', '', '', '', '', 51 | '

    ', '', '', '', '', 52 | ] as $tag) { 53 | $replaceFrom[] = $tag; 54 | $replaceTo[] = $tag . self::PARAGRAPH_SEPARATOR; 55 | } 56 | 57 | $text = str_replace($replaceFrom, $replaceTo, $text); 58 | 59 | $text = preg_replace('#<(script|style)[^>]*?>.*?#si', '', $text); 60 | $text = preg_replace('#<([a-z]+) [^>]*?index-skip[^>]*?>.*?#si', '', $text); 61 | 62 | $paragraphs = explode(self::PARAGRAPH_SEPARATOR, $text); 63 | $texts = array_map(static fn(string $string) => trim(strip_tags($string)), $paragraphs); // TODO allow some formatting 64 | $texts = array_filter($texts); 65 | 66 | $text = implode(' ', $texts); 67 | 68 | $text = html_entity_decode($text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5); 69 | 70 | return new ExtractionResult( 71 | new ContentWithMetadata((new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT))->add(0, '', $text), new ImgCollection()), 72 | new ExtractionErrors() 73 | ); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /tests/unit/Rose/Entity/SentenceMapTest.php: -------------------------------------------------------------------------------- 1 | expectException(\LogicException::class); 21 | $s = new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT); 22 | $s->add(2, '/html/body/p[2]/text()[1]', 'Second'); 23 | $s->add(2, '/html/body/p[2]/text()[1]', 'sentence. And a third one.'); 24 | } 25 | 26 | public function testToArrayManyPaths(): void 27 | { 28 | $s = new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT); 29 | $s->add(1, '/html/body/p[1]/text()', 'One sentence.'); 30 | $s->add(2, '/html/body/p[2]/text()[1]', 'Second'); 31 | $s->add(2, '/html/body/p[2]/br', ' '); 32 | $s->add(2, '/html/body/p[2]/text()[2]', 'sentence. And a third one...'); 33 | 34 | $sentenceArray = $s->toSentenceCollection()->toArray(); 35 | 36 | $this->assertEquals([ 37 | 'One sentence.', 38 | 'Second sentence.', 39 | 'And a third one...', 40 | ], $sentenceArray); 41 | } 42 | 43 | public function testToArrayOneLargePath(): void 44 | { 45 | $s = new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT); 46 | $s->add(0, '', 'А это цитата, ее тоже надо индексировать. В цитате могут быть абзацы. Ошибка астатически даёт более простую систему. Еще 1 раз проверим, как gt работает защита против xss-уязвимостей.'); 47 | 48 | $sentenceArray = $s->toSentenceCollection()->toArray(); 49 | $this->assertEquals([ 50 | 'А это цитата, ее тоже надо индексировать.', 51 | 'В цитате могут быть абзацы.', 52 | 'Ошибка астатически даёт более простую систему.', 53 | 'Еще 1 раз проверим, как gt работает защита против xss-уязвимостей.', 54 | ], $sentenceArray); 55 | } 56 | 57 | public function testToArrayOneLargePath2(): void 58 | { 59 | $s = new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT); 60 | $s->add(2, '/html/body/p[2]/text()[1]', 'Second sentence. And a third one...'); 61 | 62 | $sentenceArray = $s->toSentenceCollection()->toArray(); 63 | 64 | $this->assertEquals([ 65 | 'Second sentence.', 66 | 'And a third one...', 67 | ], $sentenceArray); 68 | } 69 | 70 | public function testToArrayPathPerSentence(): void 71 | { 72 | $s = new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT); 73 | $s->add(2, '/html/body/p[2]/text()[1]', 'Second sentence.'); 74 | $s->add(2, '/html/body/p[2]/br', ' '); 75 | $s->add(2, '/html/body/p[2]/text()[2]', 'And a third one...'); 76 | 77 | $sentenceArray = $s->toSentenceCollection()->toArray(); 78 | 79 | $this->assertEquals([ 80 | 'Second sentence.', 81 | 'And a third one...', 82 | ], $sentenceArray); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/WordPositionContainer.php: -------------------------------------------------------------------------------- 1 | [23, 56, 74], 18 | * 'word2' => [2, 57], 19 | * ] 20 | */ 21 | public function __construct(array $data = []) 22 | { 23 | $this->data = $data; 24 | } 25 | 26 | public function addWordAt(string $word, int $position): self 27 | { 28 | $this->data[$word][] = $position; 29 | 30 | sort($this->data[$word]); // TODO make more reliable requirement of input arrays to be sorted. 31 | 32 | return $this; 33 | } 34 | 35 | public function compareWith(self $referenceContainer): array 36 | { 37 | $wordMap = array_keys($this->data); 38 | $len = \count($wordMap); 39 | 40 | $result = []; 41 | /** @noinspection ForeachInvariantsInspection */ 42 | for ($i = 0; $i < $len; $i++) { 43 | $word1 = (string)$wordMap[$i]; 44 | for ($j = $i + 1; $j < $len; $j++) { 45 | $word2 = (string)$wordMap[$j]; 46 | 47 | $referenceDistance = $referenceContainer->getClosestDistanceBetween($word1, $word2, 0); 48 | if ($referenceDistance === self::INFINITY) { 49 | continue; 50 | } 51 | 52 | $distance = $this->getClosestDistanceBetween($word1, $word2, $referenceDistance); 53 | 54 | $result[] = [$word1, $word2, $distance]; 55 | } 56 | } 57 | 58 | return $result; 59 | } 60 | 61 | /** 62 | * This method uses linear algorithm, therefore input arrays must be sorted. 63 | * Otherwise, the output is incorrect. 64 | * 65 | * @param int[] $a1 66 | * @param int[] $a2 67 | * 68 | * @return int It's important to return a signed value, not an absolute value. 69 | */ 70 | protected static function compareArrays(array $a1, array $a2, int $shift): int 71 | { 72 | $len1 = \count($a1); 73 | $len2 = \count($a2); 74 | 75 | $result = self::INFINITY; 76 | $index1 = 0; 77 | $index2 = 0; 78 | 79 | while ($index1 < $len1 && $index2 < $len2) { 80 | $diff = $a2[$index2] - $a1[$index1] - $shift; 81 | 82 | if ($diff === 0) { 83 | return 0; 84 | } 85 | 86 | if (abs($result) > abs($diff)) { 87 | $result = $diff; 88 | } 89 | 90 | if ($diff < 0) { 91 | $index2++; 92 | } else { 93 | $index1++; 94 | } 95 | } 96 | 97 | return $result; 98 | } 99 | 100 | public function getClosestDistanceBetween(string $word1, string $word2, int $shift = 0): int 101 | { 102 | if (!isset($this->data[$word1], $this->data[$word2])) { 103 | return self::INFINITY; 104 | } 105 | 106 | return self::compareArrays($this->data[$word1], $this->data[$word2], $shift); 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /.github/workflows/test_mysql.yml: -------------------------------------------------------------------------------- 1 | name: Test on MySQL 2 | 3 | on: [ push ] 4 | 5 | env: 6 | DB_DATABASE: s2_rose_test 7 | DB_USER: root 8 | DB_PASSWORD: root 9 | 10 | jobs: 11 | build: 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | operating_system: ['ubuntu-22.04'] 16 | mysql_versions: 17 | - 'mariadb-10.2' 18 | - 'mariadb-10.3' 19 | - 'mariadb-10.4' 20 | - 'mariadb-10.5' 21 | - 'mariadb-10.6' 22 | - 'mariadb-10.7' 23 | - 'mariadb-10.8' 24 | - 'mariadb-10.9' 25 | - 'mariadb-10.10' 26 | - 'mariadb-10.11' 27 | - 'mariadb-11.0' 28 | - 'mariadb-11.1' 29 | - 'mariadb-11.2' 30 | - 'mariadb-11.3' 31 | - 'mariadb-11.4' 32 | - 'mariadb-11.5' 33 | - 'mariadb-11.6' 34 | - 'mariadb-11.7' 35 | - 'mariadb-11.8' 36 | - 'mariadb-12.0' 37 | - 'mariadb-12.1' 38 | - '5.6' 39 | - '5.7' 40 | - '8.0' 41 | - '8.1' 42 | - '8.2' 43 | - '8.3' 44 | - '8.4' 45 | - '9.0' 46 | - '9.1' 47 | - '9.2' 48 | - '9.3' 49 | - '9.4' 50 | - '9.5' 51 | php_versions: 52 | - '8.4' 53 | include: 54 | - 55 | operating_system: 'ubuntu-22.04' 56 | mysql_versions: 'mariadb-11.8' 57 | php_versions: '7.4' 58 | - 59 | operating_system: 'ubuntu-22.04' 60 | mysql_versions: 'mariadb-11.8' 61 | php_versions: '8.0' 62 | - 63 | operating_system: 'ubuntu-22.04' 64 | mysql_versions: 'mariadb-11.8' 65 | php_versions: '8.1' 66 | - 67 | operating_system: 'ubuntu-22.04' 68 | mysql_versions: 'mariadb-11.8' 69 | php_versions: '8.2' 70 | - 71 | operating_system: 'ubuntu-22.04' 72 | mysql_versions: 'mariadb-11.8' 73 | php_versions: '8.3' 74 | - 75 | operating_system: 'ubuntu-22.04' 76 | mysql_versions: 'mariadb-11.8' 77 | php_versions: '8.5' 78 | 79 | runs-on: '${{ matrix.operating_system }}' 80 | 81 | steps: 82 | - uses: actions/checkout@v4 83 | - uses: shogo82148/actions-setup-mysql@v1 84 | with: 85 | mysql-version: ${{ matrix.mysql_versions }} 86 | root-password: ${{ env.DB_PASSWORD }} 87 | 88 | - name: 'Setup MySQL' 89 | run: | 90 | mysql -e 'SELECT version();' -u${{ env.DB_USER }} -h127.0.0.1 -p${{ env.DB_PASSWORD }} 91 | mysql -e 'CREATE DATABASE ${{ env.DB_DATABASE }};' -u${{ env.DB_USER }} -h127.0.0.1 -p${{ env.DB_PASSWORD }} 92 | 93 | - name: 'Setup PHP' 94 | uses: shivammathur/setup-php@v2 95 | with: 96 | php-version: ${{ matrix.php_versions }} 97 | 98 | - name: Install dependencies 99 | run: COMPOSER_MEMORY_LIMIT=-1 composer install --prefer-dist --no-interaction 100 | 101 | - name: Prepare config 102 | run: cp tests/config.php.dist.mysql tests/config.php 103 | 104 | - name: Run test cases 105 | run: php -d register_argc_argv=On bin/codecept run --skip-group profile 106 | 107 | - name: Run profiling 108 | if: success() || failure() 109 | run: php -d register_argc_argv=On bin/codecept run -g profile -d -------------------------------------------------------------------------------- /src/S2/Rose/Entity/Metadata/SnippetSource.php: -------------------------------------------------------------------------------- 1 | $maxPosition) { 47 | throw new InvalidArgumentException('Minimal word position cannot be greater than maximal.'); 48 | } 49 | 50 | if (!\in_array($formatId, self::ALLOWED_FORMATS)) { 51 | throw new InvalidArgumentException(sprintf('Unknown snippet format "%s".', $formatId)); 52 | } 53 | 54 | $this->text = $text; 55 | $this->minPosition = $minPosition; 56 | $this->maxPosition = $maxPosition; 57 | $this->formatId = $formatId; 58 | } 59 | 60 | public function getText(): string 61 | { 62 | return $this->text; 63 | } 64 | 65 | public function getMinPosition(): int 66 | { 67 | return $this->minPosition; 68 | } 69 | 70 | public function getMaxPosition(): int 71 | { 72 | return $this->maxPosition; 73 | } 74 | 75 | public function getFormatId(): int 76 | { 77 | return $this->formatId; 78 | } 79 | 80 | /** 81 | * @param int[] $positions 82 | */ 83 | public function coversOneOfPositions(array $positions): bool 84 | { 85 | foreach ($positions as $position) { 86 | if ($position >= $this->minPosition && $position <= $this->maxPosition) { 87 | return true; 88 | } 89 | } 90 | 91 | return false; 92 | } 93 | 94 | public function __toString() 95 | { 96 | return $this->text; 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /tests/unit/Rose/Storage/SingleFileArrayStorageTest.php: -------------------------------------------------------------------------------- 1 | getTempFilename()); 32 | } 33 | 34 | public function testStorage() 35 | { 36 | $storage = new SingleFileArrayStorage($this->getTempFilename()); 37 | 38 | $storage->load(); 39 | 40 | $storage->addEntryToToc( 41 | new TocEntry('test title 1', '', new \DateTime(), '', 1, '4567890lkjhgfd'), 42 | new ExternalId('test_id_1') 43 | ); 44 | $storage->addEntryToToc( 45 | new TocEntry('test title 2', '', new \DateTime(), '', 1, 'edfghj8765rfg'), 46 | new ExternalId('test_id_2') 47 | ); 48 | 49 | $entry1 = $storage->getTocByExternalId(new ExternalId('test_id_1')); 50 | $entry2 = $storage->getTocByExternalId(new ExternalId('test_id_2')); 51 | $this->assertEquals(1, $entry1->getInternalId()); 52 | $this->assertEquals(2, $entry2->getInternalId()); 53 | 54 | $storage->addToFulltextIndex(['titleword'], ['keyword1', 'keyword2'], [1 => 'hello', 2 => 'world', 3=>'world'], new ExternalId('test_id_1')); 55 | 56 | $fulltextResult = $storage->fulltextResultByWords(['hello'], null); 57 | $info = $fulltextResult->toArray()['hello']; 58 | $this->assertArrayHasKey(':test_id_1', $info); 59 | $this->assertEquals([1], $info[':test_id_1']->getContentPositions()); 60 | $this->assertEquals([], $info[':test_id_1']->getTitlePositions()); 61 | $this->assertEquals([], $info[':test_id_1']->getKeywordPositions()); 62 | 63 | $fulltextResult = $storage->fulltextResultByWords(['world'], null); 64 | $info = $fulltextResult->toArray()['world']; 65 | $this->assertArrayHasKey(':test_id_1', $info); 66 | $this->assertEquals([2, 3], $info[':test_id_1']->getContentPositions()); 67 | $this->assertEquals([], $info[':test_id_1']->getTitlePositions()); 68 | $this->assertEquals([], $info[':test_id_1']->getKeywordPositions()); 69 | 70 | $storage->save(); 71 | 72 | $storage = new SingleFileArrayStorage($this->getTempFilename()); 73 | $storage->load(); 74 | 75 | $entry1 = $storage->getTocByExternalId(new ExternalId('test_id_1')); 76 | $this->assertEquals('test title 1', $entry1->getTitle()); 77 | $this->assertEquals('4567890lkjhgfd', $entry1->getHash()); 78 | 79 | $entry3 = $storage->getTocByExternalId(new ExternalId('test_id_3')); 80 | $this->assertNull($entry3); 81 | 82 | $storage->addToFulltextIndex([], [], [10 => 'hello', 20 => 'world'], new ExternalId('test_id_2')); 83 | 84 | $fulltextResult = $storage->fulltextResultByWords(['world'], null); 85 | $info = $fulltextResult->toArray()['world']; 86 | $this->assertArrayHasKey(':test_id_1', $info); 87 | $this->assertEquals([2, 3], $info[':test_id_1']->getContentPositions()); 88 | $this->assertArrayHasKey(':test_id_2', $info); 89 | $this->assertEquals([20], $info[':test_id_2']->getContentPositions()); 90 | 91 | $storage->save(); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/Indexable.php: -------------------------------------------------------------------------------- 1 | externalId = new ExternalId($id, $instanceId); 23 | $this->title = $title; 24 | $this->content = $content; 25 | } 26 | 27 | public function getExternalId(): ExternalId 28 | { 29 | return $this->externalId; 30 | } 31 | 32 | public function getTitle(): string 33 | { 34 | return $this->title; 35 | } 36 | 37 | public function setTitle(string $title): self 38 | { 39 | $this->title = $title; 40 | 41 | return $this; 42 | } 43 | 44 | public function getContent(): string 45 | { 46 | return $this->content; 47 | } 48 | 49 | public function setContent(string $content): self 50 | { 51 | $this->content = $content; 52 | 53 | return $this; 54 | } 55 | 56 | public function getKeywords(): string 57 | { 58 | return $this->keywords; 59 | } 60 | 61 | public function setKeywords(string $keywords): self 62 | { 63 | $this->keywords = $keywords; 64 | 65 | return $this; 66 | } 67 | 68 | public function getDescription(): string 69 | { 70 | return $this->description; 71 | } 72 | 73 | public function setDescription(string $description): self 74 | { 75 | $this->description = $description; 76 | 77 | return $this; 78 | } 79 | 80 | public function getDate(): ?\DateTime 81 | { 82 | return $this->date; 83 | } 84 | 85 | public function setDate(\DateTime $date = null): self 86 | { 87 | $this->date = $date; 88 | 89 | return $this; 90 | } 91 | 92 | public function getUrl(): string 93 | { 94 | return $this->url; 95 | } 96 | 97 | public function setUrl(string $url): self 98 | { 99 | $this->url = $url; 100 | 101 | return $this; 102 | } 103 | 104 | public function getRelevanceRatio(): float 105 | { 106 | return $this->relevanceRatio; 107 | } 108 | 109 | public function setRelevanceRatio(float $relevanceRatio): self 110 | { 111 | if ($relevanceRatio < 0.001) { 112 | throw new \DomainException('Relevance ratio must not be less than 0.001.'); 113 | } 114 | if ($relevanceRatio > 9999) { 115 | throw new \DomainException('Relevance ratio must not be greater than 9999.'); 116 | } 117 | 118 | $this->relevanceRatio = $relevanceRatio; 119 | 120 | return $this; 121 | } 122 | 123 | public function toTocEntry(): TocEntry 124 | { 125 | return new TocEntry( 126 | $this->getTitle(), 127 | $this->getDescription(), 128 | $this->getDate(), 129 | $this->getUrl(), 130 | $this->getRelevanceRatio(), 131 | $this->calcHash() 132 | ); 133 | } 134 | 135 | public function calcHash(): string 136 | { 137 | return md5(serialize([ 138 | $this->getTitle(), 139 | $this->getDescription(), 140 | $this->getKeywords(), 141 | $this->getContent(), 142 | ])); 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/S2/Rose/Extractor/HtmlDom/DomState.php: -------------------------------------------------------------------------------- 1 | sentenceMap = new SentenceMap(SnippetSource::FORMAT_INTERNAL); 38 | } 39 | 40 | public function attachContent(string $path, string $textContent): void 41 | { 42 | if ($this->startNewParagraph) { 43 | $this->currentParagraphIndex++; 44 | $this->startNewParagraph = false; 45 | } 46 | 47 | /** 48 | * Decode all entities. '&' was encoded before and decoded in DOM processing. 49 | * @see \S2\Rose\Extractor\HtmlDom\DomExtractor::getDomDocument 50 | */ 51 | $textContent = html_entity_decode($textContent, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5); 52 | 53 | $textContent = $this->pendingFormatting . str_replace('\\', '\\\\', $textContent); 54 | $this->pendingFormatting = ''; 55 | 56 | $this->sentenceMap->add($this->currentParagraphIndex, $path, $textContent); 57 | } 58 | 59 | public function startNewParagraph(): void 60 | { 61 | $this->startNewParagraph = true; 62 | } 63 | 64 | public function startFormatting(string $formatting): void 65 | { 66 | if (!\in_array($formatting, self::ALLOWED_FORMATTING, true)) { 67 | throw new \LogicException(sprintf('Unknown formatting "%s".', $formatting)); 68 | } 69 | $this->formattingLevel[$formatting] = 1 + ($this->formattingLevel[$formatting] ?? 0); 70 | if ($this->formattingLevel[$formatting] === 1) { 71 | $this->pendingFormatting .= '\\' . $formatting; 72 | } 73 | } 74 | 75 | public function stopFormatting(string $formatting): void 76 | { 77 | if (!\in_array($formatting, self::ALLOWED_FORMATTING, true)) { 78 | throw new \LogicException(sprintf('Unknown formatting "%s".', $formatting)); 79 | } 80 | $level = $this->formattingLevel[$formatting] ?? 0; 81 | if ($level === 1) { 82 | if ($this->pendingFormatting === '') { 83 | // No format symbols are queued. This means that symbols of formatting start have already been added 84 | // to SentenceMap. So it is not empty and the last item can be modified. 85 | $this->sentenceMap->appendToLastItem('\\' . strtoupper($formatting)); 86 | } else { 87 | $this->pendingFormatting .= '\\' . strtoupper($formatting); 88 | } 89 | } 90 | if ($level > 0) { 91 | $this->formattingLevel[$formatting] = $level - 1; 92 | } 93 | } 94 | 95 | public function attachImg(string $src, string $width, string $height, string $alt): void 96 | { 97 | $this->images[] = new Img($src, $width, $height, $alt); 98 | } 99 | 100 | public function toContentWithMetadata(): ContentWithMetadata 101 | { 102 | return new ContentWithMetadata($this->sentenceMap, new ImgCollection(...$this->images)); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/Metadata/SentenceCollection.php: -------------------------------------------------------------------------------- 1 | formatId = $formatId; 24 | } 25 | 26 | /** 27 | * @param string $text Text content of a sentence. Must be formatted according to the constructor parameter. 28 | * @return void 29 | */ 30 | public function attach(string $text): void 31 | { 32 | $this->cachedWords = null; 33 | $this->sentences[] = trim(preg_replace('#\\s+#', ' ', $text)); 34 | } 35 | 36 | public function getText(): string 37 | { 38 | return implode(' ', $this->sentences); 39 | } 40 | 41 | /** 42 | * @internal Used for tests only! 43 | */ 44 | public function toArray(): array 45 | { 46 | return $this->sentences; 47 | } 48 | 49 | /** 50 | * @return string[] 51 | */ 52 | public function getWordsArray(): array 53 | { 54 | if ($this->cachedWords === null) { 55 | $this->buildWordsInfo(); 56 | } 57 | 58 | return $this->cachedWords; 59 | } 60 | 61 | /** 62 | * @return SnippetSource[] 63 | */ 64 | public function getSnippetSources(): array 65 | { 66 | if ($this->cachedSnippetSources === null) { 67 | $this->buildWordsInfo(); 68 | } 69 | 70 | return $this->cachedSnippetSources; 71 | } 72 | 73 | private function buildWordsInfo(): void 74 | { 75 | $this->cachedWords = []; 76 | $this->cachedSnippetSources = []; 77 | $oldSize = 0; 78 | foreach ($this->sentences as $idx => $sentence) { 79 | // NOTE: maybe it's worth to join sentences somehow before exploding for optimization reasons 80 | $contentWords = self::breakIntoWords( 81 | $this->formatId === SnippetSource::FORMAT_INTERNAL ? StringHelper::clearInternalFormatting($sentence) : $sentence 82 | ); 83 | $this->cachedWords[] = $contentWords; 84 | $wordsInSentence = \count($contentWords); 85 | if ($wordsInSentence === 0) { 86 | continue; 87 | } 88 | $newSize = $wordsInSentence + $oldSize; 89 | 90 | if ($wordsInSentence >= 2) { // Skip too short snippets 91 | $this->cachedSnippetSources[$idx] = new SnippetSource($sentence, $this->formatId, $oldSize, $newSize - 1); 92 | } 93 | 94 | $oldSize = $newSize; 95 | } 96 | $this->cachedWords = array_merge(...$this->cachedWords); 97 | } 98 | 99 | /** 100 | * @return string[] 101 | */ 102 | public static function breakIntoWords(string $content): array 103 | { 104 | // Replace decimal separator: ',' -> '.' 105 | $content = preg_replace('#(?:^|[\s()])-?\d+\K,(?=\d+(?:$|[\s()]|\.\s))#', '.', $content); 106 | 107 | // We allow letters, digits and some punctuation: ".,-^_" 108 | $content = str_replace(',', ', ', $content); 109 | $content = preg_replace('#[^\\-.,0-9\\p{L}^_]+#u', ' ', $content); 110 | $content = mb_strtolower($content); 111 | $content = str_replace(['ё'], ['е'], $content); 112 | 113 | // These punctuation characters are meant to be inside words and numbers. 114 | // Remove trailing characters when splitting the words. 115 | $content = rtrim($content, '-.,'); 116 | 117 | $words = preg_split('#[\\-.,]*?[ ]+#S', $content); 118 | StringHelper::removeLongWords($words); 119 | 120 | return $words; 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /tests/unit/Rose/Entity/QueryTest.php: -------------------------------------------------------------------------------- 1 | assertEquals([1, 2], (new Query('1|||2'))->valueToArray()); 22 | $this->assertEquals([1, 2], (new Query('1\\\\\\2'))->valueToArray()); 23 | $this->assertEquals(['a', 'b'], (new Query('a/b'))->valueToArray()); 24 | $this->assertEquals(['a', 'b'], (new Query(' a b '))->valueToArray()); 25 | $this->assertEquals(['..'], (new Query('..'))->valueToArray()); 26 | $this->assertEquals(['...'], (new Query('...'))->valueToArray()); 27 | $this->assertEquals(['a..b'], (new Query('a..b'))->valueToArray()); 28 | 29 | // Tests for replacing numbers 30 | $this->assertEquals(['1.2'], (new Query('1,2'))->valueToArray()); 31 | // $this->assertEquals(['-1.2'], (new Query('-1,2'))->valueToArray()); 32 | $this->assertEquals(['1.2'], (new Query('1.2'))->valueToArray()); 33 | 34 | // Tests for replacing typographic quotes 35 | $this->assertEquals(['"', 'text'], (new Query('«text»'))->valueToArray()); 36 | $this->assertEquals(['"', 'text'], (new Query('“text”'))->valueToArray()); 37 | 38 | // Tests for replacing dashes 39 | $this->assertEquals(['a--b'], (new Query('a--b'))->valueToArray()); 40 | $this->assertEquals(['a—b'], (new Query('a---b'))->valueToArray()); // --- to mdash 41 | $this->assertEquals(['a—b'], (new Query('a–b'))->valueToArray()); // ndash to mdash 42 | $this->assertEquals(['a-b'], (new Query('a−b'))->valueToArray()); // Minus to hyphen 43 | 44 | // Test for replacing line breaks and extra spaces 45 | $this->assertEquals(['a', 'b'], (new Query("a\n\nb"))->valueToArray()); 46 | $this->assertEquals(['a', 'b'], (new Query("a \t b"))->valueToArray()); 47 | 48 | // Tests for separating special characters 49 | $this->assertEquals(['a!b'], (new Query('a!b'))->valueToArray()); 50 | $this->assertEquals(['!', 'ab'], (new Query('!ab'))->valueToArray()); 51 | $this->assertEquals(['!', 'a!b'], (new Query('!a!b'))->valueToArray()); 52 | $this->assertEquals(['(', 'word', ')'], (new Query('(word)'))->valueToArray()); 53 | $this->assertEquals(['mysql', '--all-databases'], (new Query('mysql --all-databases'))->valueToArray()); 54 | 55 | // Test for replacing "ё" with "е" 56 | $this->assertEquals(['ё', 'полет', 'field'], (new Query('ё полёт field'))->valueToArray()); 57 | 58 | // Tests for handling commas 59 | $this->assertEquals(['a', ',', 'b'], (new Query('a,b'))->valueToArray()); 60 | $this->assertEquals(['a', ',,', 'b'], (new Query('a,,b'))->valueToArray()); 61 | $this->assertEquals(['a', ',,,', 'b'], (new Query('a,,,b'))->valueToArray()); 62 | 63 | // Tests for removing long words 64 | $this->assertEquals(['a', 'c'], (new Query('a ' . str_repeat('b', 101) . ' c'))->valueToArray()); 65 | 66 | // Tests for compatibility of multiple rules 67 | $this->assertEquals(['a—b', '"', 'text'], (new Query('a–b «text»'))->valueToArray()); 68 | $this->assertEquals(['a', ',', 'b'], (new Query(" a, \n b "))->valueToArray()); 69 | $this->assertEquals( 70 | ['похоже', ',', 'лучшие', 'времена', 'наступили', 'я', 'решил', 'доработать', 'и', 'опубликовать', 'движок'], 71 | (new Query('Похоже, лучшие времена наступили. Я решил доработать и опубликовать движок.'))->valueToArray() 72 | ); 73 | 74 | // Invalid inputs 75 | $this->assertSame([], (new Query(null))->valueToArray()); 76 | $this->assertSame([], (new Query(['foo' => 'bar']))->valueToArray()); 77 | $this->assertSame(['ре'], array_values((new Query(rawurldecode('%D1%80%D0%B5%D0')))->valueToArray())); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /tests/unit/Rose/Entity/ResultSetTest.php: -------------------------------------------------------------------------------- 1 | prepareResult(new ResultSet()); 29 | $data = $result->getSortedRelevanceByExternalId(); 30 | $this->assertCount(30, $data); 31 | 32 | $result = $this->prepareResult(new ResultSet(2)); 33 | $data = $result->getSortedRelevanceByExternalId(); 34 | $this->assertCount(2, $data); 35 | $this->assertEquals(30, $result->getTotalCount()); 36 | $this->assertEquals(39, $data[':id_29']); 37 | $this->assertEquals(38, $data[':id_28']); 38 | 39 | $result = $this->prepareResult(new ResultSet(4, 3)); 40 | $data = $result->getSortedRelevanceByExternalId(); 41 | $this->assertCount(4, $data); 42 | $this->assertEquals(30, $result->getTotalCount()); 43 | $this->assertEquals(36, $data[':id_26']); 44 | $this->assertEquals(35, $data[':id_25']); 45 | $this->assertEquals(34, $data[':id_24']); 46 | $this->assertEquals(33, $data[':id_23']); 47 | } 48 | 49 | public function testEmpty() 50 | { 51 | $resultSet = new ResultSet(); 52 | $resultSet->freeze(); 53 | $data = $resultSet->getItems(); 54 | $this->assertCount(0, $data); 55 | } 56 | 57 | public function testNotFrozenGetItems() 58 | { 59 | $this->expectException(ImmutableException::class); 60 | $resultSet = new ResultSet(); 61 | $resultSet->getItems(); 62 | } 63 | 64 | public function testNotFrozenAttachSnippet() 65 | { 66 | $this->expectException(UnknownIdException::class); 67 | $resultSet = new ResultSet(); 68 | $resultSet->attachSnippet(new ExternalId('not found'), new Snippet('%s', new SnippetLine('', SnippetSource::FORMAT_PLAIN_TEXT, new PorterStemmerEnglish(), [], 0.0))); 69 | } 70 | 71 | public function testNotFrozenGetFoundExternalIds() 72 | { 73 | $this->expectException(ImmutableException::class); 74 | $resultSet = new ResultSet(); 75 | $resultSet->getFoundExternalIds(); 76 | } 77 | 78 | public function testNotFrozenGetFoundWordsByExternalId() 79 | { 80 | $this->expectException(ImmutableException::class); 81 | $resultSet = new ResultSet(); 82 | $resultSet->getFoundWordPositionsByExternalId(); 83 | } 84 | 85 | public function testNotFrozenGetSortedExternalIds() 86 | { 87 | $this->expectException(ImmutableException::class); 88 | $resultSet = new ResultSet(); 89 | $resultSet->getSortedExternalIds(); 90 | } 91 | 92 | public function testNotFrozenGetSortedRelevanceByExternalId() 93 | { 94 | $this->expectException(ImmutableException::class); 95 | $resultSet = new ResultSet(); 96 | $resultSet->getSortedRelevanceByExternalId(); 97 | } 98 | 99 | /** 100 | * @param ResultSet $result 101 | * 102 | * @return ResultSet 103 | * @throws ImmutableException 104 | * @throws \S2\Rose\Exception\InvalidArgumentException 105 | */ 106 | private function prepareResult(ResultSet $result) 107 | { 108 | for ($i = 30; $i--;) { 109 | $externalId = new ExternalId('id_' . $i); 110 | $result->addWordWeight('test1', $externalId, ['test' => $i]); 111 | $result->addWordWeight('test2', $externalId, ['test' => 10]); 112 | } 113 | 114 | $result->freeze(); 115 | 116 | return $result; 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /src/S2/Rose/Snippet/SnippetBuilder.php: -------------------------------------------------------------------------------- 1 | stemmer = $stemmer; 34 | $this->snippetLineSeparator = $snippetLineSeparator; 35 | } 36 | 37 | public function setHighlightMaskRegexArray(array $regexes): self 38 | { 39 | $this->highlightMaskRegexArray = $regexes; 40 | 41 | return $this; 42 | } 43 | 44 | /** 45 | * @throws ImmutableException 46 | * @throws UnknownIdException 47 | */ 48 | public function attachSnippets(ResultSet $result, SnippetResult $snippetResult): self 49 | { 50 | $foundWords = $result->getFoundWordPositionsByExternalId(); 51 | 52 | $snippetResult->iterate(function (ExternalId $externalId, SnippetSource ...$snippets) use ($foundWords, $result) { 53 | $snippet = $this->buildSnippet( 54 | $foundWords[$externalId->toString()], 55 | $result->getHighlightTemplate(), 56 | $result->getRelevanceByStemsFromId($externalId), 57 | ...$snippets 58 | ); 59 | $result->attachSnippet($externalId, $snippet); 60 | }); 61 | 62 | return $this; 63 | } 64 | 65 | public function buildSnippet(array $foundPositionsByStems, string $highlightTemplate, array $relevanceByStems, SnippetSource ...$snippetSources): Snippet 66 | { 67 | // Stems of the words found in the $id chapter 68 | $stems = []; 69 | $foundWordNum = 0; 70 | $snippetRelevance = []; 71 | foreach ($foundPositionsByStems as $stem => $positions) { 72 | if (empty($positions)) { 73 | // Not a fulltext search result (e.g. title from single keywords) 74 | continue; 75 | } 76 | $stems[] = $stem; 77 | $foundWordNum++; 78 | foreach ($snippetSources as $snippetIndex => $snippetSource) { 79 | if ($snippetSource->coversOneOfPositions($positions)) { 80 | $snippetRelevance[$snippetIndex] = ($snippetRelevance[$snippetIndex] ?? 0) + ($relevanceByStems[$stem] ?? 0); 81 | } 82 | } 83 | } 84 | 85 | $introSnippetLines = array_map( 86 | static fn(SnippetSource $s) => SnippetLine::createFromSnippetSourceWithoutFoundWords($s), 87 | \array_slice($snippetSources, 0, 2) 88 | ); 89 | 90 | $snippet = new Snippet($highlightTemplate, ...$introSnippetLines); 91 | 92 | if ($this->snippetLineSeparator !== null) { 93 | $snippet->setLineSeparator($this->snippetLineSeparator); 94 | } 95 | 96 | if ($foundWordNum === 0) { 97 | return $snippet; 98 | } 99 | 100 | foreach ($snippetSources as $snippetIndex => $snippetSource) { 101 | if (!isset($snippetRelevance[$snippetIndex])) { 102 | continue; 103 | } 104 | 105 | $snippetLine = new SnippetLine( 106 | $snippetSource->getText(), 107 | $snippetSource->getFormatId(), 108 | $this->stemmer, 109 | $stems, 110 | $snippetRelevance[$snippetIndex] ?? 0 111 | ); 112 | $snippetLine->setMaskRegexArray($this->highlightMaskRegexArray); 113 | 114 | $snippet->attachSnippetLine($snippetSource->getMinPosition(), $snippetSource->getMaxPosition(), $snippetLine); 115 | } 116 | 117 | return $snippet; 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/S2/Rose/Storage/File/SingleFileArrayStorage.php: -------------------------------------------------------------------------------- 1 | filename = $filename; 27 | $this->fulltextProxy = new ArrayFulltextStorage(); 28 | } 29 | 30 | public function load(bool $isDebug = false): array 31 | { 32 | $return = []; 33 | if (\count($this->toc)) { 34 | return $return; 35 | } 36 | 37 | if (!is_file($this->filename)) { 38 | return $return; 39 | } 40 | 41 | if ($isDebug) { 42 | $start_time = microtime(true); 43 | } 44 | 45 | $data = file_get_contents($this->filename); 46 | 47 | if ($isDebug) { 48 | $return[] = ProfileHelper::getProfilePoint('Reading index file', -$start_time + ($start_time = microtime(true))); 49 | } 50 | 51 | $myData = $this->extractSerializedSection($data); 52 | $unserializeOptions = ['allowed_classes' => [ 53 | \DateTime::class, 54 | TocEntry::class, 55 | Img::class, 56 | ImgCollection::class, 57 | SnippetSource::class, 58 | ]]; 59 | $this->fulltextProxy->setFulltextIndex(unserialize($myData, $unserializeOptions) ?: []); 60 | 61 | $myData = $this->extractSerializedSection($data); 62 | $this->excludedWords = unserialize($myData, $unserializeOptions) ?: []; 63 | 64 | $myData = $this->extractSerializedSection($data); 65 | $this->metadata = unserialize($myData, $unserializeOptions) ?: []; 66 | 67 | $myData = $this->extractSerializedSection($data); 68 | $this->toc = unserialize($myData, $unserializeOptions) ?: []; 69 | 70 | 71 | if ($isDebug) { 72 | $return[] = ProfileHelper::getProfilePoint('Unserializing index', -$start_time + ($start_time = microtime(true))); 73 | } 74 | 75 | $this->externalIdMap = []; 76 | foreach ($this->toc as $serializedExtId => $entry) { 77 | $this->externalIdMap[$entry->getInternalId()] = ExternalId::fromString($serializedExtId); 78 | } 79 | 80 | return $return; 81 | } 82 | 83 | public function save(): void 84 | { 85 | @unlink($this->filename); 86 | file_put_contents($this->filename, 'fulltextProxy->getFulltextIndex()) . ':{'); 87 | $buffer = ''; 88 | $length = 0; 89 | foreach ($this->fulltextProxy->getFulltextIndex() as $word => $data) { 90 | $chunk = serialize($word) . serialize($data); 91 | $length += \strlen($chunk); 92 | $buffer .= $chunk; 93 | if ($length > 100000) { 94 | file_put_contents($this->filename, $buffer, FILE_APPEND); 95 | $buffer = ''; 96 | $length = 0; 97 | } 98 | } 99 | file_put_contents($this->filename, $buffer . '}' . "\n", FILE_APPEND); 100 | $this->fulltextProxy->setFulltextIndex([]); 101 | 102 | file_put_contents($this->filename, ' //' . serialize($this->excludedWords) . "\n", FILE_APPEND); 103 | $this->excludedWords = []; 104 | 105 | file_put_contents($this->filename, ' //' . serialize($this->metadata) . "\n", FILE_APPEND); 106 | $this->metadata = []; 107 | 108 | file_put_contents($this->filename, ' //' . serialize($this->toc) . "\n", FILE_APPEND); 109 | $this->toc = []; 110 | } 111 | 112 | private function extractSerializedSection(string &$data): string 113 | { 114 | $endPos = strpos($data, "\n"); 115 | if ($endPos === false) { 116 | $line = $data; 117 | $data = ''; 118 | } else { 119 | $line = substr($data, 0, $endPos); 120 | $data = substr($data, $endPos + 1); 121 | } 122 | 123 | $commentPos = strpos($line, '//'); 124 | if ($commentPos === false) { 125 | throw new \RuntimeException('Broken SingleFileArrayStorage format: "//" marker not found.'); 126 | } 127 | 128 | return substr($line, $commentPos + 2); 129 | } 130 | } 131 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/ResultItem.php: -------------------------------------------------------------------------------- 1 | id = $id; 47 | $this->instanceId = $instanceId; 48 | $this->title = $title; 49 | $this->description = $description; 50 | $this->date = $date; 51 | $this->url = $url; 52 | $this->relevanceRatio = $relevanceRatio; 53 | $this->imgCollection = $imgCollection; 54 | $this->highlightTemplate = $highlightTemplate; 55 | } 56 | 57 | public function setSnippet(Snippet $snippet): self 58 | { 59 | $this->snippet = $snippet; 60 | 61 | return $this; 62 | } 63 | 64 | public function setRelevance(float $relevance): self 65 | { 66 | $this->relevance = $relevance; 67 | 68 | return $this; 69 | } 70 | 71 | public function getId(): string 72 | { 73 | return $this->id; 74 | } 75 | 76 | public function getInstanceId(): ?int 77 | { 78 | return $this->instanceId; 79 | } 80 | 81 | public function getTitle(): string 82 | { 83 | return $this->title; 84 | } 85 | 86 | public function getDescription(): string 87 | { 88 | return $this->description; 89 | } 90 | 91 | public function getDate(): ?\DateTime 92 | { 93 | return $this->date; 94 | } 95 | 96 | public function getUrl(): string 97 | { 98 | return $this->url; 99 | } 100 | 101 | public function getRelevanceRatio(): float 102 | { 103 | return $this->relevanceRatio; 104 | } 105 | 106 | public function getRelevance(): float 107 | { 108 | return $this->relevance; 109 | } 110 | 111 | public function getSnippet(): string 112 | { 113 | if ($this->snippet === null) { 114 | return $this->description; 115 | } 116 | 117 | $snippet = $this->snippet->toString(); 118 | if ($snippet) { 119 | return $snippet; 120 | } 121 | 122 | return $this->description ?: $this->snippet->getTextIntroduction(); 123 | } 124 | 125 | public function getFormattedSnippet(): string 126 | { 127 | if ($this->snippet === null) { 128 | return $this->description; 129 | } 130 | 131 | $snippet = $this->snippet->toString(true); 132 | if ($snippet) { 133 | return $snippet; 134 | } 135 | 136 | return $this->description ?: $this->snippet->getTextIntroduction(); 137 | } 138 | 139 | /** 140 | * @param string[] $words 141 | */ 142 | public function setFoundWords(array $words): self 143 | { 144 | $this->foundWords = $words; 145 | 146 | return $this; 147 | } 148 | 149 | /** 150 | * @throws RuntimeException 151 | */ 152 | public function getHighlightedTitle(StemmerInterface $stemmer): string 153 | { 154 | $template = $this->highlightTemplate; 155 | 156 | if (strpos($template, '%s') === false) { 157 | throw new InvalidArgumentException('Highlight template must contain "%s" substring for sprintf() function.'); 158 | } 159 | 160 | $snippetLine = new SnippetLine( 161 | $this->title, 162 | SnippetSource::FORMAT_PLAIN_TEXT, 163 | $stemmer, 164 | $this->foundWords, 165 | 0 166 | ); 167 | 168 | return $snippetLine->getHighlighted($template, false); 169 | } 170 | 171 | public function getImageCollection(): ImgCollection 172 | { 173 | return $this->imgCollection; 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/Snippet.php: -------------------------------------------------------------------------------- 1 | highlightTemplate = $highlightTemplate; 35 | $this->introductionSnippetLines = $introductionSnippetLines; 36 | } 37 | 38 | public function setLineSeparator(string $lineSeparator): self 39 | { 40 | $this->lineSeparator = $lineSeparator; 41 | 42 | return $this; 43 | } 44 | 45 | public function attachSnippetLine(int $minWordPosition, int $maxWordPosition, SnippetLine $snippetLine): self 46 | { 47 | $this->snippetLines[] = $snippetLine; 48 | $this->snippetMinWordPositions[] = $minWordPosition; 49 | $this->snippetMaxWordPositions[] = $maxWordPosition; 50 | 51 | return $this; 52 | } 53 | 54 | public function getTextIntroduction(bool $includeFormatting = false): string 55 | { 56 | $result = []; 57 | foreach ($this->introductionSnippetLines as $snippetLine) { 58 | $result[] = $snippetLine->getHighlighted($this->highlightTemplate, $includeFormatting); 59 | } 60 | 61 | return implode(' ', $result); 62 | } 63 | 64 | public function toString(bool $includeFormatting = false): ?string 65 | { 66 | $stat = []; 67 | foreach ($this->snippetLines as $index => $snippetLine) { 68 | $stat[$snippetLine->getLine()][$index] = $snippetLine->getRelevance(); 69 | } 70 | 71 | $uniqueLines = []; 72 | foreach ($stat as $indexToRelevanceMap) { 73 | arsort($indexToRelevanceMap); 74 | /** @noinspection LoopWhichDoesNotLoopInspection */ 75 | foreach ($indexToRelevanceMap as $index => $relevance) { 76 | // If there are duplicates, this code takes only one copy with the greatest relevance. 77 | $uniqueLines[$index] = $relevance; 78 | break; 79 | } 80 | } 81 | 82 | // Reverse sorting by relevance 83 | arsort($uniqueLines); 84 | 85 | // Obtaining top of meaningful lines 86 | $slice = \array_slice($uniqueLines, 0, self::SNIPPET_LINE_COUNT, true); 87 | 88 | // Sort by natural position 89 | ksort($slice); 90 | 91 | $resultSnippetLines = []; 92 | foreach ($slice as $idx => $weight) { 93 | $resultSnippetLines[$idx] = $this->snippetLines[$idx]; 94 | } 95 | 96 | return $this->implodeLines($resultSnippetLines, $includeFormatting); 97 | } 98 | 99 | /** 100 | * @param array|SnippetLine[] $snippetLines 101 | */ 102 | private function implodeLines(array $snippetLines, bool $includeFormatting): string 103 | { 104 | $result = ''; 105 | $previousMaxPosition = -1; 106 | 107 | $foundStrings = []; 108 | foreach ($snippetLines as $index => $snippetLine) { 109 | $lineStr = $snippetLine->getHighlighted($this->highlightTemplate, $includeFormatting); 110 | $lineStr = trim($lineStr); 111 | 112 | // Cleaning up unbalanced quotation marks 113 | /** @noinspection NotOptimalRegularExpressionsInspection */ 114 | $lineStr = preg_replace('#«(.*?)»#Ss', '«\\1»', $lineStr); 115 | $lineStr = str_replace(['"', '«', '»'], ['"', ''], $lineStr); 116 | if (substr_count($lineStr, '"') % 2) { 117 | $lineStr = str_replace('"', '', $lineStr); 118 | } 119 | 120 | // Remove repeating lines 121 | if (isset($foundStrings[$lineStr])) { 122 | continue; 123 | } 124 | $foundStrings[$lineStr] = 1; 125 | 126 | if ($previousMaxPosition === -1) { 127 | $result = $lineStr; 128 | } else { 129 | $result .= ($previousMaxPosition + 1 === $this->snippetMinWordPositions[$index] ? ' ' : $this->lineSeparator) . $lineStr; 130 | } 131 | $previousMaxPosition = $this->snippetMaxWordPositions[$index]; 132 | } 133 | 134 | if ($this->lineSeparator === '... ') { 135 | $result = str_replace('.... ', '... ', $result); 136 | } 137 | 138 | return $result; 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/S2/Rose/Storage/ArrayFulltextStorage.php: -------------------------------------------------------------------------------- 1 | fulltextIndex; 22 | } 23 | 24 | public function setFulltextIndex(array $fulltextIndex): self 25 | { 26 | $this->fulltextIndex = $fulltextIndex; 27 | 28 | return $this; 29 | } 30 | 31 | /** 32 | * {@inheritdoc} 33 | */ 34 | public function getByWord(string $word): array 35 | { 36 | if (!isset($this->fulltextIndex[$word])) { 37 | return []; 38 | } 39 | 40 | $result = []; 41 | foreach ($this->fulltextIndex[$word] as $id => $entries) { 42 | if (\is_int($entries)) { 43 | $result[$id][self::TYPE_CONTENT][] = $entries; 44 | } else { 45 | $entries = explode('|', $entries); 46 | foreach ($entries as $position) { 47 | if ($position[0] === self::PREFIX_TITLE) { 48 | $result[$id][self::TYPE_TITLE][] = base_convert(substr($position, 1), 36, 10); 49 | } elseif ($position[0] === self::PREFIX_KEYWORD) { 50 | $result[$id][self::TYPE_KEYWORD][] = base_convert(substr($position, 1), 36, 10); 51 | } else { 52 | $result[$id][self::TYPE_CONTENT][] = base_convert($position, 36, 10); 53 | } 54 | } 55 | } 56 | } 57 | 58 | return $result; 59 | } 60 | 61 | /** 62 | * {@inheritdoc} 63 | */ 64 | public function countByWord(string $word): int 65 | { 66 | if (!isset($this->fulltextIndex[$word])) { 67 | return 0; 68 | } 69 | 70 | return \count($this->fulltextIndex[$word]); 71 | } 72 | 73 | /** 74 | * {@inheritdoc} 75 | */ 76 | public function addWord(string $word, int $id, int $type, int $position): void 77 | { 78 | if ($word === '') { 79 | return; 80 | } 81 | 82 | if (isset($this->fulltextIndex[$word][$id])) { 83 | $positionStr = base_convert($position, 10, 36); 84 | if ($type === self::TYPE_KEYWORD) { 85 | $positionStr = self::PREFIX_KEYWORD . $positionStr; 86 | } elseif ($type === self::TYPE_TITLE) { 87 | $positionStr = self::PREFIX_TITLE . $positionStr; 88 | } 89 | 90 | $value = $this->fulltextIndex[$word][$id]; 91 | if (\is_int($value)) { 92 | // There was the only one content position, but it's no longer the case. 93 | // Convert to the 36-based number system. 94 | $this->fulltextIndex[$word][$id] = base_convert($value, 10, 36) . '|' . $positionStr; 95 | } else { 96 | // Appending 97 | $this->fulltextIndex[$word][$id] = $value . '|' . $positionStr; 98 | } 99 | } else { 100 | // If there is the only one content position in index, the position is stored as decimal number 101 | if ($type === self::TYPE_KEYWORD) { 102 | /** @noinspection CallableParameterUseCaseInTypeContextInspection */ 103 | $position = self::PREFIX_KEYWORD . base_convert($position, 10, 36); 104 | } elseif ($type === self::TYPE_TITLE) { 105 | /** @noinspection CallableParameterUseCaseInTypeContextInspection */ 106 | $position = self::PREFIX_TITLE . base_convert($position, 10, 36); 107 | } 108 | $this->fulltextIndex[$word][$id] = $position; 109 | } 110 | } 111 | 112 | /** 113 | * {@inheritdoc} 114 | */ 115 | public function removeWord(string $word): void 116 | { 117 | unset($this->fulltextIndex[$word]); 118 | } 119 | 120 | /** 121 | * {@inheritdoc} 122 | */ 123 | public function getFrequentWords(int $threshold): array 124 | { 125 | $result = []; 126 | $link = &$this->fulltextIndex; // for memory optimization 127 | foreach ($this->fulltextIndex as $word => $stat) { 128 | // Drop fulltext frequent or empty items 129 | $num = \count($stat); 130 | if ($num > $threshold) { 131 | $result[$word] = $num; 132 | } 133 | } 134 | 135 | return $result; 136 | } 137 | 138 | /** 139 | * {@inheritdoc} 140 | */ 141 | public function removeById(int $id): void 142 | { 143 | foreach ($this->fulltextIndex as &$data) { 144 | if (isset($data[$id])) { 145 | unset($data[$id]); 146 | } 147 | } 148 | unset($data); 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /tests/unit/Rose/Stemmer/StemmerTest.php: -------------------------------------------------------------------------------- 1 | russianStemmer = new PorterStemmerRussian(); 42 | $this->englishStemmer = new PorterStemmerEnglish(); 43 | $this->chainedStemmer1 = new PorterStemmerRussian(new PorterStemmerEnglish()); 44 | $this->chainedStemmer2 = new PorterStemmerEnglish(new PorterStemmerRussian()); 45 | } 46 | 47 | public function _after() 48 | { 49 | } 50 | 51 | public function testRegexes(): void 52 | { 53 | $this->assertEquals('ухмыля', $this->russianStemmer->stemWord('ухмылявшись')); 54 | $this->assertEquals('доб', $this->russianStemmer->stemWord('добившись')); 55 | } 56 | 57 | public function testParticles(): void 58 | { 59 | $this->assertEquals('кто-нибудь', $this->russianStemmer->stemWord('кого-нибудь')); 60 | $this->assertEquals('когда-нибудь', $this->russianStemmer->stemWord('когда-нибудь')); 61 | $this->assertEquals('что-то', $this->russianStemmer->stemWord('чему-то')); 62 | $this->assertEquals('нехитр-то', $this->russianStemmer->stemWord('нехитрое-то')); 63 | $this->assertEquals('когда-либо', $this->russianStemmer->stemWord('когда-либо')); 64 | $this->assertEquals('что-либо', $this->russianStemmer->stemWord('чем-либо')); 65 | $this->assertEquals('кое-что', $this->russianStemmer->stemWord('кое-чем')); 66 | $this->assertEquals('кое-кто', $this->russianStemmer->stemWord('кое-кого')); 67 | } 68 | 69 | public function testStem(): void 70 | { 71 | $this->assertEquals('ухмыляться', $this->englishStemmer->stemWord('ухмыляться')); 72 | $this->assertEquals('ухмыля', $this->russianStemmer->stemWord('ухмыляться')); 73 | $this->assertEquals('ухмыля', $this->chainedStemmer1->stemWord('ухмыляться')); 74 | $this->assertEquals('ухмыля', $this->chainedStemmer2->stemWord('ухмыляться')); 75 | 76 | $this->assertEquals('рраф', $this->russianStemmer->stemWord('Ррафа')); 77 | 78 | $this->assertEquals('метро', $this->russianStemmer->stemWord('метро')); 79 | 80 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзамен')); 81 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзамена')); 82 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзамену')); 83 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзаменом')); 84 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзамене')); 85 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзамены')); 86 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзаменов')); 87 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзаменам')); 88 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзаменами')); 89 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзаменах')); 90 | 91 | $this->assertEquals('домен', $this->russianStemmer->stemWord('домен')); 92 | $this->assertEquals('домен', $this->russianStemmer->stemWord('домена')); 93 | $this->assertEquals('домен', $this->russianStemmer->stemWord('домену')); 94 | $this->assertEquals('домен', $this->russianStemmer->stemWord('доменом')); 95 | $this->assertEquals('домен', $this->russianStemmer->stemWord('домене')); 96 | $this->assertEquals('домен', $this->russianStemmer->stemWord('домены')); 97 | $this->assertEquals('домен', $this->russianStemmer->stemWord('доменов')); 98 | $this->assertEquals('домен', $this->russianStemmer->stemWord('доменам')); 99 | $this->assertEquals('домен', $this->russianStemmer->stemWord('доменами')); 100 | $this->assertEquals('домен', $this->russianStemmer->stemWord('доменах')); 101 | 102 | $this->assertEquals('учитель', $this->englishStemmer->stemWord('Учитель')); 103 | $this->assertEquals('учител', $this->russianStemmer->stemWord('учитель')); 104 | $this->assertEquals('учител', $this->chainedStemmer1->stemWord('учитель')); 105 | $this->assertEquals('учител', $this->chainedStemmer2->stemWord('учитель')); 106 | 107 | $this->assertEquals('gun', $this->englishStemmer->stemWord('guns')); 108 | $this->assertEquals('guns', $this->russianStemmer->stemWord('guns')); 109 | 110 | $this->assertEquals('papa', $this->chainedStemmer1->stemWord('papa\'s')); 111 | $this->assertEquals('papa', $this->chainedStemmer2->stemWord('papa\'s')); 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/Metadata/SentenceMap.php: -------------------------------------------------------------------------------- 1 | [ 18 | * '/html/body/p[1]/text()' => 'One sentence.', 19 | * ], 20 | * 2 => [ 21 | * '/html/body/p[2]/text()[1]' => 'Second', 22 | * '/html/body/p[2]/br' => ' ', 23 | * '/html/body/p[2]/text()[2]' => 'sentence. And a third one.', 24 | * ], 25 | * ] 26 | * 27 | * @var array[] 28 | */ 29 | private array $paragraphs = []; 30 | private int $formatId; 31 | 32 | /** 33 | * @param int $formatId Id of formatting. 34 | * @see SnippetSource::ALLOWED_FORMATS for formatting 35 | */ 36 | public function __construct(int $formatId) 37 | { 38 | $this->formatId = $formatId; 39 | } 40 | 41 | /** 42 | * @param int $paragraphIndex Number of current paragraph. Must be detected outside based on formatting. 43 | * @param string $path Some identifier of a content node. Must be unique for the paragraph given. 44 | * @param string $textContent Raw text content of a node. Formatting must correspond to formatId constructor parameter. 45 | */ 46 | public function add(int $paragraphIndex, string $path, string $textContent): self 47 | { 48 | if (isset($this->paragraphs[$paragraphIndex][$path])) { 49 | throw new \LogicException(sprintf('Map already has a content for paragraph "%s" and path "%s".', $paragraphIndex, $path)); 50 | } 51 | $this->paragraphs[$paragraphIndex][$path] = $textContent; 52 | 53 | return $this; 54 | } 55 | 56 | public function appendToLastItem(string $text): void 57 | { 58 | $a = $this->paragraphs; 59 | if (\count($a) === 0) { 60 | throw new LogicException('Cannot append to an empty sentence map.'); 61 | } 62 | $lastKey = array_values(array_reverse(array_keys($a)))[0]; 63 | $a = $a[$lastKey]; 64 | $lastKey2 = array_values(array_reverse(array_keys($a)))[0]; 65 | $this->paragraphs[$lastKey][$lastKey2] .= $text; 66 | } 67 | 68 | public function toSentenceCollection(): SentenceCollection 69 | { 70 | $sentenceCollection = new SentenceCollection($this->formatId); 71 | 72 | foreach ($this->paragraphs as $paragraphSentences) { 73 | $accumulatedRegularSentences = ''; 74 | foreach ($paragraphSentences as $path => $paragraphSentence) { 75 | if (strpos($path, '/pre') !== false && strpos($path, '/code') !== false) { 76 | // When a code block is encountered, do accumulated regular work 77 | $this->processRegularSentences($accumulatedRegularSentences, $sentenceCollection); 78 | $accumulatedRegularSentences = ''; 79 | 80 | // and process the code in a different way 81 | $this->processCodeSentences($paragraphSentence, $sentenceCollection); 82 | } else { 83 | // Merge non-code text content and then break into sentences. 84 | $accumulatedRegularSentences .= $paragraphSentence; 85 | } 86 | } 87 | 88 | $this->processRegularSentences($accumulatedRegularSentences, $sentenceCollection); 89 | } 90 | 91 | return $sentenceCollection; 92 | } 93 | 94 | /** 95 | * Breaks a regular text into sentences using heuristics based on punctuation rules. 96 | */ 97 | private function processRegularSentences(string $text, SentenceCollection $sentenceCollection): void 98 | { 99 | $text = trim($text); 100 | $sentences = StringHelper::sentencesFromText($text, $this->formatId === SnippetSource::FORMAT_INTERNAL); 101 | 102 | if (($linesNum = 1 + substr_count($text, self::LINE_SEPARATOR)) > 3) { 103 | $totalWordNum = \count(SentenceCollection::breakIntoWords( 104 | $this->formatId === SnippetSource::FORMAT_INTERNAL ? StringHelper::clearInternalFormatting($text) : $text 105 | )); 106 | $avgWordNumInSentences = 1.0 * $totalWordNum / \count($sentences); 107 | $avgWordNumInLines = 1.0 * $totalWordNum / $linesNum; 108 | 109 | if ($avgWordNumInSentences > 20 && $avgWordNumInLines > 3 && $avgWordNumInLines < 15) { 110 | // Heuristics for lines separated by
    . 111 | // This branch is for lists like table of contents. 112 | $sentences = explode(self::LINE_SEPARATOR, $text); 113 | } 114 | } 115 | 116 | foreach ($sentences as $sentence) { 117 | if ($sentence === '') { 118 | continue; 119 | } 120 | $sentenceCollection->attach($sentence); 121 | } 122 | } 123 | 124 | /** 125 | * Breaks a source code into "sentences" using empty lines as a separator. 126 | */ 127 | private function processCodeSentences(string $text, SentenceCollection $sentenceCollection): void 128 | { 129 | $sentences = StringHelper::sentencesFromCode($text); 130 | 131 | foreach ($sentences as $sentence) { 132 | if ($sentence === '') { 133 | continue; 134 | } 135 | 136 | $sentenceCollection->attach($sentence); 137 | } 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /src/S2/Rose/Finder.php: -------------------------------------------------------------------------------- 1 | storage = $storage; 40 | $this->stemmer = $stemmer; 41 | } 42 | 43 | public function setHighlightMaskRegexArray(array $highlightMaskRegexArray): self 44 | { 45 | $this->highlightMaskRegexArray = $highlightMaskRegexArray; 46 | 47 | return $this; 48 | } 49 | 50 | public function setHighlightTemplate(string $highlightTemplate): self 51 | { 52 | $this->highlightTemplate = $highlightTemplate; 53 | 54 | return $this; 55 | } 56 | 57 | public function setSnippetLineSeparator(string $snippetLineSeparator): self 58 | { 59 | $this->snippetLineSeparator = $snippetLineSeparator; 60 | 61 | return $this; 62 | } 63 | 64 | /** 65 | * @throws ImmutableException 66 | */ 67 | public function find(Query $query, bool $isDebug = false): ResultSet 68 | { 69 | $resultSet = new ResultSet($query->getLimit(), $query->getOffset(), $isDebug); 70 | if ($this->highlightTemplate !== null) { 71 | $resultSet->setHighlightTemplate($this->highlightTemplate); 72 | } 73 | 74 | $rawWords = $query->valueToArray(); 75 | $resultSet->addProfilePoint('Input cleanup'); 76 | 77 | if (\count($rawWords) > 0) { 78 | $this->findFulltext($rawWords, $query->getInstanceId(), $resultSet); 79 | $resultSet->addProfilePoint('Fulltext search'); 80 | } 81 | 82 | $resultSet->freeze(); 83 | 84 | $sortedExternalIds = $resultSet->getSortedExternalIds(); 85 | 86 | $resultSet->addProfilePoint('Sort results'); 87 | 88 | foreach ($this->storage->getTocByExternalIds($sortedExternalIds) as $tocEntryWithExternalId) { 89 | $resultSet->attachToc($tocEntryWithExternalId); 90 | } 91 | 92 | $resultSet->addProfilePoint('Fetch TOC'); 93 | 94 | $relevanceByExternalIds = $resultSet->getSortedRelevanceByExternalId(); 95 | if (\count($relevanceByExternalIds) > 0) { 96 | $this->buildSnippets($relevanceByExternalIds, $resultSet); 97 | } 98 | 99 | return $resultSet; 100 | } 101 | 102 | /** 103 | * Ignore frequent words encountering in indexed items. 104 | */ 105 | public static function fulltextRateExcludeNum(int $tocSize): int 106 | { 107 | return max($tocSize * 0.5, 20); 108 | } 109 | 110 | /** 111 | * @throws ImmutableException 112 | */ 113 | protected function findFulltext(array $words, ?int $instanceId, ResultSet $resultSet): void 114 | { 115 | $fulltextQuery = new FulltextQuery($words, $this->stemmer); 116 | $fulltextIndexContent = $this->storage->fulltextResultByWords($fulltextQuery->getWordsWithStems(), $instanceId); 117 | $fulltextResult = new FulltextResult( 118 | $fulltextQuery, 119 | $fulltextIndexContent, 120 | $this->storage->getTocSize($instanceId) 121 | ); 122 | 123 | $fulltextResult->fillResultSet($resultSet); 124 | } 125 | 126 | public function buildSnippets(array $relevanceByExternalIds, ResultSet $resultSet): void 127 | { 128 | $snippetQuery = new SnippetQuery(ExternalIdCollection::fromStringArray(array_keys($relevanceByExternalIds))); 129 | try { 130 | $foundWordPositionsByExternalId = $resultSet->getFoundWordPositionsByExternalId(); 131 | } catch (ImmutableException $e) { 132 | throw new LogicException($e->getMessage(), 0, $e); 133 | } 134 | foreach ($foundWordPositionsByExternalId as $serializedExtId => $wordsInfo) { 135 | if (!isset($relevanceByExternalIds[$serializedExtId])) { 136 | // Out of limit and offset scope, no need to fetch snippets. 137 | continue; 138 | } 139 | $externalId = ExternalId::fromString($serializedExtId); 140 | $allPositions = array_merge(...array_values($wordsInfo)); 141 | $snippetQuery->attach($externalId, $allPositions); 142 | } 143 | $resultSet->addProfilePoint('Snippets: make query'); 144 | 145 | $snippetResult = $this->storage->getSnippets($snippetQuery); 146 | 147 | $resultSet->addProfilePoint('Snippets: obtaining'); 148 | 149 | $sb = new SnippetBuilder($this->stemmer, $this->snippetLineSeparator); 150 | $sb->setHighlightMaskRegexArray($this->highlightMaskRegexArray); 151 | try { 152 | $sb->attachSnippets($resultSet, $snippetResult); 153 | } catch (ImmutableException|UnknownIdException $e) { 154 | throw new LogicException($e->getMessage(), 0, $e); 155 | } 156 | 157 | $resultSet->addProfilePoint('Snippets: building'); 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/Query.php: -------------------------------------------------------------------------------- 1 | value = $value; 41 | } 42 | 43 | /** 44 | * @return int|null 45 | */ 46 | public function getLimit() 47 | { 48 | return $this->limit; 49 | } 50 | 51 | /** 52 | * @param int $limit 53 | * 54 | * @return self 55 | */ 56 | public function setLimit($limit) 57 | { 58 | $this->limit = $limit; 59 | 60 | return $this; 61 | } 62 | 63 | /** 64 | * @return int 65 | */ 66 | public function getOffset() 67 | { 68 | return $this->offset; 69 | } 70 | 71 | /** 72 | * @param int $offset 73 | * 74 | * @return self 75 | */ 76 | public function setOffset($offset) 77 | { 78 | $this->offset = $offset; 79 | 80 | return $this; 81 | } 82 | 83 | /** 84 | * @return string 85 | */ 86 | public function getValue() 87 | { 88 | return $this->value; 89 | } 90 | 91 | /** 92 | * @return int|null 93 | */ 94 | public function getInstanceId() 95 | { 96 | return $this->instanceId; 97 | } 98 | 99 | /** 100 | * @param int|null $instanceId 101 | * 102 | * @return self 103 | */ 104 | public function setInstanceId($instanceId) 105 | { 106 | $this->instanceId = $instanceId; 107 | 108 | return $this; 109 | } 110 | 111 | /** 112 | * @return string[] 113 | */ 114 | public function valueToArray() 115 | { 116 | $content = self::normalizeValue($this->value); 117 | if ($content === '') { 118 | return []; 119 | } 120 | 121 | $content = strip_tags($content); 122 | 123 | // Normalize 124 | $content = str_replace(['«', '»', '“', '”', '‘', '’'], '"', $content); 125 | $content = str_replace('−', '-', $content); // Replace minus sign to a hyphen 126 | $content = str_replace(['---', '–', '−'], '—', $content); // Normalize dashes 127 | $content = self::safePregReplace('#,\\s+,#u', ',,', $content); 128 | $content = self::safePregReplace('#[^\\-\\p{L}0-9^_.,()";?!…:—]+#iu', ' ', $content); 129 | $content = mb_strtolower($content); 130 | 131 | // Replace decimal separators: ',' -> '.' 132 | $content = self::safePregReplace('#(?<=^|\\s)(\\-?\\d+),(\\d+)(?=\\s|$)#u', '\\1.\\2', $content); 133 | 134 | // Separate special chars at the beginning of the word 135 | while (true) { 136 | $content = self::safePregReplace('#(?:^|\\s)\K([—^()"?:!])(?=[^\s])#u', '\\1 ', $content, -1, $count); 137 | if ($count === 0 || $content === '') { 138 | break; 139 | } 140 | } 141 | 142 | // Separate special chars at the end of the word 143 | while (true) { 144 | $content = self::safePregReplace('#(?<=[^\s])([—^()"?:!])(?=\\s|$)#u', ' \\1', $content, -1, $count); 145 | if ($count === 0 || $content === '') { 146 | break; 147 | } 148 | } 149 | 150 | // Separate groups of commas 151 | $content = self::safePregReplace('#(,+)#u', ' \\1 ', $content); 152 | 153 | $words = preg_split('#\\s+#', $content); 154 | foreach ($words as $k => &$v) { 155 | // Replace 'ё' inside words 156 | if ($v !== 'ё' && false !== strpos($v, 'ё')) { 157 | $v = str_replace('ё', 'е', $v); 158 | } 159 | 160 | if ($v === '' || !preg_match('#[\\p{L}\\d]#u', $v)) { 161 | continue; 162 | } 163 | 164 | $trimmed = rtrim($v, StringHelper::WORD_COMPONENT_DELIMITERS); 165 | if ($trimmed === '') { 166 | unset($words[$k]); 167 | continue; 168 | } 169 | 170 | $v = $trimmed; 171 | } 172 | unset($v); 173 | 174 | $words = array_unique($words); 175 | 176 | StringHelper::removeLongWords($words); 177 | 178 | // Fix keys 179 | // $words = array_values($words); // <- moved to helper 180 | 181 | if (\count($words) > self::MAX_WORDS) { 182 | $words = \array_slice($words, 0, self::MAX_WORDS); 183 | } 184 | 185 | return $words; 186 | } 187 | 188 | private static function normalizeValue($value): string 189 | { 190 | if (\is_string($value)) { 191 | $stringValue = $value; 192 | } elseif (\is_scalar($value) || (class_exists(\Stringable::class) && $value instanceof \Stringable)) { 193 | $stringValue = (string)$value; 194 | } else { 195 | return ''; 196 | } 197 | 198 | return self::normalizeUtf8($stringValue); 199 | } 200 | 201 | private static function normalizeUtf8(string $value): string 202 | { 203 | if ($value === '') { 204 | return ''; 205 | } 206 | 207 | if (mb_check_encoding($value, 'UTF-8')) { 208 | return $value; 209 | } 210 | 211 | $previousSubstitute = mb_substitute_character(); 212 | mb_substitute_character('none'); 213 | $converted = mb_convert_encoding($value, 'UTF-8', 'UTF-8'); 214 | mb_substitute_character($previousSubstitute); 215 | 216 | if ($converted === false) { 217 | return ''; 218 | } 219 | 220 | return $converted; 221 | } 222 | 223 | private static function safePregReplace(string $pattern, string $replacement, string $subject, int $limit = -1, ?int &$count = null): string 224 | { 225 | $result = preg_replace($pattern, $replacement, $subject, $limit, $count); 226 | 227 | return $result ?? ''; 228 | } 229 | } 230 | -------------------------------------------------------------------------------- /src/S2/Rose/Helper/StringHelper.php: -------------------------------------------------------------------------------- 1 | $word) { 27 | $len = mb_strlen($word); 28 | 29 | if ($len > 100 || $len === 0) { 30 | unset($words[$k]); 31 | $removed = true; 32 | } 33 | } 34 | if ($removed) { 35 | $words = array_values($words); 36 | } 37 | } 38 | 39 | /** 40 | * @return string[] 41 | */ 42 | public static function sentencesFromText(string $text, bool $hasFormatting): array 43 | { 44 | $text2 = preg_replace('#(\p{Lu}\p{L}*\.?)\s+(\p{Lu}\p{L}?\.)\s+(\p{Lu})#u', "\\1�\\2�\\3", $text); 45 | $text2 = preg_replace('#(\p{Lu}\p{L}?\.)(\p{Lu}\p{L}?\.)\s+(\p{Lu})#u', "\\1\\2�\\3", $text2); 46 | $text2 = preg_replace('#\s\K(Mr.|Dr.)\s(?=\p{Lu}\p{L}?)#u', "\\1�\\3", $text2); 47 | 48 | $substrings = preg_split('#(?:\.|[?!][»"]?)\K([ \n\t\r]+)(?=(?:[\p{Pd}-]\s)?[^\p{Ll}])#Su', $text2); 49 | 50 | $substrings = str_replace("�", ' ', $substrings); 51 | 52 | if ($hasFormatting) { 53 | // We keep the formatting scope through several sentences. 54 | // 55 | // For example, consider the input: 'Sentence 1. Sentence 2. Sentence 3.' 56 | // After processing, it becomes ['Sentence 1.', 'Sentence 2.', 'Sentence 3.']. 57 | $tagsFromPrevSentence = []; 58 | array_walk($substrings, static function (string &$text) use (&$tagsFromPrevSentence) { 59 | foreach (array_reverse($tagsFromPrevSentence) as $possibleTag => $num) { 60 | if ($num > 0) { 61 | $text = str_repeat('\\' . $possibleTag, $num) . $text; 62 | $tagsFromPrevSentence[$possibleTag] = 0; 63 | } 64 | } 65 | $text = self::fixUnbalancedInternalFormatting($text, $tagsFromPrevSentence); 66 | }); 67 | } 68 | 69 | return $substrings; 70 | } 71 | 72 | /** 73 | * @return string[] 74 | */ 75 | public static function sentencesFromCode(string $text): array 76 | { 77 | $substrings = preg_split('#(\r?\n\r?){1,}#Su', $text); 78 | array_walk($substrings, 'trim'); 79 | 80 | return $substrings; 81 | } 82 | 83 | public static function convertInternalFormattingToHtml(string $text): string 84 | { 85 | return strtr($text, [ 86 | '\\\\' => '\\', 87 | '\\' . self::BOLD => '', 88 | '\\' . strtoupper(self::BOLD) => '', 89 | '\\' . self::ITALIC => '', 90 | '\\' . strtoupper(self::ITALIC) => '', 91 | '\\' . self::SUBSCRIPT => '', 92 | '\\' . strtoupper(self::SUBSCRIPT) => '', 93 | '\\' . self::SUPERSCRIPT => '', 94 | '\\' . strtoupper(self::SUPERSCRIPT) => '', 95 | ]); 96 | } 97 | 98 | public static function clearInternalFormatting(string $text): string 99 | { 100 | return strtr($text, [ 101 | '\\\\' => '\\', 102 | '\\' . self::BOLD => '', 103 | '\\' . strtoupper(self::BOLD) => '', 104 | '\\' . self::ITALIC => '', 105 | '\\' . strtoupper(self::ITALIC) => '', 106 | '\\' . self::SUBSCRIPT => '', 107 | '\\' . strtoupper(self::SUBSCRIPT) => '', 108 | '\\' . self::SUPERSCRIPT => '', 109 | '\\' . strtoupper(self::SUPERSCRIPT) => '', 110 | ]); 111 | } 112 | 113 | /** 114 | * @Note: This approach with counting formatting symbols gives wrong results for the same nested tags. 115 | * For example, for '\i 1 \b 2 \i 3' it returns '\i 1 \b 2 \i 3 \B\I\I', however '\i 1 \b 2 \i 3\I\B\I' is expected. 116 | * It's ok since nesting of formatting tags like ab do not make a lot of sense. 117 | */ 118 | public static function fixUnbalancedInternalFormatting(string $text, array &$tagsNum): string 119 | { 120 | preg_match_all('#\\\\(?:\\\\(*SKIP)\\\\)*\K[' . self::FORMATTING_SYMBOLS . ']#i', $text, $matches); 121 | 122 | foreach ($matches[0] as $match) { 123 | $lowerMatch = strtolower($match); 124 | $tagsNum[$lowerMatch] = ($tagsNum[$lowerMatch] ?? 0) + ($match === $lowerMatch ? 1 : -1); 125 | } 126 | 127 | $result = $text; 128 | 129 | foreach ($tagsNum as $possibleTag => $num) { 130 | if ($num < 0) { 131 | $result = str_repeat('\\' . $possibleTag, -$num) . $result; 132 | } 133 | } 134 | foreach (array_reverse($tagsNum) as $possibleTag => $num) { 135 | if ($num > 0) { 136 | $result .= str_repeat('\\' . strtoupper($possibleTag), $num); 137 | } 138 | } 139 | 140 | return $result; 141 | } 142 | 143 | /** 144 | * @return array{0: array, 1: array} 145 | */ 146 | public static function getUnbalancedInternalFormatting(string $text): array 147 | { 148 | preg_match_all('#\\\\(?:\\\\(*SKIP)\\\\)*\K[' . self::FORMATTING_SYMBOLS . ']#i', $text, $matches); 149 | 150 | $openStack = []; 151 | $closeStack = []; 152 | 153 | foreach ($matches[0] as $match) { 154 | $lowerMatch = strtolower($match); 155 | if ($match === $lowerMatch) { 156 | $openStack[] = $match; 157 | continue; 158 | } 159 | 160 | $found = false; 161 | for ($i = \count($openStack); $i--;) { 162 | if ($openStack[$i] === $lowerMatch) { 163 | array_splice($openStack, $i, 1); 164 | $found = true; 165 | break; 166 | } 167 | } 168 | if (!$found) { 169 | $closeStack[] = $match; 170 | } 171 | } 172 | 173 | return [$openStack, $closeStack]; 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /tests/unit/Rose/Entity/SnippetTest.php: -------------------------------------------------------------------------------- 1 | %s', 41 | SnippetLine::createFromSnippetSourceWithoutFoundWords(new SnippetSource('introduction', SnippetSource::FORMAT_PLAIN_TEXT, 0, 0)) 42 | ); 43 | $snippet 44 | ->attachSnippetLine(1, 7, $snippetLine1) 45 | ->attachSnippetLine(8, 10, $snippetLine2) 46 | ; 47 | 48 | $this->assertEquals( 49 | 'Testing string to highlight some test values. Test is case-sensitive.', 50 | $snippet->toString() 51 | ); 52 | } 53 | 54 | public function testSnippet2() 55 | { 56 | $data = [ 57 | [ 58 | 2, 59 | 13, 60 | 'Тут есть тонкость - нужно проверить, как происходит экранировка в сущностях вроде +.', 61 | ['сущност'], 62 | ], 63 | [ 64 | 14, 65 | 23, 66 | 'Для этого нужно включить в текст само сочетание букв "plus".', 67 | ['plus'], 68 | ], 69 | ]; 70 | 71 | $snippet = new Snippet( 72 | '%s', 73 | SnippetLine::createFromSnippetSourceWithoutFoundWords(new SnippetSource('introduction', SnippetSource::FORMAT_PLAIN_TEXT, 0, 1)) 74 | ); 75 | 76 | foreach ($data as $row) { 77 | $snippet->attachSnippetLine($row[0], $row[1], new SnippetLine($row[2], SnippetSource::FORMAT_PLAIN_TEXT, new PorterStemmerRussian(), $row[3], \count($row[3]))); 78 | } 79 | 80 | $this->assertEquals( 81 | 'Тут есть тонкость - нужно проверить, как происходит экранировка в сущностях вроде &plus;. Для этого нужно включить в текст само сочетание букв "plus".', 82 | $snippet->toString() 83 | ); 84 | } 85 | 86 | public function testSnippetsUnique() 87 | { 88 | $stemmer = new PorterStemmerEnglish(); 89 | $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0)); 90 | $snippet 91 | ->attachSnippetLine(0, 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 92 | ->attachSnippetLine(4, 7, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 93 | ->attachSnippetLine(8, 11, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 94 | ->attachSnippetLine(12, 15, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 95 | ->attachSnippetLine(16, 19, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 96 | ->attachSnippetLine(20, 23, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 97 | ->attachSnippetLine(24, 27, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 98 | ; 99 | 100 | $this->assertEquals( 101 | 'Try to test 1... Try to test 2.', 102 | $snippet->toString() 103 | ); 104 | 105 | $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0)); 106 | $snippet 107 | ->attachSnippetLine(0 * 4, 0 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 108 | ->attachSnippetLine(1 * 4, 1 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 109 | ->attachSnippetLine(2 * 4, 2 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 110 | ->attachSnippetLine(3 * 4, 3 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 111 | ->attachSnippetLine(4 * 4, 4 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 112 | ->attachSnippetLine(5 * 4, 5 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 113 | ->attachSnippetLine(6 * 4, 6 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 114 | ->attachSnippetLine(7 * 4, 7 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 115 | ->attachSnippetLine(8 * 4, 8 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 116 | ->attachSnippetLine(9 * 4, 9 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 117 | ->attachSnippetLine(10 * 4, 10 * 4 + 3, new SnippetLine('Try to test 4.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1)) 118 | ->attachSnippetLine(11 * 4, 11 * 4 + 3, new SnippetLine('Try to test 4.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 2)) 119 | ; 120 | 121 | $this->assertEquals( 122 | 'Try to test 1... Try to test 2... Try to test 4.', 123 | $snippet->toString() 124 | ); 125 | } 126 | 127 | public function testEmptySnippet() 128 | { 129 | $stemmer = new PorterStemmerEnglish(); 130 | $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0)); 131 | $snippet->toString(); 132 | 133 | $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0)); 134 | $snippet->attachSnippetLine(1, 1, new SnippetLine('line1', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0)); 135 | $snippet->toString(); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/FulltextResult.php: -------------------------------------------------------------------------------- 1 | query = $query; 21 | $this->fulltextIndexContent = $fulltextIndexContent; 22 | $this->tocSize = $tocSize; 23 | } 24 | 25 | /** 26 | * https://i.upmath.me/svg/%5Cbegin%7Btikzpicture%7D%5Bscale%3D1.0544%5D%5Csmall%0A%5Cbegin%7Baxis%7D%5Baxis%20line%20style%3Dgray%2C%0A%09samples%3D100%2C%0A%09xmin%3D-1.2%2C%20xmax%3D1.2%2C%0A%09ymin%3D0%2C%20ymax%3D1.1%2C%0A%09restrict%20y%20to%20domain%3D-0.1%3A1%2C%0A%09ytick%3D%7B1%7D%2C%0A%09xtick%3D%7B-1%2C1%7D%2C%0A%09axis%20equal%2C%0A%09axis%20x%20line%3Dcenter%2C%0A%09axis%20y%20line%3Dcenter%2C%0A%09xlabel%3D%24x%24%2Cylabel%3D%24y%24%5D%0A%5Caddplot%5Bred%2Cdomain%3D-2%3A1%2Csemithick%5D%7Bexp(-(x%2F0.38)%5E2)%7D%3B%0A%5Caddplot%5Bred%5D%20coordinates%20%7B(0.8%2C0.6)%7D%20node%7B%24y%3De%5E%7B-%5Cleft(x%2F0.38%5Cright)%5E2%7D%24%7D%3B%0A%5Cpath%20(axis%20cs%3A0%2C0)%20node%20%5Banchor%3Dnorth%20west%2Cyshift%3D-0.07cm%5D%20%7B0%7D%3B%0A%5Cend%7Baxis%7D%0A%5Cend%7Btikzpicture%7D 27 | */ 28 | public static function frequencyReduction(int $tocSize, int $foundTocEntriesNum): float 29 | { 30 | if ($tocSize < 5) { 31 | return 1; 32 | } 33 | 34 | return exp(-(($foundTocEntriesNum / $tocSize) / 0.38) ** 2); 35 | } 36 | 37 | /** 38 | * Weight ratio for repeating words in the indexed item. 39 | */ 40 | protected static function repeatWeightRatio(int $repeatNum): float 41 | { 42 | return min(0.5 * ($repeatNum - 1) + 1, 4); 43 | } 44 | 45 | /** 46 | * Weight ratio for entry size (prefer some middle size) 47 | * 48 | * https://i.upmath.me/g/%5Cbegin%7Btikzpicture%7D%5Bscale%3D1.0544%5D%5Csmall%0A%5Cbegin%7Baxis%7D%5Baxis%20line%20style%3Dgray%2C%0A%09samples%3D100%2C%0A%09ymin%3D0%2C%20ymax%3D5%2C%0A%09xmin%3D0%2C%20xmax%3D1100%2C%0A%09ytick%3D%7B1%2C2%7D%2C%0A%09xtick%3D%7B50%2C200%2C500%2C1000%7D%2C%0A%09axis%20x%20line%3Dcenter%2C%0A%09axis%20y%20line%3Dcenter%2C%0A%09xlabel%3D%24x%24%2Cylabel%3D%24y%24%5D%0A%5Caddplot%5Bred%2Cdomain%3D0%3A1000%2Csemithick%5D%7B1%2F(1%2Bexp((sqrt(x)-18)%5E2%2F60))%2B1%7D%3B%0A%5Caddplot%5Bblue%2Cdomain%3D0%3A1000%2Csemithick%5D%7B1%7D%3B%0A%5Caddplot%5Bred%5D%20coordinates%20%7B(600%2C3)%7D%20node%7B%24y%3D1%2F(1%2Bexp((sqrt(x)-18)%5E2%2F60))%2B1%24%7D%3B%0A%5Cend%7Baxis%7D%0A%5Cend%7Btikzpicture%7D 49 | */ 50 | protected static function entrySizeWeightRatio(int $totalWordsNum): float 51 | { 52 | return $totalWordsNum >= 10 ? 1.0 + 1.0 / (1.0 + exp((sqrt($totalWordsNum) - 18) ** 2 / 60.0)) : 1; 53 | } 54 | 55 | /** 56 | * Weight ratio for a pair of words. Accepts the difference of distances 57 | * in the indexed item and the search query. 58 | * 59 | * @param float $distance 60 | * 61 | * @return float 62 | */ 63 | protected static function neighbourWeight(float $distance): float 64 | { 65 | return 30.0 / (1 + pow($distance / 7.0, 2)); 66 | } 67 | 68 | /** 69 | * @throws ImmutableException 70 | */ 71 | public function fillResultSet(ResultSet $resultSet): void 72 | { 73 | $wordReductionRatios = []; 74 | foreach ($this->fulltextIndexContent->toArray() as $word => $indexedItems) { 75 | $reductionRatio = self::frequencyReduction($this->tocSize, \count($indexedItems)); 76 | $wordReductionRatios[$word] = $reductionRatio; 77 | 78 | foreach ($indexedItems as $positionBag) { 79 | $externalId = $positionBag->getExternalId(); 80 | $contentPositionsNum = \count($positionBag->getContentPositions()); 81 | 82 | if ($contentPositionsNum > 0) { 83 | $weights = [ 84 | 'abundance_reduction' => $reductionRatio, 85 | 'repeat_multiply' => self::repeatWeightRatio($contentPositionsNum), 86 | 'entry_size' => self::entrySizeWeightRatio($positionBag->getWordCount()), 87 | ]; 88 | if ($positionBag->hasExternalRelevanceRatio()) { 89 | $weights['external_ratio'] = $positionBag->getExternalRelevanceRatio(); 90 | } 91 | $resultSet->addWordWeight($word, $externalId, $weights, $positionBag->getContentPositions()); 92 | } 93 | 94 | if (\count($positionBag->getKeywordPositions()) > 0) { 95 | $weights = [ 96 | 'keyword' => 10, 97 | 'abundance_reduction' => $reductionRatio, 98 | ]; 99 | if ($positionBag->hasExternalRelevanceRatio()) { 100 | $weights['external_ratio'] = $positionBag->getExternalRelevanceRatio(); 101 | } 102 | $resultSet->addWordWeight($word, $externalId, $weights); 103 | } 104 | 105 | if (\count($positionBag->getTitlePositions()) > 0) { 106 | $weights = [ 107 | 'title' => 25, 108 | 'abundance_reduction' => $reductionRatio, 109 | ]; 110 | if ($positionBag->hasExternalRelevanceRatio()) { 111 | $weights['external_ratio'] = $positionBag->getExternalRelevanceRatio(); 112 | } 113 | $resultSet->addWordWeight($word, $externalId, $weights); 114 | } 115 | } 116 | } 117 | 118 | $referenceContainer = $this->query->toWordPositionContainer(); 119 | 120 | $this->fulltextIndexContent->iterateContentWordPositions( 121 | static function (ExternalId $id, WordPositionContainer $container) use ($referenceContainer, $wordReductionRatios, $resultSet) { 122 | $pairsDistance = $container->compareWith($referenceContainer); 123 | foreach ($pairsDistance as $pairDistance) { 124 | [$word1, $word2, $distance] = $pairDistance; 125 | $weight = self::neighbourWeight($distance); 126 | if (isset($wordReductionRatios[$word1])) { 127 | $weight *= $wordReductionRatios[$word1]; 128 | } 129 | if (isset($wordReductionRatios[$word2])) { 130 | $weight *= $wordReductionRatios[$word2]; 131 | } 132 | $resultSet->addNeighbourWeight($word1, $word2, $id, $weight, $distance); 133 | } 134 | } 135 | ); 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /tests/unit/Rose/FinderTest.php: -------------------------------------------------------------------------------- 1 | static function () { 42 | return 30; 43 | }, 44 | 'fulltextResultByWords' => static function (array $words) { 45 | $result = new FulltextIndexContent(); 46 | foreach ($words as $k => $word) { 47 | if ($word === 'find') { 48 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_3'), [], [], [1], 0, 1.0)); 49 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_2'), [], [1], [10, 20], 0, 1.0)); 50 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_1'), [1], [], [], 0, 1.0)); 51 | } 52 | if ($word === 'and') { 53 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_1'), [], [], [4, 8], 0, 1.0)); 54 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_2'), [], [], [7, 11, 34], 0, 1.0)); 55 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_3'), [], [], [28, 65], 0, 1.0)); 56 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_4'), [], [], [45, 9], 0, 1.0)); 57 | 58 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_5'), [], [], [1], 0, 1.0)); 59 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_6'), [], [], [1], 0, 1.0)); 60 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_7'), [], [], [1], 0, 1.0)); 61 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_8'), [], [], [1], 0, 1.0)); 62 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_9'), [], [], [1], 0, 1.0)); 63 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_10'), [], [], [1], 0, 1.0)); 64 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_11'), [], [], [1], 0, 1.0)); 65 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_12'), [], [], [1], 0, 1.0)); 66 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_13'), [], [], [1], 0, 1.0)); 67 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_14'), [], [], [1], 0, 1.0)); 68 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_15'), [], [], [1], 0, 1.0)); 69 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_16'), [], [], [1], 0, 1.0)); 70 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_17'), [], [], [1], 0, 1.0)); 71 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_18'), [], [], [1], 0, 1.0)); 72 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_19'), [], [], [1], 0, 1.0)); 73 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_20'), [], [], [1], 0, 1.0)); 74 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_21'), [], [], [1], 0, 1.0)); 75 | } 76 | if ($word === 'replace') { 77 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_2'), [], [], [12], 0, 1.0)); 78 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_1'), [1], [], [], 0, 1.0)); 79 | } 80 | 81 | unset($words[$k]); 82 | } 83 | 84 | if (!empty($words)) { 85 | throw new \RuntimeException(sprintf('Unknown words "%s" in StorageReadInterface stub.', implode(',', $words))); 86 | } 87 | 88 | return $result; 89 | }, 90 | 'getTocByExternalIds' => static function (ExternalIdCollection $ids) { 91 | return array_map(static function (ExternalId $id) { 92 | return new TocEntryWithMetadata( 93 | new TocEntry('Title ' . $id->getId(), '', null, 'url_' . $id->getId(), 1, 'hash_' . $id->getId()), 94 | $id, 95 | new ImgCollection() 96 | ); 97 | }, $ids->toArray()); 98 | }, 99 | 'getSnippets' => function (SnippetQuery $snippetQuery) use (&$storedSnippetQuery): SnippetResult { 100 | $storedSnippetQuery = $snippetQuery; 101 | return new SnippetResult(); 102 | } 103 | ]); 104 | 105 | $stemmer = new PorterStemmerRussian(); 106 | $finder = new Finder($storage, $stemmer); 107 | $resultSet = $finder->find(new Query('find and replace')); 108 | 109 | $items = $resultSet->getItems(); 110 | $this->assertCount(21, $items); 111 | 112 | $weights = $resultSet->getFoundWordPositionsByExternalId(); 113 | $this->assertCount(21, $weights); 114 | $this->assertEquals([], $weights[':id_1']['find']); 115 | $this->assertEquals([], $weights[':id_1']['replace']); 116 | $this->assertEquals([10, 20], $weights[':id_2']['find']); 117 | $this->assertEquals([12], $weights[':id_2']['replace']); 118 | $this->assertEquals([1], $weights[':id_3']['find']); 119 | 120 | $query2 = new Query('find and replace'); 121 | $query2->setLimit(10); 122 | $resultSet2 = $finder->find($query2); 123 | $this->assertCount(10, $resultSet2->getItems()); 124 | $this->assertCount(10, $storedSnippetQuery->getExternalIds()); 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /src/S2/Rose/Indexer.php: -------------------------------------------------------------------------------- 1 | storage = $storage; 43 | $this->stemmer = $stemmer; 44 | $this->extractor = $extractor ?? DefaultExtractorFactory::create(); 45 | $this->logger = $logger; 46 | } 47 | 48 | /** 49 | * Cleaning up an HTML string. 50 | */ 51 | public static function titleStrFromHtml(string $content, string $allowedSymbols = ''): string 52 | { 53 | $content = mb_strtolower($content); 54 | $content = str_replace([' ', "\xc2\xa0"], ' ', $content); 55 | /** @var string $content */ 56 | $content = preg_replace('#&[^;]{1,20};#', '', $content); 57 | 58 | // We allow letters, digits and some punctuation: ".,-" 59 | $content = preg_replace('#[^\\-.,0-9\\p{L}^_' . $allowedSymbols . ']+#u', ' ', $content); 60 | 61 | // These punctuation characters are meant to be inside words and numbers. 62 | // We'll remove trailing characters when splitting the words. 63 | $content .= ' '; 64 | 65 | return $content; 66 | } 67 | 68 | /** 69 | * @return string[] 70 | */ 71 | protected static function arrayFromStr(string $contents): array 72 | { 73 | $words = preg_split('#[\\-.,]*?[ ]+#S', $contents); 74 | StringHelper::removeLongWords($words); 75 | 76 | return $words; 77 | } 78 | 79 | protected function addToIndex(ExternalId $externalId, string $title, ContentWithMetadata $content, string $keywords): void 80 | { 81 | $sentenceCollection = $content->getSentenceMap()->toSentenceCollection(); 82 | $contentWordsArray = $sentenceCollection->getWordsArray(); 83 | 84 | foreach ($contentWordsArray as $i => $word) { 85 | if ($this->storage->isExcludedWord($word)) { 86 | unset($contentWordsArray[$i]); 87 | } 88 | } 89 | 90 | $titleWordsArray = self::arrayFromStr($title); 91 | $keywordsArray = self::arrayFromStr($keywords); 92 | 93 | $this->storage->addMetadata($externalId, \count($titleWordsArray) + \count($contentWordsArray), $content->getImageCollection()); 94 | $this->storage->addSnippets($externalId, ...$sentenceCollection->getSnippetSources()); 95 | $this->storage->addToFulltextIndex( 96 | $this->getStemsWithComponents($titleWordsArray), 97 | $this->getStemsWithComponents($keywordsArray), // TODO consider different semantics of space and comma? 98 | $this->getStemsWithComponents($contentWordsArray), 99 | $externalId 100 | ); 101 | } 102 | 103 | public function removeById(string $id, ?int $instanceId): void 104 | { 105 | $externalId = new ExternalId($id, $instanceId); 106 | $this->storage->removeFromIndex($externalId); 107 | $this->storage->removeFromToc($externalId); 108 | } 109 | 110 | /** 111 | * @throws RuntimeException 112 | * @throws UnknownException 113 | */ 114 | public function index(Indexable $indexable): void 115 | { 116 | try { 117 | $this->doIndex($indexable); 118 | } catch (EmptyIndexException $e) { 119 | if (!$this->autoErase || !$this->storage instanceof StorageEraseInterface) { 120 | throw $e; 121 | } 122 | 123 | $this->storage->erase(); 124 | $this->doIndex($indexable); 125 | } 126 | } 127 | 128 | public function setAutoErase(bool $autoErase): void 129 | { 130 | $this->autoErase = $autoErase; 131 | } 132 | 133 | /** 134 | * @throws RuntimeException 135 | * @throws UnknownException 136 | */ 137 | protected function doIndex(Indexable $indexable): void 138 | { 139 | if ($this->storage instanceof TransactionalStorageInterface) { 140 | $this->storage->startTransaction(); 141 | } 142 | 143 | try { 144 | $externalId = $indexable->getExternalId(); 145 | $oldTocEntry = $this->storage->getTocByExternalId($externalId); 146 | 147 | $this->storage->addEntryToToc($indexable->toTocEntry(), $externalId); 148 | 149 | if ($oldTocEntry === null || $oldTocEntry->getHash() !== $indexable->calcHash()) { 150 | $this->storage->removeFromIndex($externalId); 151 | 152 | $extractionResult = $this->extractor->extract($indexable->getContent()); 153 | $extractionErrors = $extractionResult->getErrors(); 154 | if ($this->logger && $extractionErrors->hasErrors()) { 155 | $this->logger->warning(sprintf( 156 | 'Found warnings on indexing "%s" (id="%s", instance="%s", url="%s")', 157 | $indexable->getTitle(), 158 | $indexable->getExternalId()->getId(), 159 | $indexable->getExternalId()->getInstanceId() ?? '', 160 | $indexable->getUrl() 161 | ), $extractionErrors->getFormattedLines()); 162 | } 163 | 164 | // strtolower in titleStrFromHtml is important 165 | $this->addToIndex( 166 | $externalId, 167 | self::titleStrFromHtml($indexable->getTitle()), 168 | $extractionResult->getContentWithMetadata(), 169 | self::titleStrFromHtml($indexable->getKeywords()) 170 | ); 171 | } 172 | 173 | if ($this->storage instanceof TransactionalStorageInterface) { 174 | $this->storage->commitTransaction(); 175 | } 176 | } catch (\Exception $e) { 177 | if ($this->storage instanceof TransactionalStorageInterface) { 178 | $this->storage->rollbackTransaction(); 179 | } 180 | if (!($e instanceof RuntimeException)) { 181 | throw new UnknownException('Unknown exception occurred while indexing.', 0, $e); 182 | } 183 | throw $e; 184 | } 185 | } 186 | 187 | /** 188 | * Replaces words with stems. Also, this method detects compound words and adds the component stems to the result. 189 | * 190 | * The keys in the result arrays are the positions of the word. For compound words a string representation 191 | * of a float is used to map one index to several words. For example, for input 192 | * 193 | * [10 => 'well-known', 11 => 'facts'] 194 | * 195 | * this method returns 196 | * 197 | * [10 => 'well-known', 11 => 'fact', '10.001' => 'well', '10.002' => 'known'] 198 | * 199 | * @param array $words 200 | * @return array 201 | */ 202 | private function getStemsWithComponents(array $words): array 203 | { 204 | $componentsOfCompoundWords = []; 205 | foreach ($words as $i => &$word) { 206 | $stemmedWord = $this->stemmer->stemWord($word, false); 207 | 208 | // If the word contains punctuation marks like hyphen, add a variant without it 209 | if (false !== strpbrk($stemmedWord, StringHelper::WORD_COMPONENT_DELIMITERS)) { 210 | foreach (preg_split('#(?<=[\p{L}\d])[\-.,]+|[\-.,]++(?=[\p{L}\d])#u', $word) as $k => $subWord) { 211 | if ($subWord !== '' && $subWord !== $word) { 212 | $componentsOfCompoundWords[(string)($i + 0.001 * ($k + 1))] = $this->stemmer->stemWord($subWord, false); 213 | } 214 | } 215 | } 216 | 217 | $word = $stemmedWord; 218 | } 219 | unset($word); 220 | 221 | return array_merge($words, $componentsOfCompoundWords); 222 | } 223 | } 224 | -------------------------------------------------------------------------------- /src/S2/Rose/Storage/ArrayStorage.php: -------------------------------------------------------------------------------- 1 | fulltextProxy->getByWord($word); 45 | foreach ($data as $id => $positionsByType) { 46 | $externalId = $this->externalIdFromInternalId($id); 47 | if ($externalId === null) { 48 | continue; 49 | } 50 | if ($instanceId === null || $externalId->getInstanceId() === $instanceId) { 51 | $serializedExtId = $externalId->toString(); 52 | $result->add($word, new FulltextIndexPositionBag( 53 | $externalId, 54 | $positionsByType[FulltextProxyInterface::TYPE_TITLE] ?? [], 55 | $positionsByType[FulltextProxyInterface::TYPE_KEYWORD] ?? [], 56 | $positionsByType[FulltextProxyInterface::TYPE_CONTENT] ?? [], 57 | isset($this->metadata[$id]) ? $this->metadata[$id]['wordCount'] : 0, 58 | isset($this->toc[$serializedExtId]) ? $this->toc[$serializedExtId]->getRelevanceRatio() : 1.0 59 | )); 60 | } 61 | } 62 | } 63 | 64 | return $result; 65 | } 66 | 67 | /** 68 | * {@inheritdoc} 69 | * @throws UnknownIdException 70 | */ 71 | public function getSnippets(SnippetQuery $snippetQuery): SnippetResult 72 | { 73 | $result = new SnippetResult(); 74 | $snippetQuery->iterate(function (ExternalId $externalId, array $positions) use ($result) { 75 | $fallbackCount = 0; 76 | foreach ($this->metadata[$this->internalIdFromExternalId($externalId)]['snippets'] ?? [] as $snippetSource) { 77 | if (!$snippetSource instanceof SnippetSource) { 78 | throw new LogicException('Snippets must be stored as array of SnippetSource.'); 79 | } 80 | if ($fallbackCount < 2 || $snippetSource->coversOneOfPositions($positions)) { 81 | $result->attach($externalId, $snippetSource); 82 | $fallbackCount++; 83 | } 84 | } 85 | }); 86 | 87 | return $result; 88 | } 89 | 90 | /** 91 | * {@inheritdoc} 92 | * @throws UnknownIdException 93 | */ 94 | public function addToFulltextIndex(array $titleWords, array $keywords, array $contentWords, ExternalId $externalId): void 95 | { 96 | $id = $this->internalIdFromExternalId($externalId); 97 | foreach ($titleWords as $position => $word) { 98 | $this->fulltextProxy->addWord($word, $id, FulltextProxyInterface::TYPE_TITLE, (int)$position); 99 | } 100 | foreach ($keywords as $position => $word) { 101 | $this->fulltextProxy->addWord($word, $id, FulltextProxyInterface::TYPE_KEYWORD, (int)$position); 102 | } 103 | foreach ($contentWords as $position => $word) { 104 | $this->fulltextProxy->addWord($word, $id, FulltextProxyInterface::TYPE_CONTENT, (int)$position); 105 | } 106 | } 107 | 108 | /** 109 | * {@inheritdoc} 110 | */ 111 | public function isExcludedWord(string $word): bool 112 | { 113 | return isset($this->excludedWords[$word]); 114 | } 115 | 116 | /** 117 | * Drops frequent words from index. 118 | */ 119 | public function cleanup(): void 120 | { 121 | $threshold = Finder::fulltextRateExcludeNum(\count($this->toc)); 122 | 123 | foreach ($this->fulltextProxy->getFrequentWords($threshold) as $word => $stat) { 124 | // Drop fulltext frequent or empty items 125 | $this->fulltextProxy->removeWord($word); 126 | $this->excludedWords[$word] = 1; 127 | } 128 | } 129 | 130 | /** 131 | * {@inheritdoc} 132 | * @throws UnknownIdException 133 | */ 134 | public function removeFromIndex(ExternalId $externalId): void 135 | { 136 | $internalId = $this->internalIdFromExternalId($externalId); 137 | 138 | $this->fulltextProxy->removeById($internalId); 139 | 140 | foreach ($this->metadata as &$data) { 141 | if (isset($data[$internalId])) { 142 | unset($data[$internalId]); 143 | } 144 | } 145 | unset($data); 146 | } 147 | 148 | /** 149 | * {@inheritdoc} 150 | */ 151 | public function addEntryToToc(TocEntry $entry, ExternalId $externalId): void 152 | { 153 | try { 154 | $internalId = $this->internalIdFromExternalId($externalId); 155 | $this->removeFromToc($externalId); 156 | } catch (UnknownIdException $e) { 157 | $internalId = 0; 158 | foreach ($this->toc as $existingEntry) { 159 | $internalId = max($internalId, $existingEntry->getInternalId()); 160 | } 161 | $internalId++; 162 | } 163 | 164 | $entry->setInternalId($internalId); 165 | 166 | $this->toc[$externalId->toString()] = $entry; 167 | $this->externalIdMap[$internalId] = $externalId; 168 | } 169 | 170 | /** 171 | * {@inheritdoc} 172 | * @throws UnknownIdException 173 | */ 174 | public function addMetadata(ExternalId $externalId, int $wordCount, ImgCollection $imgCollection): void 175 | { 176 | $internalId = $this->internalIdFromExternalId($externalId); 177 | $this->metadata[$internalId]['wordCount'] = $wordCount; 178 | $this->metadata[$internalId]['images'] = $imgCollection; 179 | } 180 | 181 | /** 182 | * @throws UnknownIdException 183 | */ 184 | public function addSnippets(ExternalId $externalId, SnippetSource ...$snippets): void 185 | { 186 | if (\count($snippets) === 0) { 187 | return; 188 | } 189 | $this->metadata[$this->internalIdFromExternalId($externalId)]['snippets'] = $snippets; 190 | } 191 | 192 | /** 193 | * {@inheritdoc} 194 | */ 195 | public function getTocByExternalIds(ExternalIdCollection $externalIds): array 196 | { 197 | $result = []; 198 | foreach ($externalIds->toArray() as $externalId) { 199 | $serializedExtId = $externalId->toString(); 200 | if (isset($this->toc[$serializedExtId])) { 201 | $result[] = new TocEntryWithMetadata( 202 | $this->toc[$serializedExtId], 203 | $externalId, 204 | $this->metadata[$this->toc[$serializedExtId]->getInternalId()]['images'] ?? new ImgCollection() 205 | ); 206 | } 207 | } 208 | 209 | return $result; 210 | } 211 | 212 | /** 213 | * {@inheritdoc} 214 | */ 215 | public function getTocByExternalId(ExternalId $externalId): ?TocEntry 216 | { 217 | $serializedExtId = $externalId->toString(); 218 | 219 | return $this->toc[$serializedExtId] ?? null; 220 | } 221 | 222 | /** 223 | * {@inheritdoc} 224 | */ 225 | public function removeFromToc(ExternalId $externalId): void 226 | { 227 | $serializedExtId = $externalId->toString(); 228 | if (!isset($this->toc[$serializedExtId])) { 229 | return; 230 | } 231 | 232 | $internalId = $this->toc[$serializedExtId]->getInternalId(); 233 | unset($this->externalIdMap[$internalId], $this->toc[$serializedExtId]); 234 | } 235 | 236 | /** 237 | * {@inheritdoc} 238 | */ 239 | public function getTocSize(?int $instanceId): int 240 | { 241 | return \count($this->toc); 242 | } 243 | 244 | /** 245 | * @throws UnknownIdException 246 | */ 247 | private function internalIdFromExternalId(ExternalId $externalId): int 248 | { 249 | $serializedExtId = $externalId->toString(); 250 | if (!isset($this->toc[$serializedExtId])) { 251 | throw UnknownIdException::createIndexMissingExternalId($externalId); 252 | } 253 | 254 | return $this->toc[$serializedExtId]->getInternalId(); 255 | } 256 | 257 | private function externalIdFromInternalId(int $internalId): ?ExternalId 258 | { 259 | return $this->externalIdMap[$internalId] ?? null; 260 | } 261 | } 262 | -------------------------------------------------------------------------------- /tests/unit/Rose/Helper/StringHelperTest.php: -------------------------------------------------------------------------------- 1 | $str) { 25 | $this->assertEquals($sentences[$i], $str); 26 | } 27 | } 28 | 29 | public function sentenceDataProvider(): array 30 | { 31 | // Лектор спросил: «В чем смысл названия курса?» Я попытался вспомнить, что он говорил на первой лекции, и воспроизвести его слова. 32 | return [ 33 | ['One sentence.', ['One sentence.']], 34 | ['Second sentence. And a third one 123.', ['Second sentence.', 'And a third one 123.']], 35 | ['Текст на русском. И еще предложение. 1, 2, 3 и т. д. Цифры, буквы, и т. п., могут встретиться.', [ 36 | 'Текст на русском.', 37 | 'И еще предложение.', 38 | '1, 2, 3 и т. д.', 39 | 'Цифры, буквы, и т. п., могут встретиться.', 40 | ]], 41 | ['Sentence \i1. Sentence 2. Sentence\I 3.', ['Sentence \i1.\I', '\iSentence 2.\I', '\iSentence\I 3.'], true], 42 | ['Sentence \i1. Sentence 2. Sentence\B 3.', ['Sentence \i1.\I', '\iSentence 2.\I', '\b\iSentence\B 3.\I'], true], 43 | ['\i\uSentence \b1\B. Sentence 2. Sentence 3.\U\I', ['\i\uSentence \b1\B.\U\I', '\i\uSentence 2.\U\I', '\i\uSentence 3.\U\I'], true], 44 | [ 45 | 'Поезд отправился из пункта А в пункт Б. Затем вернулся назад.', 46 | [ 47 | 'Поезд отправился из пункта А в пункт Б.', 48 | 'Затем вернулся назад.', 49 | ]], 50 | [ 51 | 'Это пример абзаца. Он содержит несколько предложений. Каждое предложение заканчивается точкой! Иногда используется вопросительный знак? И восклицательный знак! Иногда используются многоточия... Но это не всегда так.', 52 | [ 53 | 'Это пример абзаца.', 54 | 'Он содержит несколько предложений.', 55 | 'Каждое предложение заканчивается точкой!', 56 | 'Иногда используется вопросительный знак?', 57 | 'И восклицательный знак!', 58 | 'Иногда используются многоточия...', 59 | 'Но это не всегда так.', 60 | ] 61 | ], 62 | [ 63 | '- Прямая речь тоже разбивается на предложения? – Да, безусловно! — Отлично, то, что нужно. - Пожалуйста.', 64 | [ 65 | '- Прямая речь тоже разбивается на предложения?', 66 | '– Да, безусловно!', 67 | '— Отлично, то, что нужно.', 68 | '- Пожалуйста.', 69 | ] 70 | ], 71 | [ 72 | '"Прямая речь может быть в другом синтаксисе", - сказал я. Противник добавил: «Как это скучно!» И следом: «Как это так». Такие дела.', 73 | [ 74 | '"Прямая речь может быть в другом синтаксисе", - сказал я.', 75 | 'Противник добавил: «Как это скучно!»', 76 | 'И следом: «Как это так».', 77 | 'Такие дела.', 78 | ] 79 | ], 80 | [ 81 | 'На первом курсе А. П. Петров вел математику. А. П. Петров делал это хорошо. Все радовались А.П. Петрову. А.П. Петров пел математику.', 82 | [ 83 | 'На первом курсе А. П. Петров вел математику.', 84 | 'А. П. Петров делал это хорошо.', 85 | 'Все радовались А.П. Петрову.', 86 | 'А.П. Петров пел математику.', 87 | ] 88 | ], 89 | [ 90 | 'Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment.', 91 | [ 92 | 'Last week, former director of the F.B.I. James B. Comey was fired.', 93 | 'Mr. Comey was not available for comment.', 94 | ] 95 | ], 96 | [ 97 | 'На первом курсе А. П. Петров (зам. декана), Д. А. Александров (преподаватель физики) и несколько студентов нашего факультета (я в том числе) отправились в Тверь на проведение окружного этапа школьной олимпиады по физике.', 98 | [ 99 | 'На первом курсе А. П. Петров (зам. декана), Д. А. Александров (преподаватель физики) и несколько студентов нашего факультета (я в том числе) отправились в Тверь на проведение окружного этапа школьной олимпиады по физике.', 100 | ] 101 | ], 102 | ]; 103 | } 104 | 105 | /** 106 | * @dataProvider unbalancedInternalFormattingDataProvider 107 | */ 108 | public function testFixUnbalancedInternalFormatting(string $text, string $expected, array $expectedTags): void 109 | { 110 | $tags = []; 111 | $this->assertEquals($expected, StringHelper::fixUnbalancedInternalFormatting($text, $tags)); 112 | $this->assertEquals($expectedTags, $tags); 113 | } 114 | 115 | public function unbalancedInternalFormattingDataProvider(): array 116 | { 117 | return [ 118 | [ 119 | '\\iThis is \\bformatted text\\I with \\Bspecial characters\\i.', 120 | '\\iThis is \\bformatted text\\I with \\Bspecial characters\\i.\\I', 121 | ['i' => 1, 'b' => 0], 122 | ], 123 | [ 124 | 'Normal text with escaped formatting symbols like \\\\draw or \\\\inline or \\\\\\\\uuu.', 125 | 'Normal text with escaped formatting symbols like \\\\draw or \\\\inline or \\\\\\\\uuu.', 126 | [], 127 | ], 128 | ['', '', []], 129 | ['456789i', '456789i', []], 130 | [ 131 | '456789\\I', 132 | '\\i456789\\I', 133 | ['i' => -1], 134 | ], 135 | [ 136 | '456789\\\\I', 137 | '456789\\\\I', 138 | [], 139 | ], 140 | [ 141 | '456789\\\\\\I', 142 | '\\i456789\\\\\\I', 143 | ['i' => -1], 144 | ], 145 | [ 146 | '456789\\\\\\\\I', 147 | '456789\\\\\\\\I', 148 | [], 149 | ], 150 | [ 151 | '456789\\\\\\\\\\I', 152 | '\\i456789\\\\\\\\\\I', 153 | ['i' => -1], 154 | ], 155 | [ 156 | '\\u456789', 157 | '\\u456789\\U', 158 | ['u' => 1], 159 | ], 160 | [ 161 | '\\u\\D\\\\I\\b', 162 | '\\d\\u\\D\\\\I\\b\\B\\U', 163 | ['d' => -1, 'u' => 1, 'b' => 1], 164 | ], 165 | [ 166 | '\i123 \b456 \i789', 167 | '\i123 \b456 \i789\B\I\I', // NOTE: This not what one expects. Current implementation does not account for the same nested tags since they do not make sense 168 | ['i' => 2, 'b' => 1], 169 | ], 170 | [ 171 | '\I 123 \i', 172 | '\I 123 \i', 173 | ['i' => 0], 174 | ], 175 | ]; 176 | } 177 | 178 | /** 179 | * @dataProvider getUnbalancedInternalFormattingDataProvider 180 | */ 181 | public function testGetUnbalancedInternalFormatting(string $text, array $expected): void 182 | { 183 | $this->assertEquals($expected, StringHelper::getUnbalancedInternalFormatting($text)); 184 | } 185 | 186 | public function getUnbalancedInternalFormattingDataProvider(): array 187 | { 188 | return [ 189 | [ 190 | '\\iThis is \\bformatted text\\I with \\Bspecial characters\\i.', 191 | [['i'], []], 192 | ], 193 | [ 194 | 'Normal text with escaped formatting symbols like \\\\draw or \\\\inline or \\\\\\\\uuu.', 195 | [[], []], 196 | ], 197 | ['', [[], []]], 198 | ['456789i', [[], []]], 199 | [ 200 | '456789\\I', 201 | [[], ['I']], 202 | ], 203 | [ 204 | '456789\\\\I', 205 | [[], []], 206 | ], 207 | [ 208 | '456789\\\\\\I', 209 | [[], ['I']], 210 | ], 211 | [ 212 | '456789\\\\\\\\I', 213 | [[], []], 214 | ], 215 | [ 216 | '456789\\\\\\\\\\I', 217 | [[], ['I']], 218 | ], 219 | [ 220 | '\\u456789', 221 | [['u'], []], 222 | ], 223 | [ 224 | '\\u\\D\\\\I\\b', 225 | [['u', 'b'], ['D']], 226 | ], 227 | [ 228 | '\i123 \b456 \i789', 229 | [['i', 'b', 'i'], []], 230 | ], 231 | [ 232 | '\I 123 \i', 233 | [['i'], ['I']], 234 | ], 235 | ]; 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /src/S2/Rose/Entity/SnippetLine.php: -------------------------------------------------------------------------------- 1 | line = $line; 51 | $this->formatId = $formatId; 52 | $this->stemmer = $stemmer; 53 | $this->stemsFoundSomewhere = $stemsFoundSomewhere; 54 | $this->relevance = $relevance; 55 | } 56 | 57 | public static function createFromSnippetSourceWithoutFoundWords(SnippetSource $snippetSource): self 58 | { 59 | return new static( 60 | $snippetSource->getText(), 61 | $snippetSource->getFormatId(), 62 | new class implements StemmerInterface { 63 | public function stemWord(string $word, bool $normalize = true): string 64 | { 65 | return $word; 66 | } 67 | }, 68 | [], 69 | 0 70 | ); 71 | } 72 | 73 | public function getRelevance(): float 74 | { 75 | return $this->relevance; 76 | } 77 | 78 | /** 79 | * @return string[] 80 | * @deprecated Not used anymore. TODO delete if not needed 81 | */ 82 | public function getFoundStems(): array 83 | { 84 | $this->parse(); 85 | 86 | return $this->foundStems; 87 | } 88 | 89 | public function getLine(): string 90 | { 91 | return $this->line; 92 | } 93 | 94 | public function getFormatId(): int 95 | { 96 | return $this->formatId; 97 | } 98 | 99 | /** 100 | * @throws RuntimeException 101 | */ 102 | public function getHighlighted(string $highlightTemplate, bool $includeFormatting): string 103 | { 104 | if (strpos($highlightTemplate, '%s') === false) { 105 | throw new RuntimeException('Highlight template must contain "%s" substring for sprintf() function.'); 106 | } 107 | 108 | $this->parse(); 109 | 110 | $line = $this->getLineWithoutMaskedFragments(); 111 | 112 | $replacedLine = ''; 113 | $processedPosition = 0; 114 | foreach ($this->highlightIntervals->toArray() as [$start, $end]) { 115 | $replacedLine .= substr($line, $processedPosition, $start - $processedPosition); 116 | $lineToReplace = substr($line, $start, $end - $start); 117 | 118 | [$openFormatting, $closeFormatting] = StringHelper::getUnbalancedInternalFormatting($lineToReplace); 119 | 120 | // Open formatting goes to the end 121 | $outsidePostfix = implode('', array_map(static fn(string $char) => '\\' . $char, $openFormatting)); 122 | $insidePostfix = implode('', array_map(static fn(string $char) => '\\' . strtoupper($char), array_reverse($openFormatting))); 123 | 124 | // Close formatting goes to the start 125 | $outsidePrefix = implode('', array_map(static fn(string $char) => '\\' . $char, $closeFormatting)); 126 | $insidePrefix = implode('', array_map(static fn(string $char) => '\\' . strtolower($char), array_reverse($closeFormatting))); 127 | 128 | $replacedLine .= $outsidePrefix . sprintf( 129 | $highlightTemplate, $insidePrefix . $lineToReplace . $insidePostfix 130 | ) . $outsidePostfix; 131 | 132 | $processedPosition = $end; 133 | } 134 | 135 | $replacedLine .= substr($line, $processedPosition); 136 | 137 | $result = $this->restoreMaskedFragments($replacedLine); 138 | 139 | return SnippetTextHelper::convertFormatting($result, $this->formatId, $includeFormatting); 140 | } 141 | 142 | public function setMaskRegexArray(array $regexes): void 143 | { 144 | $this->maskRegexArray = $regexes; 145 | } 146 | 147 | protected function parse(): void 148 | { 149 | if ($this->highlightIntervals !== null) { 150 | // Already parsed 151 | return; 152 | } 153 | 154 | $this->highlightIntervals = new HighlightIntervals(); 155 | 156 | $line = $this->getLineWithoutMaskedFragments(); 157 | 158 | if (\count($this->stemsFoundSomewhere) === 0) { 159 | return; 160 | } 161 | 162 | if ($this->formatId === SnippetSource::FORMAT_INTERNAL) { 163 | $regex = '/(?x) 164 | [\\d\\p{L}^_]*(?:(?:\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])+[\\d\\p{L}^_]*)* # matches as many word and formatting characters as possible 165 | (*SKIP) # do not cross this line on backtracking 166 | \\K # restart pattern matching to the end of the word. 167 | (?: # delimiter regex which includes: 168 | [^\\\\\\d\\p{L}^_\\-.,] # non-word character 169 | |[\\-.,]+(?![\\d\\p{L}\\-.,]) # [,-.] followed by a non-word character 170 | |\\\\(?:[' . StringHelper::FORMATTING_SYMBOLS . '](?![\\d\\p{L}\\-.,])|\\\\) # formatting sequence followed by a non-word character or escaped backslash 171 | )+/iu'; 172 | } else { 173 | $regex = '/(?x) 174 | [\\d\\p{L}^_]* # matches as many word and formatting characters as possible 175 | (*SKIP) # do not cross this line on backtracking 176 | \\K # restart pattern matching to the end of the word. 177 | (?: # delimiter regex which includes: 178 | [^\\d\\p{L}^_\\-.,] # non-word character 179 | |[\\-.,]+(?![\\d\\p{L}\\-.,]) # [,-.] followed by a non-word character 180 | )+/iu'; 181 | } 182 | $wordArray = preg_split($regex, $line, -1, \PREG_SPLIT_OFFSET_CAPTURE); 183 | 184 | $flippedStems = array_flip($this->stemsFoundSomewhere); 185 | foreach ($wordArray as [$rawWord, $offset]) { 186 | $word = $this->formatId === SnippetSource::FORMAT_INTERNAL ? StringHelper::clearInternalFormatting($rawWord) : $rawWord; 187 | $word = str_replace(SnippetTextHelper::STORE_MARKER, '', $word); 188 | 189 | if ($word === '') { 190 | // No need to call $intervals->skipInterval() since regex may work several times on a single delimiter 191 | continue; 192 | } 193 | 194 | $stem = null; 195 | if (isset($flippedStems[$word]) || isset($flippedStems[$stem = $this->stemmer->stemWord($word)])) { 196 | $this->highlightIntervals->addInterval($offset, $offset + \strlen($rawWord)); 197 | $this->foundStems[] = $stem ?? $word; 198 | } else { 199 | // Word is not found. Check if it is like a hyphenated compound word, e.g. 'test-drive' or 'long-term' 200 | if (false !== strpbrk($stem, StringHelper::WORD_COMPONENT_DELIMITERS)) { 201 | // Here is more simple regex since formatting sequences may be present. 202 | // The downside is appearance of empty words, but they are filtered out later. 203 | $subWordArray = preg_split('#[\-.,]+#u', $rawWord, -1, \PREG_SPLIT_OFFSET_CAPTURE); 204 | foreach ($subWordArray as [$rawSubWord, $subOffset]) { 205 | $subWord = $this->formatId === SnippetSource::FORMAT_INTERNAL ? StringHelper::clearInternalFormatting($rawSubWord) : $rawSubWord; 206 | $subWord = str_replace(SnippetTextHelper::STORE_MARKER, '', $subWord); 207 | 208 | if ($rawSubWord === '') { 209 | continue; 210 | } 211 | 212 | $subStem = null; 213 | if (isset($flippedStems[$subWord]) || isset($flippedStems[$subStem = $this->stemmer->stemWord($subWord)])) { 214 | $this->highlightIntervals->addInterval($offset + $subOffset, $offset + $subOffset + \strlen($rawSubWord)); 215 | $this->foundStems[] = $subStem ?? $subWord; 216 | } else { 217 | $this->highlightIntervals->skipInterval(); 218 | } 219 | } 220 | } else { 221 | // Not a compound word 222 | $this->highlightIntervals->skipInterval(); 223 | } 224 | } 225 | } 226 | } 227 | 228 | protected function getLineWithoutMaskedFragments(): string 229 | { 230 | if ($this->lineWithoutMaskedFragments !== null) { 231 | return $this->lineWithoutMaskedFragments; 232 | } 233 | 234 | $this->lineWithoutMaskedFragments = SnippetTextHelper::sanitize($this->line, $this->maskRegexArray, $this->maskedFragments); 235 | 236 | return $this->lineWithoutMaskedFragments; 237 | } 238 | 239 | protected function restoreMaskedFragments(string $line): string 240 | { 241 | return SnippetTextHelper::restore($line, $this->maskedFragments); 242 | } 243 | } 244 | --------------------------------------------------------------------------------