├── tests
├── tmp
│ └── .gitignore
├── .gitignore
├── _output
│ └── .gitignore
├── _data
│ └── dump.sql
├── _bootstrap.php
├── unit
│ ├── _bootstrap.php
│ └── Rose
│ │ ├── Entity
│ │ ├── FulltextResultTest.php
│ │ ├── WordPositionContainerTest.php
│ │ ├── SnippetLineTest.php
│ │ ├── SentenceMapTest.php
│ │ ├── QueryTest.php
│ │ ├── ResultSetTest.php
│ │ └── SnippetTest.php
│ │ ├── Helper
│ │ ├── SnippetTextHelperTest.php
│ │ └── StringHelperTest.php
│ │ ├── Storage
│ │ ├── Database
│ │ │ ├── RepositoryValidationTest.php
│ │ │ └── MysqlRepositoryTest.php
│ │ └── SingleFileArrayStorageTest.php
│ │ ├── Stemmer
│ │ └── StemmerTest.php
│ │ └── FinderTest.php
├── config.php.dist.sqlite
├── unit.suite.yml
├── config.php.dist.postgres
├── config.php.dist.mysql
└── _support
│ ├── Helper
│ ├── Unit.php
│ ├── Acceptance.php
│ └── Functional.php
│ ├── UnitTester.php
│ ├── AcceptanceTester.php
│ └── FunctionalTester.php
├── .gitignore
├── bin
├── codecept
├── stem
└── process_test.php
├── src
└── S2
│ └── Rose
│ ├── Exception
│ ├── ExceptionInterface.php
│ ├── UnknownException.php
│ ├── RuntimeException.php
│ ├── ImmutableException.php
│ ├── LogicException.php
│ ├── InvalidArgumentException.php
│ └── UnknownIdException.php
│ ├── Stemmer
│ ├── StemmerInterface.php
│ └── AbstractStemmer.php
│ ├── Storage
│ ├── Exception
│ │ ├── EmptyIndexException.php
│ │ └── InvalidEnvironmentException.php
│ ├── StorageEraseInterface.php
│ ├── TransactionalStorageInterface.php
│ ├── Dto
│ │ ├── SnippetResult.php
│ │ └── SnippetQuery.php
│ ├── FulltextProxyInterface.php
│ ├── StorageReadInterface.php
│ ├── Database
│ │ └── IdMappingStorage.php
│ ├── FulltextIndexContent.php
│ ├── StorageWriteInterface.php
│ ├── FulltextIndexPositionBag.php
│ ├── File
│ │ └── SingleFileArrayStorage.php
│ ├── ArrayFulltextStorage.php
│ └── ArrayStorage.php
│ ├── Extractor
│ ├── ExtractorInterface.php
│ ├── DefaultExtractorFactory.php
│ ├── ExtractionResult.php
│ ├── ExtractionErrors.php
│ ├── ChainExtractor.php
│ ├── HtmlRegex
│ │ └── RegexExtractor.php
│ └── HtmlDom
│ │ └── DomState.php
│ ├── Entity
│ ├── ContentWithMetadata.php
│ ├── Metadata
│ │ ├── ImgCollection.php
│ │ ├── Img.php
│ │ ├── SnippetSource.php
│ │ ├── SentenceCollection.php
│ │ └── SentenceMap.php
│ ├── HighlightIntervals.php
│ ├── TocEntryWithMetadata.php
│ ├── ExternalIdCollection.php
│ ├── ResultTrace.php
│ ├── ExternalId.php
│ ├── FulltextQuery.php
│ ├── TocEntry.php
│ ├── WordPositionContainer.php
│ ├── Indexable.php
│ ├── ResultItem.php
│ ├── Snippet.php
│ ├── Query.php
│ ├── FulltextResult.php
│ └── SnippetLine.php
│ ├── Helper
│ ├── ProfileHelper.php
│ ├── SnippetTextHelper.php
│ └── StringHelper.php
│ ├── Snippet
│ └── SnippetBuilder.php
│ ├── Finder.php
│ └── Indexer.php
├── .editorconfig
├── codeception.yml
├── composer.json
├── LICENSE
├── .github
└── workflows
│ ├── test_sqlite.yml
│ ├── test_postgres.yml
│ └── test_mysql.yml
└── doc
└── rose.svg
/tests/tmp/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 |
--------------------------------------------------------------------------------
/tests/.gitignore:
--------------------------------------------------------------------------------
1 | config.php
2 |
--------------------------------------------------------------------------------
/tests/_output/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 |
--------------------------------------------------------------------------------
/tests/_data/dump.sql:
--------------------------------------------------------------------------------
1 | /* Replace this file with actual dump of your database */
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .git
2 | .idea/
3 | vendor/
4 | tests/_support/_generated/
5 | composer.lock
6 |
--------------------------------------------------------------------------------
/tests/_bootstrap.php:
--------------------------------------------------------------------------------
1 | 'sqlite:tests/_output/s2_rose_test',
6 | 'username' => '',
7 | 'passwd' => '',
8 | ];
9 |
--------------------------------------------------------------------------------
/tests/unit.suite.yml:
--------------------------------------------------------------------------------
1 | # Codeception Test Suite Configuration
2 | #
3 | # Suite for unit (internal) tests.
4 |
5 | class_name: UnitTester
6 | modules:
7 | enabled:
8 | - Asserts
9 | - \Helper\Unit
10 |
--------------------------------------------------------------------------------
/src/S2/Rose/Exception/ExceptionInterface.php:
--------------------------------------------------------------------------------
1 | 'pgsql:host=127.0.0.1;dbname=s2_rose_test',
6 | 'username' => 'postgres',
7 | 'passwd' => '12345',
8 | ];
9 |
--------------------------------------------------------------------------------
/tests/config.php.dist.mysql:
--------------------------------------------------------------------------------
1 | 'mysql:host=127.0.0.1;dbname=s2_rose_test;charset=utf8',
6 | 'username' => 'root',
7 | 'passwd' => 'root',
8 | ];
9 |
--------------------------------------------------------------------------------
/src/S2/Rose/Exception/UnknownException.php:
--------------------------------------------------------------------------------
1 | nextStemmer = $nextStemmer;
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/codeception.yml:
--------------------------------------------------------------------------------
1 | actor: Tester
2 | paths:
3 | tests: tests
4 | log: tests/_output # prior to 5.0?
5 | output: tests/_output # ~5.0?
6 | data: tests/_data
7 | support: tests/_support
8 | envs: tests/_envs
9 | bootstrap: _bootstrap.php
10 | settings:
11 | colors: true
12 | memory_limit: 1024M
13 | extensions:
14 | enabled:
15 | - Codeception\Extension\RunFailed
16 | modules:
17 | config:
18 | Db:
19 | dsn: ''
20 | user: ''
21 | password: ''
22 | dump: tests/_data/dump.sql
23 | coverage:
24 | enabled: true
25 | include:
26 | - src/*
27 |
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/TransactionalStorageInterface.php:
--------------------------------------------------------------------------------
1 | assertEquals(0.9889808283708308, FulltextResult::frequencyReduction(50, 2));
20 | $this->assertEquals(0.17705374665950163, FulltextResult::frequencyReduction(50, 25));
21 | $this->assertEquals(1, FulltextResult::frequencyReduction(3, 2));
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/S2/Rose/Extractor/DefaultExtractorFactory.php:
--------------------------------------------------------------------------------
1 | attachExtractor(new DomExtractor());
19 | }
20 | $extractor->attachExtractor(new RegexExtractor());
21 |
22 | return $extractor;
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/tests/_support/UnitTester.php:
--------------------------------------------------------------------------------
1 | ' . PHP_EOL);
17 | }
18 |
19 | $language = $argv[1];
20 | $argument = $argv[2];
21 |
22 | $stemmer = match ($language) {
23 | 'russian' => new \S2\Rose\Stemmer\PorterStemmerRussian(),
24 | 'english' => new \S2\Rose\Stemmer\PorterStemmerEnglish(),
25 | default => throw new \Exception('Unknown stemmer language: ' . $language),
26 | };
27 |
28 | echo $stemmer->stemWord($argument), PHP_EOL;
29 |
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/Dto/SnippetResult.php:
--------------------------------------------------------------------------------
1 | data[$externalId->toString()][] = $snippet;
19 | }
20 |
21 | public function iterate(callable $callback): void
22 | {
23 | foreach ($this->data as $serializedId => $snippets) {
24 | $callback(ExternalId::fromString($serializedId), ...$snippets);
25 | }
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/composer.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "s2/rose",
3 | "description": "Search engine for PHP sites",
4 | "authors": [
5 | {
6 | "name": "Roman Parpalak",
7 | "email": "roman@parpalak.com"
8 | }
9 | ],
10 | "license": "MIT",
11 | "require": {
12 | "php": ">=7.4",
13 | "ext-json": "*",
14 | "symfony/polyfill-mbstring": "^1.2",
15 | "psr/log": "^1.1|^2.0|^3.0"
16 | },
17 | "require-dev": {
18 | "codeception/codeception": "^4.2|^5.0",
19 | "codeception/module-asserts": "^1.3|^3.0"
20 | },
21 | "suggest": {
22 | "ext-dom": "*",
23 | "ext-pdo": "*"
24 | },
25 | "autoload": {
26 | "psr-4": {
27 | "S2\\Rose\\": "src/S2/Rose"
28 | }
29 | },
30 | "autoload-dev": {
31 | "psr-4": {
32 | "S2\\Rose\\Test\\": "tests/unit/Rose"
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/FulltextProxyInterface.php:
--------------------------------------------------------------------------------
1 | sentenceMap = $sentenceMap;
20 | $this->imageCollection = $images;
21 | }
22 |
23 | public function getSentenceMap(): SentenceMap
24 | {
25 | return $this->sentenceMap;
26 | }
27 |
28 | public function getImageCollection(): ImgCollection
29 | {
30 | return $this->imageCollection;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/S2/Rose/Extractor/ExtractionResult.php:
--------------------------------------------------------------------------------
1 | contentWithMetadata = $contentWithMetadata;
19 | $this->errors = $errors;
20 | }
21 |
22 | public function getContentWithMetadata(): ContentWithMetadata
23 | {
24 | return $this->contentWithMetadata;
25 | }
26 |
27 | public function getErrors(): ExtractionErrors
28 | {
29 | return $this->errors;
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/S2/Rose/Exception/UnknownIdException.php:
--------------------------------------------------------------------------------
1 | getId(),
18 | $externalId->getInstanceId()
19 | ));
20 | }
21 |
22 | public static function createResultMissingExternalId(ExternalId $externalId)
23 | {
24 | return new static(sprintf(
25 | 'External id "%s" for instance "%s" not found in result.',
26 | $externalId->getId(),
27 | $externalId->getInstanceId()
28 | ));
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/StorageReadInterface.php:
--------------------------------------------------------------------------------
1 | Img::fromArray($item), json_decode($json, true, 512, JSON_THROW_ON_ERROR)));
24 | }
25 |
26 | public function toJson(): string
27 | {
28 | /** @noinspection PhpUnhandledExceptionInspection */
29 | return json_encode($this->getArrayCopy(), JSON_THROW_ON_ERROR | JSON_UNESCAPED_UNICODE);
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/HighlightIntervals.php:
--------------------------------------------------------------------------------
1 | hasPreviousInterval) {
19 | $this->highlightIntervals[] = [$start, $end];
20 | } else {
21 | $this->highlightIntervals[\count($this->highlightIntervals) - 1][1] = $end;
22 | }
23 |
24 | $this->hasPreviousInterval = true;
25 | }
26 |
27 | public function skipInterval(): void
28 | {
29 | $this->hasPreviousInterval = false;
30 | }
31 |
32 | public function toArray(): array
33 | {
34 | return $this->highlightIntervals;
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Helper/SnippetTextHelperTest.php:
--------------------------------------------------------------------------------
1 | alert(1)', SnippetSource::FORMAT_PLAIN_TEXT, true);
20 |
21 | $this->assertSame('<script>alert(1)</script>', $result);
22 | }
23 |
24 | public function testKeepsInternalFormattingTags(): void
25 | {
26 | $result = SnippetTextHelper::prepareForOutput('\\iDanger\\I text', SnippetSource::FORMAT_INTERNAL, true);
27 |
28 | $this->assertSame('Danger text', $result);
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Storage/Database/RepositoryValidationTest.php:
--------------------------------------------------------------------------------
1 | expectException(InvalidArgumentException::class);
18 | new MysqlRepository(new class extends \PDO {
19 | public function __construct() {}
20 | }, 'bad;DROP', []);
21 | }
22 |
23 | public function testRejectsInvalidTableOverride(): void
24 | {
25 | $this->expectException(InvalidArgumentException::class);
26 | new MysqlRepository(new class extends \PDO {
27 | public function __construct() {}
28 | }, 'ok_prefix', ['toc' => 'toc;DROP']);
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/Database/IdMappingStorage.php:
--------------------------------------------------------------------------------
1 | idMapping[$externalId->toString()] = $internalId;
22 | }
23 |
24 | public function remove(ExternalId $externalId)
25 | {
26 | unset($this->idMapping[$externalId->toString()]);
27 | }
28 |
29 | public function clear()
30 | {
31 | $this->idMapping = [];
32 | }
33 |
34 | public function get(ExternalId $externalId)
35 | {
36 | $externalIdString = $externalId->toString();
37 | if (!isset($this->idMapping[$externalIdString])) {
38 | return null;
39 | }
40 |
41 | return $this->idMapping[$externalIdString];
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2017 Roman Parpalak
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/TocEntryWithMetadata.php:
--------------------------------------------------------------------------------
1 | tocEntry = $tocEntry;
21 | $this->externalId = $externalId;
22 | $this->imgCollection = $imgCollection;
23 | }
24 |
25 | public function getTocEntry(): TocEntry
26 | {
27 | return $this->tocEntry;
28 | }
29 |
30 | public function getExternalId(): ExternalId
31 | {
32 | return $this->externalId;
33 | }
34 |
35 | /**
36 | * @return ImgCollection|Img[]
37 | */
38 | public function getImgCollection(): ImgCollection
39 | {
40 | return $this->imgCollection;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/.github/workflows/test_sqlite.yml:
--------------------------------------------------------------------------------
1 | name: Test on SQLite
2 |
3 | on: [ push ]
4 |
5 | jobs:
6 | build:
7 | strategy:
8 | fail-fast: false
9 | matrix:
10 | operating_system:
11 | - 'ubuntu-22.04'
12 | php_versions:
13 | - '7.4'
14 | - '8.0'
15 | - '8.1'
16 | - '8.2'
17 | - '8.3'
18 | - '8.4'
19 |
20 | runs-on: '${{ matrix.operating_system }}'
21 |
22 | steps:
23 | - uses: actions/checkout@v4
24 |
25 | - name: 'Setup PHP'
26 | uses: shivammathur/setup-php@v2
27 | with:
28 | php-version: ${{ matrix.php_versions }}
29 |
30 | - name: Show SQLite version
31 | run: php --ri sqlite3
32 |
33 | - name: Install dependencies
34 | run: COMPOSER_MEMORY_LIMIT=-1 composer install --prefer-dist --no-interaction
35 |
36 | - name: Prepare config
37 | run: cp tests/config.php.dist.sqlite tests/config.php
38 |
39 | - name: Run test cases
40 | run: php bin/codecept run --skip-group profile
41 |
42 | - name: Run profiling
43 | if: success() || failure()
44 | run: php bin/codecept run -g profile -d
--------------------------------------------------------------------------------
/src/S2/Rose/Helper/ProfileHelper.php:
--------------------------------------------------------------------------------
1 | $message,
15 | 'duration' => $duration,
16 | 'memory_usage' => memory_get_usage(),
17 | 'memory_peak_usage' => memory_get_peak_usage(),
18 | ];
19 | }
20 |
21 | public static function formatProfilePoint(array $point): string
22 | {
23 | $point['message'] = str_pad($point['message'], 25, ' ', STR_PAD_RIGHT);
24 | $point['duration'] = str_pad(number_format($point['duration'] * 1000.0, 2, '.', ' ') . ' ms', 20, ' ', STR_PAD_LEFT);
25 | $point['memory_usage'] = str_pad(number_format($point['memory_usage'], 0, '.', ' '), 20, ' ', STR_PAD_LEFT);
26 | $point['memory_peak_usage'] = str_pad(number_format($point['memory_peak_usage'], 0, '.', ' '), 20, ' ', STR_PAD_LEFT);
27 |
28 | return implode('', $point);
29 | }
30 | }
31 |
--------------------------------------------------------------------------------
/bin/process_test.php:
--------------------------------------------------------------------------------
1 | setAttribute(\PDO::ATTR_ERRMODE, \PDO::ERRMODE_EXCEPTION);
18 |
19 | $storage = new \S2\Rose\Storage\Database\PdoStorage($pdo, 'multiprocess_');
20 | // $storage->erase();
21 |
22 | $stemmer = new \S2\Rose\Stemmer\PorterStemmerRussian();
23 | $indexer = new \S2\Rose\Indexer($storage, $stemmer);
24 |
25 | $filenames = glob(__DIR__ . '/../tests/Resource/data/' . '*.txt');
26 | $filenames = array_slice($filenames, 0, TEST_FILE_NUM);
27 |
28 | foreach ($filenames as $filename) {
29 | echo 'Indexing ', $filename, "\n";
30 | $content = file_get_contents($filename) . ' ' . rand();
31 | $indexable = new \S2\Rose\Entity\Indexable(
32 | basename($filename),
33 | substr($content, 0, strpos($content, "\n")),
34 | $content
35 | );
36 |
37 | $indexer->index($indexable);
38 | }
39 |
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/FulltextIndexContent.php:
--------------------------------------------------------------------------------
1 | getExternalId()->toString();
20 |
21 | $contentPositions = $positionBag->getContentPositions();
22 | if (\count($contentPositions) > 0) {
23 | $this->dataByExternalId[$serializedExtId][$word] = $contentPositions;
24 | }
25 |
26 | $this->dataByWord[$word][$serializedExtId] = $positionBag;
27 | }
28 |
29 | /**
30 | * @return FulltextIndexPositionBag[][]
31 | * @deprecated TODO rename or refactor this data transformation
32 | */
33 | public function toArray(): array
34 | {
35 | return $this->dataByWord;
36 | }
37 |
38 | public function iterateContentWordPositions(\Closure $callback): void
39 | {
40 | foreach ($this->dataByExternalId as $serializedExtId => $data) {
41 | $callback(ExternalId::fromString($serializedExtId), new WordPositionContainer($data));
42 | }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/ExternalIdCollection.php:
--------------------------------------------------------------------------------
1 | externalIds = $externalIds;
32 | }
33 |
34 | /**
35 | * @param string[] $serializedExternalIds
36 | */
37 | public static function fromStringArray(array $serializedExternalIds): self
38 | {
39 | return new self(array_map(static function ($serializedExtId) {
40 | return ExternalId::fromString($serializedExtId);
41 | }, $serializedExternalIds));
42 | }
43 |
44 | /**
45 | * @return ExternalId[]
46 | */
47 | public function toArray(): array
48 | {
49 | return $this->externalIds;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/S2/Rose/Extractor/ExtractionErrors.php:
--------------------------------------------------------------------------------
1 | errors[] = [
19 | 'message' => $message,
20 | 'code' => $code,
21 | 'line' => $line,
22 | 'column' => $column
23 | ];
24 |
25 | return $this;
26 | }
27 |
28 | /** @noinspection PhpComposerExtensionStubsInspection */
29 | public function addLibXmlError(\LibXMLError $error): self
30 | {
31 | return $this->addError(trim($error->message), (string)$error->code, $error->line, $error->column);
32 | }
33 |
34 | public function hasErrors(): bool
35 | {
36 | return \count($this->errors) > 0;
37 | }
38 |
39 | /**
40 | * @return string[]
41 | */
42 | public function getFormattedLines(): array
43 | {
44 | return array_map(static fn(array $error) => sprintf(
45 | "%s:%s %s (code=%s)",
46 | $error['line'],
47 | $error['column'] ?? '?',
48 | $error['message'],
49 | $error['code']
50 | ), $this->errors);
51 | }
52 | }
53 |
--------------------------------------------------------------------------------
/src/S2/Rose/Extractor/ChainExtractor.php:
--------------------------------------------------------------------------------
1 | extractors[] = $extractor;
25 | }
26 |
27 | /**
28 | * {@inheritdoc}
29 | * @throws RuntimeException
30 | */
31 | public function extract(string $text): ExtractionResult
32 | {
33 | if (\count($this->extractors) === 0) {
34 | throw new LogicException('No extractors were attached to the ChainExtractor.');
35 | }
36 |
37 | $e = null;
38 | foreach ($this->extractors as $extractor) {
39 | try {
40 | return $extractor->extract($text);
41 | } catch (\Exception $e) {
42 | if ($this->logger) {
43 | $this->logger->error($e->getMessage(), ['exception' => $e]);
44 | }
45 | }
46 | }
47 |
48 | throw new RuntimeException($e->getMessage(), 0, $e);
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/ResultTrace.php:
--------------------------------------------------------------------------------
1 | data[$serializedExtId]['fulltext ' . $word][] = [
20 | sprintf(
21 | '%s: match at positions [%s]',
22 | array_product($weights),
23 | implode(', ', $positions)
24 | ) => $weights,
25 | ];
26 | }
27 |
28 | /**
29 | * @param float[]|array $weights
30 | */
31 | public function addKeywordWeight(string $word, string $serializedExtId, array $weights): void
32 | {
33 | $this->data[$serializedExtId]['keyword ' . $word][] = [
34 | (string)array_product($weights) => $weights,
35 | ];
36 | }
37 |
38 | public function addNeighbourWeight(string $word1, string $word2, string $serializedExtId, float $weight, int $distance): void
39 | {
40 | $this->data[$serializedExtId]['fulltext ' . $word1 . ' - ' . $word2][] = $weight . ': matches are close (shift = ' . $distance . ')';
41 | }
42 |
43 | public function toArray(): array
44 | {
45 | return $this->data;
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/Metadata/Img.php:
--------------------------------------------------------------------------------
1 | src = $src;
19 | $this->width = $width;
20 | $this->height = $height;
21 | $this->alt = $alt;
22 | }
23 |
24 | public function getSrc(): string
25 | {
26 | return $this->src;
27 | }
28 |
29 | public function getWidth(): string
30 | {
31 | return $this->width;
32 | }
33 |
34 | public function getHeight(): string
35 | {
36 | return $this->height;
37 | }
38 |
39 | public function getAlt(): string
40 | {
41 | return $this->alt;
42 | }
43 |
44 | public static function fromArray(array $img): Img
45 | {
46 | return new self($img['src'], $img['width'], $img['height'], $img['alt']);
47 | }
48 |
49 | /**
50 | * @return mixed
51 | */
52 | #[\ReturnTypeWillChange]
53 | public function jsonSerialize()
54 | {
55 | return get_object_vars($this);
56 | }
57 |
58 | public function hasNumericDimensions(): bool
59 | {
60 | return is_numeric($this->width) && is_numeric($this->height);
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/StorageWriteInterface.php:
--------------------------------------------------------------------------------
1 | toArray() as $externalId) {
20 | $this->data[$externalId->toString()] = null;
21 | }
22 | }
23 |
24 | /**
25 | * @param int[] $positions
26 | */
27 | public function attach(ExternalId $externalId, array $positions): void
28 | {
29 | $serializedExtId = $externalId->toString();
30 | if (isset($this->data[$serializedExtId])) {
31 | throw new LogicException(sprintf('SnippetQuery already has id "%s".', $serializedExtId));
32 | }
33 | $this->data[$serializedExtId] = $positions;
34 | }
35 |
36 | public function iterate(callable $callback): void
37 | {
38 | foreach ($this->data as $serializedExtId => $positions) {
39 | $callback(ExternalId::fromString($serializedExtId), $positions);
40 | }
41 | }
42 |
43 | /**
44 | * @return ExternalId[]
45 | */
46 | public function getExternalIds(): array
47 | {
48 | return array_map(static fn(string $serializedExtId) => ExternalId::fromString($serializedExtId), array_keys($this->data));
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/ExternalId.php:
--------------------------------------------------------------------------------
1 | 0)) {
22 | // @codeCoverageIgnoreStart
23 | throw new InvalidArgumentException('Instance id must be positive.');
24 | // @codeCoverageIgnoreEnd
25 | }
26 |
27 | if (!\is_string($id) && !\is_int($id) && !\is_float($id)) {
28 | // @codeCoverageIgnoreStart
29 | throw new InvalidArgumentException('External id must be string or int or float.');
30 | // @codeCoverageIgnoreEnd
31 | }
32 |
33 | $this->id = (string)$id;
34 | $this->instanceId = $instanceId;
35 | }
36 |
37 | public function getId(): string
38 | {
39 | return $this->id;
40 | }
41 |
42 | public function getInstanceId(): ?int
43 | {
44 | return $this->instanceId;
45 | }
46 |
47 | public function toString(): string
48 | {
49 | return $this->instanceId . ':' . $this->id;
50 | }
51 |
52 | public static function fromString(string $string): self
53 | {
54 | $data = explode(':', $string, 2);
55 |
56 | return new static($data[1], $data[0] !== '' ? (int)$data[0] : null);
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/FulltextQuery.php:
--------------------------------------------------------------------------------
1 | words = array_values($words);
29 | $this->extractStems($stemmer);
30 | }
31 |
32 | protected function extractStems(StemmerInterface $stemmer): void
33 | {
34 | foreach ($this->words as $i => $word) {
35 | $stemWord = $stemmer->stemWord($word);
36 | if ($stemWord !== $word) {
37 | $this->additionalStems[$i] = $stemWord;
38 | }
39 | }
40 | }
41 |
42 | /**
43 | * @return string[]
44 | */
45 | public function getWordsWithStems(): array
46 | {
47 | return array_merge($this->words, $this->additionalStems);
48 | }
49 |
50 | public function toWordPositionContainer(): WordPositionContainer
51 | {
52 | $container = new WordPositionContainer();
53 |
54 | foreach ($this->words as $position => $word) {
55 | $container->addWordAt($word, $position);
56 | }
57 |
58 | foreach ($this->additionalStems as $position => $stem) {
59 | $container->addWordAt($stem, $position);
60 | }
61 |
62 | return $container;
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Storage/Database/MysqlRepositoryTest.php:
--------------------------------------------------------------------------------
1 | capturedSql = $statement;
29 |
30 | return new class($this) extends \PDOStatement {
31 | private $pdo;
32 |
33 | public function __construct($pdo)
34 | {
35 | $this->pdo = $pdo;
36 | }
37 |
38 | public function execute($params = null): bool
39 | {
40 | $this->pdo->executedParams[] = $params ?? [];
41 |
42 | return true;
43 | }
44 | };
45 | }
46 | };
47 |
48 | $repository = new MysqlRepository($pdo, 'prefix_', []);
49 | $repository->insertWords(['test"', "danger\\word"]);
50 |
51 | $this->assertStringNotContainsString('test"', $pdo->capturedSql);
52 | $this->assertSame([['test"', "danger\\word"]], $pdo->executedParams);
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/.github/workflows/test_postgres.yml:
--------------------------------------------------------------------------------
1 | name: Test on PostgreSQL
2 |
3 | on: [ push ]
4 |
5 | jobs:
6 | build:
7 | strategy:
8 | fail-fast: false
9 | matrix:
10 | operating_system: ['ubuntu-22.04']
11 | postgresql-version: [10, 11, 12, 13, 14, 15, 16, 17, 18]
12 | php_versions:
13 | - '8.4'
14 |
15 | runs-on: '${{ matrix.operating_system }}'
16 |
17 | steps:
18 | - uses: actions/checkout@v4
19 |
20 | - name: Install PostgreSQL
21 | env:
22 | POSTGRESQL_VERSION: ${{ matrix.postgresql-version }}
23 | run: |
24 | sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-pgdg main" > /etc/apt/sources.list.d/pgdg.list'
25 | wget --quiet -O - https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add -
26 | sudo apt-get update
27 | sudo apt-get -y install "postgresql-$POSTGRESQL_VERSION"
28 | sudo service postgresql start
29 |
30 | - name: Set up PostgreSQL
31 | run: |
32 | sudo -u postgres psql -c "ALTER USER postgres WITH PASSWORD '12345';"
33 | sudo -u postgres psql -c "CREATE DATABASE s2_rose_test OWNER postgres;"
34 |
35 | - name: 'Setup PHP'
36 | uses: shivammathur/setup-php@v2
37 | with:
38 | php-version: ${{ matrix.php_versions }}
39 |
40 | - name: Install dependencies
41 | run: COMPOSER_MEMORY_LIMIT=-1 composer install --prefer-dist --no-interaction
42 |
43 | - name: Prepare config
44 | run: cp tests/config.php.dist.postgres tests/config.php
45 |
46 | - name: Run test cases
47 | run: php bin/codecept run --skip-group profile
48 |
49 | - name: Run profiling
50 | if: success() || failure()
51 | run: php bin/codecept run -g profile -d
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/FulltextIndexPositionBag.php:
--------------------------------------------------------------------------------
1 | externalId = $externalId;
29 | $this->titlePositions = $titlePositions;
30 | $this->keywordPositions = $keywordPositions;
31 | $this->contentPositions = $contentPositions;
32 | $this->wordCount = $wordCount;
33 | $this->externalRelevanceRatio = $externalRelevanceRatio;
34 | }
35 |
36 | public function getExternalId(): ExternalId
37 | {
38 | return $this->externalId;
39 | }
40 |
41 | public function getTitlePositions(): array
42 | {
43 | return $this->titlePositions;
44 | }
45 |
46 | public function getKeywordPositions(): array
47 | {
48 | return $this->keywordPositions;
49 | }
50 |
51 | public function getContentPositions(): array
52 | {
53 | return $this->contentPositions;
54 | }
55 |
56 | public function getWordCount(): int
57 | {
58 | return $this->wordCount;
59 | }
60 |
61 | public function getExternalRelevanceRatio(): float
62 | {
63 | return $this->externalRelevanceRatio;
64 | }
65 |
66 | public function hasExternalRelevanceRatio(): bool
67 | {
68 | return $this->externalRelevanceRatio !== 1.0;
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/TocEntry.php:
--------------------------------------------------------------------------------
1 | title = $title;
22 | $this->description = $description;
23 | $this->date = $date;
24 | $this->url = $url;
25 | $this->relevanceRatio = $relevanceRatio;
26 | $this->hash = $hash;
27 | }
28 |
29 | public function getTitle(): string
30 | {
31 | return $this->title;
32 | }
33 |
34 | public function getDescription(): string
35 | {
36 | return $this->description;
37 | }
38 |
39 | public function getDate(): ?\DateTime
40 | {
41 | return $this->date;
42 | }
43 |
44 | public function getUrl(): string
45 | {
46 | return $this->url;
47 | }
48 |
49 | public function getRelevanceRatio(): float
50 | {
51 | return $this->relevanceRatio;
52 | }
53 |
54 | public function getInternalId(): ?int
55 | {
56 | return $this->internalId;
57 | }
58 |
59 | public function getHash(): string
60 | {
61 | return $this->hash;
62 | }
63 |
64 | /**
65 | * @deprecated Make immutable
66 | */
67 | public function setInternalId(int $internalId): self
68 | {
69 | $this->internalId = $internalId;
70 |
71 | return $this;
72 | }
73 |
74 | public function getFormattedDate(): ?string
75 | {
76 | return $this->date !== null ? $this->date->format('Y-m-d H:i:s') : null;
77 | }
78 |
79 | public function getTimeZone(): ?string
80 | {
81 | return $this->date !== null ? $this->date->getTimeZone()->getName() : null;
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Entity/WordPositionContainerTest.php:
--------------------------------------------------------------------------------
1 | [23, 56, 74],
23 | 'word2' => [2, 57],
24 | ]);
25 |
26 | $this->assertEquals(1, $container->getClosestDistanceBetween('word1', 'word2', 0));
27 | $this->assertEquals(-1, $container->getClosestDistanceBetween('word2', 'word1', 0));
28 | $this->assertEquals(23 - 2 - 20, $container->getClosestDistanceBetween('word2', 'word1', 20));
29 | $this->assertEquals(23 - 2 - 25, $container->getClosestDistanceBetween('word2', 'word1', 25));
30 | }
31 |
32 | public function testCompare()
33 | {
34 | $container = new WordPositionContainer();
35 | foreach (explode(' ', 'Циркуляция вектора напряженности электростатического поля вдоль замкнутого контура всегда равна нулю') as $k => $word) {
36 | $container->addWordAt($word, $k);
37 | }
38 |
39 | $this->assertEquals([['поля', 'нулю', 7]], $container->compareWith(new WordPositionContainer([
40 | 'нулю' => [5],
41 | 'нул' => [5],
42 | 'поля' => [6],
43 | 'пол' => [6],
44 | ])));
45 |
46 | $this->assertEquals([['поля', 'нулю', 5]], $container->compareWith(new WordPositionContainer([
47 | 'нулю' => [1],
48 | 'нул' => [1],
49 | 'поля' => [0],
50 | 'пол' => [0],
51 | ])));
52 |
53 | $this->assertEquals([
54 | ['вектора', 'поля', 2],
55 | ['вектора', 'контура', 4],
56 | ['поля', 'контура', 2],
57 | ], $container->compareWith(new WordPositionContainer([
58 | 'вектора' => [1],
59 | 'поля' => [2],
60 | 'контура' => [3],
61 | ])));
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/S2/Rose/Helper/SnippetTextHelper.php:
--------------------------------------------------------------------------------
1 | assertEquals(
32 | 'Testing string to highlight some test values, Test is case-sensitive.',
33 | $snippetLine->getHighlighted('%s', false)
34 | );
35 | }
36 |
37 | public function testCreateHighlighted2()
38 | {
39 | $snippetLine = new SnippetLine(
40 | 'Testing string to highlight some test values, Test is case-sensitive.',
41 | SnippetSource::FORMAT_PLAIN_TEXT,
42 | new PorterStemmerEnglish(),
43 | ['Test'], // unknown stem, stems are normalized to lower case, however there is a match due to direct comparison
44 | 1
45 | );
46 |
47 | $this->assertEquals(
48 | 'Testing string to highlight some test values, Test is case-sensitive.',
49 | $snippetLine->getHighlighted('%s', false)
50 | );
51 | }
52 |
53 | public function testJoinHighlighted()
54 | {
55 | $snippetLine = new SnippetLine(
56 | 'Testing string to highlight some test values, Test is case-sensitive.',
57 | SnippetSource::FORMAT_PLAIN_TEXT,
58 | new PorterStemmerEnglish(),
59 | ['to', 'highlight'],
60 | 1
61 | );
62 |
63 | $this->assertEquals(
64 | 'Testing string to highlight some test values, Test is case-sensitive.',
65 | $snippetLine->getHighlighted('%s', false)
66 | );
67 | }
68 |
69 | public function testCreateHighlightedFail()
70 | {
71 | $snippetLine = new SnippetLine(
72 | 'Testing string to highlight some test values, Test is case-sensitive.',
73 | SnippetSource::FORMAT_PLAIN_TEXT,
74 | new PorterStemmerEnglish(),
75 | ['test', 'is'],
76 | 2
77 | );
78 | $this->expectException(RuntimeException::class);
79 | $snippetLine->getHighlighted('', false);
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/doc/rose.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/src/S2/Rose/Extractor/HtmlRegex/RegexExtractor.php:
--------------------------------------------------------------------------------
1 | ', '
', '
', '
',];
24 | $replaceTo = [' ', '', SentenceMap::LINE_SEPARATOR, SentenceMap::LINE_SEPARATOR, SentenceMap::LINE_SEPARATOR, SentenceMap::LINE_SEPARATOR];
25 |
26 | foreach ([
27 | '
',
28 | '
',
29 | '
',
30 | '
',
31 | '
', '', '', '', '', '',
40 | '
', '
', '', '',
41 | '', '', '', '', '', '',
42 | '
', '
', '', '',
43 | ] as $tag) {
44 | $replaceFrom[] = $tag;
45 | $replaceTo[] = self::PARAGRAPH_SEPARATOR . $tag;
46 | }
47 | foreach ([
48 | '', '', '', '', '', '',
49 | '
', '', '', '', '',
50 | '', '', '', '', '', '',
51 | '', '', '', '', '',
52 | ] as $tag) {
53 | $replaceFrom[] = $tag;
54 | $replaceTo[] = $tag . self::PARAGRAPH_SEPARATOR;
55 | }
56 |
57 | $text = str_replace($replaceFrom, $replaceTo, $text);
58 |
59 | $text = preg_replace('#<(script|style)[^>]*?>.*?\\1>#si', '', $text);
60 | $text = preg_replace('#<([a-z]+) [^>]*?index-skip[^>]*?>.*?\\1>#si', '', $text);
61 |
62 | $paragraphs = explode(self::PARAGRAPH_SEPARATOR, $text);
63 | $texts = array_map(static fn(string $string) => trim(strip_tags($string)), $paragraphs); // TODO allow some formatting
64 | $texts = array_filter($texts);
65 |
66 | $text = implode(' ', $texts);
67 |
68 | $text = html_entity_decode($text, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5);
69 |
70 | return new ExtractionResult(
71 | new ContentWithMetadata((new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT))->add(0, '', $text), new ImgCollection()),
72 | new ExtractionErrors()
73 | );
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Entity/SentenceMapTest.php:
--------------------------------------------------------------------------------
1 | expectException(\LogicException::class);
21 | $s = new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT);
22 | $s->add(2, '/html/body/p[2]/text()[1]', 'Second');
23 | $s->add(2, '/html/body/p[2]/text()[1]', 'sentence. And a third one.');
24 | }
25 |
26 | public function testToArrayManyPaths(): void
27 | {
28 | $s = new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT);
29 | $s->add(1, '/html/body/p[1]/text()', 'One sentence.');
30 | $s->add(2, '/html/body/p[2]/text()[1]', 'Second');
31 | $s->add(2, '/html/body/p[2]/br', ' ');
32 | $s->add(2, '/html/body/p[2]/text()[2]', 'sentence. And a third one...');
33 |
34 | $sentenceArray = $s->toSentenceCollection()->toArray();
35 |
36 | $this->assertEquals([
37 | 'One sentence.',
38 | 'Second sentence.',
39 | 'And a third one...',
40 | ], $sentenceArray);
41 | }
42 |
43 | public function testToArrayOneLargePath(): void
44 | {
45 | $s = new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT);
46 | $s->add(0, '', 'А это цитата, ее тоже надо индексировать. В цитате могут быть абзацы. Ошибка астатически даёт более простую систему. Еще 1 раз проверим, как gt работает защита против xss-уязвимостей.');
47 |
48 | $sentenceArray = $s->toSentenceCollection()->toArray();
49 | $this->assertEquals([
50 | 'А это цитата, ее тоже надо индексировать.',
51 | 'В цитате могут быть абзацы.',
52 | 'Ошибка астатически даёт более простую систему.',
53 | 'Еще 1 раз проверим, как gt работает защита против xss-уязвимостей.',
54 | ], $sentenceArray);
55 | }
56 |
57 | public function testToArrayOneLargePath2(): void
58 | {
59 | $s = new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT);
60 | $s->add(2, '/html/body/p[2]/text()[1]', 'Second sentence. And a third one...');
61 |
62 | $sentenceArray = $s->toSentenceCollection()->toArray();
63 |
64 | $this->assertEquals([
65 | 'Second sentence.',
66 | 'And a third one...',
67 | ], $sentenceArray);
68 | }
69 |
70 | public function testToArrayPathPerSentence(): void
71 | {
72 | $s = new SentenceMap(SnippetSource::FORMAT_PLAIN_TEXT);
73 | $s->add(2, '/html/body/p[2]/text()[1]', 'Second sentence.');
74 | $s->add(2, '/html/body/p[2]/br', ' ');
75 | $s->add(2, '/html/body/p[2]/text()[2]', 'And a third one...');
76 |
77 | $sentenceArray = $s->toSentenceCollection()->toArray();
78 |
79 | $this->assertEquals([
80 | 'Second sentence.',
81 | 'And a third one...',
82 | ], $sentenceArray);
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/WordPositionContainer.php:
--------------------------------------------------------------------------------
1 | [23, 56, 74],
18 | * 'word2' => [2, 57],
19 | * ]
20 | */
21 | public function __construct(array $data = [])
22 | {
23 | $this->data = $data;
24 | }
25 |
26 | public function addWordAt(string $word, int $position): self
27 | {
28 | $this->data[$word][] = $position;
29 |
30 | sort($this->data[$word]); // TODO make more reliable requirement of input arrays to be sorted.
31 |
32 | return $this;
33 | }
34 |
35 | public function compareWith(self $referenceContainer): array
36 | {
37 | $wordMap = array_keys($this->data);
38 | $len = \count($wordMap);
39 |
40 | $result = [];
41 | /** @noinspection ForeachInvariantsInspection */
42 | for ($i = 0; $i < $len; $i++) {
43 | $word1 = (string)$wordMap[$i];
44 | for ($j = $i + 1; $j < $len; $j++) {
45 | $word2 = (string)$wordMap[$j];
46 |
47 | $referenceDistance = $referenceContainer->getClosestDistanceBetween($word1, $word2, 0);
48 | if ($referenceDistance === self::INFINITY) {
49 | continue;
50 | }
51 |
52 | $distance = $this->getClosestDistanceBetween($word1, $word2, $referenceDistance);
53 |
54 | $result[] = [$word1, $word2, $distance];
55 | }
56 | }
57 |
58 | return $result;
59 | }
60 |
61 | /**
62 | * This method uses linear algorithm, therefore input arrays must be sorted.
63 | * Otherwise, the output is incorrect.
64 | *
65 | * @param int[] $a1
66 | * @param int[] $a2
67 | *
68 | * @return int It's important to return a signed value, not an absolute value.
69 | */
70 | protected static function compareArrays(array $a1, array $a2, int $shift): int
71 | {
72 | $len1 = \count($a1);
73 | $len2 = \count($a2);
74 |
75 | $result = self::INFINITY;
76 | $index1 = 0;
77 | $index2 = 0;
78 |
79 | while ($index1 < $len1 && $index2 < $len2) {
80 | $diff = $a2[$index2] - $a1[$index1] - $shift;
81 |
82 | if ($diff === 0) {
83 | return 0;
84 | }
85 |
86 | if (abs($result) > abs($diff)) {
87 | $result = $diff;
88 | }
89 |
90 | if ($diff < 0) {
91 | $index2++;
92 | } else {
93 | $index1++;
94 | }
95 | }
96 |
97 | return $result;
98 | }
99 |
100 | public function getClosestDistanceBetween(string $word1, string $word2, int $shift = 0): int
101 | {
102 | if (!isset($this->data[$word1], $this->data[$word2])) {
103 | return self::INFINITY;
104 | }
105 |
106 | return self::compareArrays($this->data[$word1], $this->data[$word2], $shift);
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/.github/workflows/test_mysql.yml:
--------------------------------------------------------------------------------
1 | name: Test on MySQL
2 |
3 | on: [ push ]
4 |
5 | env:
6 | DB_DATABASE: s2_rose_test
7 | DB_USER: root
8 | DB_PASSWORD: root
9 |
10 | jobs:
11 | build:
12 | strategy:
13 | fail-fast: false
14 | matrix:
15 | operating_system: ['ubuntu-22.04']
16 | mysql_versions:
17 | - 'mariadb-10.2'
18 | - 'mariadb-10.3'
19 | - 'mariadb-10.4'
20 | - 'mariadb-10.5'
21 | - 'mariadb-10.6'
22 | - 'mariadb-10.7'
23 | - 'mariadb-10.8'
24 | - 'mariadb-10.9'
25 | - 'mariadb-10.10'
26 | - 'mariadb-10.11'
27 | - 'mariadb-11.0'
28 | - 'mariadb-11.1'
29 | - 'mariadb-11.2'
30 | - 'mariadb-11.3'
31 | - 'mariadb-11.4'
32 | - 'mariadb-11.5'
33 | - 'mariadb-11.6'
34 | - 'mariadb-11.7'
35 | - 'mariadb-11.8'
36 | - 'mariadb-12.0'
37 | - 'mariadb-12.1'
38 | - '5.6'
39 | - '5.7'
40 | - '8.0'
41 | - '8.1'
42 | - '8.2'
43 | - '8.3'
44 | - '8.4'
45 | - '9.0'
46 | - '9.1'
47 | - '9.2'
48 | - '9.3'
49 | - '9.4'
50 | - '9.5'
51 | php_versions:
52 | - '8.4'
53 | include:
54 | -
55 | operating_system: 'ubuntu-22.04'
56 | mysql_versions: 'mariadb-11.8'
57 | php_versions: '7.4'
58 | -
59 | operating_system: 'ubuntu-22.04'
60 | mysql_versions: 'mariadb-11.8'
61 | php_versions: '8.0'
62 | -
63 | operating_system: 'ubuntu-22.04'
64 | mysql_versions: 'mariadb-11.8'
65 | php_versions: '8.1'
66 | -
67 | operating_system: 'ubuntu-22.04'
68 | mysql_versions: 'mariadb-11.8'
69 | php_versions: '8.2'
70 | -
71 | operating_system: 'ubuntu-22.04'
72 | mysql_versions: 'mariadb-11.8'
73 | php_versions: '8.3'
74 | -
75 | operating_system: 'ubuntu-22.04'
76 | mysql_versions: 'mariadb-11.8'
77 | php_versions: '8.5'
78 |
79 | runs-on: '${{ matrix.operating_system }}'
80 |
81 | steps:
82 | - uses: actions/checkout@v4
83 | - uses: shogo82148/actions-setup-mysql@v1
84 | with:
85 | mysql-version: ${{ matrix.mysql_versions }}
86 | root-password: ${{ env.DB_PASSWORD }}
87 |
88 | - name: 'Setup MySQL'
89 | run: |
90 | mysql -e 'SELECT version();' -u${{ env.DB_USER }} -h127.0.0.1 -p${{ env.DB_PASSWORD }}
91 | mysql -e 'CREATE DATABASE ${{ env.DB_DATABASE }};' -u${{ env.DB_USER }} -h127.0.0.1 -p${{ env.DB_PASSWORD }}
92 |
93 | - name: 'Setup PHP'
94 | uses: shivammathur/setup-php@v2
95 | with:
96 | php-version: ${{ matrix.php_versions }}
97 |
98 | - name: Install dependencies
99 | run: COMPOSER_MEMORY_LIMIT=-1 composer install --prefer-dist --no-interaction
100 |
101 | - name: Prepare config
102 | run: cp tests/config.php.dist.mysql tests/config.php
103 |
104 | - name: Run test cases
105 | run: php -d register_argc_argv=On bin/codecept run --skip-group profile
106 |
107 | - name: Run profiling
108 | if: success() || failure()
109 | run: php -d register_argc_argv=On bin/codecept run -g profile -d
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/Metadata/SnippetSource.php:
--------------------------------------------------------------------------------
1 | $maxPosition) {
47 | throw new InvalidArgumentException('Minimal word position cannot be greater than maximal.');
48 | }
49 |
50 | if (!\in_array($formatId, self::ALLOWED_FORMATS)) {
51 | throw new InvalidArgumentException(sprintf('Unknown snippet format "%s".', $formatId));
52 | }
53 |
54 | $this->text = $text;
55 | $this->minPosition = $minPosition;
56 | $this->maxPosition = $maxPosition;
57 | $this->formatId = $formatId;
58 | }
59 |
60 | public function getText(): string
61 | {
62 | return $this->text;
63 | }
64 |
65 | public function getMinPosition(): int
66 | {
67 | return $this->minPosition;
68 | }
69 |
70 | public function getMaxPosition(): int
71 | {
72 | return $this->maxPosition;
73 | }
74 |
75 | public function getFormatId(): int
76 | {
77 | return $this->formatId;
78 | }
79 |
80 | /**
81 | * @param int[] $positions
82 | */
83 | public function coversOneOfPositions(array $positions): bool
84 | {
85 | foreach ($positions as $position) {
86 | if ($position >= $this->minPosition && $position <= $this->maxPosition) {
87 | return true;
88 | }
89 | }
90 |
91 | return false;
92 | }
93 |
94 | public function __toString()
95 | {
96 | return $this->text;
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Storage/SingleFileArrayStorageTest.php:
--------------------------------------------------------------------------------
1 | getTempFilename());
32 | }
33 |
34 | public function testStorage()
35 | {
36 | $storage = new SingleFileArrayStorage($this->getTempFilename());
37 |
38 | $storage->load();
39 |
40 | $storage->addEntryToToc(
41 | new TocEntry('test title 1', '', new \DateTime(), '', 1, '4567890lkjhgfd'),
42 | new ExternalId('test_id_1')
43 | );
44 | $storage->addEntryToToc(
45 | new TocEntry('test title 2', '', new \DateTime(), '', 1, 'edfghj8765rfg'),
46 | new ExternalId('test_id_2')
47 | );
48 |
49 | $entry1 = $storage->getTocByExternalId(new ExternalId('test_id_1'));
50 | $entry2 = $storage->getTocByExternalId(new ExternalId('test_id_2'));
51 | $this->assertEquals(1, $entry1->getInternalId());
52 | $this->assertEquals(2, $entry2->getInternalId());
53 |
54 | $storage->addToFulltextIndex(['titleword'], ['keyword1', 'keyword2'], [1 => 'hello', 2 => 'world', 3=>'world'], new ExternalId('test_id_1'));
55 |
56 | $fulltextResult = $storage->fulltextResultByWords(['hello'], null);
57 | $info = $fulltextResult->toArray()['hello'];
58 | $this->assertArrayHasKey(':test_id_1', $info);
59 | $this->assertEquals([1], $info[':test_id_1']->getContentPositions());
60 | $this->assertEquals([], $info[':test_id_1']->getTitlePositions());
61 | $this->assertEquals([], $info[':test_id_1']->getKeywordPositions());
62 |
63 | $fulltextResult = $storage->fulltextResultByWords(['world'], null);
64 | $info = $fulltextResult->toArray()['world'];
65 | $this->assertArrayHasKey(':test_id_1', $info);
66 | $this->assertEquals([2, 3], $info[':test_id_1']->getContentPositions());
67 | $this->assertEquals([], $info[':test_id_1']->getTitlePositions());
68 | $this->assertEquals([], $info[':test_id_1']->getKeywordPositions());
69 |
70 | $storage->save();
71 |
72 | $storage = new SingleFileArrayStorage($this->getTempFilename());
73 | $storage->load();
74 |
75 | $entry1 = $storage->getTocByExternalId(new ExternalId('test_id_1'));
76 | $this->assertEquals('test title 1', $entry1->getTitle());
77 | $this->assertEquals('4567890lkjhgfd', $entry1->getHash());
78 |
79 | $entry3 = $storage->getTocByExternalId(new ExternalId('test_id_3'));
80 | $this->assertNull($entry3);
81 |
82 | $storage->addToFulltextIndex([], [], [10 => 'hello', 20 => 'world'], new ExternalId('test_id_2'));
83 |
84 | $fulltextResult = $storage->fulltextResultByWords(['world'], null);
85 | $info = $fulltextResult->toArray()['world'];
86 | $this->assertArrayHasKey(':test_id_1', $info);
87 | $this->assertEquals([2, 3], $info[':test_id_1']->getContentPositions());
88 | $this->assertArrayHasKey(':test_id_2', $info);
89 | $this->assertEquals([20], $info[':test_id_2']->getContentPositions());
90 |
91 | $storage->save();
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/Indexable.php:
--------------------------------------------------------------------------------
1 | externalId = new ExternalId($id, $instanceId);
23 | $this->title = $title;
24 | $this->content = $content;
25 | }
26 |
27 | public function getExternalId(): ExternalId
28 | {
29 | return $this->externalId;
30 | }
31 |
32 | public function getTitle(): string
33 | {
34 | return $this->title;
35 | }
36 |
37 | public function setTitle(string $title): self
38 | {
39 | $this->title = $title;
40 |
41 | return $this;
42 | }
43 |
44 | public function getContent(): string
45 | {
46 | return $this->content;
47 | }
48 |
49 | public function setContent(string $content): self
50 | {
51 | $this->content = $content;
52 |
53 | return $this;
54 | }
55 |
56 | public function getKeywords(): string
57 | {
58 | return $this->keywords;
59 | }
60 |
61 | public function setKeywords(string $keywords): self
62 | {
63 | $this->keywords = $keywords;
64 |
65 | return $this;
66 | }
67 |
68 | public function getDescription(): string
69 | {
70 | return $this->description;
71 | }
72 |
73 | public function setDescription(string $description): self
74 | {
75 | $this->description = $description;
76 |
77 | return $this;
78 | }
79 |
80 | public function getDate(): ?\DateTime
81 | {
82 | return $this->date;
83 | }
84 |
85 | public function setDate(\DateTime $date = null): self
86 | {
87 | $this->date = $date;
88 |
89 | return $this;
90 | }
91 |
92 | public function getUrl(): string
93 | {
94 | return $this->url;
95 | }
96 |
97 | public function setUrl(string $url): self
98 | {
99 | $this->url = $url;
100 |
101 | return $this;
102 | }
103 |
104 | public function getRelevanceRatio(): float
105 | {
106 | return $this->relevanceRatio;
107 | }
108 |
109 | public function setRelevanceRatio(float $relevanceRatio): self
110 | {
111 | if ($relevanceRatio < 0.001) {
112 | throw new \DomainException('Relevance ratio must not be less than 0.001.');
113 | }
114 | if ($relevanceRatio > 9999) {
115 | throw new \DomainException('Relevance ratio must not be greater than 9999.');
116 | }
117 |
118 | $this->relevanceRatio = $relevanceRatio;
119 |
120 | return $this;
121 | }
122 |
123 | public function toTocEntry(): TocEntry
124 | {
125 | return new TocEntry(
126 | $this->getTitle(),
127 | $this->getDescription(),
128 | $this->getDate(),
129 | $this->getUrl(),
130 | $this->getRelevanceRatio(),
131 | $this->calcHash()
132 | );
133 | }
134 |
135 | public function calcHash(): string
136 | {
137 | return md5(serialize([
138 | $this->getTitle(),
139 | $this->getDescription(),
140 | $this->getKeywords(),
141 | $this->getContent(),
142 | ]));
143 | }
144 | }
145 |
--------------------------------------------------------------------------------
/src/S2/Rose/Extractor/HtmlDom/DomState.php:
--------------------------------------------------------------------------------
1 | sentenceMap = new SentenceMap(SnippetSource::FORMAT_INTERNAL);
38 | }
39 |
40 | public function attachContent(string $path, string $textContent): void
41 | {
42 | if ($this->startNewParagraph) {
43 | $this->currentParagraphIndex++;
44 | $this->startNewParagraph = false;
45 | }
46 |
47 | /**
48 | * Decode all entities. '&' was encoded before and decoded in DOM processing.
49 | * @see \S2\Rose\Extractor\HtmlDom\DomExtractor::getDomDocument
50 | */
51 | $textContent = html_entity_decode($textContent, ENT_QUOTES | ENT_SUBSTITUTE | ENT_HTML5);
52 |
53 | $textContent = $this->pendingFormatting . str_replace('\\', '\\\\', $textContent);
54 | $this->pendingFormatting = '';
55 |
56 | $this->sentenceMap->add($this->currentParagraphIndex, $path, $textContent);
57 | }
58 |
59 | public function startNewParagraph(): void
60 | {
61 | $this->startNewParagraph = true;
62 | }
63 |
64 | public function startFormatting(string $formatting): void
65 | {
66 | if (!\in_array($formatting, self::ALLOWED_FORMATTING, true)) {
67 | throw new \LogicException(sprintf('Unknown formatting "%s".', $formatting));
68 | }
69 | $this->formattingLevel[$formatting] = 1 + ($this->formattingLevel[$formatting] ?? 0);
70 | if ($this->formattingLevel[$formatting] === 1) {
71 | $this->pendingFormatting .= '\\' . $formatting;
72 | }
73 | }
74 |
75 | public function stopFormatting(string $formatting): void
76 | {
77 | if (!\in_array($formatting, self::ALLOWED_FORMATTING, true)) {
78 | throw new \LogicException(sprintf('Unknown formatting "%s".', $formatting));
79 | }
80 | $level = $this->formattingLevel[$formatting] ?? 0;
81 | if ($level === 1) {
82 | if ($this->pendingFormatting === '') {
83 | // No format symbols are queued. This means that symbols of formatting start have already been added
84 | // to SentenceMap. So it is not empty and the last item can be modified.
85 | $this->sentenceMap->appendToLastItem('\\' . strtoupper($formatting));
86 | } else {
87 | $this->pendingFormatting .= '\\' . strtoupper($formatting);
88 | }
89 | }
90 | if ($level > 0) {
91 | $this->formattingLevel[$formatting] = $level - 1;
92 | }
93 | }
94 |
95 | public function attachImg(string $src, string $width, string $height, string $alt): void
96 | {
97 | $this->images[] = new Img($src, $width, $height, $alt);
98 | }
99 |
100 | public function toContentWithMetadata(): ContentWithMetadata
101 | {
102 | return new ContentWithMetadata($this->sentenceMap, new ImgCollection(...$this->images));
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/Metadata/SentenceCollection.php:
--------------------------------------------------------------------------------
1 | formatId = $formatId;
24 | }
25 |
26 | /**
27 | * @param string $text Text content of a sentence. Must be formatted according to the constructor parameter.
28 | * @return void
29 | */
30 | public function attach(string $text): void
31 | {
32 | $this->cachedWords = null;
33 | $this->sentences[] = trim(preg_replace('#\\s+#', ' ', $text));
34 | }
35 |
36 | public function getText(): string
37 | {
38 | return implode(' ', $this->sentences);
39 | }
40 |
41 | /**
42 | * @internal Used for tests only!
43 | */
44 | public function toArray(): array
45 | {
46 | return $this->sentences;
47 | }
48 |
49 | /**
50 | * @return string[]
51 | */
52 | public function getWordsArray(): array
53 | {
54 | if ($this->cachedWords === null) {
55 | $this->buildWordsInfo();
56 | }
57 |
58 | return $this->cachedWords;
59 | }
60 |
61 | /**
62 | * @return SnippetSource[]
63 | */
64 | public function getSnippetSources(): array
65 | {
66 | if ($this->cachedSnippetSources === null) {
67 | $this->buildWordsInfo();
68 | }
69 |
70 | return $this->cachedSnippetSources;
71 | }
72 |
73 | private function buildWordsInfo(): void
74 | {
75 | $this->cachedWords = [];
76 | $this->cachedSnippetSources = [];
77 | $oldSize = 0;
78 | foreach ($this->sentences as $idx => $sentence) {
79 | // NOTE: maybe it's worth to join sentences somehow before exploding for optimization reasons
80 | $contentWords = self::breakIntoWords(
81 | $this->formatId === SnippetSource::FORMAT_INTERNAL ? StringHelper::clearInternalFormatting($sentence) : $sentence
82 | );
83 | $this->cachedWords[] = $contentWords;
84 | $wordsInSentence = \count($contentWords);
85 | if ($wordsInSentence === 0) {
86 | continue;
87 | }
88 | $newSize = $wordsInSentence + $oldSize;
89 |
90 | if ($wordsInSentence >= 2) { // Skip too short snippets
91 | $this->cachedSnippetSources[$idx] = new SnippetSource($sentence, $this->formatId, $oldSize, $newSize - 1);
92 | }
93 |
94 | $oldSize = $newSize;
95 | }
96 | $this->cachedWords = array_merge(...$this->cachedWords);
97 | }
98 |
99 | /**
100 | * @return string[]
101 | */
102 | public static function breakIntoWords(string $content): array
103 | {
104 | // Replace decimal separator: ',' -> '.'
105 | $content = preg_replace('#(?:^|[\s()])-?\d+\K,(?=\d+(?:$|[\s()]|\.\s))#', '.', $content);
106 |
107 | // We allow letters, digits and some punctuation: ".,-^_"
108 | $content = str_replace(',', ', ', $content);
109 | $content = preg_replace('#[^\\-.,0-9\\p{L}^_]+#u', ' ', $content);
110 | $content = mb_strtolower($content);
111 | $content = str_replace(['ё'], ['е'], $content);
112 |
113 | // These punctuation characters are meant to be inside words and numbers.
114 | // Remove trailing characters when splitting the words.
115 | $content = rtrim($content, '-.,');
116 |
117 | $words = preg_split('#[\\-.,]*?[ ]+#S', $content);
118 | StringHelper::removeLongWords($words);
119 |
120 | return $words;
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Entity/QueryTest.php:
--------------------------------------------------------------------------------
1 | assertEquals([1, 2], (new Query('1|||2'))->valueToArray());
22 | $this->assertEquals([1, 2], (new Query('1\\\\\\2'))->valueToArray());
23 | $this->assertEquals(['a', 'b'], (new Query('a/b'))->valueToArray());
24 | $this->assertEquals(['a', 'b'], (new Query(' a b '))->valueToArray());
25 | $this->assertEquals(['..'], (new Query('..'))->valueToArray());
26 | $this->assertEquals(['...'], (new Query('...'))->valueToArray());
27 | $this->assertEquals(['a..b'], (new Query('a..b'))->valueToArray());
28 |
29 | // Tests for replacing numbers
30 | $this->assertEquals(['1.2'], (new Query('1,2'))->valueToArray());
31 | // $this->assertEquals(['-1.2'], (new Query('-1,2'))->valueToArray());
32 | $this->assertEquals(['1.2'], (new Query('1.2'))->valueToArray());
33 |
34 | // Tests for replacing typographic quotes
35 | $this->assertEquals(['"', 'text'], (new Query('«text»'))->valueToArray());
36 | $this->assertEquals(['"', 'text'], (new Query('“text”'))->valueToArray());
37 |
38 | // Tests for replacing dashes
39 | $this->assertEquals(['a--b'], (new Query('a--b'))->valueToArray());
40 | $this->assertEquals(['a—b'], (new Query('a---b'))->valueToArray()); // --- to mdash
41 | $this->assertEquals(['a—b'], (new Query('a–b'))->valueToArray()); // ndash to mdash
42 | $this->assertEquals(['a-b'], (new Query('a−b'))->valueToArray()); // Minus to hyphen
43 |
44 | // Test for replacing line breaks and extra spaces
45 | $this->assertEquals(['a', 'b'], (new Query("a\n\nb"))->valueToArray());
46 | $this->assertEquals(['a', 'b'], (new Query("a \t b"))->valueToArray());
47 |
48 | // Tests for separating special characters
49 | $this->assertEquals(['a!b'], (new Query('a!b'))->valueToArray());
50 | $this->assertEquals(['!', 'ab'], (new Query('!ab'))->valueToArray());
51 | $this->assertEquals(['!', 'a!b'], (new Query('!a!b'))->valueToArray());
52 | $this->assertEquals(['(', 'word', ')'], (new Query('(word)'))->valueToArray());
53 | $this->assertEquals(['mysql', '--all-databases'], (new Query('mysql --all-databases'))->valueToArray());
54 |
55 | // Test for replacing "ё" with "е"
56 | $this->assertEquals(['ё', 'полет', 'field'], (new Query('ё полёт field'))->valueToArray());
57 |
58 | // Tests for handling commas
59 | $this->assertEquals(['a', ',', 'b'], (new Query('a,b'))->valueToArray());
60 | $this->assertEquals(['a', ',,', 'b'], (new Query('a,,b'))->valueToArray());
61 | $this->assertEquals(['a', ',,,', 'b'], (new Query('a,,,b'))->valueToArray());
62 |
63 | // Tests for removing long words
64 | $this->assertEquals(['a', 'c'], (new Query('a ' . str_repeat('b', 101) . ' c'))->valueToArray());
65 |
66 | // Tests for compatibility of multiple rules
67 | $this->assertEquals(['a—b', '"', 'text'], (new Query('a–b «text»'))->valueToArray());
68 | $this->assertEquals(['a', ',', 'b'], (new Query(" a, \n b "))->valueToArray());
69 | $this->assertEquals(
70 | ['похоже', ',', 'лучшие', 'времена', 'наступили', 'я', 'решил', 'доработать', 'и', 'опубликовать', 'движок'],
71 | (new Query('Похоже, лучшие времена наступили. Я решил доработать и опубликовать движок.'))->valueToArray()
72 | );
73 |
74 | // Invalid inputs
75 | $this->assertSame([], (new Query(null))->valueToArray());
76 | $this->assertSame([], (new Query(['foo' => 'bar']))->valueToArray());
77 | $this->assertSame(['ре'], array_values((new Query(rawurldecode('%D1%80%D0%B5%D0')))->valueToArray()));
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Entity/ResultSetTest.php:
--------------------------------------------------------------------------------
1 | prepareResult(new ResultSet());
29 | $data = $result->getSortedRelevanceByExternalId();
30 | $this->assertCount(30, $data);
31 |
32 | $result = $this->prepareResult(new ResultSet(2));
33 | $data = $result->getSortedRelevanceByExternalId();
34 | $this->assertCount(2, $data);
35 | $this->assertEquals(30, $result->getTotalCount());
36 | $this->assertEquals(39, $data[':id_29']);
37 | $this->assertEquals(38, $data[':id_28']);
38 |
39 | $result = $this->prepareResult(new ResultSet(4, 3));
40 | $data = $result->getSortedRelevanceByExternalId();
41 | $this->assertCount(4, $data);
42 | $this->assertEquals(30, $result->getTotalCount());
43 | $this->assertEquals(36, $data[':id_26']);
44 | $this->assertEquals(35, $data[':id_25']);
45 | $this->assertEquals(34, $data[':id_24']);
46 | $this->assertEquals(33, $data[':id_23']);
47 | }
48 |
49 | public function testEmpty()
50 | {
51 | $resultSet = new ResultSet();
52 | $resultSet->freeze();
53 | $data = $resultSet->getItems();
54 | $this->assertCount(0, $data);
55 | }
56 |
57 | public function testNotFrozenGetItems()
58 | {
59 | $this->expectException(ImmutableException::class);
60 | $resultSet = new ResultSet();
61 | $resultSet->getItems();
62 | }
63 |
64 | public function testNotFrozenAttachSnippet()
65 | {
66 | $this->expectException(UnknownIdException::class);
67 | $resultSet = new ResultSet();
68 | $resultSet->attachSnippet(new ExternalId('not found'), new Snippet('%s', new SnippetLine('', SnippetSource::FORMAT_PLAIN_TEXT, new PorterStemmerEnglish(), [], 0.0)));
69 | }
70 |
71 | public function testNotFrozenGetFoundExternalIds()
72 | {
73 | $this->expectException(ImmutableException::class);
74 | $resultSet = new ResultSet();
75 | $resultSet->getFoundExternalIds();
76 | }
77 |
78 | public function testNotFrozenGetFoundWordsByExternalId()
79 | {
80 | $this->expectException(ImmutableException::class);
81 | $resultSet = new ResultSet();
82 | $resultSet->getFoundWordPositionsByExternalId();
83 | }
84 |
85 | public function testNotFrozenGetSortedExternalIds()
86 | {
87 | $this->expectException(ImmutableException::class);
88 | $resultSet = new ResultSet();
89 | $resultSet->getSortedExternalIds();
90 | }
91 |
92 | public function testNotFrozenGetSortedRelevanceByExternalId()
93 | {
94 | $this->expectException(ImmutableException::class);
95 | $resultSet = new ResultSet();
96 | $resultSet->getSortedRelevanceByExternalId();
97 | }
98 |
99 | /**
100 | * @param ResultSet $result
101 | *
102 | * @return ResultSet
103 | * @throws ImmutableException
104 | * @throws \S2\Rose\Exception\InvalidArgumentException
105 | */
106 | private function prepareResult(ResultSet $result)
107 | {
108 | for ($i = 30; $i--;) {
109 | $externalId = new ExternalId('id_' . $i);
110 | $result->addWordWeight('test1', $externalId, ['test' => $i]);
111 | $result->addWordWeight('test2', $externalId, ['test' => 10]);
112 | }
113 |
114 | $result->freeze();
115 |
116 | return $result;
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/src/S2/Rose/Snippet/SnippetBuilder.php:
--------------------------------------------------------------------------------
1 | stemmer = $stemmer;
34 | $this->snippetLineSeparator = $snippetLineSeparator;
35 | }
36 |
37 | public function setHighlightMaskRegexArray(array $regexes): self
38 | {
39 | $this->highlightMaskRegexArray = $regexes;
40 |
41 | return $this;
42 | }
43 |
44 | /**
45 | * @throws ImmutableException
46 | * @throws UnknownIdException
47 | */
48 | public function attachSnippets(ResultSet $result, SnippetResult $snippetResult): self
49 | {
50 | $foundWords = $result->getFoundWordPositionsByExternalId();
51 |
52 | $snippetResult->iterate(function (ExternalId $externalId, SnippetSource ...$snippets) use ($foundWords, $result) {
53 | $snippet = $this->buildSnippet(
54 | $foundWords[$externalId->toString()],
55 | $result->getHighlightTemplate(),
56 | $result->getRelevanceByStemsFromId($externalId),
57 | ...$snippets
58 | );
59 | $result->attachSnippet($externalId, $snippet);
60 | });
61 |
62 | return $this;
63 | }
64 |
65 | public function buildSnippet(array $foundPositionsByStems, string $highlightTemplate, array $relevanceByStems, SnippetSource ...$snippetSources): Snippet
66 | {
67 | // Stems of the words found in the $id chapter
68 | $stems = [];
69 | $foundWordNum = 0;
70 | $snippetRelevance = [];
71 | foreach ($foundPositionsByStems as $stem => $positions) {
72 | if (empty($positions)) {
73 | // Not a fulltext search result (e.g. title from single keywords)
74 | continue;
75 | }
76 | $stems[] = $stem;
77 | $foundWordNum++;
78 | foreach ($snippetSources as $snippetIndex => $snippetSource) {
79 | if ($snippetSource->coversOneOfPositions($positions)) {
80 | $snippetRelevance[$snippetIndex] = ($snippetRelevance[$snippetIndex] ?? 0) + ($relevanceByStems[$stem] ?? 0);
81 | }
82 | }
83 | }
84 |
85 | $introSnippetLines = array_map(
86 | static fn(SnippetSource $s) => SnippetLine::createFromSnippetSourceWithoutFoundWords($s),
87 | \array_slice($snippetSources, 0, 2)
88 | );
89 |
90 | $snippet = new Snippet($highlightTemplate, ...$introSnippetLines);
91 |
92 | if ($this->snippetLineSeparator !== null) {
93 | $snippet->setLineSeparator($this->snippetLineSeparator);
94 | }
95 |
96 | if ($foundWordNum === 0) {
97 | return $snippet;
98 | }
99 |
100 | foreach ($snippetSources as $snippetIndex => $snippetSource) {
101 | if (!isset($snippetRelevance[$snippetIndex])) {
102 | continue;
103 | }
104 |
105 | $snippetLine = new SnippetLine(
106 | $snippetSource->getText(),
107 | $snippetSource->getFormatId(),
108 | $this->stemmer,
109 | $stems,
110 | $snippetRelevance[$snippetIndex] ?? 0
111 | );
112 | $snippetLine->setMaskRegexArray($this->highlightMaskRegexArray);
113 |
114 | $snippet->attachSnippetLine($snippetSource->getMinPosition(), $snippetSource->getMaxPosition(), $snippetLine);
115 | }
116 |
117 | return $snippet;
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/File/SingleFileArrayStorage.php:
--------------------------------------------------------------------------------
1 | filename = $filename;
27 | $this->fulltextProxy = new ArrayFulltextStorage();
28 | }
29 |
30 | public function load(bool $isDebug = false): array
31 | {
32 | $return = [];
33 | if (\count($this->toc)) {
34 | return $return;
35 | }
36 |
37 | if (!is_file($this->filename)) {
38 | return $return;
39 | }
40 |
41 | if ($isDebug) {
42 | $start_time = microtime(true);
43 | }
44 |
45 | $data = file_get_contents($this->filename);
46 |
47 | if ($isDebug) {
48 | $return[] = ProfileHelper::getProfilePoint('Reading index file', -$start_time + ($start_time = microtime(true)));
49 | }
50 |
51 | $myData = $this->extractSerializedSection($data);
52 | $unserializeOptions = ['allowed_classes' => [
53 | \DateTime::class,
54 | TocEntry::class,
55 | Img::class,
56 | ImgCollection::class,
57 | SnippetSource::class,
58 | ]];
59 | $this->fulltextProxy->setFulltextIndex(unserialize($myData, $unserializeOptions) ?: []);
60 |
61 | $myData = $this->extractSerializedSection($data);
62 | $this->excludedWords = unserialize($myData, $unserializeOptions) ?: [];
63 |
64 | $myData = $this->extractSerializedSection($data);
65 | $this->metadata = unserialize($myData, $unserializeOptions) ?: [];
66 |
67 | $myData = $this->extractSerializedSection($data);
68 | $this->toc = unserialize($myData, $unserializeOptions) ?: [];
69 |
70 |
71 | if ($isDebug) {
72 | $return[] = ProfileHelper::getProfilePoint('Unserializing index', -$start_time + ($start_time = microtime(true)));
73 | }
74 |
75 | $this->externalIdMap = [];
76 | foreach ($this->toc as $serializedExtId => $entry) {
77 | $this->externalIdMap[$entry->getInternalId()] = ExternalId::fromString($serializedExtId);
78 | }
79 |
80 | return $return;
81 | }
82 |
83 | public function save(): void
84 | {
85 | @unlink($this->filename);
86 | file_put_contents($this->filename, 'fulltextProxy->getFulltextIndex()) . ':{');
87 | $buffer = '';
88 | $length = 0;
89 | foreach ($this->fulltextProxy->getFulltextIndex() as $word => $data) {
90 | $chunk = serialize($word) . serialize($data);
91 | $length += \strlen($chunk);
92 | $buffer .= $chunk;
93 | if ($length > 100000) {
94 | file_put_contents($this->filename, $buffer, FILE_APPEND);
95 | $buffer = '';
96 | $length = 0;
97 | }
98 | }
99 | file_put_contents($this->filename, $buffer . '}' . "\n", FILE_APPEND);
100 | $this->fulltextProxy->setFulltextIndex([]);
101 |
102 | file_put_contents($this->filename, ' //' . serialize($this->excludedWords) . "\n", FILE_APPEND);
103 | $this->excludedWords = [];
104 |
105 | file_put_contents($this->filename, ' //' . serialize($this->metadata) . "\n", FILE_APPEND);
106 | $this->metadata = [];
107 |
108 | file_put_contents($this->filename, ' //' . serialize($this->toc) . "\n", FILE_APPEND);
109 | $this->toc = [];
110 | }
111 |
112 | private function extractSerializedSection(string &$data): string
113 | {
114 | $endPos = strpos($data, "\n");
115 | if ($endPos === false) {
116 | $line = $data;
117 | $data = '';
118 | } else {
119 | $line = substr($data, 0, $endPos);
120 | $data = substr($data, $endPos + 1);
121 | }
122 |
123 | $commentPos = strpos($line, '//');
124 | if ($commentPos === false) {
125 | throw new \RuntimeException('Broken SingleFileArrayStorage format: "//" marker not found.');
126 | }
127 |
128 | return substr($line, $commentPos + 2);
129 | }
130 | }
131 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/ResultItem.php:
--------------------------------------------------------------------------------
1 | id = $id;
47 | $this->instanceId = $instanceId;
48 | $this->title = $title;
49 | $this->description = $description;
50 | $this->date = $date;
51 | $this->url = $url;
52 | $this->relevanceRatio = $relevanceRatio;
53 | $this->imgCollection = $imgCollection;
54 | $this->highlightTemplate = $highlightTemplate;
55 | }
56 |
57 | public function setSnippet(Snippet $snippet): self
58 | {
59 | $this->snippet = $snippet;
60 |
61 | return $this;
62 | }
63 |
64 | public function setRelevance(float $relevance): self
65 | {
66 | $this->relevance = $relevance;
67 |
68 | return $this;
69 | }
70 |
71 | public function getId(): string
72 | {
73 | return $this->id;
74 | }
75 |
76 | public function getInstanceId(): ?int
77 | {
78 | return $this->instanceId;
79 | }
80 |
81 | public function getTitle(): string
82 | {
83 | return $this->title;
84 | }
85 |
86 | public function getDescription(): string
87 | {
88 | return $this->description;
89 | }
90 |
91 | public function getDate(): ?\DateTime
92 | {
93 | return $this->date;
94 | }
95 |
96 | public function getUrl(): string
97 | {
98 | return $this->url;
99 | }
100 |
101 | public function getRelevanceRatio(): float
102 | {
103 | return $this->relevanceRatio;
104 | }
105 |
106 | public function getRelevance(): float
107 | {
108 | return $this->relevance;
109 | }
110 |
111 | public function getSnippet(): string
112 | {
113 | if ($this->snippet === null) {
114 | return $this->description;
115 | }
116 |
117 | $snippet = $this->snippet->toString();
118 | if ($snippet) {
119 | return $snippet;
120 | }
121 |
122 | return $this->description ?: $this->snippet->getTextIntroduction();
123 | }
124 |
125 | public function getFormattedSnippet(): string
126 | {
127 | if ($this->snippet === null) {
128 | return $this->description;
129 | }
130 |
131 | $snippet = $this->snippet->toString(true);
132 | if ($snippet) {
133 | return $snippet;
134 | }
135 |
136 | return $this->description ?: $this->snippet->getTextIntroduction();
137 | }
138 |
139 | /**
140 | * @param string[] $words
141 | */
142 | public function setFoundWords(array $words): self
143 | {
144 | $this->foundWords = $words;
145 |
146 | return $this;
147 | }
148 |
149 | /**
150 | * @throws RuntimeException
151 | */
152 | public function getHighlightedTitle(StemmerInterface $stemmer): string
153 | {
154 | $template = $this->highlightTemplate;
155 |
156 | if (strpos($template, '%s') === false) {
157 | throw new InvalidArgumentException('Highlight template must contain "%s" substring for sprintf() function.');
158 | }
159 |
160 | $snippetLine = new SnippetLine(
161 | $this->title,
162 | SnippetSource::FORMAT_PLAIN_TEXT,
163 | $stemmer,
164 | $this->foundWords,
165 | 0
166 | );
167 |
168 | return $snippetLine->getHighlighted($template, false);
169 | }
170 |
171 | public function getImageCollection(): ImgCollection
172 | {
173 | return $this->imgCollection;
174 | }
175 | }
176 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/Snippet.php:
--------------------------------------------------------------------------------
1 | highlightTemplate = $highlightTemplate;
35 | $this->introductionSnippetLines = $introductionSnippetLines;
36 | }
37 |
38 | public function setLineSeparator(string $lineSeparator): self
39 | {
40 | $this->lineSeparator = $lineSeparator;
41 |
42 | return $this;
43 | }
44 |
45 | public function attachSnippetLine(int $minWordPosition, int $maxWordPosition, SnippetLine $snippetLine): self
46 | {
47 | $this->snippetLines[] = $snippetLine;
48 | $this->snippetMinWordPositions[] = $minWordPosition;
49 | $this->snippetMaxWordPositions[] = $maxWordPosition;
50 |
51 | return $this;
52 | }
53 |
54 | public function getTextIntroduction(bool $includeFormatting = false): string
55 | {
56 | $result = [];
57 | foreach ($this->introductionSnippetLines as $snippetLine) {
58 | $result[] = $snippetLine->getHighlighted($this->highlightTemplate, $includeFormatting);
59 | }
60 |
61 | return implode(' ', $result);
62 | }
63 |
64 | public function toString(bool $includeFormatting = false): ?string
65 | {
66 | $stat = [];
67 | foreach ($this->snippetLines as $index => $snippetLine) {
68 | $stat[$snippetLine->getLine()][$index] = $snippetLine->getRelevance();
69 | }
70 |
71 | $uniqueLines = [];
72 | foreach ($stat as $indexToRelevanceMap) {
73 | arsort($indexToRelevanceMap);
74 | /** @noinspection LoopWhichDoesNotLoopInspection */
75 | foreach ($indexToRelevanceMap as $index => $relevance) {
76 | // If there are duplicates, this code takes only one copy with the greatest relevance.
77 | $uniqueLines[$index] = $relevance;
78 | break;
79 | }
80 | }
81 |
82 | // Reverse sorting by relevance
83 | arsort($uniqueLines);
84 |
85 | // Obtaining top of meaningful lines
86 | $slice = \array_slice($uniqueLines, 0, self::SNIPPET_LINE_COUNT, true);
87 |
88 | // Sort by natural position
89 | ksort($slice);
90 |
91 | $resultSnippetLines = [];
92 | foreach ($slice as $idx => $weight) {
93 | $resultSnippetLines[$idx] = $this->snippetLines[$idx];
94 | }
95 |
96 | return $this->implodeLines($resultSnippetLines, $includeFormatting);
97 | }
98 |
99 | /**
100 | * @param array|SnippetLine[] $snippetLines
101 | */
102 | private function implodeLines(array $snippetLines, bool $includeFormatting): string
103 | {
104 | $result = '';
105 | $previousMaxPosition = -1;
106 |
107 | $foundStrings = [];
108 | foreach ($snippetLines as $index => $snippetLine) {
109 | $lineStr = $snippetLine->getHighlighted($this->highlightTemplate, $includeFormatting);
110 | $lineStr = trim($lineStr);
111 |
112 | // Cleaning up unbalanced quotation marks
113 | /** @noinspection NotOptimalRegularExpressionsInspection */
114 | $lineStr = preg_replace('#«(.*?)»#Ss', '«\\1»', $lineStr);
115 | $lineStr = str_replace(['"', '«', '»'], ['"', ''], $lineStr);
116 | if (substr_count($lineStr, '"') % 2) {
117 | $lineStr = str_replace('"', '', $lineStr);
118 | }
119 |
120 | // Remove repeating lines
121 | if (isset($foundStrings[$lineStr])) {
122 | continue;
123 | }
124 | $foundStrings[$lineStr] = 1;
125 |
126 | if ($previousMaxPosition === -1) {
127 | $result = $lineStr;
128 | } else {
129 | $result .= ($previousMaxPosition + 1 === $this->snippetMinWordPositions[$index] ? ' ' : $this->lineSeparator) . $lineStr;
130 | }
131 | $previousMaxPosition = $this->snippetMaxWordPositions[$index];
132 | }
133 |
134 | if ($this->lineSeparator === '... ') {
135 | $result = str_replace('.... ', '... ', $result);
136 | }
137 |
138 | return $result;
139 | }
140 | }
141 |
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/ArrayFulltextStorage.php:
--------------------------------------------------------------------------------
1 | fulltextIndex;
22 | }
23 |
24 | public function setFulltextIndex(array $fulltextIndex): self
25 | {
26 | $this->fulltextIndex = $fulltextIndex;
27 |
28 | return $this;
29 | }
30 |
31 | /**
32 | * {@inheritdoc}
33 | */
34 | public function getByWord(string $word): array
35 | {
36 | if (!isset($this->fulltextIndex[$word])) {
37 | return [];
38 | }
39 |
40 | $result = [];
41 | foreach ($this->fulltextIndex[$word] as $id => $entries) {
42 | if (\is_int($entries)) {
43 | $result[$id][self::TYPE_CONTENT][] = $entries;
44 | } else {
45 | $entries = explode('|', $entries);
46 | foreach ($entries as $position) {
47 | if ($position[0] === self::PREFIX_TITLE) {
48 | $result[$id][self::TYPE_TITLE][] = base_convert(substr($position, 1), 36, 10);
49 | } elseif ($position[0] === self::PREFIX_KEYWORD) {
50 | $result[$id][self::TYPE_KEYWORD][] = base_convert(substr($position, 1), 36, 10);
51 | } else {
52 | $result[$id][self::TYPE_CONTENT][] = base_convert($position, 36, 10);
53 | }
54 | }
55 | }
56 | }
57 |
58 | return $result;
59 | }
60 |
61 | /**
62 | * {@inheritdoc}
63 | */
64 | public function countByWord(string $word): int
65 | {
66 | if (!isset($this->fulltextIndex[$word])) {
67 | return 0;
68 | }
69 |
70 | return \count($this->fulltextIndex[$word]);
71 | }
72 |
73 | /**
74 | * {@inheritdoc}
75 | */
76 | public function addWord(string $word, int $id, int $type, int $position): void
77 | {
78 | if ($word === '') {
79 | return;
80 | }
81 |
82 | if (isset($this->fulltextIndex[$word][$id])) {
83 | $positionStr = base_convert($position, 10, 36);
84 | if ($type === self::TYPE_KEYWORD) {
85 | $positionStr = self::PREFIX_KEYWORD . $positionStr;
86 | } elseif ($type === self::TYPE_TITLE) {
87 | $positionStr = self::PREFIX_TITLE . $positionStr;
88 | }
89 |
90 | $value = $this->fulltextIndex[$word][$id];
91 | if (\is_int($value)) {
92 | // There was the only one content position, but it's no longer the case.
93 | // Convert to the 36-based number system.
94 | $this->fulltextIndex[$word][$id] = base_convert($value, 10, 36) . '|' . $positionStr;
95 | } else {
96 | // Appending
97 | $this->fulltextIndex[$word][$id] = $value . '|' . $positionStr;
98 | }
99 | } else {
100 | // If there is the only one content position in index, the position is stored as decimal number
101 | if ($type === self::TYPE_KEYWORD) {
102 | /** @noinspection CallableParameterUseCaseInTypeContextInspection */
103 | $position = self::PREFIX_KEYWORD . base_convert($position, 10, 36);
104 | } elseif ($type === self::TYPE_TITLE) {
105 | /** @noinspection CallableParameterUseCaseInTypeContextInspection */
106 | $position = self::PREFIX_TITLE . base_convert($position, 10, 36);
107 | }
108 | $this->fulltextIndex[$word][$id] = $position;
109 | }
110 | }
111 |
112 | /**
113 | * {@inheritdoc}
114 | */
115 | public function removeWord(string $word): void
116 | {
117 | unset($this->fulltextIndex[$word]);
118 | }
119 |
120 | /**
121 | * {@inheritdoc}
122 | */
123 | public function getFrequentWords(int $threshold): array
124 | {
125 | $result = [];
126 | $link = &$this->fulltextIndex; // for memory optimization
127 | foreach ($this->fulltextIndex as $word => $stat) {
128 | // Drop fulltext frequent or empty items
129 | $num = \count($stat);
130 | if ($num > $threshold) {
131 | $result[$word] = $num;
132 | }
133 | }
134 |
135 | return $result;
136 | }
137 |
138 | /**
139 | * {@inheritdoc}
140 | */
141 | public function removeById(int $id): void
142 | {
143 | foreach ($this->fulltextIndex as &$data) {
144 | if (isset($data[$id])) {
145 | unset($data[$id]);
146 | }
147 | }
148 | unset($data);
149 | }
150 | }
151 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Stemmer/StemmerTest.php:
--------------------------------------------------------------------------------
1 | russianStemmer = new PorterStemmerRussian();
42 | $this->englishStemmer = new PorterStemmerEnglish();
43 | $this->chainedStemmer1 = new PorterStemmerRussian(new PorterStemmerEnglish());
44 | $this->chainedStemmer2 = new PorterStemmerEnglish(new PorterStemmerRussian());
45 | }
46 |
47 | public function _after()
48 | {
49 | }
50 |
51 | public function testRegexes(): void
52 | {
53 | $this->assertEquals('ухмыля', $this->russianStemmer->stemWord('ухмылявшись'));
54 | $this->assertEquals('доб', $this->russianStemmer->stemWord('добившись'));
55 | }
56 |
57 | public function testParticles(): void
58 | {
59 | $this->assertEquals('кто-нибудь', $this->russianStemmer->stemWord('кого-нибудь'));
60 | $this->assertEquals('когда-нибудь', $this->russianStemmer->stemWord('когда-нибудь'));
61 | $this->assertEquals('что-то', $this->russianStemmer->stemWord('чему-то'));
62 | $this->assertEquals('нехитр-то', $this->russianStemmer->stemWord('нехитрое-то'));
63 | $this->assertEquals('когда-либо', $this->russianStemmer->stemWord('когда-либо'));
64 | $this->assertEquals('что-либо', $this->russianStemmer->stemWord('чем-либо'));
65 | $this->assertEquals('кое-что', $this->russianStemmer->stemWord('кое-чем'));
66 | $this->assertEquals('кое-кто', $this->russianStemmer->stemWord('кое-кого'));
67 | }
68 |
69 | public function testStem(): void
70 | {
71 | $this->assertEquals('ухмыляться', $this->englishStemmer->stemWord('ухмыляться'));
72 | $this->assertEquals('ухмыля', $this->russianStemmer->stemWord('ухмыляться'));
73 | $this->assertEquals('ухмыля', $this->chainedStemmer1->stemWord('ухмыляться'));
74 | $this->assertEquals('ухмыля', $this->chainedStemmer2->stemWord('ухмыляться'));
75 |
76 | $this->assertEquals('рраф', $this->russianStemmer->stemWord('Ррафа'));
77 |
78 | $this->assertEquals('метро', $this->russianStemmer->stemWord('метро'));
79 |
80 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзамен'));
81 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзамена'));
82 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзамену'));
83 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзаменом'));
84 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзамене'));
85 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзамены'));
86 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзаменов'));
87 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзаменам'));
88 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзаменами'));
89 | $this->assertEquals('экзамен', $this->russianStemmer->stemWord('экзаменах'));
90 |
91 | $this->assertEquals('домен', $this->russianStemmer->stemWord('домен'));
92 | $this->assertEquals('домен', $this->russianStemmer->stemWord('домена'));
93 | $this->assertEquals('домен', $this->russianStemmer->stemWord('домену'));
94 | $this->assertEquals('домен', $this->russianStemmer->stemWord('доменом'));
95 | $this->assertEquals('домен', $this->russianStemmer->stemWord('домене'));
96 | $this->assertEquals('домен', $this->russianStemmer->stemWord('домены'));
97 | $this->assertEquals('домен', $this->russianStemmer->stemWord('доменов'));
98 | $this->assertEquals('домен', $this->russianStemmer->stemWord('доменам'));
99 | $this->assertEquals('домен', $this->russianStemmer->stemWord('доменами'));
100 | $this->assertEquals('домен', $this->russianStemmer->stemWord('доменах'));
101 |
102 | $this->assertEquals('учитель', $this->englishStemmer->stemWord('Учитель'));
103 | $this->assertEquals('учител', $this->russianStemmer->stemWord('учитель'));
104 | $this->assertEquals('учител', $this->chainedStemmer1->stemWord('учитель'));
105 | $this->assertEquals('учител', $this->chainedStemmer2->stemWord('учитель'));
106 |
107 | $this->assertEquals('gun', $this->englishStemmer->stemWord('guns'));
108 | $this->assertEquals('guns', $this->russianStemmer->stemWord('guns'));
109 |
110 | $this->assertEquals('papa', $this->chainedStemmer1->stemWord('papa\'s'));
111 | $this->assertEquals('papa', $this->chainedStemmer2->stemWord('papa\'s'));
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/Metadata/SentenceMap.php:
--------------------------------------------------------------------------------
1 | [
18 | * '/html/body/p[1]/text()' => 'One sentence.',
19 | * ],
20 | * 2 => [
21 | * '/html/body/p[2]/text()[1]' => 'Second',
22 | * '/html/body/p[2]/br' => ' ',
23 | * '/html/body/p[2]/text()[2]' => 'sentence. And a third one.',
24 | * ],
25 | * ]
26 | *
27 | * @var array[]
28 | */
29 | private array $paragraphs = [];
30 | private int $formatId;
31 |
32 | /**
33 | * @param int $formatId Id of formatting.
34 | * @see SnippetSource::ALLOWED_FORMATS for formatting
35 | */
36 | public function __construct(int $formatId)
37 | {
38 | $this->formatId = $formatId;
39 | }
40 |
41 | /**
42 | * @param int $paragraphIndex Number of current paragraph. Must be detected outside based on formatting.
43 | * @param string $path Some identifier of a content node. Must be unique for the paragraph given.
44 | * @param string $textContent Raw text content of a node. Formatting must correspond to formatId constructor parameter.
45 | */
46 | public function add(int $paragraphIndex, string $path, string $textContent): self
47 | {
48 | if (isset($this->paragraphs[$paragraphIndex][$path])) {
49 | throw new \LogicException(sprintf('Map already has a content for paragraph "%s" and path "%s".', $paragraphIndex, $path));
50 | }
51 | $this->paragraphs[$paragraphIndex][$path] = $textContent;
52 |
53 | return $this;
54 | }
55 |
56 | public function appendToLastItem(string $text): void
57 | {
58 | $a = $this->paragraphs;
59 | if (\count($a) === 0) {
60 | throw new LogicException('Cannot append to an empty sentence map.');
61 | }
62 | $lastKey = array_values(array_reverse(array_keys($a)))[0];
63 | $a = $a[$lastKey];
64 | $lastKey2 = array_values(array_reverse(array_keys($a)))[0];
65 | $this->paragraphs[$lastKey][$lastKey2] .= $text;
66 | }
67 |
68 | public function toSentenceCollection(): SentenceCollection
69 | {
70 | $sentenceCollection = new SentenceCollection($this->formatId);
71 |
72 | foreach ($this->paragraphs as $paragraphSentences) {
73 | $accumulatedRegularSentences = '';
74 | foreach ($paragraphSentences as $path => $paragraphSentence) {
75 | if (strpos($path, '/pre') !== false && strpos($path, '/code') !== false) {
76 | // When a code block is encountered, do accumulated regular work
77 | $this->processRegularSentences($accumulatedRegularSentences, $sentenceCollection);
78 | $accumulatedRegularSentences = '';
79 |
80 | // and process the code in a different way
81 | $this->processCodeSentences($paragraphSentence, $sentenceCollection);
82 | } else {
83 | // Merge non-code text content and then break into sentences.
84 | $accumulatedRegularSentences .= $paragraphSentence;
85 | }
86 | }
87 |
88 | $this->processRegularSentences($accumulatedRegularSentences, $sentenceCollection);
89 | }
90 |
91 | return $sentenceCollection;
92 | }
93 |
94 | /**
95 | * Breaks a regular text into sentences using heuristics based on punctuation rules.
96 | */
97 | private function processRegularSentences(string $text, SentenceCollection $sentenceCollection): void
98 | {
99 | $text = trim($text);
100 | $sentences = StringHelper::sentencesFromText($text, $this->formatId === SnippetSource::FORMAT_INTERNAL);
101 |
102 | if (($linesNum = 1 + substr_count($text, self::LINE_SEPARATOR)) > 3) {
103 | $totalWordNum = \count(SentenceCollection::breakIntoWords(
104 | $this->formatId === SnippetSource::FORMAT_INTERNAL ? StringHelper::clearInternalFormatting($text) : $text
105 | ));
106 | $avgWordNumInSentences = 1.0 * $totalWordNum / \count($sentences);
107 | $avgWordNumInLines = 1.0 * $totalWordNum / $linesNum;
108 |
109 | if ($avgWordNumInSentences > 20 && $avgWordNumInLines > 3 && $avgWordNumInLines < 15) {
110 | // Heuristics for lines separated by
.
111 | // This branch is for lists like table of contents.
112 | $sentences = explode(self::LINE_SEPARATOR, $text);
113 | }
114 | }
115 |
116 | foreach ($sentences as $sentence) {
117 | if ($sentence === '') {
118 | continue;
119 | }
120 | $sentenceCollection->attach($sentence);
121 | }
122 | }
123 |
124 | /**
125 | * Breaks a source code into "sentences" using empty lines as a separator.
126 | */
127 | private function processCodeSentences(string $text, SentenceCollection $sentenceCollection): void
128 | {
129 | $sentences = StringHelper::sentencesFromCode($text);
130 |
131 | foreach ($sentences as $sentence) {
132 | if ($sentence === '') {
133 | continue;
134 | }
135 |
136 | $sentenceCollection->attach($sentence);
137 | }
138 | }
139 | }
140 |
--------------------------------------------------------------------------------
/src/S2/Rose/Finder.php:
--------------------------------------------------------------------------------
1 | storage = $storage;
40 | $this->stemmer = $stemmer;
41 | }
42 |
43 | public function setHighlightMaskRegexArray(array $highlightMaskRegexArray): self
44 | {
45 | $this->highlightMaskRegexArray = $highlightMaskRegexArray;
46 |
47 | return $this;
48 | }
49 |
50 | public function setHighlightTemplate(string $highlightTemplate): self
51 | {
52 | $this->highlightTemplate = $highlightTemplate;
53 |
54 | return $this;
55 | }
56 |
57 | public function setSnippetLineSeparator(string $snippetLineSeparator): self
58 | {
59 | $this->snippetLineSeparator = $snippetLineSeparator;
60 |
61 | return $this;
62 | }
63 |
64 | /**
65 | * @throws ImmutableException
66 | */
67 | public function find(Query $query, bool $isDebug = false): ResultSet
68 | {
69 | $resultSet = new ResultSet($query->getLimit(), $query->getOffset(), $isDebug);
70 | if ($this->highlightTemplate !== null) {
71 | $resultSet->setHighlightTemplate($this->highlightTemplate);
72 | }
73 |
74 | $rawWords = $query->valueToArray();
75 | $resultSet->addProfilePoint('Input cleanup');
76 |
77 | if (\count($rawWords) > 0) {
78 | $this->findFulltext($rawWords, $query->getInstanceId(), $resultSet);
79 | $resultSet->addProfilePoint('Fulltext search');
80 | }
81 |
82 | $resultSet->freeze();
83 |
84 | $sortedExternalIds = $resultSet->getSortedExternalIds();
85 |
86 | $resultSet->addProfilePoint('Sort results');
87 |
88 | foreach ($this->storage->getTocByExternalIds($sortedExternalIds) as $tocEntryWithExternalId) {
89 | $resultSet->attachToc($tocEntryWithExternalId);
90 | }
91 |
92 | $resultSet->addProfilePoint('Fetch TOC');
93 |
94 | $relevanceByExternalIds = $resultSet->getSortedRelevanceByExternalId();
95 | if (\count($relevanceByExternalIds) > 0) {
96 | $this->buildSnippets($relevanceByExternalIds, $resultSet);
97 | }
98 |
99 | return $resultSet;
100 | }
101 |
102 | /**
103 | * Ignore frequent words encountering in indexed items.
104 | */
105 | public static function fulltextRateExcludeNum(int $tocSize): int
106 | {
107 | return max($tocSize * 0.5, 20);
108 | }
109 |
110 | /**
111 | * @throws ImmutableException
112 | */
113 | protected function findFulltext(array $words, ?int $instanceId, ResultSet $resultSet): void
114 | {
115 | $fulltextQuery = new FulltextQuery($words, $this->stemmer);
116 | $fulltextIndexContent = $this->storage->fulltextResultByWords($fulltextQuery->getWordsWithStems(), $instanceId);
117 | $fulltextResult = new FulltextResult(
118 | $fulltextQuery,
119 | $fulltextIndexContent,
120 | $this->storage->getTocSize($instanceId)
121 | );
122 |
123 | $fulltextResult->fillResultSet($resultSet);
124 | }
125 |
126 | public function buildSnippets(array $relevanceByExternalIds, ResultSet $resultSet): void
127 | {
128 | $snippetQuery = new SnippetQuery(ExternalIdCollection::fromStringArray(array_keys($relevanceByExternalIds)));
129 | try {
130 | $foundWordPositionsByExternalId = $resultSet->getFoundWordPositionsByExternalId();
131 | } catch (ImmutableException $e) {
132 | throw new LogicException($e->getMessage(), 0, $e);
133 | }
134 | foreach ($foundWordPositionsByExternalId as $serializedExtId => $wordsInfo) {
135 | if (!isset($relevanceByExternalIds[$serializedExtId])) {
136 | // Out of limit and offset scope, no need to fetch snippets.
137 | continue;
138 | }
139 | $externalId = ExternalId::fromString($serializedExtId);
140 | $allPositions = array_merge(...array_values($wordsInfo));
141 | $snippetQuery->attach($externalId, $allPositions);
142 | }
143 | $resultSet->addProfilePoint('Snippets: make query');
144 |
145 | $snippetResult = $this->storage->getSnippets($snippetQuery);
146 |
147 | $resultSet->addProfilePoint('Snippets: obtaining');
148 |
149 | $sb = new SnippetBuilder($this->stemmer, $this->snippetLineSeparator);
150 | $sb->setHighlightMaskRegexArray($this->highlightMaskRegexArray);
151 | try {
152 | $sb->attachSnippets($resultSet, $snippetResult);
153 | } catch (ImmutableException|UnknownIdException $e) {
154 | throw new LogicException($e->getMessage(), 0, $e);
155 | }
156 |
157 | $resultSet->addProfilePoint('Snippets: building');
158 | }
159 | }
160 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/Query.php:
--------------------------------------------------------------------------------
1 | value = $value;
41 | }
42 |
43 | /**
44 | * @return int|null
45 | */
46 | public function getLimit()
47 | {
48 | return $this->limit;
49 | }
50 |
51 | /**
52 | * @param int $limit
53 | *
54 | * @return self
55 | */
56 | public function setLimit($limit)
57 | {
58 | $this->limit = $limit;
59 |
60 | return $this;
61 | }
62 |
63 | /**
64 | * @return int
65 | */
66 | public function getOffset()
67 | {
68 | return $this->offset;
69 | }
70 |
71 | /**
72 | * @param int $offset
73 | *
74 | * @return self
75 | */
76 | public function setOffset($offset)
77 | {
78 | $this->offset = $offset;
79 |
80 | return $this;
81 | }
82 |
83 | /**
84 | * @return string
85 | */
86 | public function getValue()
87 | {
88 | return $this->value;
89 | }
90 |
91 | /**
92 | * @return int|null
93 | */
94 | public function getInstanceId()
95 | {
96 | return $this->instanceId;
97 | }
98 |
99 | /**
100 | * @param int|null $instanceId
101 | *
102 | * @return self
103 | */
104 | public function setInstanceId($instanceId)
105 | {
106 | $this->instanceId = $instanceId;
107 |
108 | return $this;
109 | }
110 |
111 | /**
112 | * @return string[]
113 | */
114 | public function valueToArray()
115 | {
116 | $content = self::normalizeValue($this->value);
117 | if ($content === '') {
118 | return [];
119 | }
120 |
121 | $content = strip_tags($content);
122 |
123 | // Normalize
124 | $content = str_replace(['«', '»', '“', '”', '‘', '’'], '"', $content);
125 | $content = str_replace('−', '-', $content); // Replace minus sign to a hyphen
126 | $content = str_replace(['---', '–', '−'], '—', $content); // Normalize dashes
127 | $content = self::safePregReplace('#,\\s+,#u', ',,', $content);
128 | $content = self::safePregReplace('#[^\\-\\p{L}0-9^_.,()";?!…:—]+#iu', ' ', $content);
129 | $content = mb_strtolower($content);
130 |
131 | // Replace decimal separators: ',' -> '.'
132 | $content = self::safePregReplace('#(?<=^|\\s)(\\-?\\d+),(\\d+)(?=\\s|$)#u', '\\1.\\2', $content);
133 |
134 | // Separate special chars at the beginning of the word
135 | while (true) {
136 | $content = self::safePregReplace('#(?:^|\\s)\K([—^()"?:!])(?=[^\s])#u', '\\1 ', $content, -1, $count);
137 | if ($count === 0 || $content === '') {
138 | break;
139 | }
140 | }
141 |
142 | // Separate special chars at the end of the word
143 | while (true) {
144 | $content = self::safePregReplace('#(?<=[^\s])([—^()"?:!])(?=\\s|$)#u', ' \\1', $content, -1, $count);
145 | if ($count === 0 || $content === '') {
146 | break;
147 | }
148 | }
149 |
150 | // Separate groups of commas
151 | $content = self::safePregReplace('#(,+)#u', ' \\1 ', $content);
152 |
153 | $words = preg_split('#\\s+#', $content);
154 | foreach ($words as $k => &$v) {
155 | // Replace 'ё' inside words
156 | if ($v !== 'ё' && false !== strpos($v, 'ё')) {
157 | $v = str_replace('ё', 'е', $v);
158 | }
159 |
160 | if ($v === '' || !preg_match('#[\\p{L}\\d]#u', $v)) {
161 | continue;
162 | }
163 |
164 | $trimmed = rtrim($v, StringHelper::WORD_COMPONENT_DELIMITERS);
165 | if ($trimmed === '') {
166 | unset($words[$k]);
167 | continue;
168 | }
169 |
170 | $v = $trimmed;
171 | }
172 | unset($v);
173 |
174 | $words = array_unique($words);
175 |
176 | StringHelper::removeLongWords($words);
177 |
178 | // Fix keys
179 | // $words = array_values($words); // <- moved to helper
180 |
181 | if (\count($words) > self::MAX_WORDS) {
182 | $words = \array_slice($words, 0, self::MAX_WORDS);
183 | }
184 |
185 | return $words;
186 | }
187 |
188 | private static function normalizeValue($value): string
189 | {
190 | if (\is_string($value)) {
191 | $stringValue = $value;
192 | } elseif (\is_scalar($value) || (class_exists(\Stringable::class) && $value instanceof \Stringable)) {
193 | $stringValue = (string)$value;
194 | } else {
195 | return '';
196 | }
197 |
198 | return self::normalizeUtf8($stringValue);
199 | }
200 |
201 | private static function normalizeUtf8(string $value): string
202 | {
203 | if ($value === '') {
204 | return '';
205 | }
206 |
207 | if (mb_check_encoding($value, 'UTF-8')) {
208 | return $value;
209 | }
210 |
211 | $previousSubstitute = mb_substitute_character();
212 | mb_substitute_character('none');
213 | $converted = mb_convert_encoding($value, 'UTF-8', 'UTF-8');
214 | mb_substitute_character($previousSubstitute);
215 |
216 | if ($converted === false) {
217 | return '';
218 | }
219 |
220 | return $converted;
221 | }
222 |
223 | private static function safePregReplace(string $pattern, string $replacement, string $subject, int $limit = -1, ?int &$count = null): string
224 | {
225 | $result = preg_replace($pattern, $replacement, $subject, $limit, $count);
226 |
227 | return $result ?? '';
228 | }
229 | }
230 |
--------------------------------------------------------------------------------
/src/S2/Rose/Helper/StringHelper.php:
--------------------------------------------------------------------------------
1 | $word) {
27 | $len = mb_strlen($word);
28 |
29 | if ($len > 100 || $len === 0) {
30 | unset($words[$k]);
31 | $removed = true;
32 | }
33 | }
34 | if ($removed) {
35 | $words = array_values($words);
36 | }
37 | }
38 |
39 | /**
40 | * @return string[]
41 | */
42 | public static function sentencesFromText(string $text, bool $hasFormatting): array
43 | {
44 | $text2 = preg_replace('#(\p{Lu}\p{L}*\.?)\s+(\p{Lu}\p{L}?\.)\s+(\p{Lu})#u', "\\1�\\2�\\3", $text);
45 | $text2 = preg_replace('#(\p{Lu}\p{L}?\.)(\p{Lu}\p{L}?\.)\s+(\p{Lu})#u', "\\1\\2�\\3", $text2);
46 | $text2 = preg_replace('#\s\K(Mr.|Dr.)\s(?=\p{Lu}\p{L}?)#u', "\\1�\\3", $text2);
47 |
48 | $substrings = preg_split('#(?:\.|[?!][»"]?)\K([ \n\t\r]+)(?=(?:[\p{Pd}-]\s)?[^\p{Ll}])#Su', $text2);
49 |
50 | $substrings = str_replace("�", ' ', $substrings);
51 |
52 | if ($hasFormatting) {
53 | // We keep the formatting scope through several sentences.
54 | //
55 | // For example, consider the input: 'Sentence 1. Sentence 2. Sentence 3.'
56 | // After processing, it becomes ['Sentence 1.', 'Sentence 2.', 'Sentence 3.'].
57 | $tagsFromPrevSentence = [];
58 | array_walk($substrings, static function (string &$text) use (&$tagsFromPrevSentence) {
59 | foreach (array_reverse($tagsFromPrevSentence) as $possibleTag => $num) {
60 | if ($num > 0) {
61 | $text = str_repeat('\\' . $possibleTag, $num) . $text;
62 | $tagsFromPrevSentence[$possibleTag] = 0;
63 | }
64 | }
65 | $text = self::fixUnbalancedInternalFormatting($text, $tagsFromPrevSentence);
66 | });
67 | }
68 |
69 | return $substrings;
70 | }
71 |
72 | /**
73 | * @return string[]
74 | */
75 | public static function sentencesFromCode(string $text): array
76 | {
77 | $substrings = preg_split('#(\r?\n\r?){1,}#Su', $text);
78 | array_walk($substrings, 'trim');
79 |
80 | return $substrings;
81 | }
82 |
83 | public static function convertInternalFormattingToHtml(string $text): string
84 | {
85 | return strtr($text, [
86 | '\\\\' => '\\',
87 | '\\' . self::BOLD => '',
88 | '\\' . strtoupper(self::BOLD) => '',
89 | '\\' . self::ITALIC => '',
90 | '\\' . strtoupper(self::ITALIC) => '',
91 | '\\' . self::SUBSCRIPT => '',
92 | '\\' . strtoupper(self::SUBSCRIPT) => '',
93 | '\\' . self::SUPERSCRIPT => '',
94 | '\\' . strtoupper(self::SUPERSCRIPT) => '',
95 | ]);
96 | }
97 |
98 | public static function clearInternalFormatting(string $text): string
99 | {
100 | return strtr($text, [
101 | '\\\\' => '\\',
102 | '\\' . self::BOLD => '',
103 | '\\' . strtoupper(self::BOLD) => '',
104 | '\\' . self::ITALIC => '',
105 | '\\' . strtoupper(self::ITALIC) => '',
106 | '\\' . self::SUBSCRIPT => '',
107 | '\\' . strtoupper(self::SUBSCRIPT) => '',
108 | '\\' . self::SUPERSCRIPT => '',
109 | '\\' . strtoupper(self::SUPERSCRIPT) => '',
110 | ]);
111 | }
112 |
113 | /**
114 | * @Note: This approach with counting formatting symbols gives wrong results for the same nested tags.
115 | * For example, for '\i 1 \b 2 \i 3' it returns '\i 1 \b 2 \i 3 \B\I\I', however '\i 1 \b 2 \i 3\I\B\I' is expected.
116 | * It's ok since nesting of formatting tags like ab do not make a lot of sense.
117 | */
118 | public static function fixUnbalancedInternalFormatting(string $text, array &$tagsNum): string
119 | {
120 | preg_match_all('#\\\\(?:\\\\(*SKIP)\\\\)*\K[' . self::FORMATTING_SYMBOLS . ']#i', $text, $matches);
121 |
122 | foreach ($matches[0] as $match) {
123 | $lowerMatch = strtolower($match);
124 | $tagsNum[$lowerMatch] = ($tagsNum[$lowerMatch] ?? 0) + ($match === $lowerMatch ? 1 : -1);
125 | }
126 |
127 | $result = $text;
128 |
129 | foreach ($tagsNum as $possibleTag => $num) {
130 | if ($num < 0) {
131 | $result = str_repeat('\\' . $possibleTag, -$num) . $result;
132 | }
133 | }
134 | foreach (array_reverse($tagsNum) as $possibleTag => $num) {
135 | if ($num > 0) {
136 | $result .= str_repeat('\\' . strtoupper($possibleTag), $num);
137 | }
138 | }
139 |
140 | return $result;
141 | }
142 |
143 | /**
144 | * @return array{0: array, 1: array}
145 | */
146 | public static function getUnbalancedInternalFormatting(string $text): array
147 | {
148 | preg_match_all('#\\\\(?:\\\\(*SKIP)\\\\)*\K[' . self::FORMATTING_SYMBOLS . ']#i', $text, $matches);
149 |
150 | $openStack = [];
151 | $closeStack = [];
152 |
153 | foreach ($matches[0] as $match) {
154 | $lowerMatch = strtolower($match);
155 | if ($match === $lowerMatch) {
156 | $openStack[] = $match;
157 | continue;
158 | }
159 |
160 | $found = false;
161 | for ($i = \count($openStack); $i--;) {
162 | if ($openStack[$i] === $lowerMatch) {
163 | array_splice($openStack, $i, 1);
164 | $found = true;
165 | break;
166 | }
167 | }
168 | if (!$found) {
169 | $closeStack[] = $match;
170 | }
171 | }
172 |
173 | return [$openStack, $closeStack];
174 | }
175 | }
176 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Entity/SnippetTest.php:
--------------------------------------------------------------------------------
1 | %s',
41 | SnippetLine::createFromSnippetSourceWithoutFoundWords(new SnippetSource('introduction', SnippetSource::FORMAT_PLAIN_TEXT, 0, 0))
42 | );
43 | $snippet
44 | ->attachSnippetLine(1, 7, $snippetLine1)
45 | ->attachSnippetLine(8, 10, $snippetLine2)
46 | ;
47 |
48 | $this->assertEquals(
49 | 'Testing string to highlight some test values. Test is case-sensitive.',
50 | $snippet->toString()
51 | );
52 | }
53 |
54 | public function testSnippet2()
55 | {
56 | $data = [
57 | [
58 | 2,
59 | 13,
60 | 'Тут есть тонкость - нужно проверить, как происходит экранировка в сущностях вроде +.',
61 | ['сущност'],
62 | ],
63 | [
64 | 14,
65 | 23,
66 | 'Для этого нужно включить в текст само сочетание букв "plus".',
67 | ['plus'],
68 | ],
69 | ];
70 |
71 | $snippet = new Snippet(
72 | '%s',
73 | SnippetLine::createFromSnippetSourceWithoutFoundWords(new SnippetSource('introduction', SnippetSource::FORMAT_PLAIN_TEXT, 0, 1))
74 | );
75 |
76 | foreach ($data as $row) {
77 | $snippet->attachSnippetLine($row[0], $row[1], new SnippetLine($row[2], SnippetSource::FORMAT_PLAIN_TEXT, new PorterStemmerRussian(), $row[3], \count($row[3])));
78 | }
79 |
80 | $this->assertEquals(
81 | 'Тут есть тонкость - нужно проверить, как происходит экранировка в сущностях вроде +. Для этого нужно включить в текст само сочетание букв "plus".',
82 | $snippet->toString()
83 | );
84 | }
85 |
86 | public function testSnippetsUnique()
87 | {
88 | $stemmer = new PorterStemmerEnglish();
89 | $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0));
90 | $snippet
91 | ->attachSnippetLine(0, 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
92 | ->attachSnippetLine(4, 7, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
93 | ->attachSnippetLine(8, 11, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
94 | ->attachSnippetLine(12, 15, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
95 | ->attachSnippetLine(16, 19, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
96 | ->attachSnippetLine(20, 23, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
97 | ->attachSnippetLine(24, 27, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
98 | ;
99 |
100 | $this->assertEquals(
101 | 'Try to test 1... Try to test 2.',
102 | $snippet->toString()
103 | );
104 |
105 | $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0));
106 | $snippet
107 | ->attachSnippetLine(0 * 4, 0 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
108 | ->attachSnippetLine(1 * 4, 1 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
109 | ->attachSnippetLine(2 * 4, 2 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
110 | ->attachSnippetLine(3 * 4, 3 * 4 + 3, new SnippetLine('Try to test 1.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
111 | ->attachSnippetLine(4 * 4, 4 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
112 | ->attachSnippetLine(5 * 4, 5 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
113 | ->attachSnippetLine(6 * 4, 6 * 4 + 3, new SnippetLine('Try to test 2.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
114 | ->attachSnippetLine(7 * 4, 7 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
115 | ->attachSnippetLine(8 * 4, 8 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
116 | ->attachSnippetLine(9 * 4, 9 * 4 + 3, new SnippetLine('Try to test 3.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
117 | ->attachSnippetLine(10 * 4, 10 * 4 + 3, new SnippetLine('Try to test 4.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 1))
118 | ->attachSnippetLine(11 * 4, 11 * 4 + 3, new SnippetLine('Try to test 4.', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, ['test'], 2))
119 | ;
120 |
121 | $this->assertEquals(
122 | 'Try to test 1... Try to test 2... Try to test 4.',
123 | $snippet->toString()
124 | );
125 | }
126 |
127 | public function testEmptySnippet()
128 | {
129 | $stemmer = new PorterStemmerEnglish();
130 | $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0));
131 | $snippet->toString();
132 |
133 | $snippet = new Snippet('%s', new SnippetLine('introduction', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0.0));
134 | $snippet->attachSnippetLine(1, 1, new SnippetLine('line1', SnippetSource::FORMAT_PLAIN_TEXT, $stemmer, [], 0));
135 | $snippet->toString();
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/FulltextResult.php:
--------------------------------------------------------------------------------
1 | query = $query;
21 | $this->fulltextIndexContent = $fulltextIndexContent;
22 | $this->tocSize = $tocSize;
23 | }
24 |
25 | /**
26 | * https://i.upmath.me/svg/%5Cbegin%7Btikzpicture%7D%5Bscale%3D1.0544%5D%5Csmall%0A%5Cbegin%7Baxis%7D%5Baxis%20line%20style%3Dgray%2C%0A%09samples%3D100%2C%0A%09xmin%3D-1.2%2C%20xmax%3D1.2%2C%0A%09ymin%3D0%2C%20ymax%3D1.1%2C%0A%09restrict%20y%20to%20domain%3D-0.1%3A1%2C%0A%09ytick%3D%7B1%7D%2C%0A%09xtick%3D%7B-1%2C1%7D%2C%0A%09axis%20equal%2C%0A%09axis%20x%20line%3Dcenter%2C%0A%09axis%20y%20line%3Dcenter%2C%0A%09xlabel%3D%24x%24%2Cylabel%3D%24y%24%5D%0A%5Caddplot%5Bred%2Cdomain%3D-2%3A1%2Csemithick%5D%7Bexp(-(x%2F0.38)%5E2)%7D%3B%0A%5Caddplot%5Bred%5D%20coordinates%20%7B(0.8%2C0.6)%7D%20node%7B%24y%3De%5E%7B-%5Cleft(x%2F0.38%5Cright)%5E2%7D%24%7D%3B%0A%5Cpath%20(axis%20cs%3A0%2C0)%20node%20%5Banchor%3Dnorth%20west%2Cyshift%3D-0.07cm%5D%20%7B0%7D%3B%0A%5Cend%7Baxis%7D%0A%5Cend%7Btikzpicture%7D
27 | */
28 | public static function frequencyReduction(int $tocSize, int $foundTocEntriesNum): float
29 | {
30 | if ($tocSize < 5) {
31 | return 1;
32 | }
33 |
34 | return exp(-(($foundTocEntriesNum / $tocSize) / 0.38) ** 2);
35 | }
36 |
37 | /**
38 | * Weight ratio for repeating words in the indexed item.
39 | */
40 | protected static function repeatWeightRatio(int $repeatNum): float
41 | {
42 | return min(0.5 * ($repeatNum - 1) + 1, 4);
43 | }
44 |
45 | /**
46 | * Weight ratio for entry size (prefer some middle size)
47 | *
48 | * https://i.upmath.me/g/%5Cbegin%7Btikzpicture%7D%5Bscale%3D1.0544%5D%5Csmall%0A%5Cbegin%7Baxis%7D%5Baxis%20line%20style%3Dgray%2C%0A%09samples%3D100%2C%0A%09ymin%3D0%2C%20ymax%3D5%2C%0A%09xmin%3D0%2C%20xmax%3D1100%2C%0A%09ytick%3D%7B1%2C2%7D%2C%0A%09xtick%3D%7B50%2C200%2C500%2C1000%7D%2C%0A%09axis%20x%20line%3Dcenter%2C%0A%09axis%20y%20line%3Dcenter%2C%0A%09xlabel%3D%24x%24%2Cylabel%3D%24y%24%5D%0A%5Caddplot%5Bred%2Cdomain%3D0%3A1000%2Csemithick%5D%7B1%2F(1%2Bexp((sqrt(x)-18)%5E2%2F60))%2B1%7D%3B%0A%5Caddplot%5Bblue%2Cdomain%3D0%3A1000%2Csemithick%5D%7B1%7D%3B%0A%5Caddplot%5Bred%5D%20coordinates%20%7B(600%2C3)%7D%20node%7B%24y%3D1%2F(1%2Bexp((sqrt(x)-18)%5E2%2F60))%2B1%24%7D%3B%0A%5Cend%7Baxis%7D%0A%5Cend%7Btikzpicture%7D
49 | */
50 | protected static function entrySizeWeightRatio(int $totalWordsNum): float
51 | {
52 | return $totalWordsNum >= 10 ? 1.0 + 1.0 / (1.0 + exp((sqrt($totalWordsNum) - 18) ** 2 / 60.0)) : 1;
53 | }
54 |
55 | /**
56 | * Weight ratio for a pair of words. Accepts the difference of distances
57 | * in the indexed item and the search query.
58 | *
59 | * @param float $distance
60 | *
61 | * @return float
62 | */
63 | protected static function neighbourWeight(float $distance): float
64 | {
65 | return 30.0 / (1 + pow($distance / 7.0, 2));
66 | }
67 |
68 | /**
69 | * @throws ImmutableException
70 | */
71 | public function fillResultSet(ResultSet $resultSet): void
72 | {
73 | $wordReductionRatios = [];
74 | foreach ($this->fulltextIndexContent->toArray() as $word => $indexedItems) {
75 | $reductionRatio = self::frequencyReduction($this->tocSize, \count($indexedItems));
76 | $wordReductionRatios[$word] = $reductionRatio;
77 |
78 | foreach ($indexedItems as $positionBag) {
79 | $externalId = $positionBag->getExternalId();
80 | $contentPositionsNum = \count($positionBag->getContentPositions());
81 |
82 | if ($contentPositionsNum > 0) {
83 | $weights = [
84 | 'abundance_reduction' => $reductionRatio,
85 | 'repeat_multiply' => self::repeatWeightRatio($contentPositionsNum),
86 | 'entry_size' => self::entrySizeWeightRatio($positionBag->getWordCount()),
87 | ];
88 | if ($positionBag->hasExternalRelevanceRatio()) {
89 | $weights['external_ratio'] = $positionBag->getExternalRelevanceRatio();
90 | }
91 | $resultSet->addWordWeight($word, $externalId, $weights, $positionBag->getContentPositions());
92 | }
93 |
94 | if (\count($positionBag->getKeywordPositions()) > 0) {
95 | $weights = [
96 | 'keyword' => 10,
97 | 'abundance_reduction' => $reductionRatio,
98 | ];
99 | if ($positionBag->hasExternalRelevanceRatio()) {
100 | $weights['external_ratio'] = $positionBag->getExternalRelevanceRatio();
101 | }
102 | $resultSet->addWordWeight($word, $externalId, $weights);
103 | }
104 |
105 | if (\count($positionBag->getTitlePositions()) > 0) {
106 | $weights = [
107 | 'title' => 25,
108 | 'abundance_reduction' => $reductionRatio,
109 | ];
110 | if ($positionBag->hasExternalRelevanceRatio()) {
111 | $weights['external_ratio'] = $positionBag->getExternalRelevanceRatio();
112 | }
113 | $resultSet->addWordWeight($word, $externalId, $weights);
114 | }
115 | }
116 | }
117 |
118 | $referenceContainer = $this->query->toWordPositionContainer();
119 |
120 | $this->fulltextIndexContent->iterateContentWordPositions(
121 | static function (ExternalId $id, WordPositionContainer $container) use ($referenceContainer, $wordReductionRatios, $resultSet) {
122 | $pairsDistance = $container->compareWith($referenceContainer);
123 | foreach ($pairsDistance as $pairDistance) {
124 | [$word1, $word2, $distance] = $pairDistance;
125 | $weight = self::neighbourWeight($distance);
126 | if (isset($wordReductionRatios[$word1])) {
127 | $weight *= $wordReductionRatios[$word1];
128 | }
129 | if (isset($wordReductionRatios[$word2])) {
130 | $weight *= $wordReductionRatios[$word2];
131 | }
132 | $resultSet->addNeighbourWeight($word1, $word2, $id, $weight, $distance);
133 | }
134 | }
135 | );
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/tests/unit/Rose/FinderTest.php:
--------------------------------------------------------------------------------
1 | static function () {
42 | return 30;
43 | },
44 | 'fulltextResultByWords' => static function (array $words) {
45 | $result = new FulltextIndexContent();
46 | foreach ($words as $k => $word) {
47 | if ($word === 'find') {
48 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_3'), [], [], [1], 0, 1.0));
49 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_2'), [], [1], [10, 20], 0, 1.0));
50 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_1'), [1], [], [], 0, 1.0));
51 | }
52 | if ($word === 'and') {
53 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_1'), [], [], [4, 8], 0, 1.0));
54 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_2'), [], [], [7, 11, 34], 0, 1.0));
55 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_3'), [], [], [28, 65], 0, 1.0));
56 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_4'), [], [], [45, 9], 0, 1.0));
57 |
58 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_5'), [], [], [1], 0, 1.0));
59 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_6'), [], [], [1], 0, 1.0));
60 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_7'), [], [], [1], 0, 1.0));
61 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_8'), [], [], [1], 0, 1.0));
62 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_9'), [], [], [1], 0, 1.0));
63 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_10'), [], [], [1], 0, 1.0));
64 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_11'), [], [], [1], 0, 1.0));
65 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_12'), [], [], [1], 0, 1.0));
66 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_13'), [], [], [1], 0, 1.0));
67 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_14'), [], [], [1], 0, 1.0));
68 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_15'), [], [], [1], 0, 1.0));
69 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_16'), [], [], [1], 0, 1.0));
70 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_17'), [], [], [1], 0, 1.0));
71 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_18'), [], [], [1], 0, 1.0));
72 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_19'), [], [], [1], 0, 1.0));
73 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_20'), [], [], [1], 0, 1.0));
74 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_21'), [], [], [1], 0, 1.0));
75 | }
76 | if ($word === 'replace') {
77 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_2'), [], [], [12], 0, 1.0));
78 | $result->add($word, new FulltextIndexPositionBag(new ExternalId('id_1'), [1], [], [], 0, 1.0));
79 | }
80 |
81 | unset($words[$k]);
82 | }
83 |
84 | if (!empty($words)) {
85 | throw new \RuntimeException(sprintf('Unknown words "%s" in StorageReadInterface stub.', implode(',', $words)));
86 | }
87 |
88 | return $result;
89 | },
90 | 'getTocByExternalIds' => static function (ExternalIdCollection $ids) {
91 | return array_map(static function (ExternalId $id) {
92 | return new TocEntryWithMetadata(
93 | new TocEntry('Title ' . $id->getId(), '', null, 'url_' . $id->getId(), 1, 'hash_' . $id->getId()),
94 | $id,
95 | new ImgCollection()
96 | );
97 | }, $ids->toArray());
98 | },
99 | 'getSnippets' => function (SnippetQuery $snippetQuery) use (&$storedSnippetQuery): SnippetResult {
100 | $storedSnippetQuery = $snippetQuery;
101 | return new SnippetResult();
102 | }
103 | ]);
104 |
105 | $stemmer = new PorterStemmerRussian();
106 | $finder = new Finder($storage, $stemmer);
107 | $resultSet = $finder->find(new Query('find and replace'));
108 |
109 | $items = $resultSet->getItems();
110 | $this->assertCount(21, $items);
111 |
112 | $weights = $resultSet->getFoundWordPositionsByExternalId();
113 | $this->assertCount(21, $weights);
114 | $this->assertEquals([], $weights[':id_1']['find']);
115 | $this->assertEquals([], $weights[':id_1']['replace']);
116 | $this->assertEquals([10, 20], $weights[':id_2']['find']);
117 | $this->assertEquals([12], $weights[':id_2']['replace']);
118 | $this->assertEquals([1], $weights[':id_3']['find']);
119 |
120 | $query2 = new Query('find and replace');
121 | $query2->setLimit(10);
122 | $resultSet2 = $finder->find($query2);
123 | $this->assertCount(10, $resultSet2->getItems());
124 | $this->assertCount(10, $storedSnippetQuery->getExternalIds());
125 | }
126 | }
127 |
--------------------------------------------------------------------------------
/src/S2/Rose/Indexer.php:
--------------------------------------------------------------------------------
1 | storage = $storage;
43 | $this->stemmer = $stemmer;
44 | $this->extractor = $extractor ?? DefaultExtractorFactory::create();
45 | $this->logger = $logger;
46 | }
47 |
48 | /**
49 | * Cleaning up an HTML string.
50 | */
51 | public static function titleStrFromHtml(string $content, string $allowedSymbols = ''): string
52 | {
53 | $content = mb_strtolower($content);
54 | $content = str_replace([' ', "\xc2\xa0"], ' ', $content);
55 | /** @var string $content */
56 | $content = preg_replace('#&[^;]{1,20};#', '', $content);
57 |
58 | // We allow letters, digits and some punctuation: ".,-"
59 | $content = preg_replace('#[^\\-.,0-9\\p{L}^_' . $allowedSymbols . ']+#u', ' ', $content);
60 |
61 | // These punctuation characters are meant to be inside words and numbers.
62 | // We'll remove trailing characters when splitting the words.
63 | $content .= ' ';
64 |
65 | return $content;
66 | }
67 |
68 | /**
69 | * @return string[]
70 | */
71 | protected static function arrayFromStr(string $contents): array
72 | {
73 | $words = preg_split('#[\\-.,]*?[ ]+#S', $contents);
74 | StringHelper::removeLongWords($words);
75 |
76 | return $words;
77 | }
78 |
79 | protected function addToIndex(ExternalId $externalId, string $title, ContentWithMetadata $content, string $keywords): void
80 | {
81 | $sentenceCollection = $content->getSentenceMap()->toSentenceCollection();
82 | $contentWordsArray = $sentenceCollection->getWordsArray();
83 |
84 | foreach ($contentWordsArray as $i => $word) {
85 | if ($this->storage->isExcludedWord($word)) {
86 | unset($contentWordsArray[$i]);
87 | }
88 | }
89 |
90 | $titleWordsArray = self::arrayFromStr($title);
91 | $keywordsArray = self::arrayFromStr($keywords);
92 |
93 | $this->storage->addMetadata($externalId, \count($titleWordsArray) + \count($contentWordsArray), $content->getImageCollection());
94 | $this->storage->addSnippets($externalId, ...$sentenceCollection->getSnippetSources());
95 | $this->storage->addToFulltextIndex(
96 | $this->getStemsWithComponents($titleWordsArray),
97 | $this->getStemsWithComponents($keywordsArray), // TODO consider different semantics of space and comma?
98 | $this->getStemsWithComponents($contentWordsArray),
99 | $externalId
100 | );
101 | }
102 |
103 | public function removeById(string $id, ?int $instanceId): void
104 | {
105 | $externalId = new ExternalId($id, $instanceId);
106 | $this->storage->removeFromIndex($externalId);
107 | $this->storage->removeFromToc($externalId);
108 | }
109 |
110 | /**
111 | * @throws RuntimeException
112 | * @throws UnknownException
113 | */
114 | public function index(Indexable $indexable): void
115 | {
116 | try {
117 | $this->doIndex($indexable);
118 | } catch (EmptyIndexException $e) {
119 | if (!$this->autoErase || !$this->storage instanceof StorageEraseInterface) {
120 | throw $e;
121 | }
122 |
123 | $this->storage->erase();
124 | $this->doIndex($indexable);
125 | }
126 | }
127 |
128 | public function setAutoErase(bool $autoErase): void
129 | {
130 | $this->autoErase = $autoErase;
131 | }
132 |
133 | /**
134 | * @throws RuntimeException
135 | * @throws UnknownException
136 | */
137 | protected function doIndex(Indexable $indexable): void
138 | {
139 | if ($this->storage instanceof TransactionalStorageInterface) {
140 | $this->storage->startTransaction();
141 | }
142 |
143 | try {
144 | $externalId = $indexable->getExternalId();
145 | $oldTocEntry = $this->storage->getTocByExternalId($externalId);
146 |
147 | $this->storage->addEntryToToc($indexable->toTocEntry(), $externalId);
148 |
149 | if ($oldTocEntry === null || $oldTocEntry->getHash() !== $indexable->calcHash()) {
150 | $this->storage->removeFromIndex($externalId);
151 |
152 | $extractionResult = $this->extractor->extract($indexable->getContent());
153 | $extractionErrors = $extractionResult->getErrors();
154 | if ($this->logger && $extractionErrors->hasErrors()) {
155 | $this->logger->warning(sprintf(
156 | 'Found warnings on indexing "%s" (id="%s", instance="%s", url="%s")',
157 | $indexable->getTitle(),
158 | $indexable->getExternalId()->getId(),
159 | $indexable->getExternalId()->getInstanceId() ?? '',
160 | $indexable->getUrl()
161 | ), $extractionErrors->getFormattedLines());
162 | }
163 |
164 | // strtolower in titleStrFromHtml is important
165 | $this->addToIndex(
166 | $externalId,
167 | self::titleStrFromHtml($indexable->getTitle()),
168 | $extractionResult->getContentWithMetadata(),
169 | self::titleStrFromHtml($indexable->getKeywords())
170 | );
171 | }
172 |
173 | if ($this->storage instanceof TransactionalStorageInterface) {
174 | $this->storage->commitTransaction();
175 | }
176 | } catch (\Exception $e) {
177 | if ($this->storage instanceof TransactionalStorageInterface) {
178 | $this->storage->rollbackTransaction();
179 | }
180 | if (!($e instanceof RuntimeException)) {
181 | throw new UnknownException('Unknown exception occurred while indexing.', 0, $e);
182 | }
183 | throw $e;
184 | }
185 | }
186 |
187 | /**
188 | * Replaces words with stems. Also, this method detects compound words and adds the component stems to the result.
189 | *
190 | * The keys in the result arrays are the positions of the word. For compound words a string representation
191 | * of a float is used to map one index to several words. For example, for input
192 | *
193 | * [10 => 'well-known', 11 => 'facts']
194 | *
195 | * this method returns
196 | *
197 | * [10 => 'well-known', 11 => 'fact', '10.001' => 'well', '10.002' => 'known']
198 | *
199 | * @param array $words
200 | * @return array
201 | */
202 | private function getStemsWithComponents(array $words): array
203 | {
204 | $componentsOfCompoundWords = [];
205 | foreach ($words as $i => &$word) {
206 | $stemmedWord = $this->stemmer->stemWord($word, false);
207 |
208 | // If the word contains punctuation marks like hyphen, add a variant without it
209 | if (false !== strpbrk($stemmedWord, StringHelper::WORD_COMPONENT_DELIMITERS)) {
210 | foreach (preg_split('#(?<=[\p{L}\d])[\-.,]+|[\-.,]++(?=[\p{L}\d])#u', $word) as $k => $subWord) {
211 | if ($subWord !== '' && $subWord !== $word) {
212 | $componentsOfCompoundWords[(string)($i + 0.001 * ($k + 1))] = $this->stemmer->stemWord($subWord, false);
213 | }
214 | }
215 | }
216 |
217 | $word = $stemmedWord;
218 | }
219 | unset($word);
220 |
221 | return array_merge($words, $componentsOfCompoundWords);
222 | }
223 | }
224 |
--------------------------------------------------------------------------------
/src/S2/Rose/Storage/ArrayStorage.php:
--------------------------------------------------------------------------------
1 | fulltextProxy->getByWord($word);
45 | foreach ($data as $id => $positionsByType) {
46 | $externalId = $this->externalIdFromInternalId($id);
47 | if ($externalId === null) {
48 | continue;
49 | }
50 | if ($instanceId === null || $externalId->getInstanceId() === $instanceId) {
51 | $serializedExtId = $externalId->toString();
52 | $result->add($word, new FulltextIndexPositionBag(
53 | $externalId,
54 | $positionsByType[FulltextProxyInterface::TYPE_TITLE] ?? [],
55 | $positionsByType[FulltextProxyInterface::TYPE_KEYWORD] ?? [],
56 | $positionsByType[FulltextProxyInterface::TYPE_CONTENT] ?? [],
57 | isset($this->metadata[$id]) ? $this->metadata[$id]['wordCount'] : 0,
58 | isset($this->toc[$serializedExtId]) ? $this->toc[$serializedExtId]->getRelevanceRatio() : 1.0
59 | ));
60 | }
61 | }
62 | }
63 |
64 | return $result;
65 | }
66 |
67 | /**
68 | * {@inheritdoc}
69 | * @throws UnknownIdException
70 | */
71 | public function getSnippets(SnippetQuery $snippetQuery): SnippetResult
72 | {
73 | $result = new SnippetResult();
74 | $snippetQuery->iterate(function (ExternalId $externalId, array $positions) use ($result) {
75 | $fallbackCount = 0;
76 | foreach ($this->metadata[$this->internalIdFromExternalId($externalId)]['snippets'] ?? [] as $snippetSource) {
77 | if (!$snippetSource instanceof SnippetSource) {
78 | throw new LogicException('Snippets must be stored as array of SnippetSource.');
79 | }
80 | if ($fallbackCount < 2 || $snippetSource->coversOneOfPositions($positions)) {
81 | $result->attach($externalId, $snippetSource);
82 | $fallbackCount++;
83 | }
84 | }
85 | });
86 |
87 | return $result;
88 | }
89 |
90 | /**
91 | * {@inheritdoc}
92 | * @throws UnknownIdException
93 | */
94 | public function addToFulltextIndex(array $titleWords, array $keywords, array $contentWords, ExternalId $externalId): void
95 | {
96 | $id = $this->internalIdFromExternalId($externalId);
97 | foreach ($titleWords as $position => $word) {
98 | $this->fulltextProxy->addWord($word, $id, FulltextProxyInterface::TYPE_TITLE, (int)$position);
99 | }
100 | foreach ($keywords as $position => $word) {
101 | $this->fulltextProxy->addWord($word, $id, FulltextProxyInterface::TYPE_KEYWORD, (int)$position);
102 | }
103 | foreach ($contentWords as $position => $word) {
104 | $this->fulltextProxy->addWord($word, $id, FulltextProxyInterface::TYPE_CONTENT, (int)$position);
105 | }
106 | }
107 |
108 | /**
109 | * {@inheritdoc}
110 | */
111 | public function isExcludedWord(string $word): bool
112 | {
113 | return isset($this->excludedWords[$word]);
114 | }
115 |
116 | /**
117 | * Drops frequent words from index.
118 | */
119 | public function cleanup(): void
120 | {
121 | $threshold = Finder::fulltextRateExcludeNum(\count($this->toc));
122 |
123 | foreach ($this->fulltextProxy->getFrequentWords($threshold) as $word => $stat) {
124 | // Drop fulltext frequent or empty items
125 | $this->fulltextProxy->removeWord($word);
126 | $this->excludedWords[$word] = 1;
127 | }
128 | }
129 |
130 | /**
131 | * {@inheritdoc}
132 | * @throws UnknownIdException
133 | */
134 | public function removeFromIndex(ExternalId $externalId): void
135 | {
136 | $internalId = $this->internalIdFromExternalId($externalId);
137 |
138 | $this->fulltextProxy->removeById($internalId);
139 |
140 | foreach ($this->metadata as &$data) {
141 | if (isset($data[$internalId])) {
142 | unset($data[$internalId]);
143 | }
144 | }
145 | unset($data);
146 | }
147 |
148 | /**
149 | * {@inheritdoc}
150 | */
151 | public function addEntryToToc(TocEntry $entry, ExternalId $externalId): void
152 | {
153 | try {
154 | $internalId = $this->internalIdFromExternalId($externalId);
155 | $this->removeFromToc($externalId);
156 | } catch (UnknownIdException $e) {
157 | $internalId = 0;
158 | foreach ($this->toc as $existingEntry) {
159 | $internalId = max($internalId, $existingEntry->getInternalId());
160 | }
161 | $internalId++;
162 | }
163 |
164 | $entry->setInternalId($internalId);
165 |
166 | $this->toc[$externalId->toString()] = $entry;
167 | $this->externalIdMap[$internalId] = $externalId;
168 | }
169 |
170 | /**
171 | * {@inheritdoc}
172 | * @throws UnknownIdException
173 | */
174 | public function addMetadata(ExternalId $externalId, int $wordCount, ImgCollection $imgCollection): void
175 | {
176 | $internalId = $this->internalIdFromExternalId($externalId);
177 | $this->metadata[$internalId]['wordCount'] = $wordCount;
178 | $this->metadata[$internalId]['images'] = $imgCollection;
179 | }
180 |
181 | /**
182 | * @throws UnknownIdException
183 | */
184 | public function addSnippets(ExternalId $externalId, SnippetSource ...$snippets): void
185 | {
186 | if (\count($snippets) === 0) {
187 | return;
188 | }
189 | $this->metadata[$this->internalIdFromExternalId($externalId)]['snippets'] = $snippets;
190 | }
191 |
192 | /**
193 | * {@inheritdoc}
194 | */
195 | public function getTocByExternalIds(ExternalIdCollection $externalIds): array
196 | {
197 | $result = [];
198 | foreach ($externalIds->toArray() as $externalId) {
199 | $serializedExtId = $externalId->toString();
200 | if (isset($this->toc[$serializedExtId])) {
201 | $result[] = new TocEntryWithMetadata(
202 | $this->toc[$serializedExtId],
203 | $externalId,
204 | $this->metadata[$this->toc[$serializedExtId]->getInternalId()]['images'] ?? new ImgCollection()
205 | );
206 | }
207 | }
208 |
209 | return $result;
210 | }
211 |
212 | /**
213 | * {@inheritdoc}
214 | */
215 | public function getTocByExternalId(ExternalId $externalId): ?TocEntry
216 | {
217 | $serializedExtId = $externalId->toString();
218 |
219 | return $this->toc[$serializedExtId] ?? null;
220 | }
221 |
222 | /**
223 | * {@inheritdoc}
224 | */
225 | public function removeFromToc(ExternalId $externalId): void
226 | {
227 | $serializedExtId = $externalId->toString();
228 | if (!isset($this->toc[$serializedExtId])) {
229 | return;
230 | }
231 |
232 | $internalId = $this->toc[$serializedExtId]->getInternalId();
233 | unset($this->externalIdMap[$internalId], $this->toc[$serializedExtId]);
234 | }
235 |
236 | /**
237 | * {@inheritdoc}
238 | */
239 | public function getTocSize(?int $instanceId): int
240 | {
241 | return \count($this->toc);
242 | }
243 |
244 | /**
245 | * @throws UnknownIdException
246 | */
247 | private function internalIdFromExternalId(ExternalId $externalId): int
248 | {
249 | $serializedExtId = $externalId->toString();
250 | if (!isset($this->toc[$serializedExtId])) {
251 | throw UnknownIdException::createIndexMissingExternalId($externalId);
252 | }
253 |
254 | return $this->toc[$serializedExtId]->getInternalId();
255 | }
256 |
257 | private function externalIdFromInternalId(int $internalId): ?ExternalId
258 | {
259 | return $this->externalIdMap[$internalId] ?? null;
260 | }
261 | }
262 |
--------------------------------------------------------------------------------
/tests/unit/Rose/Helper/StringHelperTest.php:
--------------------------------------------------------------------------------
1 | $str) {
25 | $this->assertEquals($sentences[$i], $str);
26 | }
27 | }
28 |
29 | public function sentenceDataProvider(): array
30 | {
31 | // Лектор спросил: «В чем смысл названия курса?» Я попытался вспомнить, что он говорил на первой лекции, и воспроизвести его слова.
32 | return [
33 | ['One sentence.', ['One sentence.']],
34 | ['Second sentence. And a third one 123.', ['Second sentence.', 'And a third one 123.']],
35 | ['Текст на русском. И еще предложение. 1, 2, 3 и т. д. Цифры, буквы, и т. п., могут встретиться.', [
36 | 'Текст на русском.',
37 | 'И еще предложение.',
38 | '1, 2, 3 и т. д.',
39 | 'Цифры, буквы, и т. п., могут встретиться.',
40 | ]],
41 | ['Sentence \i1. Sentence 2. Sentence\I 3.', ['Sentence \i1.\I', '\iSentence 2.\I', '\iSentence\I 3.'], true],
42 | ['Sentence \i1. Sentence 2. Sentence\B 3.', ['Sentence \i1.\I', '\iSentence 2.\I', '\b\iSentence\B 3.\I'], true],
43 | ['\i\uSentence \b1\B. Sentence 2. Sentence 3.\U\I', ['\i\uSentence \b1\B.\U\I', '\i\uSentence 2.\U\I', '\i\uSentence 3.\U\I'], true],
44 | [
45 | 'Поезд отправился из пункта А в пункт Б. Затем вернулся назад.',
46 | [
47 | 'Поезд отправился из пункта А в пункт Б.',
48 | 'Затем вернулся назад.',
49 | ]],
50 | [
51 | 'Это пример абзаца. Он содержит несколько предложений. Каждое предложение заканчивается точкой! Иногда используется вопросительный знак? И восклицательный знак! Иногда используются многоточия... Но это не всегда так.',
52 | [
53 | 'Это пример абзаца.',
54 | 'Он содержит несколько предложений.',
55 | 'Каждое предложение заканчивается точкой!',
56 | 'Иногда используется вопросительный знак?',
57 | 'И восклицательный знак!',
58 | 'Иногда используются многоточия...',
59 | 'Но это не всегда так.',
60 | ]
61 | ],
62 | [
63 | '- Прямая речь тоже разбивается на предложения? – Да, безусловно! — Отлично, то, что нужно. - Пожалуйста.',
64 | [
65 | '- Прямая речь тоже разбивается на предложения?',
66 | '– Да, безусловно!',
67 | '— Отлично, то, что нужно.',
68 | '- Пожалуйста.',
69 | ]
70 | ],
71 | [
72 | '"Прямая речь может быть в другом синтаксисе", - сказал я. Противник добавил: «Как это скучно!» И следом: «Как это так». Такие дела.',
73 | [
74 | '"Прямая речь может быть в другом синтаксисе", - сказал я.',
75 | 'Противник добавил: «Как это скучно!»',
76 | 'И следом: «Как это так».',
77 | 'Такие дела.',
78 | ]
79 | ],
80 | [
81 | 'На первом курсе А. П. Петров вел математику. А. П. Петров делал это хорошо. Все радовались А.П. Петрову. А.П. Петров пел математику.',
82 | [
83 | 'На первом курсе А. П. Петров вел математику.',
84 | 'А. П. Петров делал это хорошо.',
85 | 'Все радовались А.П. Петрову.',
86 | 'А.П. Петров пел математику.',
87 | ]
88 | ],
89 | [
90 | 'Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment.',
91 | [
92 | 'Last week, former director of the F.B.I. James B. Comey was fired.',
93 | 'Mr. Comey was not available for comment.',
94 | ]
95 | ],
96 | [
97 | 'На первом курсе А. П. Петров (зам. декана), Д. А. Александров (преподаватель физики) и несколько студентов нашего факультета (я в том числе) отправились в Тверь на проведение окружного этапа школьной олимпиады по физике.',
98 | [
99 | 'На первом курсе А. П. Петров (зам. декана), Д. А. Александров (преподаватель физики) и несколько студентов нашего факультета (я в том числе) отправились в Тверь на проведение окружного этапа школьной олимпиады по физике.',
100 | ]
101 | ],
102 | ];
103 | }
104 |
105 | /**
106 | * @dataProvider unbalancedInternalFormattingDataProvider
107 | */
108 | public function testFixUnbalancedInternalFormatting(string $text, string $expected, array $expectedTags): void
109 | {
110 | $tags = [];
111 | $this->assertEquals($expected, StringHelper::fixUnbalancedInternalFormatting($text, $tags));
112 | $this->assertEquals($expectedTags, $tags);
113 | }
114 |
115 | public function unbalancedInternalFormattingDataProvider(): array
116 | {
117 | return [
118 | [
119 | '\\iThis is \\bformatted text\\I with \\Bspecial characters\\i.',
120 | '\\iThis is \\bformatted text\\I with \\Bspecial characters\\i.\\I',
121 | ['i' => 1, 'b' => 0],
122 | ],
123 | [
124 | 'Normal text with escaped formatting symbols like \\\\draw or \\\\inline or \\\\\\\\uuu.',
125 | 'Normal text with escaped formatting symbols like \\\\draw or \\\\inline or \\\\\\\\uuu.',
126 | [],
127 | ],
128 | ['', '', []],
129 | ['456789i', '456789i', []],
130 | [
131 | '456789\\I',
132 | '\\i456789\\I',
133 | ['i' => -1],
134 | ],
135 | [
136 | '456789\\\\I',
137 | '456789\\\\I',
138 | [],
139 | ],
140 | [
141 | '456789\\\\\\I',
142 | '\\i456789\\\\\\I',
143 | ['i' => -1],
144 | ],
145 | [
146 | '456789\\\\\\\\I',
147 | '456789\\\\\\\\I',
148 | [],
149 | ],
150 | [
151 | '456789\\\\\\\\\\I',
152 | '\\i456789\\\\\\\\\\I',
153 | ['i' => -1],
154 | ],
155 | [
156 | '\\u456789',
157 | '\\u456789\\U',
158 | ['u' => 1],
159 | ],
160 | [
161 | '\\u\\D\\\\I\\b',
162 | '\\d\\u\\D\\\\I\\b\\B\\U',
163 | ['d' => -1, 'u' => 1, 'b' => 1],
164 | ],
165 | [
166 | '\i123 \b456 \i789',
167 | '\i123 \b456 \i789\B\I\I', // NOTE: This not what one expects. Current implementation does not account for the same nested tags since they do not make sense
168 | ['i' => 2, 'b' => 1],
169 | ],
170 | [
171 | '\I 123 \i',
172 | '\I 123 \i',
173 | ['i' => 0],
174 | ],
175 | ];
176 | }
177 |
178 | /**
179 | * @dataProvider getUnbalancedInternalFormattingDataProvider
180 | */
181 | public function testGetUnbalancedInternalFormatting(string $text, array $expected): void
182 | {
183 | $this->assertEquals($expected, StringHelper::getUnbalancedInternalFormatting($text));
184 | }
185 |
186 | public function getUnbalancedInternalFormattingDataProvider(): array
187 | {
188 | return [
189 | [
190 | '\\iThis is \\bformatted text\\I with \\Bspecial characters\\i.',
191 | [['i'], []],
192 | ],
193 | [
194 | 'Normal text with escaped formatting symbols like \\\\draw or \\\\inline or \\\\\\\\uuu.',
195 | [[], []],
196 | ],
197 | ['', [[], []]],
198 | ['456789i', [[], []]],
199 | [
200 | '456789\\I',
201 | [[], ['I']],
202 | ],
203 | [
204 | '456789\\\\I',
205 | [[], []],
206 | ],
207 | [
208 | '456789\\\\\\I',
209 | [[], ['I']],
210 | ],
211 | [
212 | '456789\\\\\\\\I',
213 | [[], []],
214 | ],
215 | [
216 | '456789\\\\\\\\\\I',
217 | [[], ['I']],
218 | ],
219 | [
220 | '\\u456789',
221 | [['u'], []],
222 | ],
223 | [
224 | '\\u\\D\\\\I\\b',
225 | [['u', 'b'], ['D']],
226 | ],
227 | [
228 | '\i123 \b456 \i789',
229 | [['i', 'b', 'i'], []],
230 | ],
231 | [
232 | '\I 123 \i',
233 | [['i'], ['I']],
234 | ],
235 | ];
236 | }
237 | }
238 |
--------------------------------------------------------------------------------
/src/S2/Rose/Entity/SnippetLine.php:
--------------------------------------------------------------------------------
1 | line = $line;
51 | $this->formatId = $formatId;
52 | $this->stemmer = $stemmer;
53 | $this->stemsFoundSomewhere = $stemsFoundSomewhere;
54 | $this->relevance = $relevance;
55 | }
56 |
57 | public static function createFromSnippetSourceWithoutFoundWords(SnippetSource $snippetSource): self
58 | {
59 | return new static(
60 | $snippetSource->getText(),
61 | $snippetSource->getFormatId(),
62 | new class implements StemmerInterface {
63 | public function stemWord(string $word, bool $normalize = true): string
64 | {
65 | return $word;
66 | }
67 | },
68 | [],
69 | 0
70 | );
71 | }
72 |
73 | public function getRelevance(): float
74 | {
75 | return $this->relevance;
76 | }
77 |
78 | /**
79 | * @return string[]
80 | * @deprecated Not used anymore. TODO delete if not needed
81 | */
82 | public function getFoundStems(): array
83 | {
84 | $this->parse();
85 |
86 | return $this->foundStems;
87 | }
88 |
89 | public function getLine(): string
90 | {
91 | return $this->line;
92 | }
93 |
94 | public function getFormatId(): int
95 | {
96 | return $this->formatId;
97 | }
98 |
99 | /**
100 | * @throws RuntimeException
101 | */
102 | public function getHighlighted(string $highlightTemplate, bool $includeFormatting): string
103 | {
104 | if (strpos($highlightTemplate, '%s') === false) {
105 | throw new RuntimeException('Highlight template must contain "%s" substring for sprintf() function.');
106 | }
107 |
108 | $this->parse();
109 |
110 | $line = $this->getLineWithoutMaskedFragments();
111 |
112 | $replacedLine = '';
113 | $processedPosition = 0;
114 | foreach ($this->highlightIntervals->toArray() as [$start, $end]) {
115 | $replacedLine .= substr($line, $processedPosition, $start - $processedPosition);
116 | $lineToReplace = substr($line, $start, $end - $start);
117 |
118 | [$openFormatting, $closeFormatting] = StringHelper::getUnbalancedInternalFormatting($lineToReplace);
119 |
120 | // Open formatting goes to the end
121 | $outsidePostfix = implode('', array_map(static fn(string $char) => '\\' . $char, $openFormatting));
122 | $insidePostfix = implode('', array_map(static fn(string $char) => '\\' . strtoupper($char), array_reverse($openFormatting)));
123 |
124 | // Close formatting goes to the start
125 | $outsidePrefix = implode('', array_map(static fn(string $char) => '\\' . $char, $closeFormatting));
126 | $insidePrefix = implode('', array_map(static fn(string $char) => '\\' . strtolower($char), array_reverse($closeFormatting)));
127 |
128 | $replacedLine .= $outsidePrefix . sprintf(
129 | $highlightTemplate, $insidePrefix . $lineToReplace . $insidePostfix
130 | ) . $outsidePostfix;
131 |
132 | $processedPosition = $end;
133 | }
134 |
135 | $replacedLine .= substr($line, $processedPosition);
136 |
137 | $result = $this->restoreMaskedFragments($replacedLine);
138 |
139 | return SnippetTextHelper::convertFormatting($result, $this->formatId, $includeFormatting);
140 | }
141 |
142 | public function setMaskRegexArray(array $regexes): void
143 | {
144 | $this->maskRegexArray = $regexes;
145 | }
146 |
147 | protected function parse(): void
148 | {
149 | if ($this->highlightIntervals !== null) {
150 | // Already parsed
151 | return;
152 | }
153 |
154 | $this->highlightIntervals = new HighlightIntervals();
155 |
156 | $line = $this->getLineWithoutMaskedFragments();
157 |
158 | if (\count($this->stemsFoundSomewhere) === 0) {
159 | return;
160 | }
161 |
162 | if ($this->formatId === SnippetSource::FORMAT_INTERNAL) {
163 | $regex = '/(?x)
164 | [\\d\\p{L}^_]*(?:(?:\\\\[' . StringHelper::FORMATTING_SYMBOLS . '])+[\\d\\p{L}^_]*)* # matches as many word and formatting characters as possible
165 | (*SKIP) # do not cross this line on backtracking
166 | \\K # restart pattern matching to the end of the word.
167 | (?: # delimiter regex which includes:
168 | [^\\\\\\d\\p{L}^_\\-.,] # non-word character
169 | |[\\-.,]+(?![\\d\\p{L}\\-.,]) # [,-.] followed by a non-word character
170 | |\\\\(?:[' . StringHelper::FORMATTING_SYMBOLS . '](?![\\d\\p{L}\\-.,])|\\\\) # formatting sequence followed by a non-word character or escaped backslash
171 | )+/iu';
172 | } else {
173 | $regex = '/(?x)
174 | [\\d\\p{L}^_]* # matches as many word and formatting characters as possible
175 | (*SKIP) # do not cross this line on backtracking
176 | \\K # restart pattern matching to the end of the word.
177 | (?: # delimiter regex which includes:
178 | [^\\d\\p{L}^_\\-.,] # non-word character
179 | |[\\-.,]+(?![\\d\\p{L}\\-.,]) # [,-.] followed by a non-word character
180 | )+/iu';
181 | }
182 | $wordArray = preg_split($regex, $line, -1, \PREG_SPLIT_OFFSET_CAPTURE);
183 |
184 | $flippedStems = array_flip($this->stemsFoundSomewhere);
185 | foreach ($wordArray as [$rawWord, $offset]) {
186 | $word = $this->formatId === SnippetSource::FORMAT_INTERNAL ? StringHelper::clearInternalFormatting($rawWord) : $rawWord;
187 | $word = str_replace(SnippetTextHelper::STORE_MARKER, '', $word);
188 |
189 | if ($word === '') {
190 | // No need to call $intervals->skipInterval() since regex may work several times on a single delimiter
191 | continue;
192 | }
193 |
194 | $stem = null;
195 | if (isset($flippedStems[$word]) || isset($flippedStems[$stem = $this->stemmer->stemWord($word)])) {
196 | $this->highlightIntervals->addInterval($offset, $offset + \strlen($rawWord));
197 | $this->foundStems[] = $stem ?? $word;
198 | } else {
199 | // Word is not found. Check if it is like a hyphenated compound word, e.g. 'test-drive' or 'long-term'
200 | if (false !== strpbrk($stem, StringHelper::WORD_COMPONENT_DELIMITERS)) {
201 | // Here is more simple regex since formatting sequences may be present.
202 | // The downside is appearance of empty words, but they are filtered out later.
203 | $subWordArray = preg_split('#[\-.,]+#u', $rawWord, -1, \PREG_SPLIT_OFFSET_CAPTURE);
204 | foreach ($subWordArray as [$rawSubWord, $subOffset]) {
205 | $subWord = $this->formatId === SnippetSource::FORMAT_INTERNAL ? StringHelper::clearInternalFormatting($rawSubWord) : $rawSubWord;
206 | $subWord = str_replace(SnippetTextHelper::STORE_MARKER, '', $subWord);
207 |
208 | if ($rawSubWord === '') {
209 | continue;
210 | }
211 |
212 | $subStem = null;
213 | if (isset($flippedStems[$subWord]) || isset($flippedStems[$subStem = $this->stemmer->stemWord($subWord)])) {
214 | $this->highlightIntervals->addInterval($offset + $subOffset, $offset + $subOffset + \strlen($rawSubWord));
215 | $this->foundStems[] = $subStem ?? $subWord;
216 | } else {
217 | $this->highlightIntervals->skipInterval();
218 | }
219 | }
220 | } else {
221 | // Not a compound word
222 | $this->highlightIntervals->skipInterval();
223 | }
224 | }
225 | }
226 | }
227 |
228 | protected function getLineWithoutMaskedFragments(): string
229 | {
230 | if ($this->lineWithoutMaskedFragments !== null) {
231 | return $this->lineWithoutMaskedFragments;
232 | }
233 |
234 | $this->lineWithoutMaskedFragments = SnippetTextHelper::sanitize($this->line, $this->maskRegexArray, $this->maskedFragments);
235 |
236 | return $this->lineWithoutMaskedFragments;
237 | }
238 |
239 | protected function restoreMaskedFragments(string $line): string
240 | {
241 | return SnippetTextHelper::restore($line, $this->maskedFragments);
242 | }
243 | }
244 |
--------------------------------------------------------------------------------