├── library └── ZendSearch │ ├── Lucene │ ├── Exception │ │ ├── RuntimeException.php │ │ ├── OutOfRangeException.php │ │ ├── OutOfBoundsException.php │ │ ├── InvalidFileFormatException.php │ │ ├── ExtensionNotLoadedException.php │ │ ├── UnexpectedValueException.php │ │ ├── UnsupportedMethodCallException.php │ │ ├── ExceptionInterface.php │ │ └── InvalidArgumentException.php │ ├── Document │ │ ├── Exception │ │ │ ├── ExceptionInterface.php │ │ │ └── InvalidArgumentException.php │ │ ├── AbstractOpenXML.php │ │ ├── Docx.php │ │ ├── Field.php │ │ └── Pptx.php │ ├── Search │ │ ├── Exception │ │ │ ├── ExceptionInterface.php │ │ │ └── QueryParserException.php │ │ ├── Weight │ │ │ ├── EmptyResultWeight.php │ │ │ ├── AbstractWeight.php │ │ │ ├── Phrase.php │ │ │ ├── Term.php │ │ │ ├── Boolean.php │ │ │ └── MultiTerm.php │ │ ├── Highlighter │ │ │ ├── HighlighterInterface.php │ │ │ └── DefaultHighlighter.php │ │ ├── QueryEntry │ │ │ ├── AbstractQueryEntry.php │ │ │ ├── Subquery.php │ │ │ ├── Phrase.php │ │ │ └── Term.php │ │ ├── Similarity │ │ │ └── DefaultSimilarity.php │ │ ├── QueryHit.php │ │ ├── Query │ │ │ ├── EmptyResult.php │ │ │ ├── Insignificant.php │ │ │ ├── Preprocessing │ │ │ │ ├── AbstractPreprocessing.php │ │ │ │ └── Phrase.php │ │ │ ├── Term.php │ │ │ └── AbstractQuery.php │ │ └── QueryToken.php │ ├── Analysis │ │ ├── Analyzer │ │ │ ├── Common │ │ │ │ ├── Text │ │ │ │ │ └── CaseInsensitive.php │ │ │ │ ├── TextNum │ │ │ │ │ └── CaseInsensitive.php │ │ │ │ ├── Utf8 │ │ │ │ │ └── CaseInsensitive.php │ │ │ │ ├── Utf8Num │ │ │ │ │ └── CaseInsensitive.php │ │ │ │ ├── AbstractCommon.php │ │ │ │ ├── Text.php │ │ │ │ ├── TextNum.php │ │ │ │ ├── Utf8.php │ │ │ │ └── Utf8Num.php │ │ │ ├── AnalyzerInterface.php │ │ │ ├── Analyzer.php │ │ │ └── AbstractAnalyzer.php │ │ ├── TokenFilter │ │ │ ├── TokenFilterInterface.php │ │ │ ├── LowerCase.php │ │ │ ├── ShortWords.php │ │ │ ├── LowerCaseUtf8.php │ │ │ └── StopWords.php │ │ └── Token.php │ ├── Index │ │ ├── TermsPriorityQueue.php │ │ ├── FieldInfo.php │ │ ├── TermsStreamInterface.php │ │ ├── TermInfo.php │ │ ├── DocsFilter.php │ │ ├── SegmentWriter │ │ │ ├── StreamWriter.php │ │ │ └── DocumentWriter.php │ │ └── Term.php │ ├── FSMAction.php │ ├── Storage │ │ ├── Directory │ │ │ └── DirectoryInterface.php │ │ └── File │ │ │ ├── FileInterface.php │ │ │ └── Filesystem.php │ ├── Document.php │ ├── Lucene.php │ ├── AbstractPriorityQueue.php │ ├── TermStreamsPriorityQueue.php │ └── LockManager.php │ ├── Exception │ └── ExceptionInterface.php │ └── Stdlib │ └── ErrorHandler.php ├── composer.json ├── README.md ├── LICENSE └── .github └── workflows └── test-application.yaml /library/ZendSearch/Lucene/Exception/RuntimeException.php: -------------------------------------------------------------------------------- 1 | =7.2.0" 17 | }, 18 | "replace": { 19 | "zendframework/zendsearch": "self.version" 20 | }, 21 | "extra": { 22 | "branch-alias": { 23 | "dev-master": "2.0-dev" 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZendSearch component 2 | 3 | ![Latest Version](https://img.shields.io/github/v/tag/handcraftedinthealps/zendsearch.svg) 4 | ![Test Workflow](https://img.shields.io/github/workflow/status/handcraftedinthealps/zendsearch/Test%20application/master.svg?label=test-workflow) 5 | 6 | This is a fork of [ZendSearch](https://github.com/zendframework/ZendSearch) kept compatible with the latest PHP versions. 7 | 8 | You can install using: 9 | 10 | ``` 11 | curl -s https://getcomposer.org/installer | php 12 | php composer.phar install 13 | ``` 14 | 15 | At that point, follow the instructions in the documentation folder for actual 16 | usage of the component. (Documentation is forthcoming.) 17 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Exception/QueryParserException.php: -------------------------------------------------------------------------------- 1 | addFilter(new TokenFilter\LowerCase()); 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php: -------------------------------------------------------------------------------- 1 | addFilter(new TokenFilter\LowerCase()); 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php: -------------------------------------------------------------------------------- 1 | addFilter(new TokenFilter\LowerCaseUtf8()); 28 | } 29 | } 30 | 31 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php: -------------------------------------------------------------------------------- 1 | addFilter(new TokenFilter\LowerCaseUtf8()); 28 | } 29 | } 30 | 31 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/TokenFilter/TokenFilterInterface.php: -------------------------------------------------------------------------------- 1 | currentTerm()->key(), $termsStream2->currentTerm()->key()) < 0; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/FieldInfo.php: -------------------------------------------------------------------------------- 1 | name = $name; 30 | $this->isIndexed = $isIndexed; 31 | $this->number = $number; 32 | $this->storeTermVector = $storeTermVector; 33 | $this->normsOmitted = $normsOmitted; 34 | $this->payloadsStored = $payloadsStored; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/FSMAction.php: -------------------------------------------------------------------------------- 1 | _object = $object; 45 | $this->_method = $method; 46 | } 47 | 48 | public function doAction() 49 | { 50 | $methodName = $this->_method; 51 | $this->_object->$methodName(); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Highlighter/HighlighterInterface.php: -------------------------------------------------------------------------------- 1 | getTermText() ), 33 | $srcToken->getStartOffset(), 34 | $srcToken->getEndOffset()); 35 | 36 | $newToken->setPositionIncrement($srcToken->getPositionIncrement()); 37 | 38 | return $newToken; 39 | } 40 | } 41 | 42 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryEntry/AbstractQueryEntry.php: -------------------------------------------------------------------------------- 1 | _boost *= $boostFactor; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/TermsStreamInterface.php: -------------------------------------------------------------------------------- 1 | length = $length; 38 | } 39 | 40 | /** 41 | * Normalize Token or remove it (if null is returned) 42 | * 43 | * @param \ZendSearch\Lucene\Analysis\Token $srcToken 44 | * @return \ZendSearch\Lucene\Analysis\Token 45 | */ 46 | public function normalize(Token $srcToken) 47 | { 48 | if (strlen($srcToken->getTermText()) < $this->length) { 49 | return null; 50 | } else { 51 | return $srcToken; 52 | } 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Analyzer.php: -------------------------------------------------------------------------------- 1 | docFreq = $docFreq; 61 | $this->freqPointer = $freqPointer; 62 | $this->proxPointer = $proxPointer; 63 | $this->skipOffset = $skipOffset; 64 | $this->indexPointer = $indexPointer; 65 | } 66 | } 67 | 68 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryEntry/Subquery.php: -------------------------------------------------------------------------------- 1 | _query = $query; 35 | } 36 | 37 | /** 38 | * Process modifier ('~') 39 | * 40 | * @param mixed $parameter 41 | * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException 42 | */ 43 | public function processFuzzyProximityModifier($parameter = null) 44 | { 45 | throw new \ZendSearch\Lucene\Search\Exception\QueryParserException( 46 | '\'~\' sign must follow term or phrase' 47 | ); 48 | } 49 | 50 | 51 | /** 52 | * Transform entry to a subquery 53 | * 54 | * @param string $encoding 55 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 56 | */ 57 | public function getQuery($encoding) 58 | { 59 | $this->_query->setBoost($this->_boost); 60 | 61 | return $this->_query; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/DocsFilter.php: -------------------------------------------------------------------------------- 1 | => array( => , 28 | * => , 29 | * => , 30 | * ... ), 31 | * => array( => , 32 | * => , 33 | * => , 34 | * ... ), 35 | * => array( => , 36 | * => , 37 | * => , 38 | * ... ), 39 | * ... 40 | * ) 41 | * 42 | * @var array 43 | */ 44 | public $segmentFilters = array(); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php: -------------------------------------------------------------------------------- 1 | getTermText(), 'UTF-8'), 47 | $srcToken->getStartOffset(), 48 | $srcToken->getEndOffset()); 49 | 50 | $newToken->setPositionIncrement($srcToken->getPositionIncrement()); 51 | 52 | return $newToken; 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/AbstractAnalyzer.php: -------------------------------------------------------------------------------- 1 | setInput($data, $encoding); 50 | 51 | $tokenList = array(); 52 | while (($nextToken = $this->nextToken()) !== null) { 53 | $tokenList[] = $nextToken; 54 | } 55 | 56 | return $tokenList; 57 | } 58 | 59 | /** 60 | * Tokenization stream API 61 | * Set input 62 | * 63 | * @param string $data 64 | */ 65 | public function setInput($data, $encoding = '') 66 | { 67 | $this->_input = $data; 68 | $this->_encoding = $encoding; 69 | $this->reset(); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Weight/AbstractWeight.php: -------------------------------------------------------------------------------- 1 | createWeight(). 17 | * The sumOfSquaredWeights() method is then called on the top-level 18 | * query to compute the query normalization factor Similarity->queryNorm(float). 19 | * This factor is then passed to normalize(float). At this point the weighting 20 | * is complete. 21 | * 22 | * @category Zend 23 | * @package Zend_Search_Lucene 24 | * @subpackage Search 25 | */ 26 | abstract class AbstractWeight 27 | { 28 | /** 29 | * Normalization factor. 30 | * This value is stored only for query expanation purpose and not used in any other place 31 | * 32 | * @var float 33 | */ 34 | protected $_queryNorm; 35 | 36 | /** 37 | * AbstractWeight value 38 | * 39 | * AbstractWeight value may be initialized in sumOfSquaredWeights() or normalize() 40 | * because they both are invoked either in Query::_initWeight (for top-level query) or 41 | * in corresponding methods of parent query's weights 42 | * 43 | * @var float 44 | */ 45 | protected $_value; 46 | 47 | 48 | /** 49 | * The weight for this query. 50 | * 51 | * @return float 52 | */ 53 | public function getValue() 54 | { 55 | return $this->_value; 56 | } 57 | 58 | /** 59 | * The sum of squared weights of contained query clauses. 60 | * 61 | * @return float 62 | */ 63 | abstract public function sumOfSquaredWeights(); 64 | 65 | /** 66 | * Assigns the query normalization factor to this. 67 | * 68 | * @param $norm 69 | */ 70 | abstract public function normalize($norm); 71 | } 72 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/AbstractCommon.php: -------------------------------------------------------------------------------- 1 | _filters[] = $filter; 45 | } 46 | 47 | /** 48 | * Apply filters to the token. Can return null when the token was removed. 49 | * 50 | * @param \ZendSearch\Lucene\Analysis\Token $token 51 | * @return \ZendSearch\Lucene\Analysis\Token 52 | */ 53 | public function normalize(Analysis\Token $token) 54 | { 55 | foreach ($this->_filters as $filter) { 56 | $token = $filter->normalize($token); 57 | 58 | // resulting token can be null if the filter removes it 59 | if ($token === null) { 60 | return null; 61 | } 62 | } 63 | 64 | return $token; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/Text.php: -------------------------------------------------------------------------------- 1 | _position = 0; 35 | 36 | if ($this->_input === null) { 37 | return; 38 | } 39 | 40 | // convert input into ascii 41 | if (PHP_OS != 'AIX') { 42 | $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input); 43 | } 44 | $this->_encoding = 'ASCII'; 45 | } 46 | 47 | /** 48 | * Tokenization stream API 49 | * Get next token 50 | * Returns null at the end of stream 51 | * 52 | * @return \ZendSearch\Lucene\Analysis\Token|null 53 | */ 54 | public function nextToken() 55 | { 56 | if ($this->_input === null) { 57 | return null; 58 | } 59 | 60 | 61 | do { 62 | if (! preg_match('/[a-zA-Z]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) { 63 | // It covers both cases a) there are no matches (preg_match(...) === 0) 64 | // b) error occured (preg_match(...) === FALSE) 65 | return null; 66 | } 67 | 68 | $str = $match[0][0]; 69 | $pos = $match[0][1]; 70 | $endpos = $pos + strlen($str); 71 | 72 | $this->_position = $endpos; 73 | 74 | $token = $this->normalize(new Analysis\Token($str, $pos, $endpos)); 75 | } while ($token === null); // try again if token is skipped 76 | 77 | return $token; 78 | } 79 | } 80 | 81 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/TextNum.php: -------------------------------------------------------------------------------- 1 | _position = 0; 35 | 36 | if ($this->_input === null) { 37 | return; 38 | } 39 | 40 | // convert input into ascii 41 | if (PHP_OS != 'AIX') { 42 | $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input); 43 | } 44 | $this->_encoding = 'ASCII'; 45 | } 46 | 47 | /** 48 | * Tokenization stream API 49 | * Get next token 50 | * Returns null at the end of stream 51 | * 52 | * @return \ZendSearch\Lucene\Analysis\Token|null 53 | */ 54 | public function nextToken() 55 | { 56 | if ($this->_input === null) { 57 | return null; 58 | } 59 | 60 | do { 61 | if (! preg_match('/[a-zA-Z0-9]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) { 62 | // It covers both cases a) there are no matches (preg_match(...) === 0) 63 | // b) error occured (preg_match(...) === FALSE) 64 | return null; 65 | } 66 | 67 | $str = $match[0][0]; 68 | $pos = $match[0][1]; 69 | $endpos = $pos + strlen($str); 70 | 71 | $this->_position = $endpos; 72 | 73 | $token = $this->normalize(new Analysis\Token($str, $pos, $endpos)); 74 | } while ($token === null); // try again if token is skipped 75 | 76 | return $token; 77 | } 78 | } 79 | 80 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Similarity/DefaultSimilarity.php: -------------------------------------------------------------------------------- 1 | _fdxFile = $this->_directory->createFile($this->_name . '.fdx'); 41 | $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); 42 | 43 | $this->_files[] = $this->_name . '.fdx'; 44 | $this->_files[] = $this->_name . '.fdt'; 45 | } 46 | 47 | public function addNorm($fieldName, $normVector) 48 | { 49 | if (isset($this->_norms[$fieldName])) { 50 | $this->_norms[$fieldName] .= $normVector; 51 | } else { 52 | $this->_norms[$fieldName] = $normVector; 53 | } 54 | } 55 | 56 | /** 57 | * Close segment, write it to disk and return segment info 58 | * 59 | * @return \ZendSearch\Lucene\Index\SegmentInfo 60 | */ 61 | public function close() 62 | { 63 | if ($this->_docCount == 0) { 64 | return null; 65 | } 66 | 67 | $this->_dumpFNM(); 68 | $this->_generateCFS(); 69 | 70 | return new LuceneIndex\SegmentInfo($this->_directory, 71 | $this->_name, 72 | $this->_docCount, 73 | -1, 74 | null, 75 | true, 76 | true); 77 | } 78 | } 79 | 80 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Weight/Phrase.php: -------------------------------------------------------------------------------- 1 | _query = $query; 58 | $this->_reader = $reader; 59 | } 60 | 61 | /** 62 | * The sum of squared weights of contained query clauses. 63 | * 64 | * @return float 65 | */ 66 | public function sumOfSquaredWeights() 67 | { 68 | // compute idf 69 | $this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader); 70 | 71 | // compute query weight 72 | $this->_queryWeight = $this->_idf * $this->_query->getBoost(); 73 | 74 | // square it 75 | return $this->_queryWeight * $this->_queryWeight; 76 | } 77 | 78 | 79 | /** 80 | * Assigns the query normalization factor to this. 81 | * 82 | * @param float $queryNorm 83 | */ 84 | public function normalize($queryNorm) 85 | { 86 | $this->_queryNorm = $queryNorm; 87 | 88 | // normalize query weight 89 | $this->_queryWeight *= $queryNorm; 90 | 91 | // idf for documents 92 | $this->_value = $this->_queryWeight * $this->_idf; 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Highlighter/DefaultHighlighter.php: -------------------------------------------------------------------------------- 1 | _doc = $document; 57 | } 58 | 59 | /** 60 | * Get document for highlighting. 61 | * 62 | * @return \ZendSearch\Lucene\Document\HTML $document 63 | */ 64 | public function getDocument() 65 | { 66 | return $this->_doc; 67 | } 68 | 69 | /** 70 | * Highlight specified words 71 | * 72 | * @param string|array $words Words to highlight. They could be organized using the array or string. 73 | */ 74 | public function highlight($words) 75 | { 76 | $color = $this->_highlightColors[$this->_currentColorIndex]; 77 | $this->_currentColorIndex = ($this->_currentColorIndex + 1) % count($this->_highlightColors); 78 | 79 | $this->_doc->highlight($words, $color); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryEntry/Phrase.php: -------------------------------------------------------------------------------- 1 | _phrase = $phrase; 59 | $this->_field = $field; 60 | } 61 | 62 | /** 63 | * Process modifier ('~') 64 | * 65 | * @param mixed $parameter 66 | */ 67 | public function processFuzzyProximityModifier($parameter = null) 68 | { 69 | $this->_proximityQuery = true; 70 | 71 | if ($parameter !== null) { 72 | $this->_wordsDistance = $parameter; 73 | } 74 | } 75 | 76 | /** 77 | * Transform entry to a subquery 78 | * 79 | * @param string $encoding 80 | * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException 81 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 82 | */ 83 | public function getQuery($encoding) 84 | { 85 | $query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Phrase($this->_phrase, 86 | $encoding, 87 | ($this->_field !== null)? 88 | iconv($encoding, 'UTF-8', $this->_field) : 89 | null); 90 | 91 | if ($this->_proximityQuery) { 92 | $query->setSlop($this->_wordsDistance); 93 | } 94 | 95 | $query->setBoost($this->_boost); 96 | 97 | return $query; 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryHit.php: -------------------------------------------------------------------------------- 1 | _index = $index; 64 | } 65 | 66 | /** 67 | * Magic method for checking the existence of a field 68 | * 69 | * @param string $offset 70 | * @return boolean TRUE if the field exists else FALSE 71 | */ 72 | public function __isset($offset) 73 | { 74 | return isset($this->getDocument()->$offset); 75 | } 76 | 77 | 78 | /** 79 | * Convenience function for getting fields from the document 80 | * associated with this hit. 81 | * 82 | * @param string $offset 83 | * @return string 84 | */ 85 | public function __get($offset) 86 | { 87 | return $this->getDocument()->getFieldValue($offset); 88 | } 89 | 90 | 91 | /** 92 | * Return the document object for this hit 93 | * 94 | * @return \ZendSearch\Lucene\Document 95 | */ 96 | public function getDocument() 97 | { 98 | if (!$this->_document instanceof Document) { 99 | $this->_document = $this->_index->getDocument($this->document_id); 100 | } 101 | 102 | return $this->_document; 103 | } 104 | 105 | 106 | /** 107 | * Return the index object for this hit 108 | * 109 | * @return \ZendSearch\Lucene\SearchIndexInterface 110 | */ 111 | public function getIndex() 112 | { 113 | return $this->_index; 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Weight/Term.php: -------------------------------------------------------------------------------- 1 | _term = $term; 73 | $this->_query = $query; 74 | $this->_reader = $reader; 75 | } 76 | 77 | 78 | /** 79 | * The sum of squared weights of contained query clauses. 80 | * 81 | * @return float 82 | */ 83 | public function sumOfSquaredWeights() 84 | { 85 | // compute idf 86 | $this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader); 87 | 88 | // compute query weight 89 | $this->_queryWeight = $this->_idf * $this->_query->getBoost(); 90 | 91 | // square it 92 | return $this->_queryWeight * $this->_queryWeight; 93 | } 94 | 95 | 96 | /** 97 | * Assigns the query normalization factor to this. 98 | * 99 | * @param float $queryNorm 100 | */ 101 | public function normalize($queryNorm) 102 | { 103 | $this->_queryNorm = $queryNorm; 104 | 105 | // normalize query weight 106 | $this->_queryWeight *= $queryNorm; 107 | 108 | // idf for documents 109 | $this->_value = $this->_queryWeight * $this->_idf; 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /.github/workflows/test-application.yaml: -------------------------------------------------------------------------------- 1 | name: Test application 2 | 3 | on: 4 | pull_request: 5 | push: 6 | branches: 7 | - master 8 | - '[0-9]+.[0-9]+' 9 | - '[0-9]+.x' 10 | 11 | jobs: 12 | php: 13 | name: "Run tests with php ${{ matrix.php-version }}" 14 | runs-on: ubuntu-22.04 15 | 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | include: 20 | - php-version: '7.2' 21 | composer-flags: '--prefer-lowest --prefer-stable --prefer-dist --no-interaction' 22 | 23 | - php-version: '7.3' 24 | composer-flags: '--prefer-stable --prefer-dist --no-interaction' 25 | 26 | - php-version: '7.4' 27 | composer-flags: '--prefer-stable --prefer-dist --no-interaction' 28 | 29 | - php-version: '8.0' 30 | composer-flags: '--prefer-stable --prefer-dist --no-interaction' 31 | 32 | - php-version: '8.1' 33 | composer-flags: '--prefer-stable --prefer-dist --no-interaction' 34 | 35 | - php-version: '8.2' 36 | composer-flags: '--prefer-stable --prefer-dist --no-interaction' 37 | 38 | - php-version: '8.3' 39 | composer-flags: '--prefer-stable --prefer-dist --no-interaction' 40 | 41 | - php-version: '8.4' 42 | composer-flags: '--prefer-stable --prefer-dist --no-interaction' 43 | 44 | - php-version: '8.5' 45 | composer-flags: '--prefer-stable --prefer-dist --no-interaction' 46 | 47 | steps: 48 | - name: Checkout project 49 | uses: actions/checkout@v6 50 | 51 | - name: Install and configure PHP 52 | uses: shivammathur/setup-php@v2 53 | with: 54 | php-version: ${{ matrix.php-version }} 55 | extensions: 'iconv' 56 | tools: 'composer' 57 | 58 | - name: Get composer cache directory 59 | id: composer-cache-dir 60 | run: echo "::set-output name=dir::$(composer config cache-files-dir)" 61 | 62 | - name: Cache dependencies 63 | uses: actions/cache@v4 64 | id: composer-cache 65 | with: 66 | path: ${{ steps.composer-cache-dir.outputs.dir }} 67 | key: ${{ runner.os }}-composer-${{ hashFiles('composer.lock') }} 68 | restore-keys: | 69 | ${{ runner.os }}-composer- 70 | 71 | - name: Install dependencies 72 | run: | 73 | composer validate --strict 74 | composer update -o ${{ matrix.composer-flags }} 75 | 76 | - name: Install phpunit 77 | run: | 78 | wget https://phar.phpunit.de/phpunit-8.phar -O phpunit.phar 79 | chmod +x phpunit.phar 80 | 81 | - name: Run tests 82 | run: ./phpunit.phar -c tests/phpunit.xml.dist 83 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/TokenFilter/StopWords.php: -------------------------------------------------------------------------------- 1 | 1, 'an' => '1'); 21 | * 22 | * We do recommend to provide all words in lowercase and concatenate this class after the lowercase filter. 23 | * 24 | * @category Zend 25 | * @package Zend_Search_Lucene 26 | * @subpackage Analysis 27 | */ 28 | class StopWords implements TokenFilterInterface 29 | { 30 | /** 31 | * Stop Words 32 | * @var array 33 | */ 34 | private $_stopSet; 35 | 36 | /** 37 | * Constructs new instance of this filter. 38 | * 39 | * @param array $stopwords array (set) of words that will be filtered out 40 | */ 41 | public function __construct($stopwords = array()) 42 | { 43 | $this->_stopSet = array_flip($stopwords); 44 | } 45 | 46 | /** 47 | * Normalize Token or remove it (if null is returned) 48 | * 49 | * @param \ZendSearch\Lucene\Analysis\Token $srcToken 50 | * @return \ZendSearch\Lucene\Analysis\Token 51 | */ 52 | public function normalize(Token $srcToken) 53 | { 54 | if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) { 55 | return null; 56 | } else { 57 | return $srcToken; 58 | } 59 | } 60 | 61 | /** 62 | * Fills stopwords set from a text file. Each line contains one stopword, lines with '#' in the first 63 | * column are ignored (as comments). 64 | * 65 | * You can call this method one or more times. New stopwords are always added to current set. 66 | * 67 | * @param string $filepath full path for text file with stopwords 68 | * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException 69 | * @throws \ZendSearch\Lucene\Exception\RuntimeException 70 | */ 71 | public function loadFromFile($filepath = null) 72 | { 73 | if (! $filepath || ! file_exists($filepath)) { 74 | throw new InvalidArgumentException('You have to provide valid file path'); 75 | } 76 | $fd = fopen($filepath, "r"); 77 | if (! $fd) { 78 | throw new RuntimeException('Cannot open file ' . $filepath); 79 | } 80 | while (!feof ($fd)) { 81 | $buffer = trim(fgets($fd)); 82 | if (strlen($buffer) > 0 && $buffer[0] != '#') { 83 | $this->_stopSet[$buffer] = 1; 84 | } 85 | } 86 | if (!fclose($fd)) { 87 | throw new RuntimeException('Cannot close file ' . $filepath); 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /library/ZendSearch/Stdlib/ErrorHandler.php: -------------------------------------------------------------------------------- 1 | _query = $query; 57 | $this->_reader = $reader; 58 | $this->_weights = array(); 59 | 60 | $signs = $query->getSigns(); 61 | 62 | foreach ($query->getSubqueries() as $num => $subquery) { 63 | if ($signs === null || $signs[$num] === null || $signs[$num]) { 64 | $this->_weights[$num] = $subquery->createWeight($reader); 65 | } 66 | } 67 | } 68 | 69 | 70 | /** 71 | * The weight for this query 72 | * Standard Weight::$_value is not used for boolean queries 73 | * 74 | * @return float 75 | */ 76 | public function getValue() 77 | { 78 | return $this->_query->getBoost(); 79 | } 80 | 81 | 82 | /** 83 | * The sum of squared weights of contained query clauses. 84 | * 85 | * @return float 86 | */ 87 | public function sumOfSquaredWeights() 88 | { 89 | $sum = 0; 90 | foreach ($this->_weights as $weight) { 91 | // sum sub weights 92 | $sum += $weight->sumOfSquaredWeights(); 93 | } 94 | 95 | // boost each sub-weight 96 | $sum *= $this->_query->getBoost() * $this->_query->getBoost(); 97 | 98 | // check for empty query (like '-something -another') 99 | if ($sum == 0) { 100 | $sum = 1.0; 101 | } 102 | return $sum; 103 | } 104 | 105 | 106 | /** 107 | * Assigns the query normalization factor to this. 108 | * 109 | * @param float $queryNorm 110 | */ 111 | public function normalize($queryNorm) 112 | { 113 | // incorporate boost 114 | $queryNorm *= $this->_query->getBoost(); 115 | 116 | foreach ($this->_weights as $weight) { 117 | $weight->normalize($queryNorm); 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Weight/MultiTerm.php: -------------------------------------------------------------------------------- 1 | _query = $query; 57 | $this->_reader = $reader; 58 | $this->_weights = array(); 59 | 60 | $signs = $query->getSigns(); 61 | 62 | foreach ($query->getTerms() as $id => $term) { 63 | if ($signs === null || $signs[$id] === null || $signs[$id]) { 64 | $this->_weights[$id] = new Term($term, $query, $reader); 65 | $query->setWeight($id, $this->_weights[$id]); 66 | } 67 | } 68 | } 69 | 70 | 71 | /** 72 | * The weight for this query 73 | * Standard Weight::$_value is not used for boolean queries 74 | * 75 | * @return float 76 | */ 77 | public function getValue() 78 | { 79 | return $this->_query->getBoost(); 80 | } 81 | 82 | 83 | /** 84 | * The sum of squared weights of contained query clauses. 85 | * 86 | * @return float 87 | */ 88 | public function sumOfSquaredWeights() 89 | { 90 | $sum = 0; 91 | foreach ($this->_weights as $weight) { 92 | // sum sub weights 93 | $sum += $weight->sumOfSquaredWeights(); 94 | } 95 | 96 | // boost each sub-weight 97 | $sum *= $this->_query->getBoost() * $this->_query->getBoost(); 98 | 99 | // check for empty query (like '-something -another') 100 | if ($sum == 0) { 101 | $sum = 1.0; 102 | } 103 | return $sum; 104 | } 105 | 106 | 107 | /** 108 | * Assigns the query normalization factor to this. 109 | * 110 | * @param float $queryNorm 111 | */ 112 | public function normalize($queryNorm) 113 | { 114 | // incorporate boost 115 | $queryNorm *= $this->_query->getBoost(); 116 | 117 | foreach ($this->_weights as $weight) { 118 | $weight->normalize($queryNorm); 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Storage/Directory/DirectoryInterface.php: -------------------------------------------------------------------------------- 1 | _term = $term; 59 | $this->_field = $field; 60 | } 61 | 62 | /** 63 | * Process modifier ('~') 64 | * 65 | * @param mixed $parameter 66 | */ 67 | public function processFuzzyProximityModifier($parameter = null) 68 | { 69 | $this->_fuzzyQuery = true; 70 | 71 | if ($parameter !== null) { 72 | $this->_similarity = $parameter; 73 | } else { 74 | $this->_similarity = \ZendSearch\Lucene\Search\Query\Fuzzy::DEFAULT_MIN_SIMILARITY; 75 | } 76 | } 77 | 78 | /** 79 | * Transform entry to a subquery 80 | * 81 | * @param string $encoding 82 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 83 | * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException 84 | */ 85 | public function getQuery($encoding) 86 | { 87 | if ($this->_fuzzyQuery) { 88 | $query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Fuzzy($this->_term, 89 | $encoding, 90 | ($this->_field !== null)? 91 | iconv($encoding, 'UTF-8', $this->_field) : 92 | null, 93 | $this->_similarity 94 | ); 95 | $query->setBoost($this->_boost); 96 | return $query; 97 | } 98 | 99 | 100 | $query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Term($this->_term, 101 | $encoding, 102 | ($this->_field !== null)? 103 | iconv($encoding, 'UTF-8', $this->_field) : 104 | null 105 | ); 106 | $query->setBoost($this->_boost); 107 | return $query; 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Document.php: -------------------------------------------------------------------------------- 1 | getFieldNames()); 50 | } 51 | 52 | /** 53 | * Proxy method for getFieldValue(), provides more convenient access to 54 | * the string value of a field. 55 | * 56 | * @param $offset 57 | * @return string 58 | */ 59 | public function __get($offset) 60 | { 61 | return $this->getFieldValue($offset); 62 | } 63 | 64 | 65 | /** 66 | * Add a field object to this document. 67 | * 68 | * @param \ZendSearch\Lucene\Document\Field $field 69 | * @return \ZendSearch\Lucene\Document 70 | */ 71 | public function addField(Document\Field $field) 72 | { 73 | $this->_fields[$field->name] = $field; 74 | 75 | return $this; 76 | } 77 | 78 | 79 | /** 80 | * Return an array with the names of the fields in this document. 81 | * 82 | * @return array 83 | */ 84 | public function getFieldNames() 85 | { 86 | return array_keys($this->_fields); 87 | } 88 | 89 | 90 | /** 91 | * Returns {@link \ZendSearch\Lucene\Document\Field} object for a named field in this document. 92 | * 93 | * @param string $fieldName 94 | * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException 95 | * @return \ZendSearch\Lucene\Document\Field 96 | */ 97 | public function getField($fieldName) 98 | { 99 | if (!array_key_exists($fieldName, $this->_fields)) { 100 | throw new InvalidArgumentException("Field name \"$fieldName\" not found in document."); 101 | } 102 | return $this->_fields[$fieldName]; 103 | } 104 | 105 | 106 | /** 107 | * Returns the string value of a named field in this document. 108 | * 109 | * @see __get() 110 | * @return string 111 | */ 112 | public function getFieldValue($fieldName) 113 | { 114 | return $this->getField($fieldName)->value; 115 | } 116 | 117 | /** 118 | * Returns the string value of a named field in UTF-8 encoding. 119 | * 120 | * @see __get() 121 | * @return string 122 | */ 123 | public function getFieldUtf8Value($fieldName) 124 | { 125 | return $this->getField($fieldName)->getUtf8Value(); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Query/EmptyResult.php: -------------------------------------------------------------------------------- 1 | '; 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Query/Insignificant.php: -------------------------------------------------------------------------------- 1 | '; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/Term.php: -------------------------------------------------------------------------------- 1 | field = ($field === null)? Lucene\Lucene::getDefaultSearchField() : $field; 50 | $this->text = $text; 51 | } 52 | 53 | 54 | /** 55 | * Returns term key 56 | * 57 | * @return string 58 | */ 59 | public function key() 60 | { 61 | return $this->field . chr(0) . $this->text; 62 | } 63 | 64 | /** 65 | * Get term prefix 66 | * 67 | * @param string $str 68 | * @param integer $length 69 | * @return string 70 | */ 71 | public static function getPrefix($str, $length) 72 | { 73 | /** 74 | * @todo !!!!!!! use mb_string or iconv functions if they are available 75 | */ 76 | $prefixBytes = 0; 77 | $prefixChars = 0; 78 | while (isset($str[$prefixBytes]) && $prefixChars < $length) { 79 | $charBytes = 1; 80 | if ((ord($str[$prefixBytes]) & 0xC0) == 0xC0) { 81 | $charBytes++; 82 | if (ord($str[$prefixBytes]) & 0x20 ) { 83 | $charBytes++; 84 | if (ord($str[$prefixBytes]) & 0x10 ) { 85 | $charBytes++; 86 | } 87 | } 88 | } 89 | 90 | if (! isset($str[$prefixBytes + $charBytes - 1])) { 91 | // wrong character 92 | break; 93 | } 94 | 95 | $prefixChars++; 96 | $prefixBytes += $charBytes; 97 | } 98 | 99 | return substr($str, 0, $prefixBytes); 100 | } 101 | 102 | /** 103 | * Get UTF-8 string length 104 | * 105 | * @param string $str 106 | * @return string 107 | */ 108 | public static function getLength($str) 109 | { 110 | $bytes = 0; 111 | $chars = 0; 112 | while ($bytes < strlen($str)) { 113 | $charBytes = 1; 114 | if ((ord($str[$bytes]) & 0xC0) == 0xC0) { 115 | $charBytes++; 116 | if (ord($str[$bytes]) & 0x20 ) { 117 | $charBytes++; 118 | if (ord($str[$bytes]) & 0x10 ) { 119 | $charBytes++; 120 | } 121 | } 122 | } 123 | 124 | if ($bytes + $charBytes > strlen($str)) { 125 | // wrong character 126 | break; 127 | } 128 | 129 | $chars++; 130 | $bytes += $charBytes; 131 | } 132 | 133 | return $chars; 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8.php: -------------------------------------------------------------------------------- 1 | _position = 0; 61 | $this->_bytePosition = 0; 62 | 63 | // convert input into UTF-8 64 | if (strcasecmp($this->_encoding, 'utf8' ) != 0 && 65 | strcasecmp($this->_encoding, 'utf-8') != 0 ) { 66 | $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); 67 | $this->_encoding = 'UTF-8'; 68 | } 69 | } 70 | 71 | /** 72 | * Tokenization stream API 73 | * Get next token 74 | * Returns null at the end of stream 75 | * 76 | * @return \ZendSearch\Lucene\Analysis\Token|null 77 | */ 78 | public function nextToken() 79 | { 80 | if ($this->_input === null) { 81 | return null; 82 | } 83 | 84 | do { 85 | if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) { 86 | // It covers both cases a) there are no matches (preg_match(...) === 0) 87 | // b) error occured (preg_match(...) === FALSE) 88 | return null; 89 | } 90 | 91 | // matched string 92 | $matchedWord = $match[0][0]; 93 | 94 | // binary position of the matched word in the input stream 95 | $binStartPos = $match[0][1]; 96 | 97 | // character position of the matched word in the input stream 98 | $startPos = $this->_position + 99 | iconv_strlen(substr($this->_input, 100 | $this->_bytePosition, 101 | $binStartPos - $this->_bytePosition), 102 | 'UTF-8'); 103 | // character postion of the end of matched word in the input stream 104 | $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8'); 105 | 106 | $this->_bytePosition = $binStartPos + strlen($matchedWord); 107 | $this->_position = $endPos; 108 | 109 | $token = $this->normalize(new Analysis\Token($matchedWord, $startPos, $endPos)); 110 | } while ($token === null); // try again if token is skipped 111 | 112 | return $token; 113 | } 114 | } 115 | 116 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Lucene.php: -------------------------------------------------------------------------------- 1 | _position = 0; 61 | $this->_bytePosition = 0; 62 | 63 | // convert input into UTF-8 64 | if (strcasecmp($this->_encoding, 'utf8' ) != 0 && 65 | strcasecmp($this->_encoding, 'utf-8') != 0 ) { 66 | $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); 67 | $this->_encoding = 'UTF-8'; 68 | } 69 | } 70 | 71 | /** 72 | * Tokenization stream API 73 | * Get next token 74 | * Returns null at the end of stream 75 | * 76 | * @return \ZendSearch\Lucene\Analysis\Token|null 77 | */ 78 | public function nextToken() 79 | { 80 | if ($this->_input === null) { 81 | return null; 82 | } 83 | 84 | do { 85 | if (! preg_match('/[\p{L}\p{N}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) { 86 | // It covers both cases a) there are no matches (preg_match(...) === 0) 87 | // b) error occured (preg_match(...) === FALSE) 88 | return null; 89 | } 90 | 91 | // matched string 92 | $matchedWord = $match[0][0]; 93 | 94 | // binary position of the matched word in the input stream 95 | $binStartPos = $match[0][1]; 96 | 97 | // character position of the matched word in the input stream 98 | $startPos = $this->_position + 99 | iconv_strlen(substr($this->_input, 100 | $this->_bytePosition, 101 | $binStartPos - $this->_bytePosition), 102 | 'UTF-8'); 103 | // character postion of the end of matched word in the input stream 104 | $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8'); 105 | 106 | $this->_bytePosition = $binStartPos + strlen($matchedWord); 107 | $this->_position = $endPos; 108 | 109 | $token = $this->normalize(new Analysis\Token($matchedWord, $startPos, $endPos)); 110 | } while ($token === null); // try again if token is skipped 111 | 112 | return $token; 113 | } 114 | } 115 | 116 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Query/Preprocessing/AbstractPreprocessing.php: -------------------------------------------------------------------------------- 1 | _termText = $text; 77 | $this->_startOffset = $start; 78 | $this->_endOffset = $end; 79 | 80 | $this->_positionIncrement = 1; 81 | } 82 | 83 | 84 | /** 85 | * positionIncrement setter 86 | * 87 | * @param integer $positionIncrement 88 | */ 89 | public function setPositionIncrement($positionIncrement) 90 | { 91 | $this->_positionIncrement = $positionIncrement; 92 | } 93 | 94 | /** 95 | * Returns the position increment of this Token. 96 | * 97 | * @return integer 98 | */ 99 | public function getPositionIncrement() 100 | { 101 | return $this->_positionIncrement; 102 | } 103 | 104 | /** 105 | * Returns the Token's term text. 106 | * 107 | * @return string 108 | */ 109 | public function getTermText() 110 | { 111 | return $this->_termText; 112 | } 113 | 114 | /** 115 | * Returns this Token's starting offset, the position of the first character 116 | * corresponding to this token in the source text. 117 | * 118 | * Note: 119 | * The difference between getEndOffset() and getStartOffset() may not be equal 120 | * to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered 121 | * by a stemmer or some other filter. 122 | * 123 | * @return integer 124 | */ 125 | public function getStartOffset() 126 | { 127 | return $this->_startOffset; 128 | } 129 | 130 | /** 131 | * Returns this Token's ending offset, one greater than the position of the 132 | * last character corresponding to this token in the source text. 133 | * 134 | * @return integer 135 | */ 136 | public function getEndOffset() 137 | { 138 | return $this->_endOffset; 139 | } 140 | } 141 | 142 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Document/AbstractOpenXML.php: -------------------------------------------------------------------------------- 1 | getFromName("_rels/.rels")); 78 | 79 | if (\PHP_VERSION_ID < 80000) { 80 | // Restore entity loader state 81 | libxml_disable_entity_loader($loadEntities); 82 | } 83 | 84 | foreach ($relations->Relationship as $rel) { 85 | if ($rel["Type"] == self::SCHEMA_COREPROPERTIES) { 86 | // Found core properties! Read in contents... 87 | $contents = simplexml_load_string( 88 | $package->getFromName(dirname($rel["Target"]) . "/" . basename($rel["Target"])) 89 | ); 90 | 91 | foreach ($contents->children(self::SCHEMA_DUBLINCORE) as $child) { 92 | $coreProperties[$child->getName()] = (string)$child; 93 | } 94 | foreach ($contents->children(self::SCHEMA_COREPROPERTIES) as $child) { 95 | $coreProperties[$child->getName()] = (string)$child; 96 | } 97 | foreach ($contents->children(self::SCHEMA_DUBLINCORETERMS) as $child) { 98 | $coreProperties[$child->getName()] = (string)$child; 99 | } 100 | } 101 | } 102 | 103 | return $coreProperties; 104 | } 105 | 106 | /** 107 | * Determine absolute zip path 108 | * 109 | * @param string $path 110 | * @return string 111 | */ 112 | protected function absoluteZipPath($path) 113 | { 114 | $path = str_replace(array('/', '\\'), DIRECTORY_SEPARATOR, $path); 115 | $parts = array_filter(explode(DIRECTORY_SEPARATOR, $path), 'strlen'); 116 | $absolutes = array(); 117 | foreach ($parts as $part) { 118 | if ('.' == $part) continue; 119 | if ('..' == $part) { 120 | array_pop($absolutes); 121 | } else { 122 | $absolutes[] = $part; 123 | } 124 | } 125 | return implode('/', $absolutes); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/AbstractPriorityQueue.php: -------------------------------------------------------------------------------- 1 | _heap); 54 | $parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 ) 55 | 56 | while ($nodeId != 0 && $this->_less($element, $this->_heap[$parentId])) { 57 | // Move parent node down 58 | $this->_heap[$nodeId] = $this->_heap[$parentId]; 59 | 60 | // Move pointer to the next level of tree 61 | $nodeId = $parentId; 62 | $parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 ) 63 | } 64 | 65 | // Put new node into the tree 66 | $this->_heap[$nodeId] = $element; 67 | } 68 | 69 | 70 | /** 71 | * Return least element of the queue 72 | * 73 | * Constant time 74 | * 75 | * @return mixed 76 | */ 77 | public function top() 78 | { 79 | if (count($this->_heap) == 0) { 80 | return null; 81 | } 82 | 83 | return $this->_heap[0]; 84 | } 85 | 86 | 87 | /** 88 | * Removes and return least element of the queue 89 | * 90 | * O(log(N)) time 91 | * 92 | * @return mixed 93 | */ 94 | public function pop() 95 | { 96 | if (count($this->_heap) == 0) { 97 | return null; 98 | } 99 | 100 | $top = $this->_heap[0]; 101 | $lastId = count($this->_heap) - 1; 102 | 103 | /** 104 | * Find appropriate position for last node 105 | */ 106 | $nodeId = 0; // Start from a top 107 | $childId = 1; // First child 108 | 109 | // Choose smaller child 110 | if ($lastId > 2 && $this->_less($this->_heap[2], $this->_heap[1])) { 111 | $childId = 2; 112 | } 113 | 114 | while ($childId < $lastId && 115 | $this->_less($this->_heap[$childId], $this->_heap[$lastId]) 116 | ) { 117 | // Move child node up 118 | $this->_heap[$nodeId] = $this->_heap[$childId]; 119 | 120 | $nodeId = $childId; // Go down 121 | $childId = ($nodeId << 1) + 1; // First child 122 | 123 | // Choose smaller child 124 | if (($childId+1) < $lastId && 125 | $this->_less($this->_heap[$childId+1], $this->_heap[$childId]) 126 | ) { 127 | $childId++; 128 | } 129 | } 130 | 131 | // Move last element to the new position 132 | $this->_heap[$nodeId] = $this->_heap[$lastId]; 133 | unset($this->_heap[$lastId]); 134 | 135 | return $top; 136 | } 137 | 138 | 139 | /** 140 | * Clear queue 141 | */ 142 | public function clear() 143 | { 144 | $this->_heap = array(); 145 | } 146 | 147 | 148 | /** 149 | * Compare elements 150 | * 151 | * Returns true, if $el1 is less than $el2; else otherwise 152 | * 153 | * @param mixed $el1 154 | * @param mixed $el2 155 | * @return boolean 156 | */ 157 | abstract protected function _less($el1, $el2); 158 | } 159 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/TermStreamsPriorityQueue.php: -------------------------------------------------------------------------------- 1 | _termStreams = $termStreams; 50 | 51 | $this->resetTermsStream(); 52 | } 53 | 54 | /** 55 | * Reset terms stream. 56 | */ 57 | public function resetTermsStream() 58 | { 59 | $this->_termsStreamQueue = new Index\TermsPriorityQueue(); 60 | 61 | foreach ($this->_termStreams as $termStream) { 62 | $termStream->resetTermsStream(); 63 | 64 | // Skip "empty" containers 65 | if ($termStream->currentTerm() !== null) { 66 | $this->_termsStreamQueue->put($termStream); 67 | } 68 | } 69 | 70 | $this->nextTerm(); 71 | } 72 | 73 | /** 74 | * Skip terms stream up to specified term preffix. 75 | * 76 | * Prefix contains fully specified field info and portion of searched term 77 | * 78 | * @param \ZendSearch\Lucene\Index\Term $prefix 79 | */ 80 | public function skipTo(Index\Term $prefix) 81 | { 82 | $termStreams = array(); 83 | 84 | while (($termStream = $this->_termsStreamQueue->pop()) !== null) { 85 | $termStreams[] = $termStream; 86 | } 87 | 88 | foreach ($termStreams as $termStream) { 89 | $termStream->skipTo($prefix); 90 | 91 | if ($termStream->currentTerm() !== null) { 92 | $this->_termsStreamQueue->put($termStream); 93 | } 94 | } 95 | 96 | $this->nextTerm(); 97 | } 98 | 99 | /** 100 | * Scans term streams and returns next term 101 | * 102 | * @return \ZendSearch\Lucene\Index\Term|null 103 | */ 104 | public function nextTerm() 105 | { 106 | while (($termStream = $this->_termsStreamQueue->pop()) !== null) { 107 | if ($this->_termsStreamQueue->top() === null || 108 | $this->_termsStreamQueue->top()->currentTerm()->key() != 109 | $termStream->currentTerm()->key()) { 110 | // We got new term 111 | $this->_lastTerm = $termStream->currentTerm(); 112 | 113 | if ($termStream->nextTerm() !== null) { 114 | // Put segment back into the priority queue 115 | $this->_termsStreamQueue->put($termStream); 116 | } 117 | 118 | return $this->_lastTerm; 119 | } 120 | 121 | if ($termStream->nextTerm() !== null) { 122 | // Put segment back into the priority queue 123 | $this->_termsStreamQueue->put($termStream); 124 | } 125 | } 126 | 127 | // End of stream 128 | $this->_lastTerm = null; 129 | 130 | return null; 131 | } 132 | 133 | /** 134 | * Returns term in current position 135 | * 136 | * @return \ZendSearch\Lucene\Index\Term|null 137 | */ 138 | public function currentTerm() 139 | { 140 | return $this->_lastTerm; 141 | } 142 | 143 | /** 144 | * Close terms stream 145 | * 146 | * Should be used for resources clean up if stream is not read up to the end 147 | */ 148 | public function closeTermsStream() 149 | { 150 | while (($termStream = $this->_termsStreamQueue->pop()) !== null) { 151 | $termStream->closeTermsStream(); 152 | } 153 | 154 | $this->_termsStreamQueue = null; 155 | $this->_lastTerm = null; 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Storage/File/FileInterface.php: -------------------------------------------------------------------------------- 1 | open($fileName); 57 | 58 | // Read relations and search for officeDocument 59 | $relationsXml = $package->getFromName('_rels/.rels'); 60 | if ($relationsXml === false) { 61 | throw new RuntimeException('Invalid archive or corrupted .docx file.'); 62 | } 63 | 64 | if (\PHP_VERSION_ID < 80000) { 65 | // Prevent php from loading remote resources 66 | $loadEntities = libxml_disable_entity_loader(true); 67 | } 68 | 69 | $relations = simplexml_load_string($relationsXml); 70 | 71 | if (\PHP_VERSION_ID < 80000) { 72 | // Restore entity loader state 73 | libxml_disable_entity_loader($loadEntities); 74 | } 75 | 76 | foreach($relations->Relationship as $rel) { 77 | if ($rel ["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { 78 | // Found office document! Read in contents... 79 | $contents = simplexml_load_string($package->getFromName( 80 | $this->absoluteZipPath(dirname($rel['Target']) 81 | . '/' 82 | . basename($rel['Target'])) 83 | )); 84 | 85 | $contents->registerXPathNamespace('w', self::SCHEMA_WORDPROCESSINGML); 86 | $paragraphs = $contents->xpath('//w:body/w:p'); 87 | 88 | foreach ($paragraphs as $paragraph) { 89 | $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]'); 90 | 91 | if ($runs === false) { 92 | // Paragraph doesn't contain any text or breaks 93 | continue; 94 | } 95 | 96 | foreach ($runs as $run) { 97 | if ($run->getName() == 'br') { 98 | // Break element 99 | $documentBody[] = ' '; 100 | } else { 101 | $documentBody[] = (string)$run; 102 | } 103 | } 104 | 105 | // Add space after each paragraph. So they are not bound together. 106 | $documentBody[] = ' '; 107 | } 108 | 109 | break; 110 | } 111 | } 112 | 113 | // Read core properties 114 | $coreProperties = $this->extractMetaData($package); 115 | 116 | // Close file 117 | $package->close(); 118 | 119 | // Store filename 120 | $this->addField(Field::Text('filename', $fileName, 'UTF-8')); 121 | 122 | // Store contents 123 | if ($storeContent) { 124 | $this->addField(Field::Text('body', implode('', $documentBody), 'UTF-8')); 125 | } else { 126 | $this->addField(Field::UnStored('body', implode('', $documentBody), 'UTF-8')); 127 | } 128 | 129 | // Store meta data properties 130 | foreach ($coreProperties as $key => $value) { 131 | $this->addField(Field::Text($key, $value, 'UTF-8')); 132 | } 133 | 134 | // Store title (if not present in meta data) 135 | if (! isset($coreProperties['title'])) { 136 | $this->addField(Field::Text('title', $fileName, 'UTF-8')); 137 | } 138 | } 139 | 140 | /** 141 | * Load Docx document from a file 142 | * 143 | * @param string $fileName 144 | * @param boolean $storeContent 145 | * @throws \ZendSearch\Lucene\Document\Exception\InvalidArgumentException 146 | * @return \ZendSearch\Lucene\Document\Docx 147 | */ 148 | public static function loadDocxFile($fileName, $storeContent = false) 149 | { 150 | if (!is_readable($fileName)) { 151 | throw new InvalidArgumentException('Provided file \'' . $fileName . '\' is not readable.'); 152 | } 153 | 154 | return new self($fileName, $storeContent); 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Storage/File/Filesystem.php: -------------------------------------------------------------------------------- 1 | _fileHandle = @fopen($filename, $mode); 52 | 53 | if ($this->_fileHandle === false) { 54 | ini_set('track_errors', $trackErrors); 55 | throw new Lucene\Exception\RuntimeException($php_errormsg); 56 | } 57 | 58 | ini_set('track_errors', $trackErrors); 59 | } 60 | 61 | /** 62 | * Sets the file position indicator and advances the file pointer. 63 | * The new position, measured in bytes from the beginning of the file, 64 | * is obtained by adding offset to the position specified by whence, 65 | * whose values are defined as follows: 66 | * SEEK_SET - Set position equal to offset bytes. 67 | * SEEK_CUR - Set position to current location plus offset. 68 | * SEEK_END - Set position to end-of-file plus offset. (To move to 69 | * a position before the end-of-file, you need to pass a negative value 70 | * in offset.) 71 | * SEEK_CUR is the only supported offset type for compound files 72 | * 73 | * Upon success, returns 0; otherwise, returns -1 74 | * 75 | * @param integer $offset 76 | * @param integer $whence 77 | * @return integer 78 | */ 79 | public function seek($offset, $whence=SEEK_SET) 80 | { 81 | return fseek($this->_fileHandle, $offset, $whence); 82 | } 83 | 84 | 85 | /** 86 | * Get file position. 87 | * 88 | * @return integer 89 | */ 90 | public function tell() 91 | { 92 | return ftell($this->_fileHandle); 93 | } 94 | 95 | /** 96 | * Flush output. 97 | * 98 | * Returns true on success or false on failure. 99 | * 100 | * @return boolean 101 | */ 102 | public function flush() 103 | { 104 | return fflush($this->_fileHandle); 105 | } 106 | 107 | /** 108 | * Close File object 109 | */ 110 | public function close() 111 | { 112 | if ($this->_fileHandle !== null ) { 113 | ErrorHandler::start(E_WARNING); 114 | fclose($this->_fileHandle); 115 | ErrorHandler::stop(); 116 | $this->_fileHandle = null; 117 | } 118 | } 119 | 120 | /** 121 | * Get the size of the already opened file 122 | * 123 | * @return integer 124 | */ 125 | public function size() 126 | { 127 | $position = ftell($this->_fileHandle); 128 | fseek($this->_fileHandle, 0, SEEK_END); 129 | $size = ftell($this->_fileHandle); 130 | fseek($this->_fileHandle,$position); 131 | 132 | return $size; 133 | } 134 | 135 | /** 136 | * Read a $length bytes from the file and advance the file pointer. 137 | * 138 | * @param integer $length 139 | * @return string 140 | */ 141 | protected function _fread($length=1) 142 | { 143 | if ($length == 0) { 144 | return ''; 145 | } 146 | 147 | if ($length < 1024) { 148 | return fread($this->_fileHandle, $length); 149 | } 150 | 151 | $data = ''; 152 | while ( $length > 0 && ($nextBlock = fread($this->_fileHandle, $length)) != false ) { 153 | $data .= $nextBlock; 154 | $length -= strlen($nextBlock); 155 | } 156 | return $data; 157 | } 158 | 159 | 160 | /** 161 | * Writes $length number of bytes (all, if $length===null) to the end 162 | * of the file. 163 | * 164 | * @param string $data 165 | * @param integer $length 166 | */ 167 | protected function _fwrite($data, $length=null) 168 | { 169 | if ($length === null ) { 170 | fwrite($this->_fileHandle, $data); 171 | } else { 172 | fwrite($this->_fileHandle, $data, $length); 173 | } 174 | } 175 | 176 | /** 177 | * Lock file 178 | * 179 | * Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock) 180 | * 181 | * @param integer $lockType 182 | * @param boolean $nonBlockingLock 183 | * @return boolean 184 | */ 185 | public function lock($lockType, $nonBlockingLock = false) 186 | { 187 | if ($nonBlockingLock) { 188 | return flock($this->_fileHandle, $lockType | LOCK_NB); 189 | } else { 190 | return flock($this->_fileHandle, $lockType); 191 | } 192 | } 193 | 194 | /** 195 | * Unlock file 196 | * 197 | * Returns true on success 198 | * 199 | * @return boolean 200 | */ 201 | public function unlock() 202 | { 203 | if ($this->_fileHandle !== null ) { 204 | return flock($this->_fileHandle, LOCK_UN); 205 | } else { 206 | return true; 207 | } 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Query/Term.php: -------------------------------------------------------------------------------- 1 | freq, ...) 42 | * 43 | * @var array 44 | */ 45 | private $_termFreqs; 46 | 47 | 48 | /** 49 | * Zend_Search_Lucene_Search_Query_Term constructor 50 | * 51 | * @param \ZendSearch\Lucene\Index\Term $term 52 | * @param boolean $sign 53 | */ 54 | public function __construct(Index\Term $term) 55 | { 56 | $this->_term = $term; 57 | } 58 | 59 | /** 60 | * Re-write query into primitive queries in the context of specified index 61 | * 62 | * @param \ZendSearch\Lucene\SearchIndexInterface $index 63 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 64 | */ 65 | public function rewrite(Lucene\SearchIndexInterface $index) 66 | { 67 | if ($this->_term->field != null) { 68 | return $this; 69 | } else { 70 | $query = new MultiTerm(); 71 | $query->setBoost($this->getBoost()); 72 | 73 | foreach ($index->getFieldNames(true) as $fieldName) { 74 | $term = new Index\Term($this->_term->text, $fieldName); 75 | 76 | $query->addTerm($term); 77 | } 78 | 79 | return $query->rewrite($index); 80 | } 81 | } 82 | 83 | /** 84 | * Optimize query in the context of specified index 85 | * 86 | * @param \ZendSearch\Lucene\SearchIndexInterface $index 87 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 88 | */ 89 | public function optimize(Lucene\SearchIndexInterface $index) 90 | { 91 | // Check, that index contains specified term 92 | if (!$index->hasTerm($this->_term)) { 93 | return new EmptyResult(); 94 | } 95 | 96 | return $this; 97 | } 98 | 99 | 100 | /** 101 | * Constructs an appropriate Weight implementation for this query. 102 | * 103 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 104 | * @return \ZendSearch\Lucene\Search\Weight\Term 105 | */ 106 | public function createWeight(Lucene\SearchIndexInterface $reader) 107 | { 108 | $this->_weight = new Weight\Term($this->_term, $this, $reader); 109 | return $this->_weight; 110 | } 111 | 112 | /** 113 | * Execute query in context of index reader 114 | * It also initializes necessary internal structures 115 | * 116 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 117 | * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter 118 | */ 119 | public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null) 120 | { 121 | $this->_docVector = array_flip($reader->termDocs($this->_term, $docsFilter)); 122 | $this->_termFreqs = $reader->termFreqs($this->_term, $docsFilter); 123 | 124 | // Initialize weight if it's not done yet 125 | $this->_initWeight($reader); 126 | } 127 | 128 | /** 129 | * Get document ids likely matching the query 130 | * 131 | * It's an array with document ids as keys (performance considerations) 132 | * 133 | * @return array 134 | */ 135 | public function matchedDocs() 136 | { 137 | return $this->_docVector; 138 | } 139 | 140 | /** 141 | * Score specified document 142 | * 143 | * @param integer $docId 144 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 145 | * @return float 146 | */ 147 | public function score($docId, Lucene\SearchIndexInterface $reader) 148 | { 149 | if (isset($this->_docVector[$docId])) { 150 | return $reader->getSimilarity()->tf($this->_termFreqs[$docId]) * 151 | $this->_weight->getValue() * 152 | $reader->norm($docId, $this->_term->field) * 153 | $this->getBoost(); 154 | } else { 155 | return 0; 156 | } 157 | } 158 | 159 | /** 160 | * Return query terms 161 | * 162 | * @return array 163 | */ 164 | public function getQueryTerms() 165 | { 166 | return array($this->_term); 167 | } 168 | 169 | /** 170 | * Return query term 171 | * 172 | * @return \ZendSearch\Lucene\Index\Term 173 | */ 174 | public function getTerm() 175 | { 176 | return $this->_term; 177 | } 178 | 179 | /** 180 | * Query specific matches highlighting 181 | * 182 | * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) 183 | */ 184 | protected function _highlightMatches(Highlighter $highlighter) 185 | { 186 | $highlighter->highlight($this->_term->text); 187 | } 188 | 189 | /** 190 | * Print a query 191 | * 192 | * @return string 193 | */ 194 | public function __toString() 195 | { 196 | // It's used only for query visualisation, so we don't care about characters escaping 197 | if ($this->_term->field !== null) { 198 | $query = $this->_term->field . ':'; 199 | } else { 200 | $query = ''; 201 | } 202 | 203 | $query .= $this->_term->text; 204 | 205 | if ($this->getBoost() != 1) { 206 | $query = $query . '^' . round($this->getBoost(), 4); 207 | } 208 | 209 | return $query; 210 | } 211 | } 212 | 213 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Document/Field.php: -------------------------------------------------------------------------------- 1 | name = $name; 104 | $this->value = $value; 105 | 106 | if (!$isBinary) { 107 | $this->encoding = $encoding; 108 | $this->isTokenized = $isTokenized; 109 | } else { 110 | $this->encoding = ''; 111 | $this->isTokenized = false; 112 | } 113 | 114 | $this->isStored = $isStored; 115 | $this->isIndexed = $isIndexed; 116 | $this->isBinary = $isBinary; 117 | 118 | $this->storeTermVector = false; 119 | $this->boost = 1.0; 120 | } 121 | 122 | 123 | /** 124 | * Constructs a String-valued Field that is not tokenized, but is indexed 125 | * and stored. Useful for non-text fields, e.g. date or url. 126 | * 127 | * @param string $name 128 | * @param string $value 129 | * @param string $encoding 130 | * @return \ZendSearch\Lucene\Document\Field 131 | */ 132 | public static function keyword($name, $value, $encoding = 'UTF-8') 133 | { 134 | return new self($name, $value, $encoding, true, true, false); 135 | } 136 | 137 | 138 | /** 139 | * Constructs a String-valued Field that is not tokenized nor indexed, 140 | * but is stored in the index, for return with hits. 141 | * 142 | * @param string $name 143 | * @param string $value 144 | * @param string $encoding 145 | * @return \ZendSearch\Lucene\Document\Field 146 | */ 147 | public static function unIndexed($name, $value, $encoding = 'UTF-8') 148 | { 149 | return new self($name, $value, $encoding, true, false, false); 150 | } 151 | 152 | 153 | /** 154 | * Constructs a Binary String valued Field that is not tokenized nor indexed, 155 | * but is stored in the index, for return with hits. 156 | * 157 | * @param string $name 158 | * @param string $value 159 | * @param string $encoding 160 | * @return \ZendSearch\Lucene\Document\Field 161 | */ 162 | public static function binary($name, $value) 163 | { 164 | return new self($name, $value, '', true, false, false, true); 165 | } 166 | 167 | /** 168 | * Constructs a String-valued Field that is tokenized and indexed, 169 | * and is stored in the index, for return with hits. Useful for short text 170 | * fields, like "title" or "subject". Term vector will not be stored for this field. 171 | * 172 | * @param string $name 173 | * @param string $value 174 | * @param string $encoding 175 | * @return \ZendSearch\Lucene\Document\Field 176 | */ 177 | public static function text($name, $value, $encoding = 'UTF-8') 178 | { 179 | return new self($name, $value, $encoding, true, true, true); 180 | } 181 | 182 | 183 | /** 184 | * Constructs a String-valued Field that is tokenized and indexed, 185 | * but that is not stored in the index. 186 | * 187 | * @param string $name 188 | * @param string $value 189 | * @param string $encoding 190 | * @return \ZendSearch\Lucene\Document\Field 191 | */ 192 | public static function unStored($name, $value, $encoding = 'UTF-8') 193 | { 194 | return new self($name, $value, $encoding, false, true, true); 195 | } 196 | 197 | /** 198 | * Get field value in UTF-8 encoding 199 | * 200 | * @return string 201 | */ 202 | public function getUtf8Value() 203 | { 204 | if (strcasecmp($this->encoding, 'utf8' ) == 0 || 205 | strcasecmp($this->encoding, 'utf-8') == 0 ) { 206 | return $this->value; 207 | } else { 208 | 209 | return (PHP_OS != 'AIX') ? iconv($this->encoding, 'UTF-8', $this->value) : iconv('ISO8859-1', 'UTF-8', $this->value); 210 | } 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Query/AbstractQuery.php: -------------------------------------------------------------------------------- 1 | _boost; 49 | } 50 | 51 | /** 52 | * Sets the boost for this query clause to $boost. 53 | * 54 | * @param float $boost 55 | */ 56 | public function setBoost($boost) 57 | { 58 | $this->_boost = $boost; 59 | } 60 | 61 | /** 62 | * Score specified document 63 | * 64 | * @param integer $docId 65 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 66 | * @return float 67 | */ 68 | abstract public function score($docId, Lucene\SearchIndexInterface $reader); 69 | 70 | /** 71 | * Get document ids likely matching the query 72 | * 73 | * It's an array with document ids as keys (performance considerations) 74 | * 75 | * @return array 76 | */ 77 | abstract public function matchedDocs(); 78 | 79 | /** 80 | * Execute query in context of index reader 81 | * It also initializes necessary internal structures 82 | * 83 | * AbstractQuery specific implementation 84 | * 85 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 86 | * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter 87 | */ 88 | abstract public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null); 89 | 90 | /** 91 | * Constructs an appropriate Weight implementation for this query. 92 | * 93 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 94 | * @return \ZendSearch\Lucene\Search\Weight\AbstractWeight 95 | */ 96 | abstract public function createWeight(Lucene\SearchIndexInterface $reader); 97 | 98 | /** 99 | * Constructs an initializes a Weight for a _top-level_query_. 100 | * 101 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 102 | */ 103 | protected function _initWeight(Lucene\SearchIndexInterface $reader) 104 | { 105 | // Check, that it's a top-level query and query weight is not initialized yet. 106 | if ($this->_weight !== null) { 107 | return $this->_weight; 108 | } 109 | 110 | $this->createWeight($reader); 111 | $sum = $this->_weight->sumOfSquaredWeights(); 112 | $queryNorm = $reader->getSimilarity()->queryNorm($sum); 113 | $this->_weight->normalize($queryNorm); 114 | } 115 | 116 | /** 117 | * Re-write query into primitive queries in the context of specified index 118 | * 119 | * @param \ZendSearch\Lucene\SearchIndexInterface $index 120 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 121 | */ 122 | abstract public function rewrite(Lucene\SearchIndexInterface $index); 123 | 124 | /** 125 | * Optimize query in the context of specified index 126 | * 127 | * @param \ZendSearch\Lucene\SearchIndexInterface $index 128 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 129 | */ 130 | abstract public function optimize(Lucene\SearchIndexInterface $index); 131 | 132 | /** 133 | * Reset query, so it can be reused within other queries or 134 | * with other indeces 135 | */ 136 | public function reset() 137 | { 138 | $this->_weight = null; 139 | } 140 | 141 | 142 | /** 143 | * Print a query 144 | * 145 | * @return string 146 | */ 147 | abstract public function __toString(); 148 | 149 | /** 150 | * Return query terms 151 | * 152 | * @return array 153 | */ 154 | abstract public function getQueryTerms(); 155 | 156 | /** 157 | * AbstractQuery specific matches highlighting 158 | * 159 | * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) 160 | */ 161 | abstract protected function _highlightMatches(Highlighter $highlighter); 162 | 163 | /** 164 | * Highlight matches in $inputHTML 165 | * 166 | * @param string $inputHTML 167 | * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. 168 | * @param Highlighter|null $highlighter 169 | * @return string 170 | */ 171 | public function highlightMatches($inputHTML, $defaultEncoding = '', $highlighter = null) 172 | { 173 | if ($highlighter === null) { 174 | $highlighter = new DefaultHighlighter(); 175 | } 176 | 177 | $doc = Document\HTML::loadHTML($inputHTML, false, $defaultEncoding); 178 | $highlighter->setDocument($doc); 179 | 180 | $this->_highlightMatches($highlighter); 181 | 182 | return $doc->getHTML(); 183 | } 184 | 185 | /** 186 | * Highlight matches in $inputHTMLFragment and return it (without HTML header and body tag) 187 | * 188 | * @param string $inputHTMLFragment 189 | * @param string $encoding Input HTML string encoding 190 | * @param Highlighter|null $highlighter 191 | * @return string 192 | */ 193 | public function htmlFragmentHighlightMatches($inputHTMLFragment, $encoding = 'UTF-8', $highlighter = null) 194 | { 195 | if ($highlighter === null) { 196 | $highlighter = new DefaultHighlighter(); 197 | } 198 | 199 | $inputHTML = '' 200 | . iconv($encoding, 'UTF-8//IGNORE', $inputHTMLFragment) . ''; 201 | 202 | $doc = Document\HTML::loadHTML($inputHTML); 203 | $highlighter->setDocument($doc); 204 | 205 | $this->_highlightMatches($highlighter); 206 | 207 | return $doc->getHTMLBody(); 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryToken.php: -------------------------------------------------------------------------------- 1 | or field:() pairs 28 | const TT_FIELD_INDICATOR = 3; // ':' 29 | const TT_REQUIRED = 4; // '+' 30 | const TT_PROHIBITED = 5; // '-' 31 | const TT_FUZZY_PROX_MARK = 6; // '~' 32 | const TT_BOOSTING_MARK = 7; // '^' 33 | const TT_RANGE_INCL_START = 8; // '[' 34 | const TT_RANGE_INCL_END = 9; // ']' 35 | const TT_RANGE_EXCL_START = 10; // '{' 36 | const TT_RANGE_EXCL_END = 11; // '}' 37 | const TT_SUBQUERY_START = 12; // '(' 38 | const TT_SUBQUERY_END = 13; // ')' 39 | const TT_AND_LEXEME = 14; // 'AND' or 'and' 40 | const TT_OR_LEXEME = 15; // 'OR' or 'or' 41 | const TT_NOT_LEXEME = 16; // 'NOT' or 'not' 42 | const TT_TO_LEXEME = 17; // 'TO' or 'to' 43 | const TT_NUMBER = 18; // Number, like: 10, 0.8, .64, .... 44 | 45 | 46 | /** 47 | * Returns all possible lexeme types. 48 | * It's used for syntax analyzer state machine initialization 49 | * 50 | * @return array 51 | */ 52 | public static function getTypes() 53 | { 54 | return array( self::TT_WORD, 55 | self::TT_PHRASE, 56 | self::TT_FIELD, 57 | self::TT_FIELD_INDICATOR, 58 | self::TT_REQUIRED, 59 | self::TT_PROHIBITED, 60 | self::TT_FUZZY_PROX_MARK, 61 | self::TT_BOOSTING_MARK, 62 | self::TT_RANGE_INCL_START, 63 | self::TT_RANGE_INCL_END, 64 | self::TT_RANGE_EXCL_START, 65 | self::TT_RANGE_EXCL_END, 66 | self::TT_SUBQUERY_START, 67 | self::TT_SUBQUERY_END, 68 | self::TT_AND_LEXEME, 69 | self::TT_OR_LEXEME, 70 | self::TT_NOT_LEXEME, 71 | self::TT_TO_LEXEME, 72 | self::TT_NUMBER 73 | ); 74 | } 75 | 76 | 77 | /** 78 | * TokenCategories 79 | */ 80 | const TC_WORD = 0; // Word 81 | const TC_PHRASE = 1; // Phrase (one or several quoted words) 82 | const TC_NUMBER = 2; // Nubers, which are used with syntax elements. Ex. roam~0.8 83 | const TC_SYNTAX_ELEMENT = 3; // + - ( ) [ ] { } ! || && ~ ^ 84 | 85 | 86 | /** 87 | * Token type. 88 | * 89 | * @var integer 90 | */ 91 | public $type; 92 | 93 | /** 94 | * Token text. 95 | * 96 | * @var integer 97 | */ 98 | public $text; 99 | 100 | /** 101 | * Token position within query. 102 | * 103 | * @var integer 104 | */ 105 | public $position; 106 | 107 | 108 | /** 109 | * IndexReader constructor needs token type and token text as a parameters. 110 | * 111 | * @param integer $tokenCategory 112 | * @param string $tokText 113 | * @param integer $position 114 | * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException 115 | */ 116 | public function __construct($tokenCategory, $tokenText, $position) 117 | { 118 | $this->text = $tokenText; 119 | $this->position = $position + 1; // Start from 1 120 | 121 | switch ($tokenCategory) { 122 | case self::TC_WORD: 123 | if ( strtolower($tokenText) == 'and') { 124 | $this->type = self::TT_AND_LEXEME; 125 | } elseif (strtolower($tokenText) == 'or') { 126 | $this->type = self::TT_OR_LEXEME; 127 | } elseif (strtolower($tokenText) == 'not') { 128 | $this->type = self::TT_NOT_LEXEME; 129 | } elseif (strtolower($tokenText) == 'to') { 130 | $this->type = self::TT_TO_LEXEME; 131 | } else { 132 | $this->type = self::TT_WORD; 133 | } 134 | break; 135 | 136 | case self::TC_PHRASE: 137 | $this->type = self::TT_PHRASE; 138 | break; 139 | 140 | case self::TC_NUMBER: 141 | $this->type = self::TT_NUMBER; 142 | break; 143 | 144 | case self::TC_SYNTAX_ELEMENT: 145 | switch ($tokenText) { 146 | case ':': 147 | $this->type = self::TT_FIELD_INDICATOR; 148 | break; 149 | 150 | case '+': 151 | $this->type = self::TT_REQUIRED; 152 | break; 153 | 154 | case '-': 155 | $this->type = self::TT_PROHIBITED; 156 | break; 157 | 158 | case '~': 159 | $this->type = self::TT_FUZZY_PROX_MARK; 160 | break; 161 | 162 | case '^': 163 | $this->type = self::TT_BOOSTING_MARK; 164 | break; 165 | 166 | case '[': 167 | $this->type = self::TT_RANGE_INCL_START; 168 | break; 169 | 170 | case ']': 171 | $this->type = self::TT_RANGE_INCL_END; 172 | break; 173 | 174 | case '{': 175 | $this->type = self::TT_RANGE_EXCL_START; 176 | break; 177 | 178 | case '}': 179 | $this->type = self::TT_RANGE_EXCL_END; 180 | break; 181 | 182 | case '(': 183 | $this->type = self::TT_SUBQUERY_START; 184 | break; 185 | 186 | case ')': 187 | $this->type = self::TT_SUBQUERY_END; 188 | break; 189 | 190 | case '!': 191 | $this->type = self::TT_NOT_LEXEME; 192 | break; 193 | 194 | case '&&': 195 | $this->type = self::TT_AND_LEXEME; 196 | break; 197 | 198 | case '||': 199 | $this->type = self::TT_OR_LEXEME; 200 | break; 201 | 202 | default: 203 | throw new Lucene\Exception\InvalidArgumentException( 204 | 'Unrecognized query syntax lexeme: \'' . $tokenText . '\'' 205 | ); 206 | } 207 | break; 208 | 209 | case self::TC_NUMBER: 210 | $this->type = self::TT_NUMBER; 211 | 212 | default: 213 | throw new Lucene\Exception\InvalidArgumentException( 214 | 'Unrecognized lexeme type: \'' . $tokenCategory . '\'' 215 | ); 216 | } 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Document/Pptx.php: -------------------------------------------------------------------------------- 1 | open($fileName); 77 | 78 | // Read relations and search for officeDocument 79 | $relationsXml = $package->getFromName('_rels/.rels'); 80 | if ($relationsXml === false) { 81 | throw new RuntimeException('Invalid archive or corrupted .pptx file.'); 82 | } 83 | 84 | if (\PHP_VERSION_ID < 80000) { 85 | // Prevent php from loading remote resources 86 | $loadEntities = libxml_disable_entity_loader(true); 87 | } 88 | 89 | $relations = simplexml_load_string($relationsXml); 90 | 91 | if (\PHP_VERSION_ID < 80000) { 92 | // Restore entity loader state 93 | libxml_disable_entity_loader($loadEntities); 94 | } 95 | 96 | foreach ($relations->Relationship as $rel) { 97 | if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { 98 | // Found office document! Search for slides... 99 | $slideRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) ); 100 | foreach ($slideRelations->Relationship as $slideRel) { 101 | if ($slideRel["Type"] == self::SCHEMA_SLIDERELATION) { 102 | // Found slide! 103 | $slides[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string( 104 | $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])) ) 105 | ); 106 | 107 | // Search for slide notes 108 | $slideNotesRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")) ); 109 | foreach ($slideNotesRelations->Relationship as $slideNoteRel) { 110 | if ($slideNoteRel["Type"] == self::SCHEMA_SLIDENOTESRELATION) { 111 | // Found slide notes! 112 | $slideNotes[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string( 113 | $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])) ) 114 | ); 115 | 116 | break; 117 | } 118 | } 119 | } 120 | } 121 | 122 | break; 123 | } 124 | } 125 | 126 | // Sort slides 127 | ksort($slides); 128 | ksort($slideNotes); 129 | 130 | // Extract contents from slides 131 | foreach ($slides as $slideKey => $slide) { 132 | // Register namespaces 133 | $slide->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML); 134 | $slide->registerXPathNamespace("a", self::SCHEMA_DRAWINGML); 135 | 136 | // Fetch all text 137 | $textElements = $slide->xpath('//a:t'); 138 | foreach ($textElements as $textElement) { 139 | $documentBody[] = (string)$textElement; 140 | } 141 | 142 | // Extract contents from slide notes 143 | if (isset($slideNotes[$slideKey])) { 144 | // Fetch slide note 145 | $slideNote = $slideNotes[$slideKey]; 146 | 147 | // Register namespaces 148 | $slideNote->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML); 149 | $slideNote->registerXPathNamespace("a", self::SCHEMA_DRAWINGML); 150 | 151 | // Fetch all text 152 | $textElements = $slideNote->xpath('//a:t'); 153 | foreach ($textElements as $textElement) { 154 | $documentBody[] = (string)$textElement; 155 | } 156 | } 157 | } 158 | 159 | // Read core properties 160 | $coreProperties = $this->extractMetaData($package); 161 | 162 | // Close file 163 | $package->close(); 164 | 165 | // Store filename 166 | $this->addField(Field::Text('filename', $fileName, 'UTF-8')); 167 | 168 | // Store contents 169 | if ($storeContent) { 170 | $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8')); 171 | } else { 172 | $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8')); 173 | } 174 | 175 | // Store meta data properties 176 | foreach ($coreProperties as $key => $value) { 177 | $this->addField(Field::Text($key, $value, 'UTF-8')); 178 | } 179 | 180 | // Store title (if not present in meta data) 181 | if (!isset($coreProperties['title'])) { 182 | $this->addField(Field::Text('title', $fileName, 'UTF-8')); 183 | } 184 | } 185 | 186 | /** 187 | * Load Pptx document from a file 188 | * 189 | * @param string $fileName 190 | * @param boolean $storeContent 191 | * @return \ZendSearch\Lucene\Document\Pptx 192 | */ 193 | public static function loadPptxFile($fileName, $storeContent = false) 194 | { 195 | return new self($fileName, $storeContent); 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/LockManager.php: -------------------------------------------------------------------------------- 1 | createFile(self::WRITE_LOCK_FILE); 42 | if (!$lock->lock(LOCK_EX)) { 43 | throw new RuntimeException('Can\'t obtain exclusive index lock'); 44 | } 45 | return $lock; 46 | } 47 | 48 | /** 49 | * Release exclusive write lock 50 | * 51 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 52 | */ 53 | public static function releaseWriteLock(Directory $lockDirectory) 54 | { 55 | $lock = $lockDirectory->getFileObject(self::WRITE_LOCK_FILE); 56 | $lock->unlock(); 57 | } 58 | 59 | /** 60 | * Obtain the exclusive "read escalation/de-escalation" lock 61 | * 62 | * Required to protect the escalate/de-escalate read lock process 63 | * on GFS (and potentially other) mounted filesystems. 64 | * 65 | * Why we need this: 66 | * While GFS supports cluster-wide locking via flock(), it's 67 | * implementation isn't quite what it should be. The locking 68 | * semantics that work consistently on a local filesystem tend to 69 | * fail on GFS mounted filesystems. This appears to be a design defect 70 | * in the implementation of GFS. How this manifests itself is that 71 | * conditional promotion of a shared lock to exclusive will always 72 | * fail, lock release requests are honored but not immediately 73 | * processed (causing erratic failures of subsequent conditional 74 | * requests) and the releasing of the exclusive lock before the 75 | * shared lock is set when a lock is demoted (which can open a window 76 | * of opportunity for another process to gain an exclusive lock when 77 | * it shoudln't be allowed to). 78 | * 79 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 80 | * @return \ZendSearch\Lucene\Storage\File\FileInterface 81 | * @throws \ZendSearch\Lucene\Exception\RuntimeException 82 | */ 83 | private static function _startReadLockProcessing(Directory $lockDirectory) 84 | { 85 | $lock = $lockDirectory->createFile(self::READ_LOCK_PROCESSING_LOCK_FILE); 86 | if (!$lock->lock(LOCK_EX)) { 87 | throw new RuntimeException('Can\'t obtain exclusive lock for the read lock processing file'); 88 | } 89 | return $lock; 90 | } 91 | 92 | /** 93 | * Release the exclusive "read escalation/de-escalation" lock 94 | * 95 | * Required to protect the escalate/de-escalate read lock process 96 | * on GFS (and potentially other) mounted filesystems. 97 | * 98 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 99 | */ 100 | private static function _stopReadLockProcessing(Directory $lockDirectory) 101 | { 102 | $lock = $lockDirectory->getFileObject(self::READ_LOCK_PROCESSING_LOCK_FILE); 103 | $lock->unlock(); 104 | } 105 | 106 | 107 | /** 108 | * Obtain shared read lock on the index 109 | * 110 | * It doesn't block other read or update processes, but prevent index from the premature cleaning-up 111 | * 112 | * @param \ZendSearch\Lucene\Storage\Directory $defaultLockDirectory 113 | * @return \ZendSearch\Lucene\Storage\File\FileInterface 114 | * @throws \ZendSearch\Lucene\Exception\RuntimeException 115 | */ 116 | public static function obtainReadLock(Directory $lockDirectory) 117 | { 118 | $lock = $lockDirectory->createFile(self::READ_LOCK_FILE); 119 | if (!$lock->lock(LOCK_SH)) { 120 | throw new RuntimeException('Can\'t obtain shared reading index lock'); 121 | } 122 | return $lock; 123 | } 124 | 125 | /** 126 | * Release shared read lock 127 | * 128 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 129 | */ 130 | public static function releaseReadLock(Directory $lockDirectory) 131 | { 132 | $lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE); 133 | $lock->unlock(); 134 | } 135 | 136 | /** 137 | * Escalate Read lock to exclusive level 138 | * 139 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 140 | * @return boolean 141 | */ 142 | public static function escalateReadLock(Directory $lockDirectory) 143 | { 144 | self::_startReadLockProcessing($lockDirectory); 145 | 146 | $lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE); 147 | 148 | // First, release the shared lock for the benefit of GFS since 149 | // it will fail the conditional request to promote the lock to 150 | // "exclusive" while the shared lock is held (even when we are 151 | // the only holder). 152 | $lock->unlock(); 153 | 154 | // GFS is really poor. While the above "unlock" returns, GFS 155 | // doesn't clean up it's tables right away (which will potentially 156 | // cause the conditional locking for the "exclusive" lock to fail. 157 | // We will retry the conditional lock request several times on a 158 | // failure to get past this. The performance hit is negligible 159 | // in the grand scheme of things and only will occur with GFS 160 | // filesystems or if another local process has the shared lock 161 | // on local filesystems. 162 | for ($retries = 0; $retries < 10; $retries++) { 163 | if ($lock->lock(LOCK_EX, true)) { 164 | // Exclusive lock is obtained! 165 | self::_stopReadLockProcessing($lockDirectory); 166 | return true; 167 | } 168 | 169 | // wait 1 microsecond 170 | usleep(1); 171 | } 172 | 173 | // Restore lock state 174 | $lock->lock(LOCK_SH); 175 | 176 | self::_stopReadLockProcessing($lockDirectory); 177 | return false; 178 | } 179 | 180 | /** 181 | * De-escalate Read lock to shared level 182 | * 183 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 184 | */ 185 | public static function deEscalateReadLock(Directory $lockDirectory) 186 | { 187 | $lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE); 188 | $lock->lock(LOCK_SH); 189 | } 190 | 191 | /** 192 | * Obtain exclusive optimization lock on the index 193 | * 194 | * Returns lock object on success and false otherwise (doesn't block execution) 195 | * 196 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 197 | * @return mixed 198 | */ 199 | public static function obtainOptimizationLock(Directory $lockDirectory) 200 | { 201 | $lock = $lockDirectory->createFile(self::OPTIMIZATION_LOCK_FILE); 202 | if (!$lock->lock(LOCK_EX, true)) { 203 | return false; 204 | } 205 | return $lock; 206 | } 207 | 208 | /** 209 | * Release exclusive optimization lock 210 | * 211 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 212 | */ 213 | public static function releaseOptimizationLock(Directory $lockDirectory) 214 | { 215 | $lock = $lockDirectory->getFileObject(self::OPTIMIZATION_LOCK_FILE); 216 | $lock->unlock(); 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/SegmentWriter/DocumentWriter.php: -------------------------------------------------------------------------------- 1 | _termDocs = array(); 55 | $this->_termDictionary = array(); 56 | } 57 | 58 | 59 | /** 60 | * Adds a document to this segment. 61 | * 62 | * @param \ZendSearch\Lucene\Document $document 63 | * @throws LuceneException\UnsupportedMethodCallException 64 | */ 65 | public function addDocument(Document $document) 66 | { 67 | $storedFields = array(); 68 | $docNorms = array(); 69 | $similarity = AbstractSimilarity::getDefault(); 70 | 71 | foreach ($document->getFieldNames() as $fieldName) { 72 | $field = $document->getField($fieldName); 73 | 74 | if ($field->storeTermVector) { 75 | /** 76 | * @todo term vector storing support 77 | */ 78 | throw new LuceneException\UnsupportedMethodCallException('Store term vector functionality is not supported yet.'); 79 | } 80 | 81 | if ($field->isIndexed) { 82 | if ($field->isTokenized) { 83 | $analyzer = Analyzer\Analyzer::getDefault(); 84 | $analyzer->setInput($field->value, $field->encoding); 85 | 86 | $position = 0; 87 | $tokenCounter = 0; 88 | while (($token = $analyzer->nextToken()) !== null) { 89 | $tokenCounter++; 90 | 91 | $term = new Index\Term($token->getTermText(), $field->name); 92 | $termKey = $term->key(); 93 | 94 | if (!isset($this->_termDictionary[$termKey])) { 95 | // New term 96 | $this->_termDictionary[$termKey] = $term; 97 | $this->_termDocs[$termKey] = array(); 98 | $this->_termDocs[$termKey][$this->_docCount] = array(); 99 | } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) { 100 | // Existing term, but new term entry 101 | $this->_termDocs[$termKey][$this->_docCount] = array(); 102 | } 103 | $position += $token->getPositionIncrement(); 104 | $this->_termDocs[$termKey][$this->_docCount][] = $position; 105 | } 106 | 107 | if ($tokenCounter == 0) { 108 | // Field contains empty value. Treat it as non-indexed and non-tokenized 109 | $field = clone($field); 110 | $field->isIndexed = $field->isTokenized = false; 111 | } else { 112 | $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 113 | $tokenCounter)* 114 | $document->boost* 115 | $field->boost )); 116 | } 117 | } elseif (($fieldUtf8Value = $field->getUtf8Value()) == '') { 118 | // Field contains empty value. Treat it as non-indexed and non-tokenized 119 | $field = clone($field); 120 | $field->isIndexed = $field->isTokenized = false; 121 | } else { 122 | $term = new Index\Term($fieldUtf8Value, $field->name); 123 | $termKey = $term->key(); 124 | 125 | if (!isset($this->_termDictionary[$termKey])) { 126 | // New term 127 | $this->_termDictionary[$termKey] = $term; 128 | $this->_termDocs[$termKey] = array(); 129 | $this->_termDocs[$termKey][$this->_docCount] = array(); 130 | } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) { 131 | // Existing term, but new term entry 132 | $this->_termDocs[$termKey][$this->_docCount] = array(); 133 | } 134 | $this->_termDocs[$termKey][$this->_docCount][] = 0; // position 135 | 136 | $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)* 137 | $document->boost* 138 | $field->boost )); 139 | } 140 | } 141 | 142 | if ($field->isStored) { 143 | $storedFields[] = $field; 144 | } 145 | 146 | $this->addField($field); 147 | } 148 | 149 | foreach ($this->_fields as $fieldName => $field) { 150 | if (!$field->isIndexed) { 151 | continue; 152 | } 153 | 154 | if (!isset($this->_norms[$fieldName])) { 155 | $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), 156 | $this->_docCount); 157 | } 158 | 159 | if (isset($docNorms[$fieldName])){ 160 | $this->_norms[$fieldName] .= $docNorms[$fieldName]; 161 | } else { 162 | $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )); 163 | } 164 | } 165 | 166 | $this->addStoredFields($storedFields); 167 | } 168 | 169 | 170 | /** 171 | * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files 172 | */ 173 | protected function _dumpDictionary() 174 | { 175 | ksort($this->_termDictionary, SORT_STRING); 176 | 177 | $this->initializeDictionaryFiles(); 178 | 179 | foreach ($this->_termDictionary as $termId => $term) { 180 | $this->addTerm($term, $this->_termDocs[$termId]); 181 | } 182 | 183 | $this->closeDictionaryFiles(); 184 | } 185 | 186 | 187 | /** 188 | * Close segment, write it to disk and return segment info 189 | * 190 | * @return \ZendSearch\Lucene\Index\SegmentInfo 191 | */ 192 | public function close() 193 | { 194 | if ($this->_docCount == 0) { 195 | return null; 196 | } 197 | 198 | $this->_dumpFNM(); 199 | $this->_dumpDictionary(); 200 | 201 | $this->_generateCFS(); 202 | 203 | return new Index\SegmentInfo($this->_directory, 204 | $this->_name, 205 | $this->_docCount, 206 | -1, 207 | null, 208 | true, 209 | true); 210 | } 211 | 212 | } 213 | 214 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Query/Preprocessing/Phrase.php: -------------------------------------------------------------------------------- 1 | _phrase = $phrase; 82 | $this->_phraseEncoding = $phraseEncoding; 83 | $this->_field = $fieldName; 84 | } 85 | 86 | /** 87 | * Set slop 88 | * 89 | * @param integer $slop 90 | */ 91 | public function setSlop($slop) 92 | { 93 | $this->_slop = $slop; 94 | } 95 | 96 | 97 | /** 98 | * Get slop 99 | * 100 | * @return integer 101 | */ 102 | public function getSlop() 103 | { 104 | return $this->_slop; 105 | } 106 | 107 | /** 108 | * Re-write query into primitive queries in the context of specified index 109 | * 110 | * @param \ZendSearch\Lucene\SearchIndexInterface $index 111 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 112 | */ 113 | public function rewrite(Lucene\SearchIndexInterface $index) 114 | { 115 | // Allow to use wildcards within phrases 116 | // They are either removed by text analyzer or used as a part of keyword for keyword fields 117 | // 118 | // if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) { 119 | // require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; 120 | // throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.'); 121 | // } 122 | 123 | // Split query into subqueries if field name is not specified 124 | if ($this->_field === null) { 125 | $query = new Query\Boolean(); 126 | $query->setBoost($this->getBoost()); 127 | 128 | if (Lucene\Lucene::getDefaultSearchField() === null) { 129 | $searchFields = $index->getFieldNames(true); 130 | } else { 131 | $searchFields = array(Lucene\Lucene::getDefaultSearchField()); 132 | } 133 | 134 | foreach ($searchFields as $fieldName) { 135 | $subquery = new Phrase($this->_phrase, 136 | $this->_phraseEncoding, 137 | $fieldName); 138 | $subquery->setSlop($this->getSlop()); 139 | 140 | $query->addSubquery($subquery->rewrite($index)); 141 | } 142 | 143 | $this->_matches = $query->getQueryTerms(); 144 | return $query; 145 | } 146 | 147 | // Recognize exact term matching (it corresponds to Keyword fields stored in the index) 148 | // encoding is not used since we expect binary matching 149 | $term = new Index\Term($this->_phrase, $this->_field); 150 | if ($index->hasTerm($term)) { 151 | $query = new Query\Term($term); 152 | $query->setBoost($this->getBoost()); 153 | 154 | $this->_matches = $query->getQueryTerms(); 155 | return $query; 156 | } 157 | 158 | 159 | // tokenize phrase using current analyzer and process it as a phrase query 160 | $tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding); 161 | 162 | if (count($tokens) == 0) { 163 | $this->_matches = array(); 164 | return new Query\Insignificant(); 165 | } 166 | 167 | if (count($tokens) == 1) { 168 | $term = new Index\Term($tokens[0]->getTermText(), $this->_field); 169 | $query = new Query\Term($term); 170 | $query->setBoost($this->getBoost()); 171 | 172 | $this->_matches = $query->getQueryTerms(); 173 | return $query; 174 | } 175 | 176 | //It's non-trivial phrase query 177 | $position = -1; 178 | $query = new Query\Phrase(); 179 | foreach ($tokens as $token) { 180 | $position += $token->getPositionIncrement(); 181 | $term = new Index\Term($token->getTermText(), $this->_field); 182 | $query->addTerm($term, $position); 183 | $query->setSlop($this->getSlop()); 184 | } 185 | $this->_matches = $query->getQueryTerms(); 186 | return $query; 187 | } 188 | 189 | /** 190 | * Query specific matches highlighting 191 | * 192 | * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) 193 | */ 194 | protected function _highlightMatches(Highlighter $highlighter) 195 | { 196 | /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ 197 | 198 | /** Skip exact term matching recognition, keyword fields highlighting is not supported */ 199 | 200 | /** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */ 201 | 202 | 203 | // tokenize phrase using current analyzer and process it as a phrase query 204 | $tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding); 205 | 206 | if (count($tokens) == 0) { 207 | // Do nothing 208 | return; 209 | } 210 | 211 | if (count($tokens) == 1) { 212 | $highlighter->highlight($tokens[0]->getTermText()); 213 | return; 214 | } 215 | 216 | //It's non-trivial phrase query 217 | $words = array(); 218 | foreach ($tokens as $token) { 219 | $words[] = $token->getTermText(); 220 | } 221 | $highlighter->highlight($words); 222 | } 223 | 224 | /** 225 | * Print a query 226 | * 227 | * @return string 228 | */ 229 | public function __toString() 230 | { 231 | // It's used only for query visualisation, so we don't care about characters escaping 232 | if ($this->_field !== null) { 233 | $query = $this->_field . ':'; 234 | } else { 235 | $query = ''; 236 | } 237 | 238 | $query .= '"' . $this->_phrase . '"'; 239 | 240 | if ($this->_slop != 0) { 241 | $query .= '~' . $this->_slop; 242 | } 243 | 244 | if ($this->getBoost() != 1) { 245 | $query .= '^' . round($this->getBoost(), 4); 246 | } 247 | 248 | return $query; 249 | } 250 | } 251 | --------------------------------------------------------------------------------