├── LICENSE.txt ├── README.md ├── composer.json └── library └── ZendSearch ├── Exception └── ExceptionInterface.php └── Lucene ├── AbstractFSM.php ├── AbstractPriorityQueue.php ├── Analysis ├── Analyzer │ ├── AbstractAnalyzer.php │ ├── Analyzer.php │ ├── AnalyzerInterface.php │ └── Common │ │ ├── AbstractCommon.php │ │ ├── Text.php │ │ ├── Text │ │ └── CaseInsensitive.php │ │ ├── TextNum.php │ │ ├── TextNum │ │ └── CaseInsensitive.php │ │ ├── Utf8.php │ │ ├── Utf8 │ │ └── CaseInsensitive.php │ │ ├── Utf8Num.php │ │ └── Utf8Num │ │ └── CaseInsensitive.php ├── Token.php └── TokenFilter │ ├── LowerCase.php │ ├── LowerCaseUtf8.php │ ├── ShortWords.php │ ├── StopWords.php │ └── TokenFilterInterface.php ├── Document.php ├── Document ├── AbstractOpenXML.php ├── Docx.php ├── Exception │ ├── ExceptionInterface.php │ └── InvalidArgumentException.php ├── Field.php ├── HTML.php ├── Pptx.php └── Xlsx.php ├── Exception ├── ExceptionInterface.php ├── ExtensionNotLoadedException.php ├── InvalidArgumentException.php ├── InvalidFileFormatException.php ├── OutOfBoundsException.php ├── OutOfRangeException.php ├── RuntimeException.php ├── UnexpectedValueException.php └── UnsupportedMethodCallException.php ├── FSMAction.php ├── Index.php ├── Index ├── DictionaryLoader.php ├── DocsFilter.php ├── FieldInfo.php ├── SegmentInfo.php ├── SegmentMerger.php ├── SegmentWriter │ ├── AbstractSegmentWriter.php │ ├── DocumentWriter.php │ └── StreamWriter.php ├── Term.php ├── TermInfo.php ├── TermsPriorityQueue.php ├── TermsStreamInterface.php └── Writer.php ├── LockManager.php ├── Lucene.php ├── MultiSearcher.php ├── Search ├── BooleanExpressionRecognizer.php ├── Exception │ ├── ExceptionInterface.php │ └── QueryParserException.php ├── Highlighter │ ├── DefaultHighlighter.php │ └── HighlighterInterface.php ├── Query │ ├── AbstractQuery.php │ ├── Boolean.php │ ├── EmptyResult.php │ ├── Fuzzy.php │ ├── Insignificant.php │ ├── MultiTerm.php │ ├── Phrase.php │ ├── Preprocessing │ │ ├── AbstractPreprocessing.php │ │ ├── Fuzzy.php │ │ ├── Phrase.php │ │ └── Term.php │ ├── Range.php │ ├── Term.php │ └── Wildcard.php ├── QueryEntry │ ├── AbstractQueryEntry.php │ ├── Phrase.php │ ├── Subquery.php │ └── Term.php ├── QueryHit.php ├── QueryLexer.php ├── QueryParser.php ├── QueryParserContext.php ├── QueryToken.php ├── Similarity │ ├── AbstractSimilarity.php │ └── DefaultSimilarity.php └── Weight │ ├── AbstractWeight.php │ ├── Boolean.php │ ├── EmptyResultWeight.php │ ├── MultiTerm.php │ ├── Phrase.php │ └── Term.php ├── SearchIndexInterface.php ├── Storage ├── Directory │ ├── DirectoryInterface.php │ └── Filesystem.php └── File │ ├── AbstractFile.php │ ├── FileInterface.php │ ├── Filesystem.php │ └── Memory.php └── TermStreamsPriorityQueue.php /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2005-2012, Zend Technologies USA, Inc. 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | * Neither the name of Zend Technologies USA, Inc. nor the names of its 15 | contributors may be used to endorse or promote products derived from this 16 | software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 19 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 20 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 21 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 22 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 23 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 24 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 25 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 26 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ZendSearch component 2 | 3 | > ## UNMAINTAINED 4 | > 5 | > This package is no longer maintained. 6 | 7 | You can install using: 8 | 9 | ``` 10 | curl -s https://getcomposer.org/installer | php 11 | php composer.phar install 12 | ``` 13 | 14 | At that point, follow the instructions in the documentation folder for actual 15 | usage of the component. (Documentation is forthcoming.) 16 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "zendframework/zendsearch", 3 | "description": "a general purpose text search engine written entirely in PHP 5", 4 | "type": "library", 5 | "license": "BSD-3-Clause", 6 | "keywords": [ 7 | "zf2", 8 | "lucene" 9 | ], 10 | "homepage": "http://packages.zendframework.com/", 11 | "autoload": { 12 | "psr-0": { 13 | "ZendSearch": "library/" 14 | } 15 | }, 16 | "repositories": [ 17 | { 18 | "type": "composer", 19 | "url": "http://packages.zendframework.com/" 20 | } 21 | ], 22 | "require": { 23 | "php": ">=5.3.3", 24 | "zendframework/zend-stdlib": "2.*" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /library/ZendSearch/Exception/ExceptionInterface.php: -------------------------------------------------------------------------------- 1 | _heap); 54 | $parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 ) 55 | 56 | while ($nodeId != 0 && $this->_less($element, $this->_heap[$parentId])) { 57 | // Move parent node down 58 | $this->_heap[$nodeId] = $this->_heap[$parentId]; 59 | 60 | // Move pointer to the next level of tree 61 | $nodeId = $parentId; 62 | $parentId = ($nodeId-1) >> 1; // floor( ($nodeId-1)/2 ) 63 | } 64 | 65 | // Put new node into the tree 66 | $this->_heap[$nodeId] = $element; 67 | } 68 | 69 | 70 | /** 71 | * Return least element of the queue 72 | * 73 | * Constant time 74 | * 75 | * @return mixed 76 | */ 77 | public function top() 78 | { 79 | if (count($this->_heap) == 0) { 80 | return null; 81 | } 82 | 83 | return $this->_heap[0]; 84 | } 85 | 86 | 87 | /** 88 | * Removes and return least element of the queue 89 | * 90 | * O(log(N)) time 91 | * 92 | * @return mixed 93 | */ 94 | public function pop() 95 | { 96 | if (count($this->_heap) == 0) { 97 | return null; 98 | } 99 | 100 | $top = $this->_heap[0]; 101 | $lastId = count($this->_heap) - 1; 102 | 103 | /** 104 | * Find appropriate position for last node 105 | */ 106 | $nodeId = 0; // Start from a top 107 | $childId = 1; // First child 108 | 109 | // Choose smaller child 110 | if ($lastId > 2 && $this->_less($this->_heap[2], $this->_heap[1])) { 111 | $childId = 2; 112 | } 113 | 114 | while ($childId < $lastId && 115 | $this->_less($this->_heap[$childId], $this->_heap[$lastId]) 116 | ) { 117 | // Move child node up 118 | $this->_heap[$nodeId] = $this->_heap[$childId]; 119 | 120 | $nodeId = $childId; // Go down 121 | $childId = ($nodeId << 1) + 1; // First child 122 | 123 | // Choose smaller child 124 | if (($childId+1) < $lastId && 125 | $this->_less($this->_heap[$childId+1], $this->_heap[$childId]) 126 | ) { 127 | $childId++; 128 | } 129 | } 130 | 131 | // Move last element to the new position 132 | $this->_heap[$nodeId] = $this->_heap[$lastId]; 133 | unset($this->_heap[$lastId]); 134 | 135 | return $top; 136 | } 137 | 138 | 139 | /** 140 | * Clear queue 141 | */ 142 | public function clear() 143 | { 144 | $this->_heap = array(); 145 | } 146 | 147 | 148 | /** 149 | * Compare elements 150 | * 151 | * Returns true, if $el1 is less than $el2; else otherwise 152 | * 153 | * @param mixed $el1 154 | * @param mixed $el2 155 | * @return boolean 156 | */ 157 | abstract protected function _less($el1, $el2); 158 | } 159 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/AbstractAnalyzer.php: -------------------------------------------------------------------------------- 1 | setInput($data, $encoding); 50 | 51 | $tokenList = array(); 52 | while (($nextToken = $this->nextToken()) !== null) { 53 | $tokenList[] = $nextToken; 54 | } 55 | 56 | return $tokenList; 57 | } 58 | 59 | /** 60 | * Tokenization stream API 61 | * Set input 62 | * 63 | * @param string $data 64 | */ 65 | public function setInput($data, $encoding = '') 66 | { 67 | $this->_input = $data; 68 | $this->_encoding = $encoding; 69 | $this->reset(); 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Analyzer.php: -------------------------------------------------------------------------------- 1 | _filters[] = $filter; 45 | } 46 | 47 | /** 48 | * Apply filters to the token. Can return null when the token was removed. 49 | * 50 | * @param \ZendSearch\Lucene\Analysis\Token $token 51 | * @return \ZendSearch\Lucene\Analysis\Token 52 | */ 53 | public function normalize(Analysis\Token $token) 54 | { 55 | foreach ($this->_filters as $filter) { 56 | $token = $filter->normalize($token); 57 | 58 | // resulting token can be null if the filter removes it 59 | if ($token === null) { 60 | return null; 61 | } 62 | } 63 | 64 | return $token; 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/Text.php: -------------------------------------------------------------------------------- 1 | _position = 0; 35 | 36 | if ($this->_input === null) { 37 | return; 38 | } 39 | 40 | // convert input into ascii 41 | if (PHP_OS != 'AIX') { 42 | $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input); 43 | } 44 | $this->_encoding = 'ASCII'; 45 | } 46 | 47 | /** 48 | * Tokenization stream API 49 | * Get next token 50 | * Returns null at the end of stream 51 | * 52 | * @return \ZendSearch\Lucene\Analysis\Token|null 53 | */ 54 | public function nextToken() 55 | { 56 | if ($this->_input === null) { 57 | return null; 58 | } 59 | 60 | 61 | do { 62 | if (! preg_match('/[a-zA-Z]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) { 63 | // It covers both cases a) there are no matches (preg_match(...) === 0) 64 | // b) error occured (preg_match(...) === FALSE) 65 | return null; 66 | } 67 | 68 | $str = $match[0][0]; 69 | $pos = $match[0][1]; 70 | $endpos = $pos + strlen($str); 71 | 72 | $this->_position = $endpos; 73 | 74 | $token = $this->normalize(new Analysis\Token($str, $pos, $endpos)); 75 | } while ($token === null); // try again if token is skipped 76 | 77 | return $token; 78 | } 79 | } 80 | 81 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/Text/CaseInsensitive.php: -------------------------------------------------------------------------------- 1 | addFilter(new TokenFilter\LowerCase()); 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/TextNum.php: -------------------------------------------------------------------------------- 1 | _position = 0; 35 | 36 | if ($this->_input === null) { 37 | return; 38 | } 39 | 40 | // convert input into ascii 41 | if (PHP_OS != 'AIX') { 42 | $this->_input = iconv($this->_encoding, 'ASCII//TRANSLIT', $this->_input); 43 | } 44 | $this->_encoding = 'ASCII'; 45 | } 46 | 47 | /** 48 | * Tokenization stream API 49 | * Get next token 50 | * Returns null at the end of stream 51 | * 52 | * @return \ZendSearch\Lucene\Analysis\Token|null 53 | */ 54 | public function nextToken() 55 | { 56 | if ($this->_input === null) { 57 | return null; 58 | } 59 | 60 | do { 61 | if (! preg_match('/[a-zA-Z0-9]+/', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_position)) { 62 | // It covers both cases a) there are no matches (preg_match(...) === 0) 63 | // b) error occured (preg_match(...) === FALSE) 64 | return null; 65 | } 66 | 67 | $str = $match[0][0]; 68 | $pos = $match[0][1]; 69 | $endpos = $pos + strlen($str); 70 | 71 | $this->_position = $endpos; 72 | 73 | $token = $this->normalize(new Analysis\Token($str, $pos, $endpos)); 74 | } while ($token === null); // try again if token is skipped 75 | 76 | return $token; 77 | } 78 | } 79 | 80 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/TextNum/CaseInsensitive.php: -------------------------------------------------------------------------------- 1 | addFilter(new TokenFilter\LowerCase()); 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8.php: -------------------------------------------------------------------------------- 1 | _position = 0; 61 | $this->_bytePosition = 0; 62 | 63 | // convert input into UTF-8 64 | if (strcasecmp($this->_encoding, 'utf8' ) != 0 && 65 | strcasecmp($this->_encoding, 'utf-8') != 0 ) { 66 | $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); 67 | $this->_encoding = 'UTF-8'; 68 | } 69 | } 70 | 71 | /** 72 | * Tokenization stream API 73 | * Get next token 74 | * Returns null at the end of stream 75 | * 76 | * @return \ZendSearch\Lucene\Analysis\Token|null 77 | */ 78 | public function nextToken() 79 | { 80 | if ($this->_input === null) { 81 | return null; 82 | } 83 | 84 | do { 85 | if (! preg_match('/[\p{L}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) { 86 | // It covers both cases a) there are no matches (preg_match(...) === 0) 87 | // b) error occured (preg_match(...) === FALSE) 88 | return null; 89 | } 90 | 91 | // matched string 92 | $matchedWord = $match[0][0]; 93 | 94 | // binary position of the matched word in the input stream 95 | $binStartPos = $match[0][1]; 96 | 97 | // character position of the matched word in the input stream 98 | $startPos = $this->_position + 99 | iconv_strlen(substr($this->_input, 100 | $this->_bytePosition, 101 | $binStartPos - $this->_bytePosition), 102 | 'UTF-8'); 103 | // character postion of the end of matched word in the input stream 104 | $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8'); 105 | 106 | $this->_bytePosition = $binStartPos + strlen($matchedWord); 107 | $this->_position = $endPos; 108 | 109 | $token = $this->normalize(new Analysis\Token($matchedWord, $startPos, $endPos)); 110 | } while ($token === null); // try again if token is skipped 111 | 112 | return $token; 113 | } 114 | } 115 | 116 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8/CaseInsensitive.php: -------------------------------------------------------------------------------- 1 | addFilter(new TokenFilter\LowerCaseUtf8()); 28 | } 29 | } 30 | 31 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8Num.php: -------------------------------------------------------------------------------- 1 | _position = 0; 61 | $this->_bytePosition = 0; 62 | 63 | // convert input into UTF-8 64 | if (strcasecmp($this->_encoding, 'utf8' ) != 0 && 65 | strcasecmp($this->_encoding, 'utf-8') != 0 ) { 66 | $this->_input = iconv($this->_encoding, 'UTF-8', $this->_input); 67 | $this->_encoding = 'UTF-8'; 68 | } 69 | } 70 | 71 | /** 72 | * Tokenization stream API 73 | * Get next token 74 | * Returns null at the end of stream 75 | * 76 | * @return \ZendSearch\Lucene\Analysis\Token|null 77 | */ 78 | public function nextToken() 79 | { 80 | if ($this->_input === null) { 81 | return null; 82 | } 83 | 84 | do { 85 | if (! preg_match('/[\p{L}\p{N}]+/u', $this->_input, $match, PREG_OFFSET_CAPTURE, $this->_bytePosition)) { 86 | // It covers both cases a) there are no matches (preg_match(...) === 0) 87 | // b) error occured (preg_match(...) === FALSE) 88 | return null; 89 | } 90 | 91 | // matched string 92 | $matchedWord = $match[0][0]; 93 | 94 | // binary position of the matched word in the input stream 95 | $binStartPos = $match[0][1]; 96 | 97 | // character position of the matched word in the input stream 98 | $startPos = $this->_position + 99 | iconv_strlen(substr($this->_input, 100 | $this->_bytePosition, 101 | $binStartPos - $this->_bytePosition), 102 | 'UTF-8'); 103 | // character postion of the end of matched word in the input stream 104 | $endPos = $startPos + iconv_strlen($matchedWord, 'UTF-8'); 105 | 106 | $this->_bytePosition = $binStartPos + strlen($matchedWord); 107 | $this->_position = $endPos; 108 | 109 | $token = $this->normalize(new Analysis\Token($matchedWord, $startPos, $endPos)); 110 | } while ($token === null); // try again if token is skipped 111 | 112 | return $token; 113 | } 114 | } 115 | 116 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Analyzer/Common/Utf8Num/CaseInsensitive.php: -------------------------------------------------------------------------------- 1 | addFilter(new TokenFilter\LowerCaseUtf8()); 28 | } 29 | } 30 | 31 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/Token.php: -------------------------------------------------------------------------------- 1 | _termText = $text; 77 | $this->_startOffset = $start; 78 | $this->_endOffset = $end; 79 | 80 | $this->_positionIncrement = 1; 81 | } 82 | 83 | 84 | /** 85 | * positionIncrement setter 86 | * 87 | * @param integer $positionIncrement 88 | */ 89 | public function setPositionIncrement($positionIncrement) 90 | { 91 | $this->_positionIncrement = $positionIncrement; 92 | } 93 | 94 | /** 95 | * Returns the position increment of this Token. 96 | * 97 | * @return integer 98 | */ 99 | public function getPositionIncrement() 100 | { 101 | return $this->_positionIncrement; 102 | } 103 | 104 | /** 105 | * Returns the Token's term text. 106 | * 107 | * @return string 108 | */ 109 | public function getTermText() 110 | { 111 | return $this->_termText; 112 | } 113 | 114 | /** 115 | * Returns this Token's starting offset, the position of the first character 116 | * corresponding to this token in the source text. 117 | * 118 | * Note: 119 | * The difference between getEndOffset() and getStartOffset() may not be equal 120 | * to strlen(Zend_Search_Lucene_Analysis_Token::getTermText()), as the term text may have been altered 121 | * by a stemmer or some other filter. 122 | * 123 | * @return integer 124 | */ 125 | public function getStartOffset() 126 | { 127 | return $this->_startOffset; 128 | } 129 | 130 | /** 131 | * Returns this Token's ending offset, one greater than the position of the 132 | * last character corresponding to this token in the source text. 133 | * 134 | * @return integer 135 | */ 136 | public function getEndOffset() 137 | { 138 | return $this->_endOffset; 139 | } 140 | } 141 | 142 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/TokenFilter/LowerCase.php: -------------------------------------------------------------------------------- 1 | getTermText() ), 33 | $srcToken->getStartOffset(), 34 | $srcToken->getEndOffset()); 35 | 36 | $newToken->setPositionIncrement($srcToken->getPositionIncrement()); 37 | 38 | return $newToken; 39 | } 40 | } 41 | 42 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/TokenFilter/LowerCaseUtf8.php: -------------------------------------------------------------------------------- 1 | getTermText(), 'UTF-8'), 47 | $srcToken->getStartOffset(), 48 | $srcToken->getEndOffset()); 49 | 50 | $newToken->setPositionIncrement($srcToken->getPositionIncrement()); 51 | 52 | return $newToken; 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/TokenFilter/ShortWords.php: -------------------------------------------------------------------------------- 1 | length = $length; 38 | } 39 | 40 | /** 41 | * Normalize Token or remove it (if null is returned) 42 | * 43 | * @param \ZendSearch\Lucene\Analysis\Token $srcToken 44 | * @return \ZendSearch\Lucene\Analysis\Token 45 | */ 46 | public function normalize(Token $srcToken) 47 | { 48 | if (strlen($srcToken->getTermText()) < $this->length) { 49 | return null; 50 | } else { 51 | return $srcToken; 52 | } 53 | } 54 | } 55 | 56 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/TokenFilter/StopWords.php: -------------------------------------------------------------------------------- 1 | 1, 'an' => '1'); 21 | * 22 | * We do recommend to provide all words in lowercase and concatenate this class after the lowercase filter. 23 | * 24 | * @category Zend 25 | * @package Zend_Search_Lucene 26 | * @subpackage Analysis 27 | */ 28 | class StopWords implements TokenFilterInterface 29 | { 30 | /** 31 | * Stop Words 32 | * @var array 33 | */ 34 | private $_stopSet; 35 | 36 | /** 37 | * Constructs new instance of this filter. 38 | * 39 | * @param array $stopwords array (set) of words that will be filtered out 40 | */ 41 | public function __construct($stopwords = array()) 42 | { 43 | $this->_stopSet = array_flip($stopwords); 44 | } 45 | 46 | /** 47 | * Normalize Token or remove it (if null is returned) 48 | * 49 | * @param \ZendSearch\Lucene\Analysis\Token $srcToken 50 | * @return \ZendSearch\Lucene\Analysis\Token 51 | */ 52 | public function normalize(Token $srcToken) 53 | { 54 | if (array_key_exists($srcToken->getTermText(), $this->_stopSet)) { 55 | return null; 56 | } else { 57 | return $srcToken; 58 | } 59 | } 60 | 61 | /** 62 | * Fills stopwords set from a text file. Each line contains one stopword, lines with '#' in the first 63 | * column are ignored (as comments). 64 | * 65 | * You can call this method one or more times. New stopwords are always added to current set. 66 | * 67 | * @param string $filepath full path for text file with stopwords 68 | * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException 69 | * @throws \ZendSearch\Lucene\Exception\RuntimeException 70 | */ 71 | public function loadFromFile($filepath = null) 72 | { 73 | if (! $filepath || ! file_exists($filepath)) { 74 | throw new InvalidArgumentException('You have to provide valid file path'); 75 | } 76 | $fd = fopen($filepath, "r"); 77 | if (! $fd) { 78 | throw new RuntimeException('Cannot open file ' . $filepath); 79 | } 80 | while (!feof ($fd)) { 81 | $buffer = trim(fgets($fd)); 82 | if (strlen($buffer) > 0 && $buffer[0] != '#') { 83 | $this->_stopSet[$buffer] = 1; 84 | } 85 | } 86 | if (!fclose($fd)) { 87 | throw new RuntimeException('Cannot close file ' . $filepath); 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Analysis/TokenFilter/TokenFilterInterface.php: -------------------------------------------------------------------------------- 1 | getFieldNames()); 50 | } 51 | 52 | /** 53 | * Proxy method for getFieldValue(), provides more convenient access to 54 | * the string value of a field. 55 | * 56 | * @param $offset 57 | * @return string 58 | */ 59 | public function __get($offset) 60 | { 61 | return $this->getFieldValue($offset); 62 | } 63 | 64 | 65 | /** 66 | * Add a field object to this document. 67 | * 68 | * @param \ZendSearch\Lucene\Document\Field $field 69 | * @return \ZendSearch\Lucene\Document 70 | */ 71 | public function addField(Document\Field $field) 72 | { 73 | $this->_fields[$field->name] = $field; 74 | 75 | return $this; 76 | } 77 | 78 | 79 | /** 80 | * Return an array with the names of the fields in this document. 81 | * 82 | * @return array 83 | */ 84 | public function getFieldNames() 85 | { 86 | return array_keys($this->_fields); 87 | } 88 | 89 | 90 | /** 91 | * Returns {@link \ZendSearch\Lucene\Document\Field} object for a named field in this document. 92 | * 93 | * @param string $fieldName 94 | * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException 95 | * @return \ZendSearch\Lucene\Document\Field 96 | */ 97 | public function getField($fieldName) 98 | { 99 | if (!array_key_exists($fieldName, $this->_fields)) { 100 | throw new InvalidArgumentException("Field name \"$fieldName\" not found in document."); 101 | } 102 | return $this->_fields[$fieldName]; 103 | } 104 | 105 | 106 | /** 107 | * Returns the string value of a named field in this document. 108 | * 109 | * @see __get() 110 | * @return string 111 | */ 112 | public function getFieldValue($fieldName) 113 | { 114 | return $this->getField($fieldName)->value; 115 | } 116 | 117 | /** 118 | * Returns the string value of a named field in UTF-8 encoding. 119 | * 120 | * @see __get() 121 | * @return string 122 | */ 123 | public function getFieldUtf8Value($fieldName) 124 | { 125 | return $this->getField($fieldName)->getUtf8Value(); 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Document/AbstractOpenXML.php: -------------------------------------------------------------------------------- 1 | getFromName("_rels/.rels")); 75 | 76 | // Restore entity loader state 77 | libxml_disable_entity_loader($loadEntities); 78 | 79 | foreach ($relations->Relationship as $rel) { 80 | if ($rel["Type"] == self::SCHEMA_COREPROPERTIES) { 81 | // Found core properties! Read in contents... 82 | $contents = simplexml_load_string( 83 | $package->getFromName(dirname($rel["Target"]) . "/" . basename($rel["Target"])) 84 | ); 85 | 86 | foreach ($contents->children(self::SCHEMA_DUBLINCORE) as $child) { 87 | $coreProperties[$child->getName()] = (string)$child; 88 | } 89 | foreach ($contents->children(self::SCHEMA_COREPROPERTIES) as $child) { 90 | $coreProperties[$child->getName()] = (string)$child; 91 | } 92 | foreach ($contents->children(self::SCHEMA_DUBLINCORETERMS) as $child) { 93 | $coreProperties[$child->getName()] = (string)$child; 94 | } 95 | } 96 | } 97 | 98 | return $coreProperties; 99 | } 100 | 101 | /** 102 | * Determine absolute zip path 103 | * 104 | * @param string $path 105 | * @return string 106 | */ 107 | protected function absoluteZipPath($path) 108 | { 109 | $path = str_replace(array('/', '\\'), DIRECTORY_SEPARATOR, $path); 110 | $parts = array_filter(explode(DIRECTORY_SEPARATOR, $path), 'strlen'); 111 | $absolutes = array(); 112 | foreach ($parts as $part) { 113 | if ('.' == $part) continue; 114 | if ('..' == $part) { 115 | array_pop($absolutes); 116 | } else { 117 | $absolutes[] = $part; 118 | } 119 | } 120 | return implode('/', $absolutes); 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Document/Docx.php: -------------------------------------------------------------------------------- 1 | open($fileName); 57 | 58 | // Read relations and search for officeDocument 59 | $relationsXml = $package->getFromName('_rels/.rels'); 60 | if ($relationsXml === false) { 61 | throw new RuntimeException('Invalid archive or corrupted .docx file.'); 62 | } 63 | 64 | // Prevent php from loading remote resources 65 | $loadEntities = libxml_disable_entity_loader(true); 66 | 67 | $relations = simplexml_load_string($relationsXml); 68 | 69 | // Restore entity loader state 70 | libxml_disable_entity_loader($loadEntities); 71 | 72 | foreach($relations->Relationship as $rel) { 73 | if ($rel ["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { 74 | // Found office document! Read in contents... 75 | $contents = simplexml_load_string($package->getFromName( 76 | $this->absoluteZipPath(dirname($rel['Target']) 77 | . '/' 78 | . basename($rel['Target'])) 79 | )); 80 | 81 | $contents->registerXPathNamespace('w', self::SCHEMA_WORDPROCESSINGML); 82 | $paragraphs = $contents->xpath('//w:body/w:p'); 83 | 84 | foreach ($paragraphs as $paragraph) { 85 | $runs = $paragraph->xpath('.//w:r/*[name() = "w:t" or name() = "w:br"]'); 86 | 87 | if ($runs === false) { 88 | // Paragraph doesn't contain any text or breaks 89 | continue; 90 | } 91 | 92 | foreach ($runs as $run) { 93 | if ($run->getName() == 'br') { 94 | // Break element 95 | $documentBody[] = ' '; 96 | } else { 97 | $documentBody[] = (string)$run; 98 | } 99 | } 100 | 101 | // Add space after each paragraph. So they are not bound together. 102 | $documentBody[] = ' '; 103 | } 104 | 105 | break; 106 | } 107 | } 108 | 109 | // Read core properties 110 | $coreProperties = $this->extractMetaData($package); 111 | 112 | // Close file 113 | $package->close(); 114 | 115 | // Store filename 116 | $this->addField(Field::Text('filename', $fileName, 'UTF-8')); 117 | 118 | // Store contents 119 | if ($storeContent) { 120 | $this->addField(Field::Text('body', implode('', $documentBody), 'UTF-8')); 121 | } else { 122 | $this->addField(Field::UnStored('body', implode('', $documentBody), 'UTF-8')); 123 | } 124 | 125 | // Store meta data properties 126 | foreach ($coreProperties as $key => $value) { 127 | $this->addField(Field::Text($key, $value, 'UTF-8')); 128 | } 129 | 130 | // Store title (if not present in meta data) 131 | if (! isset($coreProperties['title'])) { 132 | $this->addField(Field::Text('title', $fileName, 'UTF-8')); 133 | } 134 | } 135 | 136 | /** 137 | * Load Docx document from a file 138 | * 139 | * @param string $fileName 140 | * @param boolean $storeContent 141 | * @throws \ZendSearch\Lucene\Document\Exception\InvalidArgumentException 142 | * @return \ZendSearch\Lucene\Document\Docx 143 | */ 144 | public static function loadDocxFile($fileName, $storeContent = false) 145 | { 146 | if (!is_readable($fileName)) { 147 | throw new InvalidArgumentException('Provided file \'' . $fileName . '\' is not readable.'); 148 | } 149 | 150 | return new self($fileName, $storeContent); 151 | } 152 | } 153 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Document/Exception/ExceptionInterface.php: -------------------------------------------------------------------------------- 1 | name = $name; 104 | $this->value = $value; 105 | 106 | if (!$isBinary) { 107 | $this->encoding = $encoding; 108 | $this->isTokenized = $isTokenized; 109 | } else { 110 | $this->encoding = ''; 111 | $this->isTokenized = false; 112 | } 113 | 114 | $this->isStored = $isStored; 115 | $this->isIndexed = $isIndexed; 116 | $this->isBinary = $isBinary; 117 | 118 | $this->storeTermVector = false; 119 | $this->boost = 1.0; 120 | } 121 | 122 | 123 | /** 124 | * Constructs a String-valued Field that is not tokenized, but is indexed 125 | * and stored. Useful for non-text fields, e.g. date or url. 126 | * 127 | * @param string $name 128 | * @param string $value 129 | * @param string $encoding 130 | * @return \ZendSearch\Lucene\Document\Field 131 | */ 132 | public static function keyword($name, $value, $encoding = 'UTF-8') 133 | { 134 | return new self($name, $value, $encoding, true, true, false); 135 | } 136 | 137 | 138 | /** 139 | * Constructs a String-valued Field that is not tokenized nor indexed, 140 | * but is stored in the index, for return with hits. 141 | * 142 | * @param string $name 143 | * @param string $value 144 | * @param string $encoding 145 | * @return \ZendSearch\Lucene\Document\Field 146 | */ 147 | public static function unIndexed($name, $value, $encoding = 'UTF-8') 148 | { 149 | return new self($name, $value, $encoding, true, false, false); 150 | } 151 | 152 | 153 | /** 154 | * Constructs a Binary String valued Field that is not tokenized nor indexed, 155 | * but is stored in the index, for return with hits. 156 | * 157 | * @param string $name 158 | * @param string $value 159 | * @param string $encoding 160 | * @return \ZendSearch\Lucene\Document\Field 161 | */ 162 | public static function binary($name, $value) 163 | { 164 | return new self($name, $value, '', true, false, false, true); 165 | } 166 | 167 | /** 168 | * Constructs a String-valued Field that is tokenized and indexed, 169 | * and is stored in the index, for return with hits. Useful for short text 170 | * fields, like "title" or "subject". Term vector will not be stored for this field. 171 | * 172 | * @param string $name 173 | * @param string $value 174 | * @param string $encoding 175 | * @return \ZendSearch\Lucene\Document\Field 176 | */ 177 | public static function text($name, $value, $encoding = 'UTF-8') 178 | { 179 | return new self($name, $value, $encoding, true, true, true); 180 | } 181 | 182 | 183 | /** 184 | * Constructs a String-valued Field that is tokenized and indexed, 185 | * but that is not stored in the index. 186 | * 187 | * @param string $name 188 | * @param string $value 189 | * @param string $encoding 190 | * @return \ZendSearch\Lucene\Document\Field 191 | */ 192 | public static function unStored($name, $value, $encoding = 'UTF-8') 193 | { 194 | return new self($name, $value, $encoding, false, true, true); 195 | } 196 | 197 | /** 198 | * Get field value in UTF-8 encoding 199 | * 200 | * @return string 201 | */ 202 | public function getUtf8Value() 203 | { 204 | if (strcasecmp($this->encoding, 'utf8' ) == 0 || 205 | strcasecmp($this->encoding, 'utf-8') == 0 ) { 206 | return $this->value; 207 | } else { 208 | 209 | return (PHP_OS != 'AIX') ? iconv($this->encoding, 'UTF-8', $this->value) : iconv('ISO8859-1', 'UTF-8', $this->value); 210 | } 211 | } 212 | } 213 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Document/Pptx.php: -------------------------------------------------------------------------------- 1 | open($fileName); 77 | 78 | // Read relations and search for officeDocument 79 | $relationsXml = $package->getFromName('_rels/.rels'); 80 | if ($relationsXml === false) { 81 | throw new RuntimeException('Invalid archive or corrupted .pptx file.'); 82 | } 83 | 84 | // Prevent php from loading remote resources 85 | $loadEntities = libxml_disable_entity_loader(true); 86 | 87 | $relations = simplexml_load_string($relationsXml); 88 | 89 | // Restore entity loader state 90 | libxml_disable_entity_loader($loadEntities); 91 | 92 | foreach ($relations->Relationship as $rel) { 93 | if ($rel["Type"] == AbstractOpenXML::SCHEMA_OFFICEDOCUMENT) { 94 | // Found office document! Search for slides... 95 | $slideRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/_rels/" . basename($rel["Target"]) . ".rels")) ); 96 | foreach ($slideRelations->Relationship as $slideRel) { 97 | if ($slideRel["Type"] == self::SCHEMA_SLIDERELATION) { 98 | // Found slide! 99 | $slides[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string( 100 | $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . basename($slideRel["Target"])) ) 101 | ); 102 | 103 | // Search for slide notes 104 | $slideNotesRelations = simplexml_load_string($package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/_rels/" . basename($slideRel["Target"]) . ".rels")) ); 105 | foreach ($slideNotesRelations->Relationship as $slideNoteRel) { 106 | if ($slideNoteRel["Type"] == self::SCHEMA_SLIDENOTESRELATION) { 107 | // Found slide notes! 108 | $slideNotes[ str_replace( 'rId', '', (string)$slideRel["Id"] ) ] = simplexml_load_string( 109 | $package->getFromName( $this->absoluteZipPath(dirname($rel["Target"]) . "/" . dirname($slideRel["Target"]) . "/" . dirname($slideNoteRel["Target"]) . "/" . basename($slideNoteRel["Target"])) ) 110 | ); 111 | 112 | break; 113 | } 114 | } 115 | } 116 | } 117 | 118 | break; 119 | } 120 | } 121 | 122 | // Sort slides 123 | ksort($slides); 124 | ksort($slideNotes); 125 | 126 | // Extract contents from slides 127 | foreach ($slides as $slideKey => $slide) { 128 | // Register namespaces 129 | $slide->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML); 130 | $slide->registerXPathNamespace("a", self::SCHEMA_DRAWINGML); 131 | 132 | // Fetch all text 133 | $textElements = $slide->xpath('//a:t'); 134 | foreach ($textElements as $textElement) { 135 | $documentBody[] = (string)$textElement; 136 | } 137 | 138 | // Extract contents from slide notes 139 | if (isset($slideNotes[$slideKey])) { 140 | // Fetch slide note 141 | $slideNote = $slideNotes[$slideKey]; 142 | 143 | // Register namespaces 144 | $slideNote->registerXPathNamespace("p", self::SCHEMA_PRESENTATIONML); 145 | $slideNote->registerXPathNamespace("a", self::SCHEMA_DRAWINGML); 146 | 147 | // Fetch all text 148 | $textElements = $slideNote->xpath('//a:t'); 149 | foreach ($textElements as $textElement) { 150 | $documentBody[] = (string)$textElement; 151 | } 152 | } 153 | } 154 | 155 | // Read core properties 156 | $coreProperties = $this->extractMetaData($package); 157 | 158 | // Close file 159 | $package->close(); 160 | 161 | // Store filename 162 | $this->addField(Field::Text('filename', $fileName, 'UTF-8')); 163 | 164 | // Store contents 165 | if ($storeContent) { 166 | $this->addField(Field::Text('body', implode(' ', $documentBody), 'UTF-8')); 167 | } else { 168 | $this->addField(Field::UnStored('body', implode(' ', $documentBody), 'UTF-8')); 169 | } 170 | 171 | // Store meta data properties 172 | foreach ($coreProperties as $key => $value) { 173 | $this->addField(Field::Text($key, $value, 'UTF-8')); 174 | } 175 | 176 | // Store title (if not present in meta data) 177 | if (!isset($coreProperties['title'])) { 178 | $this->addField(Field::Text('title', $fileName, 'UTF-8')); 179 | } 180 | } 181 | 182 | /** 183 | * Load Pptx document from a file 184 | * 185 | * @param string $fileName 186 | * @param boolean $storeContent 187 | * @return \ZendSearch\Lucene\Document\Pptx 188 | */ 189 | public static function loadPptxFile($fileName, $storeContent = false) 190 | { 191 | return new self($fileName, $storeContent); 192 | } 193 | } 194 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Exception/ExceptionInterface.php: -------------------------------------------------------------------------------- 1 | _object = $object; 45 | $this->_method = $method; 46 | } 47 | 48 | public function doAction() 49 | { 50 | $methodName = $this->_method; 51 | $this->_object->$methodName(); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/DocsFilter.php: -------------------------------------------------------------------------------- 1 | => array( => , 28 | * => , 29 | * => , 30 | * ... ), 31 | * => array( => , 32 | * => , 33 | * => , 34 | * ... ), 35 | * => array( => , 36 | * => , 37 | * => , 38 | * ... ), 39 | * ... 40 | * ) 41 | * 42 | * @var array 43 | */ 44 | public $segmentFilters = array(); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/FieldInfo.php: -------------------------------------------------------------------------------- 1 | name = $name; 30 | $this->isIndexed = $isIndexed; 31 | $this->number = $number; 32 | $this->storeTermVector = $storeTermVector; 33 | $this->normsOmitted = $normsOmitted; 34 | $this->payloadsStored = $payloadsStored; 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/SegmentMerger.php: -------------------------------------------------------------------------------- 1 | ][] => 56 | * 57 | * @var array 58 | */ 59 | private $_fieldsMap = array(); 60 | 61 | 62 | 63 | /** 64 | * Object constructor. 65 | * 66 | * Creates new segment merger with $directory as target to merge segments into 67 | * and $name as a name of new segment 68 | * 69 | * @param \ZendSearch\Lucene\Storage\Directory\DirectoryInterface $directory 70 | * @param string $name 71 | */ 72 | public function __construct(Directory\DirectoryInterface $directory, $name) 73 | { 74 | /** \ZendSearch\Lucene\Index\SegmentWriter\StreamWriter */ 75 | $this->_writer = new SegmentWriter\StreamWriter($directory, $name); 76 | } 77 | 78 | 79 | /** 80 | * Add segmnet to a collection of segments to be merged 81 | * 82 | * @param \ZendSearch\Lucene\Index\SegmentInfo $segment 83 | */ 84 | public function addSource(SegmentInfo $segmentInfo) 85 | { 86 | $this->_segmentInfos[$segmentInfo->getName()] = $segmentInfo; 87 | } 88 | 89 | 90 | /** 91 | * Do merge. 92 | * 93 | * Returns number of documents in newly created segment 94 | * 95 | * @return \ZendSearch\Lucene\Index\SegmentInfo 96 | * @throws \ZendSearch\Lucene\Exception\RuntimeException 97 | */ 98 | public function merge() 99 | { 100 | if ($this->_mergeDone) { 101 | throw new RuntimeException('Merge is already done.'); 102 | } 103 | 104 | if (count($this->_segmentInfos) < 1) { 105 | throw new RuntimeException('Wrong number of segments to be merged (' 106 | . count($this->_segmentInfos) 107 | . ').'); 108 | } 109 | 110 | $this->_mergeFields(); 111 | $this->_mergeNorms(); 112 | $this->_mergeStoredFields(); 113 | $this->_mergeTerms(); 114 | 115 | $this->_mergeDone = true; 116 | 117 | return $this->_writer->close(); 118 | } 119 | 120 | 121 | /** 122 | * Merge fields information 123 | */ 124 | private function _mergeFields() 125 | { 126 | foreach ($this->_segmentInfos as $segName => $segmentInfo) { 127 | foreach ($segmentInfo->getFieldInfos() as $fieldInfo) { 128 | $this->_fieldsMap[$segName][$fieldInfo->number] = $this->_writer->addFieldInfo($fieldInfo); 129 | } 130 | } 131 | } 132 | 133 | /** 134 | * Merge field's normalization factors 135 | */ 136 | private function _mergeNorms() 137 | { 138 | foreach ($this->_writer->getFieldInfos() as $fieldInfo) { 139 | if ($fieldInfo->isIndexed) { 140 | foreach ($this->_segmentInfos as $segName => $segmentInfo) { 141 | if ($segmentInfo->hasDeletions()) { 142 | $srcNorm = $segmentInfo->normVector($fieldInfo->name); 143 | $norm = ''; 144 | $docs = $segmentInfo->count(); 145 | for ($count = 0; $count < $docs; $count++) { 146 | if (!$segmentInfo->isDeleted($count)) { 147 | $norm .= $srcNorm[$count]; 148 | } 149 | } 150 | $this->_writer->addNorm($fieldInfo->name, $norm); 151 | } else { 152 | $this->_writer->addNorm($fieldInfo->name, $segmentInfo->normVector($fieldInfo->name)); 153 | } 154 | } 155 | } 156 | } 157 | } 158 | 159 | /** 160 | * Merge fields information 161 | */ 162 | private function _mergeStoredFields() 163 | { 164 | $this->_docCount = 0; 165 | 166 | foreach ($this->_segmentInfos as $segName => $segmentInfo) { 167 | $fdtFile = $segmentInfo->openCompoundFile('.fdt'); 168 | 169 | for ($count = 0; $count < $segmentInfo->count(); $count++) { 170 | $fieldCount = $fdtFile->readVInt(); 171 | $storedFields = array(); 172 | 173 | for ($count2 = 0; $count2 < $fieldCount; $count2++) { 174 | $fieldNum = $fdtFile->readVInt(); 175 | $bits = $fdtFile->readByte(); 176 | $fieldInfo = $segmentInfo->getField($fieldNum); 177 | 178 | if (!($bits & 2)) { // Text data 179 | $storedFields[] = 180 | new Document\Field($fieldInfo->name, 181 | $fdtFile->readString(), 182 | 'UTF-8', 183 | true, 184 | $fieldInfo->isIndexed, 185 | $bits & 1 ); 186 | } else { // Binary data 187 | $storedFields[] = 188 | new Document\Field($fieldInfo->name, 189 | $fdtFile->readBinary(), 190 | '', 191 | true, 192 | $fieldInfo->isIndexed, 193 | $bits & 1, 194 | true); 195 | } 196 | } 197 | 198 | if (!$segmentInfo->isDeleted($count)) { 199 | $this->_docCount++; 200 | $this->_writer->addStoredFields($storedFields); 201 | } 202 | } 203 | } 204 | } 205 | 206 | 207 | /** 208 | * Merge fields information 209 | */ 210 | private function _mergeTerms() 211 | { 212 | $segmentInfoQueue = new TermsPriorityQueue(); 213 | 214 | $segmentStartId = 0; 215 | foreach ($this->_segmentInfos as $segName => $segmentInfo) { 216 | $segmentStartId = $segmentInfo->resetTermsStream($segmentStartId, SegmentInfo::SM_MERGE_INFO); 217 | 218 | // Skip "empty" segments 219 | if ($segmentInfo->currentTerm() !== null) { 220 | $segmentInfoQueue->put($segmentInfo); 221 | } 222 | } 223 | 224 | $this->_writer->initializeDictionaryFiles(); 225 | 226 | $termDocs = array(); 227 | while (($segmentInfo = $segmentInfoQueue->pop()) !== null) { 228 | // Merge positions array 229 | $termDocs += $segmentInfo->currentTermPositions(); 230 | 231 | if ($segmentInfoQueue->top() === null || 232 | $segmentInfoQueue->top()->currentTerm()->key() != 233 | $segmentInfo->currentTerm()->key()) { 234 | // We got new term 235 | ksort($termDocs, SORT_NUMERIC); 236 | 237 | // Add term if it's contained in any document 238 | if (count($termDocs) > 0) { 239 | $this->_writer->addTerm($segmentInfo->currentTerm(), $termDocs); 240 | } 241 | $termDocs = array(); 242 | } 243 | 244 | $segmentInfo->nextTerm(); 245 | // check, if segment dictionary is finished 246 | if ($segmentInfo->currentTerm() !== null) { 247 | // Put segment back into the priority queue 248 | $segmentInfoQueue->put($segmentInfo); 249 | } 250 | } 251 | 252 | $this->_writer->closeDictionaryFiles(); 253 | } 254 | } 255 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/SegmentWriter/DocumentWriter.php: -------------------------------------------------------------------------------- 1 | _termDocs = array(); 55 | $this->_termDictionary = array(); 56 | } 57 | 58 | 59 | /** 60 | * Adds a document to this segment. 61 | * 62 | * @param \ZendSearch\Lucene\Document $document 63 | * @throws LuceneException\UnsupportedMethodCallException 64 | */ 65 | public function addDocument(Document $document) 66 | { 67 | $storedFields = array(); 68 | $docNorms = array(); 69 | $similarity = AbstractSimilarity::getDefault(); 70 | 71 | foreach ($document->getFieldNames() as $fieldName) { 72 | $field = $document->getField($fieldName); 73 | 74 | if ($field->storeTermVector) { 75 | /** 76 | * @todo term vector storing support 77 | */ 78 | throw new LuceneException\UnsupportedMethodCallException('Store term vector functionality is not supported yet.'); 79 | } 80 | 81 | if ($field->isIndexed) { 82 | if ($field->isTokenized) { 83 | $analyzer = Analyzer\Analyzer::getDefault(); 84 | $analyzer->setInput($field->value, $field->encoding); 85 | 86 | $position = 0; 87 | $tokenCounter = 0; 88 | while (($token = $analyzer->nextToken()) !== null) { 89 | $tokenCounter++; 90 | 91 | $term = new Index\Term($token->getTermText(), $field->name); 92 | $termKey = $term->key(); 93 | 94 | if (!isset($this->_termDictionary[$termKey])) { 95 | // New term 96 | $this->_termDictionary[$termKey] = $term; 97 | $this->_termDocs[$termKey] = array(); 98 | $this->_termDocs[$termKey][$this->_docCount] = array(); 99 | } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) { 100 | // Existing term, but new term entry 101 | $this->_termDocs[$termKey][$this->_docCount] = array(); 102 | } 103 | $position += $token->getPositionIncrement(); 104 | $this->_termDocs[$termKey][$this->_docCount][] = $position; 105 | } 106 | 107 | if ($tokenCounter == 0) { 108 | // Field contains empty value. Treat it as non-indexed and non-tokenized 109 | $field = clone($field); 110 | $field->isIndexed = $field->isTokenized = false; 111 | } else { 112 | $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 113 | $tokenCounter)* 114 | $document->boost* 115 | $field->boost )); 116 | } 117 | } elseif (($fieldUtf8Value = $field->getUtf8Value()) == '') { 118 | // Field contains empty value. Treat it as non-indexed and non-tokenized 119 | $field = clone($field); 120 | $field->isIndexed = $field->isTokenized = false; 121 | } else { 122 | $term = new Index\Term($fieldUtf8Value, $field->name); 123 | $termKey = $term->key(); 124 | 125 | if (!isset($this->_termDictionary[$termKey])) { 126 | // New term 127 | $this->_termDictionary[$termKey] = $term; 128 | $this->_termDocs[$termKey] = array(); 129 | $this->_termDocs[$termKey][$this->_docCount] = array(); 130 | } elseif (!isset($this->_termDocs[$termKey][$this->_docCount])) { 131 | // Existing term, but new term entry 132 | $this->_termDocs[$termKey][$this->_docCount] = array(); 133 | } 134 | $this->_termDocs[$termKey][$this->_docCount][] = 0; // position 135 | 136 | $docNorms[$field->name] = chr($similarity->encodeNorm( $similarity->lengthNorm($field->name, 1)* 137 | $document->boost* 138 | $field->boost )); 139 | } 140 | } 141 | 142 | if ($field->isStored) { 143 | $storedFields[] = $field; 144 | } 145 | 146 | $this->addField($field); 147 | } 148 | 149 | foreach ($this->_fields as $fieldName => $field) { 150 | if (!$field->isIndexed) { 151 | continue; 152 | } 153 | 154 | if (!isset($this->_norms[$fieldName])) { 155 | $this->_norms[$fieldName] = str_repeat(chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )), 156 | $this->_docCount); 157 | } 158 | 159 | if (isset($docNorms[$fieldName])){ 160 | $this->_norms[$fieldName] .= $docNorms[$fieldName]; 161 | } else { 162 | $this->_norms[$fieldName] .= chr($similarity->encodeNorm( $similarity->lengthNorm($fieldName, 0) )); 163 | } 164 | } 165 | 166 | $this->addStoredFields($storedFields); 167 | } 168 | 169 | 170 | /** 171 | * Dump Term Dictionary (.tis) and Term Dictionary Index (.tii) segment files 172 | */ 173 | protected function _dumpDictionary() 174 | { 175 | ksort($this->_termDictionary, SORT_STRING); 176 | 177 | $this->initializeDictionaryFiles(); 178 | 179 | foreach ($this->_termDictionary as $termId => $term) { 180 | $this->addTerm($term, $this->_termDocs[$termId]); 181 | } 182 | 183 | $this->closeDictionaryFiles(); 184 | } 185 | 186 | 187 | /** 188 | * Close segment, write it to disk and return segment info 189 | * 190 | * @return \ZendSearch\Lucene\Index\SegmentInfo 191 | */ 192 | public function close() 193 | { 194 | if ($this->_docCount == 0) { 195 | return null; 196 | } 197 | 198 | $this->_dumpFNM(); 199 | $this->_dumpDictionary(); 200 | 201 | $this->_generateCFS(); 202 | 203 | return new Index\SegmentInfo($this->_directory, 204 | $this->_name, 205 | $this->_docCount, 206 | -1, 207 | null, 208 | true, 209 | true); 210 | } 211 | 212 | } 213 | 214 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/SegmentWriter/StreamWriter.php: -------------------------------------------------------------------------------- 1 | _fdxFile = $this->_directory->createFile($this->_name . '.fdx'); 41 | $this->_fdtFile = $this->_directory->createFile($this->_name . '.fdt'); 42 | 43 | $this->_files[] = $this->_name . '.fdx'; 44 | $this->_files[] = $this->_name . '.fdt'; 45 | } 46 | 47 | public function addNorm($fieldName, $normVector) 48 | { 49 | if (isset($this->_norms[$fieldName])) { 50 | $this->_norms[$fieldName] .= $normVector; 51 | } else { 52 | $this->_norms[$fieldName] = $normVector; 53 | } 54 | } 55 | 56 | /** 57 | * Close segment, write it to disk and return segment info 58 | * 59 | * @return \ZendSearch\Lucene\Index\SegmentInfo 60 | */ 61 | public function close() 62 | { 63 | if ($this->_docCount == 0) { 64 | return null; 65 | } 66 | 67 | $this->_dumpFNM(); 68 | $this->_generateCFS(); 69 | 70 | return new LuceneIndex\SegmentInfo($this->_directory, 71 | $this->_name, 72 | $this->_docCount, 73 | -1, 74 | null, 75 | true, 76 | true); 77 | } 78 | } 79 | 80 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/Term.php: -------------------------------------------------------------------------------- 1 | field = ($field === null)? Lucene\Lucene::getDefaultSearchField() : $field; 50 | $this->text = $text; 51 | } 52 | 53 | 54 | /** 55 | * Returns term key 56 | * 57 | * @return string 58 | */ 59 | public function key() 60 | { 61 | return $this->field . chr(0) . $this->text; 62 | } 63 | 64 | /** 65 | * Get term prefix 66 | * 67 | * @param string $str 68 | * @param integer $length 69 | * @return string 70 | */ 71 | public static function getPrefix($str, $length) 72 | { 73 | /** 74 | * @todo !!!!!!! use mb_string or iconv functions if they are available 75 | */ 76 | $prefixBytes = 0; 77 | $prefixChars = 0; 78 | while (isset($str[$prefixBytes]) && $prefixChars < $length) { 79 | $charBytes = 1; 80 | if ((ord($str[$prefixBytes]) & 0xC0) == 0xC0) { 81 | $charBytes++; 82 | if (ord($str[$prefixBytes]) & 0x20 ) { 83 | $charBytes++; 84 | if (ord($str[$prefixBytes]) & 0x10 ) { 85 | $charBytes++; 86 | } 87 | } 88 | } 89 | 90 | if (! isset($str[$prefixBytes + $charBytes - 1])) { 91 | // wrong character 92 | break; 93 | } 94 | 95 | $prefixChars++; 96 | $prefixBytes += $charBytes; 97 | } 98 | 99 | return substr($str, 0, $prefixBytes); 100 | } 101 | 102 | /** 103 | * Get UTF-8 string length 104 | * 105 | * @param string $str 106 | * @return string 107 | */ 108 | public static function getLength($str) 109 | { 110 | $bytes = 0; 111 | $chars = 0; 112 | while ($bytes < strlen($str)) { 113 | $charBytes = 1; 114 | if ((ord($str[$bytes]) & 0xC0) == 0xC0) { 115 | $charBytes++; 116 | if (ord($str[$bytes]) & 0x20 ) { 117 | $charBytes++; 118 | if (ord($str[$bytes]) & 0x10 ) { 119 | $charBytes++; 120 | } 121 | } 122 | } 123 | 124 | if ($bytes + $charBytes > strlen($str)) { 125 | // wrong character 126 | break; 127 | } 128 | 129 | $chars++; 130 | $bytes += $charBytes; 131 | } 132 | 133 | return $chars; 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/TermInfo.php: -------------------------------------------------------------------------------- 1 | docFreq = $docFreq; 61 | $this->freqPointer = $freqPointer; 62 | $this->proxPointer = $proxPointer; 63 | $this->skipOffset = $skipOffset; 64 | $this->indexPointer = $indexPointer; 65 | } 66 | } 67 | 68 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/TermsPriorityQueue.php: -------------------------------------------------------------------------------- 1 | currentTerm()->key(), $termsStream2->currentTerm()->key()) < 0; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Index/TermsStreamInterface.php: -------------------------------------------------------------------------------- 1 | createFile(self::WRITE_LOCK_FILE); 42 | if (!$lock->lock(LOCK_EX)) { 43 | throw new RuntimeException('Can\'t obtain exclusive index lock'); 44 | } 45 | return $lock; 46 | } 47 | 48 | /** 49 | * Release exclusive write lock 50 | * 51 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 52 | */ 53 | public static function releaseWriteLock(Directory $lockDirectory) 54 | { 55 | $lock = $lockDirectory->getFileObject(self::WRITE_LOCK_FILE); 56 | $lock->unlock(); 57 | } 58 | 59 | /** 60 | * Obtain the exclusive "read escalation/de-escalation" lock 61 | * 62 | * Required to protect the escalate/de-escalate read lock process 63 | * on GFS (and potentially other) mounted filesystems. 64 | * 65 | * Why we need this: 66 | * While GFS supports cluster-wide locking via flock(), it's 67 | * implementation isn't quite what it should be. The locking 68 | * semantics that work consistently on a local filesystem tend to 69 | * fail on GFS mounted filesystems. This appears to be a design defect 70 | * in the implementation of GFS. How this manifests itself is that 71 | * conditional promotion of a shared lock to exclusive will always 72 | * fail, lock release requests are honored but not immediately 73 | * processed (causing erratic failures of subsequent conditional 74 | * requests) and the releasing of the exclusive lock before the 75 | * shared lock is set when a lock is demoted (which can open a window 76 | * of opportunity for another process to gain an exclusive lock when 77 | * it shoudln't be allowed to). 78 | * 79 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 80 | * @return \ZendSearch\Lucene\Storage\File\FileInterface 81 | * @throws \ZendSearch\Lucene\Exception\RuntimeException 82 | */ 83 | private static function _startReadLockProcessing(Directory $lockDirectory) 84 | { 85 | $lock = $lockDirectory->createFile(self::READ_LOCK_PROCESSING_LOCK_FILE); 86 | if (!$lock->lock(LOCK_EX)) { 87 | throw new RuntimeException('Can\'t obtain exclusive lock for the read lock processing file'); 88 | } 89 | return $lock; 90 | } 91 | 92 | /** 93 | * Release the exclusive "read escalation/de-escalation" lock 94 | * 95 | * Required to protect the escalate/de-escalate read lock process 96 | * on GFS (and potentially other) mounted filesystems. 97 | * 98 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 99 | */ 100 | private static function _stopReadLockProcessing(Directory $lockDirectory) 101 | { 102 | $lock = $lockDirectory->getFileObject(self::READ_LOCK_PROCESSING_LOCK_FILE); 103 | $lock->unlock(); 104 | } 105 | 106 | 107 | /** 108 | * Obtain shared read lock on the index 109 | * 110 | * It doesn't block other read or update processes, but prevent index from the premature cleaning-up 111 | * 112 | * @param \ZendSearch\Lucene\Storage\Directory $defaultLockDirectory 113 | * @return \ZendSearch\Lucene\Storage\File\FileInterface 114 | * @throws \ZendSearch\Lucene\Exception\RuntimeException 115 | */ 116 | public static function obtainReadLock(Directory $lockDirectory) 117 | { 118 | $lock = $lockDirectory->createFile(self::READ_LOCK_FILE); 119 | if (!$lock->lock(LOCK_SH)) { 120 | throw new RuntimeException('Can\'t obtain shared reading index lock'); 121 | } 122 | return $lock; 123 | } 124 | 125 | /** 126 | * Release shared read lock 127 | * 128 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 129 | */ 130 | public static function releaseReadLock(Directory $lockDirectory) 131 | { 132 | $lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE); 133 | $lock->unlock(); 134 | } 135 | 136 | /** 137 | * Escalate Read lock to exclusive level 138 | * 139 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 140 | * @return boolean 141 | */ 142 | public static function escalateReadLock(Directory $lockDirectory) 143 | { 144 | self::_startReadLockProcessing($lockDirectory); 145 | 146 | $lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE); 147 | 148 | // First, release the shared lock for the benefit of GFS since 149 | // it will fail the conditional request to promote the lock to 150 | // "exclusive" while the shared lock is held (even when we are 151 | // the only holder). 152 | $lock->unlock(); 153 | 154 | // GFS is really poor. While the above "unlock" returns, GFS 155 | // doesn't clean up it's tables right away (which will potentially 156 | // cause the conditional locking for the "exclusive" lock to fail. 157 | // We will retry the conditional lock request several times on a 158 | // failure to get past this. The performance hit is negligible 159 | // in the grand scheme of things and only will occur with GFS 160 | // filesystems or if another local process has the shared lock 161 | // on local filesystems. 162 | for ($retries = 0; $retries < 10; $retries++) { 163 | if ($lock->lock(LOCK_EX, true)) { 164 | // Exclusive lock is obtained! 165 | self::_stopReadLockProcessing($lockDirectory); 166 | return true; 167 | } 168 | 169 | // wait 1 microsecond 170 | usleep(1); 171 | } 172 | 173 | // Restore lock state 174 | $lock->lock(LOCK_SH); 175 | 176 | self::_stopReadLockProcessing($lockDirectory); 177 | return false; 178 | } 179 | 180 | /** 181 | * De-escalate Read lock to shared level 182 | * 183 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 184 | */ 185 | public static function deEscalateReadLock(Directory $lockDirectory) 186 | { 187 | $lock = $lockDirectory->getFileObject(self::READ_LOCK_FILE); 188 | $lock->lock(LOCK_SH); 189 | } 190 | 191 | /** 192 | * Obtain exclusive optimization lock on the index 193 | * 194 | * Returns lock object on success and false otherwise (doesn't block execution) 195 | * 196 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 197 | * @return mixed 198 | */ 199 | public static function obtainOptimizationLock(Directory $lockDirectory) 200 | { 201 | $lock = $lockDirectory->createFile(self::OPTIMIZATION_LOCK_FILE); 202 | if (!$lock->lock(LOCK_EX, true)) { 203 | return false; 204 | } 205 | return $lock; 206 | } 207 | 208 | /** 209 | * Release exclusive optimization lock 210 | * 211 | * @param \ZendSearch\Lucene\Storage\Directory $lockDirectory 212 | */ 213 | public static function releaseOptimizationLock(Directory $lockDirectory) 214 | { 215 | $lock = $lockDirectory->getFileObject(self::OPTIMIZATION_LOCK_FILE); 216 | $lock->unlock(); 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Lucene.php: -------------------------------------------------------------------------------- 1 | _doc = $document; 57 | } 58 | 59 | /** 60 | * Get document for highlighting. 61 | * 62 | * @return \ZendSearch\Lucene\Document\HTML $document 63 | */ 64 | public function getDocument() 65 | { 66 | return $this->_doc; 67 | } 68 | 69 | /** 70 | * Highlight specified words 71 | * 72 | * @param string|array $words Words to highlight. They could be organized using the array or string. 73 | */ 74 | public function highlight($words) 75 | { 76 | $color = $this->_highlightColors[$this->_currentColorIndex]; 77 | $this->_currentColorIndex = ($this->_currentColorIndex + 1) % count($this->_highlightColors); 78 | 79 | $this->_doc->highlight($words, $color); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Highlighter/HighlighterInterface.php: -------------------------------------------------------------------------------- 1 | _boost; 49 | } 50 | 51 | /** 52 | * Sets the boost for this query clause to $boost. 53 | * 54 | * @param float $boost 55 | */ 56 | public function setBoost($boost) 57 | { 58 | $this->_boost = $boost; 59 | } 60 | 61 | /** 62 | * Score specified document 63 | * 64 | * @param integer $docId 65 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 66 | * @return float 67 | */ 68 | abstract public function score($docId, Lucene\SearchIndexInterface $reader); 69 | 70 | /** 71 | * Get document ids likely matching the query 72 | * 73 | * It's an array with document ids as keys (performance considerations) 74 | * 75 | * @return array 76 | */ 77 | abstract public function matchedDocs(); 78 | 79 | /** 80 | * Execute query in context of index reader 81 | * It also initializes necessary internal structures 82 | * 83 | * AbstractQuery specific implementation 84 | * 85 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 86 | * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter 87 | */ 88 | abstract public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null); 89 | 90 | /** 91 | * Constructs an appropriate Weight implementation for this query. 92 | * 93 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 94 | * @return \ZendSearch\Lucene\Search\Weight\AbstractWeight 95 | */ 96 | abstract public function createWeight(Lucene\SearchIndexInterface $reader); 97 | 98 | /** 99 | * Constructs an initializes a Weight for a _top-level_query_. 100 | * 101 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 102 | */ 103 | protected function _initWeight(Lucene\SearchIndexInterface $reader) 104 | { 105 | // Check, that it's a top-level query and query weight is not initialized yet. 106 | if ($this->_weight !== null) { 107 | return $this->_weight; 108 | } 109 | 110 | $this->createWeight($reader); 111 | $sum = $this->_weight->sumOfSquaredWeights(); 112 | $queryNorm = $reader->getSimilarity()->queryNorm($sum); 113 | $this->_weight->normalize($queryNorm); 114 | } 115 | 116 | /** 117 | * Re-write query into primitive queries in the context of specified index 118 | * 119 | * @param \ZendSearch\Lucene\SearchIndexInterface $index 120 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 121 | */ 122 | abstract public function rewrite(Lucene\SearchIndexInterface $index); 123 | 124 | /** 125 | * Optimize query in the context of specified index 126 | * 127 | * @param \ZendSearch\Lucene\SearchIndexInterface $index 128 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 129 | */ 130 | abstract public function optimize(Lucene\SearchIndexInterface $index); 131 | 132 | /** 133 | * Reset query, so it can be reused within other queries or 134 | * with other indeces 135 | */ 136 | public function reset() 137 | { 138 | $this->_weight = null; 139 | } 140 | 141 | 142 | /** 143 | * Print a query 144 | * 145 | * @return string 146 | */ 147 | abstract public function __toString(); 148 | 149 | /** 150 | * Return query terms 151 | * 152 | * @return array 153 | */ 154 | abstract public function getQueryTerms(); 155 | 156 | /** 157 | * AbstractQuery specific matches highlighting 158 | * 159 | * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) 160 | */ 161 | abstract protected function _highlightMatches(Highlighter $highlighter); 162 | 163 | /** 164 | * Highlight matches in $inputHTML 165 | * 166 | * @param string $inputHTML 167 | * @param string $defaultEncoding HTML encoding, is used if it's not specified using Content-type HTTP-EQUIV meta tag. 168 | * @param Highlighter|null $highlighter 169 | * @return string 170 | */ 171 | public function highlightMatches($inputHTML, $defaultEncoding = '', $highlighter = null) 172 | { 173 | if ($highlighter === null) { 174 | $highlighter = new DefaultHighlighter(); 175 | } 176 | 177 | $doc = Document\HTML::loadHTML($inputHTML, false, $defaultEncoding); 178 | $highlighter->setDocument($doc); 179 | 180 | $this->_highlightMatches($highlighter); 181 | 182 | return $doc->getHTML(); 183 | } 184 | 185 | /** 186 | * Highlight matches in $inputHTMLFragment and return it (without HTML header and body tag) 187 | * 188 | * @param string $inputHTMLFragment 189 | * @param string $encoding Input HTML string encoding 190 | * @param Highlighter|null $highlighter 191 | * @return string 192 | */ 193 | public function htmlFragmentHighlightMatches($inputHTMLFragment, $encoding = 'UTF-8', $highlighter = null) 194 | { 195 | if ($highlighter === null) { 196 | $highlighter = new DefaultHighlighter(); 197 | } 198 | 199 | $inputHTML = '' 200 | . iconv($encoding, 'UTF-8//IGNORE', $inputHTMLFragment) . ''; 201 | 202 | $doc = Document\HTML::loadHTML($inputHTML); 203 | $highlighter->setDocument($doc); 204 | 205 | $this->_highlightMatches($highlighter); 206 | 207 | return $doc->getHTMLBody(); 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Query/EmptyResult.php: -------------------------------------------------------------------------------- 1 | '; 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Query/Insignificant.php: -------------------------------------------------------------------------------- 1 | '; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Query/Preprocessing/AbstractPreprocessing.php: -------------------------------------------------------------------------------- 1 | _phrase = $phrase; 82 | $this->_phraseEncoding = $phraseEncoding; 83 | $this->_field = $fieldName; 84 | } 85 | 86 | /** 87 | * Set slop 88 | * 89 | * @param integer $slop 90 | */ 91 | public function setSlop($slop) 92 | { 93 | $this->_slop = $slop; 94 | } 95 | 96 | 97 | /** 98 | * Get slop 99 | * 100 | * @return integer 101 | */ 102 | public function getSlop() 103 | { 104 | return $this->_slop; 105 | } 106 | 107 | /** 108 | * Re-write query into primitive queries in the context of specified index 109 | * 110 | * @param \ZendSearch\Lucene\SearchIndexInterface $index 111 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 112 | */ 113 | public function rewrite(Lucene\SearchIndexInterface $index) 114 | { 115 | // Allow to use wildcards within phrases 116 | // They are either removed by text analyzer or used as a part of keyword for keyword fields 117 | // 118 | // if (strpos($this->_phrase, '?') !== false || strpos($this->_phrase, '*') !== false) { 119 | // require_once 'Zend/Search/Lucene/Search/QueryParserException.php'; 120 | // throw new Zend_Search_Lucene_Search_QueryParserException('Wildcards are only allowed in a single terms.'); 121 | // } 122 | 123 | // Split query into subqueries if field name is not specified 124 | if ($this->_field === null) { 125 | $query = new Query\Boolean(); 126 | $query->setBoost($this->getBoost()); 127 | 128 | if (Lucene\Lucene::getDefaultSearchField() === null) { 129 | $searchFields = $index->getFieldNames(true); 130 | } else { 131 | $searchFields = array(Lucene\Lucene::getDefaultSearchField()); 132 | } 133 | 134 | foreach ($searchFields as $fieldName) { 135 | $subquery = new Phrase($this->_phrase, 136 | $this->_phraseEncoding, 137 | $fieldName); 138 | $subquery->setSlop($this->getSlop()); 139 | 140 | $query->addSubquery($subquery->rewrite($index)); 141 | } 142 | 143 | $this->_matches = $query->getQueryTerms(); 144 | return $query; 145 | } 146 | 147 | // Recognize exact term matching (it corresponds to Keyword fields stored in the index) 148 | // encoding is not used since we expect binary matching 149 | $term = new Index\Term($this->_phrase, $this->_field); 150 | if ($index->hasTerm($term)) { 151 | $query = new Query\Term($term); 152 | $query->setBoost($this->getBoost()); 153 | 154 | $this->_matches = $query->getQueryTerms(); 155 | return $query; 156 | } 157 | 158 | 159 | // tokenize phrase using current analyzer and process it as a phrase query 160 | $tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding); 161 | 162 | if (count($tokens) == 0) { 163 | $this->_matches = array(); 164 | return new Query\Insignificant(); 165 | } 166 | 167 | if (count($tokens) == 1) { 168 | $term = new Index\Term($tokens[0]->getTermText(), $this->_field); 169 | $query = new Query\Term($term); 170 | $query->setBoost($this->getBoost()); 171 | 172 | $this->_matches = $query->getQueryTerms(); 173 | return $query; 174 | } 175 | 176 | //It's non-trivial phrase query 177 | $position = -1; 178 | $query = new Query\Phrase(); 179 | foreach ($tokens as $token) { 180 | $position += $token->getPositionIncrement(); 181 | $term = new Index\Term($token->getTermText(), $this->_field); 182 | $query->addTerm($term, $position); 183 | $query->setSlop($this->getSlop()); 184 | } 185 | $this->_matches = $query->getQueryTerms(); 186 | return $query; 187 | } 188 | 189 | /** 190 | * Query specific matches highlighting 191 | * 192 | * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) 193 | */ 194 | protected function _highlightMatches(Highlighter $highlighter) 195 | { 196 | /** Skip fields detection. We don't need it, since we expect all fields presented in the HTML body and don't differentiate them */ 197 | 198 | /** Skip exact term matching recognition, keyword fields highlighting is not supported */ 199 | 200 | /** Skip wildcard queries recognition. Supported wildcards are removed by text analyzer */ 201 | 202 | 203 | // tokenize phrase using current analyzer and process it as a phrase query 204 | $tokens = Analyzer::getDefault()->tokenize($this->_phrase, $this->_phraseEncoding); 205 | 206 | if (count($tokens) == 0) { 207 | // Do nothing 208 | return; 209 | } 210 | 211 | if (count($tokens) == 1) { 212 | $highlighter->highlight($tokens[0]->getTermText()); 213 | return; 214 | } 215 | 216 | //It's non-trivial phrase query 217 | $words = array(); 218 | foreach ($tokens as $token) { 219 | $words[] = $token->getTermText(); 220 | } 221 | $highlighter->highlight($words); 222 | } 223 | 224 | /** 225 | * Print a query 226 | * 227 | * @return string 228 | */ 229 | public function __toString() 230 | { 231 | // It's used only for query visualisation, so we don't care about characters escaping 232 | if ($this->_field !== null) { 233 | $query = $this->_field . ':'; 234 | } else { 235 | $query = ''; 236 | } 237 | 238 | $query .= '"' . $this->_phrase . '"'; 239 | 240 | if ($this->_slop != 0) { 241 | $query .= '~' . $this->_slop; 242 | } 243 | 244 | if ($this->getBoost() != 1) { 245 | $query .= '^' . round($this->getBoost(), 4); 246 | } 247 | 248 | return $query; 249 | } 250 | } 251 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Query/Term.php: -------------------------------------------------------------------------------- 1 | freq, ...) 42 | * 43 | * @var array 44 | */ 45 | private $_termFreqs; 46 | 47 | 48 | /** 49 | * Zend_Search_Lucene_Search_Query_Term constructor 50 | * 51 | * @param \ZendSearch\Lucene\Index\Term $term 52 | * @param boolean $sign 53 | */ 54 | public function __construct(Index\Term $term) 55 | { 56 | $this->_term = $term; 57 | } 58 | 59 | /** 60 | * Re-write query into primitive queries in the context of specified index 61 | * 62 | * @param \ZendSearch\Lucene\SearchIndexInterface $index 63 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 64 | */ 65 | public function rewrite(Lucene\SearchIndexInterface $index) 66 | { 67 | if ($this->_term->field != null) { 68 | return $this; 69 | } else { 70 | $query = new MultiTerm(); 71 | $query->setBoost($this->getBoost()); 72 | 73 | foreach ($index->getFieldNames(true) as $fieldName) { 74 | $term = new Index\Term($this->_term->text, $fieldName); 75 | 76 | $query->addTerm($term); 77 | } 78 | 79 | return $query->rewrite($index); 80 | } 81 | } 82 | 83 | /** 84 | * Optimize query in the context of specified index 85 | * 86 | * @param \ZendSearch\Lucene\SearchIndexInterface $index 87 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 88 | */ 89 | public function optimize(Lucene\SearchIndexInterface $index) 90 | { 91 | // Check, that index contains specified term 92 | if (!$index->hasTerm($this->_term)) { 93 | return new EmptyResult(); 94 | } 95 | 96 | return $this; 97 | } 98 | 99 | 100 | /** 101 | * Constructs an appropriate Weight implementation for this query. 102 | * 103 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 104 | * @return \ZendSearch\Lucene\Search\Weight\Term 105 | */ 106 | public function createWeight(Lucene\SearchIndexInterface $reader) 107 | { 108 | $this->_weight = new Weight\Term($this->_term, $this, $reader); 109 | return $this->_weight; 110 | } 111 | 112 | /** 113 | * Execute query in context of index reader 114 | * It also initializes necessary internal structures 115 | * 116 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 117 | * @param \ZendSearch\Lucene\Index\DocsFilter|null $docsFilter 118 | */ 119 | public function execute(Lucene\SearchIndexInterface $reader, $docsFilter = null) 120 | { 121 | $this->_docVector = array_flip($reader->termDocs($this->_term, $docsFilter)); 122 | $this->_termFreqs = $reader->termFreqs($this->_term, $docsFilter); 123 | 124 | // Initialize weight if it's not done yet 125 | $this->_initWeight($reader); 126 | } 127 | 128 | /** 129 | * Get document ids likely matching the query 130 | * 131 | * It's an array with document ids as keys (performance considerations) 132 | * 133 | * @return array 134 | */ 135 | public function matchedDocs() 136 | { 137 | return $this->_docVector; 138 | } 139 | 140 | /** 141 | * Score specified document 142 | * 143 | * @param integer $docId 144 | * @param \ZendSearch\Lucene\SearchIndexInterface $reader 145 | * @return float 146 | */ 147 | public function score($docId, Lucene\SearchIndexInterface $reader) 148 | { 149 | if (isset($this->_docVector[$docId])) { 150 | return $reader->getSimilarity()->tf($this->_termFreqs[$docId]) * 151 | $this->_weight->getValue() * 152 | $reader->norm($docId, $this->_term->field) * 153 | $this->getBoost(); 154 | } else { 155 | return 0; 156 | } 157 | } 158 | 159 | /** 160 | * Return query terms 161 | * 162 | * @return array 163 | */ 164 | public function getQueryTerms() 165 | { 166 | return array($this->_term); 167 | } 168 | 169 | /** 170 | * Return query term 171 | * 172 | * @return \ZendSearch\Lucene\Index\Term 173 | */ 174 | public function getTerm() 175 | { 176 | return $this->_term; 177 | } 178 | 179 | /** 180 | * Query specific matches highlighting 181 | * 182 | * @param Highlighter $highlighter Highlighter object (also contains doc for highlighting) 183 | */ 184 | protected function _highlightMatches(Highlighter $highlighter) 185 | { 186 | $highlighter->highlight($this->_term->text); 187 | } 188 | 189 | /** 190 | * Print a query 191 | * 192 | * @return string 193 | */ 194 | public function __toString() 195 | { 196 | // It's used only for query visualisation, so we don't care about characters escaping 197 | if ($this->_term->field !== null) { 198 | $query = $this->_term->field . ':'; 199 | } else { 200 | $query = ''; 201 | } 202 | 203 | $query .= $this->_term->text; 204 | 205 | if ($this->getBoost() != 1) { 206 | $query = $query . '^' . round($this->getBoost(), 4); 207 | } 208 | 209 | return $query; 210 | } 211 | } 212 | 213 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryEntry/AbstractQueryEntry.php: -------------------------------------------------------------------------------- 1 | _boost *= $boostFactor; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryEntry/Phrase.php: -------------------------------------------------------------------------------- 1 | _phrase = $phrase; 59 | $this->_field = $field; 60 | } 61 | 62 | /** 63 | * Process modifier ('~') 64 | * 65 | * @param mixed $parameter 66 | */ 67 | public function processFuzzyProximityModifier($parameter = null) 68 | { 69 | $this->_proximityQuery = true; 70 | 71 | if ($parameter !== null) { 72 | $this->_wordsDistance = $parameter; 73 | } 74 | } 75 | 76 | /** 77 | * Transform entry to a subquery 78 | * 79 | * @param string $encoding 80 | * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException 81 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 82 | */ 83 | public function getQuery($encoding) 84 | { 85 | $query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Phrase($this->_phrase, 86 | $encoding, 87 | ($this->_field !== null)? 88 | iconv($encoding, 'UTF-8', $this->_field) : 89 | null); 90 | 91 | if ($this->_proximityQuery) { 92 | $query->setSlop($this->_wordsDistance); 93 | } 94 | 95 | $query->setBoost($this->_boost); 96 | 97 | return $query; 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryEntry/Subquery.php: -------------------------------------------------------------------------------- 1 | _query = $query; 35 | } 36 | 37 | /** 38 | * Process modifier ('~') 39 | * 40 | * @param mixed $parameter 41 | * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException 42 | */ 43 | public function processFuzzyProximityModifier($parameter = null) 44 | { 45 | throw new \ZendSearch\Lucene\Search\Exception\QueryParserException( 46 | '\'~\' sign must follow term or phrase' 47 | ); 48 | } 49 | 50 | 51 | /** 52 | * Transform entry to a subquery 53 | * 54 | * @param string $encoding 55 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 56 | */ 57 | public function getQuery($encoding) 58 | { 59 | $this->_query->setBoost($this->_boost); 60 | 61 | return $this->_query; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryEntry/Term.php: -------------------------------------------------------------------------------- 1 | _term = $term; 59 | $this->_field = $field; 60 | } 61 | 62 | /** 63 | * Process modifier ('~') 64 | * 65 | * @param mixed $parameter 66 | */ 67 | public function processFuzzyProximityModifier($parameter = null) 68 | { 69 | $this->_fuzzyQuery = true; 70 | 71 | if ($parameter !== null) { 72 | $this->_similarity = $parameter; 73 | } else { 74 | $this->_similarity = \ZendSearch\Lucene\Search\Query\Fuzzy::DEFAULT_MIN_SIMILARITY; 75 | } 76 | } 77 | 78 | /** 79 | * Transform entry to a subquery 80 | * 81 | * @param string $encoding 82 | * @return \ZendSearch\Lucene\Search\Query\AbstractQuery 83 | * @throws \ZendSearch\Lucene\Search\Exception\QueryParserException 84 | */ 85 | public function getQuery($encoding) 86 | { 87 | if ($this->_fuzzyQuery) { 88 | $query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Fuzzy($this->_term, 89 | $encoding, 90 | ($this->_field !== null)? 91 | iconv($encoding, 'UTF-8', $this->_field) : 92 | null, 93 | $this->_similarity 94 | ); 95 | $query->setBoost($this->_boost); 96 | return $query; 97 | } 98 | 99 | 100 | $query = new \ZendSearch\Lucene\Search\Query\Preprocessing\Term($this->_term, 101 | $encoding, 102 | ($this->_field !== null)? 103 | iconv($encoding, 'UTF-8', $this->_field) : 104 | null 105 | ); 106 | $query->setBoost($this->_boost); 107 | return $query; 108 | } 109 | } 110 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryHit.php: -------------------------------------------------------------------------------- 1 | _index = $index; 64 | } 65 | 66 | /** 67 | * Magic method for checking the existence of a field 68 | * 69 | * @param string $offset 70 | * @return boolean TRUE if the field exists else FALSE 71 | */ 72 | public function __isset($offset) 73 | { 74 | return isset($this->getDocument()->$offset); 75 | } 76 | 77 | 78 | /** 79 | * Convenience function for getting fields from the document 80 | * associated with this hit. 81 | * 82 | * @param string $offset 83 | * @return string 84 | */ 85 | public function __get($offset) 86 | { 87 | return $this->getDocument()->getFieldValue($offset); 88 | } 89 | 90 | 91 | /** 92 | * Return the document object for this hit 93 | * 94 | * @return \ZendSearch\Lucene\Document 95 | */ 96 | public function getDocument() 97 | { 98 | if (!$this->_document instanceof Document) { 99 | $this->_document = $this->_index->getDocument($this->document_id); 100 | } 101 | 102 | return $this->_document; 103 | } 104 | 105 | 106 | /** 107 | * Return the index object for this hit 108 | * 109 | * @return \ZendSearch\Lucene\SearchIndexInterface 110 | */ 111 | public function getIndex() 112 | { 113 | return $this->_index; 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/QueryToken.php: -------------------------------------------------------------------------------- 1 | or field:() pairs 28 | const TT_FIELD_INDICATOR = 3; // ':' 29 | const TT_REQUIRED = 4; // '+' 30 | const TT_PROHIBITED = 5; // '-' 31 | const TT_FUZZY_PROX_MARK = 6; // '~' 32 | const TT_BOOSTING_MARK = 7; // '^' 33 | const TT_RANGE_INCL_START = 8; // '[' 34 | const TT_RANGE_INCL_END = 9; // ']' 35 | const TT_RANGE_EXCL_START = 10; // '{' 36 | const TT_RANGE_EXCL_END = 11; // '}' 37 | const TT_SUBQUERY_START = 12; // '(' 38 | const TT_SUBQUERY_END = 13; // ')' 39 | const TT_AND_LEXEME = 14; // 'AND' or 'and' 40 | const TT_OR_LEXEME = 15; // 'OR' or 'or' 41 | const TT_NOT_LEXEME = 16; // 'NOT' or 'not' 42 | const TT_TO_LEXEME = 17; // 'TO' or 'to' 43 | const TT_NUMBER = 18; // Number, like: 10, 0.8, .64, .... 44 | 45 | 46 | /** 47 | * Returns all possible lexeme types. 48 | * It's used for syntax analyzer state machine initialization 49 | * 50 | * @return array 51 | */ 52 | public static function getTypes() 53 | { 54 | return array( self::TT_WORD, 55 | self::TT_PHRASE, 56 | self::TT_FIELD, 57 | self::TT_FIELD_INDICATOR, 58 | self::TT_REQUIRED, 59 | self::TT_PROHIBITED, 60 | self::TT_FUZZY_PROX_MARK, 61 | self::TT_BOOSTING_MARK, 62 | self::TT_RANGE_INCL_START, 63 | self::TT_RANGE_INCL_END, 64 | self::TT_RANGE_EXCL_START, 65 | self::TT_RANGE_EXCL_END, 66 | self::TT_SUBQUERY_START, 67 | self::TT_SUBQUERY_END, 68 | self::TT_AND_LEXEME, 69 | self::TT_OR_LEXEME, 70 | self::TT_NOT_LEXEME, 71 | self::TT_TO_LEXEME, 72 | self::TT_NUMBER 73 | ); 74 | } 75 | 76 | 77 | /** 78 | * TokenCategories 79 | */ 80 | const TC_WORD = 0; // Word 81 | const TC_PHRASE = 1; // Phrase (one or several quoted words) 82 | const TC_NUMBER = 2; // Nubers, which are used with syntax elements. Ex. roam~0.8 83 | const TC_SYNTAX_ELEMENT = 3; // + - ( ) [ ] { } ! || && ~ ^ 84 | 85 | 86 | /** 87 | * Token type. 88 | * 89 | * @var integer 90 | */ 91 | public $type; 92 | 93 | /** 94 | * Token text. 95 | * 96 | * @var integer 97 | */ 98 | public $text; 99 | 100 | /** 101 | * Token position within query. 102 | * 103 | * @var integer 104 | */ 105 | public $position; 106 | 107 | 108 | /** 109 | * IndexReader constructor needs token type and token text as a parameters. 110 | * 111 | * @param integer $tokenCategory 112 | * @param string $tokText 113 | * @param integer $position 114 | * @throws \ZendSearch\Lucene\Exception\InvalidArgumentException 115 | */ 116 | public function __construct($tokenCategory, $tokenText, $position) 117 | { 118 | $this->text = $tokenText; 119 | $this->position = $position + 1; // Start from 1 120 | 121 | switch ($tokenCategory) { 122 | case self::TC_WORD: 123 | if ( strtolower($tokenText) == 'and') { 124 | $this->type = self::TT_AND_LEXEME; 125 | } elseif (strtolower($tokenText) == 'or') { 126 | $this->type = self::TT_OR_LEXEME; 127 | } elseif (strtolower($tokenText) == 'not') { 128 | $this->type = self::TT_NOT_LEXEME; 129 | } elseif (strtolower($tokenText) == 'to') { 130 | $this->type = self::TT_TO_LEXEME; 131 | } else { 132 | $this->type = self::TT_WORD; 133 | } 134 | break; 135 | 136 | case self::TC_PHRASE: 137 | $this->type = self::TT_PHRASE; 138 | break; 139 | 140 | case self::TC_NUMBER: 141 | $this->type = self::TT_NUMBER; 142 | break; 143 | 144 | case self::TC_SYNTAX_ELEMENT: 145 | switch ($tokenText) { 146 | case ':': 147 | $this->type = self::TT_FIELD_INDICATOR; 148 | break; 149 | 150 | case '+': 151 | $this->type = self::TT_REQUIRED; 152 | break; 153 | 154 | case '-': 155 | $this->type = self::TT_PROHIBITED; 156 | break; 157 | 158 | case '~': 159 | $this->type = self::TT_FUZZY_PROX_MARK; 160 | break; 161 | 162 | case '^': 163 | $this->type = self::TT_BOOSTING_MARK; 164 | break; 165 | 166 | case '[': 167 | $this->type = self::TT_RANGE_INCL_START; 168 | break; 169 | 170 | case ']': 171 | $this->type = self::TT_RANGE_INCL_END; 172 | break; 173 | 174 | case '{': 175 | $this->type = self::TT_RANGE_EXCL_START; 176 | break; 177 | 178 | case '}': 179 | $this->type = self::TT_RANGE_EXCL_END; 180 | break; 181 | 182 | case '(': 183 | $this->type = self::TT_SUBQUERY_START; 184 | break; 185 | 186 | case ')': 187 | $this->type = self::TT_SUBQUERY_END; 188 | break; 189 | 190 | case '!': 191 | $this->type = self::TT_NOT_LEXEME; 192 | break; 193 | 194 | case '&&': 195 | $this->type = self::TT_AND_LEXEME; 196 | break; 197 | 198 | case '||': 199 | $this->type = self::TT_OR_LEXEME; 200 | break; 201 | 202 | default: 203 | throw new Lucene\Exception\InvalidArgumentException( 204 | 'Unrecognized query syntax lexeme: \'' . $tokenText . '\'' 205 | ); 206 | } 207 | break; 208 | 209 | case self::TC_NUMBER: 210 | $this->type = self::TT_NUMBER; 211 | 212 | default: 213 | throw new Lucene\Exception\InvalidArgumentException( 214 | 'Unrecognized lexeme type: \'' . $tokenCategory . '\'' 215 | ); 216 | } 217 | } 218 | } 219 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Similarity/DefaultSimilarity.php: -------------------------------------------------------------------------------- 1 | createWeight(). 17 | * The sumOfSquaredWeights() method is then called on the top-level 18 | * query to compute the query normalization factor Similarity->queryNorm(float). 19 | * This factor is then passed to normalize(float). At this point the weighting 20 | * is complete. 21 | * 22 | * @category Zend 23 | * @package Zend_Search_Lucene 24 | * @subpackage Search 25 | */ 26 | abstract class AbstractWeight 27 | { 28 | /** 29 | * Normalization factor. 30 | * This value is stored only for query expanation purpose and not used in any other place 31 | * 32 | * @var float 33 | */ 34 | protected $_queryNorm; 35 | 36 | /** 37 | * AbstractWeight value 38 | * 39 | * AbstractWeight value may be initialized in sumOfSquaredWeights() or normalize() 40 | * because they both are invoked either in Query::_initWeight (for top-level query) or 41 | * in corresponding methods of parent query's weights 42 | * 43 | * @var float 44 | */ 45 | protected $_value; 46 | 47 | 48 | /** 49 | * The weight for this query. 50 | * 51 | * @return float 52 | */ 53 | public function getValue() 54 | { 55 | return $this->_value; 56 | } 57 | 58 | /** 59 | * The sum of squared weights of contained query clauses. 60 | * 61 | * @return float 62 | */ 63 | abstract public function sumOfSquaredWeights(); 64 | 65 | /** 66 | * Assigns the query normalization factor to this. 67 | * 68 | * @param $norm 69 | */ 70 | abstract public function normalize($norm); 71 | } 72 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Weight/Boolean.php: -------------------------------------------------------------------------------- 1 | _query = $query; 57 | $this->_reader = $reader; 58 | $this->_weights = array(); 59 | 60 | $signs = $query->getSigns(); 61 | 62 | foreach ($query->getSubqueries() as $num => $subquery) { 63 | if ($signs === null || $signs[$num] === null || $signs[$num]) { 64 | $this->_weights[$num] = $subquery->createWeight($reader); 65 | } 66 | } 67 | } 68 | 69 | 70 | /** 71 | * The weight for this query 72 | * Standard Weight::$_value is not used for boolean queries 73 | * 74 | * @return float 75 | */ 76 | public function getValue() 77 | { 78 | return $this->_query->getBoost(); 79 | } 80 | 81 | 82 | /** 83 | * The sum of squared weights of contained query clauses. 84 | * 85 | * @return float 86 | */ 87 | public function sumOfSquaredWeights() 88 | { 89 | $sum = 0; 90 | foreach ($this->_weights as $weight) { 91 | // sum sub weights 92 | $sum += $weight->sumOfSquaredWeights(); 93 | } 94 | 95 | // boost each sub-weight 96 | $sum *= $this->_query->getBoost() * $this->_query->getBoost(); 97 | 98 | // check for empty query (like '-something -another') 99 | if ($sum == 0) { 100 | $sum = 1.0; 101 | } 102 | return $sum; 103 | } 104 | 105 | 106 | /** 107 | * Assigns the query normalization factor to this. 108 | * 109 | * @param float $queryNorm 110 | */ 111 | public function normalize($queryNorm) 112 | { 113 | // incorporate boost 114 | $queryNorm *= $this->_query->getBoost(); 115 | 116 | foreach ($this->_weights as $weight) { 117 | $weight->normalize($queryNorm); 118 | } 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Weight/EmptyResultWeight.php: -------------------------------------------------------------------------------- 1 | _query = $query; 57 | $this->_reader = $reader; 58 | $this->_weights = array(); 59 | 60 | $signs = $query->getSigns(); 61 | 62 | foreach ($query->getTerms() as $id => $term) { 63 | if ($signs === null || $signs[$id] === null || $signs[$id]) { 64 | $this->_weights[$id] = new Term($term, $query, $reader); 65 | $query->setWeight($id, $this->_weights[$id]); 66 | } 67 | } 68 | } 69 | 70 | 71 | /** 72 | * The weight for this query 73 | * Standard Weight::$_value is not used for boolean queries 74 | * 75 | * @return float 76 | */ 77 | public function getValue() 78 | { 79 | return $this->_query->getBoost(); 80 | } 81 | 82 | 83 | /** 84 | * The sum of squared weights of contained query clauses. 85 | * 86 | * @return float 87 | */ 88 | public function sumOfSquaredWeights() 89 | { 90 | $sum = 0; 91 | foreach ($this->_weights as $weight) { 92 | // sum sub weights 93 | $sum += $weight->sumOfSquaredWeights(); 94 | } 95 | 96 | // boost each sub-weight 97 | $sum *= $this->_query->getBoost() * $this->_query->getBoost(); 98 | 99 | // check for empty query (like '-something -another') 100 | if ($sum == 0) { 101 | $sum = 1.0; 102 | } 103 | return $sum; 104 | } 105 | 106 | 107 | /** 108 | * Assigns the query normalization factor to this. 109 | * 110 | * @param float $queryNorm 111 | */ 112 | public function normalize($queryNorm) 113 | { 114 | // incorporate boost 115 | $queryNorm *= $this->_query->getBoost(); 116 | 117 | foreach ($this->_weights as $weight) { 118 | $weight->normalize($queryNorm); 119 | } 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Weight/Phrase.php: -------------------------------------------------------------------------------- 1 | _query = $query; 53 | $this->_reader = $reader; 54 | } 55 | 56 | /** 57 | * The sum of squared weights of contained query clauses. 58 | * 59 | * @return float 60 | */ 61 | public function sumOfSquaredWeights() 62 | { 63 | // compute idf 64 | $this->_idf = $this->_reader->getSimilarity()->idf($this->_query->getTerms(), $this->_reader); 65 | 66 | // compute query weight 67 | $this->_queryWeight = $this->_idf * $this->_query->getBoost(); 68 | 69 | // square it 70 | return $this->_queryWeight * $this->_queryWeight; 71 | } 72 | 73 | 74 | /** 75 | * Assigns the query normalization factor to this. 76 | * 77 | * @param float $queryNorm 78 | */ 79 | public function normalize($queryNorm) 80 | { 81 | $this->_queryNorm = $queryNorm; 82 | 83 | // normalize query weight 84 | $this->_queryWeight *= $queryNorm; 85 | 86 | // idf for documents 87 | $this->_value = $this->_queryWeight * $this->_idf; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Search/Weight/Term.php: -------------------------------------------------------------------------------- 1 | _term = $term; 73 | $this->_query = $query; 74 | $this->_reader = $reader; 75 | } 76 | 77 | 78 | /** 79 | * The sum of squared weights of contained query clauses. 80 | * 81 | * @return float 82 | */ 83 | public function sumOfSquaredWeights() 84 | { 85 | // compute idf 86 | $this->_idf = $this->_reader->getSimilarity()->idf($this->_term, $this->_reader); 87 | 88 | // compute query weight 89 | $this->_queryWeight = $this->_idf * $this->_query->getBoost(); 90 | 91 | // square it 92 | return $this->_queryWeight * $this->_queryWeight; 93 | } 94 | 95 | 96 | /** 97 | * Assigns the query normalization factor to this. 98 | * 99 | * @param float $queryNorm 100 | */ 101 | public function normalize($queryNorm) 102 | { 103 | $this->_queryNorm = $queryNorm; 104 | 105 | // normalize query weight 106 | $this->_queryWeight *= $queryNorm; 107 | 108 | // idf for documents 109 | $this->_value = $this->_queryWeight * $this->_idf; 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/Storage/Directory/DirectoryInterface.php: -------------------------------------------------------------------------------- 1 | _fileHandle = @fopen($filename, $mode); 52 | 53 | if ($this->_fileHandle === false) { 54 | ini_set('track_errors', $trackErrors); 55 | throw new Lucene\Exception\RuntimeException($php_errormsg); 56 | } 57 | 58 | ini_set('track_errors', $trackErrors); 59 | } 60 | 61 | /** 62 | * Sets the file position indicator and advances the file pointer. 63 | * The new position, measured in bytes from the beginning of the file, 64 | * is obtained by adding offset to the position specified by whence, 65 | * whose values are defined as follows: 66 | * SEEK_SET - Set position equal to offset bytes. 67 | * SEEK_CUR - Set position to current location plus offset. 68 | * SEEK_END - Set position to end-of-file plus offset. (To move to 69 | * a position before the end-of-file, you need to pass a negative value 70 | * in offset.) 71 | * SEEK_CUR is the only supported offset type for compound files 72 | * 73 | * Upon success, returns 0; otherwise, returns -1 74 | * 75 | * @param integer $offset 76 | * @param integer $whence 77 | * @return integer 78 | */ 79 | public function seek($offset, $whence=SEEK_SET) 80 | { 81 | return fseek($this->_fileHandle, $offset, $whence); 82 | } 83 | 84 | 85 | /** 86 | * Get file position. 87 | * 88 | * @return integer 89 | */ 90 | public function tell() 91 | { 92 | return ftell($this->_fileHandle); 93 | } 94 | 95 | /** 96 | * Flush output. 97 | * 98 | * Returns true on success or false on failure. 99 | * 100 | * @return boolean 101 | */ 102 | public function flush() 103 | { 104 | return fflush($this->_fileHandle); 105 | } 106 | 107 | /** 108 | * Close File object 109 | */ 110 | public function close() 111 | { 112 | if ($this->_fileHandle !== null ) { 113 | ErrorHandler::start(E_WARNING); 114 | fclose($this->_fileHandle); 115 | ErrorHandler::stop(); 116 | $this->_fileHandle = null; 117 | } 118 | } 119 | 120 | /** 121 | * Get the size of the already opened file 122 | * 123 | * @return integer 124 | */ 125 | public function size() 126 | { 127 | $position = ftell($this->_fileHandle); 128 | fseek($this->_fileHandle, 0, SEEK_END); 129 | $size = ftell($this->_fileHandle); 130 | fseek($this->_fileHandle,$position); 131 | 132 | return $size; 133 | } 134 | 135 | /** 136 | * Read a $length bytes from the file and advance the file pointer. 137 | * 138 | * @param integer $length 139 | * @return string 140 | */ 141 | protected function _fread($length=1) 142 | { 143 | if ($length == 0) { 144 | return ''; 145 | } 146 | 147 | if ($length < 1024) { 148 | return fread($this->_fileHandle, $length); 149 | } 150 | 151 | $data = ''; 152 | while ( $length > 0 && ($nextBlock = fread($this->_fileHandle, $length)) != false ) { 153 | $data .= $nextBlock; 154 | $length -= strlen($nextBlock); 155 | } 156 | return $data; 157 | } 158 | 159 | 160 | /** 161 | * Writes $length number of bytes (all, if $length===null) to the end 162 | * of the file. 163 | * 164 | * @param string $data 165 | * @param integer $length 166 | */ 167 | protected function _fwrite($data, $length=null) 168 | { 169 | if ($length === null ) { 170 | fwrite($this->_fileHandle, $data); 171 | } else { 172 | fwrite($this->_fileHandle, $data, $length); 173 | } 174 | } 175 | 176 | /** 177 | * Lock file 178 | * 179 | * Lock type may be a LOCK_SH (shared lock) or a LOCK_EX (exclusive lock) 180 | * 181 | * @param integer $lockType 182 | * @param boolean $nonBlockingLock 183 | * @return boolean 184 | */ 185 | public function lock($lockType, $nonBlockingLock = false) 186 | { 187 | if ($nonBlockingLock) { 188 | return flock($this->_fileHandle, $lockType | LOCK_NB); 189 | } else { 190 | return flock($this->_fileHandle, $lockType); 191 | } 192 | } 193 | 194 | /** 195 | * Unlock file 196 | * 197 | * Returns true on success 198 | * 199 | * @return boolean 200 | */ 201 | public function unlock() 202 | { 203 | if ($this->_fileHandle !== null ) { 204 | return flock($this->_fileHandle, LOCK_UN); 205 | } else { 206 | return true; 207 | } 208 | } 209 | } 210 | -------------------------------------------------------------------------------- /library/ZendSearch/Lucene/TermStreamsPriorityQueue.php: -------------------------------------------------------------------------------- 1 | _termStreams = $termStreams; 50 | 51 | $this->resetTermsStream(); 52 | } 53 | 54 | /** 55 | * Reset terms stream. 56 | */ 57 | public function resetTermsStream() 58 | { 59 | $this->_termsStreamQueue = new Index\TermsPriorityQueue(); 60 | 61 | foreach ($this->_termStreams as $termStream) { 62 | $termStream->resetTermsStream(); 63 | 64 | // Skip "empty" containers 65 | if ($termStream->currentTerm() !== null) { 66 | $this->_termsStreamQueue->put($termStream); 67 | } 68 | } 69 | 70 | $this->nextTerm(); 71 | } 72 | 73 | /** 74 | * Skip terms stream up to specified term preffix. 75 | * 76 | * Prefix contains fully specified field info and portion of searched term 77 | * 78 | * @param \ZendSearch\Lucene\Index\Term $prefix 79 | */ 80 | public function skipTo(Index\Term $prefix) 81 | { 82 | $termStreams = array(); 83 | 84 | while (($termStream = $this->_termsStreamQueue->pop()) !== null) { 85 | $termStreams[] = $termStream; 86 | } 87 | 88 | foreach ($termStreams as $termStream) { 89 | $termStream->skipTo($prefix); 90 | 91 | if ($termStream->currentTerm() !== null) { 92 | $this->_termsStreamQueue->put($termStream); 93 | } 94 | } 95 | 96 | $this->nextTerm(); 97 | } 98 | 99 | /** 100 | * Scans term streams and returns next term 101 | * 102 | * @return \ZendSearch\Lucene\Index\Term|null 103 | */ 104 | public function nextTerm() 105 | { 106 | while (($termStream = $this->_termsStreamQueue->pop()) !== null) { 107 | if ($this->_termsStreamQueue->top() === null || 108 | $this->_termsStreamQueue->top()->currentTerm()->key() != 109 | $termStream->currentTerm()->key()) { 110 | // We got new term 111 | $this->_lastTerm = $termStream->currentTerm(); 112 | 113 | if ($termStream->nextTerm() !== null) { 114 | // Put segment back into the priority queue 115 | $this->_termsStreamQueue->put($termStream); 116 | } 117 | 118 | return $this->_lastTerm; 119 | } 120 | 121 | if ($termStream->nextTerm() !== null) { 122 | // Put segment back into the priority queue 123 | $this->_termsStreamQueue->put($termStream); 124 | } 125 | } 126 | 127 | // End of stream 128 | $this->_lastTerm = null; 129 | 130 | return null; 131 | } 132 | 133 | /** 134 | * Returns term in current position 135 | * 136 | * @return \ZendSearch\Lucene\Index\Term|null 137 | */ 138 | public function currentTerm() 139 | { 140 | return $this->_lastTerm; 141 | } 142 | 143 | /** 144 | * Close terms stream 145 | * 146 | * Should be used for resources clean up if stream is not read up to the end 147 | */ 148 | public function closeTermsStream() 149 | { 150 | while (($termStream = $this->_termsStreamQueue->pop()) !== null) { 151 | $termStream->closeTermsStream(); 152 | } 153 | 154 | $this->_termsStreamQueue = null; 155 | $this->_lastTerm = null; 156 | } 157 | } 158 | --------------------------------------------------------------------------------