├── .gitignore ├── tests └── test.php ├── src └── Laurentvw │ └── Scrapher │ ├── Exceptions │ ├── SelectorNotFoundException.php │ ├── ContentNotFoundException.php │ └── MatchIdNotFoundException.php │ ├── Selectors │ ├── RegexSelector.php │ └── Selector.php │ ├── Page.php │ ├── Matcher.php │ └── Scrapher.php ├── composer.json ├── LICENSE └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | /vendor 2 | /.idea 3 | -------------------------------------------------------------------------------- /tests/test.php: -------------------------------------------------------------------------------- 1 | =5.3.0" 9 | }, 10 | "license": "MIT", 11 | "authors": [ 12 | { 13 | "name": "Laurent Van Winckel", 14 | "homepage": "http://www.laurentvw.com" 15 | } 16 | ], 17 | "autoload": { 18 | "psr-0": { 19 | "Laurentvw\\Scrapher": "src/" 20 | } 21 | }, 22 | "minimum-stability": "dev" 23 | } 24 | -------------------------------------------------------------------------------- /src/Laurentvw/Scrapher/Selectors/RegexSelector.php: -------------------------------------------------------------------------------- 1 | getExpression(), $this->getContent(), $matchLines, PREG_SET_ORDER); 12 | 13 | foreach ($matchLines as $i => $matchLine) { 14 | foreach ($this->getConfig() as $config) { 15 | if ($config['id'] == 0) { 16 | $matches[$i][$config['name']] = $this->getSourceKey(); 17 | continue; 18 | } 19 | if (!isset($matchLine[$config['id']])) { 20 | $matches[$i][$config['name']] = null; 21 | } else { 22 | $matches[$i][$config['name']] = $matchLine[$config['id']]; 23 | } 24 | } 25 | } 26 | 27 | return $matches; 28 | } 29 | 30 | } 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Laurent Van Winckel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so, 10 | subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 17 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 18 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 19 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 20 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | -------------------------------------------------------------------------------- /src/Laurentvw/Scrapher/Selectors/Selector.php: -------------------------------------------------------------------------------- 1 | expression = $expression; 12 | $this->config = $config; 13 | } 14 | 15 | public function setContent($content) 16 | { 17 | $this->content = $content; 18 | } 19 | 20 | public function getContent() 21 | { 22 | return $this->content; 23 | } 24 | 25 | public function setSourceKey($key) 26 | { 27 | $this->sourceKey = $key; 28 | } 29 | 30 | public function getSourceKey() 31 | { 32 | return $this->sourceKey; 33 | } 34 | 35 | public function setConfig($config) 36 | { 37 | $this->config = $config; 38 | } 39 | 40 | public function getConfig() 41 | { 42 | return $this->config; 43 | } 44 | 45 | public function setExpression($expression) 46 | { 47 | $this->expression = $expression; 48 | } 49 | 50 | public function getExpression() 51 | { 52 | return $this->expression; 53 | } 54 | 55 | abstract public function getMatches(); 56 | } 57 | -------------------------------------------------------------------------------- /src/Laurentvw/Scrapher/Page.php: -------------------------------------------------------------------------------- 1 | url = $url; 12 | } 13 | 14 | public function getHTML() 15 | { 16 | $ch = curl_init(); 17 | 18 | $options = array( 19 | CURLOPT_URL => $this->url, // the URL 20 | CURLOPT_RETURNTRANSFER => true, // Don’t output any response directly to the browser 21 | CURLOPT_HEADER => false, // Don’t return the header 22 | CURLOPT_FOLLOWLOCATION => true, 23 | CURLOPT_ENCODING => '', 24 | CURLOPT_USERAGENT => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', // Set a valid user agent 25 | CURLOPT_AUTOREFERER => true, 26 | CURLOPT_CONNECTTIMEOUT => 15, 27 | CURLOPT_TIMEOUT => 15, 28 | CURLOPT_MAXREDIRS => 3, 29 | ); 30 | 31 | curl_setopt_array($ch, $options); 32 | 33 | $data = curl_exec($ch); 34 | 35 | curl_close($ch); 36 | 37 | return $data; 38 | } 39 | 40 | public function getURL() 41 | { 42 | return $this->url; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/Laurentvw/Scrapher/Matcher.php: -------------------------------------------------------------------------------- 1 | setSelector($selector); 40 | $this->setFilter($filter); 41 | } 42 | 43 | /** 44 | * Set the selector. 45 | * 46 | * @param Selector $selector 47 | * 48 | * @return Matcher 49 | */ 50 | public function setSelector(Selector $selector) 51 | { 52 | $this->selector = $selector; 53 | 54 | return $this; 55 | } 56 | 57 | /** 58 | * Get the selector. 59 | * 60 | * @return Selector 61 | */ 62 | public function getSelector() 63 | { 64 | return $this->selector; 65 | } 66 | 67 | /** 68 | * Set the filter to be applied to the matches. 69 | * 70 | * @param \Closure $filter 71 | * 72 | * @return Matcher 73 | */ 74 | public function setFilter($filter = null) 75 | { 76 | $this->filter = is_callable($filter) ? $filter : null; 77 | 78 | return $this; 79 | } 80 | 81 | /** 82 | * Get detailed logs of the scraping. 83 | * 84 | * @return array 85 | */ 86 | public function getLogs() 87 | { 88 | return $this->logs; 89 | } 90 | 91 | /** 92 | * Add a log message. 93 | * 94 | * @param string $msg 95 | */ 96 | public function addLog($msg) 97 | { 98 | $this->logs[] = $msg; 99 | } 100 | 101 | /** 102 | * @param $content 103 | * 104 | * @param $sourceKey 105 | * @return array 106 | */ 107 | public function getMatches($content, $sourceKey) 108 | { 109 | $filteredResults = array(); 110 | 111 | $this->getSelector()->setContent($content); 112 | $this->getSelector()->setSourceKey($sourceKey); 113 | 114 | $matches = $this->getSelector()->getMatches(); 115 | 116 | if ($matches) { 117 | foreach ($matches as $matchLine) { 118 | $filteredResult = $this->fetch($matchLine); 119 | 120 | if ($filteredResult) { 121 | $filteredResults[] = $filteredResult; 122 | } 123 | } 124 | } else { 125 | $this->addLog('The HTML or Selector expression is broken'); 126 | } 127 | 128 | return $filteredResults; 129 | } 130 | 131 | /** 132 | * Fetch the values from a match. 133 | * 134 | * @param array $matchLine 135 | * 136 | * @return array 137 | */ 138 | private function fetch(array $matchLine) 139 | { 140 | $result = array(); 141 | 142 | foreach ($this->getSelector()->getConfig() as $match) { 143 | // Get the match value, optionally apply a function to it 144 | if (isset($match['apply'])) { 145 | $result[$match['name']] = $match['apply']($matchLine[$match['name']], $this->getSelector()->getSourceKey()); 146 | } else { 147 | $result[$match['name']] = $matchLine[$match['name']]; 148 | } 149 | 150 | // Validate this match 151 | if (isset($match['validate'])) { 152 | if (!$match['validate']($result[$match['name']], $this->getSelector()->getSourceKey())) { 153 | $this->addLog('Skipping match because validation failed for '.$match['name'].': '.$result[$match['name']]); 154 | 155 | return false; 156 | } 157 | } 158 | } 159 | 160 | // Filter the data 161 | if ($this->filter && !call_user_func($this->filter, $result)) { 162 | $this->addLog('Filtering out match: '.var_export($result, true)); 163 | 164 | return false; 165 | } 166 | 167 | return $result; 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Scrapher 2 | =========== 3 | 4 | Scrapher is a PHP library to easily scrape data from web pages. 5 | 6 | 7 | Getting Started 8 | --------------- 9 | 10 | ### Installation 11 | 12 | Add the package to your `composer.json` and run `composer update`. 13 | 14 | { 15 | "require": { 16 | "laurentvw/scrapher": "2.*" 17 | } 18 | } 19 | 20 | *For the people still using v1.0 ("LavaCrawler"), you can find the documentation is here: * 21 | 22 | 23 | ### Basic Usage 24 | 25 | In order to start scraping, you need to set the URL(s) or HTML to scrape, and a type of selector to use (for example a regex selector, together with the data you wish to match). 26 | 27 | ```php 28 | use \Laurentvw\Scrapher\Scrapher; 29 | use \Laurentvw\Scrapher\Selectors\RegexSelector; 30 | 31 | $url = 'https://www.google.com/'; 32 | $scrapher = new Scrapher($url); 33 | 34 | // Match all links on a page 35 | $regex = '/(.*?)<\/a>/ms'; 36 | 37 | $matchConfig = array( 38 | array( 39 | 'name' => 'url', 40 | 'id' => 1, // the first match (.*?) from the regex 41 | ), 42 | array( 43 | 'name' => 'title', 44 | 'id' => 2, // the second match (.*?) from the regex 45 | ), 46 | ); 47 | 48 | $matches = $scrapher->with(new RegexSelector($regex, $matchConfig)); 49 | 50 | $results = $matches->get(); 51 | ``` 52 | 53 | This returns a list of arrays based on the match configuration that was set. 54 | 55 | array(29) { 56 | [0] => 57 | array(2) { 58 | 'url' => 59 | string(34) "https://www.google.com/webhp?tab=ww" 60 | 'title' => 61 | string(6) "Search" 62 | } 63 | ... 64 | } 65 | 66 | Documentation 67 | ------------- 68 | 69 | ### Instantiating 70 | 71 | When creating an instance of Scrapher, you may optionally pass one or more URLs. 72 | 73 | Passing multiple URLs can be useful when you want to scrape the same data on different pages. For example when content is separated by pagination. 74 | 75 | ```php 76 | $scrapher = new Scrapher($url); 77 | $scrapher = new Scrapher(array($url, $url2)); 78 | ``` 79 | 80 | If you prefer to fetch the page yourself using a dedicated client/library, you may also simply pass the actual content of a page. This can also be handy if you want to scrape other content besides just web pages (e.g. local files). 81 | 82 | ```php 83 | $scrapher = new Scrapher($content); 84 | $scrapher = new Scrapher(array($content, $content2)); 85 | ``` 86 | 87 | In some cases, you may want to add (read: append) URLs or contents on the fly. 88 | 89 | ```php 90 | $scrapher->addUrl($url); 91 | $scrapher->addUrls(array($url, $url2)); 92 | $scrapher->addContent($content); 93 | $scrapher->addContents(array($content, $content2)); 94 | ``` 95 | 96 | ### Matching data using a Selector 97 | 98 | Before retrieving or sorting the matched data, you need to choose a selector to match the data you want. 99 | 100 | At the moment, Scrapher offers 1 selector out of the box, **RegexSelector**, which let's you select data using regular expressions. 101 | 102 | A Selector takes an expression and a match configuration as its arguments. 103 | 104 | For example, to match all links and their link name, you could do: 105 | 106 | ```php 107 | $regExpression = '/(.*?)<\/a>/ms'; 108 | 109 | $matchConfig = array( 110 | array( 111 | // The "name" key let's you name the data you're looking for, 112 | // and will be used when retrieving the matched data 113 | 'name' => 'url', 114 | // The "id" key is an identifier used during the regular expression search. 115 | // The id 1 corresponds to the first match in the regular expression, matching the URL. 116 | 'id' => 1, 117 | ), 118 | array( 119 | 'name' => 'title', 120 | 'id' => 2, 121 | ), 122 | ); 123 | 124 | $matches = $scrapher->with(new RegexSelector($regExpression, $matchConfig)); 125 | ``` 126 | 127 | Note that the kind of value passed to the "id" key may vary depending on what selector you're using, and can virtually be anything. You can think of the "id" key as the glue between the given expression and its selector. 128 | 129 | _**RegexSelector** uses under the hood._ 130 | 131 | For your convenience, when using Regex, a match with `'id' => 0` will return the URL of the crawled page. 132 | 133 | ### Retrieving & Sorting 134 | 135 | Once you've specified a selector using the **with** method, you can start retrieving and/or sorting the data. 136 | 137 | **Retrieving** 138 | ```php 139 | // Return all matches 140 | $results = $matches->get(); 141 | 142 | // Return all matches with a subset of the data (either use multiple arguments or an array for more than one column) 143 | $results = $matches->get('title'); 144 | 145 | // Return the first match 146 | $result = $matches->first(); 147 | 148 | // Return the last match 149 | $result = $matches->last(); 150 | 151 | // Count the number of matches 152 | $numberOfMatches = $matches->count(); 153 | ``` 154 | 155 | **Offset & limit** 156 | ```php 157 | // Take the first N matches 158 | $results = $matches->take(5)->get(); 159 | 160 | // Skip the first N matches 161 | $results = $matches->skip(1)->get(); 162 | 163 | // Take 5 matches starting from the second one. 164 | $results = $matches->skip(1)->take(5)->get(); 165 | ``` 166 | 167 | **Sorting** 168 | ```php 169 | // Order by title 170 | $results = $matches->orderBy('title')->get(); 171 | 172 | // Order by title, then by URL 173 | $results = $matches->orderBy('title')->orderBy('url', 'desc')->get(); 174 | 175 | // Custom sorting: For values that do not lend well with sorting, e.g. dates*. 176 | $results = $matches->orderBy('date', 'desc', 'date_create')->get(); 177 | 178 | // Simply reverse the order of the results 179 | $results = $matches->reverse()->get(); 180 | ``` 181 | * See [date_create](http://php.net/manual/en/function.date-create.php) 182 | 183 | 184 | ### Filtering 185 | 186 | You can filter the matched data to refine your result set. Return `true` to keep the match, `false` to filter it out. 187 | ```php 188 | $matches->filter(function($match) { 189 | // Return only matches that contain 'Google' in the link title. 190 | return stristr($match['title'], 'Google') ? true : false; 191 | }); 192 | ``` 193 | 194 | ### Mutating 195 | 196 | In order to handle inconsistencies or formatting issues, you can alter the matched values to a more desirable value. Altering happens before filtering and sorting the result set. You can do so by using the `apply` index in the match configuration array with a closure that takes 2 arguments: the matched value and the URL of the crawled page. 197 | 198 | ```php 199 | $matchConfig = array( 200 | array( 201 | 'name' => 'url', 202 | 'id' => 1, 203 | // Add domain to relative URLs 204 | 'apply' => function($match, $sourceUrl) 205 | { 206 | if (!stristr($match, 'http')) { 207 | return $sourceUrl . trim($match, '/'); 208 | } 209 | return $match; 210 | }, 211 | ), 212 | array( 213 | 'name' => 'title', 214 | 'id' => 2, 215 | // Remove all html tags inside the link title 216 | 'apply' => function($match) { 217 | return strip_tags($match); 218 | }, 219 | ), 220 | ... 221 | ); 222 | ``` 223 | 224 | ### Validation 225 | 226 | You may validate the matched data to insure that the result set always contains the desired result. Validation happens after optionally mutating the data set with `apply`. To add the validation rules that should be applied to the data, use the `validate` index in the match configuration array with a closure that takes 2 arguments: the matched value and the URL of the crawled page. The closure should return `true` if the validation succeeded, and `false` if the validation failed. Matches that fail the validation will be removed from the result set. 227 | 228 | ```php 229 | $matchConfig = array( 230 | array( 231 | 'name' => 'url', 232 | 'id' => 1, 233 | // Make sure it is a valid url 234 | 'validate' => function($match) { 235 | return filter_var($match, FILTER_VALIDATE_URL); 236 | }, 237 | ), 238 | array( 239 | 'name' => 'title', 240 | 'id' => 2, 241 | // We only want titles that are between 1 and 50 characters long. 242 | 'validate' => function($match) { 243 | return strlen($match) >= 1 && strlen($match) <= 50; 244 | }, 245 | ), 246 | ... 247 | ); 248 | ``` 249 | 250 | * To make validation easier, we recommend using in your project. 251 | 252 | ### Logging 253 | 254 | If you wish to see the matches that were filtered out, or removed due to failed validation, you can use the `getLogs` method, which returns an array of message logs. 255 | 256 | ```php 257 | $logs = $matches->getLogs(); 258 | ``` 259 | 260 | ### Did you know? 261 | 262 | **All methods are chainable** 263 | 264 | ```php 265 | $scrapher = new Scrapher(); 266 | $scrapher->addUrl($url)->with($regexSelector)->filter(...)->orderBy('title')->skip(1)->take(5)->get(); 267 | ``` 268 | 269 | Only the methods `get`, `first`, `last`, `count` and `getLogs` will cause the chaining to end, as they all return a certain result. 270 | 271 | **You can scrape different data from one page** 272 | 273 | Suppose you're scraping a page, and you want to get all H2 titles, as well as all links on the page. You can do so without having to re-instantiate Scrapher. 274 | 275 | ```php 276 | $scrapher = new Scrapher($url); 277 | $h2Titles = $scrapher->with($h2RegexSelector)->get(); 278 | $links = $scrapher->with($linksRegexSelector)->get(); 279 | ``` 280 | 281 | About 282 | ----- 283 | ### Author 284 | 285 | Laurent Van Winckel - - 286 | 287 | ### License 288 | 289 | Scrapher is licensed under the MIT License - see the `LICENSE` file for details 290 | 291 | ### Contributing 292 | 293 | Contributions to Laurentvw\Scrapher are always welcome. You make our lives 294 | easier by sending us your contributions through 295 | [GitHub pull requests](http://help.github.com/pull-requests). 296 | 297 | You may also [create an issue](https://github.com/Laurentvw/scrapher/issues) to report bugs or request new features. 298 | -------------------------------------------------------------------------------- /src/Laurentvw/Scrapher/Scrapher.php: -------------------------------------------------------------------------------- 1 | addUrl($content); 70 | } else { 71 | $this->addContent($content); 72 | } 73 | } 74 | } 75 | } 76 | 77 | /** 78 | * Add URL to scrape. 79 | * 80 | * @param string $url 81 | * 82 | * @return Scrapher 83 | */ 84 | public function addUrl($url) 85 | { 86 | $page = new Page($url); 87 | $this->addContent($page->getHTML(), $url); 88 | 89 | return $this; 90 | } 91 | 92 | /** 93 | * Add URLs to scrape. 94 | * 95 | * @param array $urls 96 | * 97 | * @return Scrapher 98 | */ 99 | public function addUrls(array $urls) 100 | { 101 | foreach ($urls as $url) { 102 | $this->addUrl($url); 103 | } 104 | 105 | return $this; 106 | } 107 | 108 | /** 109 | * Add content to scrape. 110 | * 111 | * @param string $content 112 | * @param null $key 113 | * @return Scrapher 114 | */ 115 | public function addContent($content, $key = null) 116 | { 117 | if (!is_null($key)) { 118 | $this->contents[$key] = $content; 119 | } else { 120 | $this->contents[] = $content; 121 | } 122 | 123 | return $this; 124 | } 125 | 126 | /** 127 | * Add contents to scrape. 128 | * 129 | * @param array $contents 130 | * 131 | * @return Scrapher 132 | */ 133 | public function addContents(array $contents) 134 | { 135 | foreach ($contents as $content) { 136 | $this->addContent($content); 137 | } 138 | 139 | return $this; 140 | } 141 | 142 | /** 143 | * Set the type of selector to use. 144 | * 145 | * @param Selector $selector 146 | * 147 | * @return Scrapher 148 | */ 149 | public function with(Selector $selector) 150 | { 151 | $this->matcher = new Matcher($selector); 152 | 153 | return $this; 154 | } 155 | 156 | /** 157 | * Filter the resulting matches. 158 | * 159 | * @param \Closure $filter 160 | * 161 | * @return Scrapher 162 | */ 163 | public function filter($filter) 164 | { 165 | $this->getMatcher()->setFilter($filter); 166 | 167 | return $this; 168 | } 169 | 170 | /** 171 | * Reverse the order of the resulting matches. 172 | * 173 | * @return Scrapher 174 | */ 175 | public function reverse() 176 | { 177 | $this->reverse = true; 178 | 179 | return $this; 180 | } 181 | 182 | /** 183 | * Take n-number of matches. 184 | * 185 | * @param int $n 186 | * 187 | * @return Scrapher 188 | */ 189 | public function take($n) 190 | { 191 | $this->take = $n; 192 | 193 | return $this; 194 | } 195 | 196 | /** 197 | * Skip n-number of matches. 198 | * 199 | * @param int $n 200 | * 201 | * @return Scrapher 202 | */ 203 | public function skip($n) 204 | { 205 | $this->skip = $n; 206 | 207 | return $this; 208 | } 209 | 210 | /** 211 | * Order the matches. 212 | * 213 | * @param string $name 214 | * @param string $order 215 | * @param string $projection 216 | * 217 | * @return Scrapher 218 | */ 219 | public function orderBy($name, $order = 'asc', $projection = null) 220 | { 221 | $order = strtolower($order) == 'desc' ? SORT_DESC : SORT_ASC; 222 | $this->orderBy[] = array($name, $order, $projection); 223 | 224 | return $this; 225 | } 226 | 227 | /** 228 | * Get all the matches. 229 | * 230 | * @param array $columns 231 | * 232 | * @return array 233 | */ 234 | public function get($columns = array('*')) 235 | { 236 | $columns = is_array($columns) ? $columns : func_get_args(); 237 | 238 | return $this->scrape($columns); 239 | } 240 | 241 | /** 242 | * Get the first match. 243 | * 244 | * @param array $columns 245 | * 246 | * @return array 247 | */ 248 | public function first($columns = array('*')) 249 | { 250 | $columns = is_array($columns) ? $columns : func_get_args(); 251 | 252 | $results = $this->scrape($columns); 253 | 254 | return current($results); 255 | } 256 | 257 | /** 258 | * Get the last match. 259 | * 260 | * @param array $columns 261 | * 262 | * @return array 263 | */ 264 | public function last($columns = array('*')) 265 | { 266 | $columns = is_array($columns) ? $columns : func_get_args(); 267 | 268 | $results = $this->scrape($columns); 269 | 270 | return end($results); 271 | } 272 | 273 | /** 274 | * Count the number of matches. 275 | * 276 | * @return array 277 | */ 278 | public function count() 279 | { 280 | $results = $this->scrape(array('*')); 281 | 282 | return count($results); 283 | } 284 | 285 | /** 286 | * Get detailed logs of the scraping. 287 | * 288 | * @return array 289 | */ 290 | public function getLogs() 291 | { 292 | return $this->getMatcher()->getLogs(); 293 | } 294 | 295 | /** 296 | * The matcher. 297 | * 298 | * @throws SelectorNotFoundException 299 | * 300 | * @return Matcher 301 | */ 302 | protected function getMatcher() 303 | { 304 | if (!$this->matcher) { 305 | throw new SelectorNotFoundException(); 306 | } 307 | 308 | return $this->matcher; 309 | } 310 | 311 | /** 312 | * The actual scraping. 313 | * 314 | * @param array $columns 315 | * @throws ContentNotFoundException 316 | * 317 | * @return array 318 | */ 319 | protected function scrape($columns) 320 | { 321 | if (!$this->contents) { 322 | throw new ContentNotFoundException(); 323 | } 324 | 325 | $results = array(); 326 | 327 | foreach ($this->contents as $id => $content) { 328 | $results = array_merge($results, $this->getMatcher()->getMatches($content, $id)); 329 | } 330 | 331 | if ($results) { 332 | // Order by 333 | if ($this->orderBy) { 334 | usort($results, call_user_func_array('self::makeComparer', $this->orderBy)); 335 | } 336 | // Skip & Take 337 | if ($this->skip > 0 || $this->take) { 338 | $results = array_slice($results, $this->skip, $this->take); 339 | } 340 | // Select columns 341 | if (!in_array('*', $columns)) { 342 | $results = array_map(function ($v) use($columns) { 343 | return array_intersect_key($v, array_flip($columns)); 344 | }, $results); 345 | } 346 | } 347 | 348 | if ($this->reverse) { 349 | krsort($results); 350 | } 351 | 352 | $this->clear(); 353 | 354 | return $results; 355 | } 356 | 357 | /** 358 | * Clear the scraping configuration. 359 | * 360 | * This allows us to scrape the same contents again, but with a different selector 361 | */ 362 | protected function clear() 363 | { 364 | $this->take = null; 365 | $this->orderBy = null; 366 | $this->skip = 0; 367 | } 368 | 369 | /** 370 | * This is a callable component of usort. 371 | * 372 | * For simple ascending sorts (multiple column included): 373 | * usort($row, make_comparer('column_name'[, 'other_column_name']); 374 | * 375 | * For setting a descending sort 376 | * usort($rows, make_comparer(array('column_name', SORT_DESC))); 377 | * 378 | * To include a function result on a column 379 | * usort($rows, make_comparer(array('column_name', SORT_ASC, 'function_name'))); 380 | * 381 | * From stackoverflow.com : user - jon 382 | * http://stackoverflow.com/questions/96759/how-do-i-sort-a-multidimensional-array-in-php 383 | * http://stackoverflow.com/users/50079/jon 384 | * 385 | * @return int 386 | */ 387 | private static function makeComparer() 388 | { 389 | // Normalize criteria up front so that the comparer finds everything tidy 390 | $criteria = func_get_args(); 391 | foreach ($criteria as $index => $criterion) { 392 | $criteria[$index] = is_array($criterion) 393 | ? array_pad($criterion, 3, null) 394 | : array($criterion, SORT_ASC, null); 395 | } 396 | 397 | return function ($first, $second) use (&$criteria) { 398 | foreach ($criteria as $criterion) { 399 | // How will we compare this round? 400 | list($column, $sortOrder, $projection) = $criterion; 401 | $sortOrder = $sortOrder === SORT_DESC ? -1 : 1; 402 | // If a projection was defined project the values now 403 | if ($projection) { 404 | $lhs = call_user_func($projection, $first[$column]); 405 | $rhs = call_user_func($projection, $second[$column]); 406 | } else { 407 | $lhs = $first[$column]; 408 | $rhs = $second[$column]; 409 | } 410 | // Do the actual comparison; do not return if equal 411 | if ($lhs < $rhs) { 412 | return -1 * $sortOrder; 413 | } elseif ($lhs > $rhs) { 414 | return 1 * $sortOrder; 415 | } 416 | } 417 | 418 | return 0; // tiebreakers exhausted, so $first == $second 419 | }; 420 | } 421 | } 422 | --------------------------------------------------------------------------------