├── Crawler.php ├── README.md └── example.php /Crawler.php: -------------------------------------------------------------------------------- 1 | set_user_agent(self::USER_AGENT); 25 | $this->dom = new simple_html_dom(); 26 | } 27 | 28 | /** 29 | * Set the user agent to be used for all cURL calls 30 | * 31 | * @param string user agent string 32 | * @return void 33 | */ 34 | public function set_user_agent($user_agent){ 35 | ini_set('user_agent', $user_agent); 36 | } 37 | 38 | /** 39 | * Check to make sure URL is valid 40 | * 41 | * @param string URL to check 42 | * @return boolean True if URL is valid. False is url is not valid. 43 | */ 44 | private function check_url($url){ 45 | $headers = @get_headers($url, 0); 46 | if(is_array($headers)){ 47 | if(strpos($headers[0], '404')){ 48 | return false; 49 | } 50 | 51 | foreach($headers as $header){ 52 | if(strpos($header, '404 Not Found')){ 53 | return false; 54 | } 55 | } 56 | 57 | return true; 58 | } 59 | else{ 60 | return false; 61 | } 62 | 63 | } 64 | 65 | /** 66 | * Set URL to scrape/crawl. 67 | * 68 | * @param string URL to crawl 69 | * @return boolean True if URL is valid. False is URL is not valid 70 | */ 71 | public function set_url($url){ 72 | $this->url = $url; 73 | 74 | if((strpos($url, 'http')) === false) $url = 'http://' . $url; 75 | 76 | if($this->check_url($url) === false){ 77 | return false; 78 | } 79 | 80 | if($this->dom->load_file($url) === false){ 81 | return false; 82 | } 83 | 84 | $this->url_data = parse_url($url); 85 | if(empty($this->url_data['scheme'])){ 86 | $this->data['scheme'] == 'http'; 87 | } 88 | $this->url_data['domain'] = implode(".", array_slice(explode(".", $this->url_data['host']), -2)); 89 | 90 | if(empty($this->url_data['path']) || $this->url_data['path'] != '/robots.txt'){ 91 | $this->get_robots(); 92 | } 93 | 94 | return true; 95 | } 96 | 97 | /** 98 | * Retrieve and parse the loaded URL's robots.txt 99 | * 100 | * @return array/boolean Returns array of rules if robots.txt is valid. Otherwise returns True if no rules exist or False if robots.txt is not valid. 101 | */ 102 | private function get_robots(){ 103 | if(empty($this->url_data)) return false; 104 | 105 | $robots_url = 'http://' . $this->url_data['domain'] . '/robots.txt'; 106 | 107 | if(!$this->check_url($robots_url)){ 108 | return false; 109 | } 110 | 111 | $robots_text = @file($robots_url); 112 | 113 | if(empty($robots_text)){ 114 | $this->robots_rules = false; 115 | return; 116 | } 117 | 118 | $user_agents = implode("|", array(preg_quote('*'),preg_quote(self::USER_AGENT))); 119 | 120 | $this->robots_rules = array(); 121 | 122 | foreach($robots_text as $line){ 123 | if(!$line = trim($line)) continue; 124 | 125 | if(preg_match('/^\s*User-agent: (.*)/i', $line, $match)) { 126 | $ruleApplies = preg_match("/($user_agents)/i", $match[1]); 127 | } 128 | if(!empty($ruleApplies) && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) { 129 | // an empty rule implies full access - no further tests required 130 | if(!$regs[1]) return true; 131 | // add rules that apply to array for testing 132 | $this->robots_rules[] = preg_quote(trim($regs[1]), '/'); 133 | } 134 | } 135 | 136 | return $this->robots_rules; 137 | } 138 | 139 | /** 140 | * Checks robots.txt to see if a URL can be accessed. 141 | * 142 | * @param string URL to check 143 | * @return boolean True if URL can be accessed. False if it can't. 144 | */ 145 | private function check_robots($url){ 146 | if(empty($this->robots_rules)) return true; 147 | 148 | $parsed_url = parse_url($url); 149 | 150 | foreach($this->robots_rules as $robots_rule){ 151 | if(preg_match("/^$robots_rule/", $parsed_url['path'])) return false; 152 | } 153 | 154 | return true; 155 | } 156 | 157 | /** 158 | * Removes all HTML, special characters and extra whitespace from text 159 | * 160 | * @param string Text to be cleaned 161 | * @return string Cleaned text 162 | */ 163 | private function clean_text($text){ 164 | $preg_patterns = array( 165 | "/[\x80-\xFF]/", //remove special characters 166 | "/ /", 167 | "/\s+/", //remove extra whitespace 168 | ); 169 | $text = strip_tags(preg_replace($preg_patterns, " ", html_entity_decode($text, ENT_QUOTES, 'UTF-8'))); 170 | 171 | return $text; 172 | } 173 | 174 | /** 175 | * Get HTML from loaded URL 176 | * 177 | * @return string/boolean If DOM is loaded returns its HTML. Otherwise returns False. 178 | */ 179 | public function get_html(){ 180 | if(!empty($this->dom)){ 181 | return $this->dom->save(); 182 | } 183 | else{ 184 | return false; 185 | } 186 | } 187 | 188 | /** 189 | * Get text from loaded URL without HTML tags or special characters 190 | * 191 | * @param int Max length of text to return 192 | * @return string 193 | */ 194 | public function get_text($limit = null){ 195 | if(!is_null($limit) && is_numeric($limit)){ 196 | return substr($this->clean_text($this->dom->plaintext), 0, $limit); 197 | } 198 | else{ 199 | return $this->clean_text($this->dom->plaintext); 200 | } 201 | } 202 | 203 | /** 204 | * Get title tag from loaded URL 205 | * 206 | * @return string 207 | */ 208 | public function get_title(){ 209 | if(!$page_title = $this->dom->find('head title', 0)){ 210 | return false; 211 | } 212 | 213 | return $this->clean_text($page_title->innertext); 214 | } 215 | 216 | /** 217 | * Get meta description from loaded URL 218 | * 219 | * @return string 220 | */ 221 | public function get_description(){ 222 | if(!$page_description = $this->dom->find('head meta[name=description]', 0)){ 223 | return false; 224 | } 225 | 226 | return $this->clean_text($page_description->content); 227 | } 228 | 229 | /** 230 | * Get meta keywords from loaded URL 231 | * 232 | * @return string 233 | */ 234 | public function get_keywords(){ 235 | if(!$page_keywords = $this->dom->find('head meta[name=keywords]', 0)){ 236 | return false; 237 | } 238 | 239 | return $this->clean_text($page_keywords->content); 240 | 241 | } 242 | 243 | /** 244 | * Get all links on loaded URL page 245 | * 246 | * @param array Links containing these terms will not be returned 247 | * @param array Only links containing these terms will be returned 248 | * @return array List of links on page 249 | */ 250 | public function get_links($exclude_terms = array(), $include_terms = array()){ 251 | if(!empty($this->links)) return $this->links; 252 | 253 | $this->links = array(); 254 | $anchor_tags = $this->dom->find('a[href]'); 255 | 256 | foreach($anchor_tags as $anchor){ 257 | $anchor_url = parse_url($anchor->href); 258 | if($anchor_url === false) continue; 259 | 260 | $anchor_href = ''; 261 | if(empty($anchor_url['host'])){ 262 | if(empty($anchor_url['path'])) continue; 263 | $anchor_href = $this->url_data['scheme'] . '://' . $this->url_data['host'] . ((!empty($anchor_url['path']) && substr($anchor_url['path'], 0, 1) != '/') ? '/' : '') . $anchor_url['path']; 264 | } 265 | else{ 266 | $anchor_domain = implode(".", array_slice(explode(".", $anchor_url['host']), -2)); 267 | if($anchor_domain != $this->url_data['domain']) continue; 268 | 269 | $anchor_href .= ((!empty($anchor_url['scheme'])) ? $anchor_url['scheme'] : 'http') . '://' . $anchor_url['host'] . ((!empty($anchor_url['path']) && substr($anchor_url['path'], 0, 1) != '/') ? '/' : '') . ((!empty($anchor_url['path'])) ? $anchor_url['path'] : ''); 270 | } 271 | 272 | if($anchor_href == $this->url || array_key_exists($anchor_href, $this->links)) continue; 273 | 274 | //TODO 275 | //Add support for relative links (ex. A link on http://passpack.com/en/home/ with an href of ../about_us should be http://passpack.com/en/about_us 276 | //does plaintext content exist? 277 | 278 | if(!empty($exclude_terms) && is_array($exclude_terms)){ 279 | $exclude_term_found = false; 280 | foreach($exclude_terms as $term){ 281 | if(stripos($this->clean_text($anchor->innertext), $term) !== false && strlen($this->clean_text($anchor->innertext)) < 50){ 282 | $exclude_term_found = true; 283 | } 284 | if(!empty($anchor_url['path'])){ 285 | $path_segments = explode("/", $anchor_url['path']); 286 | $last_path_segment = array_pop($path_segments); 287 | if(stripos($last_path_segment, $term) !== false && strlen($last_path_segment) < 50){ 288 | $exclude_term_found = true; 289 | } 290 | } 291 | 292 | } 293 | if($exclude_term_found) continue; 294 | } 295 | 296 | if(!empty($anchor_url['path']) && $this->check_robots($anchor_url['path']) !== true){ 297 | continue; 298 | } 299 | 300 | if(!empty($include_terms) && is_array($include_terms)){ 301 | $include_term_found = false; 302 | foreach($include_terms as $term){ 303 | if(stripos($this->clean_text($anchor->innertext), $term) !== false && strlen($this->clean_text($anchor->innertext)) < 50){ 304 | $include_term_found = true; 305 | continue; 306 | } 307 | if(!empty($anchor_url['path'])){ 308 | $path_segments = explode("/", $anchor_url['path']); 309 | $last_path_segment = str_replace(array('-','_'), ' ', array_pop($path_segments)); 310 | if(stripos($last_path_segment, $term) !== false && strlen($last_path_segment) < 50){ 311 | $include_term_found = true; 312 | continue; 313 | } 314 | } 315 | } 316 | } 317 | 318 | if(isset($include_term_found) && $include_term_found){ 319 | $this->links[$anchor_href] = array( 320 | 'raw_href' => $anchor->href, 321 | 'full_href' => $anchor_href, 322 | 'text' => $this->clean_text($anchor->innertext) 323 | ); 324 | } 325 | 326 | } 327 | 328 | return $this->links; 329 | } 330 | 331 | } 332 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | codeigniter_crawler 2 | =================== 3 | 4 | A website crawler for the CodeIgniter framework 5 | 6 | Usage 7 | ----- 8 | 9 | 1. Copy Crawler.php into the library directory of your CodeIgniter application 10 | 2. codeigniter_crawler has one requirement, Simple_Html_Dom. Download from http://simplehtmldom.sourceforge.net/ and copy Simple_html_dom.php into the libraries directory of your CodeIgniter application. 11 | 3. Load Crawler library from a controller. `$this->load->library('crawler');` 12 | 4. Set a URL. `$this->crawler->set_url('http://github.com');` 13 | 5. Use any of the public methods to obtain data from the URL you just set. `$this->crawler->get_text();` 14 | 15 | Public Methods 16 | -------------- 17 | 18 | - `get_text()` - Get plain text from the loaded URL. 19 | - `get_title()` - Get the meta title from the loaded URL. 20 | - `get_description()` - Get the meta description from the loaded URL. 21 | - `get_keywords()` - Get the meta keywords from the loaded URL. 22 | - `get_links($excluded_terms, $included_terms)` - Get all links from the loaded URL. 23 | 24 | Example 25 | ------- 26 | 27 | Crawl github: 28 | 29 | 30 | function get_site_data($url, $max_depth = 1, $current_depth = 0){ 31 | $current_depth++; 32 | 33 | $this->load->library('crawler'); 34 | 35 | $site_data = array(); 36 | 37 | if($this->crawler->set_url($site_url) !== false){ 38 | $site_data['title'] = $this->crawler->get_title(); 39 | $site_data['description'] = $this->crawler->get_description(); 40 | $site_data['keywords'] = $this->crawler->get_keywords(); 41 | $site_data['text'] = $this->crawler->get_text(); 42 | $site_data['links'] = $this->crawler->get_links(); 43 | 44 | if($current_depth <= $max_depth){ 45 | foreach($site_data['links'] as $link_key => &$link){ 46 | $link['data'] = get_site_data($link, $max_depth, $current_depth); 47 | } 48 | } 49 | } 50 | 51 | return $site_data; 52 | } 53 | else{ 54 | return false; 55 | } 56 | } 57 | 58 | $site_data = get_site_data("http://github.com", 1, 0); 59 | 60 | Licensed under MIT License (https://github.com/jquery/jquery/blob/master/MIT-LICENSE.txt) 61 | -------------------------------------------------------------------------------- /example.php: -------------------------------------------------------------------------------- 1 | get_site_data($site_url, 1, 0); 12 | } 13 | 14 | private function get_site_data($site_url, $max_depth = 1, $current_depth = 0){ 15 | $current_depth++; 16 | 17 | $this->load->library('crawler'); 18 | 19 | $site_data = array(); 20 | 21 | if($this->crawler->set_url($site_url) !== false){ 22 | $site_data['title'] = $this->crawler->get_title(); 23 | $site_data['description'] = $this->crawler->get_description(); 24 | $site_data['keywords'] = $this->crawler->get_keywords(); 25 | $site_data['text'] = $this->crawler->get_text(); 26 | $site_data['links'] = $this->crawler->get_links(); 27 | 28 | if($current_depth <= $max_depth){ 29 | foreach($site_data['links'] as $link_key => &$link){ 30 | $link['data'] = $this->get_site_data($link, $max_depth, $current_depth); 31 | } 32 | } 33 | 34 | return $site_data; 35 | } 36 | else{ 37 | return false; 38 | } 39 | } 40 | 41 | } --------------------------------------------------------------------------------