├── Crawler.php
├── README.md
└── example.php


/Crawler.php:
--------------------------------------------------------------------------------
  1 | <?php 
  2 | 
  3 | /*
  4 | Github: https://github.com/Lukeas14/codeigniter_crawler
  5 | Author: Justin Lucas (Lukeas14@gmail.com)
  6 | Copyright (c) 2012 Justin Lucas
  7 | Licensed under MIT License (https://github.com/jquery/jquery/blob/master/MIT-LICENSE.txt)
  8 | */
  9 | 
 10 | require_once(BASE_DIR . '/application/libraries/Simple_html_dom.php');
 11 | 
 12 | class Crawler{
 13 | 	
 14 | 	const USER_AGENT = "bot";
 15 | 	
 16 | 	public $url; //URL to load DOM from
 17 | 	public $url_data; //Parsed loaded URL
 18 | 	public $dom; //DOM structure of loaded URL
 19 | 
 20 | 	private $links;
 21 | 	private $robots_rules;
 22 | 
 23 | 	function __construct(){
 24 | 		$this->set_user_agent(self::USER_AGENT);
 25 | 		$this->dom = new simple_html_dom();
 26 | 	}
 27 | 	
 28 | 	/**
 29 | 	 * Set the user agent to be used for all cURL calls
 30 | 	 *
 31 | 	 * @param 	string	user agent string
 32 | 	 * @return 	void
 33 | 	 */
 34 | 	public function set_user_agent($user_agent){
 35 | 		ini_set('user_agent', $user_agent);
 36 | 	}
 37 | 
 38 | 	/**
 39 | 	 * Check to make sure URL is valid
 40 | 	 *
 41 | 	 * @param 	string	URL to check
 42 | 	 * @return	boolean	True if URL is valid. False is url is not valid.
 43 | 	 */
 44 | 	private function check_url($url){
 45 | 		$headers = @get_headers($url, 0);
 46 | 		if(is_array($headers)){
 47 | 			if(strpos($headers[0], '404')){
 48 | 				return false;
 49 | 			}
 50 | 			
 51 | 			foreach($headers as $header){
 52 | 				if(strpos($header, '404 Not Found')){
 53 | 					return false;
 54 | 				}
 55 | 			}
 56 | 			
 57 | 			return true;
 58 | 		}
 59 | 		else{
 60 | 			return false;
 61 | 		}
 62 | 		
 63 | 	}
 64 | 	
 65 | 	/**
 66 | 	 * Set URL to scrape/crawl.
 67 | 	 *
 68 | 	 * @param 	string 	URL to crawl
 69 | 	 * @return	boolean	True if URL is valid. False is URL is not valid
 70 | 	 */
 71 | 	public function set_url($url){
 72 | 		$this->url = $url;
 73 | 		
 74 | 		if((strpos($url, 'http')) === false) $url = 'http://' . $url;
 75 | 		
 76 | 		if($this->check_url($url) === false){
 77 | 			return false;
 78 | 		}
 79 | 		
 80 | 		if($this->dom->load_file($url) === false){
 81 | 			return false;
 82 | 		}
 83 | 
 84 | 		$this->url_data = parse_url($url);
 85 | 		if(empty($this->url_data['scheme'])){
 86 | 			$this->data['scheme'] == 'http';
 87 | 		}
 88 | 		$this->url_data['domain'] = implode(".", array_slice(explode(".", $this->url_data['host']), -2));
 89 | 		
 90 | 		if(empty($this->url_data['path']) || $this->url_data['path'] != '/robots.txt'){
 91 | 			$this->get_robots();
 92 | 		}
 93 | 		
 94 | 		return true;
 95 | 	}
 96 | 	
 97 | 	/**
 98 | 	 * Retrieve and parse the loaded URL's robots.txt
 99 | 	 *
100 | 	 * @return	array/boolean	Returns array of rules if robots.txt is valid. Otherwise returns True if no rules exist or False if robots.txt is not valid.
101 | 	 */
102 | 	private function get_robots(){
103 | 		if(empty($this->url_data)) return false;
104 | 		
105 | 		$robots_url = 'http://' . $this->url_data['domain'] . '/robots.txt';
106 | 		
107 | 		if(!$this->check_url($robots_url)){
108 | 			return false;
109 | 		}
110 | 		
111 | 		$robots_text = @file($robots_url);
112 | 		
113 | 		if(empty($robots_text)){
114 | 			$this->robots_rules = false;
115 | 			return;
116 | 		}
117 | 		
118 | 		$user_agents = implode("|", array(preg_quote('*'),preg_quote(self::USER_AGENT)));
119 | 		
120 | 		$this->robots_rules = array();
121 | 		
122 | 		foreach($robots_text as $line){
123 | 			if(!$line = trim($line)) continue;
124 | 			
125 | 			if(preg_match('/^\s*User-agent: (.*)/i', $line, $match)) {
126 | 				$ruleApplies = preg_match("/($user_agents)/i", $match[1]);
127 | 			}
128 | 			if(!empty($ruleApplies) && preg_match('/^\s*Disallow:(.*)/i', $line, $regs)) {
129 | 				// an empty rule implies full access - no further tests required
130 | 				if(!$regs[1]) return true;
131 | 				// add rules that apply to array for testing
132 | 				$this->robots_rules[] = preg_quote(trim($regs[1]), '/');
133 | 			}
134 | 		}
135 | 		
136 | 		return $this->robots_rules;
137 | 	}
138 | 	
139 | 	/**
140 | 	 * Checks robots.txt to see if a URL can be accessed.
141 | 	 *
142 | 	 * @param 	string	URL to check
143 | 	 * @return 	boolean	True if URL can be accessed. False if it can't.
144 | 	 */
145 | 	private function check_robots($url){
146 | 		if(empty($this->robots_rules)) return true;
147 | 		
148 | 		$parsed_url = parse_url($url);
149 | 		
150 | 		foreach($this->robots_rules as $robots_rule){
151 | 			if(preg_match("/^$robots_rule/", $parsed_url['path'])) return false;
152 | 		}
153 | 		
154 | 		return true;
155 | 	}
156 | 
157 | 	/**
158 | 	 * Removes all HTML, special characters and extra whitespace from text
159 | 	 *
160 | 	 * @param 	string	Text to be cleaned
161 | 	 * @return 	string	Cleaned text
162 | 	 */
163 | 	private function clean_text($text){
164 | 		$preg_patterns = array(
165 | 			"/[\x80-\xFF]/", //remove special characters
166 | 			"/&nbsp/",
167 | 			"/\s+/", //remove extra whitespace
168 | 		);
169 | 		$text = strip_tags(preg_replace($preg_patterns, " ", html_entity_decode($text, ENT_QUOTES, 'UTF-8')));
170 | 		
171 | 		return $text;
172 | 	}
173 | 
174 | 	/**
175 | 	 * Get HTML from loaded URL
176 | 	 *
177 | 	 * @return 	string/boolean	If DOM is loaded returns its HTML. Otherwise returns False.
178 | 	 */
179 | 	public function get_html(){
180 | 		if(!empty($this->dom)){
181 | 			return $this->dom->save();
182 | 		}
183 | 		else{
184 | 			return false;
185 | 		}
186 | 	}
187 | 	
188 | 	/**
189 | 	 * Get text from loaded URL without HTML tags or special characters
190 | 	 *
191 | 	 * @param 	int 	Max length of text to return
192 | 	 * @return 	string
193 | 	 */
194 | 	public function get_text($limit = null){
195 | 		if(!is_null($limit) && is_numeric($limit)){
196 | 			return substr($this->clean_text($this->dom->plaintext), 0, $limit);
197 | 		}
198 | 		else{
199 | 			return $this->clean_text($this->dom->plaintext);
200 | 		}
201 | 	}
202 | 
203 | 	/**
204 | 	 * Get title tag from loaded URL
205 | 	 *
206 | 	 * @return 	string
207 | 	 */
208 |     public function get_title(){
209 |     	if(!$page_title = $this->dom->find('head title', 0)){
210 |     		return false;
211 |     	}
212 | 
213 |     	return $this->clean_text($page_title->innertext);
214 |     }
215 | 
216 | 	/**
217 | 	 * Get meta description from loaded URL
218 | 	 *
219 | 	 * @return string
220 | 	 */
221 |     public function get_description(){
222 |     	if(!$page_description = $this->dom->find('head meta[name=description]', 0)){
223 |     		return false;
224 |     	}
225 | 
226 |     	return $this->clean_text($page_description->content);
227 |     }
228 | 
229 | 	/**
230 | 	 * Get meta keywords from loaded URL
231 | 	 *
232 | 	 * @return string
233 | 	 */
234 |     public function get_keywords(){
235 |     	if(!$page_keywords = $this->dom->find('head meta[name=keywords]', 0)){
236 |     		return false;
237 |     	}
238 | 
239 |     	return $this->clean_text($page_keywords->content);
240 | 
241 |     }
242 | 
243 | 	/**
244 | 	 * Get all links on loaded URL page
245 | 	 *
246 | 	 * @param 	array 	Links containing these terms will not be returned
247 | 	 * @param 	array 	Only links containing these terms will be returned
248 | 	 * @return 	array 	List of links on page
249 | 	 */
250 | 	public function get_links($exclude_terms = array(), $include_terms = array()){
251 | 		if(!empty($this->links)) return $this->links;
252 | 		
253 | 		$this->links = array();
254 | 		$anchor_tags = $this->dom->find('a[href]');
255 | 		
256 | 		foreach($anchor_tags as $anchor){
257 | 			$anchor_url = parse_url($anchor->href);
258 | 			if($anchor_url === false) continue;
259 | 
260 | 			$anchor_href = '';
261 | 			if(empty($anchor_url['host'])){
262 | 				if(empty($anchor_url['path'])) continue;
263 | 				$anchor_href = $this->url_data['scheme'] . '://' . $this->url_data['host'] . ((!empty($anchor_url['path']) && substr($anchor_url['path'], 0, 1) != '/') ? '/' : '') . $anchor_url['path'];
264 | 			}
265 | 			else{
266 | 				$anchor_domain = implode(".", array_slice(explode(".", $anchor_url['host']), -2));
267 | 				if($anchor_domain != $this->url_data['domain']) continue;
268 | 
269 | 				$anchor_href .= ((!empty($anchor_url['scheme'])) ? $anchor_url['scheme'] : 'http') . '://' . $anchor_url['host'] . ((!empty($anchor_url['path']) && substr($anchor_url['path'], 0, 1) != '/') ? '/' : '') . ((!empty($anchor_url['path'])) ? $anchor_url['path'] : '');
270 | 			}
271 | 
272 | 			if($anchor_href == $this->url || array_key_exists($anchor_href, $this->links)) continue;
273 | 
274 | 			//TODO
275 | 			//Add support for relative links (ex. A link on http://passpack.com/en/home/ with an href of ../about_us should be http://passpack.com/en/about_us
276 | 			//does plaintext content exist?
277 | 
278 | 			if(!empty($exclude_terms) && is_array($exclude_terms)){
279 | 				$exclude_term_found = false;
280 | 				foreach($exclude_terms as $term){
281 | 					if(stripos($this->clean_text($anchor->innertext), $term) !== false && strlen($this->clean_text($anchor->innertext)) < 50){
282 | 						$exclude_term_found = true;
283 | 					}
284 | 					if(!empty($anchor_url['path'])){
285 | 						$path_segments = explode("/", $anchor_url['path']);
286 | 						$last_path_segment = array_pop($path_segments);
287 | 						if(stripos($last_path_segment, $term) !== false && strlen($last_path_segment) < 50){
288 | 							$exclude_term_found = true;
289 | 						}
290 | 					}
291 | 					
292 | 				}
293 | 				if($exclude_term_found) continue;
294 | 			}
295 | 			
296 | 			if(!empty($anchor_url['path']) && $this->check_robots($anchor_url['path']) !== true){
297 | 				continue;
298 | 			}
299 | 			
300 | 			if(!empty($include_terms) && is_array($include_terms)){ 
301 | 				$include_term_found = false;
302 | 				foreach($include_terms as $term){
303 | 					if(stripos($this->clean_text($anchor->innertext), $term) !== false && strlen($this->clean_text($anchor->innertext)) < 50){
304 | 						$include_term_found = true;
305 | 						continue;
306 | 					}
307 | 					if(!empty($anchor_url['path'])){
308 | 						$path_segments = explode("/", $anchor_url['path']);
309 | 						$last_path_segment = str_replace(array('-','_'), ' ', array_pop($path_segments));
310 | 						if(stripos($last_path_segment, $term) !== false && strlen($last_path_segment) < 50){
311 | 							$include_term_found = true;
312 | 							continue;
313 | 						}
314 | 					}
315 | 				}
316 | 			}
317 | 
318 | 			if(isset($include_term_found) && $include_term_found){
319 | 				$this->links[$anchor_href] = array(
320 | 					'raw_href' => $anchor->href,
321 | 					'full_href' => $anchor_href,
322 | 					'text' => $this->clean_text($anchor->innertext)
323 | 				);
324 | 			}
325 | 		   
326 | 		}
327 | 
328 | 		return $this->links;
329 | 	}
330 | 
331 | }
332 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | codeigniter_crawler
 2 | ===================
 3 | 
 4 | A website crawler for the CodeIgniter framework
 5 | 
 6 | Usage
 7 | -----
 8 | 
 9 | 1. Copy Crawler.php into the library directory of your CodeIgniter application
10 | 2. codeigniter_crawler has one requirement, Simple_Html_Dom. Download from http://simplehtmldom.sourceforge.net/ and copy Simple_html_dom.php into the libraries directory of your CodeIgniter application.
11 | 3. Load Crawler library from a controller. `$this->load->library('crawler');`
12 | 4. Set a URL. `$this->crawler->set_url('http://github.com');`
13 | 5. Use any of the public methods to obtain data from the URL you just set. `$this->crawler->get_text();`
14 | 
15 | Public Methods
16 | --------------
17 | 
18 | - `get_text()` - Get plain text from the loaded URL.
19 | - `get_title()` - Get the meta title from the loaded URL.
20 | - `get_description()` - Get the meta description from the loaded URL.
21 | - `get_keywords()` - Get the meta keywords from the loaded URL.
22 | - `get_links($excluded_terms, $included_terms)` - Get all links from the loaded URL.
23 | 
24 | Example
25 | -------
26 | 
27 | Crawl github:
28 | 
29 | 
30 |     function get_site_data($url, $max_depth = 1, $current_depth = 0){
31 |       $current_depth++;
32 |     
33 |       $this->load->library('crawler');
34 |     
35 |     	$site_data = array();
36 |     
37 |     	if($this->crawler->set_url($site_url) !== false){
38 |     		$site_data['title'] = $this->crawler->get_title();
39 |     		$site_data['description'] = $this->crawler->get_description();
40 |     		$site_data['keywords'] = $this->crawler->get_keywords();
41 |     		$site_data['text'] = $this->crawler->get_text();
42 |     		$site_data['links'] = $this->crawler->get_links();
43 |     
44 |     		if($current_depth <= $max_depth){
45 |     			foreach($site_data['links'] as $link_key => &$link){
46 |     				$link['data'] = get_site_data($link, $max_depth, $current_depth);
47 |     				}
48 |     			}
49 |     		}
50 |     
51 |     		return $site_data;
52 |     	}
53 |     	else{
54 |     		return false;
55 |     	}
56 |     }
57 |     
58 |     $site_data = get_site_data("http://github.com", 1, 0);
59 | 
60 | Licensed under MIT License (https://github.com/jquery/jquery/blob/master/MIT-LICENSE.txt)
61 | 


--------------------------------------------------------------------------------
/example.php:
--------------------------------------------------------------------------------
 1 | <?php //defined('BASEPATH') OR exit('No direct script access allowed');
 2 | 
 3 | class Cron extends CI_Controller {
 4 | 
 5 | 	public function __construct(){
 6 | 
 7 | 	}
 8 | 
 9 | 	public function index(){
10 | 		$site_url = "http://github.com";
11 | 		$site_data = $this->get_site_data($site_url, 1, 0);
12 | 	}
13 | 
14 | 	private function get_site_data($site_url, $max_depth = 1, $current_depth = 0){
15 | 		$current_depth++;
16 | 
17 | 		$this->load->library('crawler');
18 | 
19 | 		$site_data = array();
20 | 
21 | 		if($this->crawler->set_url($site_url) !== false){
22 | 			$site_data['title'] = $this->crawler->get_title();
23 | 			$site_data['description'] = $this->crawler->get_description();
24 | 			$site_data['keywords'] = $this->crawler->get_keywords();
25 | 			$site_data['text'] = $this->crawler->get_text();
26 | 			$site_data['links'] = $this->crawler->get_links();
27 | 
28 | 			if($current_depth <= $max_depth){
29 | 				foreach($site_data['links'] as $link_key => &$link){
30 | 					$link['data'] = $this->get_site_data($link, $max_depth, $current_depth);
31 | 				}
32 | 			}
33 | 
34 | 			return $site_data;
35 | 		}
36 | 		else{
37 | 			return false;
38 | 		}
39 | 	}
40 | 
41 | }


--------------------------------------------------------------------------------