├── do_it.sh ├── libs ├── general.php ├── config.php ├── cachemanager.php ├── languages.php ├── urls.php ├── robots.php ├── contentanalyzer.php ├── cronmanager.php ├── html.php ├── providers.php └── db.php ├── README.md ├── crowle.php └── sql └── schema_create.sql /do_it.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | while true 3 | do 4 | php crowle.php& 5 | sleep 5 6 | done 7 | -------------------------------------------------------------------------------- /libs/general.php: -------------------------------------------------------------------------------- 1 | array( 24 | 'connection_string'=>'mysql:host=127.0.0.1;dbname=crowler_db', 25 | 'username'=>'root', 26 | 'password'=>'$%4 my top secret password', 27 | 'port'=>'3306' 28 | ), 29 | 'write'=>array( 30 | 'connection_string'=>'mysql:host=127.0.0.1;dbname=crowler_db', 31 | 'username'=>'root', 32 | 'password'=>'3@3 my top secret password', 33 | 'port'=>'3306' 34 | ), 35 | 'fulltext-write'=>array( 36 | 'connection_string'=>'mysql:host=127.0.0.1;dbname=crowler_db', 37 | 'username'=>'root', 38 | 'password'=>'@#@ my top secret password', 39 | 'port'=>'3306' 40 | ) 41 | ); 42 | } 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PHP-Crawler 2 | =========== 3 | 4 | PHP crawler and spider. working with UTF8, MySQL, Random host, Supports robots.txt and many more surprises 5 | 6 | 7 | Install It 8 | ========== 9 | 0. `sudo apt-get install curl php5-curl php5` - Validate that you have this packages 10 | 1. on /sql folder you will find `schema_create.sql` file run it in sql 11 | 2. on /libs folder you will find `config.php` you should configure as well 12 | 3. give read + write + delete permissions to `/writable` folder 13 | 14 | RUN IT 15 | ====== 16 | 17 | There two ways to run 18 | To run it with multi processes (if you configure on crowle.php ) Default 4 processes 19 | 20 | `sh do_it.sh` 21 | 22 | To run single proccess 23 | 24 | `php crowle.php` 25 | 26 | To Add new URL 27 | ============== 28 | Create php file and run 29 | Method 1 : 30 | 31 | "some url 1 description", 37 | 'http://some-url-2.com/"=>"some url 2 description" 38 | ); 39 | 40 | Providers::insert_url_list($urlArray); 41 | 42 | ?> 43 | 44 | Method 2 : 45 | 46 | 51 | 52 | Stay in contact 53 | =============== 54 | 55 | http://www.korotkin.co.il/ 56 | - or - 57 | info@korotkin.co.il 58 | 59 | Hope it fine :-) 60 | Give me feedback ! 61 | -------------------------------------------------------------------------------- /crowle.php: -------------------------------------------------------------------------------- 1 | getLinks() ); 49 | 50 | } 51 | } 52 | catch (Exception $ex) 53 | { 54 | _w('WAS ERROR !!! ' . $ex->getMessage()); 55 | } 56 | 57 | _w('Done for now'); 58 | unset($urls); 59 | -------------------------------------------------------------------------------- /libs/cachemanager.php: -------------------------------------------------------------------------------- 1 | instance_name = $instance_name; 38 | } 39 | /** 40 | * Cahce object 41 | * @var array 42 | */ 43 | private $_cache = array(); 44 | /** 45 | * Setting cache obje 46 | * @param string $name 47 | * @param object $value 48 | */ 49 | public function set($name,$value) 50 | { 51 | $this->_cache[$name] = $value; 52 | } 53 | /** 54 | * 55 | * @param type $name 56 | * @return type 57 | */ 58 | public function get($name) 59 | { 60 | $nn = (string)$name; 61 | if(!isset($this->_cache[$nn])) 62 | return null; 63 | 64 | return $this->_cache[$nn]; 65 | } 66 | } 67 | 68 | ?> 69 | -------------------------------------------------------------------------------- /sql/schema_create.sql: -------------------------------------------------------------------------------- 1 | delimiter $$ 2 | 3 | CREATE DATABASE `crowler_db` /*!40100 DEFAULT CHARACTER SET utf8 */$$ 4 | 5 | delimiter $$ 6 | 7 | CREATE TABLE `hosts` ( 8 | `id` bigint(20) NOT NULL AUTO_INCREMENT, 9 | `https` tinyint(1) NOT NULL DEFAULT '0', 10 | `host` varchar(255) NOT NULL, 11 | `port` int(6) NOT NULL DEFAULT '80', 12 | PRIMARY KEY (`id`), 13 | UNIQUE KEY `uniq` (`https`,`host`,`port`), 14 | KEY `host` (`host`), 15 | KEY `http_host` (`https`,`host`), 16 | KEY `http_host_port` (`https`,`host`,`port`), 17 | KEY `host_port` (`host`,`port`) 18 | ) ENGINE=MyISAM AUTO_INCREMENT=0 DEFAULT CHARSET=utf8$$ 19 | 20 | 21 | CREATE TABLE `urls` ( 22 | `id` bigint(20) NOT NULL AUTO_INCREMENT, 23 | `host_id` bigint(20) NOT NULL, 24 | `path` varchar(255) NOT NULL, 25 | `get_params` varchar(255) DEFAULT NULL, 26 | `type` enum('lead','blocked','indexed','robots_not_allowed','error_no_data') NOT NULL DEFAULT 'lead', 27 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, 28 | `updated_at` datetime DEFAULT NULL, 29 | `priority` int(11) DEFAULT '1000', 30 | PRIMARY KEY (`id`) 31 | ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8$$ 32 | 33 | 34 | 35 | CREATE TABLE `url_data` ( 36 | `id` bigint(20) NOT NULL AUTO_INCREMENT, 37 | `url_id` bigint(20) DEFAULT NULL, 38 | `text` text, 39 | PRIMARY KEY (`id`), 40 | UNIQUE KEY `idx_url` (`url_id`) 41 | ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8$$ 42 | 43 | 44 | CREATE ALGORITHM=UNDEFINED SQL SECURITY DEFINER VIEW `random_leads` AS select 45 | u.id AS url_id, 46 | h.id AS host_id, 47 | h.https AS https, 48 | h.host AS host, 49 | h.port AS port, 50 | u.path AS path, 51 | u.get_params AS get_params 52 | from 53 | (urls u 54 | join hosts h ON ((u.host_id = h.id))) 55 | where 56 | (u.type = 'lead') 57 | order by u.priority , rand() 58 | limit 500$$ 59 | 60 | -------------------------------------------------------------------------------- /libs/languages.php: -------------------------------------------------------------------------------- 1 | element in the page */ 32 | if (!isset($charset)) { 33 | preg_match( '@ element in the page */ 39 | if (!isset($charset)) { 40 | preg_match( '@<\?xml.+encoding="([^\s"]+)@si', $data, $matches ); 41 | if ( isset( $matches[1] ) ) 42 | $charset = $matches[1]; 43 | } 44 | 45 | /* 4: PHP's heuristic detection */ 46 | if (!isset($charset)) { 47 | $encoding = mb_detect_encoding($data); 48 | if ($encoding) 49 | $charset = $encoding; 50 | } 51 | 52 | /* 5: Default for HTML */ 53 | if (!isset($charset)) { 54 | if (strstr($content_type, "text/html") === 0) 55 | $charset = "ISO 8859-1"; 56 | } 57 | 58 | /* Convert it if it is anything but UTF-8 */ 59 | /* You can change "UTF-8" to "UTF-8//IGNORE" to 60 | ignore conversion errors and still output something reasonable */ 61 | if (isset($charset) && strtoupper($charset) != "UTF-8") 62 | $data = iconv($charset, 'UTF-8', $data); 63 | 64 | return $data; 65 | } 66 | } 67 | 68 | ?> 69 | -------------------------------------------------------------------------------- /libs/urls.php: -------------------------------------------------------------------------------- 1 | 'some name','url2'=>'some name'); 23 | * to 24 | * real full url list 25 | * @param array $url_list 26 | * @param string $real_url 27 | * @return array 28 | */ 29 | public static function create_full_url_list($url_list,$real_url) 30 | { 31 | $realUrl = parse_url($real_url); 32 | 33 | $newArr = array(); 34 | 35 | foreach ($url_list as $_key=>$_value) 36 | { 37 | $key = HTML::convert_spatial_symbols(HTML::clear_whitespaces($_key)); 38 | $value = HTML::convert_spatial_symbols(HTML::clear_whitespaces($_value)); 39 | 40 | /** 41 | * Skip all hashes 42 | */ 43 | if($key=='#') continue; 44 | /** 45 | * ignore javascript 46 | */ 47 | if(substr(strtolower($key), 0, strlen('javascript:'))=='javascript:') 48 | { 49 | continue; 50 | } 51 | 52 | /** 53 | * Add as-it-is to list 54 | */ 55 | if(substr(strtolower($key), 0, 7)=='http://') 56 | { 57 | $newArr[$key]=$value; 58 | continue; 59 | } 60 | 61 | /** 62 | * Add as-it-is to list 63 | */ 64 | if(substr(strtolower($key), 0, 8)=='https://') 65 | { 66 | $newArr[$key]=$value; 67 | continue; 68 | } 69 | 70 | 71 | /** 72 | * Check is relative url from ROOT 73 | */ 74 | if(substr($key, 0, 1)=='/') 75 | { 76 | $newUrl = $realUrl['scheme'] . '://' .$realUrl['host'] . 77 | (isset($realUrl['port'])?':'.$realUrl['port']:'') . $key; 78 | $newArr[$newUrl]=$value; 79 | continue; 80 | } 81 | 82 | 83 | $xUrl =array(); 84 | if(isset($realUrl['path'])) 85 | { 86 | $xUrl = explode('/', $realUrl['path']); 87 | if(count($xUrl)>0) 88 | unset($xUrl[count($xUrl)-1]); 89 | } 90 | else 91 | { 92 | if(substr($key, 0, 2)=='..') 93 | continue; 94 | } 95 | 96 | $nUrl=$realUrl['scheme'] . '://' .$realUrl['host'] . 97 | (isset($realUrl['port'])?':'.$realUrl['port']:'').implode('/', $xUrl); 98 | $newArr["$nUrl/$key"]=$value; 99 | } 100 | 101 | return $newArr; 102 | } 103 | } 104 | 105 | ?> 106 | -------------------------------------------------------------------------------- /libs/robots.php: -------------------------------------------------------------------------------- 1 | 1) 94 | { 95 | $rules[] = array( 96 | 'type' => $type, 97 | 'match' => str_replace('\*', '.*',preg_quote(trim($rule[1]), '/')) 98 | ); 99 | } 100 | } 101 | } 102 | 103 | $isAllowed = true; 104 | $cu_st = 0; 105 | 106 | foreach($rules as $rule) 107 | { 108 | 109 | // check if page hits on a rule 110 | if( @preg_match("/^{$rule['match']}/", $parsed['path']) ) 111 | { 112 | 113 | // prefer longer (more specific) rules and Allow trumps Disallow if rules same length 114 | $strength = strlen($rule['match']); 115 | 116 | if($cu_st < $strength) 117 | { 118 | $cu_st = $strength; 119 | $isAllowed = ($rule['type'] == 'allow') ? true : false; 120 | } 121 | elseif($cu_st == $strength && $rule['type'] == 'allow') 122 | { 123 | $cu_st = $strength; 124 | $isAllowed = true; 125 | } 126 | } 127 | } 128 | 129 | return $isAllowed; 130 | } 131 | } 132 | 133 | ?> 134 | -------------------------------------------------------------------------------- /libs/contentanalyzer.php: -------------------------------------------------------------------------------- 1 | _url = $url; 46 | } 47 | /** 48 | * Getting Current url id 49 | * @return boolean|int 50 | */ 51 | public function getUrlId() 52 | { 53 | $url_info = Providers::get_url_by_url($this->_url); 54 | return ( ! $url_info && isset($url_info['id'])) ? false : $url_info['id']; 55 | } 56 | /** 57 | * Getting crowler 58 | * @param string $url 59 | * @return boolean|\Crowler 60 | */ 61 | public static function getAnalyzer($url) 62 | { 63 | 64 | // Check is robots allowed 65 | if(!Robots::robots_allowed($url, Config::$agent_name)) 66 | { 67 | Providers::change_url_status($url, Providers::URLS_TYPE_ROBOTS_NOT_ALLOWED); 68 | _w('Robots not allowed'); 69 | return false; 70 | } 71 | 72 | // Create object 73 | $obj = new ContentAnalyzer($url); 74 | 75 | if(!$obj->getCONTENT_DATA()) 76 | { 77 | Providers::change_url_status($url, Providers::URLS_TYPE_ERROR_NO_DATA); 78 | return false; 79 | } 80 | 81 | return $obj; 82 | } 83 | /** 84 | * Getting all content data 85 | * @return type 86 | */ 87 | protected function getCONTENT_DATA() 88 | { 89 | 90 | $ch = curl_init($this->_url); 91 | 92 | $options = array( 93 | CURLOPT_RETURNTRANSFER => true, 94 | CURLOPT_FOLLOWLOCATION => true, 95 | CURLOPT_ENCODING => 'UTF-8', 96 | CURLOPT_TIMEOUT=> 30, 97 | CURLOPT_CONNECTTIMEOUT=>30 98 | 99 | ); 100 | curl_setopt_array($ch, $options); 101 | 102 | $this->_content = Languages::curl_exec_utf8($ch); 103 | 104 | return trim($this->_content)!=''; 105 | } 106 | /** 107 | * Extracting links 108 | * @return array 109 | */ 110 | public function getLinks() 111 | { 112 | 113 | if(!$this->_all_links) 114 | $this->_all_links = urls::create_full_url_list( 115 | HTML::extract_links($this->_content), 116 | $this->_url); 117 | 118 | return $this->_all_links; 119 | } 120 | /** 121 | * Getting plain text without html tags 122 | * @return string 123 | */ 124 | public function getPlainContent() 125 | { 126 | if($this->_plain_content) 127 | return $this->_plain_content; 128 | 129 | 130 | $this->_plain_content = HTML::clear_whitespaces(HTML::stripTags($this->_content)); 131 | 132 | return $this->_plain_content; 133 | } 134 | /** 135 | * Getting meta tags from content 136 | * @return array 137 | */ 138 | public function getMetaTags() 139 | { 140 | if(!$this->meta_tags) 141 | $this->meta_tags = HTML::getMetaTags($this->_content); 142 | 143 | return $this->meta_tags; 144 | } 145 | /** 146 | * Extract Title from html 147 | * @return string 148 | */ 149 | public function getTitle() 150 | { 151 | if(!$this->meta_title) 152 | { 153 | $btgs = HTML::getTextBetweenTags($this->_content,'title'); 154 | $this->meta_title = (is_array($btgs))?implode (' ', $btgs):$btgs; 155 | } 156 | 157 | return $this->meta_title; 158 | } 159 | } 160 | 161 | ?> 162 | -------------------------------------------------------------------------------- /libs/cronmanager.php: -------------------------------------------------------------------------------- 1 | cron_process_index; 38 | } 39 | 40 | /** 41 | * singleton constructor 42 | * @param string $run_id 43 | */ 44 | private function __construct($run_id='cron',$max_proccesses=1) 45 | { 46 | $is_all_processes_used = true; 47 | 48 | for($spid = 1; $spid <= $max_proccesses ; $spid++) 49 | { 50 | $this->RUN_ID = md5("$run_id-$spid"); 51 | 52 | if(!is_file(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid")) 53 | { 54 | $this->cron_process_index = $spid; 55 | $is_all_processes_used = false; 56 | break; 57 | } 58 | 59 | } 60 | 61 | if($is_all_processes_used) 62 | { 63 | _w('Cron alerady running ... Wait for next time'); 64 | exit(2); 65 | } 66 | 67 | 68 | try 69 | { 70 | @file_put_contents(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid", 'x'.getmypid()); 71 | 72 | if(!is_file(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid")) 73 | throw new Exception('Cannot create file'); 74 | 75 | $this->init = TRUE; 76 | } 77 | catch(Exception $ex) 78 | { 79 | _w('ERROR !!! Cannot create pid file ! ' . $ex->getMessage() ); 80 | exit(2); 81 | } 82 | 83 | register_shutdown_function(array($this, 'callRegisteredShutdown')); 84 | } 85 | /** 86 | * Shutdown func. 87 | * @throws Exception 88 | */ 89 | public function callRegisteredShutdown() 90 | { 91 | if(!$this->init) 92 | { 93 | _w('shutting down - no need'); 94 | return; 95 | } 96 | 97 | try 98 | { 99 | _w('shutting down'); 100 | unlink(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid"); 101 | if(is_file(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid")) 102 | throw new Exception('Cannot REMOVE file'); 103 | } 104 | catch(Exception $ex) 105 | { 106 | _w('ERROR !!! Cannot REMOVE pid file ! ' . $ex->getMessage() ); 107 | } 108 | } 109 | /** 110 | * Geting singleton 111 | * @param string $run_id 112 | * @return CronManager 113 | */ 114 | private static function getSingleton($run_id='cron',$max_processes=1) 115 | { 116 | if(!self::$singleton) 117 | self::$singleton = new CronManager($run_id,$max_processes); 118 | 119 | return self::$singleton; 120 | } 121 | 122 | /** 123 | * Check mandatory configs 124 | */ 125 | public static function check_configs() 126 | { 127 | if(!is_dir(WRITABLE_PATH)) 128 | { 129 | _w('No WRITABLE dir : ' . WRITABLE_PATH); 130 | try 131 | { 132 | @mkdir(WRITABLE_PATH, 0777); 133 | } 134 | catch(Exception $ex) 135 | { 136 | _w('ERROR !!!' . $ex->getMessage()); 137 | } 138 | 139 | if(!is_dir(WRITABLE_PATH)) 140 | { 141 | _w('ERROR !!! Cannot Create dir !'); 142 | exit(2); 143 | } 144 | } 145 | } 146 | 147 | /** 148 | * init function 149 | * @param string $_prefix 150 | * @param int $max_processes 151 | * @return CronManager 152 | */ 153 | public static function init($_prefix='cron',$max_processes=1) 154 | { 155 | CronManager::check_configs(); 156 | return CronManager::getSingleton($_prefix,$max_processes); 157 | 158 | } 159 | } 160 | 161 | ?> 162 | -------------------------------------------------------------------------------- /libs/html.php: -------------------------------------------------------------------------------- 1 | "]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $html, $matches); 48 | if(count($matches[0])>0) 49 | return array_combine($matches[1], $matches[2]); 50 | else 51 | return array(); 52 | 53 | } 54 | 55 | public static function getTextBetweenTags($string, $tagname) 56 | { 57 | $pattern = "/<$tagname>(.*?)<\/$tagname>/"; 58 | preg_match($pattern, $string, $matches); 59 | if(isset($matches[1])) 60 | return $matches[1]; 61 | else 62 | return array(); 63 | } 64 | /** 65 | * Getting plain text from html 66 | * @param string $content 67 | * @return string 68 | */ 69 | public static function stripTags($content) 70 | { 71 | //return strip_tags ( $content ); 72 | 73 | $search = array ("']*?>.*?'si", // Strip out javascript 74 | "']*?>.*?'si", 75 | "'<[/!]*?[^<>]*?>'si", // Strip out HTML tags 76 | "'([rn])[s]+'", // Strip out white space 77 | "'&(quot|#34);'i", // Replace HTML entities 78 | "'&(amp|#38);'i", 79 | "'&(lt|#60);'i", 80 | "'&(gt|#62);'i", 81 | "'&(nbsp|#160);'i", 82 | "'&(iexcl|#161);'i", 83 | "'&(cent|#162);'i", 84 | "'&(pound|#163);'i", 85 | "'&(copy|#169);'i", 86 | "'&#(d+);'e"); // evaluate as php 87 | 88 | $replace = array (" ", 89 | " ", 90 | " ", 91 | " ", 92 | " \" ", 93 | " & ", 94 | " ", 95 | " ", 96 | " ", 97 | ' ', 98 | ' ', 99 | ' ', 100 | ' ', 101 | ' '); 102 | return strip_tags ( HTML::clear_whitespaces(preg_replace($search, $replace, $content)) ); 103 | } 104 | /** 105 | * 106 | * @param string $tag 107 | * @param string $key_attribute 108 | * @param string $html 109 | * @return array 110 | */ 111 | public static function extract_tag_values($tag,$key_attribute,$html) 112 | { 113 | $matches = array(); 114 | $reg = "#<$tag.*$key_attribute\s*=\s*(\"|')?([^\"'>]+).*>(.+)#i"; 115 | if (preg_match_all($reg, $html, $matches)) 116 | return array_combine($matches[2], $matches[3]); 117 | 118 | return array(); 119 | } 120 | /** 121 | * getting all