├── do_it.sh
├── libs
├── general.php
├── config.php
├── cachemanager.php
├── languages.php
├── urls.php
├── robots.php
├── contentanalyzer.php
├── cronmanager.php
├── html.php
├── providers.php
└── db.php
├── README.md
├── crowle.php
└── sql
└── schema_create.sql
/do_it.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | while true
3 | do
4 | php crowle.php&
5 | sleep 5
6 | done
7 |
--------------------------------------------------------------------------------
/libs/general.php:
--------------------------------------------------------------------------------
1 | array(
24 | 'connection_string'=>'mysql:host=127.0.0.1;dbname=crowler_db',
25 | 'username'=>'root',
26 | 'password'=>'$%4 my top secret password',
27 | 'port'=>'3306'
28 | ),
29 | 'write'=>array(
30 | 'connection_string'=>'mysql:host=127.0.0.1;dbname=crowler_db',
31 | 'username'=>'root',
32 | 'password'=>'3@3 my top secret password',
33 | 'port'=>'3306'
34 | ),
35 | 'fulltext-write'=>array(
36 | 'connection_string'=>'mysql:host=127.0.0.1;dbname=crowler_db',
37 | 'username'=>'root',
38 | 'password'=>'@#@ my top secret password',
39 | 'port'=>'3306'
40 | )
41 | );
42 | }
43 |
44 |
45 |
46 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | PHP-Crawler
2 | ===========
3 |
4 | PHP crawler and spider. working with UTF8, MySQL, Random host, Supports robots.txt and many more surprises
5 |
6 |
7 | Install It
8 | ==========
9 | 0. `sudo apt-get install curl php5-curl php5` - Validate that you have this packages
10 | 1. on /sql folder you will find `schema_create.sql` file run it in sql
11 | 2. on /libs folder you will find `config.php` you should configure as well
12 | 3. give read + write + delete permissions to `/writable` folder
13 |
14 | RUN IT
15 | ======
16 |
17 | There two ways to run
18 | To run it with multi processes (if you configure on crowle.php ) Default 4 processes
19 |
20 | `sh do_it.sh`
21 |
22 | To run single proccess
23 |
24 | `php crowle.php`
25 |
26 | To Add new URL
27 | ==============
28 | Create php file and run
29 | Method 1 :
30 |
31 | "some url 1 description",
37 | 'http://some-url-2.com/"=>"some url 2 description"
38 | );
39 |
40 | Providers::insert_url_list($urlArray);
41 |
42 | ?>
43 |
44 | Method 2 :
45 |
46 |
51 |
52 | Stay in contact
53 | ===============
54 |
55 | http://www.korotkin.co.il/
56 | - or -
57 | info@korotkin.co.il
58 |
59 | Hope it fine :-)
60 | Give me feedback !
61 |
--------------------------------------------------------------------------------
/crowle.php:
--------------------------------------------------------------------------------
1 | getLinks() );
49 |
50 | }
51 | }
52 | catch (Exception $ex)
53 | {
54 | _w('WAS ERROR !!! ' . $ex->getMessage());
55 | }
56 |
57 | _w('Done for now');
58 | unset($urls);
59 |
--------------------------------------------------------------------------------
/libs/cachemanager.php:
--------------------------------------------------------------------------------
1 | instance_name = $instance_name;
38 | }
39 | /**
40 | * Cahce object
41 | * @var array
42 | */
43 | private $_cache = array();
44 | /**
45 | * Setting cache obje
46 | * @param string $name
47 | * @param object $value
48 | */
49 | public function set($name,$value)
50 | {
51 | $this->_cache[$name] = $value;
52 | }
53 | /**
54 | *
55 | * @param type $name
56 | * @return type
57 | */
58 | public function get($name)
59 | {
60 | $nn = (string)$name;
61 | if(!isset($this->_cache[$nn]))
62 | return null;
63 |
64 | return $this->_cache[$nn];
65 | }
66 | }
67 |
68 | ?>
69 |
--------------------------------------------------------------------------------
/sql/schema_create.sql:
--------------------------------------------------------------------------------
1 | delimiter $$
2 |
3 | CREATE DATABASE `crowler_db` /*!40100 DEFAULT CHARACTER SET utf8 */$$
4 |
5 | delimiter $$
6 |
7 | CREATE TABLE `hosts` (
8 | `id` bigint(20) NOT NULL AUTO_INCREMENT,
9 | `https` tinyint(1) NOT NULL DEFAULT '0',
10 | `host` varchar(255) NOT NULL,
11 | `port` int(6) NOT NULL DEFAULT '80',
12 | PRIMARY KEY (`id`),
13 | UNIQUE KEY `uniq` (`https`,`host`,`port`),
14 | KEY `host` (`host`),
15 | KEY `http_host` (`https`,`host`),
16 | KEY `http_host_port` (`https`,`host`,`port`),
17 | KEY `host_port` (`host`,`port`)
18 | ) ENGINE=MyISAM AUTO_INCREMENT=0 DEFAULT CHARSET=utf8$$
19 |
20 |
21 | CREATE TABLE `urls` (
22 | `id` bigint(20) NOT NULL AUTO_INCREMENT,
23 | `host_id` bigint(20) NOT NULL,
24 | `path` varchar(255) NOT NULL,
25 | `get_params` varchar(255) DEFAULT NULL,
26 | `type` enum('lead','blocked','indexed','robots_not_allowed','error_no_data') NOT NULL DEFAULT 'lead',
27 | `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
28 | `updated_at` datetime DEFAULT NULL,
29 | `priority` int(11) DEFAULT '1000',
30 | PRIMARY KEY (`id`)
31 | ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8$$
32 |
33 |
34 |
35 | CREATE TABLE `url_data` (
36 | `id` bigint(20) NOT NULL AUTO_INCREMENT,
37 | `url_id` bigint(20) DEFAULT NULL,
38 | `text` text,
39 | PRIMARY KEY (`id`),
40 | UNIQUE KEY `idx_url` (`url_id`)
41 | ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8$$
42 |
43 |
44 | CREATE ALGORITHM=UNDEFINED SQL SECURITY DEFINER VIEW `random_leads` AS select
45 | u.id AS url_id,
46 | h.id AS host_id,
47 | h.https AS https,
48 | h.host AS host,
49 | h.port AS port,
50 | u.path AS path,
51 | u.get_params AS get_params
52 | from
53 | (urls u
54 | join hosts h ON ((u.host_id = h.id)))
55 | where
56 | (u.type = 'lead')
57 | order by u.priority , rand()
58 | limit 500$$
59 |
60 |
--------------------------------------------------------------------------------
/libs/languages.php:
--------------------------------------------------------------------------------
1 | element in the page */
32 | if (!isset($charset)) {
33 | preg_match( '@ element in the page */
39 | if (!isset($charset)) {
40 | preg_match( '@<\?xml.+encoding="([^\s"]+)@si', $data, $matches );
41 | if ( isset( $matches[1] ) )
42 | $charset = $matches[1];
43 | }
44 |
45 | /* 4: PHP's heuristic detection */
46 | if (!isset($charset)) {
47 | $encoding = mb_detect_encoding($data);
48 | if ($encoding)
49 | $charset = $encoding;
50 | }
51 |
52 | /* 5: Default for HTML */
53 | if (!isset($charset)) {
54 | if (strstr($content_type, "text/html") === 0)
55 | $charset = "ISO 8859-1";
56 | }
57 |
58 | /* Convert it if it is anything but UTF-8 */
59 | /* You can change "UTF-8" to "UTF-8//IGNORE" to
60 | ignore conversion errors and still output something reasonable */
61 | if (isset($charset) && strtoupper($charset) != "UTF-8")
62 | $data = iconv($charset, 'UTF-8', $data);
63 |
64 | return $data;
65 | }
66 | }
67 |
68 | ?>
69 |
--------------------------------------------------------------------------------
/libs/urls.php:
--------------------------------------------------------------------------------
1 | 'some name','url2'=>'some name');
23 | * to
24 | * real full url list
25 | * @param array $url_list
26 | * @param string $real_url
27 | * @return array
28 | */
29 | public static function create_full_url_list($url_list,$real_url)
30 | {
31 | $realUrl = parse_url($real_url);
32 |
33 | $newArr = array();
34 |
35 | foreach ($url_list as $_key=>$_value)
36 | {
37 | $key = HTML::convert_spatial_symbols(HTML::clear_whitespaces($_key));
38 | $value = HTML::convert_spatial_symbols(HTML::clear_whitespaces($_value));
39 |
40 | /**
41 | * Skip all hashes
42 | */
43 | if($key=='#') continue;
44 | /**
45 | * ignore javascript
46 | */
47 | if(substr(strtolower($key), 0, strlen('javascript:'))=='javascript:')
48 | {
49 | continue;
50 | }
51 |
52 | /**
53 | * Add as-it-is to list
54 | */
55 | if(substr(strtolower($key), 0, 7)=='http://')
56 | {
57 | $newArr[$key]=$value;
58 | continue;
59 | }
60 |
61 | /**
62 | * Add as-it-is to list
63 | */
64 | if(substr(strtolower($key), 0, 8)=='https://')
65 | {
66 | $newArr[$key]=$value;
67 | continue;
68 | }
69 |
70 |
71 | /**
72 | * Check is relative url from ROOT
73 | */
74 | if(substr($key, 0, 1)=='/')
75 | {
76 | $newUrl = $realUrl['scheme'] . '://' .$realUrl['host'] .
77 | (isset($realUrl['port'])?':'.$realUrl['port']:'') . $key;
78 | $newArr[$newUrl]=$value;
79 | continue;
80 | }
81 |
82 |
83 | $xUrl =array();
84 | if(isset($realUrl['path']))
85 | {
86 | $xUrl = explode('/', $realUrl['path']);
87 | if(count($xUrl)>0)
88 | unset($xUrl[count($xUrl)-1]);
89 | }
90 | else
91 | {
92 | if(substr($key, 0, 2)=='..')
93 | continue;
94 | }
95 |
96 | $nUrl=$realUrl['scheme'] . '://' .$realUrl['host'] .
97 | (isset($realUrl['port'])?':'.$realUrl['port']:'').implode('/', $xUrl);
98 | $newArr["$nUrl/$key"]=$value;
99 | }
100 |
101 | return $newArr;
102 | }
103 | }
104 |
105 | ?>
106 |
--------------------------------------------------------------------------------
/libs/robots.php:
--------------------------------------------------------------------------------
1 | 1)
94 | {
95 | $rules[] = array(
96 | 'type' => $type,
97 | 'match' => str_replace('\*', '.*',preg_quote(trim($rule[1]), '/'))
98 | );
99 | }
100 | }
101 | }
102 |
103 | $isAllowed = true;
104 | $cu_st = 0;
105 |
106 | foreach($rules as $rule)
107 | {
108 |
109 | // check if page hits on a rule
110 | if( @preg_match("/^{$rule['match']}/", $parsed['path']) )
111 | {
112 |
113 | // prefer longer (more specific) rules and Allow trumps Disallow if rules same length
114 | $strength = strlen($rule['match']);
115 |
116 | if($cu_st < $strength)
117 | {
118 | $cu_st = $strength;
119 | $isAllowed = ($rule['type'] == 'allow') ? true : false;
120 | }
121 | elseif($cu_st == $strength && $rule['type'] == 'allow')
122 | {
123 | $cu_st = $strength;
124 | $isAllowed = true;
125 | }
126 | }
127 | }
128 |
129 | return $isAllowed;
130 | }
131 | }
132 |
133 | ?>
134 |
--------------------------------------------------------------------------------
/libs/contentanalyzer.php:
--------------------------------------------------------------------------------
1 | _url = $url;
46 | }
47 | /**
48 | * Getting Current url id
49 | * @return boolean|int
50 | */
51 | public function getUrlId()
52 | {
53 | $url_info = Providers::get_url_by_url($this->_url);
54 | return ( ! $url_info && isset($url_info['id'])) ? false : $url_info['id'];
55 | }
56 | /**
57 | * Getting crowler
58 | * @param string $url
59 | * @return boolean|\Crowler
60 | */
61 | public static function getAnalyzer($url)
62 | {
63 |
64 | // Check is robots allowed
65 | if(!Robots::robots_allowed($url, Config::$agent_name))
66 | {
67 | Providers::change_url_status($url, Providers::URLS_TYPE_ROBOTS_NOT_ALLOWED);
68 | _w('Robots not allowed');
69 | return false;
70 | }
71 |
72 | // Create object
73 | $obj = new ContentAnalyzer($url);
74 |
75 | if(!$obj->getCONTENT_DATA())
76 | {
77 | Providers::change_url_status($url, Providers::URLS_TYPE_ERROR_NO_DATA);
78 | return false;
79 | }
80 |
81 | return $obj;
82 | }
83 | /**
84 | * Getting all content data
85 | * @return type
86 | */
87 | protected function getCONTENT_DATA()
88 | {
89 |
90 | $ch = curl_init($this->_url);
91 |
92 | $options = array(
93 | CURLOPT_RETURNTRANSFER => true,
94 | CURLOPT_FOLLOWLOCATION => true,
95 | CURLOPT_ENCODING => 'UTF-8',
96 | CURLOPT_TIMEOUT=> 30,
97 | CURLOPT_CONNECTTIMEOUT=>30
98 |
99 | );
100 | curl_setopt_array($ch, $options);
101 |
102 | $this->_content = Languages::curl_exec_utf8($ch);
103 |
104 | return trim($this->_content)!='';
105 | }
106 | /**
107 | * Extracting links
108 | * @return array
109 | */
110 | public function getLinks()
111 | {
112 |
113 | if(!$this->_all_links)
114 | $this->_all_links = urls::create_full_url_list(
115 | HTML::extract_links($this->_content),
116 | $this->_url);
117 |
118 | return $this->_all_links;
119 | }
120 | /**
121 | * Getting plain text without html tags
122 | * @return string
123 | */
124 | public function getPlainContent()
125 | {
126 | if($this->_plain_content)
127 | return $this->_plain_content;
128 |
129 |
130 | $this->_plain_content = HTML::clear_whitespaces(HTML::stripTags($this->_content));
131 |
132 | return $this->_plain_content;
133 | }
134 | /**
135 | * Getting meta tags from content
136 | * @return array
137 | */
138 | public function getMetaTags()
139 | {
140 | if(!$this->meta_tags)
141 | $this->meta_tags = HTML::getMetaTags($this->_content);
142 |
143 | return $this->meta_tags;
144 | }
145 | /**
146 | * Extract Title from html
147 | * @return string
148 | */
149 | public function getTitle()
150 | {
151 | if(!$this->meta_title)
152 | {
153 | $btgs = HTML::getTextBetweenTags($this->_content,'title');
154 | $this->meta_title = (is_array($btgs))?implode (' ', $btgs):$btgs;
155 | }
156 |
157 | return $this->meta_title;
158 | }
159 | }
160 |
161 | ?>
162 |
--------------------------------------------------------------------------------
/libs/cronmanager.php:
--------------------------------------------------------------------------------
1 | cron_process_index;
38 | }
39 |
40 | /**
41 | * singleton constructor
42 | * @param string $run_id
43 | */
44 | private function __construct($run_id='cron',$max_proccesses=1)
45 | {
46 | $is_all_processes_used = true;
47 |
48 | for($spid = 1; $spid <= $max_proccesses ; $spid++)
49 | {
50 | $this->RUN_ID = md5("$run_id-$spid");
51 |
52 | if(!is_file(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid"))
53 | {
54 | $this->cron_process_index = $spid;
55 | $is_all_processes_used = false;
56 | break;
57 | }
58 |
59 | }
60 |
61 | if($is_all_processes_used)
62 | {
63 | _w('Cron alerady running ... Wait for next time');
64 | exit(2);
65 | }
66 |
67 |
68 | try
69 | {
70 | @file_put_contents(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid", 'x'.getmypid());
71 |
72 | if(!is_file(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid"))
73 | throw new Exception('Cannot create file');
74 |
75 | $this->init = TRUE;
76 | }
77 | catch(Exception $ex)
78 | {
79 | _w('ERROR !!! Cannot create pid file ! ' . $ex->getMessage() );
80 | exit(2);
81 | }
82 |
83 | register_shutdown_function(array($this, 'callRegisteredShutdown'));
84 | }
85 | /**
86 | * Shutdown func.
87 | * @throws Exception
88 | */
89 | public function callRegisteredShutdown()
90 | {
91 | if(!$this->init)
92 | {
93 | _w('shutting down - no need');
94 | return;
95 | }
96 |
97 | try
98 | {
99 | _w('shutting down');
100 | unlink(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid");
101 | if(is_file(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid"))
102 | throw new Exception('Cannot REMOVE file');
103 | }
104 | catch(Exception $ex)
105 | {
106 | _w('ERROR !!! Cannot REMOVE pid file ! ' . $ex->getMessage() );
107 | }
108 | }
109 | /**
110 | * Geting singleton
111 | * @param string $run_id
112 | * @return CronManager
113 | */
114 | private static function getSingleton($run_id='cron',$max_processes=1)
115 | {
116 | if(!self::$singleton)
117 | self::$singleton = new CronManager($run_id,$max_processes);
118 |
119 | return self::$singleton;
120 | }
121 |
122 | /**
123 | * Check mandatory configs
124 | */
125 | public static function check_configs()
126 | {
127 | if(!is_dir(WRITABLE_PATH))
128 | {
129 | _w('No WRITABLE dir : ' . WRITABLE_PATH);
130 | try
131 | {
132 | @mkdir(WRITABLE_PATH, 0777);
133 | }
134 | catch(Exception $ex)
135 | {
136 | _w('ERROR !!!' . $ex->getMessage());
137 | }
138 |
139 | if(!is_dir(WRITABLE_PATH))
140 | {
141 | _w('ERROR !!! Cannot Create dir !');
142 | exit(2);
143 | }
144 | }
145 | }
146 |
147 | /**
148 | * init function
149 | * @param string $_prefix
150 | * @param int $max_processes
151 | * @return CronManager
152 | */
153 | public static function init($_prefix='cron',$max_processes=1)
154 | {
155 | CronManager::check_configs();
156 | return CronManager::getSingleton($_prefix,$max_processes);
157 |
158 | }
159 | }
160 |
161 | ?>
162 |
--------------------------------------------------------------------------------
/libs/html.php:
--------------------------------------------------------------------------------
1 | "]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $html, $matches);
48 | if(count($matches[0])>0)
49 | return array_combine($matches[1], $matches[2]);
50 | else
51 | return array();
52 |
53 | }
54 |
55 | public static function getTextBetweenTags($string, $tagname)
56 | {
57 | $pattern = "/<$tagname>(.*?)<\/$tagname>/";
58 | preg_match($pattern, $string, $matches);
59 | if(isset($matches[1]))
60 | return $matches[1];
61 | else
62 | return array();
63 | }
64 | /**
65 | * Getting plain text from html
66 | * @param string $content
67 | * @return string
68 | */
69 | public static function stripTags($content)
70 | {
71 | //return strip_tags ( $content );
72 |
73 | $search = array ("''si", // Strip out javascript
74 | "''si",
75 | "'<[/!]*?[^<>]*?>'si", // Strip out HTML tags
76 | "'([rn])[s]+'", // Strip out white space
77 | "'&(quot|#34);'i", // Replace HTML entities
78 | "'&(amp|#38);'i",
79 | "'&(lt|#60);'i",
80 | "'&(gt|#62);'i",
81 | "'&(nbsp|#160);'i",
82 | "'&(iexcl|#161);'i",
83 | "'&(cent|#162);'i",
84 | "'&(pound|#163);'i",
85 | "'&(copy|#169);'i",
86 | "'(d+);'e"); // evaluate as php
87 |
88 | $replace = array (" ",
89 | " ",
90 | " ",
91 | " ",
92 | " \" ",
93 | " & ",
94 | " ",
95 | " ",
96 | " ",
97 | ' ',
98 | ' ',
99 | ' ',
100 | ' ',
101 | ' ');
102 | return strip_tags ( HTML::clear_whitespaces(preg_replace($search, $replace, $content)) );
103 | }
104 | /**
105 | *
106 | * @param string $tag
107 | * @param string $key_attribute
108 | * @param string $html
109 | * @return array
110 | */
111 | public static function extract_tag_values($tag,$key_attribute,$html)
112 | {
113 | $matches = array();
114 | $reg = "#<$tag.*$key_attribute\s*=\s*(\"|')?([^\"'>]+).*>(.+)$tag>#i";
115 | if (preg_match_all($reg, $html, $matches))
116 | return array_combine($matches[2], $matches[3]);
117 |
118 | return array();
119 | }
120 | /**
121 | * getting all