├── do_it.sh
├── libs
    ├── general.php
    ├── config.php
    ├── cachemanager.php
    ├── languages.php
    ├── urls.php
    ├── robots.php
    ├── contentanalyzer.php
    ├── cronmanager.php
    ├── html.php
    ├── providers.php
    └── db.php
├── README.md
├── crowle.php
└── sql
    └── schema_create.sql


/do_it.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | while true 
3 | do 
4 |     php crowle.php& 
5 |     sleep 5
6 | done
7 | 


--------------------------------------------------------------------------------
/libs/general.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | define('ROOT_PATH', dirname(dirname(__FILE__)));
 3 | define('WRITABLE_PATH', ROOT_PATH . '/writable');
 4 | 
 5 | // Auto loader
 6 | spl_autoload_register(function ($class) {
 7 |     include ROOT_PATH . '/libs/' 
 8 |             . str_replace('_', DIRECTORY_SEPARATOR, strtolower($class))  . '.php';
 9 | });
10 | 
11 | 
12 | // Write out
13 | function _w($t,$show_styled=true)
14 | {
15 |     $text = '';
16 |     if($show_styled)
17 |         $text = "CRON#" . CronManager::getCronProcessIndex () . " [" .date ("d H:i:s") . "] \t - $t.\n";
18 |     else
19 |         $text = $t;
20 |         
21 |     if(defined('STDOUT'))
22 |         fwrite(STDOUT, $text);     
23 |     else
24 |         echo $text;    
25 | }
26 | 
27 | ini_set('user_agent', Config::$agent_name . ' (' . Config::$agent_host . ')');
28 | 
29 | 


--------------------------------------------------------------------------------
/libs/config.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /**
 3 |  * Description of configs
 4 |  *
 5 |  * @author Yehuda Daniel Korotkin
 6 |  */
 7 | class Config {
 8 |     /**
 9 |      * Agent name
10 |      * @var string
11 |      */
12 |     public static $agent_name = 'my_php_bot';
13 |     /**
14 |      * Agent host
15 |      * @var string
16 |      */
17 |     public static $agent_host = 'http://changeme.com';
18 |     /**
19 |      * DB CONFIGS
20 |      * @var array
21 |      */
22 |     public static $db_configs = array(
23 |         'read'=>array(
24 |             'connection_string'=>'mysql:host=127.0.0.1;dbname=crowler_db',
25 |             'username'=>'root',
26 |             'password'=>'$%4 my top secret password',
27 |             'port'=>'3306'
28 |         ),
29 |         'write'=>array(
30 |             'connection_string'=>'mysql:host=127.0.0.1;dbname=crowler_db',
31 |             'username'=>'root',
32 |             'password'=>'3@3 my top secret password',
33 |             'port'=>'3306'
34 |         ),
35 |         'fulltext-write'=>array(
36 |             'connection_string'=>'mysql:host=127.0.0.1;dbname=crowler_db',
37 |             'username'=>'root',
38 |             'password'=>'@#@ my top secret password',
39 |             'port'=>'3306'
40 |         )
41 |     );
42 | }
43 | 
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | PHP-Crawler
 2 | ===========
 3 | 
 4 | PHP crawler and spider. working with UTF8, MySQL, Random host, Supports robots.txt and many more surprises 
 5 | 
 6 | 
 7 | Install It
 8 | ==========
 9 | 0. `sudo apt-get install curl php5-curl php5` - Validate that you have this packages
10 | 1. on /sql folder you will find `schema_create.sql` file run it in sql
11 | 2. on /libs folder you will find  `config.php` you should configure as well
12 | 3. give read + write + delete permissions to `/writable` folder
13 | 
14 | RUN IT
15 | ======
16 | 
17 | There two ways to run 
18 | To run it with multi processes (if you configure on crowle.php ) Default 4 processes
19 | 
20 | 	`sh do_it.sh` 
21 | 
22 | To run single proccess 
23 | 	
24 | 	`php crowle.php`
25 | 
26 | To Add new URL
27 | ==============
28 | Create php file and run
29 | Method 1 :
30 | 
31 | 	<?php
32 | 	
33 | 	include 'libs/general.php';
34 | 	
35 | 	$urlArray=array(
36 | 		'http://some-url-1.com/'=>"some url 1 description",
37 | 		'http://some-url-2.com/"=>"some url 2 description"
38 | 	);
39 | 	
40 | 	Providers::insert_url_list($urlArray);
41 | 	
42 | 	?>
43 | 
44 | Method 2 :
45 | 
46 | 	<?php	
47 | 	include 'libs/general.php';
48 | 	$temp = Providers::get_or_create_url_by_url("http://some-url-1.com/");
49 | 	$temp = Providers::get_or_create_url_by_url("http://some-url-2.com/");
50 | 	?>
51 | 
52 | Stay in contact
53 | ===============
54 | 
55 | http://www.korotkin.co.il/
56 | - or - 
57 | info@korotkin.co.il
58 | 
59 | Hope it fine :-)
60 | Give me feedback !
61 | 


--------------------------------------------------------------------------------
/crowle.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /**
 3 |  * BSD(not a licence) - it's Be Siaat haDishmaya (Translation: With god's help)
 4 |  * 
 5 |  * Created by : Yehuda (Daniel) Korotkin
 6 |  * 
 7 |  * 
 8 |  */
 9 | include 'libs/general.php';
10 | 
11 | CronManager::init( __FILE__ , 4);
12 | 
13 | try
14 | {
15 |     // GET URLs        
16 |     $urls = Providers::get_lead_urls();    
17 |     
18 |     _w('Got ' . count($urls) . ' urls');     
19 |     
20 |     /// PROCESS URLS    
21 |     foreach($urls as $url)
22 |     {        
23 |         // Parse array to url
24 |         $url_w = urls::create_url($url);
25 | 
26 |         _w('Getting url ' . $url_w) ;                
27 |         
28 |         // Init content analyzer
29 |         $ca = ContentAnalyzer::getAnalyzer($url_w);                  
30 |         
31 |         // If content ignored
32 |         if(!$ca)
33 |         {
34 |             _w('ignored');
35 |             
36 |             // Skip
37 |             continue;
38 |         }
39 | 
40 |         _w('Createing general CA data');
41 |         Providers::create_search_item($ca);
42 |         
43 |         
44 |         _w('setting status to indexed');
45 |         Providers::change_url_status($url_w, Providers::URLS_TYPE_INDEXED);        
46 |         
47 |         _w('inserting all other urls');
48 |         Providers::insert_url_list( $ca->getLinks() );
49 |                 
50 |     }
51 | }  
52 | catch (Exception $ex)
53 | {
54 |     _w('WAS ERROR !!! ' . $ex->getMessage());
55 | }
56 | 
57 | _w('Done for now');
58 | unset($urls);
59 |        


--------------------------------------------------------------------------------
/libs/cachemanager.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /**
 3 |  * Description of cachemanager
 4 |  *
 5 |  * @author danniel
 6 |  */
 7 | class CacheManager {
 8 |     /**
 9 |      * instances list
10 |      * @var array
11 |      */
12 |     private static $_instances = array();
13 |     /**
14 |      * Singleton
15 |      */
16 |     private function __clone() { }    
17 |     /**
18 |      * Singleton
19 |      * @param string $instance_name
20 |      * @return CacheManager
21 |      */
22 |     public static function getInstance($instance_name = 'default')
23 |     {
24 |         if(!isset(self::$_instances[$instance_name]))
25 |             self::$_instances[$instance_name] = new CacheManager($instance_name);
26 |         
27 |         return self::$_instances[$instance_name];
28 |     }
29 | 
30 |     private $instance_name= '';
31 | 
32 |     /**
33 |      * Singleton
34 |      */
35 |     private function __construct($instance_name) 
36 |     { 
37 |         $this->instance_name = $instance_name;
38 |     }
39 |     /**
40 |      * Cahce object
41 |      * @var array 
42 |      */
43 |     private $_cache = array();
44 |     /**
45 |      * Setting cache obje
46 |      * @param string $name
47 |      * @param object $value
48 |      */
49 |     public function set($name,$value)
50 |     {
51 |         $this->_cache[$name] = $value;
52 |     }
53 |     /**
54 |      * 
55 |      * @param type $name
56 |      * @return type
57 |      */
58 |     public function get($name)
59 |     {
60 |         $nn = (string)$name;
61 |         if(!isset($this->_cache[$nn]))
62 |             return null;
63 |         
64 |         return $this->_cache[$nn];
65 |     }
66 | }
67 | 
68 | ?>
69 | 


--------------------------------------------------------------------------------
/sql/schema_create.sql:
--------------------------------------------------------------------------------
 1 | delimiter $$
 2 | 
 3 | CREATE DATABASE `crowler_db` /*!40100 DEFAULT CHARACTER SET utf8 */$$
 4 | 
 5 | delimiter $$
 6 | 
 7 | CREATE TABLE `hosts` (
 8 |   `id` bigint(20) NOT NULL AUTO_INCREMENT,
 9 |   `https` tinyint(1) NOT NULL DEFAULT '0',
10 |   `host` varchar(255) NOT NULL,
11 |   `port` int(6) NOT NULL DEFAULT '80',
12 |   PRIMARY KEY (`id`),
13 |   UNIQUE KEY `uniq` (`https`,`host`,`port`),
14 |   KEY `host` (`host`),
15 |   KEY `http_host` (`https`,`host`),
16 |   KEY `http_host_port` (`https`,`host`,`port`),
17 |   KEY `host_port` (`host`,`port`)
18 | ) ENGINE=MyISAM AUTO_INCREMENT=0 DEFAULT CHARSET=utf8$$
19 | 
20 | 
21 | CREATE TABLE `urls` (
22 |   `id` bigint(20) NOT NULL AUTO_INCREMENT,
23 |   `host_id` bigint(20) NOT NULL,
24 |   `path` varchar(255) NOT NULL,
25 |   `get_params` varchar(255) DEFAULT NULL,
26 |   `type` enum('lead','blocked','indexed','robots_not_allowed','error_no_data') NOT NULL DEFAULT 'lead',
27 |   `created_at` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
28 |   `updated_at` datetime DEFAULT NULL,
29 |   `priority` int(11) DEFAULT '1000',
30 |   PRIMARY KEY (`id`)
31 | ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8$$
32 | 
33 | 
34 | 
35 | CREATE TABLE `url_data` (
36 |   `id` bigint(20) NOT NULL AUTO_INCREMENT,
37 |   `url_id` bigint(20) DEFAULT NULL,
38 |   `text` text,
39 |   PRIMARY KEY (`id`),
40 |   UNIQUE KEY `idx_url` (`url_id`)
41 | ) ENGINE=InnoDB AUTO_INCREMENT=0 DEFAULT CHARSET=utf8$$
42 | 
43 | 
44 | CREATE ALGORITHM=UNDEFINED  SQL SECURITY DEFINER VIEW `random_leads` AS select 
45 |     u.id AS url_id,
46 |     h.id AS host_id,
47 |     h.https AS https,
48 |     h.host AS host,
49 |     h.port AS port,
50 |     u.path AS path,
51 |     u.get_params AS get_params
52 | from
53 |     (urls u
54 |     join hosts h ON ((u.host_id = h.id)))
55 | where
56 |     (u.type = 'lead')
57 | order by u.priority , rand()
58 | limit 500$$
59 | 
60 | 


--------------------------------------------------------------------------------
/libs/languages.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | /**
 3 |  * Laguage manipulation class
 4 |  *
 5 |  * @author Yehuda Daniel Korotkin
 6 |  */
 7 | class Languages {
 8 |     /**
 9 |      * Execute curl with convertion to utf 8
10 |      * @param type $ch
11 |      * @return string
12 |      */
13 |     public static function curl_exec_utf8($ch) 
14 |     {
15 |         $data = curl_exec($ch);
16 |         
17 |         if (!is_string($data)) return $data;
18 | 
19 |         $charset = null;
20 |         $matches = null;
21 |         
22 |         unset($charset);
23 |         
24 |         $content_type = curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
25 | 
26 |         /* 1: HTTP Content-Type: header */
27 |         preg_match( '@([\w/+]+)(;\s*charset=(\S+))?@i', $content_type, $matches );
28 |         if ( isset( $matches[3] ) )
29 |             $charset = $matches[3];
30 | 
31 |         /* 2: <meta> element in the page */
32 |         if (!isset($charset)) {
33 |             preg_match( '@<meta\s+http-equiv="Content-Type"\s+content="([\w/]+)(;\s*charset=([^\s"]+))?@i', $data, $matches );
34 |             if ( isset( $matches[3] ) )
35 |                 $charset = $matches[3];
36 |         }
37 | 
38 |         /* 3: <xml> element in the page */
39 |         if (!isset($charset)) {
40 |             preg_match( '@<\?xml.+encoding="([^\s"]+)@si', $data, $matches );
41 |             if ( isset( $matches[1] ) )
42 |                 $charset = $matches[1];
43 |         }
44 | 
45 |         /* 4: PHP's heuristic detection */
46 |         if (!isset($charset)) {
47 |             $encoding = mb_detect_encoding($data);
48 |             if ($encoding)
49 |                 $charset = $encoding;
50 |         }
51 | 
52 |         /* 5: Default for HTML */
53 |         if (!isset($charset)) {
54 |             if (strstr($content_type, "text/html") === 0)
55 |                 $charset = "ISO 8859-1";
56 |         }
57 | 
58 |         /* Convert it if it is anything but UTF-8 */
59 |         /* You can change "UTF-8"  to "UTF-8//IGNORE" to 
60 |         ignore conversion errors and still output something reasonable */
61 |         if (isset($charset) && strtoupper($charset) != "UTF-8")
62 |             $data = iconv($charset, 'UTF-8', $data);
63 | 
64 |         return $data;
65 |     }    
66 | }
67 | 
68 | ?>
69 | 


--------------------------------------------------------------------------------
/libs/urls.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /**
  3 |  * General Utils
  4 |  *
  5 |  * @author Yehuda Daniel Korotkin
  6 |  */
  7 | class urls {
  8 |     /**
  9 |      * Create URL
 10 |      * @param type $array
 11 |      * @return type
 12 |      */
 13 |     public static function create_url($array)
 14 |     {
 15 |         return (isset($array['https'])&&$array['https']==true?'https':'http')
 16 |             .'://' . $array['host'] . (isset($array['port'])&&$array['port']!=80?':'.$array['port']:'')
 17 |             . (isset($array['path'])?$array['path']:'/')
 18 |             . (isset($array['get_params'])&&$array['get_params']?'?'.$array['get_params']:'');
 19 |     }
 20 |     /**
 21 |      * From list as
 22 |      *  array('url'=>'some name','url2'=>'some name'); 
 23 |      * to 
 24 |      *  real full url list
 25 |      * @param array $url_list
 26 |      * @param string $real_url
 27 |      * @return array
 28 |      */
 29 |     public static function create_full_url_list($url_list,$real_url)
 30 |     {
 31 |         $realUrl = parse_url($real_url);        
 32 |         
 33 |         $newArr = array();
 34 |         
 35 |         foreach ($url_list as $_key=>$_value) 
 36 |         {
 37 |             $key =   HTML::convert_spatial_symbols(HTML::clear_whitespaces($_key));
 38 |             $value = HTML::convert_spatial_symbols(HTML::clear_whitespaces($_value));
 39 | 
 40 |             /**
 41 |              * Skip all hashes
 42 |              */
 43 |             if($key=='#') continue;
 44 |             /**
 45 |              * ignore javascript
 46 |              */
 47 |             if(substr(strtolower($key), 0, strlen('javascript:'))=='javascript:')                    
 48 |             {
 49 |                 continue;
 50 |             }                
 51 | 
 52 |             /**
 53 |              * Add as-it-is to list
 54 |              */
 55 |             if(substr(strtolower($key), 0, 7)=='http://')
 56 |             {
 57 |                 $newArr[$key]=$value;
 58 |                 continue;
 59 |             }                
 60 | 
 61 |             /**
 62 |              * Add as-it-is to list
 63 |              */
 64 |             if(substr(strtolower($key), 0, 8)=='https://')
 65 |             {
 66 |                 $newArr[$key]=$value;
 67 |                 continue;
 68 |             }                
 69 | 
 70 | 
 71 |             /**
 72 |              * Check is relative url from ROOT
 73 |              */
 74 |             if(substr($key, 0, 1)=='/')
 75 |             {
 76 |                 $newUrl = $realUrl['scheme'] . '://' .$realUrl['host'] . 
 77 |                         (isset($realUrl['port'])?':'.$realUrl['port']:'') . $key;
 78 |                 $newArr[$newUrl]=$value;
 79 |                 continue;
 80 |             }
 81 | 
 82 |             
 83 |             $xUrl =array();
 84 |             if(isset($realUrl['path']))
 85 |             {
 86 |                 $xUrl = explode('/', $realUrl['path']);
 87 |                 if(count($xUrl)>0)
 88 |                     unset($xUrl[count($xUrl)-1]);
 89 |             }  
 90 |             else 
 91 |             {
 92 |                 if(substr($key, 0, 2)=='..')
 93 |                     continue;
 94 |             }
 95 |             
 96 |             $nUrl=$realUrl['scheme'] . '://' .$realUrl['host'] . 
 97 |                         (isset($realUrl['port'])?':'.$realUrl['port']:'').implode('/', $xUrl);
 98 |             $newArr["$nUrl/$key"]=$value;
 99 |         }
100 |         
101 |         return $newArr;
102 |     }
103 | }
104 | 
105 | ?>
106 | 


--------------------------------------------------------------------------------
/libs/robots.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /**
  3 |  * Description of robots
  4 |  *
  5 |  * @author danniel
  6 |  */
  7 | class Robots {
  8 |     /**
  9 |      * all robots
 10 |      * @var array 
 11 |      */
 12 |     protected static $robots = array();
 13 |     /**
 14 |      * Check if robots allowed
 15 |      * 
 16 |      * @see http://www.the-art-of-web.com/php/parse-robots/
 17 |      * @param type $url
 18 |      * @param type $useragent
 19 |      * @return boolean
 20 |      */
 21 |     public static function robots_allowed($url, $useragent=false)
 22 |     {
 23 |         // parse url to retrieve host and path
 24 |         $parsed = parse_url($url);
 25 |         
 26 |         $agents = array(preg_quote('*'));
 27 |         if($useragent) $agents[] = preg_quote($useragent, '/');
 28 |         $agents = implode('|', $agents);
 29 | 
 30 |         if(isset(self::$robots[$parsed['host']]))
 31 |         {
 32 |             $robotstxt = self::$robots[$parsed['host']];
 33 |         }
 34 |         else
 35 |         {
 36 |             
 37 |             // location of robots.txt file, only pay attention to it if the server says it exists
 38 |             if(function_exists('curl_init')) 
 39 |             {
 40 |                 $handle = curl_init("http://{$parsed['host']}/robots.txt");
 41 |             
 42 |                 curl_setopt($handle,  CURLOPT_RETURNTRANSFER, TRUE);
 43 |             
 44 |                 $response = curl_exec($handle);
 45 |                 $httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
 46 |             
 47 |                 if($httpCode == 200) 
 48 |                 {
 49 |                     $robotstxt = explode("\n", $response);
 50 |                 } 
 51 |                 else 
 52 |                 {
 53 |                     $robotstxt = false;
 54 |                 }
 55 |                 
 56 |                 curl_close($handle);
 57 |             } 
 58 |             else 
 59 |             {
 60 |                 $robotstxt = @file("http://{$parsed['host']}/robots.txt");        
 61 |             }
 62 |             
 63 |             if($robotstxt)
 64 |                 self::$robots[$parsed['host']] = $robotstxt;
 65 |         }
 66 | 
 67 |     // if there isn't a robots, then we're allowed in
 68 |     if(empty($robotstxt)) return true;
 69 | 
 70 |     
 71 |     $rules = array();
 72 |     $ruleApplies = false;
 73 |     
 74 |     foreach($robotstxt as $line) 
 75 |     {    
 76 |         // skip blank lines
 77 |         if(!$line = trim($line)) continue;
 78 | 
 79 |         // following rules only apply if User-agent matches $useragent or '*'
 80 |         if(preg_match('/^\s*User-agent: (.*)/i', $line, $match)) 
 81 |         {
 82 |             $ruleApplies = preg_match("/($agents)/i", $match[1]);
 83 |             continue;
 84 |         }
 85 |         
 86 |         if($ruleApplies) 
 87 |         {            
 88 |             $rule = explode(':', $line, 2);
 89 |             
 90 |             $type = trim(strtolower($rule[0]));
 91 |                         
 92 |             // add rules that apply to array for testing
 93 |             if(count($rule)>1)
 94 |             {
 95 |                 $rules[] = array(
 96 |                     'type' => $type,
 97 |                     'match' => str_replace('\*', '.*',preg_quote(trim($rule[1]), '/'))
 98 |                 );
 99 |             }
100 |         }
101 |     }
102 | 
103 |     $isAllowed = true;    
104 |     $cu_st = 0;
105 |     
106 |     foreach($rules as $rule) 
107 |     {
108 |         
109 |         // check if page hits on a rule
110 |         if( @preg_match("/^{$rule['match']}/", $parsed['path']) ) 
111 |         {       
112 |             
113 |             // prefer longer (more specific) rules and Allow trumps Disallow if rules same length        
114 |             $strength = strlen($rule['match']);
115 |         
116 |             if($cu_st < $strength) 
117 |             {
118 |                 $cu_st = $strength;
119 |                 $isAllowed = ($rule['type'] == 'allow') ? true : false;
120 |             } 
121 |             elseif($cu_st == $strength && $rule['type'] == 'allow') 
122 |             {
123 |                 $cu_st = $strength;
124 |                 $isAllowed = true;
125 |             }
126 |         }
127 |     }
128 | 
129 |     return $isAllowed;
130 |   }
131 | }
132 | 
133 | ?>
134 | 


--------------------------------------------------------------------------------
/libs/contentanalyzer.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /**
  3 |  * Description of crowler
  4 |  *
  5 |  * @author danniel
  6 |  */
  7 | class ContentAnalyzer {
  8 |     /**
  9 |      * Current URL
 10 |      * @var string  
 11 |      */
 12 |     protected $_url = '';
 13 |     /**
 14 |      * Full Content
 15 |      * @var string
 16 |      */
 17 |     protected $_content = '';         
 18 |     /**
 19 |      * All a links
 20 |      * @see function getLinks()
 21 |      * @var array
 22 |      */
 23 |     private $_all_links = null;
 24 |     /**
 25 |      * Plain texts
 26 |      * @var string 
 27 |      */
 28 |     private $_plain_content = null;
 29 |     /**
 30 |      * Metatags container
 31 |      * @var array
 32 |      */
 33 |     private $meta_tags = null;
 34 |     /**
 35 |      * Get Title
 36 |      * @var string
 37 |      */
 38 |     private $meta_title = null;
 39 |     /**
 40 |      * Private Constructor
 41 |      * @param string $url
 42 |      */
 43 |     private function __construct($url) 
 44 |     {
 45 |         $this->_url = $url;
 46 |     }    
 47 |     /**
 48 |      * Getting Current url id
 49 |      * @return boolean|int
 50 |      */
 51 |     public function getUrlId()
 52 |     {
 53 |         $url_info = Providers::get_url_by_url($this->_url);
 54 |         return ( ! $url_info && isset($url_info['id'])) ? false : $url_info['id'];
 55 |     }
 56 |     /**
 57 |      * Getting crowler
 58 |      * @param string $url
 59 |      * @return boolean|\Crowler
 60 |      */
 61 |     public static function getAnalyzer($url)
 62 |     {
 63 |         
 64 |         // Check is robots allowed
 65 |         if(!Robots::robots_allowed($url, Config::$agent_name))
 66 |         {
 67 |             Providers::change_url_status($url, Providers::URLS_TYPE_ROBOTS_NOT_ALLOWED);                        
 68 |             _w('Robots not allowed');
 69 |             return false;
 70 |         }                               
 71 | 
 72 |         // Create object
 73 |         $obj = new ContentAnalyzer($url);
 74 | 
 75 |         if(!$obj->getCONTENT_DATA())
 76 |         {
 77 |             Providers::change_url_status($url, Providers::URLS_TYPE_ERROR_NO_DATA);                        
 78 |             return false;
 79 |         }
 80 |         
 81 |         return $obj;
 82 |     }
 83 |     /**
 84 |      * Getting all content data
 85 |      * @return type
 86 |      */
 87 |     protected function getCONTENT_DATA()
 88 |     {
 89 |         
 90 |         $ch = curl_init($this->_url);
 91 |         
 92 |         $options = array(
 93 |             CURLOPT_RETURNTRANSFER => true,
 94 |             CURLOPT_FOLLOWLOCATION => true,
 95 |             CURLOPT_ENCODING => 'UTF-8',
 96 |             CURLOPT_TIMEOUT=> 30,
 97 |             CURLOPT_CONNECTTIMEOUT=>30
 98 |         
 99 |         );
100 |         curl_setopt_array($ch, $options);
101 |         
102 |         $this->_content = Languages::curl_exec_utf8($ch);                                         
103 |         
104 |         return trim($this->_content)!='';
105 |     }
106 |     /**
107 |      * Extracting links
108 |      * @return array
109 |      */
110 |     public function getLinks()
111 |     {       
112 |         
113 |         if(!$this->_all_links)
114 |             $this->_all_links = urls::create_full_url_list(
115 |                     HTML::extract_links($this->_content), 
116 |                     $this->_url);
117 |         
118 |         return $this->_all_links;
119 |     }
120 |     /**
121 |      * Getting plain text without html tags
122 |      * @return string
123 |      */
124 |     public function getPlainContent()
125 |     {
126 |         if($this->_plain_content)
127 |             return $this->_plain_content;
128 |         
129 |         
130 |         $this->_plain_content = HTML::clear_whitespaces(HTML::stripTags($this->_content));
131 |         
132 |         return $this->_plain_content;                
133 |     }
134 |     /**
135 |      * Getting meta tags from content
136 |      * @return array
137 |      */
138 |     public function getMetaTags()
139 |     {
140 |         if(!$this->meta_tags)
141 |             $this->meta_tags = HTML::getMetaTags($this->_content);
142 |         
143 |         return $this->meta_tags;
144 |     }
145 |     /**
146 |      * Extract Title from html
147 |      * @return string
148 |      */
149 |     public function getTitle()
150 |     {        
151 |         if(!$this->meta_title)
152 |         {
153 |             $btgs = HTML::getTextBetweenTags($this->_content,'title');
154 |             $this->meta_title = (is_array($btgs))?implode (' ', $btgs):$btgs;
155 |         }        
156 | 
157 |         return $this->meta_title;
158 |     }
159 | }
160 | 
161 | ?>
162 | 


--------------------------------------------------------------------------------
/libs/cronmanager.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /**
  3 |  * Managing cron service
  4 |  *
  5 |  * @author Yehuda Daniel Korotkin
  6 |  */
  7 | class CronManager {
  8 |     /**
  9 |      * singleton object
 10 |      * @var CronManager 
 11 |      */
 12 |     private static $singleton = null;
 13 |     /**
 14 |      * Singleton RunID
 15 |      * @var string 
 16 |      */
 17 |     private $RUN_ID = '';
 18 |     /**
 19 |      * is object has inited
 20 |      * @var boolean 
 21 |      */
 22 |     private $init = false;
 23 |     /**
 24 |      * Cront process index
 25 |      * @var int 
 26 |      */
 27 |     private $cron_process_index = 0;
 28 |     /**
 29 |      * Cron proccess index 
 30 |      * @return int
 31 |      */
 32 |     public static function getCronProcessIndex()
 33 |     {
 34 |         if(!self::$singleton)
 35 |             return 0;
 36 |         
 37 |         return self::$singleton->cron_process_index;
 38 |     }
 39 | 
 40 |     /**
 41 |      * singleton constructor
 42 |      * @param string $run_id
 43 |      */
 44 |     private function __construct($run_id='cron',$max_proccesses=1) 
 45 |     {   
 46 |         $is_all_processes_used = true;
 47 |         
 48 |         for($spid = 1; $spid <= $max_proccesses ; $spid++)
 49 |         {
 50 |             $this->RUN_ID = md5("$run_id-$spid");
 51 | 
 52 |             if(!is_file(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid"))
 53 |             {
 54 |                 $this->cron_process_index = $spid;
 55 |                 $is_all_processes_used = false;
 56 |                 break;
 57 |             }
 58 | 
 59 |         }
 60 |         
 61 |         if($is_all_processes_used)
 62 |         {
 63 |             _w('Cron alerady running ... Wait for next time');            
 64 |             exit(2);
 65 |         }
 66 |           
 67 |         
 68 |         try
 69 |         {
 70 |             @file_put_contents(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid", 'x'.getmypid());
 71 |             
 72 |             if(!is_file(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid"))
 73 |                 throw new Exception('Cannot create file');
 74 |             
 75 |             $this->init = TRUE;
 76 |         }
 77 |         catch(Exception $ex)
 78 |         {
 79 |             _w('ERROR !!! Cannot create pid file ! ' . $ex->getMessage() );
 80 |             exit(2);
 81 |         }        
 82 |         
 83 |         register_shutdown_function(array($this, 'callRegisteredShutdown'));
 84 |     }
 85 |     /**
 86 |      * Shutdown func.
 87 |      * @throws Exception
 88 |      */
 89 |     public function callRegisteredShutdown() 
 90 |     {
 91 |         if(!$this->init)
 92 |         {
 93 |             _w('shutting down - no need');
 94 |             return;
 95 |         }
 96 |         
 97 |         try
 98 |         {            
 99 |             _w('shutting down');
100 |             unlink(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid");
101 |             if(is_file(WRITABLE_PATH . "/{$this->RUN_ID}.run.pid"))
102 |                 throw new Exception('Cannot REMOVE file');
103 |         }
104 |         catch(Exception $ex)
105 |         {
106 |             _w('ERROR !!! Cannot REMOVE pid file ! ' . $ex->getMessage() );            
107 |         }             
108 |     }
109 |     /**
110 |      * Geting singleton
111 |      * @param string $run_id
112 |      * @return CronManager
113 |      */
114 |     private static function getSingleton($run_id='cron',$max_processes=1)
115 |     {
116 |         if(!self::$singleton)
117 |             self::$singleton = new CronManager($run_id,$max_processes);
118 |         
119 |         return self::$singleton;
120 |     }
121 | 
122 |     /**
123 |      * Check mandatory configs
124 |      */
125 |     public static function check_configs()
126 |     {
127 |         if(!is_dir(WRITABLE_PATH))
128 |         {
129 |             _w('No WRITABLE dir : ' . WRITABLE_PATH);
130 |             try
131 |             {
132 |                 @mkdir(WRITABLE_PATH, 0777);
133 |             }
134 |             catch(Exception $ex)
135 |             { 
136 |                 _w('ERROR !!!' . $ex->getMessage());
137 |             }
138 |             
139 |             if(!is_dir(WRITABLE_PATH))
140 |             {
141 |                 _w('ERROR !!! Cannot Create dir !');
142 |                 exit(2);
143 |             }
144 |         }        
145 |     }
146 | 
147 |     /**
148 |      * init function
149 |      * @param string $_prefix
150 |      * @param int $max_processes
151 |      * @return CronManager
152 |      */
153 |     public static function init($_prefix='cron',$max_processes=1)
154 |     {        
155 |         CronManager::check_configs();
156 |         return CronManager::getSingleton($_prefix,$max_processes);                
157 |         
158 |     }
159 | }
160 | 
161 | ?>
162 | 


--------------------------------------------------------------------------------
/libs/html.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /**
  3 |  * HTML Manipulators
  4 |  *
  5 |  * @author Yehuda Daniel Korotkin
  6 |  */
  7 | class HTML {
  8 |     /**
  9 |      * convert symbols to chars
 10 |      * @param string $string
 11 |      * @return string
 12 |      */
 13 |     public static function convert_spatial_symbols($string)
 14 |     {
 15 |         return str_replace(
 16 |                 array(
 17 |                     '&acute;',
 18 |                     "&quot;",
 19 |                     '&apos;',
 20 |                     '&amp;',
 21 |                 ), 
 22 |                 array(
 23 |                     "'",
 24 |                     '"',
 25 |                     "'",
 26 |                     '&',
 27 |                 ), 
 28 |                 $string);
 29 |     }
 30 |     /**
 31 |      * Clear white spaces
 32 |      * @param string  $string
 33 |      * @return string
 34 |      */
 35 |     public static function clear_whitespaces($string)
 36 |     {
 37 |         return trim(preg_replace('/\s+/', ' ', $string));
 38 |     }
 39 |     /**
 40 |      * extract meta tags from html
 41 |      * @param string $html
 42 |      * @return array
 43 |      */
 44 |     public static function getMetaTags($html)
 45 |     {
 46 |         $matches = array();
 47 |         preg_match_all('/<[\s]*meta[\s]*name="?' . '([^>"]*)"?[\s]*' . 'content="?([^>"]*)"?[\s]*[\/]?[\s]*>/si', $html, $matches);        
 48 |         if(count($matches[0])>0)
 49 |             return array_combine($matches[1], $matches[2]);
 50 |         else
 51 |             return array();
 52 |        
 53 |     }
 54 | 
 55 |     public static function getTextBetweenTags($string, $tagname)
 56 |     {
 57 |         $pattern = "/<$tagname>(.*?)<\/$tagname>/";
 58 |         preg_match($pattern, $string, $matches);
 59 |         if(isset($matches[1]))
 60 |             return $matches[1];
 61 |         else
 62 |             return array();
 63 |     }
 64 |     /**
 65 |      * Getting plain text from html
 66 |      * @param string $content
 67 |      * @return string
 68 |      */
 69 |     public static function stripTags($content)
 70 |     {        
 71 |         //return strip_tags ( $content ); 
 72 | 
 73 |         $search = array ("'<script[^>]*?>.*?</script>'si",  // Strip out javascript 
 74 |                  "'<style[^>]*?>.*?</style>'si",
 75 |                  "'<[/!]*?[^<>]*?>'si",          // Strip out HTML tags 
 76 |                  "'([rn])[s]+'",                // Strip out white space 
 77 |                  "'&(quot|#34);'i",                // Replace HTML entities 
 78 |                  "'&(amp|#38);'i", 
 79 |                  "'&(lt|#60);'i", 
 80 |                  "'&(gt|#62);'i", 
 81 |                  "'&(nbsp|#160);'i", 
 82 |                  "'&(iexcl|#161);'i", 
 83 |                  "'&(cent|#162);'i", 
 84 |                  "'&(pound|#163);'i", 
 85 |                  "'&(copy|#169);'i", 
 86 |                  "'&#(d+);'e");                    // evaluate as php 
 87 | 
 88 |         $replace = array (" ", 
 89 |                         " ", 
 90 |                         " ", 
 91 |                         " ", 
 92 |                         " \" ", 
 93 |                         " & ", 
 94 |                         " ", 
 95 |                         " ", 
 96 |                         " ", 
 97 |                         ' ', 
 98 |                         ' ', 
 99 |                         ' ', 
100 |                         ' ', 
101 |                         ' ');                     
102 |         return strip_tags ( HTML::clear_whitespaces(preg_replace($search, $replace, $content)) ); 
103 |     }
104 |     /**
105 |      * 
106 |      * @param string $tag
107 |      * @param string $key_attribute
108 |      * @param string $html
109 |      * @return array
110 |      */
111 |     public static function extract_tag_values($tag,$key_attribute,$html)
112 |     {
113 |         $matches = array();
114 |         $reg = "#<$tag.*$key_attribute\s*=\s*(\"|')?([^\"'>]+).*>(.+)</$tag>#i";
115 |         if (preg_match_all($reg, $html, $matches)) 
116 |             return array_combine($matches[2], $matches[3]);  
117 |         
118 |         return array();        
119 |     }
120 |     /**
121 |      * getting all <IFRAME SRC="{LINK}"></SRC> 
122 |      * @param string $html
123 |      * @return array
124 |      */
125 |     public static function extract_iframe_links($html)
126 |     {
127 |         return HTML::extract_tag_values('iframe', 'src', $html);        
128 |     }
129 |     /**
130 |      * getting all <A HREF="{LINK}">{aa}</A> 
131 |      * @param string $html
132 |      * @return array
133 |      */
134 |     public static function extract_links($html)
135 |     {
136 |         return HTML::extract_tag_values('a', 'href', $html);        
137 |     }
138 | }
139 | 
140 | ?>
141 | 


--------------------------------------------------------------------------------
/libs/providers.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /**
  3 |  * Description of providers
  4 |  *
  5 |  * @author danniel
  6 |  */
  7 | class Providers {
  8 |     const URLS_TYPE_LEAD = 'lead';
  9 |     const URLS_TYPE_BLOCKED = 'blocked';
 10 |     const URLS_TYPE_INDEXED = 'indexed';
 11 |     const URLS_TYPE_ROBOTS_NOT_ALLOWED = 'robots_not_allowed';
 12 |     const URLS_TYPE_ERROR_NO_DATA = 'error_no_data';
 13 | 
 14 |     /**
 15 |      * Get host by url
 16 |      * @param string $url
 17 |      * @param boolean $force get from database
 18 |      * @return array
 19 |      */
 20 |     public static function get_host_by_url($url,$force=false)
 21 |     {
 22 |         $cache_id = implode('-', db::create_host_params_from_url($url));        
 23 |         if(!CacheManager::getInstance('hosts')->get($cache_id) || $force)
 24 |             CacheManager::getInstance('hosts')
 25 |             ->set ($cache_id, db::get_host_by_url($url));
 26 |         
 27 |         return CacheManager::getInstance('hosts')->get($cache_id);
 28 |     }
 29 |     /**
 30 |      * Get host by url
 31 |      * @param string $url
 32 |      * @param boolean $force get from database
 33 |      * @return array
 34 |      */
 35 |     public static function get_url_by_url($url,$force=false)
 36 |     {
 37 |         $host = self::get_or_create_host_by_url($url);
 38 |         
 39 |         if(!$host || !isset($host['id']) || $host['id'] == 0 )
 40 |             return false;
 41 |         
 42 |         $host_id = intval($host['id']);
 43 |         
 44 |         $cache_id = $host_id.'-'.implode('-', db::create_url_params_from_url($url)); 
 45 |         
 46 |         if(!CacheManager::getInstance('urls')->get($cache_id) || $force)
 47 |             CacheManager::getInstance('urls')
 48 |             ->set ( $cache_id, db::get_url_by_url( $host_id,$url));
 49 |         
 50 |         return CacheManager::getInstance('urls')->get($cache_id);        
 51 |     }    
 52 |     /**
 53 |      * Get or Create Host item by Host Url string
 54 |      * @param string $url
 55 |      * @return array|boolean
 56 |      */
 57 |     public static function get_or_create_host_by_url($url)
 58 |     {
 59 |         $host = self::get_host_by_url($url);        
 60 |         if($host) 
 61 |             return $host;
 62 |         
 63 |         db::create_host($url);        
 64 |         $host = self::get_host_by_url($url,true);
 65 |         
 66 |         if($host) 
 67 |             return $host;        
 68 | 
 69 |         return false;
 70 |     }
 71 |     /**
 72 |      * Get or Create url item by url string
 73 |      * @param string $url
 74 |      * @return array|boolean
 75 |      */
 76 |     public static function get_or_create_url_by_url($url)
 77 |     {
 78 |         if(self::get_url_by_url($url))
 79 |             return self::get_url_by_url($url);
 80 |         
 81 |         
 82 |         
 83 |         $host = self::get_or_create_host_by_url($url);                              
 84 |         db::create_url($host['id'], $url);
 85 |         
 86 |         $ourl = self::get_url_by_url($url,true);        
 87 | 
 88 |         if($ourl) 
 89 |             return $ourl;        
 90 | 
 91 |         return false;
 92 |     }
 93 |    
 94 | 
 95 |     /** 
 96 |      * Getting lead url
 97 |      * @return string
 98 |      */
 99 |     public static function get_lead_urls()
100 |     {
101 |         return db::get_lead_urls();
102 |     }
103 | 
104 |     /**
105 |      * Check is url is on black list
106 |      * @param string $url
107 |      * @return boolean
108 |      */
109 |     public static function isUrlAllowed($url)
110 |     {
111 |         //@TODO: You can do what ever you want
112 |         return true;
113 |     }
114 |     /**
115 |      * @see const URLS_TYPE_*
116 |      * @param string $url
117 |      * @param mixed $status
118 |      */
119 |     public static function change_url_status($url, $status)
120 |     {
121 |         $u = self::get_url_by_url($url);
122 |         
123 |         if(!$u)
124 |             return false;
125 |         
126 |         db::update_url_status($u['id'], $status);
127 |         return self::get_url_by_url($url,true);
128 |     }
129 |     /**
130 |      * 
131 |      * @param array $urls
132 |      */
133 |     public static function insert_url_list($urls)
134 |     {        
135 |         foreach($urls as $url => $desc)
136 |         {            
137 |             // Actualy i dont care about "$desc" :)
138 |             if(self::get_url_by_url($url))
139 |                 continue;
140 |             
141 |             if(Providers::isUrlAllowed($url))
142 |                 self::get_or_create_url_by_url($url);            
143 |         }
144 |     }
145 |     /**
146 |      * Creating search item data
147 |      * @param ContentAnalyzer $ca
148 |      * @return type
149 |      */
150 |     public static function create_search_item(ContentAnalyzer $ca)
151 |     {
152 |         $desc = '';
153 |         $meta =$ca->getMetaTags();
154 |         
155 |         if(isset($meta['description']))
156 |             $desc = (is_array ($meta['description']))?implode (' ', $meta['description']):$meta['description'];
157 |                         
158 |         return db::create_fulltext_item($ca->getUrlId(), $ca->getTitle(), $desc,$ca->getPlainContent());        
159 |     }   
160 | }
161 | 
162 | ?>
163 | 


--------------------------------------------------------------------------------
/libs/db.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /**
  3 |  * Description of db
  4 |  *
  5 |  * @author danniel
  6 |  */
  7 | class db {
  8 |     /**
  9 |      * Instances
 10 |      * @var array 
 11 |      */
 12 |     private static $_instances = array();
 13 |     /**
 14 |      * Mysqli Driver
 15 |      * @var PDO 
 16 |      */
 17 |     private $_db = null;
 18 |     /**
 19 |      * Configuration name
 20 |      * @var string
 21 |      */
 22 |     private $_config_name='';    
 23 |     /** 
 24 |      * getting instance
 25 |      * @param string $config_name
 26 |      * @return db|null
 27 |      */
 28 |     protected static function getInstance($config_name)
 29 |     {
 30 |         if(isset(db::$_instances[$config_name]))
 31 |             return db::$_instances[$config_name];
 32 |         
 33 |         if(!isset(Config::$db_configs[$config_name]))
 34 |             return null;
 35 |         
 36 |         return db::$_instances[$config_name] = new db($config_name);        
 37 |     }    
 38 |     /**
 39 |      * Singleton
 40 |      * @param string $config_name
 41 |      */
 42 |     private function __construct($config_name) 
 43 |     {
 44 |         $this->_config_name = $config_name;
 45 |     }
 46 |     /** 
 47 |      * Getiing current connection
 48 |      * @return PDO|null
 49 |      */
 50 |     private function getConnection()
 51 |     {
 52 |         if(!$this->_db)
 53 |             if(!$this->load_connection ())
 54 |                 return null;
 55 |         
 56 |         return $this->_db;
 57 |     }
 58 |     /** 
 59 |      * Loading connection
 60 |      * @return boolean
 61 |      */
 62 |     private function load_connection()
 63 |     {
 64 |         if(!isset(Config::$db_configs[$this->_config_name]))
 65 |             return false;
 66 |                 
 67 |         $config = Config::$db_configs[$this->_config_name];
 68 |         
 69 |         $this->_db = new PDO( 
 70 |                         $config['connection_string'], 
 71 |                         $config['username'], 
 72 |                         $config['password'], 
 73 |                         array(PDO::MYSQL_ATTR_INIT_COMMAND => "SET NAMES utf8") 
 74 |         );
 75 |         
 76 |         return true;
 77 |         
 78 | 
 79 |     }
 80 |     /** 
 81 |      * Prepare read sql
 82 |      * @param string $sql
 83 |      * @param array|null $input_parmas
 84 |      * @return PDOStatement
 85 |      */
 86 |     private static function getRead_PrepareSql($sql,$input_parmas=null)
 87 |     {
 88 |         $pdo = self::getInstance('read')->getConnection();        
 89 |         $sth = $pdo->prepare($sql);
 90 |         $sth->execute($input_parmas);
 91 |         return $sth;
 92 |     }
 93 |     /**
 94 |      * Prepare write sql
 95 |      * @param string $sql
 96 |      * @param array|null $input_parmas
 97 |      * @return PDOStatement
 98 |      */
 99 |     private static function getWrite_PrepareSql($sql,$input_parmas=null)
100 |     {
101 |         $pdo = self::getInstance('write')->getConnection();        
102 |         $sth = $pdo->prepare($sql);
103 |         $sth->execute($input_parmas);
104 |         return $sth;
105 |     }
106 |     /**
107 |      * Prepare write fulltext sql
108 |      * @param string $sql
109 |      * @param array|null $input_parmas
110 |      * @return PDOStatement
111 |      */
112 |     private static function getWriteFT_PrepareSql($sql,$input_parmas=null)
113 |     {
114 |         $pdo = self::getInstance('fulltext-write')->getConnection();        
115 |         $sth = $pdo->prepare($sql);
116 |         $sth->execute($input_parmas);
117 |         return $sth;
118 |     }
119 |     /** 
120 |      * Get Leads url
121 |      * @return array
122 |      */
123 |     public static function get_lead_urls()            
124 |     {
125 |         $sql_statement = self::getRead_PrepareSql('SELECT * FROM random_leads');
126 |         return $sql_statement->fetchAll(PDO::FETCH_ASSOC);
127 |     }
128 |     /**
129 |      * get host record by url
130 |      * @param string $url
131 |      * @return array|false
132 |      */
133 |     public static function get_host_by_url($url)
134 |     {
135 |         $params = self::create_host_params_from_url($url);        
136 |         $sql_statement = self::getRead_PrepareSql(
137 |                 'SELECT * 
138 |                     FROM hosts 
139 |                     WHERE 
140 |                         ( https=:https )
141 |                     AND ( port =:port  )
142 |                     AND ( host =:host  )
143 |                  LIMIT 1',$params);
144 |         
145 |         
146 |         return ($sql_statement->fetch(PDO::FETCH_ASSOC));
147 |     }    
148 |     /**
149 |      * Creating params
150 |      * @param string $url
151 |      * @return array
152 |      */
153 |     public static function create_host_params_from_url($url)
154 |     {        
155 |         $parsed_url = parse_url($url);        
156 |         $params = array();
157 |         $params['port']  = (isset($parsed_url['port'])&&$parsed_url['port']!=80) ? $parsed_url['port'] : 80;
158 |         $params['https'] = (isset($parsed_url['scheme'])&&$parsed_url['scheme']=='https')?1:0;
159 |         $params['host']  = $parsed_url['host'];
160 |         return $params;
161 |     }
162 |     /**
163 |      * Creating params
164 |      * @param string $url
165 |      * @return array
166 |      */
167 |     public static function create_url_params_from_url($url)
168 |     {        
169 |         $parsed_url = parse_url($url);        
170 |         $params = array();
171 |         $params['path']  = (isset($parsed_url['path'])) ? $parsed_url['path'] : '';
172 |         $params['query']  = (isset($parsed_url['query'])) ? $parsed_url['query'] : null;
173 |         
174 |         return $params;
175 |     }
176 |     /**
177 |      * 
178 |      * @param int $host_id
179 |      * @param type $url
180 |      */
181 |     public static function get_url_by_url($host_id, $url)
182 |     {
183 |         $params = self::create_url_params_from_url($url);    
184 |         $params['host_id']=$host_id;
185 |         
186 |         $params_EQ = '';
187 |         if(isset($params['query']) && $params['query'] )
188 |         {
189 |             $params_EQ = "AND get_params = :query ";
190 |         }
191 |         else
192 |         {
193 |             $params_EQ = "AND get_params IS NULL ";
194 |             unset($params['query']);
195 |         }
196 |             
197 |         $sql_statement = self::getRead_PrepareSql(
198 |                 "SELECT * 
199 |                     FROM urls 
200 |                     WHERE 
201 |                         ( host_id    = :host_id )
202 |                     AND ( path       = :path  )
203 |                     $params_EQ
204 |                     
205 |                  LIMIT 1",$params);     
206 | 
207 |         return ($sql_statement->fetch(PDO::FETCH_ASSOC));        
208 |     }
209 |     /**
210 |      * creating url
211 |      * @param int $host_id
212 |      * @param string $url
213 |      * @return mixed
214 |      */
215 |     public static function create_url($host_id, $url)
216 |     {
217 |         $params = self::create_url_params_from_url($url);
218 |         $params['host_id']=$host_id;
219 |         self::getWrite_PrepareSql(
220 |                 "INSERT IGNORE urls 
221 |                     (  host_id,  path,  get_params, type  ) 
222 |                  VALUES 
223 |                     ( :host_id, :path, :query ,     'lead'  )",
224 |                 $params);      
225 |         return self::getInstance('write')->getConnection()->lastInsertId();
226 |     }
227 | 
228 |     /**
229 |      * Create host in host  table
230 |      * @param string $url
231 |      */
232 |     public static function create_host($url)
233 |     {
234 |         $params = self::create_host_params_from_url($url);
235 |         
236 |         @self::getWrite_PrepareSql(
237 |                 'INSERT IGNORE hosts 
238 |                     (  https,  host,  port ) 
239 |                  VALUES 
240 |                     ( :https, :host, :port )',
241 |                 $params);      
242 |              
243 |         return self::getInstance('write')->getConnection()->lastInsertId();
244 |     }
245 |     /**
246 |      * update url status
247 |      * @param int $url_id
248 |      * @param string $status
249 |      */
250 |     public static function update_url_status($url_id,$status)
251 |     {
252 |         $params = array('status'=>$status,'url_id'=>$url_id);
253 |         
254 |         @self::getWrite_PrepareSql(
255 |                 'UPDATE urls SET type = :status WHERE id = :url_id',
256 |                 $params);     
257 |         
258 |         return self::getInstance('write')->getConnection()->lastInsertId();
259 |     }
260 | 
261 |     /**
262 |      * create fulltext data
263 |      * @param int $url_id
264 |      * @param string $title
265 |      * @param string $description
266 |      * @param string $text
267 |      * @return mixed
268 |      */
269 |     public static function create_fulltext_item($url_id, $title,$description,$text)
270 |     {
271 |         $params = array(
272 |             'url_id'        =>$url_id,
273 |             'title'         =>$title,
274 |             'description'   =>$description,
275 |             'text'          =>$text
276 |         );
277 |         
278 |         @self::getWriteFT_PrepareSql(
279 |                 'INSERT INTO search_table 
280 |                     ( url_id, title, description, text ) 
281 |                     VALUES 
282 |                     (
283 |                         :url_id , 
284 |                         :title , 
285 |                         :description ,
286 |                         :text 
287 |                     )
288 |                  ON DUPLICATE KEY UPDATE 
289 |                  title = :title, 
290 |                  description = :description,
291 |                  text = :text 
292 |                 ',                
293 |         $params);   
294 | 
295 |         return self::getInstance('fulltext-write')->getConnection()->lastInsertId();
296 |     }    
297 | }
298 | 
299 | ?>
300 | 


--------------------------------------------------------------------------------