├── .gitignore ├── README.md ├── composer.json ├── src ├── Config │ ├── Config.php │ ├── ProductConfig.php │ └── SpiderConfig.php ├── Exception │ └── SpiderException.php ├── Hole │ ├── ConsumeAbstract.php │ ├── ProductAbstract.php │ └── QueueInterface.php ├── ProductResult.php ├── Queue │ ├── FastCacheQueue.php │ ├── RedisPoolQueue.php │ └── RedisQueue.php ├── Spider.php ├── SpiderClient.php └── SpiderServer.php └── test └── test.php /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## EasySwoole Spider 2 | 3 | EasySwoole-Spider 可以方便用户快速搭建分布式多协程爬虫。 4 | 5 | ## 快速使用 6 | 以百度搜索为例,根据搜索关键词找出爬出每个关键词检索结果前几页的特定数据 7 | `纯属教学目的,如有冒犯贵公司还请及时通知,会及时调整` 8 | 9 | #### Product 10 | 11 | ```php 12 | productConfig->getUrl()); 30 | $httpClient->setHeader('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36'); 31 | $body = $httpClient->get()->getBody(); 32 | 33 | // 先将每个搜索结果的a标签内容拿到 34 | $rules = [ 35 | 'search_result' => ['.c-container .t', 'text', 'a'] 36 | ]; 37 | $searchResult = QueryList::rules($rules)->html($body)->query()->getData(); 38 | 39 | $data = []; 40 | foreach ($searchResult as $result) { 41 | $item = [ 42 | 'href' => QueryList::html($result['search_result'])->find('a')->attr('href'), 43 | 'text' => QueryList::html($result['search_result'])->find('a')->text() 44 | ]; 45 | $data[] = $item; 46 | } 47 | 48 | $productJobOtherInfo = $this->productConfig->getOtherInfo(); 49 | 50 | // 下一批任务 51 | $productJobConfigs = []; 52 | if ($productJobOtherInfo['page'] === 1) { 53 | for($i=1;$i<5;$i++) { 54 | $pn = $i*10; 55 | $productJobConfig = [ 56 | 'url' => "https://www.baidu.com/s?wd={$productJobOtherInfo['word']}&pn={$pn}", 57 | 'otherInfo' => [ 58 | 'word' => $productJobOtherInfo['word'], 59 | 'page' => $i+1 60 | ] 61 | ]; 62 | $productJobConfigs[] = $productJobConfig; 63 | } 64 | 65 | $word = Cache::getInstance()->deQueue(self::SEARCH_WORDS); 66 | if (!empty($word)) { 67 | $productJobConfigs[] = [ 68 | 'url' => "https://www.baidu.com/s?wd={$word}&pn=0", 69 | 'otherInfo' => [ 70 | 'word' => $word, 71 | 'page' => 1 72 | ] 73 | ]; 74 | } 75 | 76 | } 77 | 78 | $result = new ProductResult(); 79 | $result->setProductJobConfigs($productJobConfigs)->setConsumeData($data); 80 | return $result; 81 | } 82 | 83 | } 84 | ``` 85 | 86 | ### Consume 87 | 88 | 我这里直接存文件了,可按照需求自己定制 89 | 90 | ```php 91 | getJobData(); 104 | 105 | $items = ''; 106 | foreach ($data as $item) { 107 | $items .= implode("\t", $item)."\n"; 108 | } 109 | 110 | file_put_contents('baidu.txt', $items, FILE_APPEND); 111 | } 112 | } 113 | ``` 114 | 115 | ### 注册爬虫组件 116 | 117 | ```php 118 | public static function mainServerCreate(EventRegister $register) 119 | { 120 | $spiderConfig = [ 121 | 'product' => ProductTest::class, // 必须 122 | 'consume' => ConsumeTest::class, // 必须 123 | 'queueType' => SpiderConfig::QUEUE_TYPE_FAST_CACHE, 124 | 'queue' => '自定义队列,如使用组件自带则不需要', 125 | 'queueConfig' => '自定义队列配置,目前只有SpiderConfig::QUEUE_TYPE_REDIS需要', 126 | 'maxCurrency' => 128 // 最大协程并发数 127 | ]; 128 | SpiderServer::getInstance() 129 | ->setSpiderConfig($spiderConfig) 130 | ->attachProcess(ServerManager::getInstance()->getSwooleServer()); 131 | } 132 | ``` 133 | 134 | ### 投递任务 135 | ````php 136 | $words = [ 137 | 'php', 138 | 'java', 139 | 'go' 140 | ]; 141 | 142 | foreach ($words as $word) { 143 | Cache::getInstance()->enQueue('SEARCH_WORDS', $word); 144 | } 145 | 146 | $wd = Cache::getInstance()->deQueue('SEARCH_WORDS'); 147 | 148 | SpiderClient::getInstance()->addJob( 149 | 'https://www.baidu.com/s?wd=php&pn=0', 150 | [ 151 | 'page' => 1, 152 | 'word' => $wd 153 | ] 154 | ); 155 | ```` 156 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "easyswoole/spider", 3 | "type": "library", 4 | "description": "An efficient swoole framework", 5 | "keywords": [ 6 | "swoole", 7 | "framework", 8 | "spider", 9 | "easyswoole" 10 | ], 11 | "homepage": "https://www.easyswoole.com/", 12 | "license": "Apache-2.0", 13 | "authors": [ 14 | { 15 | "name": "Huizhang", 16 | "email": "2788828128@qq.com" 17 | } 18 | ], 19 | "require": { 20 | "php": ">=7.1.0", 21 | "easyswoole/component": "^2.0", 22 | "easyswoole/queue": "^2.0", 23 | "easyswoole/fast-cache": "^1.0", 24 | "easyswoole/spl": "^1.1", 25 | "easyswoole/http-client": "^1.3", 26 | "jaeger/querylist": "^4.1" 27 | }, 28 | "require-dev": { 29 | "easyswoole/phpunit": "^1.0", 30 | "easyswoole/swoole-ide-helper": "^1.2" 31 | }, 32 | "autoload": { 33 | "psr-4": { 34 | "EasySwoole\\Spider\\": "src/" 35 | } 36 | }, 37 | "autoload-dev": { 38 | "psr-4": { 39 | "EasySwoole\\Spider\\Test\\": "test/" 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/Config/Config.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 爬虫配置 7 | */ 8 | namespace EasySwoole\Spider\Config; 9 | 10 | use EasySwoole\Component\Singleton; 11 | use EasySwoole\JobQueue\QueueDriverInterface; 12 | use EasySwoole\Spider\Hole\ConsumeAbstract; 13 | use EasySwoole\Spider\Hole\ProductAbstract; 14 | use EasySwoole\Spider\Hole\QueueInterface; 15 | 16 | class Config 17 | { 18 | 19 | use Singleton; 20 | 21 | // 用户自定义生产者 22 | protected $product; 23 | 24 | // 用户自定义消费者 25 | protected $consume; 26 | 27 | // 通信队列类型 28 | protected $queueType=1; 29 | 30 | // 队列 31 | protected $queue; 32 | 33 | // 分布式时指定一台机器为开始机器 34 | protected $mainHost; 35 | 36 | // 队列配置 37 | protected $queueConfig; 38 | 39 | // 同时运行的最大任务数量(生产+消费) 40 | protected $maxCurrency=128; 41 | 42 | 43 | public const QUEUE_TYPE_FAST_CACHE = 1; 44 | public const QUEUE_TYPE_REDIS = 2; 45 | 46 | public function setMaxCurrency($maxCurrency): Config 47 | { 48 | $this->maxCurrency = $maxCurrency; 49 | } 50 | 51 | public function getMaxCurrency() 52 | { 53 | return $this->maxCurrency; 54 | } 55 | 56 | /** 57 | * @return ProductAbstract 58 | */ 59 | public function getProduct():ProductAbstract 60 | { 61 | return $this->product; 62 | } 63 | 64 | /** 65 | * @param ProductAbstract $product 66 | * @return Config 67 | */ 68 | public function setProduct(ProductAbstract $product): Config 69 | { 70 | $this->product = $product; 71 | return $this; 72 | } 73 | 74 | /** 75 | * @return ConsumeAbstract 76 | */ 77 | public function getConsume():ConsumeAbstract 78 | { 79 | return $this->consume; 80 | } 81 | 82 | /** 83 | * @param ConsumeAbstract $consume 84 | * @return Config 85 | */ 86 | public function setConsume(ConsumeAbstract $consume): Config 87 | { 88 | $this->consume = $consume; 89 | return $this; 90 | } 91 | 92 | /** 93 | * @return mixed 94 | */ 95 | public function getQueueType() 96 | { 97 | return $this->queueType; 98 | } 99 | 100 | /** 101 | * @param mixed $queueType 102 | * @return Config 103 | */ 104 | public function setQueueType($queueType): Config 105 | { 106 | $this->queueType = $queueType; 107 | return $this; 108 | } 109 | 110 | /** 111 | * @return QueueDriverInterface 112 | */ 113 | public function getQueue():QueueDriverInterface 114 | { 115 | return $this->queue; 116 | } 117 | 118 | /** 119 | * @param mixed $queue 120 | * @return Config 121 | */ 122 | public function setQueue($queue): Config 123 | { 124 | $this->queue = $queue; 125 | return $this; 126 | } 127 | 128 | /** 129 | * @return mixed 130 | */ 131 | public function getMainHost() 132 | { 133 | return $this->mainHost; 134 | } 135 | 136 | /** 137 | * @param mixed $mainHost 138 | * @return Config 139 | */ 140 | public function setMainHost($mainHost): Config 141 | { 142 | $this->mainHost = $mainHost; 143 | return $this; 144 | } 145 | 146 | /** 147 | * @return mixed 148 | */ 149 | public function getQueueConfig() 150 | { 151 | return $this->queueConfig; 152 | } 153 | 154 | /** 155 | * @param mixed $queueConfig 156 | * @return Config 157 | */ 158 | public function setQueueConfig($queueConfig): Config 159 | { 160 | $this->queueConfig = $queueConfig; 161 | return $this; 162 | } 163 | } 164 | -------------------------------------------------------------------------------- /src/Config/ProductConfig.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 每个product任务的配置 7 | */ 8 | namespace EasySwoole\Spider\Config; 9 | 10 | use EasySwoole\Spl\SplBean; 11 | 12 | class ProductConfig extends SplBean 13 | { 14 | 15 | protected $url; 16 | 17 | protected $otherInfo; 18 | 19 | public function getUrl() 20 | { 21 | return $this->url; 22 | } 23 | 24 | public function getOtherInfo() 25 | { 26 | return $this->otherInfo; 27 | } 28 | } -------------------------------------------------------------------------------- /src/Config/SpiderConfig.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 爬虫配置 7 | */ 8 | namespace EasySwoole\Spider\Config; 9 | 10 | use EasySwoole\Queue\Queue; 11 | use EasySwoole\Queue\QueueDriverInterface; 12 | use EasySwoole\Spider\Hole\ConsumeAbstract; 13 | use EasySwoole\Spl\SplBean; 14 | 15 | class SpiderConfig extends SplBean 16 | { 17 | 18 | // 用户自定义生产者 19 | protected $product; 20 | 21 | // 用户自定义消费者 22 | protected $consume; 23 | 24 | // 通信队列类型 25 | protected $queueType=1; 26 | 27 | // 队列 28 | protected $queue; 29 | 30 | // 队列配置 31 | protected $queueConfig; 32 | 33 | // jobQueue 34 | protected $jobQueue; 35 | 36 | // jobQueue 最大协程并发数 37 | protected $maxCurrency=128; 38 | 39 | public const QUEUE_TYPE_FAST_CACHE = 1; 40 | public const QUEUE_TYPE_REDIS = 2; 41 | 42 | public function getProduct() 43 | { 44 | return $this->product; 45 | } 46 | 47 | public function setMaxCurrency(int $maxCurrency): SpiderConfig 48 | { 49 | $this->maxCurrency = $maxCurrency; 50 | return $this; 51 | } 52 | 53 | public function getMaxCurrency():int 54 | { 55 | return $this->maxCurrency; 56 | } 57 | 58 | public function setProduct($product): SpiderConfig 59 | { 60 | $this->product = $product; 61 | return $this; 62 | } 63 | 64 | public function getConsume() 65 | { 66 | return $this->consume; 67 | } 68 | 69 | /** 70 | * @param ConsumeAbstract $consume 71 | * @return SpiderConfig 72 | */ 73 | public function setConsume(ConsumeAbstract $consume): SpiderConfig 74 | { 75 | $this->consume = $consume; 76 | return $this; 77 | } 78 | 79 | /** 80 | * @return mixed 81 | */ 82 | public function getQueueType() 83 | { 84 | return $this->queueType; 85 | } 86 | 87 | /** 88 | * @param mixed $queueType 89 | * @return SpiderConfig 90 | */ 91 | public function setQueueType($queueType): SpiderConfig 92 | { 93 | $this->queueType = $queueType; 94 | return $this; 95 | } 96 | 97 | /** 98 | * @return QueueDriverInterface 99 | */ 100 | public function getQueue():QueueDriverInterface 101 | { 102 | return $this->queue; 103 | } 104 | 105 | /** 106 | * @param QueueDriverInterface $queue 107 | * @return SpiderConfig 108 | */ 109 | public function setQueue(QueueDriverInterface $queue): SpiderConfig 110 | { 111 | $this->queue = $queue; 112 | return $this; 113 | } 114 | 115 | /** 116 | * @return mixed 117 | */ 118 | public function getQueueConfig() 119 | { 120 | return $this->queueConfig; 121 | } 122 | 123 | /** 124 | * @param mixed $queueConfig 125 | * @return SpiderConfig 126 | */ 127 | public function setQueueConfig($queueConfig): SpiderConfig 128 | { 129 | $this->queueConfig = $queueConfig; 130 | return $this; 131 | } 132 | 133 | /** 134 | * @return Queue 135 | */ 136 | public function getJobQueue() : Queue 137 | { 138 | return $this->jobQueue; 139 | } 140 | 141 | /** 142 | * @param Queue $jobQueue 143 | * @return SpiderConfig 144 | */ 145 | public function setJobQueue(Queue $jobQueue): SpiderConfig 146 | { 147 | $this->jobQueue = $jobQueue; 148 | return $this; 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /src/Exception/SpiderException.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 爬虫exception 7 | */ 8 | namespace EasySwoole\Spider\Exception; 9 | 10 | class SpiderException extends \Exception 11 | { 12 | 13 | } -------------------------------------------------------------------------------- /src/Hole/ConsumeAbstract.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 暴露给外部的消费者接口 7 | */ 8 | namespace EasySwoole\Spider\Hole; 9 | 10 | use EasySwoole\Queue\Job; 11 | 12 | abstract class ConsumeAbstract extends Job 13 | { 14 | 15 | abstract public function consume(); 16 | 17 | } 18 | -------------------------------------------------------------------------------- /src/Hole/ProductAbstract.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 暴露给外部的生产者接口 7 | */ 8 | namespace EasySwoole\Spider\Hole; 9 | 10 | use EasySwoole\Queue\Job; 11 | use EasySwoole\Spider\Config\ProductConfig; 12 | use EasySwoole\Spider\ProductResult; 13 | 14 | abstract class ProductAbstract extends Job 15 | { 16 | 17 | /** 18 | * @var $productConfig ProductConfig 19 | */ 20 | public $productConfig; 21 | 22 | abstract public function product(): ProductResult; 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/Hole/QueueInterface.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 暴露给外部的队列接口 7 | */ 8 | namespace EasySwoole\Spider\Hole; 9 | 10 | interface QueueInterface 11 | { 12 | public function push($key, $value); 13 | 14 | public function pop($key); 15 | 16 | } 17 | -------------------------------------------------------------------------------- /src/ProductResult.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 生产结果 7 | */ 8 | 9 | namespace EasySwoole\Spider; 10 | 11 | class ProductResult 12 | { 13 | 14 | private $productJobs; 15 | 16 | private $consumeData; 17 | 18 | public function setProductJobConfigs(array $productJobs): ProductResult 19 | { 20 | $this->productJobs = $productJobs; 21 | return $this; 22 | } 23 | 24 | public function getProductJobConfigs() 25 | { 26 | return $this->productJobs; 27 | } 28 | 29 | public function setConsumeData($consumeData): ProductResult 30 | { 31 | $this->consumeData = $consumeData; 32 | return $this; 33 | } 34 | 35 | public function getConsumeData() 36 | { 37 | return $this->consumeData; 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/Queue/FastCacheQueue.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 爬虫组件默认fastcache为通信队列 7 | */ 8 | namespace EasySwoole\Spider\Queue; 9 | 10 | use EasySwoole\FastCache\Cache; 11 | use EasySwoole\Queue\QueueDriverInterface; 12 | use EasySwoole\Queue\Job; 13 | 14 | class FastCacheQueue implements QueueDriverInterface 15 | { 16 | 17 | private const FASTCACHE_JOB_QUEUE_KEY='FASTCACHE_JOB_QUEUE_KEY'; 18 | 19 | function pop(float $timeout = 3):?Job 20 | { 21 | // TODO: Implement pop() method. 22 | $job = Cache::getInstance()->deQueue(self::FASTCACHE_JOB_QUEUE_KEY); 23 | if (empty($job)) { 24 | return null; 25 | } 26 | $job = unserialize($job); 27 | if (empty($job)) { 28 | return null; 29 | } 30 | return $job; 31 | } 32 | 33 | function push(Job $job):bool 34 | { 35 | // TODO: Implement push() method. 36 | $res = Cache::getInstance()->enQueue(self::FASTCACHE_JOB_QUEUE_KEY, serialize($job)); 37 | if (empty($res)) { 38 | return false; 39 | } 40 | return true; 41 | } 42 | 43 | public function size(): ?int 44 | { 45 | // TODO: Implement size() method. 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/Queue/RedisPoolQueue.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 基于redis-pool 7 | */ 8 | namespace EasySwoole\Spider\Queue; 9 | 10 | use EasySwoole\Queue\Job; 11 | use EasySwoole\Queue\QueueDriverInterface; 12 | use EasySwoole\RedisPool\Redis; 13 | 14 | class RedisPoolQueue implements QueueDriverInterface 15 | { 16 | 17 | public const REDIS_JOB_QUEUE_KEY='REDIS_JOB_QUEUE_KEY'; 18 | 19 | function push(Job $job):bool 20 | { 21 | $redis = Redis::defer(self::REDIS_JOB_QUEUE_KEY); 22 | $res = $redis->lPush(self::REDIS_JOB_QUEUE_KEY, serialize($job)); 23 | if (empty($res)) { 24 | return false; 25 | } 26 | return true; 27 | } 28 | 29 | function pop(float $timeout = 3):?Job 30 | { 31 | $redis = Redis::defer(self::REDIS_JOB_QUEUE_KEY); 32 | $job = $redis->lPop(self::REDIS_JOB_QUEUE_KEY); 33 | if (empty($job)) { 34 | return null; 35 | } 36 | $job = unserialize($job); 37 | if (empty($job)) { 38 | return null; 39 | } 40 | return $job; 41 | } 42 | 43 | public function size(): ?int 44 | { 45 | // TODO: Implement size() method. 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/Queue/RedisQueue.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 基于redis-pool 7 | */ 8 | namespace EasySwoole\Spider\Queue; 9 | 10 | use EasySwoole\Component\Singleton; 11 | use EasySwoole\FastCache\Cache; 12 | use EasySwoole\JobQueue\AbstractJob; 13 | use EasySwoole\JobQueue\QueueDriverInterface; 14 | use EasySwoole\RedisPool\Redis; 15 | 16 | class RedisQueue implements QueueDriverInterface 17 | { 18 | 19 | use Singleton; 20 | 21 | public const REDIS_JOB_QUEUE_KEY='REDIS_JOB_QUEUE_KEY'; 22 | 23 | function push(AbstractJob $job):bool 24 | { 25 | $redis = Redis::defer(self::REDIS_JOB_QUEUE_KEY); 26 | $res = $redis->lPush(self::REDIS_JOB_QUEUE_KEY, serialize($job)); 27 | if (empty($res)) { 28 | return false; 29 | } 30 | return true; 31 | } 32 | 33 | function pop(float $timeout = 3):?AbstractJob 34 | { 35 | $redis = Redis::defer(self::REDIS_JOB_QUEUE_KEY); 36 | $job = $redis->lPop(self::REDIS_JOB_QUEUE_KEY); 37 | if (empty($job)) { 38 | return null; 39 | } 40 | $job = unserialize($job); 41 | if (empty($job)) { 42 | return null; 43 | } 44 | return $job; 45 | } 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/Spider.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 爬虫组件 7 | */ 8 | namespace EasySwoole\Spider; 9 | 10 | use EasySwoole\Component\Process\AbstractProcess; 11 | use EasySwoole\Component\Singleton; 12 | use EasySwoole\EasySwoole\ServerManager; 13 | use EasySwoole\FastCache\Cache; 14 | use EasySwoole\JobQueue\JobProcess; 15 | use EasySwoole\JobQueue\JobQueue; 16 | use EasySwoole\Redis\Config\RedisConfig; 17 | use EasySwoole\RedisPool\Redis; 18 | use EasySwoole\Spider\Config\Config; 19 | use EasySwoole\Spider\Config\ProductConfig; 20 | use EasySwoole\Spider\Exception\SpiderException; 21 | use EasySwoole\Spider\Queue\FastCacheQueue; 22 | use EasySwoole\Spider\Queue\RedisQueue; 23 | 24 | class Spider extends AbstractProcess 25 | { 26 | 27 | use Singleton; 28 | 29 | /** 30 | * @var $config Config 31 | */ 32 | private $config; 33 | 34 | private const ES_SPIDER_JOB_QUEUE='ES_SPIDER_JOB_QUEUE'; 35 | 36 | /** 37 | * 设置配置 38 | * 39 | * @param Config $config 40 | * CreateTime: 2020/2/22 下午3:46 41 | * @return Spider 42 | */ 43 | public function setConfig(Config $config) : Spider 44 | { 45 | $this->config = $config; 46 | return $this; 47 | } 48 | 49 | protected function run($arg) 50 | { 51 | go(function (){ 52 | 53 | $config = Config::getInstance(); 54 | 55 | $config->getProduct()->productConfig = new ProductConfig($config->getProduct()->init()); 56 | 57 | $this->setFirstProductJob(); 58 | 59 | }); 60 | } 61 | 62 | private function setFirstProductJob() 63 | { 64 | $config = Config::getInstance(); 65 | $mainHost = $config->getMainHost(); 66 | $firstJob = $config->getProduct(); 67 | if (empty($mainHost)) { 68 | if (!empty($firstJob->productConfig->getUrl())) { 69 | $config->getQueue()->push($firstJob); 70 | } else { 71 | throw new SpiderException('FirstJob url error!'); 72 | } 73 | } else { 74 | $ip = gethostbyname(gethostname()); 75 | if (!empty($ip) && $config->getMainHost() === $ip) { 76 | if (!empty($firstJob->productConfig->getUrl())) { 77 | $config->getQueue()->push($firstJob); 78 | } else { 79 | throw new SpiderException('FirstJob url error!'); 80 | } 81 | } 82 | } 83 | } 84 | 85 | /** 86 | * 将进程绑定到swooleserver 87 | * 88 | * @param null|\swoole_server|\swoole_server_port|\swoole_websocket_server|\swoole_http_server $swooleServer 89 | * CreateTime: 2020/2/22 下午2:45 90 | * @return void 91 | * @throws \EasySwoole\Component\Process\Exception 92 | * @throws \EasySwoole\FastCache\Exception\RuntimeError 93 | */ 94 | public function attachProcess($swooleServer) 95 | { 96 | 97 | // 队列通信 98 | switch ($this->config->getQueueType()) { 99 | case Config::QUEUE_TYPE_FAST_CACHE: 100 | Cache::getInstance() 101 | ->setTempDir(EASYSWOOLE_TEMP_DIR) 102 | ->attachToServer($swooleServer); 103 | $this->config->setQueue(new FastCacheQueue()); 104 | break; 105 | case Config::QUEUE_TYPE_REDIS: 106 | $queueConfig = $this->config->getQueueConfig(); 107 | if (empty($queueConfig)) { 108 | $queueConfig = new RedisConfig(); 109 | } 110 | Redis::getInstance()->register(RedisQueue::REDIS_JOB_QUEUE_KEY, $queueConfig); 111 | $this->config->setQueue(new RedisQueue()); 112 | break; 113 | default: 114 | } 115 | 116 | // 将spider绑定到主进程 117 | $swooleServer->addProcess($this->getProcess()); 118 | 119 | $jobQueue = new JobQueue($this->config->getQueue()); 120 | $jobQueue->setMaxCurrency($this->config->getMaxCurrency()); 121 | $jobQueue->attachServer($swooleServer); 122 | 123 | } 124 | 125 | } 126 | 127 | -------------------------------------------------------------------------------- /src/SpiderClient.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 爬虫客户端 7 | */ 8 | namespace EasySwoole\Spider; 9 | 10 | use EasySwoole\Component\Singleton; 11 | use EasySwoole\Spider\Config\SpiderConfig; 12 | use EasySwoole\Spider\Config\ProductConfig; 13 | use EasySwoole\Spider\Hole\ProductAbstract; 14 | 15 | class SpiderClient 16 | { 17 | use Singleton; 18 | 19 | /** 20 | * @var SpiderConfig 21 | */ 22 | private $spiderConfig; 23 | 24 | public function __construct() 25 | { 26 | $this->spiderConfig = SpiderServer::getInstance()->getSpiderConfig(); 27 | } 28 | 29 | public function addJob($url, $otherInfo) 30 | { 31 | $this->pushJob($url, $otherInfo); 32 | } 33 | 34 | public function addJobs(array $jobsConfig) 35 | { 36 | foreach ($jobsConfig as $jobConfig) { 37 | [$url, $otherInfo] = $jobConfig; 38 | $this->pushJob($url, $otherInfo); 39 | } 40 | } 41 | 42 | private function pushJob($url, $otherInfo) 43 | { 44 | $productConfig = new ProductConfig( 45 | [ 46 | 'url' => $url, 47 | 'otherInfo' => $otherInfo 48 | ] 49 | ); 50 | 51 | /** @var $productJob ProductAbstract*/ 52 | $product = $this->spiderConfig->getProduct(); 53 | $productJob = new $product(); 54 | $productJob->productConfig = $productConfig; 55 | $this->spiderConfig 56 | ->getJobQueue() 57 | ->producer() 58 | ->push($productJob); 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/SpiderServer.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 爬虫组件 7 | */ 8 | namespace EasySwoole\Spider; 9 | 10 | use EasySwoole\Component\Process\AbstractProcess; 11 | use EasySwoole\Component\Singleton; 12 | use EasySwoole\FastCache\Cache; 13 | use EasySwoole\Queue\Job; 14 | use EasySwoole\Queue\Queue; 15 | use EasySwoole\Redis\Config\RedisConfig; 16 | use EasySwoole\RedisPool\Redis; 17 | use EasySwoole\Spider\Config\ProductConfig; 18 | use EasySwoole\Spider\Config\SpiderConfig; 19 | use EasySwoole\Spider\Exception\SpiderException; 20 | use EasySwoole\Spider\Hole\ConsumeAbstract; 21 | use EasySwoole\Spider\Hole\ProductAbstract; 22 | use EasySwoole\Spider\Queue\FastCacheQueue; 23 | use EasySwoole\Spider\Queue\RedisPoolQueue; 24 | use Swoole\Coroutine; 25 | 26 | class SpiderServer extends AbstractProcess 27 | { 28 | 29 | use Singleton; 30 | 31 | public function __construct(...$args) 32 | { 33 | parent::__construct(...$args); 34 | libxml_use_internal_errors(true); 35 | } 36 | 37 | /** 38 | * @var $spiderConfig SpiderConfig 39 | */ 40 | protected $spiderConfig; 41 | 42 | public function setSpiderConfig(array $config) : SpiderServer 43 | { 44 | $this->spiderConfig = new SpiderConfig($config); 45 | return $this; 46 | } 47 | 48 | public function getSpiderConfig():SpiderConfig 49 | { 50 | return $this->spiderConfig; 51 | } 52 | 53 | protected function run($arg) 54 | { 55 | Coroutine::create(function (){ 56 | $this->spiderConfig->getJobQueue()->consumer()->listen(function (Job $job){ 57 | if ($job instanceof ProductAbstract) { 58 | $productResult = $job->product(); 59 | $this->productResultDeal($productResult); 60 | } elseif ($job instanceof ConsumeAbstract) { 61 | $job->consume(); 62 | } else { 63 | throw new SpiderException('Job type error!'); 64 | } 65 | }, 0.01, 3.0, $this->spiderConfig->getMaxCurrency()); 66 | }); 67 | } 68 | 69 | private function productResultDeal($productResult) 70 | { 71 | if ($productResult instanceof ProductResult) { 72 | 73 | Coroutine::create(function () use($productResult) { 74 | $productJobConfigs = $productResult->getProductJobConfigs(); 75 | if (!empty($productJobConfigs)) { 76 | foreach ($productJobConfigs as $productJobConfig) { 77 | $productConfig = new ProductConfig($productJobConfig); 78 | $product = $this->spiderConfig->getProduct(); 79 | /** @var $productJob ProductAbstract*/ 80 | $productJob = new $product(); 81 | $productJob->productConfig = $productConfig; 82 | $this->spiderConfig 83 | ->getJobQueue() 84 | ->producer() 85 | ->push($productJob); 86 | } 87 | } 88 | }); 89 | 90 | Coroutine::create(function () use($productResult) { 91 | $consumeData = $productResult->getConsumeData(); 92 | if (!empty($consumeData)) { 93 | /** @var $consumeJob ConsumeAbstract*/ 94 | $consume = $this->spiderConfig->getConsume(); 95 | $consumeJob = new $consume(); 96 | $consumeJob->setJobData($consumeData); 97 | $this->spiderConfig 98 | ->getJobQueue() 99 | ->producer() 100 | ->push($consumeJob); 101 | } 102 | }); 103 | } 104 | 105 | } 106 | 107 | /** 108 | * 将进程绑定到swooleserver 109 | * 110 | * @param null|\swoole_server|\swoole_server_port|\swoole_websocket_server|\swoole_http_server $swooleServer 111 | * CreateTime: 2020/2/22 下午2:45 112 | * @return void 113 | * @throws \EasySwoole\Component\Process\Exception 114 | * @throws \EasySwoole\FastCache\Exception\RuntimeError 115 | * @throws \EasySwoole\RedisPool\RedisPoolException 116 | */ 117 | public function attachProcess($swooleServer) 118 | { 119 | 120 | switch ($this->spiderConfig->getQueueType()) { 121 | case SpiderConfig::QUEUE_TYPE_FAST_CACHE: 122 | Cache::getInstance() 123 | ->setTempDir(EASYSWOOLE_TEMP_DIR) 124 | ->attachToServer($swooleServer); 125 | $this->spiderConfig->setQueue(new FastCacheQueue()); 126 | break; 127 | case SpiderConfig::QUEUE_TYPE_REDIS: 128 | $queueConfig = $this->spiderConfig->getQueueConfig(); 129 | if (empty($queueConfig)) { 130 | $queueConfig = new RedisConfig(); 131 | } 132 | Redis::getInstance()->register(RedisPoolQueue::REDIS_JOB_QUEUE_KEY, $queueConfig); 133 | $this->spiderConfig->setQueue(new RedisPoolQueue()); 134 | break; 135 | default: 136 | } 137 | 138 | $this->spiderConfig->setJobQueue(new Queue($this->spiderConfig->getQueue())); 139 | 140 | $swooleServer->addProcess($this->getProcess()); 141 | 142 | } 143 | 144 | } 145 | 146 | -------------------------------------------------------------------------------- /test/test.php: -------------------------------------------------------------------------------- 1 | 5 | * @Copyright: copyright(2020) Easyswoole all rights reserved 6 | * @Description: 7 | */ 8 | --------------------------------------------------------------------------------