├── aps ├── log │ └── weather_aps.log ├── constant.py ├── test.py ├── my_logger.py ├── tasks.py ├── mongo_db.py └── province_spider.py ├── WeatherCrawler ├── __init__.py ├── db │ ├── __init__.py │ └── mongo_db.py ├── spiders │ ├── __init__.pyc │ ├── constant.pyc │ ├── province_spider.pyc │ ├── constant.py │ ├── __init__.py │ └── province_spider.py ├── pipelines.py ├── items.py ├── middlewares.py └── settings.py ├── scrapy.cfg └── README.md /aps/log/weather_aps.log: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /WeatherCrawler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /WeatherCrawler/db/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /WeatherCrawler/spiders/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cool-firer/WeatherCrawler/HEAD/WeatherCrawler/spiders/__init__.pyc -------------------------------------------------------------------------------- /WeatherCrawler/spiders/constant.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cool-firer/WeatherCrawler/HEAD/WeatherCrawler/spiders/constant.pyc -------------------------------------------------------------------------------- /WeatherCrawler/spiders/province_spider.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cool-firer/WeatherCrawler/HEAD/WeatherCrawler/spiders/province_spider.pyc -------------------------------------------------------------------------------- /aps/constant.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | # 直辖市 4 | DIRECT_CITY = [u'北京', u'上海', u'天津', u'重庆'] 5 | 6 | # pig zone 7 | PIG_ZONE = [u'香港', u'台湾', u'澳门'] 8 | 9 | -------------------------------------------------------------------------------- /WeatherCrawler/spiders/constant.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | # 直辖市 4 | DIRECT_CITY = [u'北京', u'上海', u'天津', u'重庆'] 5 | 6 | # pig zone 7 | PIG_ZONE = [u'香港', u'台湾', u'澳门'] 8 | 9 | -------------------------------------------------------------------------------- /WeatherCrawler/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = WeatherCrawler.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = WeatherCrawler 12 | -------------------------------------------------------------------------------- /WeatherCrawler/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class WeathercrawlerPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /WeatherCrawler/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class WeathercrawlerItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /aps/test.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from apscheduler.schedulers.blocking import BlockingScheduler 3 | 4 | scheduler = BlockingScheduler() 5 | 6 | def test(): 7 | print "now is '%s' " % datetime.datetime.now() 8 | 9 | scheduler.add_job(test, "cron", second="*/3") 10 | 11 | try: 12 | scheduler.start() 13 | except (KeyboardInterrupt, SystemExit): 14 | scheduler.shutdown() 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # WeatherCrawler 2 | 爬取天气数据的爬虫,会抓取从今天之后的七天天气数据 3 | 4 | 运行之前,确保已经安装scrapy、mongodb 5 | 6 | 7 | 安装scrapy: pip install scrapy 8 | 9 | # 运行爬虫 10 | scrapy crawl province_spider 11 | 12 | 或者指定日志级别的方式启动: 13 | 14 | scrapy crawl province_spider --loglevel INFO 15 | 16 | 会依次从各省、市、县爬取天气数据,存入数据库,最终的数据如下: 17 |

18 | weather> db.wea.findOne()
19 | {
20 |         "_id" : ObjectId("598c1a0efb64421dcc437c47"),
21 |         "province" : "湖北",
22 |         "city" : "武汉",
23 |         "data" : [
24 |                 {
25 |                         "wind_force" : "微风",
26 |                         "max_tem" : "35",
27 |                         "min_tem" : "27℃",
28 |                         "wind_direction" : [
29 |                                 "无持续风向",
30 |                                 "无持续风向"
31 |                         ],
32 |                         "day" : "10日(今天)",
33 |                         "desc" : "晴转多云"
34 |                 },
35 |               ...
36 |   }
37 | 
38 | 
39 | -------------------------------------------------------------------------------- /aps/my_logger.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | import logging 4 | 5 | class Logger(): 6 | def __init__(self, logger_name='root', log_level=logging.INFO, file_name=None, console=True): 7 | ''' 8 | @param logger_name: 日志名 9 | @param log_level: 日志级别 10 | @param file_name: 日志文件 11 | @param console: 输出到控制台 12 | ''' 13 | self.logger = logging.getLogger(logger_name) 14 | self.logger.setLevel(log_level) 15 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') 16 | 17 | if file_name: 18 | fh = logging.FileHandler(file_name) 19 | fh.setLevel(log_level) 20 | fh.setFormatter(formatter) 21 | self.logger.addHandler(fh) 22 | 23 | if console: 24 | ch = logging.StreamHandler() 25 | ch.setLevel(log_level) 26 | ch.setFormatter(formatter) 27 | self.logger.addHandler(ch) 28 | 29 | def getLogger(self): 30 | return self.logger 31 | -------------------------------------------------------------------------------- /aps/tasks.py: -------------------------------------------------------------------------------- 1 | #-*- coding: utf-8 -*- 2 | #!/usr/bin/python 3 | 4 | import datetime 5 | from apscheduler.schedulers.blocking import BlockingScheduler 6 | from scrapy.crawler import CrawlerProcess 7 | from province_spider import ProvinceSpider 8 | from billiard import Process 9 | 10 | from scrapy.utils.log import configure_logging 11 | configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s', 'LOG_FILE': 'schedule.log'}) 12 | 13 | def _crawl(path=None): 14 | crawl = CrawlerProcess({ 15 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)' 16 | }) 17 | crawl.crawl(ProvinceSpider) 18 | crawl.start() 19 | crawl.stop() 20 | 21 | def run_crawl(path=None): 22 | p = Process(target=_crawl, args=['hahahahha']) 23 | p.start() 24 | #p.join() 25 | 26 | 27 | scheduler = BlockingScheduler(daemon=True) 28 | scheduler.add_job(run_crawl, "cron", hour=8, minute=30, timezone='Asia/Shanghai') 29 | scheduler.add_job(run_crawl, "cron", hour=12, minute=30, timezone='Asia/Shanghai') 30 | scheduler.add_job(run_crawl, "cron", hour=18, minute=30, timezone='Asia/Shanghai') 31 | 32 | try: 33 | scheduler.start() 34 | except (KeyboardInterrupt, SystemExit): 35 | scheduler.shutdown() 36 | 37 | -------------------------------------------------------------------------------- /WeatherCrawler/db/mongo_db.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | import traceback 4 | import threading 5 | from pymongo import MongoClient 6 | 7 | Lock = threading.Lock() 8 | 9 | class MongoDB(object): 10 | ''' 11 | mongodb class 12 | ''' 13 | __instance = None 14 | 15 | __connection = MongoClient('localhost', 27017) 16 | 17 | def __init__(self): 18 | self.client = self.__connection 19 | 20 | def __new__(cls, *args, **kwargs): 21 | if not cls.__instance: 22 | try: 23 | Lock.acquire() 24 | if not cls.__instance: 25 | cls.__instance = super(MongoDB, cls).__new__(cls, *args, **kwargs) 26 | finally: 27 | Lock.release() 28 | return cls.__instance 29 | 30 | def insert(self, database, collection, data): 31 | ''' 32 | @param database: 数据库名称 33 | @param collection: 集合 34 | @param data: 数据 35 | ''' 36 | db = self.client[database] 37 | c = db[collection] 38 | c.insert_one(data) 39 | 40 | def drop(self, database, collection): 41 | ''' 42 | @param database: 数据库名称 43 | @Param collection: 集合 44 | ''' 45 | db = self.client[database] 46 | db[collection].drop() 47 | 48 | def remove(self, database, collection, condition): 49 | ''' 50 | @param database: 数据库名称 51 | @param collection: 集合 52 | @param condition: 条件 53 | ''' 54 | db = self.client[database] 55 | c = db[collection] 56 | c.remove(condition) 57 | 58 | def query(self, sql, params=None): 59 | pass 60 | 61 | -------------------------------------------------------------------------------- /WeatherCrawler/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class WeathercrawlerSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /aps/mongo_db.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | import traceback 4 | import threading 5 | from pymongo import MongoClient 6 | 7 | Lock = threading.Lock() 8 | 9 | class MongoDB(object): 10 | ''' 11 | mongodb class 12 | ''' 13 | __instance = None 14 | 15 | #__connection = MongoClient('localhost', 27017) 16 | 17 | def __init__(self, *args, **kwargs): 18 | self.client = self.__connection 19 | 20 | def __new__(cls, *args, **kwargs): 21 | if not cls.__instance: 22 | try: 23 | Lock.acquire() 24 | if not cls.__instance: 25 | cls.__instance = super(MongoDB, cls).__new__(cls, *args, **kwargs) 26 | 27 | if kwargs.get('auth', False) == False: 28 | #uri = 'mongodb://localhost:27017/' 29 | host = kwargs.get('host', 'localhost') 30 | port = kwargs.get('port', '27017') 31 | uri = 'mongodb://{host}:{port}/'.format(**{'host': host, 'port': port}) 32 | else: 33 | uri = "mongodb://{user}:{password}@{host}/{authSource}?authMechanism={authMechanism}".format(**kwargs) 34 | #uri = "mongodb://jc:jc@localhost/admin?authMechanism=SCRAM-SHA-1" 35 | cls.__connection = MongoClient(uri) 36 | finally: 37 | Lock.release() 38 | return cls.__instance 39 | 40 | def insert(self, database, collection, data): 41 | ''' 42 | @param database: 数据库名称 43 | @param collection: 集合 44 | @param data: 数据 45 | ''' 46 | db = self.client[database] 47 | c = db[collection] 48 | c.insert_one(data) 49 | 50 | def drop(self, database, collection): 51 | ''' 52 | @param database: 数据库名称 53 | @Param collection: 集合 54 | ''' 55 | db = self.client[database] 56 | db[collection].drop() 57 | 58 | def remove(self, database, collection, condition): 59 | ''' 60 | @param database: 数据库名称 61 | @param collection: 集合 62 | @param condition: 条件 63 | ''' 64 | db = self.client[database] 65 | c = db[collection] 66 | c.remove(condition) 67 | 68 | def query(self, sql, params=None): 69 | pass 70 | 71 | -------------------------------------------------------------------------------- /WeatherCrawler/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for WeatherCrawler project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'WeatherCrawler' 13 | 14 | SPIDER_MODULES = ['WeatherCrawler.spiders'] 15 | NEWSPIDER_MODULE = 'WeatherCrawler.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'WeatherCrawler (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'WeatherCrawler.middlewares.WeathercrawlerSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'WeatherCrawler.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'WeatherCrawler.pipelines.WeathercrawlerPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /WeatherCrawler/spiders/province_spider.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | import scrapy 4 | import WeatherCrawler.spiders.constant as constant 5 | from WeatherCrawler.db.mongo_db import MongoDB 6 | 7 | class ProvinceSpider(scrapy.Spider): 8 | ''' 9 | get province 10 | ''' 11 | name = 'province_spider' 12 | allowed_domains = ['weather.com.cn'] 13 | start_urls = ['http://www.weather.com.cn/province/'] 14 | 15 | def __init__(self): 16 | super(ProvinceSpider, self).__init__() 17 | self.db = MongoDB() 18 | self.db.remove('weather', 'wea', {}) 19 | 20 | def parse(self, response): 21 | ''' 22 | 解析省 23 | ''' 24 | provinces = [] 25 | for li in response.xpath('//div[@class="sheng_rukou"]/ul/li'): 26 | name = li.xpath('.//text()').extract_first() 27 | if name not in constant.PIG_ZONE: 28 | provinces.append({ 29 | 'url': li.xpath('a/@href').extract_first(), 30 | 'province': name 31 | }) 32 | for p in provinces: 33 | yield scrapy.Request(p['url'], callback=self.parse_city, meta=p) 34 | 35 | def parse_city(self, response): 36 | ''' 37 | 解析市/区 38 | ''' 39 | # 上级省/直辖市 40 | province_info = response.meta 41 | 42 | cities = [] 43 | for a in response.xpath('//div[@class="navbox"]/span/a'): 44 | cities.append({ 45 | 'url': response.urljoin(a.xpath('@href').extract_first()), 46 | 'city': a.xpath('.//text()').extract_first() 47 | }) 48 | # shirt, 广东省的主页样式不一样 49 | if not cities: 50 | for a in response.xpath('//div[@class="area_Weather"]/ul/li'): 51 | cities.append({ 52 | 'url': response.urljoin(a.xpath('./a/@href').extract_first()), 53 | 'city': a.xpath('./a/text()').extract_first() 54 | }) 55 | for c in cities: 56 | yield scrapy.Request(c['url'], callback=self.parse_county, meta={ 57 | 'province': province_info['province'], 58 | 'city': c['city'] 59 | }) 60 | 61 | 62 | def parse_county(self, response): 63 | ''' 64 | 解析县 65 | ''' 66 | city_info = response.meta 67 | 68 | # 如果是直辖市, 没有下级县, 直接解析天气数据 69 | if city_info['province'] in constant.DIRECT_CITY: 70 | self.parse_direct_weather(response, city_info) 71 | 72 | else: 73 | counties = [] 74 | for a in response.xpath('//div[@class="navbox"]/span/a'): 75 | counties.append({ 76 | 'url': response.urljoin(a.xpath('@href').extract_first()), 77 | 'county': a.xpath('.//text()').extract_first() 78 | }) 79 | for c in counties: 80 | city_info['county'] = c['county'] 81 | yield scrapy.Request(c['url'], callback=self.parse_county_weather, meta=city_info) 82 | 83 | def parse_county_weather(self, response): 84 | ''' 85 | 解析县天气数据 86 | ''' 87 | meta = response.meta 88 | self._parse_weather(response, meta) 89 | 90 | 91 | def parse_direct_weather(self, response, meta): 92 | ''' 93 | 解析直辖市天气数据 94 | ''' 95 | #self.logger.info('provicince:%s, city:%s', meta['province'], meta['city']) 96 | self._parse_weather(response, meta) 97 | 98 | 99 | def _parse_weather(self, response, meta): 100 | seven_day_weather = [] 101 | for li in response.xpath('//div[@id="7d"]/ul[@class="t clearfix"]/li'): 102 | # 相对日期 103 | h1 = li.xpath('./h1/text()').extract_first() 104 | # 描述 105 | desc = li.xpath('./p[@class="wea"]/text()').extract_first() 106 | # 最高、低温度 107 | max_tem = li.xpath('./p[@class="tem"]/span/text()').extract_first() 108 | min_tem = li.xpath('./p[@class="tem"]/i/text()').extract_first() 109 | # 风向 110 | wind_direction = li.xpath('.//em/span/@title').extract() 111 | # 风力 可能会有隐患 112 | wf = li.xpath('.//i/text()').extract() 113 | wind_force = wf[-1] if len(wf) >= 2 else 'unkonw' 114 | 115 | seven_day_weather.append({ 116 | 'day': h1, 117 | 'desc': desc, 118 | 'max_tem': max_tem, 119 | 'min_tem': min_tem, 120 | 'wind_direction': wind_direction, 121 | 'wind_force': wind_force 122 | }) 123 | self.logger.info("========province:%s=======city:%s========county:%s", meta['province'], meta['city'], meta.get('county', None)) 124 | 125 | data = { 126 | 'province': meta['province'], 127 | 'city': meta['city'], 128 | 'county': meta.get('county', None), 129 | 'data': seven_day_weather 130 | } 131 | self.db.insert('weather', 'wea', data) 132 | 133 | -------------------------------------------------------------------------------- /aps/province_spider.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | 3 | import scrapy 4 | import constant as constant 5 | from mongo_db import MongoDB 6 | from billiard import Process 7 | from my_logger import Logger 8 | 9 | import time 10 | 11 | class ProvinceSpider(scrapy.Spider): 12 | ''' 13 | get province 14 | ''' 15 | name = 'province_spider' 16 | allowed_domains = ['weather.com.cn'] 17 | start_urls = ['http://www.weather.com.cn/province/'] 18 | 19 | def __init__(self): 20 | file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log' 21 | #self.logger = logger#logging.getLogger('weather.spider') 22 | self.log = Logger('province_spider', console=False, file_name=file_name).getLogger() 23 | self.db = MongoDB( 24 | auth=True, 25 | host='localhost', 26 | user='jc', 27 | password='jc', 28 | authSource='admin', 29 | authMechanism='SCRAM-SHA-1') 30 | 31 | self.db.remove('weather', 'wea', {}) 32 | super(ProvinceSpider, self).__init__() 33 | 34 | def parse(self, response): 35 | ''' 36 | 解析省 37 | ''' 38 | provinces = [] 39 | for li in response.xpath('//div[@class="sheng_rukou"]/ul/li'): 40 | name = li.xpath('.//text()').extract_first() 41 | if name not in constant.PIG_ZONE: 42 | provinces.append({ 43 | 'url': li.xpath('a/@href').extract_first(), 44 | 'province': name 45 | }) 46 | for p in provinces: 47 | yield scrapy.Request(p['url'], callback=self.parse_city, meta=p) 48 | 49 | def parse_city(self, response): 50 | ''' 51 | 解析市/区 52 | ''' 53 | # 上级省/直辖市 54 | province_info = response.meta 55 | 56 | cities = [] 57 | for a in response.xpath('//div[@class="navbox"]/span/a'): 58 | cities.append({ 59 | 'url': response.urljoin(a.xpath('@href').extract_first()), 60 | 'city': a.xpath('.//text()').extract_first() 61 | }) 62 | # shirt, 广东省的主页样式不一样 63 | if not cities: 64 | for a in response.xpath('//div[@class="area_Weather"]/ul/li'): 65 | cities.append({ 66 | 'url': response.urljoin(a.xpath('./a/@href').extract_first()), 67 | 'city': a.xpath('./a/text()').extract_first() 68 | }) 69 | for c in cities: 70 | yield scrapy.Request(c['url'], callback=self.parse_county, meta={ 71 | 'province': province_info['province'], 72 | 'city': c['city'] 73 | }) 74 | 75 | 76 | def parse_county(self, response): 77 | ''' 78 | 解析县 79 | ''' 80 | city_info = response.meta 81 | 82 | # 如果是直辖市, 没有下级县, 直接解析天气数据 83 | if city_info['province'] in constant.DIRECT_CITY: 84 | self.parse_direct_weather(response, city_info) 85 | 86 | else: 87 | counties = [] 88 | for a in response.xpath('//div[@class="navbox"]/span/a'): 89 | counties.append({ 90 | 'url': response.urljoin(a.xpath('@href').extract_first()), 91 | 'county': a.xpath('.//text()').extract_first() 92 | }) 93 | for c in counties: 94 | city_info['county'] = c['county'] 95 | yield scrapy.Request(c['url'], callback=self.parse_county_weather, meta=city_info) 96 | 97 | def parse_county_weather(self, response): 98 | ''' 99 | 解析县天气数据 100 | ''' 101 | meta = response.meta 102 | self._parse_weather(response, meta) 103 | 104 | 105 | def parse_direct_weather(self, response, meta): 106 | ''' 107 | 解析直辖市天气数据 108 | ''' 109 | #self.log.info('provicince:%s, city:%s', meta['province'], meta['city']) 110 | self._parse_weather(response, meta) 111 | 112 | 113 | def _parse_weather(self, response, meta): 114 | seven_day_weather = [] 115 | for li in response.xpath('//div[@id="7d"]/ul[@class="t clearfix"]/li'): 116 | # 相对日期 117 | h1 = li.xpath('./h1/text()').extract_first() 118 | # 描述 119 | desc = li.xpath('./p[@class="wea"]/text()').extract_first() 120 | # 最高、低温度 121 | max_tem = li.xpath('./p[@class="tem"]/span/text()').extract_first() 122 | min_tem = li.xpath('./p[@class="tem"]/i/text()').extract_first() 123 | # 风向 124 | wind_direction = li.xpath('.//em/span/@title').extract() 125 | # 风力 可能会有隐患 126 | wf = li.xpath('.//i/text()').extract() 127 | wind_force = wf[-1] if len(wf) >= 2 else 'unkonw' 128 | 129 | seven_day_weather.append({ 130 | 'day': h1, 131 | 'desc': desc, 132 | 'max_tem': max_tem, 133 | 'min_tem': min_tem, 134 | 'wind_direction': wind_direction, 135 | 'wind_force': wind_force 136 | }) 137 | self.log.info("========province:%s=======city:%s========county:%s", meta['province'], meta['city'], meta.get('county', None)) 138 | 139 | data = { 140 | 'province': meta['province'], 141 | 'city': meta['city'], 142 | 'county': meta.get('county', None), 143 | 'data': seven_day_weather 144 | } 145 | self.db.insert('weather', 'wea', data) 146 | 147 | 148 | def _crawl(path): 149 | from scrapy.crawler import CrawlerProcess 150 | crawl = CrawlerProcess({ 151 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 152 | 'LOG_FILE': 'text.log', 153 | 'LOG_LEVEL': 'INFO' 154 | }) 155 | crawl.crawl(ProvinceSpider) 156 | crawl.start() 157 | crawl.stop() 158 | 159 | def run_crawl(path): 160 | p = Process(target=_crawl, args=['hahahahha']) 161 | p.start() 162 | p.join() 163 | 164 | if __name__ == '__main__': 165 | run_crawl('xxx') 166 | --------------------------------------------------------------------------------