├── aps
    ├── log
    │   └── weather_aps.log
    ├── constant.py
    ├── test.py
    ├── my_logger.py
    ├── tasks.py
    ├── mongo_db.py
    └── province_spider.py
├── WeatherCrawler
    ├── __init__.py
    ├── db
    │   ├── __init__.py
    │   └── mongo_db.py
    ├── spiders
    │   ├── __init__.pyc
    │   ├── constant.pyc
    │   ├── province_spider.pyc
    │   ├── constant.py
    │   ├── __init__.py
    │   └── province_spider.py
    ├── pipelines.py
    ├── items.py
    ├── middlewares.py
    └── settings.py
├── scrapy.cfg
└── README.md


/aps/log/weather_aps.log:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/WeatherCrawler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/WeatherCrawler/db/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/WeatherCrawler/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cool-firer/WeatherCrawler/HEAD/WeatherCrawler/spiders/__init__.pyc


--------------------------------------------------------------------------------
/WeatherCrawler/spiders/constant.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cool-firer/WeatherCrawler/HEAD/WeatherCrawler/spiders/constant.pyc


--------------------------------------------------------------------------------
/WeatherCrawler/spiders/province_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cool-firer/WeatherCrawler/HEAD/WeatherCrawler/spiders/province_spider.pyc


--------------------------------------------------------------------------------
/aps/constant.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 | 
3 | # 直辖市
4 | DIRECT_CITY = [u'北京', u'上海', u'天津', u'重庆']
5 | 
6 | # pig zone
7 | PIG_ZONE = [u'香港', u'台湾', u'澳门']
8 | 
9 | 


--------------------------------------------------------------------------------
/WeatherCrawler/spiders/constant.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 | 
3 | # 直辖市
4 | DIRECT_CITY = [u'北京', u'上海', u'天津', u'重庆']
5 | 
6 | # pig zone
7 | PIG_ZONE = [u'香港', u'台湾', u'澳门']
8 | 
9 | 


--------------------------------------------------------------------------------
/WeatherCrawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = WeatherCrawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = WeatherCrawler
12 | 


--------------------------------------------------------------------------------
/WeatherCrawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class WeathercrawlerPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/WeatherCrawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WeathercrawlerItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/aps/test.py:
--------------------------------------------------------------------------------
 1 | import datetime  
 2 | from apscheduler.schedulers.blocking import BlockingScheduler  
 3 |   
 4 | scheduler = BlockingScheduler()  
 5 |   
 6 | def test():  
 7 |     print "now is '%s' " % datetime.datetime.now()  
 8 |   
 9 | scheduler.add_job(test, "cron", second="*/3")  
10 |   
11 | try:  
12 |     scheduler.start()  
13 | except (KeyboardInterrupt, SystemExit):  
14 |     scheduler.shutdown() 
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # WeatherCrawler
 2 | 爬取天气数据的爬虫，会抓取从今天之后的七天天气数据
 3 | 
 4 | 运行之前，确保已经安装scrapy、mongodb
 5 | 
 6 | 
 7 | 安装scrapy: <code>pip install scrapy</code>
 8 | 
 9 | # 运行爬虫
10 | <code>scrapy crawl province_spider</code>
11 | 
12 | 或者指定日志级别的方式启动：
13 | 
14 | <code>scrapy crawl province_spider --loglevel INFO</code>
15 | 
16 | 会依次从各省、市、县爬取天气数据，存入数据库，最终的数据如下：
17 | <pre><code>
18 | weather> db.wea.findOne()
19 | {
20 |         "_id" : ObjectId("598c1a0efb64421dcc437c47"),
21 |         "province" : "湖北",
22 |         "city" : "武汉",
23 |         "data" : [
24 |                 {
25 |                         "wind_force" : "微风",
26 |                         "max_tem" : "35",
27 |                         "min_tem" : "27℃",
28 |                         "wind_direction" : [
29 |                                 "无持续风向",
30 |                                 "无持续风向"
31 |                         ],
32 |                         "day" : "10日（今天）",
33 |                         "desc" : "晴转多云"
34 |                 },
35 |               ...
36 |   }
37 | 
38 | </code></pre>
39 | 


--------------------------------------------------------------------------------
/aps/my_logger.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | 
 3 | import logging
 4 | 
 5 | class Logger():
 6 |     def __init__(self, logger_name='root', log_level=logging.INFO, file_name=None, console=True):
 7 |         '''
 8 |             @param logger_name: 日志名
 9 |             @param log_level: 日志级别
10 |             @param file_name: 日志文件
11 |             @param console: 输出到控制台
12 |         '''
13 |         self.logger = logging.getLogger(logger_name)
14 |         self.logger.setLevel(log_level)
15 |         formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
16 |         
17 |         if file_name:
18 |             fh = logging.FileHandler(file_name)
19 |             fh.setLevel(log_level)
20 |             fh.setFormatter(formatter)
21 |             self.logger.addHandler(fh)
22 |             
23 |         if console:
24 |             ch = logging.StreamHandler()
25 |             ch.setLevel(log_level)
26 |             ch.setFormatter(formatter)
27 |             self.logger.addHandler(ch)
28 | 
29 |     def getLogger(self):
30 |         return self.logger
31 | 


--------------------------------------------------------------------------------
/aps/tasks.py:
--------------------------------------------------------------------------------
 1 | #-*- coding: utf-8 -*-
 2 | #!/usr/bin/python
 3 | 
 4 | import datetime
 5 | from apscheduler.schedulers.blocking import BlockingScheduler
 6 | from scrapy.crawler import CrawlerProcess
 7 | from province_spider import ProvinceSpider
 8 | from billiard import Process
 9 | 
10 | from scrapy.utils.log import configure_logging
11 | configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s', 'LOG_FILE': 'schedule.log'})
12 | 
13 | def _crawl(path=None):
14 |      crawl = CrawlerProcess({
15 |          'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
16 |      })
17 |      crawl.crawl(ProvinceSpider)
18 |      crawl.start()
19 |      crawl.stop()
20 | 
21 | def run_crawl(path=None):
22 |     p = Process(target=_crawl, args=['hahahahha'])
23 |     p.start()
24 |     #p.join()
25 | 
26 | 
27 | scheduler = BlockingScheduler(daemon=True)
28 | scheduler.add_job(run_crawl, "cron", hour=8, minute=30, timezone='Asia/Shanghai')
29 | scheduler.add_job(run_crawl, "cron", hour=12, minute=30, timezone='Asia/Shanghai')
30 | scheduler.add_job(run_crawl, "cron", hour=18, minute=30, timezone='Asia/Shanghai')
31 | 
32 | try:
33 | 	scheduler.start()
34 | except (KeyboardInterrupt, SystemExit):
35 | 	scheduler.shutdown()
36 | 
37 | 


--------------------------------------------------------------------------------
/WeatherCrawler/db/mongo_db.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | 
 3 | import traceback
 4 | import threading
 5 | from pymongo import MongoClient
 6 | 
 7 | Lock = threading.Lock()
 8 | 
 9 | class MongoDB(object):
10 |     '''
11 |         mongodb class
12 |     '''
13 |     __instance = None
14 | 
15 |     __connection = MongoClient('localhost', 27017)
16 | 
17 |     def __init__(self):
18 |         self.client = self.__connection
19 | 
20 |     def __new__(cls, *args, **kwargs):
21 |         if not cls.__instance:
22 |             try:
23 |                 Lock.acquire()
24 |                 if not cls.__instance:
25 |                     cls.__instance = super(MongoDB, cls).__new__(cls, *args, **kwargs)
26 |             finally:
27 |                 Lock.release()
28 |         return cls.__instance
29 | 
30 |     def insert(self, database, collection, data):
31 |         '''
32 |             @param database: 数据库名称
33 |             @param collection: 集合
34 |             @param data: 数据
35 |         '''
36 |         db = self.client[database]
37 |         c = db[collection]
38 |         c.insert_one(data)
39 |     
40 |     def drop(self, database, collection):
41 |         '''
42 |             @param database: 数据库名称
43 |             @Param collection: 集合
44 |         '''
45 |         db = self.client[database]
46 |         db[collection].drop()
47 |     
48 |     def remove(self, database, collection, condition):
49 |         '''
50 |             @param database: 数据库名称
51 |             @param collection: 集合
52 |             @param condition: 条件
53 |         '''
54 |         db = self.client[database]
55 |         c = db[collection]
56 |         c.remove(condition)
57 | 
58 |     def query(self, sql, params=None):
59 |         pass
60 | 
61 | 


--------------------------------------------------------------------------------
/WeatherCrawler/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class WeathercrawlerSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/aps/mongo_db.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | 
 3 | import traceback
 4 | import threading
 5 | from pymongo import MongoClient
 6 | 
 7 | Lock = threading.Lock()
 8 | 
 9 | class MongoDB(object):
10 |     '''
11 |         mongodb class
12 |     '''
13 |     __instance = None
14 | 
15 |     #__connection = MongoClient('localhost', 27017)
16 | 
17 |     def __init__(self, *args, **kwargs):
18 |         self.client = self.__connection
19 | 
20 |     def __new__(cls, *args, **kwargs):
21 |         if not cls.__instance:
22 |             try:
23 |                 Lock.acquire()
24 |                 if not cls.__instance:
25 |                     cls.__instance = super(MongoDB, cls).__new__(cls, *args, **kwargs)
26 |                     
27 |                     if kwargs.get('auth', False) == False:
28 |                         #uri = 'mongodb://localhost:27017/'
29 |                         host = kwargs.get('host', 'localhost')
30 |                         port = kwargs.get('port', '27017')
31 |                         uri = 'mongodb://{host}:{port}/'.format(**{'host': host, 'port': port})
32 |                     else:
33 |                         uri = "mongodb://{user}:{password}@{host}/{authSource}?authMechanism={authMechanism}".format(**kwargs)
34 |                     #uri = "mongodb://jc:jc@localhost/admin?authMechanism=SCRAM-SHA-1"
35 |                     cls.__connection = MongoClient(uri)
36 |             finally:
37 |                 Lock.release()
38 |         return cls.__instance
39 | 
40 |     def insert(self, database, collection, data):
41 |         '''
42 |             @param database: 数据库名称
43 |             @param collection: 集合
44 |             @param data: 数据
45 |         '''
46 |         db = self.client[database]
47 |         c = db[collection]
48 |         c.insert_one(data)
49 |     
50 |     def drop(self, database, collection):
51 |         '''
52 |             @param database: 数据库名称
53 |             @Param collection: 集合
54 |         '''
55 |         db = self.client[database]
56 |         db[collection].drop()
57 |     
58 |     def remove(self, database, collection, condition):
59 |         '''
60 |             @param database: 数据库名称
61 |             @param collection: 集合
62 |             @param condition: 条件
63 |         '''
64 |         db = self.client[database]
65 |         c = db[collection]
66 |         c.remove(condition)
67 | 
68 |     def query(self, sql, params=None):
69 |         pass
70 | 
71 | 


--------------------------------------------------------------------------------
/WeatherCrawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for WeatherCrawler project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'WeatherCrawler'
13 | 
14 | SPIDER_MODULES = ['WeatherCrawler.spiders']
15 | NEWSPIDER_MODULE = 'WeatherCrawler.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'WeatherCrawler (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'WeatherCrawler.middlewares.WeathercrawlerSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'WeatherCrawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'WeatherCrawler.pipelines.WeathercrawlerPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/WeatherCrawler/spiders/province_spider.py:
--------------------------------------------------------------------------------
  1 | # coding:  utf8
  2 | 
  3 | import scrapy
  4 | import WeatherCrawler.spiders.constant as constant
  5 | from WeatherCrawler.db.mongo_db import MongoDB
  6 | 
  7 | class ProvinceSpider(scrapy.Spider):
  8 |     '''
  9 |         get province
 10 |     '''
 11 |     name = 'province_spider'
 12 |     allowed_domains = ['weather.com.cn']
 13 |     start_urls = ['http://www.weather.com.cn/province/']
 14 |     
 15 |     def __init__(self):
 16 |         super(ProvinceSpider, self).__init__()
 17 |         self.db = MongoDB()
 18 |         self.db.remove('weather', 'wea', {})
 19 | 
 20 |     def parse(self, response):
 21 |         '''
 22 |             解析省
 23 |         '''
 24 |         provinces = []
 25 |         for li in response.xpath('//div[@class="sheng_rukou"]/ul/li'):
 26 |             name = li.xpath('.//text()').extract_first()
 27 |             if name not in constant.PIG_ZONE:
 28 |                 provinces.append({
 29 |                     'url': li.xpath('a/@href').extract_first(),
 30 |                     'province': name
 31 |                 })
 32 |         for p in provinces:
 33 |             yield scrapy.Request(p['url'], callback=self.parse_city, meta=p)
 34 | 
 35 |     def parse_city(self, response):
 36 |         '''
 37 |             解析市/区
 38 |         '''
 39 |         # 上级省/直辖市
 40 |         province_info = response.meta
 41 | 
 42 |         cities = []
 43 |         for a in response.xpath('//div[@class="navbox"]/span/a'):
 44 |             cities.append({
 45 |                 'url': response.urljoin(a.xpath('@href').extract_first()),
 46 |                 'city': a.xpath('.//text()').extract_first()
 47 |             })
 48 |         # shirt, 广东省的主页样式不一样
 49 |         if not cities:
 50 |             for a in response.xpath('//div[@class="area_Weather"]/ul/li'):
 51 |                 cities.append({
 52 |                     'url': response.urljoin(a.xpath('./a/@href').extract_first()),
 53 |                     'city': a.xpath('./a/text()').extract_first()
 54 |                 })
 55 |         for c in cities:
 56 |             yield scrapy.Request(c['url'], callback=self.parse_county, meta={
 57 |                 'province': province_info['province'],
 58 |                 'city': c['city']
 59 |             })
 60 |         
 61 |         
 62 |     def parse_county(self, response):
 63 |         '''
 64 |             解析县
 65 |         '''
 66 |         city_info = response.meta
 67 | 
 68 |         # 如果是直辖市, 没有下级县, 直接解析天气数据
 69 |         if city_info['province'] in constant.DIRECT_CITY:
 70 |             self.parse_direct_weather(response, city_info)
 71 |         
 72 |         else:
 73 |             counties = []
 74 |             for a in response.xpath('//div[@class="navbox"]/span/a'):
 75 |                 counties.append({
 76 |                     'url': response.urljoin(a.xpath('@href').extract_first()),
 77 |                     'county': a.xpath('.//text()').extract_first()
 78 |                 })
 79 |             for c in counties:
 80 |                 city_info['county'] = c['county']
 81 |                 yield scrapy.Request(c['url'], callback=self.parse_county_weather, meta=city_info)
 82 |         
 83 |     def parse_county_weather(self, response):
 84 |         '''
 85 |             解析县天气数据
 86 |         '''
 87 |         meta = response.meta
 88 |         self._parse_weather(response, meta)
 89 | 
 90 | 
 91 |     def parse_direct_weather(self, response, meta):
 92 |         '''
 93 |             解析直辖市天气数据
 94 |         '''
 95 |         #self.logger.info('provicince:%s, city:%s', meta['province'], meta['city'])
 96 |         self._parse_weather(response, meta)
 97 | 
 98 | 
 99 |     def _parse_weather(self, response, meta):
100 |         seven_day_weather = []
101 |         for li in response.xpath('//div[@id="7d"]/ul[@class="t clearfix"]/li'):
102 |             # 相对日期
103 |             h1 = li.xpath('./h1/text()').extract_first()
104 |             # 描述
105 |             desc = li.xpath('./p[@class="wea"]/text()').extract_first()
106 |             # 最高、低温度
107 |             max_tem = li.xpath('./p[@class="tem"]/span/text()').extract_first()
108 |             min_tem = li.xpath('./p[@class="tem"]/i/text()').extract_first()
109 |             # 风向
110 |             wind_direction = li.xpath('.//em/span/@title').extract()
111 |             # 风力 可能会有隐患
112 |             wf = li.xpath('.//i/text()').extract()
113 |             wind_force = wf[-1] if len(wf) >= 2 else 'unkonw'
114 | 
115 |             seven_day_weather.append({
116 |                 'day': h1,
117 |                 'desc': desc,
118 |                 'max_tem': max_tem,
119 |                 'min_tem': min_tem,
120 |                 'wind_direction': wind_direction,
121 |                 'wind_force': wind_force
122 |             })
123 |         self.logger.info("========province:%s=======city:%s========county:%s", meta['province'], meta['city'], meta.get('county', None))
124 | 
125 |         data = {
126 |             'province': meta['province'],
127 |             'city': meta['city'],
128 |             'county': meta.get('county', None),
129 |             'data': seven_day_weather
130 |         }
131 |         self.db.insert('weather', 'wea', data)
132 | 
133 |         


--------------------------------------------------------------------------------
/aps/province_spider.py:
--------------------------------------------------------------------------------
  1 | # coding:  utf8
  2 | 
  3 | import scrapy
  4 | import constant as constant
  5 | from mongo_db import MongoDB
  6 | from billiard import Process
  7 | from my_logger import Logger
  8 | 
  9 | import time
 10 | 
 11 | class ProvinceSpider(scrapy.Spider):
 12 |     '''
 13 |         get province
 14 |     '''
 15 |     name = 'province_spider'
 16 |     allowed_domains = ['weather.com.cn']
 17 |     start_urls = ['http://www.weather.com.cn/province/']
 18 |     
 19 |     def __init__(self):
 20 |         file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log'
 21 |         #self.logger = logger#logging.getLogger('weather.spider')
 22 |         self.log = Logger('province_spider', console=False, file_name=file_name).getLogger()
 23 |         self.db = MongoDB(
 24 |             auth=True,
 25 |             host='localhost',
 26 |             user='jc',
 27 |             password='jc',
 28 |             authSource='admin',
 29 |             authMechanism='SCRAM-SHA-1')
 30 | 
 31 |         self.db.remove('weather', 'wea', {})
 32 |         super(ProvinceSpider, self).__init__()
 33 | 
 34 |     def parse(self, response):
 35 |         '''
 36 |             解析省
 37 |         '''
 38 |         provinces = []
 39 |         for li in response.xpath('//div[@class="sheng_rukou"]/ul/li'):
 40 |             name = li.xpath('.//text()').extract_first()
 41 |             if name not in constant.PIG_ZONE:
 42 |                 provinces.append({
 43 |                     'url': li.xpath('a/@href').extract_first(),
 44 |                     'province': name
 45 |                 })
 46 |         for p in provinces:
 47 |             yield scrapy.Request(p['url'], callback=self.parse_city, meta=p)
 48 | 
 49 |     def parse_city(self, response):
 50 |         '''
 51 |             解析市/区
 52 |         '''
 53 |         # 上级省/直辖市
 54 |         province_info = response.meta
 55 | 
 56 |         cities = []
 57 |         for a in response.xpath('//div[@class="navbox"]/span/a'):
 58 |             cities.append({
 59 |                 'url': response.urljoin(a.xpath('@href').extract_first()),
 60 |                 'city': a.xpath('.//text()').extract_first()
 61 |             })
 62 |         # shirt, 广东省的主页样式不一样
 63 |         if not cities:
 64 |             for a in response.xpath('//div[@class="area_Weather"]/ul/li'):
 65 |                 cities.append({
 66 |                     'url': response.urljoin(a.xpath('./a/@href').extract_first()),
 67 |                     'city': a.xpath('./a/text()').extract_first()
 68 |                 })
 69 |         for c in cities:
 70 |             yield scrapy.Request(c['url'], callback=self.parse_county, meta={
 71 |                 'province': province_info['province'],
 72 |                 'city': c['city']
 73 |             })
 74 |         
 75 |         
 76 |     def parse_county(self, response):
 77 |         '''
 78 |             解析县
 79 |         '''
 80 |         city_info = response.meta
 81 | 
 82 |         # 如果是直辖市, 没有下级县, 直接解析天气数据
 83 |         if city_info['province'] in constant.DIRECT_CITY:
 84 |             self.parse_direct_weather(response, city_info)
 85 |         
 86 |         else:
 87 |             counties = []
 88 |             for a in response.xpath('//div[@class="navbox"]/span/a'):
 89 |                 counties.append({
 90 |                     'url': response.urljoin(a.xpath('@href').extract_first()),
 91 |                     'county': a.xpath('.//text()').extract_first()
 92 |                 })
 93 |             for c in counties:
 94 |                 city_info['county'] = c['county']
 95 |                 yield scrapy.Request(c['url'], callback=self.parse_county_weather, meta=city_info)
 96 |         
 97 |     def parse_county_weather(self, response):
 98 |         '''
 99 |             解析县天气数据
100 |         '''
101 |         meta = response.meta
102 |         self._parse_weather(response, meta)
103 | 
104 | 
105 |     def parse_direct_weather(self, response, meta):
106 |         '''
107 |             解析直辖市天气数据
108 |         '''
109 |         #self.log.info('provicince:%s, city:%s', meta['province'], meta['city'])
110 |         self._parse_weather(response, meta)
111 | 
112 | 
113 |     def _parse_weather(self, response, meta):
114 |         seven_day_weather = []
115 |         for li in response.xpath('//div[@id="7d"]/ul[@class="t clearfix"]/li'):
116 |             # 相对日期
117 |             h1 = li.xpath('./h1/text()').extract_first()
118 |             # 描述
119 |             desc = li.xpath('./p[@class="wea"]/text()').extract_first()
120 |             # 最高、低温度
121 |             max_tem = li.xpath('./p[@class="tem"]/span/text()').extract_first()
122 |             min_tem = li.xpath('./p[@class="tem"]/i/text()').extract_first()
123 |             # 风向
124 |             wind_direction = li.xpath('.//em/span/@title').extract()
125 |             # 风力 可能会有隐患
126 |             wf = li.xpath('.//i/text()').extract()
127 |             wind_force = wf[-1] if len(wf) >= 2 else 'unkonw'
128 | 
129 |             seven_day_weather.append({
130 |                 'day': h1,
131 |                 'desc': desc,
132 |                 'max_tem': max_tem,
133 |                 'min_tem': min_tem,
134 |                 'wind_direction': wind_direction,
135 |                 'wind_force': wind_force
136 |             })
137 |         self.log.info("========province:%s=======city:%s========county:%s", meta['province'], meta['city'], meta.get('county', None))
138 | 
139 |         data = {
140 |             'province': meta['province'],
141 |             'city': meta['city'],
142 |             'county': meta.get('county', None),
143 |             'data': seven_day_weather
144 |         }
145 |         self.db.insert('weather', 'wea', data)
146 | 
147 | 
148 | def _crawl(path):
149 | 	from scrapy.crawler import CrawlerProcess
150 | 	crawl = CrawlerProcess({
151 |           'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
152 |           'LOG_FILE': 'text.log',
153 |           'LOG_LEVEL': 'INFO'
154 | 	})
155 | 	crawl.crawl(ProvinceSpider)
156 | 	crawl.start()
157 | 	crawl.stop()
158 |  
159 | def run_crawl(path):
160 | 	p = Process(target=_crawl, args=['hahahahha'])
161 | 	p.start()
162 | 	p.join()
163 | 
164 | if __name__ == '__main__':
165 | 	run_crawl('xxx')	
166 | 


--------------------------------------------------------------------------------