├── aps
├── log
│ └── weather_aps.log
├── constant.py
├── test.py
├── my_logger.py
├── tasks.py
├── mongo_db.py
└── province_spider.py
├── WeatherCrawler
├── __init__.py
├── db
│ ├── __init__.py
│ └── mongo_db.py
├── spiders
│ ├── __init__.pyc
│ ├── constant.pyc
│ ├── province_spider.pyc
│ ├── constant.py
│ ├── __init__.py
│ └── province_spider.py
├── pipelines.py
├── items.py
├── middlewares.py
└── settings.py
├── scrapy.cfg
└── README.md
/aps/log/weather_aps.log:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/WeatherCrawler/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/WeatherCrawler/db/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/WeatherCrawler/spiders/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cool-firer/WeatherCrawler/HEAD/WeatherCrawler/spiders/__init__.pyc
--------------------------------------------------------------------------------
/WeatherCrawler/spiders/constant.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cool-firer/WeatherCrawler/HEAD/WeatherCrawler/spiders/constant.pyc
--------------------------------------------------------------------------------
/WeatherCrawler/spiders/province_spider.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cool-firer/WeatherCrawler/HEAD/WeatherCrawler/spiders/province_spider.pyc
--------------------------------------------------------------------------------
/aps/constant.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 |
3 | # 直辖市
4 | DIRECT_CITY = [u'北京', u'上海', u'天津', u'重庆']
5 |
6 | # pig zone
7 | PIG_ZONE = [u'香港', u'台湾', u'澳门']
8 |
9 |
--------------------------------------------------------------------------------
/WeatherCrawler/spiders/constant.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 |
3 | # 直辖市
4 | DIRECT_CITY = [u'北京', u'上海', u'天津', u'重庆']
5 |
6 | # pig zone
7 | PIG_ZONE = [u'香港', u'台湾', u'澳门']
8 |
9 |
--------------------------------------------------------------------------------
/WeatherCrawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
5 |
6 | [settings]
7 | default = WeatherCrawler.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = WeatherCrawler
12 |
--------------------------------------------------------------------------------
/WeatherCrawler/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class WeathercrawlerPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/WeatherCrawler/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class WeathercrawlerItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/aps/test.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | from apscheduler.schedulers.blocking import BlockingScheduler
3 |
4 | scheduler = BlockingScheduler()
5 |
6 | def test():
7 | print "now is '%s' " % datetime.datetime.now()
8 |
9 | scheduler.add_job(test, "cron", second="*/3")
10 |
11 | try:
12 | scheduler.start()
13 | except (KeyboardInterrupt, SystemExit):
14 | scheduler.shutdown()
15 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # WeatherCrawler
2 | 爬取天气数据的爬虫,会抓取从今天之后的七天天气数据
3 |
4 | 运行之前,确保已经安装scrapy、mongodb
5 |
6 |
7 | 安装scrapy: pip install scrapy
8 |
9 | # 运行爬虫
10 | scrapy crawl province_spider
11 |
12 | 或者指定日志级别的方式启动:
13 |
14 | scrapy crawl province_spider --loglevel INFO
15 |
16 | 会依次从各省、市、县爬取天气数据,存入数据库,最终的数据如下:
17 |
18 | weather> db.wea.findOne()
19 | {
20 | "_id" : ObjectId("598c1a0efb64421dcc437c47"),
21 | "province" : "湖北",
22 | "city" : "武汉",
23 | "data" : [
24 | {
25 | "wind_force" : "微风",
26 | "max_tem" : "35",
27 | "min_tem" : "27℃",
28 | "wind_direction" : [
29 | "无持续风向",
30 | "无持续风向"
31 | ],
32 | "day" : "10日(今天)",
33 | "desc" : "晴转多云"
34 | },
35 | ...
36 | }
37 |
38 |
39 |
--------------------------------------------------------------------------------
/aps/my_logger.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 |
3 | import logging
4 |
5 | class Logger():
6 | def __init__(self, logger_name='root', log_level=logging.INFO, file_name=None, console=True):
7 | '''
8 | @param logger_name: 日志名
9 | @param log_level: 日志级别
10 | @param file_name: 日志文件
11 | @param console: 输出到控制台
12 | '''
13 | self.logger = logging.getLogger(logger_name)
14 | self.logger.setLevel(log_level)
15 | formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
16 |
17 | if file_name:
18 | fh = logging.FileHandler(file_name)
19 | fh.setLevel(log_level)
20 | fh.setFormatter(formatter)
21 | self.logger.addHandler(fh)
22 |
23 | if console:
24 | ch = logging.StreamHandler()
25 | ch.setLevel(log_level)
26 | ch.setFormatter(formatter)
27 | self.logger.addHandler(ch)
28 |
29 | def getLogger(self):
30 | return self.logger
31 |
--------------------------------------------------------------------------------
/aps/tasks.py:
--------------------------------------------------------------------------------
1 | #-*- coding: utf-8 -*-
2 | #!/usr/bin/python
3 |
4 | import datetime
5 | from apscheduler.schedulers.blocking import BlockingScheduler
6 | from scrapy.crawler import CrawlerProcess
7 | from province_spider import ProvinceSpider
8 | from billiard import Process
9 |
10 | from scrapy.utils.log import configure_logging
11 | configure_logging({'LOG_FORMAT': '%(levelname)s: %(message)s', 'LOG_FILE': 'schedule.log'})
12 |
13 | def _crawl(path=None):
14 | crawl = CrawlerProcess({
15 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
16 | })
17 | crawl.crawl(ProvinceSpider)
18 | crawl.start()
19 | crawl.stop()
20 |
21 | def run_crawl(path=None):
22 | p = Process(target=_crawl, args=['hahahahha'])
23 | p.start()
24 | #p.join()
25 |
26 |
27 | scheduler = BlockingScheduler(daemon=True)
28 | scheduler.add_job(run_crawl, "cron", hour=8, minute=30, timezone='Asia/Shanghai')
29 | scheduler.add_job(run_crawl, "cron", hour=12, minute=30, timezone='Asia/Shanghai')
30 | scheduler.add_job(run_crawl, "cron", hour=18, minute=30, timezone='Asia/Shanghai')
31 |
32 | try:
33 | scheduler.start()
34 | except (KeyboardInterrupt, SystemExit):
35 | scheduler.shutdown()
36 |
37 |
--------------------------------------------------------------------------------
/WeatherCrawler/db/mongo_db.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 |
3 | import traceback
4 | import threading
5 | from pymongo import MongoClient
6 |
7 | Lock = threading.Lock()
8 |
9 | class MongoDB(object):
10 | '''
11 | mongodb class
12 | '''
13 | __instance = None
14 |
15 | __connection = MongoClient('localhost', 27017)
16 |
17 | def __init__(self):
18 | self.client = self.__connection
19 |
20 | def __new__(cls, *args, **kwargs):
21 | if not cls.__instance:
22 | try:
23 | Lock.acquire()
24 | if not cls.__instance:
25 | cls.__instance = super(MongoDB, cls).__new__(cls, *args, **kwargs)
26 | finally:
27 | Lock.release()
28 | return cls.__instance
29 |
30 | def insert(self, database, collection, data):
31 | '''
32 | @param database: 数据库名称
33 | @param collection: 集合
34 | @param data: 数据
35 | '''
36 | db = self.client[database]
37 | c = db[collection]
38 | c.insert_one(data)
39 |
40 | def drop(self, database, collection):
41 | '''
42 | @param database: 数据库名称
43 | @Param collection: 集合
44 | '''
45 | db = self.client[database]
46 | db[collection].drop()
47 |
48 | def remove(self, database, collection, condition):
49 | '''
50 | @param database: 数据库名称
51 | @param collection: 集合
52 | @param condition: 条件
53 | '''
54 | db = self.client[database]
55 | c = db[collection]
56 | c.remove(condition)
57 |
58 | def query(self, sql, params=None):
59 | pass
60 |
61 |
--------------------------------------------------------------------------------
/WeatherCrawler/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class WeathercrawlerSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
--------------------------------------------------------------------------------
/aps/mongo_db.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 |
3 | import traceback
4 | import threading
5 | from pymongo import MongoClient
6 |
7 | Lock = threading.Lock()
8 |
9 | class MongoDB(object):
10 | '''
11 | mongodb class
12 | '''
13 | __instance = None
14 |
15 | #__connection = MongoClient('localhost', 27017)
16 |
17 | def __init__(self, *args, **kwargs):
18 | self.client = self.__connection
19 |
20 | def __new__(cls, *args, **kwargs):
21 | if not cls.__instance:
22 | try:
23 | Lock.acquire()
24 | if not cls.__instance:
25 | cls.__instance = super(MongoDB, cls).__new__(cls, *args, **kwargs)
26 |
27 | if kwargs.get('auth', False) == False:
28 | #uri = 'mongodb://localhost:27017/'
29 | host = kwargs.get('host', 'localhost')
30 | port = kwargs.get('port', '27017')
31 | uri = 'mongodb://{host}:{port}/'.format(**{'host': host, 'port': port})
32 | else:
33 | uri = "mongodb://{user}:{password}@{host}/{authSource}?authMechanism={authMechanism}".format(**kwargs)
34 | #uri = "mongodb://jc:jc@localhost/admin?authMechanism=SCRAM-SHA-1"
35 | cls.__connection = MongoClient(uri)
36 | finally:
37 | Lock.release()
38 | return cls.__instance
39 |
40 | def insert(self, database, collection, data):
41 | '''
42 | @param database: 数据库名称
43 | @param collection: 集合
44 | @param data: 数据
45 | '''
46 | db = self.client[database]
47 | c = db[collection]
48 | c.insert_one(data)
49 |
50 | def drop(self, database, collection):
51 | '''
52 | @param database: 数据库名称
53 | @Param collection: 集合
54 | '''
55 | db = self.client[database]
56 | db[collection].drop()
57 |
58 | def remove(self, database, collection, condition):
59 | '''
60 | @param database: 数据库名称
61 | @param collection: 集合
62 | @param condition: 条件
63 | '''
64 | db = self.client[database]
65 | c = db[collection]
66 | c.remove(condition)
67 |
68 | def query(self, sql, params=None):
69 | pass
70 |
71 |
--------------------------------------------------------------------------------
/WeatherCrawler/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for WeatherCrawler project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # http://doc.scrapy.org/en/latest/topics/settings.html
9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'WeatherCrawler'
13 |
14 | SPIDER_MODULES = ['WeatherCrawler.spiders']
15 | NEWSPIDER_MODULE = 'WeatherCrawler.spiders'
16 |
17 |
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'WeatherCrawler (+http://www.yourdomain.com)'
20 |
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 |
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 |
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 |
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 |
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 |
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | # 'Accept-Language': 'en',
45 | #}
46 |
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | # 'WeatherCrawler.middlewares.WeathercrawlerSpiderMiddleware': 543,
51 | #}
52 |
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | # 'WeatherCrawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 |
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 |
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | # 'WeatherCrawler.pipelines.WeathercrawlerPipeline': 300,
69 | #}
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/WeatherCrawler/spiders/province_spider.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 |
3 | import scrapy
4 | import WeatherCrawler.spiders.constant as constant
5 | from WeatherCrawler.db.mongo_db import MongoDB
6 |
7 | class ProvinceSpider(scrapy.Spider):
8 | '''
9 | get province
10 | '''
11 | name = 'province_spider'
12 | allowed_domains = ['weather.com.cn']
13 | start_urls = ['http://www.weather.com.cn/province/']
14 |
15 | def __init__(self):
16 | super(ProvinceSpider, self).__init__()
17 | self.db = MongoDB()
18 | self.db.remove('weather', 'wea', {})
19 |
20 | def parse(self, response):
21 | '''
22 | 解析省
23 | '''
24 | provinces = []
25 | for li in response.xpath('//div[@class="sheng_rukou"]/ul/li'):
26 | name = li.xpath('.//text()').extract_first()
27 | if name not in constant.PIG_ZONE:
28 | provinces.append({
29 | 'url': li.xpath('a/@href').extract_first(),
30 | 'province': name
31 | })
32 | for p in provinces:
33 | yield scrapy.Request(p['url'], callback=self.parse_city, meta=p)
34 |
35 | def parse_city(self, response):
36 | '''
37 | 解析市/区
38 | '''
39 | # 上级省/直辖市
40 | province_info = response.meta
41 |
42 | cities = []
43 | for a in response.xpath('//div[@class="navbox"]/span/a'):
44 | cities.append({
45 | 'url': response.urljoin(a.xpath('@href').extract_first()),
46 | 'city': a.xpath('.//text()').extract_first()
47 | })
48 | # shirt, 广东省的主页样式不一样
49 | if not cities:
50 | for a in response.xpath('//div[@class="area_Weather"]/ul/li'):
51 | cities.append({
52 | 'url': response.urljoin(a.xpath('./a/@href').extract_first()),
53 | 'city': a.xpath('./a/text()').extract_first()
54 | })
55 | for c in cities:
56 | yield scrapy.Request(c['url'], callback=self.parse_county, meta={
57 | 'province': province_info['province'],
58 | 'city': c['city']
59 | })
60 |
61 |
62 | def parse_county(self, response):
63 | '''
64 | 解析县
65 | '''
66 | city_info = response.meta
67 |
68 | # 如果是直辖市, 没有下级县, 直接解析天气数据
69 | if city_info['province'] in constant.DIRECT_CITY:
70 | self.parse_direct_weather(response, city_info)
71 |
72 | else:
73 | counties = []
74 | for a in response.xpath('//div[@class="navbox"]/span/a'):
75 | counties.append({
76 | 'url': response.urljoin(a.xpath('@href').extract_first()),
77 | 'county': a.xpath('.//text()').extract_first()
78 | })
79 | for c in counties:
80 | city_info['county'] = c['county']
81 | yield scrapy.Request(c['url'], callback=self.parse_county_weather, meta=city_info)
82 |
83 | def parse_county_weather(self, response):
84 | '''
85 | 解析县天气数据
86 | '''
87 | meta = response.meta
88 | self._parse_weather(response, meta)
89 |
90 |
91 | def parse_direct_weather(self, response, meta):
92 | '''
93 | 解析直辖市天气数据
94 | '''
95 | #self.logger.info('provicince:%s, city:%s', meta['province'], meta['city'])
96 | self._parse_weather(response, meta)
97 |
98 |
99 | def _parse_weather(self, response, meta):
100 | seven_day_weather = []
101 | for li in response.xpath('//div[@id="7d"]/ul[@class="t clearfix"]/li'):
102 | # 相对日期
103 | h1 = li.xpath('./h1/text()').extract_first()
104 | # 描述
105 | desc = li.xpath('./p[@class="wea"]/text()').extract_first()
106 | # 最高、低温度
107 | max_tem = li.xpath('./p[@class="tem"]/span/text()').extract_first()
108 | min_tem = li.xpath('./p[@class="tem"]/i/text()').extract_first()
109 | # 风向
110 | wind_direction = li.xpath('.//em/span/@title').extract()
111 | # 风力 可能会有隐患
112 | wf = li.xpath('.//i/text()').extract()
113 | wind_force = wf[-1] if len(wf) >= 2 else 'unkonw'
114 |
115 | seven_day_weather.append({
116 | 'day': h1,
117 | 'desc': desc,
118 | 'max_tem': max_tem,
119 | 'min_tem': min_tem,
120 | 'wind_direction': wind_direction,
121 | 'wind_force': wind_force
122 | })
123 | self.logger.info("========province:%s=======city:%s========county:%s", meta['province'], meta['city'], meta.get('county', None))
124 |
125 | data = {
126 | 'province': meta['province'],
127 | 'city': meta['city'],
128 | 'county': meta.get('county', None),
129 | 'data': seven_day_weather
130 | }
131 | self.db.insert('weather', 'wea', data)
132 |
133 |
--------------------------------------------------------------------------------
/aps/province_spider.py:
--------------------------------------------------------------------------------
1 | # coding: utf8
2 |
3 | import scrapy
4 | import constant as constant
5 | from mongo_db import MongoDB
6 | from billiard import Process
7 | from my_logger import Logger
8 |
9 | import time
10 |
11 | class ProvinceSpider(scrapy.Spider):
12 | '''
13 | get province
14 | '''
15 | name = 'province_spider'
16 | allowed_domains = ['weather.com.cn']
17 | start_urls = ['http://www.weather.com.cn/province/']
18 |
19 | def __init__(self):
20 | file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log'
21 | #self.logger = logger#logging.getLogger('weather.spider')
22 | self.log = Logger('province_spider', console=False, file_name=file_name).getLogger()
23 | self.db = MongoDB(
24 | auth=True,
25 | host='localhost',
26 | user='jc',
27 | password='jc',
28 | authSource='admin',
29 | authMechanism='SCRAM-SHA-1')
30 |
31 | self.db.remove('weather', 'wea', {})
32 | super(ProvinceSpider, self).__init__()
33 |
34 | def parse(self, response):
35 | '''
36 | 解析省
37 | '''
38 | provinces = []
39 | for li in response.xpath('//div[@class="sheng_rukou"]/ul/li'):
40 | name = li.xpath('.//text()').extract_first()
41 | if name not in constant.PIG_ZONE:
42 | provinces.append({
43 | 'url': li.xpath('a/@href').extract_first(),
44 | 'province': name
45 | })
46 | for p in provinces:
47 | yield scrapy.Request(p['url'], callback=self.parse_city, meta=p)
48 |
49 | def parse_city(self, response):
50 | '''
51 | 解析市/区
52 | '''
53 | # 上级省/直辖市
54 | province_info = response.meta
55 |
56 | cities = []
57 | for a in response.xpath('//div[@class="navbox"]/span/a'):
58 | cities.append({
59 | 'url': response.urljoin(a.xpath('@href').extract_first()),
60 | 'city': a.xpath('.//text()').extract_first()
61 | })
62 | # shirt, 广东省的主页样式不一样
63 | if not cities:
64 | for a in response.xpath('//div[@class="area_Weather"]/ul/li'):
65 | cities.append({
66 | 'url': response.urljoin(a.xpath('./a/@href').extract_first()),
67 | 'city': a.xpath('./a/text()').extract_first()
68 | })
69 | for c in cities:
70 | yield scrapy.Request(c['url'], callback=self.parse_county, meta={
71 | 'province': province_info['province'],
72 | 'city': c['city']
73 | })
74 |
75 |
76 | def parse_county(self, response):
77 | '''
78 | 解析县
79 | '''
80 | city_info = response.meta
81 |
82 | # 如果是直辖市, 没有下级县, 直接解析天气数据
83 | if city_info['province'] in constant.DIRECT_CITY:
84 | self.parse_direct_weather(response, city_info)
85 |
86 | else:
87 | counties = []
88 | for a in response.xpath('//div[@class="navbox"]/span/a'):
89 | counties.append({
90 | 'url': response.urljoin(a.xpath('@href').extract_first()),
91 | 'county': a.xpath('.//text()').extract_first()
92 | })
93 | for c in counties:
94 | city_info['county'] = c['county']
95 | yield scrapy.Request(c['url'], callback=self.parse_county_weather, meta=city_info)
96 |
97 | def parse_county_weather(self, response):
98 | '''
99 | 解析县天气数据
100 | '''
101 | meta = response.meta
102 | self._parse_weather(response, meta)
103 |
104 |
105 | def parse_direct_weather(self, response, meta):
106 | '''
107 | 解析直辖市天气数据
108 | '''
109 | #self.log.info('provicince:%s, city:%s', meta['province'], meta['city'])
110 | self._parse_weather(response, meta)
111 |
112 |
113 | def _parse_weather(self, response, meta):
114 | seven_day_weather = []
115 | for li in response.xpath('//div[@id="7d"]/ul[@class="t clearfix"]/li'):
116 | # 相对日期
117 | h1 = li.xpath('./h1/text()').extract_first()
118 | # 描述
119 | desc = li.xpath('./p[@class="wea"]/text()').extract_first()
120 | # 最高、低温度
121 | max_tem = li.xpath('./p[@class="tem"]/span/text()').extract_first()
122 | min_tem = li.xpath('./p[@class="tem"]/i/text()').extract_first()
123 | # 风向
124 | wind_direction = li.xpath('.//em/span/@title').extract()
125 | # 风力 可能会有隐患
126 | wf = li.xpath('.//i/text()').extract()
127 | wind_force = wf[-1] if len(wf) >= 2 else 'unkonw'
128 |
129 | seven_day_weather.append({
130 | 'day': h1,
131 | 'desc': desc,
132 | 'max_tem': max_tem,
133 | 'min_tem': min_tem,
134 | 'wind_direction': wind_direction,
135 | 'wind_force': wind_force
136 | })
137 | self.log.info("========province:%s=======city:%s========county:%s", meta['province'], meta['city'], meta.get('county', None))
138 |
139 | data = {
140 | 'province': meta['province'],
141 | 'city': meta['city'],
142 | 'county': meta.get('county', None),
143 | 'data': seven_day_weather
144 | }
145 | self.db.insert('weather', 'wea', data)
146 |
147 |
148 | def _crawl(path):
149 | from scrapy.crawler import CrawlerProcess
150 | crawl = CrawlerProcess({
151 | 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)',
152 | 'LOG_FILE': 'text.log',
153 | 'LOG_LEVEL': 'INFO'
154 | })
155 | crawl.crawl(ProvinceSpider)
156 | crawl.start()
157 | crawl.stop()
158 |
159 | def run_crawl(path):
160 | p = Process(target=_crawl, args=['hahahahha'])
161 | p.start()
162 | p.join()
163 |
164 | if __name__ == '__main__':
165 | run_crawl('xxx')
166 |
--------------------------------------------------------------------------------