├── .gitignore ├── LICENSE ├── README.md ├── demo.png ├── scrapy.cfg └── stats ├── __init__.py ├── items.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py └── quhua.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | bin/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | eggs/ 16 | lib/ 17 | lib64/ 18 | parts/ 19 | sdist/ 20 | var/ 21 | *.egg-info/ 22 | .installed.cfg 23 | *.egg 24 | 25 | # Installer logs 26 | pip-log.txt 27 | pip-delete-this-directory.txt 28 | 29 | # Unit test / coverage reports 30 | htmlcov/ 31 | .tox/ 32 | .coverage 33 | .cache 34 | nosetests.xml 35 | coverage.xml 36 | 37 | # Translations 38 | *.mo 39 | 40 | # Mr Developer 41 | .mr.developer.cfg 42 | .project 43 | .pydevproject 44 | 45 | # Rope 46 | .ropeproject 47 | 48 | # Django stuff: 49 | *.log 50 | *.pot 51 | 52 | # Sphinx documentation 53 | docs/_build/ 54 | 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2014 Phyng 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | scrapy-stats 2 | ============ 3 | 4 | Scrapy项目,抓取国家统计局区划代码,并用D3.js可视化 5 | 6 | ## 组成 7 | 抓取[国家统计局统计用区划代码](http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/),得到2009-2013五年的统计用区划代码,以2013年为例: 8 | 9 | layer |count(code)| name 10 | ------|-----------|------------ 11 | 1 | 31 | 省/市/自治区 12 | 2 | 345 | 市 13 | 3 | 2856 | 县/区 14 | 4 | 43854 | 乡/镇/街道 15 | 5 | 694688 | 村/居委会 16 | 17 | ## 可视化 18 | Demo:[http://phyng.com/scrapy-stats/](http://phyng.com/scrapy-stats/) 19 | 20 |  21 | -------------------------------------------------------------------------------- /demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phyng/scrapy-stats/e9eacbaba69914d78da6f31b723223be13fae08c/demo.png -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html 5 | 6 | [settings] 7 | default = stats.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = stats 12 | -------------------------------------------------------------------------------- /stats/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phyng/scrapy-stats/e9eacbaba69914d78da6f31b723223be13fae08c/stats/__init__.py -------------------------------------------------------------------------------- /stats/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class Layer01_Item(scrapy.Item): 12 | year = scrapy.Field() 13 | name = scrapy.Field() 14 | code = scrapy.Field() 15 | 16 | class Layer02_Item(scrapy.Item): 17 | year = scrapy.Field() 18 | name = scrapy.Field() 19 | code = scrapy.Field() 20 | 21 | class Layer03_Item(scrapy.Item): 22 | year = scrapy.Field() 23 | name = scrapy.Field() 24 | code = scrapy.Field() 25 | 26 | class Layer04_Item(scrapy.Item): 27 | year = scrapy.Field() 28 | name = scrapy.Field() 29 | code = scrapy.Field() 30 | 31 | class Layer05_Item(scrapy.Item): 32 | year = scrapy.Field() 33 | name = scrapy.Field() 34 | code = scrapy.Field() 35 | code2 = scrapy.Field() 36 | -------------------------------------------------------------------------------- /stats/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class StatsPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /stats/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for stats project 4 | # 5 | # For simplicity, this file contains only the most important settings by 6 | # default. All the other settings are documented here: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # 10 | 11 | BOT_NAME = 'stats' 12 | 13 | SPIDER_MODULES = ['stats.spiders'] 14 | NEWSPIDER_MODULE = 'stats.spiders' 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'stats (+http://www.yourdomain.com)' 18 | -------------------------------------------------------------------------------- /stats/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /stats/spiders/quhua.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- # 2 | 3 | from __future__ import print_function 4 | import sys 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') 7 | 8 | import re 9 | 10 | from scrapy.contrib.spiders import CrawlSpider, Rule 11 | from scrapy.contrib.linkextractors import LinkExtractor 12 | from scrapy.http import Request 13 | 14 | from stats.items import * 15 | 16 | 17 | 18 | class StatsSpider(CrawlSpider): 19 | name = 'quhua' 20 | allowed_domains = ['stats.gov.cn'] 21 | start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/'] 22 | rules = ( 23 | #处理省级列表 24 | Rule(LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/index\.html')), callback='Layer01_Parse'), 25 | ) 26 | 27 | def Layer01_Parse(self, response): 28 | 29 | item = Layer01_Item() 30 | for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d\.html')).extract_links(response): 31 | url = i.url 32 | text = i.text 33 | item['year'] = url[-12:-8] 34 | item['name'] = text 35 | item['code'] = url[-7:-5] 36 | yield item 37 | yield Request(url, callback=self.Layer02_Parse) 38 | 39 | 40 | def Layer02_Parse(self, response): 41 | text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\ 42 | [0].extract() 43 | item = Layer02_Item() 44 | item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:] 45 | for code, name in re.findall(r'href="\d\d/(\d{4})\.html">([^\d]+?)', text): 46 | item['name'] = name 47 | item['code'] = code 48 | yield item 49 | for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d{4}\.html')).extract_links(response): 50 | url = i.url 51 | text = i.text 52 | yield Request(url, callback=self.Layer03_Parse) 53 | 54 | def Layer03_Parse(self, response): 55 | text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\ 56 | [0].extract() 57 | item = Layer03_Item() 58 | item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:] 59 | for code, name in re.findall(r'href="\d\d/(\d{6})\.html">([^\d]+?)', text): 60 | item['name'] = name 61 | item['code'] = code 62 | yield item 63 | for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d\d/\d{6}\.html')).extract_links(response): 64 | url = i.url 65 | text = i.text 66 | yield Request(url, callback=self.Layer04_Parse) 67 | 68 | def Layer04_Parse(self, response): 69 | text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\ 70 | [0].extract() 71 | item = Layer04_Item() 72 | item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:] 73 | for code, name in re.findall(r'href="\d\d/(\d{9}).html">([^\d]+?)', text): 74 | item['name'] = name 75 | item['code'] = code 76 | yield item 77 | for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d\d/\d\d/\d{9}\.html')).extract_links(response): 78 | url = i.url 79 | text = i.text 80 | yield Request(url, callback=self.Layer05_Parse) 81 | 82 | def Layer05_Parse(self, response): 83 | text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\ 84 | [0].extract() 85 | item = Layer05_Item() 86 | item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:] 87 | for code, code2, name in re.findall(r'