├── .gitignore
├── LICENSE
├── README.md
├── demo.png
├── scrapy.cfg
└── stats
    ├── __init__.py
    ├── items.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        └── quhua.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | bin/
12 | build/
13 | develop-eggs/
14 | dist/
15 | eggs/
16 | lib/
17 | lib64/
18 | parts/
19 | sdist/
20 | var/
21 | *.egg-info/
22 | .installed.cfg
23 | *.egg
24 | 
25 | # Installer logs
26 | pip-log.txt
27 | pip-delete-this-directory.txt
28 | 
29 | # Unit test / coverage reports
30 | htmlcov/
31 | .tox/
32 | .coverage
33 | .cache
34 | nosetests.xml
35 | coverage.xml
36 | 
37 | # Translations
38 | *.mo
39 | 
40 | # Mr Developer
41 | .mr.developer.cfg
42 | .project
43 | .pydevproject
44 | 
45 | # Rope
46 | .ropeproject
47 | 
48 | # Django stuff:
49 | *.log
50 | *.pot
51 | 
52 | # Sphinx documentation
53 | docs/_build/
54 | 
55 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2014 Phyng
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | scrapy-stats
 2 | ============
 3 | 
 4 | Scrapy项目，抓取国家统计局区划代码，并用D3.js可视化
 5 | 
 6 | ## 组成
 7 | 抓取[国家统计局统计用区划代码](http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/)，得到2009-2013五年的统计用区划代码，以2013年为例：
 8 | 
 9 | layer |count(code)| name     
10 | ------|-----------|------------
11 | 1     | 31        | 省/市/自治区
12 | 2     | 345       | 市
13 | 3     | 2856      | 县/区
14 | 4     | 43854     | 乡/镇/街道
15 | 5     | 694688    | 村/居委会
16 | 
17 | ## 可视化
18 | Demo：[http://phyng.com/scrapy-stats/](http://phyng.com/scrapy-stats/)
19 | 
20 | ![demo](demo.png)
21 | 


--------------------------------------------------------------------------------
/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phyng/scrapy-stats/e9eacbaba69914d78da6f31b723223be13fae08c/demo.png


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # http://doc.scrapy.org/en/latest/topics/scrapyd.html
 5 | 
 6 | [settings]
 7 | default = stats.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = stats
12 | 


--------------------------------------------------------------------------------
/stats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/phyng/scrapy-stats/e9eacbaba69914d78da6f31b723223be13fae08c/stats/__init__.py


--------------------------------------------------------------------------------
/stats/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class Layer01_Item(scrapy.Item):
12 |     year = scrapy.Field()
13 |     name = scrapy.Field()
14 |     code = scrapy.Field()
15 | 
16 | class Layer02_Item(scrapy.Item):
17 |     year = scrapy.Field()
18 |     name = scrapy.Field()
19 |     code = scrapy.Field()
20 | 
21 | class Layer03_Item(scrapy.Item):
22 |     year = scrapy.Field()
23 |     name = scrapy.Field()
24 |     code = scrapy.Field()
25 | 
26 | class Layer04_Item(scrapy.Item):
27 |     year = scrapy.Field()
28 |     name = scrapy.Field()
29 |     code = scrapy.Field()
30 | 
31 | class Layer05_Item(scrapy.Item):
32 |     year = scrapy.Field()
33 |     name = scrapy.Field()
34 |     code = scrapy.Field()
35 |     code2 = scrapy.Field()
36 | 


--------------------------------------------------------------------------------
/stats/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class StatsPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/stats/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for stats project
 4 | #
 5 | # For simplicity, this file contains only the most important settings by
 6 | # default. All the other settings are documented here:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #
10 | 
11 | BOT_NAME = 'stats'
12 | 
13 | SPIDER_MODULES = ['stats.spiders']
14 | NEWSPIDER_MODULE = 'stats.spiders'
15 | 
16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
17 | #USER_AGENT = 'stats (+http://www.yourdomain.com)'
18 | 


--------------------------------------------------------------------------------
/stats/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/stats/spiders/quhua.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*- #
 2 | 
 3 | from __future__ import print_function
 4 | import sys
 5 | reload(sys)
 6 | sys.setdefaultencoding('utf-8')
 7 | 
 8 | import re
 9 | 
10 | from scrapy.contrib.spiders import CrawlSpider, Rule
11 | from scrapy.contrib.linkextractors import LinkExtractor
12 | from scrapy.http import Request
13 | 
14 | from stats.items import *
15 | 
16 | 
17 | 
18 | class StatsSpider(CrawlSpider):
19 |     name = 'quhua'
20 |     allowed_domains = ['stats.gov.cn']
21 |     start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/']
22 |     rules = (
23 |         #处理省级列表
24 |         Rule(LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/index\.html')), callback='Layer01_Parse'),
25 |         )
26 | 
27 |     def Layer01_Parse(self, response):
28 | 
29 |         item = Layer01_Item()
30 |         for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d\.html')).extract_links(response):
31 |             url = i.url
32 |             text = i.text
33 |             item['year'] = url[-12:-8]
34 |             item['name'] = text
35 |             item['code'] = url[-7:-5]
36 |             yield item
37 |             yield Request(url, callback=self.Layer02_Parse)
38 | 
39 | 
40 |     def Layer02_Parse(self, response):
41 |         text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\
42 |                [0].extract()
43 |         item = Layer02_Item()
44 |         item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:]
45 |         for code, name in re.findall(r'href="\d\d/(\d{4})\.html">([^\d]+?)</a>', text):
46 |             item['name'] = name
47 |             item['code'] = code
48 |             yield item
49 |         for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d{4}\.html')).extract_links(response):
50 |             url = i.url
51 |             text = i.text
52 |             yield Request(url, callback=self.Layer03_Parse)
53 | 
54 |     def Layer03_Parse(self, response):
55 |         text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\
56 |                [0].extract()
57 |         item = Layer03_Item()
58 |         item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:]
59 |         for code, name in re.findall(r'href="\d\d/(\d{6})\.html">([^\d]+?)</a>', text):
60 |             item['name'] = name
61 |             item['code'] = code
62 |             yield item
63 |         for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d\d/\d{6}\.html')).extract_links(response):
64 |             url = i.url
65 |             text = i.text
66 |             yield Request(url, callback=self.Layer04_Parse)
67 | 
68 |     def Layer04_Parse(self, response):
69 |         text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\
70 |                [0].extract()
71 |         item = Layer04_Item()
72 |         item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:]
73 |         for code, name in re.findall(r'href="\d\d/(\d{9}).html">([^\d]+?)</a>', text):
74 |             item['name'] = name
75 |             item['code'] = code
76 |             yield item
77 |         for i in LinkExtractor(allow=(r'tjyqhdmhcxhfdm/20\d\d/\d\d/\d\d/\d\d/\d{9}\.html')).extract_links(response):
78 |             url = i.url
79 |             text = i.text
80 |             yield Request(url, callback=self.Layer05_Parse)
81 | 
82 |     def Layer05_Parse(self, response):
83 |         text = response.xpath('/html/body/table[2]/tbody/tr[1]/td/table/tbody/tr[2]/td/table/tbody/tr/td/table')\
84 |                [0].extract()
85 |         item = Layer05_Item()
86 |         item['year'] = re.findall(r'dm/20\d\d', response.url)[0][3:]
87 |         for code, code2, name in re.findall(r'<td>(\d{12})</td><td>(\d\d\d)</td><td>(.+?)</td>', text):
88 |             item['name'] = name
89 |             item['code'] = code
90 |             item['code2'] = code2
91 |             yield item
92 | 


--------------------------------------------------------------------------------