├── .gitignore
├── LICENSE
├── ProxyPools
    ├── crawlProxy
    │   ├── __init__.py
    │   └── crawlProxy.py
    ├── flask_api
    │   ├── __init__.py
    │   └── flask_api.py
    ├── manage
    │   ├── __init__.py
    │   └── manageProxy.py
    └── tools
    │   ├── __init__.py
    │   ├── config.py
    │   ├── ext.py
    │   ├── tools.py
    │   └── useragent.py
├── README.md
├── apkdownload
    ├── GooglePlayRank_0.txt
    ├── GooglePlayRank_1.txt
    ├── GooglePlayRank_2.txt
    ├── GooglePlayRank_3.txt
    ├── __init__.py
    ├── apk
    │   ├── com.google.android.youtube.apk
    │   ├── com.hth.docbaotonghop.apk
    │   ├── com.sports.scores.football.schedule.oakland.radiers.apk
    │   └── com.tiffany.engagement.apk
    ├── config.py
    └── download.py
├── baidutieba
    └── BDTBwithbs4.py
├── dingdianxiaoshuo
    └── dingdian
    │   ├── dingdian
    │       ├── __init__.py
    │       ├── items.py
    │       ├── mysqlpipelines
    │       │   ├── __init__.py
    │       │   ├── models.py
    │       │   ├── mypipelines.py
    │       │   └── mysqldb.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │       │   ├── __init__.py
    │       │   └── spider_dingdian.py
    │   ├── entrypoint.py
    │   └── scrapy.cfg
├── gpcrawler
    ├── __init__.py
    ├── entrypoint.py
    ├── gpcrawler
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── __init__.py
    │   │   └── crawler.py
    ├── scrapy.cfg
    └── trans_txt.py
├── gpprivacy
    ├── GooglePlayRank.txt
    ├── GooglePlayRank2.txt
    ├── gp_privacy_crawler.py
    └── privacy_with_sms.txt
├── huaban
    └── huaban.py
├── liaoxuefengpdf
    └── liaoxuefeng_pdf.py
├── meizitu
    ├── __init__.py
    ├── config.py
    ├── crawler_queue.py
    ├── download.py
    ├── getAllPageToQueue.py
    ├── spider_meizitu.py
    └── spider_meizitu_with_queue.py
├── python爬取微信公众号历史文章链接思路.md
├── tickets
    ├── crawl_stations.py
    ├── requirements.txt
    ├── stations.py
    └── tickets.py
├── weather
    ├── local_weather.txt
    ├── requirements.txt
    ├── scrapy.cfg
    ├── wea.json
    └── weather
    │   ├── __init__.py
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │       ├── __init__.py
    │       └── localweather.py
├── wechat
    └── crawl_wechat.py
└── zhihu
    ├── entrypoint.py
    ├── scrapy.cfg
    └── zhihu
        ├── __init__.py
        ├── items.py
        ├── middlewares.py
        ├── pipelines.py
        ├── settings.py
        └── spiders
            ├── __init__.py
            └── zhihu.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | .idea
91 | .idea/
92 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 fst034356
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/ProxyPools/crawlProxy/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8


--------------------------------------------------------------------------------
/ProxyPools/crawlProxy/crawlProxy.py:
--------------------------------------------------------------------------------
  1 | # # coding=utf-8
  2 | # !usr/bin/python3
  3 | 
  4 | import re
  5 | from tools.tools import crawlProxy, getHtmlTree
  6 | 
  7 | 
  8 | class getProxy(object):
  9 | 
 10 |     def __init__(self):
 11 |         pass
 12 | 
 13 |     @staticmethod
 14 |     @crawlProxy
 15 |     def getProxyFirst(page=10):
 16 |         '''
 17 |         抓取:快代理IP http://www.kuaidaili.com/
 18 |         :param page:
 19 |         :return:
 20 |         '''
 21 |         url_list = (
 22 |             'http://www.kuaidaili.com/proxylist/{page}/'.format(
 23 |                 page=page) for page in range(
 24 |                 1, page + 1))
 25 |         for url in url_list:
 26 |             tree = getHtmlTree(url=url)
 27 |             #proxy_list = tree.xpath('.//div[@id="index_free_list"]//tbody/tr')
 28 |             proxy_list = tree.xpath(
 29 |                 '//*[@id="index_free_list"]/table/tbody/tr')
 30 |             print('1')
 31 |             for proxy in proxy_list:
 32 |                 print('2')
 33 |                 # print(proxy)
 34 |                 yield ':'.join(proxy.xpath('./td/text()')[0:2])
 35 | 
 36 |     @staticmethod
 37 |     @crawlProxy
 38 |     def getProxySecond(proxy_num=100):
 39 |         '''
 40 |         抓取:66代理 http://www.66ip.cn/,66代理提供API，可以直接提取,
 41 |         :param proxy_num:
 42 |         :return:
 43 |         '''
 44 |         url = "http://m.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format(
 45 |             proxy_num)
 46 |         html = getHtmlTree(url, xpath=False)
 47 |         proxy_list = re.findall(
 48 |             r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', str(html))
 49 |         for proxy in proxy_list:
 50 |             yield proxy
 51 | 
 52 |     @staticmethod
 53 |     @crawlProxy
 54 |     def getProxyThird(days=1):
 55 |         '''
 56 |         抓取:有代理 http://www.youdaili.net/Daili/http/
 57 |         :param days:
 58 |         :return:
 59 |         '''
 60 |         url = "http://www.youdaili.net/Daili/http/"
 61 |         tree = getHtmlTree(url)
 62 |         page_url_list = tree.xpath(
 63 |             './/div[@class="chunlist"]/ul/li/p/a/@href')[0:days]
 64 |         for page_url in page_url_list:
 65 |             html = getHtmlTree(page_url, xpath=False)
 66 |             proxy_list = re.findall(
 67 |                 r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', str(html))
 68 |             for proxy in proxy_list:
 69 |                 yield proxy
 70 | 
 71 |     @staticmethod
 72 |     @crawlProxy
 73 |     def getProxyForth():
 74 |         '''
 75 |         抓取:西刺代理 http://api.xicidaili.com/free2016.txt
 76 |         :return:
 77 |         '''
 78 | 
 79 |         url_list = ['http://www.xicidaili.com/nn',  # 高匿
 80 |                     'http://www.xicidaili.com/nt',  # 透明
 81 |                     ]
 82 |         for each_url in url_list:
 83 |             tree = getHtmlTree(each_url)
 84 |             proxy_list = tree.xpath('.//table[@id="ip_list"]//tr')
 85 |             for proxy in proxy_list:
 86 |                 yield ':'.join(proxy.xpath('./td/text()')[0:2])
 87 | 
 88 |     @staticmethod
 89 |     @crawlProxy
 90 |     def getProxyFifth():
 91 |         '''
 92 |         抓取:guobanjia http://www.goubanjia.com/free/gngn/index.shtml
 93 |         :return:
 94 |         '''
 95 | 
 96 |         url = "http://www.goubanjia.com/free/gngn/index{page}.shtml"
 97 |         for page in range(1, 10):
 98 |             page_url = url.format(page=page)
 99 |             tree = getHtmlTree(page_url)
100 |             proxy_list = tree.xpath('//td[@class="ip"]')
101 |             for each_proxy in proxy_list:
102 |                 yield ''.join(each_proxy.xpath('.//text()'))
103 | 
104 | '''
105 | if __name__ == '__main__':
106 |     gg = getProxy()
107 | 
108 |     for n in gg.getProxyFifth():
109 |         print(n)
110 | '''


--------------------------------------------------------------------------------
/ProxyPools/flask_api/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 
3 | 


--------------------------------------------------------------------------------
/ProxyPools/flask_api/flask_api.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # !/usr/bin/python3
 3 | 
 4 | from flask import Flask, jsonify, request
 5 | 
 6 | from manage.manageProxy import Proxymanager
 7 | 
 8 | app = Flask(__name__)
 9 | 
10 | api_list = {
11 |     'get': u'get an usable proxy',
12 |     'refresh': u'refresh proxy pool',
13 |     'get_all': u'get all proxy from proxy pool',
14 |     'delete?proxy=127.0.0.1:8080': u'delete an unable proxy',
15 | }
16 | 
17 | proxymanager = Proxymanager()
18 | 
19 | 
20 | @app.route('/')
21 | def index():
22 |     return jsonify(api_list)
23 | 
24 | 
25 | @app.route('/get/')
26 | def get():
27 |     proxy = proxymanager.getVerifyProxy()
28 |     return proxy
29 | 
30 | 
31 | @app.route('/get_all/')
32 | def getAll():
33 |     proxies = proxymanager.getAllVerifyProxy()
34 |     return jsonify(list(proxies))
35 | 
36 | 
37 | @app.route('/refresh/')
38 | def refresh():
39 |     proxymanager.refesh()
40 |     return True
41 | 
42 | 
43 | @app.route('/delete/', methods=['GET'])
44 | def delete():
45 |     proxy = request.args.get('proxy')
46 |     proxymanager.delete_proxy(proxy)
47 |     return 'delete success'
48 | 
49 | if __name__ == '__main__':
50 |     app.run(host='0.0.0.0', port=5000)
51 | 


--------------------------------------------------------------------------------
/ProxyPools/manage/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 


--------------------------------------------------------------------------------
/ProxyPools/manage/manageProxy.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import requests
  3 | from multiprocessing import Process
  4 | from apscheduler.schedulers.blocking import BlockingScheduler
  5 | 
  6 | from crawlProxy.crawlProxy import getProxy
  7 | from tools.config import MONGO_TABLE_ALL, MONGO_TABLE_VERIFY
  8 | from tools.ext import db
  9 | from tools.tools import verifyProxy
 10 | 
 11 | 
 12 | class Proxymanager(object):
 13 | 
 14 |     def __init__(self):
 15 |         '''
 16 |         alldb:获取的全部代理的数据库
 17 |         verifydb：经过验证后可用的数据库
 18 |         '''
 19 |         self.alldb = db[MONGO_TABLE_ALL]
 20 |         self.verifydb = db[MONGO_TABLE_VERIFY]
 21 | 
 22 |     def refesh(self):
 23 |         '''
 24 |         删除旧数据困并重新获取代理后加入数据库
 25 |         :return:
 26 |         '''
 27 |         self.alldb.drop()
 28 |         self.verifydb.drop()
 29 | 
 30 |         proxies = getProxy()
 31 | 
 32 |         for proxy in proxies.getProxySecond():
 33 |             if verifyProxy(str(proxy)):
 34 |                 proxy_dict = {'proxy': str(proxy)}
 35 |                 self.alldb.insert(proxy_dict)
 36 | 
 37 |         for proxy in proxies.getProxyThird():
 38 |             if verifyProxy(str(proxy)):
 39 |                 proxy_dict = {'proxy': str(proxy)}
 40 |                 self.alldb.insert(proxy_dict)
 41 | 
 42 |         for proxy in proxies.getProxyForth():
 43 |             if verifyProxy(str(proxy)):
 44 |                 proxy_dict = {'proxy': str(proxy)}
 45 |                 self.alldb.insert(proxy_dict)
 46 | 
 47 |         for proxy in proxies.getProxyFifth():
 48 |             if verifyProxy(str(proxy)):
 49 |                 proxy_dict = {'proxy': str(proxy)}
 50 |                 self.alldb.insert(proxy_dict)
 51 | 
 52 |     def getAllProxy(self):
 53 |         '''
 54 |         获得全部爬取的代理
 55 |         :return:
 56 |         '''
 57 |         for proxy in self.alldb.find():
 58 |             yield proxy
 59 | 
 60 |     def getVerifyProxy(self):
 61 |         '''
 62 |         获得一个可用的代理
 63 |         :return:
 64 |         '''
 65 |         return self.alldb.find_one()['proxy']
 66 | 
 67 |     def getAllVerifyProxy(self):
 68 |         '''
 69 |         获得全部可用的代理
 70 |         :return:
 71 |         '''
 72 |         for proxy in self.verifydb.find():
 73 |             yield proxy['proxy']
 74 | 
 75 |     def valid_proxy(self):
 76 |         '''
 77 |         验证代理，如果可用则放入verifydb
 78 |         :return:
 79 |         '''
 80 |         print('start valid proxy!')
 81 |         for p in self.getAllProxy():
 82 |             proxy = {}
 83 |             proxy['proxy'] = p['proxy']
 84 |             proxies = {"http": "http://{proxy}".format(proxy=proxy['proxy']),
 85 |                        "https": "https://{proxy}".format(proxy=proxy['proxy'])}
 86 | 
 87 |             try:
 88 |                 response = requests.get(
 89 |                     'https://www.baidu.com',
 90 |                     proxies=proxies,
 91 |                     timeout=30,
 92 |                     verify=False)
 93 |                 if response.status_code == 200:
 94 |                     self.verifydb.insert(proxy)
 95 |                     print('Proxy:%s is useful!' % proxy['proxy'])
 96 |             except Exception as e:
 97 |                 print("Error: %s" % e)
 98 |                 print('Proxy: %s validation fail' % proxy['proxy'])
 99 |         print('valid proxy complete!')
100 | 
101 |     def delete_proxy(self, delproxy):
102 |         self.verifydb.remove({'proxy': delproxy})
103 | 
104 | 
105 | def refresh_pool():
106 |     schedules = Proxymanager()
107 |     schedules.valid_proxy()
108 | 
109 | 
110 | def main(process_num=10):
111 |     manager = Proxymanager()
112 |     manager.refesh()
113 |     pool = []
114 |     for num in range(process_num):
115 |         proc = Process(target=refresh_pool, args=())
116 |         pool.append(proc)
117 | 
118 |     for num in range(process_num):
119 |         pool[num].start()
120 | 
121 |     for num in range(process_num):
122 |         pool[num].join()
123 | 
124 | if __name__ == '__main__':
125 |     main()
126 |     schedule = BlockingScheduler()
127 |     schedule.add_job(main, 'interval', minutes=10)
128 |     schedule.start()
129 | 


--------------------------------------------------------------------------------
/ProxyPools/tools/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8


--------------------------------------------------------------------------------
/ProxyPools/tools/config.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 
3 | MONGO_URL = 'localhost'
4 | MONGO_DB = 'freeproxy'
5 | MONGO_TABLE_ALL = 'allfreeproxy'
6 | MONGO_TABLE_VERIFY = 'verifyfreeproxy'
7 | 


--------------------------------------------------------------------------------
/ProxyPools/tools/ext.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 
3 | import pymongo
4 | 
5 | from tools.config import MONGO_URL, MONGO_DB
6 | 
7 | client = pymongo.MongoClient(MONGO_URL, connect=False)
8 | db = client[MONGO_DB]
9 | 


--------------------------------------------------------------------------------
/ProxyPools/tools/tools.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # !usr/bin/python3
 3 | 
 4 | import requests
 5 | import re
 6 | import functools
 7 | from lxml import etree as ET
 8 | from .useragent import header
 9 | 
10 | 
11 | def crawlProxy(func):
12 |     '''
13 |     抓取代理的装饰器，方便输出错误信息
14 |     :param func:
15 |     :return:
16 |     '''
17 |     @functools.wraps(func)
18 |     def wrapper(*args, **kwargs):
19 |         try:
20 |             return func(*args, **kwargs)
21 |         except Exception as e:
22 |             print('抓取代理ip失败:%s', e)
23 |     return wrapper
24 | 
25 | 
26 | def verifyProxy(proxy):
27 |     '''
28 |     检查代理ip的格式是否正确
29 |     :param proxy:
30 |     :return:
31 |     '''
32 |     verify_regex = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}"
33 |     return True if re.findall(verify_regex, proxy) else False
34 | 
35 | 
36 | def getHtmlTree(url, xpath=True, **kwargs):
37 |     '''
38 |     获取免费代理ip页面的html树用于解析
39 |     :param url:
40 |     :param kwargs:
41 |     :return:
42 |     '''
43 |     if xpath:
44 |         html = requests.get(url=url, headers=header, timeout=30).content
45 |         return ET.HTML(html)
46 |     else:
47 |         html = requests.get(url=url, headers=header).content
48 |         return html
49 | 


--------------------------------------------------------------------------------
/ProxyPools/tools/useragent.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import random
 3 | 
 4 | useragents = [
 5 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
 6 |     "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
 7 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
 8 |     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
 9 |     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
10 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
11 |     "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
12 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
13 |     "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
14 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
15 |     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
16 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
17 |     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
18 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
19 |     "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
20 |     "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
21 |     "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
22 |     "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
23 | 
24 | agent = random.choice(useragents)
25 | 
26 | header = {
27 |     'User-Agent': agent,
28 | }
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # crawler
 2 | 学习python爬虫时的一些代码。
 3 | 
 4 | ## baidutieba
 5 | urllib2爬取百度贴吧某帖子的各楼层的内容
 6 | 
 7 | ## huaban
 8 | selenium爬取花瓣网的图片
 9 | 
10 | ## liaoxuefengpdf
11 | request爬取廖雪峰老师网站上的教程并转成pdf
12 | 
13 | ## dingdianxiaoshuo
14 | scrapy爬取顶点小说网全部小说
15 | 
16 | ## meizitu
17 | 爬取妹子图全部图片
18 | 
19 | ## weather
20 | scrapy爬取新浪天气
21 | 
22 | ## tickets
23 | 获取12306车票信息
24 | 
25 | ## wechat
26 | 爬取微信公众号全部文章的链接
27 | 
28 | ## zhihu
29 | scrapy-redis分布式爬取知乎全部用户的信息。使用 scrapy 通过知乎的 API爬取，redis做分布式链接。从一个人的关注列表开始，递归爬取所有关注的人和被关注者，从而实现爬取整个知乎上所有进行过关注和被关注的人的信息。没有关注的人且没有被关注的用户不进行爬取。爬取下来的所有信息存入到 MongoDB 中。
30 | 
31 | ## gpcrawler
32 | scrapy爬取googleplay上的app包名
33 | 


--------------------------------------------------------------------------------
/apkdownload/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/apkdownload/__init__.py


--------------------------------------------------------------------------------
/apkdownload/apk/com.google.android.youtube.apk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/apkdownload/apk/com.google.android.youtube.apk


--------------------------------------------------------------------------------
/apkdownload/apk/com.hth.docbaotonghop.apk:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 | <meta charset="utf-8">
  5 | <meta name="viewport" content="width=device-width, initial-scale=1">
  6 | <meta name="description" content="Download APKs Directly From Google Play To Your Computer With APK Downloader Extension For Google Chrome">
  7 | <meta name="msvalidate.01" content="E09357EA5F3A771DC1BC1CF6C0256F3D" />
  8 | <meta name="theme-color" content="#87bc4e">
  9 | <meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">
 10 | <script src="/cdn-cgi/apps/head/oeS5dnF2VuWN6NV8avmFeaBH-3o.js"></script><link rel="publisher" href="https://plus.google.com/+Evozi" />
 11 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
 12 | <link rel="shortcut icon" href="assets/favicon.ico">
 13 | <link rel="icon" sizes="192x192" href="https://apps.evozi.com/apk-downloader/assets/img/mini_icon.png" />
 14 | <title>APK Downloader [Latest] Download Directly | Chrome Extension v3 (Evozi Official)</title>
 15 | <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
 16 | <link href="assets/css/main.css" rel="stylesheet">
 17 | <script src="//ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>
 18 | <script type="text/javascript">window.jQuery || document.write(unescape("<script type='text/javascript' src='//code.jquery.com/jquery-3.2.1.min.js'><\/script>"))</script>
 19 | <script>
 20 |     var loc = window.parent.location.toString();
 21 |     if (loc.indexOf("RemoteRunner.html")!=-1) {
 22 |       document.location="httpss://apps.evozi.com/";
 23 |     }
 24 | 
 25 |     try {
 26 |         if (window.document.documentElement.getAttribute("webdriver")) return !+[]
 27 |     } catch (IDLMrxxel) {}
 28 |     try {
 29 |         if ("_Selenium_IDE_Recorder" in window) return !+""
 30 |     } catch (KknKsUayS) {}
 31 |     try {
 32 |         if ("__webdriver_script_fn" in document) return !+""
 33 |     } catch(e){}
 34 | 
 35 |     if (window.callPhantom || window._phantom) {
 36 |         document.location="https://apps.evozi.com/";
 37 |     }
 38 |     </script>
 39 | </head>
 40 | <body>
 41 | 
 42 | <div class="navbar navbar-default navbar-static-top" role="navigation">
 43 | <div class="container">
 44 | <div class="navbar-header">
 45 | <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
 46 | <span class="sr-only">Toggle navigation</span>
 47 | <span class="icon-bar"></span>
 48 | <span class="icon-bar"></span>
 49 | <span class="icon-bar"></span>
 50 | </button>
 51 | <a class="navbar-brand" style="float:none" href="//apps.evozi.com/apk-downloader/"><img src="assets/img/mini_icon.png" width="45" height="45" alt="APK Downloader" style="display:inline"> APK Downloader</a>
 52 | </div>
 53 | <div class="navbar-collapse collapse">
 54 | <ul class="nav navbar-nav navbar-right">
 55 | <li class="active"><a href="https://apps.evozi.com/apk-downloader/">Home</a></li>
 56 | <li><a href="//apps.evozi.com/apk-downloader/#discuss">Comment</a></li>
 57 | <li><a href="/cdn-cgi/l/email-protection#a1c4ccc0c8cd8ac5ccc2c0e1c4d7cedbc88fc2cecc">DMCA Report</a></li>
 58 | 
 59 | </ul>
 60 | </div>
 61 | </div>
 62 | </div>
 63 | <div class="container">
 64 | <div class="row">
 65 | <div class="panel col-12 col-lg-6 col-lg-offset-3">
 66 | <div class="form-group panel-body">
 67 | <!--[if lt IE 9]>
 68 |             <div class="alert alert-danger">Your browser you are using is <strong>too old</strong>. Please upgrade</div>
 69 |             <![endif]-->
 70 | <label for="packagename">Package name or Google Play URL</label>
 71 | <div class="pull-right">
 72 | <a href="https://play.google.com/store/apps/details?id=com.evozi.injector" title="Get app link and paste in the box" target="_blank"><span class="glyphicon glyphicon-new-window"></span> Visit Play Store </a>
 73 | </div>
 74 | <input type="hidden" id="packagename" />
 75 | <input type="text" id="JandlHNCwduRIbGxy" class="form-control input-lg" placeholder="com.evozi.deviceid or https://play.google.com/store/apps/details?id=" value="com.hth.docbaotonghop" tabindex="1" autocorrect="off" autocapitalize="none" autofocus />
 76 | <span id="apk_info" class="help-block"><div class="alert alert-success"><strong><span class="glyphicon glyphicon-arrow-down" aria-hidden="true"></span> Click [Generate Download Link]</strong> to fetch the APK download link</div></span>
 77 | <button type="button" id="cFgzVSRwNlbkb" class="btn btn-primary btn-lg btn-block" data-loading-text="Generating" onclick="_gaq.push(['_trackEvent', 'apk_file', 'generate']);"><span class="glyphicon glyphicon-link"></span> Generate Download Link</button>
 78 | <a id="sTfCWwABjCBN" class="btn btn-success btn-block" href="#" rel="norewrite" style="display:none" onclick="_gaq.push(['_trackEvent', 'apk_file', 'download']);">Download</a>
 79 | 
 80 | <script data-cfasync="false" src="/cdn-cgi/scripts/d07b1474/cloudflare-static/email-decode.min.js"></script><script>
 81 | 	        var oldwrite = document.write;
 82 | 		       var text = '';
 83 | 
 84 | 		    function loadSecureStart() {
 85 | 	             document.write = function(t) { text = t; }
 86 | 		    }
 87 | 
 88 | 		    function loadSecureEnd() {
 89 | 			   document.write = oldwrite;
 90 | 			   text = text.replace(/http:/g, 'https:');
 91 | 			   document.write(text);
 92 | 	        }
 93 | 
 94 |             function loadSecureImage() {
 95 |                 document.write = oldwrite;
 96 |                 try {
 97 |                     var results = /<img[^>]+src="([^">]+)/g.exec(text);
 98 |                     var source = results[1];
 99 |                     text = text.replace(/<img src=/gi, "<img src='https://external.evozi.com/image.php?url=" + encodeURIComponent(source) + "' tempSrc=");
100 |                 } catch(err) {
101 |                     text = text.replace(/http%3A%2F%2F/g, 'https%3A%2F%2F');
102 |                 }
103 |                 document.write(text);
104 |             }
105 | 	        </script>
106 | <br>
107 | <script type="text/javascript" src="https://www.adnetworkperformance.com/ad/display.php?r=200834"></script>
108 | <br>
109 | <button class="btn btn-default btn-xs dropdown-toggle pull-right" type="button" data-toggle="collapse" data-target="#viewdetails" onclick="_gaq.push(['_trackEvent', 'apk_file', 'settings']);">
110 | Request Update <span class="caret"></span>
111 | </button><br><br>
112 | <div id="viewdetails" class="collapse">
113 | <div class="alert alert-danger">
114 | 
115 | <p>Post the app package name/ID in the comment box below if you like to request us to update or fetch the app</p>
116 | </div>
117 | 
118 | </div>
119 | </div>
120 | </div>
121 | </div>
122 | <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
123 | <script type="text/javascript">
124 |     var windowOpen = 0;
125 | 
126 |     if(top != self) {
127 |         $("body").click(function() {
128 |             if(windowOpen == 0){
129 |                 var a = document.createElement("a");
130 |                 a.href = location.href;
131 |                 var evt = document.createEvent("MouseEvents");
132 |                 evt.initMouseEvent("click", true, true, window, 0, 0, 0, 0, 0, true, false, false, false, 0, null);
133 |                 a.dispatchEvent(evt);
134 |                 windowOpen = 1;
135 |             }
136 |         });
137 |     }
138 | 
139 |     /** COPYRIGHT EVOZI - DO NOT USE EVOZI API WITHOUT PERMISSION ELSE WE WILL START LIMITING IP, DON"T OVERLOAD OUR SERVER **/
140 | 
141 |     var generateCount = 0;
142 |     $("#JandlHNCwduRIbGxy").keypress(function (e) {
143 |         if(e.which == 13) {
144 |             download_apk();
145 |         }
146 |     });
147 |     $('#cFgzVSRwNlbkb').click(function(){
148 |         download_apk();
149 |     });
150 |     function download_apk(){
151 |         $('#apk_info').html('');
152 |         $("#sTfCWwABjCBN").hide();
153 |         $("#notify-danger").hide();
154 | 
155 |         var xGwqMsAVtFxQvhRIkc = $('#JandlHNCwduRIbGxy').val().replace(/((http|https):\/\/)?play.google.com\/store\/apps\/details\?id=/i, '');
156 |        xGwqMsAVtFxQvhRIkc = xGwqMsAVtFxQvhRIkc.replace('market://details?id=', '').trim();
157 |          var validUrlDesc = '<br>It should be something like this <br>https://play.google.com/store/apps/details?id=com.evozi.deviceid';
158 |       var packageguide = '<a href="assets/img/guide/googleplay_packagename.png" target="_blank"><img src="assets/img/guide/googleplay_packagename.png" width="100%" height="100%"></a>';
159 | 
160 |              var ssPDu = 'FRsyX';       var version_desc = '';
161 |              var YFMbZEKbli = 'AeS9PiSl9-kleurgBdiqrw';
162 |   var fetched_desc = '';
163 | 
164 |         if(xGwqMsAVtFxQvhRIkc.indexOf("&") != -1){
165 |             var splited_string = xGwqMsAVtFxQvhRIkc.split("&");
166 |             xGwqMsAVtFxQvhRIkc = splited_string[0];
167 |         }
168 | 
169 |         if(!xGwqMsAVtFxQvhRIkc || (xGwqMsAVtFxQvhRIkc.length < 6) || (xGwqMsAVtFxQvhRIkc.indexOf('.') === -1) ){
170 |             $('#apk_info').html('<p class="text-danger">Please make sure package name or URL is valid</p>'+packageguide);
171 |             return false;
172 |         } else if (/\s/.test(xGwqMsAVtFxQvhRIkc)) {
173 |             $('#apk_info').html('<p class="text-danger">Whitespace (empty space) found. Please make sure package name or URL is valid</p>'+packageguide);
174 |             return false;
175 |         }
176 | 
177 |         if(generateCount >= 4){
178 |             $('#apk_info').html('<a href="//apps.evozi.com/apk-downloader/?id='+xGwqMsAVtFxQvhRIkc+'" class="btn btn-block btn-warning" onclick="_gaq.push([\'_trackEvent\', \'apk_file\', \'continue\']);"><span class="glyphicon glyphicon-asterisk"></span> Click here to continue generate download link</a>');
179 |             return false;
180 |         }
181 | 
182 |         try{
183 |             $('#cFgzVSRwNlbkb').button('loading');
184 |         } catch(err) {}
185 |         $('#apk_info').html('<span class="android android_holder"></span> Searching and downloading APK...<br>It may take up to 3 minutes, depending on file size');
186 | 
187 | 		   var  DYudbajA  = "";
188 |         var vtwWCYyplcoGfpGpPCs = 'UhLpirg';         DYudbajA  =   {t: 1516960793, bceddeaba: xGwqMsAVtFxQvhRIkc,     dbfefcbafdec:       YFMbZEKbli,   fetch: $('#forceRefetch').is(':checked')};
189 | 
190 |         $.ajax({
191 |               type: "POST",
192 |               //crossDomain: true,
193 |                             dataType: "json",
194 | 
195 |               data: DYudbajA,
196 |               cache: false, url: "//api-apk.evozi.com/download",
197 | 
198 |             success: function(response) {
199 |                 if(response.status == "success"){
200 |                     if(response.fetched_at != null){
201 |                         fetched_desc = '<br><strong>Last Fetched: </strong>' + response.fetched_at;
202 |                     }
203 | 
204 |                     if(response.version != null){
205 |                         version_desc = '<br><strong>Version: </strong>' + response.version + ' ('+response.version_code+')';
206 |                     }
207 | 
208 |                     $('#apk_info').html('<div class="row"><div class="col-md-10"><p class="text-success"><strong>Package Name:</strong> '+response.packagename+' <a href="https://play.google.com/store/apps/details?id='+response.packagename+'" target="_blank">[Play Store]</a><br><strong>File Size:</strong> '+response.filesize+'<br><strong>QR Code:</strong> <a href="http://chart.apis.google.com/chart?cht=qr&chs=300x300&chl=https:'+encodeURIComponent(response.url)+'" target="_blank"> <span class="glyphicon glyphicon-qrcode"></span> View</a><br><strong>MD5 File Hash:</strong> '+response.md5 + fetched_desc+version_desc+'</p></div><div class="col-md-2"><img class="app-icon" src="//d2lh3rxs7crswz.cloudfront.net/'+response.packagename+'.png" onerror="if (this.src != \'//apps.evozi.com/apk-downloader/assets/img/mini_icon.png\') this.src = \'//apps.evozi.com/apk-downloader/assets/img/mini_icon.png\';"></div></div>');
209 |                     $("#sTfCWwABjCBN").show().prop("href", response.url);
210 |                     $('#sTfCWwABjCBN').html('<span class="glyphicon glyphicon-save"></span> Click here to download <strong>'+response.packagename+'</strong> now');
211 |                     addClicker("//apps.evozi.com/apk-downloader/?id="+response.packagename);
212 |                     generateCount += 1;
213 |                 } else if (response.status == "error"){
214 |                     $('#apk_info').html('<p class="text-danger">'+response.data);
215 |                     $("#sTfCWwABjCBN").hide();
216 |                     _gaq.push(['_trackEvent', 'error', xGwqMsAVtFxQvhRIkc]);
217 |                 }
218 |             },
219 |             statusCode: {
220 |               404: function() {
221 |                 alert( "Requested page not found" );
222 |               },
223 |               429: function() {
224 |                 alert( "You are being rate limited" );
225 |               },
226 |               500: function() {
227 |                 alert( "Internal Server Error" );
228 |               },
229 |             },
230 |             error: function (xhr, ajaxOptions, thrownError) {
231 |                 var msg = xhr.status + " No connection - OR - You are being rate limited, please try again later";
232 |                 $('#apk_info').html('<p class="text-danger">Ops! Unable to download (Please try again or contact us)<br>'+msg+'</p>');
233 |                 _gaq.push(['_trackEvent', 'ajax_error - ', msg + navigator.userAgent]);
234 |             },
235 |             complete: function() {
236 |                 $("#forceRefetch").attr('checked', false);
237 |                 $('#cFgzVSRwNlbkb').button('reset');
238 |             }
239 |     });
240 | 
241 |     function addClicker(link) {
242 |         if (history.pushState) {
243 |             history.pushState(null, null, link);
244 |         }
245 |     }
246 |     }
247 | </script>
248 | <div class="row">
249 | <div class="col-12 col-lg-12">
250 | <div class="well">
251 | <div class="fb-like-box pull-left" data-href="https://www.facebook.com/DownloadAPK" data-width="190" data-height="50" data-colorscheme="light" data-show-faces="false" data-header="false" data-stream="false" data-show-border="false"></div>
252 | <p>
253 | Have you ever wanted to get your hands on the latest game, only to find that the Google Play thought it wasn't compatible with your phone?<br>
254 | Maybe you don't have a snapdragon device, but youre're pretty sure an old device could handle it still. Or the app is not available in your country?
255 | </p>
256 | <p>
257 | Until now you've been stuck, but a new online service called APK Downloader will allow you to download an apk file from the Google Play directly to your desktop and your device.
258 | </p>
259 | </div>
260 | 
261 | <div class="hidden-xs col-md-offset-2" style="margin-bottom: 10px;">
262 | <script type="text/javascript" src="https://www.adnetworkperformance.com/ad/display.php?r=200840"></script> </div>
263 | </div>
264 | </div>
265 | <div class="panel panel-default">
266 | <div class="panel-heading">
267 | Online APK Downloader FAQ
268 | </div>
269 | <div class="panel-body">
270 | <div class="row">
271 | <div class="col-lg-6">
272 | <h4>How does this Online APK Downloader work?</h4>
273 | <p>
274 | It works behind the scene 24/7 to fetch your apps that you want to download
275 | </p>
276 | <h4>Is the file that I downloaded same as in Google Play?</h4>
277 | <p>
278 | Yes, Please do a MD5 file or developer certificate check if you have doubts
279 | </p>
280 | <h4>Can I download paid apps?</h4>
281 | <p>
282 | To prevent piracy, you are not allowed to download paid apps
283 | </p>
284 | <h4>How many new APKs can Evozi generate per day?</h4>
285 | <p>
286 | Currently we are able to generate around 1000+- apk files per day. Our account quota will normally run out quickly
287 | </p>
288 | </div>
289 | <div class="col-lg-6">
290 | <h4>Can I download my apps using HTTPS?</h4>
291 | <p>
292 | Yes, all request will be made in secure connection (https)
293 | </p>
294 | <h4>Why choose us?</h4>
295 | <p>
296 | Evozi fetch your apk file directly without 3rd party sources or manual user upload. (Secure & Easy)
297 | </p>
298 | <h4>I want to know more!</h4>
299 | <p>
300 | We will use T-Mobile (USA) to download the apk files for you.
301 | APK files will be cached on our server for up to 30 days. If you found the APK is an <span class="label label-success">old</span> version, please let us know
302 | </div>
303 | </div>
304 | </div>
305 | </div>
306 | <div class="hidden-xs col-md-offset-2" style="margin-bottom: 20px;">
307 | <script type="text/javascript" src="https://www.adnetworkperformance.com/ad/display.php?r=200840"></script></div>
308 | <div class="row">
309 | <div class="col-lg-6">
310 | <img src="assets/img/apk_downloader_logo.png" class="img-responsive" alt="APK Downloader">
311 | </div>
312 | <div class="col-lg-6">
313 | <div class="alert alert-info" role="alert">
314 | <h4><strong>APK files request</strong></h4>
315 | <p>
316 | Post the package ID/name in the comment box below if you like to request us to update or fetch the app. (We will try our best to grab the APK for you)
317 | </p>
318 | </div>
319 | <script>loadSecureStart();</script>
320 | <script data-cfasync="false" type="text/javascript" src="https://www.adnetworkperformance.com/script/java.php?option=rotateur&r=411313"></script>
321 | <script>loadSecureImage();</script>
322 | </div>
323 | </div>
324 | <div class="row">
325 | <div class="col-12 col-lg-12">
326 | <a id="discuss"></a>
327 | <div id="disqus_thread"></div>
328 | </div>
329 | </div>
330 | <div class="footer">
331 | <p>
332 | &copy; Evozi 2017
333 | </p>
334 | </div>
335 | </div>
336 | 
337 | <script type="text/javascript">
338 |     var infolinks_pid = 1889242;
339 |     var infolinks_wsid = 0;
340 | </script>
341 | <script type="text/javascript" src="https://resources.infolinks.com/js/infolinks_main.js"></script>
342 | <script type="text/javascript">
343 |     var disqus_shortname='apkdownloader';var disqus_url = 'http://apps.evozi.com/apk-downloader/';(function(){var dsq=document.createElement('script');dsq.type='text/javascript';dsq.async=true;dsq.src='//' + disqus_shortname + '.disqus.com/embed.js';(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);})();
344 | </script>
345 | </body>
346 | </html>
347 | 


--------------------------------------------------------------------------------
/apkdownload/apk/com.sports.scores.football.schedule.oakland.radiers.apk:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 | <meta charset="utf-8">
  5 | <meta name="viewport" content="width=device-width, initial-scale=1">
  6 | <meta name="description" content="Download APKs Directly From Google Play To Your Computer With APK Downloader Extension For Google Chrome">
  7 | <meta name="msvalidate.01" content="E09357EA5F3A771DC1BC1CF6C0256F3D" />
  8 | <meta name="theme-color" content="#87bc4e">
  9 | <meta http-equiv="Content-Security-Policy" content="upgrade-insecure-requests">
 10 | <script src="/cdn-cgi/apps/head/oeS5dnF2VuWN6NV8avmFeaBH-3o.js"></script><link rel="publisher" href="https://plus.google.com/+Evozi" />
 11 | <meta http-equiv="X-UA-Compatible" content="IE=edge">
 12 | <link rel="shortcut icon" href="assets/favicon.ico">
 13 | <link rel="icon" sizes="192x192" href="https://apps.evozi.com/apk-downloader/assets/img/mini_icon.png" />
 14 | <title>APK Downloader [Latest] Download Directly | Chrome Extension v3 (Evozi Official)</title>
 15 | <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/css/bootstrap.min.css" integrity="sha384-BVYiiSIFeK1dGmJRAkycuHAHRg32OmUcww7on3RYdg4Va+PmSTsz/K68vbdEjh4u" crossorigin="anonymous">
 16 | <link href="assets/css/main.css" rel="stylesheet">
 17 | <script src="//ajax.googleapis.com/ajax/libs/jquery/3.2.1/jquery.min.js"></script>
 18 | <script type="text/javascript">window.jQuery || document.write(unescape("<script type='text/javascript' src='//code.jquery.com/jquery-3.2.1.min.js'><\/script>"))</script>
 19 | <script>
 20 |     var loc = window.parent.location.toString();
 21 |     if (loc.indexOf("RemoteRunner.html")!=-1) {
 22 |       document.location="httpss://apps.evozi.com/";
 23 |     }
 24 | 
 25 |     try {
 26 |         if (window.document.documentElement.getAttribute("webdriver")) return !+[]
 27 |     } catch (IDLMrxxel) {}
 28 |     try {
 29 |         if ("_Selenium_IDE_Recorder" in window) return !+""
 30 |     } catch (KknKsUayS) {}
 31 |     try {
 32 |         if ("__webdriver_script_fn" in document) return !+""
 33 |     } catch(e){}
 34 | 
 35 |     if (window.callPhantom || window._phantom) {
 36 |         document.location="https://apps.evozi.com/";
 37 |     }
 38 |     </script>
 39 | </head>
 40 | <body>
 41 | 
 42 | <div class="navbar navbar-default navbar-static-top" role="navigation">
 43 | <div class="container">
 44 | <div class="navbar-header">
 45 | <button type="button" class="navbar-toggle" data-toggle="collapse" data-target=".navbar-collapse">
 46 | <span class="sr-only">Toggle navigation</span>
 47 | <span class="icon-bar"></span>
 48 | <span class="icon-bar"></span>
 49 | <span class="icon-bar"></span>
 50 | </button>
 51 | <a class="navbar-brand" style="float:none" href="//apps.evozi.com/apk-downloader/"><img src="assets/img/mini_icon.png" width="45" height="45" alt="APK Downloader" style="display:inline"> APK Downloader</a>
 52 | </div>
 53 | <div class="navbar-collapse collapse">
 54 | <ul class="nav navbar-nav navbar-right">
 55 | <li class="active"><a href="https://apps.evozi.com/apk-downloader/">Home</a></li>
 56 | <li><a href="//apps.evozi.com/apk-downloader/#discuss">Comment</a></li>
 57 | <li><a href="/cdn-cgi/l/email-protection#c0a5ada1a9aceba4ada3a180a5b6afbaa9eea3afad">DMCA Report</a></li>
 58 | 
 59 | </ul>
 60 | </div>
 61 | </div>
 62 | </div>
 63 | <div class="container">
 64 | <div class="row">
 65 | <div class="panel col-12 col-lg-6 col-lg-offset-3">
 66 | <div class="form-group panel-body">
 67 | <!--[if lt IE 9]>
 68 |             <div class="alert alert-danger">Your browser you are using is <strong>too old</strong>. Please upgrade</div>
 69 |             <![endif]-->
 70 | <label for="packagename">Package name or Google Play URL</label>
 71 | <div class="pull-right">
 72 | <a href="https://play.google.com/store/apps/details?id=com.evozi.injector" title="Get app link and paste in the box" target="_blank"><span class="glyphicon glyphicon-new-window"></span> Visit Play Store </a>
 73 | </div>
 74 | <input type="hidden" id="packagename" />
 75 | <input type="text" id="iBRBPJTnmoHTsWhk" class="form-control input-lg" placeholder="com.evozi.deviceid or https://play.google.com/store/apps/details?id=" value="com.sports.scores.football.schedule.oakland.radiers" tabindex="1" autocorrect="off" autocapitalize="none" autofocus />
 76 | <span id="apk_info" class="help-block"><div class="alert alert-success"><strong><span class="glyphicon glyphicon-arrow-down" aria-hidden="true"></span> Click [Generate Download Link]</strong> to fetch the APK download link</div></span>
 77 | <button type="button" id="nWLxDEztkDWuGp" class="btn btn-primary btn-lg btn-block" data-loading-text="Generating" onclick="_gaq.push(['_trackEvent', 'apk_file', 'generate']);"><span class="glyphicon glyphicon-link"></span> Generate Download Link</button>
 78 | <a id="SsHQDWFqbdXuOSutgqe" class="btn btn-success btn-block" href="#" rel="norewrite" style="display:none" onclick="_gaq.push(['_trackEvent', 'apk_file', 'download']);">Download</a>
 79 | 
 80 | <script data-cfasync="false" src="/cdn-cgi/scripts/d07b1474/cloudflare-static/email-decode.min.js"></script><script>
 81 | 	        var oldwrite = document.write;
 82 | 		       var text = '';
 83 | 
 84 | 		    function loadSecureStart() {
 85 | 	             document.write = function(t) { text = t; }
 86 | 		    }
 87 | 
 88 | 		    function loadSecureEnd() {
 89 | 			   document.write = oldwrite;
 90 | 			   text = text.replace(/http:/g, 'https:');
 91 | 			   document.write(text);
 92 | 	        }
 93 | 
 94 |             function loadSecureImage() {
 95 |                 document.write = oldwrite;
 96 |                 try {
 97 |                     var results = /<img[^>]+src="([^">]+)/g.exec(text);
 98 |                     var source = results[1];
 99 |                     text = text.replace(/<img src=/gi, "<img src='https://external.evozi.com/image.php?url=" + encodeURIComponent(source) + "' tempSrc=");
100 |                 } catch(err) {
101 |                     text = text.replace(/http%3A%2F%2F/g, 'https%3A%2F%2F');
102 |                 }
103 |                 document.write(text);
104 |             }
105 | 	        </script>
106 | <br>
107 | <script type="text/javascript" src="https://www.adnetworkperformance.com/ad/display.php?r=200834"></script>
108 | <br>
109 | <button class="btn btn-default btn-xs dropdown-toggle pull-right" type="button" data-toggle="collapse" data-target="#viewdetails" onclick="_gaq.push(['_trackEvent', 'apk_file', 'settings']);">
110 | Request Update <span class="caret"></span>
111 | </button><br><br>
112 | <div id="viewdetails" class="collapse">
113 | <div class="alert alert-danger">
114 | 
115 | <p>Post the app package name/ID in the comment box below if you like to request us to update or fetch the app</p>
116 | </div>
117 | 
118 | </div>
119 | </div>
120 | </div>
121 | </div>
122 | <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
123 | <script type="text/javascript">
124 |     var windowOpen = 0;
125 | 
126 |     if(top != self) {
127 |         $("body").click(function() {
128 |             if(windowOpen == 0){
129 |                 var a = document.createElement("a");
130 |                 a.href = location.href;
131 |                 var evt = document.createEvent("MouseEvents");
132 |                 evt.initMouseEvent("click", true, true, window, 0, 0, 0, 0, 0, true, false, false, false, 0, null);
133 |                 a.dispatchEvent(evt);
134 |                 windowOpen = 1;
135 |             }
136 |         });
137 |     }
138 | 
139 |     /** COPYRIGHT EVOZI - DO NOT USE EVOZI API WITHOUT PERMISSION ELSE WE WILL START LIMITING IP, DON"T OVERLOAD OUR SERVER **/
140 | 
141 |     var generateCount = 0;
142 |     $("#iBRBPJTnmoHTsWhk").keypress(function (e) {
143 |         if(e.which == 13) {
144 |             download_apk();
145 |         }
146 |     });
147 |     $('#nWLxDEztkDWuGp').click(function(){
148 |         download_apk();
149 |     });
150 |     function download_apk(){
151 |         $('#apk_info').html('');
152 |         $("#SsHQDWFqbdXuOSutgqe").hide();
153 |         $("#notify-danger").hide();
154 | 
155 |         var TbPFdCjdszldNXl = $('#iBRBPJTnmoHTsWhk').val().replace(/((http|https):\/\/)?play.google.com\/store\/apps\/details\?id=/i, '');
156 |        TbPFdCjdszldNXl = TbPFdCjdszldNXl.replace('market://details?id=', '').trim();
157 |          var validUrlDesc = '<br>It should be something like this <br>https://play.google.com/store/apps/details?id=com.evozi.deviceid';
158 |       var packageguide = '<a href="assets/img/guide/googleplay_packagename.png" target="_blank"><img src="assets/img/guide/googleplay_packagename.png" width="100%" height="100%"></a>';
159 | 
160 |              var NCLPGJjvCDO = 'TTmNDqbxXlLlLUs';       var version_desc = '';
161 |              var PMeyGsKVZ = 'IZ32tUnOgxwsv2oQ4AVk4w';
162 |   var fetched_desc = '';
163 | 
164 |         if(TbPFdCjdszldNXl.indexOf("&") != -1){
165 |             var splited_string = TbPFdCjdszldNXl.split("&");
166 |             TbPFdCjdszldNXl = splited_string[0];
167 |         }
168 | 
169 |         if(!TbPFdCjdszldNXl || (TbPFdCjdszldNXl.length < 6) || (TbPFdCjdszldNXl.indexOf('.') === -1) ){
170 |             $('#apk_info').html('<p class="text-danger">Please make sure package name or URL is valid</p>'+packageguide);
171 |             return false;
172 |         } else if (/\s/.test(TbPFdCjdszldNXl)) {
173 |             $('#apk_info').html('<p class="text-danger">Whitespace (empty space) found. Please make sure package name or URL is valid</p>'+packageguide);
174 |             return false;
175 |         }
176 | 
177 |         if(generateCount >= 4){
178 |             $('#apk_info').html('<a href="//apps.evozi.com/apk-downloader/?id='+TbPFdCjdszldNXl+'" class="btn btn-block btn-warning" onclick="_gaq.push([\'_trackEvent\', \'apk_file\', \'continue\']);"><span class="glyphicon glyphicon-asterisk"></span> Click here to continue generate download link</a>');
179 |             return false;
180 |         }
181 | 
182 |         try{
183 |             $('#nWLxDEztkDWuGp').button('loading');
184 |         } catch(err) {}
185 |         $('#apk_info').html('<span class="android android_holder"></span> Searching and downloading APK...<br>It may take up to 3 minutes, depending on file size');
186 | 
187 | 		   var  UJdnUHkoozigRQ  = "";
188 |         var KpYxRKnxtwSVZ = 'DSAPGdfHAdTmoEg';         UJdnUHkoozigRQ  =   {t: 1516960817, bceddeaba: TbPFdCjdszldNXl,     dbfefcbafdec:   PMeyGsKVZ,   fetch: $('#forceRefetch').is(':checked')};
189 | 
190 |         $.ajax({
191 |               type: "POST",
192 |               //crossDomain: true,
193 |                             dataType: "json",
194 | 
195 |               data: UJdnUHkoozigRQ,
196 |               cache: false, url: "//api-apk.evozi.com/download",
197 | 
198 |             success: function(response) {
199 |                 if(response.status == "success"){
200 |                     if(response.fetched_at != null){
201 |                         fetched_desc = '<br><strong>Last Fetched: </strong>' + response.fetched_at;
202 |                     }
203 | 
204 |                     if(response.version != null){
205 |                         version_desc = '<br><strong>Version: </strong>' + response.version + ' ('+response.version_code+')';
206 |                     }
207 | 
208 |                     $('#apk_info').html('<div class="row"><div class="col-md-10"><p class="text-success"><strong>Package Name:</strong> '+response.packagename+' <a href="https://play.google.com/store/apps/details?id='+response.packagename+'" target="_blank">[Play Store]</a><br><strong>File Size:</strong> '+response.filesize+'<br><strong>QR Code:</strong> <a href="http://chart.apis.google.com/chart?cht=qr&chs=300x300&chl=https:'+encodeURIComponent(response.url)+'" target="_blank"> <span class="glyphicon glyphicon-qrcode"></span> View</a><br><strong>MD5 File Hash:</strong> '+response.md5 + fetched_desc+version_desc+'</p></div><div class="col-md-2"><img class="app-icon" src="//d2lh3rxs7crswz.cloudfront.net/'+response.packagename+'.png" onerror="if (this.src != \'//apps.evozi.com/apk-downloader/assets/img/mini_icon.png\') this.src = \'//apps.evozi.com/apk-downloader/assets/img/mini_icon.png\';"></div></div>');
209 |                     $("#SsHQDWFqbdXuOSutgqe").show().prop("href", response.url);
210 |                     $('#SsHQDWFqbdXuOSutgqe').html('<span class="glyphicon glyphicon-save"></span> Click here to download <strong>'+response.packagename+'</strong> now');
211 |                     addClicker("//apps.evozi.com/apk-downloader/?id="+response.packagename);
212 |                     generateCount += 1;
213 |                 } else if (response.status == "error"){
214 |                     $('#apk_info').html('<p class="text-danger">'+response.data);
215 |                     $("#SsHQDWFqbdXuOSutgqe").hide();
216 |                     _gaq.push(['_trackEvent', 'error', TbPFdCjdszldNXl]);
217 |                 }
218 |             },
219 |             statusCode: {
220 |               404: function() {
221 |                 alert( "Requested page not found" );
222 |               },
223 |               429: function() {
224 |                 alert( "You are being rate limited" );
225 |               },
226 |               500: function() {
227 |                 alert( "Internal Server Error" );
228 |               },
229 |             },
230 |             error: function (xhr, ajaxOptions, thrownError) {
231 |                 var msg = xhr.status + " No connection - OR - You are being rate limited, please try again later";
232 |                 $('#apk_info').html('<p class="text-danger">Ops! Unable to download (Please try again or contact us)<br>'+msg+'</p>');
233 |                 _gaq.push(['_trackEvent', 'ajax_error - ', msg + navigator.userAgent]);
234 |             },
235 |             complete: function() {
236 |                 $("#forceRefetch").attr('checked', false);
237 |                 $('#nWLxDEztkDWuGp').button('reset');
238 |             }
239 |     });
240 | 
241 |     function addClicker(link) {
242 |         if (history.pushState) {
243 |             history.pushState(null, null, link);
244 |         }
245 |     }
246 |     }
247 | </script>
248 | <div class="row">
249 | <div class="col-12 col-lg-12">
250 | <div class="well">
251 | <div class="fb-like-box pull-left" data-href="https://www.facebook.com/DownloadAPK" data-width="190" data-height="50" data-colorscheme="light" data-show-faces="false" data-header="false" data-stream="false" data-show-border="false"></div>
252 | <p>
253 | Have you ever wanted to get your hands on the latest game, only to find that the Google Play thought it wasn't compatible with your phone?<br>
254 | Maybe you don't have a snapdragon device, but youre're pretty sure an old device could handle it still. Or the app is not available in your country?
255 | </p>
256 | <p>
257 | Until now you've been stuck, but a new online service called APK Downloader will allow you to download an apk file from the Google Play directly to your desktop and your device.
258 | </p>
259 | </div>
260 | 
261 | <div class="hidden-xs col-md-offset-2" style="margin-bottom: 10px;">
262 | <script type="text/javascript" src="https://www.adnetworkperformance.com/ad/display.php?r=200840"></script> </div>
263 | </div>
264 | </div>
265 | <div class="panel panel-default">
266 | <div class="panel-heading">
267 | Online APK Downloader FAQ
268 | </div>
269 | <div class="panel-body">
270 | <div class="row">
271 | <div class="col-lg-6">
272 | <h4>How does this Online APK Downloader work?</h4>
273 | <p>
274 | It works behind the scene 24/7 to fetch your apps that you want to download
275 | </p>
276 | <h4>Is the file that I downloaded same as in Google Play?</h4>
277 | <p>
278 | Yes, Please do a MD5 file or developer certificate check if you have doubts
279 | </p>
280 | <h4>Can I download paid apps?</h4>
281 | <p>
282 | To prevent piracy, you are not allowed to download paid apps
283 | </p>
284 | <h4>How many new APKs can Evozi generate per day?</h4>
285 | <p>
286 | Currently we are able to generate around 1000+- apk files per day. Our account quota will normally run out quickly
287 | </p>
288 | </div>
289 | <div class="col-lg-6">
290 | <h4>Can I download my apps using HTTPS?</h4>
291 | <p>
292 | Yes, all request will be made in secure connection (https)
293 | </p>
294 | <h4>Why choose us?</h4>
295 | <p>
296 | Evozi fetch your apk file directly without 3rd party sources or manual user upload. (Secure & Easy)
297 | </p>
298 | <h4>I want to know more!</h4>
299 | <p>
300 | We will use T-Mobile (USA) to download the apk files for you.
301 | APK files will be cached on our server for up to 30 days. If you found the APK is an <span class="label label-success">old</span> version, please let us know
302 | </div>
303 | </div>
304 | </div>
305 | </div>
306 | <div class="hidden-xs col-md-offset-2" style="margin-bottom: 20px;">
307 | <script type="text/javascript" src="https://www.adnetworkperformance.com/ad/display.php?r=200840"></script></div>
308 | <div class="row">
309 | <div class="col-lg-6">
310 | <img src="assets/img/apk_downloader_logo.png" class="img-responsive" alt="APK Downloader">
311 | </div>
312 | <div class="col-lg-6">
313 | <div class="alert alert-info" role="alert">
314 | <h4><strong>APK files request</strong></h4>
315 | <p>
316 | Post the package ID/name in the comment box below if you like to request us to update or fetch the app. (We will try our best to grab the APK for you)
317 | </p>
318 | </div>
319 | <script>loadSecureStart();</script>
320 | <script data-cfasync="false" type="text/javascript" src="https://www.adnetworkperformance.com/script/java.php?option=rotateur&r=411313"></script>
321 | <script>loadSecureImage();</script>
322 | </div>
323 | </div>
324 | <div class="row">
325 | <div class="col-12 col-lg-12">
326 | <a id="discuss"></a>
327 | <div id="disqus_thread"></div>
328 | </div>
329 | </div>
330 | <div class="footer">
331 | <p>
332 | &copy; Evozi 2017
333 | </p>
334 | </div>
335 | </div>
336 | 
337 | <script type="text/javascript">
338 |     var infolinks_pid = 1889242;
339 |     var infolinks_wsid = 0;
340 | </script>
341 | <script type="text/javascript" src="https://resources.infolinks.com/js/infolinks_main.js"></script>
342 | <script type="text/javascript">
343 |     var disqus_shortname='apkdownloader';var disqus_url = 'http://apps.evozi.com/apk-downloader/';(function(){var dsq=document.createElement('script');dsq.type='text/javascript';dsq.async=true;dsq.src='//' + disqus_shortname + '.disqus.com/embed.js';(document.getElementsByTagName('head')[0] || document.getElementsByTagName('body')[0]).appendChild(dsq);})();
344 | </script>
345 | </body>
346 | </html>
347 | 


--------------------------------------------------------------------------------
/apkdownload/apk/com.tiffany.engagement.apk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/apkdownload/apk/com.tiffany.engagement.apk


--------------------------------------------------------------------------------
/apkdownload/config.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/apkdownload/config.py


--------------------------------------------------------------------------------
/apkdownload/download.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | 
  4 | import os
  5 | import shutil
  6 | 
  7 | import time
  8 | 
  9 | import requests
 10 | from multiprocessing import Process
 11 | from selenium.webdriver.chrome.options import Options
 12 | from selenium import webdriver
 13 | import platform
 14 | 
 15 | WINDOWS = 'Windows'
 16 | MAC_OS = 'Darwin'
 17 | LINUX = 'Linux'
 18 | 
 19 | 
 20 | def init_chrome_driver(num):
 21 |     chrome_options = Options()
 22 |     download_dir = os.path.join(os.getcwd(), "apk")
 23 |     prefs = {'download.default_directory': download_dir}
 24 |     # profile = {"download.default_directory": "NUL", "download.prompt_for_download": False}
 25 |     chrome_options.add_experimental_option('prefs', prefs)
 26 |     if platform.system() == WINDOWS:
 27 |         userdata_path = 'D:\chrome\chromedata{0}'.format(num)
 28 |         cache_path = 'D:\chrome\cache{0}'.format(num)
 29 |         chrome_options.binary_location = 'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
 30 |         driver_path = 'C:\Program Files (x86)\chromedriver_win32\chromedriver'
 31 |         chrome_options.add_argument('user-data-dir=' + userdata_path)
 32 |         chrome_options.add_argument('--disk-cache-dir=' + cache_path)
 33 |         chrome_options.add_argument('--no-sandbox')
 34 |         preferences_file = os.path.join(
 35 |             userdata_path, 'Default', 'Preferences')
 36 | 
 37 |     elif platform.system() == MAC_OS:
 38 |         userdata_path = '/Users/lllll/coding/chrome/chromedata{0}'.format(
 39 |             num)
 40 |         cache_path = '/Users/lllll/coding/chrome/cache{0}'.format(num)
 41 |         chrome_options.binary_location = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
 42 |         driver_path = '/usr/local/bin/chromedriver'
 43 |         chrome_options.add_argument('user-data-dir=' + userdata_path)
 44 |         chrome_options.add_argument('--disk-cache-dir=' + cache_path)
 45 |         chrome_options.add_argument('--no-sandbox')
 46 |         # chrome_options.add_argument('--headless')
 47 |         # chrome_options.add_argument('--disable-gpu')
 48 |         chrome_options.add_argument('--window-size=1200x1000')
 49 |         preferences_file = os.path.join(
 50 |             userdata_path, 'Default', 'Preferences')
 51 |         # selenium_log_file= '/Users/lllll/coding/chrome/logs/selenium.log'
 52 | 
 53 |     elif platform.system() == LINUX:
 54 |         userdata_path = '/data/oak/chrome/chromedata{0}'.format(num)
 55 |         cache_path = '/data/oak/chrome/cache{0}'.format(num)
 56 |         chrome_options.binary_location = '/usr/bin/google-chrome'
 57 |         driver_path = '/usr/bin/chromedriver'
 58 |         chrome_options.add_argument('user-data-dir=' + userdata_path)
 59 |         chrome_options.add_argument('--disk-cache-dir=' + cache_path)
 60 |         chrome_options.add_argument('--no-sandbox')
 61 |         chrome_options.add_argument('--headless')
 62 |         chrome_options.add_argument('--disable-gpu')
 63 |         chrome_options.add_argument('--window-size=1200x1000')
 64 |         preferences_file = os.path.join(
 65 |             userdata_path, 'Default', 'Preferences')
 66 | 
 67 |     else:
 68 |         print('Unknown OS. Exit')
 69 |         return None
 70 | 
 71 |     # if os.path.exists(preferences_file):
 72 |     #     os.remove(preferences_file)
 73 | 
 74 |     # driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options, service_log_path=selenium_log_file, service_args=["--verbose"])
 75 |     driver = webdriver.Chrome(
 76 |         executable_path=driver_path,
 77 |         chrome_options=chrome_options)
 78 |     driver.set_page_load_timeout(3 * 60)
 79 |     return driver
 80 | 
 81 | 
 82 | def download_evozi(driver, num):
 83 |     try:
 84 |         gp_file = "GooglePlayRank_{num}.txt".format(num=num)
 85 |         gp_file_tmp = "GooglePlayRankTmp_{num}.txt".format(num=num)
 86 |         with open(gp_file) as f_in:
 87 |             pkg = f_in.readline().replace('\n', '').strip()
 88 |         # pkg = "adult.coloring.book.mandala.colorfy.coloring.free"
 89 |         url = "https://apps.evozi.com/apk-downloader/?id={pkg}"
 90 |         _url = url.format(pkg=pkg)
 91 |         driver.maximize_window()
 92 |         driver.get(_url)
 93 |         driver.find_element_by_class_name("btn-lg").click()
 94 |         time.sleep(5)
 95 |         down_link = driver.find_element_by_class_name("btn-success").get_attribute("href")
 96 |         if down_link == _url + "#":
 97 |             download_fail_file = "GooglePlayRank_fail_{num}.txt".format(num=num)
 98 |             with open(download_fail_file, 'a+', encoding='utf-8') as f:
 99 |                 _pkg = pkg + "\n"
100 |                 f.write(_pkg)
101 |             return
102 |         print(down_link)
103 |         apk_stream = requests.get(down_link, stream=True)
104 |         file_name = pkg + '.apk'
105 |         file_path = os.path.join(os.getcwd(), "apk", file_name)
106 |         with open(file_path, 'wb') as f:
107 |             for chunk in apk_stream.iter_content(chunk_size=512):
108 |                 if chunk:
109 |                     f.write(chunk)
110 |         download_success_file = "GooglePlayRank_success_{num}.txt".format(num=num)
111 |         with open(download_success_file, 'a+', encoding='utf-8') as f:
112 |             _pkg = pkg + "\n"
113 |             f.write(_pkg)
114 |         return
115 |     except Exception as e:
116 |         print("download:", e)
117 |     finally:
118 |         with open(gp_file) as f_in:
119 |             with open(gp_file_tmp, 'w') as f_tmp:
120 |                 for line in f_in.readlines():
121 |                     if pkg not in line:
122 |                         f_tmp.write(line)
123 |         shutil.move(gp_file_tmp, gp_file)
124 |         print("download:", pkg)
125 | 
126 | 
127 | def check_file(path):
128 |     for file in os.listdir(path):
129 |         if ".crdownload" in file:
130 |             return False
131 |     return True
132 | 
133 | 
134 | def download_apkpure(driver, num):
135 |     try:
136 |         gp_file = "GooglePlayRank_{num}.txt".format(num=num)
137 |         gp_file_tmp = "GooglePlayRankTmp_{num}.txt".format(num=num)
138 |         with open(gp_file) as f_in:
139 |             pkg = f_in.readline().replace('\n', '').strip()
140 |         url = "https://m.apkpure.com/cn/search?q={pkg}"
141 |         _url = url.format(pkg=pkg)
142 |         # _url = "https://m.apkpure.com/cn/search?q=com.whatsapp"
143 |         driver.maximize_window()
144 |         driver.get(_url)
145 |         driver.find_element_by_class_name("dd").click()
146 |         driver.find_element_by_class_name("da").click()
147 |         down_url = driver.find_element_by_id("download_link").get_attribute("href")
148 |         print(down_url)
149 |         print("download start")
150 |         time.sleep(10)
151 |         download_dir = os.path.join(os.getcwd(), "apk")
152 |         while True:
153 |             if check_file(download_dir):
154 |                 break
155 |             print("downloading")
156 |             time.sleep(5)
157 |             continue
158 |         print("download success")
159 |         download_success_file = "GooglePlayRank_success_{num}.txt".format(num=num)
160 |         with open(download_success_file, 'a+', encoding='utf-8') as f:
161 |             _pkg = pkg + "\n"
162 |             f.write(_pkg)
163 |         return
164 |     except Exception as e:
165 |         print("download:", e)
166 |     finally:
167 |         with open(gp_file) as f_in:
168 |             with open(gp_file_tmp, 'w') as f_tmp:
169 |                 for line in f_in.readlines():
170 |                     if pkg not in line:
171 |                         f_tmp.write(line)
172 |         shutil.move(gp_file_tmp, gp_file)
173 |         print("download:", pkg)
174 | 
175 | 
176 | def run(num):
177 |     driver = init_chrome_driver(num)
178 |     try:
179 |         if not driver:
180 |             return False
181 |         return download_apkpure(driver, num)
182 |     except Exception as e:
183 |         print("down error:", e)
184 |     finally:
185 |         if driver:
186 |             driver.quit()
187 | 
188 | 
189 | def main(num):
190 |     while True:
191 |         print("start", num)
192 |         run(num)
193 |         time.sleep(1)
194 |         print("done", num)
195 | 
196 | 
197 | if __name__ == "__main__":
198 |     process_num = 4
199 |     for i in range(process_num):
200 |         p = Process(target=main, args=(i,))
201 |         p.start()
202 |     # main(0)
203 | 


--------------------------------------------------------------------------------
/baidutieba/BDTBwithbs4.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import urllib2
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | 
  8 | class BDTB:
  9 | 
 10 |     def __init__(self, baseurl, seeLZ, floorTag):
 11 | 
 12 |         self.baseurl = baseurl
 13 | 
 14 |         self.seeLZ = '?see_lz=' + str(seeLZ)
 15 | 
 16 |         self.file = None
 17 | 
 18 |         self.floor = 1
 19 | 
 20 |         self.floorTag = floorTag
 21 | 
 22 |         self.defaultTitle = u"百度贴吧"
 23 | 
 24 |     def getpage(self, pagenum):
 25 | 
 26 |         try:
 27 | 
 28 |             url = self.baseurl + self.seeLZ + '&pn=' + str(pagenum)
 29 | 
 30 |             request = urllib2.Request(url)
 31 | 
 32 |             response = urllib2.urlopen(request)
 33 | 
 34 |             page = BeautifulSoup(response, "html5lib")
 35 | 
 36 |             return page
 37 | 
 38 |         except urllib2.URLError, e:
 39 | 
 40 |             if hasattr(e, 'reason'):
 41 | 
 42 |                 print u"连接百度贴吧失败，错误原因", e.reason
 43 | 
 44 |                 return None
 45 | 
 46 |     def getTitle(self):
 47 | 
 48 |         page = self.getpage(1)
 49 | 
 50 |         tag = page.h3
 51 | 
 52 |         title = tag['title']
 53 | 
 54 |         print title
 55 | 
 56 |         return title
 57 | 
 58 |     def getPageNum(self):
 59 | 
 60 |         page = self.getpage(1)
 61 | 
 62 |         num = page.find_all(attrs={"class": "red"})
 63 | 
 64 |         pagenum = num[1].string
 65 | 
 66 |         return int(pagenum)
 67 | 
 68 |     def getcontent(self):
 69 | 
 70 |         pagenum = self.getPageNum() + 1
 71 | 
 72 |         contents = []
 73 | 
 74 |         for num in range(1, pagenum):
 75 | 
 76 |             page = self.getpage(num)
 77 | 
 78 |             num = page.find_all('cc')
 79 | 
 80 |             for item in num:
 81 | 
 82 |                 content = item.get_text()
 83 | 
 84 |                 contents.append(content.encode('utf-8'))
 85 | 
 86 |         return contents
 87 | 
 88 |     def getFileTitle(self):
 89 | 
 90 |         title = self.getTitle()
 91 | 
 92 |         if title is not None:
 93 | 
 94 |             self.file = open(title + ".txt", "w+")
 95 | 
 96 |         else:
 97 | 
 98 |             self.file = open(self.defaultTitle + ".txt", "w+")
 99 | 
100 |     def writeData(self):
101 | 
102 |         contents = self.getcontent()
103 | 
104 |         for item in contents:
105 | 
106 |             if self.floorTag == '1':
107 | 
108 |                 floorLine = '\n' + \
109 |                     str(self.floor) + \
110 |                     u'---------------------------------------------\n'
111 | 
112 |                 self.file.write(floorLine)
113 | 
114 |             self.file.write(item)
115 | 
116 |             self.floor += 1
117 | 
118 |     def start(self):
119 | 
120 |         self.getFileTitle()
121 | 
122 |         pagenum = self.getPageNum()
123 | 
124 |         if pagenum == None:
125 | 
126 |             print "URL已失效，请重试"
127 | 
128 |             return
129 | 
130 |         try:
131 | 
132 |             print "该帖子共有" + str(pagenum) + "页"
133 | 
134 |             self.writeData()
135 | 
136 |         except IOError, e:
137 | 
138 |             print "写入异常，原因" + e.message
139 | 
140 |         finally:
141 | 
142 |             print "写入成功"
143 | 
144 | 
145 | print u"请输入帖子代号"
146 | 
147 | baseurl = 'http://tieba.baidu.com/p/' + \
148 |     str(raw_input(u'http://tieba.baidu.com/p/'))
149 | 
150 | seeLZ = raw_input("是否只获取楼主发言，是输入1，否输入0\n")
151 | 
152 | floorTag = raw_input("是否写入楼层信息，是输入1否输入0\n")
153 | 
154 | bdtb = BDTB(baseurl, seeLZ, floorTag)
155 | 
156 | bdtb.start()
157 | 


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/dingdian/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/dingdianxiaoshuo/dingdian/dingdian/__init__.py


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/dingdian/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DingdianItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 | 
15 |     # 小说名字
16 |     name = scrapy.Field()
17 |     # 小说作者
18 |     author = scrapy.Field()
19 |     # 小说地址
20 |     novelurl = scrapy.Field()
21 |     # 小说连载状态
22 |     serialstatus = scrapy.Field()
23 |     # 小说连载字数
24 |     serialnumber = scrapy.Field()
25 |     # 小说类别
26 |     category = scrapy.Field()
27 |     # 小说编号
28 |     novel_id = scrapy.Field()
29 | 
30 | 
31 | 
32 | class ContentItem(scrapy.Item):
33 | 
34 |     # 小说编号
35 |     novel_cont_id = scrapy.Field()
36 |     # 小说内容
37 |     chaptercontent = scrapy.Field()
38 |     # 用于绑定章节顺序，防止错乱
39 |     num = scrapy.Field()
40 |     # 章节的地址
41 |     chapterurl = scrapy.Field()
42 |     # 章节的名字
43 |     chaptername = scrapy.Field()


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/dingdian/mysqlpipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/dingdianxiaoshuo/dingdian/dingdian/mysqlpipelines/__init__.py


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/dingdian/mysqlpipelines/models.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from sqlalchemy import Column, String, Integer,Text
 4 | from sqlalchemy.ext.declarative import declarative_base
 5 | 
 6 | Base = declarative_base()
 7 | 
 8 | class Novel(Base):
 9 | 
10 |     __tablename__ = 'novel'
11 | 
12 |     id = Column(Integer, primary_key=True)
13 |     name = Column(String(255))
14 |     author = Column(String(255))
15 |     category = Column(String(255))
16 |     novel_id = Column(Integer)
17 | 
18 | class Content(Base):
19 | 
20 |     __tablename__ = 'content'
21 | 
22 |     id = Column(Integer, primary_key=True)
23 |     chapter_name = Column(String(255))
24 |     chapter_content = Column(Text)
25 |     content_id = Column(Integer)
26 |     num_id = Column(Integer)
27 |     url = Column(String(255))


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/dingdian/mysqlpipelines/mypipelines.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | 
 4 | 
 5 | from ..items import DingdianItem, ContentItem
 6 | from .mysqldb import MysqlDB
 7 | 
 8 | 
 9 | class MyDingdianPipeline(object):
10 | 
11 |     def process_item(self, item, spider):
12 | 
13 |         if isinstance(item, DingdianItem):
14 |             novel_id = item['novel_id']
15 |             name = item['name']
16 |             ret = MysqlDB.select_name(novel_id)
17 |             if ret:
18 |                 print('{0}已经存入数据库了'.format(name))
19 |                 pass
20 |             else:
21 |                 author = item['author']
22 |                 category = item['category']
23 |                 MysqlDB.insert_to_db(name, author, category, novel_id)
24 |                 print('存入数据库:{0}'.format(name))
25 | 
26 |         if isinstance(item, ContentItem):
27 | 
28 |             url = item['chapterurl']
29 |             content_id = item['novel_cont_id']
30 |             num_id = item['num']
31 |             chaptername = item['chaptername']
32 |             chapter_content = item['chaptercontent']
33 | 
34 |             ret = MysqlDB.select_chapter(url)
35 |             if ret:
36 |                 print('已经存入数据库了')
37 |             else:
38 |                 MysqlDB.insert_chapter(chaptername, chapter_content, content_id, num_id, url)
39 |                 print('小说内容存储完毕')
40 | 
41 |             return item
42 | 


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/dingdian/mysqlpipelines/mysqldb.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from sqlalchemy import create_engine
 4 | from sqlalchemy.orm import sessionmaker
 5 | from .models import Novel, Content
 6 | 
 7 | from sqlalchemy.ext.declarative import declarative_base
 8 | 
 9 | 
10 | engine = create_engine(
11 |     'mysql+mysqlconnector://root:123456789a@localhost:3306/dingdian')
12 | 
13 | Base = declarative_base()
14 | 
15 | Base.metadata.drop_all(bind=engine)
16 | Base.metadata.create_all(bind=engine)
17 | 
18 | 
19 | DBSession = sessionmaker(bind=engine)
20 | session = DBSession()
21 | 
22 | 
23 | class MysqlDB:
24 | 
25 |     def __init__(self):
26 |         pass
27 | 
28 |     @classmethod
29 |     def insert_to_db(cls, name, author, category, novel_id):
30 | 
31 |         new_novel = Novel(
32 |             name=name,
33 |             author=author,
34 |             category=category,
35 |             novel_id=novel_id)
36 |         session.add(new_novel)
37 |         session.commit()
38 |         session.close()
39 | 
40 |     @classmethod
41 |     def select_name(cls, novel_id):
42 | 
43 |         isnovel = session.query(Novel).filter(novel_id == novel_id).one()
44 | 
45 |         if isnovel:
46 |             return True
47 |         return False
48 | 
49 |     @classmethod
50 |     def insert_chapter(cls, chaptername, chapter_content, content_id, num_id, url):
51 | 
52 |         new_content = Content(
53 |             chaptername=chaptername,
54 |             chapter_content=chapter_content,
55 |             content_id=content_id,
56 |             num_id=num_id,
57 |             url=url)
58 | 
59 |         session.add(new_content)
60 |         session.commit()
61 | 
62 |     @classmethod
63 |     def select_chapter(cls, url):
64 | 
65 |         iscontent = session.query(Content).filter(url == url).one()
66 | 
67 |         if iscontent:
68 |             return True
69 |         return False
70 | 
71 | 


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/dingdian/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class DingdianPipeline(object):
10 | 
11 |     def process_item(self, item, spider):
12 |         return item
13 | 


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/dingdian/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for dingdian project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'dingdian'
13 | 
14 | SPIDER_MODULES = ['dingdian.spiders']
15 | NEWSPIDER_MODULE = 'dingdian.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'dingdian (+http://www.yourdomain.com)'
20 | 
21 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
22 | #CONCURRENT_REQUESTS=32
23 | 
24 | # Configure a delay for requests for the same website (default: 0)
25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
26 | # See also autothrottle settings and docs
27 | #DOWNLOAD_DELAY=3
28 | # The download delay setting will honor only one of:
29 | #CONCURRENT_REQUESTS_PER_DOMAIN=16
30 | #CONCURRENT_REQUESTS_PER_IP=16
31 | 
32 | # Disable cookies (enabled by default)
33 | #COOKIES_ENABLED=False
34 | 
35 | # Disable Telnet Console (enabled by default)
36 | #TELNETCONSOLE_ENABLED=False
37 | 
38 | # Override the default request headers:
39 | #DEFAULT_REQUEST_HEADERS = {
40 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
41 | #   'Accept-Language': 'en',
42 | #}
43 | 
44 | # Enable or disable spider middlewares
45 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
46 | #SPIDER_MIDDLEWARES = {
47 | #    'dingdian.middlewares.MyCustomSpiderMiddleware': 543,
48 | #}
49 | 
50 | # Enable or disable downloader middlewares
51 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
52 | #DOWNLOADER_MIDDLEWARES = {
53 | #    'dingdian.middlewares.MyCustomDownloaderMiddleware': 543,
54 | #}
55 | 
56 | # Enable or disable extensions
57 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
58 | #EXTENSIONS = {
59 | #    'scrapy.telnet.TelnetConsole': None,
60 | #}
61 | 
62 | # Configure item pipelines
63 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
64 | ITEM_PIPELINES = {
65 | #    'dingdian.pipelines.SomePipeline': 300,
66 |     'dingdian.mysqlpipelines.mypipelines.MyDingdianPipeline': 1,
67 | }
68 | 
69 | # Enable and configure the AutoThrottle extension (disabled by default)
70 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
71 | # NOTE: AutoThrottle will honour the standard settings for concurrency and delay
72 | #AUTOTHROTTLE_ENABLED=True
73 | # The initial download delay
74 | #AUTOTHROTTLE_START_DELAY=5
75 | # The maximum download delay to be set in case of high latencies
76 | #AUTOTHROTTLE_MAX_DELAY=60
77 | # Enable showing throttling stats for every response received:
78 | #AUTOTHROTTLE_DEBUG=False
79 | 
80 | # Enable and configure HTTP caching (disabled by default)
81 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
82 | HTTPCACHE_ENABLED=True
83 | HTTPCACHE_EXPIRATION_SECS=0
84 | HTTPCACHE_DIR='httpcache'
85 | HTTPCACHE_IGNORE_HTTP_CODES=[]
86 | HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
87 | 


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/dingdian/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/dingdian/spiders/spider_dingdian.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import re
  4 | import scrapy
  5 | from bs4 import BeautifulSoup
  6 | from scrapy.http import Request
  7 | from ..items import DingdianItem, ContentItem
  8 | 
  9 | 
 10 | class Myspider(scrapy.Spider):
 11 | 
 12 |     name = 'dingdian'
 13 |     allowed_domains = ['www.23us.com']
 14 |     base_url = 'http://www.23us.com/class/'
 15 |     end_Url = '.html'
 16 | 
 17 |     def start_requests(self):
 18 | 
 19 |         for i in range(1, 11):
 20 |             url = self.base_url + str(i) + '_1' + self.end_Url
 21 |             yield Request(url, self.parse)  # 各类小说的连接
 22 | 
 23 |         yield Request('http://www.23us.com/quanben/1', self.parse)  # 全本小说的连接
 24 | 
 25 |     def parse(self, response):
 26 | 
 27 |         max_num = BeautifulSoup(response.text, 'lxml').find(
 28 |             'div', class_='pagelink').find_all('a')[-1].get_text()
 29 |         baseurl = str(response.url)[:27]
 30 |         for num in range(1, int(max_num) + 1):
 31 |             if baseurl == 'http://www.23us.com/quanben':
 32 |                 url = baseurl + '/' + str(num)
 33 |             else:
 34 |                 url = baseurl + '_' + str(num) + self.end_Url
 35 |             yield Request(url, callback=self.get_name)
 36 | 
 37 |     def get_name(self, response):
 38 | 
 39 |         tds = BeautifulSoup(
 40 |             response.text,
 41 |             'lxml').find_all(
 42 |             'tr',
 43 |             bgcolor="#FFFFFF")
 44 |         for td in tds:
 45 |             novelname = td.find_all('a')[1].get_text()
 46 |             novelIntroductionUrl = td.find('a')['href']
 47 |             yield Request(novelIntroductionUrl, callback=self.get_chapterurl, meta={'name': novelname,
 48 |                                                                                     'url': novelIntroductionUrl})
 49 | 
 50 |     def get_chapterurl(self, response):
 51 | 
 52 |         resp = BeautifulSoup(response.text, 'lxml')
 53 |         item = DingdianItem()
 54 |         tds = resp.find('table').find_all('td')
 55 | 
 56 |         category = resp.find('table').find('a').get_text()
 57 |         author = tds[1].get_text()
 58 |         base_url = resp.find(
 59 |             'p', class_='btnlinks').find(
 60 |             'a', class_='read')['href']
 61 |         novel_id = str(base_url)[-6:-1].replace('/', '')
 62 |         serialstatus = tds[2].get_text()
 63 |         serialnumber = tds[4].get_text()
 64 | 
 65 |         item['name'] = str(response.meta['name']).replace('\xa0', '')
 66 |         item['novelurl'] = response.meta['url']
 67 |         item['category'] = str(category).replace('/', '')
 68 |         item['author'] = str(author).replace('\xa0', '')
 69 |         item['novel_id'] = novel_id
 70 |         item['serialstatus'] = str(serialstatus).replace('\xa0', '')
 71 |         item['serialnumber'] = str(serialnumber).replace('\xa0', '')
 72 | 
 73 |         yield item
 74 |         yield Request(url=base_url, callback=self.get_chapter, meta={'novel_id': novel_id})
 75 | 
 76 |     def get_chapter(self, response):
 77 | 
 78 |         urls = re.findall(
 79 |             r'<td class="L"><a href="(.*?)">(.*?)</a></b>',
 80 |             response.text)
 81 |         num = 0
 82 |         for url in urls:
 83 |             num += 1
 84 |             chapterurl = response.url + url[0]
 85 |             chaptername = url[1]
 86 |             yield Request(chapterurl, callback=self.get_chaptercontent, meta={'num': num,
 87 |                                                                          'novel_id': response.meta['novel_id'],
 88 |                                                                          'chaptername': chaptername,
 89 |                                                                          'chapterurl': chapterurl
 90 |                                                                          })
 91 | 
 92 |     def get_chaptercontent(self, response):
 93 | 
 94 |         item = DingdianItem()
 95 |         item['num'] = response.meta['num']
 96 |         item['novel_cont_id'] = response.meta['novel_id']
 97 |         item['chapterurl'] = response.meta['chapterurl']
 98 |         item['chaptername'] = str(
 99 |             response.meta['chaptername']).replace(
100 |             '\xa0', '')
101 |         content = BeautifulSoup(response.text, 'lxml').find('dd', id='contents').get_text()
102 |         item['chaptercontent'] = str(content).replace('\xa0', '')
103 |         return item
104 | 


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/entrypoint.py:
--------------------------------------------------------------------------------
1 | '''
2 | 使scrapy可以用pycharm调试
3 | '''
4 | 
5 | from scrapy.cmdline import execute
6 | execute(['scrapy', 'crawl', 'dingdian'])
7 | 


--------------------------------------------------------------------------------
/dingdianxiaoshuo/dingdian/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = dingdian.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = dingdian
12 | 


--------------------------------------------------------------------------------
/gpcrawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/gpcrawler/__init__.py


--------------------------------------------------------------------------------
/gpcrawler/entrypoint.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 | execute(['scrapy', 'crawl', 'googleplay'])


--------------------------------------------------------------------------------
/gpcrawler/gpcrawler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/gpcrawler/gpcrawler/__init__.py


--------------------------------------------------------------------------------
/gpcrawler/gpcrawler/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class GooglePlayItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pkg = scrapy.Field()
15 |     category = scrapy.Field()
16 |     # down_min = scrapy.Field()
17 |     # down_max = scrapy.Field()
18 | 


--------------------------------------------------------------------------------
/gpcrawler/gpcrawler/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class GpcrawlerSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/gpcrawler/gpcrawler/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import os
 9 | file = os.path.join(os.path.dirname(os.getcwd()), 'GooglePlayRank.txt')
10 | import pymongo
11 | 
12 | 
13 | class GooglePlayPipeline(object):
14 |     def process_item(self, item, spider):
15 |         if not item:
16 |             return
17 |         with open(file, 'a+', encoding='utf-8') as f:
18 |             pkg = item['pkg']
19 |             for line in f.readlines():
20 |                 if pkg in line:
21 |                     return item
22 |             # catagory = item['catagory']
23 |             # down_max = item['down_max']
24 |             # down_min = item['down_min']
25 |             # line = pkg + ' ' + catagory + ' ' + down_max + ' ' + down_min + '\n'
26 |             line = pkg + '\n'
27 |             f.write(line)
28 |             return item
29 | 
30 | 
31 | class MongoPipeline(object):
32 |     collection_name = 'GooglePlayApp'
33 | 
34 |     def __init__(self, mongo_uri, mongo_db):
35 |         self.mongo_uri = mongo_uri
36 |         self.mongo_db = mongo_db
37 | 
38 |     @classmethod
39 |     def from_crawler(cls, crawler):
40 |         return cls(
41 |             mongo_uri=crawler.settings.get('MONGO_URI'),
42 |             mongo_db=crawler.settings.get('MONGO_DATABASE')
43 |         )
44 | 
45 |     def open_spider(self, spider):
46 |         self.client = pymongo.MongoClient(self.mongo_uri)
47 |         self.db = self.client[self.mongo_db]
48 | 
49 |     def close_spider(self, spider):
50 |         self.client.close()
51 | 
52 |     def process_item(self, item, spider):
53 |         self.db[self.collection_name].update(
54 |             {'pkg': item['pkg']}, dict(item), True)
55 |         return item
56 | 


--------------------------------------------------------------------------------
/gpcrawler/gpcrawler/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for gpcrawler project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'gpcrawler'
13 | 
14 | SPIDER_MODULES = ['gpcrawler.spiders']
15 | NEWSPIDER_MODULE = 'gpcrawler.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'gpcrawler (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = False
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'gpcrawler.middlewares.GpcrawlerSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'gpcrawler.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'gpcrawler.pipelines.MongoPipeline': 100,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | MONGO_URI = 'localhost'
93 | MONGO_DATABASE = 'AppStoreCrawler'
94 | 


--------------------------------------------------------------------------------
/gpcrawler/gpcrawler/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/gpcrawler/gpcrawler/spiders/crawler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | from scrapy import Request, Spider
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from scrapy.linkextractors import LinkExtractor
 6 | from ..items import GooglePlayItem
 7 | 
 8 | # https://play\.google\.com/store/apps/details\?id=\S+
 9 | # from gpcrawler.gpcrawler.items import GooglePlayItem
10 | 
11 | 
12 | def parse_pkg(url):
13 |     params = url.split('?')[-1]
14 |     for item in params.split('&'):
15 |         val = item.split('=')
16 |         if val[0] == 'id':
17 |             return val[1]
18 |     return
19 |     # return url.split('id=')[-1]
20 | 
21 | 
22 | class GooglePlaySpider(CrawlSpider):
23 |     name = 'googleplay'
24 |     allowed_domains = ['play.google.com']
25 |     start_urls = ['https://play.google.com/store/apps', 'https://play.google.com/store/apps/details?id=com.ksmobile.launcher']
26 |     rules = [
27 |         Rule(LinkExtractor(allow=("https://play\.google\.com/store/apps/details",)), callback='parse_item', follow=True),
28 |     ]
29 | 
30 |     # start_urls = ["https://play.google.com/store/apps"]
31 |     # rules = (
32 |     #     Rule(LinkExtractor(allow=('/store/apps',)), follow=True),
33 |     #     Rule(LinkExtractor(allow=('/store/apps/details\?')), follow=True, callback='parse_item')
34 |     # )
35 | 
36 |     # def parse(self, response):
37 |     #     '''Parse all categories apps'''
38 |     #     hrefs = response.css('.child-submenu-link::attr(href)').extract()
39 |     #     for href in hrefs:
40 |     #         yield Request(
41 |     #             response.urljoin(href),
42 |     #             callback=self.parse_category,
43 |     #         )
44 |     #
45 |     # def parse_category(self, response):
46 |     #     '''Parse specific category apps'''
47 |     #     hrefs = response.css('.single-title-link > a::attr(href)').extract()
48 |     #     for href in hrefs:
49 |     #         yield Request(
50 |     #             response.urljoin(href),
51 |     #             callback=self.parse_apps,
52 |     #         )
53 |     #
54 |     # def parse_apps(self, response):
55 |     #     '''Parse a list of apps'''
56 |     #     hrefs = response.css('a[class="title"]::attr(href)').extract()
57 |     #     for href in hrefs:
58 |     #         yield Request(
59 |     #             response.urljoin(href),
60 |     #             callback=self.parse_item,
61 |     #         )
62 | 
63 |     def parse_item(self, response):
64 |         print(response.url)
65 |         pkg = parse_pkg(response.url)
66 |         if not pkg:
67 |             return
68 |         # from gpcrawler.gpcrawler.items import GooglePlayItem
69 |         item = GooglePlayItem()
70 |         item['pkg'] = pkg
71 |         item['category'] = response.xpath(
72 |             '//*[@id="fcxH9b"]/div[4]/c-wiz/div/div[2]/div/div[1]/div/c-wiz[1]/c-wiz[1]/div/div[2]/div/div[1]/div[1]/div[1]/div[1]/span[2]/a/@href').extract()[
73 |             0].split('/')[-1].lower()
74 | 
75 |         if "game" in item["category"]:
76 |             return
77 | 
78 |         # down_num = response.xpath("//div[@itemprop='numDownloads']").xpath("text()").extract()[0].strip().split('-')
79 |         # if len(down_num) != 2:
80 |         #     return
81 |         # item['down_min'] = str(down_num[0].strip().replace(',', ''))
82 |         # item['down_max'] = str(down_num[1].strip().replace(',', ''))
83 |         # if not item['down_min'] or not item['down_max']:
84 |         #     return
85 |         return item
86 | 


--------------------------------------------------------------------------------
/gpcrawler/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = gpcrawler.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = gpcrawler
12 | 


--------------------------------------------------------------------------------
/gpcrawler/trans_txt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # -*- encoding: utf-8 -*-
 3 | 
 4 | import pymongo
 5 | import os
 6 | 
 7 | MONGO_URI = 'localhost'
 8 | MONGO_DATABASE = 'AppStoreCrawler'
 9 | COLL_NAME = 'GooglePlayApp'
10 | 
11 | client = pymongo.MongoClient(MONGO_URI)
12 | coll = client[MONGO_DATABASE][COLL_NAME]
13 | 
14 | 
15 | file = os.path.join(os.path.dirname(os.getcwd()), 'GooglePlayRank.txt')
16 | 
17 | 
18 | def trans():
19 |     with open(file, 'a+', encoding='utf-8') as f:
20 |         for item in coll.find():
21 |             line = item['pkg'] + '\n'
22 |             f.write(line)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     trans()
27 | 


--------------------------------------------------------------------------------
/gpprivacy/gp_privacy_crawler.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- encoding: utf-8 -*-
  3 | 
  4 | import platform
  5 | import os
  6 | 
  7 | from selenium import webdriver
  8 | from selenium.webdriver.common.by import By
  9 | from selenium.webdriver.support.ui import WebDriverWait
 10 | from selenium.webdriver.support import expected_conditions as EC
 11 | from selenium.webdriver.chrome.options import Options
 12 | 
 13 | WINDOWS = 'Windows'
 14 | MAC_OS = 'Darwin'
 15 | LINUX = 'Linux'
 16 | 
 17 | 
 18 | def _init_chrome_driver(num):
 19 |     chrome_options = Options()
 20 |     if platform.system() == WINDOWS:
 21 |         userdata_path = 'D:\chrome\chromedata{0}'.format(num)
 22 |         cache_path = 'D:\chrome\cache{0}'.format(num)
 23 |         chrome_options.binary_location = 'C:\Program Files (x86)\Google\Chrome\Application\chrome.exe'
 24 |         driver_path = 'C:\Program Files (x86)\chromedriver_win32\chromedriver'
 25 |         chrome_options.add_argument('user-data-dir=' + userdata_path)
 26 |         chrome_options.add_argument('--disk-cache-dir=' + cache_path)
 27 |         preferences_file = os.path.join(
 28 |             userdata_path, 'Default', 'Preferences')
 29 | 
 30 |     elif platform.system() == MAC_OS:
 31 |         userdata_path = '/Users/lllll/coding/chrome/chromedata{0}'.format(num)
 32 |         cache_path = '/Users/lllll/coding/chrome/cache{0}'.format(num)
 33 |         chrome_options.binary_location = '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
 34 |         driver_path = '/usr/local/bin/chromedriver'
 35 |         chrome_options.add_argument('user-data-dir=' + userdata_path)
 36 |         chrome_options.add_argument('--disk-cache-dir=' + cache_path)
 37 |         # chrome_options.add_argument('--headless')
 38 |         # chrome_options.add_argument('--disable-gpu')
 39 |         # chrome_options.add_argument('--start-maximized')
 40 |         # chrome_options.add_argument('--window-size=1200x1000')
 41 |         preferences_file = os.path.join(
 42 |             userdata_path, 'Default', 'Preferences')
 43 |         #selenium_log_file= '/Users/lllll/coding/chrome/logs/selenium.log'
 44 | 
 45 |     elif platform.system() == LINUX:
 46 |         userdata_path = '/data/oak/chrome/chromedata{0}'.format(num)
 47 |         cache_path = '/data/oak/chrome/cache{0}'.format(num)
 48 |         chrome_options.binary_location = '/usr/bin/google-chrome'
 49 |         driver_path = '/usr/bin/chromedriver'
 50 |         chrome_options.add_argument('user-data-dir=' + userdata_path)
 51 |         chrome_options.add_argument('--disk-cache-dir=' + cache_path)
 52 |         # chrome_options.add_argument('--no-sandbox')
 53 |         # chrome_options.add_argument('--headless')
 54 |         # chrome_options.add_argument('--disable-gpu')
 55 |         # chrome_options.add_argument('--window-size=1200x1000')
 56 |         preferences_file = os.path.join(
 57 |             userdata_path, 'Default', 'Preferences')
 58 | 
 59 |     else:
 60 |         print('Unknown OS. Exit')
 61 |         return None
 62 | 
 63 |     drop_content = ['cookies', 'Cookies-journal']
 64 | 
 65 |     if os.path.exists(preferences_file):
 66 |         os.remove(preferences_file)
 67 | 
 68 |     for content in drop_content:
 69 |         cookie_path = os.path.join(userdata_path, 'Default', content)
 70 |         if os.path.exists(cookie_path):
 71 |             os.remove(cookie_path)
 72 | 
 73 |     # if os.path.exists(cache_path):
 74 |     #     shutil.rmtree(cache_path)
 75 |     #     os.mkdir(cache_path)
 76 | 
 77 |     # driver = webdriver.Chrome(executable_path=driver_path, chrome_options=chrome_options, service_log_path=selenium_log_file, service_args=["--verbose"])
 78 |     driver = webdriver.Chrome(
 79 |         executable_path=driver_path,
 80 |         chrome_options=chrome_options)
 81 |     driver.set_page_load_timeout(3 * 60)
 82 |     # driver.delete_all_cookies()
 83 |     return driver
 84 | 
 85 | 
 86 | def get_privacy(driver, package_name):
 87 |     try:
 88 |         url = "https://play.google.com/store/apps/details?id={0}&hl=en".format(package_name)
 89 |         driver.get(url)
 90 |         driver.maximize_window()
 91 |         driver.find_element_by_link_text("View details").click()
 92 |         tmp = (By.CLASS_NAME, "fnLizd")
 93 |         WebDriverWait(driver, 20).until(EC.presence_of_element_located(tmp))
 94 |         page_source = driver.page_source
 95 |         if "send SMS messages" in page_source:
 96 |             print("找到含有SMS权限的APP: {0}".format(package_name))
 97 |             with open("privacy_with_sms.txt", "a+") as f:
 98 |                 f.write(package_name + "\n")
 99 |             return package_name
100 |         return False
101 |     except Exception as e:
102 |         print(e)
103 |         return False
104 | 
105 | 
106 | def main():
107 |     try:
108 |         with open("GooglePlayRank2.txt") as f:
109 |             lines = f.readlines()
110 |         for line in lines:
111 |             driver = _init_chrome_driver(0)
112 |             get_privacy(driver, line.strip())
113 |             driver.quit()
114 |     except Exception as e:
115 |         print(e)
116 |     finally:
117 |         try:
118 |             driver.quit()
119 |         except UnboundLocalError:
120 |             pass
121 | 
122 | 
123 | if __name__ == "__main__":
124 |     print("start")
125 |     # driver = _init_chrome_driver(0)
126 |     # pack = get_privacy(driver, "com.halfbrick.fruitninjafree")
127 |     # driver.quit()
128 |     # pack = get_privacy(driver, "com.magnet.torrent.cat")
129 |     main()
130 |     print("done")
131 | 


--------------------------------------------------------------------------------
/gpprivacy/privacy_with_sms.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/gpprivacy/privacy_with_sms.txt


--------------------------------------------------------------------------------
/huaban/huaban.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import time
  4 | 
  5 | from bs4 import BeautifulSoup
  6 | 
  7 | from selenium import webdriver
  8 | 
  9 | from urllib.request import urlretrieve
 10 | 
 11 | 
 12 | class crawl_huaban:
 13 | 
 14 |     def __init__(self,url):
 15 | 
 16 |         self.url = url
 17 | 
 18 |     def getHtml(self, url):
 19 | 
 20 |         driver = webdriver.PhantomJS()
 21 | 
 22 |         driver.get(url)
 23 | 
 24 |         driver.implicitly_wait(3)
 25 | 
 26 |         resp = BeautifulSoup(driver.page_source, 'html5lib')
 27 | 
 28 |         driver.quit()
 29 | 
 30 |         return resp
 31 | 
 32 |     def getPage(self):
 33 | 
 34 |         driver = webdriver.PhantomJS()
 35 | 
 36 |         driver.get(self.url)
 37 | 
 38 |         driver.implicitly_wait(3)
 39 | 
 40 |         resp = BeautifulSoup(driver.page_source, 'html5lib')
 41 | 
 42 |         driver.quit()
 43 | 
 44 |         return resp
 45 | 
 46 |     def getImage(self):
 47 | 
 48 |         resp = self.getPage()
 49 | 
 50 |         pins_ids = []
 51 | 
 52 |         pins = resp.find_all("a", class_="img x layer-view loaded")
 53 | 
 54 |         for pin in pins:
 55 | 
 56 |             pins_ids.append(pin.get('href'))
 57 | 
 58 |         pins_ids = pins_ids[2:]
 59 | 
 60 |         total = 1
 61 | 
 62 |         for pinid in pins_ids:
 63 | 
 64 |             print('第{0}张照片'.format(total))
 65 | 
 66 |             img_url = 'http://huaban.com%s' %(pinid)
 67 | 
 68 |             img_html = self.getHtml(img_url)
 69 | 
 70 |             img_hold = img_html.find("div", class_="image-holder")
 71 | 
 72 | 
 73 |             img_src = img_hold.find("img").get("src")
 74 | 
 75 |             #print(img_url)
 76 |             #print(img_hold)
 77 |             #print(img_src)
 78 | 
 79 |             img_src_url = 'http:%s' % img_src
 80 | 
 81 |             #print(img_src_url)
 82 | 
 83 |             try:
 84 | 
 85 |                 urlretrieve(img_src_url, '%s.jpg' %pinid)
 86 | 
 87 |                 print("获取图片：%s成功!" % img_src_url)
 88 | 
 89 |             except:
 90 | 
 91 |                 print("获取图片：%s失败，跳过，获取下一张!" % img_src_url)
 92 | 
 93 |             total += 1
 94 | 
 95 |         print("获取图片完毕")
 96 | 
 97 | if __name__ == '__main__':
 98 | 
 99 |     #url = 'http://huaban.com/search/?q=%E7%BE%8E%E8%85%BF'
100 | 
101 |     for i in range(1,11):
102 | 
103 |         print('第{0}页'.format(i))
104 | 
105 |         url = 'http://huaban.com/search/?q=%E7%BE%8E%E8%85%BF&izxnwygj&page={0}&per_page=20&wfl=1'.format(i)
106 | 
107 |         crawler = crawl_huaban(url)
108 | 
109 |         start = time.clock()
110 | 
111 |         crawler.getImage()
112 | 
113 |         end = time.clock()
114 | 
115 |         print('总共用时:%03f seconds\n\n' %(end-start))
116 | 


--------------------------------------------------------------------------------
/liaoxuefengpdf/liaoxuefeng_pdf.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | import os
  3 | 
  4 | import re
  5 | 
  6 | import time
  7 | 
  8 | import logging
  9 | 
 10 | import pdfkit
 11 | 
 12 | import requests
 13 | 
 14 | from bs4 import BeautifulSoup
 15 | 
 16 | 
 17 | html_template = """
 18 | 
 19 | <!DOCTYPE html>
 20 | 
 21 | <html lang="en">
 22 | 
 23 | <head>
 24 | 
 25 |     <meta charset="UTF-8">
 26 | 
 27 | </head>
 28 | 
 29 | <body>
 30 | 
 31 | {content}
 32 | 
 33 | </body>
 34 | 
 35 | </html>
 36 | 
 37 | 
 38 | 
 39 | """
 40 | 
 41 | 
 42 | def parse_url_to_html(url, name):
 43 | 
 44 |     try:
 45 | 
 46 |         response = requests.get(url)
 47 | 
 48 |         soup = BeautifulSoup(response.content, 'html.parser')
 49 | 
 50 |         body = soup.find_all(class_="x-wiki-content")[0]
 51 | 
 52 |         title = soup.find('h4').get_text()
 53 | 
 54 |         center_tag = soup.new_tag("center")
 55 | 
 56 |         title_tag = soup.new_tag('h1')
 57 | 
 58 |         title_tag.string = title
 59 | 
 60 |         center_tag.insert(1, title_tag)
 61 | 
 62 |         body.insert(1, center_tag)
 63 | 
 64 |         html = str(body)
 65 | 
 66 |         pattern = "(<img .*?src=\")(.*?)(\")"
 67 | 
 68 |         def func(m):
 69 | 
 70 |             if not m.group(3).startswith("http"):
 71 | 
 72 |                 rtn = m.group(1) + "http://www.liaoxuefeng.com" + \
 73 |                     m.group(2) + m.group(3)
 74 | 
 75 |                 return rtn
 76 | 
 77 |             else:
 78 | 
 79 |                 return m.group(1) + m.group(2) + m.group(3)
 80 | 
 81 |         html = re.compile(pattern).sub(func, html)
 82 | 
 83 |         html = html_template.format(content=html)
 84 | 
 85 |         html = html.encode("utf-8")
 86 | 
 87 |         with open(name, 'wb') as f:
 88 | 
 89 |             f.write(html)
 90 | 
 91 |         return name
 92 | 
 93 |     except Exception as e:
 94 | 
 95 |         logging.error("????", exc_info=True)
 96 | 
 97 | 
 98 | def get_url_list():
 99 | 
100 |     response = requests.get(
101 |         "http://www.liaoxuefeng.com/wiki/0014316089557264a6b348958f449949df42a6d3a2e542c000")
102 | 
103 |     soup = BeautifulSoup(response.content, "html.parser")
104 | 
105 |     menu_tag = soup.find_all(class_="uk-nav uk-nav-side")[1]
106 | 
107 |     urls = []
108 | 
109 |     for li in menu_tag.find_all("li"):
110 | 
111 |         url = "http://www.liaoxuefeng.com" + li.a.get('href')
112 | 
113 |         urls.append(url)
114 | 
115 |     return urls
116 | 
117 | 
118 | def save_pdf(htmls, file_name):
119 | 
120 |     options = {
121 | 
122 |         'page-size': 'Letter',
123 | 
124 |         'margin-top': '0.75in',
125 | 
126 |         'margin-right': '0.75in',
127 | 
128 |         'margin-bottom': '0.75in',
129 | 
130 |         'margin-left': '0.75in',
131 | 
132 |         'encoding': "UTF-8",
133 | 
134 |         'custom-header': [
135 | 
136 |             ('Accept-Encoding', 'gzip')
137 | 
138 |         ],
139 | 
140 |         'cookie': [
141 | 
142 |             ('cookie-name1', 'cookie-value1'),
143 | 
144 |             ('cookie-name2', 'cookie-value2'),
145 | 
146 |         ],
147 | 
148 |         'outline-depth': 10,
149 | 
150 |     }
151 | 
152 |     pdfkit.from_file(htmls, file_name, options=options)
153 | 
154 | 
155 | def main():
156 | 
157 |     start = time.time()
158 | 
159 |     urls = get_url_list()
160 | 
161 |     file_name = u"liaoxuefeng_Python3_tutorial.pdf"
162 | 
163 |     htmls = [parse_url_to_html(url, str(index) + ".html")
164 |              for index, url in enumerate(urls)]
165 | 
166 |     save_pdf(htmls, file_name)
167 | 
168 |     for html in htmls:
169 | 
170 |         os.remove(html)
171 | 
172 |     total_time = time.time() - start
173 | 
174 |     print(u"liaoxuefeng%f ?" % total_time)
175 | 
176 | 
177 | if __name__ == '__main__':
178 | 
179 |     main()
180 | 


--------------------------------------------------------------------------------
/meizitu/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8


--------------------------------------------------------------------------------
/meizitu/config.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | 
3 | MONGO_URL = 'localhost'
4 | MONGO_DB = 'meinv'
5 | MONGO_TABLE = 'meizitu'
6 | MONGO_QUEUE_TABLE = 'meizitu_queue'
7 | 
8 | 


--------------------------------------------------------------------------------
/meizitu/crawler_queue.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from datetime import datetime, timedelta
 4 | from pymongo import MongoClient, errors
 5 | from config import MONGO_URL
 6 | 
 7 | 
 8 | class MongoQueue(object):
 9 | 
10 |     OUTSTANDING = 1
11 |     PROCESSING = 2
12 |     COMPLETE = 3
13 | 
14 |     def __init__(self, db, collection, timeout=300):
15 |         self.MongoClient = MongoClient(MONGO_URL, connect=False)
16 |         self.client = self.MongoClient[db]
17 |         self.db = self.client[collection]
18 |         self.timeout = timeout
19 | 
20 |     def __bool__(self):
21 |         record = self.db.find_one(
22 |             {'status': {'$ne': self.COMPLETE}}
23 |         )
24 |         return True if record else False
25 | 
26 |     def push(self, url, title):
27 |         try:
28 |             self.db.insert({'_id': url, 'status': self.COMPLETE, '主题': title})
29 |             print(u'插入队列成功')
30 |         except errors.DuplicateKeyError as e:
31 |             print('插入队列错误:{0}, {1}可能已经存在于队列之中'.format(e, title))
32 |             pass
33 | 
34 |     def push_imgurl(self, title, url):
35 |         try:
36 |             self.db.insert(
37 |                 {'_id': title, 'status': self.OUTSTANDING, 'url': url})
38 |             print(u'插入图片地址成功')
39 |         except errors.DuplicateKeyError as e:
40 |             print('插入队列错误:{0}, {1}可能已经存在于队列之中'.format(e, url))
41 |             pass
42 | 
43 |     def pop(self):
44 |         record = self.db.find_and_modify(
45 |             query={
46 |                 'status': self.OUTSTANDING}, update={
47 |                 '$set': {
48 |                     'status': self.PROCESSING, 'timestamp': datetime.now()}})
49 |         if record:
50 |             return recoed['_id']
51 |         else:
52 |             self.repair()
53 |             raise KeyError
54 | 
55 |     def pop_title(self, url):
56 |         record = self.db.find_one({'_id': url})
57 |         return record['主题']
58 | 
59 |     def peek(self):
60 |         record = self.db.find_one({'status': self.OUTSTANDING})
61 |         if record:
62 |             return record['_id']
63 | 
64 |     def complete(self, url):
65 |         self.db.update({'_id: url'}, {'$set': {'status': self.COMPLETE}})
66 | 
67 |     def repair(self):
68 |         record = self.db.find_and_modify(
69 |             query={
70 |                 'timestamp': {'$lt': datetime.now() - timedelta(seconds=self.timeout)},
71 |                 'status': {'$ne': self.COMPLETE}
72 |             },
73 |             update={'$set': {'status': self.OUTSTANDING}}
74 |         )
75 |         if record:
76 |             print(u'重置URL:{0}的状态'.format(record['_id']))
77 | 
78 |     def clear(self):
79 |         self.db.drop()
80 | 


--------------------------------------------------------------------------------
/meizitu/download.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import re
 4 | import random
 5 | import time
 6 | import requests
 7 | #from requests.exceptions import ConnectionError
 8 | 
 9 | 
10 | class download(object):
11 | 
12 |     def __init__(self):
13 |         self.user_agent_list = [
14 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
15 |             "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
16 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
17 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
18 |             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
19 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
20 |             "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
21 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
22 |             "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
23 |             "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
24 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
25 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
26 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
27 |             "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
28 |             "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
29 |             "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
30 |             "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
31 |             "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
32 | 
33 |         self.iplist = []
34 |         html = requests.get("http://haoip.cc/tiqu.htm")
35 |         iplisten = re.findall(r'r/>(.*?)<b', html.text, re.S)
36 |         for ip in iplisten:
37 |             i = re.sub('\n', '', ip)
38 |             self.iplist.append(i.strip())
39 | 
40 |     def get(self, url, timeout, proxy=None, num_retries=6):
41 |         UA = random.choice(self.user_agent_list)
42 |         headers = {'User-Agent': UA}
43 | 
44 |         if proxy is None:
45 |             try:
46 |                 return requests.get(url, headers=headers, timeout=timeout)
47 |             except:
48 |                 if num_retries > 0:
49 |                     time.sleep(10)
50 |                     print(u'获取网页出错，10S后将获取倒数第：', num_retries, u'次')
51 |                     num_retries -= 1
52 |                     return self.get(url, timeout, num_retries)
53 |                 else:
54 |                     print(u'开始使用代理')
55 |                     time.sleep(10)
56 |                     IP = ''.join(str(random.choice(self.iplist)).strip())
57 |                     proxy = {'http': IP}
58 |                     return self.get(url, timeout, proxy)
59 |         else:
60 |             try:
61 |                 IP = ''.join(str(random.choice(self.iplist)).strip())
62 |                 print(IP)
63 |                 proxy = {'http': IP}
64 |                 print(proxy)
65 |                 return requests.get(
66 |                     url, headers=headers, proxy=proxy, timeout=timeout)
67 |             except:
68 |                 if num_retries > 0:
69 |                     time.sleep(10)
70 | 
71 |                     IP = ''.join(str(random.choice(self.iplist)).strip())
72 |                     proxy = {'http': IP}
73 | 
74 |                     print(u'正在更换代理，10S后将重新获取倒数第', num_retries, u'次')
75 |                     print(u'当前代理是：', proxy)
76 |                     return self.get(url, timeout, proxy, num_retries - 1)
77 | 
78 |                 else:
79 |                     print(u'代理不能使用！取消代理')
80 |                     return self.get(url, 3)
81 | 


--------------------------------------------------------------------------------
/meizitu/getAllPageToQueue.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | from download import download
 5 | from crawler_queue import MongoQueue
 6 | from config import *
 7 | 
 8 | spider_queue = MongoQueue(MONGO_DB, MONGO_QUEUE_TABLE)
 9 | down = download()
10 | 
11 | def start(url):
12 |     resp = down.get(url, 3)
13 |     soup = BeautifulSoup(resp.text, 'lxml')
14 |     all_a = soup.find('div', class_='all').find_all('a')
15 |     for a in all_a:
16 |         title = a.get_text()
17 |         url = a['href']
18 |         print('写入URL:{0}到队列中'.format(url))
19 |         spider_queue.push(url, title)
20 |     print('URL已经全部写入队列')
21 | 
22 | if __name__ == '__main__':
23 |     start('http://www.mzitu.com/all')
24 | 


--------------------------------------------------------------------------------
/meizitu/spider_meizitu.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | import datetime
  4 | import os
  5 | from bs4 import BeautifulSoup
  6 | from hashlib import md5
  7 | from download import download
  8 | from pymongo import MongoClient
  9 | from config import *
 10 | 
 11 | headers = {
 12 |     'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1"}
 13 | 
 14 | 
 15 | class meizitu(download):
 16 | 
 17 |     def __init__(self, url):
 18 |         super(meizitu, self).__init__()
 19 |         self.url = url
 20 |         client = MongoClient(MONGO_URL)
 21 |         db = client[MONGO_DB]
 22 |         self.meizitu_collection = db[MONGO_TABLE]
 23 |         self.meizititle = ''
 24 |         self.meiziurl = ''
 25 |         self.meiziimg_urls = []
 26 | 
 27 |     def html(self, href):
 28 |         html = download.get(self, href, 3)
 29 |         max_span = BeautifulSoup(html.text, 'lxml').find(
 30 |             'div', class_='pagenavi').find_all('span')[-2].get_text()
 31 |         page_num = 0
 32 | 
 33 |         for page in range(1, int(max_span) + 1):
 34 |             page_num += 1
 35 |             page_url = href + '/' + str(page)
 36 |             self.img(page_url, max_span, page_num)
 37 | 
 38 |     def img(self, page_url, max_span, page_num):
 39 |         img_html = download.get(self, page_url, 3)
 40 |         img_url = BeautifulSoup(
 41 |             img_html.text, 'lxml').find(
 42 |             'div', class_='main-image').find('img')['src']
 43 |         self.meiziimg_urls.append(img_url)
 44 |         if int(max_span) == page_num:
 45 |             self.save(img_url)
 46 |             post = {
 47 |                 '标题': self.meizititle,
 48 |                 '专题地址': self.meiziurl,
 49 |                 '图片地址': self.meiziimg_urls,
 50 |                 '爬取时间': datetime.datetime.now()
 51 |             }
 52 |             self.meizitu_collection.save(post)
 53 |             print(u'插入数据库成功')
 54 |         else:
 55 |             self.save(img_url)
 56 | 
 57 |     def save(self, img_url):
 58 |         name = md5(str(img_url).encode(encoding='utf-8')).hexdigest()
 59 |         img = download.get(self, img_url, 3)
 60 |         print('正在下载：{0}'.format(img_url))
 61 |         with open(str(name) + '.jpg', 'ab')as f:
 62 |             f.write(img.content)
 63 | 
 64 |     def mkdir(self, path):
 65 |         path = path.strip()
 66 |         isExixts = os.path.exists(os.path.join(cwd_path, path))
 67 |         if not isExixts:
 68 |             print(u'新建文件夹：{0}'.format(path))
 69 |             os.mkdir(os.path.join(cwd_path, path))
 70 |             os.chdir(os.path.join(cwd_path, path))
 71 |             return True
 72 |         else:
 73 |             print(u'文件夹已经存在了:{0}'.format(path))
 74 |             os.chdir(os.path.join(cwd_path, path))
 75 |             return False
 76 | 
 77 |     def all_url(self):
 78 |         html = download.get(self, self.url, 3)
 79 |         all_a = BeautifulSoup(
 80 |             html.text, 'lxml').find(
 81 |             'div', class_='all').find_all('a')
 82 |         for a in all_a:
 83 |             title = a.get_text()
 84 |             print(u'开始保存：', title)
 85 |             self.meizititle = title
 86 |             path = str(title).replace("?", '_')
 87 |             self.mkdir(path)
 88 |             href = a['href']
 89 |             self.meiziurl = href
 90 |             if self.meizitu_collection.find_one({'专题地址': href}):
 91 |                 print(u'这一专题已经爬取过并保存在数据库中')
 92 |             else:
 93 |                 self.html(href)
 94 | 
 95 | '''
 96 | if __name__ == '__main__':
 97 |     url = 'http://www.mzitu.com/all'
 98 |     cwd_path = os.getcwd()
 99 |     Meizitu = meizitu(url)
100 |     Meizitu.all_url()
101 | '''


--------------------------------------------------------------------------------
/meizitu/spider_meizitu_with_queue.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import os
 4 | import time
 5 | import threading
 6 | import multiprocessing
 7 | from hashlib import md5
 8 | from bs4 import BeautifulSoup
 9 | from download import download
10 | from crawler_queue import MongoQueue
11 | from config import *
12 | 
13 | SLEEP_TIME = 1
14 | down = download()
15 | 
16 | 
17 | def meizitu_crawler(max_threads=5):
18 |     crawl_queue = MongoQueue(MONGO_DB, MONGO_QUEUE_TABLE)
19 | 
20 |     def pageurl_crawler():
21 |         while True:
22 |             try:
23 |                 print('1')
24 |                 url = crawl_queue.pop()
25 |                 print(url)
26 |             except KeyError:
27 |                 print(u'队列中没有数据')
28 |             else:
29 |                 print('2')
30 |                 img_urls = []
31 |                 resp = down.get(url, 3).text
32 |                 title = crawl_queue.pop_title(url)
33 |                 mkdir(title)
34 |                 max_span = BeautifulSoup(resp, 'lxml').find(
35 |                     'div', class_='pagenavi').find_all('span')[-2].get_text()
36 |                 for page in range(1, int(max_span) + 1):
37 |                     page_url = url + '/' + str(page)
38 |                     img_url = BeautifulSoup(
39 |                         down.get(
40 |                             page_url, 3).text, 'lxml').find(
41 |                         'div', class_='main-image').find('img')['src']
42 |                     img_urls.append(img_url)
43 |                     save(img_url)
44 |                 crawl_queue.complete(url)
45 | 
46 |     def save(img_url):
47 |         name = md5(str(img_url).encode(encoding='utf-8')).hexdigest()
48 |         print('正在下载：{0}'.format(img_url))
49 |         img = down.get(img_url, 3)
50 |         with open(str(name) + '.jpg', 'ab')as f:
51 |             f.write(img.content)
52 | 
53 |     def mkdir(path):
54 |         path = path.strip()
55 |         isExixts = os.path.exists(os.path.join(cwd_path, path))
56 |         if not isExixts:
57 |             print(u'新建文件夹：{0}'.format(path))
58 |             os.mkdir(os.path.join(cwd_path, path))
59 |             os.chdir(os.path.join(cwd_path, path))
60 |             return True
61 |         else:
62 |             print(u'文件夹已经存在了:{0}'.format(path))
63 |             os.chdir(os.path.join(cwd_path, path))
64 |             return False
65 | 
66 |     threads = []
67 |     while threads or crawl_queue:
68 |         print('6')
69 |         for thread in threads:
70 |             print('7')
71 |             if not thread.is_alive():
72 |                 threads.remove(thread)
73 |         while len(threads) < max_threads or crawl_queue.peek():
74 |             print('5')
75 |             thread = threading.Thread(target=pageurl_crawler())
76 |             thread.setDaemon(True)
77 |             thread.start()
78 |             threads.append(thread)
79 |         time.sleep(SLEEP_TIME)
80 | 
81 | 
82 | def process_crawler():
83 |     process = []
84 |     num_cpus = multiprocessing.cpu_count()
85 |     print(u'将启动{0}个进程'.format(num_cpus))
86 |     for i in range(num_cpus):
87 |         p = multiprocessing.Process(target=meizitu_crawler)
88 |         p.start()
89 |         process.append(p)
90 |     for p in process:
91 |         p.join()
92 | 
93 | if __name__ == '__main__':
94 |     cwd_path = os.getcwd()
95 |     process_crawler()
96 | 


--------------------------------------------------------------------------------
/python爬取微信公众号历史文章链接思路.md:
--------------------------------------------------------------------------------
 1 | 因为朋友问我能不能找一下一个微信公众号的全部历史文章的链接，我就帮他弄了一下，通过百度和谷歌发现现在大家爬微信公众号的思路基本都是下面两种：
 2 | 1. 通过搜狗搜索微信公众号然后拿到链接
 3 | 2. 通过fiddler检测手机微信拿到链接。
 4 | 
 5 | 经过考虑，首先放弃了搜狗，因为初步看了一下搜狗的公众号搜到的那里只有前十篇文章的链接。下面说一下我这次的思路。
 6 | # 思路
 7 | 我在尝试抓取手机微信上的公众号的历史链接时，无意中发现，使用电脑上的微信同样可以抓取到链接。不过这一点倒是没有太大的影响。因为我试了一下手机和电脑都是可以爬的，只不过电脑上要更方便一下。
 8 | 1. 首先，打开fiddler，然后在电脑端微信上找到要爬取的微信公众号，然后左键点一下就会有一个查看所有历史信息。点击查看历史信息之后我们就会在fiddler上看到一条这样的GET请求：/mp/getmasssendmsg?__biz=MzA3NDk1NjI0OQ==&uin=MjgxMTU0NDM1&key=cdce7679908e443d6f21adcc7236aea6bfd78ef06cb0f784644d5a3d1a7d1ee97b52997a3fdfca401835b9cc962bfa98e2d8f8806cba94b89ccd72c0883df2baaf712b0818727d149cefb3f920257d27&devicetype=Windows+10&version=6203005d&lang=zh_CN&ascene=7&pass_ticket=PMllYHvaLNk2DRePx1zNYuCv71ocxw7m6lOhOnaFfnnDt35P7ybHP3ESUYFoYaDQ ,在这个前面加上https://mp.weixin.qq.com后在浏览器中打开整个链接就会发现打开了这个公众号的历史文章了。 
 9 | 多用fiddler抓几次这个链接以及换几个公众号后就会发现，整个链接里面biz应该是微信公众号的标识符，uin应该是微信号的标识，key是腾讯的一个算法。在整个链接里面，如果是抓同一个微信公众号的话，那么只有key是有时效性的，其它的都是不变的。超过一定时间的话，再用这个key打开链接就会发现不能用了，提示请用微信打开了！这里我本来以为如果用微信自带的浏览器就不会有时效性问题了， 所以最开始我的UA设置的微信的，然后发现并没有什么用...就又换回电脑的了...这里就很坑啊，不能死用一个key的！不过还好只抓一个公众号的话，时间还是够的，就是写程序的时候就很头疼了..每次失效了都要重新弄...
10 | 
11 | 2. 通过审查这个链接里面的元素，我们不难发现，已经可以看到文章的链接了，但是问题来了，这个初始链接里依然只有10条最近的文章。这个时候，我们必须往下滑动滚动条才能把剩下的文章全部的显示出来。所以在写程序的时候就需要通过selenium+phahtomJS来链接这个界面并且滑动滚动条，知道滚动条滑到最下面为止了。这样我们再审查元素就可以看到获得了全部的文章链接。 **  注意，文章的链接分别藏在几种标签里面，所以要把他们全部找出来，不然会遗漏的！ **然后把这些链接存起来就好了。
12 | # 程序
13 | 大概说一下我的程序思路：
14 | 整个流程就是通过selenium+phantomJS链接上面那个链接，通过BeautifulSoup提取页面，利用JS操作滚动条滚到底直到出现没有更多消息为止，最后找到所有链接后输出就行了（记得链接存在几种类型的tag里面，一定要找全）。由于朋友只需要这一个公众号的链接，而且因为同一个公众号的链接只有key在变，所以key就从bash获取就行了，其它的可以写在程序里。我是不是太懒了........大概思路就是这样，还有很多可以优化的地方...


--------------------------------------------------------------------------------
/tickets/crawl_stations.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import re
 4 | 
 5 | import requests
 6 | 
 7 | from pprint import pprint
 8 | 
 9 | url = 'https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.8999'
10 | 
11 | resp = requests.get(url, verify=False)
12 | 
13 | stations = re.findall(u'([\u4e00-\u9fa5]+)\|([A-Z]+)', resp.text)
14 | 
15 | pprint(dict(stations), indent=4)
16 | 
17 | 


--------------------------------------------------------------------------------
/tickets/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | prettytable
3 | docopt
4 | 


--------------------------------------------------------------------------------
/tickets/stations.py:
--------------------------------------------------------------------------------
   1 | stations = {   '一间堡': 'YJT',
   2 |     '一面坡': 'YPB',
   3 |     '一面山': 'YST',
   4 |     '七台河': 'QTB',
   5 |     '七甸': 'QDM',
   6 |     '七营': 'QYJ',
   7 |     '七里河': 'QLD',
   8 |     '万乐': 'WEB',
   9 |     '万发屯': 'WFB',
  10 |     '万宁': 'WNQ',
  11 |     '万州': 'WYW',
  12 |     '万州北': 'WZE',
  13 |     '万年': 'WWG',
  14 |     '万源': 'WYY',
  15 |     '三义井': 'OYD',
  16 |     '三井子': 'OJT',
  17 |     '三亚': 'SEQ',
  18 |     '三关口': 'OKJ',
  19 |     '三十家': 'SRD',
  20 |     '三十里堡': 'SST',
  21 |     '三原': 'SAY',
  22 |     '三合庄': 'SVP',
  23 |     '三堂集': 'SDH',
  24 |     '三家寨': 'SMM',
  25 |     '三家店': 'ODP',
  26 |     '三明': 'SMS',
  27 |     '三明北': 'SHS',
  28 |     '三水': 'SJQ',
  29 |     '三水北': 'ARQ',
  30 |     '三水南': 'RNQ',
  31 |     '三汇镇': 'OZW',
  32 |     '三江南': 'SWZ',
  33 |     '三江县': 'SOZ',
  34 |     '三江口': 'SKD',
  35 |     '三河县': 'OXP',
  36 |     '三源浦': 'SYL',
  37 |     '三穗': 'QHW',
  38 |     '三营': 'OEJ',
  39 |     '三道湖': 'SDL',
  40 |     '三都县': 'KKW',
  41 |     '三门县': 'OQH',
  42 |     '三门峡': 'SMF',
  43 |     '三门峡南': 'SCF',
  44 |     '三门峡西': 'SXF',
  45 |     '三间房': 'SFX',
  46 |     '三阳川': 'SYJ',
  47 |     '上万': 'SWP',
  48 |     '上园': 'SUD',
  49 |     '上杭': 'JBS',
  50 |     '上板城': 'SBP',
  51 |     '上板城南': 'OBP',
  52 |     '上海': 'SHH',
  53 |     '上海南': 'SNH',
  54 |     '上海虹桥': 'AOH',
  55 |     '上海西': 'SXH',
  56 |     '上腰墩': 'SPJ',
  57 |     '上虞': 'BDH',
  58 |     '上西铺': 'SXM',
  59 |     '上饶': 'SRG',
  60 |     '上高镇': 'SVK',
  61 |     '下台子': 'EIP',
  62 |     '下城子': 'XCB',
  63 |     '下板城': 'EBP',
  64 |     '下社': 'XSV',
  65 |     '下花园': 'XYP',
  66 |     '下马塘': 'XAT',
  67 |     '世博园': 'ZWT',
  68 |     '东': 'FDC',
  69 |     '东丰': 'DIL',
  70 |     '东乡': 'DXG',
  71 |     '东二道河': 'DRB',
  72 |     '东京城': 'DJB',
  73 |     '东光': 'DGP',
  74 |     '东升': 'DRQ',
  75 |     '东台': 'DBH',
  76 |     '东安东': 'DCZ',
  77 |     '东庄': 'DZV',
  78 |     '东戌': 'RXP',
  79 |     '东戴河': 'RDD',
  80 |     '东方': 'UFQ',
  81 |     '东方红': 'DFB',
  82 |     '东明县': 'DNF',
  83 |     '东明村': 'DMD',
  84 |     '东来': 'RVD',
  85 |     '东津': 'DKB',
  86 |     '东海': 'DHB',
  87 |     '东海县': 'DQH',
  88 |     '东淤地': 'DBV',
  89 |     '东港北': 'RGT',
  90 |     '东湾': 'DRJ',
  91 |     '东胜': 'DOC',
  92 |     '东胜西': 'DYC',
  93 |     '东至': 'DCH',
  94 |     '东莞': 'RTQ',
  95 |     '东莞东': 'DMQ',
  96 |     '东营': 'DPK',
  97 |     '东营南': 'DOK',
  98 |     '东辛庄': 'DXD',
  99 |     '东边井': 'DBB',
 100 |     '东通化': 'DTL',
 101 |     '东镇': 'DNV',
 102 |     '两家': 'UJT',
 103 |     '两当': 'LDY',
 104 |     '中华门': 'VNH',
 105 |     '中卫': 'ZWJ',
 106 |     '中和': 'ZHX',
 107 |     '中宁': 'VNJ',
 108 |     '中宁东': 'ZDJ',
 109 |     '中宁南': 'ZNJ',
 110 |     '中寨': 'ZZM',
 111 |     '中山': 'ZSQ',
 112 |     '中山北': 'ZGQ',
 113 |     '中川机场': 'ZJJ',
 114 |     '中牟': 'ZGF',
 115 |     '丰乐镇': 'FZB',
 116 |     '丰城': 'FCG',
 117 |     '丰城南': 'FNG',
 118 |     '丰水村': 'FSJ',
 119 |     '丰都': 'FUW',
 120 |     '丰镇': 'FZC',
 121 |     '丰顺': 'FUQ',
 122 |     '临城': 'UUP',
 123 |     '临川': 'LCG',
 124 |     '临江': 'LQL',
 125 |     '临汾': 'LFV',
 126 |     '临汾西': 'LXV',
 127 |     '临沂': 'LVK',
 128 |     '临沂北': 'UYK',
 129 |     '临河': 'LHC',
 130 |     '临泽': 'LEJ',
 131 |     '临泽南': 'LDJ',
 132 |     '临海': 'UFH',
 133 |     '临清': 'UQK',
 134 |     '临湘': 'LXQ',
 135 |     '临澧': 'LWQ',
 136 |     '临西': 'UEP',
 137 |     '临邑': 'LUK',
 138 |     '临颍': 'LNF',
 139 |     '临高南': 'KGQ',
 140 |     '丹东': 'DUT',
 141 |     '丹东西': 'RWT',
 142 |     '丹凤': 'DGY',
 143 |     '丹徒': 'RUH',
 144 |     '丹阳': 'DYH',
 145 |     '丹阳北': 'EXH',
 146 |     '丹霞山': 'IRQ',
 147 |     '丽水': 'USH',
 148 |     '丽江': 'LHM',
 149 |     '乃林': 'NLD',
 150 |     '义乌': 'YWH',
 151 |     '义县': 'YXD',
 152 |     '义马': 'YMF',
 153 |     '乌伊岭': 'WPB',
 154 |     '乌兰哈达': 'WLC',
 155 |     '乌兰浩特': 'WWT',
 156 |     '乌奴耳': 'WRX',
 157 |     '乌尔旗汗': 'WHX',
 158 |     '乌拉山': 'WSC',
 159 |     '乌拉特前旗': 'WQC',
 160 |     '乌海': 'WVC',
 161 |     '乌海西': 'WXC',
 162 |     '乌西': 'WXR',
 163 |     '乌鲁木齐': 'WAR',
 164 |     '乌鲁木齐南': 'WMR',
 165 |     '乌龙泉南': 'WFN',
 166 |     '乐东': 'UQQ',
 167 |     '乐善村': 'LUM',
 168 |     '乐山': 'IVW',
 169 |     '乐山北': 'UTW',
 170 |     '乐平市': 'LPG',
 171 |     '乐昌': 'LCQ',
 172 |     '乐清': 'UPH',
 173 |     '乐都': 'LDO',
 174 |     '乐都南': 'LVO',
 175 |     '九三': 'SSX',
 176 |     '九台': 'JTL',
 177 |     '九台南': 'JNL',
 178 |     '九江': 'JJG',
 179 |     '九郎山': 'KJQ',
 180 |     '乳山': 'ROK',
 181 |     '乾县': 'QBY',
 182 |     '乾安': 'QOT',
 183 |     '二密河': 'RML',
 184 |     '二营': 'RYJ',
 185 |     '二连': 'RLC',
 186 |     '二道沟门': 'RDP',
 187 |     '二道湾': 'RDX',
 188 |     '二龙': 'RLD',
 189 |     '二龙山屯': 'ELA',
 190 |     '于家堡': 'YKP',
 191 |     '于都': 'YDG',
 192 |     '云东海': 'NAQ',
 193 |     '云居寺': 'AFP',
 194 |     '云山': 'KZQ',
 195 |     '云彩岭': 'ACP',
 196 |     '云梦': 'YMN',
 197 |     '云浮东': 'IXQ',
 198 |     '云霄': 'YBS',
 199 |     '五五': 'WVR',
 200 |     '五原': 'WYC',
 201 |     '五叉沟': 'WCT',
 202 |     '五台山': 'WSV',
 203 |     '五大连池': 'WRB',
 204 |     '五女山': 'WET',
 205 |     '五家': 'WUB',
 206 |     '五寨': 'WZV',
 207 |     '五常': 'WCB',
 208 |     '五府山': 'WFG',
 209 |     '五棵树': 'WKT',
 210 |     '五莲': 'WLK',
 211 |     '五营': 'WWB',
 212 |     '五道沟': 'WDL',
 213 |     '五道河': 'WHP',
 214 |     '五龙背': 'WBT',
 215 |     '五龙背东': 'WMT',
 216 |     '井冈山': 'JGG',
 217 |     '井南': 'JNP',
 218 |     '井店': 'JFP',
 219 |     '井陉': 'JJP',
 220 |     '亚': 'JUQ',
 221 |     '亚布力': 'YBB',
 222 |     '亚布力南': 'YWB',
 223 |     '亚龙湾': 'TWQ',
 224 |     '交城': 'JNV',
 225 |     '京山': 'JCN',
 226 |     '亭亮': 'TIZ',
 227 |     '亮甲店': 'LRT',
 228 |     '亳州': 'BZH',
 229 |     '什里店': 'OMP',
 230 |     '仁布': 'RUO',
 231 |     '介休': 'JXV',
 232 |     '介休东': 'JDV',
 233 |     '从江': 'KNW',
 234 |     '仙人桥': 'XRL',
 235 |     '仙林': 'XPH',
 236 |     '仙桃西': 'XAN',
 237 |     '仙游': 'XWS',
 238 |     '代县': 'DKV',
 239 |     '仪征': 'UZH',
 240 |     '仲恺': 'KKQ',
 241 |     '任丘': 'RQP',
 242 |     '伊图里河': 'YEX',
 243 |     '伊宁': 'YMR',
 244 |     '伊宁东': 'YNR',
 245 |     '伊尔施': 'YET',
 246 |     '伊拉哈': 'YLX',
 247 |     '伊敏': 'YMX',
 248 |     '伊春': 'YCB',
 249 |     '伊林': 'YLB',
 250 |     '会同': 'VTQ',
 251 |     '会昌北': 'XEG',
 252 |     '低庄': 'DVQ',
 253 |     '低窝铺': 'DWJ',
 254 |     '余姚': 'YYH',
 255 |     '余姚北': 'CTH',
 256 |     '余杭': 'EVH',
 257 |     '余江': 'YHG',
 258 |     '余粮堡': 'YLD',
 259 |     '佛山': 'FSQ',
 260 |     '佳木斯': 'JMB',
 261 |     '依安': 'YAX',
 262 |     '侯马': 'HMV',
 263 |     '侯马西': 'HPV',
 264 |     '保定': 'BDP',
 265 |     '保定东': 'BMP',
 266 |     '保康': 'BKD',
 267 |     '信丰': 'EFG',
 268 |     '信宜': 'EEQ',
 269 |     '信阳': 'XUN',
 270 |     '信阳东': 'OYN',
 271 |     '修武': 'XWF',
 272 |     '修武西': 'EXF',
 273 |     '倭肯': 'WQB',
 274 |     '偃师': 'YSF',
 275 |     '偏岭': 'PNT',
 276 |     '偏店': 'PRP',
 277 |     '元宝山': 'YUD',
 278 |     '元氏': 'YSP',
 279 |     '元谋': 'YMM',
 280 |     '先锋': 'NQQ',
 281 |     '光山': 'GUN',
 282 |     '光明城': 'IMQ',
 283 |     '光泽': 'GZS',
 284 |     '克一河': 'KHX',
 285 |     '克东': 'KOB',
 286 |     '克山': 'KSB',
 287 |     '克拉玛依': 'KHR',
 288 |     '免渡河': 'MDX',
 289 |     '兑镇': 'DWV',
 290 |     '兖州': 'YZK',
 291 |     '全州南': 'QNZ',
 292 |     '全椒': 'INH',
 293 |     '八仙筒': 'VXD',
 294 |     '八步': 'BBE',
 295 |     '八角台': 'BTD',
 296 |     '八达岭': 'ILP',
 297 |     '八面城': 'BMD',
 298 |     '八面通': 'BMB',
 299 |     '公主岭': 'GLT',
 300 |     '公主岭南': 'GBT',
 301 |     '公庙子': 'GMC',
 302 |     '公营子': 'GYD',
 303 |     '六合': 'KLH',
 304 |     '六合镇': 'LEX',
 305 |     '六安': 'UAH',
 306 |     '六枝': 'LIW',
 307 |     '六盘山': 'UPJ',
 308 |     '六盘水': 'UMW',
 309 |     '六道河子': 'LVP',
 310 |     '兰岗': 'LNB',
 311 |     '兰州': 'LZJ',
 312 |     '兰州东': 'LVJ',
 313 |     '兰州新区': 'LQJ',
 314 |     '兰州西': 'LAJ',
 315 |     '兰棱': 'LLB',
 316 |     '兰溪': 'LWH',
 317 |     '兰考': 'LKF',
 318 |     '兰考南': 'LUF',
 319 |     '兰陵北': 'COK',
 320 |     '共青城': 'GAG',
 321 |     '关岭': 'GLE',
 322 |     '关林': 'GLF',
 323 |     '兴业': 'SNZ',
 324 |     '兴义': 'XRZ',
 325 |     '兴凯': 'EKB',
 326 |     '兴和西': 'XEC',
 327 |     '兴国': 'EUG',
 328 |     '兴城': 'XCD',
 329 |     '兴宁': 'ENQ',
 330 |     '兴安': 'XAZ',
 331 |     '兴安北': 'XDZ',
 332 |     '兴平': 'XPY',
 333 |     '兴泉堡': 'XQJ',
 334 |     '兴隆县': 'EXP',
 335 |     '兴隆店': 'XDD',
 336 |     '兴隆镇': 'XZB',
 337 |     '内乡': 'NXF',
 338 |     '内江': 'NJW',
 339 |     '内江北': 'NKW',
 340 |     '册亨': 'CHZ',
 341 |     '冕宁': 'UGW',
 342 |     '军粮城北': 'JMP',
 343 |     '农安': 'NAT',
 344 |     '冠豸山': 'GSS',
 345 |     '冯屯': 'FTX',
 346 |     '冷水江东': 'UDQ',
 347 |     '凌海': 'JID',
 348 |     '凌源': 'LYD',
 349 |     '凌源东': 'LDD',
 350 |     '凤凰城': 'FHT',
 351 |     '凤凰机场': 'FJQ',
 352 |     '凤县': 'FXY',
 353 |     '凤城东': 'FDT',
 354 |     '凤州': 'FZY',
 355 |     '凤阳': 'FUH',
 356 |     '凭祥': 'PXZ',
 357 |     '凯里': 'KLW',
 358 |     '凯里南': 'QKW',
 359 |     '刀尔登': 'DRD',
 360 |     '分宜': 'FYG',
 361 |     '刘家店': 'UDT',
 362 |     '刘家河': 'LVT',
 363 |     '创业村': 'CEX',
 364 |     '利川': 'LCN',
 365 |     '利津南': 'LNK',
 366 |     '到保': 'RBT',
 367 |     '前卫': 'QWD',
 368 |     '前山': 'QXQ',
 369 |     '前磨头': 'QMP',
 370 |     '前苇塘': 'QWP',
 371 |     '前进镇': 'QEB',
 372 |     '前锋': 'QFB',
 373 |     '加格达奇': 'JGX',
 374 |     '勃利': 'BLB',
 375 |     '勉县': 'MVY',
 376 |     '包头': 'BTC',
 377 |     '包头东': 'BDC',
 378 |     '化州': 'HZZ',
 379 |     '化德': 'HGC',
 380 |     '北井子': 'BRT',
 381 |     '北京': 'BJP',
 382 |     '北京东': 'BOP',
 383 |     '北京北': 'VAP',
 384 |     '北京南': 'VNP',
 385 |     '北京西': 'BXP',
 386 |     '北台': 'BTT',
 387 |     '北宅': 'BVP',
 388 |     '北安': 'BAB',
 389 |     '北屯': 'BYP',
 390 |     '北屯市': 'BXR',
 391 |     '北戴河': 'BEP',
 392 |     '北流': 'BOZ',
 393 |     '北海': 'BHZ',
 394 |     '北滘': 'IBQ',
 395 |     '北碚': 'BPW',
 396 |     '北票南': 'RPD',
 397 |     '北营': 'BIV',
 398 |     '北马圈子': 'BRP',
 399 |     '十堰': 'SNN',
 400 |     '十家子': 'SJD',
 401 |     '十渡': 'SEP',
 402 |     '千河': 'QUY',
 403 |     '千阳': 'QOY',
 404 |     '午汲': 'WJP',
 405 |     '华城': 'VCQ',
 406 |     '华家': 'HJT',
 407 |     '华容': 'HRN',
 408 |     '华容东': 'HPN',
 409 |     '华容南': 'KRN',
 410 |     '华山': 'HSY',
 411 |     '华山北': 'HDY',
 412 |     '华蓥': 'HUW',
 413 |     '卓资东': 'ZDC',
 414 |     '卓资山': 'ZZC',
 415 |     '南丰': 'NFG',
 416 |     '南丹': 'NDZ',
 417 |     '南京': 'NJH',
 418 |     '南京南': 'NKH',
 419 |     '南仇': 'NCK',
 420 |     '南充': 'NCW',
 421 |     '南充北': 'NCE',
 422 |     '南关岭': 'NLT',
 423 |     '南华': 'NHS',
 424 |     '南博山': 'NBK',
 425 |     '南口': 'NKP',
 426 |     '南口前': 'NKT',
 427 |     '南召': 'NAF',
 428 |     '南台': 'NTT',
 429 |     '南城': 'NDG',
 430 |     '南城司': 'NSP',
 431 |     '南大庙': 'NMP',
 432 |     '南头': 'NOQ',
 433 |     '南宁': 'NNZ',
 434 |     '南宁东': 'NFZ',
 435 |     '南宁西': 'NXZ',
 436 |     '南宫东': 'NFP',
 437 |     '南岔': 'NCB',
 438 |     '南峪': 'NUP',
 439 |     '南平': 'NPS',
 440 |     '南平北': 'NBS',
 441 |     '南平南': 'NNS',
 442 |     '南昌': 'NCG',
 443 |     '南昌西': 'NXG',
 444 |     '南曹': 'NEF',
 445 |     '南朗': 'NNQ',
 446 |     '南木': 'NMX',
 447 |     '南杂木': 'NZT',
 448 |     '南桥': 'NQD',
 449 |     '南江': 'FIW',
 450 |     '南江口': 'NDQ',
 451 |     '南河川': 'NHJ',
 452 |     '南湖东': 'NDN',
 453 |     '南湾子': 'NWP',
 454 |     '南翔北': 'NEH',
 455 |     '南芬': 'NFT',
 456 |     '南芬北': 'NUT',
 457 |     '南观村': 'NGP',
 458 |     '南通': 'NUH',
 459 |     '南部': 'NBE',
 460 |     '南阳': 'NFF',
 461 |     '南阳寨': 'NYF',
 462 |     '南陵': 'LLH',
 463 |     '南雄': 'NCQ',
 464 |     '南靖': 'NJS',
 465 |     '博乐': 'BOR',
 466 |     '博克图': 'BKX',
 467 |     '博兴': 'BXK',
 468 |     '博白': 'BBZ',
 469 |     '博鳌': 'BWQ',
 470 |     '卢龙': 'UAP',
 471 |     '卧里屯': 'WLX',
 472 |     '卫东': 'WVT',
 473 |     '卫星': 'WVB',
 474 |     '卫辉': 'WHF',
 475 |     '即墨北': 'JVK',
 476 |     '原平': 'YPV',
 477 |     '厦门': 'XMS',
 478 |     '厦门北': 'XKS',
 479 |     '厦门高崎': 'XBS',
 480 |     '友好': 'YOB',
 481 |     '双丰': 'OFB',
 482 |     '双吉': 'SML',
 483 |     '双城北': 'SBB',
 484 |     '双城堡': 'SCB',
 485 |     '双峰北': 'NFQ',
 486 |     '双河镇': 'SEL',
 487 |     '双流机场': 'IPW',
 488 |     '双流西': 'IQW',
 489 |     '双牌': 'SBZ',
 490 |     '双辽': 'ZJD',
 491 |     '双鸭山': 'SSB',
 492 |     '发耳': 'FEM',
 493 |     '口东': 'KEQ',
 494 |     '口前': 'KQL',
 495 |     '古东': 'GDV',
 496 |     '古交': 'GJV',
 497 |     '古城镇': 'GZB',
 498 |     '古浪': 'GLJ',
 499 |     '古田': 'GTS',
 500 |     '古田会址': 'STS',
 501 |     '古田北': 'GBS',
 502 |     '古莲': 'GRX',
 503 |     '古镇': 'GNQ',
 504 |     '句容西': 'JWH',
 505 |     '台前': 'TTK',
 506 |     '台安': 'TID',
 507 |     '台州': 'TZH',
 508 |     '叶城': 'YER',
 509 |     '叶柏寿': 'YBD',
 510 |     '司家岭': 'OLK',
 511 |     '合川': 'WKW',
 512 |     '合浦': 'HVZ',
 513 |     '合肥': 'HFH',
 514 |     '合肥北城': 'COH',
 515 |     '合肥南': 'ENH',
 516 |     '合肥西': 'HTH',
 517 |     '合阳': 'HAY',
 518 |     '合阳北': 'HTY',
 519 |     '吉安': 'VAG',
 520 |     '吉文': 'JWX',
 521 |     '吉林': 'JLL',
 522 |     '吉舒': 'JSL',
 523 |     '吉首': 'JIQ',
 524 |     '同心': 'TXJ',
 525 |     '同江': 'TJB',
 526 |     '后湖': 'IHN',
 527 |     '吐列毛杜': 'TMD',
 528 |     '吐哈': 'THR',
 529 |     '吐鲁番': 'TFR',
 530 |     '吐鲁番北': 'TAR',
 531 |     '向塘': 'XTG',
 532 |     '向阳': 'XDB',
 533 |     '吕梁': 'LHV',
 534 |     '吴圩': 'WYZ',
 535 |     '吴堡': 'WUY',
 536 |     '吴官田': 'WGM',
 537 |     '吴家屯': 'WJT',
 538 |     '吴家川': 'WCJ',
 539 |     '吴桥': 'WUP',
 540 |     '周口': 'ZKN',
 541 |     '周家': 'ZOB',
 542 |     '周家屯': 'ZOD',
 543 |     '周水子': 'ZIT',
 544 |     '呼兰': 'HUB',
 545 |     '呼和浩特': 'HHC',
 546 |     '呼和浩特东': 'NDC',
 547 |     '呼鲁斯太': 'VTJ',
 548 |     '咋子': 'ZAL',
 549 |     '和什托洛盖': 'VSR',
 550 |     '和平': 'VAQ',
 551 |     '和田': 'VTR',
 552 |     '和硕': 'VUR',
 553 |     '和静': 'HJR',
 554 |     '和龙': 'HLL',
 555 |     '咸宁': 'XNN',
 556 |     '咸宁东': 'XKN',
 557 |     '咸宁北': 'XRN',
 558 |     '咸宁南': 'UNN',
 559 |     '咸阳': 'XYY',
 560 |     '咸阳秦都': 'XOY',
 561 |     '哈密': 'HMR',
 562 |     '哈尔滨': 'HBB',
 563 |     '哈尔滨东': 'VBB',
 564 |     '哈尔滨北': 'HTB',
 565 |     '哈尔滨西': 'VAB',
 566 |     '哈拉海': 'HIT',
 567 |     '哈拉苏': 'HAX',
 568 |     '哈达铺': 'HDJ',
 569 |     '哲里木': 'ZLC',
 570 |     '唐家湾': 'PDQ',
 571 |     '唐山': 'TSP',
 572 |     '唐山北': 'FUP',
 573 |     '唐河': 'THF',
 574 |     '商丘': 'SQF',
 575 |     '商丘南': 'SPF',
 576 |     '商南': 'ONY',
 577 |     '商城': 'SWN',
 578 |     '商河': 'SOK',
 579 |     '商洛': 'OLY',
 580 |     '商都': 'SXC',
 581 |     '喀什': 'KSR',
 582 |     '喀喇其': 'KQX',
 583 |     '喇嘛甸': 'LMX',
 584 |     '喜德': 'EDW',
 585 |     '嘉兴': 'JXH',
 586 |     '嘉兴南': 'EPH',
 587 |     '嘉善': 'JSH',
 588 |     '嘉善南': 'EAH',
 589 |     '嘉峪关': 'JGJ',
 590 |     '嘉峪关南': 'JBJ',
 591 |     '嘉峰': 'JFF',
 592 |     '嘉祥': 'JUK',
 593 |     '嘎什甸子': 'GXD',
 594 |     '四会': 'AHQ',
 595 |     '四合永': 'OHD',
 596 |     '四平': 'SPT',
 597 |     '四平东': 'PPT',
 598 |     '四方台': 'STB',
 599 |     '四道湾': 'OUD',
 600 |     '团结': 'TIX',
 601 |     '园墩': 'YAJ',
 602 |     '固原': 'GUJ',
 603 |     '固始': 'GXN',
 604 |     '固安': 'GFP',
 605 |     '固镇': 'GEH',
 606 |     '图们': 'TML',
 607 |     '图们北': 'QSL',
 608 |     '图强': 'TQX',
 609 |     '图里河': 'TEX',
 610 |     '土地堂东': 'TTN',
 611 |     '土桥子': 'TQJ',
 612 |     '土溪': 'TSW',
 613 |     '土牧尔台': 'TRC',
 614 |     '土贵乌拉': 'TGC',
 615 |     '土门子': 'TCJ',
 616 |     '坂田': 'BTQ',
 617 |     '坡底下': 'PXJ',
 618 |     '坪上': 'PSK',
 619 |     '坪石': 'PSQ',
 620 |     '垫江': 'DJE',
 621 |     '城固': 'CGY',
 622 |     '城子坦': 'CWT',
 623 |     '城阳': 'CEK',
 624 |     '塔哈': 'THX',
 625 |     '塔尔气': 'TVX',
 626 |     '塔崖驿': 'TYP',
 627 |     '塔河': 'TXX',
 628 |     '塔石嘴': 'TIM',
 629 |     '塘沽': 'TGP',
 630 |     '塘豹': 'TBQ',
 631 |     '墨玉': 'MUR',
 632 |     '壮志': 'ZUX',
 633 |     '复盛': 'FAW',
 634 |     '夏官营': 'XGJ',
 635 |     '夏石': 'XIZ',
 636 |     '夏邑县': 'EJH',
 637 |     '大丰': 'KRQ',
 638 |     '大余': 'DYG',
 639 |     '大元': 'DYZ',
 640 |     '大关': 'RGW',
 641 |     '大兴': 'DXX',
 642 |     '大兴沟': 'DXL',
 643 |     '大其拉哈': 'DQX',
 644 |     '大冶北': 'DBN',
 645 |     '大口屯': 'DKP',
 646 |     '大同': 'DTV',
 647 |     '大坝': 'DBJ',
 648 |     '大埔': 'DPI',
 649 |     '大堡': 'DVT',
 650 |     '大孤山': 'RMT',
 651 |     '大安': 'RAT',
 652 |     '大安北': 'RNT',
 653 |     '大官屯': 'DTT',
 654 |     '大屯': 'DNT',
 655 |     '大巴': 'DBD',
 656 |     '大平房': 'DPD',
 657 |     '大庆': 'DZX',
 658 |     '大庆东': 'LFX',
 659 |     '大庆西': 'RHX',
 660 |     '大成': 'DCT',
 661 |     '大战场': 'DTJ',
 662 |     '大拟': 'DNZ',
 663 |     '大方南': 'DNE',
 664 |     '大旺': 'WWQ',
 665 |     '大杖子': 'DAP',
 666 |     '大杨树': 'DUX',
 667 |     '大板': 'DBC',
 668 |     '大林': 'DLD',
 669 |     '大武口': 'DFJ',
 670 |     '大涧': 'DFP',
 671 |     '大湾子': 'DFM',
 672 |     '大灰厂': 'DHP',
 673 |     '大王滩': 'DZZ',
 674 |     '大理': 'DKM',
 675 |     '大田边': 'DBM',
 676 |     '大盘石': 'RPP',
 677 |     '大石头': 'DSL',
 678 |     '大石头南': 'DAL',
 679 |     '大石寨': 'RZT',
 680 |     '大石桥': 'DQT',
 681 |     '大磴沟': 'DKJ',
 682 |     '大禾塘': 'SOQ',
 683 |     '大竹园': 'DZY',
 684 |     '大红旗': 'DQD',
 685 |     '大英东': 'IAW',
 686 |     '大苴': 'DIM',
 687 |     '大荔': 'DNY',
 688 |     '大营': 'DYV',
 689 |     '大营子': 'DZD',
 690 |     '大营镇': 'DJP',
 691 |     '大虎山': 'DHD',
 692 |     '大足南': 'FQW',
 693 |     '大连': 'DLT',
 694 |     '大连北': 'DFT',
 695 |     '大通西': 'DTO',
 696 |     '大陆号': 'DLC',
 697 |     '大雁': 'DYX',
 698 |     '大青沟': 'DSD',
 699 |     '天义': 'TND',
 700 |     '天岗': 'TGL',
 701 |     '天柱山': 'QWH',
 702 |     '天桥岭': 'TQL',
 703 |     '天水': 'TSJ',
 704 |     '天河机场': 'TJN',
 705 |     '天河街': 'TEN',
 706 |     '天津': 'TJP',
 707 |     '天津北': 'TBP',
 708 |     '天津南': 'TIP',
 709 |     '天津西': 'TXP',
 710 |     '天祝': 'TZJ',
 711 |     '天镇': 'TZV',
 712 |     '天门': 'TMN',
 713 |     '天门南': 'TNN',
 714 |     '太原': 'TYV',
 715 |     '太原东': 'TDV',
 716 |     '太原北': 'TBV',
 717 |     '太原南': 'TNV',
 718 |     '太姥山': 'TLS',
 719 |     '太平川': 'TIT',
 720 |     '太平镇': 'TEB',
 721 |     '太湖': 'TKH',
 722 |     '太谷': 'TGV',
 723 |     '太谷西': 'TIV',
 724 |     '太阳升': 'TQT',
 725 |     '太阳山': 'TYJ',
 726 |     '夹心子': 'JXT',
 727 |     '奇峰塔': 'QVP',
 728 |     '奈曼': 'NMD',
 729 |     '奉化': 'FHH',
 730 |     '奎屯': 'KTR',
 731 |     '奎山': 'KAB',
 732 |     '如东': 'RIH',
 733 |     '如皋': 'RBH',
 734 |     '始兴': 'IPQ',
 735 |     '姚千户屯': 'YQT',
 736 |     '姚安': 'YAC',
 737 |     '姚家': 'YAT',
 738 |     '姚渡': 'AOJ',
 739 |     '姜堰': 'UEH',
 740 |     '姜家': 'JJB',
 741 |     '威海': 'WKK',
 742 |     '威海北': 'WHK',
 743 |     '威箐': 'WAM',
 744 |     '威舍': 'WSM',
 745 |     '威虎岭北': 'WBL',
 746 |     '娄底': 'LDQ',
 747 |     '娄底南': 'UOQ',
 748 |     '娘子关': 'NIP',
 749 |     '婺源': 'WYG',
 750 |     '嫩江': 'NGX',
 751 |     '子洲': 'ZZY',
 752 |     '子长': 'ZHY',
 753 |     '孙吴': 'SKB',
 754 |     '孙家': 'SUB',
 755 |     '孙镇': 'OZY',
 756 |     '孝南': 'XNV',
 757 |     '孝感': 'XGN',
 758 |     '孝感东': 'GDN',
 759 |     '孝感北': 'XJN',
 760 |     '孝西': 'XOV',
 761 |     '孟家岗': 'MGB',
 762 |     '孟庄': 'MZF',
 763 |     '孤家子': 'GKT',
 764 |     '孤山口': 'GSP',
 765 |     '宁东': 'NOJ',
 766 |     '宁东南': 'NDJ',
 767 |     '宁乡': 'NXQ',
 768 |     '宁国': 'NNH',
 769 |     '宁安': 'NAB',
 770 |     '宁家': 'NVT',
 771 |     '宁德': 'NES',
 772 |     '宁明': 'NMZ',
 773 |     '宁村': 'NCZ',
 774 |     '宁武': 'NWV',
 775 |     '宁波': 'NGH',
 776 |     '宁波东': 'NVH',
 777 |     '宁海': 'NHH',
 778 |     '宁陵县': 'NLF',
 779 |     '安亭北': 'ASH',
 780 |     '安仁': 'ARG',
 781 |     '安化': 'PKQ',
 782 |     '安口窑': 'AYY',
 783 |     '安图': 'ATL',
 784 |     '安图西': 'AXL',
 785 |     '安塘': 'ATV',
 786 |     '安定': 'ADP',
 787 |     '安家': 'AJB',
 788 |     '安平': 'APT',
 789 |     '安广': 'AGT',
 790 |     '安庆': 'AQH',
 791 |     '安庆西': 'APH',
 792 |     '安康': 'AKY',
 793 |     '安德': 'ARW',
 794 |     '安溪': 'AXS',
 795 |     '安达': 'ADX',
 796 |     '安阳': 'AYF',
 797 |     '安阳东': 'ADF',
 798 |     '安陆': 'ALN',
 799 |     '安顺': 'ASW',
 800 |     '安顺西': 'ASE',
 801 |     '安龙': 'AUZ',
 802 |     '宋': 'SOB',
 803 |     '宋城路': 'SFF',
 804 |     '宏庆': 'HEY',
 805 |     '官厅': 'GTP',
 806 |     '官厅西': 'KEP',
 807 |     '官字井': 'GOT',
 808 |     '官高': 'GVP',
 809 |     '定南': 'DNG',
 810 |     '定州': 'DXP',
 811 |     '定州东': 'DOP',
 812 |     '定襄': 'DXV',
 813 |     '定西': 'DSJ',
 814 |     '定边': 'DYJ',
 815 |     '定远': 'EWH',
 816 |     '定陶': 'DQK',
 817 |     '宜兴': 'YUH',
 818 |     '宜城': 'YIN',
 819 |     '宜宾': 'YBW',
 820 |     '宜州': 'YSZ',
 821 |     '宜昌': 'YCN',
 822 |     '宜昌东': 'HAN',
 823 |     '宜春': 'YEG',
 824 |     '宜春西': 'YCG',
 825 |     '宜耐': 'YVM',
 826 |     '宜良北': 'YSM',
 827 |     '宝华山': 'BWH',
 828 |     '宝坻': 'BPP',
 829 |     '宝拉格': 'BQC',
 830 |     '宝林': 'BNB',
 831 |     '宝泉岭': 'BQB',
 832 |     '宝清': 'BUB',
 833 |     '宝鸡': 'BJY',
 834 |     '宝鸡南': 'BBY',
 835 |     '宝龙山': 'BND',
 836 |     '宣化': 'XHP',
 837 |     '宣和': 'XWJ',
 838 |     '宣城': 'ECH',
 839 |     '宣威': 'XWM',
 840 |     '宣汉': 'XHY',
 841 |     '容县': 'RXZ',
 842 |     '容桂': 'RUQ',
 843 |     '宽甸': 'KDT',
 844 |     '宾阳': 'UKZ',
 845 |     '宿州': 'OXH',
 846 |     '宿州东': 'SRH',
 847 |     '宿松': 'OAH',
 848 |     '密云北': 'MUP',
 849 |     '密山': 'MSB',
 850 |     '富县': 'FEY',
 851 |     '富县东': 'FDY',
 852 |     '富宁': 'FNM',
 853 |     '富川': 'FDZ',
 854 |     '富拉尔基': 'FRX',
 855 |     '富海': 'FHX',
 856 |     '富源': 'FYM',
 857 |     '富源北': 'FBM',
 858 |     '富裕': 'FYX',
 859 |     '富锦': 'FIB',
 860 |     '寒岭': 'HAT',
 861 |     '寒葱沟': 'HKB',
 862 |     '察素齐': 'CSC',
 863 |     '对青山': 'DQB',
 864 |     '寿阳': 'SYV',
 865 |     '将乐': 'JLS',
 866 |     '小东': 'XOD',
 867 |     '小哨': 'XAM',
 868 |     '小寺沟': 'ESP',
 869 |     '小岭': 'XLB',
 870 |     '小市': 'XST',
 871 |     '小得江': 'EJM',
 872 |     '小扬气': 'XYX',
 873 |     '小新街': 'XXM',
 874 |     '小月旧': 'XFM',
 875 |     '小村': 'XEM',
 876 |     '小榄': 'EAQ',
 877 |     '小河沿': 'XYD',
 878 |     '小河镇': 'EKY',
 879 |     '小董': 'XEZ',
 880 |     '小西庄': 'XXP',
 881 |     '小金口': 'NKQ',
 882 |     '小雨谷': 'XHM',
 883 |     '尖峰': 'PFQ',
 884 |     '尚家': 'SJB',
 885 |     '尚志': 'SZB',
 886 |     '尤溪': 'YXS',
 887 |     '尹地': 'YDM',
 888 |     '尼勒克': 'NIR',
 889 |     '尼木': 'NMO',
 890 |     '屏边': 'PBM',
 891 |     '山丹': 'SDJ',
 892 |     '山坡东': 'SBN',
 893 |     '山城镇': 'SCL',
 894 |     '山市': 'SQB',
 895 |     '山河屯': 'SHL',
 896 |     '山海关': 'SHD',
 897 |     '山阴': 'SNV',
 898 |     '岐山': 'QAY',
 899 |     '岑溪': 'CNZ',
 900 |     '岔江': 'CAM',
 901 |     '岢岚': 'KLV',
 902 |     '岩会': 'AEP',
 903 |     '岱岳': 'RYV',
 904 |     '岳家井': 'YGJ',
 905 |     '岳池': 'AWW',
 906 |     '岳阳': 'YYQ',
 907 |     '岳阳东': 'YIQ',
 908 |     '岷县': 'MXJ',
 909 |     '峡江': 'EJG',
 910 |     '峨眉': 'EMW',
 911 |     '峨眉山': 'IXW',
 912 |     '峨边': 'EBW',
 913 |     '峻德': 'JDB',
 914 |     '崇仁': 'CRG',
 915 |     '崇左': 'CZZ',
 916 |     '崔黄口': 'CHP',
 917 |     '崖州': 'YUQ',
 918 |     '嵩明': 'SVM',
 919 |     '嵯岗': 'CAX',
 920 |     '巢湖': 'CIH',
 921 |     '巢湖东': 'GUH',
 922 |     '工农湖': 'GRT',
 923 |     '左岭': 'ZSN',
 924 |     '巨宝': 'JRT',
 925 |     '巨野': 'JYK',
 926 |     '巩义': 'GXF',
 927 |     '巩义南': 'GYF',
 928 |     '巴东': 'BNN',
 929 |     '巴中': 'IEW',
 930 |     '巴中东': 'BDE',
 931 |     '巴山': 'BAY',
 932 |     '巴彦高勒': 'BAC',
 933 |     '巴林': 'BLX',
 934 |     '巴楚': 'BCR',
 935 |     '布列开': 'BLR',
 936 |     '布海': 'BUT',
 937 |     '师宗': 'SEM',
 938 |     '师庄': 'SNM',
 939 |     '带岭': 'DLB',
 940 |     '常州': 'CZH',
 941 |     '常州北': 'ESH',
 942 |     '常平': 'DAQ',
 943 |     '常平东': 'FQQ',
 944 |     '常庄': 'CVK',
 945 |     '常德': 'VGQ',
 946 |     '帽儿山': 'MRB',
 947 |     '干塘': 'GNJ',
 948 |     '干沟': 'GGL',
 949 |     '平关': 'PGM',
 950 |     '平凉': 'PIJ',
 951 |     '平凉南': 'POJ',
 952 |     '平南南': 'PAZ',
 953 |     '平原': 'PYK',
 954 |     '平原堡': 'PPJ',
 955 |     '平台': 'PVT',
 956 |     '平坝南': 'PBE',
 957 |     '平型关': 'PGV',
 958 |     '平安': 'PAL',
 959 |     '平安镇': 'PZT',
 960 |     '平安驿': 'PNO',
 961 |     '平山': 'PSB',
 962 |     '平岗': 'PGL',
 963 |     '平峪': 'PYP',
 964 |     '平庄': 'PZD',
 965 |     '平庄南': 'PND',
 966 |     '平度': 'PAK',
 967 |     '平房': 'PFB',
 968 |     '平旺': 'PWV',
 969 |     '平昌': 'PCE',
 970 |     '平果': 'PGZ',
 971 |     '平河口': 'PHM',
 972 |     '平泉': 'PQP',
 973 |     '平洋': 'PYX',
 974 |     '平湖': 'PHQ',
 975 |     '平田': 'PTM',
 976 |     '平社': 'PSV',
 977 |     '平遥': 'PYV',
 978 |     '平遥古城': 'PDV',
 979 |     '平邑': 'PIK',
 980 |     '平顶山': 'PEN',
 981 |     '平顶山西': 'BFF',
 982 |     '广元': 'GYW',
 983 |     '广元南': 'GAW',
 984 |     '广南卫': 'GNM',
 985 |     '广南县': 'GXM',
 986 |     '广宁': 'FBQ',
 987 |     '广宁寺': 'GQT',
 988 |     '广宁寺南': 'GNT',
 989 |     '广安': 'VJW',
 990 |     '广安南': 'VUW',
 991 |     '广州': 'GZQ',
 992 |     '广州东': 'GGQ',
 993 |     '广州北': 'GBQ',
 994 |     '广州南': 'IZQ',
 995 |     '广州西': 'GXQ',
 996 |     '广德': 'GRH',
 997 |     '广水': 'GSN',
 998 |     '广汉': 'GHW',
 999 |     '广汉北': 'GVW',
1000 |     '广通北': 'GPM',
1001 |     '庄桥': 'ZQH',
1002 |     '庄河北': 'ZUT',
1003 |     '庆丰': 'QFT',
1004 |     '庆安': 'QAB',
1005 |     '庆盛': 'QSQ',
1006 |     '庆阳山': 'QSJ',
1007 |     '庐山': 'LSG',
1008 |     '庐江': 'UJH',
1009 |     '库伦': 'KLD',
1010 |     '库尔勒': 'KLR',
1011 |     '库车': 'KCR',
1012 |     '库都尔': 'KDX',
1013 |     '应县': 'YZV',
1014 |     '应城': 'YHN',
1015 |     '庙城': 'MAP',
1016 |     '庙山': 'MSN',
1017 |     '庙岭': 'MLL',
1018 |     '庙庄': 'MZJ',
1019 |     '康城': 'KCP',
1020 |     '康庄': 'KZP',
1021 |     '康熙岭': 'KXZ',
1022 |     '康金井': 'KJB',
1023 |     '廉江': 'LJZ',
1024 |     '廊坊': 'LJP',
1025 |     '廊坊北': 'LFP',
1026 |     '延吉': 'YJL',
1027 |     '延吉西': 'YXL',
1028 |     '延安': 'YWY',
1029 |     '延庆': 'YNP',
1030 |     '建三江': 'JIB',
1031 |     '建始': 'JRN',
1032 |     '建宁县北': 'JCS',
1033 |     '建昌': 'JFD',
1034 |     '建水': 'JSM',
1035 |     '建湖': 'AJH',
1036 |     '建瓯': 'JVS',
1037 |     '建瓯西': 'JUS',
1038 |     '建设': 'JET',
1039 |     '建阳': 'JYS',
1040 |     '开原': 'KYT',
1041 |     '开原西': 'KXT',
1042 |     '开安': 'KAT',
1043 |     '开封': 'KFF',
1044 |     '开封北': 'KBF',
1045 |     '开江': 'KAW',
1046 |     '开通': 'KTT',
1047 |     '开阳': 'KVW',
1048 |     '开鲁': 'KLC',
1049 |     '弋江': 'RVH',
1050 |     '弋阳': 'YIG',
1051 |     '弓棚子': 'GPT',
1052 |     '张兰': 'ZLV',
1053 |     '张家口': 'ZKP',
1054 |     '张家口南': 'ZMP',
1055 |     '张家界': 'DIQ',
1056 |     '张掖': 'ZYJ',
1057 |     '张掖西': 'ZEJ',
1058 |     '张桥': 'ZQY',
1059 |     '张百湾': 'ZUP',
1060 |     '张维屯': 'ZWB',
1061 |     '张辛': 'ZIP',
1062 |     '弥勒': 'MLM',
1063 |     '弥渡': 'MDF',
1064 |     '归流河': 'GHT',
1065 |     '当涂东': 'OWH',
1066 |     '当阳': 'DYN',
1067 |     '彝良': 'ALW',
1068 |     '彬县': 'BXY',
1069 |     '彭山': 'PSW',
1070 |     '彭山北': 'PPW',
1071 |     '彭州': 'PMW',
1072 |     '彭水': 'PHW',
1073 |     '彭泽': 'PZG',
1074 |     '彭阳': 'PYJ',
1075 |     '彰武': 'ZWD',
1076 |     '徐家': 'XJB',
1077 |     '徐州': 'XCH',
1078 |     '徐州东': 'UUH',
1079 |     '徐水': 'XSP',
1080 |     '徐闻': 'XJQ',
1081 |     '得耳布尔': 'DRX',
1082 |     '徘徊北': 'PHP',
1083 |     '微子镇': 'WQP',
1084 |     '德令哈': 'DHO',
1085 |     '德伯斯': 'RDT',
1086 |     '德保': 'RBZ',
1087 |     '德兴': 'DWG',
1088 |     '德安': 'DAG',
1089 |     '德州': 'DZP',
1090 |     '德州东': 'DIP',
1091 |     '德惠': 'DHT',
1092 |     '德惠西': 'DXT',
1093 |     '德昌': 'DVW',
1094 |     '德清': 'DRH',
1095 |     '德清西': 'MOH',
1096 |     '德阳': 'DYW',
1097 |     '徽县': 'HYY',
1098 |     '忻州': 'XXV',
1099 |     '怀仁': 'HRV',
1100 |     '怀仁东': 'HFV',
1101 |     '怀化': 'HHQ',
1102 |     '怀化南': 'KAQ',
1103 |     '怀柔': 'HRP',
1104 |     '怀柔北': 'HBP',
1105 |     '怀集': 'FAQ',
1106 |     '恩施': 'ESN',
1107 |     '恭城': 'GCZ',
1108 |     '息县': 'ENN',
1109 |     '息烽': 'XFW',
1110 |     '悬钟': 'XRP',
1111 |     '惠东': 'KDQ',
1112 |     '惠农': 'HMJ',
1113 |     '惠安': 'HNS',
1114 |     '惠山': 'VCH',
1115 |     '惠州': 'HCQ',
1116 |     '惠州南': 'KNQ',
1117 |     '惠州西': 'VXQ',
1118 |     '惠环': 'KHQ',
1119 |     '慈利': 'CUQ',
1120 |     '成吉思汗': 'CJX',
1121 |     '成都': 'CDW',
1122 |     '成都东': 'ICW',
1123 |     '成都南': 'CNW',
1124 |     '成高子': 'CZB',
1125 |     '戚墅堰': 'QYH',
1126 |     '扎兰屯': 'ZTX',
1127 |     '扎赉诺尔西': 'ZXX',
1128 |     '扎鲁特': 'ZLD',
1129 |     '打柴沟': 'DGJ',
1130 |     '扬州': 'YLH',
1131 |     '扶余': 'FYT',
1132 |     '扶余北': 'FBT',
1133 |     '扶绥': 'FSZ',
1134 |     '承德': 'CDP',
1135 |     '承德东': 'CCP',
1136 |     '抚宁': 'FNP',
1137 |     '抚州': 'FZG',
1138 |     '抚州东': 'FDG',
1139 |     '抚州北': 'FBG',
1140 |     '抚松': 'FSL',
1141 |     '抚远': 'FYB',
1142 |     '抚顺': 'FST',
1143 |     '抚顺北': 'FET',
1144 |     '拉古': 'LGB',
1145 |     '拉哈': 'LHX',
1146 |     '拉林': 'LAB',
1147 |     '拉萨': 'LSO',
1148 |     '拉鲊': 'LEM',
1149 |     '招柏': 'ZBP',
1150 |     '换新天': 'VTB',
1151 |     '揭阳': 'JRQ',
1152 |     '攀枝花': 'PRW',
1153 |     '攸县': 'YOG',
1154 |     '攸县南': 'YXG',
1155 |     '敖力布告': 'ALD',
1156 |     '敦化': 'DHL',
1157 |     '敦煌': 'DHJ',
1158 |     '文地': 'WNZ',
1159 |     '文安': 'WBP',
1160 |     '文昌': 'WEQ',
1161 |     '文水': 'WEV',
1162 |     '文登': 'WBK',
1163 |     '文登东': 'WGK',
1164 |     '斜河涧': 'EEP',
1165 |     '新乐': 'ELP',
1166 |     '新乡': 'XXF',
1167 |     '新乡东': 'EGF',
1168 |     '新会': 'EFQ',
1169 |     '新余': 'XUG',
1170 |     '新余北': 'XBG',
1171 |     '新保安': 'XAP',
1172 |     '新兴县': 'XGQ',
1173 |     '新化': 'EHQ',
1174 |     '新化南': 'EJQ',
1175 |     '新华': 'XHB',
1176 |     '新华屯': 'XAX',
1177 |     '新县': 'XSN',
1178 |     '新友谊': 'EYB',
1179 |     '新和': 'XIR',
1180 |     '新坪田': 'XPM',
1181 |     '新城子': 'XCT',
1182 |     '新安': 'EAM',
1183 |     '新安县': 'XAF',
1184 |     '新帐房': 'XZX',
1185 |     '新干': 'EGG',
1186 |     '新晃': 'XLQ',
1187 |     '新晃西': 'EWQ',
1188 |     '新李': 'XLJ',
1189 |     '新杖子': 'ERP',
1190 |     '新松浦': 'XOB',
1191 |     '新林': 'XPX',
1192 |     '新民': 'XMD',
1193 |     '新江': 'XJM',
1194 |     '新沂': 'VIH',
1195 |     '新津': 'IRW',
1196 |     '新津南': 'ITW',
1197 |     '新窝铺': 'EPD',
1198 |     '新立屯': 'XLD',
1199 |     '新立镇': 'XGT',
1200 |     '新绛': 'XJV',
1201 |     '新绰源': 'XRX',
1202 |     '新肇': 'XZT',
1203 |     '新邱': 'XQD',
1204 |     '新郑机场': 'EZF',
1205 |     '新都东': 'EWW',
1206 |     '新阳镇': 'XZJ',
1207 |     '新青': 'XQB',
1208 |     '施家嘴': 'SHM',
1209 |     '施秉': 'AQW',
1210 |     '旅顺': 'LST',
1211 |     '旌德': 'NSH',
1212 |     '旗下营': 'QXC',
1213 |     '无为': 'IIH',
1214 |     '无锡': 'WXH',
1215 |     '无锡东': 'WGH',
1216 |     '无锡新区': 'IFH',
1217 |     '日喀则': 'RKO',
1218 |     '日照': 'RZK',
1219 |     '旧庄窝': 'JVP',
1220 |     '旬阳': 'XUY',
1221 |     '旬阳北': 'XBY',
1222 |     '旺苍': 'WEW',
1223 |     '昂昂溪': 'AAX',
1224 |     '昆山': 'KSH',
1225 |     '昆山南': 'KNH',
1226 |     '昆明': 'KMM',
1227 |     '昆明南': 'KOM',
1228 |     '昆明西': 'KXM',
1229 |     '昆独仑召': 'KDC',
1230 |     '昆阳': 'KAM',
1231 |     '昌乐': 'CLK',
1232 |     '昌图': 'CTT',
1233 |     '昌图西': 'CPT',
1234 |     '昌平': 'CPP',
1235 |     '昌平北': 'VBP',
1236 |     '昌黎': 'CLP',
1237 |     '明光': 'MGH',
1238 |     '明城': 'MCL',
1239 |     '明水河': 'MUT',
1240 |     '明港': 'MGN',
1241 |     '明港东': 'MDN',
1242 |     '明珠': 'MFQ',
1243 |     '春湾': 'CQQ',
1244 |     '春阳': 'CAL',
1245 |     '昭化': 'ZHW',
1246 |     '昭山': 'KWQ',
1247 |     '昭通': 'ZDW',
1248 |     '晋中': 'JZV',
1249 |     '晋城': 'JCF',
1250 |     '晋城北': 'JEF',
1251 |     '晋州': 'JXP',
1252 |     '晋江': 'JJS',
1253 |     '晏城': 'YEK',
1254 |     '晨明': 'CMB',
1255 |     '普兰店': 'PLT',
1256 |     '普宁': 'PEQ',
1257 |     '普安': 'PAN',
1258 |     '普安县': 'PUE',
1259 |     '普定': 'PGW',
1260 |     '普湾': 'PWT',
1261 |     '普者黑': 'PZM',
1262 |     '普雄': 'POW',
1263 |     '景德镇': 'JCG',
1264 |     '景泰': 'JTJ',
1265 |     '暖泉': 'NQJ',
1266 |     '暮云': 'KIQ',
1267 |     '曲水县': 'QSO',
1268 |     '曲江': 'QIM',
1269 |     '曲阜': 'QFK',
1270 |     '曲阜东': 'QAK',
1271 |     '曲靖': 'QJM',
1272 |     '曲靖北': 'QBM',
1273 |     '曹县': 'CXK',
1274 |     '曹子里': 'CFP',
1275 |     '曾口': 'ZKE',
1276 |     '曾家坪子': 'ZBW',
1277 |     '月亮田': 'YUM',
1278 |     '月山': 'YBF',
1279 |     '朔州': 'SUV',
1280 |     '朗乡': 'LXB',
1281 |     '望都': 'WDP',
1282 |     '朝阳': 'CYD',
1283 |     '朝阳地': 'CDD',
1284 |     '朝阳川': 'CYL',
1285 |     '朝阳镇': 'CZL',
1286 |     '木里图': 'MUD',
1287 |     '本溪': 'BXT',
1288 |     '本溪新城': 'BVT',
1289 |     '本溪湖': 'BHT',
1290 |     '朱家沟': 'ZUB',
1291 |     '朱家窑': 'ZUJ',
1292 |     '朱日和': 'ZRC',
1293 |     '朱杨溪': 'ZXW',
1294 |     '李家': 'LJB',
1295 |     '李家坪': 'LIJ',
1296 |     '李旺': 'VLJ',
1297 |     '李石寨': 'LET',
1298 |     '杏树': 'XSB',
1299 |     '杏树屯': 'XDT',
1300 |     '杜家': 'DJL',
1301 |     '来宾': 'UBZ',
1302 |     '来宾北': 'UCZ',
1303 |     '来舟': 'LZS',
1304 |     '杨岗': 'YRB',
1305 |     '杨村': 'YBP',
1306 |     '杨杖子': 'YZD',
1307 |     '杨林': 'YLM',
1308 |     '杨柳青': 'YQP',
1309 |     '杨树岭': 'YAD',
1310 |     '杨陵': 'YSY',
1311 |     '杨陵南': 'YEY',
1312 |     '杭州': 'HZH',
1313 |     '杭州东': 'HGH',
1314 |     '杭州南': 'XHH',
1315 |     '杭锦后旗': 'HDC',
1316 |     '松原': 'VYT',
1317 |     '松原北': 'OCT',
1318 |     '松树': 'SFT',
1319 |     '松树镇': 'SSL',
1320 |     '松桃': 'MZQ',
1321 |     '松江': 'SAH',
1322 |     '松江南': 'IMH',
1323 |     '松江河': 'SJL',
1324 |     '松江镇': 'OZL',
1325 |     '松河': 'SBM',
1326 |     '松滋': 'SIN',
1327 |     '板城': 'BUP',
1328 |     '板塘': 'NGQ',
1329 |     '林东': 'LRC',
1330 |     '林口': 'LKB',
1331 |     '林海': 'LXX',
1332 |     '林源': 'LYX',
1333 |     '林盛堡': 'LBT',
1334 |     '林西': 'LXC',
1335 |     '果松': 'GSL',
1336 |     '枝城': 'ZCN',
1337 |     '枝江北': 'ZIN',
1338 |     '枣庄': 'ZEK',
1339 |     '枣庄东': 'ZNK',
1340 |     '枣庄西': 'ZFK',
1341 |     '枣强': 'ZVP',
1342 |     '枣林': 'ZIV',
1343 |     '枣阳': 'ZYN',
1344 |     '柏果': 'BGM',
1345 |     '柞水': 'ZSY',
1346 |     '查布嘎': 'CBC',
1347 |     '柳园': 'DHR',
1348 |     '柳园南': 'LNR',
1349 |     '柳州': 'LZZ',
1350 |     '柳林南': 'LKV',
1351 |     '柳树屯': 'LSD',
1352 |     '柳江': 'UQZ',
1353 |     '柳河': 'LNL',
1354 |     '柴岗': 'CGT',
1355 |     '柴沟堡': 'CGV',
1356 |     '柴河': 'CHB',
1357 |     '树木岭': 'FMQ',
1358 |     '栟茶': 'FWH',
1359 |     '株洲': 'ZZQ',
1360 |     '株洲南': 'KVQ',
1361 |     '株洲西': 'ZAQ',
1362 |     '根河': 'GEX',
1363 |     '格尔木': 'GRO',
1364 |     '桂平': 'GAZ',
1365 |     '桂林': 'GLZ',
1366 |     '桂林北': 'GBZ',
1367 |     '桂林西': 'GEZ',
1368 |     '桃山': 'TAB',
1369 |     '桃村': 'TCK',
1370 |     '桃村北': 'TOK',
1371 |     '桐乡': 'TCH',
1372 |     '桐城': 'TTH',
1373 |     '桐子林': 'TEW',
1374 |     '桐柏': 'TBF',
1375 |     '桐梓': 'TZW',
1376 |     '桑园子': 'SAJ',
1377 |     '桑根达来': 'OGC',
1378 |     '桓台': 'VTK',
1379 |     '桥头': 'QAT',
1380 |     '桥西': 'QXJ',
1381 |     '桦南': 'HNB',
1382 |     '桦林': 'HIB',
1383 |     '梁山': 'LMK',
1384 |     '梁平': 'UQW',
1385 |     '梁平南': 'LPE',
1386 |     '梁底下': 'LDP',
1387 |     '梅州': 'MOQ',
1388 |     '梅河口': 'MHL',
1389 |     '梧州': 'WZZ',
1390 |     '梧州南': 'WBZ',
1391 |     '梨树镇': 'LSB',
1392 |     '棋子湾': 'QZQ',
1393 |     '棕溪': 'ZOY',
1394 |     '棠海': 'THM',
1395 |     '楚山': 'CSB',
1396 |     '楚雄': 'COM',
1397 |     '榆林': 'ALY',
1398 |     '榆树': 'YRT',
1399 |     '榆树台': 'YUT',
1400 |     '榆树屯': 'YSX',
1401 |     '榆树沟': 'YGP',
1402 |     '榆次': 'YCV',
1403 |     '榆社': 'YSV',
1404 |     '榕江': 'RVW',
1405 |     '槐荫': 'IYN',
1406 |     '樟木头': 'ZOQ',
1407 |     '樟木头东': 'ZRQ',
1408 |     '樟树': 'ZSG',
1409 |     '樟树东': 'ZOG',
1410 |     '横峰': 'HFG',
1411 |     '横沟桥东': 'HNN',
1412 |     '横道河子': 'HDB',
1413 |     '歙县': 'OVH',
1414 |     '歙县北': 'NPH',
1415 |     '正定': 'ZDP',
1416 |     '正定机场': 'ZHP',
1417 |     '正镶白旗': 'ZXC',
1418 |     '武义': 'RYH',
1419 |     '武义北': 'WDH',
1420 |     '武乡': 'WVV',
1421 |     '武功': 'WGY',
1422 |     '武夷山': 'WAS',
1423 |     '武夷山东': 'WCS',
1424 |     '武夷山北': 'WBS',
1425 |     '武威': 'WUJ',
1426 |     '武威南': 'WWJ',
1427 |     '武安': 'WAP',
1428 |     '武山': 'WSJ',
1429 |     '武当山': 'WRN',
1430 |     '武昌': 'WCN',
1431 |     '武汉': 'WHN',
1432 |     '武清': 'WWP',
1433 |     '武穴': 'WXN',
1434 |     '武胜': 'WSE',
1435 |     '武陟': 'WIF',
1436 |     '武隆': 'WLW',
1437 |     '歪头山': 'WIT',
1438 |     '毛坝': 'MBY',
1439 |     '毛坝关': 'MGY',
1440 |     '毛陈': 'MHN',
1441 |     '民乐': 'MBJ',
1442 |     '民和南': 'MNO',
1443 |     '民权': 'MQF',
1444 |     '民权北': 'MIF',
1445 |     '水家湖': 'SQH',
1446 |     '水富': 'OTW',
1447 |     '水泉': 'SID',
1448 |     '水洋': 'OYP',
1449 |     '水洞': 'SIL',
1450 |     '水源': 'OYJ',
1451 |     '永丰营': 'YYM',
1452 |     '永乐店': 'YDY',
1453 |     '永修': 'ACG',
1454 |     '永嘉': 'URH',
1455 |     '永城北': 'RGH',
1456 |     '永安': 'YAS',
1457 |     '永安乡': 'YNB',
1458 |     '永定': 'YGS',
1459 |     '永寿': 'ASY',
1460 |     '永川': 'YCW',
1461 |     '永川东': 'WMW',
1462 |     '永州': 'AOQ',
1463 |     '永康': 'RFH',
1464 |     '永康南': 'QUH',
1465 |     '永泰': 'YTS',
1466 |     '永济': 'YIV',
1467 |     '永济北': 'AJV',
1468 |     '永登': 'YDJ',
1469 |     '永福南': 'YBZ',
1470 |     '永郎': 'YLW',
1471 |     '汉中': 'HOY',
1472 |     '汉口': 'HKN',
1473 |     '汉寿': 'VSQ',
1474 |     '汉川': 'HCN',
1475 |     '汉沽': 'HGP',
1476 |     '汉源': 'WHW',
1477 |     '汉阴': 'HQY',
1478 |     '汐子': 'XZD',
1479 |     '汕头': 'OTQ',
1480 |     '汕尾': 'OGQ',
1481 |     '汝州': 'ROF',
1482 |     '汝箕沟': 'RQJ',
1483 |     '汝阳': 'RYF',
1484 |     '江华': 'JHZ',
1485 |     '江宁': 'JJH',
1486 |     '江宁西': 'OKH',
1487 |     '江山': 'JUH',
1488 |     '江所田': 'JOM',
1489 |     '江桥': 'JQX',
1490 |     '江永': 'JYZ',
1491 |     '江油': 'JFW',
1492 |     '江津': 'JJW',
1493 |     '江源': 'SZL',
1494 |     '江边村': 'JBG',
1495 |     '江都': 'UDH',
1496 |     '江门': 'JWQ',
1497 |     '池州': 'IYH',
1498 |     '汤原': 'TYB',
1499 |     '汤山城': 'TCT',
1500 |     '汤旺河': 'THB',
1501 |     '汤池': 'TCX',
1502 |     '汤逊湖': 'THN',
1503 |     '汤阴': 'TYF',
1504 |     '汨罗': 'MLQ',
1505 |     '汨罗东': 'MQQ',
1506 |     '汪清': 'WQL',
1507 |     '汾河': 'FEV',
1508 |     '汾阳': 'FAV',
1509 |     '沁县': 'QVV',
1510 |     '沁阳': 'QYF',
1511 |     '沂南': 'YNK',
1512 |     '沂水': 'YUK',
1513 |     '沃皮': 'WPT',
1514 |     '沈丘': 'SQN',
1515 |     '沈家': 'OJB',
1516 |     '沈家河': 'OJJ',
1517 |     '沈阳': 'SYT',
1518 |     '沈阳东': 'SDT',
1519 |     '沈阳北': 'SBT',
1520 |     '沈阳南': 'SOT',
1521 |     '沐滂': 'MPQ',
1522 |     '沙县': 'SAS',
1523 |     '沙后所': 'SSD',
1524 |     '沙坡头': 'SFJ',
1525 |     '沙城': 'SCP',
1526 |     '沙岭子': 'SLP',
1527 |     '沙桥': 'SQM',
1528 |     '沙沱': 'SFM',
1529 |     '沙河': 'SHP',
1530 |     '沙河口': 'SKT',
1531 |     '沙河市': 'VOP',
1532 |     '沙海': 'SED',
1533 |     '沙湾县': 'SXR',
1534 |     '沟帮子': 'GBD',
1535 |     '沥林北': 'KBQ',
1536 |     '沧州': 'COP',
1537 |     '沧州西': 'CBP',
1538 |     '沭阳': 'FMH',
1539 |     '河口北': 'HBM',
1540 |     '河口南': 'HKJ',
1541 |     '河唇': 'HCZ',
1542 |     '河津': 'HJV',
1543 |     '河源': 'VIQ',
1544 |     '河边': 'HBV',
1545 |     '河间西': 'HXP',
1546 |     '治安': 'ZAD',
1547 |     '沿河城': 'YHP',
1548 |     '泉州': 'QYS',
1549 |     '泉州东': 'QRS',
1550 |     '泉阳': 'QYL',
1551 |     '泊头': 'BZP',
1552 |     '法启': 'FQE',
1553 |     '泗县': 'GPH',
1554 |     '泗水': 'OSK',
1555 |     '泗洪': 'GQH',
1556 |     '泗阳': 'MPH',
1557 |     '泡子': 'POD',
1558 |     '泥河子': 'NHD',
1559 |     '泰和': 'THG',
1560 |     '泰宁': 'TNS',
1561 |     '泰安': 'TMK',
1562 |     '泰山': 'TAK',
1563 |     '泰州': 'UTH',
1564 |     '泰康': 'TKX',
1565 |     '泰来': 'TLX',
1566 |     '泽普': 'ZPR',
1567 |     '泽润里': 'ZLM',
1568 |     '泾县': 'LOH',
1569 |     '泾川': 'JAJ',
1570 |     '洋河': 'GTH',
1571 |     '洛湾三江': 'KRW',
1572 |     '洛门': 'LMJ',
1573 |     '洛阳': 'LYF',
1574 |     '洛阳东': 'LDF',
1575 |     '洛阳龙门': 'LLF',
1576 |     '洞井': 'FWQ',
1577 |     '洞庙河': 'DEP',
1578 |     '洪河': 'HPB',
1579 |     '洪洞': 'HDV',
1580 |     '洪洞西': 'HTV',
1581 |     '洮南': 'TVT',
1582 |     '流水沟': 'USP',
1583 |     '济南': 'JNK',
1584 |     '济南东': 'JAK',
1585 |     '济南西': 'JGK',
1586 |     '济宁': 'JIK',
1587 |     '济源': 'JYF',
1588 |     '浑河': 'HHT',
1589 |     '浠水': 'XZN',
1590 |     '浩良河': 'HHB',
1591 |     '浮图峪': 'FYP',
1592 |     '海东西': 'HDO',
1593 |     '海伦': 'HLB',
1594 |     '海北': 'HEB',
1595 |     '海口': 'VUQ',
1596 |     '海口东': 'HMQ',
1597 |     '海坨子': 'HZT',
1598 |     '海城': 'HCT',
1599 |     '海城西': 'HXT',
1600 |     '海宁': 'HNH',
1601 |     '海宁西': 'EUH',
1602 |     '海安县': 'HIH',
1603 |     '海拉尔': 'HRX',
1604 |     '海林': 'HRB',
1605 |     '海湾': 'RWH',
1606 |     '海石湾': 'HSO',
1607 |     '海阳': 'HYK',
1608 |     '海阳北': 'HEK',
1609 |     '海龙': 'HIL',
1610 |     '涉县': 'OEP',
1611 |     '涞源': 'LYP',
1612 |     '涟源': 'LAQ',
1613 |     '涡阳': 'GYH',
1614 |     '涪陵': 'FLW',
1615 |     '涪陵北': 'FEW',
1616 |     '涵江': 'HJS',
1617 |     '涿州': 'ZXP',
1618 |     '涿州东': 'ZAP',
1619 |     '淄博': 'ZBK',
1620 |     '淮北': 'HRH',
1621 |     '淮南': 'HAH',
1622 |     '淮南东': 'HOH',
1623 |     '淮安': 'AUH',
1624 |     '淮安南': 'AMH',
1625 |     '淮滨': 'HVN',
1626 |     '深井子': 'SWT',
1627 |     '深圳': 'SZQ',
1628 |     '深圳东': 'BJQ',
1629 |     '深圳北': 'IOQ',
1630 |     '深圳坪山': 'IFQ',
1631 |     '深圳西': 'OSQ',
1632 |     '深州': 'OZP',
1633 |     '清华园': 'QHP',
1634 |     '清原': 'QYT',
1635 |     '清徐': 'QUV',
1636 |     '清水': 'QUJ',
1637 |     '清水北': 'QEJ',
1638 |     '清河': 'QIP',
1639 |     '清河城': 'QYP',
1640 |     '清河门': 'QHD',
1641 |     '清涧县': 'QNY',
1642 |     '清远': 'QBQ',
1643 |     '渑池': 'MCF',
1644 |     '渑池南': 'MNF',
1645 |     '渠县': 'QRW',
1646 |     '渠旧': 'QJZ',
1647 |     '渠黎': 'QLZ',
1648 |     '温岭': 'VHH',
1649 |     '温州': 'RZH',
1650 |     '温州南': 'VRH',
1651 |     '温春': 'WDB',
1652 |     '渭南': 'WNY',
1653 |     '渭南北': 'WBY',
1654 |     '渭南南': 'WVY',
1655 |     '渭南镇': 'WNJ',
1656 |     '渭津': 'WJL',
1657 |     '渭源': 'WEJ',
1658 |     '湖口': 'HKG',
1659 |     '湖州': 'VZH',
1660 |     '湘乡': 'XXQ',
1661 |     '湘府路': 'FVQ',
1662 |     '湘潭': 'XTQ',
1663 |     '湘潭北': 'EDQ',
1664 |     '湛江': 'ZJZ',
1665 |     '湛江西': 'ZWQ',
1666 |     '湟源': 'HNO',
1667 |     '湾沟': 'WGL',
1668 |     '溆浦': 'EPQ',
1669 |     '溆浦南': 'EMQ',
1670 |     '源潭': 'YTQ',
1671 |     '源迁': 'AQK',
1672 |     '溧水': 'LDH',
1673 |     '溧阳': 'LEH',
1674 |     '滁州': 'CXH',
1675 |     '滁州北': 'CUH',
1676 |     '滕州': 'TXK',
1677 |     '滕州东': 'TEK',
1678 |     '满归': 'MHX',
1679 |     '满洲里': 'MLX',
1680 |     '滦县': 'UXP',
1681 |     '滦平': 'UPP',
1682 |     '滦河': 'UDP',
1683 |     '滦河沿': 'UNP',
1684 |     '滨州': 'BIK',
1685 |     '滨江': 'BJB',
1686 |     '滨海': 'FHP',
1687 |     '滨海北': 'FCP',
1688 |     '滴道': 'DDB',
1689 |     '漠河': 'MVX',
1690 |     '漫水湾': 'MKW',
1691 |     '漯河': 'LON',
1692 |     '漯河西': 'LBN',
1693 |     '漳县': 'ZXJ',
1694 |     '漳州': 'ZUS',
1695 |     '漳州东': 'GOS',
1696 |     '漳平': 'ZPS',
1697 |     '漳浦': 'ZCS',
1698 |     '潍坊': 'WFK',
1699 |     '潘家店': 'PDP',
1700 |     '潜江': 'QJN',
1701 |     '潞城': 'UTP',
1702 |     '潢川': 'KCN',
1703 |     '潮州': 'CKQ',
1704 |     '潮汕': 'CBQ',
1705 |     '潮阳': 'CNQ',
1706 |     '潼关': 'TGY',
1707 |     '潼南': 'TVW',
1708 |     '澄城': 'CUY',
1709 |     '澧县': 'LEQ',
1710 |     '濑湍': 'LVZ',
1711 |     '濮阳': 'PYF',
1712 |     '灌水': 'GST',
1713 |     '火连寨': 'HLT',
1714 |     '灯塔': 'DGT',
1715 |     '灵丘': 'LVV',
1716 |     '灵宝': 'LBF',
1717 |     '灵宝西': 'LPF',
1718 |     '灵武': 'LNJ',
1719 |     '灵璧': 'GMH',
1720 |     '灵石': 'LSV',
1721 |     '灵石东': 'UDV',
1722 |     '炎陵': 'YAG',
1723 |     '烟台': 'YAK',
1724 |     '烟台南': 'YLK',
1725 |     '烟筒屯': 'YUX',
1726 |     '烟筒山': 'YSL',
1727 |     '热水': 'RSD',
1728 |     '焉耆': 'YSR',
1729 |     '焦作': 'JOF',
1730 |     '焦作东': 'WEF',
1731 |     '照福铺': 'ZFM',
1732 |     '熊岳城': 'XYT',
1733 |     '燕子砭': 'YZY',
1734 |     '燕家庄': 'AZK',
1735 |     '燕山': 'AOP',
1736 |     '燕岗': 'YGW',
1737 |     '燕郊': 'AJP',
1738 |     '牙克石': 'YKX',
1739 |     '牙屯堡': 'YTZ',
1740 |     '牛家': 'NJB',
1741 |     '牛心台': 'NXT',
1742 |     '牟平': 'MBK',
1743 |     '牡丹江': 'MDB',
1744 |     '犀浦': 'XIW',
1745 |     '犀浦东': 'XAW',
1746 |     '独山': 'RWW',
1747 |     '独立屯': 'DTX',
1748 |     '狮山': 'KSQ',
1749 |     '狮山北': 'NSQ',
1750 |     '狼尾山': 'LRJ',
1751 |     '猛洞河': 'MUQ',
1752 |     '玉屏': 'YZW',
1753 |     '玉山': 'YNG',
1754 |     '玉山南': 'YGG',
1755 |     '玉林': 'YLZ',
1756 |     '玉泉': 'YQB',
1757 |     '玉溪': 'AXM',
1758 |     '玉溪西': 'YXM',
1759 |     '玉田县': 'ATP',
1760 |     '玉石': 'YSJ',
1761 |     '玉舍': 'AUM',
1762 |     '玉门': 'YXJ',
1763 |     '王兆屯': 'WZB',
1764 |     '王团庄': 'WZJ',
1765 |     '王安镇': 'WVP',
1766 |     '王家湾': 'WJJ',
1767 |     '王家营西': 'KNM',
1768 |     '王岗': 'WGB',
1769 |     '王府': 'WUT',
1770 |     '王杨': 'WYB',
1771 |     '王瞳': 'WTP',
1772 |     '玛纳斯': 'MSR',
1773 |     '玛纳斯湖': 'MNR',
1774 |     '珠海': 'ZHQ',
1775 |     '珠海北': 'ZIQ',
1776 |     '珠窝': 'ZOP',
1777 |     '班猫箐': 'BNM',
1778 |     '珲春': 'HUL',
1779 |     '琼海': 'QYQ',
1780 |     '瑞安': 'RAH',
1781 |     '瑞昌': 'RCG',
1782 |     '瑞金': 'RJG',
1783 |     '璧山': 'FZW',
1784 |     '瓜州': 'GZJ',
1785 |     '瓢儿屯': 'PRT',
1786 |     '瓦屋山': 'WAH',
1787 |     '瓦房店': 'WDT',
1788 |     '瓦房店西': 'WXT',
1789 |     '瓦拉干': 'WVX',
1790 |     '瓦窑田': 'WIM',
1791 |     '甘旗卡': 'GQD',
1792 |     '甘河': 'GAX',
1793 |     '甘泉': 'GQY',
1794 |     '甘泉北': 'GEY',
1795 |     '甘洛': 'VOW',
1796 |     '甘草店': 'GDJ',
1797 |     '甘谷': 'GGJ',
1798 |     '田东': 'TDZ',
1799 |     '田东北': 'TBZ',
1800 |     '田师府': 'TFT',
1801 |     '田心东': 'KQQ',
1802 |     '田林': 'TFZ',
1803 |     '田阳': 'TRZ',
1804 |     '甲山': 'JOP',
1805 |     '甸心': 'DXM',
1806 |     '界首市': 'JUN',
1807 |     '略阳': 'LYY',
1808 |     '疏勒': 'SUR',
1809 |     '疏勒河': 'SHJ',
1810 |     '登沙河': 'DWT',
1811 |     '白云鄂博': 'BEC',
1812 |     '白城': 'BCT',
1813 |     '白壁关': 'BGV',
1814 |     '白奎堡': 'BKB',
1815 |     '白山市': 'HJL',
1816 |     '白旗': 'BQP',
1817 |     '白水县': 'BGY',
1818 |     '白水江': 'BSY',
1819 |     '白水镇': 'BUM',
1820 |     '白沙': 'BSW',
1821 |     '白沙坡': 'BPM',
1822 |     '白沟': 'FEP',
1823 |     '白河': 'BEL',
1824 |     '白河东': 'BIY',
1825 |     '白河县': 'BEY',
1826 |     '白泉': 'BQL',
1827 |     '白洋淀': 'FWP',
1828 |     '白涧': 'BAP',
1829 |     '白狼': 'BAT',
1830 |     '白石山': 'BAL',
1831 |     '白芨沟': 'BJJ',
1832 |     '白银市': 'BNJ',
1833 |     '白银西': 'BXJ',
1834 |     '白音他拉': 'BID',
1835 |     '白音华南': 'FNC',
1836 |     '白音察干': 'BYC',
1837 |     '白音胡硕': 'BCD',
1838 |     '白马井': 'BFQ',
1839 |     '白鸡坡': 'BBM',
1840 |     '百宜': 'FHW',
1841 |     '百浪': 'BRZ',
1842 |     '百色': 'BIZ',
1843 |     '百里峡': 'AAP',
1844 |     '皇姑屯': 'HTT',
1845 |     '皋兰': 'GEJ',
1846 |     '皮口': 'PUT',
1847 |     '皮口南': 'PKT',
1848 |     '皮山': 'PSR',
1849 |     '益阳': 'AEQ',
1850 |     '盐城': 'AFH',
1851 |     '盐池': 'YKJ',
1852 |     '盐津': 'AEW',
1853 |     '盖州': 'GXT',
1854 |     '盖州西': 'GAT',
1855 |     '盘关': 'PAM',
1856 |     '盘州': 'PAE',
1857 |     '盘锦': 'PVD',
1858 |     '盘锦北': 'PBD',
1859 |     '盘龙城': 'PNN',
1860 |     '眉山': 'MSW',
1861 |     '眉山东': 'IUW',
1862 |     '石人': 'SRL',
1863 |     '石人城': 'SRB',
1864 |     '石嘴山': 'QQJ',
1865 |     '石坝': 'OBJ',
1866 |     '石城': 'SCT',
1867 |     '石头': 'OTB',
1868 |     '石家庄': 'SJP',
1869 |     '石家庄北': 'VVP',
1870 |     '石山': 'SAD',
1871 |     '石岘': 'SXL',
1872 |     '石岭': 'SOL',
1873 |     '石峡子': 'SXJ',
1874 |     '石景山南': 'SRP',
1875 |     '石林': 'SLM',
1876 |     '石林南': 'LNM',
1877 |     '石林西': 'SYM',
1878 |     '石柱县': 'OSW',
1879 |     '石桥': 'SQE',
1880 |     '石桥子': 'SQT',
1881 |     '石梯': 'STE',
1882 |     '石河子': 'SZR',
1883 |     '石泉县': 'SXY',
1884 |     '石磷': 'SPB',
1885 |     '石门县': 'OMQ',
1886 |     '石门县北': 'VFQ',
1887 |     '石龙': 'SLQ',
1888 |     '砀山': 'DKH',
1889 |     '砀山南': 'PRH',
1890 |     '砚川': 'YYY',
1891 |     '确山': 'QSN',
1892 |     '碧江': 'BLQ',
1893 |     '碧鸡关': 'BJM',
1894 |     '碾子山': 'NZX',
1895 |     '磁县': 'CIP',
1896 |     '磁山': 'CSP',
1897 |     '磁窑': 'CYK',
1898 |     '磁西': 'CRP',
1899 |     '磐安镇': 'PAJ',
1900 |     '磐石': 'PSL',
1901 |     '磨刀石': 'MOB',
1902 |     '礼泉': 'LGY',
1903 |     '祁东': 'QMQ',
1904 |     '祁东北': 'QRQ',
1905 |     '祁县': 'QXV',
1906 |     '祁县东': 'QGV',
1907 |     '祁家堡': 'QBT',
1908 |     '祁门': 'QIH',
1909 |     '祁阳': 'QWQ',
1910 |     '祁阳北': 'QVQ',
1911 |     '神头': 'SEV',
1912 |     '神州': 'SRQ',
1913 |     '神木': 'OMY',
1914 |     '神树': 'SWB',
1915 |     '神池': 'SMV',
1916 |     '祥云': 'EXM',
1917 |     '禄丰南': 'LQM',
1918 |     '福利区': 'FLJ',
1919 |     '福利屯': 'FTB',
1920 |     '福安': 'FAS',
1921 |     '福山口': 'FKP',
1922 |     '福山镇': 'FZQ',
1923 |     '福州': 'FZS',
1924 |     '福州南': 'FYS',
1925 |     '福泉': 'VMW',
1926 |     '福海': 'FHR',
1927 |     '福清': 'FQS',
1928 |     '福田': 'NZQ',
1929 |     '福鼎': 'FES',
1930 |     '禹城': 'YCK',
1931 |     '离堆公园': 'INW',
1932 |     '秀山': 'ETW',
1933 |     '秦家': 'QJB',
1934 |     '秦家庄': 'QZV',
1935 |     '秦岭': 'QLY',
1936 |     '秦皇岛': 'QTP',
1937 |     '秧草地': 'YKM',
1938 |     '稷山': 'JVV',
1939 |     '穆棱': 'MLB',
1940 |     '窑上': 'ASP',
1941 |     '立志': 'LZX',
1942 |     '章丘': 'ZTK',
1943 |     '章党': 'ZHT',
1944 |     '章古台': 'ZGD',
1945 |     '端州': 'WZQ',
1946 |     '竹园坝': 'ZAW',
1947 |     '笔架山': 'BSB',
1948 |     '简阳': 'JYW',
1949 |     '简阳南': 'JOW',
1950 |     '米易': 'MMW',
1951 |     '米沙子': 'MST',
1952 |     '米脂': 'MEY',
1953 |     '精河': 'JHR',
1954 |     '精河南': 'JIR',
1955 |     '索伦': 'SNT',
1956 |     '索图罕': 'SHX',
1957 |     '紫荆关': 'ZYP',
1958 |     '紫阳': 'ZVY',
1959 |     '綦江': 'QJW',
1960 |     '繁峙': 'FSV',
1961 |     '繁昌西': 'PUH',
1962 |     '红光镇': 'IGW',
1963 |     '红兴隆': 'VHB',
1964 |     '红安': 'HWN',
1965 |     '红安西': 'VXN',
1966 |     '红寺堡': 'HSJ',
1967 |     '红山': 'VSB',
1968 |     '红岘台': 'HTJ',
1969 |     '红彦': 'VIX',
1970 |     '红星': 'VXB',
1971 |     '红果': 'HEM',
1972 |     '红江': 'HFM',
1973 |     '红砂岘': 'VSJ',
1974 |     '红花沟': 'VHD',
1975 |     '纪家沟': 'VJD',
1976 |     '纳雍': 'NYE',
1977 |     '纸坊东': 'ZMN',
1978 |     '绅坊': 'OLH',
1979 |     '织金': 'IZW',
1980 |     '织金北': 'ZJE',
1981 |     '绍兴': 'SOH',
1982 |     '绍兴东': 'SSH',
1983 |     '绍兴北': 'SLH',
1984 |     '经棚': 'JPC',
1985 |     '绕阳河': 'RHD',
1986 |     '统军庄': 'TZP',
1987 |     '绥中': 'SZD',
1988 |     '绥中北': 'SND',
1989 |     '绥化': 'SHB',
1990 |     '绥德': 'ODY',
1991 |     '绥棱': 'SIB',
1992 |     '绥芬河': 'SFB',
1993 |     '绥阳': 'SYB',
1994 |     '绩溪北': 'NRH',
1995 |     '绩溪县': 'JRH',
1996 |     '绵阳': 'MYW',
1997 |     '绿化': 'LWJ',
1998 |     '绿博园': 'LCF',
1999 |     '缙云': 'JYH',
2000 |     '缙云西': 'PYH',
2001 |     '罗城': 'VCZ',
2002 |     '罗山': 'LRN',
2003 |     '罗平': 'LPM',
2004 |     '罗江': 'LJW',
2005 |     '罗江东': 'IKW',
2006 |     '罗源': 'LVS',
2007 |     '羊场': 'YED',
2008 |     '羊堡': 'ABM',
2009 |     '羊尾哨': 'YWM',
2010 |     '羊者窝': 'AEM',
2011 |     '羊臼河': 'YHM',
2012 |     '羊草': 'YAB',
2013 |     '美兰': 'MHQ',
2014 |     '美溪': 'MEB',
2015 |     '老城镇': 'ACQ',
2016 |     '老府': 'UFD',
2017 |     '老莱': 'LAX',
2018 |     '老营': 'LXL',
2019 |     '老边': 'LLT',
2020 |     '耒阳': 'LYQ',
2021 |     '耒阳西': 'LPQ',
2022 |     '聊城': 'UCK',
2023 |     '肃宁': 'SYP',
2024 |     '肇东': 'ZDB',
2025 |     '肇庆': 'ZVQ',
2026 |     '肇庆东': 'FCQ',
2027 |     '肥东': 'FIH',
2028 |     '背荫河': 'BYB',
2029 |     '胜芳': 'SUP',
2030 |     '胶州': 'JXK',
2031 |     '胶州北': 'JZK',
2032 |     '能家': 'NJD',
2033 |     '自贡': 'ZGW',
2034 |     '舍力虎': 'VLD',
2035 |     '舒兰': 'SLL',
2036 |     '舒城': 'OCH',
2037 |     '良各庄': 'LGP',
2038 |     '艾家村': 'AJJ',
2039 |     '艾河': 'AHP',
2040 |     '芙蓉南': 'KCQ',
2041 |     '芜湖': 'WHH',
2042 |     '芦台': 'LTP',
2043 |     '芦沟': 'LOM',
2044 |     '芦溪': 'LUG',
2045 |     '芦潮港': 'UCH',
2046 |     '芨岭': 'JLJ',
2047 |     '花园': 'HUN',
2048 |     '花园口': 'HYT',
2049 |     '花家庄': 'HJM',
2050 |     '花山南': 'KNN',
2051 |     '花桥': 'VQH',
2052 |     '花棚子': 'HZM',
2053 |     '花湖': 'KHN',
2054 |     '芷江': 'ZPQ',
2055 |     '苇子沟': 'WZL',
2056 |     '苇河': 'WHB',
2057 |     '苍南': 'CEH',
2058 |     '苍溪': 'CXE',
2059 |     '苍石': 'CST',
2060 |     '苏家屯': 'SXT',
2061 |     '苏尼特左旗': 'ONC',
2062 |     '苏州': 'SZH',
2063 |     '苏州北': 'OHH',
2064 |     '苏州园区': 'KAH',
2065 |     '苏州新区': 'ITH',
2066 |     '英吉沙': 'YIR',
2067 |     '英德': 'YDQ',
2068 |     '英德西': 'IIQ',
2069 |     '茂名': 'MDQ',
2070 |     '茂名西': 'MMZ',
2071 |     '茂林': 'MLD',
2072 |     '茂舍祖': 'MOM',
2073 |     '范家屯': 'FTT',
2074 |     '范镇': 'VZK',
2075 |     '茅岭': 'MLZ',
2076 |     '茅草坪': 'KPM',
2077 |     '茶卡': 'CVO',
2078 |     '茶陵': 'CDG',
2079 |     '茶陵南': 'CNG',
2080 |     '荆州': 'JBN',
2081 |     '荆门': 'JMN',
2082 |     '草市': 'CSL',
2083 |     '草河口': 'CKT',
2084 |     '草海': 'WBW',
2085 |     '荣成': 'RCK',
2086 |     '荣昌': 'RCW',
2087 |     '荣昌北': 'RQW',
2088 |     '荷塘': 'KXQ',
2089 |     '莆田': 'PTS',
2090 |     '莎车': 'SCR',
2091 |     '莒南': 'JOK',
2092 |     '莒县': 'JKK',
2093 |     '莫尔道嘎': 'MRX',
2094 |     '莱芜东': 'LWK',
2095 |     '莱芜西': 'UXK',
2096 |     '莱西': 'LXK',
2097 |     '莱西北': 'LBK',
2098 |     '莱阳': 'LYK',
2099 |     '莲江口': 'LHB',
2100 |     '获嘉': 'HJF',
2101 |     '菇园': 'GYL',
2102 |     '菏泽': 'HIK',
2103 |     '萍乡': 'PXG',
2104 |     '萍乡北': 'PBG',
2105 |     '营口': 'YKT',
2106 |     '营口东': 'YGT',
2107 |     '营城子': 'YCT',
2108 |     '营山': 'NUW',
2109 |     '营盘水': 'YZJ',
2110 |     '营街': 'YAM',
2111 |     '萧县北': 'QSH',
2112 |     '萨拉齐': 'SLC',
2113 |     '落坡岭': 'LPP',
2114 |     '落垡': 'LOP',
2115 |     '葛店南': 'GNN',
2116 |     '葛根庙': 'GGT',
2117 |     '葡萄菁': 'PTW',
2118 |     '葫芦岛': 'HLD',
2119 |     '葫芦岛北': 'HPD',
2120 |     '葵潭': 'KTQ',
2121 |     '蒙自': 'MZM',
2122 |     '蒙自北': 'MBM',
2123 |     '蒲城': 'PCY',
2124 |     '蒲城东': 'PEY',
2125 |     '蓝村': 'LCK',
2126 |     '蓟州': 'JKP',
2127 |     '蓬安': 'PAW',
2128 |     '蔡家坡': 'CJY',
2129 |     '蔡家沟': 'CJT',
2130 |     '蔡山': 'CON',
2131 |     '蔺家楼': 'ULK',
2132 |     '蕲春': 'QRN',
2133 |     '藁城': 'GEP',
2134 |     '藤县': 'TAZ',
2135 |     '虎什哈': 'HHP',
2136 |     '虎林': 'VLB',
2137 |     '虎石台': 'HUT',
2138 |     '虎门': 'IUQ',
2139 |     '虞城县': 'IXH',
2140 |     '虢镇': 'GZY',
2141 |     '蚌埠': 'BBH',
2142 |     '蚌埠南': 'BMH',
2143 |     '蛟河': 'JHL',
2144 |     '蛟河西': 'JOL',
2145 |     '融安': 'RAZ',
2146 |     '融水': 'RSZ',
2147 |     '衡南': 'HNG',
2148 |     '衡山': 'HSQ',
2149 |     '衡山西': 'HEQ',
2150 |     '衡水': 'HSP',
2151 |     '衡阳': 'HYQ',
2152 |     '衡阳东': 'HVQ',
2153 |     '衢州': 'QEH',
2154 |     '裴德': 'PDB',
2155 |     '褚家湾': 'CWJ',
2156 |     '襄垣': 'EIF',
2157 |     '襄汾': 'XFV',
2158 |     '襄汾西': 'XTV',
2159 |     '襄河': 'XXB',
2160 |     '襄阳': 'XFN',
2161 |     '襄阳东': 'XWN',
2162 |     '西丰': 'XFT',
2163 |     '西乌旗': 'XWC',
2164 |     '西乡': 'XQY',
2165 |     '西八里': 'XLP',
2166 |     '西哲里木': 'XRD',
2167 |     '西固': 'XIJ',
2168 |     '西固城': 'XUJ',
2169 |     '西大庙': 'XMP',
2170 |     '西宁': 'XNO',
2171 |     '西安': 'XAY',
2172 |     '西安北': 'EAY',
2173 |     '西安南': 'CAY',
2174 |     '西小召': 'XZC',
2175 |     '西岗子': 'NBB',
2176 |     '西峡': 'XIF',
2177 |     '西平': 'XPN',
2178 |     '西昌': 'ECW',
2179 |     '西昌南': 'ENW',
2180 |     '西林': 'XYB',
2181 |     '西柳': 'GCT',
2182 |     '西湖东': 'WDQ',
2183 |     '西街口': 'EKM',
2184 |     '西阳村': 'XQF',
2185 |     '西麻山': 'XMB',
2186 |     '角美': 'JES',
2187 |     '讷河': 'NHX',
2188 |     '许三湾': 'XSJ',
2189 |     '许家台': 'XTJ',
2190 |     '许家屯': 'XJT',
2191 |     '许昌': 'XCF',
2192 |     '许昌东': 'XVF',
2193 |     '诏安': 'ZDS',
2194 |     '诸城': 'ZQK',
2195 |     '诸暨': 'ZDH',
2196 |     '读书铺': 'DPM',
2197 |     '谢家镇': 'XMT',
2198 |     '谭家井': 'TNJ',
2199 |     '谷城': 'GCN',
2200 |     '豆庄': 'ROP',
2201 |     '豆张庄': 'RZP',
2202 |     '豆罗': 'DLV',
2203 |     '贲红': 'BVC',
2204 |     '贵安': 'GAE',
2205 |     '贵定': 'GTW',
2206 |     '贵定北': 'FMW',
2207 |     '贵定南': 'IDW',
2208 |     '贵定县': 'KIW',
2209 |     '贵港': 'GGZ',
2210 |     '贵溪': 'GXG',
2211 |     '贵阳': 'GIW',
2212 |     '贵阳北': 'KQW',
2213 |     '费县': 'FXK',
2214 |     '贺家店': 'HJJ',
2215 |     '贺州': 'HXZ',
2216 |     '贺胜桥东': 'HLN',
2217 |     '贾鲁河': 'JLF',
2218 |     '资中': 'ZZW',
2219 |     '资中北': 'WZW',
2220 |     '资溪': 'ZXS',
2221 |     '资阳': 'ZYW',
2222 |     '资阳北': 'FYW',
2223 |     '赛汗塔拉': 'SHC',
2224 |     '赣州': 'GZG',
2225 |     '赤壁': 'CBN',
2226 |     '赤壁北': 'CIN',
2227 |     '赤峰': 'CFD',
2228 |     '赤峰西': 'CID',
2229 |     '赵光': 'ZGB',
2230 |     '赵城': 'ZCV',
2231 |     '赶水': 'GSW',
2232 |     '超梁沟': 'CYP',
2233 |     '越西': 'YHW',
2234 |     '路口铺': 'LKQ',
2235 |     '车转湾': 'CWM',
2236 |     '轩岗': 'XGV',
2237 |     '轮台': 'LAR',
2238 |     '辛集': 'ENP',
2239 |     '辰清': 'CQB',
2240 |     '辰溪': 'CXQ',
2241 |     '辽中': 'LZD',
2242 |     '辽源': 'LYL',
2243 |     '辽阳': 'LYT',
2244 |     '达家沟': 'DJT',
2245 |     '达州': 'RXW',
2246 |     '达拉特旗': 'DIC',
2247 |     '达拉特西': 'DNC',
2248 |     '迁安': 'QQP',
2249 |     '迎宾路': 'YFW',
2250 |     '迎春': 'YYB',
2251 |     '运城': 'YNV',
2252 |     '运城北': 'ABV',
2253 |     '运粮河': 'YEF',
2254 |     '进贤': 'JUG',
2255 |     '进贤南': 'JXG',
2256 |     '连云港': 'UIH',
2257 |     '连云港东': 'UKH',
2258 |     '连山关': 'LGT',
2259 |     '连江': 'LKS',
2260 |     '迤资': 'YQM',
2261 |     '通化': 'THL',
2262 |     '通化县': 'TXL',
2263 |     '通北': 'TBB',
2264 |     '通安驿': 'TAJ',
2265 |     '通州西': 'TAP',
2266 |     '通沟': 'TOL',
2267 |     '通海': 'TAM',
2268 |     '通辽': 'TLD',
2269 |     '通远堡': 'TYT',
2270 |     '通远堡西': 'TST',
2271 |     '通途': 'TUT',
2272 |     '通道': 'TRQ',
2273 |     '遂宁': 'NIW',
2274 |     '遂平': 'SON',
2275 |     '遂溪': 'SXZ',
2276 |     '道州': 'DFZ',
2277 |     '道清': 'DML',
2278 |     '遵义': 'ZIW',
2279 |     '邓州': 'DOF',
2280 |     '邢台': 'XTP',
2281 |     '邢台东': 'EDP',
2282 |     '那曲': 'NQO',
2283 |     '那罗': 'ULZ',
2284 |     '那铺': 'NPZ',
2285 |     '邯郸': 'HDP',
2286 |     '邯郸东': 'HPP',
2287 |     '邳州': 'PJH',
2288 |     '邵东': 'FIQ',
2289 |     '邵家堂': 'SJJ',
2290 |     '邵武': 'SWS',
2291 |     '邵阳': 'SYQ',
2292 |     '邵阳北': 'OVQ',
2293 |     '邹城': 'ZIK',
2294 |     '郁南': 'YKQ',
2295 |     '郑州': 'ZZF',
2296 |     '郑州东': 'ZAF',
2297 |     '郑州西': 'XPF',
2298 |     '郓城': 'YPK',
2299 |     '郫县': 'PWW',
2300 |     '郫县西': 'PCW',
2301 |     '郭家店': 'GDT',
2302 |     '郭磊庄': 'GLP',
2303 |     '郯城': 'TZK',
2304 |     '郴州': 'CZQ',
2305 |     '郴州西': 'ICQ',
2306 |     '都匀': 'RYW',
2307 |     '都匀东': 'KJW',
2308 |     '都格': 'DMM',
2309 |     '都江堰': 'DDW',
2310 |     '鄂尔多斯': 'EEC',
2311 |     '鄂州': 'ECN',
2312 |     '鄂州东': 'EFN',
2313 |     '鄄城': 'JCK',
2314 |     '鄯善': 'SSR',
2315 |     '鄯善北': 'SMR',
2316 |     '酉阳': 'AFW',
2317 |     '酒泉': 'JQJ',
2318 |     '酒泉南': 'JNJ',
2319 |     '醴陵': 'LLG',
2320 |     '醴陵东': 'UKQ',
2321 |     '里木店': 'LMB',
2322 |     '重庆': 'CQW',
2323 |     '重庆北': 'CUW',
2324 |     '重庆南': 'CRW',
2325 |     '野三坡': 'AIP',
2326 |     '金华': 'JBH',
2327 |     '金华南': 'RNH',
2328 |     '金坑': 'JKT',
2329 |     '金城江': 'JJZ',
2330 |     '金宝屯': 'JBD',
2331 |     '金寨': 'JZH',
2332 |     '金山北': 'EGH',
2333 |     '金山屯': 'JTB',
2334 |     '金州': 'JZT',
2335 |     '金昌': 'JCJ',
2336 |     '金月湾': 'PYQ',
2337 |     '金杖子': 'JYD',
2338 |     '金沟屯': 'VGP',
2339 |     '金河': 'JHX',
2340 |     '金银潭': 'JTN',
2341 |     '金马村': 'JMM',
2342 |     '钟家村': 'ZJY',
2343 |     '钟山': 'ZSZ',
2344 |     '钟山西': 'ZAZ',
2345 |     '钟祥': 'ZTN',
2346 |     '钦州': 'QRZ',
2347 |     '钦州东': 'QDZ',
2348 |     '铁力': 'TLB',
2349 |     '铁厂': 'TCL',
2350 |     '铁岭': 'TLT',
2351 |     '铁岭西': 'PXT',
2352 |     '铜仁': 'RDQ',
2353 |     '铜仁南': 'TNW',
2354 |     '铜陵': 'TJH',
2355 |     '铜陵北': 'KXH',
2356 |     '银川': 'YIJ',
2357 |     '银浪': 'YJX',
2358 |     '银滩': 'CTQ',
2359 |     '银瓶': 'KPQ',
2360 |     '锡林浩特': 'XTC',
2361 |     '锦州': 'JZD',
2362 |     '锦州南': 'JOD',
2363 |     '锦河': 'JHB',
2364 |     '镇城底': 'ZDV',
2365 |     '镇安': 'ZEY',
2366 |     '镇平': 'ZPF',
2367 |     '镇江': 'ZJH',
2368 |     '镇江南': 'ZEH',
2369 |     '镇西': 'ZVT',
2370 |     '镇赉': 'ZLT',
2371 |     '镇远': 'ZUW',
2372 |     '镜铁山': 'JVJ',
2373 |     '长临河': 'FVH',
2374 |     '长兴': 'CBH',
2375 |     '长兴南': 'CFH',
2376 |     '长农': 'CNJ',
2377 |     '长冲': 'CCM',
2378 |     '长坡岭': 'CPM',
2379 |     '长垣': 'CYF',
2380 |     '长城': 'CEJ',
2381 |     '长寿': 'EFW',
2382 |     '长寿北': 'COW',
2383 |     '长寿湖': 'CSE',
2384 |     '长山屯': 'CVT',
2385 |     '长岭子': 'CLT',
2386 |     '长庆桥': 'CQJ',
2387 |     '长征': 'CZJ',
2388 |     '长春': 'CCT',
2389 |     '长春南': 'CET',
2390 |     '长春西': 'CRT',
2391 |     '长武': 'CWY',
2392 |     '长汀': 'CES',
2393 |     '长汀南': 'CNS',
2394 |     '长汀镇': 'CDB',
2395 |     '长沙': 'CSQ',
2396 |     '长沙南': 'CWQ',
2397 |     '长治': 'CZF',
2398 |     '长治北': 'CBF',
2399 |     '长甸': 'CDT',
2400 |     '长葛': 'CEF',
2401 |     '长阳': 'CYN',
2402 |     '门源': 'MYO',
2403 |     '闵集': 'MJN',
2404 |     '闻喜': 'WXV',
2405 |     '闻喜西': 'WOV',
2406 |     '闽清': 'MQS',
2407 |     '闽清北': 'MBS',
2408 |     '阆中': 'LZE',
2409 |     '阎良': 'YNY',
2410 |     '阜南': 'FNH',
2411 |     '阜宁': 'AKH',
2412 |     '阜新南': 'FXD',
2413 |     '阜阳': 'FYH',
2414 |     '防城港北': 'FBZ',
2415 |     '阳信': 'YVK',
2416 |     '阳城': 'YNF',
2417 |     '阳岔': 'YAL',
2418 |     '阳平关': 'YAY',
2419 |     '阳新': 'YON',
2420 |     '阳明堡': 'YVV',
2421 |     '阳春': 'YQQ',
2422 |     '阳曲': 'YQV',
2423 |     '阳朔': 'YCZ',
2424 |     '阳泉': 'AQP',
2425 |     '阳泉北': 'YPP',
2426 |     '阳泉曲': 'YYV',
2427 |     '阳澄湖': 'AIH',
2428 |     '阳谷': 'YIK',
2429 |     '阳邑': 'ARP',
2430 |     '阳高': 'YOV',
2431 |     '阿克苏': 'ASR',
2432 |     '阿克陶': 'AER',
2433 |     '阿南庄': 'AZM',
2434 |     '阿图什': 'ATR',
2435 |     '阿城': 'ACB',
2436 |     '阿尔山': 'ART',
2437 |     '阿尔山北': 'ARX',
2438 |     '阿巴嘎旗': 'AQC',
2439 |     '阿拉山口': 'AKR',
2440 |     '阿木尔': 'JTX',
2441 |     '阿里河': 'AHX',
2442 |     '阿金': 'AJD',
2443 |     '阿龙山': 'ASX',
2444 |     '陆丰': 'LLQ',
2445 |     '陆川': 'LKZ',
2446 |     '陆良': 'LRM',
2447 |     '陇南': 'INJ',
2448 |     '陇县': 'LXY',
2449 |     '陇西': 'LXJ',
2450 |     '陈官营': 'CAJ',
2451 |     '陈相屯': 'CXT',
2452 |     '陵城': 'LGK',
2453 |     '陵水': 'LIQ',
2454 |     '陶家屯': 'TOT',
2455 |     '陶赖昭': 'TPT',
2456 |     '隆化': 'UHP',
2457 |     '隆安东': 'IDZ',
2458 |     '隆昌': 'LCW',
2459 |     '隆昌北': 'NWW',
2460 |     '随州': 'SZN',
2461 |     '雁翅': 'YAP',
2462 |     '雁荡山': 'YGH',
2463 |     '集宁南': 'JAC',
2464 |     '集安': 'JAL',
2465 |     '雨格': 'VTM',
2466 |     '零陵': 'UWZ',
2467 |     '雷州': 'UAQ',
2468 |     '霍尔果斯': 'HFR',
2469 |     '霍州': 'HZV',
2470 |     '霍州东': 'HWV',
2471 |     '霍林郭勒': 'HWD',
2472 |     '霍邱': 'FBH',
2473 |     '霞浦': 'XOS',
2474 |     '露水河': 'LUL',
2475 |     '霸州': 'RMP',
2476 |     '霸州西': 'FOP',
2477 |     '青县': 'QXP',
2478 |     '青城山': 'QSW',
2479 |     '青堆': 'QET',
2480 |     '青山': 'QSB',
2481 |     '青岛': 'QDK',
2482 |     '青岛北': 'QHK',
2483 |     '青州市': 'QZK',
2484 |     '青田': 'QVH',
2485 |     '青白江东': 'QFW',
2486 |     '青神': 'QVW',
2487 |     '青莲': 'QEW',
2488 |     '青铜峡': 'QTJ',
2489 |     '青龙': 'QIB',
2490 |     '青龙山': 'QGH',
2491 |     '靖宇': 'JYL',
2492 |     '靖州': 'JEQ',
2493 |     '靖西': 'JMZ',
2494 |     '靖边': 'JIY',
2495 |     '靖远': 'JYJ',
2496 |     '靖远西': 'JXJ',
2497 |     '静海': 'JHP',
2498 |     '革居': 'GEM',
2499 |     '革镇堡': 'GZT',
2500 |     '鞍山': 'AST',
2501 |     '鞍山西': 'AXT',
2502 |     '韦庄': 'WZY',
2503 |     '韩城': 'HCY',
2504 |     '韩府湾': 'HXJ',
2505 |     '韩麻营': 'HYP',
2506 |     '韶关': 'SNQ',
2507 |     '韶关东': 'SGQ',
2508 |     '韶山': 'SSQ',
2509 |     '韶山南': 'INQ',
2510 |     '项城': 'ERN',
2511 |     '顺义': 'SOP',
2512 |     '顺德': 'ORQ',
2513 |     '顺德学院': 'OJQ',
2514 |     '顺昌': 'SCS',
2515 |     '额济纳': 'EJC',
2516 |     '风陵渡': 'FLV',
2517 |     '饶平': 'RVQ',
2518 |     '饶阳': 'RVP',
2519 |     '首山': 'SAT',
2520 |     '香兰': 'XNB',
2521 |     '香坊': 'XFB',
2522 |     '香樟路': 'FNQ',
2523 |     '马三家': 'MJT',
2524 |     '马兰': 'MLR',
2525 |     '马林': 'MID',
2526 |     '马桥河': 'MQB',
2527 |     '马皇': 'MHZ',
2528 |     '马莲河': 'MHB',
2529 |     '马鞍山': 'MAH',
2530 |     '马鞍山东': 'OMH',
2531 |     '马龙': 'MGM',
2532 |     '驻马店': 'ZDN',
2533 |     '驻马店西': 'ZLN',
2534 |     '驼腰岭': 'TIL',
2535 |     '骆驼巷': 'LTJ',
2536 |     '高台': 'GTJ',
2537 |     '高台南': 'GAJ',
2538 |     '高各庄': 'GGP',
2539 |     '高安': 'GCG',
2540 |     '高密': 'GMK',
2541 |     '高山子': 'GSD',
2542 |     '高州': 'GSQ',
2543 |     '高平': 'GPF',
2544 |     '高村': 'GCV',
2545 |     '高桥镇': 'GZD',
2546 |     '高楼房': 'GFM',
2547 |     '高滩': 'GAY',
2548 |     '高碑店': 'GBP',
2549 |     '高碑店东': 'GMP',
2550 |     '高邑': 'GIP',
2551 |     '高邑西': 'GNP',
2552 |     '魏善庄': 'WSP',
2553 |     '魏杖子': 'WKD',
2554 |     '鲁山': 'LAF',
2555 |     '鲁番': 'LVM',
2556 |     '鲅鱼圈': 'BYT',
2557 |     '鲘门': 'KMQ',
2558 |     '鳌江': 'ARH',
2559 |     '鸡东': 'JOB',
2560 |     '鸡冠山': 'JST',
2561 |     '鸡西': 'JXB',
2562 |     '鸭园': 'YYL',
2563 |     '鸳鸯镇': 'YYJ',
2564 |     '鹤北': 'HMB',
2565 |     '鹤壁': 'HAF',
2566 |     '鹤壁东': 'HFF',
2567 |     '鹤岗': 'HGB',
2568 |     '鹤庆': 'HQM',
2569 |     '鹤立': 'HOB',
2570 |     '鹰手营子': 'YIP',
2571 |     '鹰潭': 'YTG',
2572 |     '鹰潭北': 'YKG',
2573 |     '鹿寨': 'LIZ',
2574 |     '鹿寨北': 'LSZ',
2575 |     '鹿道': 'LDL',
2576 |     '麦园': 'MYS',
2577 |     '麻城': 'MCN',
2578 |     '麻城北': 'MBN',
2579 |     '麻尾': 'VAW',
2580 |     '麻山': 'MAB',
2581 |     '麻阳': 'MVQ',
2582 |     '黄冈': 'KGN',
2583 |     '黄冈东': 'KAN',
2584 |     '黄冈西': 'KXN',
2585 |     '黄口': 'KOH',
2586 |     '黄山': 'HKH',
2587 |     '黄山北': 'NYH',
2588 |     '黄州': 'VON',
2589 |     '黄村': 'HCP',
2590 |     '黄松甸': 'HDL',
2591 |     '黄柏': 'HBL',
2592 |     '黄梅': 'VEH',
2593 |     '黄河景区': 'HCF',
2594 |     '黄泥河': 'HHL',
2595 |     '黄流': 'KLQ',
2596 |     '黄瓜园': 'HYM',
2597 |     '黄石': 'HSN',
2598 |     '黄石东': 'OSN',
2599 |     '黄石北': 'KSN',
2600 |     '黄羊滩': 'HGJ',
2601 |     '黄羊镇': 'HYJ',
2602 |     '黄花筒': 'HUD',
2603 |     '黄陵': 'ULY',
2604 |     '黄陵南': 'VLY',
2605 |     '黎城': 'UCP',
2606 |     '黎塘': 'LTZ',
2607 |     '黑井': 'HIM',
2608 |     '黑冲滩': 'HCJ',
2609 |     '黑台': 'HQB',
2610 |     '黑旺': 'HWK',
2611 |     '黑水': 'HOT',
2612 |     '黑河': 'HJB',
2613 |     '黔江': 'QNW',
2614 |     '鼎湖东': 'UWQ',
2615 |     '鼎湖山': 'NVQ',
2616 |     '齐齐哈尔': 'QHX',
2617 |     '齐齐哈尔南': 'QNB',
2618 |     '龙丰': 'KFQ',
2619 |     '龙井': 'LJL',
2620 |     '龙华': 'LHP',
2621 |     '龙南': 'UNG',
2622 |     '龙嘉': 'UJL',
2623 |     '龙塘坝': 'LBM',
2624 |     '龙山镇': 'LAS',
2625 |     '龙岩': 'LYS',
2626 |     '龙川': 'LUQ',
2627 |     '龙市': 'LAG',
2628 |     '龙江': 'LJX',
2629 |     '龙沟': 'LGJ',
2630 |     '龙泉寺': 'UQJ',
2631 |     '龙洞堡': 'FVW',
2632 |     '龙游': 'LMH',
2633 |     '龙爪沟': 'LZT',
2634 |     '龙里': 'LLW',
2635 |     '龙里北': 'KFW',
2636 |     '龙镇': 'LZA',
2637 |     '龙骨甸': 'LGM'}
2638 | 


--------------------------------------------------------------------------------
/tickets/tickets.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | """命令行火车票查看工具
  4 | 
  5 | Usage:
  6 |     tickets [-gdtkz] <from> <to> <date>
  7 | 
  8 | options:
  9 |     -h,--help   显示帮助菜单
 10 |     -g          高铁
 11 |     -d          动车
 12 |     -t          特快
 13 |     -k          快速
 14 |     -z          直达
 15 | 
 16 | Example:
 17 |     tickets 北京 重庆 2017-03-26
 18 |     tickets -dg 成都 重庆 2017-03-26
 19 | 
 20 | """
 21 | import requests
 22 | 
 23 | from docopt import docopt
 24 | 
 25 | from prettytable import PrettyTable
 26 | 
 27 | from stations import stations
 28 | 
 29 | #因为是https，会有警告，所以加入以下参数解决警告问题
 30 | from requests.packages.urllib3.exceptions import InsecureRequestWarning,InsecurePlatformWarning
 31 | requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
 32 | requests.packages.urllib3.disable_warnings(InsecurePlatformWarning)
 33 | 
 34 | class TrainsResult:
 35 | 
 36 |     header = '车次 车站 时间 历时 一等 二等 软卧 硬卧 硬座 无座'.split()
 37 | 
 38 |     def __init__(self, available_trains, options):
 39 | 
 40 |         self.available_trains = available_trains
 41 | 
 42 |         self.options = options
 43 | 
 44 |     def _get_duration(self, raw_train):
 45 | 
 46 |         duration = raw_train.get('lishi').replace(':', '小时') + '分'
 47 | 
 48 |         if duration.startswith('00'):
 49 |             return duration[4:]
 50 | 
 51 |         if duration.startswith('0'):
 52 |             return duration[1:]
 53 | 
 54 |         return duration
 55 | 
 56 |     @property
 57 |     def trains(self):
 58 | 
 59 |         for raw_train in self.available_trains:
 60 | 
 61 |             raw_train = raw_train['queryLeftNewDTO']
 62 | 
 63 |             train_code = raw_train['station_train_code']
 64 | 
 65 |             initial = train_code[0].lower()
 66 | 
 67 |             if not self.options or initial in self.options:
 68 | 
 69 |                 train = [
 70 |                     train_code,
 71 | 
 72 |                     '\n'.join([raw_train['from_station_name'],
 73 |                               raw_train['to_station_name']]),
 74 | 
 75 |                     '\n'.join([raw_train['start_time'],
 76 |                              raw_train['arrive_time']]),
 77 | 
 78 |                     self._get_duration(raw_train),
 79 | 
 80 |                     raw_train['zy_num'],
 81 |                     raw_train['ze_num'],
 82 |                     raw_train['rw_num'],
 83 |                     raw_train['yw_num'],
 84 |                     raw_train['yz_num'],
 85 |                     raw_train['wz_num']
 86 |                 ]
 87 | 
 88 |                 yield train
 89 | 
 90 |     def pretty_print(self):
 91 | 
 92 |         pt = PrettyTable()
 93 | 
 94 |         pt._set_field_names(self.header)
 95 | 
 96 |         for train in self.trains:
 97 |             pt.add_row(train)
 98 | 
 99 |         print(pt)
100 | 
101 | def command():
102 | 
103 |     arguments = docopt(__doc__)
104 | 
105 |     #print(arguments)
106 | 
107 |     from_sta = stations.get(arguments['<from>'])
108 | 
109 |     to_sta = stations.get(arguments['<to>'])
110 | 
111 |     date = arguments['<date>']
112 | 
113 |     url = 'https://kyfw.12306.cn/otn/leftTicket/query?leftTicketDTO.train_date={}&leftTicketDTO.from_station={}&leftTicketDTO.to_station={}&purpose_codes=ADULT'.format(date, from_sta, to_sta)
114 | 
115 |     resp = requests.get(url, verify=False)
116 | 
117 |     #print(resp.json())
118 | 
119 |     options = ''.join([
120 |         key for key, value in arguments.items() if value is True
121 |     ])
122 | 
123 |     available_trains = resp.json()['data']
124 | 
125 |     TrainsResult(available_trains, options).pretty_print()
126 | 
127 | if __name__ == '__main__':
128 | 
129 |     command()
130 | 


--------------------------------------------------------------------------------
/weather/local_weather.txt:
--------------------------------------------------------------------------------
 1 | city:通辽
 2 | 
 3 | data:03-05		day:多云(-2°C )		night:多云( -12°C)
 4 | 
 5 | data:03-06		day:多云(-2°C )		night:多云( -11°C)
 6 | 
 7 | data:03-07		day:晴(2°C )		night:多云( -9°C)
 8 | 
 9 | data:03-08		day:晴(5°C )		night:晴( -6°C)
10 | 
11 | data:03-09		day:晴(5°C )		night:晴( -6°C)
12 | 
13 | data:03-10		day:多云(9°C )		night:晴( -3°C)
14 | 
15 | data:03-11		day:多云(8°C )		night:多云( -3°C)
16 | 
17 | data:03-12		day:晴(7°C )		night:晴( -4°C)
18 | 
19 | data:03-13		day:少云(8°C )		night:晴( -4°C)
20 | 
21 | data:03-14		day:局部多云(8°C )		night:少云( -3°C)
22 | 
23 | 


--------------------------------------------------------------------------------
/weather/requirements.txt:
--------------------------------------------------------------------------------
1 | scrapy
2 | beautifulsoup4
3 | 


--------------------------------------------------------------------------------
/weather/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = weather.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weather
12 | 


--------------------------------------------------------------------------------
/weather/wea.json:
--------------------------------------------------------------------------------
1 | [
2 | {"date": ["03-04", "03-05", "03-06", "03-07", "03-08", "03-09", "03-10", "03-11", "03-12", "03-13"], "city": ["\u676d\u5dde"], "dayDesc": ["\u591a\u4e91", "\u5c0f\u96e8", "\u5c0f\u96e8", "\u9634", "\u9634", "\u591a\u4e91", "\u591a\u4e91", "\u6674", "\u6674", "\u6674", "\u591a\u4e91", "\u591a\u4e91", "\u591a\u4e91", "\u5c0f\u96e8", "\u96f6\u6563\u9635\u96e8", "\u9634\u5929", "\u9634\u5929", "\u9635\u96e8", "\u96e8", "\u9635\u96e8"], "dayTemp": ["18\u00b0C / 10\u00b0C", "12\u00b0C / 7\u00b0C", "12\u00b0C / 4\u00b0C", "12\u00b0C / 4\u00b0C", "17\u00b0C / 4\u00b0C", "18\u00b0C / 7\u00b0C", "20\u00b0C / 10\u00b0C", "16\u00b0C / 10\u00b0C", "17\u00b0C / 9\u00b0C", "15\u00b0C / 8\u00b0C"]}
3 | ]


--------------------------------------------------------------------------------
/weather/weather/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/weather/weather/__init__.py


--------------------------------------------------------------------------------
/weather/weather/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WeatherItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 | 
15 |     city = scrapy.Field()
16 |     date = scrapy.Field()
17 |     dayDesc = scrapy.Field()
18 |     dayTemp = scrapy.Field()
19 | 
20 |     pass
21 | 
22 | 


--------------------------------------------------------------------------------
/weather/weather/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class WeatherSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/weather/weather/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | class WeatherPipeline(object):
 9 | 
10 |     header = '日期 白天 晚上'.split()
11 | 
12 |     def __init__(self):
13 | 
14 |         pass
15 | 
16 |     def process_item(self, item, spider):
17 | 
18 |         with open('local_weather.txt', 'w+') as f:
19 | 
20 |             city = item['city'].encode('utf-8')
21 | 
22 |             f.write('city:' + str(city) + '\n\n')
23 | 
24 |             date = item['date']
25 | 
26 |             desc = item['dayDesc']
27 | 
28 |             dayDesc = desc[1::2]
29 | 
30 |             nightDesc = desc[0::2]
31 | 
32 |             dayTemp = item['dayTemp']
33 | 
34 |             weaitem = zip(date, dayDesc, nightDesc, dayTemp)
35 | 
36 |             for i in range(len(weaitem)):
37 | 
38 |                 item = weaitem[i]
39 | 
40 |                 d = item[0]
41 | 
42 |                 dd = item[1]
43 | 
44 |                 nd = item[2]
45 | 
46 |                 ta = item[3].split('/')
47 | 
48 |                 #判断以下白天还是晚上，晚上爬的时候当天白天的温度没有数据
49 |                 if len(ta) == 1:
50 |                     dt = 'None'
51 |                     nt = ta[0]
52 |                 else:
53 |                     dt = ta[0]
54 |                     nt = ta[1]
55 | 
56 |                 txt = 'data:{0}\t\tday:{1}({2})\t\tnight:{3}({4})\n\n'.format(
57 |                         d,
58 |                         dd.encode('utf-8'),
59 |                         dt.encode('utf-8'),
60 |                         nd.encode('utf-8'),
61 |                         nt.encode('utf-8')
62 |                 )
63 | 
64 |                 f.write(txt)
65 | 
66 |         return item
67 | 


--------------------------------------------------------------------------------
/weather/weather/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for weather project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'weather'
13 | 
14 | SPIDER_MODULES = ['weather.spiders']
15 | NEWSPIDER_MODULE = 'weather.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'weather (+http://www.yourdomain.com)'
20 | USER_AGENT = 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = True
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | #CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | #DOWNLOAD_DELAY = 3
32 | # The download delay setting will honor only one of:
33 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | #CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | #COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | #TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | DEFAULT_REQUEST_HEADERS = {
44 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | #   'Accept-Language': 'en',
46 |     'Referer':'http://www.weibo.com'
47 | }
48 | 
49 | # Enable or disable spider middlewares
50 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
51 | #SPIDER_MIDDLEWARES = {
52 | #    'weather.middlewares.WeatherSpiderMiddleware': 543,
53 | #}
54 | 
55 | # Enable or disable downloader middlewares
56 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
57 | #DOWNLOADER_MIDDLEWARES = {
58 | #    'weather.middlewares.MyCustomDownloaderMiddleware': 543,
59 | #}
60 | 
61 | # Enable or disable extensions
62 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
63 | #EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | #}
66 | 
67 | # Configure item pipelines
68 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 |     'weather.pipelines.WeatherPipeline': 300,
71 | }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | #AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | #AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | #AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | #AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | #HTTPCACHE_ENABLED = True
89 | #HTTPCACHE_EXPIRATION_SECS = 0
90 | #HTTPCACHE_DIR = 'httpcache'
91 | #HTTPCACHE_IGNORE_HTTP_CODES = []
92 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 


--------------------------------------------------------------------------------
/weather/weather/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/weather/weather/spiders/localweather.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | import scrapy
 4 | 
 5 | from weather.items import WeatherItem
 6 | 
 7 | from bs4 import BeautifulSoup
 8 | 
 9 | class WeatherSpider(scrapy.Spider):
10 | 
11 |     name = "localweather"
12 | 
13 |     #allowed_domains = ["sina.com.cn"]
14 | 
15 |     start_urls = ['http://weather.sina.com.cn/']
16 | 
17 |     def parse(self, response):
18 | 
19 |         item = WeatherItem()
20 | 
21 |         resp = response.body
22 | 
23 |         soup = BeautifulSoup(resp, "html5lib")
24 | 
25 |         itemTemp = {}
26 | 
27 |         itemTemp['city'] = soup.find(id='slider_ct_name')
28 | 
29 |         tenDay = soup.find(id='blk_fc_c0_scroll')
30 |         #print tenDay
31 | 
32 |         itemTemp['date'] = tenDay.findAll("p", class_="wt_fc_c0_i_date")
33 | 
34 |         itemTemp['dayDesc'] = tenDay.findAll("img", class_="icons0_wt")
35 | 
36 |         itemTemp['dayTemp'] = tenDay.findAll("p", class_="wt_fc_c0_i_temp")
37 | 
38 |         for att in itemTemp:
39 | 
40 |             item[att] = []
41 | 
42 |             if att == 'city':
43 |                 item[att] = itemTemp.get(att).text
44 |                 continue
45 | 
46 |             for obj in itemTemp.get(att):
47 | 
48 |                 if att == 'dayDesc':
49 |                     item[att].append(obj['title'])
50 | 
51 |                 else:
52 |                     item[att].append(obj.text)
53 | 
54 |         return item
55 | 
56 | 
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/wechat/crawl_wechat.py:
--------------------------------------------------------------------------------
 1 | # coding = utf-8
 2 | 
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | from selenium import webdriver
 6 | 
 7 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
 8 | 
 9 | import pdfkit
10 | 
11 | import os
12 | 
13 | import sys
14 | 
15 | dcap = dict(DesiredCapabilities.PHANTOMJS)
16 | 
17 | dcap["phantomjs.page.settings.userAgent"] = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393")
18 | 
19 | js2 = 'window.scrollTo(0, document.body.scrollHeight)'
20 | 
21 | class crawl_wechat:
22 | 
23 |     def __init__(self, url):
24 | 
25 |         self.url = url
26 | 
27 |         self.old_scroll_height = 0
28 | 
29 |     def getList(self):
30 | 
31 |         driver = webdriver.PhantomJS(desired_capabilities=dcap)
32 | 
33 |         driver.get(self.url)
34 | 
35 |         for i in range(10):
36 |             if(BeautifulSoup(driver.page_source,
37 |                             'html5lib').find('div',class_="more_wrapper \
38 |                                             no_more").get("style")) == 'display:none':
39 |                 driver.execute_script(js2)
40 | 
41 |         resp = BeautifulSoup(driver.page_source, 'html5lib')
42 |         msg_list = []
43 |         msg_cover = resp.find_all("div", class_="msg_cover")
44 | 
45 |         for href in msg_cover:
46 |             if href.get("hrefs") is not None:
47 |                 msg_list.append(href.get("hrefs"))
48 |             else:
49 |                 msg_cover_redirect = resp.find_all("a",class_="cover_appmsg_link_box redirect")
50 |                 for tmp in msg_cover_redirect:
51 |                     msg_list.append(tmp.get("hrefs"))
52 | 
53 |         sub_msg = resp.find_all("h4", class_="flex_msg_title msg_title")
54 | 
55 |         for sub_href in sub_msg:
56 |             msg_list.append(sub_href.get("hrefs"))
57 | 
58 |         print(msg_list)
59 | 
60 | 
61 | if __name__ == '__main__':
62 | 
63 |     key = sys.argv[1]
64 | 
65 |     wechat_url = 'https://mp.weixin.qq.com/mp/getmasssendmsg?__biz=MzA3NDk1NjI0OQ==&uin=MjgxMTU0NDM1&key={0}&devicetype=Windows+10&version=6203005d&lang=zh_CN&ascene=7&pass_ticket=vbFYPkG%2FXKNQwJgsf2AF6LH3gE3ceAEvtzrNPxFswjfdlxJ5b5BYLTzxg4iitkHG'.format(key)
66 | 
67 |     wechat = crawl_wechat(wechat_url)
68 | 
69 |     wechat.getList()
70 | 


--------------------------------------------------------------------------------
/zhihu/entrypoint.py:
--------------------------------------------------------------------------------
1 | '''
2 | 使scrapy可以用pycharmy远程调试
3 | '''
4 | 
5 | from scrapy.cmdline import execute
6 | execute(['scrapy', 'crawl', 'zhihuuser'])


--------------------------------------------------------------------------------
/zhihu/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = zhihu.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = zhihu
12 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yyyy777/crawler/98f0c1a129b3b5b77fe88971f4f0c6aae5a8964f/zhihu/zhihu/__init__.py


--------------------------------------------------------------------------------
/zhihu/zhihu/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ZhihuUserItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.scrapy.Field()
14 |     id = scrapy.Field()
15 |     name = scrapy.Field()
16 |     avatar_url = scrapy.Field()
17 |     headline = scrapy.Field()
18 |     description = scrapy.Field()
19 |     url = scrapy.Field()
20 |     url_token = scrapy.Field()
21 |     gender = scrapy.Field()
22 |     cover_url = scrapy.Field()
23 |     type = scrapy.Field()
24 |     badge = scrapy.Field()
25 | 
26 |     answer_count = scrapy.Field()
27 |     articles_count = scrapy.Field()
28 |     commercial_question_count = scrapy.Field()
29 |     favorite_count = scrapy.Field()
30 |     favorited_count = scrapy.Field()
31 |     follower_count = scrapy.Field()
32 |     following_columns_count = scrapy.Field()
33 |     following_count = scrapy.Field()
34 |     pins_count = scrapy.Field()
35 |     question_count = scrapy.Field()
36 |     thank_from_count = scrapy.Field()
37 |     thank_to_count = scrapy.Field()
38 |     thanked_count = scrapy.Field()
39 |     vote_from_count = scrapy.Field()
40 |     vote_to_count = scrapy.Field()
41 |     voteup_count = scrapy.Field()
42 |     following_favlists_count = scrapy.Field()
43 |     following_question_count = scrapy.Field()
44 |     following_topic_count = scrapy.Field()
45 |     marked_answers_count = scrapy.Field()
46 |     mutual_followees_count = scrapy.Field()
47 |     hosted_live_count = scrapy.Field()
48 |     participated_live_count = scrapy.Field()
49 | 
50 |     locations = scrapy.Field()
51 |     educations = scrapy.Field()
52 |     employments = scrapy.Field()
53 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ZhihuSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymongo
 9 | 
10 | 
11 | class ZhihuPipeline(object):
12 | 
13 |     def process_item(self, item, spider):
14 |         return item
15 | 
16 | 
17 | class MongoPipeline(object):
18 |     collection_name = 'users'
19 | 
20 |     def __init__(self, mongo_uri, mongo_db):
21 |         self.mongo_uri = mongo_uri
22 |         self.mongo_db = mongo_db
23 | 
24 |     @classmethod
25 |     def from_crawler(cls, crawler):
26 |         return cls(
27 |             mongo_uri=crawler.settings.get('MONGO_URI'),
28 |             mongo_db=crawler.settings.get('MONGO_DATABASE')
29 |         )
30 | 
31 |     def open_spider(self, spider):
32 |         self.client = pymongo.MongoClient(self.mongo_uri)
33 |         self.db = self.client[self.mongo_db]
34 | 
35 |     def close_spider(self, spider):
36 |         self.client.close()
37 | 
38 |     def process_item(self, item, spider):
39 |         self.db[self.collection_name].update(
40 |             {'url_token': item['url_token']}, dict(item), True)
41 |         return item
42 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for zhihu project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'zhihu'
 13 | 
 14 | SPIDER_MODULES = ['zhihu.spiders']
 15 | NEWSPIDER_MODULE = 'zhihu.spiders'
 16 | 
 17 | SCHEDULER = "scrapy_redis.scheduler.Scheduler"
 18 | DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
 19 | REDIS_URL = 'redis://root:123456a@127.0.0.1:6379'
 20 | 
 21 | #SCHEDULER_QUEUE_CLASS = "scrapy_redis.queue.SpiderQueue"
 22 | #SCHEDULER_PERSIST = True
 23 | 
 24 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 25 | #USER_AGENT = 'zhihu (+http://www.yourdomain.com)'
 26 | 
 27 | # Obey robots.txt rules
 28 | ROBOTSTXT_OBEY = False
 29 | 
 30 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 31 | #CONCURRENT_REQUESTS = 32
 32 | 
 33 | # Configure a delay for requests for the same website (default: 0)
 34 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 35 | # See also autothrottle settings and docs
 36 | #DOWNLOAD_DELAY = 3
 37 | # The download delay setting will honor only one of:
 38 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
 39 | #CONCURRENT_REQUESTS_PER_IP = 16
 40 | 
 41 | # Disable cookies (enabled by default)
 42 | #COOKIES_ENABLED = False
 43 | 
 44 | # Disable Telnet Console (enabled by default)
 45 | #TELNETCONSOLE_ENABLED = False
 46 | 
 47 | # Override the default request headers:
 48 | DEFAULT_REQUEST_HEADERS = {
 49 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 50 | #   'Accept-Language': 'en',
 51 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
 52 |     'authorization': 'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
 53 | }
 54 | 
 55 | # Enable or disable spider middlewares
 56 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 57 | #SPIDER_MIDDLEWARES = {
 58 | #    'zhihu.middlewares.ZhihuSpiderMiddleware': 543,
 59 | #}
 60 | 
 61 | # Enable or disable downloader middlewares
 62 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 63 | #DOWNLOADER_MIDDLEWARES = {
 64 | #    'zhihu.middlewares.MyCustomDownloaderMiddleware': 543,
 65 | #}
 66 | 
 67 | # Enable or disable extensions
 68 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 69 | #EXTENSIONS = {
 70 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 71 | #}
 72 | 
 73 | # Configure item pipelines
 74 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 75 | ITEM_PIPELINES = {
 76 |     'zhihu.pipelines.MongoPipeline': 300,
 77 |     'scrapy_redis.pipelines.RedisPipeline': 400,
 78 | }
 79 | 
 80 | # Enable and configure the AutoThrottle extension (disabled by default)
 81 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 82 | #AUTOTHROTTLE_ENABLED = True
 83 | # The initial download delay
 84 | #AUTOTHROTTLE_START_DELAY = 5
 85 | # The maximum download delay to be set in case of high latencies
 86 | #AUTOTHROTTLE_MAX_DELAY = 60
 87 | # The average number of requests Scrapy should be sending in parallel to
 88 | # each remote server
 89 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 90 | # Enable showing throttling stats for every response received:
 91 | #AUTOTHROTTLE_DEBUG = False
 92 | 
 93 | # Enable and configure HTTP caching (disabled by default)
 94 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
 95 | HTTPCACHE_ENABLED = True
 96 | HTTPCACHE_EXPIRATION_SECS = 0
 97 | HTTPCACHE_DIR = 'httpcache'
 98 | HTTPCACHE_IGNORE_HTTP_CODES = []
 99 | HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
100 | 
101 | MONGO_URI = 'localhost'
102 | MONGO_DATABASE = 'zhihu'
103 | 
104 | DOWNLOAD_DELAY = 1


--------------------------------------------------------------------------------
/zhihu/zhihu/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/zhihu/zhihu/spiders/zhihu.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | import json
 3 | from scrapy import Spider, Request
 4 | from scrapy_redis.spiders import RedisSpider
 5 | 
 6 | from zhihu.items import ZhihuUserItem
 7 | 
 8 | 
 9 | class ZhihuUserSpider(RedisSpider):
10 |     name = 'zhihuuser'
11 |     allowed_domains = ["www.zhihu.com"]
12 |     redis_key = "zhihuuser:start_urls"
13 |     user_url = 'https://www.zhihu.com/api/v4/members/{user}?include={include}'
14 |     follows_url = 'https://www.zhihu.com/api/v4/members/{user}/followees?include={include}&offset={offset}&limit={limit}'
15 |     followers_url = 'https://www.zhihu.com/api/v4/members/{user}/followers?include={include}&offset={offset}&limit={limit}'
16 |     start_user = 'excited-vczh'  # 轮子哥
17 |     user_query = 'locations,employments,gender,educations,business,voteup_count,thanked_Count,follower_count,following_count,cover_url,following_topic_count,following_question_count,following_favlists_count,following_columns_count,answer_count,articles_count,pins_count,question_count,commercial_question_count,favorite_count,favorited_count,logs_count,marked_answers_count,marked_answers_text,message_thread_token,account_status,is_active,is_force_renamed,is_bind_sina,sina_weibo_url,sina_weibo_name,show_sina_weibo,is_blocking,is_blocked,is_following,is_followed,mutual_followees_count,vote_to_count,vote_from_count,thank_to_count,thank_from_count,thanked_count,description,hosted_live_count,participated_live_count,allow_message,industry_category,org_name,org_homepage,badge[?(type=best_answerer)].topics'
18 |     follows_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
19 |     followers_query = 'data[*].answer_count,articles_count,gender,follower_count,is_followed,is_following,badge[?(type=best_answerer)].topics'
20 | 
21 |     def start_requests(self):
22 |         '''
23 |         先拿到第一个用户，然后根据第一个用户的关注和被关注列表递归爬取
24 |         :return:
25 |         '''
26 |         yield Request(self.user_url.format(user=self.start_user, include=self.user_query), self.parse_user)
27 |         yield Request(self.follows_url.format(user=self.start_user, include=self.follows_query, limit=20, offset=0),
28 |                       self.parse_follows)
29 |         yield Request(self.followers_url.format(user=self.start_user, include=self.followers_query, limit=20, offset=0),
30 |                       self.parse_followers)
31 | 
32 |     def parse_user(self, response):
33 |         result = json.loads(response.text)
34 |         item = ZhihuUserItem()
35 | 
36 |         for field in item.fields:
37 |             if field in result.keys():
38 |                 item[field] = result.get(field)
39 |         yield item
40 | 
41 |         yield Request(
42 |             self.follows_url.format(user=result.get('url_token'), include=self.follows_query, limit=20, offset=0),
43 |             self.parse_follows)
44 | 
45 |         yield Request(
46 |             self.followers_url.format(user=result.get('url_token'), include=self.followers_query, limit=20, offset=0),
47 |             self.parse_followers)
48 | 
49 |     def parse_follows(self, response):
50 |         results = json.loads(response.text)
51 | 
52 |         if 'data' in results.keys():
53 |             for result in results.get('data'):
54 |                 yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
55 |                               self.parse_user)
56 | 
57 |         if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
58 |             next_page = results.get('paging').get('next')
59 |             yield Request(next_page,
60 |                           self.parse_follows)
61 | 
62 |     def parse_followers(self, response):
63 |         results = json.loads(response.text)
64 | 
65 |         if 'data' in results.keys():
66 |             for result in results.get('data'):
67 |                 yield Request(self.user_url.format(user=result.get('url_token'), include=self.user_query),
68 |                               self.parse_user)
69 | 
70 |         if 'paging' in results.keys() and results.get('paging').get('is_end') == False:
71 |             next_page = results.get('paging').get('next')
72 |             yield Request(next_page,
73 |                           self.parse_followers)
74 | 


--------------------------------------------------------------------------------