├── .gitignore ├── Dockerfile ├── README.md ├── docker-compose.yml ├── docs ├── poi_v2.sql ├── system.png └── urls.sql ├── preview ├── 1.png ├── 10.png ├── 11.png ├── 12.png ├── 13.png ├── 2.png ├── 3.png ├── 4.png ├── 5.png ├── 6.png ├── 7.png ├── 8.png └── 9.png ├── requirements.txt └── src ├── __init__.py ├── common ├── __init__.py ├── cache.py ├── chameleon.py ├── httper.py ├── models.py ├── pyecho.py └── utils.py ├── ctrl_citylist.py ├── ctrl_poidetail.py ├── ctrl_poilist.py ├── ctrl_portal.py ├── main_citylist.py ├── main_poidetail.py ├── main_poilist.py └── main_portal.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Compiled Object files, Static and Dynamic libs (Shared Objects) 2 | *.o 3 | *.a 4 | *.so 5 | 6 | # Folders 7 | _obj 8 | _test 9 | __pycache__ 10 | 11 | # Architecture specific extensions/prefixes 12 | *.[568vq] 13 | [568vq].out 14 | 15 | *.cgo1.go 16 | *.cgo2.c 17 | _cgo_defun.c 18 | _cgo_gotypes.go 19 | _cgo_export.* 20 | 21 | _testmain.go 22 | 23 | *.exe 24 | *.test 25 | *.prof 26 | 27 | .DS_Store 28 | .AppleDouble 29 | .LSOverride 30 | Icon 31 | 32 | 33 | # Thumbnails 34 | ._* 35 | 36 | # Files that might appear on external disk 37 | .Spotlight-V100 38 | .Trashes 39 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.5 2 | MAINTAINER HJK 3 | RUN mkdir /lambda 4 | WORKDIR /lambda 5 | ADD . /lambda/ 6 | RUN apt-get install libxml2-dev libxslt-dev 7 | RUN pip install -r requirements.txt 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 马蜂窝分布式爬虫系统 2 | 3 | 设计这个爬虫系统的主要目的是,完整快速地获取国内所有旅游目的地和旅游景点,包括名称、介绍、图片、相关攻略等。 4 | 5 | 主要使用到的技术(工具/语言/库)包括:Docker, Redis, MySQL, Python, requests, peewee等。 6 | 7 | 系统架构图: 8 | ![](./docs/system.png) 9 | 10 | MySQL和Redis是两个Docker容器,Redis负责保存系统运行过程中产生的数据,MySQL负责保存需要持久话的数据。ABCDE分别由单个或多个Docker容器组成,是主要的下载节点和逻辑处理节点。 11 | 12 | 1. 容器A获得全国的省份、直辖市和特别行政区信息,将对应ID和信息分别保存到Redis和MySQL 13 | 2. 容器集群B从Redis一直读取省份ID,根据省份ID从马蜂窝获取该省内的旅游目的地,并将目的地ID和信息分别保存到Redis和MySQL 14 | 3. 容器集群C从Redis一直读取目的地ID,根据目的地ID从马蜂窝获取该旅行目的地下所有的景点ID和概要信息,并将其分别保存到Redis和MySQL 15 | 4. 容器集群D从Redis一直读取景点ID,根据景点ID从马蜂窝获取该景点的具体信息,包括名称、介绍、门票、交通、图片等,并且把信息都存储到MySQL,把图片地址保存到Redis 16 | 5. 容器集群E从Redis一直读取图片地址,并将图片保存到本地(TODO) 17 | 18 | 容器A只工作一次,完成信息的获取之后自动停止。集群BCDE会一直工作,一直从Redis队列里读取信息,一旦获取到新的ID,就会开始下载和处理页面/图片,直到队列为空,重新回到等待状态。同一个集群内的容器所做的工作是一样的,容器数量可以根据需求随时扩展或缩小。 19 | 20 | # 使用方式 21 | 22 | 下载程序 23 | ``` 24 | git clone https://github.com/0xHJK/mafengwo-crawlers 25 | ``` 26 | 27 | 修改`src/common/chameleon.py`里的代理列表(因为代理具有时效性),代理获取方式可以参考: 28 | 29 | 30 | 31 | 运行 32 | ``` 33 | docker-compose up 34 | ``` 35 | 36 | 扩展节点数量(根据自己需要扩展) 37 | ``` 38 | #扩展获取城市列表节点为10个 39 | docker-compose scale citylist=10 40 | 41 | #扩展获取景点列表节点为20个 42 | docker-compose scale citylist=20 43 | 44 | #扩展获取景点信息节点为30个 45 | docker-compose scale citylist=30 46 | ``` 47 | 48 | 重新运行 49 | ``` 50 | docker-compose up 51 | ``` 52 | 53 | # 运行效果 54 | ![](./preview/1.png) 55 | ![](./preview/2.png) 56 | ![](./preview/3.png) 57 | ![](./preview/4.png) 58 | ![](./preview/5.png) 59 | ![](./preview/6.png) 60 | ![](./preview/7.png) 61 | ![](./preview/8.png) 62 | ![](./preview/9.png) 63 | ![](./preview/10.png) 64 | ![](./preview/11.png) 65 | ![](./preview/12.png) 66 | ![](./preview/13.png) 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | cache: 4 | image: redis 5 | ports: 6 | - '6379:6379' 7 | 8 | db: 9 | image: mysql:5.7 10 | volumes: 11 | - dbdata:/var/lib/mysql 12 | restart: always 13 | command: mysqld --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci --init-connect='SET NAMES utf8mb4;' --innodb-flush-log-at-trx-commit=0 14 | environment: 15 | MYSQL_ROOT_PASSWORD: ayou 16 | MYSQL_DATABASE: ayou 17 | MYSQL_USER: ayou 18 | MYSQL_PASSWORD: ayou 19 | ports: 20 | - '3306:3306' 21 | 22 | portal: 23 | build: . 24 | volumes: 25 | - .:/lambda 26 | command: python3 /lambda/src/main_portal.py 27 | links: 28 | - cache 29 | - db 30 | depends_on: 31 | - cache 32 | - db 33 | 34 | citylist: 35 | build: . 36 | volumes: 37 | - .:/lambda 38 | command: python3 /lambda/src/main_citylist.py 39 | links: 40 | - cache 41 | - db 42 | depends_on: 43 | - cache 44 | - db 45 | 46 | poilist: 47 | build: . 48 | volumes: 49 | - .:/lambda 50 | command: python3 /lambda/src/main_poilist.py 51 | links: 52 | - cache 53 | - db 54 | depends_on: 55 | - cache 56 | - db 57 | 58 | poidetail: 59 | build: . 60 | volumes: 61 | - .:/lambda 62 | command: python3 /lambda/src/main_poidetail.py 63 | links: 64 | - cache 65 | - db 66 | depends_on: 67 | - cache 68 | - db 69 | 70 | volumes: 71 | dbdata: 72 | 73 | -------------------------------------------------------------------------------- /docs/poi_v2.sql: -------------------------------------------------------------------------------- 1 | -- 马蜂窝爬虫数据库结构 2 | -- By HJK 3 | -- create 2017-01-15 4 | -- update 2017-01-16 5 | -- version 0.2 6 | 7 | -- 目的地 8 | create table if not exists dl_dest( 9 | -- 目的地ID 10 | dest_id varchar(40) primary key, 11 | -- 目的地名称 12 | name varchar(48), 13 | --(行政)市/直辖市(可以和name一样) 14 | city varchar(48), 15 | --(行政)省/州/直辖市(可以和name一样) 16 | province varchar(48), 17 | -- 国家 18 | country varchar(48), 19 | -- 父目的地ID 20 | parent_dest_id varchar(40), 21 | -- 马蜂窝目的地ID 22 | m_dest_id int(8) 23 | ); 24 | 25 | 26 | 27 | -- 景点 28 | create table if not exists dl_poi( 29 | -- 景点ID 30 | poi_id varchar(40) primary key, 31 | -- 景点名 32 | name varchar(48), 33 | -- 景点描述 34 | description text, 35 | -- 景点地址 36 | poi_address text, 37 | -- 景点攻略 38 | guidebook text, 39 | -- 景点电话 40 | tel varchar(20), 41 | -- 景点网址 42 | website varchar(255), 43 | -- 用时参考 44 | expected_time text, 45 | -- 交通 46 | traffic text, 47 | -- 门票 48 | ticket text, 49 | -- 开放时间 50 | business_hours text, 51 | -- 评论数 52 | comment_count int, 53 | -- 好评数 54 | comment_count_a int, 55 | -- 中评数 56 | comment_count_b int, 57 | -- 差评数 58 | comment_count_c int, 59 | -- 父景点ID(如果有) 60 | parent_poi_id varchar(40), 61 | -- 目的地ID 62 | dest_id varchar(40), 63 | -- 马蜂窝景点ID 64 | m_poi_id int(8) 65 | ); 66 | 67 | -- 景点图片 68 | create table if not exists dl_poi_images( 69 | -- 自增ID 70 | idx_id int primary key, 71 | -- 景点ID 72 | poi_id varchar(40), 73 | -- 图片URL 74 | image_url text 75 | ); 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /docs/system.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/docs/system.png -------------------------------------------------------------------------------- /docs/urls.sql: -------------------------------------------------------------------------------- 1 | create table if not exists urls ( 2 | id integer primary key, 3 | url text, 4 | method text, 5 | data text, 6 | dtype text, 7 | rex text, 8 | selector text, 9 | attr text 10 | ); 11 | -------------------------------------------------------------------------------- /preview/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/1.png -------------------------------------------------------------------------------- /preview/10.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/10.png -------------------------------------------------------------------------------- /preview/11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/11.png -------------------------------------------------------------------------------- /preview/12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/12.png -------------------------------------------------------------------------------- /preview/13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/13.png -------------------------------------------------------------------------------- /preview/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/2.png -------------------------------------------------------------------------------- /preview/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/3.png -------------------------------------------------------------------------------- /preview/4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/4.png -------------------------------------------------------------------------------- /preview/5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/5.png -------------------------------------------------------------------------------- /preview/6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/6.png -------------------------------------------------------------------------------- /preview/7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/7.png -------------------------------------------------------------------------------- /preview/8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/8.png -------------------------------------------------------------------------------- /preview/9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/preview/9.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | peewee 3 | pymysql 4 | redis 5 | pyquery 6 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/src/__init__.py -------------------------------------------------------------------------------- /src/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/0xHJK/mafengwo-crawlers/cf269ba3d9ab6ed783ac666ecc212386fe721897/src/common/__init__.py -------------------------------------------------------------------------------- /src/common/cache.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import redis 5 | 6 | # r = redis.Redis(host = 'localhost') 7 | r = redis.Redis(host = 'cache') 8 | 9 | def mq_push(key, val, is_high = False): 10 | if is_high: 11 | r.rpush(key, val) 12 | else: 13 | r.lpush(key, val) 14 | 15 | def mq_pop(key): 16 | return str(r.brpop(key)[1], encoding = 'utf-8') 17 | 18 | def push_portal_id(val): 19 | mq_push('mafengwo:portal_id', val) 20 | 21 | def pop_portal_id(): 22 | return mq_pop('mafengwo:portal_id') 23 | 24 | def push_city_id(val): 25 | mq_push('mafengwo:city_id', val) 26 | 27 | def pop_city_id(): 28 | return mq_pop('mafengwo:city_id') 29 | 30 | def push_poi_id(val): 31 | mq_push('mafengwo:poi_id', val) 32 | 33 | def pop_poi_id(): 34 | return mq_pop('mafengwo:poi_id') 35 | 36 | def push_image_url(val): 37 | mq_push('mafengwo:image_url', val) 38 | 39 | def pop_image_url(): 40 | mq_pop('mafengwo:image_url') 41 | -------------------------------------------------------------------------------- /src/common/chameleon.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import random 5 | 6 | class Chameleon(object): 7 | """docstring for Chameleon""" 8 | def __init__(self): 9 | super(Chameleon, self).__init__() 10 | 11 | def get_headers(self): 12 | return {'User-Agent': random.choice(self.ua_list)} 13 | 14 | def get_proxies(self): 15 | proxy = random.choice(self.proxy_list) 16 | return {'http': proxy, 'https': proxy} 17 | 18 | ua_list = [ 19 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:21.0) Gecko/20100101 Firefox/21.0', 20 | 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:21.0) Gecko/20130331 Firefox/21.0', 21 | 'Mozilla/5.0 (Windows NT 6.2; WOW64; rv:21.0) Gecko/20100101 Firefox/21.0', 22 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36', 23 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Ubuntu/11.10 Chromium/27.0.1453.93 Chrome/27.0.1453.93 Safari/537.36', 24 | 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36', 25 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)', 26 | 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)', 27 | 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.9.168 Version/11.52', 28 | 'Opera/9.80 (Windows NT 6.1; WOW64; U; en) Presto/2.10.229 Version/11.62', 29 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', 30 | 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.20.25 (KHTML, like Gecko) Version/5.0.4 Safari/533.20.27', 31 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36', 32 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', 33 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 34 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', 35 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14', 36 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36 Qiyu/2.1.0.0', 37 | 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.100 Safari/537.36', 38 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.87 Safari/537.36', 39 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 40 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 41 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', 42 | 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0', 43 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 44 | 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.3 Safari/537.36', 45 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.59 Safari/537.36', 46 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36', 47 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', 48 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36 Qiyu/2.1.0.0', 49 | 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 50 | 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36', 51 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' 52 | 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36', 53 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 54 | 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.75 Safari/537.36' 55 | ] 56 | 57 | proxy_list = [ 58 | '14.223.23.126:9999', 59 | '175.4.12.214:9999', 60 | '111.160.141.42:63000', 61 | '124.88.67.83:843', 62 | '120.52.73.97:8083', 63 | '222.87.72.105:9999', 64 | '124.118.237.218:9999', 65 | '42.84.199.115:8998', 66 | '123.206.93.108:8081', 67 | '124.88.67.10:83', 68 | '36.73.216.172:80', 69 | '14.105.168.238:9999', 70 | '183.71.192.189:9999', 71 | '58.154.33.12:8080', 72 | '66.65.8.172:9999', 73 | '60.11.35.67:8998', 74 | '106.46.136.64:808', 75 | '124.88.67.34:843', 76 | '94.177.248.71:8080', 77 | '37.59.54.189:3128', 78 | '113.5.234.73:9529', 79 | '220.166.241.51:8118', 80 | '92.222.106.162:9999', 81 | '35.157.121.224:3128', 82 | '120.52.73.98:8092', 83 | '111.85.118.25:8998', 84 | '119.165.11.77:8118', 85 | '46.105.121.118:1080', 86 | '114.223.87.112:9999', 87 | '183.71.152.206:9999', 88 | '182.253.161.60:8080', 89 | '116.199.48.245:80', 90 | '14.125.51.82:8888', 91 | '122.212.129.9:80', 92 | '192.99.79.159:8080', 93 | '120.52.73.98:8088', 94 | '124.88.67.31:843', 95 | '111.76.133.172:808', 96 | '14.109.132.182:9999', 97 | '54.153.127.129:8083', 98 | '35.160.136.133:8000', 99 | '80.240.221.38:3128', 100 | '52.34.230.6:8088', 101 | '14.105.177.85:9999', 102 | '110.52.183.208:9999', 103 | '120.52.73.98:99', 104 | '211.75.115.20:80', 105 | '119.86.68.26:9999', 106 | '47.89.41.164:80', 107 | '221.2.230.196:8998', 108 | '222.138.69.226:8998', 109 | '120.52.73.97:8084', 110 | '183.21.135.228:9999', 111 | '106.87.26.144:9999', 112 | '14.105.178.162:9999', 113 | '177.67.84.248:8080', 114 | '96.239.193.244:8080', 115 | '58.216.14.22:808', 116 | '115.231.175.68:8081', 117 | '125.88.74.122:82', 118 | '124.88.67.14:843', 119 | '111.76.133.210:808', 120 | '14.111.250.187:9999', 121 | '110.83.46.89:808', 122 | '121.232.145.33:9000', 123 | '192.99.56.104:8080', 124 | '203.172.223.190:8080', 125 | '218.109.189.16:8888', 126 | '110.154.92.42:8888', 127 | '124.88.67.7:843', 128 | '120.52.73.97:8094', 129 | '182.204.18.65:8118', 130 | '52.59.61.67:8083', 131 | '120.52.73.98:86', 132 | '220.250.12.19:8998', 133 | '115.58.16.31:9999', 134 | '218.86.128.62:8118', 135 | '106.46.136.82:808', 136 | '120.52.73.97:8091', 137 | '36.249.192.128:8118', 138 | '114.228.185.57:8118', 139 | '14.105.177.218:9999', 140 | '219.246.184.153:80', 141 | '120.52.73.97:8087', 142 | '14.199.124.204:80', 143 | '183.67.243.79:9999', 144 | '31.220.58.150:3128', 145 | '106.46.136.3:808', 146 | '125.81.178.86:9999', 147 | '120.52.73.98:84', 148 | '111.76.129.171:808', 149 | '120.52.73.98:8081', 150 | '120.52.73.97:98', 151 | '94.242.59.123:80', 152 | '120.52.73.98:8097', 153 | '112.195.72.143:8118', 154 | '125.112.174.36:8088', 155 | '120.52.73.98:90', 156 | '113.252.236.96:8080', 157 | '89.34.26.14:1080', 158 | '124.88.67.17:83', 159 | '158.69.157.32:3128', 160 | '121.40.139.217:80', 161 | '113.206.211.99:8998', 162 | '120.52.73.109:8090', 163 | '124.88.67.30:82', 164 | '120.52.73.97:8097', 165 | '52.59.66.142:8083', 166 | '120.52.73.98:8083', 167 | '163.121.187.180:8080', 168 | '94.177.234.145:80', 169 | '125.122.118.87:808', 170 | '120.52.73.98:97', 171 | '123.57.225.102:8088', 172 | '81.2.252.136:3128', 173 | '120.52.73.98:82', 174 | '47.91.151.193:3128', 175 | '122.195.181.133:8888', 176 | '94.177.252.80:80', 177 | '120.52.73.112:8090', 178 | '108.61.182.117:8118', 179 | '120.52.73.98:8093', 180 | '106.83.93.17:9999', 181 | '177.67.81.26:3128', 182 | '182.92.224.202:8088', 183 | '115.56.188.207:808', 184 | '111.120.117.27:9999', 185 | '124.88.67.81:843', 186 | '120.52.73.98:85', 187 | '124.88.67.23:843', 188 | '164.67.174.43:3128', 189 | '125.166.226.234:80', 190 | '115.236.226.225:8998', 191 | '183.68.180.241:9999', 192 | '117.70.96.116:9999', 193 | '120.52.73.98:8094', 194 | '120.52.73.98:8091', 195 | '217.33.216.114:8080', 196 | '140.224.76.17:808', 197 | '124.88.67.19:843', 198 | '124.88.67.32:83', 199 | '192.99.128.170:80', 200 | '119.86.11.136:9999', 201 | '120.52.73.98:8089', 202 | '192.129.221.89:9001', 203 | '124.88.67.31:81', 204 | '124.88.67.52:843', 205 | '31.220.54.45:80', 206 | '202.73.51.146:8128', 207 | '120.52.73.112:8080', 208 | '110.179.46.233:8888', 209 | '111.76.129.136:808', 210 | '106.46.136.14:808', 211 | '120.52.73.98:8085', 212 | '124.88.67.21:81', 213 | '125.88.74.122:83', 214 | '23.88.102.24:8080', 215 | '218.84.122.211:9999', 216 | '177.84.87.154:80', 217 | '77.73.66.227:8080', 218 | '182.39.153.2:8118', 219 | '124.88.67.17:80', 220 | '183.69.9.201:9999', 221 | '39.74.178.69:8998', 222 | '221.226.82.130:8998', 223 | '120.26.48.77:80', 224 | '124.88.67.32:81', 225 | '125.82.122.219:9999', 226 | '14.155.115.41:8118', 227 | '47.52.2.135:8080', 228 | '119.84.160.165:9999', 229 | '125.34.88.90:8118', 230 | '90.152.38.178:1080', 231 | '183.67.39.49:9999', 232 | '124.88.67.31:80', 233 | '5.160.33.92:3128', 234 | '14.110.119.69:9999', 235 | '63.150.152.151:8080', 236 | '14.201.122.140:80', 237 | '124.88.67.20:82', 238 | '158.69.186.112:8080', 239 | '124.88.67.19:82', 240 | '192.99.128.170:3128', 241 | '42.84.67.111:8888', 242 | '120.52.73.98:80', 243 | '124.88.67.10:81', 244 | '158.69.172.98:80', 245 | '124.88.67.20:843', 246 | '170.248.47.58:80', 247 | '94.177.234.145:3128', 248 | '117.91.185.152:9999', 249 | '221.215.162.218:8998', 250 | '138.97.241.160:80', 251 | '120.52.73.98:8080', 252 | '113.244.50.41:9999', 253 | '106.46.136.89:808', 254 | '113.69.165.251:808', 255 | '92.109.100.74:80', 256 | '106.46.136.102:808', 257 | '106.46.136.80:808', 258 | '113.141.119.212:9999', 259 | '125.84.218.87:9999', 260 | '179.185.54.114:8080', 261 | '124.135.154.242:9999', 262 | '124.88.67.34:81', 263 | '63.150.152.151:3128', 264 | '182.48.113.11:8088', 265 | '120.52.73.112:8091', 266 | '27.8.61.225:8888', 267 | '195.138.86.112:3128', 268 | '120.52.73.98:100', 269 | '107.170.54.215:8080', 270 | '94.242.59.141:80', 271 | '113.110.128.252:3128', 272 | '223.244.196.135:8118', 273 | '58.176.46.248:8380', 274 | '144.217.49.233:8080', 275 | '120.52.73.97:8098', 276 | '120.52.73.97:8081', 277 | '94.156.144.87:80', 278 | '124.88.67.39:843', 279 | '106.46.136.127:808', 280 | '185.43.210.238:8080', 281 | '88.159.109.104:80', 282 | '120.52.73.97:80', 283 | '120.52.73.98:8096', 284 | '120.52.73.98:95', 285 | '180.246.14.221:80', 286 | '124.88.67.19:80', 287 | '185.22.172.20:3128', 288 | '219.145.95.193:8998', 289 | '177.67.84.248:3128', 290 | '120.52.73.97:93', 291 | '175.15.183.150:9999', 292 | '182.88.228.251:8123', 293 | '120.52.73.98:98', 294 | '125.120.10.127:808', 295 | '175.7.150.232:9999', 296 | '222.255.236.117:3128', 297 | '94.46.177.99:80', 298 | '123.56.28.89:8080', 299 | '14.109.71.213:9999', 300 | '124.88.67.83:83', 301 | '114.215.29.26:80', 302 | '124.88.67.83:80', 303 | '120.52.73.97:83', 304 | '115.198.34.88:808', 305 | '106.46.136.79:808', 306 | '115.214.161.24:8118', 307 | '60.219.11.61:8998', 308 | '222.169.193.162:8099', 309 | '52.59.251.120:8083', 310 | '221.211.49.169:9999', 311 | '222.169.87.80:8998', 312 | '113.133.68.118:9999', 313 | '92.47.195.250:3128', 314 | '219.127.253.43:80', 315 | '94.23.17.132:9999', 316 | '120.77.223.7:28080', 317 | '120.52.73.97:81', 318 | '192.129.231.118:9001', 319 | '184.69.67.122:80', 320 | '183.71.134.87:9999', 321 | '192.99.128.170:8080', 322 | '52.59.65.26:8083', 323 | '119.28.12.218:8888', 324 | '106.88.12.198:9999', 325 | '120.52.73.97:8086', 326 | '180.251.145.175:80', 327 | '222.211.65.138:80', 328 | '124.88.67.39:80', 329 | '111.123.40.104:9999', 330 | '49.86.133.144:9999', 331 | '181.111.175.235:8080', 332 | '125.122.116.132:808', 333 | '183.67.28.86:9999', 334 | '94.156.144.87:8080', 335 | '124.88.67.24:843', 336 | '112.233.111.225:808', 337 | '192.129.225.194:9001', 338 | '14.106.15.47:9999', 339 | '124.234.44.219:9999', 340 | '58.37.57.200:8118', 341 | '183.68.169.171:9999', 342 | '5.2.64.150:1080', 343 | '110.189.223.201:808', 344 | '152.251.245.236:8080', 345 | '14.109.129.161:9999', 346 | '113.244.93.110:9999', 347 | '124.88.67.19:81', 348 | '181.59.255.227:8080', 349 | '219.152.196.111:9999', 350 | '106.46.136.20:808', 351 | '106.46.136.108:808', 352 | '123.189.56.123:9999', 353 | '112.249.41.57:8888', 354 | '117.70.184.252:9999', 355 | '120.52.73.112:80', 356 | '120.52.73.97:88', 357 | '120.52.73.123:100', 358 | '111.13.7.42:83', 359 | '124.88.67.10:843', 360 | '120.52.73.97:92', 361 | '120.52.73.97:97', 362 | '203.90.144.145:80', 363 | '120.52.73.97:8082', 364 | '125.72.106.216:808', 365 | '124.88.67.39:82', 366 | '138.97.241.160:3128', 367 | '124.88.67.83:82', 368 | '124.88.67.30:81', 369 | '178.206.193.107:8080', 370 | '183.165.93.208:8998', 371 | '110.187.12.138:808', 372 | '94.177.252.80:3128', 373 | '120.52.73.97:8089', 374 | '200.229.202.72:80', 375 | '212.46.215.107:8080', 376 | '14.105.179.167:9999', 377 | '85.255.11.23:3128', 378 | '123.4.88.196:8118', 379 | '124.88.67.81:81', 380 | '218.109.123.185:8888', 381 | '111.13.7.42:80', 382 | '153.0.163.18:8998', 383 | '124.88.67.18:81', 384 | '121.40.42.35:9999', 385 | '200.229.202.72:8080', 386 | '87.236.233.182:808', 387 | '186.24.7.26:8080', 388 | '124.88.67.30:843', 389 | '202.111.175.97:8080', 390 | '45.40.143.57:80', 391 | '125.46.64.91:8080', 392 | '124.88.67.21:83', 393 | '121.24.220.205:8118', 394 | '106.87.66.189:9999', 395 | '117.21.234.96:8080', 396 | '36.80.27.181:8080', 397 | '223.87.178.73:80', 398 | '124.88.67.17:843', 399 | '35.163.151.89:8083', 400 | '31.214.152.90:8080', 401 | '120.52.73.98:8100', 402 | '125.127.50.241:8118' 403 | ] 404 | 405 | chameleon = Chameleon() 406 | -------------------------------------------------------------------------------- /src/common/httper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | import requests 6 | from pyquery import PyQuery as pq 7 | 8 | from .pyecho import echo 9 | from .chameleon import chameleon 10 | from .cache import mq_push 11 | 12 | class Httper(object): 13 | """docstring for Httper""" 14 | def __init__(self, *args, **kwargs): 15 | super(Httper, self).__init__() 16 | # if request failed, retry 10 times 17 | self.remaining_retries = 20 18 | self.url = args[0] or kwargs.get('url', '') 19 | self.method = kwargs.get('method', 'get') 20 | self.data = kwargs.get('data', {}) 21 | # rtype: response type, text or json 22 | self.rtype = kwargs.get('rtype', 'text') 23 | # rkey: response key 24 | self.rkey = kwargs.get('rkey', ['']) 25 | # dtype: deal way, re or pq 26 | self.dtype = kwargs.get('dtype', '') 27 | self.rex = kwargs.get('rex', '') 28 | self.selector = kwargs.get('selector', '') 29 | self.attr = kwargs.get('attr', '') 30 | # request timeout, default timeout is 10s 31 | self.timeout = kwargs.get('timeout', 7) 32 | # 用来处理出错情况 33 | self.cache_key = kwargs.get('cache_key', 'mafengwo:default') 34 | self.cache_val = kwargs.get('cache_val', '1234') 35 | 36 | self.request() 37 | 38 | '''save failed request''' 39 | def request_failed(self): 40 | # if request failed, retry 41 | echo.error('Get %s failed' % self.url) 42 | echo.info('Remaining retries: %d' % self.remaining_retries) 43 | if self.remaining_retries > 0: 44 | self.remaining_retries -= 1 45 | self.request() 46 | else: 47 | mq_push(self.cache_key, self.cache_val) 48 | 49 | '''request''' 50 | def request(self): 51 | echo.info('Trying to get %s' % self.url) 52 | try: 53 | if self.method == 'get' or self.method == 'GET': 54 | r = requests.get( 55 | self.url, 56 | data = self.data, 57 | headers = chameleon.get_headers(), 58 | proxies = chameleon.get_proxies(), 59 | timeout = self.timeout 60 | ) 61 | elif self.method == 'post' or self.method == 'POST': 62 | r = requests.post( 63 | self.url, 64 | data = self.data, 65 | headers = chameleon.get_headers(), 66 | proxies = chameleon.get_proxies(), 67 | timeout = self.timeout 68 | ) 69 | except: 70 | return self.request_failed() 71 | # 如果状态不为200或数据为空或长度小于10 72 | if r.status_code != 200 or r.text is None or len(r.text) < 10: 73 | return self.request_failed() 74 | if self.rtype == 'text': 75 | self.result = r.text 76 | elif self.rtype == 'json': 77 | try: 78 | self.result = r.json() 79 | except: 80 | return self.request_failed() 81 | echo.success('Get %s successfully' % self.url) 82 | 83 | '''get data''' 84 | def get_data(self, **kwargs): 85 | dtype = kwargs.get('dtype', self.dtype) 86 | rex = kwargs.get('rex', self.rex) 87 | selector = kwargs.get('selector', self.selector) 88 | attr = kwargs.get('attr', self.attr) 89 | rkey = kwargs.get('rkey', self.rkey) 90 | txt = self.result 91 | 92 | # 支持从多级json中提取数据 93 | for rk in rkey: 94 | if rk: 95 | try: 96 | txt = txt[rk] 97 | except: 98 | self.request_failed() 99 | return self.get_data( 100 | dtype = dtype, 101 | rex = rex, 102 | selector = selector, 103 | attr = attr, 104 | rkey = rkey 105 | ) 106 | 107 | # 如果是用正则模式 108 | if dtype == 're' and rex != '': 109 | return re.findall(rex, txt) 110 | # 如果是用选择器模式 111 | elif dtype == 'pq' and selector != '' and attr != '': 112 | try: 113 | d = pq(txt) 114 | except: 115 | return self.request_failed() 116 | elements = d(selector) 117 | # 如果选中的不是一个list,先变成list 118 | if not isinstance(elements, list): 119 | elements = [elements] 120 | # 如果attr是text 121 | if attr == 'text' or attr == 'txt': 122 | res = [d(x).text() for x in elements] 123 | else: 124 | res = [d(x).attr(attr) for x in elements] 125 | return res 126 | 127 | echo.error('are you kidding me?') 128 | return [''] 129 | -------------------------------------------------------------------------------- /src/common/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from peewee import * 5 | 6 | db = MySQLDatabase('ayou', host = 'db', user = 'ayou', password = 'ayou', charset = 'utf8mb4') 7 | db.connect() 8 | 9 | class BaseModel(Model): 10 | class Meta: 11 | database = db 12 | 13 | class Dest(BaseModel): 14 | dest_id = UUIDField(primary_key = True) 15 | name = CharField(max_length = 48, default = '') 16 | city = CharField(max_length = 48, default = '') 17 | province = CharField(max_length = 48, default = '') 18 | country = CharField(max_length = 48, default = '中国') 19 | parent_dest_id = CharField(max_length = 40, default = '') 20 | m_parent_dest_id = IntegerField(default = 0) 21 | m_dest_id = IntegerField(default = 0) 22 | # is_dest_over = BooleanField(default = False) 23 | # is_poi_over = BooleanField(default = False) 24 | 25 | 26 | class Poi(BaseModel): 27 | poi_id = UUIDField(primary_key = True) 28 | name = CharField(max_length = 48, default = '') 29 | summary = TextField(default = '') 30 | # poi_address = TextField(default = '') 31 | # guidebook = TextField(default = '') 32 | tel = CharField(max_length = 255, default = '') 33 | website = CharField(default = '') 34 | expected_time = TextField(default = '') 35 | traffic = TextField(default = '') 36 | ticket = TextField(default = '') 37 | business_hours = TextField(default = '') 38 | comment_count = IntegerField(default = 0) 39 | # comment_count_a = IntegerField(default = 0) 40 | # comment_count_b = IntegerField(default = 0) 41 | # comment_count_c = IntegerField(default = 0) 42 | # parent_poi_id = CharField(max_length = 40, default = '') 43 | sub_poi_id = TextField(default = '') 44 | dest_id = CharField(max_length = 40, default = '') 45 | m_poi_id = IntegerField(default = 0) 46 | # is_over = BooleanField(default = False) 47 | 48 | 49 | class Pimg(BaseModel): 50 | idx_id = PrimaryKeyField(primary_key = True) 51 | poi_id = CharField(max_length = 40, default = '') 52 | image_url = TextField(default = '') 53 | 54 | # Only create the tables if they do not exist 55 | db.create_tables([Dest, Poi, Pimg], safe=True) 56 | 57 | -------------------------------------------------------------------------------- /src/common/pyecho.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from os import name 5 | import sys 6 | ''' 7 | 格式:\x1b[显示方式;前景色;背景色m 8 | 显示方式 0~7 9 | 前景色 30~38 10 | 背景色 40~48 11 | ''' 12 | class ColorFore(object): 13 | black = '30' 14 | red = '31' 15 | green = '32' 16 | yellow = '33' 17 | blue = '34' 18 | magenta = '35' 19 | cyan = '36' 20 | white = '37' 21 | default = '38' 22 | 23 | class ColorBack(object): 24 | black = '40' 25 | red = '41' 26 | green = '42' 27 | yellow = '43' 28 | blue = '44' 29 | magenta = '45' 30 | cyan = '46' 31 | white = '47' 32 | default = '48' 33 | 34 | class Style(ColorFore): 35 | def __init__(self): 36 | self.fg = ColorFore() 37 | self.bg = ColorBack() 38 | self.reset = '0' 39 | self.bright = '1' 40 | self.dim = '2' 41 | self.italic = '3' 42 | self.underline = '4' 43 | self.blink = '5' 44 | self.revert = '7' 45 | 46 | class Pyout(object): 47 | def __init__(self): 48 | self.s = Style() 49 | self.fmt = '\x1b[%sm%s%s\x1b[0m' 50 | '''例子pyout.example()''' 51 | def example(self): 52 | for style in range(8): 53 | for fg in range(30, 39): 54 | s1 = '' 55 | for bg in range(40, 49): 56 | format = ';'.join([str(style), str(fg), str(bg)]) 57 | s1 += '\x1b[%sm %s \x1b[0m' % (format, format) 58 | print(s1) 59 | print('\n') 60 | '''消息 格式''' 61 | def log(self, m, s = '0', flag = ' '): 62 | if isinstance(s, list): 63 | s = ';'.join(s) 64 | if isinstance(m, list): 65 | print(self.fmt % (s, '', '[')) 66 | for msg in m: 67 | print(self.fmt % (s, flag, msg)) 68 | print(self.fmt % (s, '', ']')) 69 | else: 70 | print(self.fmt % (s, flag, m)) 71 | def bright(self, m): 72 | self.log(m, self.s.bright) 73 | def italic(self, m): 74 | self.log(m, self.s.italic) 75 | def underline(self, m): 76 | self.log(m, self.s.underline) 77 | def info(self, m): 78 | self.log(m, self.s.blue, 'ℹ️ ') 79 | def warn(self, m): 80 | self.log(m, self.s.yellow, '⚠️ ') 81 | def debug(self, m): 82 | self.log(m, self.s.magenta, '🌀 ') 83 | def error(self, m): 84 | self.log(m, self.s.red, '❌ ') 85 | def success(self, m): 86 | self.log(m, self.s.green, '✅ ') 87 | 88 | fg = ColorFore() 89 | bg = ColorBack() 90 | style = Style() 91 | echo = Pyout() 92 | 93 | -------------------------------------------------------------------------------- /src/common/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | # 从各省份简称中返回全称 5 | 6 | def get_full_province_name(name): 7 | name_kv = { 8 | '皖': '安徽', '京': '北京', '渝': '重庆', 9 | '闽': '福建', '甘': '甘肃', '粤': '广东', 10 | '桂': '广西', '黔': '贵州', '琼': '海南', 11 | '冀': '河北', '豫': '河南', '黑': '黑龙江', 12 | '鄂': '湖北', '湘': '湖南', '吉': '吉林', 13 | '苏': '江苏', '赣': '江西', '辽': '辽宁', 14 | '蒙': '内蒙古', '宁': '宁夏', '青': '青海', 15 | '鲁': '山东', '晋': '山西', '陕': '陕西', 16 | '沪': '上海', '川': '四川', '津': '天津', 17 | '藏': '西藏', '新': '新疆', '滇': '云南', 18 | '浙': '浙江', '港': '香港', '澳': '澳门', 19 | '台': '台湾' 20 | } 21 | if name in name_kv.keys(): 22 | return name_kv[name] 23 | else: 24 | return name 25 | -------------------------------------------------------------------------------- /src/ctrl_citylist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import uuid 5 | import re 6 | from common.pyecho import echo 7 | from common.models import Dest 8 | from common.httper import Httper 9 | from common.cache import push_city_id 10 | 11 | class CitylistCtrl(object): 12 | """docstring for CitylistCtrl""" 13 | def __init__(self, **kwargs): 14 | super(CitylistCtrl, self).__init__() 15 | self.portal_id = kwargs.get('portal_id', '') 16 | self.page = kwargs.get('page', 1) 17 | self.total = kwargs.get('total', 2) 18 | self.url = 'http://www.mafengwo.cn/mdd/base/list/pagedata_citylist' 19 | self.city_id_list = [] 20 | self.city_name_list = [] 21 | 22 | def set_data(self): 23 | data = { 24 | 'mddid': self.portal_id, 25 | 'page': self.page 26 | } 27 | hr = Httper( 28 | self.url, 29 | method = 'post', 30 | data = data, 31 | rtype = 'json', 32 | dtype = 'pq' 33 | ) 34 | # 在第一次请求时获取页面总数 35 | if self.page == 1: 36 | tmp_total = hr.get_data( 37 | rkey = ['page'], 38 | selector = '.pg-last', 39 | attr = 'data-page' 40 | ) 41 | self.total = int(tmp_total[0]) 42 | # 设置city id 43 | self.city_id_list = hr.get_data( 44 | rkey = ['list'], 45 | selector = '.item .img a', 46 | attr = 'data-id' 47 | ) 48 | # 设置city name 49 | self.city_name_list = hr.get_data( 50 | rkey = ['list'], 51 | selector = '.item .title', 52 | attr = 'text' 53 | ) 54 | 55 | def push_data(self): 56 | for city_id in self.city_id_list: 57 | push_city_id(city_id) 58 | echo.success('city id %s pushed' % city_id) 59 | 60 | def save_data(self): 61 | for city in zip(self.city_id_list, self.city_name_list): 62 | names = city[1].split(' ') 63 | if names[0]: 64 | Dest.create( 65 | dest_id = uuid.uuid4(), 66 | name = names[0], 67 | m_dest_id = city[0], 68 | m_parent_dest_id = self.portal_id 69 | ) 70 | echo.success('city id %s name %s saved' % (city[0], names[0])) 71 | 72 | def entry(self): 73 | while self.page <= self.total: 74 | echo.info('mddid: %s, page: %s, total: %s' % (self.portal_id, self.page, self.total)) 75 | self.set_data() 76 | self.push_data() 77 | self.save_data() 78 | self.page += 1 79 | 80 | -------------------------------------------------------------------------------- /src/ctrl_poidetail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import uuid 5 | import re 6 | from common.pyecho import echo 7 | from common.models import Poi, Pimg 8 | from common.httper import Httper 9 | from common.cache import push_image_url 10 | 11 | class PoidetailCtrl(object): 12 | """docstring for PoidetailCtrl""" 13 | def __init__(self, **kwargs): 14 | super(PoidetailCtrl, self).__init__() 15 | self.poi_id = uuid.uuid4() 16 | self.m_poi_id = kwargs.get('m_poi_id', '') 17 | self.url = 'http://www.mafengwo.cn/poi/%s.html' % self.m_poi_id 18 | 19 | def set_data(self): 20 | hr = Httper( 21 | self.url, 22 | method = 'get', 23 | rtype = 'text', 24 | dtype = 'pq' 25 | ) 26 | self.poi_name = ''.join(hr.get_data(selector = 'h1', attr = 'text')) 27 | self.image_url_list = hr.get_data(selector = '.bd img', attr = 'src') 28 | self.summary = ''.join(hr.get_data(selector = '.summary', attr = 'text')) 29 | self.tel = ''.join(hr.get_data(selector = '.tel .content', attr = 'text')) 30 | self.website = ''.join(hr.get_data(selector = '.item-site .content a', attr = 'href')) 31 | self.expected_time = ''.join(hr.get_data( 32 | selector = '.item-time .content', 33 | attr = 'text' 34 | )) 35 | self.traffic = ''.join(hr.get_data( 36 | selector = '.container > div:nth-child(6) > div:nth-child(2) > dl:nth-child(3) > dd', 37 | attr = 'text' 38 | )) 39 | self.ticket = ''.join(hr.get_data( 40 | selector = '.container > div:nth-child(6) > div:nth-child(2) > dl:nth-child(4) > dd', 41 | attr = 'text' 42 | )) 43 | self.business_hours = ''.join(hr.get_data( 44 | selector = '.container > div:nth-child(6) > div:nth-child(2) > dl:nth-child(5) > dd', 45 | attr = 'text' 46 | )) 47 | comment_count = hr.get_data( 48 | selector = '#poi-navbar > ul > li:nth-child(3) > a > span', 49 | attr = 'text' 50 | ) 51 | if comment_count: 52 | self.comment_count = ''.join(re.findall(r'\d+', str(comment_count))) 53 | else: 54 | self.comment_count = '0' 55 | sub_poi_href = ''.join(hr.get_data( 56 | selector = '.mod-innerScenic li > a', 57 | attr = 'href' 58 | )) 59 | if sub_poi_href: 60 | self.sub_poi_id = ','.join(re.findall('\d+', sub_poi_href)) 61 | else: 62 | self.sub_poi_id = '' 63 | 64 | def push_data(self): 65 | for image_url in self.image_url_list: 66 | if image_url: 67 | image_url = image_url.split('?imageMogr2')[0] 68 | push_image_url(image_url) 69 | echo.info('image_url %s pushed' % image_url) 70 | 71 | def save_data(self): 72 | Poi.create( 73 | poi_id = self.poi_id, 74 | name = self.poi_name, 75 | summary = self.summary, 76 | tel = self.tel, 77 | website = self.website, 78 | expected_time = self.expected_time, 79 | traffic = self.traffic, 80 | ticket = self.ticket, 81 | business_hours = self.business_hours, 82 | comment_count = self.comment_count, 83 | sub_poi_id = self.sub_poi_id, 84 | m_poi_id = self.m_poi_id 85 | ) 86 | for image_url in self.image_url_list: 87 | if image_url: 88 | Pimg.create( 89 | poi_id = self.poi_id, 90 | image_url = image_url.split('?imageMogr2')[0] 91 | ) 92 | echo.info('%s %s saved' % (self.m_poi_id, self.poi_name)) 93 | 94 | def entry(self): 95 | self.set_data() 96 | self.push_data() 97 | self.save_data() 98 | -------------------------------------------------------------------------------- /src/ctrl_poilist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import uuid 5 | import re 6 | from common.pyecho import echo 7 | from common.httper import Httper 8 | from common.cache import push_poi_id 9 | 10 | class PoilistCtrl(object): 11 | """docstring for PoilistCtrl""" 12 | def __init__(self, **kwargs): 13 | super(PoilistCtrl, self).__init__() 14 | self.url = 'http://www.mafengwo.cn/ajax/router.php' 15 | self.city_id = kwargs.get('city_id', '') 16 | self.page = kwargs.get('page', 1) 17 | self.total = kwargs.get('total', 2) 18 | 19 | def set_data(self): 20 | data = { 21 | 'sAct': 'KMdd_StructWebAjax|GetPoisByTag', 22 | 'iMddid': self.city_id, 23 | 'iTagId': 0, 24 | 'iPage': self.page 25 | } 26 | hr = Httper( 27 | self.url, 28 | method = 'post', 29 | data = data, 30 | rtype = 'json', 31 | rkey = ['data', 'list'], 32 | dtype = 'pq', 33 | selector = 'li a', 34 | attr = 'href' 35 | ) 36 | # 如果是第一次请求那么获取总页数 37 | if self.page == 1: 38 | tmp_total = hr.get_data( 39 | rkey = ['data', 'page'], 40 | selector = '.count span', 41 | attr = 'text' 42 | ) 43 | if tmp_total: 44 | self.total = int(tmp_total[0]) 45 | else: 46 | self.total = 1 47 | self.poi_id_list = hr.get_data() 48 | 49 | def push_data(self): 50 | for poi_id in self.poi_id_list: 51 | pid = re.findall('\d+', poi_id) 52 | if pid: 53 | push_poi_id(pid[0]) 54 | echo.info('Poi id %s pushed' % pid[0]) 55 | 56 | def entry(self): 57 | self.set_data() 58 | self.push_data() 59 | 60 | -------------------------------------------------------------------------------- /src/ctrl_portal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | import uuid 5 | import re 6 | from common.pyecho import echo 7 | from common.models import Dest 8 | from common.httper import Httper 9 | from common.cache import push_portal_id 10 | from common.cache import push_city_id 11 | from common.utils import get_full_province_name 12 | 13 | class PortalCtrl(object): 14 | """docstring for PortalCtrl""" 15 | def __init__(self): 16 | super(PortalCtrl, self).__init__() 17 | self.url = 'http://www.mafengwo.cn/mdd/' 18 | 19 | def set_data(self): 20 | hr = Httper( 21 | self.url, 22 | rtype = 'text', 23 | dtype = 'pq' 24 | ) 25 | portal_href_list = hr.get_data( 26 | selector = '.sub-title a', 27 | attr = 'href' 28 | ) 29 | portal_name_list = hr.get_data( 30 | selector = '.sub-title a', 31 | attr = 'text' 32 | ) 33 | city_href_list = hr.get_data( 34 | selector = '.bd-china > dl:nth-child(1) > dd > ul li a', 35 | attr = 'href' 36 | ) 37 | city_name_list = hr.get_data( 38 | selector = '.bd-china > dl:nth-child(1) > dd > ul li a', 39 | attr = 'text' 40 | ) 41 | self.portal_list = zip(portal_href_list, portal_name_list) 42 | self.city_list = zip(city_href_list, city_name_list) 43 | 44 | def save_data(self, href_name_list, area_type = 'province'): 45 | for href_name in href_name_list: 46 | mid = re.findall('\d+', href_name[0]) 47 | if mid: 48 | name = get_full_province_name(href_name[1]) 49 | m_dest_id = mid[0] 50 | if area_type == 'province': 51 | province = name 52 | city = '' 53 | # push portal id 54 | push_portal_id(m_dest_id) 55 | else: 56 | province = '' 57 | city = name 58 | # push city id 59 | push_city_id(m_dest_id) 60 | # save to database 61 | Dest.create( 62 | dest_id = uuid.uuid4(), 63 | name = name, 64 | m_dest_id = m_dest_id, 65 | province = province, 66 | city = city 67 | ) 68 | echo.info(m_dest_id + ' ' + name + ' saved') 69 | 70 | def entry(self): 71 | # 获取数据 72 | self.set_data() 73 | # 保存province数据 74 | self.save_data(self.portal_list) 75 | # 保存city数据 76 | self.save_data(self.city_list, 'city') 77 | 78 | -------------------------------------------------------------------------------- /src/main_citylist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from ctrl_citylist import CitylistCtrl 5 | from common.cache import pop_portal_id 6 | 7 | if __name__ == '__main__': 8 | while True: 9 | portal_id = pop_portal_id() 10 | print(portal_id) 11 | clc = CitylistCtrl(portal_id = portal_id) 12 | clc.entry() 13 | -------------------------------------------------------------------------------- /src/main_poidetail.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from ctrl_poidetail import PoidetailCtrl 5 | from common.cache import pop_poi_id 6 | 7 | if __name__ == '__main__': 8 | while True: 9 | m_poi_id = pop_poi_id() 10 | pdc = PoidetailCtrl(m_poi_id = m_poi_id) 11 | pdc.entry() 12 | -------------------------------------------------------------------------------- /src/main_poilist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from ctrl_poilist import PoilistCtrl 5 | from common.cache import pop_city_id 6 | 7 | if __name__ == '__main__': 8 | while True: 9 | city_id = pop_city_id() 10 | plc = PoilistCtrl(city_id = city_id) 11 | plc.entry() 12 | -------------------------------------------------------------------------------- /src/main_portal.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from ctrl_portal import PortalCtrl 5 | 6 | if __name__ == '__main__': 7 | pc = PortalCtrl() 8 | pc.entry() 9 | --------------------------------------------------------------------------------