├── .gitignore
├── .idea
├── misc.xml
├── modules.xml
├── spider_python.iml
├── vcs.xml
└── workspace.xml
├── LICENSE
├── ModifyLocation
├── gps_utils.py
├── main.py
└── position_utils.py
├── Python调用JAR
├── exec_jar_example.py
└── jar
│ ├── com
│ └── xingag
│ │ └── common
│ │ └── EncryHelper.class
│ └── encry.jar
├── README.md
├── feapder
└── tophub_demo
│ ├── .idea
│ ├── inspectionProfiles
│ │ └── Project_Default.xml
│ ├── misc.xml
│ ├── modules.xml
│ ├── tophub_demo.iml
│ └── workspace.xml
│ ├── items
│ ├── __init__.py
│ └── topic_item.py
│ ├── main.py
│ ├── setting.py
│ ├── spiders
│ ├── __init__.py
│ └── tophub_spider.py
│ └── test.py
├── js
└── jian_shu.js
├── pic
└── 最低气温排行榜.png
├── raw
└── qr.jpeg
├── scrapy
├── douban_login
│ ├── .idea
│ │ ├── douban_login.iml
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ ├── vcs.xml
│ │ └── workspace.xml
│ ├── captcha.png
│ ├── douban_login
│ │ ├── __init__.py
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ └── douban.py
│ ├── readme.MD
│ ├── scrapy.cfg
│ └── start.py
├── huize_spider
│ ├── .idea
│ │ ├── huize_spider.iml
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ └── workspace.xml
│ ├── .~ana.rtf
│ ├── ana.rtf
│ ├── datas.json
│ ├── huize_spider
│ │ ├── __init__.py
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ ├── huize.py
│ │ │ └── string_utils.py
│ ├── scrapy.cfg
│ └── start.py
├── jianshu_spider
│ ├── .idea
│ │ ├── jianshu_spider.iml
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ ├── vcs.xml
│ │ └── workspace.xml
│ ├── jianshu_spider
│ │ ├── __init__.py
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ └── jianshu.py
│ ├── raw
│ │ ├── article.sql
│ │ └── article_table.png
│ └── scrapy.cfg
├── qczj
│ ├── .idea
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ ├── qczj.iml
│ │ ├── vcs.xml
│ │ └── workspace.xml
│ ├── qczj
│ │ ├── __init__.py
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ └── bmw5.py
│ ├── readme.MD
│ ├── scrapy.cfg
│ └── start.py
├── qsbk
│ ├── .idea
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ ├── qsbk.iml
│ │ ├── vcs.xml
│ │ └── workspace.xml
│ ├── duanzi.json
│ ├── qsbk
│ │ ├── __init__.py
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ └── spider_qsbk.py
│ ├── readme.MD
│ └── scrapy.cfg
├── sfw_spider
│ ├── .idea
│ │ ├── misc.xml
│ │ ├── modules.xml
│ │ ├── sfw.iml
│ │ ├── vcs.xml
│ │ └── workspace.xml
│ ├── requirements.txt
│ ├── scrapy.cfg
│ ├── sfw
│ │ ├── __init__.py
│ │ ├── items.py
│ │ ├── middlewares.py
│ │ ├── pipelines.py
│ │ ├── settings.py
│ │ └── spiders
│ │ │ ├── __init__.py
│ │ │ └── sfw_spider.py
│ └── start.py
└── weixin_community
│ ├── .idea
│ ├── misc.xml
│ ├── modules.xml
│ ├── vcs.xml
│ ├── weixin_community.iml
│ └── workspace.xml
│ ├── readme.MD
│ ├── scrapy.cfg
│ └── weixin_community
│ ├── __init__.py
│ ├── items.py
│ ├── middlewares.py
│ ├── pipelines.py
│ ├── settings.py
│ └── spiders
│ ├── __init__.py
│ └── wx_spider.py
├── spiders
├── film_xinpianchang
│ ├── Film.py
│ ├── models.py
│ ├── tools_file.py
│ └── tools_string.py
├── spider_bai_si_bu_de_jie.py
├── spider_boss.py
├── spider_china_weather.py
├── spider_dou_tu_la.py
├── spider_dytt.py
├── spider_gushiwen.py
├── spider_lagou.py
├── spider_qiu_shi_bai_ke.py
├── spider_tencent_recruit.py
├── 发表情
│ ├── auto_send_emoji.py
│ └── utils
│ │ ├── chat_utils.py
│ │ └── string_utils.py
└── 年终奖
│ ├── comments.txt
│ ├── nzj.py
│ └── output.png
├── verification code
└── 注册【中知网】
│ ├── AipOcr.py
│ ├── cnki_demo.py
│ ├── file_tools.py
│ ├── image_code.png
│ └── screen_shot.png
├── 微信聊天记录
├── main.py
└── utils
│ ├── dbutils.py
│ └── string_utils.py
└── 获取女友的位置
├── .idea
├── inspectionProfiles
│ └── Project_Default.xml
├── misc.xml
├── modules.xml
├── vcs.xml
├── workspace.xml
└── 地理位置.iml
├── main.py
├── picture
└── 11441566648796_.pic_hd.jpg
└── position_utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | .DS_Store
104 |
105 |
106 | # mypy
107 | .mypy_cache/
108 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/spider_python.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/ModifyLocation/gps_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: gps_utils.py
12 | @time: 2019-11-17 10:34
13 | @description:TODO
14 | """
15 |
16 | import math
17 |
18 |
19 | def gps_to_dms(gps_data):
20 | """
21 | 坐标转为度、分、秒(double)
22 | 116.397451
23 | :param gps_data:
24 | :return:
25 | """
26 | # 度:向下取整
27 | gps_degree = math.floor(gps_data)
28 |
29 | gps_data_temp1 = (gps_data - gps_degree) * 60
30 |
31 | # 分
32 | gps_minute = math.floor(gps_data_temp1)
33 |
34 | gps_data_temp2 = gps_data_temp1 - gps_minute
35 |
36 | # 秒,取小数点后4位
37 | gps_second = round(gps_data_temp2 * 60, 2)
38 |
39 | # 注意:秒必须转换为整形
40 | result = ((gps_degree, 1), (gps_minute, 1), (int(gps_second * 100), 100))
41 |
42 | return result
43 |
44 |
45 | def dms_to_gps(dms_data):
46 | """
47 | 度、分、秒转为坐标值(double)
48 | :param dms_data:
49 | :return:
50 | """
51 | data1 = dms_data[0][0] / dms_data[0][1]
52 |
53 | data2 = dms_data[1][0] / dms_data[1][1] / 60
54 |
55 | data3 = dms_data[2][0] / dms_data[2][1] / 3600
56 |
57 | result = round(data1 + data2 + data3,6)
58 |
59 | return result
60 |
--------------------------------------------------------------------------------
/ModifyLocation/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: main.py
12 | @time: 2019-11-16 10:12
13 | @description:修改图片地理位置
14 | """
15 |
16 | import requests
17 | import time
18 | from PIL import Image
19 | import piexif
20 | import json
21 | from gps_utils import *
22 | from position_utils import *
23 |
24 |
25 | # 依赖:pip3 install piexif
26 |
27 | class Exif():
28 | def __init__(self):
29 | self.time = '2019:11:17 14:13:22'
30 |
31 | # 地理编码(地址转为经纬度)
32 | self.url_geo = 'https://restapi.amap.com/v3/geocode/geo'
33 |
34 | # 逆地理编码(经纬度转为地址)
35 | self.url_regeo = 'https://restapi.amap.com/v3/geocode/regeo?parameters'
36 |
37 | # key
38 | self.ak = '你的ak'
39 |
40 | # 数字签名
41 | self.sign = '你的sign'
42 |
43 | def read_image(self, image_path):
44 | """
45 | 开始处理图片
46 | exifread:读取图片属性
47 | :return:
48 | """
49 | exif_dict = piexif.load(image_path)
50 |
51 | if exif_dict['GPS']:
52 |
53 | # 纬度
54 | gps_lati_pre = exif_dict['GPS'][2]
55 |
56 | gps_lati = dms_to_gps(gps_lati_pre)
57 |
58 | # 经度
59 | gps_long_pre = exif_dict['GPS'][4]
60 | gps_long = dms_to_gps(gps_long_pre)
61 |
62 | # GPS坐标转为高德坐标
63 | lng, lat = wgs84togcj02(gps_long, gps_lati)
64 |
65 | # print(lng, lat)
66 |
67 | print(f"原图地理位置如下\n经度:{lng}\n纬度:{lat}\n")
68 |
69 | return f'{lng}, {lat}'
70 | else:
71 | print(f'抱歉!这张图片不包含地理位置!')
72 |
73 | def current_time(self):
74 | """
75 | 获取当前时间
76 | :return:
77 | """
78 | time_now = time.strftime('%Y:%m:%d %H:%M:%S', time.localtime(time.time()))
79 |
80 | result = bytes(time_now, encoding='utf-8')
81 |
82 | return result
83 |
84 | def str_to_bytes(self, str_content):
85 | """
86 | 字符串转bytes
87 | :return:
88 | """
89 | return bytes(str_content, encoding='utf-8')
90 |
91 | def is_image(self, filename):
92 | """
93 | 判断文件是否是一张图片
94 | :param filename:
95 | :return:
96 | """
97 | file_suffix = filename.split('.')[-1]
98 |
99 | if file_suffix == 'jpg' or file_suffix == 'png':
100 | return True
101 | else:
102 | return False
103 |
104 | def write_image(self, image_path, gps_long, gps_lati):
105 | """
106 | 修改文件夹下所有文件的属性
107 | :param image_path: 文件夹路径
108 | :return:
109 | """
110 | # 读取图片
111 | img = Image.open(image_path)
112 |
113 | try:
114 | exif_dict = piexif.load(img.info['exif'])
115 | except:
116 | print('加载文件地理位置异常!')
117 | return
118 |
119 | # 修改地理位置
120 | # GPS GPSLatitudeRef:N
121 | # GPS GPSLatitude:[22, 32, 189/20]
122 | # GPS GPSLongitudeRef:E
123 | # GPS GPSLongitude:[114, 1, 689/20]
124 | exif_dict['GPS'][2] = gps_to_dms(gps_lati)
125 | exif_dict['GPS'][4] = gps_to_dms(gps_long)
126 |
127 | exif_bytes = piexif.dump(exif_dict)
128 |
129 | # 写入到新的图片中去
130 | img.save(image_path, 'jpeg', exif=exif_bytes)
131 |
132 | def get_address_by_location(self, location):
133 | """
134 | 通过经纬度拿到地理位置
135 | :param location:
136 | :return:
137 | """
138 | params = {
139 | 'key': self.ak,
140 | 'location': location,
141 | 'sig': self.sign
142 | }
143 |
144 | resp = json.loads(requests.get(url=self.url_regeo, params=params).text)
145 |
146 | if resp and resp.get('regeocode') and resp.get('regeocode').get('formatted_address'):
147 | address = resp.get('regeocode').get('formatted_address')
148 | print(f'原图的拍摄地址为:{address}\n')
149 | else:
150 | print('api解析地址出错,请检查ak!\n')
151 |
152 | def get_location_by_address(self, city, address):
153 | """
154 | 通过地理位置到拿到经纬度
155 | 地理编码:https://lbs.amap.com/api/webservice/guide/api/georegeo/
156 | :param address:
157 | :return:
158 | """
159 | params = {
160 | 'key': self.ak,
161 | 'city': city,
162 | 'address': address,
163 | 'sig': self.sign
164 | }
165 |
166 | resp = json.loads(requests.get(url=self.url_geo, params=params).text)
167 |
168 | # 获取坐标地址
169 | if resp and len(resp.get('geocodes')) >= 1 and resp.get('geocodes')[0].get('location'):
170 | location = resp.get('geocodes')[0].get('location')
171 | gps_data = location.split(',')
172 |
173 | # 得到经度和纬度
174 | gps_long = float(gps_data[0])
175 | gps_lati = float(gps_data[1])
176 |
177 | return gps_long, gps_lati
178 | else:
179 | print('api解析地址出错,请检查ak!')
180 | return None
181 |
182 |
183 | if __name__ == '__main__':
184 | exif = Exif()
185 |
186 | image_path = './WechatIMG1439.jpeg'
187 |
188 | # 1、读取原图的属性
189 | location = exif.read_image(image_path)
190 |
191 | if location:
192 | # 2、原图的详细地址
193 | exif.get_address_by_location(location)
194 |
195 | # 3、输入地址(市+目的地,例如:深圳莲花山公园)
196 | city = input('请输入定位城市(例如:深圳):')
197 | address = input('请输入具体的定位地址(例如:莲花山公园):')
198 |
199 | if address:
200 | # 通过地址拿到坐标地址
201 | location = exif.get_location_by_address(city, address)
202 |
203 | if location:
204 | # 4、修改图片属性,写入经度和纬度
205 | exif.write_image(image_path, location[0], location[1])
206 | print('修改图片地理成功!')
207 | else:
208 | print('请先输入具体地址!')
209 |
--------------------------------------------------------------------------------
/ModifyLocation/position_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: position_utils.py
12 | @time: 2019-08-23 17:44
13 | @description:坐标转换
14 | """
15 |
16 | # -*- coding: utf-8 -*-
17 | import math
18 |
19 | x_pi = 3.14159265358979324 * 3000.0 / 180.0
20 | pi = 3.1415926535897932384626 # π
21 | a = 6378245.0 # 长半轴
22 | ee = 0.00669342162296594323 # 扁率
23 |
24 |
25 | def wgs84togcj02(lng, lat):
26 | """
27 | WGS84转GCJ02(火星坐标系)
28 | :param lng:WGS84坐标系的经度
29 | :param lat:WGS84坐标系的纬度
30 | :return:
31 | """
32 | if out_of_china(lng, lat): # 判断是否在国内
33 | return lng, lat
34 | dlat = transformlat(lng - 105.0, lat - 35.0)
35 | dlng = transformlng(lng - 105.0, lat - 35.0)
36 | radlat = lat / 180.0 * pi
37 | magic = math.sin(radlat)
38 | magic = 1 - ee * magic * magic
39 | sqrtmagic = math.sqrt(magic)
40 | dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
41 | dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
42 | mglat = lat + dlat
43 | mglng = lng + dlng
44 | return [mglng, mglat]
45 |
46 |
47 | def gcj02towgs84(lng, lat):
48 | """
49 | GCJ02(火星坐标系)转GPS84
50 | :param lng:火星坐标系的经度
51 | :param lat:火星坐标系纬度
52 | :return:
53 | """
54 | if out_of_china(lng, lat):
55 | return lng, lat
56 | dlat = transformlat(lng - 105.0, lat - 35.0)
57 | dlng = transformlng(lng - 105.0, lat - 35.0)
58 | radlat = lat / 180.0 * pi
59 | magic = math.sin(radlat)
60 | magic = 1 - ee * magic * magic
61 | sqrtmagic = math.sqrt(magic)
62 | dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
63 | dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
64 | mglat = lat + dlat
65 | mglng = lng + dlng
66 | return [lng * 2 - mglng, lat * 2 - mglat]
67 |
68 |
69 | def transformlat(lng, lat):
70 | ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + \
71 | 0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng))
72 | ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 *
73 | math.sin(2.0 * lng * pi)) * 2.0 / 3.0
74 | ret += (20.0 * math.sin(lat * pi) + 40.0 *
75 | math.sin(lat / 3.0 * pi)) * 2.0 / 3.0
76 | ret += (160.0 * math.sin(lat / 12.0 * pi) + 320 *
77 | math.sin(lat * pi / 30.0)) * 2.0 / 3.0
78 | return ret
79 |
80 |
81 | def transformlng(lng, lat):
82 | ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + \
83 | 0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng))
84 | ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 *
85 | math.sin(2.0 * lng * pi)) * 2.0 / 3.0
86 | ret += (20.0 * math.sin(lng * pi) + 40.0 *
87 | math.sin(lng / 3.0 * pi)) * 2.0 / 3.0
88 | ret += (150.0 * math.sin(lng / 12.0 * pi) + 300.0 *
89 | math.sin(lng / 30.0 * pi)) * 2.0 / 3.0
90 | return ret
91 |
92 |
93 | def out_of_china(lng, lat):
94 | """
95 | 判断是否在国内,不在国内不做偏移
96 | :param lng:
97 | :param lat:
98 | :return:
99 | """
100 | if lng < 72.004 or lng > 137.8347:
101 | return True
102 | if lat < 0.8293 or lat > 55.8271:
103 | return True
104 | return False
105 |
--------------------------------------------------------------------------------
/Python调用JAR/exec_jar_example.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: exec_jar_example.py
12 | @time: 2021-01-02 12:30
13 | @description:TODO
14 | """
15 |
16 | import jpype
17 | import os
18 |
19 | # 初始化
20 | jar_path = os.path.join(os.path.abspath('.'), 'jar/encry.jar')
21 |
22 | print(jar_path)
23 |
24 | # 启动jvm
25 | jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", "-Djava.class.path=%s" % (jar_path))
26 |
27 |
28 | # 通过包名,实例化JAVA对象
29 | EncryClass = jpype.JClass("com.xingag.common.EncryHelper")
30 | encryClass = EncryClass()
31 |
32 | # 调用JAVA中的加密方法
33 | content_encry = encryClass.encrypt("xag")
34 | print(content_encry)
35 |
36 | # 关闭jvm
37 | jpype.shutdownJVM()
38 |
--------------------------------------------------------------------------------
/Python调用JAR/jar/com/xingag/common/EncryHelper.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/Python调用JAR/jar/com/xingag/common/EncryHelper.class
--------------------------------------------------------------------------------
/Python调用JAR/jar/encry.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/Python调用JAR/jar/encry.jar
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # spider_python
2 |
3 | ## 前言
4 |
5 | 如果想查看详细的教程,请关注微信公众号:**AirPython**
6 |
7 | 
8 |
9 |
10 |
11 | ## 普通的爬虫
12 |
13 | * [爬取电影天堂最新的电影数据 - xpath](./spiders/spider_dytt.py)
14 |
15 | * [爬取腾讯招聘的职位数据 - xpath](./spiders/spider_tencent_recruit.py)
16 |
17 | * [爬取中国天气网全国天气并生成饼状图 - bs4](./spiders/spider_china_weather.py)
18 |
19 | * [爬取古诗词网的数据 - re](./spiders/spider_gushiwen.py)
20 |
21 | * [爬取糗事百科上的段子数据 - re](./spiders/spider_qiu_shi_bai_ke.py)
22 |
23 |
24 |
25 | ## 多线程爬虫
26 |
27 | * [多线程爬取斗图吧的表情图并下载到本地 - xpath + threading](./spiders/spider_dou_tu_la.py)
28 | * [使用 itchat 发送表情到指定的人和微信群](./spiders/发表情/)
29 | * [多线程爬取百思不得姐的文字和图片信息并写入到csv中](./spiders/spider_bai_si_bu_de_jie.py)
30 |
31 |
32 |
33 | ## Selenium 自动化爬虫
34 |
35 | * [爬取拉勾网的职位信息 - selenium + requests + lxml ](./spiders/spider_lagou.py)
36 |
37 | * [爬取 Boss 直聘网的职位信息 - selenium + lxml](./spiders/spider_boss.py)
38 |
39 |
40 |
41 | ## Scrapy 框架爬虫
42 | * [爬取糗事百科的段子保存到 JSON 文件中](./scrapy/qsbk/readme.MD)
43 | * [爬取微信小程序论坛的数据](./scrapy/weixin_community/readme.MD)
44 | * [登录豆瓣网并修改个性签名](./scrapy/douban_login/readme.MD)
45 | * [下载汽车之家的高清图片到本地](./scrapy/qczj/readme.MD)
46 | * [爬取简书网所有文章数据](./scrapy/jianshu_spider/)
47 | * [爬取房天下所有房的数据,包含新房、二手房](./scrapy/sfw_spider)
48 |
49 |
50 |
51 |
52 |
53 | ## feapder
54 |
55 | * [feapder AirSpider实例](./feapder/tophub_demo)
56 |
57 |
58 |
59 | ## Node.js 爬虫
60 |
61 | * [使用 puppeteer 爬取简书文章并保存到本地](./js/jian_shu.js)
62 |
63 |
64 |
65 | ## 其他
66 |
67 | * [使用 Python 定位到女朋友的位置](./获取女友的位置)
68 | * [女朋友背着我,用 Python 偷偷隐藏了她的行踪](./ModifyLocation)
69 | * [微信群聊记录](./微信聊天记录)
70 | * [Python 调用 JAR](./Python调用JAR)
71 |
72 |
--------------------------------------------------------------------------------
/feapder/tophub_demo/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
47 |
48 |
49 |
54 |
55 |
56 |
--------------------------------------------------------------------------------
/feapder/tophub_demo/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/feapder/tophub_demo/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/feapder/tophub_demo/.idea/tophub_demo.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/feapder/tophub_demo/items/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "topic_item"
3 | ]
--------------------------------------------------------------------------------
/feapder/tophub_demo/items/topic_item.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on 2021-04-08 12:20:22
4 | ---------
5 | @summary:
6 | ---------
7 | @author: xingag
8 | """
9 |
10 | from feapder import Item
11 |
12 |
13 | class TopicItem(Item):
14 | """
15 | This class was generated by feapder.
16 | command: feapder create -i topic.
17 | """
18 |
19 | def __init__(self, *args, **kwargs):
20 | # self.id = None
21 | self.title = None # 文章标题
22 | self.auth = None # 作者
23 | self.like_count = 0 # 喜欢数
24 | self.collection = 0 # 收藏数
25 | self.comment = 0 # 评论数
26 |
--------------------------------------------------------------------------------
/feapder/tophub_demo/main.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on 2021-04-08 11:57:08
4 | ---------
5 | @summary: 爬虫入口
6 | ---------
7 | @author: xingag
8 | """
9 |
10 | from feapder import ArgumentParser
11 |
12 | from spiders import *
13 |
14 |
15 | def crawl_xxx():
16 | """
17 | 普通爬虫
18 | """
19 | spider = xxx.XXXSpider(redis_key="xxx:xxx")
20 | spider.start()
21 |
22 |
23 | def crawl_xxx(args):
24 | """
25 | 批次爬虫
26 | @param args: 1 / 2 / init
27 | """
28 | spider = xxx_spider.XXXSpider(
29 | task_table="", # mysql中的任务表
30 | batch_record_table="", # mysql中的批次记录表
31 | batch_name="xxx(周全)", # 批次名字
32 | batch_interval=7, # 批次时间 天为单位 若为小时 可写 1 / 24
33 | task_keys=["id", "xxx"], # 需要获取任务表里的字段名,可添加多个
34 | redis_key="xxx:xxxx", # redis中存放request等信息的根key
35 | task_state="state", # mysql中任务状态字段
36 | )
37 |
38 | if args == 1:
39 | spider.start_monitor_task()
40 | elif args == 2:
41 | spider.start()
42 | elif args == "init":
43 | spider.init_task()
44 |
45 |
46 | if __name__ == "__main__":
47 | parser = ArgumentParser(description="xxx爬虫")
48 |
49 | parser.add_argument(
50 | "--crawl_xxx", action="store_true", help="xxx", function=crawl_xxx
51 | )
52 | parser.add_argument(
53 | "--crawl_xxx", type=int, nargs=1, help="xxx(1|2)", function=crawl_xxx
54 | )
55 |
56 | parser.start()
57 |
--------------------------------------------------------------------------------
/feapder/tophub_demo/setting.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """爬虫配置文件"""
3 | import os
4 |
5 |
6 | # MYSQL
7 | MYSQL_IP = "localhost"
8 | MYSQL_PORT = 3306
9 | MYSQL_DB = "xag"
10 | MYSQL_USER_NAME = "root"
11 | MYSQL_USER_PASS = "root"
12 |
13 | # REDIS
14 | # IP:PORT
15 | REDISDB_IP_PORTS = "xxx:6379"
16 | REDISDB_USER_PASS = ""
17 | # 默认 0 到 15 共16个数据库
18 | REDISDB_DB = 0
19 | # # 适用于redis哨兵模式
20 | # REDISDB_SERVICE_NAME = None
21 | #
22 | # # 数据入库的pipeline,可自定义,默认MysqlPipeline
23 | # ITEM_PIPELINES = ["feapder.pipelines.mysql_pipeline.MysqlPipeline"]
24 | #
25 | # # 爬虫相关
26 | # # COLLECTOR
27 | # COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔
28 | # COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量
29 | #
30 | # # SPIDER
31 | # SPIDER_THREAD_COUNT = 100 # 爬虫并发数
32 | # SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间)
33 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数
34 | # WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警
35 | #
36 | # # 浏览器渲染下载
37 | # WEBDRIVER = dict(
38 | # pool_size=2, # 浏览器的数量
39 | # load_images=False, # 是否加载图片
40 | # user_agent=None, # 字符串 或 无参函数,返回值为user_agent
41 | # proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址
42 | # headless=False, # 是否为无头浏览器
43 | # driver_type="CHROME", # CHROME 或 PHANTOMJS,
44 | # timeout=30, # 请求超时时间
45 | # window_size=(1024, 800), # 窗口大小
46 | # executable_path=None, # 浏览器路径,默认为默认路径
47 | # )
48 | #
49 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败
50 | # RETRY_FAILED_REQUESTS = False
51 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒
52 | # REQUEST_TIME_OUT = 600 # 10分钟
53 | # # 保存失败的request
54 | # SAVE_FAILED_REQUEST = True
55 | #
56 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用
57 | # RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True
58 | # RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒
59 | # RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True
60 | #
61 | # # 爬虫是否自动结束,若为False,则会等待新任务下发,进程不退出
62 | # AUTO_STOP_WHEN_SPIDER_DONE = True
63 | #
64 | # # 设置代理
65 | # PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n
66 | # PROXY_ENABLE = True
67 | #
68 | # # 随机headers
69 | # RANDOM_HEADERS = True
70 | # # requests 使用session
71 | # USE_SESSION = False
72 | #
73 | # # 去重
74 | # ITEM_FILTER_ENABLE = False # item 去重
75 | # REQUEST_FILTER_ENABLE = False # request 去重
76 | #
77 | # # 报警 支持钉钉及邮件,二选一即可
78 | # # 钉钉报警
79 | # DINGDING_WARNING_URL = "" # 钉钉机器人api
80 | # DINGDING_WARNING_PHONE = "" # 报警人 支持列表,可指定多个
81 | # # 邮件报警
82 | # EAMIL_SENDER = "" # 发件人
83 | # EAMIL_PASSWORD = "" # 授权码
84 | # EMAIL_RECEIVER = "" # 收件人 支持列表,可指定多个
85 | # # 报警时间间隔及级别
86 | # WARNING_INTERVAL = 3600 # 相同报警的报警时间间隔,防止刷屏
87 | # WARNING_LEVEL = "DEBUG" # 报警级别, DEBUG / ERROR
88 | #
89 | # LOG_NAME = os.path.basename(os.getcwd())
90 | # LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径
91 | # LOG_LEVEL = "DEBUG"
92 | # LOG_IS_WRITE_TO_FILE = False
93 |
--------------------------------------------------------------------------------
/feapder/tophub_demo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | "tophub_spider"
3 | ]
--------------------------------------------------------------------------------
/feapder/tophub_demo/spiders/tophub_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on 2021-04-08 12:03:28
4 | ---------
5 | @summary:
6 | ---------
7 | @author: xingag
8 | """
9 |
10 | import re
11 |
12 | import feapder
13 | from fake_useragent import UserAgent
14 | from feapder.db.mysqldb import MysqlDB
15 |
16 |
17 | # 爬取数据并入库
18 |
19 | class TophubSpider(feapder.AirSpider):
20 |
21 | def __init__(self, *args, **kwargs):
22 | super().__init__(*args, **kwargs)
23 | self.db = MysqlDB()
24 |
25 | def start_requests(self):
26 | yield feapder.Request("https://tophub.today/", download_midware=self.download_midware)
27 |
28 | def parse(self, request, response):
29 | # print(response.text)
30 | card_elements = response.xpath('//div[@class="cc-cd"]')
31 |
32 | # 过滤出对应的卡片元素【什么值得买】
33 | buy_good_element = [card_element for card_element in card_elements if
34 | card_element.xpath('.//div[@class="cc-cd-is"]//span/text()').extract_first() == '什么值得买'][0]
35 |
36 | # 获取内部文章标题及地址
37 | a_elements = buy_good_element.xpath('.//div[@class="cc-cd-cb nano"]//a')
38 |
39 | for a_element in a_elements:
40 | # 标题和链接
41 | title = a_element.xpath('.//span[@class="t"]/text()').extract_first()
42 | href = a_element.xpath('.//@href').extract_first()
43 |
44 | # 再次下发新任务,并带上文章标题
45 | yield feapder.Request(href, download_midware=self.download_midware, callback=self.parser_detail_page,
46 | title=title)
47 |
48 | def parser_detail_page(self, request, response):
49 | """
50 | 解析文章详情数据
51 | :param request:
52 | :param response:
53 | :return:
54 | """
55 | title = request.title
56 |
57 | url = request.url
58 |
59 | # 解析文章详情页面,获取点赞、收藏、评论数目及作者名称
60 | author = response.xpath('//a[@class="author-title"]/text()').extract_first().strip()
61 |
62 | print("作者:", author, '文章标题:', title, "地址:", url)
63 |
64 | desc_elements = response.xpath('//span[@class="xilie"]/span')
65 |
66 | print("desc数目:", len(desc_elements))
67 |
68 | # 点赞
69 | like_count = int(re.findall('\d+', desc_elements[1].xpath('./text()').extract_first())[0])
70 | # 收藏
71 | collection_count = int(re.findall('\d+', desc_elements[2].xpath('./text()').extract_first())[0])
72 | # 评论
73 | comment_count = int(re.findall('\d+', desc_elements[3].xpath('./text()').extract_first())[0])
74 |
75 | print("点赞:", like_count, "收藏:", collection_count, "评论:", comment_count)
76 |
77 | # 插入数据库
78 | sql = "INSERT INTO topic(title,auth,like_count,collection,comment) values('%s','%s','%s','%d','%d')" % (
79 | title, author, like_count, collection_count, comment_count)
80 |
81 | # 执行
82 | self.db.execute(sql)
83 |
84 | def download_midware(self, request):
85 | # 随机UA
86 | # 依赖:pip3 install fake_useragent
87 | ua = UserAgent().random
88 | request.headers = {'User-Agent': ua}
89 | return request
90 |
91 |
92 | if __name__ == "__main__":
93 | TophubSpider(thread_count=10).start()
94 |
--------------------------------------------------------------------------------
/feapder/tophub_demo/test.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: test.py
12 | @time: 2021/4/8 下午12:26
13 | @description:TODO
14 | """
15 |
16 | from fake_useragent import UserAgent
17 |
18 | ua = UserAgent().random
19 | print(ua)
--------------------------------------------------------------------------------
/js/jian_shu.js:
--------------------------------------------------------------------------------
1 | //简书上的文章保存为pdf保存到本地
2 | const puppeteer = require('puppeteer');
3 |
4 | const mkdirp = require('mkdirp');
5 |
6 | BASE_URL = 'https://www.jianshu.com';
7 |
8 | HOME_URL = `${BASE_URL}/u/f46becd1ed83`;
9 |
10 | //文章目录
11 | const ARTICLE_PATH = './monkey';
12 |
13 | const download_article = async () => {
14 |
15 | const viewport_size = {
16 | width: 0,
17 | height: 0,
18 | };
19 |
20 | const browser = await puppeteer.launch({
21 | headless: true,
22 | });
23 |
24 | const page = await browser.newPage();
25 |
26 | page.setViewport(viewport_size);
27 |
28 | //打开文章主页
29 | await page.goto(HOME_URL);
30 |
31 | console.log('显示文章列表,马上开始滑动')
32 |
33 | //滑动文章列表,使所有文章被加载出来
34 | //参考:https://github.com/GoogleChrome/puppeteer/issues/844
35 | await autoScroll(page);
36 |
37 | console.log('所有文章加载完成');
38 |
39 | const articles = await page.$eval('.note-list', articles_element => {
40 | const article_elements = articles_element.querySelectorAll('li');
41 | const articleElementArray = Array.prototype.slice.call(article_elements);
42 |
43 | return articleElementArray.map(item => {
44 | const a_element = item.querySelector('.title');
45 | return {
46 | href: a_element.getAttribute('href'),
47 | title: a_element.innerHTML.trim(),
48 | };
49 | });
50 | });
51 |
52 | console.log(`大佬一共发布了${articles.length}篇文章`);
53 |
54 |
55 | //新建目录
56 | mkdirp.sync(ARTICLE_PATH);
57 |
58 | for (let article of articles) {
59 | const articlePage = await browser.newPage();
60 | articlePage.setViewport(viewport_size);
61 | articlePage.goto(`${BASE_URL}${article.href}`, {
62 | waitUntil: 'networkidle2'
63 | });
64 |
65 | articlePage.waitForSelector('.post');
66 | console.log('文章详情页面加载完成');
67 |
68 | //注意:这里必须等待几秒,不然下面的滑动会报错:
69 | // UnhandledPromiseRejectionWarning: Error: Execution context was destroyed, most likely because of a navigation.
70 | await articlePage.waitFor(2000);
71 |
72 | //滑动到最底部,加载出所有的图片
73 | await autoScroll(articlePage);
74 |
75 |
76 | //为了保证页面的整洁干净,屏蔽多余的元素
77 | await articlePage.$eval('body', body => {
78 | body.querySelector('.navbar').style.display = 'none';
79 | body.querySelector('#note-fixed-ad-container').style.display = 'none';
80 | body.querySelector('.note-bottom').style.display = 'none';
81 | body.querySelector('.side-tool').style.display = 'none';
82 | // body.querySelector('.author').style.display = 'none';
83 | body.querySelector('.meta-bottom').style.display = 'none';
84 | body.querySelector('#web-note-ad-1').style.display = 'none';
85 | body.querySelector('#comment-list').style.display = 'none';
86 | body.querySelector('.follow-detail').style.display = 'none';
87 | body.querySelector('.show-foot').style.display = 'none';
88 |
89 | Promise.resolve();
90 | });
91 |
92 | //文章名称
93 | const fileName = `${article.title.replace("/\\//g", "、")}.pdf`;
94 | const fileFullPath = `${ARTICLE_PATH}/${fileName}`;
95 | console.log(`文章保存的完整路径是:${fileFullPath}`);
96 |
97 | await page.emulateMedia('screen');
98 | await articlePage.pdf({
99 | path: fileFullPath,
100 | format: 'A4'
101 | });
102 | console.log(`保存成功: ${fileFullPath}`);
103 | articlePage.close();
104 | }
105 |
106 | console.log('下载完成!Enjoy~');
107 | };
108 |
109 | function autoScroll(page) {
110 | return page.evaluate(() => {
111 | return new Promise((resolve, reject) => {
112 | var totalHeight = 0;
113 | var distance = 100;
114 | var timer = setInterval(() => {
115 | console.log('执行间断函数');
116 | var scrollHeight = document.body.scrollHeight;
117 | window.scrollBy(0, distance);
118 | totalHeight += distance;
119 |
120 | if (totalHeight >= scrollHeight) {
121 | console.log('滑动到底');
122 | clearInterval(timer);
123 | resolve();
124 | }
125 | }, 100);
126 | })
127 | });
128 | }
129 |
130 |
131 | module.exports = download_article;
132 |
133 | if (require.main === module) {
134 | download_article()
135 | }
136 |
137 |
138 |
--------------------------------------------------------------------------------
/pic/最低气温排行榜.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/pic/最低气温排行榜.png
--------------------------------------------------------------------------------
/raw/qr.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/raw/qr.jpeg
--------------------------------------------------------------------------------
/scrapy/douban_login/.idea/douban_login.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/scrapy/douban_login/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/scrapy/douban_login/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy/douban_login/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/scrapy/douban_login/captcha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/douban_login/captcha.png
--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/douban_login/douban_login/__init__.py
--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class DoubanLoginItem(scrapy.Item):
12 | # define the fields for your item here like:
13 | # name = scrapy.Field()
14 | pass
15 |
--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class DoubanLoginSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class DoubanLoginDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 |
9 | class DoubanLoginPipeline(object):
10 | def process_item(self, item, spider):
11 | return item
12 |
--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for douban_login project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'douban_login'
13 |
14 | SPIDER_MODULES = ['douban_login.spiders']
15 | NEWSPIDER_MODULE = 'douban_login.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'douban_login (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 1
30 | # The download del、ay、 setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | 'Accept-Language': 'en',
44 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
45 | }
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | # 'douban_login.middlewares.DoubanLoginSpiderMiddleware': 543,
51 | # }
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | # 'douban_login.middlewares.DoubanLoginDownloaderMiddleware': 543,
57 | # }
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'douban_login.pipelines.DoubanLoginPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | # AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | # AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/spiders/douban.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from urllib import request
4 | from PIL import Image
5 | import ssl
6 |
7 |
8 | # 使用Scrapy登录豆瓣网
9 | # 验证码识别可以通过手动输入【PIL】和自动识别
10 |
11 | class DoubanSpider(scrapy.Spider):
12 | name = 'douban'
13 | allowed_domains = ['douban.com']
14 |
15 | # 默认首先请求这个地址【GET】,然后把请求结果返回给parse()函数解析
16 | start_urls = ['https://accounts.douban.com/login']
17 |
18 | # 登录url
19 | login_url = 'https://accounts.douban.com/login'
20 |
21 | # 个人中心url
22 | person_center_url = 'https://www.douban.com/people/165725759/'
23 |
24 | # 编辑签名的请求地址
25 | edit_signature = 'https://www.douban.com/j/people/165725759/edit_signature'
26 |
27 | def parse(self, response):
28 | """
29 | 请求后的解析
30 | 包含两种情况:1.第一次请求start_urls;2.某一次请求不包含callback
31 | :param response:
32 | :return:
33 | """
34 | # 注意:把最后的请求解析过滤掉
35 | # 如果解析到相应地址不是login_url就不做处理
36 | if response.url != self.login_url:
37 | return
38 |
39 | print('调用parse函数,此时的url:%s' % response.url)
40 | form_data = {
41 | 'source': 'index_nav',
42 | 'redir': 'https://www.douban.com/', # 登录后跳转到哪个界面
43 | 'form_email': '18520876423',
44 | 'form_password': 'Hu881025',
45 | # 'captcha-solution': 'chemical', # 验证码【需要识别图片】
46 | # 'captcha-id': 'ysCwMdnnq8YVpDJZdfmzHu1V:en', # 验证码ID 【每次刷新都重新生成一个,放入到input标签的name为captcha-id的value中】
47 | 'remember': 'on',
48 | 'login': '登录'
49 | }
50 |
51 | # 获取id为captcha-id的img标签【css方式,也可以选择用xpath】
52 | # 验证码图片的url
53 | captcha_img = response.css('img#captcha_image::attr(src)').get()
54 |
55 | # 注意:如果存在验证码,就识别验证码;如果没有验证码,不传入以下两个参数直接登录
56 | if captcha_img:
57 | # 手动识别验证码
58 | captcha = self._regonize_captcha(captcha_img)
59 | form_data['captcha-solution'] = captcha
60 |
61 | # 验证码id【每次刷新都会变化】
62 | captcha_id = response.xpath('//input[@name="captcha-id"]/@value').get()
63 | form_data['captcha-id'] = captcha_id
64 | print('带有验证码的参数已经补充完整,现在开始发送请求')
65 | else:
66 | print('没有验证码,现在开始发送请求')
67 |
68 | # 发送登录请求【POST】
69 | yield scrapy.FormRequest(url=self.login_url, formdata=form_data, callback=self.parse_after_login)
70 |
71 | def _regonize_captcha(self, image_url):
72 | """
73 | 人工识别验证码【urllib+PIL】
74 | :param image_url:
75 | :return:
76 | """
77 | print('验证码的地址:%s,开始下载图片' % image_url)
78 |
79 | # 下载图片到本地
80 | request.urlretrieve(image_url, 'captcha.png')
81 |
82 | print('下载图片完成,开始显示图片')
83 |
84 | # 显示在控制台,手动输入验证码
85 | # 打开图片
86 | image = Image.open('captcha.png')
87 | # 展示
88 | image.show()
89 |
90 | # 提示输入验证码
91 | captcha = input('请输入验证码:')
92 |
93 | return captcha
94 |
95 | def parse_after_login(self, response):
96 | """
97 | 登录成功之后,请求【个人中心】
98 | :param response:
99 | :return:
100 | """
101 | # 当前url
102 | current_page_url = response.url
103 | print('调用登录接口后,现在的界面是:%s' % current_page_url)
104 | if current_page_url == 'https://www.douban.com/':
105 | print('登录成功')
106 | # 请求个人中心的页面
107 | request = scrapy.Request(url=self.person_center_url, callback=self.parse_person_center)
108 | yield request
109 | else:
110 | print('登录失败')
111 |
112 | def parse_person_center(self, response):
113 | """
114 | 解析个人中心页面
115 | :param response:
116 | :return:
117 | """
118 | if response.url == self.person_center_url:
119 | print('进入到个人中心页面了')
120 | ck = response.xpath('//input[@name="ck"]/@value').get()
121 | print('获取的ck是:%s' % ck)
122 | formdata = {
123 | 'ck': ck,
124 | 'signature': '时光如水,岁月如斯'
125 | }
126 | # 发送post请求来更改签名
127 | yield scrapy.FormRequest(self.edit_signature, formdata=formdata)
128 | else:
129 | print('进入个人中心页面失败')
130 |
--------------------------------------------------------------------------------
/scrapy/douban_login/readme.MD:
--------------------------------------------------------------------------------
1 | # 使用scrapy登录豆瓣网
2 | ### 准备
3 |
4 | ```
5 | scrapy startproject douban_login
6 | cd douban_login
7 | scrapy genspider douban "douban.com"
8 | ```
9 |
10 |
11 |
12 | ### 配置
13 |
14 | 配置 `settings.py` 文件
15 |
16 | 编写 `start.py` 文件,利用 `cmdline` 快速指定爬虫代码
17 |
18 |
19 |
20 | ### 开发
21 |
22 | 场景:使用 `scrapy` 登录豆瓣网,然后到个人中心页面,修改个性签名
23 |
24 | 请求:初始请求【GET】、登录请求【POST】、个人中心请求【GET】、修改签名请求【POST】
25 |
26 | 注意:
27 |
28 | 1. 初始请求的地址是:`start_urls`
29 | 2. 使用 `urllib + PIL` 下载验证码图片,并人工识别验证码【可以付费调用识别验证码的接口】
30 | 3. `captcha-id` 和 `ck` 两个请求参数都在源码中的某个元素里
31 |
32 |
33 |
34 | ### 运行
35 |
36 | 运行 `start.py`
37 |
38 |
--------------------------------------------------------------------------------
/scrapy/douban_login/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = douban_login.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = douban_login
12 |
--------------------------------------------------------------------------------
/scrapy/douban_login/start.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: start.py
12 | @time: 11/15/18 21:04
13 | @description:方便执行 Python 文件【执行一个 Python 文件】
14 | """
15 | from scrapy import cmdline
16 |
17 | cmdline.execute('scrapy crawl douban'.split())
18 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/.idea/huize_spider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/.~ana.rtf:
--------------------------------------------------------------------------------
1 | xingag x i n g a g
--------------------------------------------------------------------------------
/scrapy/huize_spider/ana.rtf:
--------------------------------------------------------------------------------
1 | {\rtf1\ansi\ansicpg936\cocoartf1671\cocoasubrtf100
2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset134 PingFangSC-Regular;}
3 | {\colortbl;\red255\green255\blue255;}
4 | {\*\expandedcolortbl;;}
5 | \margl1440\margr1440\vieww10800\viewh13080\viewkind0
6 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
7 |
8 | \f0\fs24 \cf0 http://www.huize.com/\
9 | \
10 |
11 | \f1 \'bd\'a1\'bf\'b5\'b1\'a3\'cf\'d5\
12 | 1.\'d6\'d8\'bc\'b2
13 | \f0 \
14 | http://www.huize.com/product/ins-2059-0-0\
15 | 2.
16 | \f1 \'d7\'a1\'d4\'ba\'d2\'bd\'c1\'c6
17 | \f0 \
18 | http://www.huize.com/product/ins-2058-0-0\
19 | \
20 | \
21 |
22 | \f1 \'c8\'cb\'ca\'d9\'b1\'a3\'cf\'d5
23 | \f0 \
24 | 1.
25 | \f1 \'c8\'cb\'ca\'d9\'b1\'a3\'d5\'cf
26 | \f0 \
27 | http://www.huize.com/product/ins-2060-0-0\
28 | 2.
29 | \f1 \'c4\'ea\'bd\'f0\'b1\'a3\'cf\'d5
30 | \f0 \
31 | http://www.huize.com/product/ins-2101-0-0\
32 | \
33 | \
34 |
35 | \f1 \'b6\'f9\'cd\'af\'b1\'a3\'cf\'d5
36 | \f0 \
37 | 1.
38 | \f1 \'b6\'f9\'cd\'af\'d6\'d8\'bb\'f7
39 | \f0 \
40 | http://www.huize.com/product/ins-2043-0-0\
41 | 2.
42 | \f1 \'b6\'f9\'cd\'af\'d2\'bd\'c1\'c6
43 | \f0 \
44 | http://www.huize.com/product/ins-2044-0-0\
45 | 3.
46 | \f1 \'b6\'f9\'cd\'af\'d2\'e2\'cd\'e2\
47 | http://www.huize.com/product/ins-2042-0-0\
48 | 4.\'bd\'cc\'d3\'fd\'b4\'a2\'d0\'ee\
49 | http://www.huize.com/product/ins-2057-0-0\
50 | \
51 | \
52 | \'d2\'e2\'cd\'e2\'b1\'a3\'cf\'d5\
53 | 1.\'bd\'bb\'cd\'a8\'d2\'e2\'cd\'e2\
54 | http://www.huize.com/product/ins-2082-0-0\
55 | 2.\'d7\'db\'ba\'cf\'d2\'e2\'cd\'e2\
56 | http://www.huize.com/product/ins-2049-0-0\
57 | \
58 | }
--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/huize_spider/huize_spider/__init__.py
--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class HuizeSpiderItem(scrapy.Item):
12 | title = scrapy.Field()
13 | sales = scrapy.Field()
14 | tips = scrapy.Field()
15 | price = scrapy.Field()
16 | url = scrapy.Field()
17 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class HuizeSpiderSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class HuizeSpiderDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | from scrapy.exporters import JsonLinesItemExporter
9 |
10 |
11 | class HuizeSpiderPipeline(object):
12 |
13 | def __init__(self):
14 | self.fp = open('datas.json', 'wb')
15 |
16 | self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False)
17 |
18 | def process_item(self, item, spider):
19 | self.exporter.export_item(item)
20 | return item
21 |
22 | def close_spider(self, spider):
23 | # 关闭文件
24 | self.fp.close()
25 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for huize_spider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'huize_spider'
13 |
14 | SPIDER_MODULES = ['huize_spider.spiders']
15 | NEWSPIDER_MODULE = 'huize_spider.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'huize_spider (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 1
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | 'Accept-Language': 'en',
44 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
45 | }
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | # 'huize_spider.middlewares.HuizeSpiderSpiderMiddleware': 543,
51 | # }
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | # 'huize_spider.middlewares.HuizeSpiderDownloaderMiddleware': 543,
57 | # }
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'huize_spider.pipelines.HuizeSpiderPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | # AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | # AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/spiders/huize.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.linkextractors import LinkExtractor
4 | from scrapy.spiders import CrawlSpider, Rule
5 | from huize_spider.items import HuizeSpiderItem
6 | from .string_utils import remove_space_words
7 |
8 |
9 | # 使用 CrawlSpider 爬取某保险网的数据
10 |
11 | class HuizeSpider(CrawlSpider):
12 | name = 'huize'
13 | allowed_domains = ['huize.com']
14 | start_urls = ['http://huize.com/']
15 |
16 | rules = (
17 | Rule(LinkExtractor(allow=r'.*http://www.huize.com/product/ins-.*'), callback=None, follow=False),
18 | Rule(LinkExtractor(allow=r'.*http://www.huize.com/product/detail-.*'), callback='parse_detail', follow=False),
19 | )
20 |
21 | def parse_detail(self, response):
22 | # 标题
23 | title = response.xpath('//h2[@class="product-title f30"]/text()').get().strip()
24 |
25 | # 销量
26 | sales = response.xpath('//p[@class="count-item fc6"]/text()').get().strip()
27 |
28 | # 保险特色
29 | # 去掉特殊空格符号
30 | tips = remove_space_words("、".join(response.xpath('//li[@class="ensure-support-item"]/text()').getall()))
31 |
32 | # 价格
33 | price = response.xpath('//span[@class="product-price"]/i[@class="preminum-result"]/text()').get()+" 元"
34 |
35 | item = HuizeSpiderItem(title=title, sales=sales, tips=tips, price=price, url=response.url)
36 |
37 | yield item
38 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/spiders/string_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: string_utils.py
12 | @time: 12/4/18 19:52
13 | @description:TODO
14 | """
15 |
16 |
17 | def remove_space_words(source):
18 | """
19 | 去掉字符串中的特殊空格,包含\n、\t、\xa0
20 | :param source:
21 | :return:
22 | """
23 | result = "".join(source.split())
24 | return result
25 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = huize_spider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = huize_spider
12 |
--------------------------------------------------------------------------------
/scrapy/huize_spider/start.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: start.py
12 | @time: 11/15/18 21:04
13 | @description:方便执行 Python 文件【执行一个 Python 文件】
14 | """
15 | from scrapy import cmdline
16 |
17 | cmdline.execute('scrapy crawl huize'.split())
18 |
19 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/.idea/jianshu_spider.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/jianshu_spider/jianshu_spider/__init__.py
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | # 文章详情Item
12 | class ArticleItem(scrapy.Item):
13 | title = scrapy.Field()
14 | content = scrapy.Field()
15 | # 文章id
16 | article_id = scrapy.Field()
17 | # 原始的url
18 | origin_url = scrapy.Field()
19 |
20 | # 作者
21 | author = scrapy.Field()
22 |
23 | # 头像
24 | avatar = scrapy.Field()
25 |
26 | # 发布时间
27 | pubtime = scrapy.Field()
28 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class JianshuSpiderSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class JianshuSpiderDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | # 爬取到数据后,保存到Mysql数据中
9 |
10 | import pymysql
11 |
12 |
13 | class JianshuSpiderPipeline(object):
14 |
15 | def __init__(self):
16 | db_params = {
17 | 'host': '127.0.0.1',
18 | 'port': 3306,
19 | 'user': 'root',
20 | 'password': 'root',
21 | 'database': 'jianshu',
22 | 'charset': 'utf8'
23 | }
24 |
25 | # 数据库【连接对象】
26 | self.conn = pymysql.connect(**db_params)
27 |
28 | # 数据库【游标对象】【操作数据库】
29 | self.cursor = self.conn.cursor()
30 |
31 | # sql语句
32 | self._sql = """
33 | insert into article(id,title,content,author,avatar,pubtime,article_id,origin_url)
34 | values(null,%s,%s,%s,%s,%s,%s,%s)
35 | """
36 |
37 | def process_item(self, item, spider):
38 | # 执行sql语句
39 | self.cursor.execute(self._sql, (
40 | item['title'], item['content'], item['author'], item['avatar'], item['pubtime'], item['article_id'],
41 | item['origin_url']))
42 |
43 | # 插入到数据库中
44 | self.conn.commit()
45 | return item
46 |
47 | def close_spider(self, spider):
48 | # 关闭游标
49 | self.cursor.close()
50 |
51 |
52 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for jianshu_spider project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'jianshu_spider'
13 |
14 | SPIDER_MODULES = ['jianshu_spider.spiders']
15 | NEWSPIDER_MODULE = 'jianshu_spider.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'jianshu_spider (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | 'Accept-Language': 'en',
44 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
45 | }
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | # 'jianshu_spider.middlewares.JianshuSpiderSpiderMiddleware': 543,
51 | # }
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | # 'jianshu_spider.middlewares.JianshuSpiderDownloaderMiddleware': 543,
57 | # }
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | # AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | # AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
92 |
93 | # 在 setting.py 文件中 设置 日志 记录等级
94 | # LOG_LEVEL = 'DEBUG'
95 | # LOG_FILE = 'log.txt'
96 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/spiders/jianshu.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.linkextractors import LinkExtractor
4 | from scrapy.spiders import CrawlSpider, Rule
5 | from jianshu_spider.items import ArticleItem
6 |
7 |
8 | class JianshuSpider(CrawlSpider):
9 | name = 'jianshu'
10 | allowed_domains = ['jianshu.com']
11 | start_urls = ['https://www.jianshu.com/']
12 |
13 | HTTPS = "https:"
14 |
15 | rules = (
16 | # 文章id是有12位小写字母或者数字0-9构成
17 | Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
18 | )
19 |
20 | # 数据测试:scrapy shell https://www.jianshu.com/p/8d5ab6d5f258
21 | def parse_detail(self, response):
22 | title = response.xpath('//h1[@class="title"]/text()').get()
23 |
24 | author = response.xpath('//div[@class="info"]/span/a/text()').get()
25 |
26 | avatar = self.HTTPS + response.xpath('//div[@class="author"]/a/img/@src').get()
27 |
28 | pub_time = response.xpath('//span[@class="publish-time"]/text()').get().replace("*", "")
29 |
30 | current_url = response.url
31 | real_url = current_url.split(r"?")[0]
32 |
33 | article_id = real_url.split(r'/')[-1]
34 |
35 | # 保留标签的H5内容[保留格式,方便后面排版]
36 | content = response.xpath('//div[@class="show-content"]').get()
37 |
38 | item = ArticleItem(
39 | title=title,
40 | avatar=avatar,
41 | pubtime=pub_time,
42 | origin_url=current_url,
43 | author=author,
44 | article_id=article_id,
45 | content=content
46 | )
47 |
48 | yield item
49 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/raw/article.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Navicat MySQL Data Transfer
3 |
4 | Source Server : cal
5 | Source Server Type : MySQL
6 | Source Server Version : 50724
7 | Source Host : localhost
8 | Source Database : jianshu
9 |
10 | Target Server Type : MySQL
11 | Target Server Version : 50724
12 | File Encoding : utf-8
13 |
14 | Date: 12/04/2018 23:08:42 PM
15 | */
16 |
17 | SET NAMES utf8;
18 | SET FOREIGN_KEY_CHECKS = 0;
19 |
20 | -- ----------------------------
21 | -- Table structure for `article`
22 | -- ----------------------------
23 | DROP TABLE IF EXISTS `article`;
24 | CREATE TABLE `article` (
25 | `id` int(11) NOT NULL AUTO_INCREMENT,
26 | `title` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
27 | `content` longtext CHARACTER SET utf8 COLLATE utf8_bin,
28 | `author` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
29 | `avatar` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
30 | `pubtime` datetime DEFAULT NULL,
31 | `article_id` varchar(20) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
32 | `origin_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
33 | PRIMARY KEY (`id`)
34 | ) ENGINE=InnoDB AUTO_INCREMENT=725 DEFAULT CHARSET=utf8;
35 |
36 | SET FOREIGN_KEY_CHECKS = 1;
37 |
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/raw/article_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/jianshu_spider/raw/article_table.png
--------------------------------------------------------------------------------
/scrapy/jianshu_spider/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = jianshu_spider.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jianshu_spider
12 |
--------------------------------------------------------------------------------
/scrapy/qczj/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/scrapy/qczj/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy/qczj/.idea/qczj.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/scrapy/qczj/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/scrapy/qczj/qczj/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/qczj/qczj/__init__.py
--------------------------------------------------------------------------------
/scrapy/qczj/qczj/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 | # 为了方便使用Images Pipline,这里定义image_urls和images两个变量【必须】
11 | class QczjItem(scrapy.Item):
12 | category = scrapy.Field()
13 | image_urls = scrapy.Field()
14 | images = scrapy.Field()
15 |
--------------------------------------------------------------------------------
/scrapy/qczj/qczj/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class QczjSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class QczjDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/scrapy/qczj/qczj/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | # 存储数据
9 | import os
10 | from urllib import request
11 | from scrapy.pipelines.images import ImagesPipeline
12 | from qczj import settings
13 |
14 |
15 | # 场景:由于系统提供的ImagesPipline不能定义子文件件目录和文件名称,这里需要自定义
16 | class CustomImagesPipline(ImagesPipeline):
17 |
18 | # 发送下载图片请求之前调用
19 | def get_media_requests(self, item, info):
20 | request_objs = super(CustomImagesPipline, self).get_media_requests(item, info)
21 |
22 | for request_obj in request_objs:
23 | request_obj.item = item
24 |
25 | # 注意:一定要返回请求对象列表
26 | return request_objs
27 |
28 | # 图片被存储之前才会被执行
29 | def file_path(self, request, response=None, info=None):
30 | path = super(CustomImagesPipline, self).file_path(request, response, info)
31 |
32 | # 获取分类
33 | category = request.item.get('category')
34 |
35 | # 实际要保存的目录下
36 | category_path = os.path.join(settings.IMAGES_STORE, category)
37 |
38 | if not os.path.exists(category_path):
39 | os.mkdir(category_path)
40 |
41 | # 图片的名称 full/%s.jpg
42 | image_name = path.replace("full/", "")
43 |
44 | # 图片要保存的完成路径【注意:这里要写相对路径,相对于:settings.IMAGES_STORE这个目录】【具体查看父类返回的路径】
45 | image_full_path = os.path.join(category, image_name)
46 |
47 | return image_full_path
48 |
--------------------------------------------------------------------------------
/scrapy/qczj/qczj/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for qczj project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | import os
13 |
14 | BOT_NAME = 'qczj'
15 |
16 | SPIDER_MODULES = ['qczj.spiders']
17 | NEWSPIDER_MODULE = 'qczj.spiders'
18 |
19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 | # USER_AGENT = 'qczj (+http://www.yourdomain.com)'
21 |
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 |
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | # CONCURRENT_REQUESTS = 32
27 |
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 1
32 | # The download delay setting will honor only one of:
33 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | # CONCURRENT_REQUESTS_PER_IP = 16
35 |
36 | # Disable cookies (enabled by default)
37 | # COOKIES_ENABLED = False
38 |
39 | # Disable Telnet Console (enabled by default)
40 | # TELNETCONSOLE_ENABLED = False
41 |
42 | # Override the default request headers:
43 | DEFAULT_REQUEST_HEADERS = {
44 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 | 'Accept-Language': 'en',
46 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
47 | }
48 |
49 | # Enable or disable spider middlewares
50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
51 | # SPIDER_MIDDLEWARES = {
52 | # 'qczj.middlewares.QczjSpiderMiddleware': 543,
53 | # }
54 |
55 | # Enable or disable downloader middlewares
56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
57 | # DOWNLOADER_MIDDLEWARES = {
58 | # 'qczj.middlewares.QczjDownloaderMiddleware': 543,
59 | # }
60 |
61 | # Enable or disable extensions
62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
63 | # EXTENSIONS = {
64 | # 'scrapy.extensions.telnet.TelnetConsole': None,
65 | # }
66 |
67 | # Configure item pipelines
68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 | 'qczj.pipelines.CustomImagesPipline': 1
71 | }
72 |
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | # AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | # AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | # AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | # AUTOTHROTTLE_DEBUG = False
85 |
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | # HTTPCACHE_ENABLED = True
89 | # HTTPCACHE_EXPIRATION_SECS = 0
90 | # HTTPCACHE_DIR = 'httpcache'
91 | # HTTPCACHE_IGNORE_HTTP_CODES = []
92 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 |
94 |
95 | # 图片下载路径,供Images pipline使用
96 | IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')
97 |
--------------------------------------------------------------------------------
/scrapy/qczj/qczj/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy/qczj/qczj/spiders/bmw5.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.spiders import CrawlSpider, Rule
4 | from scrapy.linkextractors import LinkExtractor
5 |
6 | from qczj.items import QczjItem
7 |
8 |
9 | # 爬取汽车之家宝马5系的数据,下载原图
10 |
11 | class Bmw5Spider(CrawlSpider):
12 | name = 'bmw5'
13 | allowed_domains = ['car.autohome.com.cn']
14 |
15 | # 宝马5系(进口)汽车系列图地址
16 | start_urls = ['https://car.autohome.com.cn/pic/series/202.html']
17 |
18 | rules = {
19 | # follow=True:接下来解析第二页、第三页、、、
20 | Rule(LinkExtractor(allow=r'https://car.autohome.com.cn/pic/series/202-.+'), callback="parse_page", follow=True)
21 | }
22 |
23 | def parse_page(self, response):
24 | """
25 | 解析满足rules的url【更多图片页面】 https://car.autohome.com.cn/pic/series/202-1-p1.html
26 | :param response:
27 | :return:
28 | """
29 | # 1.获取类别【可以通过scrapy shell url局部测试,不需要运行整个项目】
30 | category = response.xpath('//div[@class="uibox"]/div[1]/text()').get()
31 |
32 | # 2.图片
33 | # 注意:xpath 包含语法【同样可以通过scrapy shell来局部测试正确性】
34 | srcs = response.xpath('//div[contains(@class,"uibox-con")]//li//img/@src').getall()
35 |
36 | # 3.1 对缩略图的地址补全
37 | # 3.2 转换缩略图的url为高清图片的url
38 | srcs = list(map(lambda x: response.urljoin(x).replace("t_", ""), srcs))
39 |
40 | item = QczjItem(category=category, image_urls=srcs)
41 |
42 | print("爬完页面:%s,类别:%s" % (response.url, category))
43 |
44 | yield item
45 |
--------------------------------------------------------------------------------
/scrapy/qczj/readme.MD:
--------------------------------------------------------------------------------
1 | # 爬取汽车之家的图片【宝马5系车】
2 | ### 创建一个爬虫
3 | ```
4 | scrapy genspider bmw5 "car.autohome.com.cn"
5 | ```
6 |
7 |
--------------------------------------------------------------------------------
/scrapy/qczj/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = qczj.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = qczj
12 |
--------------------------------------------------------------------------------
/scrapy/qczj/start.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: start.py
12 | @time: 11/15/18 21:04
13 | @description:方便执行 Python 文件【执行一个 Python 文件】
14 | """
15 | from scrapy import cmdline
16 |
17 | cmdline.execute('scrapy crawl bmw5'.split())
18 |
--------------------------------------------------------------------------------
/scrapy/qsbk/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/scrapy/qsbk/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy/qsbk/.idea/qsbk.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/scrapy/qsbk/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/qsbk/qsbk/__init__.py
--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | # 作用:定义数据模型
9 |
10 | import scrapy
11 |
12 |
13 | class QsbkItem(scrapy.Item):
14 | # define the fields for your item here like:
15 | # name = scrapy.Field()
16 | """
17 | 定义数据模型
18 | """
19 | # 段子作者
20 | author = scrapy.Field()
21 |
22 | # 段子内容
23 | content = scrapy.Field()
24 |
--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | # 作用:定义中间件,包含下载器中间件、爬虫中间件
9 |
10 | from scrapy import signals
11 |
12 |
13 | class QsbkSpiderMiddleware(object):
14 | # Not all methods need to be defined. If a method is not defined,
15 | # scrapy acts as if the spider middleware does not modify the
16 | # passed objects.
17 |
18 | @classmethod
19 | def from_crawler(cls, crawler):
20 | # This method is used by Scrapy to create your spiders.
21 | s = cls()
22 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
23 | return s
24 |
25 | def process_spider_input(self, response, spider):
26 | # Called for each response that goes through the spider
27 | # middleware and into the spider.
28 |
29 | # Should return None or raise an exception.
30 | return None
31 |
32 | def process_spider_output(self, response, result, spider):
33 | # Called with the results returned from the Spider, after
34 | # it has processed the response.
35 |
36 | # Must return an iterable of Request, dict or Item objects.
37 | for i in result:
38 | yield i
39 |
40 | def process_spider_exception(self, response, exception, spider):
41 | # Called when a spider or process_spider_input() method
42 | # (from other spider middleware) raises an exception.
43 |
44 | # Should return either None or an iterable of Response, dict
45 | # or Item objects.
46 | pass
47 |
48 | def process_start_requests(self, start_requests, spider):
49 | # Called with the start requests of the spider, and works
50 | # similarly to the process_spider_output() method, except
51 | # that it doesn’t have a response associated.
52 |
53 | # Must return only requests (not items).
54 | for r in start_requests:
55 | yield r
56 |
57 | def spider_opened(self, spider):
58 | spider.logger.info('Spider opened: %s' % spider.name)
59 |
60 |
61 | class QsbkDownloaderMiddleware(object):
62 | # Not all methods need to be defined. If a method is not defined,
63 | # scrapy acts as if the downloader middleware does not modify the
64 | # passed objects.
65 |
66 | @classmethod
67 | def from_crawler(cls, crawler):
68 | # This method is used by Scrapy to create your spiders.
69 | s = cls()
70 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
71 | return s
72 |
73 | def process_request(self, request, spider):
74 | # Called for each request that goes through the downloader
75 | # middleware.
76 |
77 | # Must either:
78 | # - return None: continue processing this request
79 | # - or return a Response object
80 | # - or return a Request object
81 | # - or raise IgnoreRequest: process_exception() methods of
82 | # installed downloader middleware will be called
83 | return None
84 |
85 | def process_response(self, request, response, spider):
86 | # Called with the response returned from the downloader.
87 |
88 | # Must either;
89 | # - return a Response object
90 | # - return a Request object
91 | # - or raise IgnoreRequest
92 | return response
93 |
94 | def process_exception(self, request, exception, spider):
95 | # Called when a download handler or a process_request()
96 | # (from other downloader middleware) raises an exception.
97 |
98 | # Must either:
99 | # - return None: continue processing this exception
100 | # - return a Response object: stops process_exception() chain
101 | # - return a Request object: stops process_exception() chain
102 | pass
103 |
104 | def spider_opened(self, spider):
105 | spider.logger.info('Spider opened: %s' % spider.name)
106 |
--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | # 作用:保存数据【Json】【Xml、CSV类似,详情查看 exporters 类】
9 |
10 | import json
11 | from .items import QsbkItem
12 |
13 | from scrapy.exporters import JsonLinesItemExporter
14 |
15 | class QsbkPipeline(object):
16 |
17 | def __init__(self):
18 | # JsonLinesItemExporter 必须要以二进制的方式打开
19 | # 注意:以二进制的方式打开写入,不需要指定编码格式;以字符串的形式打开写入,就需要指定编码格式
20 | self.fp = open('duanzi.json', 'wb')
21 |
22 | # 定义一个 exporters
23 | self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
24 |
25 | def open_spider(self, spider):
26 | print('爬虫开始了...')
27 |
28 | def process_item(self, item, spider):
29 | self.exporter.export_item(item)
30 | return item
31 |
32 | def close_spider(self, spider):
33 | self.fp.close()
34 | print('爬虫结束了。')
35 |
--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for qsbk project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | # 作用:爬虫配置文件
13 | # 比如:配置请求头、是否开启 Cookie、下载之前是否延迟
14 |
15 | BOT_NAME = 'qsbk'
16 |
17 | SPIDER_MODULES = ['qsbk.spiders']
18 | NEWSPIDER_MODULE = 'qsbk.spiders'
19 |
20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
21 | # USER_AGENT = 'qsbk (+http://www.yourdomain.com)'
22 |
23 | # Obey robots.txt rules
24 | ROBOTSTXT_OBEY = False
25 |
26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
27 | # CONCURRENT_REQUESTS = 32
28 |
29 | # Configure a delay for requests for the same website (default: 0)
30 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
31 | # See also autothrottle settings and docs
32 |
33 | # 下载延迟
34 | # 1 秒钟停 1 次
35 | DOWNLOAD_DELAY = 1
36 | # The download delay setting will honor only one of:
37 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
38 | # CONCURRENT_REQUESTS_PER_IP = 16
39 |
40 | # Disable cookies (enabled by default)
41 | # COOKIES_ENABLED = False
42 |
43 | # Disable Telnet Console (enabled by default)
44 | # TELNETCONSOLE_ENABLED = False
45 |
46 | # Override the default request headers:
47 | DEFAULT_REQUEST_HEADERS = {
48 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
49 | 'Accept-Language': 'en',
50 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
51 | }
52 |
53 | # Enable or disable spider middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
55 | # SPIDER_MIDDLEWARES = {
56 | # 'qsbk.middlewares.QsbkSpiderMiddleware': 543,
57 | # }
58 |
59 | # Enable or disable downloader middlewares
60 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
61 | # DOWNLOADER_MIDDLEWARES = {
62 | # 'qsbk.middlewares.QsbkDownloaderMiddleware': 543,
63 | # }
64 |
65 | # Enable or disable extensions
66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
67 | # EXTENSIONS = {
68 | # 'scrapy.extensions.telnet.TelnetConsole': None,
69 | # }
70 |
71 | # Configure item pipelines
72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
73 | # 'qsbk.pipelines.QsbkPipeline':Key;300:优先级。值越小,优先级越高。
74 | ITEM_PIPELINES = {
75 | 'qsbk.pipelines.QsbkPipeline': 300,
76 | }
77 |
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | # AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | # AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | # AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | # AUTOTHROTTLE_DEBUG = False
90 |
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | # HTTPCACHE_ENABLED = True
94 | # HTTPCACHE_EXPIRATION_SECS = 0
95 | # HTTPCACHE_DIR = 'httpcache'
96 | # HTTPCACHE_IGNORE_HTTP_CODES = []
97 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 |
--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/spiders/spider_qsbk.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from qsbk.items import QsbkItem
4 | from scrapy.http.response.html import HtmlResponse
5 | from scrapy.selector.unified import SelectorList, Selector
6 |
7 |
8 | # 使用 scrapy 爬取糗事百科
9 |
10 | class SpiderQsbkSpider(scrapy.Spider):
11 | name = 'spider_qsbk'
12 | allowed_domains = ['qiushibaike.com']
13 | start_urls = ['https://www.qiushibaike.com/text/page/1/']
14 | base_domain = "https://www.qiushibaike.com"
15 |
16 | def parse(self, response):
17 | """
18 | 对 Download 下载回来的数据进行解释
19 | :param response: HtmlResponse
20 | :return:
21 | """
22 |
23 | # 1.利用 Xpath 获取所有的段子【divs】
24 | duan_zi_divs = response.xpath('//div[@id="content-left"]/div')
25 |
26 | # items = []
27 |
28 | # 2.遍历出段子进行解析
29 | for duan_zi_div in duan_zi_divs:
30 | # 2.1 获取作者
31 | author = duan_zi_div.xpath(".//h2/text()").get().strip()
32 |
33 | # 2.2 获取段子内容
34 | content_pre = duan_zi_div.xpath(".//div[@class='content']//text()").getall() # 列表
35 | content = "".join(content_pre).strip()
36 |
37 | # 2.3 组装成一个数据模型
38 | item = QsbkItem(author=author, content=content)
39 |
40 | # 2.4 以生成器的方式传给 piplines 管道处理
41 | yield item
42 |
43 | # 查找下一页的链接地址
44 | next_url = None
45 | try:
46 | next_url = self.base_domain + response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
47 | except:
48 | pass
49 |
50 | # 如果找不到下一页【最后一页】,就直接返回
51 | if not next_url:
52 | return
53 | else:
54 | # 执行下一页
55 | yield scrapy.Request(next_url, callback=self.parse)
56 |
--------------------------------------------------------------------------------
/scrapy/qsbk/readme.MD:
--------------------------------------------------------------------------------
1 | # 使用 `Scrapy` 来爬取糗事百科
2 |
3 | 1. 修改 `settings.py` 配置文件
4 |
5 | ```
6 | # 1.修改 ROBOTSTXT_OBEY 为 False
7 | ROBOTSTXT_OBEY = False
8 |
9 | # 2.放开请求头的设置
10 | DEFAULT_REQUEST_HEADERS = {
11 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
12 | 'Accept-Language': 'en',
13 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
14 | }
15 |
16 | # 3.放开 PipLine 便于保存数据
17 | # 'qsbk.pipelines.QsbkPipeline':Key;300:优先级;值越小,优先级越高。
18 | ITEM_PIPELINES = {
19 | 'qsbk.pipelines.QsbkPipeline': 300,
20 | }
21 | ```
22 |
23 | 2. 编写爬虫代码 - `spiders/spider_xx.py`
24 |
25 | 对 `Download` 下载后的数据,利用 `xpath` 进行解释,然后通过生成器传给 `PipLine`
26 |
27 | 3. 编写数据模型
28 |
29 | 定义数据模型,便于管理
30 |
31 | 4. 编写 `Pipline` 管道
32 |
33 | 编写保存数据的代码
34 |
35 | 注意:需要在 `settings.py` 文件中激活 `Pipline`
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
--------------------------------------------------------------------------------
/scrapy/qsbk/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = qsbk.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = qsbk
12 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/.idea/sfw.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/requirements.txt:
--------------------------------------------------------------------------------
1 | aiohttp==3.4.4
2 | asn1crypto==0.24.0
3 | astroid==2.0.4
4 | async-timeout==3.0.1
5 | attrs==18.2.0
6 | Automat==0.7.0
7 | certifi==2018.10.15
8 | cffi==1.11.5
9 | chardet==3.0.4
10 | constantly==15.1.0
11 | cryptography==2.4.1
12 | cssselect==1.0.3
13 | dateparser==0.7.0
14 | demjson==2.2.4
15 | douyin==0.3.6
16 | hyperlink==18.0.0
17 | idna==2.7
18 | incremental==17.5.0
19 | isort==4.3.4
20 | lazy-object-proxy==1.3.1
21 | lxml==4.2.5
22 | mccabe==0.6.1
23 | motor==2.0.0
24 | multidict==4.5.2
25 | parsel==1.5.1
26 | Pillow==5.3.0
27 | pyasn1==0.4.4
28 | pyasn1-modules==0.2.2
29 | pycparser==2.19
30 | PyDispatcher==2.0.5
31 | PyHamcrest==1.9.0
32 | pylint==2.1.1
33 | pymongo==3.7.2
34 | PyMySQL==0.9.2
35 | pyOpenSSL==18.0.0
36 | python-dateutil==2.7.5
37 | pytz==2018.7
38 | queuelib==1.5.0
39 | regex==2018.11.22
40 | requests==2.19.1
41 | retrying==1.3.3
42 | Scrapy==1.5.1
43 | selenium==3.14.1
44 | service-identity==17.0.0
45 | six==1.11.0
46 | tqdm==4.28.1
47 | Twisted==18.9.0
48 | tzlocal==1.5.1
49 | urllib3==1.23
50 | w3lib==1.19.0
51 | wrapt==1.10.11
52 | yarl==1.2.6
53 | zope.interface==4.6.0
54 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = sfw.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sfw
12 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/sfw_spider/sfw/__init__.py
--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class NewHouseItem(scrapy.Item):
12 | """
13 | 新房的数据模型【10个属性:省份、城市、小区名称、价格、几居室、面积、地址、区域、是否在售、详情页面ull】
14 | """
15 | # 省份
16 | province = scrapy.Field()
17 | # 城市
18 | city = scrapy.Field()
19 | # 小区名称
20 | name = scrapy.Field()
21 | # 价格
22 | price = scrapy.Field()
23 | # 几居室【列表】【新房可能有多个房型】
24 | rooms = scrapy.Field()
25 | # 面积
26 | area = scrapy.Field()
27 | # 地址
28 | address = scrapy.Field()
29 | # 行政区
30 | district = scrapy.Field()
31 | # 是否在售
32 | sale = scrapy.Field()
33 | # 详情页面url
34 | origin_url = scrapy.Field()
35 |
36 |
37 | class ESFHouseItem(scrapy.Item):
38 | """
39 | 二手房数据模型【12个属性:省份、城市、小区名称、几室几厅、楼层、朝向、年代、地址、建筑面积、总价、单价、详情页面URL】
40 | """
41 | # 省份
42 | province = scrapy.Field()
43 | # 城市
44 | city = scrapy.Field()
45 | # 小区名称
46 | name = scrapy.Field()
47 | # 几室几厅
48 | rooms = scrapy.Field()
49 | # 楼层
50 | floor = scrapy.Field()
51 | # 朝向
52 | toward = scrapy.Field()
53 | # 年代
54 | year = scrapy.Field()
55 | # 地址
56 | address = scrapy.Field()
57 | # 建筑面积
58 | area = scrapy.Field()
59 | # 总价
60 | price = scrapy.Field()
61 | # 单价
62 | unit = scrapy.Field()
63 | # 详情页面url
64 | origin_url = scrapy.Field()
65 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class SfwSpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class SfwDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
105 |
106 | # ===================================================================
107 | import random
108 |
109 |
110 | # 随机请求头
111 | # 自定义一个下载器中间件【Download Middlewares】【请求头】
112 | # 所有请求头可以参考:http://www.useragentstring.com/pages/useragentstring.php?typ=Browser
113 | class UserAgentDownloaderMiddleware(object):
114 | USER_AGENTS = [
115 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
116 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
117 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
118 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
119 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
120 | 'Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1'
121 | ]
122 |
123 | def process_request(self, request, spider):
124 | # 随机拿到一个请求头
125 | user_agent = random.choice(self.USER_AGENTS)
126 |
127 | # 设置到request
128 | request.headers['User-Agent'] = user_agent
129 |
130 | request.headers['Location'] = None
131 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | from scrapy.exporters import JsonLinesItemExporter
9 | from .items import NewHouseItem, ESFHouseItem
10 |
11 |
12 | class SfwPipeline(object):
13 |
14 | def __init__(self):
15 | self.fp_new_house = open('new_house.json', 'wb')
16 | self.fp_esf_house = open('esf_house.json', 'wb')
17 |
18 | self.exporter_new_house = JsonLinesItemExporter(self.fp_new_house, ensure_ascii=False)
19 | self.exporter_esf_house = JsonLinesItemExporter(self.fp_esf_house, ensure_ascii=False)
20 |
21 | def process_item(self, item, spider):
22 | if isinstance(item, NewHouseItem):
23 | print('写入一条新手房数据')
24 | self.exporter_new_house.export_item(item)
25 | else:
26 | print('写入一条二手房数据')
27 | self.exporter_esf_house.export_item(item)
28 | return item
29 |
30 | def close_spider(self, spider):
31 | self.fp_new_house.close()
32 | self.fp_esf_house.close()
33 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for sfw project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'sfw'
13 |
14 | SPIDER_MODULES = ['sfw.spiders']
15 | NEWSPIDER_MODULE = 'sfw.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'sfw (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | 'Accept-Language': 'en',
44 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
45 | }
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | # 'sfw.middlewares.SfwSpiderMiddleware': 543,
51 | # }
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 | 'sfw.middlewares.SfwDownloaderMiddleware': 543,
57 | 'sfw.middlewares.UserAgentDownloaderMiddleware': 500,
58 | }
59 |
60 | # Enable or disable extensions
61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
62 | # EXTENSIONS = {
63 | # 'scrapy.extensions.telnet.TelnetConsole': None,
64 | # }
65 |
66 | # Configure item pipelines
67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 | 'sfw.pipelines.SfwPipeline': 300,
70 | }
71 |
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | # AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | # AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | # AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | # AUTOTHROTTLE_DEBUG = False
84 |
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | # HTTPCACHE_ENABLED = True
88 | # HTTPCACHE_EXPIRATION_SECS = 0
89 | # HTTPCACHE_DIR = 'httpcache'
90 | # HTTPCACHE_IGNORE_HTTP_CODES = []
91 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 |
93 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy/sfw_spider/start.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: start.py
12 | @time: 11/15/18 21:04
13 | @description: Traceback
14 | """
15 | from scrapy import cmdline
16 |
17 | cmdline.execute('scrapy crawl sfw_spider'.split())
18 |
19 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/.idea/weixin_community.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/readme.MD:
--------------------------------------------------------------------------------
1 | # 使用 `CrawlSpider` 爬取微信小程序论坛
2 | 1. 创建一个项目
3 |
4 | ```
5 | scrapy startproject weixin_community
6 | ```
7 |
8 | 2. 创建一个爬虫
9 |
10 | ```
11 | # 先进入文件夹中
12 | cd weixin_community
13 |
14 | # 创建一个爬虫
15 | scrapy genspider -t crawl wx_spider "wxapp-union.com"
16 | ```
17 |
18 | 3. 使用 `Pycharm` 打开项目
19 |
20 | 4. 设置 `setting.py` 文件
21 |
22 | ```
23 | ROBOTSTXT_OBEY = False
24 |
25 | DOWNLOAD_DELAY = 1
26 |
27 | DEFAULT_REQUEST_HEADERS = {
28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
29 | 'Accept-Language': 'en',
30 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
31 | }
32 |
33 | ITEM_PIPELINES = {
34 | 'weixin_community.pipelines.WeixinCommunityPipeline': 300,
35 | }
36 | ```
37 |
38 | 5. 编写爬虫
39 |
40 | 6. 编写数据模型
41 |
42 | 7. 编写 `Pipline` 管道
43 |
44 | 8. 运行测试
45 |
46 |
47 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/scrapy.cfg:
--------------------------------------------------------------------------------
1 | # Automatically created by: scrapy startproject
2 | #
3 | # For more information about the [deploy] section see:
4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
5 |
6 | [settings]
7 | default = weixin_community.settings
8 |
9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weixin_community
12 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/weixin_community/weixin_community/__init__.py
--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your scraped items
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/items.html
7 |
8 | import scrapy
9 |
10 |
11 | class WeixinCommunityItem(scrapy.Item):
12 | title = scrapy.Field()
13 | author = scrapy.Field()
14 | pub_time = scrapy.Field()
15 | content = scrapy.Field()
16 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/middlewares.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define here the models for your spider middleware
4 | #
5 | # See documentation in:
6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7 |
8 | from scrapy import signals
9 |
10 |
11 | class WeixinCommunitySpiderMiddleware(object):
12 | # Not all methods need to be defined. If a method is not defined,
13 | # scrapy acts as if the spider middleware does not modify the
14 | # passed objects.
15 |
16 | @classmethod
17 | def from_crawler(cls, crawler):
18 | # This method is used by Scrapy to create your spiders.
19 | s = cls()
20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 | return s
22 |
23 | def process_spider_input(self, response, spider):
24 | # Called for each response that goes through the spider
25 | # middleware and into the spider.
26 |
27 | # Should return None or raise an exception.
28 | return None
29 |
30 | def process_spider_output(self, response, result, spider):
31 | # Called with the results returned from the Spider, after
32 | # it has processed the response.
33 |
34 | # Must return an iterable of Request, dict or Item objects.
35 | for i in result:
36 | yield i
37 |
38 | def process_spider_exception(self, response, exception, spider):
39 | # Called when a spider or process_spider_input() method
40 | # (from other spider middleware) raises an exception.
41 |
42 | # Should return either None or an iterable of Response, dict
43 | # or Item objects.
44 | pass
45 |
46 | def process_start_requests(self, start_requests, spider):
47 | # Called with the start requests of the spider, and works
48 | # similarly to the process_spider_output() method, except
49 | # that it doesn’t have a response associated.
50 |
51 | # Must return only requests (not items).
52 | for r in start_requests:
53 | yield r
54 |
55 | def spider_opened(self, spider):
56 | spider.logger.info('Spider opened: %s' % spider.name)
57 |
58 |
59 | class WeixinCommunityDownloaderMiddleware(object):
60 | # Not all methods need to be defined. If a method is not defined,
61 | # scrapy acts as if the downloader middleware does not modify the
62 | # passed objects.
63 |
64 | @classmethod
65 | def from_crawler(cls, crawler):
66 | # This method is used by Scrapy to create your spiders.
67 | s = cls()
68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69 | return s
70 |
71 | def process_request(self, request, spider):
72 | # Called for each request that goes through the downloader
73 | # middleware.
74 |
75 | # Must either:
76 | # - return None: continue processing this request
77 | # - or return a Response object
78 | # - or return a Request object
79 | # - or raise IgnoreRequest: process_exception() methods of
80 | # installed downloader middleware will be called
81 | return None
82 |
83 | def process_response(self, request, response, spider):
84 | # Called with the response returned from the downloader.
85 |
86 | # Must either;
87 | # - return a Response object
88 | # - return a Request object
89 | # - or raise IgnoreRequest
90 | return response
91 |
92 | def process_exception(self, request, exception, spider):
93 | # Called when a download handler or a process_request()
94 | # (from other downloader middleware) raises an exception.
95 |
96 | # Must either:
97 | # - return None: continue processing this exception
98 | # - return a Response object: stops process_exception() chain
99 | # - return a Request object: stops process_exception() chain
100 | pass
101 |
102 | def spider_opened(self, spider):
103 | spider.logger.info('Spider opened: %s' % spider.name)
104 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/pipelines.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Define your item pipelines here
4 | #
5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7 |
8 | from scrapy.exporters import JsonLinesItemExporter,JsonItemExporter
9 |
10 |
11 | # 由于数据量相比比较大,这里使用:JsonLinesItemExporter
12 |
13 | class WeixinCommunityPipeline(object):
14 |
15 | def __init__(self):
16 | self.fp = open('wxjc.json', 'wb')
17 | self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')
18 |
19 | def process_item(self, item, spider):
20 | # 获取一条item,就写入一条数据到文件中
21 | self.exporter.export_item(item)
22 | return item
23 |
24 | def close_spider(self, spider):
25 | self.fp.close()
26 |
27 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/settings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # Scrapy settings for weixin_community project
4 | #
5 | # For simplicity, this file contains only settings considered important or
6 | # commonly used. You can find more settings consulting the documentation:
7 | #
8 | # https://doc.scrapy.org/en/latest/topics/settings.html
9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 |
12 | BOT_NAME = 'weixin_community'
13 |
14 | SPIDER_MODULES = ['weixin_community.spiders']
15 | NEWSPIDER_MODULE = 'weixin_community.spiders'
16 |
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'weixin_community (+http://www.yourdomain.com)'
19 |
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 |
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 |
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 1
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 |
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 |
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 |
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 | 'Accept-Language': 'en',
44 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
45 | }
46 |
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | # 'weixin_community.middlewares.WeixinCommunitySpiderMiddleware': 543,
51 | # }
52 |
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | # 'weixin_community.middlewares.WeixinCommunityDownloaderMiddleware': 543,
57 | # }
58 |
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | # 'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 |
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 | 'weixin_community.pipelines.WeixinCommunityPipeline': 300
69 | }
70 |
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | # AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | # AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG = False
83 |
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 |
--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/spiders/wx_spider.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import scrapy
3 | from scrapy.linkextractors import LinkExtractor
4 | from scrapy.spiders import CrawlSpider, Rule
5 | from weixin_community.items import WeixinCommunityItem
6 |
7 |
8 | class WxSpiderSpider(CrawlSpider):
9 | name = 'wx_spider'
10 | allowed_domains = ['wxapp-union.com']
11 | # 起始页从第 1 页开始
12 | start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
13 |
14 | # 定义规则
15 | rules = (
16 | # 列表【页面】
17 | Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
18 |
19 | # 详情【页面】
20 | Rule(LinkExtractor(allow=r'article-.+\.html'), callback='parse_detail', follow=False)
21 | )
22 |
23 |
24 | def parse_detail(self, response):
25 | # 标题
26 | title = response.xpath('//h1[@class="ph"]/text()').get()
27 |
28 | # p 标签元素
29 | author_element_p = response.xpath('//p[@class="authors"]')
30 |
31 | # 作者
32 | author = author_element_p.xpath('./a/text()').get()
33 |
34 | # 发布时间
35 | pub_time = author_element_p.xpath('./span/text()').get()
36 |
37 | # 内容
38 | content_pre = response.xpath('//td[@id="article_content"]//text()').getall()
39 |
40 | content = "".join(content_pre).strip()
41 |
42 | # 把解析完的数据交个 Pipline 去处理
43 | yield WeixinCommunityItem(title=title, author=author, pub_time=pub_time, content=content)
44 |
--------------------------------------------------------------------------------
/spiders/film_xinpianchang/models.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: models.py
12 | @time: 12/15/18 23:08
13 | @description:数据模型
14 | """
15 |
16 | from datetime import datetime
17 | from mongoengine import StringField, URLField, IntField, Document, connect
18 |
19 | __author__ = 'xag'
20 |
21 | response = connect('admin', host='localhost', port=27017, username='root', password='xag')
22 |
23 |
24 | class FilmModel(Document):
25 | """
26 | 电影【模型】
27 | """
28 | title = StringField() # 电影标题
29 | type = StringField() # 电影类型
30 | play_num = StringField() # 播放量
31 | like_num = StringField() # 喜欢数
32 | img_cover = URLField() # 封面地址
33 | play_address = URLField() # 播放地址
34 | download_address = URLField() # 下载地址
35 |
--------------------------------------------------------------------------------
/spiders/film_xinpianchang/tools_file.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: tools_file.py
12 | @time: 1/29/19 16:29
13 | @description:文件夹工具类
14 | """
15 | import os
16 |
17 |
18 | def mkdir(path):
19 | """
20 | 新建一个目录
21 | :param path:完整路径
22 | :return:
23 | """
24 | if not os.path.exists(path):
25 | os.makedirs(path)
26 |
27 | return path
28 |
--------------------------------------------------------------------------------
/spiders/film_xinpianchang/tools_string.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: tools_string.py
12 | @time: 1/28/19 23:50
13 | @description:TODO
14 | """
15 |
16 | import random
17 | import string
18 |
19 |
20 | def remove_space(str):
21 | return ''.join(str.split(' ')).replace("\t", '').replace("\n", '')
22 |
23 |
24 | def make_random_string(num):
25 | """
26 | 生成随机字符串
27 | :param num:
28 | :return:
29 | """
30 | return ''.join(random.sample(string.ascii_letters + string.digits, num))
31 |
--------------------------------------------------------------------------------
/spiders/spider_bai_si_bu_de_jie.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: spider_bai_si_bu_de_jie.py
12 | @time: 2018/9/25 19:58
13 | @description:利用多线程爬取【百思不得姐】网站的文字和图片并下载到csv文件中
14 | """
15 |
16 | import requests
17 | from lxml import etree
18 | import threading
19 | from queue import Queue
20 | import time
21 | import csv
22 | from urllib import request
23 | import fileutils
24 |
25 | HEADERS = {
26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
27 | 'Referer': 'http://www.budejie.com/hot/1'
28 | }
29 |
30 |
31 | class BSSpider(threading.Thread):
32 | """
33 | 爬取每一页的数据
34 | """
35 |
36 | def __init__(self, page_queue, joke_queue, name, *args, **kwargs):
37 | super(BSSpider, self).__init__(*args, **kwargs)
38 |
39 | # 1.初始化数据
40 | self.page_queue = page_queue
41 | self.joke_queue = joke_queue
42 | self.name = name
43 |
44 | def run(self):
45 | while True:
46 | # 2.如果页面队列为空,就退出循环
47 | if self.page_queue.empty():
48 | print(self.name + '任务完成~')
49 | # while not self.joke_queue.empty():
50 | # print(self.joke_queue.get())
51 | break
52 |
53 | # 3.从队列中获取页面地址
54 | page_url = self.page_queue.get()
55 | self.spider_page(page_url)
56 |
57 | # 6.休眠0.5秒
58 | time.sleep(0.5)
59 |
60 | def spider_page(self, page_url):
61 | """
62 | 爬取一页的数据
63 | :param page_url:页面的url
64 | :return:
65 | """
66 | response = requests.get(page_url, headers=HEADERS)
67 | text_raw = response.text
68 | html_element = etree.HTML(text_raw)
69 |
70 | # 4.利用xpath去解析数据
71 | div_elements = html_element.xpath('//div[@class="j-r-list"]')
72 |
73 | for div_element in div_elements:
74 | duan_zi_elments = div_element.xpath('./ul/li')
75 | for duan_zi_elment in duan_zi_elments:
76 | # 【数据】用户名
77 | username = duan_zi_elment.xpath('.//a[@class="u-user-name"]/text()')[0]
78 |
79 | # 【数据】段子发布时间
80 | pubtime = duan_zi_elment.xpath('.//span/text()')[0]
81 |
82 | desc_element = duan_zi_elment.xpath('.//div[@class="j-r-list-c-desc"]')[0]
83 | # 【数据】段子描述内容
84 | content = desc_element.xpath('./a/text()')[0]
85 |
86 | img_div_element = duan_zi_elment.xpath('.//div[@class="j-r-list-c-img"]')[0]
87 | img = img_div_element.xpath('.//img/@data-original')[0]
88 | alt = img_div_element.xpath('.//img/@alt')[0]
89 |
90 | # 5.把解析后的数据以元组的方式放入到队列中去
91 | self.joke_queue.put((username, content, img, alt, pubtime))
92 |
93 |
94 | class BSWriter(threading.Thread):
95 | """
96 | 下载图片、写入文字数据到csv文件中
97 | """
98 |
99 | def __init__(self, page_queue, joke_queue, writer, gLock, name, *args, **kwargs):
100 | super(BSWriter, self).__init__(*args, **kwargs)
101 |
102 | # 1.初始化
103 | self.page_queue = page_queue
104 | self.joke_queue = joke_queue
105 | self.writer = writer
106 | self.gLock = gLock
107 | self.name = name
108 |
109 | def run(self):
110 | while True:
111 | if self.joke_queue.empty() and self.page_queue.empty():
112 | print(self.name + '任务完成~')
113 | break
114 |
115 | # 2.从joke_queue队列中获取数据
116 | joke_info = self.joke_queue.get(timeout=40)
117 | username, content, img, alt, pubtime = joke_info
118 |
119 | # 3.上锁
120 | self.gLock.acquire()
121 |
122 | # 4.写入数据到csv中
123 | self.writer.writerow((username, content, img, alt, pubtime))
124 |
125 | # 5.下载图片到本地
126 | # file_name = alt + fileutils.get_file_suffix(img)
127 | # request.urlretrieve(img, './imgs/%s' % file_name)
128 |
129 | # 6.释放锁
130 | self.gLock.release()
131 |
132 | print('写入一条数据成功')
133 |
134 |
135 | class BSDownImg(threading.Thread):
136 | """
137 | 下载图片的消费者
138 | """
139 |
140 | def __init__(self, page_queue, joke_queue, gLock, name, *args, **kwargs):
141 | super(BSDownImg, self).__init__(*args, **kwargs)
142 | self.page_queue = page_queue
143 | self.joke_queue = joke_queue
144 | self.gLock = gLock
145 | self.name = name
146 |
147 | def run(self):
148 | while True:
149 | if self.joke_queue.empty() and self.page_queue.empty():
150 | print(self.name + '任务完成~')
151 | break
152 | username, content, img, alt, pubtime = self.joke_queue.get(timeout=40)
153 |
154 | # 上锁并下载图片
155 | self.gLock.acquire()
156 | file_name = alt + fileutils.get_file_suffix(img)
157 | request.urlretrieve(img, './imgs/%s' % file_name)
158 | self.gLock.release()
159 |
160 | print('下载一张图片成功')
161 |
162 |
163 | def spider():
164 | """
165 | 爬取百思不得姐的前20页数据
166 | :return:
167 | """
168 |
169 | # 1.构建队列【生产者、消费者需要上锁的对象】
170 | page_queue = Queue(20)
171 | joke_queue = Queue(200)
172 |
173 | # 2.锁对象
174 | gLock = threading.Lock()
175 |
176 | # 3.写入
177 | fp = open('jokes.csv', 'a', newline='', encoding='utf-8')
178 | writer = csv.writer(fp)
179 |
180 | # 4.写入csv表头信息
181 | writer.writerow(['username', 'content', 'img', 'alt', 'pubtime'])
182 |
183 | # 5.前10页待爬取的地址,放入到队列中
184 | for page_num in range(1, 11):
185 | page_url = 'http://www.budejie.com/hot/%d' % page_num
186 | page_queue.put(page_url)
187 |
188 | # 6.构建10个生成者来进行爬虫
189 | for x in range(1, 6):
190 | t = BSSpider(page_queue, joke_queue, name='生产者%d' % x)
191 | t.start()
192 |
193 | # 7.构建 20 个消费者来写入数据到csv文件中
194 | for x in range(1, 21):
195 | t = BSWriter(page_queue, joke_queue, writer, gLock, name='消费者-文字%d' % x)
196 | t.start()
197 |
198 | # 8.构建 50 个消费者来下载图片
199 | for x in range(1, 51):
200 | t = BSDownImg(page_queue, joke_queue, gLock, name='消费者-图片%d' % x)
201 | t.start()
202 |
203 |
204 | if __name__ == '__main__':
205 | spider()
206 |
--------------------------------------------------------------------------------
/spiders/spider_boss.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: spider_boss.py
12 | @time: 2018/10/12 10:17
13 | @description:使用selenium爬取boss直聘网并写入到csv文件中
14 | """
15 |
16 | from selenium import webdriver
17 | import re
18 | from lxml import etree
19 | import requests
20 | import time
21 | import string_utils
22 | import csv
23 |
24 | current_page = 1
25 |
26 |
27 | class BossSpider(object):
28 | driver_path = "/usr/local/bin/chromedriver"
29 |
30 | def __init__(self):
31 | self.driver = webdriver.Chrome(executable_path=BossSpider.driver_path)
32 |
33 | # 网页前缀
34 | self.domain = 'https://www.zhipin.com'
35 |
36 | # 爬取在首页
37 | self.url = 'https://www.zhipin.com/job_detail/?query=python&scity=100010000&industry=&position='
38 |
39 | self.positions = []
40 |
41 | # 保存数据到 csv 文件中【追加】
42 | fp = open('positions.csv', 'a', newline='', encoding='utf-8')
43 | self.writer = csv.DictWriter(fp, ['company_name', 'name', 'salary', 'city', 'work_years', 'education', 'desc'])
44 | self.writer.writeheader()
45 |
46 | def run(self):
47 | self.driver.get(self.url)
48 |
49 | global current_page
50 |
51 | while True:
52 |
53 | print('爬取第%d页数据' % current_page)
54 | current_page = current_page + 1
55 |
56 | # 获取首页在源码内容
57 | source = self.driver.page_source
58 |
59 | # 爬去当前页面在数据
60 | self.parse_current_page(source)
61 |
62 | next_bt = self.driver.find_element_by_xpath("//a[@ka='page-next']")
63 |
64 | if 'disabled' in next_bt.get_attribute("class"):
65 | # 最后一页,爬取完成之后,退出应用
66 | self.driver.quit()
67 | break
68 | else:
69 | next_bt.click()
70 |
71 | time.sleep(1)
72 |
73 | # 由于boss直聘做了反爬【验证码】,这里只爬取一页数据
74 | break
75 |
76 | def parse_current_page(self, source):
77 | """
78 | 解析当前页面在数据获取到详情页面在url:detail_url
79 | :param source:
80 | :return:
81 | """
82 | html = etree.HTML(source)
83 |
84 | # 获取到每一个职位在详情地址
85 | detail_urls_pre = html.xpath('//div[@class="info-primary"]//a/@href')
86 | # links = html.xpath("//div[@class='info-primary']//a[position()=1]/@href")
87 |
88 | # 利用lambda + map 对职位详情地址列表加入前缀
89 | detail_urls = list(map(lambda x: self.domain + x, detail_urls_pre))
90 |
91 | # 爬取详情页面的数据
92 | for detail_url in detail_urls:
93 | self.request_detail_page(detail_url)
94 |
95 | time.sleep(1)
96 |
97 | def request_detail_page(self, detail_url):
98 | """
99 | 打开职位详情页面
100 | :param detail_url:
101 | :return:
102 | """
103 |
104 | # 1.切换到详情页面窗口
105 | self.driver.execute_script("window.open('%s')" % (detail_url))
106 | self.driver.switch_to.window(self.driver.window_handles[1])
107 |
108 | # 2.获取详情页面的源码数据
109 | page_source_detail = self.driver.page_source
110 |
111 | # 3.解析详情页面
112 | self.parse_detail_page(page_source_detail)
113 |
114 | # 4.关闭当前窗口并切换回列表
115 | self.driver.close()
116 |
117 | self.driver.switch_to.window(self.driver.window_handles[0])
118 |
119 | def parse_detail_page(self, page_source_detail):
120 | """
121 | 解析职位详情页面
122 | :param page_source_detail:
123 | :return:
124 | """
125 | html = etree.HTML(page_source_detail)
126 |
127 | # 数据 - 名称
128 | name = html.xpath('//h1/text()')[0]
129 |
130 | # 数据 - 公司名称
131 | company_name = html.xpath('//h3[@class="name"]/a[@ka="job-detail-company"]/text()')[0].strip()
132 |
133 | # 数据 - 薪水
134 | salary = html.xpath("//div[@class='name']/span[@class='badge']/text()")[0].strip()
135 |
136 | # 数据 - info
137 | infos = html.xpath("//div[@class='job-primary detail-box']/div[@class='info-primary']/p/text()")
138 |
139 | desc_pre = html.xpath('//div[@class="job-sec"]/div[@class="text"]/text()')
140 |
141 | # 每一项换行,去掉前后空格,最后去掉特殊符号
142 | desc = string_utils.remove_special_word('\n'.join(desc_pre).strip())
143 |
144 | city = infos[0]
145 | work_years = infos[1]
146 | education = infos[2]
147 |
148 | position = {
149 | 'company_name': company_name,
150 | 'name': name,
151 | 'salary': salary,
152 | 'city': city,
153 | 'work_years': work_years,
154 | 'education': education,
155 | 'desc': desc
156 |
157 | }
158 | print('爬取一条数据成功')
159 | print("==" * 40)
160 |
161 | # 写入到csv文件中
162 | self.write_to_csv(position)
163 |
164 | self.positions.append(position)
165 |
166 | def write_to_csv(self, position):
167 | """
168 | 把职位信息写入到 csv 文件中
169 | :param position:
170 | :return:
171 | """
172 | self.writer.writerow(position)
173 |
174 |
175 | if __name__ == '__main__':
176 | # 定义爬虫类
177 | spider = BossSpider()
178 |
179 | # 开始执行爬虫
180 | spider.run()
181 |
182 | # 写入到csv文件中
183 |
184 | # 查看数据
185 | print('恭喜!爬取数据完成~')
186 | print(spider.positions)
187 |
--------------------------------------------------------------------------------
/spiders/spider_china_weather.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: spider_china_weather.py
12 | @time: 2018/9/20 0:04
13 | @description:利用requests + bs4 + html5lib + pyecharts爬取中国天气网的最低气温并可视化
14 | @install:# pip install pyecharts/pyecharts-snapshot
15 | """
16 |
17 | import requests
18 | from bs4 import BeautifulSoup
19 | import time
20 | from pyecharts import Bar
21 |
22 |
23 | # 一共8个区域,包含:华北、东北、华东、华中、华南、西北、西南、港澳台
24 | # 华北
25 | url_hb = 'http://www.weather.com.cn/textFC/hb.shtml'
26 |
27 | # 东北
28 | url_db = 'http://www.weather.com.cn/textFC/db.shtml'
29 |
30 | # 华东
31 | url_hd = 'http://www.weather.com.cn/textFC/hd.shtml'
32 |
33 | # 华中
34 | url_hz = 'http://www.weather.com.cn/textFC/hz.shtml'
35 |
36 | # 华南
37 | url_hn = 'http://www.weather.com.cn/textFC/hn.shtml'
38 |
39 | # 西北
40 | url_xb = 'http://www.weather.com.cn/textFC/xb.shtml'
41 |
42 | # 西南
43 | url_xn = 'http://www.weather.com.cn/textFC/xn.shtml'
44 |
45 | # 港澳台【比较特殊】
46 | url_gat = 'http://www.weather.com.cn/textFC/gat.shtml'
47 |
48 | url_areas = [url_hb, url_db, url_hd, url_hz, url_hn, url_xb, url_xn, url_gat]
49 |
50 | HEADERS = {
51 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
52 | 'Referer': 'http://www.weather.com.cn/textFC/hb.shtml'
53 | }
54 |
55 | # 数据【城市+最低温度】列表
56 | ALL_DATA = []
57 |
58 |
59 | def parse_page(url):
60 | """
61 | 解析一个区域:华北、东北、华东等
62 | :param url:
63 | :return:
64 | """
65 | response = requests.get(url, headers=HEADERS)
66 |
67 | # 1.获取页面的原始html数据
68 | text = response.content.decode('utf-8')
69 |
70 |
71 | # 注意:港澳台中香港的table标签没有正确的关闭,使用lxml解析器不能正确解析。需要使用html5lib【容错性强】去自动补全代码,然后进行解析
72 | soup = BeautifulSoup(text, 'html5lib')
73 |
74 | div_conMidtab = soup.find('div', class_='conMidtab')
75 |
76 | # 3.获取所有的table子Tag【天气信息都保存在table标签下面】
77 | tables = div_conMidtab.find_all('table')
78 |
79 | # 4.遍历片区下面的省份
80 | for table in tables:
81 | # 4.1过滤掉表头的两个tr数据
82 | trs = table.find_all('tr')[2:]
83 |
84 | # 5.遍历省份下面的市区
85 | for index, tr in enumerate(trs):
86 | tds = tr.find_all('td')
87 |
88 | # 5.1 城市名称【第 1 个td标签】
89 | # 注意:一个省份第一个城市取第 2 个td标签;其余城市取第 1 个td标签
90 | city_td = tds[1] if index == 0 else tds[0]
91 |
92 | city = list(city_td.stripped_strings)[0]
93 |
94 | # 5.2 最低气温【倒数第 2 个td标签】
95 | temp_low_td = tds[-2]
96 |
97 | temp_low = list(temp_low_td.stripped_strings)[0]
98 |
99 | ALL_DATA.append({"city": city, "temp_low": int(temp_low)})
100 |
101 |
102 | def spider():
103 | for index, url in enumerate(url_areas):
104 | print('开始爬取第{}个区域'.format(index + 1))
105 | parse_page(url)
106 | time.sleep(1)
107 |
108 |
109 | def analysis_data():
110 | """
111 | 分析爬下来的数据
112 | :return:
113 | """
114 |
115 | # 1.默认的排序方式是升序【通过最低气温进行排序】
116 | ALL_DATA.sort(key=lambda data: data['temp_low'])
117 |
118 | # 2.获取前面10条数据
119 | top_10 = ALL_DATA[:10]
120 |
121 | return top_10
122 |
123 |
124 | def show_with_chart(top_10):
125 | """
126 | 把最低的十个城市和温度生成饼状图
127 | :param top_10:
128 | :return:
129 | """
130 | # 1.获取城市列表
131 | citys = list(map(lambda item: item['city'], top_10))
132 |
133 | # 2.最低温度列表
134 | temp_lows = list(map(lambda item: item['temp_low'], top_10))
135 |
136 | # 3.生成饼状图并写入到html文件中
137 | bar = Bar("最低气温排行榜")
138 |
139 | bar.add("最低温度", citys, temp_lows)
140 |
141 | # 渲染
142 | bar.render('temperature.html')
143 |
144 |
145 | if __name__ == '__main__':
146 | # 1.爬取数据
147 | spider()
148 |
149 | # 2.分析数据
150 | top_10 = analysis_data()
151 |
152 | # 3.使用pyecharts生成饼状图
153 | show_with_chart(top_10)
154 |
--------------------------------------------------------------------------------
/spiders/spider_dou_tu_la.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: spider_dou_tu_la
12 | @time: 2018/9/25 14:40
13 | @description:多线程去爬取斗图啦网站的表情
14 | @spider_to:http://www.doutula.com/
15 | """
16 |
17 | import requests
18 | from lxml import etree
19 | from urllib import request
20 | import re
21 | import os
22 | import threading
23 | from queue import Queue
24 | import time
25 |
26 | # 技术点
27 | # 1.使用request是获取html数据
28 | # 2.使用xpath解析数据
29 | # 3.使用正则表达式sub()函数过滤掉特殊的字符
30 | # 4.使用urllib.request.urlretrieve()下载图片
31 | # 5.生产者和消费者模式分离
32 | # 6.使用queue[线程安全]去保存【每一页的爬取地址】和【表情图片地址】
33 |
34 | HEADERS = {
35 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
36 | }
37 |
38 |
39 | class Procuder(threading.Thread):
40 | """
41 | 生产者
42 | 爬取页面,获取图片地址加入到表情图片队列中
43 | """
44 |
45 | def __init__(self, name, page_queue, img_queue, *args, **kwargs):
46 | super(Procuder, self).__init__(*args, **kwargs)
47 | self.name = name
48 | self.page_queue = page_queue
49 | self.img_queue = img_queue
50 |
51 | def run(self):
52 | while True:
53 | if self.page_queue.empty():
54 | print(self.name + '任务完成~')
55 | break
56 | # 1.获取每一页的url
57 | page_url = self.page_queue.get()
58 |
59 | # 2.爬取页面的数据
60 | self.spider_page(page_url)
61 |
62 | # 3.休眠0.5秒
63 | time.sleep(0.5)
64 |
65 | def spider_page(self, url):
66 | """
67 | 爬取每一页
68 | :param url: 每一页的地址
69 | :return:
70 | """
71 | response = requests.get(url, headers=HEADERS)
72 | text_raw = response.text
73 |
74 | # 1.使用etree
75 | html_raw = etree.HTML(text_raw)
76 |
77 | # 2.使用xpath解析数据
78 | # 注意:过滤掉gif标签图片
79 | imgs = html_raw.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
80 |
81 | # 3.获取图片的实际连接并下载到本地
82 | for img in imgs:
83 | # 3.1 图片的实际地址
84 | img_url = img.get('data-original')
85 |
86 | # 3.2 图片名称替换特殊符号
87 | alt = re.sub(r'[\??\.,。!!\*]', '', img.get('alt'))
88 |
89 | # 3.3 提取图片的后缀,组装成文件的名字
90 | img_name = alt + os.path.splitext(img_url)[-1]
91 |
92 | # 3.4 把爬取到的表情【图片地址+图片名称】以【元组】的形式加入到队列图片队列中
93 | self.img_queue.put((img_url, img_name))
94 |
95 |
96 | class Consumer(threading.Thread):
97 | """
98 | 消费者
99 | 获取图片的地址下载到本地
100 | """
101 |
102 | def __init__(self, name, page_queue, img_queue, *args, **kwargs):
103 | super(Consumer, self).__init__(*args, **kwargs)
104 | self.name = name
105 | self.page_queue = page_queue
106 | self.img_queue = img_queue
107 |
108 | def run(self):
109 | while True:
110 |
111 | if self.img_queue.empty() and self.page_queue.empty():
112 | print(self.name + '任务完成~')
113 | break
114 |
115 | # 1.解包,获取图片的地址 + 图片的名称
116 | img_url, img_name = self.img_queue.get()
117 |
118 | # 2.使用urlretrieve()函数下载图片到本地
119 | request.urlretrieve(img_url, './imgs/%s' % img_name)
120 |
121 | print(img_name + "下载完成")
122 |
123 |
124 | def spider():
125 | # 1.页面的队列
126 | page_queue = Queue(100)
127 |
128 | # 2.表情图片的队列
129 | # 注意:队列的大小尽量设置大一些,保证线程减少等待的时间
130 | img_queue = Queue(1000)
131 |
132 | # 3.爬取页面的地址
133 | for x in range(1, 10):
134 | url = 'http://www.doutula.com/photo/list/?page=%d' % x
135 |
136 | # 3.1 存入到页面地址队列中
137 | page_queue.put(url)
138 |
139 | # 创建5个生成者和5个消费者
140 | # 生产者:爬取每一页的数据,获取表情图片的url
141 | # 消费者:从表情队列中获取表情图片的实际地址并下载到本地
142 | for x in range(5):
143 | t = Procuder(name='生产线程-%d' % x, page_queue=page_queue, img_queue=img_queue)
144 | t.start()
145 |
146 | for x in range(5):
147 | t = Consumer(name='消费线程-%d' % x, page_queue=page_queue, img_queue=img_queue)
148 | t.start()
149 |
150 |
151 | if __name__ == '__main__':
152 | spider()
153 |
--------------------------------------------------------------------------------
/spiders/spider_dytt.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: 4.dytt.py
12 | @time: 2018/9/16 18:46
13 | @description:爬电影天堂【 lxml + xpath + requests】【2018新片精品,包含更多】
14 | """
15 |
16 | import requests
17 | from lxml import etree
18 | import time
19 |
20 | # url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
21 |
22 | # 主页地址
23 | BASE_DOMAIN = 'http://www.dytt8.net'
24 |
25 | HEADERS = {
26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
27 | }
28 |
29 |
30 | def get_detail_urls(url):
31 | """
32 | 获取电影详情页面的url
33 | :param url: 每一页电影列表的地址url
34 | :return:
35 | """
36 | response = requests.get(url, headers=HEADERS)
37 |
38 | # 注意:右键查看源代码,charset=gb2312" 编码方式【网站编码不规范,解码必须用响应的编码方式进行解码】
39 | # print(response.content.decode('gbk'))
40 |
41 | # html_element = etree.HTML(response.content.decode('gbk'))
42 |
43 | # 注意:电影天堂第3页使用默认的gbk会有乱码,这里使用默认的解码方式【href为英文,解析不会受影响】
44 | html_element = etree.HTML(response.text)
45 |
46 | # 【数据 - 字符串列表】详情页面地址
47 | # 所有class为tbspan的table标签/子孙标签中的a标签的href属性
48 | detail_urls = html_element.xpath('//table[@class="tbspan"]//a/@href')
49 |
50 | # 深拷贝一份列表数据,实现一变遍历列表,一边删除列表数据
51 | # 过滤掉【综合电影】导致的脏数据
52 | detail_urls_new = detail_urls
53 | for index, detail_url in enumerate(detail_urls_new):
54 | if detail_url == '/html/gndy/jddy/index.html':
55 | detail_urls.remove(detail_url)
56 |
57 | # print(detail_urls)
58 |
59 | # print(BASE_DOMAIN + detail_url)
60 | # 组装详情页面的地址
61 | detail_urls = map(lambda x: BASE_DOMAIN + x, detail_urls)
62 |
63 | return detail_urls
64 |
65 |
66 | def parse_detail_page(detail_url):
67 | """
68 | 解析电影详情页面
69 | :param detail_url: 详情页面的地址
70 | :return:
71 | """
72 | response = requests.get(detail_url, headers=HEADERS)
73 | text = response.content.decode('gbk')
74 | html_element = etree.HTML(text)
75 |
76 | # 【数据 - 电影标题】
77 | title = html_element.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()')[0]
78 |
79 | # 获取zoom标签
80 | zoom_element = html_element.xpath('//div[@id="Zoom"]')[0]
81 |
82 | # 【数据 - 电影封面和电影截图】
83 | imgs = zoom_element.xpath(".//img/@src")
84 |
85 | # 注意:为了避免脏数据导致应用挂掉,提前初始化
86 | year, country, type, rating, duration, director, actors, cover, screen_shot, download_url = '', '', '', '', '', '', '', '', '', ''
87 |
88 | if len(imgs) > 0:
89 | cover = imgs[0]
90 |
91 | # 【数据 - 电影截图】
92 | if len(imgs) > 1:
93 | screen_shot = imgs[1]
94 |
95 | # 获取div[@id='zoom']标签下面的所有的文本数据【子孙所有的text文本数据】
96 | infos = zoom_element.xpath('.//text()')
97 |
98 | # 解析具体内容的函数
99 | def parse_info(info, rule):
100 | return info.replace(rule, '').strip()
101 |
102 | # 遍历infos每一项去获取有用的数据
103 | for key, info in enumerate(infos):
104 |
105 | # print('遍历第{}项'.format(key))
106 | # print(info)
107 | # print('结束==================================================')
108 |
109 | if info.startswith('◎年 代'):
110 | # 年代
111 | year = parse_info(info, '◎年 代')
112 | elif info.startswith('◎产 地'):
113 | # 产地
114 | country = parse_info(info, '◎产 地')
115 | elif info.startswith('◎类 别'):
116 | # 类别
117 | type = parse_info(info, '◎类 别')
118 | elif info.startswith('◎豆瓣评分'):
119 | # 豆瓣评分
120 | rating = parse_info(info, '◎豆瓣评分')
121 | elif info.startswith('◎片 长'):
122 | # 片长
123 | duration = parse_info(info, '◎片 长')
124 | elif info.startswith('◎导 演'):
125 | # 导演
126 | director = parse_info(info, '◎导 演')
127 | elif info.startswith('◎主 演'):
128 | # 演员【第一个演员】
129 | actor_first = parse_info(info, '◎主 演')
130 |
131 | actors = [actor_first]
132 |
133 | # 继续往下面遍历
134 | for index in range(key + 1, len(infos)):
135 | item = infos[index].strip()
136 | if item.startswith('◎简 介'):
137 | break
138 | # 获取所有的演员
139 | # print(item)
140 | actors.append(item)
141 | elif info.startswith('◎简 介'):
142 | # desc = parse_info(info, '◎简 介')
143 |
144 | for index in range(key + 1, len(infos)):
145 | item = infos[index].strip()
146 | if item.startswith('【下载地址】'):
147 | break
148 | desc = item
149 |
150 | print(detail_url)
151 |
152 | # 下载地址
153 | if len(html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()')) > 0:
154 | download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()')[0]
155 | elif len(html_element.xpath('//td[@bgcolor="#fdfddf"]/text()')) > 0:
156 | download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/text()')[0]
157 |
158 | film = {
159 | 'title': title,
160 | 'cover': cover,
161 | 'screen_shot': screen_shot,
162 | 'year': year,
163 | 'country': country,
164 | 'type': type,
165 | 'rating': rating,
166 | 'duration': duration,
167 | 'director': director,
168 | 'actors': actors,
169 | 'desc': desc,
170 | 'download_url': download_url
171 | }
172 |
173 | return film
174 |
175 |
176 | def spider():
177 | """
178 | 爬虫的入口
179 | :return:
180 | """
181 | base_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
182 |
183 | films = []
184 |
185 | # 1.获取第1-10页的数据
186 | for index in range(1, 11):
187 | print('开始爬第{}页'.format(index))
188 |
189 | # 2.电影列表的地址url
190 | url = base_url.format(index)
191 |
192 | # 3.获取当前页面包含的所有电影【详情地址】
193 | detail_urls = get_detail_urls(url)
194 |
195 | # 4.解析每一项电影的详情页面
196 |
197 | for key, detail_url in enumerate(detail_urls):
198 | # print('索引:' + str(key) + ',地址:' + detail_url)
199 | # print('解析详情页面:' + detail_url)
200 | film = parse_detail_page(detail_url)
201 |
202 | films.append(film)
203 |
204 | # 5.每爬取一页,就休眠2秒钟
205 | time.sleep(1)
206 |
207 | print(films)
208 |
209 |
210 | if __name__ == '__main__':
211 | spider()
212 |
--------------------------------------------------------------------------------
/spiders/spider_gushiwen.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: spider_gushiwen
12 | @time: 2018/9/21 17:34
13 | @description:利用【正则表达式】爬取【古诗文】网
14 | @link:https://www.gushiwen.org/
15 | """
16 |
17 | import requests
18 | import re
19 | import time
20 |
21 | HEADERS = {
22 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
23 | }
24 |
25 |
26 | # 利用正则表达式去爬虫的注意事项
27 | # 1.正则表达式去爬取元素的时候,与 xpath、bs4 不同,没有结构关系,都是当成一个字符串进行匹配处理
28 | # 2.re.DOTALL可以让【.符号】匹配到所有的字符【包含\n】
29 | # 3.正则表达式匹配【任意多字符】一般采用非饥饿型方式【.*?】
30 |
31 |
32 | def spider_page(url):
33 | """
34 | 爬取某一页的数据
35 | :param url:
36 | :return:
37 | """
38 | response = requests.get(url, headers=HEADERS)
39 | text_raw = response.text
40 |
41 | # print(text_raw)
42 |
43 | # 1.获取所有的标题
44 | titles = re.findall(r'
.*?
(.*?)', text_raw, re.DOTALL)
45 |
46 | # 2.获取所有的朝代
47 | dynasties = re.findall(r'
.*?(.*?)', text_raw, re.DOTALL)
48 |
49 | # 3.获取作者信息
50 | authors = re.findall(r'.*?.*?(.*?)', text_raw, re.DOTALL)
51 |
52 | # 4.获取古诗文内容
53 | # 内容待进一步美化【去掉多余的元素】
54 | contents_pre = re.findall(r'(.*?)
', text_raw, re.DOTALL)
55 |
56 | contents = []
57 | for content_pre in contents_pre:
58 | # 4.1 利用sub()函数把内容中的【<.*?>或者换行字符】替换为空
59 | content = re.sub(r'<.*?>|\n', "", content_pre)
60 | contents.append(content.strip())
61 |
62 | # 诗词列表数据
63 | poems = []
64 |
65 | # 5. 使用zip()把四个列表组合在一起
66 | for value in zip(titles, dynasties, authors, contents):
67 | # 5.1 自动进行解包放入到变量当中
68 | title, dynastie, author, content = value
69 |
70 | # 5.2 新建dict,并加入到诗词列表数据中
71 | poem = {
72 | 'title': title,
73 | 'dynastie': dynastie,
74 | 'author': author,
75 | 'content': content
76 | }
77 |
78 | poems.append(poem)
79 |
80 | return poems
81 |
82 |
83 | def spider():
84 | # 全部诗词列表数据
85 | poems = []
86 |
87 | # 1.爬取前面10页数据
88 | for page_num in range(10):
89 | url = 'https://www.gushiwen.org/default_{}.aspx'.format(page_num + 1)
90 |
91 | print('开始爬取第{}页诗词数据'.format(page_num + 1))
92 |
93 | poems.append(spider_page(url))
94 |
95 | time.sleep(1)
96 |
97 | # 2.显示数据
98 | for poem in poems:
99 | print(poem)
100 | print("==" * 40)
101 |
102 | print('恭喜!爬取数据完成!')
103 |
104 |
105 | if __name__ == '__main__':
106 | spider()
107 |
--------------------------------------------------------------------------------
/spiders/spider_qiu_shi_bai_ke.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: spider_qiu_shi_bai_ke.py
12 | @time: 2018/9/21 23:16
13 | @description:利用正则表达式去爬取【糗事百科】的文字数据
14 | @link:https://www.qiushibaike.com/text/
15 | """
16 |
17 | import re
18 | import requests
19 |
20 | # 待爬取的地址
21 | base_url = 'https://www.qiushibaike.com/text/page/%s/'
22 |
23 | HEADERS = {
24 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
25 | 'Referer': 'https://www.qiushibaike.com/'
26 | }
27 |
28 |
29 | def spider_page(url):
30 | """
31 | 爬取某一页的数据
32 | :param url:
33 | :return:
34 | """
35 | response = requests.get(url, headers=HEADERS)
36 | text_raw = response.text
37 |
38 | # 获取此页的段子数据
39 | # 1.获取作者列表数据
40 | authors_pre = re.findall(r'(.*?)', text_raw, re.DOTALL)
41 |
42 | # 1.1 对获取的作者信息进一步进行处理【数据中包含\n】
43 | authors = []
44 | for author_pre in authors_pre:
45 | author = re.sub(r'\n', '', author_pre)
46 | authors.append(author)
47 |
48 | # 2.获取段子列表数据
49 | contents_pre = re.findall(r'
.*?
(.*?)', text_raw, re.S)
50 |
51 | # 2.1 对段子数据进一步处理【数据中包含\n和
】
52 | contents = []
53 | for content_pre in contents_pre:
54 | content = re.sub(r'<.*?>|\n', '', content_pre)
55 | contents.append(content)
56 |
57 | # 3.把两个列表数据组装成一个新的列表中
58 | jokes = []
59 | for temp in zip(authors, contents):
60 | author, content = temp
61 | jokes.append({
62 | 'author': author,
63 | 'content': content
64 | })
65 |
66 | # 4.返回当前页面获取的段子数据列表
67 | return jokes
68 |
69 |
70 | def spider():
71 | jokes = []
72 |
73 | for page_num in range(1, 10):
74 | print('开始爬取第%s页数据' % page_num)
75 |
76 | # 爬取某一页的数据
77 | jokes.append(spider_page(base_url % page_num))
78 |
79 | # 打印爬取的数据
80 | for joke in jokes:
81 | print(joke)
82 |
83 | print('恭喜!爬取数据完成!')
84 |
85 |
86 | if __name__ == '__main__':
87 | spider()
88 |
--------------------------------------------------------------------------------
/spiders/spider_tencent_recruit.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: spider_tencent_recruit
12 | @time: 2018/9/17 11:22
13 | @description:爬腾讯招聘职位信息
14 | """
15 |
16 | import requests
17 |
18 | from lxml import etree
19 |
20 | import time
21 |
22 | # 每页的职位数
23 | PAGE_SIZE = 10
24 |
25 | BASE_DOMAIN = 'https://hr.tencent.com/'
26 |
27 | HEADERS = {
28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
29 | 'Referer': 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=10',
30 | 'Cookie': '_ga=GA1.2.1222789966.1535530525; pgv_pvi=8193187840; pgv_si=s2985358336; PHPSESSID=22e3m8aknd19s1gqkh0i9eisk0; Hm_lvt_0bd5902d44e80b78cb1cd01ca0e85f4a=1536726429,1536908218,1537154694,1537166987; Hm_lpvt_0bd5902d44e80b78cb1cd01ca0e85f4a=1537167106'
31 | }
32 |
33 |
34 | def get_jo_detail_urls(page_url):
35 | """
36 | 1.根据当前页面url地址获取每一个职位的详情页面url
37 | :param page_url:当前页面的url
38 | :return:
39 | """
40 | response = requests.get(page_url, headers=HEADERS)
41 |
42 | html_element = etree.HTML(response.text)
43 |
44 | # print(etree.tostring(html_element, encoding='utf-8').decode('utf-8'))
45 |
46 | detail_urls = html_element.xpath('//tr[@class="even" or @class="odd"]//a/@href')
47 |
48 | # 获取所有职位详情页面的url
49 | detail_urls = map(lambda detail_url: BASE_DOMAIN + detail_url, detail_urls)
50 |
51 | return detail_urls
52 |
53 |
54 | def get_detail_msg(detail_url):
55 | """
56 | 2.获取某个职位的详细数据
57 | :param detail_url: 职位详细页面的url
58 | :return: 职位数据
59 | """
60 | # print('请求的详细地址是:' + detail_url)
61 | response = requests.get(detail_url, headers=HEADERS)
62 | html_element = etree.HTML(response.text)
63 |
64 | position = {}
65 |
66 | # 【数据】获取职位标题
67 | title = html_element.xpath('//tr[@class="h"]/td/text()')[0]
68 | position['title'] = title
69 |
70 | # 【数据】工作地点/职位类别
71 | top_infos = html_element.xpath('//tr[@class="c bottomline"]//text()')
72 | position['location'] = top_infos[top_infos.index('工作地点:') + 1]
73 | position['category'] = top_infos[top_infos.index('职位类别:') + 1]
74 |
75 | content_infos = html_element.xpath('//ul[@class="squareli"]')
76 | # 【数据】工作职责
77 | work_do_info = content_infos[0]
78 | position['duty'] = work_do_info.xpath("./li/text()")
79 |
80 | # 【数据】工作要求
81 | work_ask_info = content_infos[1]
82 | position['ask'] = work_ask_info.xpath('./li/text()')
83 |
84 | return position
85 |
86 |
87 | def spider():
88 | # 0.待返回的职位数据
89 | positions = []
90 |
91 | # 1.获取前10页的职位数据
92 | for page_num in range(0, 10):
93 | print('开始爬取第{}页数据'.format(page_num + 1))
94 |
95 | # 2.每一页的地址
96 | url = 'https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a'.format(page_num * PAGE_SIZE)
97 |
98 | # 3.获取【当前页】所有职位的【详情页面的url】
99 | detail_urls = get_jo_detail_urls(url)
100 |
101 | # 4.一个个去解析详情页面的数据
102 | for detail_url in detail_urls:
103 | position = get_detail_msg(detail_url)
104 | positions.append(position)
105 |
106 | time.sleep(1)
107 |
108 | print('爬取完成!')
109 | print(positions)
110 |
111 |
112 | if __name__ == '__main__':
113 | spider()
114 |
--------------------------------------------------------------------------------
/spiders/发表情/auto_send_emoji.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: auto_send_emoji.py
12 | @time: 3/14/19 16:22
13 | @description:根据要求选择表情,发给微信上对应的好友或者微信群
14 | """
15 |
16 | import requests
17 | from lxml import etree
18 | import os
19 | import re
20 | from utils.string_utils import *
21 | import time
22 | import random
23 | from urllib import request
24 | import itchat
25 | from utils.chat_utils import *
26 | import matplotlib.pyplot as plt
27 | import matplotlib.image as mpimg
28 | from queue import Queue
29 | import threading
30 |
31 | # pip3 install itchat
32 |
33 | HEADERS = {
34 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
35 | }
36 |
37 | url = 'https://www.doutula.com/search?type=photo&more=1&keyword={}&page={}'
38 |
39 |
40 | class Spider(object):
41 |
42 | def __init__(self, emoji_type, send_to):
43 | self.emoji_type = emoji_type
44 | self.send_to = send_to
45 | self.emojis = []
46 |
47 | # 起始页码
48 | self.start_page = 1
49 |
50 | def get_emojis(self):
51 |
52 | while True:
53 | current_url = url.format(self.emoji_type, self.start_page)
54 | resp = requests.get(current_url, headers=HEADERS)
55 | html_raw = etree.HTML(resp.text)
56 |
57 | # 判断当前是否还有emoji表情
58 | container_element = html_raw.xpath('//div[@class="random_picture"]//img')
59 | if len(container_element) > 0:
60 | self.emojis.extend(self.__get_current_page_emoji(html_raw))
61 | self.start_page += 1
62 | else:
63 | print("当前页面没有表情数据,地址是:%s" % current_url)
64 | break
65 |
66 | time.sleep(0.5)
67 |
68 | def __get_current_page_emoji(self, html_raw):
69 | """
70 | 获取当前页面所有的emoji图片
71 | :param current_url:
72 | :return:
73 | """
74 |
75 | a_elements = html_raw.xpath('//div[@class="pic-content text-center"]/div[@class="random_picture"]/a')
76 |
77 | print("第%d页一共有%d张图片" % (self.start_page, len(a_elements)))
78 |
79 | imgs = []
80 |
81 | for a_element in a_elements:
82 | # 获取img标签【最后一个img】【存储地址】
83 | img_element = a_element.xpath('./img[last()]')[0]
84 |
85 | # 获取p标签【存储名称】
86 | name = a_element.xpath('./p/text()')[0]
87 |
88 | # xpath获取兄弟节点p
89 | # 表情的名称
90 | # name = img_element.xpath('./../p/text()')[0]
91 |
92 | # 表情的下载地址
93 | img_url = img_element.get('data-original')
94 |
95 | # 表情的新名词,不带后缀
96 | # name_new = remove_space(re.sub(r'[\??\.,。!!\*]', '', name))
97 |
98 | # 注意:由于itchat没法发送带中文的文件,这里随机生成一个名称
99 | name_new = make_random_string(6)
100 |
101 | # 表情的名称,加上后缀
102 | # print('==' * 60)
103 | # print(name_new)
104 | # print(img_url)
105 | # print('==' * 60)
106 | img_name = name_new + os.path.splitext(img_url)[-1]
107 |
108 | imgs.append({
109 | 'name': img_name,
110 | 'url': img_url
111 | })
112 |
113 | return imgs
114 |
115 | def download_emojis(self, target_emoji):
116 | """
117 | 下载表情
118 | :param target_emojis:
119 | :return:
120 | """
121 | # 本地保存目录
122 | local_img = './imgs/%s' % target_emoji.get('name')
123 |
124 | request.urlretrieve(target_emoji.get('url'), local_img)
125 |
126 | print('emoji保存本地地址:%s' % local_img)
127 |
128 | return local_img
129 |
130 | def show_image(self, filename):
131 | lena = mpimg.imread(filename)
132 |
133 | plt.imshow(lena) # 显示图片
134 | plt.axis('off') # 不显示坐标轴
135 | plt.show()
136 |
137 |
138 | if __name__ == '__main__':
139 |
140 | # 准备调用itchat发送图片
141 | itchat.auto_login(hotReload=True)
142 |
143 | emoji_type = input('想发哪类表情:')
144 | send_type = input('某个人:0/群聊:1【默认是单聊】')
145 | send_to = input('发给谁呢?')
146 |
147 | if not emoji_type:
148 | emoji_type = '装逼'
149 |
150 | if not send_type:
151 | send_type = 0
152 | else:
153 | send_type = int(send_type)
154 |
155 | if not send_to:
156 | if send_type == 0:
157 | send_to = '指定经常要发送的一个人'
158 | else:
159 | send_to = '指定经常要发送的一个群'
160 |
161 | spider = Spider(emoji_type, send_to)
162 |
163 | # 带发送的表情
164 | local_img = None
165 |
166 | # 获取这种类型的所有表情
167 | spider.get_emojis()
168 |
169 | while True:
170 |
171 | # 从所有emoji表情中选择一张
172 | choose_emoji = random.sample(spider.emojis, 1)
173 |
174 | # 下载到本地
175 | local_img = spider.download_emojis(choose_emoji[0])
176 |
177 | # 显示图片
178 | spider.show_image(local_img)
179 |
180 | ok = input('主人满意吗:')
181 |
182 | if ok:
183 | print('好的,就发送这张表情。')
184 | if send_type == 0:
185 | send_to_person(send_to, local_img)
186 | else:
187 | send_to_group_chat(send_to, local_img)
188 |
189 | # 需要再发一张吗
190 | go_on_send = input('需要再发一张吗?')
191 | if go_on_send:
192 | continue
193 | else:
194 | print('结束了')
195 | break
196 | else:
197 | print('不满意,继续找一张')
198 | continue
199 |
--------------------------------------------------------------------------------
/spiders/发表情/utils/chat_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: test.py
12 | @time: 3/15/19 11:45
13 | @description:TODO
14 | """
15 |
16 | import itchat
17 |
18 |
19 | itchat.auto_login(True)
20 |
21 |
22 | def send_to_person(username, file_names):
23 | """
24 | 发送给某个人
25 | :param username: 发送对象的昵称
26 | :param filename: 文件名
27 | :return:
28 | """
29 | room = itchat.search_friends(name=r'%s' % username)
30 |
31 | userName = room[0]['UserName']
32 |
33 | try:
34 | if isinstance(file_names, list):
35 | # 多个图片
36 | for file_name in file_names:
37 | itchat.send_image(file_name, toUserName=userName)
38 | else:
39 | # 一个图片
40 | itchat.send_image(file_names, toUserName=userName)
41 | print('发送完毕!')
42 | except:
43 | print('发送出错!')
44 |
45 |
46 | def send_to_group_chat(target_group_chat_name, file_names):
47 | """
48 | 群聊
49 | :param target_group_chat_name:
50 | :param file_name:
51 | :return:
52 | """
53 | rooms = itchat.get_chatrooms(update=True)
54 |
55 | # 目标群聊对象
56 | target_room = None
57 | for room in rooms:
58 | group_chat_name = room.get('NickName')
59 | if target_group_chat_name == group_chat_name:
60 | target_room = room
61 | break
62 |
63 | if target_room:
64 | if isinstance(file_names, list):
65 | for file_name in file_names:
66 | target_room.send_image(file_name)
67 | else:
68 | target_room.send_image(file_names)
69 |
70 | print('发送完毕!')
71 | else:
72 | print('抱歉,不存在这个群聊')
73 |
--------------------------------------------------------------------------------
/spiders/发表情/utils/string_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: string_utils.py
12 | @time: 3/15/19 10:36
13 | @description:TODO
14 | """
15 |
16 | import random
17 | import string
18 |
19 |
20 | def remove_space(source):
21 | """
22 | 去除空格
23 | :param source:
24 | :return:
25 | """
26 | return "".join(source.split(' '))
27 |
28 |
29 |
30 | def make_random_string(num):
31 | """
32 | 生成随机字符串
33 | :param num:
34 | :return:
35 | """
36 | return ''.join(random.sample(string.ascii_letters + string.digits, num))
--------------------------------------------------------------------------------
/spiders/年终奖/comments.txt:
--------------------------------------------------------------------------------
1 | 有,一个月工资
2 | 我们没有
3 | 有
4 | 还没发
5 | 没有
6 | 没有
7 | 这个真没有
8 | 没有
9 | 有4.8✖️1.5
10 | 从来没有
11 | 没有
12 | 没有,只有水果[流泪]购物卡也没
13 | 年后发……
14 | 年终奖没有,年会也没有了[捂脸]
15 | 没有了
16 | 说了有,还没发
17 | 从来没有
18 | 我这边小公司提都没提,估计凉了。
19 | 有,但是还不确定发多少
20 | 没有
21 | 没有
22 | 没有了
23 | 有,但是要打折了,具体还未知
24 | 没有过
25 | 要过年才知道
26 | 有,一个半月多一点,每个人不一样,这个看部门老大
27 | 没有
28 | 没有,而且要被裁了[流泪]
29 | 不知道有没有,每年都是8-9月分发,年“中”奖。如果发的话,固定一个月[再见]
30 | 我实习3个月,拿了三分之一月的
31 | 有,0.5个月
32 | 两个月[发呆]
33 | 小公司,有一个月的年终奖!
34 | 最后一天上班 给裁了 年终奖都省了
35 | 有.年底双薪
36 | 没有年终奖,年终奖是神马?
37 | 有年终奖,但是由于入职时间过短,不知道自己会有多少。
38 | 还不知道....😂
39 | 从来没有[流泪]
40 | 没有的飘过
41 | 我们没有
42 | 没有。
43 | 不知道耶。。反正有也就3000块钱。。
44 | 没有
45 | 有,跟去年一样
46 | 我们过年前一天才能知道发不发,才能知道发多少 (没有人知道年终奖的计算法方法,发多少是多少)。其余时间没有一点年终奖的消息
47 | 还不清楚😂
48 | 年会抽奖算不算😂
49 | 有 还没发
50 | 两个月
51 | 有,但不知道多少
52 | 听说有,听说比去年多。
53 | 木有
54 | 还不知道
55 | 老板说我们的叫13薪,多出来的 1 不是额外给的,每个月抽一点出来,最后考核完了看着给,13薪是这个意思吗[撇嘴]
56 | 没有
57 | 有 大概2-3个月的工资
58 | 听说是有。。
59 | 我听说我们公司的一直都是第二年年中之后才发年终奖 1-3个月不等[衰]
60 | 有 但还没发
61 | 一般随过年工资一起发,还没到时间
62 | 还不知道[捂脸] 19号开年会
63 | 去年的没发[撇嘴]
64 | 有,年终奖和去年一样多,不过全年收入上涨幅度可观😄
65 | 没有[难过]
66 | 有,发了个球和一坨毛线,还有一把锤子。
67 | 没有+1
68 | 还不知道,我们得到除夕的前一天才知道。去年也是除夕的前一天
69 | 还不晓得有没有,这个公司加入还没有一个月
70 | 没有
71 | 年终奖是有的。过年前发,一般人1.5应该有的
72 | 没有年终奖,工资都是拖延
73 | 还没发呢 不太清楚啊
74 | 有
75 | 没有
76 | 没有啦,公司都要倒了!!!
77 | 没消息 可能连年会都没有[衰]
78 | 没有
79 | 没有
80 | 没有
81 | 还没通知呢
82 | 工作三年,从来没拿过年终奖
83 | 今年也没有,年会还年后举行[捂脸]
84 | 有 但是还没确定发多少
85 | 有 2月 还没发
86 | 没有。。。
87 | 有,接近两个月工资
88 | 我们年终奖3个月工资
89 | 没年终奖,有季度奖
90 | 减半
91 | 同一个月
92 | 没有
93 | 十一月入职的,有一点
94 | 估计没有,发工资都困难了[发呆]
95 | 外包公司,一直没有
96 | 没有
97 | 有,俩月
98 | 有年会,转正不久,年终奖还不清楚。
99 | 没有+1
100 | 今年还没通知不发,往常是两个月*绩效
101 | 年会都调到年后3月份开了,还说要开拓疆土,扩大规模🌚
102 | 我们这个级别不会有,,,
103 | 今年四月底入职的,不知道有没有
104 | 13薪的1薪算年终奖吗,算的话就有,不算的话就没有
105 | 有,一个月
106 | 真没有[难过]
107 | 据说是有,没说发多少,待定转态,估计凉了
108 | 以前有。今年估计悬了
109 | 我们应该有,但是还不确定
110 | 应该在大公司、国企、事业单位这些影响不大吧。我们还是有的
111 | 有,主要是前面说好的设计的提成,不知道能给到多少
112 | 没有
113 | 没有
114 | 没有年终奖
115 | 应该有吧!没有立马辞职
116 | 没有
117 | 工资拖欠了[流泪]
118 | 没有,而且公司春节前后还不让请假,如果请假,春节的法定假日就不算法定假日了,算成请假了,要扣钱
119 | 没有
120 | 没有
121 | 一直的传统,一个月工资,但平时工资就比同行低好多,综合年收入八万多一点点
122 | 有
123 | 没有
124 | 没有 公司业绩下滑将近8亿
125 | 有,和以前一样。
126 | 不是看老板心情,去年有,今年就不确定了
127 | 没有
128 | 有,一个月还有少量项目分成
129 | 今年现在都还没提过年终奖这件事,感觉凉了[流泪]
130 | 一个月 !
131 | 要倒闭了
132 | 有,大部分是两个月工资
133 | 没有
134 | 没有,还降薪20%
135 | 应该有,看公司利润了,大概率一个月工资。
136 | 不清楚[流泪]
137 | 有个锤子
138 | 没有
139 | 应该有,还没发,估计底薪x2。
140 | 没有+降薪20%
141 | 物流集团旗下成立的新科技公司,大数据部门貌似一直都没有年终奖……
142 | 我们还在评估发多少
143 | 200块红包
144 | 有,听说大概是月工资的1.几倍。ps:发的是17年的奖金,18的奖金还得往后挪。应该是等不到那一天了。
145 | 我们有2个月,不过要19年年中才发
146 | 从来都没有
147 | 正常的按照绩效发
148 | 有,一个月工资
149 | 还没发 往年1.5不到
150 | 2月
151 | 没有
152 | 从来没有过。。
153 | 往年都是4个月,过年前一周发,今年还不知道
154 | 公司已经裁员,剩下的大概率没有年终奖[撇嘴]
155 | 应该是有,
156 | 有,一个月
157 | 我们有,照常两个月奖金。但是是平时周六上班换来的
158 | 一直没有
159 | 从来没有
160 | 年薪百分之15
161 | 还没听吭
162 | 有,一个月,但是不多
163 | 还不知道。。。
164 | 多半没有!
165 | 一直是13薪。不过今年改成了bouns,比13薪率高,大概1.3个月的样子
166 | 应该有
167 | 往年惯例都是一个月,今年公司效益不错,承诺至少两个月以上,这几天公司还组织来巴厘岛度假。额,是不是有点太拉仇恨了呀[调皮]
168 | 我公司,就我部门没有[微笑]
169 | 有,几个月不知道
170 | 我们还没发
171 | 有,项目奖半个月,年终奖,惯例4月份发
172 | 一个月
173 | 有
174 | 妹有
175 | 一个月+1200过年费
176 | 应该是有2个月,
177 | 创业公司,刚开门几个月,没有
178 | 还不清楚,估计没有
179 | 没有
180 | 之前说是有2个月的,年后发
181 | 没有
182 | 还没发,不知道有没有
183 | 3个月
184 | 我们没有
185 | 没有
186 | 据说还有,还没发过[闭嘴]
187 | 还不知道呢。去年是春节放假前发的
188 | 啥都没得
189 | 实习生不说话😂😂
190 | 还没有发,但入职说的14薪
191 | 我们每年都4月底发,还不清楚有没有
192 | 年薪的20%就是年终奖,绩效不好还要扣
193 | 我们是年中奖,刚来半年,不知道年中有没有[微笑]
194 | 有,不过减半了,去年年终是多发两个月的工资,今年好像是只有一个月的
195 | 说有,不知道有没有
196 | 没有
197 | 以前是平均3到4个月工资,去年就没发了,今年估计也悬了
198 | 没有 年会都没了
199 | 没有。
200 | 2+绩效
201 | 没有
202 | 去年就把年终奖取消了,变为项目奖的路过
203 | 18 年第三季度发的 17 年年终奖[微笑]
204 | 发了,大概是四五个月工资了,因为工资低😂
205 | 有,拖到6月份发
206 | 还不知道
207 | 一个月工资,但是公司规定要第二个季度才发,差不多就是67月份
208 | 没有
209 | 老板承诺都有,但现在还没发,不知道会不会兑现
210 | 没有
211 | 年底双薪,年终两月
212 | 没有
213 | 没有,估计年会有红包
214 | 不知道多少个月
215 | 据说有😂
216 | 要不稍微分行业来个投票 清晰些
217 | 从业三年,几乎没感受到年终奖,以至于我都不记得我们有发过所谓的年终奖吗?
218 | 没有……
219 | 有,但不知道怎么发[捂脸]
220 | 有 2月12日
221 | 老大说年终奖年后发。
222 | 一个月
223 | 绩效到现在还不知道。。。
224 | 有,一个月工资
225 | 没有,据说年会都取消了
226 | 没有
227 | 新公司,啥都不确定[晕][晕][晕]
228 | 从来就没有
229 | 一直没有[撇嘴]
230 | 我们没有,昨天发工资,每个人都要延迟发放一部分,普遍30%,个别60%或70%。大家已经怨声载道了
231 | 有6个月[尴尬]
232 | 没有
233 | 应该是一个月工资,下周五年会抽奖保底 400,一等奖 10000 现金,如果我中了就可以每个月给张叔打赏了😏😏
234 | 2
235 | 老板说给我加工资,年终奖照发
236 | 没有
237 | 也没有
238 | 有,两个月
239 | 有,一个月工资
240 | 我们没有
241 | 有
242 | 还没发
243 | 没有
244 | 没有
245 | 这个真没有
246 | 没有
247 | 有4.8✖️1.5
248 | 从来没有
249 | 没有
250 | 没有,只有水果[流泪]购物卡也没
251 | 年后发……
252 | 年终奖没有,年会也没有了[捂脸]
253 | 没有了
254 | 说了有,还没发
255 | 从来没有
256 | 我这边小公司提都没提,估计凉了。
257 | 有,但是还不确定发多少
258 | 没有
259 | 没有
260 | 没有了
261 | 有,但是要打折了,具体还未知
262 | 没有过
263 | 要过年才知道
264 | 有,一个半月多一点,每个人不一样,这个看部门老大
265 | 没有
266 | 没有,而且要被裁了[流泪]
267 | 不知道有没有,每年都是8-9月分发,年“中”奖。如果发的话,固定一个月[再见]
268 | 我实习3个月,拿了三分之一月的
269 |
--------------------------------------------------------------------------------
/spiders/年终奖/nzj.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: nzj.py
12 | @time: 1/11/19 16:00
13 | @description:看看大家今年大家都有年终奖吗?
14 | """
15 |
16 | import json
17 | import jieba
18 | import matplotlib.pyplot as plt
19 | from wordcloud import WordCloud
20 |
21 | # 文件名称
22 | filename = 'comments.txt'
23 |
24 | # 总共的评论数目
25 | comment_count = 0
26 |
27 |
28 | def response(flow):
29 | request = flow.request
30 | response = flow.response
31 |
32 | global comment_count
33 |
34 | # 请求的地址
35 | request_url = request.url
36 |
37 | # 筛选
38 | if 'comments' in request_url and 'zsxq' in request_url:
39 | # 返回的内容
40 | response_content = response.content.decode('utf-8')
41 | print('请求地址:' + request_url)
42 | print('请求方法:' + str(request.method))
43 | print('参数:' + str(request.data))
44 |
45 | obj = json.loads(response_content)
46 |
47 | comments = obj['resp_data']['comments']
48 |
49 | # 最后一页
50 | if len(comments) == 0:
51 | print('一共有%d个球友发表了自己的看法' % comment_count)
52 |
53 | # 生成词云
54 | generate_word_cloud()
55 |
56 | else:
57 | comment_count += len(comments)
58 | for comment in comments:
59 | comment_content = comment['text']
60 | with open(filename, 'a') as f:
61 | f.write(comment_content + '\n')
62 |
63 |
64 | def generate_word_cloud():
65 | """
66 | 生成词云
67 | :return:
68 | """
69 | with open(filename, 'r') as f:
70 | word_content = f.read()
71 |
72 | # 使用jieba去分割
73 | wordlist = jieba.cut(word_content, cut_all=True)
74 |
75 | wl_space_split = " ".join(wordlist)
76 |
77 | font = r'/Users/xingag/Library/Fonts/SimHei.ttf'
78 |
79 | wordcloud = WordCloud(font_path=font, width=1080, height=1920, margin=2).generate(wl_space_split)
80 |
81 | # 显示图片
82 | plt.imshow(wordcloud)
83 | plt.axis("off")
84 |
85 | # 按照设置保存到本地文件夹
86 | wordcloud.to_file("./output.png")
87 |
--------------------------------------------------------------------------------
/spiders/年终奖/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/spiders/年终奖/output.png
--------------------------------------------------------------------------------
/verification code/注册【中知网】/AipOcr.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: AipOcr.py
12 | @time: 1/23/19 15:19
13 | @description:AipOcr是OCR的Python SDK客户端,为使用OCR的开发人员提供了一系列的交互方法。
14 | """
15 |
16 | from aip import AipOcr
17 |
18 | """ 你的 APPID AK SK """
19 | APP_ID = '15474**'
20 | API_KEY = 'VBoMZ6XUX119w***'
21 | SECRET_KEY = 'GPvqLVeGIMOR57***'
22 |
23 | client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
24 |
--------------------------------------------------------------------------------
/verification code/注册【中知网】/cnki_demo.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: cnki_demo.py
12 | @time: 1/23/19 15:44
13 | @description:[中国知网注册]
14 | """
15 | from PIL import Image
16 | from selenium import webdriver
17 | from file_tools import *
18 | from AipOcr import *
19 | import requests
20 | import time
21 | import json
22 |
23 |
24 | class Cnki_Spider(object):
25 | driver_path = "/usr/local/bin/chromedriver"
26 |
27 | def __init__(self):
28 | self.driver = webdriver.Chrome(executable_path=Cnki_Spider.driver_path)
29 |
30 | # 包含验证码的页面的截图
31 | self.screen_shot_file_name = "screen_shot.png"
32 |
33 | # 验证码图片
34 | self.code_file_name = "image_code.png"
35 |
36 | # 注册主页面
37 | self.main_url = 'http://my.cnki.net/elibregister/commonRegister.aspx'
38 |
39 | # 待注册的内容
40 | # 昵称
41 | self.username = 'xingag2311'
42 | # 密码
43 | self.password = 'Hu9012782'
44 | # 邮箱地址
45 | self.email = '809900227@qq.com'
46 |
47 | def run(self):
48 | # 1.打开注册页面【包含验证码】
49 | self.driver.get(self.main_url)
50 |
51 | source = self.driver.page_source
52 |
53 | # 2.验证码图片、验证码输入框
54 | code_input_element = self.driver.find_element_by_id('txtOldCheckCode')
55 | code_img_element = self.driver.find_element_by_id('checkcode')
56 |
57 |
58 | # 外面容器
59 | container_element = self.driver.find_element_by_id('form1')
60 |
61 | # 3.获取验证码、填入输入框、点击外面
62 | # 如果没有出现出错的提示tips,就代表输入验证码成功
63 | while True:
64 |
65 | code = self.get_code().strip()
66 |
67 | error_tips_element = self.driver.find_element_by_id('span_oldcheckcode')
68 |
69 | print('验证码为:%s' % code)
70 | code_input_element.clear()
71 | code_input_element.click()
72 | code_input_element.send_keys(code)
73 |
74 | # 点击外围的容器,判断验证码是否输入正确
75 | container_element.click()
76 |
77 | # 显示了错误信息:验证码输入错误
78 | if error_tips_element.text:
79 | time.sleep(2)
80 | print('验证码验证失败,点击验证码图片')
81 |
82 | # 点击验证码图片,重新加载验证码
83 | code_img_element.click()
84 | continue
85 | else:
86 | print('验证码验证成功')
87 | break
88 |
89 | # 3.注册
90 | self.register(code)
91 |
92 | def get_code(self):
93 |
94 | # 1.截图并保存到本地
95 | self.driver.get_screenshot_as_file('./%s' % self.screen_shot_file_name)
96 |
97 | # 2.打开文件
98 | screenshot_image = Image.open('./%s' % self.screen_shot_file_name)
99 |
100 | # 3.设置要裁剪的区域(验证码所在的区域)
101 | code_box = (899, 819, 1048, 883)
102 |
103 | # 4.截图:生成只有验证码的图片
104 | code_image = screenshot_image.crop(code_box)
105 |
106 | # 5.保存到本地
107 | code_image.save("./%s" % self.code_file_name)
108 |
109 | # 6.以byte读取图片
110 | image = get_file_content("./%s" % self.code_file_name)
111 |
112 | # 7.使用百度OCR识别验证码
113 | result = client.basicAccurate(image)
114 |
115 | print(result)
116 |
117 | # 识别的文字内容
118 | word_result = result.get('words_result')[0].get('words')
119 |
120 | return word_result
121 |
122 | def register(self, code):
123 | # 用户名输入框
124 | username_input_element = self.driver.find_element_by_id('username')
125 |
126 | # 密码输入框
127 | password_input_element = self.driver.find_element_by_id('txtPassword')
128 |
129 | # 邮箱输入框
130 | txtEmail_input_element = self.driver.find_element_by_id('txtEmail')
131 |
132 | # 注册按钮
133 | submit_btn_element = self.driver.find_element_by_id('ButtonRegister')
134 |
135 | username_input_element.send_keys(self.username)
136 | password_input_element.send_keys(self.password)
137 | txtEmail_input_element.send_keys(self.email)
138 |
139 | submit_btn_element.click()
140 |
141 |
142 | if __name__ == '__main__':
143 | spider = Cnki_Spider()
144 | spider.run()
145 |
--------------------------------------------------------------------------------
/verification code/注册【中知网】/file_tools.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: file_tools.py
12 | @time: 1/23/19 15:41
13 | @description:TODO
14 | """
15 |
16 |
17 | def get_file_content(filePath):
18 | """
19 | 读取文件
20 | :param filePath: 文件路径
21 | :return: byte类型
22 | """
23 | with open(filePath, 'rb') as fp:
24 | return fp.read()
25 |
--------------------------------------------------------------------------------
/verification code/注册【中知网】/image_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/verification code/注册【中知网】/image_code.png
--------------------------------------------------------------------------------
/verification code/注册【中知网】/screen_shot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/verification code/注册【中知网】/screen_shot.png
--------------------------------------------------------------------------------
/微信聊天记录/utils/dbutils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: dbutils.py
12 | @time: 2020-04-11 16:57
13 | @description
14 | """
15 |
16 | import sqlite3
17 |
18 |
19 | class DUtil():
20 |
21 | def __init__(self, db_path="./weixin.db"):
22 | """
23 | 数据库初始化
24 | """
25 | self.db = sqlite3.connect(db_path)
26 | self.cursor = self.db.cursor()
27 |
28 | def execute(self, sql, param=None):
29 | """
30 | Sql语句,包含:增、删、改
31 | param:数据,可以为列表、字典,也可以为空
32 | """
33 | try:
34 | if param is None:
35 | self.cursor.execute(sql)
36 | else:
37 | if type(param) is list:
38 | self.cursor.executemany(sql, param)
39 | else:
40 | self.cursor.execute(sql, param)
41 | count = self.db.total_changes
42 | self.db.commit()
43 | except Exception as e:
44 | print(e)
45 | return False, e
46 |
47 | # 返回结果
48 | return True if count > 0 else False
49 |
50 | def query(self, sql, param=None):
51 | """
52 | 查询语句
53 | sql:Sql语句
54 | param:参数,可以包含空
55 | retutn:成功返回True
56 | """
57 | if param is None:
58 | self.cursor.execute(sql)
59 | else:
60 | self.cursor.execute(sql, param)
61 | return self.cursor.fetchall()
62 |
63 | def close(self):
64 | """
65 | 数据库关闭
66 | """
67 | self.cursor.close()
68 | self.db.close()
69 |
70 |
--------------------------------------------------------------------------------
/微信聊天记录/utils/string_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: StringUtils.py
12 | @time: 2020-04-11 18:39
13 | @description:TODO
14 | """
15 | import re
16 |
17 |
18 | def get_ava_string(str):
19 | """
20 | 去掉特殊符号,保留正常内容
21 | :param str:
22 | :return:
23 | """
24 | return re.sub(u"([^ \u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", str)
25 |
--------------------------------------------------------------------------------
/获取女友的位置/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/获取女友的位置/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/获取女友的位置/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/获取女友的位置/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/获取女友的位置/.idea/地理位置.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/获取女友的位置/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: meta_picture.py.py
12 | @time: 2019-08-23 16:23
13 | @description:高德坐标拾取网站:https://lbs.amap.com/console/show/picker
14 | """
15 |
16 | import os
17 | import exifread
18 | from decimal import Decimal
19 | from position_utils import *
20 | import requests
21 | import json
22 | import datetime
23 |
24 |
25 | # pip3 install exifread
26 |
27 |
28 | class Location(object):
29 |
30 | def __init__(self, image_path):
31 | self.img_path = image_path
32 |
33 | self.api_key = "你申请的AK"
34 |
35 | self.url_get_position = 'https://restapi.amap.com/v3/geocode/regeo?key={}&location={}'
36 |
37 | def run(self):
38 | coordinate = self.__get_image_ability()
39 |
40 | print(f'获取到经度、纬度是:{coordinate}')
41 |
42 | if not coordinate:
43 | return
44 |
45 | # 根据经度和纬度,获取到详细地址
46 | address = self.__get_address(coordinate)
47 |
48 | # 检验坐标值
49 | # https://lbs.amap.com/console/show/picker
50 | print(f'你女朋友当前位置在:{address}')
51 |
52 | def __get_address(self, location):
53 | """
54 | 根据坐标得到详细地址
55 | :param location: 经纬度值
56 | :return:
57 | """
58 | resp = requests.get(self.url_get_position.format(self.api_key, location))
59 |
60 | location_data = json.loads(resp.text)
61 |
62 | address = location_data.get('regeocode').get('formatted_address')
63 |
64 | return address
65 |
66 | def __format_lati_long_data(self, data):
67 | """
68 | 对经度和纬度数据做处理,保留6位小数
69 | :param data: 原始经度和纬度值
70 | :return:
71 | """
72 | # 删除左右括号和空格
73 | data_list_tmp = str(data).replace('[', '').replace(']', '').split(',')
74 | data_list = [data.strip() for data in data_list_tmp]
75 |
76 | # 替换秒的值
77 | data_tmp = data_list[-1].split('/')
78 |
79 | # 秒的值
80 | data_sec = int(data_tmp[0]) / int(data_tmp[1]) / 3600
81 |
82 | # 替换分的值
83 | data_tmp = data_list[-2]
84 |
85 | # 分的值
86 | data_minute = int(data_tmp) / 60
87 |
88 | # 度的值
89 | data_degree = int(data_list[0])
90 |
91 | # 由于高德API只能识别到小数点后的6位
92 | # 需要转换为浮点数,并保留为6位小数
93 | result = "%.6f" % (data_degree + data_minute + data_sec)
94 | return float(result)
95 |
96 | def __get_image_ability(self):
97 | """
98 | 获取图片的属性值,包含:经纬度、拍摄时间等
99 | :param picture_name:
100 | :return:
101 | """
102 |
103 | # 利用exifread库,读取图片的属性
104 | img_exif = exifread.process_file(open(self.img_path, 'rb'))
105 |
106 | # 能够读取到属性
107 | if img_exif:
108 | # 纬度数
109 | latitude_gps = img_exif['GPS GPSLatitude']
110 |
111 | # N,S 南北纬方向
112 | latitude_direction = img_exif['GPS GPSLatitudeRef']
113 |
114 | # 经度数
115 | longitude_gps = img_exif['GPS GPSLongitude']
116 |
117 | # E,W 东西经方向
118 | longitude_direction = img_exif['GPS GPSLongitudeRef']
119 |
120 | # 拍摄时间
121 | take_time = img_exif['EXIF DateTimeOriginal']
122 |
123 | is_lie = self.judge_time_met(take_time)
124 |
125 | if is_lie:
126 | print('很遗憾的通知你,你的女朋友在撒谎!!!')
127 | return
128 |
129 | # 纬度、经度、拍摄时间
130 | if latitude_gps and longitude_gps and take_time:
131 |
132 | # 对纬度、经度值原始值作进一步的处理
133 | latitude = self.__format_lati_long_data(latitude_gps)
134 | longitude = self.__format_lati_long_data(longitude_gps)
135 |
136 | # print(f'{longitude},{latitude}')
137 |
138 | # 注意:由于gps获取的坐标在国内高德等主流地图上逆编码不够精确,这里需要转换为火星坐标系
139 | location = wgs84togcj02(longitude, latitude)
140 |
141 | return f'{location[0]},{location[1]}'
142 | else:
143 | print(f'获取的图片数据属性不完整')
144 | return ''
145 | else:
146 | print('抱歉,图片不是原图,没法获取到图片属性。')
147 | return ''
148 |
149 | def judge_time_met(self, take_time):
150 | """
151 | 通知拍摄时间判断女朋友是否撒谎
152 | :param take_time:
153 | :return:
154 | """
155 | # 拍摄时间
156 | format_time = str(take_time).split(" ")[0].replace(":", "-")
157 |
158 | # 当天日期
159 | today = str(datetime.date.today())
160 |
161 | if format_time == today:
162 | return False
163 | else:
164 | return True
165 |
166 |
167 | if __name__ == '__main__':
168 | # 女朋友发过来的图片【原图】
169 | location = Location('./picture/11441566648796_.pic_hd.jpg')
170 |
171 | # 找到女朋友的地理位置
172 | location.run()
173 |
--------------------------------------------------------------------------------
/获取女友的位置/picture/11441566648796_.pic_hd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/获取女友的位置/picture/11441566648796_.pic_hd.jpg
--------------------------------------------------------------------------------
/获取女友的位置/position_utils.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # encoding: utf-8
3 |
4 | """
5 | @version: v1.0
6 | @author: xag
7 | @license: Apache Licence
8 | @contact: xinganguo@gmail.com
9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: position_utils.py
12 | @time: 2019-08-23 17:44
13 | @description:坐标转换
14 | """
15 |
16 | # -*- coding: utf-8 -*-
17 | import json
18 | import math
19 |
20 | x_pi = 3.14159265358979324 * 3000.0 / 180.0
21 | pi = 3.1415926535897932384626 # π
22 | a = 6378245.0 # 长半轴
23 | ee = 0.00669342162296594323 # 扁率
24 |
25 |
26 | def wgs84togcj02(lng, lat):
27 | """
28 | WGS84转GCJ02(火星坐标系)
29 | :param lng:WGS84坐标系的经度
30 | :param lat:WGS84坐标系的纬度
31 | :return:
32 | """
33 | if out_of_china(lng, lat): # 判断是否在国内
34 | return lng, lat
35 | dlat = transformlat(lng - 105.0, lat - 35.0)
36 | dlng = transformlng(lng - 105.0, lat - 35.0)
37 | radlat = lat / 180.0 * pi
38 | magic = math.sin(radlat)
39 | magic = 1 - ee * magic * magic
40 | sqrtmagic = math.sqrt(magic)
41 | dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
42 | dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
43 | mglat = lat + dlat
44 | mglng = lng + dlng
45 | return [mglng, mglat]
46 |
47 |
48 | def gcj02towgs84(lng, lat):
49 | """
50 | GCJ02(火星坐标系)转GPS84
51 | :param lng:火星坐标系的经度
52 | :param lat:火星坐标系纬度
53 | :return:
54 | """
55 | if out_of_china(lng, lat):
56 | return lng, lat
57 | dlat = transformlat(lng - 105.0, lat - 35.0)
58 | dlng = transformlng(lng - 105.0, lat - 35.0)
59 | radlat = lat / 180.0 * pi
60 | magic = math.sin(radlat)
61 | magic = 1 - ee * magic * magic
62 | sqrtmagic = math.sqrt(magic)
63 | dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
64 | dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
65 | mglat = lat + dlat
66 | mglng = lng + dlng
67 | return [lng * 2 - mglng, lat * 2 - mglat]
68 |
69 |
70 | def transformlat(lng, lat):
71 | ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + \
72 | 0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng))
73 | ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 *
74 | math.sin(2.0 * lng * pi)) * 2.0 / 3.0
75 | ret += (20.0 * math.sin(lat * pi) + 40.0 *
76 | math.sin(lat / 3.0 * pi)) * 2.0 / 3.0
77 | ret += (160.0 * math.sin(lat / 12.0 * pi) + 320 *
78 | math.sin(lat * pi / 30.0)) * 2.0 / 3.0
79 | return ret
80 |
81 |
82 | def transformlng(lng, lat):
83 | ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + \
84 | 0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng))
85 | ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 *
86 | math.sin(2.0 * lng * pi)) * 2.0 / 3.0
87 | ret += (20.0 * math.sin(lng * pi) + 40.0 *
88 | math.sin(lng / 3.0 * pi)) * 2.0 / 3.0
89 | ret += (150.0 * math.sin(lng / 12.0 * pi) + 300.0 *
90 | math.sin(lng / 30.0 * pi)) * 2.0 / 3.0
91 | return ret
92 |
93 |
94 | def out_of_china(lng, lat):
95 | """
96 | 判断是否在国内,不在国内不做偏移
97 | :param lng:
98 | :param lat:
99 | :return:
100 | """
101 | if lng < 72.004 or lng > 137.8347:
102 | return True
103 | if lat < 0.8293 or lat > 55.8271:
104 | return True
105 | return False
106 |
--------------------------------------------------------------------------------