├── .gitignore
├── .idea
    ├── misc.xml
    ├── modules.xml
    ├── spider_python.iml
    ├── vcs.xml
    └── workspace.xml
├── LICENSE
├── ModifyLocation
    ├── gps_utils.py
    ├── main.py
    └── position_utils.py
├── Python调用JAR
    ├── exec_jar_example.py
    └── jar
    │   ├── com
    │       └── xingag
    │       │   └── common
    │       │       └── EncryHelper.class
    │   └── encry.jar
├── README.md
├── feapder
    └── tophub_demo
    │   ├── .idea
    │       ├── inspectionProfiles
    │       │   └── Project_Default.xml
    │       ├── misc.xml
    │       ├── modules.xml
    │       ├── tophub_demo.iml
    │       └── workspace.xml
    │   ├── items
    │       ├── __init__.py
    │       └── topic_item.py
    │   ├── main.py
    │   ├── setting.py
    │   ├── spiders
    │       ├── __init__.py
    │       └── tophub_spider.py
    │   └── test.py
├── js
    └── jian_shu.js
├── pic
    └── 最低气温排行榜.png
├── raw
    └── qr.jpeg
├── scrapy
    ├── douban_login
    │   ├── .idea
    │   │   ├── douban_login.iml
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   ├── vcs.xml
    │   │   └── workspace.xml
    │   ├── captcha.png
    │   ├── douban_login
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── douban.py
    │   ├── readme.MD
    │   ├── scrapy.cfg
    │   └── start.py
    ├── huize_spider
    │   ├── .idea
    │   │   ├── huize_spider.iml
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   └── workspace.xml
    │   ├── .~ana.rtf
    │   ├── ana.rtf
    │   ├── datas.json
    │   ├── huize_spider
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   ├── huize.py
    │   │   │   └── string_utils.py
    │   ├── scrapy.cfg
    │   └── start.py
    ├── jianshu_spider
    │   ├── .idea
    │   │   ├── jianshu_spider.iml
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   ├── vcs.xml
    │   │   └── workspace.xml
    │   ├── jianshu_spider
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── jianshu.py
    │   ├── raw
    │   │   ├── article.sql
    │   │   └── article_table.png
    │   └── scrapy.cfg
    ├── qczj
    │   ├── .idea
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   ├── qczj.iml
    │   │   ├── vcs.xml
    │   │   └── workspace.xml
    │   ├── qczj
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── bmw5.py
    │   ├── readme.MD
    │   ├── scrapy.cfg
    │   └── start.py
    ├── qsbk
    │   ├── .idea
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   ├── qsbk.iml
    │   │   ├── vcs.xml
    │   │   └── workspace.xml
    │   ├── duanzi.json
    │   ├── qsbk
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── spider_qsbk.py
    │   ├── readme.MD
    │   └── scrapy.cfg
    ├── sfw_spider
    │   ├── .idea
    │   │   ├── misc.xml
    │   │   ├── modules.xml
    │   │   ├── sfw.iml
    │   │   ├── vcs.xml
    │   │   └── workspace.xml
    │   ├── requirements.txt
    │   ├── scrapy.cfg
    │   ├── sfw
    │   │   ├── __init__.py
    │   │   ├── items.py
    │   │   ├── middlewares.py
    │   │   ├── pipelines.py
    │   │   ├── settings.py
    │   │   └── spiders
    │   │   │   ├── __init__.py
    │   │   │   └── sfw_spider.py
    │   └── start.py
    └── weixin_community
    │   ├── .idea
    │       ├── misc.xml
    │       ├── modules.xml
    │       ├── vcs.xml
    │       ├── weixin_community.iml
    │       └── workspace.xml
    │   ├── readme.MD
    │   ├── scrapy.cfg
    │   └── weixin_community
    │       ├── __init__.py
    │       ├── items.py
    │       ├── middlewares.py
    │       ├── pipelines.py
    │       ├── settings.py
    │       └── spiders
    │           ├── __init__.py
    │           └── wx_spider.py
├── spiders
    ├── film_xinpianchang
    │   ├── Film.py
    │   ├── models.py
    │   ├── tools_file.py
    │   └── tools_string.py
    ├── spider_bai_si_bu_de_jie.py
    ├── spider_boss.py
    ├── spider_china_weather.py
    ├── spider_dou_tu_la.py
    ├── spider_dytt.py
    ├── spider_gushiwen.py
    ├── spider_lagou.py
    ├── spider_qiu_shi_bai_ke.py
    ├── spider_tencent_recruit.py
    ├── 发表情
    │   ├── auto_send_emoji.py
    │   └── utils
    │   │   ├── chat_utils.py
    │   │   └── string_utils.py
    └── 年终奖
    │   ├── comments.txt
    │   ├── nzj.py
    │   └── output.png
├── verification code
    └── 注册【中知网】
    │   ├── AipOcr.py
    │   ├── cnki_demo.py
    │   ├── file_tools.py
    │   ├── image_code.png
    │   └── screen_shot.png
├── 微信聊天记录
    ├── main.py
    └── utils
    │   ├── dbutils.py
    │   └── string_utils.py
└── 获取女友的位置
    ├── .idea
        ├── inspectionProfiles
        │   └── Project_Default.xml
        ├── misc.xml
        ├── modules.xml
        ├── vcs.xml
        ├── workspace.xml
        └── 地理位置.iml
    ├── main.py
    ├── picture
        └── 11441566648796_.pic_hd.jpg
    └── position_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | .DS_Store
104 | 
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7 (flask-env)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/spider_python.iml" filepath="$PROJECT_DIR$/.idea/spider_python.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/.idea/spider_python.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 2.7 (flask-env)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="PROJECT_TEST_RUNNER" value="Unittests" />
10 |   </component>
11 | </module>


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$" vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/ModifyLocation/gps_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding: utf-8
 3 | 
 4 | """
 5 | @version: v1.0
 6 | @author: xag
 7 | @license: Apache Licence
 8 | @contact: xinganguo@gmail.com
 9 | @site: http://www.xingag.top
10 | @software: PyCharm
11 | @file: gps_utils.py
12 | @time: 2019-11-17 10:34
13 | @description：TODO
14 | """
15 | 
16 | import math
17 | 
18 | 
19 | def gps_to_dms(gps_data):
20 |     """
21 |     坐标转为度、分、秒(double)
22 |     116.397451
23 |     :param gps_data:
24 |     :return:
25 |     """
26 |     # 度：向下取整
27 |     gps_degree = math.floor(gps_data)
28 | 
29 |     gps_data_temp1 = (gps_data - gps_degree) * 60
30 | 
31 |     # 分
32 |     gps_minute = math.floor(gps_data_temp1)
33 | 
34 |     gps_data_temp2 = gps_data_temp1 - gps_minute
35 | 
36 |     # 秒,取小数点后4位
37 |     gps_second = round(gps_data_temp2 * 60, 2)
38 | 
39 |     # 注意：秒必须转换为整形
40 |     result = ((gps_degree, 1), (gps_minute, 1), (int(gps_second * 100), 100))
41 | 
42 |     return result
43 | 
44 | 
45 | def dms_to_gps(dms_data):
46 |     """
47 |     度、分、秒转为坐标值(double)
48 |     :param dms_data:
49 |     :return:
50 |     """
51 |     data1 = dms_data[0][0] / dms_data[0][1]
52 | 
53 |     data2 = dms_data[1][0] / dms_data[1][1] / 60
54 | 
55 |     data3 = dms_data[2][0] / dms_data[2][1] / 3600
56 | 
57 |     result = round(data1 + data2 + data3,6)
58 | 
59 |     return result
60 | 


--------------------------------------------------------------------------------
/ModifyLocation/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """
  5 | @version: v1.0
  6 | @author: xag
  7 | @license: Apache Licence
  8 | @contact: xinganguo@gmail.com
  9 | @site: http://www.xingag.top
 10 | @software: PyCharm
 11 | @file: main.py
 12 | @time: 2019-11-16 10:12
 13 | @description：修改图片地理位置
 14 | """
 15 | 
 16 | import requests
 17 | import time
 18 | from PIL import Image
 19 | import piexif
 20 | import json
 21 | from gps_utils import *
 22 | from position_utils import *
 23 | 
 24 | 
 25 | # 依赖：pip3 install piexif
 26 | 
 27 | class Exif():
 28 |     def __init__(self):
 29 |         self.time = '2019:11:17 14:13:22'
 30 | 
 31 |         # 地理编码（地址转为经纬度）
 32 |         self.url_geo = 'https://restapi.amap.com/v3/geocode/geo'
 33 | 
 34 |         # 逆地理编码（经纬度转为地址）
 35 |         self.url_regeo = 'https://restapi.amap.com/v3/geocode/regeo?parameters'
 36 | 
 37 |         # key
 38 |         self.ak = '你的ak'
 39 | 
 40 |         # 数字签名
 41 |         self.sign = '你的sign'
 42 | 
 43 |     def read_image(self, image_path):
 44 |         """
 45 |         开始处理图片
 46 |         exifread:读取图片属性
 47 |         :return:
 48 |         """
 49 |         exif_dict = piexif.load(image_path)
 50 | 
 51 |         if exif_dict['GPS']:
 52 | 
 53 |             # 纬度
 54 |             gps_lati_pre = exif_dict['GPS'][2]
 55 | 
 56 |             gps_lati = dms_to_gps(gps_lati_pre)
 57 | 
 58 |             # 经度
 59 |             gps_long_pre = exif_dict['GPS'][4]
 60 |             gps_long = dms_to_gps(gps_long_pre)
 61 | 
 62 |             # GPS坐标转为高德坐标
 63 |             lng, lat = wgs84togcj02(gps_long, gps_lati)
 64 | 
 65 |             # print(lng, lat)
 66 | 
 67 |             print(f"原图地理位置如下\n经度：{lng}\n纬度:{lat}\n")
 68 | 
 69 |             return f'{lng}, {lat}'
 70 |         else:
 71 |             print(f'抱歉！这张图片不包含地理位置！')
 72 | 
 73 |     def current_time(self):
 74 |         """
 75 |         获取当前时间
 76 |         :return:
 77 |         """
 78 |         time_now = time.strftime('%Y:%m:%d %H:%M:%S', time.localtime(time.time()))
 79 | 
 80 |         result = bytes(time_now, encoding='utf-8')
 81 | 
 82 |         return result
 83 | 
 84 |     def str_to_bytes(self, str_content):
 85 |         """
 86 |         字符串转bytes
 87 |         :return:
 88 |         """
 89 |         return bytes(str_content, encoding='utf-8')
 90 | 
 91 |     def is_image(self, filename):
 92 |         """
 93 |         判断文件是否是一张图片
 94 |         :param filename:
 95 |         :return:
 96 |         """
 97 |         file_suffix = filename.split('.')[-1]
 98 | 
 99 |         if file_suffix == 'jpg' or file_suffix == 'png':
100 |             return True
101 |         else:
102 |             return False
103 | 
104 |     def write_image(self, image_path, gps_long, gps_lati):
105 |         """
106 |         修改文件夹下所有文件的属性
107 |         :param image_path: 文件夹路径
108 |         :return:
109 |         """
110 |         # 读取图片
111 |         img = Image.open(image_path)
112 | 
113 |         try:
114 |             exif_dict = piexif.load(img.info['exif'])
115 |         except:
116 |             print('加载文件地理位置异常！')
117 |             return
118 | 
119 |         # 修改地理位置
120 |         # GPS GPSLatitudeRef:N
121 |         # GPS GPSLatitude:[22, 32, 189/20]
122 |         # GPS GPSLongitudeRef:E
123 |         # GPS GPSLongitude:[114, 1, 689/20]
124 |         exif_dict['GPS'][2] = gps_to_dms(gps_lati)
125 |         exif_dict['GPS'][4] = gps_to_dms(gps_long)
126 | 
127 |         exif_bytes = piexif.dump(exif_dict)
128 | 
129 |         # 写入到新的图片中去
130 |         img.save(image_path, 'jpeg', exif=exif_bytes)
131 | 
132 |     def get_address_by_location(self, location):
133 |         """
134 |         通过经纬度拿到地理位置
135 |         :param location:
136 |         :return:
137 |         """
138 |         params = {
139 |             'key': self.ak,
140 |             'location': location,
141 |             'sig': self.sign
142 |         }
143 | 
144 |         resp = json.loads(requests.get(url=self.url_regeo, params=params).text)
145 | 
146 |         if resp and resp.get('regeocode') and resp.get('regeocode').get('formatted_address'):
147 |             address = resp.get('regeocode').get('formatted_address')
148 |             print(f'原图的拍摄地址为:{address}\n')
149 |         else:
150 |             print('api解析地址出错，请检查ak！\n')
151 | 
152 |     def get_location_by_address(self, city, address):
153 |         """
154 |         通过地理位置到拿到经纬度
155 |         地理编码：https://lbs.amap.com/api/webservice/guide/api/georegeo/
156 |         :param address:
157 |         :return:
158 |         """
159 |         params = {
160 |             'key': self.ak,
161 |             'city': city,
162 |             'address': address,
163 |             'sig': self.sign
164 |         }
165 | 
166 |         resp = json.loads(requests.get(url=self.url_geo, params=params).text)
167 | 
168 |         # 获取坐标地址
169 |         if resp and len(resp.get('geocodes')) >= 1 and resp.get('geocodes')[0].get('location'):
170 |             location = resp.get('geocodes')[0].get('location')
171 |             gps_data = location.split(',')
172 | 
173 |             # 得到经度和纬度
174 |             gps_long = float(gps_data[0])
175 |             gps_lati = float(gps_data[1])
176 | 
177 |             return gps_long, gps_lati
178 |         else:
179 |             print('api解析地址出错，请检查ak！')
180 |             return None
181 | 
182 | 
183 | if __name__ == '__main__':
184 |     exif = Exif()
185 | 
186 |     image_path = './WechatIMG1439.jpeg'
187 | 
188 |     # 1、读取原图的属性
189 |     location = exif.read_image(image_path)
190 | 
191 |     if location:
192 |         # 2、原图的详细地址
193 |         exif.get_address_by_location(location)
194 | 
195 |         # 3、输入地址（市+目的地，例如：深圳莲花山公园）
196 |         city = input('请输入定位城市(例如：深圳)：')
197 |         address = input('请输入具体的定位地址(例如：莲花山公园)：')
198 | 
199 |         if address:
200 |             # 通过地址拿到坐标地址
201 |             location = exif.get_location_by_address(city, address)
202 | 
203 |             if location:
204 |                 # 4、修改图片属性,写入经度和纬度
205 |                 exif.write_image(image_path, location[0], location[1])
206 |                 print('修改图片地理成功！')
207 |         else:
208 |             print('请先输入具体地址！')
209 | 


--------------------------------------------------------------------------------
/ModifyLocation/position_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """
  5 | @version: v1.0
  6 | @author: xag
  7 | @license: Apache Licence
  8 | @contact: xinganguo@gmail.com
  9 | @site: http://www.xingag.top
 10 | @software: PyCharm
 11 | @file: position_utils.py
 12 | @time: 2019-08-23 17:44
 13 | @description：坐标转换
 14 | """
 15 | 
 16 | # -*- coding: utf-8 -*-
 17 | import math
 18 | 
 19 | x_pi = 3.14159265358979324 * 3000.0 / 180.0
 20 | pi = 3.1415926535897932384626  # π
 21 | a = 6378245.0  # 长半轴
 22 | ee = 0.00669342162296594323  # 扁率
 23 | 
 24 | 
 25 | def wgs84togcj02(lng, lat):
 26 |     """
 27 |     WGS84转GCJ02(火星坐标系)
 28 |     :param lng:WGS84坐标系的经度
 29 |     :param lat:WGS84坐标系的纬度
 30 |     :return:
 31 |     """
 32 |     if out_of_china(lng, lat):  # 判断是否在国内
 33 |         return lng, lat
 34 |     dlat = transformlat(lng - 105.0, lat - 35.0)
 35 |     dlng = transformlng(lng - 105.0, lat - 35.0)
 36 |     radlat = lat / 180.0 * pi
 37 |     magic = math.sin(radlat)
 38 |     magic = 1 - ee * magic * magic
 39 |     sqrtmagic = math.sqrt(magic)
 40 |     dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
 41 |     dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
 42 |     mglat = lat + dlat
 43 |     mglng = lng + dlng
 44 |     return [mglng, mglat]
 45 | 
 46 | 
 47 | def gcj02towgs84(lng, lat):
 48 |     """
 49 |     GCJ02(火星坐标系)转GPS84
 50 |     :param lng:火星坐标系的经度
 51 |     :param lat:火星坐标系纬度
 52 |     :return:
 53 |     """
 54 |     if out_of_china(lng, lat):
 55 |         return lng, lat
 56 |     dlat = transformlat(lng - 105.0, lat - 35.0)
 57 |     dlng = transformlng(lng - 105.0, lat - 35.0)
 58 |     radlat = lat / 180.0 * pi
 59 |     magic = math.sin(radlat)
 60 |     magic = 1 - ee * magic * magic
 61 |     sqrtmagic = math.sqrt(magic)
 62 |     dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
 63 |     dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
 64 |     mglat = lat + dlat
 65 |     mglng = lng + dlng
 66 |     return [lng * 2 - mglng, lat * 2 - mglat]
 67 | 
 68 | 
 69 | def transformlat(lng, lat):
 70 |     ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + \
 71 |           0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng))
 72 |     ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 *
 73 |             math.sin(2.0 * lng * pi)) * 2.0 / 3.0
 74 |     ret += (20.0 * math.sin(lat * pi) + 40.0 *
 75 |             math.sin(lat / 3.0 * pi)) * 2.0 / 3.0
 76 |     ret += (160.0 * math.sin(lat / 12.0 * pi) + 320 *
 77 |             math.sin(lat * pi / 30.0)) * 2.0 / 3.0
 78 |     return ret
 79 | 
 80 | 
 81 | def transformlng(lng, lat):
 82 |     ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + \
 83 |           0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng))
 84 |     ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 *
 85 |             math.sin(2.0 * lng * pi)) * 2.0 / 3.0
 86 |     ret += (20.0 * math.sin(lng * pi) + 40.0 *
 87 |             math.sin(lng / 3.0 * pi)) * 2.0 / 3.0
 88 |     ret += (150.0 * math.sin(lng / 12.0 * pi) + 300.0 *
 89 |             math.sin(lng / 30.0 * pi)) * 2.0 / 3.0
 90 |     return ret
 91 | 
 92 | 
 93 | def out_of_china(lng, lat):
 94 |     """
 95 |     判断是否在国内，不在国内不做偏移
 96 |     :param lng:
 97 |     :param lat:
 98 |     :return:
 99 |     """
100 |     if lng < 72.004 or lng > 137.8347:
101 |         return True
102 |     if lat < 0.8293 or lat > 55.8271:
103 |         return True
104 |     return False
105 | 


--------------------------------------------------------------------------------
/Python调用JAR/exec_jar_example.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: exec_jar_example.py 
12 | @time: 2021-01-02 12:30 
13 | @description：TODO
14 | """
15 | 
16 | import jpype
17 | import os
18 | 
19 | # 初始化
20 | jar_path = os.path.join(os.path.abspath('.'), 'jar/encry.jar')
21 | 
22 | print(jar_path)
23 | 
24 | # 启动jvm
25 | jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", "-Djava.class.path=%s" % (jar_path))
26 | 
27 | 
28 | # 通过包名，实例化JAVA对象
29 | EncryClass = jpype.JClass("com.xingag.common.EncryHelper")
30 | encryClass = EncryClass()
31 | 
32 | # 调用JAVA中的加密方法
33 | content_encry = encryClass.encrypt("xag")
34 | print(content_encry)
35 | 
36 | # 关闭jvm
37 | jpype.shutdownJVM()
38 | 


--------------------------------------------------------------------------------
/Python调用JAR/jar/com/xingag/common/EncryHelper.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/Python调用JAR/jar/com/xingag/common/EncryHelper.class


--------------------------------------------------------------------------------
/Python调用JAR/jar/encry.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/Python调用JAR/jar/encry.jar


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # spider_python
 2 | 
 3 | ## 前言
 4 | 
 5 | 如果想查看详细的教程，请关注微信公众号：**AirPython**
 6 | 
 7 | ![](./raw/qr.jpeg)
 8 | 
 9 | 
10 | 
11 | ## 普通的爬虫
12 | 
13 | * [爬取电影天堂最新的电影数据 - xpath](./spiders/spider_dytt.py)
14 | 
15 | * [爬取腾讯招聘的职位数据 - xpath](./spiders/spider_tencent_recruit.py)
16 | 
17 | * [爬取中国天气网全国天气并生成饼状图 - bs4](./spiders/spider_china_weather.py)
18 | 
19 | * [爬取古诗词网的数据 - re](./spiders/spider_gushiwen.py)
20 | 
21 | * [爬取糗事百科上的段子数据 - re](./spiders/spider_qiu_shi_bai_ke.py)
22 | 
23 | 
24 | 
25 | ## 多线程爬虫
26 | 
27 | * [多线程爬取斗图吧的表情图并下载到本地 - xpath + threading](./spiders/spider_dou_tu_la.py)
28 | * [使用 itchat 发送表情到指定的人和微信群](./spiders/发表情/)
29 | * [多线程爬取百思不得姐的文字和图片信息并写入到csv中](./spiders/spider_bai_si_bu_de_jie.py)
30 | 
31 | 
32 | 
33 | ## Selenium 自动化爬虫
34 | 
35 | * [爬取拉勾网的职位信息 - selenium + requests + lxml ](./spiders/spider_lagou.py)
36 | 
37 | * [爬取 Boss 直聘网的职位信息 - selenium + lxml](./spiders/spider_boss.py)
38 | 
39 | 
40 | 
41 | ## Scrapy 框架爬虫
42 | * [爬取糗事百科的段子保存到 JSON 文件中](./scrapy/qsbk/readme.MD)
43 | * [爬取微信小程序论坛的数据](./scrapy/weixin_community/readme.MD)
44 | * [登录豆瓣网并修改个性签名](./scrapy/douban_login/readme.MD)
45 | * [下载汽车之家的高清图片到本地](./scrapy/qczj/readme.MD)
46 | * [爬取简书网所有文章数据](./scrapy/jianshu_spider/)
47 | * [爬取房天下所有房的数据，包含新房、二手房](./scrapy/sfw_spider)
48 | 
49 | 
50 | 
51 | 
52 | 
53 | ## feapder
54 | 
55 | * [feapder AirSpider实例](./feapder/tophub_demo)
56 | 
57 | 
58 | 
59 | ## Node.js 爬虫
60 | 
61 | * [使用 puppeteer 爬取简书文章并保存到本地](./js/jian_shu.js)
62 | 
63 |   
64 | 
65 | ## 其他
66 | 
67 | * [使用 Python 定位到女朋友的位置](./获取女友的位置)
68 | * [女朋友背着我，用 Python 偷偷隐藏了她的行踪](./ModifyLocation)
69 | * [微信群聊记录](./微信聊天记录)
70 | * [Python 调用 JAR](./Python调用JAR)
71 | 
72 | 


--------------------------------------------------------------------------------
/feapder/tophub_demo/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredPackages">
 6 |         <value>
 7 |           <list size="36">
 8 |             <item index="0" class="java.lang.String" itemvalue="alembic" />
 9 |             <item index="1" class="java.lang.String" itemvalue="greenlet" />
10 |             <item index="2" class="java.lang.String" itemvalue="PyYAML" />
11 |             <item index="3" class="java.lang.String" itemvalue="pluggy" />
12 |             <item index="4" class="java.lang.String" itemvalue="SQLAlchemy" />
13 |             <item index="5" class="java.lang.String" itemvalue="cffi" />
14 |             <item index="6" class="java.lang.String" itemvalue="numpy" />
15 |             <item index="7" class="java.lang.String" itemvalue="requests" />
16 |             <item index="8" class="java.lang.String" itemvalue="Jinja2" />
17 |             <item index="9" class="java.lang.String" itemvalue="selenium" />
18 |             <item index="10" class="java.lang.String" itemvalue="certifi" />
19 |             <item index="11" class="java.lang.String" itemvalue="lxml" />
20 |             <item index="12" class="java.lang.String" itemvalue="urllib3" />
21 |             <item index="13" class="java.lang.String" itemvalue="Appium-Python-Client" />
22 |             <item index="14" class="java.lang.String" itemvalue="xlrd" />
23 |             <item index="15" class="java.lang.String" itemvalue="Flask" />
24 |             <item index="16" class="java.lang.String" itemvalue="beautifulsoup4" />
25 |             <item index="17" class="java.lang.String" itemvalue="python-docx" />
26 |             <item index="18" class="java.lang.String" itemvalue="pymongo" />
27 |             <item index="19" class="java.lang.String" itemvalue="Werkzeug" />
28 |             <item index="20" class="java.lang.String" itemvalue="pytest" />
29 |             <item index="21" class="java.lang.String" itemvalue="cryptography" />
30 |             <item index="22" class="java.lang.String" itemvalue="Flask-SQLAlchemy" />
31 |             <item index="23" class="java.lang.String" itemvalue="WTForms" />
32 |             <item index="24" class="java.lang.String" itemvalue="colorama" />
33 |             <item index="25" class="java.lang.String" itemvalue="Django" />
34 |             <item index="26" class="java.lang.String" itemvalue="Flask-Migrate" />
35 |             <item index="27" class="java.lang.String" itemvalue="wechatsogou" />
36 |             <item index="28" class="java.lang.String" itemvalue="PyMySQL" />
37 |             <item index="29" class="java.lang.String" itemvalue="pytz" />
38 |             <item index="30" class="java.lang.String" itemvalue="itchat" />
39 |             <item index="31" class="java.lang.String" itemvalue="idna" />
40 |             <item index="32" class="java.lang.String" itemvalue="Pillow" />
41 |             <item index="33" class="java.lang.String" itemvalue="zope.interface" />
42 |             <item index="34" class="java.lang.String" itemvalue="gevent" />
43 |             <item index="35" class="java.lang.String" itemvalue="zope.event" />
44 |           </list>
45 |         </value>
46 |       </option>
47 |     </inspection_tool>
48 |     <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
49 |       <option name="ignoredIdentifiers">
50 |         <list>
51 |           <option value="Exception.format_exc" />
52 |         </list>
53 |       </option>
54 |     </inspection_tool>
55 |   </profile>
56 | </component>


--------------------------------------------------------------------------------
/feapder/tophub_demo/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (xh_log)" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/feapder/tophub_demo/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/tophub_demo.iml" filepath="$PROJECT_DIR$/.idea/tophub_demo.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/feapder/tophub_demo/.idea/tophub_demo.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="jdk" jdkName="Python 3.7 (xh_log)" jdkType="Python SDK" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="pytest" />
10 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/feapder/tophub_demo/items/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "topic_item"
3 | ]


--------------------------------------------------------------------------------
/feapder/tophub_demo/items/topic_item.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-04-08 12:20:22
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: xingag
 8 | """
 9 | 
10 | from feapder import Item
11 | 
12 | 
13 | class TopicItem(Item):
14 |     """
15 |     This class was generated by feapder.
16 |     command: feapder create -i topic.
17 |     """
18 | 
19 |     def __init__(self, *args, **kwargs):
20 |         # self.id = None
21 |         self.title = None  # 文章标题
22 |         self.auth = None  # 作者
23 |         self.like_count = 0  # 喜欢数
24 |         self.collection = 0  # 收藏数
25 |         self.comment = 0  # 评论数
26 | 


--------------------------------------------------------------------------------
/feapder/tophub_demo/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-04-08 11:57:08
 4 | ---------
 5 | @summary: 爬虫入口
 6 | ---------
 7 | @author: xingag
 8 | """
 9 | 
10 | from feapder import ArgumentParser
11 | 
12 | from spiders import *
13 | 
14 | 
15 | def crawl_xxx():
16 |     """
17 |     普通爬虫
18 |     """
19 |     spider = xxx.XXXSpider(redis_key="xxx:xxx")
20 |     spider.start()
21 | 
22 | 
23 | def crawl_xxx(args):
24 |     """
25 |     批次爬虫
26 |     @param args: 1 / 2 / init
27 |     """
28 |     spider = xxx_spider.XXXSpider(
29 |         task_table="",  # mysql中的任务表
30 |         batch_record_table="",  # mysql中的批次记录表
31 |         batch_name="xxx(周全)",  # 批次名字
32 |         batch_interval=7,  # 批次时间 天为单位 若为小时 可写 1 / 24
33 |         task_keys=["id", "xxx"],  # 需要获取任务表里的字段名，可添加多个
34 |         redis_key="xxx:xxxx",  # redis中存放request等信息的根key
35 |         task_state="state",  # mysql中任务状态字段
36 |     )
37 | 
38 |     if args == 1:
39 |         spider.start_monitor_task()
40 |     elif args == 2:
41 |         spider.start()
42 |     elif args == "init":
43 |         spider.init_task()
44 | 
45 | 
46 | if __name__ == "__main__":
47 |     parser = ArgumentParser(description="xxx爬虫")
48 | 
49 |     parser.add_argument(
50 |         "--crawl_xxx", action="store_true", help="xxx", function=crawl_xxx
51 |     )
52 |     parser.add_argument(
53 |         "--crawl_xxx", type=int, nargs=1, help="xxx(1|2）", function=crawl_xxx
54 |     )
55 | 
56 |     parser.start()
57 | 


--------------------------------------------------------------------------------
/feapder/tophub_demo/setting.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """爬虫配置文件"""
 3 | import os
 4 | 
 5 | 
 6 | # MYSQL
 7 | MYSQL_IP = "localhost"
 8 | MYSQL_PORT = 3306
 9 | MYSQL_DB = "xag"
10 | MYSQL_USER_NAME = "root"
11 | MYSQL_USER_PASS = "root"
12 | 
13 | # REDIS
14 | # IP:PORT
15 | REDISDB_IP_PORTS = "xxx:6379"
16 | REDISDB_USER_PASS = ""
17 | # 默认 0 到 15 共16个数据库
18 | REDISDB_DB = 0
19 | # # 适用于redis哨兵模式
20 | # REDISDB_SERVICE_NAME = None
21 | #
22 | # # 数据入库的pipeline，可自定义，默认MysqlPipeline
23 | # ITEM_PIPELINES = ["feapder.pipelines.mysql_pipeline.MysqlPipeline"]
24 | #
25 | # # 爬虫相关
26 | # # COLLECTOR
27 | # COLLECTOR_SLEEP_TIME = 1  # 从任务队列中获取任务到内存队列的间隔
28 | # COLLECTOR_TASK_COUNT = 100  # 每次获取任务数量
29 | #
30 | # # SPIDER
31 | # SPIDER_THREAD_COUNT = 100  # 爬虫并发数
32 | # SPIDER_SLEEP_TIME = 0  # 下载时间间隔（解析完一个response后休眠时间）
33 | # SPIDER_MAX_RETRY_TIMES = 100  # 每个请求最大重试次数
34 | # WARNING_FAILED_COUNT = 1000  # 任务失败数 超过WARNING_FAILED_COUNT则报警
35 | #
36 | # # 浏览器渲染下载
37 | # WEBDRIVER = dict(
38 | #     pool_size=2,  # 浏览器的数量
39 | #     load_images=False,  # 是否加载图片
40 | #     user_agent=None,  # 字符串 或 无参函数，返回值为user_agent
41 | #     proxy=None,  # xxx.xxx.xxx.xxx:xxxx 或 无参函数，返回值为代理地址
42 | #     headless=False,  # 是否为无头浏览器
43 | #     driver_type="CHROME",  # CHROME 或 PHANTOMJS,
44 | #     timeout=30,  # 请求超时时间
45 | #     window_size=(1024, 800),  # 窗口大小
46 | #     executable_path=None,  # 浏览器路径，默认为默认路径
47 | # )
48 | #
49 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败
50 | # RETRY_FAILED_REQUESTS = False
51 | # # request 超时时间，超过这个时间重新做（不是网络请求的超时时间）单位秒
52 | # REQUEST_TIME_OUT = 600  # 10分钟
53 | # # 保存失败的request
54 | # SAVE_FAILED_REQUEST = True
55 | #
56 | # # 下载缓存 利用redis缓存，由于内存小，所以仅供测试时使用
57 | # RESPONSE_CACHED_ENABLE = False  # 是否启用下载缓存 成本高的数据或容易变需求的数据，建议设置为True
58 | # RESPONSE_CACHED_EXPIRE_TIME = 3600  # 缓存时间 秒
59 | # RESPONSE_CACHED_USED = False  # 是否使用缓存 补采数据时可设置为True
60 | #
61 | # # 爬虫是否自动结束，若为False，则会等待新任务下发，进程不退出
62 | # AUTO_STOP_WHEN_SPIDER_DONE = True
63 | #
64 | # # 设置代理
65 | # PROXY_EXTRACT_API = None  # 代理提取API ，返回的代理分割符为\r\n
66 | # PROXY_ENABLE = True
67 | #
68 | # # 随机headers
69 | # RANDOM_HEADERS = True
70 | # # requests 使用session
71 | # USE_SESSION = False
72 | #
73 | # # 去重
74 | # ITEM_FILTER_ENABLE = False  # item 去重
75 | # REQUEST_FILTER_ENABLE = False  # request 去重
76 | #
77 | # # 报警 支持钉钉及邮件，二选一即可
78 | # # 钉钉报警
79 | # DINGDING_WARNING_URL = ""  # 钉钉机器人api
80 | # DINGDING_WARNING_PHONE = ""  # 报警人 支持列表，可指定多个
81 | # # 邮件报警
82 | # EAMIL_SENDER = ""  # 发件人
83 | # EAMIL_PASSWORD = ""  # 授权码
84 | # EMAIL_RECEIVER = "" # 收件人 支持列表，可指定多个
85 | # # 报警时间间隔及级别
86 | # WARNING_INTERVAL = 3600  # 相同报警的报警时间间隔，防止刷屏
87 | # WARNING_LEVEL = "DEBUG" # 报警级别， DEBUG / ERROR
88 | #
89 | # LOG_NAME = os.path.basename(os.getcwd())
90 | # LOG_PATH = "log/%s.log" % LOG_NAME  # log存储路径
91 | # LOG_LEVEL = "DEBUG"
92 | # LOG_IS_WRITE_TO_FILE = False
93 | 


--------------------------------------------------------------------------------
/feapder/tophub_demo/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 |     "tophub_spider"
3 | ]


--------------------------------------------------------------------------------
/feapder/tophub_demo/spiders/tophub_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | """
 3 | Created on 2021-04-08 12:03:28
 4 | ---------
 5 | @summary:
 6 | ---------
 7 | @author: xingag
 8 | """
 9 | 
10 | import re
11 | 
12 | import feapder
13 | from fake_useragent import UserAgent
14 | from feapder.db.mysqldb import MysqlDB
15 | 
16 | 
17 | # 爬取数据并入库
18 | 
19 | class TophubSpider(feapder.AirSpider):
20 | 
21 |     def __init__(self, *args, **kwargs):
22 |         super().__init__(*args, **kwargs)
23 |         self.db = MysqlDB()
24 | 
25 |     def start_requests(self):
26 |         yield feapder.Request("https://tophub.today/", download_midware=self.download_midware)
27 | 
28 |     def parse(self, request, response):
29 |         # print(response.text)
30 |         card_elements = response.xpath('//div[@class="cc-cd"]')
31 | 
32 |         # 过滤出对应的卡片元素【什么值得买】
33 |         buy_good_element = [card_element for card_element in card_elements if
34 |                             card_element.xpath('.//div[@class="cc-cd-is"]//span/text()').extract_first() == '什么值得买'][0]
35 | 
36 |         # 获取内部文章标题及地址
37 |         a_elements = buy_good_element.xpath('.//div[@class="cc-cd-cb nano"]//a')
38 | 
39 |         for a_element in a_elements:
40 |             # 标题和链接
41 |             title = a_element.xpath('.//span[@class="t"]/text()').extract_first()
42 |             href = a_element.xpath('.//@href').extract_first()
43 | 
44 |             # 再次下发新任务，并带上文章标题
45 |             yield feapder.Request(href, download_midware=self.download_midware, callback=self.parser_detail_page,
46 |                                   title=title)
47 | 
48 |     def parser_detail_page(self, request, response):
49 |         """
50 |         解析文章详情数据
51 |         :param request:
52 |         :param response:
53 |         :return:
54 |         """
55 |         title = request.title
56 | 
57 |         url = request.url
58 | 
59 |         # 解析文章详情页面，获取点赞、收藏、评论数目及作者名称
60 |         author = response.xpath('//a[@class="author-title"]/text()').extract_first().strip()
61 | 
62 |         print("作者：", author, '文章标题:', title, "地址：", url)
63 | 
64 |         desc_elements = response.xpath('//span[@class="xilie"]/span')
65 | 
66 |         print("desc数目:", len(desc_elements))
67 | 
68 |         # 点赞
69 |         like_count = int(re.findall('\d+', desc_elements[1].xpath('./text()').extract_first())[0])
70 |         # 收藏
71 |         collection_count = int(re.findall('\d+', desc_elements[2].xpath('./text()').extract_first())[0])
72 |         # 评论
73 |         comment_count = int(re.findall('\d+', desc_elements[3].xpath('./text()').extract_first())[0])
74 | 
75 |         print("点赞：", like_count, "收藏:", collection_count, "评论:", comment_count)
76 | 
77 |         # 插入数据库
78 |         sql = "INSERT INTO topic(title,auth,like_count,collection,comment) values('%s','%s','%s','%d','%d')" % (
79 |         title, author, like_count, collection_count, comment_count)
80 | 
81 |         # 执行
82 |         self.db.execute(sql)
83 | 
84 |     def download_midware(self, request):
85 |         # 随机UA
86 |         # 依赖：pip3 install fake_useragent
87 |         ua = UserAgent().random
88 |         request.headers = {'User-Agent': ua}
89 |         return request
90 | 
91 | 
92 | if __name__ == "__main__":
93 |     TophubSpider(thread_count=10).start()
94 | 


--------------------------------------------------------------------------------
/feapder/tophub_demo/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: test.py 
12 | @time: 2021/4/8 下午12:26 
13 | @description：TODO
14 | """
15 | 
16 | from fake_useragent import UserAgent
17 | 
18 | ua = UserAgent().random
19 | print(ua)


--------------------------------------------------------------------------------
/js/jian_shu.js:
--------------------------------------------------------------------------------
  1 | //简书上的文章保存为pdf保存到本地
  2 | const puppeteer = require('puppeteer');
  3 | 
  4 | const mkdirp = require('mkdirp');
  5 | 
  6 | BASE_URL = 'https://www.jianshu.com';
  7 | 
  8 | HOME_URL = `${BASE_URL}/u/f46becd1ed83`;
  9 | 
 10 | //文章目录
 11 | const ARTICLE_PATH = './monkey';
 12 | 
 13 | const download_article = async () => {
 14 | 
 15 |     const viewport_size = {
 16 |         width: 0,
 17 |         height: 0,
 18 |     };
 19 | 
 20 |     const browser = await puppeteer.launch({
 21 |         headless: true,
 22 |     });
 23 | 
 24 |     const page = await browser.newPage();
 25 | 
 26 |     page.setViewport(viewport_size);
 27 | 
 28 |     //打开文章主页
 29 |     await page.goto(HOME_URL);
 30 | 
 31 |     console.log('显示文章列表，马上开始滑动')
 32 | 
 33 |     //滑动文章列表，使所有文章被加载出来
 34 |     //参考：https://github.com/GoogleChrome/puppeteer/issues/844
 35 |     await autoScroll(page);
 36 | 
 37 |     console.log('所有文章加载完成');
 38 | 
 39 |     const articles = await page.$eval('.note-list', articles_element => {
 40 |         const article_elements = articles_element.querySelectorAll('li');
 41 |         const articleElementArray = Array.prototype.slice.call(article_elements);
 42 | 
 43 |         return articleElementArray.map(item => {
 44 |             const a_element = item.querySelector('.title');
 45 |             return {
 46 |                 href: a_element.getAttribute('href'),
 47 |                 title: a_element.innerHTML.trim(),
 48 |             };
 49 |         });
 50 |     });
 51 | 
 52 |     console.log(`大佬一共发布了${articles.length}篇文章`);
 53 | 
 54 | 
 55 |     //新建目录
 56 |     mkdirp.sync(ARTICLE_PATH);
 57 | 
 58 |     for (let article of articles) {
 59 |         const articlePage = await browser.newPage();
 60 |         articlePage.setViewport(viewport_size);
 61 |         articlePage.goto(`${BASE_URL}${article.href}`, {
 62 |             waitUntil: 'networkidle2'
 63 |         });
 64 | 
 65 |         articlePage.waitForSelector('.post');
 66 |         console.log('文章详情页面加载完成');
 67 | 
 68 |         //注意：这里必须等待几秒，不然下面的滑动会报错：
 69 |         // UnhandledPromiseRejectionWarning: Error: Execution context was destroyed, most likely because of a navigation.
 70 |         await articlePage.waitFor(2000);
 71 | 
 72 |         //滑动到最底部，加载出所有的图片
 73 |         await autoScroll(articlePage);
 74 | 
 75 | 
 76 |         //为了保证页面的整洁干净，屏蔽多余的元素
 77 |         await articlePage.$eval('body', body => {
 78 |             body.querySelector('.navbar').style.display = 'none';
 79 |             body.querySelector('#note-fixed-ad-container').style.display = 'none';
 80 |             body.querySelector('.note-bottom').style.display = 'none';
 81 |             body.querySelector('.side-tool').style.display = 'none';
 82 |             // body.querySelector('.author').style.display = 'none';
 83 |             body.querySelector('.meta-bottom').style.display = 'none';
 84 |             body.querySelector('#web-note-ad-1').style.display = 'none';
 85 |             body.querySelector('#comment-list').style.display = 'none';
 86 |             body.querySelector('.follow-detail').style.display = 'none';
 87 |             body.querySelector('.show-foot').style.display = 'none';
 88 | 
 89 |             Promise.resolve();
 90 |         });
 91 | 
 92 |         //文章名称
 93 |         const fileName = `${article.title.replace("/\\//g", "、")}.pdf`;
 94 |         const fileFullPath = `${ARTICLE_PATH}/${fileName}`;
 95 |         console.log(`文章保存的完整路径是:${fileFullPath}`);
 96 | 
 97 |         await page.emulateMedia('screen');
 98 |         await articlePage.pdf({
 99 |             path: fileFullPath,
100 |             format: 'A4'
101 |         });
102 |         console.log(`保存成功: ${fileFullPath}`);
103 |         articlePage.close();
104 |     }
105 | 
106 |     console.log('下载完成！Enjoy~');
107 | };
108 | 
109 | function autoScroll(page) {
110 |     return page.evaluate(() => {
111 |         return new Promise((resolve, reject) => {
112 |             var totalHeight = 0;
113 |             var distance = 100;
114 |             var timer = setInterval(() => {
115 |                 console.log('执行间断函数');
116 |                 var scrollHeight = document.body.scrollHeight;
117 |                 window.scrollBy(0, distance);
118 |                 totalHeight += distance;
119 | 
120 |                 if (totalHeight >= scrollHeight) {
121 |                     console.log('滑动到底');
122 |                     clearInterval(timer);
123 |                     resolve();
124 |                 }
125 |             }, 100);
126 |         })
127 |     });
128 | }
129 | 
130 | 
131 | module.exports = download_article;
132 | 
133 | if (require.main === module) {
134 |     download_article()
135 | }
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/pic/最低气温排行榜.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/pic/最低气温排行榜.png


--------------------------------------------------------------------------------
/raw/qr.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/raw/qr.jpeg


--------------------------------------------------------------------------------
/scrapy/douban_login/.idea/douban_login.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/scrapy/douban_login/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/scrapy/douban_login/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/douban_login.iml" filepath="$PROJECT_DIR$/.idea/douban_login.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/scrapy/douban_login/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../../../../.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/scrapy/douban_login/captcha.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/douban_login/captcha.png


--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/douban_login/douban_login/__init__.py


--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class DoubanLoginItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class DoubanLoginSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class DoubanLoginDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class DoubanLoginPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for douban_login project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'douban_login'
13 | 
14 | SPIDER_MODULES = ['douban_login.spiders']
15 | NEWSPIDER_MODULE = 'douban_login.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'douban_login (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 1
30 | # The download del、ay、 setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 |     'Accept-Language': 'en',
44 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
45 | }
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | #    'douban_login.middlewares.DoubanLoginSpiderMiddleware': 543,
51 | # }
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | #    'douban_login.middlewares.DoubanLoginDownloaderMiddleware': 543,
57 | # }
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'douban_login.pipelines.DoubanLoginPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | # AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | # AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy/douban_login/douban_login/spiders/douban.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | from urllib import request
  4 | from PIL import Image
  5 | import ssl
  6 | 
  7 | 
  8 | # 使用Scrapy登录豆瓣网
  9 | # 验证码识别可以通过手动输入【PIL】和自动识别
 10 | 
 11 | class DoubanSpider(scrapy.Spider):
 12 |     name = 'douban'
 13 |     allowed_domains = ['douban.com']
 14 | 
 15 |     # 默认首先请求这个地址【GET】，然后把请求结果返回给parse()函数解析
 16 |     start_urls = ['https://accounts.douban.com/login']
 17 | 
 18 |     # 登录url
 19 |     login_url = 'https://accounts.douban.com/login'
 20 | 
 21 |     # 个人中心url
 22 |     person_center_url = 'https://www.douban.com/people/165725759/'
 23 | 
 24 |     # 编辑签名的请求地址
 25 |     edit_signature = 'https://www.douban.com/j/people/165725759/edit_signature'
 26 | 
 27 |     def parse(self, response):
 28 |         """
 29 |         请求后的解析
 30 |         包含两种情况：1.第一次请求start_urls；2.某一次请求不包含callback
 31 |         :param response:
 32 |         :return:
 33 |         """
 34 |         # 注意：把最后的请求解析过滤掉
 35 |         # 如果解析到相应地址不是login_url就不做处理
 36 |         if response.url != self.login_url:
 37 |             return
 38 | 
 39 |         print('调用parse函数，此时的url:%s' % response.url)
 40 |         form_data = {
 41 |             'source': 'index_nav',
 42 |             'redir': 'https://www.douban.com/',  # 登录后跳转到哪个界面
 43 |             'form_email': '18520876423',
 44 |             'form_password': 'Hu881025',
 45 |             # 'captcha-solution': 'chemical',  # 验证码【需要识别图片】
 46 |             # 'captcha-id': 'ysCwMdnnq8YVpDJZdfmzHu1V:en',  # 验证码ID  【每次刷新都重新生成一个，放入到input标签的name为captcha-id的value中】
 47 |             'remember': 'on',
 48 |             'login': '登录'
 49 |         }
 50 | 
 51 |         # 获取id为captcha-id的img标签【css方式，也可以选择用xpath】
 52 |         # 验证码图片的url
 53 |         captcha_img = response.css('img#captcha_image::attr(src)').get()
 54 | 
 55 |         # 注意：如果存在验证码，就识别验证码;如果没有验证码，不传入以下两个参数直接登录
 56 |         if captcha_img:
 57 |             # 手动识别验证码
 58 |             captcha = self._regonize_captcha(captcha_img)
 59 |             form_data['captcha-solution'] = captcha
 60 | 
 61 |             # 验证码id【每次刷新都会变化】
 62 |             captcha_id = response.xpath('//input[@name="captcha-id"]/@value').get()
 63 |             form_data['captcha-id'] = captcha_id
 64 |             print('带有验证码的参数已经补充完整，现在开始发送请求')
 65 |         else:
 66 |             print('没有验证码，现在开始发送请求')
 67 | 
 68 |         # 发送登录请求【POST】
 69 |         yield scrapy.FormRequest(url=self.login_url, formdata=form_data, callback=self.parse_after_login)
 70 | 
 71 |     def _regonize_captcha(self, image_url):
 72 |         """
 73 |         人工识别验证码【urllib+PIL】
 74 |         :param image_url:
 75 |         :return:
 76 |         """
 77 |         print('验证码的地址:%s,开始下载图片' % image_url)
 78 | 
 79 |         # 下载图片到本地
 80 |         request.urlretrieve(image_url, 'captcha.png')
 81 | 
 82 |         print('下载图片完成，开始显示图片')
 83 | 
 84 |         # 显示在控制台，手动输入验证码
 85 |         # 打开图片
 86 |         image = Image.open('captcha.png')
 87 |         # 展示
 88 |         image.show()
 89 | 
 90 |         # 提示输入验证码
 91 |         captcha = input('请输入验证码:')
 92 | 
 93 |         return captcha
 94 | 
 95 |     def parse_after_login(self, response):
 96 |         """
 97 |         登录成功之后，请求【个人中心】
 98 |         :param response:
 99 |         :return:
100 |         """
101 |         # 当前url
102 |         current_page_url = response.url
103 |         print('调用登录接口后，现在的界面是：%s' % current_page_url)
104 |         if current_page_url == 'https://www.douban.com/':
105 |             print('登录成功')
106 |             # 请求个人中心的页面
107 |             request = scrapy.Request(url=self.person_center_url, callback=self.parse_person_center)
108 |             yield request
109 |         else:
110 |             print('登录失败')
111 | 
112 |     def parse_person_center(self, response):
113 |         """
114 |         解析个人中心页面
115 |         :param response:
116 |         :return:
117 |         """
118 |         if response.url == self.person_center_url:
119 |             print('进入到个人中心页面了')
120 |             ck = response.xpath('//input[@name="ck"]/@value').get()
121 |             print('获取的ck是:%s' % ck)
122 |             formdata = {
123 |                 'ck': ck,
124 |                 'signature': '时光如水，岁月如斯'
125 |             }
126 |             # 发送post请求来更改签名
127 |             yield scrapy.FormRequest(self.edit_signature, formdata=formdata)
128 |         else:
129 |             print('进入个人中心页面失败')
130 | 


--------------------------------------------------------------------------------
/scrapy/douban_login/readme.MD:
--------------------------------------------------------------------------------
 1 | # 使用scrapy登录豆瓣网
 2 | ### 准备
 3 | 
 4 | ```
 5 | scrapy startproject douban_login
 6 | cd douban_login
 7 | scrapy genspider douban "douban.com"
 8 | ```
 9 | 
10 | 
11 | 
12 | ### 配置
13 | 
14 | 配置 `settings.py` 文件
15 | 
16 | 编写 `start.py` 文件，利用 `cmdline` 快速指定爬虫代码
17 | 
18 | 
19 | 
20 | ### 开发
21 | 
22 | 场景：使用 `scrapy` 登录豆瓣网，然后到个人中心页面，修改个性签名
23 | 
24 | 请求：初始请求【GET】、登录请求【POST】、个人中心请求【GET】、修改签名请求【POST】
25 | 
26 | 注意：
27 | 
28 | 1. 初始请求的地址是：`start_urls`
29 | 2. 使用 `urllib + PIL` 下载验证码图片，并人工识别验证码【可以付费调用识别验证码的接口】
30 | 3. `captcha-id` 和 `ck` 两个请求参数都在源码中的某个元素里
31 | 
32 | 
33 | 
34 | ### 运行
35 | 
36 | 运行 `start.py` 
37 | 
38 | 


--------------------------------------------------------------------------------
/scrapy/douban_login/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = douban_login.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = douban_login
12 | 


--------------------------------------------------------------------------------
/scrapy/douban_login/start.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: start.py 
12 | @time: 11/15/18 21:04 
13 | @description：方便执行 Python 文件【执行一个 Python 文件】
14 | """
15 | from scrapy import cmdline
16 | 
17 | cmdline.execute('scrapy crawl douban'.split())
18 | 


--------------------------------------------------------------------------------
/scrapy/huize_spider/.idea/huize_spider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/scrapy/huize_spider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/scrapy/huize_spider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/huize_spider.iml" filepath="$PROJECT_DIR$/.idea/huize_spider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/scrapy/huize_spider/.~ana.rtf:
--------------------------------------------------------------------------------
1 | xingag                                                x i n g a g                                                                                               


--------------------------------------------------------------------------------
/scrapy/huize_spider/ana.rtf:
--------------------------------------------------------------------------------
 1 | {\rtf1\ansi\ansicpg936\cocoartf1671\cocoasubrtf100
 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset134 PingFangSC-Regular;}
 3 | {\colortbl;\red255\green255\blue255;}
 4 | {\*\expandedcolortbl;;}
 5 | \margl1440\margr1440\vieww10800\viewh13080\viewkind0
 6 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0
 7 | 
 8 | \f0\fs24 \cf0 http://www.huize.com/\
 9 | \
10 | 
11 | \f1 \'bd\'a1\'bf\'b5\'b1\'a3\'cf\'d5\
12 | 1.\'d6\'d8\'bc\'b2
13 | \f0 \
14 | http://www.huize.com/product/ins-2059-0-0\
15 | 2.
16 | \f1 \'d7\'a1\'d4\'ba\'d2\'bd\'c1\'c6
17 | \f0 \
18 | http://www.huize.com/product/ins-2058-0-0\
19 | \
20 | \
21 | 
22 | \f1 \'c8\'cb\'ca\'d9\'b1\'a3\'cf\'d5
23 | \f0 \
24 | 1.
25 | \f1 \'c8\'cb\'ca\'d9\'b1\'a3\'d5\'cf
26 | \f0 \
27 | http://www.huize.com/product/ins-2060-0-0\
28 | 2.
29 | \f1 \'c4\'ea\'bd\'f0\'b1\'a3\'cf\'d5
30 | \f0 \
31 | http://www.huize.com/product/ins-2101-0-0\
32 | \
33 | \
34 | 
35 | \f1 \'b6\'f9\'cd\'af\'b1\'a3\'cf\'d5
36 | \f0 \
37 | 1.
38 | \f1 \'b6\'f9\'cd\'af\'d6\'d8\'bb\'f7
39 | \f0 \
40 | http://www.huize.com/product/ins-2043-0-0\
41 | 2.
42 | \f1 \'b6\'f9\'cd\'af\'d2\'bd\'c1\'c6
43 | \f0 \
44 | http://www.huize.com/product/ins-2044-0-0\
45 | 3.
46 | \f1 \'b6\'f9\'cd\'af\'d2\'e2\'cd\'e2\
47 | http://www.huize.com/product/ins-2042-0-0\
48 | 4.\'bd\'cc\'d3\'fd\'b4\'a2\'d0\'ee\
49 | http://www.huize.com/product/ins-2057-0-0\
50 | \
51 | \
52 | \'d2\'e2\'cd\'e2\'b1\'a3\'cf\'d5\
53 | 1.\'bd\'bb\'cd\'a8\'d2\'e2\'cd\'e2\
54 | http://www.huize.com/product/ins-2082-0-0\
55 | 2.\'d7\'db\'ba\'cf\'d2\'e2\'cd\'e2\
56 | http://www.huize.com/product/ins-2049-0-0\
57 | \
58 | }


--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/huize_spider/huize_spider/__init__.py


--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class HuizeSpiderItem(scrapy.Item):
12 |     title = scrapy.Field()
13 |     sales = scrapy.Field()
14 |     tips = scrapy.Field()
15 |     price = scrapy.Field()
16 |     url = scrapy.Field()
17 | 


--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class HuizeSpiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class HuizeSpiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from scrapy.exporters import JsonLinesItemExporter
 9 | 
10 | 
11 | class HuizeSpiderPipeline(object):
12 | 
13 |     def __init__(self):
14 |         self.fp = open('datas.json', 'wb')
15 | 
16 |         self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False)
17 | 
18 |     def process_item(self, item, spider):
19 |         self.exporter.export_item(item)
20 |         return item
21 | 
22 |     def close_spider(self, spider):
23 |         # 关闭文件
24 |         self.fp.close()
25 | 


--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for huize_spider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'huize_spider'
13 | 
14 | SPIDER_MODULES = ['huize_spider.spiders']
15 | NEWSPIDER_MODULE = 'huize_spider.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'huize_spider (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 1
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 |     'Accept-Language': 'en',
44 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
45 | }
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | #    'huize_spider.middlewares.HuizeSpiderSpiderMiddleware': 543,
51 | # }
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | #    'huize_spider.middlewares.HuizeSpiderDownloaderMiddleware': 543,
57 | # }
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |    'huize_spider.pipelines.HuizeSpiderPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | # AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | # AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/spiders/huize.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from huize_spider.items import HuizeSpiderItem
 6 | from .string_utils import remove_space_words
 7 | 
 8 | 
 9 | # 使用 CrawlSpider 爬取某保险网的数据
10 | 
11 | class HuizeSpider(CrawlSpider):
12 |     name = 'huize'
13 |     allowed_domains = ['huize.com']
14 |     start_urls = ['http://huize.com/']
15 | 
16 |     rules = (
17 |         Rule(LinkExtractor(allow=r'.*http://www.huize.com/product/ins-.*'), callback=None, follow=False),
18 |         Rule(LinkExtractor(allow=r'.*http://www.huize.com/product/detail-.*'), callback='parse_detail', follow=False),
19 |     )
20 | 
21 |     def parse_detail(self, response):
22 |         # 标题
23 |         title = response.xpath('//h2[@class="product-title f30"]/text()').get().strip()
24 | 
25 |         # 销量
26 |         sales = response.xpath('//p[@class="count-item fc6"]/text()').get().strip()
27 | 
28 |         # 保险特色
29 |         # 去掉特殊空格符号
30 |         tips = remove_space_words("、".join(response.xpath('//li[@class="ensure-support-item"]/text()').getall()))
31 | 
32 |         # 价格
33 |         price = response.xpath('//span[@class="product-price"]/i[@class="preminum-result"]/text()').get()+" 元"
34 | 
35 |         item = HuizeSpiderItem(title=title, sales=sales, tips=tips, price=price, url=response.url)
36 | 
37 |         yield item
38 | 


--------------------------------------------------------------------------------
/scrapy/huize_spider/huize_spider/spiders/string_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: string_utils.py 
12 | @time: 12/4/18 19:52 
13 | @description：TODO
14 | """
15 | 
16 | 
17 | def remove_space_words(source):
18 |     """
19 |     去掉字符串中的特殊空格，包含\n、\t、\xa0
20 |     :param source:
21 |     :return:
22 |     """
23 |     result = "".join(source.split())
24 |     return result
25 | 


--------------------------------------------------------------------------------
/scrapy/huize_spider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = huize_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = huize_spider
12 | 


--------------------------------------------------------------------------------
/scrapy/huize_spider/start.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: start.py 
12 | @time: 11/15/18 21:04 
13 | @description：方便执行 Python 文件【执行一个 Python 文件】
14 | """
15 | from scrapy import cmdline
16 | 
17 | cmdline.execute('scrapy crawl huize'.split())
18 | 
19 | 


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/.idea/jianshu_spider.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/jianshu_spider.iml" filepath="$PROJECT_DIR$/.idea/jianshu_spider.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../../../../../.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/jianshu_spider/jianshu_spider/__init__.py


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | # 文章详情Item
12 | class ArticleItem(scrapy.Item):
13 |     title = scrapy.Field()
14 |     content = scrapy.Field()
15 |     # 文章id
16 |     article_id = scrapy.Field()
17 |     # 原始的url
18 |     origin_url = scrapy.Field()
19 | 
20 |     # 作者
21 |     author = scrapy.Field()
22 | 
23 |     # 头像
24 |     avatar = scrapy.Field()
25 | 
26 |     # 发布时间
27 |     pubtime = scrapy.Field()
28 | 


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class JianshuSpiderSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class JianshuSpiderDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | # 爬取到数据后，保存到Mysql数据中
 9 | 
10 | import pymysql
11 | 
12 | 
13 | class JianshuSpiderPipeline(object):
14 | 
15 |     def __init__(self):
16 |         db_params = {
17 |             'host': '127.0.0.1',
18 |             'port': 3306,
19 |             'user': 'root',
20 |             'password': 'root',
21 |             'database': 'jianshu',
22 |             'charset': 'utf8'
23 |         }
24 |     
25 |         # 数据库【连接对象】
26 |         self.conn = pymysql.connect(**db_params)
27 |         
28 |         # 数据库【游标对象】【操作数据库】
29 |         self.cursor = self.conn.cursor()
30 | 
31 |         # sql语句
32 |         self._sql = """
33 |                 insert into article(id,title,content,author,avatar,pubtime,article_id,origin_url) 
34 |                 values(null,%s,%s,%s,%s,%s,%s,%s)
35 |             """
36 | 
37 |     def process_item(self, item, spider):
38 |         # 执行sql语句
39 |         self.cursor.execute(self._sql, (
40 |             item['title'], item['content'], item['author'], item['avatar'], item['pubtime'], item['article_id'],
41 |             item['origin_url']))
42 | 
43 |         # 插入到数据库中
44 |         self.conn.commit()
45 |         return item
46 | 
47 |     def close_spider(self, spider):
48 |         # 关闭游标
49 |         self.cursor.close()
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for jianshu_spider project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'jianshu_spider'
13 | 
14 | SPIDER_MODULES = ['jianshu_spider.spiders']
15 | NEWSPIDER_MODULE = 'jianshu_spider.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'jianshu_spider (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 |     'Accept-Language': 'en',
44 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
45 | }
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | #    'jianshu_spider.middlewares.JianshuSpiderSpiderMiddleware': 543,
51 | # }
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | #    'jianshu_spider.middlewares.JianshuSpiderDownloaderMiddleware': 543,
57 | # }
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'jianshu_spider.pipelines.JianshuSpiderPipeline': 300,
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | # AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | # AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 
92 | 
93 | # 在 setting.py 文件中 设置 日志 记录等级
94 | # LOG_LEVEL = 'DEBUG'
95 | # LOG_FILE = 'log.txt'
96 | 


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/jianshu_spider/spiders/jianshu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from jianshu_spider.items import ArticleItem
 6 | 
 7 | 
 8 | class JianshuSpider(CrawlSpider):
 9 |     name = 'jianshu'
10 |     allowed_domains = ['jianshu.com']
11 |     start_urls = ['https://www.jianshu.com/']
12 | 
13 |     HTTPS = "https:"
14 | 
15 |     rules = (
16 |         # 文章id是有12位小写字母或者数字0-9构成
17 |         Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
18 |     )
19 | 
20 |     # 数据测试：scrapy shell https://www.jianshu.com/p/8d5ab6d5f258
21 |     def parse_detail(self, response):
22 |         title = response.xpath('//h1[@class="title"]/text()').get()
23 | 
24 |         author = response.xpath('//div[@class="info"]/span/a/text()').get()
25 | 
26 |         avatar = self.HTTPS + response.xpath('//div[@class="author"]/a/img/@src').get()
27 | 
28 |         pub_time = response.xpath('//span[@class="publish-time"]/text()').get().replace("*", "")
29 | 
30 |         current_url = response.url
31 |         real_url = current_url.split(r"?")[0]
32 | 
33 |         article_id = real_url.split(r'/')[-1]
34 | 
35 |         # 保留标签的H5内容[保留格式，方便后面排版]
36 |         content = response.xpath('//div[@class="show-content"]').get()
37 | 
38 |         item = ArticleItem(
39 |             title=title,
40 |             avatar=avatar,
41 |             pubtime=pub_time,
42 |             origin_url=current_url,
43 |             author=author,
44 |             article_id=article_id,
45 |             content=content
46 |         )
47 | 
48 |         yield item
49 | 


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/raw/article.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Navicat MySQL Data Transfer
 3 | 
 4 |  Source Server         : cal
 5 |  Source Server Type    : MySQL
 6 |  Source Server Version : 50724
 7 |  Source Host           : localhost
 8 |  Source Database       : jianshu
 9 | 
10 |  Target Server Type    : MySQL
11 |  Target Server Version : 50724
12 |  File Encoding         : utf-8
13 | 
14 |  Date: 12/04/2018 23:08:42 PM
15 | */
16 | 
17 | SET NAMES utf8;
18 | SET FOREIGN_KEY_CHECKS = 0;
19 | 
20 | -- ----------------------------
21 | --  Table structure for `article`
22 | -- ----------------------------
23 | DROP TABLE IF EXISTS `article`;
24 | CREATE TABLE `article` (
25 |   `id` int(11) NOT NULL AUTO_INCREMENT,
26 |   `title` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
27 |   `content` longtext CHARACTER SET utf8 COLLATE utf8_bin,
28 |   `author` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
29 |   `avatar` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
30 |   `pubtime` datetime DEFAULT NULL,
31 |   `article_id` varchar(20) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
32 |   `origin_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL,
33 |   PRIMARY KEY (`id`)
34 | ) ENGINE=InnoDB AUTO_INCREMENT=725 DEFAULT CHARSET=utf8;
35 | 
36 | SET FOREIGN_KEY_CHECKS = 1;
37 | 


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/raw/article_table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/jianshu_spider/raw/article_table.png


--------------------------------------------------------------------------------
/scrapy/jianshu_spider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = jianshu_spider.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = jianshu_spider
12 | 


--------------------------------------------------------------------------------
/scrapy/qczj/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/scrapy/qczj/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/qczj.iml" filepath="$PROJECT_DIR$/.idea/qczj.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/scrapy/qczj/.idea/qczj.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/scrapy/qczj/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../../../../.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/scrapy/qczj/qczj/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/qczj/qczj/__init__.py


--------------------------------------------------------------------------------
/scrapy/qczj/qczj/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | # 为了方便使用Images Pipline，这里定义image_urls和images两个变量【必须】
11 | class QczjItem(scrapy.Item):
12 |     category = scrapy.Field()
13 |     image_urls = scrapy.Field()
14 |     images = scrapy.Field()
15 | 


--------------------------------------------------------------------------------
/scrapy/qczj/qczj/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class QczjSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class QczjDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/scrapy/qczj/qczj/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | # 存储数据
 9 | import os
10 | from urllib import request
11 | from scrapy.pipelines.images import ImagesPipeline
12 | from qczj import settings
13 | 
14 | 
15 | # 场景：由于系统提供的ImagesPipline不能定义子文件件目录和文件名称，这里需要自定义
16 | class CustomImagesPipline(ImagesPipeline):
17 | 
18 |     # 发送下载图片请求之前调用
19 |     def get_media_requests(self, item, info):
20 |         request_objs = super(CustomImagesPipline, self).get_media_requests(item, info)
21 | 
22 |         for request_obj in request_objs:
23 |             request_obj.item = item
24 | 
25 |         # 注意：一定要返回请求对象列表
26 |         return request_objs
27 | 
28 |     # 图片被存储之前才会被执行
29 |     def file_path(self, request, response=None, info=None):
30 |         path = super(CustomImagesPipline, self).file_path(request, response, info)
31 | 
32 |         # 获取分类
33 |         category = request.item.get('category')
34 | 
35 |         # 实际要保存的目录下
36 |         category_path = os.path.join(settings.IMAGES_STORE, category)
37 | 
38 |         if not os.path.exists(category_path):
39 |             os.mkdir(category_path)
40 | 
41 |         # 图片的名称  full/%s.jpg
42 |         image_name = path.replace("full/", "")
43 | 
44 |         # 图片要保存的完成路径【注意：这里要写相对路径,相对于：settings.IMAGES_STORE这个目录】【具体查看父类返回的路径】
45 |         image_full_path = os.path.join(category, image_name)
46 | 
47 |         return image_full_path
48 | 


--------------------------------------------------------------------------------
/scrapy/qczj/qczj/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for qczj project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | import os
13 | 
14 | BOT_NAME = 'qczj'
15 | 
16 | SPIDER_MODULES = ['qczj.spiders']
17 | NEWSPIDER_MODULE = 'qczj.spiders'
18 | 
19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
20 | # USER_AGENT = 'qczj (+http://www.yourdomain.com)'
21 | 
22 | # Obey robots.txt rules
23 | ROBOTSTXT_OBEY = False
24 | 
25 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
26 | # CONCURRENT_REQUESTS = 32
27 | 
28 | # Configure a delay for requests for the same website (default: 0)
29 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
30 | # See also autothrottle settings and docs
31 | DOWNLOAD_DELAY = 1
32 | # The download delay setting will honor only one of:
33 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
34 | # CONCURRENT_REQUESTS_PER_IP = 16
35 | 
36 | # Disable cookies (enabled by default)
37 | # COOKIES_ENABLED = False
38 | 
39 | # Disable Telnet Console (enabled by default)
40 | # TELNETCONSOLE_ENABLED = False
41 | 
42 | # Override the default request headers:
43 | DEFAULT_REQUEST_HEADERS = {
44 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
45 |     'Accept-Language': 'en',
46 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
47 | }
48 | 
49 | # Enable or disable spider middlewares
50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
51 | # SPIDER_MIDDLEWARES = {
52 | #    'qczj.middlewares.QczjSpiderMiddleware': 543,
53 | # }
54 | 
55 | # Enable or disable downloader middlewares
56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
57 | # DOWNLOADER_MIDDLEWARES = {
58 | #    'qczj.middlewares.QczjDownloaderMiddleware': 543,
59 | # }
60 | 
61 | # Enable or disable extensions
62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
63 | # EXTENSIONS = {
64 | #    'scrapy.extensions.telnet.TelnetConsole': None,
65 | # }
66 | 
67 | # Configure item pipelines
68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
69 | ITEM_PIPELINES = {
70 |     'qczj.pipelines.CustomImagesPipline': 1
71 | }
72 | 
73 | # Enable and configure the AutoThrottle extension (disabled by default)
74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
75 | # AUTOTHROTTLE_ENABLED = True
76 | # The initial download delay
77 | # AUTOTHROTTLE_START_DELAY = 5
78 | # The maximum download delay to be set in case of high latencies
79 | # AUTOTHROTTLE_MAX_DELAY = 60
80 | # The average number of requests Scrapy should be sending in parallel to
81 | # each remote server
82 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
83 | # Enable showing throttling stats for every response received:
84 | # AUTOTHROTTLE_DEBUG = False
85 | 
86 | # Enable and configure HTTP caching (disabled by default)
87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
88 | # HTTPCACHE_ENABLED = True
89 | # HTTPCACHE_EXPIRATION_SECS = 0
90 | # HTTPCACHE_DIR = 'httpcache'
91 | # HTTPCACHE_IGNORE_HTTP_CODES = []
92 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
93 | 
94 | 
95 | # 图片下载路径，供Images pipline使用
96 | IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images')
97 | 


--------------------------------------------------------------------------------
/scrapy/qczj/qczj/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy/qczj/qczj/spiders/bmw5.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.spiders import CrawlSpider, Rule
 4 | from scrapy.linkextractors import LinkExtractor
 5 | 
 6 | from qczj.items import QczjItem
 7 | 
 8 | 
 9 | # 爬取汽车之家宝马5系的数据，下载原图
10 | 
11 | class Bmw5Spider(CrawlSpider):
12 |     name = 'bmw5'
13 |     allowed_domains = ['car.autohome.com.cn']
14 | 
15 |     # 宝马5系（进口）汽车系列图地址
16 |     start_urls = ['https://car.autohome.com.cn/pic/series/202.html']
17 | 
18 |     rules = {
19 |         # follow=True:接下来解析第二页、第三页、、、
20 |         Rule(LinkExtractor(allow=r'https://car.autohome.com.cn/pic/series/202-.+'), callback="parse_page", follow=True)
21 |     }
22 | 
23 |     def parse_page(self, response):
24 |         """
25 |         解析满足rules的url【更多图片页面】  https://car.autohome.com.cn/pic/series/202-1-p1.html
26 |         :param response: 
27 |         :return:
28 |         """
29 |         # 1.获取类别【可以通过scrapy shell url局部测试，不需要运行整个项目】
30 |         category = response.xpath('//div[@class="uibox"]/div[1]/text()').get()
31 | 
32 |         # 2.图片
33 |         # 注意：xpath 包含语法【同样可以通过scrapy shell来局部测试正确性】
34 |         srcs = response.xpath('//div[contains(@class,"uibox-con")]//li//img/@src').getall()
35 | 
36 |         # 3.1 对缩略图的地址补全
37 |         # 3.2 转换缩略图的url为高清图片的url
38 |         srcs = list(map(lambda x: response.urljoin(x).replace("t_", ""), srcs))
39 | 
40 |         item = QczjItem(category=category, image_urls=srcs)
41 | 
42 |         print("爬完页面：%s，类别：%s" % (response.url, category))
43 | 
44 |         yield item
45 | 


--------------------------------------------------------------------------------
/scrapy/qczj/readme.MD:
--------------------------------------------------------------------------------
1 | # 爬取汽车之家的图片【宝马5系车】
2 | ### 创建一个爬虫
3 | ```
4 | scrapy genspider bmw5 "car.autohome.com.cn"
5 | ```
6 | 
7 | 


--------------------------------------------------------------------------------
/scrapy/qczj/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = qczj.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = qczj
12 | 


--------------------------------------------------------------------------------
/scrapy/qczj/start.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: start.py 
12 | @time: 11/15/18 21:04 
13 | @description：方便执行 Python 文件【执行一个 Python 文件】
14 | """
15 | from scrapy import cmdline
16 | 
17 | cmdline.execute('scrapy crawl bmw5'.split())
18 | 


--------------------------------------------------------------------------------
/scrapy/qsbk/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/scrapy/qsbk/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/qsbk.iml" filepath="$PROJECT_DIR$/.idea/qsbk.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/scrapy/qsbk/.idea/qsbk.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/scrapy/qsbk/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../../../../.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/qsbk/qsbk/__init__.py


--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | # 作用：定义数据模型
 9 | 
10 | import scrapy
11 | 
12 | 
13 | class QsbkItem(scrapy.Item):
14 |     # define the fields for your item here like:
15 |     # name = scrapy.Field()
16 |     """
17 |     定义数据模型
18 |     """
19 |     # 段子作者
20 |     author = scrapy.Field()
21 | 
22 |     # 段子内容
23 |     content = scrapy.Field()
24 | 


--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | # 作用：定义中间件，包含下载器中间件、爬虫中间件
  9 | 
 10 | from scrapy import signals
 11 | 
 12 | 
 13 | class QsbkSpiderMiddleware(object):
 14 |     # Not all methods need to be defined. If a method is not defined,
 15 |     # scrapy acts as if the spider middleware does not modify the
 16 |     # passed objects.
 17 | 
 18 |     @classmethod
 19 |     def from_crawler(cls, crawler):
 20 |         # This method is used by Scrapy to create your spiders.
 21 |         s = cls()
 22 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 23 |         return s
 24 | 
 25 |     def process_spider_input(self, response, spider):
 26 |         # Called for each response that goes through the spider
 27 |         # middleware and into the spider.
 28 | 
 29 |         # Should return None or raise an exception.
 30 |         return None
 31 | 
 32 |     def process_spider_output(self, response, result, spider):
 33 |         # Called with the results returned from the Spider, after
 34 |         # it has processed the response.
 35 | 
 36 |         # Must return an iterable of Request, dict or Item objects.
 37 |         for i in result:
 38 |             yield i
 39 | 
 40 |     def process_spider_exception(self, response, exception, spider):
 41 |         # Called when a spider or process_spider_input() method
 42 |         # (from other spider middleware) raises an exception.
 43 | 
 44 |         # Should return either None or an iterable of Response, dict
 45 |         # or Item objects.
 46 |         pass
 47 | 
 48 |     def process_start_requests(self, start_requests, spider):
 49 |         # Called with the start requests of the spider, and works
 50 |         # similarly to the process_spider_output() method, except
 51 |         # that it doesn’t have a response associated.
 52 | 
 53 |         # Must return only requests (not items).
 54 |         for r in start_requests:
 55 |             yield r
 56 | 
 57 |     def spider_opened(self, spider):
 58 |         spider.logger.info('Spider opened: %s' % spider.name)
 59 | 
 60 | 
 61 | class QsbkDownloaderMiddleware(object):
 62 |     # Not all methods need to be defined. If a method is not defined,
 63 |     # scrapy acts as if the downloader middleware does not modify the
 64 |     # passed objects.
 65 | 
 66 |     @classmethod
 67 |     def from_crawler(cls, crawler):
 68 |         # This method is used by Scrapy to create your spiders.
 69 |         s = cls()
 70 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 71 |         return s
 72 | 
 73 |     def process_request(self, request, spider):
 74 |         # Called for each request that goes through the downloader
 75 |         # middleware.
 76 | 
 77 |         # Must either:
 78 |         # - return None: continue processing this request
 79 |         # - or return a Response object
 80 |         # - or return a Request object
 81 |         # - or raise IgnoreRequest: process_exception() methods of
 82 |         #   installed downloader middleware will be called
 83 |         return None
 84 | 
 85 |     def process_response(self, request, response, spider):
 86 |         # Called with the response returned from the downloader.
 87 | 
 88 |         # Must either;
 89 |         # - return a Response object
 90 |         # - return a Request object
 91 |         # - or raise IgnoreRequest
 92 |         return response
 93 | 
 94 |     def process_exception(self, request, exception, spider):
 95 |         # Called when a download handler or a process_request()
 96 |         # (from other downloader middleware) raises an exception.
 97 | 
 98 |         # Must either:
 99 |         # - return None: continue processing this exception
100 |         # - return a Response object: stops process_exception() chain
101 |         # - return a Request object: stops process_exception() chain
102 |         pass
103 | 
104 |     def spider_opened(self, spider):
105 |         spider.logger.info('Spider opened: %s' % spider.name)
106 | 


--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | # 作用：保存数据【Json】【Xml、CSV类似，详情查看 exporters 类】
 9 | 
10 | import json
11 | from .items import QsbkItem
12 | 
13 | from scrapy.exporters import JsonLinesItemExporter
14 | 
15 | class QsbkPipeline(object):
16 | 
17 |     def __init__(self):
18 |         # JsonLinesItemExporter 必须要以二进制的方式打开
19 |         # 注意：以二进制的方式打开写入，不需要指定编码格式；以字符串的形式打开写入，就需要指定编码格式
20 |         self.fp = open('duanzi.json', 'wb')
21 | 
22 |         # 定义一个 exporters
23 |         self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
24 | 
25 |     def open_spider(self, spider):
26 |         print('爬虫开始了...')
27 | 
28 |     def process_item(self, item, spider):
29 |         self.exporter.export_item(item)
30 |         return item
31 | 
32 |     def close_spider(self, spider):
33 |         self.fp.close()
34 |         print('爬虫结束了。')
35 | 


--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for qsbk project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | # 作用：爬虫配置文件
13 | # 比如：配置请求头、是否开启 Cookie、下载之前是否延迟
14 | 
15 | BOT_NAME = 'qsbk'
16 | 
17 | SPIDER_MODULES = ['qsbk.spiders']
18 | NEWSPIDER_MODULE = 'qsbk.spiders'
19 | 
20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
21 | # USER_AGENT = 'qsbk (+http://www.yourdomain.com)'
22 | 
23 | # Obey robots.txt rules
24 | ROBOTSTXT_OBEY = False
25 | 
26 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
27 | # CONCURRENT_REQUESTS = 32
28 | 
29 | # Configure a delay for requests for the same website (default: 0)
30 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
31 | # See also autothrottle settings and docs
32 | 
33 | # 下载延迟
34 | # 1 秒钟停 1 次
35 | DOWNLOAD_DELAY = 1
36 | # The download delay setting will honor only one of:
37 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
38 | # CONCURRENT_REQUESTS_PER_IP = 16
39 | 
40 | # Disable cookies (enabled by default)
41 | # COOKIES_ENABLED = False
42 | 
43 | # Disable Telnet Console (enabled by default)
44 | # TELNETCONSOLE_ENABLED = False
45 | 
46 | # Override the default request headers:
47 | DEFAULT_REQUEST_HEADERS = {
48 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
49 |     'Accept-Language': 'en',
50 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
51 | }
52 | 
53 | # Enable or disable spider middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
55 | # SPIDER_MIDDLEWARES = {
56 | #    'qsbk.middlewares.QsbkSpiderMiddleware': 543,
57 | # }
58 | 
59 | # Enable or disable downloader middlewares
60 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
61 | # DOWNLOADER_MIDDLEWARES = {
62 | #    'qsbk.middlewares.QsbkDownloaderMiddleware': 543,
63 | # }
64 | 
65 | # Enable or disable extensions
66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
67 | # EXTENSIONS = {
68 | #    'scrapy.extensions.telnet.TelnetConsole': None,
69 | # }
70 | 
71 | # Configure item pipelines
72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
73 | # 'qsbk.pipelines.QsbkPipeline'：Key；300：优先级。值越小，优先级越高。
74 | ITEM_PIPELINES = {
75 |    'qsbk.pipelines.QsbkPipeline': 300,
76 | }
77 | 
78 | # Enable and configure the AutoThrottle extension (disabled by default)
79 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
80 | # AUTOTHROTTLE_ENABLED = True
81 | # The initial download delay
82 | # AUTOTHROTTLE_START_DELAY = 5
83 | # The maximum download delay to be set in case of high latencies
84 | # AUTOTHROTTLE_MAX_DELAY = 60
85 | # The average number of requests Scrapy should be sending in parallel to
86 | # each remote server
87 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88 | # Enable showing throttling stats for every response received:
89 | # AUTOTHROTTLE_DEBUG = False
90 | 
91 | # Enable and configure HTTP caching (disabled by default)
92 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93 | # HTTPCACHE_ENABLED = True
94 | # HTTPCACHE_EXPIRATION_SECS = 0
95 | # HTTPCACHE_DIR = 'httpcache'
96 | # HTTPCACHE_IGNORE_HTTP_CODES = []
97 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
98 | 


--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy/qsbk/qsbk/spiders/spider_qsbk.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from qsbk.items import QsbkItem
 4 | from scrapy.http.response.html import HtmlResponse
 5 | from scrapy.selector.unified import SelectorList, Selector
 6 | 
 7 | 
 8 | # 使用 scrapy 爬取糗事百科
 9 | 
10 | class SpiderQsbkSpider(scrapy.Spider):
11 |     name = 'spider_qsbk'
12 |     allowed_domains = ['qiushibaike.com']
13 |     start_urls = ['https://www.qiushibaike.com/text/page/1/']
14 |     base_domain = "https://www.qiushibaike.com"
15 | 
16 |     def parse(self, response):
17 |         """
18 |         对 Download 下载回来的数据进行解释
19 |         :param response: HtmlResponse
20 |         :return:
21 |         """
22 | 
23 |         # 1.利用 Xpath 获取所有的段子【divs】
24 |         duan_zi_divs = response.xpath('//div[@id="content-left"]/div')
25 | 
26 |         # items = []
27 | 
28 |         # 2.遍历出段子进行解析
29 |         for duan_zi_div in duan_zi_divs:
30 |             # 2.1 获取作者
31 |             author = duan_zi_div.xpath(".//h2/text()").get().strip()
32 | 
33 |             # 2.2 获取段子内容
34 |             content_pre = duan_zi_div.xpath(".//div[@class='content']//text()").getall()  # 列表
35 |             content = "".join(content_pre).strip()
36 | 
37 |             # 2.3 组装成一个数据模型
38 |             item = QsbkItem(author=author, content=content)
39 | 
40 |             # 2.4 以生成器的方式传给 piplines 管道处理
41 |             yield item
42 | 
43 |         # 查找下一页的链接地址
44 |         next_url = None
45 |         try:
46 |             next_url = self.base_domain + response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
47 |         except:
48 |             pass
49 | 
50 |         # 如果找不到下一页【最后一页】，就直接返回
51 |         if not next_url:
52 |             return
53 |         else:
54 |             # 执行下一页
55 |             yield scrapy.Request(next_url, callback=self.parse)
56 | 


--------------------------------------------------------------------------------
/scrapy/qsbk/readme.MD:
--------------------------------------------------------------------------------
 1 | # 使用 `Scrapy` 来爬取糗事百科
 2 | 
 3 | 1. 修改 `settings.py` 配置文件
 4 | 
 5 |    ```
 6 |    # 1.修改 ROBOTSTXT_OBEY 为 False
 7 |    ROBOTSTXT_OBEY = False
 8 |    
 9 |    # 2.放开请求头的设置
10 |    DEFAULT_REQUEST_HEADERS = {
11 |        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
12 |        'Accept-Language': 'en',
13 |        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
14 |    }
15 |    
16 |    # 3.放开 PipLine 便于保存数据
17 |    # 'qsbk.pipelines.QsbkPipeline'：Key；300：优先级；值越小，优先级越高。
18 |    ITEM_PIPELINES = {
19 |       'qsbk.pipelines.QsbkPipeline': 300,
20 |    }
21 |    ```
22 | 
23 | 2. 编写爬虫代码 - `spiders/spider_xx.py`
24 | 
25 |    对 `Download` 下载后的数据，利用 `xpath` 进行解释，然后通过生成器传给 `PipLine`
26 | 
27 | 3. 编写数据模型
28 | 
29 |    定义数据模型，便于管理
30 | 
31 | 4. 编写 `Pipline` 管道
32 | 
33 |    编写保存数据的代码
34 | 
35 |    注意：需要在 `settings.py` 文件中激活 `Pipline`
36 | 
37 | 
38 | 
39 | 
40 | 
41 | 
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 
53 |  


--------------------------------------------------------------------------------
/scrapy/qsbk/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = qsbk.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = qsbk
12 | 


--------------------------------------------------------------------------------
/scrapy/sfw_spider/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/scrapy/sfw_spider/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/sfw.iml" filepath="$PROJECT_DIR$/.idea/sfw.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/scrapy/sfw_spider/.idea/sfw.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/scrapy/sfw_spider/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../../../../../.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/scrapy/sfw_spider/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohttp==3.4.4
 2 | asn1crypto==0.24.0
 3 | astroid==2.0.4
 4 | async-timeout==3.0.1
 5 | attrs==18.2.0
 6 | Automat==0.7.0
 7 | certifi==2018.10.15
 8 | cffi==1.11.5
 9 | chardet==3.0.4
10 | constantly==15.1.0
11 | cryptography==2.4.1
12 | cssselect==1.0.3
13 | dateparser==0.7.0
14 | demjson==2.2.4
15 | douyin==0.3.6
16 | hyperlink==18.0.0
17 | idna==2.7
18 | incremental==17.5.0
19 | isort==4.3.4
20 | lazy-object-proxy==1.3.1
21 | lxml==4.2.5
22 | mccabe==0.6.1
23 | motor==2.0.0
24 | multidict==4.5.2
25 | parsel==1.5.1
26 | Pillow==5.3.0
27 | pyasn1==0.4.4
28 | pyasn1-modules==0.2.2
29 | pycparser==2.19
30 | PyDispatcher==2.0.5
31 | PyHamcrest==1.9.0
32 | pylint==2.1.1
33 | pymongo==3.7.2
34 | PyMySQL==0.9.2
35 | pyOpenSSL==18.0.0
36 | python-dateutil==2.7.5
37 | pytz==2018.7
38 | queuelib==1.5.0
39 | regex==2018.11.22
40 | requests==2.19.1
41 | retrying==1.3.3
42 | Scrapy==1.5.1
43 | selenium==3.14.1
44 | service-identity==17.0.0
45 | six==1.11.0
46 | tqdm==4.28.1
47 | Twisted==18.9.0
48 | tzlocal==1.5.1
49 | urllib3==1.23
50 | w3lib==1.19.0
51 | wrapt==1.10.11
52 | yarl==1.2.6
53 | zope.interface==4.6.0
54 | 


--------------------------------------------------------------------------------
/scrapy/sfw_spider/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = sfw.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = sfw
12 | 


--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/sfw_spider/sfw/__init__.py


--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class NewHouseItem(scrapy.Item):
12 |     """
13 |     新房的数据模型【10个属性：省份、城市、小区名称、价格、几居室、面积、地址、区域、是否在售、详情页面ull】
14 |     """
15 |     # 省份
16 |     province = scrapy.Field()
17 |     # 城市
18 |     city = scrapy.Field()
19 |     # 小区名称
20 |     name = scrapy.Field()
21 |     # 价格
22 |     price = scrapy.Field()
23 |     # 几居室【列表】【新房可能有多个房型】
24 |     rooms = scrapy.Field()
25 |     # 面积
26 |     area = scrapy.Field()
27 |     # 地址
28 |     address = scrapy.Field()
29 |     # 行政区
30 |     district = scrapy.Field()
31 |     # 是否在售
32 |     sale = scrapy.Field()
33 |     # 详情页面url
34 |     origin_url = scrapy.Field()
35 | 
36 | 
37 | class ESFHouseItem(scrapy.Item):
38 |     """
39 |     二手房数据模型【12个属性：省份、城市、小区名称、几室几厅、楼层、朝向、年代、地址、建筑面积、总价、单价、详情页面URL】
40 |     """
41 |     # 省份
42 |     province = scrapy.Field()
43 |     # 城市
44 |     city = scrapy.Field()
45 |     # 小区名称
46 |     name = scrapy.Field()
47 |     # 几室几厅
48 |     rooms = scrapy.Field()
49 |     # 楼层
50 |     floor = scrapy.Field()
51 |     # 朝向
52 |     toward = scrapy.Field()
53 |     # 年代
54 |     year = scrapy.Field()
55 |     # 地址
56 |     address = scrapy.Field()
57 |     # 建筑面积
58 |     area = scrapy.Field()
59 |     # 总价
60 |     price = scrapy.Field()
61 |     # 单价
62 |     unit = scrapy.Field()
63 |     # 详情页面url
64 |     origin_url = scrapy.Field()
65 | 


--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class SfwSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class SfwDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 
105 | 
106 | # ===================================================================
107 | import random
108 | 
109 | 
110 | # 随机请求头
111 | # 自定义一个下载器中间件【Download Middlewares】【请求头】
112 | # 所有请求头可以参考：http://www.useragentstring.com/pages/useragentstring.php?typ=Browser
113 | class UserAgentDownloaderMiddleware(object):
114 |     USER_AGENTS = [
115 |         'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
116 |         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
117 |         'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;',
118 |         'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)',
119 |         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1',
120 |         'Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1'
121 |     ]
122 | 
123 |     def process_request(self, request, spider):
124 |         # 随机拿到一个请求头
125 |         user_agent = random.choice(self.USER_AGENTS)
126 | 
127 |         # 设置到request
128 |         request.headers['User-Agent'] = user_agent
129 | 
130 |         request.headers['Location'] = None
131 | 


--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from scrapy.exporters import JsonLinesItemExporter
 9 | from .items import NewHouseItem, ESFHouseItem
10 | 
11 | 
12 | class SfwPipeline(object):
13 | 
14 |     def __init__(self):
15 |         self.fp_new_house = open('new_house.json', 'wb')
16 |         self.fp_esf_house = open('esf_house.json', 'wb')
17 | 
18 |         self.exporter_new_house = JsonLinesItemExporter(self.fp_new_house, ensure_ascii=False)
19 |         self.exporter_esf_house = JsonLinesItemExporter(self.fp_esf_house, ensure_ascii=False)
20 | 
21 |     def process_item(self, item, spider):
22 |         if isinstance(item, NewHouseItem):
23 |             print('写入一条新手房数据')
24 |             self.exporter_new_house.export_item(item)
25 |         else:
26 |             print('写入一条二手房数据')
27 |             self.exporter_esf_house.export_item(item)
28 |         return item
29 | 
30 |     def close_spider(self, spider):
31 |         self.fp_new_house.close()
32 |         self.fp_esf_house.close()
33 | 


--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for sfw project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'sfw'
13 | 
14 | SPIDER_MODULES = ['sfw.spiders']
15 | NEWSPIDER_MODULE = 'sfw.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'sfw (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 3
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 |     'Accept-Language': 'en',
44 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
45 | }
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | #    'sfw.middlewares.SfwSpiderMiddleware': 543,
51 | # }
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | DOWNLOADER_MIDDLEWARES = {
56 |    'sfw.middlewares.SfwDownloaderMiddleware': 543,
57 |    'sfw.middlewares.UserAgentDownloaderMiddleware': 500,
58 | }
59 | 
60 | # Enable or disable extensions
61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
62 | # EXTENSIONS = {
63 | #    'scrapy.extensions.telnet.TelnetConsole': None,
64 | # }
65 | 
66 | # Configure item pipelines
67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
68 | ITEM_PIPELINES = {
69 |    'sfw.pipelines.SfwPipeline': 300,
70 | }
71 | 
72 | # Enable and configure the AutoThrottle extension (disabled by default)
73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
74 | # AUTOTHROTTLE_ENABLED = True
75 | # The initial download delay
76 | # AUTOTHROTTLE_START_DELAY = 5
77 | # The maximum download delay to be set in case of high latencies
78 | # AUTOTHROTTLE_MAX_DELAY = 60
79 | # The average number of requests Scrapy should be sending in parallel to
80 | # each remote server
81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
82 | # Enable showing throttling stats for every response received:
83 | # AUTOTHROTTLE_DEBUG = False
84 | 
85 | # Enable and configure HTTP caching (disabled by default)
86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
87 | # HTTPCACHE_ENABLED = True
88 | # HTTPCACHE_EXPIRATION_SECS = 0
89 | # HTTPCACHE_DIR = 'httpcache'
90 | # HTTPCACHE_IGNORE_HTTP_CODES = []
91 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
92 | 
93 | 


--------------------------------------------------------------------------------
/scrapy/sfw_spider/sfw/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy/sfw_spider/start.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: start.py 
12 | @time: 11/15/18 21:04 
13 | @description： Traceback
14 | """
15 | from scrapy import cmdline
16 | 
17 | cmdline.execute('scrapy crawl sfw_spider'.split())
18 | 
19 | 


--------------------------------------------------------------------------------
/scrapy/weixin_community/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/scrapy/weixin_community/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/weixin_community.iml" filepath="$PROJECT_DIR$/.idea/weixin_community.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/scrapy/weixin_community/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../../../../.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/scrapy/weixin_community/.idea/weixin_community.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="Twisted Trial" />
10 |     <option name="PROJECT_TEST_RUNNER" value="Twisted Trial" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/scrapy/weixin_community/readme.MD:
--------------------------------------------------------------------------------
 1 | # 使用 `CrawlSpider` 爬取微信小程序论坛
 2 | 1. 创建一个项目
 3 | 
 4 |    ```
 5 |    scrapy startproject weixin_community
 6 |    ```
 7 | 
 8 | 2. 创建一个爬虫
 9 | 
10 |    ```
11 |    # 先进入文件夹中
12 |    cd weixin_community
13 |    
14 |    # 创建一个爬虫
15 |    scrapy genspider -t crawl wx_spider "wxapp-union.com"
16 |    ```
17 | 
18 | 3. 使用 `Pycharm` 打开项目
19 | 
20 | 4. 设置 `setting.py` 文件
21 | 
22 |    ```
23 |    ROBOTSTXT_OBEY = False
24 |    
25 |    DOWNLOAD_DELAY = 1
26 |    
27 |    DEFAULT_REQUEST_HEADERS = {
28 |        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
29 |        'Accept-Language': 'en',
30 |        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
31 |    }
32 |    
33 |    ITEM_PIPELINES = {
34 |       'weixin_community.pipelines.WeixinCommunityPipeline': 300,
35 |    }
36 |    ```
37 | 
38 | 5. 编写爬虫
39 | 
40 | 6. 编写数据模型
41 | 
42 | 7. 编写 `Pipline` 管道
43 | 
44 | 8. 运行测试
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/scrapy/weixin_community/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = weixin_community.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = weixin_community
12 | 


--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/weixin_community/weixin_community/__init__.py


--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class WeixinCommunityItem(scrapy.Item):
12 |     title = scrapy.Field()
13 |     author = scrapy.Field()
14 |     pub_time = scrapy.Field()
15 |     content = scrapy.Field()
16 | 


--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class WeixinCommunitySpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class WeixinCommunityDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from scrapy.exporters import JsonLinesItemExporter,JsonItemExporter
 9 | 
10 | 
11 | # 由于数据量相比比较大，这里使用：JsonLinesItemExporter
12 | 
13 | class WeixinCommunityPipeline(object):
14 | 
15 |     def __init__(self):
16 |         self.fp = open('wxjc.json', 'wb')
17 |         self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8')
18 | 
19 |     def process_item(self, item, spider):
20 |         # 获取一条item，就写入一条数据到文件中
21 |         self.exporter.export_item(item)
22 |         return item
23 | 
24 |     def close_spider(self, spider):
25 |         self.fp.close()
26 | 
27 | 


--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for weixin_community project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'weixin_community'
13 | 
14 | SPIDER_MODULES = ['weixin_community.spiders']
15 | NEWSPIDER_MODULE = 'weixin_community.spiders'
16 | 
17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
18 | # USER_AGENT = 'weixin_community (+http://www.yourdomain.com)'
19 | 
20 | # Obey robots.txt rules
21 | ROBOTSTXT_OBEY = False
22 | 
23 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
24 | # CONCURRENT_REQUESTS = 32
25 | 
26 | # Configure a delay for requests for the same website (default: 0)
27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
28 | # See also autothrottle settings and docs
29 | DOWNLOAD_DELAY = 1
30 | # The download delay setting will honor only one of:
31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
32 | # CONCURRENT_REQUESTS_PER_IP = 16
33 | 
34 | # Disable cookies (enabled by default)
35 | # COOKIES_ENABLED = False
36 | 
37 | # Disable Telnet Console (enabled by default)
38 | # TELNETCONSOLE_ENABLED = False
39 | 
40 | # Override the default request headers:
41 | DEFAULT_REQUEST_HEADERS = {
42 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43 |     'Accept-Language': 'en',
44 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
45 | }
46 | 
47 | # Enable or disable spider middlewares
48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49 | # SPIDER_MIDDLEWARES = {
50 | #    'weixin_community.middlewares.WeixinCommunitySpiderMiddleware': 543,
51 | # }
52 | 
53 | # Enable or disable downloader middlewares
54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55 | # DOWNLOADER_MIDDLEWARES = {
56 | #    'weixin_community.middlewares.WeixinCommunityDownloaderMiddleware': 543,
57 | # }
58 | 
59 | # Enable or disable extensions
60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
61 | # EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | # }
64 | 
65 | # Configure item pipelines
66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67 | ITEM_PIPELINES = {
68 |     'weixin_community.pipelines.WeixinCommunityPipeline': 300
69 | }
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | # AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | # AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | # AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | # AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | # HTTPCACHE_ENABLED = True
87 | # HTTPCACHE_EXPIRATION_SECS = 0
88 | # HTTPCACHE_DIR = 'httpcache'
89 | # HTTPCACHE_IGNORE_HTTP_CODES = []
90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scrapy/weixin_community/weixin_community/spiders/wx_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | from scrapy.linkextractors import LinkExtractor
 4 | from scrapy.spiders import CrawlSpider, Rule
 5 | from weixin_community.items import WeixinCommunityItem
 6 | 
 7 | 
 8 | class WxSpiderSpider(CrawlSpider):
 9 |     name = 'wx_spider'
10 |     allowed_domains = ['wxapp-union.com']
11 |     # 起始页从第 1 页开始
12 |     start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
13 | 
14 |     # 定义规则
15 |     rules = (
16 |         # 列表【页面】
17 |         Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
18 | 
19 |         # 详情【页面】
20 |         Rule(LinkExtractor(allow=r'article-.+\.html'), callback='parse_detail', follow=False)
21 |     )
22 | 
23 | 
24 |     def parse_detail(self, response):
25 |         # 标题
26 |         title = response.xpath('//h1[@class="ph"]/text()').get()
27 | 
28 |         # p 标签元素
29 |         author_element_p = response.xpath('//p[@class="authors"]')
30 | 
31 |         # 作者
32 |         author = author_element_p.xpath('./a/text()').get()
33 | 
34 |         # 发布时间
35 |         pub_time = author_element_p.xpath('./span/text()').get()
36 | 
37 |         # 内容
38 |         content_pre = response.xpath('//td[@id="article_content"]//text()').getall()
39 | 
40 |         content = "".join(content_pre).strip()
41 | 
42 |         # 把解析完的数据交个 Pipline 去处理
43 |         yield WeixinCommunityItem(title=title, author=author, pub_time=pub_time, content=content)
44 | 


--------------------------------------------------------------------------------
/spiders/film_xinpianchang/models.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: models.py 
12 | @time: 12/15/18 23:08 
13 | @description：数据模型
14 | """
15 | 
16 | from datetime import datetime
17 | from mongoengine import StringField, URLField, IntField, Document, connect
18 | 
19 | __author__ = 'xag'
20 | 
21 | response = connect('admin', host='localhost', port=27017, username='root', password='xag')
22 | 
23 | 
24 | class FilmModel(Document):
25 |     """
26 |     电影【模型】
27 |     """
28 |     title = StringField()  # 电影标题
29 |     type = StringField()  # 电影类型
30 |     play_num = StringField()  # 播放量
31 |     like_num = StringField()  # 喜欢数
32 |     img_cover = URLField()  # 封面地址
33 |     play_address = URLField()  # 播放地址
34 |     download_address = URLField()  # 下载地址
35 | 


--------------------------------------------------------------------------------
/spiders/film_xinpianchang/tools_file.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: tools_file.py 
12 | @time: 1/29/19 16:29 
13 | @description：文件夹工具类
14 | """
15 | import os
16 | 
17 | 
18 | def mkdir(path):
19 |     """
20 |     新建一个目录
21 |     :param path:完整路径
22 |     :return:
23 |     """
24 |     if not os.path.exists(path):
25 |         os.makedirs(path)
26 | 
27 |     return path
28 | 


--------------------------------------------------------------------------------
/spiders/film_xinpianchang/tools_string.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: tools_string.py 
12 | @time: 1/28/19 23:50 
13 | @description：TODO
14 | """
15 | 
16 | import random
17 | import string
18 | 
19 | 
20 | def remove_space(str):
21 |     return ''.join(str.split(' ')).replace("\t", '').replace("\n", '')
22 | 
23 | 
24 | def make_random_string(num):
25 |     """
26 |     生成随机字符串
27 |     :param num:
28 |     :return:
29 |     """
30 |     return ''.join(random.sample(string.ascii_letters + string.digits, num))
31 | 


--------------------------------------------------------------------------------
/spiders/spider_bai_si_bu_de_jie.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python  
  2 | # encoding: utf-8  
  3 | 
  4 | """ 
  5 | @version: v1.0 
  6 | @author: xag 
  7 | @license: Apache Licence  
  8 | @contact: xinganguo@gmail.com 
  9 | @site: http://www.xingag.top 
 10 | @software: PyCharm 
 11 | @file: spider_bai_si_bu_de_jie.py
 12 | @time: 2018/9/25 19:58 
 13 | @description：利用多线程爬取【百思不得姐】网站的文字和图片并下载到csv文件中
 14 | """
 15 | 
 16 | import requests
 17 | from lxml import etree
 18 | import threading
 19 | from queue import Queue
 20 | import time
 21 | import csv
 22 | from urllib import request
 23 | import fileutils
 24 | 
 25 | HEADERS = {
 26 | 	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
 27 | 	'Referer': 'http://www.budejie.com/hot/1'
 28 | }
 29 | 
 30 | 
 31 | class BSSpider(threading.Thread):
 32 | 	"""
 33 | 	爬取每一页的数据
 34 | 	"""
 35 | 
 36 | 	def __init__(self, page_queue, joke_queue, name, *args, **kwargs):
 37 | 		super(BSSpider, self).__init__(*args, **kwargs)
 38 | 
 39 | 		# 1.初始化数据
 40 | 		self.page_queue = page_queue
 41 | 		self.joke_queue = joke_queue
 42 | 		self.name = name
 43 | 
 44 | 	def run(self):
 45 | 		while True:
 46 | 			# 2.如果页面队列为空，就退出循环
 47 | 			if self.page_queue.empty():
 48 | 				print(self.name + '任务完成~')
 49 | 				# while not self.joke_queue.empty():
 50 | 				# 	print(self.joke_queue.get())
 51 | 				break
 52 | 
 53 | 			# 3.从队列中获取页面地址
 54 | 			page_url = self.page_queue.get()
 55 | 			self.spider_page(page_url)
 56 | 
 57 | 			# 6.休眠0.5秒
 58 | 			time.sleep(0.5)
 59 | 
 60 | 	def spider_page(self, page_url):
 61 | 		"""
 62 | 		爬取一页的数据
 63 | 		:param page_url:页面的url
 64 | 		:return:
 65 | 		"""
 66 | 		response = requests.get(page_url, headers=HEADERS)
 67 | 		text_raw = response.text
 68 | 		html_element = etree.HTML(text_raw)
 69 | 
 70 | 		# 4.利用xpath去解析数据
 71 | 		div_elements = html_element.xpath('//div[@class="j-r-list"]')
 72 | 
 73 | 		for div_element in div_elements:
 74 | 			duan_zi_elments = div_element.xpath('./ul/li')
 75 | 			for duan_zi_elment in duan_zi_elments:
 76 | 				# 【数据】用户名
 77 | 				username = duan_zi_elment.xpath('.//a[@class="u-user-name"]/text()')[0]
 78 | 
 79 | 				# 【数据】段子发布时间
 80 | 				pubtime = duan_zi_elment.xpath('.//span/text()')[0]
 81 | 
 82 | 				desc_element = duan_zi_elment.xpath('.//div[@class="j-r-list-c-desc"]')[0]
 83 | 				# 【数据】段子描述内容
 84 | 				content = desc_element.xpath('./a/text()')[0]
 85 | 
 86 | 				img_div_element = duan_zi_elment.xpath('.//div[@class="j-r-list-c-img"]')[0]
 87 | 				img = img_div_element.xpath('.//img/@data-original')[0]
 88 | 				alt = img_div_element.xpath('.//img/@alt')[0]
 89 | 
 90 | 				# 5.把解析后的数据以元组的方式放入到队列中去
 91 | 				self.joke_queue.put((username, content, img, alt, pubtime))
 92 | 
 93 | 
 94 | class BSWriter(threading.Thread):
 95 | 	"""
 96 | 	下载图片、写入文字数据到csv文件中
 97 | 	"""
 98 | 
 99 | 	def __init__(self, page_queue, joke_queue, writer, gLock, name, *args, **kwargs):
100 | 		super(BSWriter, self).__init__(*args, **kwargs)
101 | 
102 | 		# 1.初始化
103 | 		self.page_queue = page_queue
104 | 		self.joke_queue = joke_queue
105 | 		self.writer = writer
106 | 		self.gLock = gLock
107 | 		self.name = name
108 | 
109 | 	def run(self):
110 | 		while True:
111 | 			if self.joke_queue.empty() and self.page_queue.empty():
112 | 				print(self.name + '任务完成~')
113 | 				break
114 | 
115 | 			# 2.从joke_queue队列中获取数据
116 | 			joke_info = self.joke_queue.get(timeout=40)
117 | 			username, content, img, alt, pubtime = joke_info
118 | 
119 | 			# 3.上锁
120 | 			self.gLock.acquire()
121 | 
122 | 			# 4.写入数据到csv中
123 | 			self.writer.writerow((username, content, img, alt, pubtime))
124 | 
125 | 			# 5.下载图片到本地
126 | 			# file_name = alt + fileutils.get_file_suffix(img)
127 | 			# request.urlretrieve(img, './imgs/%s' % file_name)
128 | 
129 | 			# 6.释放锁
130 | 			self.gLock.release()
131 | 
132 | 			print('写入一条数据成功')
133 | 
134 | 
135 | class BSDownImg(threading.Thread):
136 | 	"""
137 | 	下载图片的消费者
138 | 	"""
139 | 
140 | 	def __init__(self, page_queue, joke_queue, gLock, name, *args, **kwargs):
141 | 		super(BSDownImg, self).__init__(*args, **kwargs)
142 | 		self.page_queue = page_queue
143 | 		self.joke_queue = joke_queue
144 | 		self.gLock = gLock
145 | 		self.name = name
146 | 
147 | 	def run(self):
148 | 		while True:
149 | 			if self.joke_queue.empty() and self.page_queue.empty():
150 | 				print(self.name + '任务完成~')
151 | 				break
152 | 			username, content, img, alt, pubtime = self.joke_queue.get(timeout=40)
153 | 
154 | 			# 上锁并下载图片
155 | 			self.gLock.acquire()
156 | 			file_name = alt + fileutils.get_file_suffix(img)
157 | 			request.urlretrieve(img, './imgs/%s' % file_name)
158 | 			self.gLock.release()
159 | 
160 | 			print('下载一张图片成功')
161 | 
162 | 
163 | def spider():
164 | 	"""
165 | 	爬取百思不得姐的前20页数据
166 | 	:return:
167 | 	"""
168 | 
169 | 	# 1.构建队列【生产者、消费者需要上锁的对象】
170 | 	page_queue = Queue(20)
171 | 	joke_queue = Queue(200)
172 | 
173 | 	# 2.锁对象
174 | 	gLock = threading.Lock()
175 | 
176 | 	# 3.写入
177 | 	fp = open('jokes.csv', 'a', newline='', encoding='utf-8')
178 | 	writer = csv.writer(fp)
179 | 
180 | 	# 4.写入csv表头信息
181 | 	writer.writerow(['username', 'content', 'img', 'alt', 'pubtime'])
182 | 
183 | 	# 5.前10页待爬取的地址，放入到队列中
184 | 	for page_num in range(1, 11):
185 | 		page_url = 'http://www.budejie.com/hot/%d' % page_num
186 | 		page_queue.put(page_url)
187 | 
188 | 	# 6.构建10个生成者来进行爬虫
189 | 	for x in range(1, 6):
190 | 		t = BSSpider(page_queue, joke_queue, name='生产者%d' % x)
191 | 		t.start()
192 | 
193 | 	# 7.构建 20 个消费者来写入数据到csv文件中
194 | 	for x in range(1, 21):
195 | 		t = BSWriter(page_queue, joke_queue, writer, gLock, name='消费者-文字%d' % x)
196 | 		t.start()
197 | 
198 | 	# 8.构建 50 个消费者来下载图片
199 | 	for x in range(1, 51):
200 | 		t = BSDownImg(page_queue, joke_queue, gLock, name='消费者-图片%d' % x)
201 | 		t.start()
202 | 
203 | 
204 | if __name__ == '__main__':
205 | 	spider()
206 | 


--------------------------------------------------------------------------------
/spiders/spider_boss.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """
  5 | @version: v1.0
  6 | @author: xag
  7 | @license: Apache Licence
  8 | @contact: xinganguo@gmail.com
  9 | @site: http://www.xingag.top
 10 | @software: PyCharm
 11 | @file: spider_boss.py
 12 | @time: 2018/10/12 10:17
 13 | @description：使用selenium爬取boss直聘网并写入到csv文件中
 14 | """
 15 | 
 16 | from selenium import webdriver
 17 | import re
 18 | from lxml import etree
 19 | import requests
 20 | import time
 21 | import string_utils
 22 | import csv
 23 | 
 24 | current_page = 1
 25 | 
 26 | 
 27 | class BossSpider(object):
 28 |     driver_path = "/usr/local/bin/chromedriver"
 29 | 
 30 |     def __init__(self):
 31 |         self.driver = webdriver.Chrome(executable_path=BossSpider.driver_path)
 32 | 
 33 |         # 网页前缀
 34 |         self.domain = 'https://www.zhipin.com'
 35 | 
 36 |         # 爬取在首页
 37 |         self.url = 'https://www.zhipin.com/job_detail/?query=python&scity=100010000&industry=&position='
 38 | 
 39 |         self.positions = []
 40 | 
 41 |         # 保存数据到 csv 文件中【追加】
 42 |         fp = open('positions.csv', 'a', newline='', encoding='utf-8')
 43 |         self.writer = csv.DictWriter(fp, ['company_name', 'name', 'salary', 'city', 'work_years', 'education', 'desc'])
 44 |         self.writer.writeheader()
 45 | 
 46 |     def run(self):
 47 |         self.driver.get(self.url)
 48 | 
 49 |         global current_page
 50 | 
 51 |         while True:
 52 | 
 53 |             print('爬取第%d页数据' % current_page)
 54 |             current_page = current_page + 1
 55 | 
 56 |             # 获取首页在源码内容
 57 |             source = self.driver.page_source
 58 | 
 59 |             # 爬去当前页面在数据
 60 |             self.parse_current_page(source)
 61 | 
 62 |             next_bt = self.driver.find_element_by_xpath("//a[@ka='page-next']")
 63 | 
 64 |             if 'disabled' in next_bt.get_attribute("class"):
 65 |                 # 最后一页，爬取完成之后，退出应用
 66 |                 self.driver.quit()
 67 |                 break
 68 |             else:
 69 |                 next_bt.click()
 70 | 
 71 |             time.sleep(1)
 72 | 
 73 |             # 由于boss直聘做了反爬【验证码】，这里只爬取一页数据
 74 |             break
 75 | 
 76 |     def parse_current_page(self, source):
 77 |         """
 78 |         解析当前页面在数据获取到详情页面在url:detail_url
 79 |         :param source:
 80 |         :return:
 81 |         """
 82 |         html = etree.HTML(source)
 83 | 
 84 |         # 获取到每一个职位在详情地址
 85 |         detail_urls_pre = html.xpath('//div[@class="info-primary"]//a/@href')
 86 |         # links = html.xpath("//div[@class='info-primary']//a[position()=1]/@href")
 87 | 
 88 |         # 利用lambda + map 对职位详情地址列表加入前缀
 89 |         detail_urls = list(map(lambda x: self.domain + x, detail_urls_pre))
 90 | 
 91 |         # 爬取详情页面的数据
 92 |         for detail_url in detail_urls:
 93 |             self.request_detail_page(detail_url)
 94 | 
 95 |             time.sleep(1)
 96 | 
 97 |     def request_detail_page(self, detail_url):
 98 |         """
 99 |         打开职位详情页面
100 |         :param detail_url:
101 |         :return:
102 |         """
103 | 
104 |         # 1.切换到详情页面窗口
105 |         self.driver.execute_script("window.open('%s')" % (detail_url))
106 |         self.driver.switch_to.window(self.driver.window_handles[1])
107 | 
108 |         # 2.获取详情页面的源码数据
109 |         page_source_detail = self.driver.page_source
110 | 
111 |         # 3.解析详情页面
112 |         self.parse_detail_page(page_source_detail)
113 | 
114 |         # 4.关闭当前窗口并切换回列表
115 |         self.driver.close()
116 | 
117 |         self.driver.switch_to.window(self.driver.window_handles[0])
118 | 
119 |     def parse_detail_page(self, page_source_detail):
120 |         """
121 |         解析职位详情页面
122 |         :param page_source_detail:
123 |         :return:
124 |         """
125 |         html = etree.HTML(page_source_detail)
126 | 
127 |         # 数据 - 名称
128 |         name = html.xpath('//h1/text()')[0]
129 | 
130 |         # 数据 - 公司名称
131 |         company_name = html.xpath('//h3[@class="name"]/a[@ka="job-detail-company"]/text()')[0].strip()
132 | 
133 |         # 数据 - 薪水
134 |         salary = html.xpath("//div[@class='name']/span[@class='badge']/text()")[0].strip()
135 | 
136 |         # 数据 - info
137 |         infos = html.xpath("//div[@class='job-primary detail-box']/div[@class='info-primary']/p/text()")
138 | 
139 |         desc_pre = html.xpath('//div[@class="job-sec"]/div[@class="text"]/text()')
140 | 
141 |         # 每一项换行，去掉前后空格，最后去掉特殊符号
142 |         desc = string_utils.remove_special_word('\n'.join(desc_pre).strip())
143 | 
144 |         city = infos[0]
145 |         work_years = infos[1]
146 |         education = infos[2]
147 | 
148 |         position = {
149 |             'company_name': company_name,
150 |             'name': name,
151 |             'salary': salary,
152 |             'city': city,
153 |             'work_years': work_years,
154 |             'education': education,
155 |             'desc': desc
156 | 
157 |         }
158 |         print('爬取一条数据成功')
159 |         print("==" * 40)
160 | 
161 |         # 写入到csv文件中
162 |         self.write_to_csv(position)
163 | 
164 |         self.positions.append(position)
165 | 
166 |     def write_to_csv(self, position):
167 |         """
168 |         把职位信息写入到 csv 文件中
169 |         :param position:
170 |         :return:
171 |         """
172 |         self.writer.writerow(position)
173 | 
174 | 
175 | if __name__ == '__main__':
176 |     # 定义爬虫类
177 |     spider = BossSpider()
178 | 
179 |     # 开始执行爬虫
180 |     spider.run()
181 | 
182 |     # 写入到csv文件中
183 | 
184 |     # 查看数据
185 |     print('恭喜！爬取数据完成~')
186 |     print(spider.positions)
187 | 


--------------------------------------------------------------------------------
/spiders/spider_china_weather.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """
  5 |     @version: v1.0
  6 |     @author: xag
  7 |     @license: Apache Licence
  8 |     @contact: xinganguo@gmail.com
  9 |     @site: http://www.xingag.top
 10 |     @software: PyCharm
 11 |     @file: spider_china_weather.py
 12 |     @time: 2018/9/20 0:04
 13 |     @description：利用requests + bs4 + html5lib + pyecharts爬取中国天气网的最低气温并可视化
 14 |     @install:# pip install pyecharts/pyecharts-snapshot
 15 |     """
 16 | 
 17 | import requests
 18 | from bs4 import BeautifulSoup
 19 | import time
 20 | from pyecharts import Bar
 21 | 
 22 | 
 23 | # 一共8个区域，包含：华北、东北、华东、华中、华南、西北、西南、港澳台
 24 | # 华北
 25 | url_hb = 'http://www.weather.com.cn/textFC/hb.shtml'
 26 | 
 27 | # 东北
 28 | url_db = 'http://www.weather.com.cn/textFC/db.shtml'
 29 | 
 30 | # 华东
 31 | url_hd = 'http://www.weather.com.cn/textFC/hd.shtml'
 32 | 
 33 | # 华中
 34 | url_hz = 'http://www.weather.com.cn/textFC/hz.shtml'
 35 | 
 36 | # 华南
 37 | url_hn = 'http://www.weather.com.cn/textFC/hn.shtml'
 38 | 
 39 | # 西北
 40 | url_xb = 'http://www.weather.com.cn/textFC/xb.shtml'
 41 | 
 42 | # 西南
 43 | url_xn = 'http://www.weather.com.cn/textFC/xn.shtml'
 44 | 
 45 | # 港澳台【比较特殊】
 46 | url_gat = 'http://www.weather.com.cn/textFC/gat.shtml'
 47 | 
 48 | url_areas = [url_hb, url_db, url_hd, url_hz, url_hn, url_xb, url_xn, url_gat]
 49 | 
 50 | HEADERS = {
 51 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
 52 |     'Referer': 'http://www.weather.com.cn/textFC/hb.shtml'
 53 | }
 54 | 
 55 | # 数据【城市+最低温度】列表
 56 | ALL_DATA = []
 57 | 
 58 | 
 59 | def parse_page(url):
 60 |     """
 61 |         解析一个区域：华北、东北、华东等
 62 |         :param url:
 63 |         :return:
 64 |         """
 65 |     response = requests.get(url, headers=HEADERS)
 66 |     
 67 |     # 1.获取页面的原始html数据
 68 |     text = response.content.decode('utf-8')
 69 |     
 70 |     
 71 |     # 注意：港澳台中香港的table标签没有正确的关闭，使用lxml解析器不能正确解析。需要使用html5lib【容错性强】去自动补全代码，然后进行解析
 72 |     soup = BeautifulSoup(text, 'html5lib')
 73 |     
 74 |     div_conMidtab = soup.find('div', class_='conMidtab')
 75 |     
 76 |     # 3.获取所有的table子Tag【天气信息都保存在table标签下面】
 77 |     tables = div_conMidtab.find_all('table')
 78 |     
 79 |     # 4.遍历片区下面的省份
 80 |     for table in tables:
 81 |         # 4.1过滤掉表头的两个tr数据
 82 |         trs = table.find_all('tr')[2:]
 83 |         
 84 |         # 5.遍历省份下面的市区
 85 |         for index, tr in enumerate(trs):
 86 |             tds = tr.find_all('td')
 87 |             
 88 |             # 5.1 城市名称【第 1 个td标签】
 89 |             # 注意：一个省份第一个城市取第 2 个td标签；其余城市取第 1 个td标签
 90 |             city_td = tds[1] if index == 0 else tds[0]
 91 |             
 92 |             city = list(city_td.stripped_strings)[0]
 93 |             
 94 |             # 5.2 最低气温【倒数第 2 个td标签】
 95 |             temp_low_td = tds[-2]
 96 |             
 97 |             temp_low = list(temp_low_td.stripped_strings)[0]
 98 |             
 99 |             ALL_DATA.append({"city": city, "temp_low": int(temp_low)})
100 | 
101 | 
102 | def spider():
103 |     for index, url in enumerate(url_areas):
104 |         print('开始爬取第{}个区域'.format(index + 1))
105 |         parse_page(url)
106 |         time.sleep(1)
107 | 
108 | 
109 | def analysis_data():
110 |     """
111 |         分析爬下来的数据
112 |         :return:
113 |         """
114 |     
115 |     # 1.默认的排序方式是升序【通过最低气温进行排序】
116 |     ALL_DATA.sort(key=lambda data: data['temp_low'])
117 |     
118 |     # 2.获取前面10条数据
119 |     top_10 = ALL_DATA[:10]
120 |     
121 |     return top_10
122 | 
123 | 
124 | def show_with_chart(top_10):
125 |     """
126 |         把最低的十个城市和温度生成饼状图
127 |         :param top_10:
128 |         :return:
129 |         """
130 |     # 1.获取城市列表
131 |     citys = list(map(lambda item: item['city'], top_10))
132 |     
133 |     # 2.最低温度列表
134 |     temp_lows = list(map(lambda item: item['temp_low'], top_10))
135 |     
136 |     # 3.生成饼状图并写入到html文件中
137 |     bar = Bar("最低气温排行榜")
138 |     
139 |     bar.add("最低温度", citys, temp_lows)
140 |     
141 |     # 渲染
142 |     bar.render('temperature.html')
143 | 
144 | 
145 | if __name__ == '__main__':
146 |     # 1.爬取数据
147 |     spider()
148 |     
149 |     # 2.分析数据
150 |     top_10 = analysis_data()
151 |     
152 |     # 3.使用pyecharts生成饼状图
153 |     show_with_chart(top_10)
154 | 


--------------------------------------------------------------------------------
/spiders/spider_dou_tu_la.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python  
  2 | # encoding: utf-8  
  3 | 
  4 | """ 
  5 | @version: v1.0 
  6 | @author: xag 
  7 | @license: Apache Licence  
  8 | @contact: xinganguo@gmail.com 
  9 | @site: http://www.xingag.top 
 10 | @software: PyCharm 
 11 | @file: spider_dou_tu_la
 12 | @time: 2018/9/25 14:40 
 13 | @description：多线程去爬取斗图啦网站的表情
 14 | @spider_to：http://www.doutula.com/
 15 | """
 16 | 
 17 | import requests
 18 | from lxml import etree
 19 | from urllib import request
 20 | import re
 21 | import os
 22 | import threading
 23 | from queue import Queue
 24 | import time
 25 | 
 26 | # 技术点
 27 | # 1.使用request是获取html数据
 28 | # 2.使用xpath解析数据
 29 | # 3.使用正则表达式sub()函数过滤掉特殊的字符
 30 | # 4.使用urllib.request.urlretrieve()下载图片
 31 | # 5.生产者和消费者模式分离
 32 | # 6.使用queue[线程安全]去保存【每一页的爬取地址】和【表情图片地址】
 33 | 
 34 | HEADERS = {
 35 | 	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
 36 | }
 37 | 
 38 | 
 39 | class Procuder(threading.Thread):
 40 | 	"""
 41 | 	生产者
 42 | 	爬取页面，获取图片地址加入到表情图片队列中
 43 | 	"""
 44 | 
 45 | 	def __init__(self, name, page_queue, img_queue, *args, **kwargs):
 46 | 		super(Procuder, self).__init__(*args, **kwargs)
 47 | 		self.name = name
 48 | 		self.page_queue = page_queue
 49 | 		self.img_queue = img_queue
 50 | 
 51 | 	def run(self):
 52 | 		while True:
 53 | 			if self.page_queue.empty():
 54 | 				print(self.name + '任务完成~')
 55 | 				break
 56 | 			# 1.获取每一页的url
 57 | 			page_url = self.page_queue.get()
 58 | 
 59 | 			# 2.爬取页面的数据
 60 | 			self.spider_page(page_url)
 61 | 
 62 | 			# 3.休眠0.5秒
 63 | 			time.sleep(0.5)
 64 | 
 65 | 	def spider_page(self, url):
 66 | 		"""
 67 | 		爬取每一页
 68 | 		:param url: 每一页的地址
 69 | 		:return:
 70 | 		"""
 71 | 		response = requests.get(url, headers=HEADERS)
 72 | 		text_raw = response.text
 73 | 
 74 | 		# 1.使用etree
 75 | 		html_raw = etree.HTML(text_raw)
 76 | 
 77 | 		# 2.使用xpath解析数据
 78 | 		# 注意：过滤掉gif标签图片
 79 | 		imgs = html_raw.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
 80 | 
 81 | 		# 3.获取图片的实际连接并下载到本地
 82 | 		for img in imgs:
 83 | 			# 3.1 图片的实际地址
 84 | 			img_url = img.get('data-original')
 85 | 
 86 | 			# 3.2 图片名称替换特殊符号
 87 | 			alt = re.sub(r'[\?？\.，。！!\*]', '', img.get('alt'))
 88 | 
 89 | 			# 3.3 提取图片的后缀,组装成文件的名字
 90 | 			img_name = alt + os.path.splitext(img_url)[-1]
 91 | 
 92 | 			# 3.4 把爬取到的表情【图片地址+图片名称】以【元组】的形式加入到队列图片队列中
 93 | 			self.img_queue.put((img_url, img_name))
 94 | 
 95 | 
 96 | class Consumer(threading.Thread):
 97 | 	"""
 98 | 	消费者
 99 | 	获取图片的地址下载到本地
100 | 	"""
101 | 
102 | 	def __init__(self, name, page_queue, img_queue, *args, **kwargs):
103 | 		super(Consumer, self).__init__(*args, **kwargs)
104 | 		self.name = name
105 | 		self.page_queue = page_queue
106 | 		self.img_queue = img_queue
107 | 
108 | 	def run(self):
109 | 		while True:
110 | 
111 | 			if self.img_queue.empty() and self.page_queue.empty():
112 | 				print(self.name + '任务完成~')
113 | 				break
114 | 
115 | 			# 1.解包，获取图片的地址 + 图片的名称
116 | 			img_url, img_name = self.img_queue.get()
117 | 
118 | 			# 2.使用urlretrieve()函数下载图片到本地
119 | 			request.urlretrieve(img_url, './imgs/%s' % img_name)
120 | 
121 | 			print(img_name + "下载完成")
122 | 
123 | 
124 | def spider():
125 | 	# 1.页面的队列
126 | 	page_queue = Queue(100)
127 | 
128 | 	# 2.表情图片的队列
129 | 	# 注意：队列的大小尽量设置大一些，保证线程减少等待的时间
130 | 	img_queue = Queue(1000)
131 | 
132 | 	# 3.爬取页面的地址
133 | 	for x in range(1, 10):
134 | 		url = 'http://www.doutula.com/photo/list/?page=%d' % x
135 | 
136 | 		# 3.1 存入到页面地址队列中
137 | 		page_queue.put(url)
138 | 
139 | 	# 创建5个生成者和5个消费者
140 | 	# 生产者：爬取每一页的数据，获取表情图片的url
141 | 	# 消费者：从表情队列中获取表情图片的实际地址并下载到本地
142 | 	for x in range(5):
143 | 		t = Procuder(name='生产线程-%d' % x, page_queue=page_queue, img_queue=img_queue)
144 | 		t.start()
145 | 
146 | 	for x in range(5):
147 | 		t = Consumer(name='消费线程-%d' % x, page_queue=page_queue, img_queue=img_queue)
148 | 		t.start()
149 | 
150 | 
151 | if __name__ == '__main__':
152 | 	spider()
153 | 


--------------------------------------------------------------------------------
/spiders/spider_dytt.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """
  5 | @version: v1.0
  6 | @author: xag
  7 | @license: Apache Licence
  8 | @contact: xinganguo@gmail.com
  9 | @site: http://www.xingag.top
 10 | @software: PyCharm
 11 | @file: 4.dytt.py
 12 | @time: 2018/9/16 18:46
 13 | @description：爬电影天堂【 lxml + xpath + requests】【2018新片精品，包含更多】
 14 | """
 15 | 
 16 | import requests
 17 | from lxml import etree
 18 | import time
 19 | 
 20 | # url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_1.html'
 21 | 
 22 | # 主页地址
 23 | BASE_DOMAIN = 'http://www.dytt8.net'
 24 | 
 25 | HEADERS = {
 26 | 	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
 27 | }
 28 | 
 29 | 
 30 | def get_detail_urls(url):
 31 | 	"""
 32 | 	获取电影详情页面的url
 33 | 	:param url: 每一页电影列表的地址url
 34 | 	:return:
 35 | 	"""
 36 | 	response = requests.get(url, headers=HEADERS)
 37 | 
 38 | 	# 注意：右键查看源代码，charset=gb2312" 编码方式【网站编码不规范，解码必须用响应的编码方式进行解码】
 39 | 	# print(response.content.decode('gbk'))
 40 | 
 41 | 	# html_element = etree.HTML(response.content.decode('gbk'))
 42 | 
 43 | 	# 注意：电影天堂第3页使用默认的gbk会有乱码，这里使用默认的解码方式【href为英文，解析不会受影响】
 44 | 	html_element = etree.HTML(response.text)
 45 | 
 46 | 	# 【数据 - 字符串列表】详情页面地址
 47 | 	# 所有class为tbspan的table标签/子孙标签中的a标签的href属性
 48 | 	detail_urls = html_element.xpath('//table[@class="tbspan"]//a/@href')
 49 | 
 50 | 	# 深拷贝一份列表数据，实现一变遍历列表，一边删除列表数据
 51 | 	# 过滤掉【综合电影】导致的脏数据
 52 | 	detail_urls_new = detail_urls
 53 | 	for index, detail_url in enumerate(detail_urls_new):
 54 | 		if detail_url == '/html/gndy/jddy/index.html':
 55 | 			detail_urls.remove(detail_url)
 56 | 
 57 | 	# print(detail_urls)
 58 | 
 59 | 	# print(BASE_DOMAIN + detail_url)
 60 | 	# 组装详情页面的地址
 61 | 	detail_urls = map(lambda x: BASE_DOMAIN + x, detail_urls)
 62 | 
 63 | 	return detail_urls
 64 | 
 65 | 
 66 | def parse_detail_page(detail_url):
 67 | 	"""
 68 | 	解析电影详情页面
 69 | 	:param detail_url: 详情页面的地址
 70 | 	:return:
 71 | 	"""
 72 | 	response = requests.get(detail_url, headers=HEADERS)
 73 | 	text = response.content.decode('gbk')
 74 | 	html_element = etree.HTML(text)
 75 | 
 76 | 	# 【数据 - 电影标题】
 77 | 	title = html_element.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()')[0]
 78 | 
 79 | 	# 获取zoom标签
 80 | 	zoom_element = html_element.xpath('//div[@id="Zoom"]')[0]
 81 | 
 82 | 	# 【数据 - 电影封面和电影截图】
 83 | 	imgs = zoom_element.xpath(".//img/@src")
 84 | 
 85 | 	# 注意：为了避免脏数据导致应用挂掉，提前初始化
 86 | 	year, country, type, rating, duration, director, actors, cover, screen_shot, download_url = '', '', '', '', '', '', '', '', '', ''
 87 | 
 88 | 	if len(imgs) > 0:
 89 | 		cover = imgs[0]
 90 | 
 91 | 	# 【数据 - 电影截图】
 92 | 	if len(imgs) > 1:
 93 | 		screen_shot = imgs[1]
 94 | 
 95 | 	# 获取div[@id='zoom']标签下面的所有的文本数据【子孙所有的text文本数据】
 96 | 	infos = zoom_element.xpath('.//text()')
 97 | 
 98 | 	# 解析具体内容的函数
 99 | 	def parse_info(info, rule):
100 | 		return info.replace(rule, '').strip()
101 | 
102 | 	# 遍历infos每一项去获取有用的数据
103 | 	for key, info in enumerate(infos):
104 | 
105 | 		# print('遍历第{}项'.format(key))
106 | 		# print(info)
107 | 		# print('结束==================================================')
108 | 
109 | 		if info.startswith('◎年　　代'):
110 | 			# 年代
111 | 			year = parse_info(info, '◎年　　代')
112 | 		elif info.startswith('◎产　　地'):
113 | 			# 产地
114 | 			country = parse_info(info, '◎产　　地')
115 | 		elif info.startswith('◎类　　别'):
116 | 			# 类别
117 | 			type = parse_info(info, '◎类　　别')
118 | 		elif info.startswith('◎豆瓣评分'):
119 | 			# 豆瓣评分
120 | 			rating = parse_info(info, '◎豆瓣评分')
121 | 		elif info.startswith('◎片　　长'):
122 | 			# 片长
123 | 			duration = parse_info(info, '◎片　　长')
124 | 		elif info.startswith('◎导　　演'):
125 | 			# 导演
126 | 			director = parse_info(info, '◎导　　演')
127 | 		elif info.startswith('◎主　　演'):
128 | 			# 演员【第一个演员】
129 | 			actor_first = parse_info(info, '◎主　　演')
130 | 
131 | 			actors = [actor_first]
132 | 
133 | 			# 继续往下面遍历
134 | 			for index in range(key + 1, len(infos)):
135 | 				item = infos[index].strip()
136 | 				if item.startswith('◎简　　介'):
137 | 					break
138 | 				# 获取所有的演员
139 | 				# print(item)
140 | 				actors.append(item)
141 | 		elif info.startswith('◎简　　介'):
142 | 			# desc = parse_info(info, '◎简　　介')
143 | 
144 | 			for index in range(key + 1, len(infos)):
145 | 				item = infos[index].strip()
146 | 				if item.startswith('【下载地址】'):
147 | 					break
148 | 				desc = item
149 | 
150 | 	print(detail_url)
151 | 
152 | 	# 下载地址
153 | 	if len(html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()')) > 0:
154 | 		download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()')[0]
155 | 	elif len(html_element.xpath('//td[@bgcolor="#fdfddf"]/text()')) > 0:
156 | 		download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/text()')[0]
157 | 
158 | 	film = {
159 | 		'title': title,
160 | 		'cover': cover,
161 | 		'screen_shot': screen_shot,
162 | 		'year': year,
163 | 		'country': country,
164 | 		'type': type,
165 | 		'rating': rating,
166 | 		'duration': duration,
167 | 		'director': director,
168 | 		'actors': actors,
169 | 		'desc': desc,
170 | 		'download_url': download_url
171 | 	}
172 | 
173 | 	return film
174 | 
175 | 
176 | def spider():
177 | 	"""
178 | 	爬虫的入口
179 | 	:return:
180 | 	"""
181 | 	base_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
182 | 
183 | 	films = []
184 | 
185 | 	# 1.获取第1-10页的数据
186 | 	for index in range(1, 11):
187 | 		print('开始爬第{}页'.format(index))
188 | 
189 | 		# 2.电影列表的地址url
190 | 		url = base_url.format(index)
191 | 
192 | 		# 3.获取当前页面包含的所有电影【详情地址】
193 | 		detail_urls = get_detail_urls(url)
194 | 
195 | 		# 4.解析每一项电影的详情页面
196 | 
197 | 		for key, detail_url in enumerate(detail_urls):
198 | 			# print('索引:' + str(key) + ',地址：' + detail_url)
199 | 			# print('解析详情页面:' + detail_url)
200 | 			film = parse_detail_page(detail_url)
201 | 
202 | 			films.append(film)
203 | 
204 | 		# 5.每爬取一页，就休眠2秒钟
205 | 		time.sleep(1)
206 | 
207 | 	print(films)
208 | 
209 | 
210 | if __name__ == '__main__':
211 | 	spider()
212 | 


--------------------------------------------------------------------------------
/spiders/spider_gushiwen.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python  
  2 | # encoding: utf-8  
  3 | 
  4 | """ 
  5 | @version: v1.0 
  6 | @author: xag 
  7 | @license: Apache Licence  
  8 | @contact: xinganguo@gmail.com 
  9 | @site: http://www.xingag.top 
 10 | @software: PyCharm 
 11 | @file: spider_gushiwen 
 12 | @time: 2018/9/21 17:34 
 13 | @description：利用【正则表达式】爬取【古诗文】网
 14 | @link：https://www.gushiwen.org/
 15 | """
 16 | 
 17 | import requests
 18 | import re
 19 | import time
 20 | 
 21 | HEADERS = {
 22 | 	'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
 23 | }
 24 | 
 25 | 
 26 | # 利用正则表达式去爬虫的注意事项
 27 | # 1.正则表达式去爬取元素的时候，与 xpath、bs4 不同，没有结构关系，都是当成一个字符串进行匹配处理
 28 | # 2.re.DOTALL可以让【.符号】匹配到所有的字符【包含\n】
 29 | # 3.正则表达式匹配【任意多字符】一般采用非饥饿型方式【.*?】
 30 | 
 31 | 
 32 | def spider_page(url):
 33 | 	"""
 34 | 	爬取某一页的数据
 35 | 	:param url:
 36 | 	:return:
 37 | 	"""
 38 | 	response = requests.get(url, headers=HEADERS)
 39 | 	text_raw = response.text
 40 | 
 41 | 	# print(text_raw)
 42 | 
 43 | 	# 1.获取所有的标题
 44 | 	titles = re.findall(r'<div\sclass="cont">.*?<b>(.*?)</b>', text_raw, re.DOTALL)
 45 | 
 46 | 	# 2.获取所有的朝代
 47 | 	dynasties = re.findall(r'<p\sclass="source">.*?<a.*?>(.*?)</a>', text_raw, re.DOTALL)
 48 | 
 49 | 	# 3.获取作者信息
 50 | 	authors = re.findall(r'<p\sclass="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text_raw, re.DOTALL)
 51 | 
 52 | 	# 4.获取古诗文内容
 53 | 	# 内容待进一步美化【去掉多余的元素】
 54 | 	contents_pre = re.findall(r'<div\sclass="contson".*?>(.*?)</div>', text_raw, re.DOTALL)
 55 | 
 56 | 	contents = []
 57 | 	for content_pre in contents_pre:
 58 | 		# 4.1 利用sub()函数把内容中的【<.*?>或者换行字符】替换为空
 59 | 		content = re.sub(r'<.*?>|\n', "", content_pre)
 60 | 		contents.append(content.strip())
 61 | 
 62 | 	# 诗词列表数据
 63 | 	poems = []
 64 | 
 65 | 	# 5. 使用zip()把四个列表组合在一起
 66 | 	for value in zip(titles, dynasties, authors, contents):
 67 | 		# 5.1 自动进行解包放入到变量当中
 68 | 		title, dynastie, author, content = value
 69 | 
 70 | 		# 5.2 新建dict，并加入到诗词列表数据中
 71 | 		poem = {
 72 | 			'title': title,
 73 | 			'dynastie': dynastie,
 74 | 			'author': author,
 75 | 			'content': content
 76 | 		}
 77 | 
 78 | 		poems.append(poem)
 79 | 
 80 | 	return poems
 81 | 
 82 | 
 83 | def spider():
 84 | 	# 全部诗词列表数据
 85 | 	poems = []
 86 | 
 87 | 	# 1.爬取前面10页数据
 88 | 	for page_num in range(10):
 89 | 		url = 'https://www.gushiwen.org/default_{}.aspx'.format(page_num + 1)
 90 | 
 91 | 		print('开始爬取第{}页诗词数据'.format(page_num + 1))
 92 | 
 93 | 		poems.append(spider_page(url))
 94 | 
 95 | 		time.sleep(1)
 96 | 
 97 | 	# 2.显示数据
 98 | 	for poem in poems:
 99 | 		print(poem)
100 | 		print("==" * 40)
101 | 
102 | 	print('恭喜！爬取数据完成！')
103 | 
104 | 
105 | if __name__ == '__main__':
106 | 	spider()
107 | 


--------------------------------------------------------------------------------
/spiders/spider_qiu_shi_bai_ke.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: spider_qiu_shi_bai_ke.py 
12 | @time: 2018/9/21 23:16 
13 | @description：利用正则表达式去爬取【糗事百科】的文字数据
14 | @link：https://www.qiushibaike.com/text/
15 | """
16 | 
17 | import re
18 | import requests
19 | 
20 | # 待爬取的地址
21 | base_url = 'https://www.qiushibaike.com/text/page/%s/'
22 | 
23 | HEADERS = {
24 | 	'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',
25 | 	'Referer': 'https://www.qiushibaike.com/'
26 | }
27 | 
28 | 
29 | def spider_page(url):
30 | 	"""
31 | 	爬取某一页的数据
32 | 	:param url:
33 | 	:return:
34 | 	"""
35 | 	response = requests.get(url, headers=HEADERS)
36 | 	text_raw = response.text
37 | 
38 | 	# 获取此页的段子数据
39 | 	# 1.获取作者列表数据
40 | 	authors_pre = re.findall(r'<div\sclass="article.*?<h2>(.*?)</h2>', text_raw, re.DOTALL)
41 | 
42 | 	# 1.1 对获取的作者信息进一步进行处理【数据中包含\n】
43 | 	authors = []
44 | 	for author_pre in authors_pre:
45 | 		author = re.sub(r'\n', '', author_pre)
46 | 		authors.append(author)
47 | 
48 | 	# 2.获取段子列表数据
49 | 	contents_pre = re.findall(r'<div\sclass="content">.*?<span>(.*?)</span>', text_raw, re.S)
50 | 
51 | 	# 2.1 对段子数据进一步处理【数据中包含\n和<br/>】
52 | 	contents = []
53 | 	for content_pre in contents_pre:
54 | 		content = re.sub(r'<.*?>|\n', '', content_pre)
55 | 		contents.append(content)
56 | 
57 | 	# 3.把两个列表数据组装成一个新的列表中
58 | 	jokes = []
59 | 	for temp in zip(authors, contents):
60 | 		author, content = temp
61 | 		jokes.append({
62 | 			'author': author,
63 | 			'content': content
64 | 		})
65 | 
66 | 	# 4.返回当前页面获取的段子数据列表
67 | 	return jokes
68 | 
69 | 
70 | def spider():
71 | 	jokes = []
72 | 
73 | 	for page_num in range(1, 10):
74 | 		print('开始爬取第%s页数据' % page_num)
75 | 
76 | 		# 爬取某一页的数据
77 | 		jokes.append(spider_page(base_url % page_num))
78 | 
79 | 	# 打印爬取的数据
80 | 	for joke in jokes:
81 | 		print(joke)
82 | 
83 | 	print('恭喜！爬取数据完成！')
84 | 
85 | 
86 | if __name__ == '__main__':
87 | 	spider()
88 | 


--------------------------------------------------------------------------------
/spiders/spider_tencent_recruit.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python  
  2 | # encoding: utf-8  
  3 | 
  4 | """ 
  5 | @version: v1.0 
  6 | @author: xag 
  7 | @license: Apache Licence  
  8 | @contact: xinganguo@gmail.com 
  9 | @site: http://www.xingag.top 
 10 | @software: PyCharm 
 11 | @file: spider_tencent_recruit
 12 | @time: 2018/9/17 11:22 
 13 | @description：爬腾讯招聘职位信息
 14 | """
 15 | 
 16 | import requests
 17 | 
 18 | from lxml import etree
 19 | 
 20 | import time
 21 | 
 22 | # 每页的职位数
 23 | PAGE_SIZE = 10
 24 | 
 25 | BASE_DOMAIN = 'https://hr.tencent.com/'
 26 | 
 27 | HEADERS = {
 28 | 	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36',
 29 | 	'Referer': 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=10',
 30 | 	'Cookie': '_ga=GA1.2.1222789966.1535530525; pgv_pvi=8193187840; pgv_si=s2985358336; PHPSESSID=22e3m8aknd19s1gqkh0i9eisk0; Hm_lvt_0bd5902d44e80b78cb1cd01ca0e85f4a=1536726429,1536908218,1537154694,1537166987; Hm_lpvt_0bd5902d44e80b78cb1cd01ca0e85f4a=1537167106'
 31 | }
 32 | 
 33 | 
 34 | def get_jo_detail_urls(page_url):
 35 | 	"""
 36 | 	1.根据当前页面url地址获取每一个职位的详情页面url
 37 | 	:param page_url:当前页面的url
 38 | 	:return:
 39 | 	"""
 40 | 	response = requests.get(page_url, headers=HEADERS)
 41 | 
 42 | 	html_element = etree.HTML(response.text)
 43 | 
 44 | 	# print(etree.tostring(html_element, encoding='utf-8').decode('utf-8'))
 45 | 
 46 | 	detail_urls = html_element.xpath('//tr[@class="even" or @class="odd"]//a/@href')
 47 | 
 48 | 	# 获取所有职位详情页面的url
 49 | 	detail_urls = map(lambda detail_url: BASE_DOMAIN + detail_url, detail_urls)
 50 | 
 51 | 	return detail_urls
 52 | 
 53 | 
 54 | def get_detail_msg(detail_url):
 55 | 	"""
 56 | 	2.获取某个职位的详细数据
 57 | 	:param detail_url: 职位详细页面的url
 58 | 	:return: 职位数据
 59 | 	"""
 60 | 	# print('请求的详细地址是:' + detail_url)
 61 | 	response = requests.get(detail_url, headers=HEADERS)
 62 | 	html_element = etree.HTML(response.text)
 63 | 
 64 | 	position = {}
 65 | 
 66 | 	# 【数据】获取职位标题
 67 | 	title = html_element.xpath('//tr[@class="h"]/td/text()')[0]
 68 | 	position['title'] = title
 69 | 
 70 | 	# 【数据】工作地点/职位类别
 71 | 	top_infos = html_element.xpath('//tr[@class="c bottomline"]//text()')
 72 | 	position['location'] = top_infos[top_infos.index('工作地点：') + 1]
 73 | 	position['category'] = top_infos[top_infos.index('职位类别：') + 1]
 74 | 
 75 | 	content_infos = html_element.xpath('//ul[@class="squareli"]')
 76 | 	# 【数据】工作职责
 77 | 	work_do_info = content_infos[0]
 78 | 	position['duty'] = work_do_info.xpath("./li/text()")
 79 | 
 80 | 	# 【数据】工作要求
 81 | 	work_ask_info = content_infos[1]
 82 | 	position['ask'] = work_ask_info.xpath('./li/text()')
 83 | 
 84 | 	return position
 85 | 
 86 | 
 87 | def spider():
 88 | 	# 0.待返回的职位数据
 89 | 	positions = []
 90 | 
 91 | 	# 1.获取前10页的职位数据
 92 | 	for page_num in range(0, 10):
 93 | 		print('开始爬取第{}页数据'.format(page_num + 1))
 94 | 
 95 | 		# 2.每一页的地址
 96 | 		url = 'https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a'.format(page_num * PAGE_SIZE)
 97 | 
 98 | 		# 3.获取【当前页】所有职位的【详情页面的url】
 99 | 		detail_urls = get_jo_detail_urls(url)
100 | 
101 | 		# 4.一个个去解析详情页面的数据
102 | 		for detail_url in detail_urls:
103 | 			position = get_detail_msg(detail_url)
104 | 			positions.append(position)
105 | 
106 | 		time.sleep(1)
107 | 
108 | 	print('爬取完成！')
109 | 	print(positions)
110 | 
111 | 
112 | if __name__ == '__main__':
113 | 	spider()
114 | 


--------------------------------------------------------------------------------
/spiders/发表情/auto_send_emoji.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python  
  2 | # encoding: utf-8  
  3 | 
  4 | """ 
  5 | @version: v1.0 
  6 | @author: xag 
  7 | @license: Apache Licence  
  8 | @contact: xinganguo@gmail.com 
  9 | @site: http://www.xingag.top 
 10 | @software: PyCharm 
 11 | @file: auto_send_emoji.py 
 12 | @time: 3/14/19 16:22 
 13 | @description：根据要求选择表情，发给微信上对应的好友或者微信群
 14 | """
 15 | 
 16 | import requests
 17 | from lxml import etree
 18 | import os
 19 | import re
 20 | from utils.string_utils import *
 21 | import time
 22 | import random
 23 | from urllib import request
 24 | import itchat
 25 | from utils.chat_utils import *
 26 | import matplotlib.pyplot as plt
 27 | import matplotlib.image as mpimg
 28 | from queue import Queue
 29 | import threading
 30 | 
 31 | # pip3 install itchat
 32 | 
 33 | HEADERS = {
 34 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
 35 | }
 36 | 
 37 | url = 'https://www.doutula.com/search?type=photo&more=1&keyword={}&page={}'
 38 | 
 39 | 
 40 | class Spider(object):
 41 | 
 42 |     def __init__(self, emoji_type, send_to):
 43 |         self.emoji_type = emoji_type
 44 |         self.send_to = send_to
 45 |         self.emojis = []
 46 | 
 47 |         # 起始页码
 48 |         self.start_page = 1
 49 | 
 50 |     def get_emojis(self):
 51 | 
 52 |         while True:
 53 |             current_url = url.format(self.emoji_type, self.start_page)
 54 |             resp = requests.get(current_url, headers=HEADERS)
 55 |             html_raw = etree.HTML(resp.text)
 56 | 
 57 |             # 判断当前是否还有emoji表情
 58 |             container_element = html_raw.xpath('//div[@class="random_picture"]//img')
 59 |             if len(container_element) > 0:
 60 |                 self.emojis.extend(self.__get_current_page_emoji(html_raw))
 61 |                 self.start_page += 1
 62 |             else:
 63 |                 print("当前页面没有表情数据,地址是:%s" % current_url)
 64 |                 break
 65 | 
 66 |             time.sleep(0.5)
 67 | 
 68 |     def __get_current_page_emoji(self, html_raw):
 69 |         """
 70 |         获取当前页面所有的emoji图片
 71 |         :param current_url:
 72 |         :return:
 73 |         """
 74 | 
 75 |         a_elements = html_raw.xpath('//div[@class="pic-content text-center"]/div[@class="random_picture"]/a')
 76 | 
 77 |         print("第%d页一共有%d张图片" % (self.start_page, len(a_elements)))
 78 | 
 79 |         imgs = []
 80 | 
 81 |         for a_element in a_elements:
 82 |             # 获取img标签【最后一个img】【存储地址】
 83 |             img_element = a_element.xpath('./img[last()]')[0]
 84 | 
 85 |             # 获取p标签【存储名称】
 86 |             name = a_element.xpath('./p/text()')[0]
 87 | 
 88 |             # xpath获取兄弟节点p
 89 |             # 表情的名称
 90 |             # name = img_element.xpath('./../p/text()')[0]
 91 | 
 92 |             # 表情的下载地址
 93 |             img_url = img_element.get('data-original')
 94 | 
 95 |             # 表情的新名词，不带后缀
 96 |             # name_new = remove_space(re.sub(r'[\?？\.，。！!\*]', '', name))
 97 | 
 98 |             # 注意：由于itchat没法发送带中文的文件，这里随机生成一个名称
 99 |             name_new = make_random_string(6)
100 | 
101 |             # 表情的名称，加上后缀
102 |             # print('==' * 60)
103 |             # print(name_new)
104 |             # print(img_url)
105 |             # print('==' * 60)
106 |             img_name = name_new + os.path.splitext(img_url)[-1]
107 | 
108 |             imgs.append({
109 |                 'name': img_name,
110 |                 'url': img_url
111 |             })
112 | 
113 |         return imgs
114 | 
115 |     def download_emojis(self, target_emoji):
116 |         """
117 |         下载表情
118 |         :param target_emojis:
119 |         :return:
120 |         """
121 |         # 本地保存目录
122 |         local_img = './imgs/%s' % target_emoji.get('name')
123 | 
124 |         request.urlretrieve(target_emoji.get('url'), local_img)
125 | 
126 |         print('emoji保存本地地址:%s' % local_img)
127 | 
128 |         return local_img
129 | 
130 |     def show_image(self, filename):
131 |         lena = mpimg.imread(filename)
132 | 
133 |         plt.imshow(lena)  # 显示图片
134 |         plt.axis('off')  # 不显示坐标轴
135 |         plt.show()
136 | 
137 | 
138 | if __name__ == '__main__':
139 | 
140 |     # 准备调用itchat发送图片
141 |     itchat.auto_login(hotReload=True)
142 | 
143 |     emoji_type = input('想发哪类表情:')
144 |     send_type = input('某个人：0/群聊：1【默认是单聊】')
145 |     send_to = input('发给谁呢？')
146 | 
147 |     if not emoji_type:
148 |         emoji_type = '装逼'
149 | 
150 |     if not send_type:
151 |         send_type = 0
152 |     else:
153 |         send_type = int(send_type)
154 | 
155 |     if not send_to:
156 |         if send_type == 0:
157 |             send_to = '指定经常要发送的一个人'
158 |         else:
159 |             send_to = '指定经常要发送的一个群'
160 | 
161 |     spider = Spider(emoji_type, send_to)
162 | 
163 |     # 带发送的表情
164 |     local_img = None
165 | 
166 |     # 获取这种类型的所有表情
167 |     spider.get_emojis()
168 | 
169 |     while True:
170 | 
171 |         # 从所有emoji表情中选择一张
172 |         choose_emoji = random.sample(spider.emojis, 1)
173 | 
174 |         # 下载到本地
175 |         local_img = spider.download_emojis(choose_emoji[0])
176 | 
177 |         # 显示图片
178 |         spider.show_image(local_img)
179 | 
180 |         ok = input('主人满意吗：')
181 | 
182 |         if ok:
183 |             print('好的，就发送这张表情。')
184 |             if send_type == 0:
185 |                 send_to_person(send_to, local_img)
186 |             else:
187 |                 send_to_group_chat(send_to, local_img)
188 | 
189 |             # 需要再发一张吗
190 |             go_on_send = input('需要再发一张吗?')
191 |             if go_on_send:
192 |                 continue
193 |             else:
194 |                 print('结束了')
195 |                 break
196 |         else:
197 |             print('不满意，继续找一张')
198 |             continue
199 | 


--------------------------------------------------------------------------------
/spiders/发表情/utils/chat_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: test.py 
12 | @time: 3/15/19 11:45 
13 | @description：TODO
14 | """
15 | 
16 | import itchat
17 | 
18 | 
19 | itchat.auto_login(True)
20 | 
21 | 
22 | def send_to_person(username, file_names):
23 |     """
24 |     发送给某个人
25 |     :param username: 发送对象的昵称
26 |     :param filename: 文件名
27 |     :return:
28 |     """
29 |     room = itchat.search_friends(name=r'%s' % username)
30 | 
31 |     userName = room[0]['UserName']
32 | 
33 |     try:
34 |         if isinstance(file_names, list):
35 |             # 多个图片
36 |             for file_name in file_names:
37 |                 itchat.send_image(file_name, toUserName=userName)
38 |         else:
39 |             # 一个图片
40 |             itchat.send_image(file_names, toUserName=userName)
41 |         print('发送完毕！')
42 |     except:
43 |         print('发送出错！')
44 | 
45 | 
46 | def send_to_group_chat(target_group_chat_name, file_names):
47 |     """
48 |     群聊
49 |     :param target_group_chat_name:
50 |     :param file_name:
51 |     :return:
52 |     """
53 |     rooms = itchat.get_chatrooms(update=True)
54 | 
55 |     # 目标群聊对象
56 |     target_room = None
57 |     for room in rooms:
58 |         group_chat_name = room.get('NickName')
59 |         if target_group_chat_name == group_chat_name:
60 |             target_room = room
61 |             break
62 | 
63 |     if target_room:
64 |         if isinstance(file_names, list):
65 |             for file_name in file_names:
66 |                 target_room.send_image(file_name)
67 |         else:
68 |             target_room.send_image(file_names)
69 | 
70 |         print('发送完毕！')
71 |     else:
72 |         print('抱歉，不存在这个群聊')
73 | 


--------------------------------------------------------------------------------
/spiders/发表情/utils/string_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: string_utils.py 
12 | @time: 3/15/19 10:36 
13 | @description：TODO
14 | """
15 | 
16 | import random
17 | import string
18 | 
19 | 
20 | def remove_space(source):
21 |     """
22 |     去除空格
23 |     :param source:
24 |     :return:
25 |     """
26 |     return "".join(source.split(' '))
27 | 
28 | 
29 | 
30 | def make_random_string(num):
31 |     """
32 |     生成随机字符串
33 |     :param num:
34 |     :return:
35 |     """
36 |     return ''.join(random.sample(string.ascii_letters + string.digits, num))


--------------------------------------------------------------------------------
/spiders/年终奖/comments.txt:
--------------------------------------------------------------------------------
  1 | 有，一个月工资
  2 | 我们没有
  3 | 有
  4 | 还没发
  5 | 没有
  6 | 没有
  7 | 这个真没有
  8 | 没有
  9 | 有4.8✖️1.5
 10 | 从来没有
 11 | 没有
 12 | 没有，只有水果[流泪]购物卡也没
 13 | 年后发……
 14 | 年终奖没有，年会也没有了[捂脸]
 15 | 没有了
 16 | 说了有，还没发
 17 | 从来没有
 18 | 我这边小公司提都没提，估计凉了。
 19 | 有，但是还不确定发多少
 20 | 没有
 21 | 没有
 22 | 没有了
 23 | 有，但是要打折了，具体还未知
 24 | 没有过
 25 | 要过年才知道
 26 | 有，一个半月多一点，每个人不一样，这个看部门老大
 27 | 没有
 28 | 没有，而且要被裁了[流泪]
 29 | 不知道有没有，每年都是8-9月分发，年“中”奖。如果发的话，固定一个月[再见]
 30 | 我实习3个月，拿了三分之一月的
 31 | 有，0.5个月
 32 | 两个月[发呆]
 33 | 小公司，有一个月的年终奖！
 34 | 最后一天上班 给裁了 年终奖都省了
 35 | 有.年底双薪
 36 | 没有年终奖，年终奖是神马？
 37 | 有年终奖，但是由于入职时间过短，不知道自己会有多少。
 38 | 还不知道....😂
 39 | 从来没有[流泪]
 40 | 没有的飘过
 41 | 我们没有
 42 | 没有。
 43 | 不知道耶。。反正有也就3000块钱。。
 44 | 没有
 45 | 有，跟去年一样
 46 | 我们过年前一天才能知道发不发，才能知道发多少 (没有人知道年终奖的计算法方法，发多少是多少)。其余时间没有一点年终奖的消息
 47 | 还不清楚😂
 48 | 年会抽奖算不算😂
 49 | 有 还没发
 50 | 两个月
 51 | 有，但不知道多少
 52 | 听说有，听说比去年多。
 53 | 木有
 54 | 还不知道
 55 | 老板说我们的叫13薪，多出来的 1 不是额外给的，每个月抽一点出来，最后考核完了看着给，13薪是这个意思吗[撇嘴]
 56 | 没有
 57 | 有 大概2-3个月的工资
 58 | 听说是有。。
 59 | 我听说我们公司的一直都是第二年年中之后才发年终奖 1-3个月不等[衰]
 60 | 有  但还没发
 61 | 一般随过年工资一起发，还没到时间
 62 | 还不知道[捂脸] 19号开年会
 63 | 去年的没发[撇嘴]
 64 | 有，年终奖和去年一样多，不过全年收入上涨幅度可观😄
 65 | 没有[难过]
 66 | 有，发了个球和一坨毛线，还有一把锤子。
 67 | 没有+1
 68 | 还不知道，我们得到除夕的前一天才知道。去年也是除夕的前一天
 69 | 还不晓得有没有，这个公司加入还没有一个月
 70 | 没有
 71 | 年终奖是有的。过年前发，一般人1.5应该有的
 72 | 没有年终奖，工资都是拖延
 73 | 还没发呢 不太清楚啊
 74 | 有
 75 | 没有
 76 | 没有啦，公司都要倒了！！！
 77 | 没消息  可能连年会都没有[衰]
 78 | 没有
 79 | 没有
 80 | 没有
 81 | 还没通知呢
 82 | 工作三年，从来没拿过年终奖
 83 | 今年也没有，年会还年后举行[捂脸]
 84 | 有 但是还没确定发多少
 85 | 有  2月   还没发
 86 | 没有。。。
 87 | 有，接近两个月工资
 88 | 我们年终奖3个月工资
 89 | 没年终奖，有季度奖
 90 | 减半
 91 | 同一个月
 92 | 没有
 93 | 十一月入职的，有一点
 94 | 估计没有，发工资都困难了[发呆]
 95 | 外包公司，一直没有
 96 | 没有
 97 | 有，俩月
 98 | 有年会，转正不久，年终奖还不清楚。
 99 | 没有+1
100 | 今年还没通知不发，往常是两个月*绩效
101 | 年会都调到年后3月份开了，还说要开拓疆土，扩大规模🌚
102 | 我们这个级别不会有，，，
103 | 今年四月底入职的，不知道有没有
104 | 13薪的1薪算年终奖吗，算的话就有，不算的话就没有
105 | 有，一个月
106 | 真没有[难过]
107 | 据说是有，没说发多少，待定转态，估计凉了
108 | 以前有。今年估计悬了
109 | 我们应该有，但是还不确定
110 | 应该在大公司、国企、事业单位这些影响不大吧。我们还是有的
111 | 有，主要是前面说好的设计的提成，不知道能给到多少
112 | 没有
113 | 没有
114 | 没有年终奖
115 | 应该有吧！没有立马辞职
116 | 没有
117 | 工资拖欠了[流泪]
118 | 没有，而且公司春节前后还不让请假，如果请假，春节的法定假日就不算法定假日了，算成请假了，要扣钱
119 | 没有
120 | 没有
121 | 一直的传统，一个月工资，但平时工资就比同行低好多，综合年收入八万多一点点
122 | 有
123 | 没有
124 | 没有 公司业绩下滑将近8亿
125 | 有，和以前一样。
126 | 不是看老板心情，去年有，今年就不确定了
127 | 没有
128 | 有，一个月还有少量项目分成
129 | 今年现在都还没提过年终奖这件事，感觉凉了[流泪]
130 | 一个月 ！
131 | 要倒闭了
132 | 有，大部分是两个月工资
133 | 没有
134 | 没有，还降薪20％
135 | 应该有，看公司利润了，大概率一个月工资。
136 | 不清楚[流泪]
137 | 有个锤子
138 | 没有
139 | 应该有，还没发，估计底薪x2。
140 | 没有+降薪20%
141 | 物流集团旗下成立的新科技公司，大数据部门貌似一直都没有年终奖……
142 | 我们还在评估发多少
143 | 200块红包
144 | 有，听说大概是月工资的1.几倍。ps:发的是17年的奖金，18的奖金还得往后挪。应该是等不到那一天了。
145 | 我们有2个月，不过要19年年中才发
146 | 从来都没有
147 | 正常的按照绩效发
148 | 有，一个月工资
149 | 还没发 往年1.5不到
150 | 2月
151 | 没有
152 | 从来没有过。。
153 | 往年都是4个月，过年前一周发，今年还不知道
154 | 公司已经裁员，剩下的大概率没有年终奖[撇嘴]
155 | 应该是有，
156 | 有，一个月
157 | 我们有，照常两个月奖金。但是是平时周六上班换来的
158 | 一直没有
159 | 从来没有
160 | 年薪百分之15
161 | 还没听吭
162 | 有，一个月，但是不多
163 | 还不知道。。。
164 | 多半没有！
165 | 一直是13薪。不过今年改成了bouns,比13薪率高，大概1.3个月的样子
166 | 应该有
167 | 往年惯例都是一个月，今年公司效益不错，承诺至少两个月以上，这几天公司还组织来巴厘岛度假。额，是不是有点太拉仇恨了呀[调皮]
168 | 我公司，就我部门没有[微笑]
169 | 有，几个月不知道
170 | 我们还没发
171 | 有，项目奖半个月，年终奖，惯例4月份发
172 | 一个月
173 | 有
174 | 妹有
175 | 一个月+1200过年费
176 | 应该是有2个月，
177 | 创业公司，刚开门几个月，没有
178 | 还不清楚，估计没有
179 | 没有
180 | 之前说是有2个月的，年后发
181 | 没有
182 | 还没发，不知道有没有
183 | 3个月
184 | 我们没有
185 | 没有
186 | 据说还有，还没发过[闭嘴]
187 | 还不知道呢。去年是春节放假前发的
188 | 啥都没得
189 | 实习生不说话😂😂
190 | 还没有发，但入职说的14薪
191 | 我们每年都4月底发，还不清楚有没有
192 | 年薪的20%就是年终奖，绩效不好还要扣
193 | 我们是年中奖，刚来半年，不知道年中有没有[微笑]
194 | 有，不过减半了，去年年终是多发两个月的工资，今年好像是只有一个月的
195 | 说有，不知道有没有
196 | 没有
197 | 以前是平均3到4个月工资，去年就没发了，今年估计也悬了
198 | 没有  年会都没了
199 | 没有。
200 | 2+绩效
201 | 没有
202 | 去年就把年终奖取消了，变为项目奖的路过
203 | 18 年第三季度发的 17 年年终奖[微笑]
204 | 发了，大概是四五个月工资了，因为工资低😂
205 | 有，拖到6月份发
206 | 还不知道
207 | 一个月工资，但是公司规定要第二个季度才发，差不多就是67月份
208 | 没有
209 | 老板承诺都有，但现在还没发，不知道会不会兑现
210 | 没有
211 | 年底双薪，年终两月
212 | 没有
213 | 没有，估计年会有红包
214 | 不知道多少个月
215 | 据说有😂
216 | 要不稍微分行业来个投票 清晰些
217 | 从业三年，几乎没感受到年终奖，以至于我都不记得我们有发过所谓的年终奖吗？
218 | 没有……
219 | 有，但不知道怎么发[捂脸]
220 | 有 2月12日
221 | 老大说年终奖年后发。
222 | 一个月
223 | 绩效到现在还不知道。。。
224 | 有，一个月工资
225 | 没有，据说年会都取消了
226 | 没有
227 | 新公司，啥都不确定[晕][晕][晕]
228 | 从来就没有
229 | 一直没有[撇嘴]
230 | 我们没有，昨天发工资，每个人都要延迟发放一部分，普遍30％，个别60％或70％。大家已经怨声载道了
231 | 有6个月[尴尬]
232 | 没有
233 | 应该是一个月工资，下周五年会抽奖保底 400，一等奖 10000 现金,如果我中了就可以每个月给张叔打赏了😏😏
234 | 2
235 | 老板说给我加工资，年终奖照发
236 | 没有
237 | 也没有
238 | 有，两个月
239 | 有，一个月工资
240 | 我们没有
241 | 有
242 | 还没发
243 | 没有
244 | 没有
245 | 这个真没有
246 | 没有
247 | 有4.8✖️1.5
248 | 从来没有
249 | 没有
250 | 没有，只有水果[流泪]购物卡也没
251 | 年后发……
252 | 年终奖没有，年会也没有了[捂脸]
253 | 没有了
254 | 说了有，还没发
255 | 从来没有
256 | 我这边小公司提都没提，估计凉了。
257 | 有，但是还不确定发多少
258 | 没有
259 | 没有
260 | 没有了
261 | 有，但是要打折了，具体还未知
262 | 没有过
263 | 要过年才知道
264 | 有，一个半月多一点，每个人不一样，这个看部门老大
265 | 没有
266 | 没有，而且要被裁了[流泪]
267 | 不知道有没有，每年都是8-9月分发，年“中”奖。如果发的话，固定一个月[再见]
268 | 我实习3个月，拿了三分之一月的
269 | 


--------------------------------------------------------------------------------
/spiders/年终奖/nzj.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: nzj.py 
12 | @time: 1/11/19 16:00 
13 | @description：看看大家今年大家都有年终奖吗？
14 | """
15 | 
16 | import json
17 | import jieba
18 | import matplotlib.pyplot as plt
19 | from wordcloud import WordCloud
20 | 
21 | # 文件名称
22 | filename = 'comments.txt'
23 | 
24 | # 总共的评论数目
25 | comment_count = 0
26 | 
27 | 
28 | def response(flow):
29 |     request = flow.request
30 |     response = flow.response
31 | 
32 |     global comment_count
33 | 
34 |     # 请求的地址
35 |     request_url = request.url
36 | 
37 |     # 筛选
38 |     if 'comments' in request_url and 'zsxq' in request_url:
39 |         # 返回的内容
40 |         response_content = response.content.decode('utf-8')
41 |         print('请求地址:' + request_url)
42 |         print('请求方法：' + str(request.method))
43 |         print('参数:' + str(request.data))
44 | 
45 |         obj = json.loads(response_content)
46 | 
47 |         comments = obj['resp_data']['comments']
48 | 
49 |         # 最后一页
50 |         if len(comments) == 0:
51 |             print('一共有%d个球友发表了自己的看法' % comment_count)
52 | 
53 |             # 生成词云
54 |             generate_word_cloud()
55 | 
56 |         else:
57 |             comment_count += len(comments)
58 |             for comment in comments:
59 |                 comment_content = comment['text']
60 |                 with open(filename, 'a') as f:
61 |                     f.write(comment_content + '\n')
62 | 
63 | 
64 | def generate_word_cloud():
65 |     """
66 |     生成词云
67 |     :return:
68 |     """
69 |     with open(filename, 'r') as f:
70 |         word_content = f.read()
71 | 
72 |         # 使用jieba去分割
73 |         wordlist = jieba.cut(word_content, cut_all=True)
74 | 
75 |         wl_space_split = " ".join(wordlist)
76 | 
77 |         font = r'/Users/xingag/Library/Fonts/SimHei.ttf'
78 | 
79 |         wordcloud = WordCloud(font_path=font, width=1080, height=1920, margin=2).generate(wl_space_split)
80 | 
81 |         # 显示图片
82 |         plt.imshow(wordcloud)
83 |         plt.axis("off")
84 | 
85 |         # 按照设置保存到本地文件夹
86 |         wordcloud.to_file("./output.png")
87 | 


--------------------------------------------------------------------------------
/spiders/年终奖/output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/spiders/年终奖/output.png


--------------------------------------------------------------------------------
/verification code/注册【中知网】/AipOcr.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: AipOcr.py 
12 | @time: 1/23/19 15:19 
13 | @description：AipOcr是OCR的Python SDK客户端，为使用OCR的开发人员提供了一系列的交互方法。
14 | """
15 | 
16 | from aip import AipOcr
17 | 
18 | """ 你的 APPID AK SK """
19 | APP_ID = '15474**'
20 | API_KEY = 'VBoMZ6XUX119w***'
21 | SECRET_KEY = 'GPvqLVeGIMOR57***'
22 | 
23 | client = AipOcr(APP_ID, API_KEY, SECRET_KEY)
24 | 


--------------------------------------------------------------------------------
/verification code/注册【中知网】/cnki_demo.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python  
  2 | # encoding: utf-8  
  3 | 
  4 | """ 
  5 | @version: v1.0 
  6 | @author: xag 
  7 | @license: Apache Licence  
  8 | @contact: xinganguo@gmail.com 
  9 | @site: http://www.xingag.top 
 10 | @software: PyCharm 
 11 | @file: cnki_demo.py
 12 | @time: 1/23/19 15:44 
 13 | @description：[中国知网注册]
 14 | """
 15 | from PIL import Image
 16 | from selenium import webdriver
 17 | from file_tools import *
 18 | from AipOcr import *
 19 | import requests
 20 | import time
 21 | import json
 22 | 
 23 | 
 24 | class Cnki_Spider(object):
 25 |     driver_path = "/usr/local/bin/chromedriver"
 26 | 
 27 |     def __init__(self):
 28 |         self.driver = webdriver.Chrome(executable_path=Cnki_Spider.driver_path)
 29 | 
 30 |         # 包含验证码的页面的截图
 31 |         self.screen_shot_file_name = "screen_shot.png"
 32 | 
 33 |         # 验证码图片
 34 |         self.code_file_name = "image_code.png"
 35 | 
 36 |         # 注册主页面
 37 |         self.main_url = 'http://my.cnki.net/elibregister/commonRegister.aspx'
 38 | 
 39 |         # 待注册的内容
 40 |         # 昵称
 41 |         self.username = 'xingag2311'
 42 |         # 密码
 43 |         self.password = 'Hu9012782'
 44 |         # 邮箱地址
 45 |         self.email = '809900227@qq.com'
 46 | 
 47 |     def run(self):
 48 |         # 1.打开注册页面【包含验证码】
 49 |         self.driver.get(self.main_url)
 50 | 
 51 |         source = self.driver.page_source
 52 | 
 53 |         # 2.验证码图片、验证码输入框
 54 |         code_input_element = self.driver.find_element_by_id('txtOldCheckCode')
 55 |         code_img_element = self.driver.find_element_by_id('checkcode')
 56 | 
 57 | 
 58 |         # 外面容器
 59 |         container_element = self.driver.find_element_by_id('form1')
 60 | 
 61 |         # 3.获取验证码、填入输入框、点击外面
 62 |         # 如果没有出现出错的提示tips，就代表输入验证码成功
 63 |         while True:
 64 | 
 65 |             code = self.get_code().strip()
 66 | 
 67 |             error_tips_element = self.driver.find_element_by_id('span_oldcheckcode')
 68 | 
 69 |             print('验证码为:%s' % code)
 70 |             code_input_element.clear()
 71 |             code_input_element.click()
 72 |             code_input_element.send_keys(code)
 73 | 
 74 |             # 点击外围的容器，判断验证码是否输入正确
 75 |             container_element.click()
 76 | 
 77 |             # 显示了错误信息：验证码输入错误
 78 |             if error_tips_element.text:
 79 |                 time.sleep(2)
 80 |                 print('验证码验证失败，点击验证码图片')
 81 | 
 82 |                 # 点击验证码图片，重新加载验证码
 83 |                 code_img_element.click()
 84 |                 continue
 85 |             else:
 86 |                 print('验证码验证成功')
 87 |                 break
 88 | 
 89 |         # 3.注册
 90 |         self.register(code)
 91 | 
 92 |     def get_code(self):
 93 | 
 94 |         # 1.截图并保存到本地
 95 |         self.driver.get_screenshot_as_file('./%s' % self.screen_shot_file_name)
 96 | 
 97 |         # 2.打开文件
 98 |         screenshot_image = Image.open('./%s' % self.screen_shot_file_name)
 99 | 
100 |         # 3.设置要裁剪的区域（验证码所在的区域）
101 |         code_box = (899, 819, 1048, 883)
102 | 
103 |         # 4.截图：生成只有验证码的图片
104 |         code_image = screenshot_image.crop(code_box)
105 | 
106 |         # 5.保存到本地
107 |         code_image.save("./%s" % self.code_file_name)
108 | 
109 |         # 6.以byte读取图片
110 |         image = get_file_content("./%s" % self.code_file_name)
111 | 
112 |         # 7.使用百度OCR识别验证码
113 |         result = client.basicAccurate(image)
114 | 
115 |         print(result)
116 | 
117 |         # 识别的文字内容
118 |         word_result = result.get('words_result')[0].get('words')
119 | 
120 |         return word_result
121 | 
122 |     def register(self, code):
123 |         # 用户名输入框
124 |         username_input_element = self.driver.find_element_by_id('username')
125 | 
126 |         # 密码输入框
127 |         password_input_element = self.driver.find_element_by_id('txtPassword')
128 | 
129 |         # 邮箱输入框
130 |         txtEmail_input_element = self.driver.find_element_by_id('txtEmail')
131 | 
132 |         # 注册按钮
133 |         submit_btn_element = self.driver.find_element_by_id('ButtonRegister')
134 | 
135 |         username_input_element.send_keys(self.username)
136 |         password_input_element.send_keys(self.password)
137 |         txtEmail_input_element.send_keys(self.email)
138 | 
139 |         submit_btn_element.click()
140 | 
141 | 
142 | if __name__ == '__main__':
143 |     spider = Cnki_Spider()
144 |     spider.run()
145 | 


--------------------------------------------------------------------------------
/verification code/注册【中知网】/file_tools.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: file_tools.py 
12 | @time: 1/23/19 15:41 
13 | @description：TODO
14 | """
15 | 
16 | 
17 | def get_file_content(filePath):
18 |     """
19 |     读取文件
20 |     :param filePath: 文件路径
21 |     :return: byte类型 <class 'bytes'>
22 |     """
23 |     with open(filePath, 'rb') as fp:
24 |         return fp.read()
25 | 


--------------------------------------------------------------------------------
/verification code/注册【中知网】/image_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/verification code/注册【中知网】/image_code.png


--------------------------------------------------------------------------------
/verification code/注册【中知网】/screen_shot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/verification code/注册【中知网】/screen_shot.png


--------------------------------------------------------------------------------
/微信聊天记录/utils/dbutils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: dbutils.py 
12 | @time: 2020-04-11 16:57 
13 | @description
14 | """
15 | 
16 | import sqlite3
17 | 
18 | 
19 | class DUtil():
20 | 
21 |     def __init__(self, db_path="./weixin.db"):
22 |         """
23 |         数据库初始化
24 |         """
25 |         self.db = sqlite3.connect(db_path)
26 |         self.cursor = self.db.cursor()
27 | 
28 |     def execute(self, sql, param=None):
29 |         """
30 |         Sql语句，包含：增、删、改
31 |         param：数据，可以为列表、字典，也可以为空
32 |         """
33 |         try:
34 |             if param is None:
35 |                 self.cursor.execute(sql)
36 |             else:
37 |                 if type(param) is list:
38 |                     self.cursor.executemany(sql, param)
39 |                 else:
40 |                     self.cursor.execute(sql, param)
41 |             count = self.db.total_changes
42 |             self.db.commit()
43 |         except Exception as e:
44 |             print(e)
45 |             return False, e
46 | 
47 |         # 返回结果
48 |         return True if count > 0 else False
49 | 
50 |     def query(self, sql, param=None):
51 |         """
52 |         查询语句
53 |         sql：Sql语句
54 |         param：参数，可以包含空
55 |         retutn：成功返回True
56 |         """
57 |         if param is None:
58 |             self.cursor.execute(sql)
59 |         else:
60 |             self.cursor.execute(sql, param)
61 |         return self.cursor.fetchall()
62 | 
63 |     def close(self):
64 |         """
65 |         数据库关闭
66 |         """
67 |         self.cursor.close()
68 |         self.db.close()
69 | 
70 | 


--------------------------------------------------------------------------------
/微信聊天记录/utils/string_utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python  
 2 | # encoding: utf-8  
 3 | 
 4 | """ 
 5 | @version: v1.0 
 6 | @author: xag 
 7 | @license: Apache Licence  
 8 | @contact: xinganguo@gmail.com 
 9 | @site: http://www.xingag.top 
10 | @software: PyCharm 
11 | @file: StringUtils.py 
12 | @time: 2020-04-11 18:39 
13 | @description：TODO
14 | """
15 | import re
16 | 
17 | 
18 | def get_ava_string(str):
19 |     """
20 |     去掉特殊符号，保留正常内容
21 |     :param str:
22 |     :return:
23 |     """
24 |     return re.sub(u"([^ \u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", str)
25 | 


--------------------------------------------------------------------------------
/获取女友的位置/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ignoredIdentifiers">
 6 |         <list>
 7 |           <option value="datetime.date.today" />
 8 |         </list>
 9 |       </option>
10 |     </inspection_tool>
11 |   </profile>
12 | </component>


--------------------------------------------------------------------------------
/获取女友的位置/.idea/misc.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="JavaScriptSettings">
4 |     <option name="languageLevel" value="ES6" />
5 |   </component>
6 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7" project-jdk-type="Python SDK" />
7 | </project>


--------------------------------------------------------------------------------
/获取女友的位置/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/地理位置.iml" filepath="$PROJECT_DIR$/.idea/地理位置.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/获取女友的位置/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="$PROJECT_DIR$/../../.." vcs="Git" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/获取女友的位置/.idea/地理位置.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$" />
 5 |     <orderEntry type="inheritedJdk" />
 6 |     <orderEntry type="sourceFolder" forTests="false" />
 7 |   </component>
 8 |   <component name="TestRunnerService">
 9 |     <option name="projectConfiguration" value="pytest" />
10 |     <option name="PROJECT_TEST_RUNNER" value="pytest" />
11 |   </component>
12 | </module>


--------------------------------------------------------------------------------
/获取女友的位置/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """
  5 | @version: v1.0
  6 | @author: xag
  7 | @license: Apache Licence
  8 | @contact: xinganguo@gmail.com
  9 | @site: http://www.xingag.top
 10 | @software: PyCharm
 11 | @file: meta_picture.py.py
 12 | @time: 2019-08-23 16:23
 13 | @description：高德坐标拾取网站：https://lbs.amap.com/console/show/picker
 14 | """
 15 | 
 16 | import os
 17 | import exifread
 18 | from decimal import Decimal
 19 | from position_utils import *
 20 | import requests
 21 | import json
 22 | import datetime
 23 | 
 24 | 
 25 | #  pip3 install exifread
 26 | 
 27 | 
 28 | class Location(object):
 29 | 
 30 |     def __init__(self, image_path):
 31 |         self.img_path = image_path
 32 | 
 33 |         self.api_key = "你申请的AK"
 34 | 
 35 |         self.url_get_position = 'https://restapi.amap.com/v3/geocode/regeo?key={}&location={}'
 36 | 
 37 |     def run(self):
 38 |         coordinate = self.__get_image_ability()
 39 | 
 40 |         print(f'获取到经度、纬度是:{coordinate}')
 41 | 
 42 |         if not coordinate:
 43 |             return
 44 | 
 45 |         # 根据经度和纬度，获取到详细地址
 46 |         address = self.__get_address(coordinate)
 47 | 
 48 |         # 检验坐标值
 49 |         # https://lbs.amap.com/console/show/picker
 50 |         print(f'你女朋友当前位置在:{address}')
 51 | 
 52 |     def __get_address(self, location):
 53 |         """
 54 |         根据坐标得到详细地址
 55 |         :param location: 经纬度值
 56 |         :return:
 57 |         """
 58 |         resp = requests.get(self.url_get_position.format(self.api_key, location))
 59 | 
 60 |         location_data = json.loads(resp.text)
 61 | 
 62 |         address = location_data.get('regeocode').get('formatted_address')
 63 | 
 64 |         return address
 65 | 
 66 |     def __format_lati_long_data(self, data):
 67 |         """
 68 |         对经度和纬度数据做处理，保留6位小数
 69 |         :param data: 原始经度和纬度值
 70 |         :return:
 71 |         """
 72 |         # 删除左右括号和空格
 73 |         data_list_tmp = str(data).replace('[', '').replace(']', '').split(',')
 74 |         data_list = [data.strip() for data in data_list_tmp]
 75 | 
 76 |         # 替换秒的值
 77 |         data_tmp = data_list[-1].split('/')
 78 | 
 79 |         # 秒的值
 80 |         data_sec = int(data_tmp[0]) / int(data_tmp[1]) / 3600
 81 | 
 82 |         # 替换分的值
 83 |         data_tmp = data_list[-2]
 84 | 
 85 |         # 分的值
 86 |         data_minute = int(data_tmp) / 60
 87 | 
 88 |         # 度的值
 89 |         data_degree = int(data_list[0])
 90 | 
 91 |         # 由于高德API只能识别到小数点后的6位
 92 |         # 需要转换为浮点数，并保留为6位小数
 93 |         result = "%.6f" % (data_degree + data_minute + data_sec)
 94 |         return float(result)
 95 | 
 96 |     def __get_image_ability(self):
 97 |         """
 98 |         获取图片的属性值，包含：经纬度、拍摄时间等
 99 |         :param picture_name:
100 |         :return:
101 |         """
102 | 
103 |         # 利用exifread库，读取图片的属性
104 |         img_exif = exifread.process_file(open(self.img_path, 'rb'))
105 | 
106 |         # 能够读取到属性
107 |         if img_exif:
108 |             # 纬度数
109 |             latitude_gps = img_exif['GPS GPSLatitude']
110 | 
111 |             # N,S 南北纬方向
112 |             latitude_direction = img_exif['GPS GPSLatitudeRef']
113 | 
114 |             # 经度数
115 |             longitude_gps = img_exif['GPS GPSLongitude']
116 | 
117 |             # E,W 东西经方向
118 |             longitude_direction = img_exif['GPS GPSLongitudeRef']
119 | 
120 |             # 拍摄时间
121 |             take_time = img_exif['EXIF DateTimeOriginal']
122 | 
123 |             is_lie = self.judge_time_met(take_time)
124 | 
125 |             if is_lie:
126 |                 print('很遗憾的通知你，你的女朋友在撒谎！！！')
127 |                 return
128 | 
129 |                 # 纬度、经度、拍摄时间
130 |             if latitude_gps and longitude_gps and take_time:
131 | 
132 |                 # 对纬度、经度值原始值作进一步的处理
133 |                 latitude = self.__format_lati_long_data(latitude_gps)
134 |                 longitude = self.__format_lati_long_data(longitude_gps)
135 | 
136 |                 # print(f'{longitude},{latitude}')
137 | 
138 |                 # 注意：由于gps获取的坐标在国内高德等主流地图上逆编码不够精确，这里需要转换为火星坐标系
139 |                 location = wgs84togcj02(longitude, latitude)
140 | 
141 |                 return f'{location[0]},{location[1]}'
142 |             else:
143 |                 print(f'获取的图片数据属性不完整')
144 |                 return ''
145 |         else:
146 |             print('抱歉，图片不是原图，没法获取到图片属性。')
147 |             return ''
148 | 
149 |     def judge_time_met(self, take_time):
150 |         """
151 |         通知拍摄时间判断女朋友是否撒谎
152 |         :param take_time:
153 |         :return:
154 |         """
155 |         # 拍摄时间
156 |         format_time = str(take_time).split(" ")[0].replace(":", "-")
157 | 
158 |         # 当天日期
159 |         today = str(datetime.date.today())
160 | 
161 |         if format_time == today:
162 |             return False
163 |         else:
164 |             return True
165 | 
166 | 
167 | if __name__ == '__main__':
168 |     # 女朋友发过来的图片【原图】
169 |     location = Location('./picture/11441566648796_.pic_hd.jpg')
170 | 
171 |     # 找到女朋友的地理位置
172 |     location.run()
173 | 


--------------------------------------------------------------------------------
/获取女友的位置/picture/11441566648796_.pic_hd.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/获取女友的位置/picture/11441566648796_.pic_hd.jpg


--------------------------------------------------------------------------------
/获取女友的位置/position_utils.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encoding: utf-8
  3 | 
  4 | """
  5 | @version: v1.0
  6 | @author: xag
  7 | @license: Apache Licence
  8 | @contact: xinganguo@gmail.com
  9 | @site: http://www.xingag.top
 10 | @software: PyCharm
 11 | @file: position_utils.py
 12 | @time: 2019-08-23 17:44
 13 | @description：坐标转换
 14 | """
 15 | 
 16 | # -*- coding: utf-8 -*-
 17 | import json
 18 | import math
 19 | 
 20 | x_pi = 3.14159265358979324 * 3000.0 / 180.0
 21 | pi = 3.1415926535897932384626  # π
 22 | a = 6378245.0  # 长半轴
 23 | ee = 0.00669342162296594323  # 扁率
 24 | 
 25 | 
 26 | def wgs84togcj02(lng, lat):
 27 |     """
 28 |     WGS84转GCJ02(火星坐标系)
 29 |     :param lng:WGS84坐标系的经度
 30 |     :param lat:WGS84坐标系的纬度
 31 |     :return:
 32 |     """
 33 |     if out_of_china(lng, lat):  # 判断是否在国内
 34 |         return lng, lat
 35 |     dlat = transformlat(lng - 105.0, lat - 35.0)
 36 |     dlng = transformlng(lng - 105.0, lat - 35.0)
 37 |     radlat = lat / 180.0 * pi
 38 |     magic = math.sin(radlat)
 39 |     magic = 1 - ee * magic * magic
 40 |     sqrtmagic = math.sqrt(magic)
 41 |     dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
 42 |     dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
 43 |     mglat = lat + dlat
 44 |     mglng = lng + dlng
 45 |     return [mglng, mglat]
 46 | 
 47 | 
 48 | def gcj02towgs84(lng, lat):
 49 |     """
 50 |     GCJ02(火星坐标系)转GPS84
 51 |     :param lng:火星坐标系的经度
 52 |     :param lat:火星坐标系纬度
 53 |     :return:
 54 |     """
 55 |     if out_of_china(lng, lat):
 56 |         return lng, lat
 57 |     dlat = transformlat(lng - 105.0, lat - 35.0)
 58 |     dlng = transformlng(lng - 105.0, lat - 35.0)
 59 |     radlat = lat / 180.0 * pi
 60 |     magic = math.sin(radlat)
 61 |     magic = 1 - ee * magic * magic
 62 |     sqrtmagic = math.sqrt(magic)
 63 |     dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi)
 64 |     dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi)
 65 |     mglat = lat + dlat
 66 |     mglng = lng + dlng
 67 |     return [lng * 2 - mglng, lat * 2 - mglat]
 68 | 
 69 | 
 70 | def transformlat(lng, lat):
 71 |     ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + \
 72 |           0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng))
 73 |     ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 *
 74 |             math.sin(2.0 * lng * pi)) * 2.0 / 3.0
 75 |     ret += (20.0 * math.sin(lat * pi) + 40.0 *
 76 |             math.sin(lat / 3.0 * pi)) * 2.0 / 3.0
 77 |     ret += (160.0 * math.sin(lat / 12.0 * pi) + 320 *
 78 |             math.sin(lat * pi / 30.0)) * 2.0 / 3.0
 79 |     return ret
 80 | 
 81 | 
 82 | def transformlng(lng, lat):
 83 |     ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + \
 84 |           0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng))
 85 |     ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 *
 86 |             math.sin(2.0 * lng * pi)) * 2.0 / 3.0
 87 |     ret += (20.0 * math.sin(lng * pi) + 40.0 *
 88 |             math.sin(lng / 3.0 * pi)) * 2.0 / 3.0
 89 |     ret += (150.0 * math.sin(lng / 12.0 * pi) + 300.0 *
 90 |             math.sin(lng / 30.0 * pi)) * 2.0 / 3.0
 91 |     return ret
 92 | 
 93 | 
 94 | def out_of_china(lng, lat):
 95 |     """
 96 |     判断是否在国内，不在国内不做偏移
 97 |     :param lng:
 98 |     :param lat:
 99 |     :return:
100 |     """
101 |     if lng < 72.004 or lng > 137.8347:
102 |         return True
103 |     if lat < 0.8293 or lat > 55.8271:
104 |         return True
105 |     return False
106 | 


--------------------------------------------------------------------------------