├── .gitignore ├── .idea ├── misc.xml ├── modules.xml ├── spider_python.iml ├── vcs.xml └── workspace.xml ├── LICENSE ├── ModifyLocation ├── gps_utils.py ├── main.py └── position_utils.py ├── Python调用JAR ├── exec_jar_example.py └── jar │ ├── com │ └── xingag │ │ └── common │ │ └── EncryHelper.class │ └── encry.jar ├── README.md ├── feapder └── tophub_demo │ ├── .idea │ ├── inspectionProfiles │ │ └── Project_Default.xml │ ├── misc.xml │ ├── modules.xml │ ├── tophub_demo.iml │ └── workspace.xml │ ├── items │ ├── __init__.py │ └── topic_item.py │ ├── main.py │ ├── setting.py │ ├── spiders │ ├── __init__.py │ └── tophub_spider.py │ └── test.py ├── js └── jian_shu.js ├── pic └── 最低气温排行榜.png ├── raw └── qr.jpeg ├── scrapy ├── douban_login │ ├── .idea │ │ ├── douban_login.iml │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── captcha.png │ ├── douban_login │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── douban.py │ ├── readme.MD │ ├── scrapy.cfg │ └── start.py ├── huize_spider │ ├── .idea │ │ ├── huize_spider.iml │ │ ├── misc.xml │ │ ├── modules.xml │ │ └── workspace.xml │ ├── .~ana.rtf │ ├── ana.rtf │ ├── datas.json │ ├── huize_spider │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── huize.py │ │ │ └── string_utils.py │ ├── scrapy.cfg │ └── start.py ├── jianshu_spider │ ├── .idea │ │ ├── jianshu_spider.iml │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── jianshu_spider │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── jianshu.py │ ├── raw │ │ ├── article.sql │ │ └── article_table.png │ └── scrapy.cfg ├── qczj │ ├── .idea │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── qczj.iml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── qczj │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── bmw5.py │ ├── readme.MD │ ├── scrapy.cfg │ └── start.py ├── qsbk │ ├── .idea │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── qsbk.iml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── duanzi.json │ ├── qsbk │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── spider_qsbk.py │ ├── readme.MD │ └── scrapy.cfg ├── sfw_spider │ ├── .idea │ │ ├── misc.xml │ │ ├── modules.xml │ │ ├── sfw.iml │ │ ├── vcs.xml │ │ └── workspace.xml │ ├── requirements.txt │ ├── scrapy.cfg │ ├── sfw │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ └── sfw_spider.py │ └── start.py └── weixin_community │ ├── .idea │ ├── misc.xml │ ├── modules.xml │ ├── vcs.xml │ ├── weixin_community.iml │ └── workspace.xml │ ├── readme.MD │ ├── scrapy.cfg │ └── weixin_community │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ ├── __init__.py │ └── wx_spider.py ├── spiders ├── film_xinpianchang │ ├── Film.py │ ├── models.py │ ├── tools_file.py │ └── tools_string.py ├── spider_bai_si_bu_de_jie.py ├── spider_boss.py ├── spider_china_weather.py ├── spider_dou_tu_la.py ├── spider_dytt.py ├── spider_gushiwen.py ├── spider_lagou.py ├── spider_qiu_shi_bai_ke.py ├── spider_tencent_recruit.py ├── 发表情 │ ├── auto_send_emoji.py │ └── utils │ │ ├── chat_utils.py │ │ └── string_utils.py └── 年终奖 │ ├── comments.txt │ ├── nzj.py │ └── output.png ├── verification code └── 注册【中知网】 │ ├── AipOcr.py │ ├── cnki_demo.py │ ├── file_tools.py │ ├── image_code.png │ └── screen_shot.png ├── 微信聊天记录 ├── main.py └── utils │ ├── dbutils.py │ └── string_utils.py └── 获取女友的位置 ├── .idea ├── inspectionProfiles │ └── Project_Default.xml ├── misc.xml ├── modules.xml ├── vcs.xml ├── workspace.xml └── 地理位置.iml ├── main.py ├── picture └── 11441566648796_.pic_hd.jpg └── position_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | .DS_Store 104 | 105 | 106 | # mypy 107 | .mypy_cache/ 108 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/spider_python.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /ModifyLocation/gps_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: gps_utils.py 12 | @time: 2019-11-17 10:34 13 | @description:TODO 14 | """ 15 | 16 | import math 17 | 18 | 19 | def gps_to_dms(gps_data): 20 | """ 21 | 坐标转为度、分、秒(double) 22 | 116.397451 23 | :param gps_data: 24 | :return: 25 | """ 26 | # 度:向下取整 27 | gps_degree = math.floor(gps_data) 28 | 29 | gps_data_temp1 = (gps_data - gps_degree) * 60 30 | 31 | # 分 32 | gps_minute = math.floor(gps_data_temp1) 33 | 34 | gps_data_temp2 = gps_data_temp1 - gps_minute 35 | 36 | # 秒,取小数点后4位 37 | gps_second = round(gps_data_temp2 * 60, 2) 38 | 39 | # 注意:秒必须转换为整形 40 | result = ((gps_degree, 1), (gps_minute, 1), (int(gps_second * 100), 100)) 41 | 42 | return result 43 | 44 | 45 | def dms_to_gps(dms_data): 46 | """ 47 | 度、分、秒转为坐标值(double) 48 | :param dms_data: 49 | :return: 50 | """ 51 | data1 = dms_data[0][0] / dms_data[0][1] 52 | 53 | data2 = dms_data[1][0] / dms_data[1][1] / 60 54 | 55 | data3 = dms_data[2][0] / dms_data[2][1] / 3600 56 | 57 | result = round(data1 + data2 + data3,6) 58 | 59 | return result 60 | -------------------------------------------------------------------------------- /ModifyLocation/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: main.py 12 | @time: 2019-11-16 10:12 13 | @description:修改图片地理位置 14 | """ 15 | 16 | import requests 17 | import time 18 | from PIL import Image 19 | import piexif 20 | import json 21 | from gps_utils import * 22 | from position_utils import * 23 | 24 | 25 | # 依赖:pip3 install piexif 26 | 27 | class Exif(): 28 | def __init__(self): 29 | self.time = '2019:11:17 14:13:22' 30 | 31 | # 地理编码(地址转为经纬度) 32 | self.url_geo = 'https://restapi.amap.com/v3/geocode/geo' 33 | 34 | # 逆地理编码(经纬度转为地址) 35 | self.url_regeo = 'https://restapi.amap.com/v3/geocode/regeo?parameters' 36 | 37 | # key 38 | self.ak = '你的ak' 39 | 40 | # 数字签名 41 | self.sign = '你的sign' 42 | 43 | def read_image(self, image_path): 44 | """ 45 | 开始处理图片 46 | exifread:读取图片属性 47 | :return: 48 | """ 49 | exif_dict = piexif.load(image_path) 50 | 51 | if exif_dict['GPS']: 52 | 53 | # 纬度 54 | gps_lati_pre = exif_dict['GPS'][2] 55 | 56 | gps_lati = dms_to_gps(gps_lati_pre) 57 | 58 | # 经度 59 | gps_long_pre = exif_dict['GPS'][4] 60 | gps_long = dms_to_gps(gps_long_pre) 61 | 62 | # GPS坐标转为高德坐标 63 | lng, lat = wgs84togcj02(gps_long, gps_lati) 64 | 65 | # print(lng, lat) 66 | 67 | print(f"原图地理位置如下\n经度:{lng}\n纬度:{lat}\n") 68 | 69 | return f'{lng}, {lat}' 70 | else: 71 | print(f'抱歉!这张图片不包含地理位置!') 72 | 73 | def current_time(self): 74 | """ 75 | 获取当前时间 76 | :return: 77 | """ 78 | time_now = time.strftime('%Y:%m:%d %H:%M:%S', time.localtime(time.time())) 79 | 80 | result = bytes(time_now, encoding='utf-8') 81 | 82 | return result 83 | 84 | def str_to_bytes(self, str_content): 85 | """ 86 | 字符串转bytes 87 | :return: 88 | """ 89 | return bytes(str_content, encoding='utf-8') 90 | 91 | def is_image(self, filename): 92 | """ 93 | 判断文件是否是一张图片 94 | :param filename: 95 | :return: 96 | """ 97 | file_suffix = filename.split('.')[-1] 98 | 99 | if file_suffix == 'jpg' or file_suffix == 'png': 100 | return True 101 | else: 102 | return False 103 | 104 | def write_image(self, image_path, gps_long, gps_lati): 105 | """ 106 | 修改文件夹下所有文件的属性 107 | :param image_path: 文件夹路径 108 | :return: 109 | """ 110 | # 读取图片 111 | img = Image.open(image_path) 112 | 113 | try: 114 | exif_dict = piexif.load(img.info['exif']) 115 | except: 116 | print('加载文件地理位置异常!') 117 | return 118 | 119 | # 修改地理位置 120 | # GPS GPSLatitudeRef:N 121 | # GPS GPSLatitude:[22, 32, 189/20] 122 | # GPS GPSLongitudeRef:E 123 | # GPS GPSLongitude:[114, 1, 689/20] 124 | exif_dict['GPS'][2] = gps_to_dms(gps_lati) 125 | exif_dict['GPS'][4] = gps_to_dms(gps_long) 126 | 127 | exif_bytes = piexif.dump(exif_dict) 128 | 129 | # 写入到新的图片中去 130 | img.save(image_path, 'jpeg', exif=exif_bytes) 131 | 132 | def get_address_by_location(self, location): 133 | """ 134 | 通过经纬度拿到地理位置 135 | :param location: 136 | :return: 137 | """ 138 | params = { 139 | 'key': self.ak, 140 | 'location': location, 141 | 'sig': self.sign 142 | } 143 | 144 | resp = json.loads(requests.get(url=self.url_regeo, params=params).text) 145 | 146 | if resp and resp.get('regeocode') and resp.get('regeocode').get('formatted_address'): 147 | address = resp.get('regeocode').get('formatted_address') 148 | print(f'原图的拍摄地址为:{address}\n') 149 | else: 150 | print('api解析地址出错,请检查ak!\n') 151 | 152 | def get_location_by_address(self, city, address): 153 | """ 154 | 通过地理位置到拿到经纬度 155 | 地理编码:https://lbs.amap.com/api/webservice/guide/api/georegeo/ 156 | :param address: 157 | :return: 158 | """ 159 | params = { 160 | 'key': self.ak, 161 | 'city': city, 162 | 'address': address, 163 | 'sig': self.sign 164 | } 165 | 166 | resp = json.loads(requests.get(url=self.url_geo, params=params).text) 167 | 168 | # 获取坐标地址 169 | if resp and len(resp.get('geocodes')) >= 1 and resp.get('geocodes')[0].get('location'): 170 | location = resp.get('geocodes')[0].get('location') 171 | gps_data = location.split(',') 172 | 173 | # 得到经度和纬度 174 | gps_long = float(gps_data[0]) 175 | gps_lati = float(gps_data[1]) 176 | 177 | return gps_long, gps_lati 178 | else: 179 | print('api解析地址出错,请检查ak!') 180 | return None 181 | 182 | 183 | if __name__ == '__main__': 184 | exif = Exif() 185 | 186 | image_path = './WechatIMG1439.jpeg' 187 | 188 | # 1、读取原图的属性 189 | location = exif.read_image(image_path) 190 | 191 | if location: 192 | # 2、原图的详细地址 193 | exif.get_address_by_location(location) 194 | 195 | # 3、输入地址(市+目的地,例如:深圳莲花山公园) 196 | city = input('请输入定位城市(例如:深圳):') 197 | address = input('请输入具体的定位地址(例如:莲花山公园):') 198 | 199 | if address: 200 | # 通过地址拿到坐标地址 201 | location = exif.get_location_by_address(city, address) 202 | 203 | if location: 204 | # 4、修改图片属性,写入经度和纬度 205 | exif.write_image(image_path, location[0], location[1]) 206 | print('修改图片地理成功!') 207 | else: 208 | print('请先输入具体地址!') 209 | -------------------------------------------------------------------------------- /ModifyLocation/position_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: position_utils.py 12 | @time: 2019-08-23 17:44 13 | @description:坐标转换 14 | """ 15 | 16 | # -*- coding: utf-8 -*- 17 | import math 18 | 19 | x_pi = 3.14159265358979324 * 3000.0 / 180.0 20 | pi = 3.1415926535897932384626 # π 21 | a = 6378245.0 # 长半轴 22 | ee = 0.00669342162296594323 # 扁率 23 | 24 | 25 | def wgs84togcj02(lng, lat): 26 | """ 27 | WGS84转GCJ02(火星坐标系) 28 | :param lng:WGS84坐标系的经度 29 | :param lat:WGS84坐标系的纬度 30 | :return: 31 | """ 32 | if out_of_china(lng, lat): # 判断是否在国内 33 | return lng, lat 34 | dlat = transformlat(lng - 105.0, lat - 35.0) 35 | dlng = transformlng(lng - 105.0, lat - 35.0) 36 | radlat = lat / 180.0 * pi 37 | magic = math.sin(radlat) 38 | magic = 1 - ee * magic * magic 39 | sqrtmagic = math.sqrt(magic) 40 | dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi) 41 | dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi) 42 | mglat = lat + dlat 43 | mglng = lng + dlng 44 | return [mglng, mglat] 45 | 46 | 47 | def gcj02towgs84(lng, lat): 48 | """ 49 | GCJ02(火星坐标系)转GPS84 50 | :param lng:火星坐标系的经度 51 | :param lat:火星坐标系纬度 52 | :return: 53 | """ 54 | if out_of_china(lng, lat): 55 | return lng, lat 56 | dlat = transformlat(lng - 105.0, lat - 35.0) 57 | dlng = transformlng(lng - 105.0, lat - 35.0) 58 | radlat = lat / 180.0 * pi 59 | magic = math.sin(radlat) 60 | magic = 1 - ee * magic * magic 61 | sqrtmagic = math.sqrt(magic) 62 | dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi) 63 | dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi) 64 | mglat = lat + dlat 65 | mglng = lng + dlng 66 | return [lng * 2 - mglng, lat * 2 - mglat] 67 | 68 | 69 | def transformlat(lng, lat): 70 | ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + \ 71 | 0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng)) 72 | ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 * 73 | math.sin(2.0 * lng * pi)) * 2.0 / 3.0 74 | ret += (20.0 * math.sin(lat * pi) + 40.0 * 75 | math.sin(lat / 3.0 * pi)) * 2.0 / 3.0 76 | ret += (160.0 * math.sin(lat / 12.0 * pi) + 320 * 77 | math.sin(lat * pi / 30.0)) * 2.0 / 3.0 78 | return ret 79 | 80 | 81 | def transformlng(lng, lat): 82 | ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + \ 83 | 0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng)) 84 | ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 * 85 | math.sin(2.0 * lng * pi)) * 2.0 / 3.0 86 | ret += (20.0 * math.sin(lng * pi) + 40.0 * 87 | math.sin(lng / 3.0 * pi)) * 2.0 / 3.0 88 | ret += (150.0 * math.sin(lng / 12.0 * pi) + 300.0 * 89 | math.sin(lng / 30.0 * pi)) * 2.0 / 3.0 90 | return ret 91 | 92 | 93 | def out_of_china(lng, lat): 94 | """ 95 | 判断是否在国内,不在国内不做偏移 96 | :param lng: 97 | :param lat: 98 | :return: 99 | """ 100 | if lng < 72.004 or lng > 137.8347: 101 | return True 102 | if lat < 0.8293 or lat > 55.8271: 103 | return True 104 | return False 105 | -------------------------------------------------------------------------------- /Python调用JAR/exec_jar_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: exec_jar_example.py 12 | @time: 2021-01-02 12:30 13 | @description:TODO 14 | """ 15 | 16 | import jpype 17 | import os 18 | 19 | # 初始化 20 | jar_path = os.path.join(os.path.abspath('.'), 'jar/encry.jar') 21 | 22 | print(jar_path) 23 | 24 | # 启动jvm 25 | jpype.startJVM(jpype.getDefaultJVMPath(), "-ea", "-Djava.class.path=%s" % (jar_path)) 26 | 27 | 28 | # 通过包名,实例化JAVA对象 29 | EncryClass = jpype.JClass("com.xingag.common.EncryHelper") 30 | encryClass = EncryClass() 31 | 32 | # 调用JAVA中的加密方法 33 | content_encry = encryClass.encrypt("xag") 34 | print(content_encry) 35 | 36 | # 关闭jvm 37 | jpype.shutdownJVM() 38 | -------------------------------------------------------------------------------- /Python调用JAR/jar/com/xingag/common/EncryHelper.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/Python调用JAR/jar/com/xingag/common/EncryHelper.class -------------------------------------------------------------------------------- /Python调用JAR/jar/encry.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/Python调用JAR/jar/encry.jar -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spider_python 2 | 3 | ## 前言 4 | 5 | 如果想查看详细的教程,请关注微信公众号:**AirPython** 6 | 7 | ![](./raw/qr.jpeg) 8 | 9 | 10 | 11 | ## 普通的爬虫 12 | 13 | * [爬取电影天堂最新的电影数据 - xpath](./spiders/spider_dytt.py) 14 | 15 | * [爬取腾讯招聘的职位数据 - xpath](./spiders/spider_tencent_recruit.py) 16 | 17 | * [爬取中国天气网全国天气并生成饼状图 - bs4](./spiders/spider_china_weather.py) 18 | 19 | * [爬取古诗词网的数据 - re](./spiders/spider_gushiwen.py) 20 | 21 | * [爬取糗事百科上的段子数据 - re](./spiders/spider_qiu_shi_bai_ke.py) 22 | 23 | 24 | 25 | ## 多线程爬虫 26 | 27 | * [多线程爬取斗图吧的表情图并下载到本地 - xpath + threading](./spiders/spider_dou_tu_la.py) 28 | * [使用 itchat 发送表情到指定的人和微信群](./spiders/发表情/) 29 | * [多线程爬取百思不得姐的文字和图片信息并写入到csv中](./spiders/spider_bai_si_bu_de_jie.py) 30 | 31 | 32 | 33 | ## Selenium 自动化爬虫 34 | 35 | * [爬取拉勾网的职位信息 - selenium + requests + lxml ](./spiders/spider_lagou.py) 36 | 37 | * [爬取 Boss 直聘网的职位信息 - selenium + lxml](./spiders/spider_boss.py) 38 | 39 | 40 | 41 | ## Scrapy 框架爬虫 42 | * [爬取糗事百科的段子保存到 JSON 文件中](./scrapy/qsbk/readme.MD) 43 | * [爬取微信小程序论坛的数据](./scrapy/weixin_community/readme.MD) 44 | * [登录豆瓣网并修改个性签名](./scrapy/douban_login/readme.MD) 45 | * [下载汽车之家的高清图片到本地](./scrapy/qczj/readme.MD) 46 | * [爬取简书网所有文章数据](./scrapy/jianshu_spider/) 47 | * [爬取房天下所有房的数据,包含新房、二手房](./scrapy/sfw_spider) 48 | 49 | 50 | 51 | 52 | 53 | ## feapder 54 | 55 | * [feapder AirSpider实例](./feapder/tophub_demo) 56 | 57 | 58 | 59 | ## Node.js 爬虫 60 | 61 | * [使用 puppeteer 爬取简书文章并保存到本地](./js/jian_shu.js) 62 | 63 | 64 | 65 | ## 其他 66 | 67 | * [使用 Python 定位到女朋友的位置](./获取女友的位置) 68 | * [女朋友背着我,用 Python 偷偷隐藏了她的行踪](./ModifyLocation) 69 | * [微信群聊记录](./微信聊天记录) 70 | * [Python 调用 JAR](./Python调用JAR) 71 | 72 | -------------------------------------------------------------------------------- /feapder/tophub_demo/.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 56 | -------------------------------------------------------------------------------- /feapder/tophub_demo/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /feapder/tophub_demo/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /feapder/tophub_demo/.idea/tophub_demo.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /feapder/tophub_demo/items/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "topic_item" 3 | ] -------------------------------------------------------------------------------- /feapder/tophub_demo/items/topic_item.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-04-08 12:20:22 4 | --------- 5 | @summary: 6 | --------- 7 | @author: xingag 8 | """ 9 | 10 | from feapder import Item 11 | 12 | 13 | class TopicItem(Item): 14 | """ 15 | This class was generated by feapder. 16 | command: feapder create -i topic. 17 | """ 18 | 19 | def __init__(self, *args, **kwargs): 20 | # self.id = None 21 | self.title = None # 文章标题 22 | self.auth = None # 作者 23 | self.like_count = 0 # 喜欢数 24 | self.collection = 0 # 收藏数 25 | self.comment = 0 # 评论数 26 | -------------------------------------------------------------------------------- /feapder/tophub_demo/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-04-08 11:57:08 4 | --------- 5 | @summary: 爬虫入口 6 | --------- 7 | @author: xingag 8 | """ 9 | 10 | from feapder import ArgumentParser 11 | 12 | from spiders import * 13 | 14 | 15 | def crawl_xxx(): 16 | """ 17 | 普通爬虫 18 | """ 19 | spider = xxx.XXXSpider(redis_key="xxx:xxx") 20 | spider.start() 21 | 22 | 23 | def crawl_xxx(args): 24 | """ 25 | 批次爬虫 26 | @param args: 1 / 2 / init 27 | """ 28 | spider = xxx_spider.XXXSpider( 29 | task_table="", # mysql中的任务表 30 | batch_record_table="", # mysql中的批次记录表 31 | batch_name="xxx(周全)", # 批次名字 32 | batch_interval=7, # 批次时间 天为单位 若为小时 可写 1 / 24 33 | task_keys=["id", "xxx"], # 需要获取任务表里的字段名,可添加多个 34 | redis_key="xxx:xxxx", # redis中存放request等信息的根key 35 | task_state="state", # mysql中任务状态字段 36 | ) 37 | 38 | if args == 1: 39 | spider.start_monitor_task() 40 | elif args == 2: 41 | spider.start() 42 | elif args == "init": 43 | spider.init_task() 44 | 45 | 46 | if __name__ == "__main__": 47 | parser = ArgumentParser(description="xxx爬虫") 48 | 49 | parser.add_argument( 50 | "--crawl_xxx", action="store_true", help="xxx", function=crawl_xxx 51 | ) 52 | parser.add_argument( 53 | "--crawl_xxx", type=int, nargs=1, help="xxx(1|2)", function=crawl_xxx 54 | ) 55 | 56 | parser.start() 57 | -------------------------------------------------------------------------------- /feapder/tophub_demo/setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """爬虫配置文件""" 3 | import os 4 | 5 | 6 | # MYSQL 7 | MYSQL_IP = "localhost" 8 | MYSQL_PORT = 3306 9 | MYSQL_DB = "xag" 10 | MYSQL_USER_NAME = "root" 11 | MYSQL_USER_PASS = "root" 12 | 13 | # REDIS 14 | # IP:PORT 15 | REDISDB_IP_PORTS = "xxx:6379" 16 | REDISDB_USER_PASS = "" 17 | # 默认 0 到 15 共16个数据库 18 | REDISDB_DB = 0 19 | # # 适用于redis哨兵模式 20 | # REDISDB_SERVICE_NAME = None 21 | # 22 | # # 数据入库的pipeline,可自定义,默认MysqlPipeline 23 | # ITEM_PIPELINES = ["feapder.pipelines.mysql_pipeline.MysqlPipeline"] 24 | # 25 | # # 爬虫相关 26 | # # COLLECTOR 27 | # COLLECTOR_SLEEP_TIME = 1 # 从任务队列中获取任务到内存队列的间隔 28 | # COLLECTOR_TASK_COUNT = 100 # 每次获取任务数量 29 | # 30 | # # SPIDER 31 | # SPIDER_THREAD_COUNT = 100 # 爬虫并发数 32 | # SPIDER_SLEEP_TIME = 0 # 下载时间间隔(解析完一个response后休眠时间) 33 | # SPIDER_MAX_RETRY_TIMES = 100 # 每个请求最大重试次数 34 | # WARNING_FAILED_COUNT = 1000 # 任务失败数 超过WARNING_FAILED_COUNT则报警 35 | # 36 | # # 浏览器渲染下载 37 | # WEBDRIVER = dict( 38 | # pool_size=2, # 浏览器的数量 39 | # load_images=False, # 是否加载图片 40 | # user_agent=None, # 字符串 或 无参函数,返回值为user_agent 41 | # proxy=None, # xxx.xxx.xxx.xxx:xxxx 或 无参函数,返回值为代理地址 42 | # headless=False, # 是否为无头浏览器 43 | # driver_type="CHROME", # CHROME 或 PHANTOMJS, 44 | # timeout=30, # 请求超时时间 45 | # window_size=(1024, 800), # 窗口大小 46 | # executable_path=None, # 浏览器路径,默认为默认路径 47 | # ) 48 | # 49 | # # 重新尝试失败的requests 当requests重试次数超过允许的最大重试次数算失败 50 | # RETRY_FAILED_REQUESTS = False 51 | # # request 超时时间,超过这个时间重新做(不是网络请求的超时时间)单位秒 52 | # REQUEST_TIME_OUT = 600 # 10分钟 53 | # # 保存失败的request 54 | # SAVE_FAILED_REQUEST = True 55 | # 56 | # # 下载缓存 利用redis缓存,由于内存小,所以仅供测试时使用 57 | # RESPONSE_CACHED_ENABLE = False # 是否启用下载缓存 成本高的数据或容易变需求的数据,建议设置为True 58 | # RESPONSE_CACHED_EXPIRE_TIME = 3600 # 缓存时间 秒 59 | # RESPONSE_CACHED_USED = False # 是否使用缓存 补采数据时可设置为True 60 | # 61 | # # 爬虫是否自动结束,若为False,则会等待新任务下发,进程不退出 62 | # AUTO_STOP_WHEN_SPIDER_DONE = True 63 | # 64 | # # 设置代理 65 | # PROXY_EXTRACT_API = None # 代理提取API ,返回的代理分割符为\r\n 66 | # PROXY_ENABLE = True 67 | # 68 | # # 随机headers 69 | # RANDOM_HEADERS = True 70 | # # requests 使用session 71 | # USE_SESSION = False 72 | # 73 | # # 去重 74 | # ITEM_FILTER_ENABLE = False # item 去重 75 | # REQUEST_FILTER_ENABLE = False # request 去重 76 | # 77 | # # 报警 支持钉钉及邮件,二选一即可 78 | # # 钉钉报警 79 | # DINGDING_WARNING_URL = "" # 钉钉机器人api 80 | # DINGDING_WARNING_PHONE = "" # 报警人 支持列表,可指定多个 81 | # # 邮件报警 82 | # EAMIL_SENDER = "" # 发件人 83 | # EAMIL_PASSWORD = "" # 授权码 84 | # EMAIL_RECEIVER = "" # 收件人 支持列表,可指定多个 85 | # # 报警时间间隔及级别 86 | # WARNING_INTERVAL = 3600 # 相同报警的报警时间间隔,防止刷屏 87 | # WARNING_LEVEL = "DEBUG" # 报警级别, DEBUG / ERROR 88 | # 89 | # LOG_NAME = os.path.basename(os.getcwd()) 90 | # LOG_PATH = "log/%s.log" % LOG_NAME # log存储路径 91 | # LOG_LEVEL = "DEBUG" 92 | # LOG_IS_WRITE_TO_FILE = False 93 | -------------------------------------------------------------------------------- /feapder/tophub_demo/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | "tophub_spider" 3 | ] -------------------------------------------------------------------------------- /feapder/tophub_demo/spiders/tophub_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on 2021-04-08 12:03:28 4 | --------- 5 | @summary: 6 | --------- 7 | @author: xingag 8 | """ 9 | 10 | import re 11 | 12 | import feapder 13 | from fake_useragent import UserAgent 14 | from feapder.db.mysqldb import MysqlDB 15 | 16 | 17 | # 爬取数据并入库 18 | 19 | class TophubSpider(feapder.AirSpider): 20 | 21 | def __init__(self, *args, **kwargs): 22 | super().__init__(*args, **kwargs) 23 | self.db = MysqlDB() 24 | 25 | def start_requests(self): 26 | yield feapder.Request("https://tophub.today/", download_midware=self.download_midware) 27 | 28 | def parse(self, request, response): 29 | # print(response.text) 30 | card_elements = response.xpath('//div[@class="cc-cd"]') 31 | 32 | # 过滤出对应的卡片元素【什么值得买】 33 | buy_good_element = [card_element for card_element in card_elements if 34 | card_element.xpath('.//div[@class="cc-cd-is"]//span/text()').extract_first() == '什么值得买'][0] 35 | 36 | # 获取内部文章标题及地址 37 | a_elements = buy_good_element.xpath('.//div[@class="cc-cd-cb nano"]//a') 38 | 39 | for a_element in a_elements: 40 | # 标题和链接 41 | title = a_element.xpath('.//span[@class="t"]/text()').extract_first() 42 | href = a_element.xpath('.//@href').extract_first() 43 | 44 | # 再次下发新任务,并带上文章标题 45 | yield feapder.Request(href, download_midware=self.download_midware, callback=self.parser_detail_page, 46 | title=title) 47 | 48 | def parser_detail_page(self, request, response): 49 | """ 50 | 解析文章详情数据 51 | :param request: 52 | :param response: 53 | :return: 54 | """ 55 | title = request.title 56 | 57 | url = request.url 58 | 59 | # 解析文章详情页面,获取点赞、收藏、评论数目及作者名称 60 | author = response.xpath('//a[@class="author-title"]/text()').extract_first().strip() 61 | 62 | print("作者:", author, '文章标题:', title, "地址:", url) 63 | 64 | desc_elements = response.xpath('//span[@class="xilie"]/span') 65 | 66 | print("desc数目:", len(desc_elements)) 67 | 68 | # 点赞 69 | like_count = int(re.findall('\d+', desc_elements[1].xpath('./text()').extract_first())[0]) 70 | # 收藏 71 | collection_count = int(re.findall('\d+', desc_elements[2].xpath('./text()').extract_first())[0]) 72 | # 评论 73 | comment_count = int(re.findall('\d+', desc_elements[3].xpath('./text()').extract_first())[0]) 74 | 75 | print("点赞:", like_count, "收藏:", collection_count, "评论:", comment_count) 76 | 77 | # 插入数据库 78 | sql = "INSERT INTO topic(title,auth,like_count,collection,comment) values('%s','%s','%s','%d','%d')" % ( 79 | title, author, like_count, collection_count, comment_count) 80 | 81 | # 执行 82 | self.db.execute(sql) 83 | 84 | def download_midware(self, request): 85 | # 随机UA 86 | # 依赖:pip3 install fake_useragent 87 | ua = UserAgent().random 88 | request.headers = {'User-Agent': ua} 89 | return request 90 | 91 | 92 | if __name__ == "__main__": 93 | TophubSpider(thread_count=10).start() 94 | -------------------------------------------------------------------------------- /feapder/tophub_demo/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: test.py 12 | @time: 2021/4/8 下午12:26 13 | @description:TODO 14 | """ 15 | 16 | from fake_useragent import UserAgent 17 | 18 | ua = UserAgent().random 19 | print(ua) -------------------------------------------------------------------------------- /js/jian_shu.js: -------------------------------------------------------------------------------- 1 | //简书上的文章保存为pdf保存到本地 2 | const puppeteer = require('puppeteer'); 3 | 4 | const mkdirp = require('mkdirp'); 5 | 6 | BASE_URL = 'https://www.jianshu.com'; 7 | 8 | HOME_URL = `${BASE_URL}/u/f46becd1ed83`; 9 | 10 | //文章目录 11 | const ARTICLE_PATH = './monkey'; 12 | 13 | const download_article = async () => { 14 | 15 | const viewport_size = { 16 | width: 0, 17 | height: 0, 18 | }; 19 | 20 | const browser = await puppeteer.launch({ 21 | headless: true, 22 | }); 23 | 24 | const page = await browser.newPage(); 25 | 26 | page.setViewport(viewport_size); 27 | 28 | //打开文章主页 29 | await page.goto(HOME_URL); 30 | 31 | console.log('显示文章列表,马上开始滑动') 32 | 33 | //滑动文章列表,使所有文章被加载出来 34 | //参考:https://github.com/GoogleChrome/puppeteer/issues/844 35 | await autoScroll(page); 36 | 37 | console.log('所有文章加载完成'); 38 | 39 | const articles = await page.$eval('.note-list', articles_element => { 40 | const article_elements = articles_element.querySelectorAll('li'); 41 | const articleElementArray = Array.prototype.slice.call(article_elements); 42 | 43 | return articleElementArray.map(item => { 44 | const a_element = item.querySelector('.title'); 45 | return { 46 | href: a_element.getAttribute('href'), 47 | title: a_element.innerHTML.trim(), 48 | }; 49 | }); 50 | }); 51 | 52 | console.log(`大佬一共发布了${articles.length}篇文章`); 53 | 54 | 55 | //新建目录 56 | mkdirp.sync(ARTICLE_PATH); 57 | 58 | for (let article of articles) { 59 | const articlePage = await browser.newPage(); 60 | articlePage.setViewport(viewport_size); 61 | articlePage.goto(`${BASE_URL}${article.href}`, { 62 | waitUntil: 'networkidle2' 63 | }); 64 | 65 | articlePage.waitForSelector('.post'); 66 | console.log('文章详情页面加载完成'); 67 | 68 | //注意:这里必须等待几秒,不然下面的滑动会报错: 69 | // UnhandledPromiseRejectionWarning: Error: Execution context was destroyed, most likely because of a navigation. 70 | await articlePage.waitFor(2000); 71 | 72 | //滑动到最底部,加载出所有的图片 73 | await autoScroll(articlePage); 74 | 75 | 76 | //为了保证页面的整洁干净,屏蔽多余的元素 77 | await articlePage.$eval('body', body => { 78 | body.querySelector('.navbar').style.display = 'none'; 79 | body.querySelector('#note-fixed-ad-container').style.display = 'none'; 80 | body.querySelector('.note-bottom').style.display = 'none'; 81 | body.querySelector('.side-tool').style.display = 'none'; 82 | // body.querySelector('.author').style.display = 'none'; 83 | body.querySelector('.meta-bottom').style.display = 'none'; 84 | body.querySelector('#web-note-ad-1').style.display = 'none'; 85 | body.querySelector('#comment-list').style.display = 'none'; 86 | body.querySelector('.follow-detail').style.display = 'none'; 87 | body.querySelector('.show-foot').style.display = 'none'; 88 | 89 | Promise.resolve(); 90 | }); 91 | 92 | //文章名称 93 | const fileName = `${article.title.replace("/\\//g", "、")}.pdf`; 94 | const fileFullPath = `${ARTICLE_PATH}/${fileName}`; 95 | console.log(`文章保存的完整路径是:${fileFullPath}`); 96 | 97 | await page.emulateMedia('screen'); 98 | await articlePage.pdf({ 99 | path: fileFullPath, 100 | format: 'A4' 101 | }); 102 | console.log(`保存成功: ${fileFullPath}`); 103 | articlePage.close(); 104 | } 105 | 106 | console.log('下载完成!Enjoy~'); 107 | }; 108 | 109 | function autoScroll(page) { 110 | return page.evaluate(() => { 111 | return new Promise((resolve, reject) => { 112 | var totalHeight = 0; 113 | var distance = 100; 114 | var timer = setInterval(() => { 115 | console.log('执行间断函数'); 116 | var scrollHeight = document.body.scrollHeight; 117 | window.scrollBy(0, distance); 118 | totalHeight += distance; 119 | 120 | if (totalHeight >= scrollHeight) { 121 | console.log('滑动到底'); 122 | clearInterval(timer); 123 | resolve(); 124 | } 125 | }, 100); 126 | }) 127 | }); 128 | } 129 | 130 | 131 | module.exports = download_article; 132 | 133 | if (require.main === module) { 134 | download_article() 135 | } 136 | 137 | 138 | -------------------------------------------------------------------------------- /pic/最低气温排行榜.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/pic/最低气温排行榜.png -------------------------------------------------------------------------------- /raw/qr.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/raw/qr.jpeg -------------------------------------------------------------------------------- /scrapy/douban_login/.idea/douban_login.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /scrapy/douban_login/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /scrapy/douban_login/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy/douban_login/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /scrapy/douban_login/captcha.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/douban_login/captcha.png -------------------------------------------------------------------------------- /scrapy/douban_login/douban_login/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/douban_login/douban_login/__init__.py -------------------------------------------------------------------------------- /scrapy/douban_login/douban_login/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class DoubanLoginItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /scrapy/douban_login/douban_login/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class DoubanLoginSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class DoubanLoginDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /scrapy/douban_login/douban_login/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class DoubanLoginPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /scrapy/douban_login/douban_login/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for douban_login project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'douban_login' 13 | 14 | SPIDER_MODULES = ['douban_login.spiders'] 15 | NEWSPIDER_MODULE = 'douban_login.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'douban_login (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 1 30 | # The download del、ay、 setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | DEFAULT_REQUEST_HEADERS = { 42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | 'Accept-Language': 'en', 44 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 45 | } 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | # SPIDER_MIDDLEWARES = { 50 | # 'douban_login.middlewares.DoubanLoginSpiderMiddleware': 543, 51 | # } 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | # DOWNLOADER_MIDDLEWARES = { 56 | # 'douban_login.middlewares.DoubanLoginDownloaderMiddleware': 543, 57 | # } 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | # EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | # } 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'douban_login.pipelines.DoubanLoginPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | # AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | # AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | # AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | # HTTPCACHE_ENABLED = True 87 | # HTTPCACHE_EXPIRATION_SECS = 0 88 | # HTTPCACHE_DIR = 'httpcache' 89 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /scrapy/douban_login/douban_login/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy/douban_login/douban_login/spiders/douban.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from urllib import request 4 | from PIL import Image 5 | import ssl 6 | 7 | 8 | # 使用Scrapy登录豆瓣网 9 | # 验证码识别可以通过手动输入【PIL】和自动识别 10 | 11 | class DoubanSpider(scrapy.Spider): 12 | name = 'douban' 13 | allowed_domains = ['douban.com'] 14 | 15 | # 默认首先请求这个地址【GET】,然后把请求结果返回给parse()函数解析 16 | start_urls = ['https://accounts.douban.com/login'] 17 | 18 | # 登录url 19 | login_url = 'https://accounts.douban.com/login' 20 | 21 | # 个人中心url 22 | person_center_url = 'https://www.douban.com/people/165725759/' 23 | 24 | # 编辑签名的请求地址 25 | edit_signature = 'https://www.douban.com/j/people/165725759/edit_signature' 26 | 27 | def parse(self, response): 28 | """ 29 | 请求后的解析 30 | 包含两种情况:1.第一次请求start_urls;2.某一次请求不包含callback 31 | :param response: 32 | :return: 33 | """ 34 | # 注意:把最后的请求解析过滤掉 35 | # 如果解析到相应地址不是login_url就不做处理 36 | if response.url != self.login_url: 37 | return 38 | 39 | print('调用parse函数,此时的url:%s' % response.url) 40 | form_data = { 41 | 'source': 'index_nav', 42 | 'redir': 'https://www.douban.com/', # 登录后跳转到哪个界面 43 | 'form_email': '18520876423', 44 | 'form_password': 'Hu881025', 45 | # 'captcha-solution': 'chemical', # 验证码【需要识别图片】 46 | # 'captcha-id': 'ysCwMdnnq8YVpDJZdfmzHu1V:en', # 验证码ID 【每次刷新都重新生成一个,放入到input标签的name为captcha-id的value中】 47 | 'remember': 'on', 48 | 'login': '登录' 49 | } 50 | 51 | # 获取id为captcha-id的img标签【css方式,也可以选择用xpath】 52 | # 验证码图片的url 53 | captcha_img = response.css('img#captcha_image::attr(src)').get() 54 | 55 | # 注意:如果存在验证码,就识别验证码;如果没有验证码,不传入以下两个参数直接登录 56 | if captcha_img: 57 | # 手动识别验证码 58 | captcha = self._regonize_captcha(captcha_img) 59 | form_data['captcha-solution'] = captcha 60 | 61 | # 验证码id【每次刷新都会变化】 62 | captcha_id = response.xpath('//input[@name="captcha-id"]/@value').get() 63 | form_data['captcha-id'] = captcha_id 64 | print('带有验证码的参数已经补充完整,现在开始发送请求') 65 | else: 66 | print('没有验证码,现在开始发送请求') 67 | 68 | # 发送登录请求【POST】 69 | yield scrapy.FormRequest(url=self.login_url, formdata=form_data, callback=self.parse_after_login) 70 | 71 | def _regonize_captcha(self, image_url): 72 | """ 73 | 人工识别验证码【urllib+PIL】 74 | :param image_url: 75 | :return: 76 | """ 77 | print('验证码的地址:%s,开始下载图片' % image_url) 78 | 79 | # 下载图片到本地 80 | request.urlretrieve(image_url, 'captcha.png') 81 | 82 | print('下载图片完成,开始显示图片') 83 | 84 | # 显示在控制台,手动输入验证码 85 | # 打开图片 86 | image = Image.open('captcha.png') 87 | # 展示 88 | image.show() 89 | 90 | # 提示输入验证码 91 | captcha = input('请输入验证码:') 92 | 93 | return captcha 94 | 95 | def parse_after_login(self, response): 96 | """ 97 | 登录成功之后,请求【个人中心】 98 | :param response: 99 | :return: 100 | """ 101 | # 当前url 102 | current_page_url = response.url 103 | print('调用登录接口后,现在的界面是:%s' % current_page_url) 104 | if current_page_url == 'https://www.douban.com/': 105 | print('登录成功') 106 | # 请求个人中心的页面 107 | request = scrapy.Request(url=self.person_center_url, callback=self.parse_person_center) 108 | yield request 109 | else: 110 | print('登录失败') 111 | 112 | def parse_person_center(self, response): 113 | """ 114 | 解析个人中心页面 115 | :param response: 116 | :return: 117 | """ 118 | if response.url == self.person_center_url: 119 | print('进入到个人中心页面了') 120 | ck = response.xpath('//input[@name="ck"]/@value').get() 121 | print('获取的ck是:%s' % ck) 122 | formdata = { 123 | 'ck': ck, 124 | 'signature': '时光如水,岁月如斯' 125 | } 126 | # 发送post请求来更改签名 127 | yield scrapy.FormRequest(self.edit_signature, formdata=formdata) 128 | else: 129 | print('进入个人中心页面失败') 130 | -------------------------------------------------------------------------------- /scrapy/douban_login/readme.MD: -------------------------------------------------------------------------------- 1 | # 使用scrapy登录豆瓣网 2 | ### 准备 3 | 4 | ``` 5 | scrapy startproject douban_login 6 | cd douban_login 7 | scrapy genspider douban "douban.com" 8 | ``` 9 | 10 | 11 | 12 | ### 配置 13 | 14 | 配置 `settings.py` 文件 15 | 16 | 编写 `start.py` 文件,利用 `cmdline` 快速指定爬虫代码 17 | 18 | 19 | 20 | ### 开发 21 | 22 | 场景:使用 `scrapy` 登录豆瓣网,然后到个人中心页面,修改个性签名 23 | 24 | 请求:初始请求【GET】、登录请求【POST】、个人中心请求【GET】、修改签名请求【POST】 25 | 26 | 注意: 27 | 28 | 1. 初始请求的地址是:`start_urls` 29 | 2. 使用 `urllib + PIL` 下载验证码图片,并人工识别验证码【可以付费调用识别验证码的接口】 30 | 3. `captcha-id` 和 `ck` 两个请求参数都在源码中的某个元素里 31 | 32 | 33 | 34 | ### 运行 35 | 36 | 运行 `start.py` 37 | 38 | -------------------------------------------------------------------------------- /scrapy/douban_login/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = douban_login.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = douban_login 12 | -------------------------------------------------------------------------------- /scrapy/douban_login/start.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: start.py 12 | @time: 11/15/18 21:04 13 | @description:方便执行 Python 文件【执行一个 Python 文件】 14 | """ 15 | from scrapy import cmdline 16 | 17 | cmdline.execute('scrapy crawl douban'.split()) 18 | -------------------------------------------------------------------------------- /scrapy/huize_spider/.idea/huize_spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /scrapy/huize_spider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /scrapy/huize_spider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy/huize_spider/.~ana.rtf: -------------------------------------------------------------------------------- 1 | xingagxingag -------------------------------------------------------------------------------- /scrapy/huize_spider/ana.rtf: -------------------------------------------------------------------------------- 1 | {\rtf1\ansi\ansicpg936\cocoartf1671\cocoasubrtf100 2 | {\fonttbl\f0\fswiss\fcharset0 Helvetica;\f1\fnil\fcharset134 PingFangSC-Regular;} 3 | {\colortbl;\red255\green255\blue255;} 4 | {\*\expandedcolortbl;;} 5 | \margl1440\margr1440\vieww10800\viewh13080\viewkind0 6 | \pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardirnatural\partightenfactor0 7 | 8 | \f0\fs24 \cf0 http://www.huize.com/\ 9 | \ 10 | 11 | \f1 \'bd\'a1\'bf\'b5\'b1\'a3\'cf\'d5\ 12 | 1.\'d6\'d8\'bc\'b2 13 | \f0 \ 14 | http://www.huize.com/product/ins-2059-0-0\ 15 | 2. 16 | \f1 \'d7\'a1\'d4\'ba\'d2\'bd\'c1\'c6 17 | \f0 \ 18 | http://www.huize.com/product/ins-2058-0-0\ 19 | \ 20 | \ 21 | 22 | \f1 \'c8\'cb\'ca\'d9\'b1\'a3\'cf\'d5 23 | \f0 \ 24 | 1. 25 | \f1 \'c8\'cb\'ca\'d9\'b1\'a3\'d5\'cf 26 | \f0 \ 27 | http://www.huize.com/product/ins-2060-0-0\ 28 | 2. 29 | \f1 \'c4\'ea\'bd\'f0\'b1\'a3\'cf\'d5 30 | \f0 \ 31 | http://www.huize.com/product/ins-2101-0-0\ 32 | \ 33 | \ 34 | 35 | \f1 \'b6\'f9\'cd\'af\'b1\'a3\'cf\'d5 36 | \f0 \ 37 | 1. 38 | \f1 \'b6\'f9\'cd\'af\'d6\'d8\'bb\'f7 39 | \f0 \ 40 | http://www.huize.com/product/ins-2043-0-0\ 41 | 2. 42 | \f1 \'b6\'f9\'cd\'af\'d2\'bd\'c1\'c6 43 | \f0 \ 44 | http://www.huize.com/product/ins-2044-0-0\ 45 | 3. 46 | \f1 \'b6\'f9\'cd\'af\'d2\'e2\'cd\'e2\ 47 | http://www.huize.com/product/ins-2042-0-0\ 48 | 4.\'bd\'cc\'d3\'fd\'b4\'a2\'d0\'ee\ 49 | http://www.huize.com/product/ins-2057-0-0\ 50 | \ 51 | \ 52 | \'d2\'e2\'cd\'e2\'b1\'a3\'cf\'d5\ 53 | 1.\'bd\'bb\'cd\'a8\'d2\'e2\'cd\'e2\ 54 | http://www.huize.com/product/ins-2082-0-0\ 55 | 2.\'d7\'db\'ba\'cf\'d2\'e2\'cd\'e2\ 56 | http://www.huize.com/product/ins-2049-0-0\ 57 | \ 58 | } -------------------------------------------------------------------------------- /scrapy/huize_spider/huize_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/huize_spider/huize_spider/__init__.py -------------------------------------------------------------------------------- /scrapy/huize_spider/huize_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class HuizeSpiderItem(scrapy.Item): 12 | title = scrapy.Field() 13 | sales = scrapy.Field() 14 | tips = scrapy.Field() 15 | price = scrapy.Field() 16 | url = scrapy.Field() 17 | -------------------------------------------------------------------------------- /scrapy/huize_spider/huize_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class HuizeSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class HuizeSpiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /scrapy/huize_spider/huize_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.exporters import JsonLinesItemExporter 9 | 10 | 11 | class HuizeSpiderPipeline(object): 12 | 13 | def __init__(self): 14 | self.fp = open('datas.json', 'wb') 15 | 16 | self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False) 17 | 18 | def process_item(self, item, spider): 19 | self.exporter.export_item(item) 20 | return item 21 | 22 | def close_spider(self, spider): 23 | # 关闭文件 24 | self.fp.close() 25 | -------------------------------------------------------------------------------- /scrapy/huize_spider/huize_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for huize_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'huize_spider' 13 | 14 | SPIDER_MODULES = ['huize_spider.spiders'] 15 | NEWSPIDER_MODULE = 'huize_spider.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'huize_spider (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 1 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | DEFAULT_REQUEST_HEADERS = { 42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | 'Accept-Language': 'en', 44 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 45 | } 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | # SPIDER_MIDDLEWARES = { 50 | # 'huize_spider.middlewares.HuizeSpiderSpiderMiddleware': 543, 51 | # } 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | # DOWNLOADER_MIDDLEWARES = { 56 | # 'huize_spider.middlewares.HuizeSpiderDownloaderMiddleware': 543, 57 | # } 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | # EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | # } 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'huize_spider.pipelines.HuizeSpiderPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | # AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | # AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | # AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | # HTTPCACHE_ENABLED = True 87 | # HTTPCACHE_EXPIRATION_SECS = 0 88 | # HTTPCACHE_DIR = 'httpcache' 89 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /scrapy/huize_spider/huize_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy/huize_spider/huize_spider/spiders/huize.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from huize_spider.items import HuizeSpiderItem 6 | from .string_utils import remove_space_words 7 | 8 | 9 | # 使用 CrawlSpider 爬取某保险网的数据 10 | 11 | class HuizeSpider(CrawlSpider): 12 | name = 'huize' 13 | allowed_domains = ['huize.com'] 14 | start_urls = ['http://huize.com/'] 15 | 16 | rules = ( 17 | Rule(LinkExtractor(allow=r'.*http://www.huize.com/product/ins-.*'), callback=None, follow=False), 18 | Rule(LinkExtractor(allow=r'.*http://www.huize.com/product/detail-.*'), callback='parse_detail', follow=False), 19 | ) 20 | 21 | def parse_detail(self, response): 22 | # 标题 23 | title = response.xpath('//h2[@class="product-title f30"]/text()').get().strip() 24 | 25 | # 销量 26 | sales = response.xpath('//p[@class="count-item fc6"]/text()').get().strip() 27 | 28 | # 保险特色 29 | # 去掉特殊空格符号 30 | tips = remove_space_words("、".join(response.xpath('//li[@class="ensure-support-item"]/text()').getall())) 31 | 32 | # 价格 33 | price = response.xpath('//span[@class="product-price"]/i[@class="preminum-result"]/text()').get()+" 元" 34 | 35 | item = HuizeSpiderItem(title=title, sales=sales, tips=tips, price=price, url=response.url) 36 | 37 | yield item 38 | -------------------------------------------------------------------------------- /scrapy/huize_spider/huize_spider/spiders/string_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: string_utils.py 12 | @time: 12/4/18 19:52 13 | @description:TODO 14 | """ 15 | 16 | 17 | def remove_space_words(source): 18 | """ 19 | 去掉字符串中的特殊空格,包含\n、\t、\xa0 20 | :param source: 21 | :return: 22 | """ 23 | result = "".join(source.split()) 24 | return result 25 | -------------------------------------------------------------------------------- /scrapy/huize_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = huize_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = huize_spider 12 | -------------------------------------------------------------------------------- /scrapy/huize_spider/start.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: start.py 12 | @time: 11/15/18 21:04 13 | @description:方便执行 Python 文件【执行一个 Python 文件】 14 | """ 15 | from scrapy import cmdline 16 | 17 | cmdline.execute('scrapy crawl huize'.split()) 18 | 19 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/.idea/jianshu_spider.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/jianshu_spider/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/jianshu_spider/jianshu_spider/__init__.py -------------------------------------------------------------------------------- /scrapy/jianshu_spider/jianshu_spider/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | # 文章详情Item 12 | class ArticleItem(scrapy.Item): 13 | title = scrapy.Field() 14 | content = scrapy.Field() 15 | # 文章id 16 | article_id = scrapy.Field() 17 | # 原始的url 18 | origin_url = scrapy.Field() 19 | 20 | # 作者 21 | author = scrapy.Field() 22 | 23 | # 头像 24 | avatar = scrapy.Field() 25 | 26 | # 发布时间 27 | pubtime = scrapy.Field() 28 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/jianshu_spider/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class JianshuSpiderSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class JianshuSpiderDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/jianshu_spider/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | # 爬取到数据后,保存到Mysql数据中 9 | 10 | import pymysql 11 | 12 | 13 | class JianshuSpiderPipeline(object): 14 | 15 | def __init__(self): 16 | db_params = { 17 | 'host': '127.0.0.1', 18 | 'port': 3306, 19 | 'user': 'root', 20 | 'password': 'root', 21 | 'database': 'jianshu', 22 | 'charset': 'utf8' 23 | } 24 | 25 | # 数据库【连接对象】 26 | self.conn = pymysql.connect(**db_params) 27 | 28 | # 数据库【游标对象】【操作数据库】 29 | self.cursor = self.conn.cursor() 30 | 31 | # sql语句 32 | self._sql = """ 33 | insert into article(id,title,content,author,avatar,pubtime,article_id,origin_url) 34 | values(null,%s,%s,%s,%s,%s,%s,%s) 35 | """ 36 | 37 | def process_item(self, item, spider): 38 | # 执行sql语句 39 | self.cursor.execute(self._sql, ( 40 | item['title'], item['content'], item['author'], item['avatar'], item['pubtime'], item['article_id'], 41 | item['origin_url'])) 42 | 43 | # 插入到数据库中 44 | self.conn.commit() 45 | return item 46 | 47 | def close_spider(self, spider): 48 | # 关闭游标 49 | self.cursor.close() 50 | 51 | 52 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/jianshu_spider/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for jianshu_spider project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'jianshu_spider' 13 | 14 | SPIDER_MODULES = ['jianshu_spider.spiders'] 15 | NEWSPIDER_MODULE = 'jianshu_spider.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'jianshu_spider (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | DEFAULT_REQUEST_HEADERS = { 42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | 'Accept-Language': 'en', 44 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 45 | } 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | # SPIDER_MIDDLEWARES = { 50 | # 'jianshu_spider.middlewares.JianshuSpiderSpiderMiddleware': 543, 51 | # } 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | # DOWNLOADER_MIDDLEWARES = { 56 | # 'jianshu_spider.middlewares.JianshuSpiderDownloaderMiddleware': 543, 57 | # } 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | # EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | # } 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'jianshu_spider.pipelines.JianshuSpiderPipeline': 300, 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | # AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | # AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | # AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | # HTTPCACHE_ENABLED = True 87 | # HTTPCACHE_EXPIRATION_SECS = 0 88 | # HTTPCACHE_DIR = 'httpcache' 89 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | 92 | 93 | # 在 setting.py 文件中 设置 日志 记录等级 94 | # LOG_LEVEL = 'DEBUG' 95 | # LOG_FILE = 'log.txt' 96 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/jianshu_spider/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/jianshu_spider/spiders/jianshu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from jianshu_spider.items import ArticleItem 6 | 7 | 8 | class JianshuSpider(CrawlSpider): 9 | name = 'jianshu' 10 | allowed_domains = ['jianshu.com'] 11 | start_urls = ['https://www.jianshu.com/'] 12 | 13 | HTTPS = "https:" 14 | 15 | rules = ( 16 | # 文章id是有12位小写字母或者数字0-9构成 17 | Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True), 18 | ) 19 | 20 | # 数据测试:scrapy shell https://www.jianshu.com/p/8d5ab6d5f258 21 | def parse_detail(self, response): 22 | title = response.xpath('//h1[@class="title"]/text()').get() 23 | 24 | author = response.xpath('//div[@class="info"]/span/a/text()').get() 25 | 26 | avatar = self.HTTPS + response.xpath('//div[@class="author"]/a/img/@src').get() 27 | 28 | pub_time = response.xpath('//span[@class="publish-time"]/text()').get().replace("*", "") 29 | 30 | current_url = response.url 31 | real_url = current_url.split(r"?")[0] 32 | 33 | article_id = real_url.split(r'/')[-1] 34 | 35 | # 保留标签的H5内容[保留格式,方便后面排版] 36 | content = response.xpath('//div[@class="show-content"]').get() 37 | 38 | item = ArticleItem( 39 | title=title, 40 | avatar=avatar, 41 | pubtime=pub_time, 42 | origin_url=current_url, 43 | author=author, 44 | article_id=article_id, 45 | content=content 46 | ) 47 | 48 | yield item 49 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/raw/article.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : cal 5 | Source Server Type : MySQL 6 | Source Server Version : 50724 7 | Source Host : localhost 8 | Source Database : jianshu 9 | 10 | Target Server Type : MySQL 11 | Target Server Version : 50724 12 | File Encoding : utf-8 13 | 14 | Date: 12/04/2018 23:08:42 PM 15 | */ 16 | 17 | SET NAMES utf8; 18 | SET FOREIGN_KEY_CHECKS = 0; 19 | 20 | -- ---------------------------- 21 | -- Table structure for `article` 22 | -- ---------------------------- 23 | DROP TABLE IF EXISTS `article`; 24 | CREATE TABLE `article` ( 25 | `id` int(11) NOT NULL AUTO_INCREMENT, 26 | `title` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL, 27 | `content` longtext CHARACTER SET utf8 COLLATE utf8_bin, 28 | `author` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL, 29 | `avatar` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL, 30 | `pubtime` datetime DEFAULT NULL, 31 | `article_id` varchar(20) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL, 32 | `origin_url` varchar(255) CHARACTER SET utf8 COLLATE utf8_bin DEFAULT NULL, 33 | PRIMARY KEY (`id`) 34 | ) ENGINE=InnoDB AUTO_INCREMENT=725 DEFAULT CHARSET=utf8; 35 | 36 | SET FOREIGN_KEY_CHECKS = 1; 37 | -------------------------------------------------------------------------------- /scrapy/jianshu_spider/raw/article_table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/jianshu_spider/raw/article_table.png -------------------------------------------------------------------------------- /scrapy/jianshu_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = jianshu_spider.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = jianshu_spider 12 | -------------------------------------------------------------------------------- /scrapy/qczj/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /scrapy/qczj/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy/qczj/.idea/qczj.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /scrapy/qczj/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /scrapy/qczj/qczj/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/qczj/qczj/__init__.py -------------------------------------------------------------------------------- /scrapy/qczj/qczj/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | # 为了方便使用Images Pipline,这里定义image_urls和images两个变量【必须】 11 | class QczjItem(scrapy.Item): 12 | category = scrapy.Field() 13 | image_urls = scrapy.Field() 14 | images = scrapy.Field() 15 | -------------------------------------------------------------------------------- /scrapy/qczj/qczj/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class QczjSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class QczjDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /scrapy/qczj/qczj/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | # 存储数据 9 | import os 10 | from urllib import request 11 | from scrapy.pipelines.images import ImagesPipeline 12 | from qczj import settings 13 | 14 | 15 | # 场景:由于系统提供的ImagesPipline不能定义子文件件目录和文件名称,这里需要自定义 16 | class CustomImagesPipline(ImagesPipeline): 17 | 18 | # 发送下载图片请求之前调用 19 | def get_media_requests(self, item, info): 20 | request_objs = super(CustomImagesPipline, self).get_media_requests(item, info) 21 | 22 | for request_obj in request_objs: 23 | request_obj.item = item 24 | 25 | # 注意:一定要返回请求对象列表 26 | return request_objs 27 | 28 | # 图片被存储之前才会被执行 29 | def file_path(self, request, response=None, info=None): 30 | path = super(CustomImagesPipline, self).file_path(request, response, info) 31 | 32 | # 获取分类 33 | category = request.item.get('category') 34 | 35 | # 实际要保存的目录下 36 | category_path = os.path.join(settings.IMAGES_STORE, category) 37 | 38 | if not os.path.exists(category_path): 39 | os.mkdir(category_path) 40 | 41 | # 图片的名称 full/%s.jpg 42 | image_name = path.replace("full/", "") 43 | 44 | # 图片要保存的完成路径【注意:这里要写相对路径,相对于:settings.IMAGES_STORE这个目录】【具体查看父类返回的路径】 45 | image_full_path = os.path.join(category, image_name) 46 | 47 | return image_full_path 48 | -------------------------------------------------------------------------------- /scrapy/qczj/qczj/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for qczj project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | import os 13 | 14 | BOT_NAME = 'qczj' 15 | 16 | SPIDER_MODULES = ['qczj.spiders'] 17 | NEWSPIDER_MODULE = 'qczj.spiders' 18 | 19 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 20 | # USER_AGENT = 'qczj (+http://www.yourdomain.com)' 21 | 22 | # Obey robots.txt rules 23 | ROBOTSTXT_OBEY = False 24 | 25 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 26 | # CONCURRENT_REQUESTS = 32 27 | 28 | # Configure a delay for requests for the same website (default: 0) 29 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 30 | # See also autothrottle settings and docs 31 | DOWNLOAD_DELAY = 1 32 | # The download delay setting will honor only one of: 33 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 34 | # CONCURRENT_REQUESTS_PER_IP = 16 35 | 36 | # Disable cookies (enabled by default) 37 | # COOKIES_ENABLED = False 38 | 39 | # Disable Telnet Console (enabled by default) 40 | # TELNETCONSOLE_ENABLED = False 41 | 42 | # Override the default request headers: 43 | DEFAULT_REQUEST_HEADERS = { 44 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 45 | 'Accept-Language': 'en', 46 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 47 | } 48 | 49 | # Enable or disable spider middlewares 50 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 51 | # SPIDER_MIDDLEWARES = { 52 | # 'qczj.middlewares.QczjSpiderMiddleware': 543, 53 | # } 54 | 55 | # Enable or disable downloader middlewares 56 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 57 | # DOWNLOADER_MIDDLEWARES = { 58 | # 'qczj.middlewares.QczjDownloaderMiddleware': 543, 59 | # } 60 | 61 | # Enable or disable extensions 62 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 63 | # EXTENSIONS = { 64 | # 'scrapy.extensions.telnet.TelnetConsole': None, 65 | # } 66 | 67 | # Configure item pipelines 68 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 69 | ITEM_PIPELINES = { 70 | 'qczj.pipelines.CustomImagesPipline': 1 71 | } 72 | 73 | # Enable and configure the AutoThrottle extension (disabled by default) 74 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 75 | # AUTOTHROTTLE_ENABLED = True 76 | # The initial download delay 77 | # AUTOTHROTTLE_START_DELAY = 5 78 | # The maximum download delay to be set in case of high latencies 79 | # AUTOTHROTTLE_MAX_DELAY = 60 80 | # The average number of requests Scrapy should be sending in parallel to 81 | # each remote server 82 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 83 | # Enable showing throttling stats for every response received: 84 | # AUTOTHROTTLE_DEBUG = False 85 | 86 | # Enable and configure HTTP caching (disabled by default) 87 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 88 | # HTTPCACHE_ENABLED = True 89 | # HTTPCACHE_EXPIRATION_SECS = 0 90 | # HTTPCACHE_DIR = 'httpcache' 91 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 92 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 93 | 94 | 95 | # 图片下载路径,供Images pipline使用 96 | IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'images') 97 | -------------------------------------------------------------------------------- /scrapy/qczj/qczj/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy/qczj/qczj/spiders/bmw5.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.spiders import CrawlSpider, Rule 4 | from scrapy.linkextractors import LinkExtractor 5 | 6 | from qczj.items import QczjItem 7 | 8 | 9 | # 爬取汽车之家宝马5系的数据,下载原图 10 | 11 | class Bmw5Spider(CrawlSpider): 12 | name = 'bmw5' 13 | allowed_domains = ['car.autohome.com.cn'] 14 | 15 | # 宝马5系(进口)汽车系列图地址 16 | start_urls = ['https://car.autohome.com.cn/pic/series/202.html'] 17 | 18 | rules = { 19 | # follow=True:接下来解析第二页、第三页、、、 20 | Rule(LinkExtractor(allow=r'https://car.autohome.com.cn/pic/series/202-.+'), callback="parse_page", follow=True) 21 | } 22 | 23 | def parse_page(self, response): 24 | """ 25 | 解析满足rules的url【更多图片页面】 https://car.autohome.com.cn/pic/series/202-1-p1.html 26 | :param response: 27 | :return: 28 | """ 29 | # 1.获取类别【可以通过scrapy shell url局部测试,不需要运行整个项目】 30 | category = response.xpath('//div[@class="uibox"]/div[1]/text()').get() 31 | 32 | # 2.图片 33 | # 注意:xpath 包含语法【同样可以通过scrapy shell来局部测试正确性】 34 | srcs = response.xpath('//div[contains(@class,"uibox-con")]//li//img/@src').getall() 35 | 36 | # 3.1 对缩略图的地址补全 37 | # 3.2 转换缩略图的url为高清图片的url 38 | srcs = list(map(lambda x: response.urljoin(x).replace("t_", ""), srcs)) 39 | 40 | item = QczjItem(category=category, image_urls=srcs) 41 | 42 | print("爬完页面:%s,类别:%s" % (response.url, category)) 43 | 44 | yield item 45 | -------------------------------------------------------------------------------- /scrapy/qczj/readme.MD: -------------------------------------------------------------------------------- 1 | # 爬取汽车之家的图片【宝马5系车】 2 | ### 创建一个爬虫 3 | ``` 4 | scrapy genspider bmw5 "car.autohome.com.cn" 5 | ``` 6 | 7 | -------------------------------------------------------------------------------- /scrapy/qczj/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = qczj.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = qczj 12 | -------------------------------------------------------------------------------- /scrapy/qczj/start.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: start.py 12 | @time: 11/15/18 21:04 13 | @description:方便执行 Python 文件【执行一个 Python 文件】 14 | """ 15 | from scrapy import cmdline 16 | 17 | cmdline.execute('scrapy crawl bmw5'.split()) 18 | -------------------------------------------------------------------------------- /scrapy/qsbk/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /scrapy/qsbk/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy/qsbk/.idea/qsbk.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /scrapy/qsbk/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /scrapy/qsbk/qsbk/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/qsbk/qsbk/__init__.py -------------------------------------------------------------------------------- /scrapy/qsbk/qsbk/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | # 作用:定义数据模型 9 | 10 | import scrapy 11 | 12 | 13 | class QsbkItem(scrapy.Item): 14 | # define the fields for your item here like: 15 | # name = scrapy.Field() 16 | """ 17 | 定义数据模型 18 | """ 19 | # 段子作者 20 | author = scrapy.Field() 21 | 22 | # 段子内容 23 | content = scrapy.Field() 24 | -------------------------------------------------------------------------------- /scrapy/qsbk/qsbk/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | # 作用:定义中间件,包含下载器中间件、爬虫中间件 9 | 10 | from scrapy import signals 11 | 12 | 13 | class QsbkSpiderMiddleware(object): 14 | # Not all methods need to be defined. If a method is not defined, 15 | # scrapy acts as if the spider middleware does not modify the 16 | # passed objects. 17 | 18 | @classmethod 19 | def from_crawler(cls, crawler): 20 | # This method is used by Scrapy to create your spiders. 21 | s = cls() 22 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 23 | return s 24 | 25 | def process_spider_input(self, response, spider): 26 | # Called for each response that goes through the spider 27 | # middleware and into the spider. 28 | 29 | # Should return None or raise an exception. 30 | return None 31 | 32 | def process_spider_output(self, response, result, spider): 33 | # Called with the results returned from the Spider, after 34 | # it has processed the response. 35 | 36 | # Must return an iterable of Request, dict or Item objects. 37 | for i in result: 38 | yield i 39 | 40 | def process_spider_exception(self, response, exception, spider): 41 | # Called when a spider or process_spider_input() method 42 | # (from other spider middleware) raises an exception. 43 | 44 | # Should return either None or an iterable of Response, dict 45 | # or Item objects. 46 | pass 47 | 48 | def process_start_requests(self, start_requests, spider): 49 | # Called with the start requests of the spider, and works 50 | # similarly to the process_spider_output() method, except 51 | # that it doesn’t have a response associated. 52 | 53 | # Must return only requests (not items). 54 | for r in start_requests: 55 | yield r 56 | 57 | def spider_opened(self, spider): 58 | spider.logger.info('Spider opened: %s' % spider.name) 59 | 60 | 61 | class QsbkDownloaderMiddleware(object): 62 | # Not all methods need to be defined. If a method is not defined, 63 | # scrapy acts as if the downloader middleware does not modify the 64 | # passed objects. 65 | 66 | @classmethod 67 | def from_crawler(cls, crawler): 68 | # This method is used by Scrapy to create your spiders. 69 | s = cls() 70 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 71 | return s 72 | 73 | def process_request(self, request, spider): 74 | # Called for each request that goes through the downloader 75 | # middleware. 76 | 77 | # Must either: 78 | # - return None: continue processing this request 79 | # - or return a Response object 80 | # - or return a Request object 81 | # - or raise IgnoreRequest: process_exception() methods of 82 | # installed downloader middleware will be called 83 | return None 84 | 85 | def process_response(self, request, response, spider): 86 | # Called with the response returned from the downloader. 87 | 88 | # Must either; 89 | # - return a Response object 90 | # - return a Request object 91 | # - or raise IgnoreRequest 92 | return response 93 | 94 | def process_exception(self, request, exception, spider): 95 | # Called when a download handler or a process_request() 96 | # (from other downloader middleware) raises an exception. 97 | 98 | # Must either: 99 | # - return None: continue processing this exception 100 | # - return a Response object: stops process_exception() chain 101 | # - return a Request object: stops process_exception() chain 102 | pass 103 | 104 | def spider_opened(self, spider): 105 | spider.logger.info('Spider opened: %s' % spider.name) 106 | -------------------------------------------------------------------------------- /scrapy/qsbk/qsbk/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | # 作用:保存数据【Json】【Xml、CSV类似,详情查看 exporters 类】 9 | 10 | import json 11 | from .items import QsbkItem 12 | 13 | from scrapy.exporters import JsonLinesItemExporter 14 | 15 | class QsbkPipeline(object): 16 | 17 | def __init__(self): 18 | # JsonLinesItemExporter 必须要以二进制的方式打开 19 | # 注意:以二进制的方式打开写入,不需要指定编码格式;以字符串的形式打开写入,就需要指定编码格式 20 | self.fp = open('duanzi.json', 'wb') 21 | 22 | # 定义一个 exporters 23 | self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8') 24 | 25 | def open_spider(self, spider): 26 | print('爬虫开始了...') 27 | 28 | def process_item(self, item, spider): 29 | self.exporter.export_item(item) 30 | return item 31 | 32 | def close_spider(self, spider): 33 | self.fp.close() 34 | print('爬虫结束了。') 35 | -------------------------------------------------------------------------------- /scrapy/qsbk/qsbk/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for qsbk project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | # 作用:爬虫配置文件 13 | # 比如:配置请求头、是否开启 Cookie、下载之前是否延迟 14 | 15 | BOT_NAME = 'qsbk' 16 | 17 | SPIDER_MODULES = ['qsbk.spiders'] 18 | NEWSPIDER_MODULE = 'qsbk.spiders' 19 | 20 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 21 | # USER_AGENT = 'qsbk (+http://www.yourdomain.com)' 22 | 23 | # Obey robots.txt rules 24 | ROBOTSTXT_OBEY = False 25 | 26 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 27 | # CONCURRENT_REQUESTS = 32 28 | 29 | # Configure a delay for requests for the same website (default: 0) 30 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 31 | # See also autothrottle settings and docs 32 | 33 | # 下载延迟 34 | # 1 秒钟停 1 次 35 | DOWNLOAD_DELAY = 1 36 | # The download delay setting will honor only one of: 37 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 38 | # CONCURRENT_REQUESTS_PER_IP = 16 39 | 40 | # Disable cookies (enabled by default) 41 | # COOKIES_ENABLED = False 42 | 43 | # Disable Telnet Console (enabled by default) 44 | # TELNETCONSOLE_ENABLED = False 45 | 46 | # Override the default request headers: 47 | DEFAULT_REQUEST_HEADERS = { 48 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 49 | 'Accept-Language': 'en', 50 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 51 | } 52 | 53 | # Enable or disable spider middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 55 | # SPIDER_MIDDLEWARES = { 56 | # 'qsbk.middlewares.QsbkSpiderMiddleware': 543, 57 | # } 58 | 59 | # Enable or disable downloader middlewares 60 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 61 | # DOWNLOADER_MIDDLEWARES = { 62 | # 'qsbk.middlewares.QsbkDownloaderMiddleware': 543, 63 | # } 64 | 65 | # Enable or disable extensions 66 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 67 | # EXTENSIONS = { 68 | # 'scrapy.extensions.telnet.TelnetConsole': None, 69 | # } 70 | 71 | # Configure item pipelines 72 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 73 | # 'qsbk.pipelines.QsbkPipeline':Key;300:优先级。值越小,优先级越高。 74 | ITEM_PIPELINES = { 75 | 'qsbk.pipelines.QsbkPipeline': 300, 76 | } 77 | 78 | # Enable and configure the AutoThrottle extension (disabled by default) 79 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 80 | # AUTOTHROTTLE_ENABLED = True 81 | # The initial download delay 82 | # AUTOTHROTTLE_START_DELAY = 5 83 | # The maximum download delay to be set in case of high latencies 84 | # AUTOTHROTTLE_MAX_DELAY = 60 85 | # The average number of requests Scrapy should be sending in parallel to 86 | # each remote server 87 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 88 | # Enable showing throttling stats for every response received: 89 | # AUTOTHROTTLE_DEBUG = False 90 | 91 | # Enable and configure HTTP caching (disabled by default) 92 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 93 | # HTTPCACHE_ENABLED = True 94 | # HTTPCACHE_EXPIRATION_SECS = 0 95 | # HTTPCACHE_DIR = 'httpcache' 96 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 97 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 98 | -------------------------------------------------------------------------------- /scrapy/qsbk/qsbk/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy/qsbk/qsbk/spiders/spider_qsbk.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from qsbk.items import QsbkItem 4 | from scrapy.http.response.html import HtmlResponse 5 | from scrapy.selector.unified import SelectorList, Selector 6 | 7 | 8 | # 使用 scrapy 爬取糗事百科 9 | 10 | class SpiderQsbkSpider(scrapy.Spider): 11 | name = 'spider_qsbk' 12 | allowed_domains = ['qiushibaike.com'] 13 | start_urls = ['https://www.qiushibaike.com/text/page/1/'] 14 | base_domain = "https://www.qiushibaike.com" 15 | 16 | def parse(self, response): 17 | """ 18 | 对 Download 下载回来的数据进行解释 19 | :param response: HtmlResponse 20 | :return: 21 | """ 22 | 23 | # 1.利用 Xpath 获取所有的段子【divs】 24 | duan_zi_divs = response.xpath('//div[@id="content-left"]/div') 25 | 26 | # items = [] 27 | 28 | # 2.遍历出段子进行解析 29 | for duan_zi_div in duan_zi_divs: 30 | # 2.1 获取作者 31 | author = duan_zi_div.xpath(".//h2/text()").get().strip() 32 | 33 | # 2.2 获取段子内容 34 | content_pre = duan_zi_div.xpath(".//div[@class='content']//text()").getall() # 列表 35 | content = "".join(content_pre).strip() 36 | 37 | # 2.3 组装成一个数据模型 38 | item = QsbkItem(author=author, content=content) 39 | 40 | # 2.4 以生成器的方式传给 piplines 管道处理 41 | yield item 42 | 43 | # 查找下一页的链接地址 44 | next_url = None 45 | try: 46 | next_url = self.base_domain + response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get() 47 | except: 48 | pass 49 | 50 | # 如果找不到下一页【最后一页】,就直接返回 51 | if not next_url: 52 | return 53 | else: 54 | # 执行下一页 55 | yield scrapy.Request(next_url, callback=self.parse) 56 | -------------------------------------------------------------------------------- /scrapy/qsbk/readme.MD: -------------------------------------------------------------------------------- 1 | # 使用 `Scrapy` 来爬取糗事百科 2 | 3 | 1. 修改 `settings.py` 配置文件 4 | 5 | ``` 6 | # 1.修改 ROBOTSTXT_OBEY 为 False 7 | ROBOTSTXT_OBEY = False 8 | 9 | # 2.放开请求头的设置 10 | DEFAULT_REQUEST_HEADERS = { 11 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 12 | 'Accept-Language': 'en', 13 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 14 | } 15 | 16 | # 3.放开 PipLine 便于保存数据 17 | # 'qsbk.pipelines.QsbkPipeline':Key;300:优先级;值越小,优先级越高。 18 | ITEM_PIPELINES = { 19 | 'qsbk.pipelines.QsbkPipeline': 300, 20 | } 21 | ``` 22 | 23 | 2. 编写爬虫代码 - `spiders/spider_xx.py` 24 | 25 | 对 `Download` 下载后的数据,利用 `xpath` 进行解释,然后通过生成器传给 `PipLine` 26 | 27 | 3. 编写数据模型 28 | 29 | 定义数据模型,便于管理 30 | 31 | 4. 编写 `Pipline` 管道 32 | 33 | 编写保存数据的代码 34 | 35 | 注意:需要在 `settings.py` 文件中激活 `Pipline` 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /scrapy/qsbk/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = qsbk.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = qsbk 12 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/.idea/sfw.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.4.4 2 | asn1crypto==0.24.0 3 | astroid==2.0.4 4 | async-timeout==3.0.1 5 | attrs==18.2.0 6 | Automat==0.7.0 7 | certifi==2018.10.15 8 | cffi==1.11.5 9 | chardet==3.0.4 10 | constantly==15.1.0 11 | cryptography==2.4.1 12 | cssselect==1.0.3 13 | dateparser==0.7.0 14 | demjson==2.2.4 15 | douyin==0.3.6 16 | hyperlink==18.0.0 17 | idna==2.7 18 | incremental==17.5.0 19 | isort==4.3.4 20 | lazy-object-proxy==1.3.1 21 | lxml==4.2.5 22 | mccabe==0.6.1 23 | motor==2.0.0 24 | multidict==4.5.2 25 | parsel==1.5.1 26 | Pillow==5.3.0 27 | pyasn1==0.4.4 28 | pyasn1-modules==0.2.2 29 | pycparser==2.19 30 | PyDispatcher==2.0.5 31 | PyHamcrest==1.9.0 32 | pylint==2.1.1 33 | pymongo==3.7.2 34 | PyMySQL==0.9.2 35 | pyOpenSSL==18.0.0 36 | python-dateutil==2.7.5 37 | pytz==2018.7 38 | queuelib==1.5.0 39 | regex==2018.11.22 40 | requests==2.19.1 41 | retrying==1.3.3 42 | Scrapy==1.5.1 43 | selenium==3.14.1 44 | service-identity==17.0.0 45 | six==1.11.0 46 | tqdm==4.28.1 47 | Twisted==18.9.0 48 | tzlocal==1.5.1 49 | urllib3==1.23 50 | w3lib==1.19.0 51 | wrapt==1.10.11 52 | yarl==1.2.6 53 | zope.interface==4.6.0 54 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = sfw.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = sfw 12 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/sfw/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/sfw_spider/sfw/__init__.py -------------------------------------------------------------------------------- /scrapy/sfw_spider/sfw/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class NewHouseItem(scrapy.Item): 12 | """ 13 | 新房的数据模型【10个属性:省份、城市、小区名称、价格、几居室、面积、地址、区域、是否在售、详情页面ull】 14 | """ 15 | # 省份 16 | province = scrapy.Field() 17 | # 城市 18 | city = scrapy.Field() 19 | # 小区名称 20 | name = scrapy.Field() 21 | # 价格 22 | price = scrapy.Field() 23 | # 几居室【列表】【新房可能有多个房型】 24 | rooms = scrapy.Field() 25 | # 面积 26 | area = scrapy.Field() 27 | # 地址 28 | address = scrapy.Field() 29 | # 行政区 30 | district = scrapy.Field() 31 | # 是否在售 32 | sale = scrapy.Field() 33 | # 详情页面url 34 | origin_url = scrapy.Field() 35 | 36 | 37 | class ESFHouseItem(scrapy.Item): 38 | """ 39 | 二手房数据模型【12个属性:省份、城市、小区名称、几室几厅、楼层、朝向、年代、地址、建筑面积、总价、单价、详情页面URL】 40 | """ 41 | # 省份 42 | province = scrapy.Field() 43 | # 城市 44 | city = scrapy.Field() 45 | # 小区名称 46 | name = scrapy.Field() 47 | # 几室几厅 48 | rooms = scrapy.Field() 49 | # 楼层 50 | floor = scrapy.Field() 51 | # 朝向 52 | toward = scrapy.Field() 53 | # 年代 54 | year = scrapy.Field() 55 | # 地址 56 | address = scrapy.Field() 57 | # 建筑面积 58 | area = scrapy.Field() 59 | # 总价 60 | price = scrapy.Field() 61 | # 单价 62 | unit = scrapy.Field() 63 | # 详情页面url 64 | origin_url = scrapy.Field() 65 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/sfw/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class SfwSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class SfwDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | 105 | 106 | # =================================================================== 107 | import random 108 | 109 | 110 | # 随机请求头 111 | # 自定义一个下载器中间件【Download Middlewares】【请求头】 112 | # 所有请求头可以参考:http://www.useragentstring.com/pages/useragentstring.php?typ=Browser 113 | class UserAgentDownloaderMiddleware(object): 114 | USER_AGENTS = [ 115 | 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50', 116 | 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 117 | 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;', 118 | 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)', 119 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv,2.0.1) Gecko/20100101 Firefox/4.0.1', 120 | 'Mozilla/5.0 (Windows NT 6.1; rv,2.0.1) Gecko/20100101 Firefox/4.0.1' 121 | ] 122 | 123 | def process_request(self, request, spider): 124 | # 随机拿到一个请求头 125 | user_agent = random.choice(self.USER_AGENTS) 126 | 127 | # 设置到request 128 | request.headers['User-Agent'] = user_agent 129 | 130 | request.headers['Location'] = None 131 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/sfw/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.exporters import JsonLinesItemExporter 9 | from .items import NewHouseItem, ESFHouseItem 10 | 11 | 12 | class SfwPipeline(object): 13 | 14 | def __init__(self): 15 | self.fp_new_house = open('new_house.json', 'wb') 16 | self.fp_esf_house = open('esf_house.json', 'wb') 17 | 18 | self.exporter_new_house = JsonLinesItemExporter(self.fp_new_house, ensure_ascii=False) 19 | self.exporter_esf_house = JsonLinesItemExporter(self.fp_esf_house, ensure_ascii=False) 20 | 21 | def process_item(self, item, spider): 22 | if isinstance(item, NewHouseItem): 23 | print('写入一条新手房数据') 24 | self.exporter_new_house.export_item(item) 25 | else: 26 | print('写入一条二手房数据') 27 | self.exporter_esf_house.export_item(item) 28 | return item 29 | 30 | def close_spider(self, spider): 31 | self.fp_new_house.close() 32 | self.fp_esf_house.close() 33 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/sfw/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for sfw project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'sfw' 13 | 14 | SPIDER_MODULES = ['sfw.spiders'] 15 | NEWSPIDER_MODULE = 'sfw.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'sfw (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 3 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | DEFAULT_REQUEST_HEADERS = { 42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | 'Accept-Language': 'en', 44 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 45 | } 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | # SPIDER_MIDDLEWARES = { 50 | # 'sfw.middlewares.SfwSpiderMiddleware': 543, 51 | # } 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | DOWNLOADER_MIDDLEWARES = { 56 | 'sfw.middlewares.SfwDownloaderMiddleware': 543, 57 | 'sfw.middlewares.UserAgentDownloaderMiddleware': 500, 58 | } 59 | 60 | # Enable or disable extensions 61 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 62 | # EXTENSIONS = { 63 | # 'scrapy.extensions.telnet.TelnetConsole': None, 64 | # } 65 | 66 | # Configure item pipelines 67 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 68 | ITEM_PIPELINES = { 69 | 'sfw.pipelines.SfwPipeline': 300, 70 | } 71 | 72 | # Enable and configure the AutoThrottle extension (disabled by default) 73 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 74 | # AUTOTHROTTLE_ENABLED = True 75 | # The initial download delay 76 | # AUTOTHROTTLE_START_DELAY = 5 77 | # The maximum download delay to be set in case of high latencies 78 | # AUTOTHROTTLE_MAX_DELAY = 60 79 | # The average number of requests Scrapy should be sending in parallel to 80 | # each remote server 81 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 82 | # Enable showing throttling stats for every response received: 83 | # AUTOTHROTTLE_DEBUG = False 84 | 85 | # Enable and configure HTTP caching (disabled by default) 86 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 87 | # HTTPCACHE_ENABLED = True 88 | # HTTPCACHE_EXPIRATION_SECS = 0 89 | # HTTPCACHE_DIR = 'httpcache' 90 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 91 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 92 | 93 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/sfw/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy/sfw_spider/start.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: start.py 12 | @time: 11/15/18 21:04 13 | @description: Traceback 14 | """ 15 | from scrapy import cmdline 16 | 17 | cmdline.execute('scrapy crawl sfw_spider'.split()) 18 | 19 | -------------------------------------------------------------------------------- /scrapy/weixin_community/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /scrapy/weixin_community/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /scrapy/weixin_community/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /scrapy/weixin_community/.idea/weixin_community.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /scrapy/weixin_community/readme.MD: -------------------------------------------------------------------------------- 1 | # 使用 `CrawlSpider` 爬取微信小程序论坛 2 | 1. 创建一个项目 3 | 4 | ``` 5 | scrapy startproject weixin_community 6 | ``` 7 | 8 | 2. 创建一个爬虫 9 | 10 | ``` 11 | # 先进入文件夹中 12 | cd weixin_community 13 | 14 | # 创建一个爬虫 15 | scrapy genspider -t crawl wx_spider "wxapp-union.com" 16 | ``` 17 | 18 | 3. 使用 `Pycharm` 打开项目 19 | 20 | 4. 设置 `setting.py` 文件 21 | 22 | ``` 23 | ROBOTSTXT_OBEY = False 24 | 25 | DOWNLOAD_DELAY = 1 26 | 27 | DEFAULT_REQUEST_HEADERS = { 28 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 29 | 'Accept-Language': 'en', 30 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 31 | } 32 | 33 | ITEM_PIPELINES = { 34 | 'weixin_community.pipelines.WeixinCommunityPipeline': 300, 35 | } 36 | ``` 37 | 38 | 5. 编写爬虫 39 | 40 | 6. 编写数据模型 41 | 42 | 7. 编写 `Pipline` 管道 43 | 44 | 8. 运行测试 45 | 46 | 47 | -------------------------------------------------------------------------------- /scrapy/weixin_community/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = weixin_community.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = weixin_community 12 | -------------------------------------------------------------------------------- /scrapy/weixin_community/weixin_community/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/scrapy/weixin_community/weixin_community/__init__.py -------------------------------------------------------------------------------- /scrapy/weixin_community/weixin_community/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class WeixinCommunityItem(scrapy.Item): 12 | title = scrapy.Field() 13 | author = scrapy.Field() 14 | pub_time = scrapy.Field() 15 | content = scrapy.Field() 16 | -------------------------------------------------------------------------------- /scrapy/weixin_community/weixin_community/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class WeixinCommunitySpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class WeixinCommunityDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /scrapy/weixin_community/weixin_community/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from scrapy.exporters import JsonLinesItemExporter,JsonItemExporter 9 | 10 | 11 | # 由于数据量相比比较大,这里使用:JsonLinesItemExporter 12 | 13 | class WeixinCommunityPipeline(object): 14 | 15 | def __init__(self): 16 | self.fp = open('wxjc.json', 'wb') 17 | self.exporter = JsonLinesItemExporter(self.fp, ensure_ascii=False, encoding='utf-8') 18 | 19 | def process_item(self, item, spider): 20 | # 获取一条item,就写入一条数据到文件中 21 | self.exporter.export_item(item) 22 | return item 23 | 24 | def close_spider(self, spider): 25 | self.fp.close() 26 | 27 | -------------------------------------------------------------------------------- /scrapy/weixin_community/weixin_community/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for weixin_community project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'weixin_community' 13 | 14 | SPIDER_MODULES = ['weixin_community.spiders'] 15 | NEWSPIDER_MODULE = 'weixin_community.spiders' 16 | 17 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 18 | # USER_AGENT = 'weixin_community (+http://www.yourdomain.com)' 19 | 20 | # Obey robots.txt rules 21 | ROBOTSTXT_OBEY = False 22 | 23 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 24 | # CONCURRENT_REQUESTS = 32 25 | 26 | # Configure a delay for requests for the same website (default: 0) 27 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 28 | # See also autothrottle settings and docs 29 | DOWNLOAD_DELAY = 1 30 | # The download delay setting will honor only one of: 31 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 32 | # CONCURRENT_REQUESTS_PER_IP = 16 33 | 34 | # Disable cookies (enabled by default) 35 | # COOKIES_ENABLED = False 36 | 37 | # Disable Telnet Console (enabled by default) 38 | # TELNETCONSOLE_ENABLED = False 39 | 40 | # Override the default request headers: 41 | DEFAULT_REQUEST_HEADERS = { 42 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 43 | 'Accept-Language': 'en', 44 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 45 | } 46 | 47 | # Enable or disable spider middlewares 48 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 49 | # SPIDER_MIDDLEWARES = { 50 | # 'weixin_community.middlewares.WeixinCommunitySpiderMiddleware': 543, 51 | # } 52 | 53 | # Enable or disable downloader middlewares 54 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 55 | # DOWNLOADER_MIDDLEWARES = { 56 | # 'weixin_community.middlewares.WeixinCommunityDownloaderMiddleware': 543, 57 | # } 58 | 59 | # Enable or disable extensions 60 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 61 | # EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | # } 64 | 65 | # Configure item pipelines 66 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 67 | ITEM_PIPELINES = { 68 | 'weixin_community.pipelines.WeixinCommunityPipeline': 300 69 | } 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | # AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | # AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | # AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | # AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | # HTTPCACHE_ENABLED = True 87 | # HTTPCACHE_EXPIRATION_SECS = 0 88 | # HTTPCACHE_DIR = 'httpcache' 89 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /scrapy/weixin_community/weixin_community/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scrapy/weixin_community/weixin_community/spiders/wx_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | from scrapy.linkextractors import LinkExtractor 4 | from scrapy.spiders import CrawlSpider, Rule 5 | from weixin_community.items import WeixinCommunityItem 6 | 7 | 8 | class WxSpiderSpider(CrawlSpider): 9 | name = 'wx_spider' 10 | allowed_domains = ['wxapp-union.com'] 11 | # 起始页从第 1 页开始 12 | start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1'] 13 | 14 | # 定义规则 15 | rules = ( 16 | # 列表【页面】 17 | Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True), 18 | 19 | # 详情【页面】 20 | Rule(LinkExtractor(allow=r'article-.+\.html'), callback='parse_detail', follow=False) 21 | ) 22 | 23 | 24 | def parse_detail(self, response): 25 | # 标题 26 | title = response.xpath('//h1[@class="ph"]/text()').get() 27 | 28 | # p 标签元素 29 | author_element_p = response.xpath('//p[@class="authors"]') 30 | 31 | # 作者 32 | author = author_element_p.xpath('./a/text()').get() 33 | 34 | # 发布时间 35 | pub_time = author_element_p.xpath('./span/text()').get() 36 | 37 | # 内容 38 | content_pre = response.xpath('//td[@id="article_content"]//text()').getall() 39 | 40 | content = "".join(content_pre).strip() 41 | 42 | # 把解析完的数据交个 Pipline 去处理 43 | yield WeixinCommunityItem(title=title, author=author, pub_time=pub_time, content=content) 44 | -------------------------------------------------------------------------------- /spiders/film_xinpianchang/models.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: models.py 12 | @time: 12/15/18 23:08 13 | @description:数据模型 14 | """ 15 | 16 | from datetime import datetime 17 | from mongoengine import StringField, URLField, IntField, Document, connect 18 | 19 | __author__ = 'xag' 20 | 21 | response = connect('admin', host='localhost', port=27017, username='root', password='xag') 22 | 23 | 24 | class FilmModel(Document): 25 | """ 26 | 电影【模型】 27 | """ 28 | title = StringField() # 电影标题 29 | type = StringField() # 电影类型 30 | play_num = StringField() # 播放量 31 | like_num = StringField() # 喜欢数 32 | img_cover = URLField() # 封面地址 33 | play_address = URLField() # 播放地址 34 | download_address = URLField() # 下载地址 35 | -------------------------------------------------------------------------------- /spiders/film_xinpianchang/tools_file.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: tools_file.py 12 | @time: 1/29/19 16:29 13 | @description:文件夹工具类 14 | """ 15 | import os 16 | 17 | 18 | def mkdir(path): 19 | """ 20 | 新建一个目录 21 | :param path:完整路径 22 | :return: 23 | """ 24 | if not os.path.exists(path): 25 | os.makedirs(path) 26 | 27 | return path 28 | -------------------------------------------------------------------------------- /spiders/film_xinpianchang/tools_string.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: tools_string.py 12 | @time: 1/28/19 23:50 13 | @description:TODO 14 | """ 15 | 16 | import random 17 | import string 18 | 19 | 20 | def remove_space(str): 21 | return ''.join(str.split(' ')).replace("\t", '').replace("\n", '') 22 | 23 | 24 | def make_random_string(num): 25 | """ 26 | 生成随机字符串 27 | :param num: 28 | :return: 29 | """ 30 | return ''.join(random.sample(string.ascii_letters + string.digits, num)) 31 | -------------------------------------------------------------------------------- /spiders/spider_bai_si_bu_de_jie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: spider_bai_si_bu_de_jie.py 12 | @time: 2018/9/25 19:58 13 | @description:利用多线程爬取【百思不得姐】网站的文字和图片并下载到csv文件中 14 | """ 15 | 16 | import requests 17 | from lxml import etree 18 | import threading 19 | from queue import Queue 20 | import time 21 | import csv 22 | from urllib import request 23 | import fileutils 24 | 25 | HEADERS = { 26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 27 | 'Referer': 'http://www.budejie.com/hot/1' 28 | } 29 | 30 | 31 | class BSSpider(threading.Thread): 32 | """ 33 | 爬取每一页的数据 34 | """ 35 | 36 | def __init__(self, page_queue, joke_queue, name, *args, **kwargs): 37 | super(BSSpider, self).__init__(*args, **kwargs) 38 | 39 | # 1.初始化数据 40 | self.page_queue = page_queue 41 | self.joke_queue = joke_queue 42 | self.name = name 43 | 44 | def run(self): 45 | while True: 46 | # 2.如果页面队列为空,就退出循环 47 | if self.page_queue.empty(): 48 | print(self.name + '任务完成~') 49 | # while not self.joke_queue.empty(): 50 | # print(self.joke_queue.get()) 51 | break 52 | 53 | # 3.从队列中获取页面地址 54 | page_url = self.page_queue.get() 55 | self.spider_page(page_url) 56 | 57 | # 6.休眠0.5秒 58 | time.sleep(0.5) 59 | 60 | def spider_page(self, page_url): 61 | """ 62 | 爬取一页的数据 63 | :param page_url:页面的url 64 | :return: 65 | """ 66 | response = requests.get(page_url, headers=HEADERS) 67 | text_raw = response.text 68 | html_element = etree.HTML(text_raw) 69 | 70 | # 4.利用xpath去解析数据 71 | div_elements = html_element.xpath('//div[@class="j-r-list"]') 72 | 73 | for div_element in div_elements: 74 | duan_zi_elments = div_element.xpath('./ul/li') 75 | for duan_zi_elment in duan_zi_elments: 76 | # 【数据】用户名 77 | username = duan_zi_elment.xpath('.//a[@class="u-user-name"]/text()')[0] 78 | 79 | # 【数据】段子发布时间 80 | pubtime = duan_zi_elment.xpath('.//span/text()')[0] 81 | 82 | desc_element = duan_zi_elment.xpath('.//div[@class="j-r-list-c-desc"]')[0] 83 | # 【数据】段子描述内容 84 | content = desc_element.xpath('./a/text()')[0] 85 | 86 | img_div_element = duan_zi_elment.xpath('.//div[@class="j-r-list-c-img"]')[0] 87 | img = img_div_element.xpath('.//img/@data-original')[0] 88 | alt = img_div_element.xpath('.//img/@alt')[0] 89 | 90 | # 5.把解析后的数据以元组的方式放入到队列中去 91 | self.joke_queue.put((username, content, img, alt, pubtime)) 92 | 93 | 94 | class BSWriter(threading.Thread): 95 | """ 96 | 下载图片、写入文字数据到csv文件中 97 | """ 98 | 99 | def __init__(self, page_queue, joke_queue, writer, gLock, name, *args, **kwargs): 100 | super(BSWriter, self).__init__(*args, **kwargs) 101 | 102 | # 1.初始化 103 | self.page_queue = page_queue 104 | self.joke_queue = joke_queue 105 | self.writer = writer 106 | self.gLock = gLock 107 | self.name = name 108 | 109 | def run(self): 110 | while True: 111 | if self.joke_queue.empty() and self.page_queue.empty(): 112 | print(self.name + '任务完成~') 113 | break 114 | 115 | # 2.从joke_queue队列中获取数据 116 | joke_info = self.joke_queue.get(timeout=40) 117 | username, content, img, alt, pubtime = joke_info 118 | 119 | # 3.上锁 120 | self.gLock.acquire() 121 | 122 | # 4.写入数据到csv中 123 | self.writer.writerow((username, content, img, alt, pubtime)) 124 | 125 | # 5.下载图片到本地 126 | # file_name = alt + fileutils.get_file_suffix(img) 127 | # request.urlretrieve(img, './imgs/%s' % file_name) 128 | 129 | # 6.释放锁 130 | self.gLock.release() 131 | 132 | print('写入一条数据成功') 133 | 134 | 135 | class BSDownImg(threading.Thread): 136 | """ 137 | 下载图片的消费者 138 | """ 139 | 140 | def __init__(self, page_queue, joke_queue, gLock, name, *args, **kwargs): 141 | super(BSDownImg, self).__init__(*args, **kwargs) 142 | self.page_queue = page_queue 143 | self.joke_queue = joke_queue 144 | self.gLock = gLock 145 | self.name = name 146 | 147 | def run(self): 148 | while True: 149 | if self.joke_queue.empty() and self.page_queue.empty(): 150 | print(self.name + '任务完成~') 151 | break 152 | username, content, img, alt, pubtime = self.joke_queue.get(timeout=40) 153 | 154 | # 上锁并下载图片 155 | self.gLock.acquire() 156 | file_name = alt + fileutils.get_file_suffix(img) 157 | request.urlretrieve(img, './imgs/%s' % file_name) 158 | self.gLock.release() 159 | 160 | print('下载一张图片成功') 161 | 162 | 163 | def spider(): 164 | """ 165 | 爬取百思不得姐的前20页数据 166 | :return: 167 | """ 168 | 169 | # 1.构建队列【生产者、消费者需要上锁的对象】 170 | page_queue = Queue(20) 171 | joke_queue = Queue(200) 172 | 173 | # 2.锁对象 174 | gLock = threading.Lock() 175 | 176 | # 3.写入 177 | fp = open('jokes.csv', 'a', newline='', encoding='utf-8') 178 | writer = csv.writer(fp) 179 | 180 | # 4.写入csv表头信息 181 | writer.writerow(['username', 'content', 'img', 'alt', 'pubtime']) 182 | 183 | # 5.前10页待爬取的地址,放入到队列中 184 | for page_num in range(1, 11): 185 | page_url = 'http://www.budejie.com/hot/%d' % page_num 186 | page_queue.put(page_url) 187 | 188 | # 6.构建10个生成者来进行爬虫 189 | for x in range(1, 6): 190 | t = BSSpider(page_queue, joke_queue, name='生产者%d' % x) 191 | t.start() 192 | 193 | # 7.构建 20 个消费者来写入数据到csv文件中 194 | for x in range(1, 21): 195 | t = BSWriter(page_queue, joke_queue, writer, gLock, name='消费者-文字%d' % x) 196 | t.start() 197 | 198 | # 8.构建 50 个消费者来下载图片 199 | for x in range(1, 51): 200 | t = BSDownImg(page_queue, joke_queue, gLock, name='消费者-图片%d' % x) 201 | t.start() 202 | 203 | 204 | if __name__ == '__main__': 205 | spider() 206 | -------------------------------------------------------------------------------- /spiders/spider_boss.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: spider_boss.py 12 | @time: 2018/10/12 10:17 13 | @description:使用selenium爬取boss直聘网并写入到csv文件中 14 | """ 15 | 16 | from selenium import webdriver 17 | import re 18 | from lxml import etree 19 | import requests 20 | import time 21 | import string_utils 22 | import csv 23 | 24 | current_page = 1 25 | 26 | 27 | class BossSpider(object): 28 | driver_path = "/usr/local/bin/chromedriver" 29 | 30 | def __init__(self): 31 | self.driver = webdriver.Chrome(executable_path=BossSpider.driver_path) 32 | 33 | # 网页前缀 34 | self.domain = 'https://www.zhipin.com' 35 | 36 | # 爬取在首页 37 | self.url = 'https://www.zhipin.com/job_detail/?query=python&scity=100010000&industry=&position=' 38 | 39 | self.positions = [] 40 | 41 | # 保存数据到 csv 文件中【追加】 42 | fp = open('positions.csv', 'a', newline='', encoding='utf-8') 43 | self.writer = csv.DictWriter(fp, ['company_name', 'name', 'salary', 'city', 'work_years', 'education', 'desc']) 44 | self.writer.writeheader() 45 | 46 | def run(self): 47 | self.driver.get(self.url) 48 | 49 | global current_page 50 | 51 | while True: 52 | 53 | print('爬取第%d页数据' % current_page) 54 | current_page = current_page + 1 55 | 56 | # 获取首页在源码内容 57 | source = self.driver.page_source 58 | 59 | # 爬去当前页面在数据 60 | self.parse_current_page(source) 61 | 62 | next_bt = self.driver.find_element_by_xpath("//a[@ka='page-next']") 63 | 64 | if 'disabled' in next_bt.get_attribute("class"): 65 | # 最后一页,爬取完成之后,退出应用 66 | self.driver.quit() 67 | break 68 | else: 69 | next_bt.click() 70 | 71 | time.sleep(1) 72 | 73 | # 由于boss直聘做了反爬【验证码】,这里只爬取一页数据 74 | break 75 | 76 | def parse_current_page(self, source): 77 | """ 78 | 解析当前页面在数据获取到详情页面在url:detail_url 79 | :param source: 80 | :return: 81 | """ 82 | html = etree.HTML(source) 83 | 84 | # 获取到每一个职位在详情地址 85 | detail_urls_pre = html.xpath('//div[@class="info-primary"]//a/@href') 86 | # links = html.xpath("//div[@class='info-primary']//a[position()=1]/@href") 87 | 88 | # 利用lambda + map 对职位详情地址列表加入前缀 89 | detail_urls = list(map(lambda x: self.domain + x, detail_urls_pre)) 90 | 91 | # 爬取详情页面的数据 92 | for detail_url in detail_urls: 93 | self.request_detail_page(detail_url) 94 | 95 | time.sleep(1) 96 | 97 | def request_detail_page(self, detail_url): 98 | """ 99 | 打开职位详情页面 100 | :param detail_url: 101 | :return: 102 | """ 103 | 104 | # 1.切换到详情页面窗口 105 | self.driver.execute_script("window.open('%s')" % (detail_url)) 106 | self.driver.switch_to.window(self.driver.window_handles[1]) 107 | 108 | # 2.获取详情页面的源码数据 109 | page_source_detail = self.driver.page_source 110 | 111 | # 3.解析详情页面 112 | self.parse_detail_page(page_source_detail) 113 | 114 | # 4.关闭当前窗口并切换回列表 115 | self.driver.close() 116 | 117 | self.driver.switch_to.window(self.driver.window_handles[0]) 118 | 119 | def parse_detail_page(self, page_source_detail): 120 | """ 121 | 解析职位详情页面 122 | :param page_source_detail: 123 | :return: 124 | """ 125 | html = etree.HTML(page_source_detail) 126 | 127 | # 数据 - 名称 128 | name = html.xpath('//h1/text()')[0] 129 | 130 | # 数据 - 公司名称 131 | company_name = html.xpath('//h3[@class="name"]/a[@ka="job-detail-company"]/text()')[0].strip() 132 | 133 | # 数据 - 薪水 134 | salary = html.xpath("//div[@class='name']/span[@class='badge']/text()")[0].strip() 135 | 136 | # 数据 - info 137 | infos = html.xpath("//div[@class='job-primary detail-box']/div[@class='info-primary']/p/text()") 138 | 139 | desc_pre = html.xpath('//div[@class="job-sec"]/div[@class="text"]/text()') 140 | 141 | # 每一项换行,去掉前后空格,最后去掉特殊符号 142 | desc = string_utils.remove_special_word('\n'.join(desc_pre).strip()) 143 | 144 | city = infos[0] 145 | work_years = infos[1] 146 | education = infos[2] 147 | 148 | position = { 149 | 'company_name': company_name, 150 | 'name': name, 151 | 'salary': salary, 152 | 'city': city, 153 | 'work_years': work_years, 154 | 'education': education, 155 | 'desc': desc 156 | 157 | } 158 | print('爬取一条数据成功') 159 | print("==" * 40) 160 | 161 | # 写入到csv文件中 162 | self.write_to_csv(position) 163 | 164 | self.positions.append(position) 165 | 166 | def write_to_csv(self, position): 167 | """ 168 | 把职位信息写入到 csv 文件中 169 | :param position: 170 | :return: 171 | """ 172 | self.writer.writerow(position) 173 | 174 | 175 | if __name__ == '__main__': 176 | # 定义爬虫类 177 | spider = BossSpider() 178 | 179 | # 开始执行爬虫 180 | spider.run() 181 | 182 | # 写入到csv文件中 183 | 184 | # 查看数据 185 | print('恭喜!爬取数据完成~') 186 | print(spider.positions) 187 | -------------------------------------------------------------------------------- /spiders/spider_china_weather.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: spider_china_weather.py 12 | @time: 2018/9/20 0:04 13 | @description:利用requests + bs4 + html5lib + pyecharts爬取中国天气网的最低气温并可视化 14 | @install:# pip install pyecharts/pyecharts-snapshot 15 | """ 16 | 17 | import requests 18 | from bs4 import BeautifulSoup 19 | import time 20 | from pyecharts import Bar 21 | 22 | 23 | # 一共8个区域,包含:华北、东北、华东、华中、华南、西北、西南、港澳台 24 | # 华北 25 | url_hb = 'http://www.weather.com.cn/textFC/hb.shtml' 26 | 27 | # 东北 28 | url_db = 'http://www.weather.com.cn/textFC/db.shtml' 29 | 30 | # 华东 31 | url_hd = 'http://www.weather.com.cn/textFC/hd.shtml' 32 | 33 | # 华中 34 | url_hz = 'http://www.weather.com.cn/textFC/hz.shtml' 35 | 36 | # 华南 37 | url_hn = 'http://www.weather.com.cn/textFC/hn.shtml' 38 | 39 | # 西北 40 | url_xb = 'http://www.weather.com.cn/textFC/xb.shtml' 41 | 42 | # 西南 43 | url_xn = 'http://www.weather.com.cn/textFC/xn.shtml' 44 | 45 | # 港澳台【比较特殊】 46 | url_gat = 'http://www.weather.com.cn/textFC/gat.shtml' 47 | 48 | url_areas = [url_hb, url_db, url_hd, url_hz, url_hn, url_xb, url_xn, url_gat] 49 | 50 | HEADERS = { 51 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 52 | 'Referer': 'http://www.weather.com.cn/textFC/hb.shtml' 53 | } 54 | 55 | # 数据【城市+最低温度】列表 56 | ALL_DATA = [] 57 | 58 | 59 | def parse_page(url): 60 | """ 61 | 解析一个区域:华北、东北、华东等 62 | :param url: 63 | :return: 64 | """ 65 | response = requests.get(url, headers=HEADERS) 66 | 67 | # 1.获取页面的原始html数据 68 | text = response.content.decode('utf-8') 69 | 70 | 71 | # 注意:港澳台中香港的table标签没有正确的关闭,使用lxml解析器不能正确解析。需要使用html5lib【容错性强】去自动补全代码,然后进行解析 72 | soup = BeautifulSoup(text, 'html5lib') 73 | 74 | div_conMidtab = soup.find('div', class_='conMidtab') 75 | 76 | # 3.获取所有的table子Tag【天气信息都保存在table标签下面】 77 | tables = div_conMidtab.find_all('table') 78 | 79 | # 4.遍历片区下面的省份 80 | for table in tables: 81 | # 4.1过滤掉表头的两个tr数据 82 | trs = table.find_all('tr')[2:] 83 | 84 | # 5.遍历省份下面的市区 85 | for index, tr in enumerate(trs): 86 | tds = tr.find_all('td') 87 | 88 | # 5.1 城市名称【第 1 个td标签】 89 | # 注意:一个省份第一个城市取第 2 个td标签;其余城市取第 1 个td标签 90 | city_td = tds[1] if index == 0 else tds[0] 91 | 92 | city = list(city_td.stripped_strings)[0] 93 | 94 | # 5.2 最低气温【倒数第 2 个td标签】 95 | temp_low_td = tds[-2] 96 | 97 | temp_low = list(temp_low_td.stripped_strings)[0] 98 | 99 | ALL_DATA.append({"city": city, "temp_low": int(temp_low)}) 100 | 101 | 102 | def spider(): 103 | for index, url in enumerate(url_areas): 104 | print('开始爬取第{}个区域'.format(index + 1)) 105 | parse_page(url) 106 | time.sleep(1) 107 | 108 | 109 | def analysis_data(): 110 | """ 111 | 分析爬下来的数据 112 | :return: 113 | """ 114 | 115 | # 1.默认的排序方式是升序【通过最低气温进行排序】 116 | ALL_DATA.sort(key=lambda data: data['temp_low']) 117 | 118 | # 2.获取前面10条数据 119 | top_10 = ALL_DATA[:10] 120 | 121 | return top_10 122 | 123 | 124 | def show_with_chart(top_10): 125 | """ 126 | 把最低的十个城市和温度生成饼状图 127 | :param top_10: 128 | :return: 129 | """ 130 | # 1.获取城市列表 131 | citys = list(map(lambda item: item['city'], top_10)) 132 | 133 | # 2.最低温度列表 134 | temp_lows = list(map(lambda item: item['temp_low'], top_10)) 135 | 136 | # 3.生成饼状图并写入到html文件中 137 | bar = Bar("最低气温排行榜") 138 | 139 | bar.add("最低温度", citys, temp_lows) 140 | 141 | # 渲染 142 | bar.render('temperature.html') 143 | 144 | 145 | if __name__ == '__main__': 146 | # 1.爬取数据 147 | spider() 148 | 149 | # 2.分析数据 150 | top_10 = analysis_data() 151 | 152 | # 3.使用pyecharts生成饼状图 153 | show_with_chart(top_10) 154 | -------------------------------------------------------------------------------- /spiders/spider_dou_tu_la.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: spider_dou_tu_la 12 | @time: 2018/9/25 14:40 13 | @description:多线程去爬取斗图啦网站的表情 14 | @spider_to:http://www.doutula.com/ 15 | """ 16 | 17 | import requests 18 | from lxml import etree 19 | from urllib import request 20 | import re 21 | import os 22 | import threading 23 | from queue import Queue 24 | import time 25 | 26 | # 技术点 27 | # 1.使用request是获取html数据 28 | # 2.使用xpath解析数据 29 | # 3.使用正则表达式sub()函数过滤掉特殊的字符 30 | # 4.使用urllib.request.urlretrieve()下载图片 31 | # 5.生产者和消费者模式分离 32 | # 6.使用queue[线程安全]去保存【每一页的爬取地址】和【表情图片地址】 33 | 34 | HEADERS = { 35 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' 36 | } 37 | 38 | 39 | class Procuder(threading.Thread): 40 | """ 41 | 生产者 42 | 爬取页面,获取图片地址加入到表情图片队列中 43 | """ 44 | 45 | def __init__(self, name, page_queue, img_queue, *args, **kwargs): 46 | super(Procuder, self).__init__(*args, **kwargs) 47 | self.name = name 48 | self.page_queue = page_queue 49 | self.img_queue = img_queue 50 | 51 | def run(self): 52 | while True: 53 | if self.page_queue.empty(): 54 | print(self.name + '任务完成~') 55 | break 56 | # 1.获取每一页的url 57 | page_url = self.page_queue.get() 58 | 59 | # 2.爬取页面的数据 60 | self.spider_page(page_url) 61 | 62 | # 3.休眠0.5秒 63 | time.sleep(0.5) 64 | 65 | def spider_page(self, url): 66 | """ 67 | 爬取每一页 68 | :param url: 每一页的地址 69 | :return: 70 | """ 71 | response = requests.get(url, headers=HEADERS) 72 | text_raw = response.text 73 | 74 | # 1.使用etree 75 | html_raw = etree.HTML(text_raw) 76 | 77 | # 2.使用xpath解析数据 78 | # 注意:过滤掉gif标签图片 79 | imgs = html_raw.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]') 80 | 81 | # 3.获取图片的实际连接并下载到本地 82 | for img in imgs: 83 | # 3.1 图片的实际地址 84 | img_url = img.get('data-original') 85 | 86 | # 3.2 图片名称替换特殊符号 87 | alt = re.sub(r'[\??\.,。!!\*]', '', img.get('alt')) 88 | 89 | # 3.3 提取图片的后缀,组装成文件的名字 90 | img_name = alt + os.path.splitext(img_url)[-1] 91 | 92 | # 3.4 把爬取到的表情【图片地址+图片名称】以【元组】的形式加入到队列图片队列中 93 | self.img_queue.put((img_url, img_name)) 94 | 95 | 96 | class Consumer(threading.Thread): 97 | """ 98 | 消费者 99 | 获取图片的地址下载到本地 100 | """ 101 | 102 | def __init__(self, name, page_queue, img_queue, *args, **kwargs): 103 | super(Consumer, self).__init__(*args, **kwargs) 104 | self.name = name 105 | self.page_queue = page_queue 106 | self.img_queue = img_queue 107 | 108 | def run(self): 109 | while True: 110 | 111 | if self.img_queue.empty() and self.page_queue.empty(): 112 | print(self.name + '任务完成~') 113 | break 114 | 115 | # 1.解包,获取图片的地址 + 图片的名称 116 | img_url, img_name = self.img_queue.get() 117 | 118 | # 2.使用urlretrieve()函数下载图片到本地 119 | request.urlretrieve(img_url, './imgs/%s' % img_name) 120 | 121 | print(img_name + "下载完成") 122 | 123 | 124 | def spider(): 125 | # 1.页面的队列 126 | page_queue = Queue(100) 127 | 128 | # 2.表情图片的队列 129 | # 注意:队列的大小尽量设置大一些,保证线程减少等待的时间 130 | img_queue = Queue(1000) 131 | 132 | # 3.爬取页面的地址 133 | for x in range(1, 10): 134 | url = 'http://www.doutula.com/photo/list/?page=%d' % x 135 | 136 | # 3.1 存入到页面地址队列中 137 | page_queue.put(url) 138 | 139 | # 创建5个生成者和5个消费者 140 | # 生产者:爬取每一页的数据,获取表情图片的url 141 | # 消费者:从表情队列中获取表情图片的实际地址并下载到本地 142 | for x in range(5): 143 | t = Procuder(name='生产线程-%d' % x, page_queue=page_queue, img_queue=img_queue) 144 | t.start() 145 | 146 | for x in range(5): 147 | t = Consumer(name='消费线程-%d' % x, page_queue=page_queue, img_queue=img_queue) 148 | t.start() 149 | 150 | 151 | if __name__ == '__main__': 152 | spider() 153 | -------------------------------------------------------------------------------- /spiders/spider_dytt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: 4.dytt.py 12 | @time: 2018/9/16 18:46 13 | @description:爬电影天堂【 lxml + xpath + requests】【2018新片精品,包含更多】 14 | """ 15 | 16 | import requests 17 | from lxml import etree 18 | import time 19 | 20 | # url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_1.html' 21 | 22 | # 主页地址 23 | BASE_DOMAIN = 'http://www.dytt8.net' 24 | 25 | HEADERS = { 26 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36', 27 | } 28 | 29 | 30 | def get_detail_urls(url): 31 | """ 32 | 获取电影详情页面的url 33 | :param url: 每一页电影列表的地址url 34 | :return: 35 | """ 36 | response = requests.get(url, headers=HEADERS) 37 | 38 | # 注意:右键查看源代码,charset=gb2312" 编码方式【网站编码不规范,解码必须用响应的编码方式进行解码】 39 | # print(response.content.decode('gbk')) 40 | 41 | # html_element = etree.HTML(response.content.decode('gbk')) 42 | 43 | # 注意:电影天堂第3页使用默认的gbk会有乱码,这里使用默认的解码方式【href为英文,解析不会受影响】 44 | html_element = etree.HTML(response.text) 45 | 46 | # 【数据 - 字符串列表】详情页面地址 47 | # 所有class为tbspan的table标签/子孙标签中的a标签的href属性 48 | detail_urls = html_element.xpath('//table[@class="tbspan"]//a/@href') 49 | 50 | # 深拷贝一份列表数据,实现一变遍历列表,一边删除列表数据 51 | # 过滤掉【综合电影】导致的脏数据 52 | detail_urls_new = detail_urls 53 | for index, detail_url in enumerate(detail_urls_new): 54 | if detail_url == '/html/gndy/jddy/index.html': 55 | detail_urls.remove(detail_url) 56 | 57 | # print(detail_urls) 58 | 59 | # print(BASE_DOMAIN + detail_url) 60 | # 组装详情页面的地址 61 | detail_urls = map(lambda x: BASE_DOMAIN + x, detail_urls) 62 | 63 | return detail_urls 64 | 65 | 66 | def parse_detail_page(detail_url): 67 | """ 68 | 解析电影详情页面 69 | :param detail_url: 详情页面的地址 70 | :return: 71 | """ 72 | response = requests.get(detail_url, headers=HEADERS) 73 | text = response.content.decode('gbk') 74 | html_element = etree.HTML(text) 75 | 76 | # 【数据 - 电影标题】 77 | title = html_element.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()')[0] 78 | 79 | # 获取zoom标签 80 | zoom_element = html_element.xpath('//div[@id="Zoom"]')[0] 81 | 82 | # 【数据 - 电影封面和电影截图】 83 | imgs = zoom_element.xpath(".//img/@src") 84 | 85 | # 注意:为了避免脏数据导致应用挂掉,提前初始化 86 | year, country, type, rating, duration, director, actors, cover, screen_shot, download_url = '', '', '', '', '', '', '', '', '', '' 87 | 88 | if len(imgs) > 0: 89 | cover = imgs[0] 90 | 91 | # 【数据 - 电影截图】 92 | if len(imgs) > 1: 93 | screen_shot = imgs[1] 94 | 95 | # 获取div[@id='zoom']标签下面的所有的文本数据【子孙所有的text文本数据】 96 | infos = zoom_element.xpath('.//text()') 97 | 98 | # 解析具体内容的函数 99 | def parse_info(info, rule): 100 | return info.replace(rule, '').strip() 101 | 102 | # 遍历infos每一项去获取有用的数据 103 | for key, info in enumerate(infos): 104 | 105 | # print('遍历第{}项'.format(key)) 106 | # print(info) 107 | # print('结束==================================================') 108 | 109 | if info.startswith('◎年  代'): 110 | # 年代 111 | year = parse_info(info, '◎年  代') 112 | elif info.startswith('◎产  地'): 113 | # 产地 114 | country = parse_info(info, '◎产  地') 115 | elif info.startswith('◎类  别'): 116 | # 类别 117 | type = parse_info(info, '◎类  别') 118 | elif info.startswith('◎豆瓣评分'): 119 | # 豆瓣评分 120 | rating = parse_info(info, '◎豆瓣评分') 121 | elif info.startswith('◎片  长'): 122 | # 片长 123 | duration = parse_info(info, '◎片  长') 124 | elif info.startswith('◎导  演'): 125 | # 导演 126 | director = parse_info(info, '◎导  演') 127 | elif info.startswith('◎主  演'): 128 | # 演员【第一个演员】 129 | actor_first = parse_info(info, '◎主  演') 130 | 131 | actors = [actor_first] 132 | 133 | # 继续往下面遍历 134 | for index in range(key + 1, len(infos)): 135 | item = infos[index].strip() 136 | if item.startswith('◎简  介'): 137 | break 138 | # 获取所有的演员 139 | # print(item) 140 | actors.append(item) 141 | elif info.startswith('◎简  介'): 142 | # desc = parse_info(info, '◎简  介') 143 | 144 | for index in range(key + 1, len(infos)): 145 | item = infos[index].strip() 146 | if item.startswith('【下载地址】'): 147 | break 148 | desc = item 149 | 150 | print(detail_url) 151 | 152 | # 下载地址 153 | if len(html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()')) > 0: 154 | download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/a/text()')[0] 155 | elif len(html_element.xpath('//td[@bgcolor="#fdfddf"]/text()')) > 0: 156 | download_url = html_element.xpath('//td[@bgcolor="#fdfddf"]/text()')[0] 157 | 158 | film = { 159 | 'title': title, 160 | 'cover': cover, 161 | 'screen_shot': screen_shot, 162 | 'year': year, 163 | 'country': country, 164 | 'type': type, 165 | 'rating': rating, 166 | 'duration': duration, 167 | 'director': director, 168 | 'actors': actors, 169 | 'desc': desc, 170 | 'download_url': download_url 171 | } 172 | 173 | return film 174 | 175 | 176 | def spider(): 177 | """ 178 | 爬虫的入口 179 | :return: 180 | """ 181 | base_url = 'http://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' 182 | 183 | films = [] 184 | 185 | # 1.获取第1-10页的数据 186 | for index in range(1, 11): 187 | print('开始爬第{}页'.format(index)) 188 | 189 | # 2.电影列表的地址url 190 | url = base_url.format(index) 191 | 192 | # 3.获取当前页面包含的所有电影【详情地址】 193 | detail_urls = get_detail_urls(url) 194 | 195 | # 4.解析每一项电影的详情页面 196 | 197 | for key, detail_url in enumerate(detail_urls): 198 | # print('索引:' + str(key) + ',地址:' + detail_url) 199 | # print('解析详情页面:' + detail_url) 200 | film = parse_detail_page(detail_url) 201 | 202 | films.append(film) 203 | 204 | # 5.每爬取一页,就休眠2秒钟 205 | time.sleep(1) 206 | 207 | print(films) 208 | 209 | 210 | if __name__ == '__main__': 211 | spider() 212 | -------------------------------------------------------------------------------- /spiders/spider_gushiwen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: spider_gushiwen 12 | @time: 2018/9/21 17:34 13 | @description:利用【正则表达式】爬取【古诗文】网 14 | @link:https://www.gushiwen.org/ 15 | """ 16 | 17 | import requests 18 | import re 19 | import time 20 | 21 | HEADERS = { 22 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' 23 | } 24 | 25 | 26 | # 利用正则表达式去爬虫的注意事项 27 | # 1.正则表达式去爬取元素的时候,与 xpath、bs4 不同,没有结构关系,都是当成一个字符串进行匹配处理 28 | # 2.re.DOTALL可以让【.符号】匹配到所有的字符【包含\n】 29 | # 3.正则表达式匹配【任意多字符】一般采用非饥饿型方式【.*?】 30 | 31 | 32 | def spider_page(url): 33 | """ 34 | 爬取某一页的数据 35 | :param url: 36 | :return: 37 | """ 38 | response = requests.get(url, headers=HEADERS) 39 | text_raw = response.text 40 | 41 | # print(text_raw) 42 | 43 | # 1.获取所有的标题 44 | titles = re.findall(r'.*?(.*?)', text_raw, re.DOTALL) 45 | 46 | # 2.获取所有的朝代 47 | dynasties = re.findall(r'.*?(.*?)', text_raw, re.DOTALL) 48 | 49 | # 3.获取作者信息 50 | authors = re.findall(r'.*?.*?(.*?)', text_raw, re.DOTALL) 51 | 52 | # 4.获取古诗文内容 53 | # 内容待进一步美化【去掉多余的元素】 54 | contents_pre = re.findall(r'(.*?)', text_raw, re.DOTALL) 55 | 56 | contents = [] 57 | for content_pre in contents_pre: 58 | # 4.1 利用sub()函数把内容中的【<.*?>或者换行字符】替换为空 59 | content = re.sub(r'<.*?>|\n', "", content_pre) 60 | contents.append(content.strip()) 61 | 62 | # 诗词列表数据 63 | poems = [] 64 | 65 | # 5. 使用zip()把四个列表组合在一起 66 | for value in zip(titles, dynasties, authors, contents): 67 | # 5.1 自动进行解包放入到变量当中 68 | title, dynastie, author, content = value 69 | 70 | # 5.2 新建dict,并加入到诗词列表数据中 71 | poem = { 72 | 'title': title, 73 | 'dynastie': dynastie, 74 | 'author': author, 75 | 'content': content 76 | } 77 | 78 | poems.append(poem) 79 | 80 | return poems 81 | 82 | 83 | def spider(): 84 | # 全部诗词列表数据 85 | poems = [] 86 | 87 | # 1.爬取前面10页数据 88 | for page_num in range(10): 89 | url = 'https://www.gushiwen.org/default_{}.aspx'.format(page_num + 1) 90 | 91 | print('开始爬取第{}页诗词数据'.format(page_num + 1)) 92 | 93 | poems.append(spider_page(url)) 94 | 95 | time.sleep(1) 96 | 97 | # 2.显示数据 98 | for poem in poems: 99 | print(poem) 100 | print("==" * 40) 101 | 102 | print('恭喜!爬取数据完成!') 103 | 104 | 105 | if __name__ == '__main__': 106 | spider() 107 | -------------------------------------------------------------------------------- /spiders/spider_qiu_shi_bai_ke.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: spider_qiu_shi_bai_ke.py 12 | @time: 2018/9/21 23:16 13 | @description:利用正则表达式去爬取【糗事百科】的文字数据 14 | @link:https://www.qiushibaike.com/text/ 15 | """ 16 | 17 | import re 18 | import requests 19 | 20 | # 待爬取的地址 21 | base_url = 'https://www.qiushibaike.com/text/page/%s/' 22 | 23 | HEADERS = { 24 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 25 | 'Referer': 'https://www.qiushibaike.com/' 26 | } 27 | 28 | 29 | def spider_page(url): 30 | """ 31 | 爬取某一页的数据 32 | :param url: 33 | :return: 34 | """ 35 | response = requests.get(url, headers=HEADERS) 36 | text_raw = response.text 37 | 38 | # 获取此页的段子数据 39 | # 1.获取作者列表数据 40 | authors_pre = re.findall(r'(.*?)', text_raw, re.DOTALL) 41 | 42 | # 1.1 对获取的作者信息进一步进行处理【数据中包含\n】 43 | authors = [] 44 | for author_pre in authors_pre: 45 | author = re.sub(r'\n', '', author_pre) 46 | authors.append(author) 47 | 48 | # 2.获取段子列表数据 49 | contents_pre = re.findall(r'.*?(.*?)', text_raw, re.S) 50 | 51 | # 2.1 对段子数据进一步处理【数据中包含\n和
】 52 | contents = [] 53 | for content_pre in contents_pre: 54 | content = re.sub(r'<.*?>|\n', '', content_pre) 55 | contents.append(content) 56 | 57 | # 3.把两个列表数据组装成一个新的列表中 58 | jokes = [] 59 | for temp in zip(authors, contents): 60 | author, content = temp 61 | jokes.append({ 62 | 'author': author, 63 | 'content': content 64 | }) 65 | 66 | # 4.返回当前页面获取的段子数据列表 67 | return jokes 68 | 69 | 70 | def spider(): 71 | jokes = [] 72 | 73 | for page_num in range(1, 10): 74 | print('开始爬取第%s页数据' % page_num) 75 | 76 | # 爬取某一页的数据 77 | jokes.append(spider_page(base_url % page_num)) 78 | 79 | # 打印爬取的数据 80 | for joke in jokes: 81 | print(joke) 82 | 83 | print('恭喜!爬取数据完成!') 84 | 85 | 86 | if __name__ == '__main__': 87 | spider() 88 | -------------------------------------------------------------------------------- /spiders/spider_tencent_recruit.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: spider_tencent_recruit 12 | @time: 2018/9/17 11:22 13 | @description:爬腾讯招聘职位信息 14 | """ 15 | 16 | import requests 17 | 18 | from lxml import etree 19 | 20 | import time 21 | 22 | # 每页的职位数 23 | PAGE_SIZE = 10 24 | 25 | BASE_DOMAIN = 'https://hr.tencent.com/' 26 | 27 | HEADERS = { 28 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.92 Safari/537.36', 29 | 'Referer': 'https://hr.tencent.com/position.php?lid=&tid=&keywords=python&start=10', 30 | 'Cookie': '_ga=GA1.2.1222789966.1535530525; pgv_pvi=8193187840; pgv_si=s2985358336; PHPSESSID=22e3m8aknd19s1gqkh0i9eisk0; Hm_lvt_0bd5902d44e80b78cb1cd01ca0e85f4a=1536726429,1536908218,1537154694,1537166987; Hm_lpvt_0bd5902d44e80b78cb1cd01ca0e85f4a=1537167106' 31 | } 32 | 33 | 34 | def get_jo_detail_urls(page_url): 35 | """ 36 | 1.根据当前页面url地址获取每一个职位的详情页面url 37 | :param page_url:当前页面的url 38 | :return: 39 | """ 40 | response = requests.get(page_url, headers=HEADERS) 41 | 42 | html_element = etree.HTML(response.text) 43 | 44 | # print(etree.tostring(html_element, encoding='utf-8').decode('utf-8')) 45 | 46 | detail_urls = html_element.xpath('//tr[@class="even" or @class="odd"]//a/@href') 47 | 48 | # 获取所有职位详情页面的url 49 | detail_urls = map(lambda detail_url: BASE_DOMAIN + detail_url, detail_urls) 50 | 51 | return detail_urls 52 | 53 | 54 | def get_detail_msg(detail_url): 55 | """ 56 | 2.获取某个职位的详细数据 57 | :param detail_url: 职位详细页面的url 58 | :return: 职位数据 59 | """ 60 | # print('请求的详细地址是:' + detail_url) 61 | response = requests.get(detail_url, headers=HEADERS) 62 | html_element = etree.HTML(response.text) 63 | 64 | position = {} 65 | 66 | # 【数据】获取职位标题 67 | title = html_element.xpath('//tr[@class="h"]/td/text()')[0] 68 | position['title'] = title 69 | 70 | # 【数据】工作地点/职位类别 71 | top_infos = html_element.xpath('//tr[@class="c bottomline"]//text()') 72 | position['location'] = top_infos[top_infos.index('工作地点:') + 1] 73 | position['category'] = top_infos[top_infos.index('职位类别:') + 1] 74 | 75 | content_infos = html_element.xpath('//ul[@class="squareli"]') 76 | # 【数据】工作职责 77 | work_do_info = content_infos[0] 78 | position['duty'] = work_do_info.xpath("./li/text()") 79 | 80 | # 【数据】工作要求 81 | work_ask_info = content_infos[1] 82 | position['ask'] = work_ask_info.xpath('./li/text()') 83 | 84 | return position 85 | 86 | 87 | def spider(): 88 | # 0.待返回的职位数据 89 | positions = [] 90 | 91 | # 1.获取前10页的职位数据 92 | for page_num in range(0, 10): 93 | print('开始爬取第{}页数据'.format(page_num + 1)) 94 | 95 | # 2.每一页的地址 96 | url = 'https://hr.tencent.com/position.php?keywords=python&lid=0&tid=0&start={}#a'.format(page_num * PAGE_SIZE) 97 | 98 | # 3.获取【当前页】所有职位的【详情页面的url】 99 | detail_urls = get_jo_detail_urls(url) 100 | 101 | # 4.一个个去解析详情页面的数据 102 | for detail_url in detail_urls: 103 | position = get_detail_msg(detail_url) 104 | positions.append(position) 105 | 106 | time.sleep(1) 107 | 108 | print('爬取完成!') 109 | print(positions) 110 | 111 | 112 | if __name__ == '__main__': 113 | spider() 114 | -------------------------------------------------------------------------------- /spiders/发表情/auto_send_emoji.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: auto_send_emoji.py 12 | @time: 3/14/19 16:22 13 | @description:根据要求选择表情,发给微信上对应的好友或者微信群 14 | """ 15 | 16 | import requests 17 | from lxml import etree 18 | import os 19 | import re 20 | from utils.string_utils import * 21 | import time 22 | import random 23 | from urllib import request 24 | import itchat 25 | from utils.chat_utils import * 26 | import matplotlib.pyplot as plt 27 | import matplotlib.image as mpimg 28 | from queue import Queue 29 | import threading 30 | 31 | # pip3 install itchat 32 | 33 | HEADERS = { 34 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' 35 | } 36 | 37 | url = 'https://www.doutula.com/search?type=photo&more=1&keyword={}&page={}' 38 | 39 | 40 | class Spider(object): 41 | 42 | def __init__(self, emoji_type, send_to): 43 | self.emoji_type = emoji_type 44 | self.send_to = send_to 45 | self.emojis = [] 46 | 47 | # 起始页码 48 | self.start_page = 1 49 | 50 | def get_emojis(self): 51 | 52 | while True: 53 | current_url = url.format(self.emoji_type, self.start_page) 54 | resp = requests.get(current_url, headers=HEADERS) 55 | html_raw = etree.HTML(resp.text) 56 | 57 | # 判断当前是否还有emoji表情 58 | container_element = html_raw.xpath('//div[@class="random_picture"]//img') 59 | if len(container_element) > 0: 60 | self.emojis.extend(self.__get_current_page_emoji(html_raw)) 61 | self.start_page += 1 62 | else: 63 | print("当前页面没有表情数据,地址是:%s" % current_url) 64 | break 65 | 66 | time.sleep(0.5) 67 | 68 | def __get_current_page_emoji(self, html_raw): 69 | """ 70 | 获取当前页面所有的emoji图片 71 | :param current_url: 72 | :return: 73 | """ 74 | 75 | a_elements = html_raw.xpath('//div[@class="pic-content text-center"]/div[@class="random_picture"]/a') 76 | 77 | print("第%d页一共有%d张图片" % (self.start_page, len(a_elements))) 78 | 79 | imgs = [] 80 | 81 | for a_element in a_elements: 82 | # 获取img标签【最后一个img】【存储地址】 83 | img_element = a_element.xpath('./img[last()]')[0] 84 | 85 | # 获取p标签【存储名称】 86 | name = a_element.xpath('./p/text()')[0] 87 | 88 | # xpath获取兄弟节点p 89 | # 表情的名称 90 | # name = img_element.xpath('./../p/text()')[0] 91 | 92 | # 表情的下载地址 93 | img_url = img_element.get('data-original') 94 | 95 | # 表情的新名词,不带后缀 96 | # name_new = remove_space(re.sub(r'[\??\.,。!!\*]', '', name)) 97 | 98 | # 注意:由于itchat没法发送带中文的文件,这里随机生成一个名称 99 | name_new = make_random_string(6) 100 | 101 | # 表情的名称,加上后缀 102 | # print('==' * 60) 103 | # print(name_new) 104 | # print(img_url) 105 | # print('==' * 60) 106 | img_name = name_new + os.path.splitext(img_url)[-1] 107 | 108 | imgs.append({ 109 | 'name': img_name, 110 | 'url': img_url 111 | }) 112 | 113 | return imgs 114 | 115 | def download_emojis(self, target_emoji): 116 | """ 117 | 下载表情 118 | :param target_emojis: 119 | :return: 120 | """ 121 | # 本地保存目录 122 | local_img = './imgs/%s' % target_emoji.get('name') 123 | 124 | request.urlretrieve(target_emoji.get('url'), local_img) 125 | 126 | print('emoji保存本地地址:%s' % local_img) 127 | 128 | return local_img 129 | 130 | def show_image(self, filename): 131 | lena = mpimg.imread(filename) 132 | 133 | plt.imshow(lena) # 显示图片 134 | plt.axis('off') # 不显示坐标轴 135 | plt.show() 136 | 137 | 138 | if __name__ == '__main__': 139 | 140 | # 准备调用itchat发送图片 141 | itchat.auto_login(hotReload=True) 142 | 143 | emoji_type = input('想发哪类表情:') 144 | send_type = input('某个人:0/群聊:1【默认是单聊】') 145 | send_to = input('发给谁呢?') 146 | 147 | if not emoji_type: 148 | emoji_type = '装逼' 149 | 150 | if not send_type: 151 | send_type = 0 152 | else: 153 | send_type = int(send_type) 154 | 155 | if not send_to: 156 | if send_type == 0: 157 | send_to = '指定经常要发送的一个人' 158 | else: 159 | send_to = '指定经常要发送的一个群' 160 | 161 | spider = Spider(emoji_type, send_to) 162 | 163 | # 带发送的表情 164 | local_img = None 165 | 166 | # 获取这种类型的所有表情 167 | spider.get_emojis() 168 | 169 | while True: 170 | 171 | # 从所有emoji表情中选择一张 172 | choose_emoji = random.sample(spider.emojis, 1) 173 | 174 | # 下载到本地 175 | local_img = spider.download_emojis(choose_emoji[0]) 176 | 177 | # 显示图片 178 | spider.show_image(local_img) 179 | 180 | ok = input('主人满意吗:') 181 | 182 | if ok: 183 | print('好的,就发送这张表情。') 184 | if send_type == 0: 185 | send_to_person(send_to, local_img) 186 | else: 187 | send_to_group_chat(send_to, local_img) 188 | 189 | # 需要再发一张吗 190 | go_on_send = input('需要再发一张吗?') 191 | if go_on_send: 192 | continue 193 | else: 194 | print('结束了') 195 | break 196 | else: 197 | print('不满意,继续找一张') 198 | continue 199 | -------------------------------------------------------------------------------- /spiders/发表情/utils/chat_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: test.py 12 | @time: 3/15/19 11:45 13 | @description:TODO 14 | """ 15 | 16 | import itchat 17 | 18 | 19 | itchat.auto_login(True) 20 | 21 | 22 | def send_to_person(username, file_names): 23 | """ 24 | 发送给某个人 25 | :param username: 发送对象的昵称 26 | :param filename: 文件名 27 | :return: 28 | """ 29 | room = itchat.search_friends(name=r'%s' % username) 30 | 31 | userName = room[0]['UserName'] 32 | 33 | try: 34 | if isinstance(file_names, list): 35 | # 多个图片 36 | for file_name in file_names: 37 | itchat.send_image(file_name, toUserName=userName) 38 | else: 39 | # 一个图片 40 | itchat.send_image(file_names, toUserName=userName) 41 | print('发送完毕!') 42 | except: 43 | print('发送出错!') 44 | 45 | 46 | def send_to_group_chat(target_group_chat_name, file_names): 47 | """ 48 | 群聊 49 | :param target_group_chat_name: 50 | :param file_name: 51 | :return: 52 | """ 53 | rooms = itchat.get_chatrooms(update=True) 54 | 55 | # 目标群聊对象 56 | target_room = None 57 | for room in rooms: 58 | group_chat_name = room.get('NickName') 59 | if target_group_chat_name == group_chat_name: 60 | target_room = room 61 | break 62 | 63 | if target_room: 64 | if isinstance(file_names, list): 65 | for file_name in file_names: 66 | target_room.send_image(file_name) 67 | else: 68 | target_room.send_image(file_names) 69 | 70 | print('发送完毕!') 71 | else: 72 | print('抱歉,不存在这个群聊') 73 | -------------------------------------------------------------------------------- /spiders/发表情/utils/string_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: string_utils.py 12 | @time: 3/15/19 10:36 13 | @description:TODO 14 | """ 15 | 16 | import random 17 | import string 18 | 19 | 20 | def remove_space(source): 21 | """ 22 | 去除空格 23 | :param source: 24 | :return: 25 | """ 26 | return "".join(source.split(' ')) 27 | 28 | 29 | 30 | def make_random_string(num): 31 | """ 32 | 生成随机字符串 33 | :param num: 34 | :return: 35 | """ 36 | return ''.join(random.sample(string.ascii_letters + string.digits, num)) -------------------------------------------------------------------------------- /spiders/年终奖/comments.txt: -------------------------------------------------------------------------------- 1 | 有,一个月工资 2 | 我们没有 3 | 有 4 | 还没发 5 | 没有 6 | 没有 7 | 这个真没有 8 | 没有 9 | 有4.8✖️1.5 10 | 从来没有 11 | 没有 12 | 没有,只有水果[流泪]购物卡也没 13 | 年后发…… 14 | 年终奖没有,年会也没有了[捂脸] 15 | 没有了 16 | 说了有,还没发 17 | 从来没有 18 | 我这边小公司提都没提,估计凉了。 19 | 有,但是还不确定发多少 20 | 没有 21 | 没有 22 | 没有了 23 | 有,但是要打折了,具体还未知 24 | 没有过 25 | 要过年才知道 26 | 有,一个半月多一点,每个人不一样,这个看部门老大 27 | 没有 28 | 没有,而且要被裁了[流泪] 29 | 不知道有没有,每年都是8-9月分发,年“中”奖。如果发的话,固定一个月[再见] 30 | 我实习3个月,拿了三分之一月的 31 | 有,0.5个月 32 | 两个月[发呆] 33 | 小公司,有一个月的年终奖! 34 | 最后一天上班 给裁了 年终奖都省了 35 | 有.年底双薪 36 | 没有年终奖,年终奖是神马? 37 | 有年终奖,但是由于入职时间过短,不知道自己会有多少。 38 | 还不知道....😂 39 | 从来没有[流泪] 40 | 没有的飘过 41 | 我们没有 42 | 没有。 43 | 不知道耶。。反正有也就3000块钱。。 44 | 没有 45 | 有,跟去年一样 46 | 我们过年前一天才能知道发不发,才能知道发多少 (没有人知道年终奖的计算法方法,发多少是多少)。其余时间没有一点年终奖的消息 47 | 还不清楚😂 48 | 年会抽奖算不算😂 49 | 有 还没发 50 | 两个月 51 | 有,但不知道多少 52 | 听说有,听说比去年多。 53 | 木有 54 | 还不知道 55 | 老板说我们的叫13薪,多出来的 1 不是额外给的,每个月抽一点出来,最后考核完了看着给,13薪是这个意思吗[撇嘴] 56 | 没有 57 | 有 大概2-3个月的工资 58 | 听说是有。。 59 | 我听说我们公司的一直都是第二年年中之后才发年终奖 1-3个月不等[衰] 60 | 有 但还没发 61 | 一般随过年工资一起发,还没到时间 62 | 还不知道[捂脸] 19号开年会 63 | 去年的没发[撇嘴] 64 | 有,年终奖和去年一样多,不过全年收入上涨幅度可观😄 65 | 没有[难过] 66 | 有,发了个球和一坨毛线,还有一把锤子。 67 | 没有+1 68 | 还不知道,我们得到除夕的前一天才知道。去年也是除夕的前一天 69 | 还不晓得有没有,这个公司加入还没有一个月 70 | 没有 71 | 年终奖是有的。过年前发,一般人1.5应该有的 72 | 没有年终奖,工资都是拖延 73 | 还没发呢 不太清楚啊 74 | 有 75 | 没有 76 | 没有啦,公司都要倒了!!! 77 | 没消息 可能连年会都没有[衰] 78 | 没有 79 | 没有 80 | 没有 81 | 还没通知呢 82 | 工作三年,从来没拿过年终奖 83 | 今年也没有,年会还年后举行[捂脸] 84 | 有 但是还没确定发多少 85 | 有 2月 还没发 86 | 没有。。。 87 | 有,接近两个月工资 88 | 我们年终奖3个月工资 89 | 没年终奖,有季度奖 90 | 减半 91 | 同一个月 92 | 没有 93 | 十一月入职的,有一点 94 | 估计没有,发工资都困难了[发呆] 95 | 外包公司,一直没有 96 | 没有 97 | 有,俩月 98 | 有年会,转正不久,年终奖还不清楚。 99 | 没有+1 100 | 今年还没通知不发,往常是两个月*绩效 101 | 年会都调到年后3月份开了,还说要开拓疆土,扩大规模🌚 102 | 我们这个级别不会有,,, 103 | 今年四月底入职的,不知道有没有 104 | 13薪的1薪算年终奖吗,算的话就有,不算的话就没有 105 | 有,一个月 106 | 真没有[难过] 107 | 据说是有,没说发多少,待定转态,估计凉了 108 | 以前有。今年估计悬了 109 | 我们应该有,但是还不确定 110 | 应该在大公司、国企、事业单位这些影响不大吧。我们还是有的 111 | 有,主要是前面说好的设计的提成,不知道能给到多少 112 | 没有 113 | 没有 114 | 没有年终奖 115 | 应该有吧!没有立马辞职 116 | 没有 117 | 工资拖欠了[流泪] 118 | 没有,而且公司春节前后还不让请假,如果请假,春节的法定假日就不算法定假日了,算成请假了,要扣钱 119 | 没有 120 | 没有 121 | 一直的传统,一个月工资,但平时工资就比同行低好多,综合年收入八万多一点点 122 | 有 123 | 没有 124 | 没有 公司业绩下滑将近8亿 125 | 有,和以前一样。 126 | 不是看老板心情,去年有,今年就不确定了 127 | 没有 128 | 有,一个月还有少量项目分成 129 | 今年现在都还没提过年终奖这件事,感觉凉了[流泪] 130 | 一个月 ! 131 | 要倒闭了 132 | 有,大部分是两个月工资 133 | 没有 134 | 没有,还降薪20% 135 | 应该有,看公司利润了,大概率一个月工资。 136 | 不清楚[流泪] 137 | 有个锤子 138 | 没有 139 | 应该有,还没发,估计底薪x2。 140 | 没有+降薪20% 141 | 物流集团旗下成立的新科技公司,大数据部门貌似一直都没有年终奖…… 142 | 我们还在评估发多少 143 | 200块红包 144 | 有,听说大概是月工资的1.几倍。ps:发的是17年的奖金,18的奖金还得往后挪。应该是等不到那一天了。 145 | 我们有2个月,不过要19年年中才发 146 | 从来都没有 147 | 正常的按照绩效发 148 | 有,一个月工资 149 | 还没发 往年1.5不到 150 | 2月 151 | 没有 152 | 从来没有过。。 153 | 往年都是4个月,过年前一周发,今年还不知道 154 | 公司已经裁员,剩下的大概率没有年终奖[撇嘴] 155 | 应该是有, 156 | 有,一个月 157 | 我们有,照常两个月奖金。但是是平时周六上班换来的 158 | 一直没有 159 | 从来没有 160 | 年薪百分之15 161 | 还没听吭 162 | 有,一个月,但是不多 163 | 还不知道。。。 164 | 多半没有! 165 | 一直是13薪。不过今年改成了bouns,比13薪率高,大概1.3个月的样子 166 | 应该有 167 | 往年惯例都是一个月,今年公司效益不错,承诺至少两个月以上,这几天公司还组织来巴厘岛度假。额,是不是有点太拉仇恨了呀[调皮] 168 | 我公司,就我部门没有[微笑] 169 | 有,几个月不知道 170 | 我们还没发 171 | 有,项目奖半个月,年终奖,惯例4月份发 172 | 一个月 173 | 有 174 | 妹有 175 | 一个月+1200过年费 176 | 应该是有2个月, 177 | 创业公司,刚开门几个月,没有 178 | 还不清楚,估计没有 179 | 没有 180 | 之前说是有2个月的,年后发 181 | 没有 182 | 还没发,不知道有没有 183 | 3个月 184 | 我们没有 185 | 没有 186 | 据说还有,还没发过[闭嘴] 187 | 还不知道呢。去年是春节放假前发的 188 | 啥都没得 189 | 实习生不说话😂😂 190 | 还没有发,但入职说的14薪 191 | 我们每年都4月底发,还不清楚有没有 192 | 年薪的20%就是年终奖,绩效不好还要扣 193 | 我们是年中奖,刚来半年,不知道年中有没有[微笑] 194 | 有,不过减半了,去年年终是多发两个月的工资,今年好像是只有一个月的 195 | 说有,不知道有没有 196 | 没有 197 | 以前是平均3到4个月工资,去年就没发了,今年估计也悬了 198 | 没有 年会都没了 199 | 没有。 200 | 2+绩效 201 | 没有 202 | 去年就把年终奖取消了,变为项目奖的路过 203 | 18 年第三季度发的 17 年年终奖[微笑] 204 | 发了,大概是四五个月工资了,因为工资低😂 205 | 有,拖到6月份发 206 | 还不知道 207 | 一个月工资,但是公司规定要第二个季度才发,差不多就是67月份 208 | 没有 209 | 老板承诺都有,但现在还没发,不知道会不会兑现 210 | 没有 211 | 年底双薪,年终两月 212 | 没有 213 | 没有,估计年会有红包 214 | 不知道多少个月 215 | 据说有😂 216 | 要不稍微分行业来个投票 清晰些 217 | 从业三年,几乎没感受到年终奖,以至于我都不记得我们有发过所谓的年终奖吗? 218 | 没有…… 219 | 有,但不知道怎么发[捂脸] 220 | 有 2月12日 221 | 老大说年终奖年后发。 222 | 一个月 223 | 绩效到现在还不知道。。。 224 | 有,一个月工资 225 | 没有,据说年会都取消了 226 | 没有 227 | 新公司,啥都不确定[晕][晕][晕] 228 | 从来就没有 229 | 一直没有[撇嘴] 230 | 我们没有,昨天发工资,每个人都要延迟发放一部分,普遍30%,个别60%或70%。大家已经怨声载道了 231 | 有6个月[尴尬] 232 | 没有 233 | 应该是一个月工资,下周五年会抽奖保底 400,一等奖 10000 现金,如果我中了就可以每个月给张叔打赏了😏😏 234 | 2 235 | 老板说给我加工资,年终奖照发 236 | 没有 237 | 也没有 238 | 有,两个月 239 | 有,一个月工资 240 | 我们没有 241 | 有 242 | 还没发 243 | 没有 244 | 没有 245 | 这个真没有 246 | 没有 247 | 有4.8✖️1.5 248 | 从来没有 249 | 没有 250 | 没有,只有水果[流泪]购物卡也没 251 | 年后发…… 252 | 年终奖没有,年会也没有了[捂脸] 253 | 没有了 254 | 说了有,还没发 255 | 从来没有 256 | 我这边小公司提都没提,估计凉了。 257 | 有,但是还不确定发多少 258 | 没有 259 | 没有 260 | 没有了 261 | 有,但是要打折了,具体还未知 262 | 没有过 263 | 要过年才知道 264 | 有,一个半月多一点,每个人不一样,这个看部门老大 265 | 没有 266 | 没有,而且要被裁了[流泪] 267 | 不知道有没有,每年都是8-9月分发,年“中”奖。如果发的话,固定一个月[再见] 268 | 我实习3个月,拿了三分之一月的 269 | -------------------------------------------------------------------------------- /spiders/年终奖/nzj.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: nzj.py 12 | @time: 1/11/19 16:00 13 | @description:看看大家今年大家都有年终奖吗? 14 | """ 15 | 16 | import json 17 | import jieba 18 | import matplotlib.pyplot as plt 19 | from wordcloud import WordCloud 20 | 21 | # 文件名称 22 | filename = 'comments.txt' 23 | 24 | # 总共的评论数目 25 | comment_count = 0 26 | 27 | 28 | def response(flow): 29 | request = flow.request 30 | response = flow.response 31 | 32 | global comment_count 33 | 34 | # 请求的地址 35 | request_url = request.url 36 | 37 | # 筛选 38 | if 'comments' in request_url and 'zsxq' in request_url: 39 | # 返回的内容 40 | response_content = response.content.decode('utf-8') 41 | print('请求地址:' + request_url) 42 | print('请求方法:' + str(request.method)) 43 | print('参数:' + str(request.data)) 44 | 45 | obj = json.loads(response_content) 46 | 47 | comments = obj['resp_data']['comments'] 48 | 49 | # 最后一页 50 | if len(comments) == 0: 51 | print('一共有%d个球友发表了自己的看法' % comment_count) 52 | 53 | # 生成词云 54 | generate_word_cloud() 55 | 56 | else: 57 | comment_count += len(comments) 58 | for comment in comments: 59 | comment_content = comment['text'] 60 | with open(filename, 'a') as f: 61 | f.write(comment_content + '\n') 62 | 63 | 64 | def generate_word_cloud(): 65 | """ 66 | 生成词云 67 | :return: 68 | """ 69 | with open(filename, 'r') as f: 70 | word_content = f.read() 71 | 72 | # 使用jieba去分割 73 | wordlist = jieba.cut(word_content, cut_all=True) 74 | 75 | wl_space_split = " ".join(wordlist) 76 | 77 | font = r'/Users/xingag/Library/Fonts/SimHei.ttf' 78 | 79 | wordcloud = WordCloud(font_path=font, width=1080, height=1920, margin=2).generate(wl_space_split) 80 | 81 | # 显示图片 82 | plt.imshow(wordcloud) 83 | plt.axis("off") 84 | 85 | # 按照设置保存到本地文件夹 86 | wordcloud.to_file("./output.png") 87 | -------------------------------------------------------------------------------- /spiders/年终奖/output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/spiders/年终奖/output.png -------------------------------------------------------------------------------- /verification code/注册【中知网】/AipOcr.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: AipOcr.py 12 | @time: 1/23/19 15:19 13 | @description:AipOcr是OCR的Python SDK客户端,为使用OCR的开发人员提供了一系列的交互方法。 14 | """ 15 | 16 | from aip import AipOcr 17 | 18 | """ 你的 APPID AK SK """ 19 | APP_ID = '15474**' 20 | API_KEY = 'VBoMZ6XUX119w***' 21 | SECRET_KEY = 'GPvqLVeGIMOR57***' 22 | 23 | client = AipOcr(APP_ID, API_KEY, SECRET_KEY) 24 | -------------------------------------------------------------------------------- /verification code/注册【中知网】/cnki_demo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: cnki_demo.py 12 | @time: 1/23/19 15:44 13 | @description:[中国知网注册] 14 | """ 15 | from PIL import Image 16 | from selenium import webdriver 17 | from file_tools import * 18 | from AipOcr import * 19 | import requests 20 | import time 21 | import json 22 | 23 | 24 | class Cnki_Spider(object): 25 | driver_path = "/usr/local/bin/chromedriver" 26 | 27 | def __init__(self): 28 | self.driver = webdriver.Chrome(executable_path=Cnki_Spider.driver_path) 29 | 30 | # 包含验证码的页面的截图 31 | self.screen_shot_file_name = "screen_shot.png" 32 | 33 | # 验证码图片 34 | self.code_file_name = "image_code.png" 35 | 36 | # 注册主页面 37 | self.main_url = 'http://my.cnki.net/elibregister/commonRegister.aspx' 38 | 39 | # 待注册的内容 40 | # 昵称 41 | self.username = 'xingag2311' 42 | # 密码 43 | self.password = 'Hu9012782' 44 | # 邮箱地址 45 | self.email = '809900227@qq.com' 46 | 47 | def run(self): 48 | # 1.打开注册页面【包含验证码】 49 | self.driver.get(self.main_url) 50 | 51 | source = self.driver.page_source 52 | 53 | # 2.验证码图片、验证码输入框 54 | code_input_element = self.driver.find_element_by_id('txtOldCheckCode') 55 | code_img_element = self.driver.find_element_by_id('checkcode') 56 | 57 | 58 | # 外面容器 59 | container_element = self.driver.find_element_by_id('form1') 60 | 61 | # 3.获取验证码、填入输入框、点击外面 62 | # 如果没有出现出错的提示tips,就代表输入验证码成功 63 | while True: 64 | 65 | code = self.get_code().strip() 66 | 67 | error_tips_element = self.driver.find_element_by_id('span_oldcheckcode') 68 | 69 | print('验证码为:%s' % code) 70 | code_input_element.clear() 71 | code_input_element.click() 72 | code_input_element.send_keys(code) 73 | 74 | # 点击外围的容器,判断验证码是否输入正确 75 | container_element.click() 76 | 77 | # 显示了错误信息:验证码输入错误 78 | if error_tips_element.text: 79 | time.sleep(2) 80 | print('验证码验证失败,点击验证码图片') 81 | 82 | # 点击验证码图片,重新加载验证码 83 | code_img_element.click() 84 | continue 85 | else: 86 | print('验证码验证成功') 87 | break 88 | 89 | # 3.注册 90 | self.register(code) 91 | 92 | def get_code(self): 93 | 94 | # 1.截图并保存到本地 95 | self.driver.get_screenshot_as_file('./%s' % self.screen_shot_file_name) 96 | 97 | # 2.打开文件 98 | screenshot_image = Image.open('./%s' % self.screen_shot_file_name) 99 | 100 | # 3.设置要裁剪的区域(验证码所在的区域) 101 | code_box = (899, 819, 1048, 883) 102 | 103 | # 4.截图:生成只有验证码的图片 104 | code_image = screenshot_image.crop(code_box) 105 | 106 | # 5.保存到本地 107 | code_image.save("./%s" % self.code_file_name) 108 | 109 | # 6.以byte读取图片 110 | image = get_file_content("./%s" % self.code_file_name) 111 | 112 | # 7.使用百度OCR识别验证码 113 | result = client.basicAccurate(image) 114 | 115 | print(result) 116 | 117 | # 识别的文字内容 118 | word_result = result.get('words_result')[0].get('words') 119 | 120 | return word_result 121 | 122 | def register(self, code): 123 | # 用户名输入框 124 | username_input_element = self.driver.find_element_by_id('username') 125 | 126 | # 密码输入框 127 | password_input_element = self.driver.find_element_by_id('txtPassword') 128 | 129 | # 邮箱输入框 130 | txtEmail_input_element = self.driver.find_element_by_id('txtEmail') 131 | 132 | # 注册按钮 133 | submit_btn_element = self.driver.find_element_by_id('ButtonRegister') 134 | 135 | username_input_element.send_keys(self.username) 136 | password_input_element.send_keys(self.password) 137 | txtEmail_input_element.send_keys(self.email) 138 | 139 | submit_btn_element.click() 140 | 141 | 142 | if __name__ == '__main__': 143 | spider = Cnki_Spider() 144 | spider.run() 145 | -------------------------------------------------------------------------------- /verification code/注册【中知网】/file_tools.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: file_tools.py 12 | @time: 1/23/19 15:41 13 | @description:TODO 14 | """ 15 | 16 | 17 | def get_file_content(filePath): 18 | """ 19 | 读取文件 20 | :param filePath: 文件路径 21 | :return: byte类型 22 | """ 23 | with open(filePath, 'rb') as fp: 24 | return fp.read() 25 | -------------------------------------------------------------------------------- /verification code/注册【中知网】/image_code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/verification code/注册【中知网】/image_code.png -------------------------------------------------------------------------------- /verification code/注册【中知网】/screen_shot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/verification code/注册【中知网】/screen_shot.png -------------------------------------------------------------------------------- /微信聊天记录/utils/dbutils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: dbutils.py 12 | @time: 2020-04-11 16:57 13 | @description 14 | """ 15 | 16 | import sqlite3 17 | 18 | 19 | class DUtil(): 20 | 21 | def __init__(self, db_path="./weixin.db"): 22 | """ 23 | 数据库初始化 24 | """ 25 | self.db = sqlite3.connect(db_path) 26 | self.cursor = self.db.cursor() 27 | 28 | def execute(self, sql, param=None): 29 | """ 30 | Sql语句,包含:增、删、改 31 | param:数据,可以为列表、字典,也可以为空 32 | """ 33 | try: 34 | if param is None: 35 | self.cursor.execute(sql) 36 | else: 37 | if type(param) is list: 38 | self.cursor.executemany(sql, param) 39 | else: 40 | self.cursor.execute(sql, param) 41 | count = self.db.total_changes 42 | self.db.commit() 43 | except Exception as e: 44 | print(e) 45 | return False, e 46 | 47 | # 返回结果 48 | return True if count > 0 else False 49 | 50 | def query(self, sql, param=None): 51 | """ 52 | 查询语句 53 | sql:Sql语句 54 | param:参数,可以包含空 55 | retutn:成功返回True 56 | """ 57 | if param is None: 58 | self.cursor.execute(sql) 59 | else: 60 | self.cursor.execute(sql, param) 61 | return self.cursor.fetchall() 62 | 63 | def close(self): 64 | """ 65 | 数据库关闭 66 | """ 67 | self.cursor.close() 68 | self.db.close() 69 | 70 | -------------------------------------------------------------------------------- /微信聊天记录/utils/string_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: StringUtils.py 12 | @time: 2020-04-11 18:39 13 | @description:TODO 14 | """ 15 | import re 16 | 17 | 18 | def get_ava_string(str): 19 | """ 20 | 去掉特殊符号,保留正常内容 21 | :param str: 22 | :return: 23 | """ 24 | return re.sub(u"([^ \u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])", "", str) 25 | -------------------------------------------------------------------------------- /获取女友的位置/.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 12 | -------------------------------------------------------------------------------- /获取女友的位置/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /获取女友的位置/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /获取女友的位置/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /获取女友的位置/.idea/地理位置.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /获取女友的位置/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: meta_picture.py.py 12 | @time: 2019-08-23 16:23 13 | @description:高德坐标拾取网站:https://lbs.amap.com/console/show/picker 14 | """ 15 | 16 | import os 17 | import exifread 18 | from decimal import Decimal 19 | from position_utils import * 20 | import requests 21 | import json 22 | import datetime 23 | 24 | 25 | # pip3 install exifread 26 | 27 | 28 | class Location(object): 29 | 30 | def __init__(self, image_path): 31 | self.img_path = image_path 32 | 33 | self.api_key = "你申请的AK" 34 | 35 | self.url_get_position = 'https://restapi.amap.com/v3/geocode/regeo?key={}&location={}' 36 | 37 | def run(self): 38 | coordinate = self.__get_image_ability() 39 | 40 | print(f'获取到经度、纬度是:{coordinate}') 41 | 42 | if not coordinate: 43 | return 44 | 45 | # 根据经度和纬度,获取到详细地址 46 | address = self.__get_address(coordinate) 47 | 48 | # 检验坐标值 49 | # https://lbs.amap.com/console/show/picker 50 | print(f'你女朋友当前位置在:{address}') 51 | 52 | def __get_address(self, location): 53 | """ 54 | 根据坐标得到详细地址 55 | :param location: 经纬度值 56 | :return: 57 | """ 58 | resp = requests.get(self.url_get_position.format(self.api_key, location)) 59 | 60 | location_data = json.loads(resp.text) 61 | 62 | address = location_data.get('regeocode').get('formatted_address') 63 | 64 | return address 65 | 66 | def __format_lati_long_data(self, data): 67 | """ 68 | 对经度和纬度数据做处理,保留6位小数 69 | :param data: 原始经度和纬度值 70 | :return: 71 | """ 72 | # 删除左右括号和空格 73 | data_list_tmp = str(data).replace('[', '').replace(']', '').split(',') 74 | data_list = [data.strip() for data in data_list_tmp] 75 | 76 | # 替换秒的值 77 | data_tmp = data_list[-1].split('/') 78 | 79 | # 秒的值 80 | data_sec = int(data_tmp[0]) / int(data_tmp[1]) / 3600 81 | 82 | # 替换分的值 83 | data_tmp = data_list[-2] 84 | 85 | # 分的值 86 | data_minute = int(data_tmp) / 60 87 | 88 | # 度的值 89 | data_degree = int(data_list[0]) 90 | 91 | # 由于高德API只能识别到小数点后的6位 92 | # 需要转换为浮点数,并保留为6位小数 93 | result = "%.6f" % (data_degree + data_minute + data_sec) 94 | return float(result) 95 | 96 | def __get_image_ability(self): 97 | """ 98 | 获取图片的属性值,包含:经纬度、拍摄时间等 99 | :param picture_name: 100 | :return: 101 | """ 102 | 103 | # 利用exifread库,读取图片的属性 104 | img_exif = exifread.process_file(open(self.img_path, 'rb')) 105 | 106 | # 能够读取到属性 107 | if img_exif: 108 | # 纬度数 109 | latitude_gps = img_exif['GPS GPSLatitude'] 110 | 111 | # N,S 南北纬方向 112 | latitude_direction = img_exif['GPS GPSLatitudeRef'] 113 | 114 | # 经度数 115 | longitude_gps = img_exif['GPS GPSLongitude'] 116 | 117 | # E,W 东西经方向 118 | longitude_direction = img_exif['GPS GPSLongitudeRef'] 119 | 120 | # 拍摄时间 121 | take_time = img_exif['EXIF DateTimeOriginal'] 122 | 123 | is_lie = self.judge_time_met(take_time) 124 | 125 | if is_lie: 126 | print('很遗憾的通知你,你的女朋友在撒谎!!!') 127 | return 128 | 129 | # 纬度、经度、拍摄时间 130 | if latitude_gps and longitude_gps and take_time: 131 | 132 | # 对纬度、经度值原始值作进一步的处理 133 | latitude = self.__format_lati_long_data(latitude_gps) 134 | longitude = self.__format_lati_long_data(longitude_gps) 135 | 136 | # print(f'{longitude},{latitude}') 137 | 138 | # 注意:由于gps获取的坐标在国内高德等主流地图上逆编码不够精确,这里需要转换为火星坐标系 139 | location = wgs84togcj02(longitude, latitude) 140 | 141 | return f'{location[0]},{location[1]}' 142 | else: 143 | print(f'获取的图片数据属性不完整') 144 | return '' 145 | else: 146 | print('抱歉,图片不是原图,没法获取到图片属性。') 147 | return '' 148 | 149 | def judge_time_met(self, take_time): 150 | """ 151 | 通知拍摄时间判断女朋友是否撒谎 152 | :param take_time: 153 | :return: 154 | """ 155 | # 拍摄时间 156 | format_time = str(take_time).split(" ")[0].replace(":", "-") 157 | 158 | # 当天日期 159 | today = str(datetime.date.today()) 160 | 161 | if format_time == today: 162 | return False 163 | else: 164 | return True 165 | 166 | 167 | if __name__ == '__main__': 168 | # 女朋友发过来的图片【原图】 169 | location = Location('./picture/11441566648796_.pic_hd.jpg') 170 | 171 | # 找到女朋友的地理位置 172 | location.run() 173 | -------------------------------------------------------------------------------- /获取女友的位置/picture/11441566648796_.pic_hd.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xingag/spider_python/80668005f1416dab04c25569b35b679a2a6b2e5d/获取女友的位置/picture/11441566648796_.pic_hd.jpg -------------------------------------------------------------------------------- /获取女友的位置/position_utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding: utf-8 3 | 4 | """ 5 | @version: v1.0 6 | @author: xag 7 | @license: Apache Licence 8 | @contact: xinganguo@gmail.com 9 | @site: http://www.xingag.top 10 | @software: PyCharm 11 | @file: position_utils.py 12 | @time: 2019-08-23 17:44 13 | @description:坐标转换 14 | """ 15 | 16 | # -*- coding: utf-8 -*- 17 | import json 18 | import math 19 | 20 | x_pi = 3.14159265358979324 * 3000.0 / 180.0 21 | pi = 3.1415926535897932384626 # π 22 | a = 6378245.0 # 长半轴 23 | ee = 0.00669342162296594323 # 扁率 24 | 25 | 26 | def wgs84togcj02(lng, lat): 27 | """ 28 | WGS84转GCJ02(火星坐标系) 29 | :param lng:WGS84坐标系的经度 30 | :param lat:WGS84坐标系的纬度 31 | :return: 32 | """ 33 | if out_of_china(lng, lat): # 判断是否在国内 34 | return lng, lat 35 | dlat = transformlat(lng - 105.0, lat - 35.0) 36 | dlng = transformlng(lng - 105.0, lat - 35.0) 37 | radlat = lat / 180.0 * pi 38 | magic = math.sin(radlat) 39 | magic = 1 - ee * magic * magic 40 | sqrtmagic = math.sqrt(magic) 41 | dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi) 42 | dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi) 43 | mglat = lat + dlat 44 | mglng = lng + dlng 45 | return [mglng, mglat] 46 | 47 | 48 | def gcj02towgs84(lng, lat): 49 | """ 50 | GCJ02(火星坐标系)转GPS84 51 | :param lng:火星坐标系的经度 52 | :param lat:火星坐标系纬度 53 | :return: 54 | """ 55 | if out_of_china(lng, lat): 56 | return lng, lat 57 | dlat = transformlat(lng - 105.0, lat - 35.0) 58 | dlng = transformlng(lng - 105.0, lat - 35.0) 59 | radlat = lat / 180.0 * pi 60 | magic = math.sin(radlat) 61 | magic = 1 - ee * magic * magic 62 | sqrtmagic = math.sqrt(magic) 63 | dlat = (dlat * 180.0) / ((a * (1 - ee)) / (magic * sqrtmagic) * pi) 64 | dlng = (dlng * 180.0) / (a / sqrtmagic * math.cos(radlat) * pi) 65 | mglat = lat + dlat 66 | mglng = lng + dlng 67 | return [lng * 2 - mglng, lat * 2 - mglat] 68 | 69 | 70 | def transformlat(lng, lat): 71 | ret = -100.0 + 2.0 * lng + 3.0 * lat + 0.2 * lat * lat + \ 72 | 0.1 * lng * lat + 0.2 * math.sqrt(math.fabs(lng)) 73 | ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 * 74 | math.sin(2.0 * lng * pi)) * 2.0 / 3.0 75 | ret += (20.0 * math.sin(lat * pi) + 40.0 * 76 | math.sin(lat / 3.0 * pi)) * 2.0 / 3.0 77 | ret += (160.0 * math.sin(lat / 12.0 * pi) + 320 * 78 | math.sin(lat * pi / 30.0)) * 2.0 / 3.0 79 | return ret 80 | 81 | 82 | def transformlng(lng, lat): 83 | ret = 300.0 + lng + 2.0 * lat + 0.1 * lng * lng + \ 84 | 0.1 * lng * lat + 0.1 * math.sqrt(math.fabs(lng)) 85 | ret += (20.0 * math.sin(6.0 * lng * pi) + 20.0 * 86 | math.sin(2.0 * lng * pi)) * 2.0 / 3.0 87 | ret += (20.0 * math.sin(lng * pi) + 40.0 * 88 | math.sin(lng / 3.0 * pi)) * 2.0 / 3.0 89 | ret += (150.0 * math.sin(lng / 12.0 * pi) + 300.0 * 90 | math.sin(lng / 30.0 * pi)) * 2.0 / 3.0 91 | return ret 92 | 93 | 94 | def out_of_china(lng, lat): 95 | """ 96 | 判断是否在国内,不在国内不做偏移 97 | :param lng: 98 | :param lat: 99 | :return: 100 | """ 101 | if lng < 72.004 or lng > 137.8347: 102 | return True 103 | if lat < 0.8293 or lat > 55.8271: 104 | return True 105 | return False 106 | --------------------------------------------------------------------------------