├── .gitattributes ├── .gitignore ├── README.md ├── 【51Job】查岗位 └── select_job.py ├── 【bilibili】自动登录 ├── README.md ├── __init__.py └── login.py ├── 【bilibili】视频下载 ├── __init__.py └── video_download.py ├── 【双色球】头奖分布 ├── main.py ├── result.jpg └── 近期记录.xlsx ├── 【壁纸】美女壁纸下载器 └── bg_down.py ├── 【大众点评】字体反爬、坐标反爬 ├── 参数生成 │ ├── encryp.js │ └── uid.py ├── 旧版 │ ├── __init__.py │ ├── parse_address_poi.py │ └── parse_font_css.py └── 最新版7月 │ ├── README.md │ ├── font.json │ └── main.py ├── 【天眼查】字体加密 ├── num.woff └── tyc.py ├── 【抖音】无水印视频解析 ├── README.md ├── __init__.py └── parse.py ├── 【拼多多】登陆参数生成 ├── PinDuoDuo.py ├── README.md ├── __init__.py └── encryp.js ├── 【文书】app查询接口 └── main.py ├── 【淘宝】自动登陆 ├── auto_login_pyppeteer.py └── login_for_sina.py ├── 其他实战 ├── __init__.py ├── 【5173网】自动登录 │ ├── auto_login.py │ ├── encryp.js │ └── logOK.png ├── 【9377网】自动登录 │ ├── 9377login.py │ └── __init__.py ├── 【DNS】自动登录 │ ├── Login.py │ ├── __init__.py │ └── dns.js ├── 【GitHub】自动登录 │ ├── __init__.py │ └── login.py ├── 【Glidedsky】自动登陆 │ └── login.py ├── 【Python加密库】Demo │ ├── __init__.py │ └── encryption.py ├── 【TCL金融】自动登录 │ ├── __init__.py │ ├── auto_login.py │ ├── encryp.js │ └── ok.png ├── 【steam】自动登录 │ ├── execute.js │ └── login.py ├── 【万创帮】自动登录 │ ├── __init__.py │ ├── encryp.js │ ├── login_ok.png │ └── spider_login.py ├── 【中关村】自动登录 │ ├── README.md │ └── login.py ├── 【京东】商品数据爬取 │ ├── __init__.py │ ├── geckodriver │ └── selenium抓取.py ├── 【人人网】自动登录 │ ├── enc.js │ └── login.py ├── 【企业名片】企业查询 │ ├── encryp.js │ └── qi_ming.py ├── 【国鑫所】自动登录 │ ├── Login.py │ ├── __init__.py │ ├── encryp.js │ └── login_ok.png ├── 【天眼查】模拟登录 │ ├── __init__.py │ └── login.py ├── 【天翼】登录 │ ├── login.py │ └── v1.js ├── 【好莱客】参数解析 │ ├── __init__.py │ ├── encryp.js │ ├── holike.py │ └── ok.png ├── 【小牛在线】登录参数生成 │ ├── __init__.py │ ├── encryp.js │ └── make_param.py ├── 【开鑫贷】登陆参数生成 │ ├── KaiXinDai.py │ ├── __init__.py │ └── encryp.js ├── 【微信】登录参数生成 │ ├── __init__.py │ ├── encryp.js │ └── make_pwd.py ├── 【房价】房价获取 │ ├── README.md │ ├── __pycache__ │ │ └── util.cpython-37.pyc │ ├── main.py │ └── util.py ├── 【房天下】自动登录 │ ├── encryp.js │ ├── login.py │ └── ok.png ├── 【新浪微博】密码解密 │ ├── execute.js │ └── main.py ├── 【时光网】登陆参数生成 │ ├── encryp.js │ └── login.py ├── 【易通贷】自动登录 │ ├── __init__.py │ ├── auto_login.py │ └── encryp.js ├── 【汽车之家】参数解密 │ ├── execute.js │ └── main.py ├── 【满级网】自动登录 │ ├── auto_login.py │ └── encryp.js ├── 【百度】wap端sig生成 │ ├── make_sig.py │ └── v3_update.js ├── 【百度】网页找回密码 │ ├── __pycache__ │ │ └── header.cpython-37.pyc │ ├── demo.py │ ├── dv.js │ ├── encryp.js │ ├── header.py │ └── 验证码.png ├── 【百度】翻译 │ ├── __init__.py │ ├── translate.js │ └── translation.py ├── 【百度】自动登录 │ ├── README.md │ ├── encryp.js │ └── login.py ├── 【百度街拍】图片下载 │ └── get_image.py ├── 【移动】登录参数生成 │ ├── MakeParam.py │ ├── __init__.py │ ├── encryp.js │ └── make_params.png ├── 【空中网】自动登录 │ ├── __init__.py │ ├── encryp.js │ └── spider_login.py ├── 【美团】数据解析、token生成 │ ├── README.md │ ├── __init__.py │ ├── create_food_token.py │ ├── get_login_cookies.py │ ├── parse_food_comments.py │ ├── parse_food_info.py │ ├── parse_hotel_comments.py │ ├── parse_hotel_info.py │ ├── parse_play_areas.py │ └── parse_play_info.py ├── 【试客联盟】登录 │ ├── execute.js │ └── login.py ├── 【谷雨】数字解密 │ └── GuYu.py ├── 【豆瓣】自动登录 │ └── DouBan.py ├── 【逗游】自动登录 │ ├── __init__.py │ ├── douyou.py │ └── encryp.js ├── 【金逸电影】自动注册 │ ├── __init__.py │ ├── encryp.js │ ├── register.png │ └── register.py ├── 【青海移动】登陆参数生成 │ ├── __init__.py │ ├── encryp.js │ └── make_param.py └── 【餐饮】查询信息 │ ├── FoodInfo.py │ ├── __init__.py │ └── t.html ├── 原创爬虫工具 ├── Cookies │ ├── MeiTuan │ │ ├── __init__.py │ │ ├── config.py │ │ ├── db.py │ │ ├── generator.py │ │ └── 账号.txt │ ├── README.md │ └── __init__.py ├── DataMigration │ ├── README.md │ ├── __init__.py │ ├── config.py │ ├── db │ │ ├── MongoDB.py │ │ ├── Mysql.py │ │ └── __init__.py │ └── migration │ │ ├── __init__.py │ │ ├── mongo_to_mysql.py │ │ └── mysql_to_mongo.py ├── Decode │ ├── README.md │ ├── __init__.py │ └── translation.py ├── Jsencrypt │ ├── __init__.py │ └── make_encrypt.py ├── OSS │ ├── __init__.py │ └── push_to_oss.py ├── Proxy │ ├── KDLProxyPool.py │ ├── README.md │ ├── XDLProxyPool.py │ ├── XDLProxyUseDemo.py │ ├── ZhiMaProxyPool.py │ └── ZhiMaProxyUseDemo.py ├── README.md ├── Register │ ├── MessageCode.py │ ├── README.md │ └── __init__.py └── zok │ ├── README.md │ ├── __init__.py │ ├── get_db │ ├── __init__.py │ ├── from_mongodb.py │ └── from_mysql.py │ ├── proxies │ ├── __init__.py │ └── proxies.py │ ├── random_UA │ ├── __init__.py │ ├── fake_useragent.json │ └── ua_random.py │ ├── repetition │ ├── __init__.py │ └── update_cache.py │ ├── save │ ├── __init__.py │ └── to_mysql.py │ └── zok_config.py ├── 滑动验证码 ├── 【w3c】滑块验证 │ ├── __init__.py │ ├── bg.png │ ├── chache.png │ ├── hk.png │ ├── img │ │ ├── 0.png │ │ ├── 1.png │ │ ├── 2.png │ │ └── 3.png │ └── w3c.py └── 【腾讯】滑块验证 │ ├── bg.jpeg │ ├── discriminate.py │ └── sel.py └── 项目 ├── HouseScrapy ├── requirements ├── scrapy.cfg ├── settings.py ├── spiders │ ├── __init__.py │ └── house.py └── toolkits │ ├── __init__.py │ ├── fake_useragent.json │ ├── items.py │ ├── make_ua.py │ ├── middlewares.py │ ├── pipelines.py │ └── proxies.py ├── HouseSpider ├── README.md ├── config.py ├── db │ └── __init__.py ├── main.py └── tool │ ├── __init__.py │ ├── parse.py │ ├── proxy.py │ └── toolkit.py ├── MeiTuanArea ├── MeiTuanArea │ ├── __init__.py │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── __init__.py │ │ ├── area_coord.py │ │ └── areas.py ├── README.md ├── __init__.py ├── scrapy.cfg └── 初始化.sql └── README.md /.gitattributes: -------------------------------------------------------------------------------- 1 | *.js linguist-language=python -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | */.DS_Store 3 | /.idea 4 | */.idea 5 | .vscode 6 | /.vscode 7 | */.vscode 8 | /__pycache__ 9 | */__pycache__ 10 | 11 | .README.md -------------------------------------------------------------------------------- /【51Job】查岗位/select_job.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-04-15 Python: 3.7 4 | 5 | import requests 6 | from lxml import etree 7 | 8 | Format_str = 'https://search.51job.com/list/000000,000000,0000,00,9,99,{key},2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare=' 9 | Headers = { 10 | 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36" 11 | } 12 | 13 | 14 | class GetJob(object): 15 | 16 | def __init__(self, job_name): 17 | self.job = job_name 18 | 19 | self.get_info() 20 | 21 | def get_info(self): 22 | target_url = Format_str.format(key=self.job) 23 | response = requests.get(target_url, headers=Headers) 24 | # 编码转换 25 | response.encoding = response.apparent_encoding 26 | root = etree.HTML(response.text) 27 | self.parse(root) 28 | 29 | @staticmethod 30 | def parse(root): 31 | div_list = root.xpath("//div[@class='dw_table']/div[@class='el']") 32 | for div in div_list: 33 | money = div.xpath("span[@class='t4']/text()") 34 | money = money[0] if money else "面议" 35 | # 工作名称不可能为空,所以不用判断 36 | a = div.xpath("p/span/a")[0] 37 | job_name = a.xpath("text()")[0].strip() 38 | job_href = a.xpath("@href")[0] 39 | date_time = div.xpath("span[@class='t5']/text()") 40 | date_time = date_time[0] if date_time else "没有时间" 41 | print(job_name, money, date_time, job_href) 42 | with open('job.csv', 'a', encoding='gb18030') as f: 43 | job_list = [job_name, date_time, money, job_href, '\n'] 44 | f.write(','.join(job_list)) 45 | 46 | 47 | if __name__ == "__main__": 48 | key = input("请输入关键词") 49 | GetJob(key) 50 | -------------------------------------------------------------------------------- /【bilibili】自动登录/README.md: -------------------------------------------------------------------------------- 1 | ## B站自动登录 2 | 3 | 本案例根据 `selenium` 实现。 4 | 5 | ## 效果图 6 | 7 | ![image](https://csrftoken.oss-cn-beijing.aliyuncs.com/github/blibili-login-report.png) 8 | 9 | ## Q&A 10 | 11 | > ChromeDriver - WebDriver for Chrome 12 | 13 | ``` 14 | 因为是模拟点击,所以需要下载插件。 15 | 16 | 点击下方链接即可跳转至下载界面。 17 | ``` 18 | 19 | > 为什么要模拟滑动多次? 20 | 21 | ``` 22 | 因为获取滑块的偏移量,在模拟操作的时候,机器在控制滑动速度的时候比较均匀,可能会被判定为机器。 23 | 24 | 当然了,我们会在今后给予更好的滑动支持~ 25 | ``` 26 | 27 | [下载ChromeDriver](https://chromedriver.chromium.org/downloads) 28 | 29 | ## Support 30 | 31 | ``` 32 | 案例于 2020-04-23 前均可用,如有疑问请联系作者。 33 | ``` 34 | 35 | ## Donate 36 | 37 | Thanks ~ 38 | -------------------------------------------------------------------------------- /【bilibili】自动登录/__init__.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Date: 2020/4/23 4 | -------------------------------------------------------------------------------- /【bilibili】视频下载/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-09 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /【双色球】头奖分布/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-11-08 Python: 3.7 4 | import requests 5 | import json 6 | import pandas as pd 7 | import openpyxl 8 | import jieba 9 | import wordcloud 10 | import matplotlib.pyplot as plt 11 | 12 | 13 | class SSQ: 14 | def __init__(self, file, font): 15 | self.header = { 16 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36', 17 | 'Host': 'www.cwl.gov.cn', 18 | 'Referer': 'http://www.cwl.gov.cn/kjxx/ssq/kjgg/' 19 | } 20 | self.file = file 21 | self.font = font 22 | self.get_history_url = 'http://www.cwl.gov.cn/cwl_admin/kjxx/findDrawNotice?name=ssq&issueCount=100' 23 | self.session = requests.session() 24 | 25 | def history(self): 26 | """爬取最近100期""" 27 | _dict = None 28 | try: 29 | self.session.get('http://www.cwl.gov.cn/kjxx/ssq/kjgg/') 30 | _dict = json.loads(self.session.get(self.get_history_url, headers=self.header).text) 31 | except TypeError: 32 | print('获取历史记录失败') 33 | finally: 34 | return _dict 35 | 36 | def clean_data(self, data): 37 | """ 38 | 清洗数据 39 | :return: 40 | """ 41 | columns = [] 42 | 43 | for item in data.get('result'): 44 | columns.append([ 45 | item.get('code'), 46 | item.get('date'), 47 | item.get('week'), 48 | item.get('red').split(','), 49 | item.get('blue'), 50 | item.get('sales'), 51 | item.get('poolmoney'), 52 | item.get('content'), 53 | item.get('prizegrades')[0].get('typemoney'), 54 | item.get('prizegrades')[0].get('typenum'), 55 | item.get('prizegrades')[1].get('typemoney'), 56 | item.get('prizegrades')[1].get('typenum'), 57 | item.get('prizegrades')[2].get('typemoney'), 58 | item.get('prizegrades')[2].get('typenum'), 59 | ]) 60 | 61 | df = pd.DataFrame( 62 | columns, 63 | columns=["期数", "开奖日期", "星期数", "红球", "蓝球", "销售金额", "奖池", "中奖地区", "一等奖金", "一等奖人数", "二等奖金", "二等奖人数", "三等奖金", "三等奖人数"], # 指定列 64 | ) 65 | self.save(df) 66 | self.set_data(df) 67 | 68 | def save(self, df): 69 | """储存 70 | """ 71 | df.to_excel(self.file) 72 | 73 | def set_data(self, df): 74 | """ 75 | 数据预处理 76 | :return: 77 | """ 78 | cut_text = [] 79 | for i in df['中奖地区']: 80 | for addr in i.split(',')[:-1]: 81 | name, num = jieba.cut(addr[:-1]) 82 | for n in range(int(num)): 83 | cut_text.append(name) 84 | print(" ".join(cut_text)) 85 | 86 | w = wordcloud.WordCloud(font_path=self.font, background_color="white", scale=4) 87 | w.generate(" ".join(cut_text)) 88 | plt.imshow(w, interpolation="bilinear") 89 | plt.axis("off") 90 | # plt.show() 91 | # 保存生成的图片 92 | w.to_file('result.jpg') 93 | 94 | def parse_history(self): 95 | """ 96 | pandas 载入数据 97 | :return: 98 | """ 99 | data = self.history() 100 | self.clean_data(data) 101 | 102 | 103 | if __name__ == "__main__": 104 | """ 105 | 请自行准备一个字体文件并导入路径 106 | """ 107 | ssq = SSQ('近期记录.xlsx', '你自己准备的字库路径') 108 | ssq.parse_history() 109 | -------------------------------------------------------------------------------- /【双色球】头奖分布/result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/【双色球】头奖分布/result.jpg -------------------------------------------------------------------------------- /【双色球】头奖分布/近期记录.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/【双色球】头奖分布/近期记录.xlsx -------------------------------------------------------------------------------- /【壁纸】美女壁纸下载器/bg_down.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-11-06 Python: 3.7 4 | 5 | from requests import get 6 | from filetype import guess 7 | from os import rename 8 | from os import makedirs 9 | from os.path import exists 10 | from json import loads 11 | from contextlib import closing 12 | 13 | 14 | class DownBg: 15 | """ 16 | 超级高清图片下载 17 | """ 18 | def __init__(self): 19 | self.headers = { 20 | "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36" 21 | } 22 | 23 | def down_load(self, file_url, file_full_name, now_photo_count, all_photo_count): 24 | 25 | # 开始下载图片 26 | with closing(get(file_url, headers=self.headers, stream=True)) as response: 27 | chunk_size = 1024 # 单次请求最大值 28 | content_size = int(response.headers['content-length']) # 文件总大小 29 | data_count = 0 # 当前已传输的大小 30 | with open(file_full_name, "wb") as file: 31 | for data in response.iter_content(chunk_size=chunk_size): 32 | file.write(data) 33 | done_block = int((data_count / content_size) * 50) 34 | data_count = data_count + len(data) 35 | now_jd = (data_count / content_size) * 100 36 | print("\r %s:[%s%s] %d%% %d/%d" % ( 37 | file_full_name, done_block * '█', ' ' * (50 - 1 - done_block), now_jd, now_photo_count, 38 | all_photo_count), end=" ") 39 | # 下载完图片后获取图片扩展名,并为其增加扩展名 40 | file_type = guess(file_full_name) 41 | rename(file_full_name, file_full_name + '.' + file_type.extension) 42 | 43 | def crawler_photo(self, type_id, photo_count): 44 | """ 45 | :param type_id: 最新 1, 最热 2, 女生 3, 星空 4 46 | :param photo_count: 下载数量 47 | :return: 48 | """ 49 | type_dict = { 50 | '1': '5c68ffb9463b7fbfe72b0db0', 51 | '2': '5c69251c9b1c011c41bb97be', 52 | '3': '5c81087e6aee28c541eefc26', 53 | '4': '5c81f64c96fad8fe211f5367' 54 | } 55 | 56 | url = 'https://service.paper.meiyuan.in/api/v2/columns/flow/{key}?page=1&per_page='.format( 57 | key=type_dict.get(str(type_id))) + str(photo_count) 58 | 59 | # 获取图片列表数据 60 | respond = get(url, headers=self.headers) 61 | photo_data = loads(respond.content) 62 | 63 | # 已经下载的图片张数 64 | now_photo_count = 1 65 | 66 | # 所有图片张数 67 | all_photo_count = len(photo_data) 68 | 69 | # 开始下载并保存5K分辨率壁纸 70 | for photo in photo_data: 71 | 72 | # 创建一个文件夹存放我们下载的图片 73 | if not exists('./' + str(type_id)): 74 | makedirs('./' + str(type_id)) 75 | 76 | # 准备下载的图片链接 77 | file_url = photo['urls']['raw'] 78 | 79 | # 准备下载的图片名称,不包含扩展名 80 | file_name_only = file_url.split('/') 81 | file_name_only = file_name_only[len(file_name_only) - 1] 82 | 83 | # 准备保存到本地的完整路径 84 | file_full_name = './' + str(type_id) + '/' + file_name_only 85 | 86 | # 开始下载图片 87 | self.down_load(file_url, file_full_name, now_photo_count, all_photo_count) 88 | now_photo_count = now_photo_count + 1 89 | 90 | 91 | if __name__ == '__main__': 92 | dg = DownBg() 93 | 94 | wall_paper_id = 1 95 | wall_paper_count = 10 96 | while True: 97 | wall_paper_id = input("\n\n壁纸类型:最新壁纸 1, 最热壁纸 2, 女生壁纸 3, 星空壁纸 4\n请输入编号以便选择5K超清壁纸类型:") 98 | wall_paper_count = input("请输入要下载的5K超清壁纸的数量:") 99 | 100 | if wall_paper_id not in ['1', '2', '3', '4'] or not wall_paper_count.isdigit(): 101 | print('输入有误') 102 | continue 103 | 104 | print("正在下载5K超清壁纸,请稍等……") 105 | dg.crawler_photo(int(wall_paper_id), int(wall_paper_count)) 106 | print('\n下载5K高清壁纸成功!') 107 | -------------------------------------------------------------------------------- /【大众点评】字体反爬、坐标反爬/参数生成/encryp.js: -------------------------------------------------------------------------------- 1 | function make() { 2 | for (var t = 1 * new Date, n = 0; t === 1 * new Date && n < 200;) n++; 3 | return t.toString(16) + n.toString(16) 4 | } 5 | 6 | function test(love, you, babby) { 7 | var t = (you * babby).toString(16); 8 | return make() + "-" + Math.random().toString(16).replace(".", "") + "-" + function () { 9 | var t = love, 10 | n = void 0, 11 | e = void 0, 12 | i = [], 13 | r = 0; 14 | 15 | function o(t, n) { 16 | var e = void 0, 17 | r = 0; 18 | for (e = 0; e < n.length; e++) r |= i[e] << 8 * e; 19 | return t ^ r 20 | } 21 | 22 | for (n = 0; n < t.length; n++) e = t.charCodeAt(n), i.unshift(255 & e), 4 <= i.length && (r = o(r, i), i = []); 23 | return 0 < i.length && (r = o(r, i)), r.toString(16) 24 | }() + "-" + t + "-" + make() 25 | } 26 | 27 | function now_uu() { 28 | return (65536 * (1 + Math.random()) | 0).toString(16).substring(1) 29 | } 30 | function puid() { 31 | return "owl-" +now_uu() + now_uu() + "-" + now_uu() + "-" + now_uu() + "-" + now_uu() + "-" + now_uu() + now_uu() + now_uu() 32 | } -------------------------------------------------------------------------------- /【大众点评】字体反爬、坐标反爬/参数生成/uid.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-11-15 Python: 3.7 4 | import execjs.runtime_names 5 | import random 6 | import requests 7 | import time 8 | from faker import Faker 9 | 10 | 11 | info = random.choice([[800, 1024], [900, 1440], [1050, 1680], [1200, 1920], [1200, 1600]]) 12 | 13 | with open("encryp.js", "r", encoding="utf-8") as f: 14 | js = execjs.compile(f.read()) 15 | 16 | print('引擎', execjs.get().name) 17 | uid = js.call('test', Faker().user_agent(), info[0], info[1]) 18 | page_id = js.call('puid') 19 | 20 | 21 | headers = { 22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', 23 | 'Host': 'catfront.dianping.com', 24 | 'Referer': 'http://www.dianping.com/shop/97789651', 25 | 'Origin': 'http://www.dianping.com', 26 | } 27 | 28 | headers2 = { 29 | 'Cookie': "_lxsdk_cuid=16e8184bc7cc8-00733806cb0caf-d087704-13c680-16e8184bc7cc8;", 30 | 'Referer': 'http://www.dianping.com/shop/76311084', 31 | 'Host': 'www.dianping.com', 32 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36', 33 | } 34 | sign_url = 'http://catfront.dianping.com/api/pv?v=1&sdk=1.8.13&project=app-pc-main-shop&pageurl=main-shop&pageId={pageId}×tamp={timestamp}®ion=&operator=&network=&container=&os=&unionid={unionid}' 35 | session = requests.session() 36 | session.get('http://www.dianping.com/shop/76311084', headers=headers2) 37 | response = session.post(sign_url.format(pageId=page_id, unionid=uid, timestamp=str(int(round(time.time() * 1000)))), headers=headers) 38 | print(uid, page_id) 39 | print(response) 40 | 41 | 42 | -------------------------------------------------------------------------------- /【大众点评】字体反爬、坐标反爬/旧版/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-12 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /【大众点评】字体反爬、坐标反爬/旧版/parse_address_poi.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-03-27 Python: 3.7 4 | 5 | 6 | def to_base36(value): 7 | """converts a decimal integer to a 36 decimal string""" 8 | if not isinstance(value, int): 9 | raise TypeError("expected int, got %s: %r" % (value.__class__.__name__, value)) 10 | 11 | if value == 0: 12 | return "0" 13 | 14 | if value < 0: 15 | sign = "-" 16 | value = -value 17 | else: 18 | sign = "" 19 | 20 | result = [] 21 | 22 | while value: 23 | value, mod = divmod(value, 36) 24 | result.append("0123456789abcdefghijklmnopqrstuvwxyz"[mod]) 25 | 26 | return sign + "".join(reversed(result)) 27 | 28 | 29 | def decode(C): 30 | """parse poi""" 31 | digi = 16 32 | add = 10 33 | plus = 7 34 | cha = 36 35 | I = -1 36 | H = 0 37 | B = '' 38 | J = len(C) 39 | G = ord(C[-1]) 40 | C = C[:-1] 41 | J -= 1 42 | 43 | for E in range(J): 44 | D = int(C[E], cha) - add 45 | if D >= add: 46 | D = D - plus 47 | B += to_base36(D) 48 | if D > H: 49 | I = E 50 | H = D 51 | 52 | A = int(B[:I], digi) 53 | F = int(B[I + 1:], digi) 54 | L = (A + F - int(G)) / 2 55 | K = float(F - L) / 100000 56 | L = float(L) / 100000 57 | return {'lng': L, 'lat': K} 58 | 59 | 60 | if __name__ == '__main__': 61 | print(decode('HFHSGGZTWSATFG')) 62 | -------------------------------------------------------------------------------- /【大众点评】字体反爬、坐标反爬/最新版7月/README.md: -------------------------------------------------------------------------------- 1 | # 仅限学术交流 2 | # 如有冒犯请立即联系作者删除 3 | 4 | # 安装 5 | **`pip3 install fontTools`** 6 | 7 | **`pip3 install requests`** 8 | 9 | **`pip3 install redis`** 10 | 11 | 12 | # 使用 13 | 1. 需要开启 redis 库 并配置,默认链接的本机 redis 14 | 2. 参考 `main.py` 中的调用代码 15 | 16 | **[参考博客链接](https://www.zhangkunzhi.com/archives/72)** -------------------------------------------------------------------------------- /【天眼查】字体加密/num.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/【天眼查】字体加密/num.woff -------------------------------------------------------------------------------- /【天眼查】字体加密/tyc.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-12-06 Python: 3.7 4 | 5 | """ 6 | 从网页下载一个字体文件获取对应推导式,动态获取请自行拓展 7 | """ 8 | 9 | from fontTools.ttLib import TTFont 10 | import re 11 | 12 | font = TTFont('num.woff') # 打开tyc-num.woff 13 | font.saveXML('tyc-num.xml') # 保存为tyc-num.xml 14 | with open('tyc-num.xml', 'r') as f: 15 | xml = f.read() # 读取tyc-num.xml赋值给xml 16 | GlyphID = re.findall(r'', xml) # 获得对应关系 17 | print(GlyphID) 18 | GlyphIDNameLists = list(set([int(Gname) for Gid, Gname in GlyphID])) # 对应关系数量转换 19 | print(GlyphIDNameLists) 20 | DigitalDicts = {str(i): str(GlyphIDNameLists[i - 2]) for i in range(2, len(GlyphIDNameLists)+2)} # 数字对应关系的字典推导式 21 | print(DigitalDicts) 22 | GlyphIDDicts = {str(Gname): DigitalDicts[Gid] for Gid, Gname in GlyphID} # 通过数字对应关系生成源代码跟页面显示的字典推导式 23 | print('-' * 39 + '数字对应关系的字典推导式' + '-' * 39) 24 | print(DigitalDicts) 25 | print('-' * 27 + '通过数字对应关系生成源代码跟页面显示的字典推导式' + '-' * 27) 26 | print(GlyphIDDicts) 27 | -------------------------------------------------------------------------------- /【抖音】无水印视频解析/README.md: -------------------------------------------------------------------------------- 1 | 这是一份抖音无水印单个视频的解析代码 2 | 3 | -------------------------------------------------------------------------------- /【抖音】无水印视频解析/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # Auth: Zok Email: 362416272@qq.com 3 | # Date: 2020/3/6 4 | 5 | -------------------------------------------------------------------------------- /【抖音】无水印视频解析/parse.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # Auth: Zok Email: 362416272@qq.com 3 | # Date: 2020/3/6 4 | 5 | import re 6 | import requests 7 | import json 8 | 9 | 10 | class ParseVideo: 11 | 12 | def __init__(self, share): 13 | path = self.get_url(share) 14 | self.url = 'https://v.douyin.com/' + path + '/' 15 | self.headers = { 16 | 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 17 | } 18 | self.session = requests.session() 19 | self.first_url = None 20 | 21 | @staticmethod 22 | def get_url(share_url): 23 | return re.search(r'https://v\.douyin\.com/(.*?)/', share_url).group(1) 24 | 25 | def go_location(self): 26 | response = self.session.get(self.url, headers=self.headers) 27 | self.first_url = response.url 28 | result = re.search(r'itemId: "(.*?)",[\s\S]*?uid: "(.*?)",[\s\S]*?authorName: "(.*?)",[\s\S]*?dytk: "(.*?)"', 29 | response.text) 30 | return result 31 | 32 | def go_message(self, ret): 33 | url = 'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids=' + ret.group(1) + '&dytk=' + ret.group(4) 34 | response = self.session.get(url, headers=self.headers) 35 | json_data = json.loads(response.text) 36 | user_id = ret.group(2) 37 | user_name = ret.group(3).encode('utf-8').decode('unicode_escape') 38 | 39 | if json_data.get('status_code') != 0: 40 | print('解析失败') 41 | exit() 42 | item_list = json_data.get('item_list')[0] 43 | aweme_id = item_list.get('aweme_id') 44 | desc = item_list.get('desc') 45 | comment_count = item_list.get('statistics').get('comment_count') 46 | digg_count = item_list.get('statistics').get('digg_count') 47 | 48 | video = item_list.get('video') 49 | cover = video.get('origin_cover').get('url_list')[0] 50 | play_addr = video.get('play_addr_lowbr').get('url_list')[0] 51 | 52 | play_addr_response = self.session.get(play_addr, headers=self.headers, allow_redirects=False) 53 | msg = """ 54 | 用户id:{user_id} 55 | 用户名:{user_name} 56 | 作品id:{aweme_id} 57 | 标题: {desc} 58 | 评论数: {comment_count} 59 | 点赞数: {digg_count} 60 | 封面地址:{cover} 61 | 无水印视频:{addr} 62 | """.format( 63 | user_id=user_id, 64 | user_name=user_name, 65 | aweme_id=aweme_id, 66 | desc=desc, 67 | comment_count=comment_count, 68 | digg_count=digg_count, 69 | cover=cover, 70 | addr=play_addr_response.headers['location'] 71 | ) 72 | print(msg) 73 | 74 | def start(self): 75 | result = self.go_location() 76 | self.go_message(result) 77 | 78 | 79 | if __name__ == '__main__': 80 | # text = '#在抖音,记录美好生活#要逆天!北京地坛医院证实新冠病毒攻击中枢神经系统 https://v.douyin.com/tW7qrw/ 复制此链接,打开【抖音短视频】,直接观看视频!' 81 | text = input('请输入分享链接>>>') 82 | pv = ParseVideo(text) 83 | pv.start() 84 | -------------------------------------------------------------------------------- /【拼多多】登陆参数生成/PinDuoDuo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-23 Python: 3.7 4 | 5 | 6 | import execjs.runtime_names 7 | 8 | """ 9 | pip3 install execjs 10 | npm i jsdom -g 11 | """ 12 | 13 | 14 | class PingDuoDuoSpider(object): 15 | """ 16 | 拼多多加密解析 17 | """ 18 | 19 | def __init__(self, password): 20 | # 初始化 21 | print('引擎', execjs.get().name) 22 | self.password = password 23 | 24 | def make(self): 25 | with open("encryp.js", "r", encoding="utf-8") as f: 26 | ctx = execjs.compile(f.read()) 27 | 28 | ret = ctx.call("test", self.password) 29 | print(ret) 30 | 31 | 32 | if __name__ == '__main__': 33 | key = input("输入字符串") 34 | pdd = PingDuoDuoSpider(key) 35 | pdd.make() 36 | 37 | -------------------------------------------------------------------------------- /【拼多多】登陆参数生成/README.md: -------------------------------------------------------------------------------- 1 | # 解密过程参考博客 2 | 3 | [博客链接](https://www.zhangkunzhi.com/archives/67) -------------------------------------------------------------------------------- /【拼多多】登陆参数生成/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-23 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /【淘宝】自动登陆/login_for_sina.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-04-11 Python: 3.7 4 | 5 | from selenium import webdriver 6 | from selenium.webdriver.common.by import By 7 | from selenium.webdriver.support.ui import WebDriverWait 8 | from selenium.webdriver.support import expected_conditions as EC 9 | 10 | 11 | class TB_Spider: 12 | 13 | def __init__(self, username, password): 14 | """初始化参数""" 15 | url = 'https://login.taobao.com/member/login.jhtml' 16 | self.url = url 17 | 18 | options = webdriver.ChromeOptions() 19 | # 不加载图片,加快访问速度 20 | options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) 21 | # 设置为开发者模式,避免被识别 22 | options.add_experimental_option('excludeSwitches', 23 | ['enable-automation']) 24 | self.browser = webdriver.Chrome(executable_path='./chromedriver', options=options) 25 | self.wait = WebDriverWait(self.browser, 40) 26 | # 初始化用户名 27 | self.username = username 28 | # 初始化密码 29 | self.password = password 30 | 31 | def run(self): 32 | """登陆接口""" 33 | self.browser.get(self.url) 34 | try: 35 | # 这里设置等待:等待输入框 36 | login_element = self.wait.until( 37 | EC.presence_of_element_located((By.CSS_SELECTOR, '.qrcode-login > .login-links > .forget-pwd'))) 38 | login_element.click() 39 | 40 | sina_login = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.weibo-login'))) 41 | sina_login.click() 42 | 43 | weibo_user = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.username > .W_input'))) 44 | weibo_user.send_keys(self.username) 45 | 46 | sina_password = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.password > .W_input'))) 47 | sina_password.send_keys(self.password) 48 | 49 | submit = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.btn_tip > a > span'))) 50 | submit.click() 51 | 52 | taobao_name = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 53 | '.site-nav-bd > ul.site-nav-bd-l > li#J_SiteNavLogin > div.site-nav-menu-hd > div.site-nav-user > a.site-nav-login-info-nick '))) 54 | # 登陆成功打印提示信息 55 | print("登陆成功:%s" % taobao_name.text) 56 | except Exception: 57 | self.browser.close() 58 | print("登陆失败") 59 | 60 | 61 | if __name__ == "__main__": 62 | name = input("请输入你的微博用户名:") 63 | pas = input("请输入密码:") 64 | spider = TB_Spider(name, pas) 65 | spider.run() 66 | -------------------------------------------------------------------------------- /其他实战/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-11 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【5173网】自动登录/auto_login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-08-26 Python: 3.7 4 | 5 | import re 6 | import requests 7 | import execjs.runtime_names 8 | 9 | 10 | class YX(object): 11 | """ 12 | 易通贷自动登陆 13 | """ 14 | 15 | def __init__(self, user, pwd): 16 | self.user = user 17 | self.pwd = pwd 18 | self.session = requests.session() 19 | self.url = 'https://passport.5173.com/' 20 | self.headers = { 21 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 22 | 'Host': 'passport.5173.com', 23 | } 24 | print('引擎', execjs.get().name) 25 | 26 | def make_pwd(self, key): 27 | with open("encryp.js", "r", encoding="utf-8") as f: 28 | ctx = execjs.compile(f.read()) 29 | return ctx.call("make_js", self.pwd, key) 30 | 31 | def make_data(self, token, key): 32 | data = { 33 | 'userName': self.user, 34 | 'password': self.make_pwd(key), 35 | 'mobileNo': '', 36 | 'captcha': '', 37 | 'smsCaptcha': '', 38 | 'category': '', 39 | 'passpod': '', 40 | 'smsLogin': '0', 41 | '__validationToken__': token, 42 | '__validationDna__': '', 43 | } 44 | return data 45 | 46 | def login(self): 47 | """start 48 | """ 49 | response = self.session.get(self.url) 50 | info = re.search(r'SecurityToken:"(.*?)",[\s\S]*?PasswordKey:"(.*?)",', response.text) 51 | try: 52 | token = info.group(1) 53 | key = info.group(2) 54 | data = self.make_data(token, key) 55 | result = self.session.post(self.url, data=data, headers=self.headers) 56 | if '5173auth' in str(result.cookies): 57 | print(result.cookies) 58 | print('【登陆成功】') 59 | else: 60 | print('【登陆失败】') 61 | except AttributeError: 62 | print('【获取key失败】') 63 | 64 | 65 | if __name__ == '__main__': 66 | username = input('请输入账号') 67 | password = input('密码') 68 | yx = YX(username, password) 69 | yx.login() 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /其他实战/【5173网】自动登录/logOK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【5173网】自动登录/logOK.png -------------------------------------------------------------------------------- /其他实战/【9377网】自动登录/9377login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-23 Python: 3.7 4 | 5 | import requests 6 | 7 | 8 | class Login9377: 9 | """9377游戏平台自动登陆 10 | """ 11 | 12 | def __init__(self, username, password): 13 | self.headers = { 14 | 'Upgrade-Insecure-Requests': '1', 15 | 'Host': 'wvw.9377.com', 16 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36' 17 | } 18 | if len(password) < 6 or len(username) < 6: 19 | print('请输入正确账号密码!') 20 | exit() 21 | self.username = username 22 | self.password = password 23 | self.login_url = 'http://wvw.9377.com/login.php' 24 | self.host = 'https://www.9377.com/' 25 | self.session = requests.session() 26 | 27 | def login(self): 28 | """登陆 29 | """ 30 | data = { 31 | 'do': 'login', 32 | 'gourl': self.host, 33 | 'login_save': '1', 34 | 'username': self.username, 35 | 'password': self.password 36 | } 37 | self.session.get(self.login_url, headers=self.headers) 38 | result = self.session.post(self.login_url, headers=self.headers, data=data) 39 | self.check(result) 40 | 41 | def check(self, result): 42 | """检测登陆状态 43 | """ 44 | if self.username in str(result.cookies): 45 | print('登陆成功') 46 | else: 47 | print('用户名或密码错误') 48 | 49 | 50 | if __name__ == '__main__': 51 | name = input('输入账号') 52 | word = input('输入密码') 53 | lg = Login9377(name, word) 54 | lg.login() 55 | -------------------------------------------------------------------------------- /其他实战/【9377网】自动登录/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-24 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【DNS】自动登录/Login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-08 Python: 3.7 4 | import requests 5 | import re 6 | import execjs.runtime_names 7 | 8 | 9 | class DNS: 10 | def __init__(self, user, pwd): 11 | self.user = user 12 | self.pwd = pwd 13 | self.js = None 14 | self.url = 'https://www.dns.com/login.html' 15 | self.headers = { 16 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 17 | 'Host': 'www.dns.com' 18 | } 19 | self.read_js() 20 | 21 | def get_token(self): 22 | response = requests.get(self.url, headers=self.headers) 23 | try: 24 | token = re.search(r'', response.text).group(1) 25 | print(token) 26 | except AttributeError: 27 | print('token 捕获失败') 28 | 29 | def read_js(self): 30 | with open("dns.js", "r", encoding="utf-8") as f: 31 | self.js = execjs.compile(f.read()) 32 | 33 | def login(self): 34 | data = { 35 | '_token': self.get_token(), 36 | 'password': self.js.call('aes', self.pwd), 37 | 'email': self.js.call('aes', self.user), 38 | 'redirectTo': 'https://www.dns.com/dashboard', 39 | } 40 | response = requests.post(self.url, data=data, headers=self.headers) 41 | print(response) 42 | 43 | 44 | if __name__ == '__main__': 45 | username = input('请输入账号') 46 | password = input('密码') 47 | dns = DNS(username, password) 48 | dns.login() 49 | -------------------------------------------------------------------------------- /其他实战/【DNS】自动登录/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【GitHub】自动登录/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-18 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【GitHub】自动登录/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-04-11 Python: 3.7 4 | """ 5 | 1. get login html token 6 | 2. login 7 | """ 8 | 9 | import requests 10 | from lxml import etree 11 | 12 | 13 | class Login(object): 14 | def __init__(self, username, password): 15 | 16 | self.headers = { 17 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36', 18 | 'Referer': 'https://github.com/', 19 | 'Host': 'github.com' 20 | } 21 | 22 | self.login_url = 'https://github.com/login' 23 | self.post_url = 'https://github.com/session' 24 | self.session = requests.Session() 25 | 26 | self.username = username 27 | self.password = password 28 | 29 | def login_GitHub(self): 30 | """ 31 | 模拟登陆 32 | :return: 33 | """ 34 | 35 | post_data = { 36 | 'commit': 'Sign in', 37 | 'utf8': '✓', 38 | 'authenticity_token': self.get_token(), 39 | 'login': self.username, 40 | 'password': self.password 41 | } 42 | 43 | response = self.session.post(self.post_url, data=post_data, headers=self.headers) 44 | 45 | if response.status_code == 200: 46 | html = etree.HTML(response.content.decode()) 47 | if html.xpath('/html/body/div[1]/header/div[7]/details/summary'): 48 | pro_list = html.xpath('//ul[@class="list-style-none"]/li/div/a/span[2]/text()') 49 | print("登录成功!正在拉取你的所有项目..") 50 | print(pro_list) 51 | 52 | else: 53 | print('账号或密码错误') 54 | else: 55 | print("登录失败!") 56 | 57 | def get_token(self): 58 | """ 59 | 获取token 60 | :return: 61 | """ 62 | 63 | response = self.session.get(self.login_url, headers=self.headers) 64 | html = etree.HTML(response.content.decode()) 65 | 66 | token = html.xpath('//input[@name="authenticity_token"]/@value')[0] 67 | 68 | return token 69 | 70 | 71 | if __name__ == '__main__': 72 | user = input('请输入您的账号: ') 73 | key = input('请输入您的密码: ') 74 | 75 | login = Login(user, key) 76 | login.login_GitHub() 77 | -------------------------------------------------------------------------------- /其他实战/【Glidedsky】自动登陆/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-25 Python: 3.7 4 | 5 | import requests 6 | import re 7 | import json 8 | 9 | 10 | class Gli: 11 | """ 12 | 自动登陆 Glidedsky 13 | http://www.glidedsky.com/login 14 | """ 15 | 16 | def __init__(self, user, pwd): 17 | self.user = user 18 | self.pwd = pwd 19 | self.url = 'http://www.glidedsky.com/login' 20 | self.session = requests.session() 21 | self.headers = { 22 | 'Host': 'www.glidedsky.com', 23 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 24 | } 25 | 26 | def get_token(self): 27 | response = self.session.get(self.url, headers=self.headers) 28 | _token = re.search(r'name="csrf-token" content="(.*?)">', response.text).group(1) 29 | return _token 30 | 31 | def login(self): 32 | data = {'_token': self.get_token(), 'email': self.user, 'password': self.pwd} 33 | self.session.post(self.url, data=data) 34 | # print(self.session.cookies) 35 | cookies = requests.utils.dict_from_cookiejar(self.session.cookies) # cookies 输出 36 | with open('toolkit/cookies.json', 'w', encoding='utf-8') as f: 37 | f.write(json.dumps(cookies)) 38 | # print(cookies) 39 | 40 | 41 | if __name__ == '__main__': 42 | username = input('请输入用户名') 43 | password = input('请输入密码') 44 | g = Gli(username, password) 45 | g.login() 46 | -------------------------------------------------------------------------------- /其他实战/【Python加密库】Demo/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-11 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【TCL金融】自动登录/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【TCL金融】自动登录/auto_login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-10 Python: 3.7 4 | 5 | import requests 6 | import execjs.runtime_names 7 | 8 | 9 | class SpiderLogin: 10 | """ 11 | TCL 个人金融 12 | https://weixin.tjinsuo.com/#login/mine 13 | """ 14 | 15 | def __init__(self, user, pwd): 16 | self.user = user 17 | self.pwd = pwd 18 | self.js = None 19 | self.url = 'https://weixin.tjinsuo.com/service/user/login' 20 | self.load_js() 21 | print('引擎', execjs.get().name) 22 | 23 | def load_js(self): 24 | """js 调用 25 | """ 26 | with open("encryp.js", "r", encoding="utf-8") as f: 27 | self.js = execjs.compile(f.read()) 28 | 29 | def auto_login(self): 30 | """登陆 31 | """ 32 | ret = self.js.call('make', self.pwd) 33 | rand_key, word = ret.split('||') 34 | print(rand_key, word) 35 | headers = { 36 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 37 | 'Host': 'weixin.tjinsuo.com', 38 | 'terminalType': 'BEST_WX', 39 | 'Accept': 'application/json' 40 | } 41 | data = 'mobile={user}&password={pwd}&cipherkey=&message=&randKey={rand_key}'.format(user=self.user, 42 | pwd=word, 43 | rand_key=rand_key) 44 | 45 | response = requests.post(self.url, headers=headers, data=data) 46 | print(response.text) 47 | print(response) 48 | 49 | 50 | if __name__ == '__main__': 51 | username = input('请输入账号') 52 | password = input('密码') 53 | wcb = SpiderLogin(username, password) 54 | wcb.auto_login() 55 | -------------------------------------------------------------------------------- /其他实战/【TCL金融】自动登录/ok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【TCL金融】自动登录/ok.png -------------------------------------------------------------------------------- /其他实战/【steam】自动登录/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-12-11 Python: 3.7 4 | 5 | import execjs 6 | import requests, json, re 7 | 8 | 9 | def Get_parameters(username): 10 | """steam 登录 只处理了密码加密。其他请自行拓展 11 | :return 公钥和一个参数; 12 | """ 13 | import time 14 | try: 15 | url = "https://store.steampowered.com/login/getrsakey/" 16 | 17 | headers = { 18 | 'User-Agent': 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', 19 | 'Host': 'store.steampowered.com', 20 | 'Referer': 'https://store.steampowered.com/login/?redir=&redir_ssl=1', 21 | 'Origin': 'https://store.steampowered.com' 22 | } 23 | data = { 24 | 'donotcache': int(round(time.time() * 1000)), 25 | 'username': username, 26 | } 27 | res = requests.post(url=url, headers=headers, data=data) 28 | publickey_mod = json.loads(res.text).get('publickey_mod') 29 | publickey_exp = json.loads(res.text).get('publickey_exp') 30 | return publickey_mod, publickey_exp 31 | 32 | except Exception as err: 33 | print('访问失败', err) 34 | 35 | 36 | def main(pwd, publickey_mod, publickey_exp): 37 | """ 38 | :param pwd: 39 | :param publickey_mod: 40 | :param publickey_exp: 41 | :return sign: 42 | """ 43 | with open('execute.js', 'r', encoding='utf-8') as f: 44 | js = execjs.compile(f.read()) 45 | print('引擎', execjs.get().name) 46 | sign = js.call('get_pwd', pwd, publickey_mod, publickey_exp) 47 | return sign 48 | 49 | 50 | if __name__ == '__main__': 51 | username = input('请输入账户:') 52 | pwd = input('请输入密码:') 53 | publickey_mod, publickey_exp = Get_parameters(username) 54 | sign = main(pwd, publickey_mod, publickey_exp) 55 | print(sign) 56 | -------------------------------------------------------------------------------- /其他实战/【万创帮】自动登录/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【万创帮】自动登录/login_ok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【万创帮】自动登录/login_ok.png -------------------------------------------------------------------------------- /其他实战/【万创帮】自动登录/spider_login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-03 Python: 3.7 4 | 5 | import json 6 | import requests 7 | import execjs.runtime_names 8 | 9 | 10 | class SpiderLogin: 11 | """ 12 | 万创帮爬虫登陆 13 | """ 14 | 15 | def __init__(self, user, pwd): 16 | self.user = user 17 | self.pwd = pwd 18 | self.url = 'https://m.wcbchina.com/login/other-login.html' 19 | print('引擎', execjs.get().name) 20 | 21 | def use_js(self): 22 | """js 调用 23 | """ 24 | with open("encryp.js", "r", encoding="utf-8") as f: 25 | js = execjs.compile(f.read()) 26 | 27 | try: 28 | sign, t = js.call("make_sigin") 29 | pwd = js.call("make_pwd", self.pwd) 30 | return sign, t, pwd 31 | except Exception: 32 | print('异常数据') 33 | 34 | def auto_login(self): 35 | """登陆 36 | """ 37 | sign, t, pwd = self.use_js() 38 | headers = { 39 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 40 | 'Referer': 'https://m.wcbchina.com/login/other-login.html' 41 | } 42 | pay_load = { 43 | 'auth': {'sign': sign, 'timestamp': t}, 44 | 'password': self.user, 45 | 'username': pwd 46 | } 47 | 48 | response = requests.post(self.url, headers=headers, data=json.dumps(pay_load)) 49 | print(response.cookies) 50 | print(response) 51 | 52 | 53 | if __name__ == '__main__': 54 | username = input('请输入账号') 55 | password = input('密码') 56 | wcb = SpiderLogin(username, password) 57 | wcb.auto_login() 58 | -------------------------------------------------------------------------------- /其他实战/【中关村】自动登录/README.md: -------------------------------------------------------------------------------- 1 | # 解密过程博客说明 2 | 3 | https://www.zhangkunzhi.com/?p=135 -------------------------------------------------------------------------------- /其他实战/【中关村】自动登录/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-29 Python: 3.7 4 | 5 | 6 | import requests 7 | import hashlib 8 | import time 9 | import json 10 | 11 | from urllib import parse 12 | 13 | 14 | class ZGC: 15 | """ 16 | 解析过程说明 https://www.zhangkunzhi.com/?p=135 17 | 18 | 1. 用的 CryptoJS md5 加密 19 | 2. 需要带入 cookies 20 | """ 21 | 22 | def __init__(self, username, password): 23 | self.username = username 24 | self.password = password 25 | self.headers = { 26 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 27 | } 28 | 29 | def get_cookies(self): 30 | """取cookies 31 | """ 32 | _now = time.time() 33 | t = str(_now)[:7] 34 | _jsonp = int(round(_now * 1000)) 35 | pick = 'http://js.zol.com.cn/pvn/pv.ht?&t={t}&c=&callback=_jsonp{_jsonp}'.format(t=t, _jsonp=_jsonp) 36 | try: 37 | content = requests.get(pick, headers=self.headers).text 38 | ipck = json.loads(content[content.find('(')+1:-1]).get('ipck') 39 | return parse.quote(ipck) 40 | except: 41 | print('cookies 获取失败') 42 | 43 | def login(self, ipck): 44 | """登陆 45 | """ 46 | _str_now = str(int(time.time())) 47 | login_url = 'http://service.zol.com.cn/user/ajax/login2014/login.php' 48 | data = { 49 | 'userid': self.username, 50 | 'pwd': self.make_md5(self.password), 51 | 'is_auto': '1', 52 | 'backUrl': 'http://www.zol.com.cn/' 53 | } 54 | cookies = { 55 | 'Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0': _str_now, 56 | 'Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0': _str_now, 57 | 'ip_ck': ipck, 58 | 'vn': '1', 59 | 'lv': _str_now, 60 | 'z_pro_city': 's_provice%3Dzhongqing%26s_city%3Dzhongqing', 61 | 'z_day': 'ixgo20%3D1' 62 | } 63 | 64 | response = requests.post(login_url, headers=self.headers, data=data, cookies=cookies) 65 | msg = json.loads(response.content) 66 | return msg 67 | 68 | @staticmethod 69 | def make_md5(_str): 70 | """md5 生成 71 | """ 72 | # 待加密信息 73 | text = _str + 'zol' 74 | # 创建md5对象 75 | m = hashlib.md5() 76 | m.update(text.encode(encoding='utf-8')) 77 | str_md5 = m.hexdigest() 78 | return str_md5 79 | 80 | def main(self): 81 | ipck = self.get_cookies() 82 | msg = self.login(ipck) 83 | print(msg) 84 | 85 | 86 | if __name__ == '__main__': 87 | user = input('请输入中关村账号') 88 | pwd = input('请输入中关村密码') 89 | zgc = ZGC(user, pwd) 90 | zgc.main() 91 | -------------------------------------------------------------------------------- /其他实战/【京东】商品数据爬取/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-12-10 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【京东】商品数据爬取/geckodriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【京东】商品数据爬取/geckodriver -------------------------------------------------------------------------------- /其他实战/【京东】商品数据爬取/selenium抓取.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-04-11 Python: 3.7 4 | 5 | from selenium import webdriver 6 | from selenium.webdriver.common.keys import Keys # 键盘按键操作 7 | import time 8 | 9 | 10 | def get_goods(driver): 11 | try: 12 | goods = driver.find_elements_by_class_name('gl-item') 13 | 14 | for good in goods: 15 | detail_url = good.find_element_by_tag_name('a').get_attribute('href') 16 | 17 | p_name = good.find_element_by_css_selector('.p-name em').text.replace('\n', '') 18 | price = good.find_element_by_css_selector('.p-price i').text 19 | p_commit = good.find_element_by_css_selector('.p-commit a').text 20 | 21 | msg = ''' 22 | 商品 : %s 23 | 链接 : %s 24 | 价钱 :%s 25 | 评论 :%s 26 | ''' % (p_name, detail_url, price, p_commit) 27 | 28 | print(msg, end='\n\n') 29 | 30 | button = driver.find_element_by_partial_link_text('下一页') 31 | button.click() 32 | time.sleep(1) 33 | get_goods(driver) 34 | except Exception: 35 | pass 36 | 37 | 38 | def spider(url, keyword): 39 | driver = webdriver.Firefox() 40 | driver.get(url) 41 | driver.implicitly_wait(3) # 使用隐式等待 42 | try: 43 | input_tag = driver.find_element_by_id('key') 44 | input_tag.send_keys(keyword) 45 | input_tag.send_keys(Keys.ENTER) 46 | get_goods(driver) 47 | finally: 48 | driver.close() 49 | 50 | 51 | if __name__ == '__main__': 52 | spider('https://www.jd.com/', keyword='手机') 53 | -------------------------------------------------------------------------------- /其他实战/【人人网】自动登录/login.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | import re 4 | import execjs.runtime_names 5 | 6 | 7 | class People: 8 | def __init__(self, user, pwd): 9 | """ 10 | 初始化 11 | :param user: 用户名 12 | :param pwd: 密码 13 | """ 14 | self.username = user 15 | self.pwd = pwd 16 | self.ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36' 17 | self.headers = { 18 | 'User-Agent': self.ua, 19 | 'Host': 'www.renren.com', 20 | } 21 | self.session = requests.session() 22 | self.json_data = '' 23 | 24 | print('【JS引擎】', execjs.get().name) 25 | with open("enc.js", "r", encoding="utf-8") as f: 26 | self.js = execjs.compile(f.read()) 27 | 28 | def to_index(self): 29 | """ 30 | 第一步 - 访问首页 31 | 获取 Cookies 32 | :return: 33 | """ 34 | response = self.session.get('http://www.renren.com/', headers=self.headers) 35 | print('【主页】', response) 36 | 37 | def get_key(self): 38 | """ 39 | 第二步 - 获取加密参数 40 | 获取 rkey 以及 密码加密所需参数 41 | :return: 42 | """ 43 | headers = { 44 | 'Referer': 'http://login.renren.com/ajaxproxy.htm', 45 | 'User-Agent': self.ua, 46 | } 47 | response = self.session.get('http://login.renren.com/ajax/getEncryptKey', headers=headers) 48 | print('【获取key】', response.text) 49 | return response.text 50 | 51 | def login(self, key_info): 52 | """ 53 | 第三步 - 登录账号 54 | :param key_info: 第二步获取的参数 55 | :return: 56 | """ 57 | url = 'http://www.renren.com/ajaxLogin/login?1=1' + self.js.call('getTime') 58 | data = { 59 | 'email': self.username, 60 | 'icode': "", 61 | 'origURL': 'http://www.renren.com/home', 62 | 'domain': 'renren.com', 63 | 'key_id': '1', 64 | 'captcha_type': 'web_login', 65 | 'password': self.get_password(key_info), 66 | 'rkey': json.loads(key_info).get('rkey'), 67 | 'f': '' 68 | } 69 | print('【登录data】', data) 70 | print('【登录URL】', url) 71 | print('【Cookies】', self.session.cookies) 72 | response = self.session.post(url, data=data, headers=self.headers) 73 | print('【返回信息】', response.text) 74 | response = self.session.get('http://www.renren.com/home', headers=self.headers) 75 | print('【登录信息】', re.findall("(.*?)", response.text)) 76 | 77 | def get_password(self, key_info): 78 | """ 79 | 调用 js 代码生成参数 80 | :param key_info: 81 | :return: 82 | """ 83 | return self.js.call('enc', key_info, self.pwd) 84 | 85 | def start(self): 86 | """ 87 | 启动 88 | :return: 89 | """ 90 | self.to_index() 91 | self.login(self.get_key()) 92 | 93 | 94 | if __name__ == '__main__': 95 | """ 96 | 启动区域 97 | """ 98 | username = input('用户名>>> ') 99 | password = input('密码>>> ') 100 | pp = People(username, password) 101 | pp.start() 102 | -------------------------------------------------------------------------------- /其他实战/【企业名片】企业查询/qi_ming.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-11-08 Python: 3.7 4 | 5 | import requests 6 | import json 7 | import execjs.runtime_names 8 | 9 | 10 | with open('encryp.js', 'r', encoding='utf-8') as f: 11 | js = execjs.compile(f.read()) 12 | 13 | print('引擎', execjs.get().name) 14 | 15 | data = { 16 | 'time_interval': '', 17 | 'tag': '', 18 | 'tag_type': '', 19 | 'province': '', 20 | 'lunci': '', 21 | 'page': '1', 22 | 'num': '20', 23 | 'unionid': '', 24 | } 25 | 26 | headers = { 27 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36' 28 | } 29 | 30 | response = requests.post('https://vipapi.qimingpian.com/DataList/productListVip', data=data, headers=headers) 31 | 32 | re_data = json.loads(response.text) 33 | 34 | data = js.call('get_info', re_data.get('encrypt_data')) 35 | print(data.encode('utf-8').decode('unicode_escape')) 36 | 37 | -------------------------------------------------------------------------------- /其他实战/【国鑫所】自动登录/Login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-10 Python: 3.7 4 | import execjs.runtime_names 5 | import requests 6 | 7 | 8 | class GuoXin: 9 | """ 10 | 国鑫所 11 | https://wechat.gclfax.com/html/register/login.html 12 | """ 13 | 14 | def __init__(self, user, pwd): 15 | self.user = user 16 | self.pwd = pwd 17 | self.url = 'https://wechat.gclfax.com/client/index.php' 18 | self.js = None 19 | self.init_js() 20 | 21 | def init_js(self): 22 | print('引擎', execjs.get().name) 23 | with open("encryp.js", "r", encoding="utf-8") as f: 24 | self.js = execjs.compile(f.read()) 25 | 26 | def login(self): 27 | headers = { 28 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 29 | 'Host': 'wechat.gclfax.com', 30 | 'Origin': 'https://wechat.gclfax.com', 31 | 'Referer': 'https://wechat.gclfax.com/html/register/login.html' 32 | } 33 | data = { 34 | 'OPT': '1', 35 | 'name': self.user, 36 | 'pwd': self.js.call('test', self.pwd), 37 | 'randomId': '', 38 | 'code': '', 39 | 'openid': '', 40 | } 41 | response = requests.post(self.url, headers=headers, data=data) 42 | print(response.text) 43 | print(response) 44 | 45 | 46 | if __name__ == '__main__': 47 | username = input('用户名') 48 | password = input('密码') 49 | gxs = GuoXin(username, password) 50 | gxs.login() 51 | -------------------------------------------------------------------------------- /其他实战/【国鑫所】自动登录/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【国鑫所】自动登录/login_ok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【国鑫所】自动登录/login_ok.png -------------------------------------------------------------------------------- /其他实战/【天眼查】模拟登录/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-18 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【天眼查】模拟登录/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-04-13 Python: 3.7 4 | import time 5 | 6 | from lxml import etree 7 | from selenium import webdriver 8 | from selenium.webdriver.common.by import By 9 | from selenium.webdriver.support.ui import WebDriverWait 10 | from selenium.webdriver.support import expected_conditions as EC 11 | 12 | 13 | class TYC_Spider: 14 | 15 | def __init__(self, username, password): 16 | """初始化参数""" 17 | url = 'https://www.tianyancha.com/login' 18 | page_url = 'https://www.tianyancha.com/search/ohp1/p{page}?base=cq' 19 | self.page_url = page_url 20 | self.page = 1 # 当前页数 21 | self.url = url 22 | 23 | options = webdriver.ChromeOptions() 24 | # 不加载图片,加快访问速度 25 | # options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2}) 26 | # 设置为开发者模式,避免被识别 27 | options.add_experimental_option('excludeSwitches', 28 | ['enable-automation']) 29 | self.browser = webdriver.Chrome(executable_path='./chromedriver', options=options) 30 | self.wait = WebDriverWait(self.browser, 40) 31 | # 初始化用户名 32 | self.username = username 33 | # 初始化密码 34 | self.password = password 35 | 36 | def run(self): 37 | """登陆接口""" 38 | self.browser.get(self.url) 39 | try: 40 | use_pass = self.wait.until( 41 | EC.presence_of_element_located((By.XPATH, '//*[@id="web-content"]/div/div[2]/div/div[2]/div/div[3]/div[1]/div[2]'))) 42 | time.sleep(2) 43 | use_pass.click() 44 | username = self.wait.until( 45 | EC.presence_of_element_located((By.XPATH, '//*[@id="web-content"]/div/div[2]/div/div[2]/div/div[3]/div[2]/div[2]/input'))) 46 | password = self.wait.until( 47 | EC.presence_of_element_located( 48 | (By.XPATH, '//*[@id="web-content"]/div/div[2]/div/div[2]/div/div[3]/div[2]/div[3]/input'))) 49 | input_to = self.wait.until( 50 | EC.presence_of_element_located( 51 | (By.XPATH, '//*[@id="web-content"]/div/div[2]/div/div[2]/div/div[3]/div[2]/div[5]'))) 52 | username.send_keys(self.username) 53 | password.send_keys(self.password) 54 | input_to.click() 55 | 56 | self.wait.until( 57 | EC.presence_of_element_located((By.XPATH, '//*[@id="home-main-search"]'))) 58 | print('登陆成功') 59 | self.go_page() 60 | 61 | except Exception: 62 | self.browser.close() 63 | print("登陆失败") 64 | 65 | def go_page(self): 66 | """进入指定页面""" 67 | self.browser.get(self.page_url.format(page=str(self.page+1))) # ohp带电话 68 | self.get_info() 69 | self.go_page() 70 | 71 | def get_info(self): 72 | """获取当前页面,企业名称+电话号码""" 73 | html = self.browser.page_source 74 | etr = etree.HTML(html) 75 | divs = etr.xpath("//div[@class='search-item sv-search-company']") 76 | for div in divs: 77 | title = div.xpath('./div/div[3]/div[1]/a/text()') 78 | phone = div.xpath('./div/div[3]/div[3]/div[1]/script/text()') 79 | if not phone: 80 | phone = div.xpath('./div/div[3]/div[3]/div[1]/span[2]/span/text()') 81 | 82 | if not phone: 83 | phone = div.xpath('./div/div[3]/div[4]/div[1]/script/text()') 84 | print(title, phone) 85 | time.sleep(2) 86 | 87 | 88 | if __name__ == "__main__": 89 | name = input("请输入你的微博用户名:") 90 | pas = input("请输入密码:") 91 | spider = TYC_Spider(name, pas) 92 | spider.run() 93 | -------------------------------------------------------------------------------- /其他实战/【天翼】登录/login.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # Auth: Zok Email: 362416272@qq.com 3 | # Date: 2020/1/23 4 | 5 | 6 | import requests 7 | import re 8 | import execjs 9 | 10 | 11 | session = requests.session() 12 | UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36' 13 | 14 | 15 | def login(username, password): 16 | with open('v1.js', 'r', encoding='utf-8') as f: 17 | js = execjs.compile(f.read()) 18 | username = js.call('make', username) 19 | password = js.call('make', password) 20 | url = 'https://e.189.cn/index.do' 21 | login_url = 'https://open.e.189.cn/api/logbox/oauth2/loginSubmit.do' 22 | response = session.get(url, headers={"User-Agent": UA}) 23 | ret = re.search(r'sign=(.*?)&appId=(.*?)¶s=(.*?)&format=(.*?)&clientType=(.*?)&version=(.*?)">', response.text) 24 | 25 | url = 'https://open.e.189.cn/api/logbox/oauth2/unifyAccountLogin.do?sign=' + ret.group(1) + '&appId=' + ret.group( 26 | 2) + '¶s=' + ret.group(3) + '&format=' + ret.group(4) + '&clientType=' + ret.group( 27 | 5) + '&version=' + ret.group(6) 28 | 29 | response = session.get(url, headers={"User-Agent": UA}) 30 | text = response.text 31 | 32 | captchaToken = re.search(r"captchaToken' value='(.*?)'>", text).group(1) 33 | 34 | ret = re.search(r"clientType = '(.*?)'[\s\S]*?accountType = '(.*?)'[\s\S]*?appKey = '(.*?)'", text) 35 | clientType = ret.group(1) 36 | accountType = ret.group(2) 37 | appKey = ret.group(3) 38 | 39 | paramId = re.search(r'paramId = "(.*?)"', text).group(1) 40 | REQID = re.search(r'reqId = "(.*?)"', text).group(1) 41 | lt = re.search(r'lt = "(.*?)"', text).group(1) 42 | 43 | headers = { 44 | 'User-Agent': UA, 45 | 'Host': 'open.e.189.cn', 46 | 'Origin': 'https://open.e.189.cn', 47 | 'Referer': url, 48 | 'REQID': REQID, 49 | 'lt': lt, 50 | } 51 | data = { 52 | 'appKey': appKey, 53 | 'accountType': accountType, 54 | 'validateCode': "", # 验证码 55 | 'captchaToken': captchaToken, 56 | 'returnUrl': 'https://e.189.cn/user/loginMiddle.do?returnUrlMid=https://e.189.cn/user/index.do', 57 | 'mailSuffix': '', 58 | 'dynamicCheck': 'FALSE', 59 | 'clientType': clientType, 60 | 'cb_SaveName': '1', 61 | 'isOauth2': 'false', 62 | 'state': '', 63 | 'paramId': paramId, 64 | 'userName': username, 65 | 'password': password, 66 | } 67 | response = session.post(login_url, headers=headers, data=data) 68 | 69 | # print(data) 70 | print(response.text) 71 | 72 | 73 | print(execjs.get().name) 74 | if execjs.get().name != 'Node.js (V8)': 75 | print('请安装V8 引擎') 76 | 77 | if __name__ == '__main__': 78 | user = input('用户名>>>') 79 | pwd = input('密码>>>') 80 | login(user, pwd) 81 | -------------------------------------------------------------------------------- /其他实战/【好莱客】参数解析/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【好莱客】参数解析/holike.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-07 Python: 3.7 4 | import execjs.runtime_names 5 | import requests 6 | import time 7 | import re 8 | 9 | 10 | class MakeParam: 11 | """ 12 | 好莱客 13 | http://oa.holike.com/login.jsp 14 | """ 15 | 16 | def __init__(self, name, pwd): 17 | self.name = name 18 | self.pwd = pwd 19 | self.js = None 20 | 21 | self.read_js() 22 | 23 | def get_key_vi(self): 24 | url = 'http://oa.holike.com/resource/js/session.jsp?_={t}&s_ajax=true' 25 | headers = { 26 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36' 27 | } 28 | response = requests.get(url.format(t=int(round(time.time() * 1000))), headers=headers) 29 | try: 30 | ret = re.search(r'return "(.*?)";', response.text).group(1) 31 | _key = self.js.call('get_key_iv', ret) 32 | return _key 33 | except AttributeError: 34 | print('获取key失败') 35 | 36 | def read_js(self): 37 | with open('encryp.js', 'r', encoding='utf-8') as f: 38 | self.js = execjs.compile(f.read()) 39 | 40 | def make_params(self): 41 | obj = self.get_key_vi() 42 | j_password = self.js.call("make_j_password", self.pwd, obj.get('security'), obj.get('key'), obj.get('iv')) 43 | 44 | msg = """ 45 | j_username: {user} 46 | j_password: {j_password} 47 | """.format(user=self.name, j_password=j_password) 48 | print(msg) 49 | 50 | 51 | if __name__ == '__main__': 52 | username = input('请输入用户名') 53 | password = input('请输入密码') 54 | hk = MakeParam(username, password) 55 | hk.make_params() 56 | -------------------------------------------------------------------------------- /其他实战/【好莱客】参数解析/ok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【好莱客】参数解析/ok.png -------------------------------------------------------------------------------- /其他实战/【小牛在线】登录参数生成/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【小牛在线】登录参数生成/make_param.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-29 Python: 3.7 4 | 5 | import execjs.runtime_names 6 | 7 | """ 8 | 小牛在线,登陆密码参数解密 9 | https://www.xiaoniu88.com/user/login 10 | """ 11 | 12 | 13 | def init_js(): 14 | with open("encryp.js", "r", encoding="utf-8") as f: 15 | return execjs.compile(f.read()) 16 | 17 | 18 | def make_param(password): 19 | js = init_js() 20 | pwd = js.call('get_pwd', password) 21 | print('加密后密码', pwd) 22 | 23 | 24 | if __name__ == '__main__': 25 | password = input('明文密码') 26 | make_param(password) 27 | -------------------------------------------------------------------------------- /其他实战/【开鑫贷】登陆参数生成/KaiXinDai.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-15 Python: 3.7 4 | import requests 5 | import execjs.runtime_names 6 | 7 | 8 | class KaiXinDai: 9 | """ 10 | 开鑫贷登陆参数解密 11 | https://www.gkkxd.com/userAuth/login 12 | """ 13 | def __init__(self, pwd): 14 | self.js = None 15 | self.pwd = pwd 16 | self.init_js() 17 | 18 | @staticmethod 19 | def get_dl(): 20 | from lxml import etree 21 | url = 'https://www.kxjf.com/user/login?mainSiteName=kxd' 22 | headers = { 23 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 24 | 'Host': 'www.kxjf.com', 25 | 'Referer': 'https://www.gkkxd.com/userAuth/login', 26 | } 27 | response = requests.get(url, headers=headers) 28 | etree = etree.HTML(response.text) 29 | dlmy = etree.xpath('//*[@id="dlmy"]/@value')[0] 30 | return dlmy 31 | 32 | def init_js(self): 33 | with open('encryp.js', 'r', encoding='utf-8') as f: 34 | self.js = execjs.compile(f.read()) 35 | 36 | def make_param(self): 37 | pwd = self.js.call('test', self.get_dl(), self.pwd) 38 | print('pwd生成', pwd) 39 | 40 | 41 | if __name__ == '__main__': 42 | password = input('请输入用户密码') 43 | kxd = KaiXinDai(password) 44 | kxd.make_param() 45 | -------------------------------------------------------------------------------- /其他实战/【开鑫贷】登陆参数生成/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【微信】登录参数生成/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-10 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【微信】登录参数生成/make_pwd.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-08-22 Python: 3.7 4 | import execjs.runtime_names 5 | 6 | 7 | class WeXin(object): 8 | """ 9 | wx 登陆密码解析 10 | """ 11 | 12 | def __init__(self): 13 | self.url = 'https://mp.weixin.qq.com/?token=&lang=zh_CN' 14 | print('引擎', execjs.get().name) 15 | 16 | @staticmethod 17 | def make_pwd(pwd): 18 | with open("encryp.js", "r", encoding="utf-8") as f: 19 | ctx = execjs.compile(f.read()) 20 | 21 | ret = ctx.call("make_pwd", pwd) 22 | print(ret) 23 | 24 | 25 | if __name__ == '__main__': 26 | pdd = WeXin() 27 | pdd.make_pwd('密码') 28 | 29 | -------------------------------------------------------------------------------- /其他实战/【房价】房价获取/README.md: -------------------------------------------------------------------------------- 1 | # 概述 2 | 这不是一个完整的项目,是测试demo,可以获取区域内在售房产单套价格 3 | 4 | 5 | 6 | **代码只是测试了一个最新销售小区中的一栋楼的在售楼房价格** 7 | 8 | 如果需要更完整的,就联系作者 -------------------------------------------------------------------------------- /其他实战/【房价】房价获取/__pycache__/util.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【房价】房价获取/__pycache__/util.cpython-37.pyc -------------------------------------------------------------------------------- /其他实战/【房价】房价获取/util.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # Auth: Zok Email: 362416272@qq.com 3 | # Date: 2020/2/21 4 | 5 | 6 | from pyDes import * 7 | import base64 8 | 9 | KEY = b'hjkiuy6754edxc32890tfhjkw23xdea'[:24] # 密钥只需要24位 10 | IV = b'jhf5632s' 11 | 12 | 13 | def des3_encrypt(s): 14 | """ 15 | 3DES 加密 16 | :param s: 原始字符串 17 | :return: 加密后字符串,16进制 18 | """ 19 | k = triple_des(KEY, CBC, IV, pad=None, padmode=PAD_PKCS5) 20 | en = k.encrypt(s, padmode=PAD_PKCS5) 21 | return base64.b64encode(en).decode('utf-8') 22 | 23 | 24 | def des3_decrypt(s): 25 | """ 26 | 3DES 解密 27 | :param s: 加密字符串 28 | :return: 明文 29 | """ 30 | _str = base64.b64decode(s) 31 | k = triple_des(KEY, CBC, IV, pad=None, padmode=PAD_PKCS5) 32 | en = k.decrypt(_str, padmode=PAD_PKCS5).decode('utf-8') 33 | return en 34 | 35 | 36 | def decrypt_str(s): 37 | info = des3_decrypt(s) # 获得 解密后得 base64 38 | content = info[:-6] 39 | hIndex = base64.b64decode(info[-6:].replace("==", "")).decode().split("_") 40 | content2 = content[int(hIndex[0]):] 41 | txt = base64.b64decode( 42 | content2[: len(content2)-int(hIndex[1])][::-1] 43 | ).decode('utf-8').replace("##", "").replace("{@mk7}", "") 44 | return txt 45 | 46 | 47 | def make_str(enB): 48 | """ 49 | 复写字符串算法 50 | 51 | 根据传入文档,转换ascii并计算和 52 | 并复写算法 53 | for (byte item : enB.getBytes("UTF-8")) { 54 | sumResult = Long.valueOf(sumResult.longValue() + ((long) item)); 55 | } 56 | """ 57 | count = 0 58 | for i in enB: 59 | count += ord(i) 60 | # print('合', count) # 每个字符的 Ascii 码的总和 61 | p = count % len(enB) 62 | n = 1 63 | # print('position', p) 64 | while p + n < len(enB) and p - n >= 0: 65 | enB = rep( 66 | rep(enB, p + n, enB[p - n]), 67 | p - n, 68 | enB[p + n] 69 | ) 70 | n += 1 71 | return enB 72 | 73 | 74 | def rep(source, index, rep_str): 75 | """ 76 | 复写的java层字符转换方法 77 | :return: 78 | """ 79 | str1 = source[0: index] 80 | return str1 + rep_str + source[index + 1:] 81 | 82 | 83 | if __name__ == '__main__': 84 | decrypt_str("AaDaKV8GxE77rIScVyq7E0rebiFQjhrkq8PUcmR8A22NHhAW58pQkQ==") 85 | -------------------------------------------------------------------------------- /其他实战/【房天下】自动登录/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-26 Python: 3.7 4 | 5 | import execjs.runtime_names 6 | import requests 7 | 8 | 9 | class Fang: 10 | """ 11 | 房天下自动登陆 12 | https://passport.fang.com/ 13 | """ 14 | 15 | def __init__(self, user, pwd): 16 | self.user = user 17 | self.pwd = pwd 18 | self.js = None 19 | self.api = 'https://passport.fang.com/login.api' 20 | self.js_init() 21 | 22 | def js_init(self): 23 | print('引擎', execjs.get().name) 24 | with open("encryp.js", "r", encoding="utf-8") as f: 25 | self.js = execjs.compile(f.read()) 26 | 27 | def login(self): 28 | data = { 29 | 'uid': self.user, 30 | 'pwd': self.js.call('getPwd', self.pwd), 31 | 'Service': 'soufun-passport-web', 32 | 'AutoLogin': '1' 33 | } 34 | headers = { 35 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 36 | 'Origin': 'https://passport.fang.com', 37 | 'Referer': 'https://passport.fang.com/', 38 | } 39 | response = requests.post(self.api, data=data, headers=headers) 40 | print(response.text) 41 | print(response.cookies) 42 | 43 | 44 | if __name__ == '__main__': 45 | username = input('输入房天下账号') 46 | password = input('输入密码') 47 | f = Fang(username, password) 48 | f.login() 49 | -------------------------------------------------------------------------------- /其他实战/【房天下】自动登录/ok.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【房天下】自动登录/ok.png -------------------------------------------------------------------------------- /其他实战/【新浪微博】密码解密/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-26 Python: 3.7 4 | 5 | 6 | import execjs 7 | import requests 8 | import json 9 | import re 10 | 11 | 12 | def Get_parameters(): 13 | """微博加密参数有两个 用户名和密码 14 | 用户名为 base64加密 15 | 此处只解决了密码加密问题 其他的请自行拓展 16 | pubkey,time,nonce 17 | :return pubkey,time,nonce 18 | """ 19 | try: 20 | url = "https://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=MTc3MjM1NzI1OTA%3D&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.19)&_=1574300620782" 21 | 22 | headers = { 23 | 'User-Agent': 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14', 24 | 'Host': 'login.sina.com.cn', 25 | 'Referer': 'https://www.weibo.com/login.php', 26 | } 27 | 28 | res = requests.get(url=url, headers=headers) 29 | data = re.findall('sinaSSOController.preloginCallBack\((.*?)\)', res.text, re.S)[0] 30 | new_data = json.loads(data) 31 | time = new_data.get('servertime') 32 | nonce = new_data.get('nonce') 33 | pubkey = new_data.get('pubkey') 34 | return pubkey, time, nonce 35 | except Exception as err: 36 | print('访问失败', err) 37 | 38 | 39 | def main(pwd): 40 | """ 41 | :param pwd: 42 | :return: 43 | """ 44 | with open('execute.js', 'r', encoding='utf-8') as f: 45 | js = execjs.compile(f.read()) 46 | 47 | print('引擎', execjs.get().name) 48 | publickey, time, nonce = Get_parameters() 49 | sign = js.call('get_up', pwd, publickey, time, nonce) 50 | return sign 51 | 52 | 53 | if __name__ == '__main__': 54 | pwd = input('请输入密码:') 55 | sign = main(pwd) 56 | print(sign) 57 | -------------------------------------------------------------------------------- /其他实战/【时光网】登陆参数生成/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-11 Python: 3.7 4 | import execjs.runtime_names 5 | 6 | 7 | class MTime: 8 | """ 9 | 时光网登陆,password 加密解析 10 | https://m.mtime.cn/#!/member/signin 11 | """ 12 | def __init__(self, name, pwd): 13 | self.name = name 14 | self.pwd = pwd 15 | self.url = 'https://m.mtime.cn/Service/callback-comm.mi/user/login.api' 16 | self.js = None 17 | self.init_js() 18 | 19 | def init_js(self): 20 | print('引擎', execjs.get().name) 21 | with open("encryp.js", "r", encoding="utf-8") as f: 22 | self.js = execjs.compile(f.read()) 23 | 24 | def make_pwd(self): 25 | print(self.js.call('get_pwd', self.pwd)) 26 | 27 | 28 | if __name__ == '__main__': 29 | username = input('请输入用户名') 30 | password = input('输入密码') 31 | mt = MTime(username, password) 32 | mt.make_pwd() 33 | -------------------------------------------------------------------------------- /其他实战/【易通贷】自动登录/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【易通贷】自动登录/auto_login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-08-26 Python: 3.7 4 | 5 | import requests 6 | import execjs.runtime_names 7 | 8 | 9 | class YDT(object): 10 | """ 11 | 易通贷自动登陆 12 | """ 13 | 14 | def __init__(self, user, pwd): 15 | self.user = user 16 | self.pwd = pwd 17 | self.url = 'https://app.etongdai.com/login/verifylogin' 18 | print('引擎', execjs.get().name) 19 | 20 | @staticmethod 21 | def make_pwd(pwd): 22 | with open("encryp.js", "r", encoding="utf-8") as f: 23 | ctx = execjs.compile(f.read()) 24 | return ctx.call("make_js", pwd) 25 | 26 | def make_data(self): 27 | data = { 28 | 'loginName': self.user, 29 | 'check': 'on', 30 | 'next': 'null', 31 | 'password': self.make_pwd(self.pwd), 32 | } 33 | 34 | return data 35 | 36 | def login(self): 37 | data = self.make_data() 38 | response = requests.post(self.url, data=data) 39 | data = response.content.decode('utf-8') 40 | print(data) 41 | 42 | 43 | if __name__ == '__main__': 44 | username = input('请输入 易通贷账号') 45 | password = input('请输入 易通贷密码') 46 | ydt = YDT(username, password) 47 | ydt.login() 48 | 49 | 50 | -------------------------------------------------------------------------------- /其他实战/【汽车之家】参数解密/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-08-26 Python: 3.7 4 | 5 | import execjs 6 | 7 | 8 | def main(pwd): 9 | """只解决了pwd的加密,其他请自行拓展 10 | :param pwd: 11 | :return: 12 | """ 13 | with open('execute.js', 'r', encoding='utf-8') as f: 14 | js = execjs.compile(f.read()) 15 | 16 | print('引擎', execjs.get().name) 17 | 18 | sign = js.call('hex_md5', pwd) 19 | return sign 20 | 21 | 22 | if __name__ == '__main__': 23 | pwd = input('请输入你的密码:') 24 | print(main(pwd)) 25 | -------------------------------------------------------------------------------- /其他实战/【满级网】自动登录/auto_login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-08-26 Python: 3.7 4 | 5 | import requests 6 | import base64 7 | from Crypto.Cipher import PKCS1_v1_5 as Cipher_pksc1_v1_5 8 | from Crypto.PublicKey import RSA 9 | 10 | 11 | class YX(object): 12 | """ 13 | 满级网自动登陆 官网 www.manjiwang.com 14 | http://www.manjiwang.com/Logins/BuyerLogin 15 | """ 16 | 17 | def __init__(self, user, pwd): 18 | self.user = user 19 | self.pwd = pwd 20 | self.url = 'http://www.manjiwang.com/Logins/BuyerLogin' 21 | self.headers = { 22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36', 23 | 'Host': 'www.manjiwang.com', 24 | } 25 | self.public_key = """-----BEGIN PUBLIC KEY----- 26 | MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDC4wHerJc4BSst20Zb07lY9LeZss4OEEhe+SrnLyYy8hGquX/aTQNn+5wnV/+8ierKPgqPGIXPf1ZRww5/6yON+O7dAfJ7BRx85HneIWqwPCZToLck8DN8UXsBuXLMcG7tfMunnnZKenrPsAslN0eKvkYkvz4EPGdvmPwz0NCKXQIDAQAB 27 | -----END PUBLIC KEY----- 28 | """ 29 | 30 | def make_pwd(self): 31 | rsa_key = RSA.importKey(self.public_key) 32 | cipher = Cipher_pksc1_v1_5.new(rsa_key) 33 | cipher_text = base64.b64encode(cipher.encrypt(self.pwd.encode())) 34 | return cipher_text.decode() 35 | 36 | def make_data(self): 37 | data = { 38 | 'account': self.user, 39 | 'password': self.make_pwd(), 40 | 'returnUrl': '/' 41 | } 42 | return data 43 | 44 | def login(self): 45 | """start 46 | """ 47 | data = self.make_data() 48 | response = requests.post(self.url, data=data) 49 | print(response.text) 50 | print(response.cookies) 51 | 52 | 53 | if __name__ == '__main__': 54 | username = input('请输入账号') 55 | password = input('密码') 56 | yx = YX(username, password) 57 | yx.login() 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /其他实战/【百度】wap端sig生成/make_sig.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2020-01-13 Python: 3.7 4 | 5 | """ 6 | wap端 sig 参数生成 7 | 应水友需求,帮忙弄的 8 | 需要 V8 引擎! 9 | """ 10 | 11 | import execjs 12 | import os 13 | 14 | print(execjs.get().name) 15 | 16 | 17 | with open(os.path.dirname(__file__) + '/v3_update.js') as f: 18 | js = execjs.compile(f.read()) 19 | 20 | 21 | # dv 可固定, 用了一些随机参数生成的。 22 | dv = 'tk0.48553508531670751578885709447.0@mmy0VdnCHg9mlXM-7ZM-tbvB8YHXK3MIEg9WNa8V3x9Cqa5kqgOXcFOjca5BJWOB7eNIzY5k9j8VNKUk0~9F~~5rOiHXvivmzzHjJFMXubOG~W8VRln6~l9k0g9mlXM-7ZM-tbvB8YHXK3MIEg9WH~9V7x9Cql5kqgOXcFOjca5BJWOB7eNIzY5k9-9CRWUq__dy0ov8Cpy5k9j8S~W8Cpz9SlXM-7ZM-tbH-JSMIYaUktanm~F9VEg9WEj8VRgOXcFOjca5BJWOB7eNIzYUk0~9kHg9C9~5kEF8WqW9mlx-vvLwvB87Tr4hByj9G~F5kHyGynvrg~5Vty8CEW8Cqy8C9l8VH~8WEl8CHynkRz8WqK8kt-5Vq_jy~56JeOrJXLIKYOq__Hyr9m~~5k0K9k9g9WHj5k0K9Vqg9Cqy9m~lnCp~5k0K9Vqg9Cqa9q__' 23 | username = '这是测试' # 用户名 24 | s_code = 'ilvw' # 验证码 25 | verifystring = 'jxOb3456654e9d67a5c02ab155fe9012fb44e5b90ae9b01ca02' # 首页返回的 26 | 27 | result = js.call('v3test', dv, s_code, verifystring) 28 | 29 | print(result) -------------------------------------------------------------------------------- /其他实战/【百度】网页找回密码/__pycache__/header.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【百度】网页找回密码/__pycache__/header.cpython-37.pyc -------------------------------------------------------------------------------- /其他实战/【百度】网页找回密码/header.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-12-23 Python: 3.7 4 | 5 | UA = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36' 6 | REFERER = 'https://passport.baidu.com/?getpassindex' 7 | LANGUAGE = 'zh-CN,zh;q=0.9' 8 | CONNECTION = 'keep-alive' 9 | 10 | headers_get_phone = { 11 | 'Connection': CONNECTION, 12 | 'User-Agent': UA, 13 | 'Accept': '*/*', 14 | 'Sec-Fetch-Site': 'same-origin', 15 | 'Sec-Fetch-Mode': 'no-cors', 16 | 'Referer': REFERER, 17 | 'Accept-Language': LANGUAGE 18 | } 19 | 20 | 21 | headers_token = { 22 | "Connection": CONNECTION, 23 | "Content-Lengt": '999', 24 | "Cache-Control": 'max-age=0', 25 | "Origin": "https://passport.baidu.com", 26 | "Upgrade-Insecure-Requests": '1', 27 | "Content-Type": "application/x-www-form-urlencoded", 28 | "User-Agent": UA, 29 | "Sec-Fetch-User": "?1", 30 | "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", 31 | "Sec-Fetch-Site": "same-origin", 32 | "Sec-Fetch-Mode": "navigate", 33 | "Referer": REFERER, 34 | "Accept-Language": LANGUAGE, 35 | } 36 | 37 | headers_img = { 38 | 'Connection': CONNECTION, 39 | 'User-Agent': UA, 40 | 'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8', 41 | 'Sec-Fetch-Site': 'same-origin', 42 | 'Sec-Fetch-Mode': 'no-cors', 43 | 'Referer': REFERER, 44 | 'Accept-Language': LANGUAGE, 45 | } 46 | 47 | headers_bds_token = { 48 | 'Connection': CONNECTION, 49 | 'Upgrade-Insecure-Requests': '1', 50 | 'User-Agent': UA, 51 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3', 52 | 'Sec-Fetch-Site': 'none', 53 | 'Sec-Fetch-Mode': 'navigate', 54 | 'Accept-Language': LANGUAGE, 55 | } 56 | 57 | headers_verify_str = { 58 | 'Connection': CONNECTION, 59 | 'User-Agent': UA, 60 | 'Accept': '*/*', 61 | 'Sec-Fetch-Site': 'same-origin', 62 | 'Sec-Fetch-Mode': 'no-cors', 63 | 'Referer': REFERER, 64 | 'Accept-Language': LANGUAGE, 65 | } 66 | -------------------------------------------------------------------------------- /其他实战/【百度】网页找回密码/验证码.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【百度】网页找回密码/验证码.png -------------------------------------------------------------------------------- /其他实战/【百度】翻译/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-11-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【百度】翻译/translate.js: -------------------------------------------------------------------------------- 1 | var i = "320305.131321201" 2 | 3 | 4 | function n(r, o) { 5 | for (var t = 0; t < o.length - 2; t += 3) { 6 | var e = o.charAt(t + 2); 7 | e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e), 8 | e = "+" === o.charAt(t + 1) ? r >>> e : r << e, 9 | r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e 10 | } 11 | return r 12 | } 13 | 14 | function a(r) { 15 | var t = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g); 16 | if (null === t) { 17 | var a = r.length; 18 | a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10)) 19 | } else { 20 | for (var C = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), h = 0, f = C.length, u = []; f > h; h++) 21 | "" !== C[h] && u.push.apply(u, e(C[h].split(""))), 22 | h !== f - 1 && u.push(t[h]); 23 | var g = u.length; 24 | g > 30 && (r = u.slice(0, 10).join("") + u.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + u.slice(-10).join("")) 25 | } 26 | var l = void 0 27 | , d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107); 28 | l = null !== i ? i : (i = o.common[d] || "") || ""; 29 | for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) { 30 | var p = r.charCodeAt(F); 31 | 128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)), 32 | c[v++] = p >> 18 | 240, 33 | c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224, 34 | c[v++] = p >> 6 & 63 | 128), 35 | c[v++] = 63 & p | 128) 36 | } 37 | for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++) 38 | w += c[D], 39 | w = n(w, A); 40 | return w = n(w, b), 41 | w ^= s, 42 | 0 > w && (w = (2147483647 & w) + 2147483648), 43 | w %= 1e6, 44 | w.toString() + "." + (w ^ S) 45 | } -------------------------------------------------------------------------------- /其他实战/【百度】翻译/translation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-04-26 Python: 3.7 4 | 5 | # 本代码参考 github作者:CriseLYJ 6 | 7 | import requests 8 | import js2py 9 | 10 | 11 | class FanYiSpider(object): 12 | """ 13 | 翻译 14 | """ 15 | context = js2py.EvalJs() # python中使用js 16 | 17 | def __init__(self, query): 18 | # 初始化 19 | self.url = "https://fanyi.baidu.com/basetrans" 20 | self.query = query 21 | self.headers = { 22 | "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Mobile Safari/537.36", 23 | "Referer": "https://fanyi.baidu.com/", 24 | "Cookie": "BAIDUID=714BFAAF02DA927F583935C7A354949A:FG=1; BIDUPSID=714BFAAF02DA927F583935C7A354949A; PSTM=1553390486; delPer=0; PSINO=5; H_PS_PSSID=28742_1463_21125_18559_28723_28557_28697_28585_28640_28604_28626_22160; locale=zh; from_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lvt_afd111fa62852d1f37001d1f980b6800=1553658863,1553766321,1553769980,1553770442; Hm_lpvt_afd111fa62852d1f37001d1f980b6800=1553770442; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1553766258,1553766321,1553769980,1553770442; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1553770442", 25 | "Content-Type": "application/x-www-form-urlencoded", 26 | "Origin": "https://fanyi.baidu.com", 27 | "X-Requested-With": "XMLHttpRequest", 28 | } 29 | 30 | def make_sign(self): 31 | with open("translate.js", "r", encoding="utf-8") as f: 32 | self.context.execute(f.read()) 33 | 34 | sign = self.context.a(self.query) 35 | return sign 36 | 37 | def make_data(self, sign): 38 | data = { 39 | "query": self.query, 40 | "from": "en", 41 | "to": "zh", 42 | "token": "6f5c83b84d69ad3633abdf18abcb030d", 43 | "sign": sign 44 | } 45 | return data 46 | 47 | def get_content(self, data): 48 | response = requests.post( 49 | url=self.url, 50 | headers=self.headers, 51 | data=data 52 | ) 53 | return response.json()["trans"][0]["dst"] 54 | 55 | @property 56 | def run(self): 57 | sign = self.make_sign() # 获取sign的值 58 | data = self.make_data(sign) # 构建参数 59 | content = self.get_content(data) # 获取翻译内容 60 | return content 61 | 62 | 63 | if __name__ == '__main__': 64 | key = input("输入翻译内容:") 65 | translate = FanYiSpider(key) 66 | print(translate.run) 67 | -------------------------------------------------------------------------------- /其他实战/【百度】自动登录/README.md: -------------------------------------------------------------------------------- 1 | # 解密过程参考博客 2 | 3 | [博客链接](https://www.zhangkunzhi.com/?p=216) -------------------------------------------------------------------------------- /其他实战/【百度】自动登录/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-08-05 Python: 3.7 4 | 5 | """ 6 | 百度登陆参数比较多 7 | 8 | 这里是密码加密生成器 9 | """ 10 | 11 | import js2py 12 | 13 | 14 | class PingDuoDuoSpider(object): 15 | """ 16 | 生成百度登陆密码加密结果 17 | """ 18 | context = js2py.EvalJs() # python中使用js 19 | 20 | def __init__(self): 21 | # 初始化 22 | with open("encryp.js", "r", encoding="utf-8") as f: 23 | self.context.execute(f.read()) 24 | 25 | def make(self, password): 26 | pwd = self.context.test(password) 27 | print(pwd) # 打印加密之后的密码 28 | 29 | 30 | if __name__ == '__main__': 31 | pdd = PingDuoDuoSpider() 32 | 33 | key = input("输入密码") 34 | pdd.make(key) 35 | -------------------------------------------------------------------------------- /其他实战/【百度街拍】图片下载/get_image.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-08-05 Python: 3.7 4 | 5 | import requests, time 6 | from urllib.parse import urlencode 7 | from urllib.request import urlretrieve 8 | 9 | 10 | def getPage(offset): 11 | '''获取网页信息''' 12 | data = { 13 | 'tn': 'resultjson_com', 14 | 'ipn': 'rj', 15 | 'ct': '201326592', 16 | 'is': '', 17 | 'fp': 'result', 18 | 'queryWord': '街拍', 19 | 'cl': '2', 20 | 'lm': '-1', 21 | 'ie': 'utf - 8', 22 | 'oe': 'utf - 8', 23 | 'adpicid': '', 24 | 'st': '-1', 25 | 'z': '', 26 | 'ic': '0', 27 | 'hd': '', 28 | 'latest': '', 29 | 'copyright': '', 30 | 'word': '街拍', 31 | 's': '', 32 | 'se': '', 33 | 'tab': '', 34 | 'width': '', 35 | 'height': '', 36 | 'face': '0', 37 | 'istype': '2', 38 | 'qc': '', 39 | 'nc': '1', 40 | 'fr': '', 41 | 'expermode': '', 42 | 'force': '', 43 | 'pn': offset, 44 | 'rn': '30', 45 | 'gsm': '1e', 46 | '1551789143500': '', 47 | } 48 | headers = { 49 | 'Accept': 'text/plain, */*; q=0.01', 50 | 'Accept-Encoding': 'deflate, br', 51 | 'Accept-Language': 'Accept-Language', 52 | 'Connection': 'keep-alive', 53 | 'Cookie': 'BDqhfp=%E8%A1%97%E6%8B%8D%26%260-10-1undefined%26%260%26%261; BIDUPSID=7CA5F033CA22949F5FB6110DBC5DC1EE; BAIDUID=6DDE5BAA44763FD6C7CA84401CB19F36:FG=1; indexPageSugList=%5B%22%E8%A1%97%E6%8B%8D%22%5D; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; uploadTime=1551768107224; userFrom=null; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; firstShowTip=1; cleanHistoryStatus=0', 54 | 'Host': 'image.baidu.com', 55 | 'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E8%A1%97%E6%8B%8D&oq=%E8%A1%97%E6%8B%8D&rsp=-1', 56 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6735.400 QQBrowser/10.2.2328.400', 57 | 'X-Requested-With': 'XMLHttpRequest', 58 | } 59 | url = 'https://image.baidu.com/search/acjson?' + urlencode(data) 60 | try: 61 | res = requests.get(url, data=data, headers=headers) 62 | res.encoding = 'utf-8' # 网页信息编码 63 | if res.status_code == 200: 64 | return res.json() 65 | except requests.ConnectionError: 66 | return None 67 | 68 | 69 | def getImage(json): 70 | '''解析网页数据并爬取所需的信息''' 71 | try: 72 | data = json.get('data') 73 | if data: 74 | for item in data: 75 | yield { 76 | 'image': item.get('hoverURL'), 77 | 'title': item.get('fromPageTitleEnc'), 78 | } 79 | except: 80 | return None 81 | 82 | 83 | def saveImage(item): 84 | '''把获取的图片与标题封装并存储''' 85 | try: 86 | m = item.get('title') 87 | local_image = item.get('image') # 获取图片的url 88 | image_url = local_image 89 | urlretrieve(image_url, './pic/' + str(m) + '.jpg') 90 | # print('p'+str(m) + '.jpg') 91 | except: 92 | return None 93 | 94 | 95 | def main(offset): 96 | '''调度爬取函数和存储''' 97 | json = getPage(offset) 98 | for item in getImage(json): 99 | print(item) 100 | saveImage(item) 101 | 102 | 103 | if __name__ == '__main__': 104 | for i in range(5): # 此处循环遍历五次是不可行的 每次data值中的gsm在变化 105 | main(offset=i * 30) 106 | time.sleep(1) 107 | -------------------------------------------------------------------------------- /其他实战/【移动】登录参数生成/MakeParam.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-05 Python: 3.7 4 | 5 | import execjs.runtime_names 6 | 7 | 8 | class MakeParam: 9 | """ 10 | 移动登陆 11 | 加密参数生成器 12 | 页面 https://mail.10086.cn/ 13 | """ 14 | 15 | def __init__(self, name, pwd): 16 | self.name = name 17 | self.pwd = pwd 18 | self.js = None 19 | self.init_js() 20 | 21 | def init_js(self): 22 | print('引擎', execjs.get().name) 23 | with open("encryp.js", "r", encoding="utf-8") as f: 24 | self.js = execjs.compile(f.read()) 25 | 26 | def mk_params(self): 27 | cguid = self.js.call("customerGetCGUID") 28 | _ = self.js.call('sha1', self.name) 29 | word = self.js.call('calcDigest', self.pwd) 30 | msg = """ 31 | cguid: {cguid} 32 | _: {_} 33 | password: {word} 34 | """ 35 | print(msg.format(cguid=cguid, _=_, word=word)) 36 | 37 | 38 | if __name__ == '__main__': 39 | username = input('输入用户名') 40 | password = input('输入密码') 41 | yd = MakeParam(username, password) 42 | yd.mk_params() 43 | -------------------------------------------------------------------------------- /其他实战/【移动】登录参数生成/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【移动】登录参数生成/encryp.js: -------------------------------------------------------------------------------- 1 | function sha1(a) { 2 | function b(a, b) { 3 | var c = (a & 65535) + (b & 65535); 4 | return (a >> 16) + (b >> 16) + (c >> 16) << 16 | c & 65535 5 | } 6 | 7 | for (var c = [], d = 0; d < 8 * a.length; d += 8) 8 | c[d >> 5] |= (a.charCodeAt(d / 8) & 255) << 24 - d % 32; 9 | a = 8 * a.length; 10 | c[a >> 5] |= 128 << 24 - a % 32; 11 | c[(a + 64 >> 9 << 4) + 15] = a; 12 | a = Array(80); 13 | for (var d = 1732584193, e = -271733879, f = -1732584194, g = 271733878, k = -1009589776, h = 0; h < c.length; h += 14 | 16) { 15 | for (var l = d, m = e, n = f, p = g, q = k, j = 0; 80 > j; j++) { 16 | a[j] = 16 > j ? c[h + j] : (a[j - 3] ^ a[j - 8] ^ a[j - 14] ^ a[j - 16]) << 1 | (a[j - 3] ^ a[j - 8] ^ a[j - 14] ^ 17 | a[j - 16]) >>> 31; 18 | var r = b(b(d << 5 | d >>> 27, 20 > j ? e & f | ~e & g : 40 > j ? e ^ f ^ g : 60 > j ? e & f | e & g | f & g : e ^ 19 | f ^ g), b(b(k, a[j]), 20 > j ? 1518500249 : 40 > j ? 1859775393 : 60 > j ? -1894007588 : -899497514)), 20 | k = g, 21 | g = f, 22 | f = e << 30 | e >>> 2, 23 | e = d, 24 | d = r 25 | } 26 | d = b(d, l); 27 | e = b(e, m); 28 | f = b(f, n); 29 | g = b(g, p); 30 | k = b(k, q) 31 | } 32 | c = [d, e, f, g, k]; 33 | a = ""; 34 | for (d = 0; d < 4 * c.length; d++) 35 | a += "0123456789abcdef".charAt(c[d >> 2] >> 8 * (3 - d % 4) + 4 & 15) + "0123456789abcdef".charAt(c[d >> 2] >> 8 * 36 | (3 - d % 4) & 15); 37 | return a 38 | } 39 | 40 | 41 | function a(a, c) { 42 | var d = (a & 65535) + (c & 65535); 43 | return (a >> 16) + (c >> 16) + (d >> 16) << 16 | d & 65535 44 | } 45 | 46 | calcDigest = function (b) { 47 | for (var c = (b.length + 8 >> 6) + 1, d = Array(16 * c), e = 0; e < 16 * c; e++) 48 | d[e] = 0; 49 | for (e = 0; e < b.length; e++) 50 | d[e >> 2] |= b.charCodeAt(e) << 24 - 8 * (e & 3); 51 | d[e >> 2] |= 128 << 24 - 8 * (e & 3); 52 | d[16 * c - 1] = 8 * b.length; 53 | b = Array(80); 54 | for (var c = 1732584193, e = -271733879, f = -1732584194, g = 271733878, k = -1009589776, h = 0; h < d.length; h += 55 | 16) { 56 | for (var l = c, m = e, n = f, p = g, q = k, j = 0; 80 > j; j++) { 57 | b[j] = 16 > j ? d[h + j] : (b[j - 3] ^ b[j - 8] ^ b[j - 14] ^ b[j - 16]) << 1 | (b[j - 3] ^ b[j - 8] ^ b[j - 14] ^ 58 | b[j - 16]) >>> 31; 59 | var r = a(a(c << 5 | c >>> 27, 20 > j ? e & f | ~e & g : 40 > j ? e ^ f ^ g : 60 > j ? e & f | e & g | f & g : e ^ 60 | f ^ g), a(a(k, b[j]), 20 > j ? 1518500249 : 40 > j ? 1859775393 : 60 > j ? -1894007588 : -899497514)), 61 | k = g, 62 | g = f, 63 | f = e << 30 | e >>> 2, 64 | e = c, 65 | c = r 66 | } 67 | c = a(c, l); 68 | e = a(e, m); 69 | f = a(f, n); 70 | g = a(g, p); 71 | k = a(k, q) 72 | } 73 | d = [c, e, f, g, k]; 74 | b = ""; 75 | for (c = 0; c < 4 * d.length; c++) 76 | b += "0123456789abcdef".charAt(d[c >> 2] >> 8 * (3 - c % 4) + 4 & 15) + "0123456789abcdef".charAt(d[c >> 2] >> 8 * 77 | (3 - c % 4) & 15); 78 | return b 79 | } 80 | 81 | 82 | function customerGetCGUID() { 83 | function a(a, b) { 84 | var e = (b || 2) - (1 + Math.floor(Math.log(a | 1) / Math.LN10 + 1E-15)); 85 | return Array(e + 1).join("0") + a 86 | } 87 | 88 | var b = new Date; 89 | return "" + a(b.getHours()) + a(b.getMinutes()) + a(b.getSeconds()) + a(b.getMilliseconds(), 3) + a(Math.ceil(9999 * 90 | Math.random()), 4) 91 | } -------------------------------------------------------------------------------- /其他实战/【移动】登录参数生成/make_params.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【移动】登录参数生成/make_params.png -------------------------------------------------------------------------------- /其他实战/【空中网】自动登录/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【空中网】自动登录/encryp.js: -------------------------------------------------------------------------------- 1 | function mk_pwd (str, pwd) { 2 | if (pwd == null || pwd.length <= 0) { 3 | return null 4 | } 5 | ;var prand = ""; 6 | for (var i = 0; i < pwd.length; i++) { 7 | prand += pwd.charCodeAt(i).toString() 8 | } 9 | ;var sPos = Math.floor(prand.length / 5); 10 | var mult = parseInt(prand.charAt(sPos) + prand.charAt(sPos * 2) + prand.charAt(sPos * 3) + prand.charAt(sPos * 4) + prand.charAt(sPos * 5)); 11 | var incr = Math.ceil(pwd.length / 2); 12 | var modu = Math.pow(2, 31) - 1; 13 | if (mult < 2) { 14 | return null 15 | } 16 | ;var salt = Math.round(Math.random() * 1000000000) % 100000000; 17 | prand += salt; 18 | while (prand.length > 10) { 19 | var a = prand.substring(0, 1); 20 | var b = prand.substring(10, prand.length); 21 | if (b.length > 10) { 22 | prand = b 23 | } else { 24 | prand = (parseInt(a) + parseInt(b)).toString() 25 | } 26 | } 27 | ;prand = (mult * prand + incr) % modu; 28 | var enc_chr = ""; 29 | var enc_str = ""; 30 | for (var i = 0; i < str.length; i++) { 31 | enc_chr = parseInt(str.charCodeAt(i) ^ Math.floor((prand / modu) * 255)); 32 | if (enc_chr < 16) { 33 | enc_str += "0" + enc_chr.toString(16) 34 | } else 35 | enc_str += enc_chr.toString(16); 36 | prand = (mult * prand + incr) % modu 37 | } 38 | ;salt = salt.toString(16); 39 | while (salt.length < 8) 40 | salt = "0" + salt; 41 | enc_str += salt; 42 | return enc_str 43 | } -------------------------------------------------------------------------------- /其他实战/【空中网】自动登录/spider_login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-03 Python: 3.7 4 | import re 5 | import time 6 | import requests 7 | import execjs.runtime_names 8 | 9 | 10 | class SpiderLogin: 11 | """ 12 | 空中网爬虫登陆 13 | """ 14 | 15 | def __init__(self, user, pwd): 16 | self.session = requests.session() 17 | self.user = user 18 | self.pwd = pwd 19 | self.login_time = int(round(time.time() * 1000)) 20 | self.url = 'https://m.wcbchina.com/login/other-login.html' 21 | self.headers = { 22 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 23 | 'Host': 'sso.kongzhong.com', 24 | 'Referer': 'https://passport.kongzhong.com/login' 25 | } 26 | 27 | def use_js(self, dc): 28 | """js 调用 29 | """ 30 | with open("encryp.js", "r", encoding="utf-8") as f: 31 | js = execjs.compile(f.read()) 32 | try: 33 | pwd = js.call("mk_pwd", self.pwd, dc) 34 | return pwd 35 | except Exception: 36 | print('js 异常') 37 | 38 | def auto_login(self): 39 | """登陆 40 | """ 41 | login_url = 'https://sso.kongzhong.com/ajaxLogin?j=j&&type=1&service=https://passport.kongzhong.com/&username={username}&password={password}&vcode=&toSave=0&_={_time}' 42 | dc = self.get_dc() 43 | en_pwd = self.use_js(dc) 44 | response = self.session.get(login_url.format(username=self.user, password=en_pwd, _time=self.login_time), headers=self.headers) 45 | print(response.cookies) 46 | print(response.text) 47 | print(response) 48 | 49 | def get_dc(self): 50 | """捕获 dc 参数 51 | """ 52 | target = 'https://sso.kongzhong.com/ajaxLogin?j=j&jsonp=j&service=https://passport.kongzhong.com/&_={t}'.format( 53 | t=self.login_time) 54 | response = self.session.get(target, headers=self.headers) 55 | try: 56 | dc = re.search(r'"dc":"(.*?)","kzmsg', response.text).group(1) 57 | return dc 58 | except AttributeError: 59 | print('dc 捕获失败') 60 | 61 | 62 | if __name__ == '__main__': 63 | username = input('请输入账号') 64 | password = input('密码') 65 | kzw = SpiderLogin(username, password) 66 | kzw.auto_login() 67 | -------------------------------------------------------------------------------- /其他实战/【美团】数据解析、token生成/README.md: -------------------------------------------------------------------------------- 1 | ``` 2 | . 3 | └── MeiTuan // -------美团------- 4 | ├── get_login_cookies.py // 基于pyppeteer登陆并获取cookies 5 | ├── parse_play_areas.py // 三级区域解析器(休闲板块) 6 | ├── parse_play_info.py // 休闲会所商铺数据解析 7 | ├── parse_hotel_info.py // 酒店基础数据解析 8 | ├── parse_hotel_comments.py // 酒店评论解析 9 | ├── create_food_token.py // 餐饮页Token生成器 10 | ├── parse_food_comments.py // 获取用户评论数据 11 |   └── parse_food_info.py // 解析餐馆数据 12 | 13 | ``` 14 | -------------------------------------------------------------------------------- /其他实战/【美团】数据解析、token生成/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-18 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【美团】数据解析、token生成/create_food_token.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-04-21 Python: 3.7 4 | 5 | import json, zlib, base64, time 6 | 7 | 8 | class MakeToken(): 9 | """ 10 | 测试2019-4-21日可用 11 | 仅作为学术交流!如有侵权,联系作者删除 12 | 美团【餐馆列表】Token生成 13 | """ 14 | 15 | def __init__(self, areaId, cityName, originUrl, page): 16 | self.areaId = areaId 17 | self.cityName = cityName 18 | self.originUrl = originUrl 19 | self.page = page 20 | self.uuid = 'c6eada3ffd8e444491e9.1555472928.3.0.0' # Demo 21 | 22 | def join_sign(self): 23 | # 参数 24 | sign = 'areaId={areaId}&cateId=0&cityName={cityName}&dinnerCountAttrId=&optimusCode=1&originUrl={originUrl}&page={page}&partner=126&platform=1&riskLevel=1&sort=&userId=&uuid={uuid}' 25 | _str = sign.format(areaId=self.areaId, cityName=self.cityName, originUrl=self.originUrl, page=self.page, 26 | uuid=self.uuid) 27 | sign = base64.b64encode(zlib.compress(bytes(json.dumps(_str, ensure_ascii=False), encoding="utf8"))) 28 | sign = str(sign, encoding="utf8") 29 | return sign 30 | 31 | @property 32 | def join_token(self): 33 | str_json = {} 34 | str_json['rId'] = 100900 35 | str_json['ver'] = '1.0.6' 36 | str_json['ts'] = time.time() 37 | str_json['cts'] = time.time() + 110 38 | str_json['brVD'] = [1920, 315] 39 | str_json['brR'] = [[1920, 1080], [1920, 1057], 24, 24] 40 | str_json['bI'] = [self.originUrl, ""] 41 | str_json['mT'] = [] 42 | str_json['kT'] = [] 43 | str_json['aT'] = [] 44 | str_json['tT'] = [] 45 | str_json['aM'] = '' 46 | str_json['sign'] = self.join_sign() 47 | token_decode = zlib.compress( 48 | bytes(json.dumps(str_json, separators=(',', ':'), ensure_ascii=False), encoding="utf8")) 49 | token = str(base64.b64encode(token_decode), encoding="utf8") 50 | return token 51 | 52 | 53 | if __name__ == '__main__': 54 | # 测试数据 55 | areaId = '4581' 56 | cityName = '重庆' 57 | originUrl = 'http://cq.meituan.com/meishi/b4581/' 58 | page = '1' 59 | 60 | token = MakeToken(areaId, cityName, originUrl, page) 61 | print(token.join_token) 62 | -------------------------------------------------------------------------------- /其他实战/【美团】数据解析、token生成/get_login_cookies.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-21 Python: 3.7 4 | import asyncio 5 | import json 6 | 7 | from pyppeteer import launch 8 | 9 | 10 | class MeiTuanCookies(): 11 | def __init__(self, username, password): 12 | self.login_url = 'https://passport.meituan.com/account/unitivelogin' 13 | self.username = username 14 | self.password = password 15 | 16 | async def star(self): 17 | browser = await launch() 18 | context = await browser.createIncogniteBrowserContext() 19 | page = await context.newPage() 20 | await page.evaluateOnNewDocument('() =>{ Object.defineProperties(navigator,' 21 | '{ webdriver:{ get: () => false } }) }') # 本页刷新后值不变 22 | 23 | await page.goto(self.login_url) 24 | await page.type('input#login-email', self.username) 25 | await page.type('input#login-password', self.password) 26 | await page.click('input.btn') 27 | await self.get_cookie(page) 28 | 29 | async def get_cookie(self, page): 30 | """ 31 | 获取 cookies 32 | :param page: 页面 33 | :return: 34 | """ 35 | cookies_list = await page.cookies() 36 | cookies = '' 37 | for cookie in cookies_list: 38 | str_cookie = '{0}={1};' 39 | str_cookie = str_cookie.format(cookie.get('name'), cookie.get('value')) 40 | cookies += str_cookie 41 | print(cookies) 42 | 43 | 44 | if __name__ == '__main__': 45 | name = input('美团账号') 46 | pwd = input('密码') 47 | mt = MeiTuanCookies(name, pwd) 48 | loop = asyncio.get_event_loop() 49 | loop.run_until_complete(mt.star()) 50 | -------------------------------------------------------------------------------- /其他实战/【美团】数据解析、token生成/parse_food_comments.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-04-17 Python: 3.7 4 | 5 | import requests 6 | import json 7 | import time 8 | 9 | from urllib import parse 10 | 11 | 12 | class ParseComments(object): 13 | def __init__(self, shop_id): 14 | self.shop_id = shop_id 15 | 16 | self.get_data() 17 | 18 | def get_data(self): 19 | url_code = self.get_originUrl() 20 | 21 | url = 'http://www.meituan.com/meishi/api/poi/getMerchantComment?' 22 | params = { 23 | 'platform': '1', 24 | 'partner': '126', 25 | 'originUrl': url_code, 26 | 'riskLevel': '1', 27 | 'optimusCode': '1', 28 | 'id': self.shop_id, 29 | 'offset': '0', 30 | 'pageSize': '10', 31 | 'sortType': '1', 32 | } 33 | headers = { 34 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 35 | } 36 | response = requests.get(url=url, params=params, headers=headers) 37 | data = response.text 38 | self.parse(data) 39 | 40 | def get_originUrl(self): 41 | """编码解码 42 | """ 43 | return parse.quote_plus('http://www.meituan.com/meishi/' + self.shop_id + '/') 44 | 45 | def parse(self, data): 46 | """解析数据 47 | """ 48 | data_dict = json.loads(data) 49 | for item in data_dict.get('data').get('comments'): 50 | create_time = self.parse_time(item.get('commentTime')) 51 | print_str = """ 52 | 评论用户:{userName} 53 | 评论时间:{create_time} 54 | 评论详情:{comment} 55 | 评论id:{reviewId} 56 | """.format(userName=item.get('userName'), comment=item.get('comment'), create_time=create_time, 57 | reviewId=item.get('reviewId')) 58 | print(print_str) 59 | 60 | @staticmethod 61 | def parse_time(timeStamp): 62 | """13位 解码时间 63 | """ 64 | time_stamp = float(int(timeStamp) / 1000) 65 | time_array = time.localtime(time_stamp) 66 | return time.strftime("%Y-%m-%d %H:%M:%S", time_array) 67 | 68 | 69 | if __name__ == '__main__': 70 | p_id = input('请输入餐馆id') 71 | ParseComments(p_id) 72 | -------------------------------------------------------------------------------- /其他实战/【美团】数据解析、token生成/parse_hotel_comments.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-10 Python: 3.7 4 | 5 | """ 6 | 解析酒店评论 7 | """ 8 | 9 | import requests 10 | import json 11 | import time 12 | 13 | 14 | class ParseComments(object): 15 | """解析酒店评论 16 | """ 17 | def __init__(self, hotel_id): 18 | self.hotel_id = hotel_id 19 | self.get_data() 20 | 21 | def get_data(self): 22 | 23 | url = 'https://ihotel.meituan.com/group/v1/poi/comment/' + self.hotel_id + '?' 24 | params = { 25 | 'sortType': 'default', 26 | 'noempty': '1', 27 | 'withpic': '0', 28 | 'filter': 'all', 29 | 'limit': '10', 30 | 'offset': '0', 31 | } 32 | headers = { 33 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 34 | } 35 | response = requests.get(url=url, params=params, headers=headers) 36 | data = response.text 37 | self.parse(data) 38 | 39 | def parse(self, data): 40 | """解析数据 41 | """ 42 | data_dict = json.loads(data) 43 | for item in data_dict.get('data').get('feedback'): 44 | create_time = self.parse_time(item.get('replytimestamp')) 45 | print_str = """ 46 | 评论用户:{userName} 47 | 评论时间:{create_time} 48 | 评论详情:{comment} 49 | 满意度:{scoretext} 50 | """.format(userName=item.get('username'), comment=item.get('comment'), create_time=create_time, 51 | scoretext=item.get('scoretext')) 52 | print(print_str) 53 | self.parse_pic(item) 54 | 55 | @staticmethod 56 | def parse_time(timeStamp): 57 | """13位 解码时间 58 | """ 59 | time_array = time.localtime(timeStamp) 60 | return time.strftime("%Y-%m-%d %H:%M:%S", time_array) 61 | 62 | def parse_pic(self, item): 63 | pic_list = [i.get('url').replace('w.h', '750.0') for i in item.get('picinfo')] 64 | print(pic_list) 65 | 66 | 67 | if __name__ == '__main__': 68 | p_id = input('请输入酒店id') 69 | ParseComments(p_id) 70 | -------------------------------------------------------------------------------- /其他实战/【美团】数据解析、token生成/parse_hotel_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-05 Python: 3.7 4 | 5 | """ 6 | 解析 7 | 美团酒店店铺的基础信息 8 | 该板块信息隐藏在get请求后的js中直接用正则匹配出信息再抽取出来 9 | """ 10 | import requests 11 | import re 12 | import json 13 | import time 14 | 15 | 16 | class ParseHotelInfo(object): 17 | headers = { 18 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 19 | } 20 | 21 | def __init__(self, p_id): 22 | self.p_id = p_id 23 | 24 | def go_to_hotel(self): 25 | """执行访问 26 | """ 27 | # 拼接日期 28 | now_day = time.strftime('%Y-%m-%d', time.localtime(time.time())) 29 | 30 | # 组合 get 地址 31 | url = 'https://hotel.meituan.com/' + self.p_id + '/?ci=' + now_day + '&co=' + now_day 32 | data = requests.get(url, headers=self.headers).content.decode('utf-8') 33 | 34 | # 提取有效区域 35 | info = re.search(r'window.__INITIAL_STATE__=(.*?)', data, flags=re.DOTALL) 36 | if info: 37 | info_dict = json.loads(info.group(1).strip()[:-1]) 38 | self.parse_html(info_dict) 39 | else: 40 | print('访问失效') 41 | 42 | def parse_html(self, data_dict): 43 | data = data_dict.get('poiData') 44 | print('店名', data.get('name')) 45 | print('店铺id', data.get('poiid')) 46 | print('城市id', data.get('cityId')) 47 | print('地址', data.get('addr')) 48 | print('lng', data.get('lng')) 49 | print('lat', data.get('lat')) 50 | print('封面', data.get('frontImg').replace('w.h', '750.0')) 51 | print('wifi', data.get('wifi')) 52 | print('地区id', data.get('areaId')) 53 | print('地区名', data.get('areaName')) 54 | print('平均消费', data.get('avgPrice')) 55 | print('类别id', data.get('brandId')) 56 | print('类别名', data.get('brandName')) 57 | print('简介', data.get('introduction')) 58 | print('星级', data.get('highHotelStar')) 59 | print('舒适类型', data.get('hotelStar')) 60 | print('电话', [i.get('phone') for i in data.get('phoneList')]) 61 | print('平均分', data.get('avgScore')) 62 | print('标签', data.get('poiAttrTagList')) 63 | print('城市名', data.get('cityName')) 64 | print('城市拼音', data.get('cityPinyin')) 65 | 66 | poi_data = data_dict.get('poiExt') # 酒店详情 67 | print('服务', [i.get('attrDesc') for i in poi_data.get('serviceIconsInfo').get('serviceIcons')]) 68 | print('酒店介绍', {i.get('attrDesc'): i.get('attrValue') for i in poi_data.get('hotelIntroInfo').get('poiExtendsInfos')}) 69 | 70 | 71 | if __name__ == '__main__': 72 | print("""\033[1;33m请输入酒店ID \033[0m""") 73 | _id = input('(链接末尾数字就是ID)') 74 | # _id = '41823880' # 测试 75 | hotel = ParseHotelInfo(_id) 76 | hotel.go_to_hotel() 77 | -------------------------------------------------------------------------------- /其他实战/【美团】数据解析、token生成/parse_play_areas.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-05 Python: 3.7 4 | 5 | import requests 6 | import json 7 | import re 8 | from pypinyin import pinyin 9 | 10 | 11 | class ParseAreas(object): 12 | 13 | def __init__(self, city_name): 14 | self.alphabet = "".join([i[0][0] for i in pinyin(city_name)]) 15 | 16 | self.get_data() 17 | 18 | def get_data(self): 19 | 20 | url = 'https://{city}.meituan.com/xiuxianyule/' 21 | headers = { 22 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 23 | } 24 | target_url = url.format(city=self.alphabet) 25 | response = requests.get(target_url, headers=headers) 26 | data = response.text 27 | self.parse(data, target_url) 28 | 29 | @ staticmethod 30 | def parse(data, url): 31 | """解析数据 32 | """ 33 | py_dict = {} 34 | text = re.search(r'"city":{"id":(.*?),"name":"(.*?)","pinyin".*?"area":(.*?),"category":', data) 35 | if text: 36 | py_dict = {'城市': text.group(2), '城市ID': text.group(1)} 37 | dict_info = json.loads(text.group(3)).get('children') # 提取区域信息 38 | py_dict['区'] = [] 39 | 40 | for node in dict_info: 41 | if node.get('name') == '推荐商圈': 42 | continue # 推荐商圈过滤 43 | # 二级区域 44 | district = {'区名': node.get('name'), '区ID': node.get('id'), 45 | '区链接': url + 'b' + str(node.get('id')) + '/'} 46 | if node.get('children'): 47 | district['街道'] = [] 48 | # 三级区域 49 | for i in node.get('children'): 50 | area = {'街道名': i.get('name'), '街道ID': i.get('id'), 51 | '街道链接': url + 'b' + str(i.get('id')) + '/'} 52 | district['街道'].append(area) 53 | 54 | py_dict['区'].append(district) 55 | 56 | print(json.dumps(py_dict, ensure_ascii=False)) 57 | 58 | 59 | if __name__ == '__main__': 60 | print(""" 61 | \033[1;33m娱乐板块区域解析 62 | 请输入城市名例如 北京 63 | 返回json格式\033[0m 64 | """) 65 | chines = input('输入城市名') 66 | ParseAreas(chines) 67 | -------------------------------------------------------------------------------- /其他实战/【美团】数据解析、token生成/parse_play_info.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-08 Python: 3.7 4 | 5 | """ 6 | 解析 7 | 美团休闲娱乐商铺信息 8 | 该板块信息隐藏在get请求后的js中直接用正则匹配出信息再抽取出来 9 | """ 10 | import requests 11 | import re 12 | import json 13 | 14 | 15 | class ParsePlayInfo(object): 16 | target_url = 'http://www.meituan.com/xiuxianyule/{p_id}/' 17 | headers = { 18 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 19 | } 20 | 21 | def __init__(self, restaurant_id): 22 | self.restaurant_id = str(restaurant_id) 23 | 24 | self.go_to_restaurant() 25 | 26 | def go_to_restaurant(self): 27 | """执行访问 28 | """ 29 | url = self.target_url.format(p_id=self.restaurant_id) 30 | data = requests.get(url, headers=self.headers).text 31 | 32 | # 提取有效区域 33 | data = re.search(r'"params":{"poiInfo":(.*?)},"fallbackPara', data, flags=re.DOTALL) 34 | if data: 35 | self.parse_html(json.loads(data.group(1))) 36 | else: 37 | print('访问失效') 38 | 39 | def parse_html(self, data): 40 | print('商铺ID', self.restaurant_id) 41 | print('城市ID', data.get('catId')) 42 | print('城市', data.get('cityName')) 43 | print('城市拼音', data.get('cityPy')) 44 | print('店铺', data.get('shopName')) 45 | print('评分', data.get('score')) 46 | print('平均消费', data.get('avgPrice')) 47 | print('地址', data.get('address')) 48 | print('电话', data.get('phone')) 49 | print('营业时间', data.get('openTime')) 50 | print('封面图片', data.get('headIcon')) 51 | print('wifi', data.get('wifi')) # 有=1 无=0 52 | print('停车', data.get('park')) # 如果有例如:免费提供5个停车位。 没有为空 53 | print('经度', data.get('lng')) 54 | print('纬度', data.get('lat')) 55 | print('类型', data.get('breadCrumbNavDTOList')[2].get('title')[len(data.get('cityName')):]) 56 | 57 | albums = [] 58 | images = data.get('albumDTOList') 59 | for node in images: 60 | albums.append(node.get('url')) 61 | print('相册', albums) 62 | 63 | 64 | if __name__ == '__main__': 65 | print(""" 66 | \033[1;33m请输入商铺ID \033[0m 67 | """) 68 | p_id = input('(商铺网址末尾数字就是ID)') 69 | ParsePlayInfo(p_id) 70 | -------------------------------------------------------------------------------- /其他实战/【试客联盟】登录/login.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-11-23 Python: 3.7 4 | 5 | 6 | import execjs 7 | import requests, re 8 | 9 | s = requests.Session() 10 | 11 | 12 | def main(pwd): 13 | """res_n 这个参数 是从网页获取的 但调试发现是其实固定的 14 | :param pwd: 15 | :return: 16 | """ 17 | with open('execute.js', 'r', encoding='utf-8') as f: 18 | js = execjs.compile(f.read()) 19 | 20 | print('引擎', execjs.get().name) 21 | sign = js.call('get_pwd', pwd) 22 | return sign 23 | 24 | 25 | def login(sign_pwd, username): 26 | url = "http://login.shikee.com/check/?&_1574394219820" 27 | data = { 28 | "username": username, 29 | "password": sign_pwd, 30 | "vcode": '', 31 | "to": 'http://user.shikee.com/', 32 | } 33 | res = s.post(url=url, data=data) 34 | res.encoding = "utf-8" 35 | print(res.text) 36 | 37 | 38 | def home(): 39 | home_url = "http://user.shikee.com/buyer" 40 | response = s.get(home_url) 41 | html = response.content.decode('utf-8') 42 | data = re.findall( 43 | '
.*?

您好!(.*?)您有未读提醒 1

', 44 | html, re.S)[0] 45 | print(data) 46 | 47 | 48 | if __name__ == '__main__': 49 | username = input('请输入账户:') 50 | pwd = input('请输入密码:') 51 | sign = main(pwd) 52 | print('正在登录....') 53 | login(sign, username) 54 | home() 55 | -------------------------------------------------------------------------------- /其他实战/【谷雨】数字解密/GuYu.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-25 Python: 3.7 4 | 5 | import requests 6 | import os 7 | from fontTools.ttLib import TTFont 8 | 9 | 10 | class Font: 11 | """ 12 | https://guyujiezi.com/ 13 | 谷雨解字的 数字解密 14 | 现在版本的 雨谷字体加的xml 会有一个移位操作 15 | """ 16 | def __init__(self, uri): 17 | self.url = uri 18 | self.filename = uri.split('/')[-1] 19 | self.font = None 20 | self._list = [] 21 | 22 | def check(self): 23 | """检查目录 24 | """ 25 | if not os.path.isfile(self.filename): 26 | resp = requests.get(self.url) 27 | with open(self.filename, 'wb') as f: 28 | f.write(resp.content) 29 | # TTFont 存为 xml 30 | self.font = TTFont(self.filename) 31 | self.font.saveXML(self.filename.replace(self.filename.split('.')[-1], 'xml')) 32 | 33 | def get_wo(self): 34 | """获取 woff 35 | """ 36 | self.check() 37 | ph = self.font['cmap'] 38 | _dict = ph.tables[0].cmap 39 | # 1. 字典取 value 列表化 40 | # 2. str 取最后 2 位,并转为 int 41 | # 3. 减去 17 并从新组装列表 42 | self._list = [int(i[-2:])-17 for i in list(_dict.values())] 43 | """ 44 | 处理移位 45 | """ 46 | print(list(_dict.values())) 47 | print(self._list) 48 | 49 | def parse(self, number): 50 | _str = '' 51 | for num in number: 52 | _str += str(self._list[int(num)]) 53 | print('最终展示字', int(_str)) 54 | 55 | 56 | if __name__ == '__main__': 57 | ft = Font("https://guyujiezi.com/fonts/2DLw9u/3iZbr8.woff") 58 | ft.get_wo() 59 | # 输入页面数字测试 60 | ft.parse('947') 61 | 62 | 63 | -------------------------------------------------------------------------------- /其他实战/【豆瓣】自动登录/DouBan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2020-01-08 Python: 3.7 4 | 5 | import requests 6 | import re 7 | 8 | 9 | class DouBan: 10 | def __init__(self, name, pwd): 11 | self.name = name.strip() 12 | self.pwd = pwd.strip() 13 | self.session = requests.session() 14 | self.headers = { 15 | 'Origin': 'https://accounts.douban.com', 16 | 'Host': 'accounts.douban.com', 17 | 'Referer': 'https://accounts.douban.com/passport/login_popup?login_source=anony', 18 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36', 19 | } 20 | self.login_url = 'https://accounts.douban.com/j/mobile/login/basic' 21 | self.index_url = "https://www.douban.com/" 22 | self.session = requests.session() 23 | 24 | def login(self): 25 | data = { 26 | 'ck': '', 27 | 'name': self.name, 28 | 'password': self.pwd, 29 | 'remember': 'false', 30 | 'ticket': '', 31 | } 32 | self.session.post(self.login_url, data=data, headers=self.headers) 33 | 34 | def check(self): 35 | self.headers['Host'] = 'www.douban.com' 36 | response = self.session.get("https://www.douban.com/", headers=self.headers) 37 | try: 38 | title = re.search(r'(.*?)的帐号', response.text).group(1) 39 | print('【登录成功】', title) 40 | except: 41 | print('【登录失败】') 42 | 43 | 44 | if __name__ == '__main__': 45 | username = input('豆瓣用户名 >>>') 46 | password = input('密码 >>>') 47 | db = DouBan(username, password) 48 | db.login() 49 | db.check() 50 | -------------------------------------------------------------------------------- /其他实战/【逗游】自动登录/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【逗游】自动登录/douyou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-08-01 Python: 3.7 4 | 5 | import js2py 6 | import requests 7 | import json 8 | 9 | 10 | class DouYou: 11 | headers = { 12 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36', 13 | 'Referer': 'http://www.doyo.cn/passport/login' 14 | } 15 | 16 | def __init__(self, username, password): 17 | self.context = js2py.EvalJs() # python中使用js 18 | self.username = username 19 | self.password = password 20 | 21 | def make_password(self): 22 | """取加密后的字符串 23 | """ 24 | try: 25 | nonce, ts = self.get_token() 26 | with open("encryp.js", "r", encoding="utf-8") as f: 27 | self.context.execute(f.read()) 28 | pwd_hash = self.context.get_value(self.password, nonce, ts) 29 | return pwd_hash # 打印加密之后的密码 30 | except: 31 | print('获取token失败') 32 | 33 | def get_token(self): 34 | """获取 token 35 | """ 36 | get_token_url = 'http://www.doyo.cn/User/Passport/token?username={user}&random=0.1428378278012199'.format(user=self.username) 37 | result = json.loads(requests.get(get_token_url).text) 38 | if result.get('result'): 39 | nonce = result.get('nonce') 40 | ts = result.get('ts') 41 | return nonce, ts 42 | else: 43 | print('获取token失败') 44 | exit() 45 | 46 | def login(self): 47 | """登陆 48 | """ 49 | # decode('unicode_escape') 50 | login_url = 'http://www.doyo.cn/passport/login' 51 | data = { 52 | 'username': self.username, 53 | 'password': self.make_password(), 54 | 'remberme': '1', 55 | 'next': 'aHR0cCUzQSUyRiUyRnd3dy5kb3lvLmNuJTJG' 56 | } 57 | response = requests.post(login_url, data=data, headers=self.headers) 58 | info = json.loads(response.text) 59 | if info.get('result'): 60 | print('登陆成功 | 用户等级:{level} 用户id:{uid}'.format(level=info.get('level'), uid=info.get('uid'))) 61 | else: 62 | print('登陆失败') 63 | 64 | 65 | if __name__ == '__main__': 66 | user = input('输入逗游账号') 67 | pwd = input('输入密码') 68 | dy = DouYou(user, pwd) 69 | dy.login() 70 | -------------------------------------------------------------------------------- /其他实战/【金逸电影】自动注册/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【金逸电影】自动注册/register.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【金逸电影】自动注册/register.png -------------------------------------------------------------------------------- /其他实战/【金逸电影】自动注册/register.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-04 Python: 3.7 4 | 5 | import requests 6 | import execjs.runtime_names 7 | 8 | 9 | class JinYiRegister: 10 | """ 11 | 金逸电影注册 12 | http://www.jycinema.com/wap/#/register 13 | """ 14 | def __init__(self, phone): 15 | self.headers = { 16 | 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1', 17 | } 18 | self.url = 'http://www.jycinema.com/frontUIWebapp/appserver/photoMessageService/newsSendMessage' 19 | self.phone = phone 20 | 21 | @staticmethod 22 | def js_make(json_data): 23 | with open('encryp.js', 'r', encoding='utf-8') as f: 24 | js = execjs.compile(f.read()) 25 | try: 26 | result = js.call("getEncryption", json_data) 27 | return result 28 | except Exception: 29 | print('js 异常') 30 | 31 | def register(self): 32 | data = '{"mobileNumber": ' + self.phone + ', "channelId": 7, "channelCode": "J0005", "memberId": ""}' 33 | data = { 34 | 'params': self.js_make(data), 35 | 'Origin': 'http://www.jycinema.com', 36 | 'Referer': 'http://www.jycinema.com/wap/', 37 | } 38 | response = requests.post(self.url, data=data, headers=self.headers) 39 | print(response.content.decode('utf-8')) 40 | 41 | 42 | if __name__ == '__main__': 43 | your_phone = input('请输入待注册手机号') 44 | jy = JinYiRegister(your_phone) 45 | jy.register() 46 | -------------------------------------------------------------------------------- /其他实战/【青海移动】登陆参数生成/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-06 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /其他实战/【青海移动】登陆参数生成/make_param.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-09-12 Python: 3.7 4 | import execjs.runtime_names 5 | 6 | 7 | class QinHaiYiDong: 8 | """ 9 | 青海移动 10 | 参数加密 11 | https://www.iqhmall.cn/shopweb/logon/logon 12 | """ 13 | def __init__(self, user, pwd): 14 | self.js = None 15 | self.user = user 16 | self.pwd = pwd 17 | self.init_js() 18 | 19 | def init_js(self): 20 | print('引擎', execjs.get().name) 21 | with open("encryp.js", "r", encoding="utf-8") as f: 22 | self.js = execjs.compile(f.read()) 23 | 24 | def make_param(self): 25 | print(self.js.call('test', self.pwd)) 26 | 27 | 28 | if __name__ == '__main__': 29 | yd = QinHaiYiDong('17327362817', '123123123') 30 | yd.make_param() 31 | -------------------------------------------------------------------------------- /其他实战/【餐饮】查询信息/FoodInfo.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf-8 -*- 2 | # Time : 2020/01/16 3 | # Author : Zok 4 | # Email : 362416272@qq.com 5 | 6 | import requests 7 | import re 8 | import json 9 | from copyheaders import headers_raw_to_dict 10 | 11 | 12 | class Food: 13 | """ 14 | 根据输入美团餐馆名,解析参观基础信息 15 | """ 16 | def __init__(self): 17 | self.headers = headers_raw_to_dict(b""" 18 | Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 19 | Accept-Encoding: gzip, deflate, br 20 | Accept-Language: zh-CN,zh;q=0.9 21 | Cache-Control: max-age=0 22 | Connection: keep-alive 23 | Cookie: _lxsdk_s=16fb0ce3a0d-4cf-d9e-cf2%7C%7C1 24 | Host: www.meituan.com 25 | Sec-Fetch-Mode: navigate 26 | Sec-Fetch-Site: none 27 | Sec-Fetch-User: ?1 28 | Upgrade-Insecure-Requests: 1 29 | User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36 30 | """) 31 | 32 | def get_info(self, url): 33 | response = requests.get(url, headers=self.headers) 34 | data = json.loads(re.search(r'{ Object.defineProperties(navigator,' 23 | '{ webdriver:{ get: () => false } }) }') # 本页刷新后值不变 24 | 25 | await page.goto(self.login_url) 26 | await page.type('input#login-email', username) 27 | await page.type('input#login-password', password) 28 | await page.click('input.btn') 29 | await self.get_cookie(page,username,password) 30 | 31 | async def get_cookie(self, page,username,password): 32 | """ 33 | 获取 cookies 34 | :param page: 页面 35 | :return: 36 | """ 37 | cookies_list = await page.cookies() 38 | cookies = '' 39 | for cookie in cookies_list: 40 | str_cookie = '{0}={1};' 41 | str_cookie = str_cookie.format(cookie.get('name'), cookie.get('value')) 42 | cookies += str_cookie 43 | # 储存cookies 44 | print(cookies) 45 | self.r.set(username, json.dumps({'password': password, 'cookies': cookies})) 46 | 47 | 48 | if __name__ == '__main__': 49 | mt = MeiTuanCookies() 50 | 51 | with open('账号.txt', 'r', encoding='utf-8') as f: 52 | # 账号|密码\n 53 | lines = f.readlines() 54 | 55 | tasks = [] 56 | for line in lines: 57 | username, password = line.strip().split('|') 58 | tasks.append(mt.star(username, password)) 59 | 60 | loop = asyncio.get_event_loop() 61 | loop.run_until_complete(asyncio.wait(tasks)) 62 | 63 | -------------------------------------------------------------------------------- /原创爬虫工具/Cookies/MeiTuan/账号.txt: -------------------------------------------------------------------------------- 1 | 账号1|密码1 2 | 账号2|密码2 3 | 账号3|密码3 -------------------------------------------------------------------------------- /原创爬虫工具/Cookies/README.md: -------------------------------------------------------------------------------- 1 | # 异步批量登陆美团获取cookies 2 | 3 | > pyppeteer 异步批量登陆美团并将cookies储存到redis 的hash表中 -------------------------------------------------------------------------------- /原创爬虫工具/Cookies/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-14 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /原创爬虫工具/DataMigration/README.md: -------------------------------------------------------------------------------- 1 | # 工作中经常有这种需求 2 | > 将采集好的mongodb数据转存到mysql中,或者是redis数据转到mongodb,于是打算封装一个组件便于以后调用 3 | 4 | # mysql转存mongo 5 | 1. 在 config 中配置 mongo 与 mysql 连接 6 | 2. 在 `msyql_to_mongo.py` 下方实例化时填入 `需要转换mysql表名`, `mongo库名`, `mongo表名` 7 | 3. 调用 `mi.easy_to_mongo()` 即可将 mysql 中的 数据导入到 mongodb 8 | 9 | > 当然也支持自定义转换,在类中添加即可 -------------------------------------------------------------------------------- /原创爬虫工具/DataMigration/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-15 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /原创爬虫工具/DataMigration/config.py: -------------------------------------------------------------------------------- 1 | # mongodb链接 2 | MONGODB_URL = 'mongodb://localhost:27017' 3 | 4 | # Redis数据库地址 5 | REDIS_HOST = '' 6 | 7 | # Redis端口 8 | REDIS_PORT = 6379 9 | 10 | # Redis密码,如无填None 11 | REDIS_PASSWORD = None 12 | 13 | # Mysql地址 14 | MYSQL_HOST = '127.0.0.1' 15 | 16 | # Mysql端口 17 | MYSQL_PORT = 3306 18 | 19 | # Mysql用户名 20 | MYSQL_USER = 'root' 21 | 22 | # Mysql密码 23 | MYSQL_PASSWORD = '' 24 | 25 | # Mysql链接库 26 | MYSQL_DB_NAME = 'travel' 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /原创爬虫工具/DataMigration/db/MongoDB.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-15 Python: 3.7 4 | from pymongo import MongoClient 5 | 6 | from DataMigration.config import MONGODB_URL 7 | 8 | 9 | class Mongo(object): 10 | def __init__(self, db_name, collection): 11 | client = MongoClient(MONGODB_URL) 12 | database = client[db_name] 13 | self.collection = database[collection] 14 | 15 | def delete(self, *args, del_one=True): 16 | """ 17 | 删除复合条件的信息 18 | :param sql: sql 语句 19 | :param del_one: 默认删除第一条,否则删除复合条件的所有 20 | :return: 21 | """ 22 | return self.collection.delete_one(*args) if del_one else self.collection.deleteMany(*args) 23 | 24 | @property 25 | def all(self): 26 | """ 27 | 返回全部 28 | :return: 整表信息 29 | """ 30 | return self.collection.find({}) 31 | 32 | def find(self, *args): 33 | """ 34 | 指定查找 35 | :param sql: 36 | :return: 37 | """ 38 | return self.collection.find(*args) 39 | 40 | def update(self, *args, update_one=True): 41 | """ 42 | 修改数据 43 | :param sql: 修改sql 44 | :param update_one: 默认修改第一个,否则修改复合条件所有 45 | :return: 46 | """ 47 | return self.collection.update_one(*args) if update_one else self.collection.update_many(*args) 48 | 49 | def insert(self, *args, insert_one=True): 50 | """ 51 | 插入数据 52 | :param sql: 新增sql 53 | :param insert_one: 默认插入一个 54 | :return: 55 | """ 56 | return self.collection.insert_one(*args) if insert_one else self.collection.insert_many(*args) 57 | 58 | 59 | if __name__ == '__main__': 60 | # 测试 61 | mg = Mongo('meituan', 'user_info') 62 | # data = mg.all 63 | ret = mg.update({'用户名': '三丰948'}, {'$set': {'用户名': '三三风'}}) 64 | -------------------------------------------------------------------------------- /原创爬虫工具/DataMigration/db/Mysql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-15 Python: 3.7 4 | import pymysql 5 | 6 | from DataMigration.config import MYSQL_HOST, MYSQL_PORT, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB_NAME 7 | 8 | 9 | class Mysql(object): 10 | def __init__(self): 11 | """ 12 | 链接数据库 13 | """ 14 | self.conn = pymysql.Connect( 15 | host=MYSQL_HOST, 16 | port=MYSQL_PORT, 17 | user=MYSQL_USER, 18 | password=MYSQL_PASSWORD, 19 | db=MYSQL_DB_NAME, 20 | ) 21 | 22 | def insert(self, sql): 23 | """ 24 | 查找 25 | :param sql: sql语句 26 | :return: 27 | """ 28 | # 创建游标对象 29 | cursor = self.conn.cursor() 30 | # 执行并提交 31 | try: 32 | cursor.execute(sql) 33 | self.conn.commit() 34 | except Exception as e: 35 | print('异常回滚') 36 | self.conn.rollback() 37 | finally: 38 | cursor.close() 39 | 40 | def select(self, sql): 41 | """ 42 | 查找 43 | :param sql: sql 语句 44 | :return: 查找结果 45 | """ 46 | cursor = self.conn.cursor() # 创建游标对象 47 | # 提交事务 48 | try: 49 | cursor.execute(sql) 50 | data = cursor.fetchall() 51 | except Exception as e: 52 | print('异常回滚') 53 | data = None 54 | self.conn.rollback() 55 | finally: 56 | cursor.close() 57 | return data 58 | 59 | -------------------------------------------------------------------------------- /原创爬虫工具/DataMigration/db/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-15 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /原创爬虫工具/DataMigration/migration/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-15 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /原创爬虫工具/DataMigration/migration/mongo_to_mysql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-23 Python: 3.7 4 | 5 | 6 | from DataMigration.db.MongoDB import Mongo 7 | from DataMigration.db.Mysql import Mysql 8 | from DataMigration.config import MYSQL_DB_NAME 9 | 10 | 11 | class Migrate(object): 12 | def __init__(self,mysql_table_name, mongodb_name, mongodb_collection): 13 | self.mongo = Mongo(mongodb_name, mongodb_collection) 14 | self.mysql = Mysql() 15 | self.mysql_name = mysql_table_name 16 | 17 | def easy_to_mongo(self, column_comment=False): 18 | """ 19 | 将输入插入 mongodb 20 | :return: 21 | """ 22 | columns = self.get_column() 23 | nodes = self.all_mysql_data() 24 | data_list = [] 25 | 26 | for node in nodes: 27 | data_dict = {} 28 | for index, column in enumerate(columns): 29 | if column_comment: 30 | data_dict[column[1]] = node[index] 31 | else: 32 | data_dict[column[0]] = node[index] 33 | data_list.append(data_dict) 34 | try: 35 | self.mongo.insert(data_list, insert_one=False) 36 | print('储存成功') 37 | except Exception: 38 | print('转存失败') 39 | 40 | def all_mysql_data(self): 41 | """ 42 | 获取需要转换的数据 43 | :return: 所有 mysql 数据 44 | """ 45 | sql = """SELECT * from {table_name};""".format(table_name=self.mysql_name) 46 | return self.mysql.select(sql) 47 | 48 | def get_column(self): 49 | """ 50 | 取字段名 51 | :return: (字段名,字段描述) 52 | """ 53 | sql = """select COLUMN_NAME,column_comment 54 | from INFORMATION_SCHEMA.Columns 55 | where table_name='{table_name}' and table_schema='{db_name}'""".format( 56 | table_name=self.mysql_name, 57 | db_name=MYSQL_DB_NAME, 58 | ) 59 | return self.mysql.select(sql) 60 | 61 | 62 | if __name__ == '__main__': 63 | mi = Migrate('需要转换mysql表名', 'mongo库名', 'mongo表名') 64 | mi.easy_to_mongo(column_comment=True) # column_comment=True 使用注释的字段名, 默认不使用 65 | -------------------------------------------------------------------------------- /原创爬虫工具/DataMigration/migration/mysql_to_mongo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-20 Python: 3.7 4 | 5 | 6 | from DataMigration.db.MongoDB import Mongo 7 | from DataMigration.db.Mysql import Mysql 8 | from DataMigration.config import MYSQL_DB_NAME 9 | 10 | 11 | class Migrate(object): 12 | def __init__(self, mysql_table_name, mongodb_name, mongodb_collection): 13 | self.mongo = Mongo(mongodb_name, mongodb_collection) 14 | self.mysql = Mysql() 15 | self.mysql_name = mysql_table_name 16 | 17 | def easy_to_mongo(self, column_comment=False): 18 | """ 19 | 将输入插入 mongodb 20 | :return: 21 | """ 22 | columns = self.get_column() 23 | nodes = self.all_mysql_data() 24 | data_list = [] 25 | 26 | for node in nodes: 27 | data_dict = {} 28 | for index, column in enumerate(columns): 29 | if column_comment: 30 | data_dict[column[1]] = node[index] 31 | else: 32 | data_dict[column[0]] = node[index] 33 | data_list.append(data_dict) 34 | try: 35 | self.mongo.insert(data_list, insert_one=False) 36 | print('储存成功') 37 | except Exception: 38 | print('转存失败') 39 | 40 | def all_mysql_data(self): 41 | """ 42 | 获取需要转换的数据 43 | :return: 所有 mysql 数据 44 | """ 45 | sql = """SELECT * from {table_name};""".format(table_name=self.mysql_name) 46 | return self.mysql.select(sql) 47 | 48 | def get_column(self): 49 | """ 50 | 取字段名 51 | :return: (字段名,字段描述) 52 | """ 53 | sql = """select COLUMN_NAME,column_comment 54 | from INFORMATION_SCHEMA.Columns 55 | where table_name='{table_name}' and table_schema='{db_name}'""".format( 56 | table_name=self.mysql_name, 57 | db_name=MYSQL_DB_NAME, 58 | ) 59 | return self.mysql.select(sql) 60 | 61 | 62 | if __name__ == '__main__': 63 | mi = Migrate('需要转换mysql表名', 'mongo库名', 'mongo表名') 64 | mi.easy_to_mongo(column_comment=True) # column_comment=True 使用注释的字段名, 默认不使用 65 | -------------------------------------------------------------------------------- /原创爬虫工具/Decode/README.md: -------------------------------------------------------------------------------- 1 | # 可拓展式解密器 2 | > 方便测试可连续转换重制的编码转换器,可灵活拓展解码规则 3 | 4 | 5 | # 说明博客 6 | 7 | [**博客地址**](https://www.zhangkunzhi.com/?p=241) 8 | 9 | 10 | -------------------------------------------------------------------------------- /原创爬虫工具/Decode/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-01 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /原创爬虫工具/Decode/translation.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-04-28 Python: 3.7 4 | 5 | import base64 6 | import zlib 7 | 8 | COLOR = {'red': 1, 'green': 2, 'yellow': 3, 'blue': 4} 9 | 10 | 11 | class TranslationMetaClass(type): 12 | """Meta 类""" 13 | def __new__(mcs, name, bases, attrs): 14 | count = 0 15 | attrs['__Decode__'] = {} 16 | for k, v in attrs.items(): 17 | if 'decode_' in k: 18 | count += 1 19 | attrs['__Decode__'][str(count)] = k 20 | attrs['__TranslationFuncCount__'] = count 21 | return type.__new__(mcs, name, bases, attrs) 22 | 23 | 24 | class Util(object): 25 | """辅助类""" 26 | 27 | @staticmethod 28 | def _print(color, msg): 29 | """print color control 30 | """ 31 | node = '\033[1;3{id}m{msg}\033[0m' 32 | if COLOR.get(color): 33 | print(node.format(id=COLOR.get(color), msg=msg)) 34 | else: 35 | print(msg) 36 | 37 | def msg(self): 38 | """print decode func 39 | """ 40 | for k in self.__Decode__: 41 | self._print('yellow', str(k) + ': ' + self.__Decode__[k][7:]) 42 | self._print('yellow', 'r: 【重制】 e:【退出】') 43 | return input('请选择 >>>').lower() 44 | 45 | 46 | class Decode(Util, metaclass=TranslationMetaClass): 47 | """ 48 | 将需要添加的转码类型按下列类似格式添加即可 49 | def decode_自定义名(self): 50 | self._key = 解密过程 51 | """ 52 | def __init__(self, _key): 53 | self._key = _key 54 | self._copy = _key 55 | self.crumbs = '' 56 | 57 | def main(self): 58 | choice = self.msg() 59 | while choice != 'e': 60 | if choice == 'r': # 重制 61 | self._key, self.crumbs = self._copy, '' 62 | self._print('blue', '重制成功: ' + self._key) 63 | choice = self.msg() 64 | elif choice in self.__Decode__: # 选择是否在现有函数选项中 65 | try: 66 | eval("self.{}()".format(self.__Decode__[choice])) # 字符串转函数运行 67 | self._print('blue', self._key) 68 | self.crumbs += self.__Decode__[choice][7:] + ' > ' 69 | self._print('green', self.crumbs) 70 | choice = self.msg() 71 | except Exception: 72 | choice = input('解码失败,换一种 >>>') 73 | 74 | self._print('red', '调试结束') 75 | 76 | def decode_base64(self): 77 | """解base64""" 78 | self._key = base64.b64decode(self._key) 79 | 80 | def decode_zlib(self): 81 | """解压串""" 82 | self._key = zlib.decompress(self._key) 83 | 84 | def decode_str(self): 85 | """转字符串""" 86 | self._key = str(self._key, encoding="utf-8") 87 | 88 | def decode_hex(self): 89 | """转到16进制""" 90 | self._key = self._key.hex() 91 | 92 | 93 | if __name__ == '__main__': 94 | # _key = 'eJyrVnqxZdnT/u1KVgpKpcWpRUo6CkpP17c9X9AIEilILC4uzy9KUaoFAGxTEMo=' # 测试 95 | _key = input('\033[1;31m输入解码内容>>> \033[0m') 96 | ts = Decode(_key) 97 | ts.main() 98 | -------------------------------------------------------------------------------- /原创爬虫工具/Jsencrypt/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-29 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /原创爬虫工具/Jsencrypt/make_encrypt.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-28 Python: 3.7 4 | 5 | import base64 6 | 7 | from Crypto.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5 8 | from Crypto.PublicKey import RSA 9 | 10 | 11 | public_key = """ 12 | -----BEGIN PUBLIC KEY----- 13 | Your PUBLIC KEY 14 | -----END PUBLIC KEY----- 15 | """ 16 | 17 | 18 | def make_message(pwd): 19 | rsakey = RSA.importKey(public_key) 20 | cipher = Cipher_pkcs1_v1_5.new(rsakey) 21 | cipher_text = base64.b64encode(cipher.encrypt(pwd.encode(encoding="utf-8"))) 22 | return cipher_text.decode('utf8') 23 | 24 | 25 | if __name__ == '__main__': 26 | print(make_message('hellow')) 27 | -------------------------------------------------------------------------------- /原创爬虫工具/OSS/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-24 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /原创爬虫工具/OSS/push_to_oss.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-24 Python: 3.7 4 | 5 | """ 6 | 将图redis中储存的网络图片链接,并发直传到 OSS 上 7 | """ 8 | 9 | import oss2 10 | import redis 11 | import requests 12 | 13 | from concurrent.futures import ThreadPoolExecutor # 线程池模块 14 | 15 | KEY = '' 16 | KEYSECRET = '' 17 | BUCKETNAME = '' 18 | ENDPOINT = 'http://oss-cn-hangzhou.aliyuncs.com' 19 | 20 | REDIS_HOST = "localhost" 21 | REDIS_USER = "root" 22 | REDIS_PASSWORD = "" 23 | REDIS_DB_NAME = 1 24 | REDIS_PORT = 6379 25 | 26 | list_name = 'restaurant' # 列队名 27 | 28 | # oss 29 | auth = oss2.Auth(KEY, KEYSECRET) 30 | bucket = oss2.Bucket(auth, ENDPOINT, BUCKETNAME) 31 | 32 | # redis 池 33 | pool = redis.ConnectionPool(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB_NAME, password=REDIS_PASSWORD, 34 | decode_responses=True) 35 | r = redis.Redis(connection_pool=pool) 36 | 37 | 38 | def put_img(): 39 | """上传逻辑,根据项目需求修改即可""" 40 | url = r.rpop(list_name) 41 | input = requests.get(url) 42 | if input.status_code == 200: 43 | file_name = url # this is file name 44 | obj = bucket.put_object(file_name, input) 45 | if obj.status == 200: 46 | print('OK', file_name) 47 | else: 48 | r.lpush(list_name) 49 | 50 | 51 | def get_len(): 52 | return r.llen(list_name) 53 | 54 | 55 | if __name__ == '__main__': 56 | list_len = get_len() 57 | print('专辑总图数量', list_len) 58 | pool = ThreadPoolExecutor() # 设置线程池大小,默认等于cpu核数 59 | for i in range(list_len): 60 | pool.submit(put_img) 61 | 62 | pool.shutdown(wait=True) 63 | print('主进程') 64 | -------------------------------------------------------------------------------- /原创爬虫工具/Proxy/KDLProxyPool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-17 Python: 3.7 4 | 5 | """ 6 | 快代理IP池 7 | https://www.kuaidaili.com/ 放代理API 8 | """ 9 | 10 | import redis 11 | import requests 12 | import json 13 | 14 | from apscheduler.schedulers.blocking import BlockingScheduler 15 | 16 | 17 | class KDLProxyPool(object): 18 | """ 19 | 快代理IP池 20 | 用的快代理开放代理API 21 | """ 22 | 23 | def __init__(self, key, count): 24 | try: 25 | self.key = key # 订单号 26 | self.count = count # 代理池代理数量 27 | """redis数据库配置区""" 28 | pool = redis.ConnectionPool(decode_responses=True) 29 | self.r = redis.Redis(connection_pool=pool) 30 | except: 31 | print('请填入正确的API链接') 32 | 33 | def check_ip(self): 34 | """ 35 | 监控 IP 分数、个数,对其进行增删 36 | """ 37 | # 检查分数 38 | nodes = self.r.zrevrange('KDLProxy', 0, -1, withscores=True) 39 | for i in nodes: 40 | node = list(i) 41 | score = int(node[1]) 42 | if score <= 0: 43 | print('\033[1;33m分数过低剔除\033[0m') 44 | self.r.zrem('KDLProxy', node[0]) 45 | 46 | # 检查个数 47 | _sum = self.r.zcard('KDLProxy') 48 | if _sum < self.count: 49 | self.add_ip(self.count - _sum) 50 | 51 | def add_ip(self, num): 52 | """ 53 | 提取IP 54 | """ 55 | get_url = 'http://svip.kdlapi.com/api/getproxy/?orderid={key}&num={num}&protocol=2&method=2&an_ha=1&sp1=1&quality=2&format=json&sep=1'.format( 56 | key=self.key, num=num) 57 | 58 | # 返回的文本进行解析 59 | response = requests.get(get_url) 60 | if response.status_code == 200: 61 | ret = json.loads(response.text) 62 | if ret.get('code') == 0: 63 | self.parse(ret.get('data').get('proxy_list')) 64 | else: 65 | print(ret.get('msg')) 66 | else: 67 | print('提取失败') 68 | 69 | def parse(self, proxy_list): 70 | """ 71 | 解析返回数据 72 | """ 73 | for node in proxy_list: 74 | self.save_to_redis(node, 10) # 默认10分 75 | 76 | def save_to_redis(self, proxy, expire): 77 | """ 78 | 推送到redis集合中 79 | """ 80 | print('代理 %s 推入redis集合' % proxy) 81 | self.r.zadd('KDLProxy', {proxy: expire}) 82 | 83 | 84 | def aps_run(): 85 | """ 86 | 监控 87 | """ 88 | kdl.check_ip() 89 | 90 | 91 | kdl = KDLProxyPool('填写开放代理订单号', 20) 92 | 93 | # 循环监控 94 | scheduler = BlockingScheduler() 95 | scheduler.add_job(aps_run, 'cron', second='*/1') # 这里设置检测评论,推荐2s一次(默认) 96 | scheduler.start() 97 | -------------------------------------------------------------------------------- /原创爬虫工具/Proxy/README.md: -------------------------------------------------------------------------------- 1 | [TOC] 2 | 3 | # 安装模块 4 | 5 | ```bush 6 | pip3 install redis 7 | pip3 install apscheduler 8 | pip3 install reuqest 9 | pip3 install python-dateutil 10 | ``` 11 | 12 | # 讯代理池使用 13 | 1. 登陆讯代理 进入API页码将下面下方生成的API复制 14 | ![讯代理API](https://www.zhangkunzhi.com/images/xdl3.png) 15 | 16 | 2. 将链接复制到项目该位置 17 | ![讯代理API](https://www.zhangkunzhi.com/images/xdl4.png) 18 | 19 | 3. 配置redis, 默认是本机 20 | ![讯代理API](https://www.zhangkunzhi.com/images/xdl5.png) 21 | 22 | 4. 启动程序,大功告成,只需要在调用ip的时候对其进行增减分操作即可 23 | ![讯代理API](https://www.zhangkunzhi.com/images/xdl1.png) 24 | ![讯代理API](https://www.zhangkunzhi.com/images/xdl2.png) 25 | 26 | # 芝麻代理池使用 27 | 28 | 1. 首先登陆你的芝麻代理后台管理,找到自己的key如图 29 | ![key位置](https://www.zhangkunzhi.com/images/芝麻1.png) 30 | 31 | 1. 在代码下方配置key 32 | ![key位置](https://www.zhangkunzhi.com/images/填入芝麻key.png) 33 | 34 | 1. 在代码中配置 redis库连接 **默认链接的本地** 35 | ![key位置](https://www.zhangkunzhi.com/images/代理模块.png) 36 | 37 | 1. 启动程序 38 | > 如果在服务端可以使用后台运行命令 39 | `nohup python3 ProxyPool.py >my.log &` 40 | 41 | 1. 第一次启动芝麻代理会绑定你的ip白名单,稍等片刻就会开始提取 42 | 43 | ![key位置](https://www.zhangkunzhi.com/images/提取ip.png) 44 | 45 | 1. 链接redis可以看到ip池了,大功告成 46 | ![key位置](https://www.zhangkunzhi.com/images/20个ip.png) 47 | 48 | 1. 后续在使用代理ip时,根据访问结果对代理ip积分增减即可,后续会更新这个Demo继续关注Github即可。[**传送门**](https://github.com/wkunzhi/SpiderUtilPackage) 49 | 50 | 51 | # 额外配置 52 | - 可以自由配置,代理池上线值(默认20),实例化时配置即可 53 | ```python 54 | zm = ZhiMaPool('key', ip_sum=100) 55 | ``` 56 | - 可以自由配置,只取可用时间xx以上的ip(默认1号套餐下的1000秒以上),实例化时配置即可 57 | ```python 58 | zm = ZhiMaPool('key', ttl=1000) 59 | ``` 60 | - 还可以配置 每次提取数、提取套餐类型、提取ip HTTP或者HTTPS或者Sockets 61 | -------------------------------------------------------------------------------- /原创爬虫工具/Proxy/XDLProxyPool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-09 Python: 3.7 4 | import redis 5 | import requests 6 | import re 7 | import time 8 | import json 9 | 10 | from apscheduler.schedulers.blocking import BlockingScheduler 11 | 12 | """ 13 | 可自型拓展其他的代理ip产品,只需修改调用接口即可 14 | """ 15 | 16 | 17 | class XDLProxyPool(object): 18 | """ 19 | 迅代理IP池 20 | """ 21 | 22 | def __init__(self, api_url): 23 | try: 24 | """redis数据库配置区""" 25 | pool = redis.ConnectionPool(decode_responses=True) 26 | self.r = redis.Redis(connection_pool=pool) 27 | 28 | """白名单初始化""" 29 | ret = re.search(r'spiderId=(.*?)&orderno=(.*?)&returnType=\d+&count=(\d+)', api_url) 30 | self.spiderId, self.orderno, self.count = ret.group(1), ret.group(2), int(ret.group(3)) 31 | self.init_proxy() 32 | except: 33 | print('请填入正确的API链接') 34 | 35 | def init_proxy(self): 36 | """ 37 | 初始化代理 38 | """ 39 | print('\033[1;35m初始化中...\033[0m') 40 | 41 | # 取出当前IP地址 42 | response = requests.get('http://pv.sohu.com/cityjson?ie=utf-8') 43 | address = re.search(r'"cip": "(.*?)", "cid', response.text).group(1) 44 | 45 | # 加入白名单 46 | url = 'http://www.xdaili.cn/ipagent/newWhilteList/updateByOrder?orderno={orderno}&ip={ip}&spiderId={spiderId}'.format( 47 | orderno=self.orderno, ip=address, spiderId=self.spiderId) 48 | status = requests.get(url=url).status_code 49 | if status == 200: 50 | print('\033[1;35m初始化成功,启动中稍等..\033[0m') 51 | time.sleep(2) 52 | print('监控已开启') 53 | else: 54 | print('初始化白名单失败') 55 | 56 | def check_ip(self): 57 | """ 58 | 监控 IP 分数、个数,对其进行增删 59 | """ 60 | 61 | # 检查分数 62 | nodes = self.r.zrevrange('XDLProxy', 0, -1, withscores=True) 63 | for i in nodes: 64 | node = list(i) 65 | score = int(node[1]) 66 | if score <= 0: 67 | print('\033[1;33m分数过低剔除\033[0m') 68 | self.r.zrem('XDLProxy', node[0]) 69 | 70 | # 检查个数 71 | _sum = self.r.zcard('XDLProxy') 72 | if _sum < self.count: 73 | self.add_ip(self.count - _sum) 74 | 75 | def add_ip(self, count): 76 | """ 77 | 提取IP 78 | """ 79 | get_url = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId={spiderId}&orderno={orderno}&returnType=2&count={count}'.format( 80 | spiderId=self.spiderId, orderno=self.orderno, count=str(count)) 81 | 82 | # 返回的文本进行解析 83 | response = requests.get(get_url) 84 | if response.status_code == 200: 85 | ret = json.loads(response.text) 86 | if ret.get('ERRORCODE') in ['10036', '10038', '10055']: 87 | print('提取速度过快5秒钟提取一次') 88 | elif ret.get('ERRORCODE') == '10032': 89 | print('余额不足或今日已到提取上线') 90 | else: 91 | self.parse(ret) 92 | else: 93 | print('提取失败') 94 | 95 | def parse(self, data): 96 | """ 97 | 解析返回数据 98 | """ 99 | proxy_list = data.get('RESULT') 100 | for node in proxy_list: 101 | proxy = node.get('ip') + ':' + node.get('port') 102 | self.save_to_redis(proxy, 10) # 默认10分 103 | 104 | def save_to_redis(self, proxy, expire): 105 | """ 106 | 推送到redis集合中 107 | """ 108 | print('代理 %s 推入redis集合' % proxy) 109 | self.r.zadd('XDLProxy', {proxy: expire}) 110 | 111 | 112 | def aps_run(): 113 | """ 114 | 监控 115 | """ 116 | xdl.check_ip() 117 | 118 | 119 | # 填入提取链接 120 | xdl = XDLProxyPool('填写讯代理api链接') 121 | 122 | # 循环监控 123 | scheduler = BlockingScheduler() 124 | scheduler.add_job(aps_run, 'cron', second='*/1') # 这里设置检测评论,推荐2s一次(默认) 125 | scheduler.start() 126 | -------------------------------------------------------------------------------- /原创爬虫工具/Proxy/XDLProxyUseDemo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-11 Python: 3.7 4 | 5 | import redis 6 | import random 7 | 8 | # 在scrapy中使用 代理池的demo 9 | 10 | 11 | """ 12 | middleware中代码如下 13 | """ 14 | 15 | pool = redis.ConnectionPool(decode_responses=True) # redis 池 16 | r = redis.Redis(connection_pool=pool) 17 | 18 | 19 | 20 | 21 | """ 22 | middleware中配置代理中间键 23 | 注意,根据爬取网址是http 还是https 来设置 24 | """ 25 | 26 | class MyProxy(object): 27 | """代理IP设置""" 28 | def process_request(self, request, spider): 29 | # 此处对接redis 30 | data = r.zrangebyscore('XDLProxy', 1, 100, withscores=True) 31 | ip, score = random.choice(data) 32 | request.meta['proxy'] = 'http://'+ip # 根据自己情况填写 33 | 34 | 35 | 36 | 37 | """ 38 | 拦截中间键中配置如下,写入计分器,满分20分 39 | """ 40 | 41 | class DownloaderMiddleware(object): 42 | def process_response(self, request, response, spider): 43 | # 对代理ip进行清洗 44 | proxy = request._meta.get('proxy') 45 | if not response.status == 200: 46 | print('IP访问失败') 47 | if proxy: 48 | proxy = proxy[proxy.find('/')+2:] # 提取当此访问proxy 49 | r.zincrby('XDLProxy', -1, proxy) # redis 命令修改 50 | else: 51 | if proxy: 52 | proxy = proxy[proxy.find('/') + 2:] # 提取当此访问proxy 53 | score = r.zscore('XDLProxy', proxy) # 取出分数 54 | if score < 20: 55 | r.zincrby('XDLProxy', 1, proxy) # redis 新版本命令更改这样了 56 | return response 57 | 58 | def process_exception(self, request, exception, spider): # 可能由于IP质量问题无法访问超时 59 | print('超时异常') 60 | proxy = request._meta.get('proxy') 61 | if proxy: 62 | proxy = proxy[proxy.find('/') + 2:] 63 | r.zincrby('XDLProxy', -1, proxy) # redis 新版本命令更改这样了 64 | return request 65 | 66 | 67 | """ 68 | setting中配置 69 | """ 70 | DOWNLOAD_TIMEOUT = 5 # 有的时候代理ip失效,会导致一直卡在那里 ,也有可能是用http 访问https 71 | DOWNLOADER_MIDDLEWARES = { 72 | 'middlewares.MyProxy': 543, # 自定义代理IP 73 | 'middlewares.spiderDownloaderMiddleware': 600, # 拦截301、302等跳转 必须设置到600 74 | } -------------------------------------------------------------------------------- /原创爬虫工具/Proxy/ZhiMaProxyUseDemo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-07 Python: 3.7 4 | import redis 5 | import random 6 | 7 | # 在scrapy中使用 代理池的demo 8 | 9 | 10 | """ 11 | scrapy 中 middleware中代码如下 12 | """ 13 | 14 | pool = redis.ConnectionPool(decode_responses=True) 15 | r = redis.Redis(connection_pool=pool) 16 | 17 | 18 | 19 | 20 | """ 21 | middleware中配置代理中间键 22 | 注意,根据爬取网址是http 还是https 来设置 23 | """ 24 | 25 | class MyProxy(object): 26 | """代理IP设置""" 27 | def process_request(self, request, spider): 28 | # 此处对接redis 29 | data = r.zrange('ZhiMaProxy', 0, -1, withscores=True) 30 | ip, score = random.choice(data) 31 | request.meta['proxy'] = 'http://'+ip 32 | 33 | 34 | 35 | 36 | """ 37 | 拦截中间键中配置如下,写入计分器,满分20分 38 | """ 39 | 40 | class DownloaderMiddleware(object): 41 | def process_response(self, request, response, spider): 42 | # 对代理ip进行清洗 43 | proxy = request._meta.get('proxy') 44 | if response.status == 302: 45 | print('IP访问失败') 46 | if proxy: 47 | proxy = proxy[proxy.find('/')+2:] 48 | r.zincrby('ZhiMaProxy', -10000000000, proxy) # redis 命令修改 49 | elif response.status == 200: 50 | if proxy: 51 | proxy = proxy[proxy.find('/') + 2:] 52 | score = r.zscore('ZhiMaProxy',proxy) 53 | if score < 200000000000: 54 | r.zincrby('ZhiMaProxy', 10000000000, proxy) # redis 新版本命令更改这样了 55 | return response 56 | 57 | def process_exception(self, request, exception, spider): # 可能由于IP质量问题无法访问超时,必须在这里捕获然后扣分 58 | print('超时异常') 59 | proxy = request._meta.get('proxy') 60 | if proxy: 61 | proxy = proxy[proxy.find('/') + 2:] 62 | r.zincrby('ZhiMaProxy', -10000000000, proxy) # redis 新版本命令更改这样了 63 | return request 64 | 65 | 66 | """ 67 | setting中配置 68 | """ 69 | DOWNLOAD_TIMEOUT = 5 # 有的时候代理ip失效,会导致一直卡在那里 ,也有可能是用http 访问https 70 | OWNLOADER_MIDDLEWARES = { 71 | 'middlewares.MyProxy': 543, # 自定义代理IP 72 | 'middlewares.spiderDownloaderMiddleware': 600, # 拦截301、302等跳转 73 | } -------------------------------------------------------------------------------- /原创爬虫工具/README.md: -------------------------------------------------------------------------------- 1 | ## 工具表 2 | - [x] [解密工具-可拓展式解密器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Decode) 3 | - [x] [自动注册-验证短信接收器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Register) 4 | - [x] [代理IP-芝麻代理池监控器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Proxy) 5 | - [x] [代理IP-芝麻代理池客户端Demo](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Proxy) 6 | - [x] [代理IP-讯代理池监控器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Proxy) 7 | - [x] [代理IP-讯代理池客户端Demo](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Proxy) 8 | - [x] [代理IP-快代理池监控器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Proxy) 9 | - [x] [cookies获取-pyppeteer获取美团登陆cookies](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Cookies) 10 | - [x] [跨数据库迁移器-开发中](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/DataMigration) 11 | - [x] [网络图片并发直传OSS](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/OSS) 12 | - [x] [生成encrypt加密参数器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Jsencrypt) 13 | 14 |
15 | 16 | 17 | 18 | # 可拓展式解密器 19 | 20 | [**博客传送门**](https://blog.zhangkunzhi.com/2019/06/02/%E5%8E%9F%E5%88%9B%E5%B7%A5%E5%85%B7%E4%B9%8B%E5%8F%AF%E6%8B%93%E5%B1%95%E8%A7%A3%E7%A0%81%E5%99%A8/index.html) 21 | 22 | > 方便测试可连续转换重制的编码转换器,可灵活拓展解码规则 23 | 24 | ![](https://zok-blog.oss-cn-hangzhou.aliyuncs.com/images/特殊.gif) 25 | 26 |
27 | 28 | 29 | 30 | 31 | # 代理池清洗工具 32 | 33 | [**博客传送门**](https://blog.zhangkunzhi.com/2019/05/02/%E6%90%AD%E5%BB%BA%E4%B8%80%E4%B8%AA%E8%B6%85%E7%AE%80%E5%8D%95%E7%9A%84%E5%AE%9E%E7%94%A8%E7%9A%84%E9%AB%98%E5%8F%AF%E7%94%A8%E4%BB%98%E8%B4%B9IP%E6%B1%A0/index.html) 34 | 35 | > 爬虫经常会用到代理ip,其中有很多收费ip,但是如何在scrapy中,高效使用这些ip是一个比较麻烦的事情,在这里基于[芝麻代理ip](http://h.zhimaruanjian.com/pay/)做一个代理池监控器,首先整理我们的需求再对其代理质量进行管理,从而保持高效IP使用率 36 | 37 | ![key位置](https://www.zhangkunzhi.com/images/提取ip.png) 38 | 39 | 40 |
41 | 42 | # 验证码短信接收器 43 | 44 | > 基于短信接收平台的异步短信接收器,最大并发上限 20,Python3.5+。 45 | 启动后会根据设置的异步并发数进行获取手机号码并监听短信接收情况(60秒) 超过60秒后会将未收到短信的手机号拉入黑名单,并是释放。 46 | 47 | 若要配置具体某个网站使用,还需开发对应的账号注册器,配合调用本短信接收器来达到自动注册账号的功能 48 | 49 |
50 | 51 | # cookies获取Demo 52 | 53 | > 基于Pyppeteer 并发获取站点cookies 54 | - 美团登陆cookies 55 | ![](https://www.zhangkunzhi.com/images/异步获取cookies.png) 56 | 57 | 58 | # 跨数据库迁移器 59 | **工作中经常有这种需求** 60 | > 将采集好的mongodb数据转存到mysql中,或者是redis数据转到mongodb,于是打算封装一个组件便于以后调用 61 | 62 | - [x] mysql 数据迁移 mongodb 63 | ![](https://www.zhangkunzhi.com/images/to_mongo1.png) 64 | ![](https://www.zhangkunzhi.com/images/to_mongo2.png) -------------------------------------------------------------------------------- /原创爬虫工具/Register/README.md: -------------------------------------------------------------------------------- 1 | # 注册短信并发异步接收器 2 | 3 | > 基于短信接收平台的异步短信接收器,最大并发上限20,Python3.5+ 4 | 5 | `pip3 install asyncio` 6 | `pip3 install aiohttp` 7 | 8 | [平台网址](http://www.51ym.me/User/Default.aspx) 9 | 10 | ## 使用步骤 11 | 1. 实例化对象时填入平台 token 12 | 2. 实例化对象时填入后台查询的项目 id 13 | 3. 实例化对象时填入手机短信并发上限(最大20并发) 14 | 15 | > 启动后会根据设置的异步并发数进行获取手机号码并监听短信接收情况(60秒) 超过60秒后会将未收到短信的手机号拉入黑名单,并是释放。 16 | 17 | 若要配置具体某个网站使用,还需开发对应的账号注册器,配合调用本短信接收器来达到自动注册账号的功能 -------------------------------------------------------------------------------- /原创爬虫工具/Register/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-05-13 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/README.md: -------------------------------------------------------------------------------- 1 | # Zok 组件使用说明 2 | > by: 362416272@qq.com 自用 3 | 4 | ### 目录 5 | - repetition 内容更新处理 6 | - save 通用持久化存储组件 7 | - random_UA 随机UA 8 | - proxies 阿布云代理组件 9 | 10 | 11 | 12 | **mysql储存** 13 | 1. 必须在zok_config中配置要持久化的数据库账户密码 14 | 2. 在爬虫项目文件pipelines管道中,引入并使用 15 | ```python 16 | from zok.save.to_mysql import SaveToMysqlBase 17 | 18 | class CityLandmarkListPipeline(SaveToMysqlBase): 19 | member = 'city' # redis集合名 如果是分布式无需设置 20 | 21 | @staticmethod 22 | def get_sql(item): 23 | sql = """INSERT INTO base_city_landmark(city, county, landmark) VALUES ("{city}","{county}","{landmark}") """.format( 24 | city=item['city'], 25 | county=item['county'], 26 | landmark=item['landmark'], 27 | ) 28 | return sql 29 | 30 | '''必须调用 def_sql(item)方法,并返回sql语句即可''' 31 | ``` 32 | 33 | **随机UA** 34 | ```python 35 | # setting.py中 加入即可 36 | DOWNLOADER_MIDDLEWARES = { 37 | 'zok.random_UA.ua_random.RandomUserAgentMiddleware': 20, 38 | } 39 | ``` 40 | 41 | **代理ip设置** 42 | ```python 43 | # 在setting中配置即可 44 | DOWNLOADER_MIDDLEWARES = { 45 | 'zok.proxies.proxies.ProxyMiddleware': 15, # 自定义的中间件 46 | } 47 | ``` 48 | 49 | **基于redis内容去重更新** 50 | > 原理: 在储存数据之前取到hash数据值,并加以对比,如果有值就跳过不储存,无值就set(md5, id) 51 | 1. 开启redis服务 52 | 2. 在 zok_config中配置 redis配置 53 | 3. 应用储存组件 mysql 就会自动启用去重增量更新功能 54 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/5 Python: 3.7 4 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/get_db/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/16 Python: 3.7 4 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/get_db/from_mongodb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-04-23 Python: 3.7 4 | from pymongo import MongoClient 5 | 6 | from zok.zok_config import MONGODB_URL 7 | 8 | client = MongoClient(MONGODB_URL) 9 | 10 | database = client.meituan_db # 链接数据库 11 | collection = database.href_coolections # 链接结合 12 | 13 | data = collection.find({},{'_id': 0}) 14 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/get_db/from_mysql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/7 Python: 3.7 4 | 5 | import pymysql 6 | 7 | from zok.zok_config import * 8 | 9 | 10 | def get_data(sql): 11 | conn = pymysql.Connect( 12 | host=MYSQL_HOST, 13 | port=MYSQL_PORT, 14 | user=MYSQL_USER, 15 | password=MYSQL_PASSWORD, 16 | db=MYSQL_DB_NAME, 17 | ) 18 | # 创建游标对象 19 | cursor = conn.cursor() 20 | # 提交事务 21 | try: 22 | cursor.execute(sql) 23 | data = cursor.fetchall() 24 | cursor.close() 25 | conn.close() 26 | return data 27 | except Exception as e: 28 | print(e) 29 | print('异常回滚') 30 | conn.rollback() 31 | cursor.close() 32 | conn.close() 33 | return None 34 | 35 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/proxies/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/7 Python: 3.7 4 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/proxies/proxies.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/5 Python: 3.7 4 | 5 | import base64 6 | from zok.zok_config import * 7 | 8 | # 代理服务器 9 | proxyServer = "http://http-dyn.abuyun.com:9020" 10 | 11 | 12 | proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((PROXY_USER + ":" + PROXY_PASS), "ascii")).decode("utf8") 13 | 14 | 15 | class ProxyMiddleware(object): 16 | """自定义中间件代理IP""" 17 | def process_request(self, request, spider): 18 | request.meta["proxy"] = proxyServer 19 | request.headers["Proxy-Authorization"] = proxyAuth 20 | 21 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/random_UA/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/7 Python: 3.7 4 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/random_UA/ua_random.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/7 Python: 3.7 4 | import os 5 | 6 | from fake_useragent import UserAgent 7 | 8 | 9 | class RandomUserAgentMiddleware(object): 10 | """ 11 | first to use location because it is the fastest 12 | """ 13 | 14 | def __init__(self): 15 | location = os.getcwd() + '/zok/random_UA/fake_useragent.json' 16 | self.agent = UserAgent(path=location) # 调用本地 ua池 17 | # self.agent = UserAgent(verify_ssl=False) 18 | # self.agent = UserAgent(use_cache_server=False) 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | return cls() 23 | 24 | def process_request(self, request, spider): 25 | request.headers.setdefault('User-Agent', self.agent.random) 26 | 27 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/repetition/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/5 Python: 3.7 4 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/repetition/update_cache.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/7 Python: 3.7 4 | 5 | import redis 6 | import hashlib 7 | 8 | from zok.zok_config import REDIS_PORT, REDIS_DB_NAME, REDIS_HOST, REDIS_USER, REDIS_PASSWORD 9 | 10 | 11 | class CacheRedis(object): 12 | 13 | pool = redis.ConnectionPool(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB_NAME, password=REDIS_PASSWORD, decode_responses=True) 14 | r = redis.Redis(connection_pool=pool) 15 | # 加上decode_responses=True,写入的键值对中的value为str类型,不加这个参数写入的则为字节类型。 16 | 17 | # 1. 根据储存数据取值判断是否存在 18 | # 3. 不存在-已有数据: 需要更新 19 | # 4. 不存在-无数据: 需要插入 20 | # 5. 存在 直接跳过储存 21 | 22 | # BUG 在redis数据库丢失的情况下【会全体重新录入】 23 | 24 | @staticmethod 25 | def get_md5(data): 26 | md5 = hashlib.md5(data.encode('utf-8')).hexdigest() 27 | return md5 28 | 29 | def redis_exists(self, member, md5): 30 | """ 31 | 验证hash是否存在, 有返回True,没有返回False 32 | :param member: 验证区域集合Key 33 | :param md5: 要储存的数据 34 | :return: True or False 35 | """ 36 | print() 37 | if self.r.sismember(member, md5): 38 | return True 39 | else: 40 | return False 41 | 42 | def save_redis(self, member, md5): 43 | self.r.sadd(member, md5) 44 | 45 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/save/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/7 Python: 3.7 4 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/save/to_mysql.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/7 Python: 3.7 4 | 5 | import pymysql 6 | 7 | from zok.zok_config import * 8 | from zok.repetition.update_cache import CacheRedis 9 | 10 | 11 | class SaveToMysqlBase(object): 12 | """ 13 | mysql储存基类 14 | 新增语法 INSERT INTO 表名(city, county, district) VALUES ("%s","%s","%s") 15 | 更新语法 UPDATE 表名 SET mail = "playstation.com" WHERE user_name = "Peter" 16 | """ 17 | member = None # 不设置默认不开启 redis去重校验 18 | conn = None 19 | cursor = None # 游标对象 20 | redis = CacheRedis() 21 | 22 | def open_spider(self, spider): 23 | print('开始爬虫,链接数据库') 24 | self.conn = pymysql.Connect( 25 | host=MYSQL_HOST, 26 | port=MYSQL_PORT, 27 | user=MYSQL_USER, 28 | password=MYSQL_PASSWORD, 29 | db=MYSQL_DB_NAME, 30 | ) 31 | 32 | def process_item(self, item, spider): 33 | # 写sql语句 插数据,没有表的话要先在数据库创建 34 | sql = self.get_sql(item) 35 | if self.member: 36 | sql_md5 = self.redis.get_md5(sql) 37 | if not self.redis.redis_exists(self.member, sql_md5): 38 | # 创建游标对象 39 | self.cursor = self.conn.cursor() 40 | # 提交事务 41 | try: 42 | self.cursor.execute(sql) 43 | self.conn.commit() 44 | self.redis.save_redis(self.member, sql_md5) 45 | # int(conn.insert_id()) # 最新插入行的主键ID,conn.insert_id()一定要在conn.commit()之前,否则会返回0 46 | except Exception as e: 47 | print(e) 48 | print('异常回滚') 49 | self.conn.rollback() 50 | 51 | self.cursor.close() 52 | return item 53 | else: 54 | print('已有相同数据无需插入') 55 | else: 56 | # 创建游标对象 57 | self.cursor = self.conn.cursor() 58 | # 提交事务 59 | try: 60 | self.cursor.execute(sql) 61 | self.conn.commit() 62 | except Exception as e: 63 | print(e) 64 | print('异常回滚') 65 | self.conn.rollback() 66 | self.cursor.close() 67 | return item 68 | 69 | def close_spider(self, spider): 70 | print('爬虫结束, 关闭通道') 71 | self.conn.close() 72 | -------------------------------------------------------------------------------- /原创爬虫工具/zok/zok_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/5 Python: 3.7 4 | from urllib import parse 5 | 6 | 7 | MONGODB_URL = 'mongodb://localhost:27017' 8 | 9 | 10 | REDIS_HOST = "localhost" 11 | REDIS_USER = "root" 12 | REDIS_PASSWORD = "" 13 | REDIS_DB_NAME = 0 14 | REDIS_PORT = 6379 15 | 16 | 17 | -------------------------------------------------------------------------------- /滑动验证码/【w3c】滑块验证/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-10 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /滑动验证码/【w3c】滑块验证/bg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/bg.png -------------------------------------------------------------------------------- /滑动验证码/【w3c】滑块验证/chache.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/chache.png -------------------------------------------------------------------------------- /滑动验证码/【w3c】滑块验证/hk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/hk.png -------------------------------------------------------------------------------- /滑动验证码/【w3c】滑块验证/img/0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/img/0.png -------------------------------------------------------------------------------- /滑动验证码/【w3c】滑块验证/img/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/img/1.png -------------------------------------------------------------------------------- /滑动验证码/【w3c】滑块验证/img/2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/img/2.png -------------------------------------------------------------------------------- /滑动验证码/【w3c】滑块验证/img/3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/img/3.png -------------------------------------------------------------------------------- /滑动验证码/【腾讯】滑块验证/bg.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【腾讯】滑块验证/bg.jpeg -------------------------------------------------------------------------------- /滑动验证码/【腾讯】滑块验证/discriminate.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-10-11 Python: 3.7 4 | 5 | 6 | """ 7 | pip3 install opencv-python 8 | """ 9 | 10 | import cv2 as cv 11 | 12 | 13 | def get_pos(image): 14 | """ 15 | 缺口轮廓检测 16 | 对付腾讯滑块够用 17 | 该方法识别率 95% 左右 18 | """ 19 | blurred = cv.GaussianBlur(image, (5, 5), 0) 20 | canny = cv.Canny(blurred, 200, 400) 21 | contours, hierarchy = cv.findContours(canny, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE) 22 | for i, contour in enumerate(contours): 23 | m = cv.moments(contour) 24 | if m['m00'] == 0: 25 | cx = cy = 0 26 | else: 27 | cx, cy = m['m10'] / m['m00'], m['m01'] / m['m00'] 28 | if 6000 < cv.contourArea(contour) < 8000 and 370 < cv.arcLength(contour, True) < 390: 29 | if cx < 400: 30 | continue 31 | x, y, w, h = cv.boundingRect(contour) # 外接矩形 32 | cv.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 2) 33 | cv.imshow('image', image) 34 | return x 35 | return 0 36 | 37 | 38 | if __name__ == '__main__': 39 | """ 40 | 这里是滑块缺口识别 41 | 识别到后 42 | 1。可以通过自动化工具取拖动滑块 43 | 2。可以通过参数解析的形式生成参数提交通过验证 44 | """ 45 | img0 = cv.imread('bg.jpeg') 46 | get_pos(img0) 47 | cv.waitKey(0) 48 | cv.destroyAllWindows() 49 | -------------------------------------------------------------------------------- /项目/HouseScrapy/requirements: -------------------------------------------------------------------------------- 1 | scrapy 2 | scrapy-redis 3 | pymysql 4 | redis>=3.2.1 5 | pymongo -------------------------------------------------------------------------------- /项目/HouseScrapy/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = HouseScrapy 12 | -------------------------------------------------------------------------------- /项目/HouseScrapy/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # 阿布云代理 IP 4 | PROXY_USER = '' 5 | PROXY_PASS = '' 6 | 7 | BOT_NAME = 'HouseScrapy' 8 | 9 | SPIDER_MODULES = ['spiders'] 10 | NEWSPIDER_MODULE = 'spiders' 11 | 12 | # 否认协议 13 | ROBOTSTXT_OBEY = False 14 | 15 | # 随机延迟 16 | RANDOMIZE_DOWNLOAD_DELAY = True 17 | 18 | # 重试处理 19 | DOWNLOAD_FAIL_ON_DATALOSS = False 20 | 21 | # 设置超时时间 22 | DOWNLOAD_TIMEOUT = 5 23 | 24 | # MongoDB 25 | MONGODB_URL = 'mongodb://localhost:27017' 26 | MONGODB_DB = '房产' 27 | MONGODB_COLL = '地产数据' 28 | 29 | 30 | # Redis 31 | REDIS_HOST = '127.0.0.1' # 本机 32 | REDIS_WORD = None 33 | REDIS_PORT = 6379 34 | 35 | # 限流 秒/次 36 | DOWNLOAD_DELAY = 1 / 10 37 | 38 | # 禁止301 39 | # HTTPERROR_ALLOWED_CODES = [301] 40 | 41 | # 日志配置 42 | # LOG_LEVEL = 'WARNING' 43 | # LOG_FILE = 'log/error_log.txt' 44 | 45 | 46 | # Headers 47 | DEFAULT_REQUEST_HEADERS = { 48 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 49 | 'Host': 'www.funi.com' 50 | } 51 | 52 | 53 | """项目独立配置区""" 54 | 55 | # HOST 56 | HOST = 'http://www.funi.com' 57 | 58 | 59 | """===== 分布式配置区 =====""" 60 | 61 | # # 去重,利用set指纹去重 62 | # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter' 63 | # 64 | # # 调度器 65 | # SCHEDULER = 'scrapy_redis.scheduler.Scheduler' 66 | # 67 | # # 去重指纹的set 68 | # SCHEDULER_PERSIST = True 69 | # 70 | # # 配置密码 71 | # REDIS_PARAMS = { 72 | # 'password': REDIS_WORD, 73 | # } 74 | # 75 | # -------------------------------------------------------------------------------- /项目/HouseScrapy/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-15 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /项目/HouseScrapy/toolkits/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-15 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /项目/HouseScrapy/toolkits/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | 6 | class HousesItem(scrapy.Item): 7 | data = scrapy.Field() 8 | -------------------------------------------------------------------------------- /项目/HouseScrapy/toolkits/make_ua.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/7 Python: 3.7 4 | import os 5 | 6 | from fake_useragent import UserAgent 7 | 8 | 9 | class RandomUserAgentMiddleware(object): 10 | """ 11 | first to use location because it is the fastest 12 | """ 13 | 14 | def __init__(self): 15 | location = os.getcwd() + '/toolkits/fake_useragent.json' 16 | self.agent = UserAgent(path=location) 17 | # self.agent = UserAgent(verify_ssl=False) 18 | # self.agent = UserAgent(use_cache_server=False) 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | return cls() 23 | 24 | def process_request(self, request, spider): 25 | request.headers.setdefault('User-Agent', self.agent.random) 26 | -------------------------------------------------------------------------------- /项目/HouseScrapy/toolkits/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class HousescrapySpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class HousescrapyDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /项目/HouseScrapy/toolkits/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from pymongo import MongoClient 4 | from bson.objectid import ObjectId 5 | from settings import MONGODB_URL, MONGODB_DB, MONGODB_COLL 6 | 7 | 8 | class HousePipeline(object): 9 | """地产基础数据 10 | """ 11 | 12 | def __init__(self): 13 | client = MongoClient(MONGODB_URL) 14 | self.coll = client[MONGODB_DB][MONGODB_COLL] # 地产链接 15 | 16 | def process_item(self, item, spider): 17 | self.coll.insert_one(item['data']) 18 | -------------------------------------------------------------------------------- /项目/HouseScrapy/toolkits/proxies.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 3 | # Date: 2019/3/5 Python: 3.7 4 | 5 | import base64 6 | from settings import PROXY_USER, PROXY_PASS 7 | 8 | # 代理服务器 9 | proxyServer = "http://http-dyn.abuyun.com:9020" 10 | 11 | 12 | proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((PROXY_USER + ":" + PROXY_PASS), "ascii")).decode("utf8") 13 | 14 | 15 | class ProxyMiddleware(object): 16 | """自定义中间件代理IP""" 17 | def process_request(self, request, spider): 18 | request.meta["proxy"] = proxyServer 19 | request.headers["Proxy-Authorization"] = proxyAuth 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /项目/HouseSpider/README.md: -------------------------------------------------------------------------------- 1 | # 目前项目还在抽空更新中 2 | > 慢慢填坑 3 | 4 | # 概述 5 | > 对 `www.funi.com` 网站进行数据爬取 -------------------------------------------------------------------------------- /项目/HouseSpider/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-12 Python: 3.7 4 | 5 | # Redis 6 | REDIS_HOST = '127.0.0.1' 7 | REDIS_PORT = '6379' 8 | REDIS_PASSWORD = None 9 | 10 | # MongoDB 11 | MONGO_CLEAN = 'mongodb://localhost:27017' 12 | 13 | # TargetUrl 14 | TARGET_URL = "http://www.funi.com/loupan/region_0_0_0_0_{page}" 15 | 16 | # ProxyIP 17 | PROXY_USER = "" 18 | PROXY_PASS = "" 19 | 20 | # HOST 21 | HOST = 'http://www.funi.com' 22 | -------------------------------------------------------------------------------- /项目/HouseSpider/db/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-12 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /项目/HouseSpider/main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-12 Python: 3.7 4 | 5 | import asyncio 6 | 7 | from tool.parse import * 8 | from tool.toolkit import * 9 | 10 | 11 | async def get_max_page(): 12 | """获取总页数 13 | """ 14 | url = TARGET_URL.format(page=1) 15 | result = await get(url) 16 | return await parse_total_page(result) 17 | 18 | 19 | async def get_house_url(page): 20 | """获取地产链接 21 | """ 22 | url = TARGET_URL.format(page=page) 23 | result = await get(url) 24 | await parse_house_url(result, page) 25 | 26 | 27 | @count_time 28 | def main(): 29 | loop = asyncio.get_event_loop() 30 | 31 | # 1. 获取总页数 32 | task = loop.create_task(get_max_page()) 33 | total_page = loop.run_until_complete(task) 34 | 35 | # 2. 获取链接 36 | house_url_func = [asyncio.ensure_future(get_house_url(_)) for _ in range(1, int(total_page))] 37 | loop.run_until_complete(asyncio.wait(house_url_func)) 38 | 39 | # 3. 楼盘详情 40 | 41 | 42 | if __name__ == '__main__': 43 | main() 44 | -------------------------------------------------------------------------------- /项目/HouseSpider/tool/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-12 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /项目/HouseSpider/tool/parse.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-12 Python: 3.7 4 | 5 | from pyquery import PyQuery as pq 6 | from config import * 7 | 8 | 9 | async def parse_total_page(result): 10 | """解析总页数 11 | """ 12 | doc = pq(result) 13 | max_page = doc('.pages a').eq(-2).text() 14 | print('数据总: {total} 页'.format(total=max_page)) 15 | return max_page 16 | 17 | 18 | async def parse_house_url(result, page): 19 | """页面解析链接 20 | """ 21 | doc = pq(result) 22 | dls = doc('.fleft div').eq(-2)('dl') 23 | n = 0 24 | for dl in dls: 25 | href = pq(dl)('dt a').attr('href') 26 | href = HOST + href[: href.find(';')] # 清洗链接 27 | print(href) 28 | n += 1 29 | if not n: 30 | print('第 {page} 抽取链接失败'.format(page=page)) 31 | 32 | 33 | -------------------------------------------------------------------------------- /项目/HouseSpider/tool/proxy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-12 Python: 3.7 4 | 5 | 6 | from config import PROXY_PASS, PROXY_USER 7 | 8 | # 代理服务器 9 | proxyHost = "http-dyn.abuyun.com" 10 | proxyPort = "9020" 11 | 12 | 13 | proxyServer = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { 14 | "host": proxyHost, 15 | "port": proxyPort, 16 | "user": PROXY_USER, 17 | "pass": PROXY_PASS, 18 | } 19 | 20 | if not PROXY_USER or not PROXY_PASS: 21 | msg = """ 22 | 请先在 config.py 配置文件内填入代理IP账号 23 | 阿布云代理IP:https://www.abuyun.com/http-proxy/products.html 24 | """ 25 | print(msg) 26 | exit() 27 | -------------------------------------------------------------------------------- /项目/HouseSpider/tool/toolkit.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-07-13 Python: 3.7 4 | import datetime 5 | import aiohttp 6 | 7 | from tool.proxy import proxyServer 8 | 9 | 10 | async def get(url): 11 | """请求页面 12 | """ 13 | headers = { 14 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36', 15 | 'Host': 'www.funi.com' 16 | } 17 | 18 | try: 19 | """conn = aiohttp.TCPConnector(verify_ssl=False) connector=conn""" 20 | async with aiohttp.ClientSession(headers=headers) as session: 21 | async with session.get(url, proxy=proxyServer) as response: 22 | return await response.text("utf-8") 23 | except TimeoutError as te: 24 | print('超时', te) 25 | 26 | 27 | def count_time(func): 28 | """取运行时间 29 | """ 30 | def int_time(*args, **kwargs): 31 | start_time = datetime.datetime.now() # 程序开始时间 32 | func() 33 | over_time = datetime.datetime.now() # 程序结束时间 34 | total_time = (over_time-start_time).total_seconds() 35 | print('程序耗时: %s 秒' % total_time) 36 | return int_time 37 | -------------------------------------------------------------------------------- /项目/MeiTuanArea/MeiTuanArea/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/项目/MeiTuanArea/MeiTuanArea/__init__.py -------------------------------------------------------------------------------- /项目/MeiTuanArea/MeiTuanArea/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import scrapy 4 | 5 | 6 | class AreaItem(scrapy.Item): 7 | """地区""" 8 | type = scrapy.Field() 9 | id = scrapy.Field() 10 | pid = scrapy.Field() 11 | name = scrapy.Field() 12 | pinyin = scrapy.Field() 13 | first = scrapy.Field() 14 | haschild = scrapy.Field() 15 | 16 | 17 | class CoordItem(scrapy.Item): 18 | """坐标录入""" 19 | type = scrapy.Field() 20 | id = scrapy.Field() 21 | lng = scrapy.Field() 22 | lat = scrapy.Field() 23 | -------------------------------------------------------------------------------- /项目/MeiTuanArea/MeiTuanArea/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class MeituanareaSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class MeituanareaDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /项目/MeiTuanArea/MeiTuanArea/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | BOT_NAME = 'MeiTuanArea' # 爬虫项目名 5 | 6 | SPIDER_MODULES = ['MeiTuanArea.spiders'] # 爬虫目录设定 7 | NEWSPIDER_MODULE = 'MeiTuanArea.spiders' # 爬虫生成目录 8 | 9 | ROBOTSTXT_OBEY = False # 否认协议 10 | 11 | RANDOMIZE_DOWNLOAD_DELAY = True # 开启随机增加毫秒级延迟,增加访问成功率 12 | 13 | DOWNLOAD_FAIL_ON_DATALOSS = False # 重试处理 14 | 15 | DOWNLOAD_TIMEOUT = 5 # 设置超时时间,避免ip失效等待时间过长 16 | 17 | # HTTPERROR_ALLOWED_CODES = [301] # 禁止301 18 | 19 | # 指定终端输出日志、日志位置 20 | # LOG_LEVEL = 'WARNING' 21 | # LOG_FILE = 'error_log.txt' 22 | 23 | HTTPERROR_ALLOWED_CODES = [403] 24 | 25 | # UA 26 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36' 27 | 28 | # mysql 29 | MYSQL_HOST = '127.0.0.1' 30 | MYSQL_PORT = 3306 31 | MYSQL_USER = 'root' 32 | MYSQL_PASSWORD = 'mysql 密码' 33 | MYSQL_DB_NAME = 'mysql库' 34 | 35 | # API 百度地图坐标获取API,申请后填写即可 36 | API_AK = '百度地图 api ak' 37 | 38 | 39 | -------------------------------------------------------------------------------- /项目/MeiTuanArea/MeiTuanArea/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /项目/MeiTuanArea/MeiTuanArea/spiders/area_coord.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import pymysql 4 | import json 5 | 6 | from MeiTuanArea.settings import API_AK 7 | from MeiTuanArea.settings import MYSQL_DB_NAME, MYSQL_HOST, MYSQL_PASSWORD, MYSQL_PORT, MYSQL_USER 8 | from MeiTuanArea.items import CoordItem 9 | 10 | 11 | class GetLngSpider(scrapy.Spider): 12 | name = 'area_coord' 13 | 14 | # 独立配置 15 | custom_settings = { 16 | 'ITEM_PIPELINES': { 17 | 'MeiTuanArea.pipelines.CoordPipeline': 300, 18 | }, 19 | } 20 | 21 | # mysql 配置 22 | conn = pymysql.Connect( 23 | host=MYSQL_HOST, 24 | port=MYSQL_PORT, 25 | user=MYSQL_USER, 26 | password=MYSQL_PASSWORD, 27 | db=MYSQL_DB_NAME, 28 | ) 29 | 30 | url = 'http://api.map.baidu.com/geocoder/v2/?address={address}&output=json&ak={ak}' 31 | 32 | def start_requests(self): 33 | 34 | # 一级区域 省市 35 | provinces = self.get_db("""SELECT id,`name` from province""") 36 | for _id, name in provinces: 37 | target_url = self.url.format(address=name, ak=API_AK) 38 | yield scrapy.Request(target_url, meta={'type': 'province', '_id': _id}) 39 | 40 | # 二级区域 城市 41 | city = self.get_db("""SELECT id,`name` from city""") 42 | for _id, name in city: 43 | target_url = self.url.format(address=name, ak=API_AK) 44 | yield scrapy.Request(target_url, meta={'type': 'city', '_id': _id}) 45 | 46 | # 三级区域 区域 47 | area = self.get_db("""select area.id, city.name, area.name from city LEFT JOIN area on city.id=area.pid""") 48 | for _id, name, address_name in area: 49 | address = str(name)+str(address_name) 50 | target_url = self.url.format(address=address, ak=API_AK) 51 | yield scrapy.Request(target_url, meta={'type': 'area', '_id': i[0]}) 52 | 53 | # 四级区域 街道 54 | address = self.get_db("""select address.id,area.name, address.name from area LEFT JOIN address on address.pid=area.id""") 55 | for _id, name, address_name in address: 56 | target_url = self.url.format(address=str(name)+str(address_name), ak=API_AK) 57 | yield scrapy.Request(target_url, meta={'type': 'address', '_id': _id}) 58 | 59 | def get_db(self, sql): 60 | """数据库查询""" 61 | # 创建游标对象 62 | cursor = self.conn.cursor() 63 | # 提交事务 64 | try: 65 | cursor.execute(sql) 66 | data = cursor.fetchall() 67 | cursor.close() 68 | self.conn.close() 69 | return data 70 | except Exception as e: 71 | print('异常回滚') 72 | self.conn.rollback() 73 | cursor.close() 74 | self.conn.close() 75 | return None 76 | 77 | def parse(self, response): 78 | """清洗数据""" 79 | item = CoordItem() 80 | data = json.loads(response.text) 81 | # 处理字符串 把闲杂符号去掉 82 | if data.get('status') == 0: 83 | # 坐标 84 | item['lng'] = data.get('result').get('location').get('lng') 85 | item['lat'] = data.get('result').get('location').get('lat') 86 | item['id'] = response.meta.get('_id') 87 | item['type'] = response.meta.get('type') 88 | yield item 89 | 90 | -------------------------------------------------------------------------------- /项目/MeiTuanArea/MeiTuanArea/spiders/areas.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import json 4 | import re 5 | 6 | from pypinyin import pinyin, lazy_pinyin 7 | from MeiTuanArea.items import AreaItem 8 | 9 | 10 | class GetAreaSpider(scrapy.Spider): 11 | name = 'areas' 12 | 13 | # 独立配置 14 | custom_settings = { 15 | 'ITEM_PIPELINES': { 16 | 'MeiTuanArea.pipelines.AreaPipeline': 300, 17 | }, 18 | 'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36', 19 | 'DOWNLOAD_DELAY': 0.5, # 限流 下载同一个网站下一个页面前需要等待的时间 20 | } 21 | 22 | def start_requests(self): 23 | start_url = 'https://www.meituan.com/ptapi/getprovincecityinfo/' 24 | yield scrapy.Request(start_url, callback=self.parse_province) 25 | 26 | def parse_province(self, response): 27 | """省市+市 1、2 级区域采集""" 28 | target_url = 'http://{acronym}.meituan.com/meishi/' 29 | 30 | item = AreaItem() 31 | data = json.loads(response.text) 32 | for node in data: 33 | name = node.get('provinceName') 34 | item['type'] = 'province' 35 | item['haschild'] = 1 36 | item['id'] = node.get('provinceCode') 37 | item['pid'] = 0 38 | item['name'] = name 39 | item['pinyin'] = ''.join(lazy_pinyin(name)) 40 | item['first'] = self.get_acronym(name) 41 | yield item # 一级省市 42 | 43 | for i in node.get('cityInfoList'): 44 | item['type'] = 'city' 45 | item['id'] = i.get('id') 46 | item['pid'] = node.get('provinceCode') 47 | item['name'] = i.get('name') 48 | item['pinyin'] = i.get('pinyin') 49 | item['first'] = i.get('acronym') 50 | yield item # 二级市 51 | 52 | url = target_url.format(acronym=i.get('acronym')) 53 | yield scrapy.Request(url, callback=self.parse_area, meta={'pid': i.get('id')}) 54 | 55 | def parse_area(self, response): 56 | """区域+街道 2、3 级区域采集""" 57 | info, areas = re.search(r',"areas":(.*?),"dinnerCountsAttr', response.text), None 58 | if info: 59 | areas = json.loads(info.group(1)) 60 | if areas: 61 | city_id = response.meta.get('pid') 62 | item = AreaItem() 63 | 64 | # 解析区域 3 级 65 | for area in areas: 66 | item['type'] = 'area' 67 | item['id'] = area.get('id') 68 | item['pid'] = city_id 69 | item['name'] = area.get('name') 70 | item['pinyin'] = ''.join(lazy_pinyin(area.get('name'))) 71 | item['first'] = self.get_acronym(area.get('name')) 72 | 73 | subs = area.get('subAreas') 74 | # 判断是否有下级,有的区域么有下级了 75 | if len(subs) > 1: 76 | item['haschild'] = 1 77 | else: 78 | item['haschild'] = 0 79 | 80 | yield item 81 | 82 | # 解析 4 级 83 | if len(subs) > 1: 84 | for sub in subs: 85 | if not sub.get('name') == '全部': 86 | item['haschild'] = 0 87 | item['type'] = 'address' 88 | item['id'] = sub.get('id') 89 | item['pid'] = area.get('id') 90 | item['name'] = sub.get('name') 91 | item['pinyin'] = ''.join(lazy_pinyin(sub.get('name'))) 92 | item['first'] = self.get_acronym(sub.get('name')) 93 | yield item 94 | 95 | else: 96 | print('区域读取失败') 97 | 98 | @staticmethod 99 | def get_acronym(str_data): 100 | """ 101 | 获取字符串的首字母 102 | :param str_data: 字符串 103 | :return: 字符串 104 | """ 105 | return "".join([i[0][0] for i in pinyin(str_data)]) 106 | 107 | -------------------------------------------------------------------------------- /项目/MeiTuanArea/README.md: -------------------------------------------------------------------------------- 1 | # 美团城市采集 2 | > 因为全站爬取需要用到,区域基础数据。这里单独抽离出来。 3 | 4 | ## 配置 5 | 在 settings 内配置 mysql 与 百度api_ak 即可 6 | 7 | ## 数据库设计 8 | > 因为最终数据将会用到Mysql上,区域一共有4个层级,分别是省市、市、区域、街道,这里按照业务需求拆分到4张表中。 9 | 10 | ![](https://zok-blog.oss-cn-hangzhou.aliyuncs.com/images/区域表.png) 11 | 12 | ## 坐标拾取 13 | > 通过百度API调用地址,获取坐标并存入库中 14 | 15 | ## 效果 16 | ![](https://zok-blog.oss-cn-hangzhou.aliyuncs.com/images/区域坐标.png) -------------------------------------------------------------------------------- /项目/MeiTuanArea/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # __author__ = "zok" 362416272@qq.com 3 | # Date: 2019-06-18 Python: 3.7 4 | 5 | -------------------------------------------------------------------------------- /项目/MeiTuanArea/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = MeiTuanArea.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = MeiTuanArea 12 | -------------------------------------------------------------------------------- /项目/MeiTuanArea/初始化.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat Premium Data Transfer 3 | 4 | Source Server : LocalhostMysql 5 | Source Server Type : MySQL 6 | Source Server Version : 50725 7 | Source Host : localhost:3306 8 | Source Schema : nujiang 9 | 10 | Target Server Type : MySQL 11 | Target Server Version : 50725 12 | File Encoding : 65001 13 | 14 | Date: 23/05/2019 16:32:56 15 | */ 16 | 17 | SET NAMES utf8mb4; 18 | SET FOREIGN_KEY_CHECKS = 0; 19 | 20 | -- ---------------------------- 21 | -- Table structure for address 22 | -- ---------------------------- 23 | DROP TABLE IF EXISTS `address`; 24 | CREATE TABLE `address` ( 25 | `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT 'ID', 26 | `pid` bigint(10) DEFAULT NULL COMMENT '父id', 27 | `name` varchar(100) DEFAULT NULL COMMENT '名称', 28 | `pinyin` varchar(100) DEFAULT NULL COMMENT '拼音', 29 | `code` varchar(100) DEFAULT NULL COMMENT '长途区号', 30 | `zip` varchar(100) DEFAULT NULL COMMENT '邮编', 31 | `first` varchar(50) DEFAULT NULL COMMENT '首字母', 32 | `lng` varchar(100) DEFAULT NULL COMMENT '经度', 33 | `lat` varchar(100) DEFAULT NULL COMMENT '纬度', 34 | PRIMARY KEY (`id`) USING BTREE, 35 | KEY `pid` (`pid`) USING BTREE 36 | ) ENGINE=InnoDB AUTO_INCREMENT=3749 DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='四级区域 地址'; 37 | 38 | -- ---------------------------- 39 | -- Table structure for area 40 | -- ---------------------------- 41 | DROP TABLE IF EXISTS `area`; 42 | CREATE TABLE `area` ( 43 | `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT 'ID', 44 | `pid` bigint(10) DEFAULT NULL COMMENT '父id', 45 | `name` varchar(100) DEFAULT NULL COMMENT '名称', 46 | `pinyin` varchar(100) DEFAULT NULL COMMENT '拼音', 47 | `code` varchar(100) DEFAULT NULL COMMENT '长途区号', 48 | `zip` varchar(100) DEFAULT NULL COMMENT '邮编', 49 | `first` varchar(50) DEFAULT NULL COMMENT '首字母', 50 | `lng` varchar(100) DEFAULT NULL COMMENT '经度', 51 | `lat` varchar(100) DEFAULT NULL COMMENT '纬度', 52 | `haschild` int(1) DEFAULT NULL COMMENT '是否有下级', 53 | PRIMARY KEY (`id`) USING BTREE, 54 | KEY `pid` (`pid`) USING BTREE 55 | ) ENGINE=InnoDB AUTO_INCREMENT=39793 DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='三级区域 区域'; 56 | 57 | -- ---------------------------- 58 | -- Table structure for city 59 | -- ---------------------------- 60 | DROP TABLE IF EXISTS `city`; 61 | CREATE TABLE `city` ( 62 | `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT 'ID', 63 | `pid` bigint(10) DEFAULT NULL COMMENT '父id', 64 | `name` varchar(100) DEFAULT NULL COMMENT '名称', 65 | `pinyin` varchar(100) DEFAULT NULL COMMENT '拼音', 66 | `code` varchar(100) DEFAULT NULL COMMENT '长途区号', 67 | `zip` varchar(100) DEFAULT NULL COMMENT '邮编', 68 | `first` varchar(50) DEFAULT NULL COMMENT '首字母', 69 | `lng` varchar(100) DEFAULT NULL COMMENT '经度', 70 | `lat` varchar(100) DEFAULT NULL COMMENT '纬度', 71 | PRIMARY KEY (`id`) USING BTREE, 72 | KEY `pid` (`pid`) USING BTREE 73 | ) ENGINE=InnoDB AUTO_INCREMENT=8002 DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='二级区域 城市'; 74 | 75 | -- ---------------------------- 76 | -- Table structure for province 77 | -- ---------------------------- 78 | DROP TABLE IF EXISTS `province`; 79 | CREATE TABLE `province` ( 80 | `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT 'ID', 81 | `name` varchar(100) DEFAULT NULL COMMENT '名称', 82 | `pinyin` varchar(100) DEFAULT NULL COMMENT '拼音', 83 | `code` varchar(100) DEFAULT NULL COMMENT '长途区号', 84 | `zip` varchar(100) DEFAULT NULL COMMENT '邮编', 85 | `first` varchar(50) DEFAULT NULL COMMENT '首字母', 86 | `lng` varchar(100) DEFAULT NULL COMMENT '经度', 87 | `lat` varchar(100) DEFAULT NULL COMMENT '纬度', 88 | PRIMARY KEY (`id`) USING BTREE 89 | ) ENGINE=InnoDB AUTO_INCREMENT=820001 DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='一级区域 省市'; 90 | 91 | SET FOREIGN_KEY_CHECKS = 1; 92 | -------------------------------------------------------------------------------- /项目/README.md: -------------------------------------------------------------------------------- 1 | # 该板块不定期更新 2 | > 因为工作中会经常开发重型的爬虫,并且也属于公司的资源,所以并不会将代码放到网上。尽量以一些实战demo形式发布一些个人小项目。 3 | 4 | ## MeiTuanArea 5 | 美团区域 Scrapy 爬虫 6 | 7 | ## HoseSpider 8 | 房地产爬虫 aiohttp 爬虫 --------------------------------------------------------------------------------