├── .gitattributes
├── .gitignore
├── README.md
├── 【51Job】查岗位
    └── select_job.py
├── 【bilibili】自动登录
    ├── README.md
    ├── __init__.py
    └── login.py
├── 【bilibili】视频下载
    ├── __init__.py
    └── video_download.py
├── 【双色球】头奖分布
    ├── main.py
    ├── result.jpg
    └── 近期记录.xlsx
├── 【壁纸】美女壁纸下载器
    └── bg_down.py
├── 【大众点评】字体反爬、坐标反爬
    ├── 参数生成
    │   ├── encryp.js
    │   └── uid.py
    ├── 旧版
    │   ├── __init__.py
    │   ├── parse_address_poi.py
    │   └── parse_font_css.py
    └── 最新版7月
    │   ├── README.md
    │   ├── font.json
    │   └── main.py
├── 【天眼查】字体加密
    ├── num.woff
    └── tyc.py
├── 【抖音】无水印视频解析
    ├── README.md
    ├── __init__.py
    └── parse.py
├── 【拼多多】登陆参数生成
    ├── PinDuoDuo.py
    ├── README.md
    ├── __init__.py
    └── encryp.js
├── 【文书】app查询接口
    └── main.py
├── 【淘宝】自动登陆
    ├── auto_login_pyppeteer.py
    └── login_for_sina.py
├── 其他实战
    ├── __init__.py
    ├── 【5173网】自动登录
    │   ├── auto_login.py
    │   ├── encryp.js
    │   └── logOK.png
    ├── 【9377网】自动登录
    │   ├── 9377login.py
    │   └── __init__.py
    ├── 【DNS】自动登录
    │   ├── Login.py
    │   ├── __init__.py
    │   └── dns.js
    ├── 【GitHub】自动登录
    │   ├── __init__.py
    │   └── login.py
    ├── 【Glidedsky】自动登陆
    │   └── login.py
    ├── 【Python加密库】Demo
    │   ├── __init__.py
    │   └── encryption.py
    ├── 【TCL金融】自动登录
    │   ├── __init__.py
    │   ├── auto_login.py
    │   ├── encryp.js
    │   └── ok.png
    ├── 【steam】自动登录
    │   ├── execute.js
    │   └── login.py
    ├── 【万创帮】自动登录
    │   ├── __init__.py
    │   ├── encryp.js
    │   ├── login_ok.png
    │   └── spider_login.py
    ├── 【中关村】自动登录
    │   ├── README.md
    │   └── login.py
    ├── 【京东】商品数据爬取
    │   ├── __init__.py
    │   ├── geckodriver
    │   └── selenium抓取.py
    ├── 【人人网】自动登录
    │   ├── enc.js
    │   └── login.py
    ├── 【企业名片】企业查询
    │   ├── encryp.js
    │   └── qi_ming.py
    ├── 【国鑫所】自动登录
    │   ├── Login.py
    │   ├── __init__.py
    │   ├── encryp.js
    │   └── login_ok.png
    ├── 【天眼查】模拟登录
    │   ├── __init__.py
    │   └── login.py
    ├── 【天翼】登录
    │   ├── login.py
    │   └── v1.js
    ├── 【好莱客】参数解析
    │   ├── __init__.py
    │   ├── encryp.js
    │   ├── holike.py
    │   └── ok.png
    ├── 【小牛在线】登录参数生成
    │   ├── __init__.py
    │   ├── encryp.js
    │   └── make_param.py
    ├── 【开鑫贷】登陆参数生成
    │   ├── KaiXinDai.py
    │   ├── __init__.py
    │   └── encryp.js
    ├── 【微信】登录参数生成
    │   ├── __init__.py
    │   ├── encryp.js
    │   └── make_pwd.py
    ├── 【房价】房价获取
    │   ├── README.md
    │   ├── __pycache__
    │   │   └── util.cpython-37.pyc
    │   ├── main.py
    │   └── util.py
    ├── 【房天下】自动登录
    │   ├── encryp.js
    │   ├── login.py
    │   └── ok.png
    ├── 【新浪微博】密码解密
    │   ├── execute.js
    │   └── main.py
    ├── 【时光网】登陆参数生成
    │   ├── encryp.js
    │   └── login.py
    ├── 【易通贷】自动登录
    │   ├── __init__.py
    │   ├── auto_login.py
    │   └── encryp.js
    ├── 【汽车之家】参数解密
    │   ├── execute.js
    │   └── main.py
    ├── 【满级网】自动登录
    │   ├── auto_login.py
    │   └── encryp.js
    ├── 【百度】wap端sig生成
    │   ├── make_sig.py
    │   └── v3_update.js
    ├── 【百度】网页找回密码
    │   ├── __pycache__
    │   │   └── header.cpython-37.pyc
    │   ├── demo.py
    │   ├── dv.js
    │   ├── encryp.js
    │   ├── header.py
    │   └── 验证码.png
    ├── 【百度】翻译
    │   ├── __init__.py
    │   ├── translate.js
    │   └── translation.py
    ├── 【百度】自动登录
    │   ├── README.md
    │   ├── encryp.js
    │   └── login.py
    ├── 【百度街拍】图片下载
    │   └── get_image.py
    ├── 【移动】登录参数生成
    │   ├── MakeParam.py
    │   ├── __init__.py
    │   ├── encryp.js
    │   └── make_params.png
    ├── 【空中网】自动登录
    │   ├── __init__.py
    │   ├── encryp.js
    │   └── spider_login.py
    ├── 【美团】数据解析、token生成
    │   ├── README.md
    │   ├── __init__.py
    │   ├── create_food_token.py
    │   ├── get_login_cookies.py
    │   ├── parse_food_comments.py
    │   ├── parse_food_info.py
    │   ├── parse_hotel_comments.py
    │   ├── parse_hotel_info.py
    │   ├── parse_play_areas.py
    │   └── parse_play_info.py
    ├── 【试客联盟】登录
    │   ├── execute.js
    │   └── login.py
    ├── 【谷雨】数字解密
    │   └── GuYu.py
    ├── 【豆瓣】自动登录
    │   └── DouBan.py
    ├── 【逗游】自动登录
    │   ├── __init__.py
    │   ├── douyou.py
    │   └── encryp.js
    ├── 【金逸电影】自动注册
    │   ├── __init__.py
    │   ├── encryp.js
    │   ├── register.png
    │   └── register.py
    ├── 【青海移动】登陆参数生成
    │   ├── __init__.py
    │   ├── encryp.js
    │   └── make_param.py
    └── 【餐饮】查询信息
    │   ├── FoodInfo.py
    │   ├── __init__.py
    │   └── t.html
├── 原创爬虫工具
    ├── Cookies
    │   ├── MeiTuan
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── db.py
    │   │   ├── generator.py
    │   │   └── 账号.txt
    │   ├── README.md
    │   └── __init__.py
    ├── DataMigration
    │   ├── README.md
    │   ├── __init__.py
    │   ├── config.py
    │   ├── db
    │   │   ├── MongoDB.py
    │   │   ├── Mysql.py
    │   │   └── __init__.py
    │   └── migration
    │   │   ├── __init__.py
    │   │   ├── mongo_to_mysql.py
    │   │   └── mysql_to_mongo.py
    ├── Decode
    │   ├── README.md
    │   ├── __init__.py
    │   └── translation.py
    ├── Jsencrypt
    │   ├── __init__.py
    │   └── make_encrypt.py
    ├── OSS
    │   ├── __init__.py
    │   └── push_to_oss.py
    ├── Proxy
    │   ├── KDLProxyPool.py
    │   ├── README.md
    │   ├── XDLProxyPool.py
    │   ├── XDLProxyUseDemo.py
    │   ├── ZhiMaProxyPool.py
    │   └── ZhiMaProxyUseDemo.py
    ├── README.md
    ├── Register
    │   ├── MessageCode.py
    │   ├── README.md
    │   └── __init__.py
    └── zok
    │   ├── README.md
    │   ├── __init__.py
    │   ├── get_db
    │       ├── __init__.py
    │       ├── from_mongodb.py
    │       └── from_mysql.py
    │   ├── proxies
    │       ├── __init__.py
    │       └── proxies.py
    │   ├── random_UA
    │       ├── __init__.py
    │       ├── fake_useragent.json
    │       └── ua_random.py
    │   ├── repetition
    │       ├── __init__.py
    │       └── update_cache.py
    │   ├── save
    │       ├── __init__.py
    │       └── to_mysql.py
    │   └── zok_config.py
├── 滑动验证码
    ├── 【w3c】滑块验证
    │   ├── __init__.py
    │   ├── bg.png
    │   ├── chache.png
    │   ├── hk.png
    │   ├── img
    │   │   ├── 0.png
    │   │   ├── 1.png
    │   │   ├── 2.png
    │   │   └── 3.png
    │   └── w3c.py
    └── 【腾讯】滑块验证
    │   ├── bg.jpeg
    │   ├── discriminate.py
    │   └── sel.py
└── 项目
    ├── HouseScrapy
        ├── requirements
        ├── scrapy.cfg
        ├── settings.py
        ├── spiders
        │   ├── __init__.py
        │   └── house.py
        └── toolkits
        │   ├── __init__.py
        │   ├── fake_useragent.json
        │   ├── items.py
        │   ├── make_ua.py
        │   ├── middlewares.py
        │   ├── pipelines.py
        │   └── proxies.py
    ├── HouseSpider
        ├── README.md
        ├── config.py
        ├── db
        │   └── __init__.py
        ├── main.py
        └── tool
        │   ├── __init__.py
        │   ├── parse.py
        │   ├── proxy.py
        │   └── toolkit.py
    ├── MeiTuanArea
        ├── MeiTuanArea
        │   ├── __init__.py
        │   ├── items.py
        │   ├── middlewares.py
        │   ├── pipelines.py
        │   ├── settings.py
        │   └── spiders
        │   │   ├── __init__.py
        │   │   ├── area_coord.py
        │   │   └── areas.py
        ├── README.md
        ├── __init__.py
        ├── scrapy.cfg
        └── 初始化.sql
    └── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.js linguist-language=python


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | */.DS_Store
 3 | /.idea
 4 | */.idea
 5 | .vscode
 6 | /.vscode
 7 | */.vscode
 8 | /__pycache__
 9 | */__pycache__
10 | 
11 | .README.md


--------------------------------------------------------------------------------
/【51Job】查岗位/select_job.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-04-15  Python: 3.7
 4 | 
 5 | import requests
 6 | from lxml import etree
 7 | 
 8 | Format_str = 'https://search.51job.com/list/000000,000000,0000,00,9,99,{key},2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='
 9 | Headers = {
10 |     'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
11 | }
12 | 
13 | 
14 | class GetJob(object):
15 | 
16 |     def __init__(self, job_name):
17 |         self.job = job_name
18 | 
19 |         self.get_info()
20 | 
21 |     def get_info(self):
22 |         target_url = Format_str.format(key=self.job)
23 |         response = requests.get(target_url, headers=Headers)
24 |         # 编码转换
25 |         response.encoding = response.apparent_encoding
26 |         root = etree.HTML(response.text)
27 |         self.parse(root)
28 | 
29 |     @staticmethod
30 |     def parse(root):
31 |         div_list = root.xpath("//div[@class='dw_table']/div[@class='el']")
32 |         for div in div_list:
33 |             money = div.xpath("span[@class='t4']/text()")
34 |             money = money[0] if money else "面议"
35 |             # 工作名称不可能为空,所以不用判断
36 |             a = div.xpath("p/span/a")[0]
37 |             job_name = a.xpath("text()")[0].strip()
38 |             job_href = a.xpath("@href")[0]
39 |             date_time = div.xpath("span[@class='t5']/text()")
40 |             date_time = date_time[0] if date_time else "没有时间"
41 |             print(job_name, money, date_time, job_href)
42 |             with open('job.csv', 'a', encoding='gb18030') as f:
43 |                 job_list = [job_name, date_time, money, job_href, '\n']
44 |                 f.write(','.join(job_list))
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     key = input("请输入关键词")
49 |     GetJob(key)
50 | 


--------------------------------------------------------------------------------
/【bilibili】自动登录/README.md:
--------------------------------------------------------------------------------
 1 | ## B站自动登录
 2 | 
 3 | 本案例根据 `selenium` 实现。
 4 | 
 5 | ## 效果图
 6 | 
 7 | ![image](https://csrftoken.oss-cn-beijing.aliyuncs.com/github/blibili-login-report.png)
 8 | 
 9 | ## Q&A
10 | 
11 | > ChromeDriver - WebDriver for Chrome
12 | 
13 | ```
14 | 因为是模拟点击，所以需要下载插件。
15 | 
16 | 点击下方链接即可跳转至下载界面。
17 | ```
18 | 
19 | > 为什么要模拟滑动多次？
20 | 
21 | ```
22 | 因为获取滑块的偏移量，在模拟操作的时候，机器在控制滑动速度的时候比较均匀，可能会被判定为机器。
23 | 
24 | 当然了，我们会在今后给予更好的滑动支持~ 
25 | ```
26 | 
27 | [下载ChromeDriver](https://chromedriver.chromium.org/downloads)
28 | 
29 | ## Support
30 | 
31 | ```
32 | 案例于 2020-04-23 前均可用，如有疑问请联系作者。
33 | ```
34 | 
35 | ## Donate
36 | 
37 | Thanks ~
38 | 


--------------------------------------------------------------------------------
/【bilibili】自动登录/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # Date: 2020/4/23
4 | 


--------------------------------------------------------------------------------
/【bilibili】视频下载/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-05-09  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/【双色球】头奖分布/main.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # __author__ = "zok"  362416272@qq.com
  3 | # Date: 2019-11-08  Python: 3.7
  4 | import requests
  5 | import json
  6 | import pandas as pd
  7 | import openpyxl
  8 | import jieba
  9 | import wordcloud
 10 | import matplotlib.pyplot as plt
 11 | 
 12 | 
 13 | class SSQ:
 14 |     def __init__(self, file, font):
 15 |         self.header = {
 16 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36',
 17 |             'Host': 'www.cwl.gov.cn',
 18 |             'Referer': 'http://www.cwl.gov.cn/kjxx/ssq/kjgg/'
 19 |         }
 20 |         self.file = file
 21 |         self.font = font
 22 |         self.get_history_url = 'http://www.cwl.gov.cn/cwl_admin/kjxx/findDrawNotice?name=ssq&issueCount=100'
 23 |         self.session = requests.session()
 24 | 
 25 |     def history(self):
 26 |         """爬取最近100期"""
 27 |         _dict = None
 28 |         try:
 29 |             self.session.get('http://www.cwl.gov.cn/kjxx/ssq/kjgg/')
 30 |             _dict = json.loads(self.session.get(self.get_history_url, headers=self.header).text)
 31 |         except TypeError:
 32 |             print('获取历史记录失败')
 33 |         finally:
 34 |             return _dict
 35 | 
 36 |     def clean_data(self, data):
 37 |         """
 38 |         清洗数据
 39 |         :return:
 40 |         """
 41 |         columns = []
 42 | 
 43 |         for item in data.get('result'):
 44 |             columns.append([
 45 |                 item.get('code'),
 46 |                 item.get('date'),
 47 |                 item.get('week'),
 48 |                 item.get('red').split(','),
 49 |                 item.get('blue'),
 50 |                 item.get('sales'),
 51 |                 item.get('poolmoney'),
 52 |                 item.get('content'),
 53 |                 item.get('prizegrades')[0].get('typemoney'),
 54 |                 item.get('prizegrades')[0].get('typenum'),
 55 |                 item.get('prizegrades')[1].get('typemoney'),
 56 |                 item.get('prizegrades')[1].get('typenum'),
 57 |                 item.get('prizegrades')[2].get('typemoney'),
 58 |                 item.get('prizegrades')[2].get('typenum'),
 59 |             ])
 60 | 
 61 |         df = pd.DataFrame(
 62 |             columns,
 63 |             columns=["期数", "开奖日期", "星期数", "红球", "蓝球", "销售金额", "奖池", "中奖地区", "一等奖金", "一等奖人数", "二等奖金", "二等奖人数", "三等奖金", "三等奖人数"],  # 指定列
 64 |         )
 65 |         self.save(df)
 66 |         self.set_data(df)
 67 | 
 68 |     def save(self, df):
 69 |         """储存
 70 |         """
 71 |         df.to_excel(self.file)
 72 | 
 73 |     def set_data(self, df):
 74 |         """
 75 |         数据预处理
 76 |         :return:
 77 |         """
 78 |         cut_text = []
 79 |         for i in df['中奖地区']:
 80 |             for addr in i.split(',')[:-1]:
 81 |                 name, num = jieba.cut(addr[:-1])
 82 |                 for n in range(int(num)):
 83 |                     cut_text.append(name)
 84 |         print(" ".join(cut_text))
 85 | 
 86 |         w = wordcloud.WordCloud(font_path=self.font, background_color="white", scale=4)
 87 |         w.generate(" ".join(cut_text))
 88 |         plt.imshow(w, interpolation="bilinear")
 89 |         plt.axis("off")
 90 |         # plt.show()
 91 |         # 保存生成的图片
 92 |         w.to_file('result.jpg')
 93 | 
 94 |     def parse_history(self):
 95 |         """
 96 |         pandas 载入数据
 97 |         :return:
 98 |         """
 99 |         data = self.history()
100 |         self.clean_data(data)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     """
105 |     请自行准备一个字体文件并导入路径
106 |     """
107 |     ssq = SSQ('近期记录.xlsx', '你自己准备的字库路径')
108 |     ssq.parse_history()
109 | 


--------------------------------------------------------------------------------
/【双色球】头奖分布/result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/【双色球】头奖分布/result.jpg


--------------------------------------------------------------------------------
/【双色球】头奖分布/近期记录.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/【双色球】头奖分布/近期记录.xlsx


--------------------------------------------------------------------------------
/【壁纸】美女壁纸下载器/bg_down.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # __author__ = "zok"  362416272@qq.com
  3 | # Date: 2019-11-06  Python: 3.7
  4 | 
  5 | from requests import get
  6 | from filetype import guess
  7 | from os import rename
  8 | from os import makedirs
  9 | from os.path import exists
 10 | from json import loads
 11 | from contextlib import closing
 12 | 
 13 | 
 14 | class DownBg:
 15 |     """
 16 |     超级高清图片下载
 17 |     """
 18 |     def __init__(self):
 19 |         self.headers = {
 20 |             "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
 21 |         }
 22 | 
 23 |     def down_load(self, file_url, file_full_name, now_photo_count, all_photo_count):
 24 | 
 25 |         # 开始下载图片
 26 |         with closing(get(file_url, headers=self.headers, stream=True)) as response:
 27 |             chunk_size = 1024  # 单次请求最大值
 28 |             content_size = int(response.headers['content-length'])  # 文件总大小
 29 |             data_count = 0  # 当前已传输的大小
 30 |             with open(file_full_name, "wb") as file:
 31 |                 for data in response.iter_content(chunk_size=chunk_size):
 32 |                     file.write(data)
 33 |                     done_block = int((data_count / content_size) * 50)
 34 |                     data_count = data_count + len(data)
 35 |                     now_jd = (data_count / content_size) * 100
 36 |                     print("\r %s：[%s%s] %d%% %d/%d" % (
 37 |                         file_full_name, done_block * '█', ' ' * (50 - 1 - done_block), now_jd, now_photo_count,
 38 |                         all_photo_count), end=" ")
 39 |         # 下载完图片后获取图片扩展名，并为其增加扩展名
 40 |         file_type = guess(file_full_name)
 41 |         rename(file_full_name, file_full_name + '.' + file_type.extension)
 42 | 
 43 |     def crawler_photo(self, type_id, photo_count):
 44 |         """
 45 |         :param type_id: 最新 1, 最热 2, 女生 3, 星空 4
 46 |         :param photo_count:  下载数量
 47 |         :return:
 48 |         """
 49 |         type_dict = {
 50 |             '1': '5c68ffb9463b7fbfe72b0db0',
 51 |             '2': '5c69251c9b1c011c41bb97be',
 52 |             '3': '5c81087e6aee28c541eefc26',
 53 |             '4': '5c81f64c96fad8fe211f5367'
 54 |         }
 55 | 
 56 |         url = 'https://service.paper.meiyuan.in/api/v2/columns/flow/{key}?page=1&per_page='.format(
 57 |             key=type_dict.get(str(type_id))) + str(photo_count)
 58 | 
 59 |         # 获取图片列表数据
 60 |         respond = get(url, headers=self.headers)
 61 |         photo_data = loads(respond.content)
 62 | 
 63 |         # 已经下载的图片张数
 64 |         now_photo_count = 1
 65 | 
 66 |         # 所有图片张数
 67 |         all_photo_count = len(photo_data)
 68 | 
 69 |         # 开始下载并保存5K分辨率壁纸
 70 |         for photo in photo_data:
 71 | 
 72 |             # 创建一个文件夹存放我们下载的图片
 73 |             if not exists('./' + str(type_id)):
 74 |                 makedirs('./' + str(type_id))
 75 | 
 76 |             # 准备下载的图片链接
 77 |             file_url = photo['urls']['raw']
 78 | 
 79 |             # 准备下载的图片名称,不包含扩展名
 80 |             file_name_only = file_url.split('/')
 81 |             file_name_only = file_name_only[len(file_name_only) - 1]
 82 | 
 83 |             # 准备保存到本地的完整路径
 84 |             file_full_name = './' + str(type_id) + '/' + file_name_only
 85 | 
 86 |             # 开始下载图片
 87 |             self.down_load(file_url, file_full_name, now_photo_count, all_photo_count)
 88 |             now_photo_count = now_photo_count + 1
 89 | 
 90 | 
 91 | if __name__ == '__main__':
 92 |     dg = DownBg()
 93 | 
 94 |     wall_paper_id = 1
 95 |     wall_paper_count = 10
 96 |     while True:
 97 |         wall_paper_id = input("\n\n壁纸类型：最新壁纸 1, 最热壁纸 2, 女生壁纸 3, 星空壁纸 4\n请输入编号以便选择5K超清壁纸类型：")
 98 |         wall_paper_count = input("请输入要下载的5K超清壁纸的数量：")
 99 | 
100 |         if wall_paper_id not in ['1', '2', '3', '4'] or not wall_paper_count.isdigit():
101 |             print('输入有误')
102 |             continue
103 | 
104 |         print("正在下载5K超清壁纸，请稍等……")
105 |         dg.crawler_photo(int(wall_paper_id), int(wall_paper_count))
106 |         print('\n下载5K高清壁纸成功!')
107 | 


--------------------------------------------------------------------------------
/【大众点评】字体反爬、坐标反爬/参数生成/encryp.js:
--------------------------------------------------------------------------------
 1 | function make() {
 2 |     for (var t = 1 * new Date, n = 0; t === 1 * new Date && n < 200;) n++;
 3 |     return t.toString(16) + n.toString(16)
 4 | }
 5 | 
 6 | function test(love, you, babby) {
 7 |     var t = (you * babby).toString(16);
 8 |     return make() + "-" + Math.random().toString(16).replace(".", "") + "-" + function () {
 9 |         var t = love,
10 |             n = void 0,
11 |             e = void 0,
12 |             i = [],
13 |             r = 0;
14 | 
15 |         function o(t, n) {
16 |             var e = void 0,
17 |                 r = 0;
18 |             for (e = 0; e < n.length; e++) r |= i[e] << 8 * e;
19 |             return t ^ r
20 |         }
21 | 
22 |         for (n = 0; n < t.length; n++) e = t.charCodeAt(n), i.unshift(255 & e), 4 <= i.length && (r = o(r, i), i = []);
23 |         return 0 < i.length && (r = o(r, i)), r.toString(16)
24 |     }() + "-" + t + "-" + make()
25 | }
26 | 
27 | function now_uu() {
28 |     return (65536 * (1 + Math.random()) | 0).toString(16).substring(1)
29 | }
30 | function puid() {
31 |     return  "owl-" +now_uu() + now_uu() + "-" + now_uu() + "-" + now_uu() + "-" + now_uu() + "-" + now_uu() + now_uu() + now_uu()
32 | }


--------------------------------------------------------------------------------
/【大众点评】字体反爬、坐标反爬/参数生成/uid.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-11-15  Python: 3.7
 4 | import execjs.runtime_names
 5 | import random
 6 | import requests
 7 | import time
 8 | from faker import Faker
 9 | 
10 | 
11 | info = random.choice([[800, 1024], [900, 1440], [1050, 1680], [1200, 1920], [1200, 1600]])
12 | 
13 | with open("encryp.js", "r", encoding="utf-8") as f:
14 |     js = execjs.compile(f.read())
15 | 
16 | print('引擎', execjs.get().name)
17 | uid = js.call('test', Faker().user_agent(), info[0], info[1])
18 | page_id = js.call('puid')
19 | 
20 | 
21 | headers = {
22 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
23 |     'Host': 'catfront.dianping.com',
24 |     'Referer': 'http://www.dianping.com/shop/97789651',
25 |     'Origin': 'http://www.dianping.com',
26 | }
27 | 
28 | headers2 = {
29 |     'Cookie': "_lxsdk_cuid=16e8184bc7cc8-00733806cb0caf-d087704-13c680-16e8184bc7cc8;",
30 |     'Referer': 'http://www.dianping.com/shop/76311084',
31 |     'Host': 'www.dianping.com',
32 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36',
33 | }
34 | sign_url = 'http://catfront.dianping.com/api/pv?v=1&sdk=1.8.13&project=app-pc-main-shop&pageurl=main-shop&pageId={pageId}&timestamp={timestamp}&region=&operator=&network=&container=&os=&unionid={unionid}'
35 | session = requests.session()
36 | session.get('http://www.dianping.com/shop/76311084', headers=headers2)
37 | response = session.post(sign_url.format(pageId=page_id, unionid=uid, timestamp=str(int(round(time.time() * 1000)))), headers=headers)
38 | print(uid, page_id)
39 | print(response)
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/【大众点评】字体反爬、坐标反爬/旧版/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-12  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/【大众点评】字体反爬、坐标反爬/旧版/parse_address_poi.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-03-27  Python: 3.7
 4 | 
 5 | 
 6 | def to_base36(value):
 7 |     """converts a decimal integer to a 36 decimal string"""
 8 |     if not isinstance(value, int):
 9 |         raise TypeError("expected int, got %s: %r" % (value.__class__.__name__, value))
10 | 
11 |     if value == 0:
12 |         return "0"
13 | 
14 |     if value < 0:
15 |         sign = "-"
16 |         value = -value
17 |     else:
18 |         sign = ""
19 | 
20 |     result = []
21 | 
22 |     while value:
23 |         value, mod = divmod(value, 36)
24 |         result.append("0123456789abcdefghijklmnopqrstuvwxyz"[mod])
25 | 
26 |     return sign + "".join(reversed(result))
27 | 
28 | 
29 | def decode(C):
30 |     """parse poi"""
31 |     digi = 16
32 |     add = 10
33 |     plus = 7
34 |     cha = 36
35 |     I = -1
36 |     H = 0
37 |     B = ''
38 |     J = len(C)
39 |     G = ord(C[-1])
40 |     C = C[:-1]
41 |     J -= 1
42 | 
43 |     for E in range(J):
44 |         D = int(C[E], cha) - add
45 |         if D >= add:
46 |             D = D - plus
47 |         B += to_base36(D)
48 |         if D > H:
49 |             I = E
50 |             H = D
51 | 
52 |     A = int(B[:I], digi)
53 |     F = int(B[I + 1:], digi)
54 |     L = (A + F - int(G)) / 2
55 |     K = float(F - L) / 100000
56 |     L = float(L) / 100000
57 |     return {'lng': L, 'lat': K}
58 | 
59 | 
60 | if __name__ == '__main__':
61 |     print(decode('HFHSGGZTWSATFG'))
62 | 


--------------------------------------------------------------------------------
/【大众点评】字体反爬、坐标反爬/最新版7月/README.md:
--------------------------------------------------------------------------------
 1 | # 仅限学术交流
 2 | # 如有冒犯请立即联系作者删除
 3 | 
 4 | # 安装
 5 | **`pip3 install fontTools`**
 6 | 
 7 | **`pip3 install requests`**
 8 | 
 9 | **`pip3 install redis`**
10 | 
11 | 
12 | # 使用
13 | 1. 需要开启 redis 库 并配置，默认链接的本机 redis 
14 | 2. 参考 `main.py` 中的调用代码
15 | 
16 | **[参考博客链接](https://www.zhangkunzhi.com/archives/72)**


--------------------------------------------------------------------------------
/【天眼查】字体加密/num.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/【天眼查】字体加密/num.woff


--------------------------------------------------------------------------------
/【天眼查】字体加密/tyc.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-12-06  Python: 3.7
 4 | 
 5 | """
 6 |     从网页下载一个字体文件获取对应推导式，动态获取请自行拓展
 7 | """
 8 | 
 9 | from fontTools.ttLib import TTFont
10 | import re
11 | 
12 | font = TTFont('num.woff')  # 打开tyc-num.woff
13 | font.saveXML('tyc-num.xml')  # 保存为tyc-num.xml
14 | with open('tyc-num.xml', 'r') as f:
15 |     xml = f.read()  # 读取tyc-num.xml赋值给xml
16 | GlyphID = re.findall(r'<GlyphID id="(.*?)" name="(\d+)"/>', xml)  # 获得对应关系
17 | print(GlyphID)
18 | GlyphIDNameLists = list(set([int(Gname) for Gid, Gname in GlyphID])) # 对应关系数量转换
19 | print(GlyphIDNameLists)
20 | DigitalDicts = {str(i): str(GlyphIDNameLists[i - 2]) for i in range(2, len(GlyphIDNameLists)+2)}  # 数字对应关系的字典推导式
21 | print(DigitalDicts)
22 | GlyphIDDicts = {str(Gname): DigitalDicts[Gid] for Gid, Gname in GlyphID}  # 通过数字对应关系生成源代码跟页面显示的字典推导式
23 | print('-' * 39 + '数字对应关系的字典推导式' + '-' * 39)
24 | print(DigitalDicts)
25 | print('-' * 27 + '通过数字对应关系生成源代码跟页面显示的字典推导式' + '-' * 27)
26 | print(GlyphIDDicts)
27 | 


--------------------------------------------------------------------------------
/【抖音】无水印视频解析/README.md:
--------------------------------------------------------------------------------
1 | 这是一份抖音无水印单个视频的解析代码
2 | 
3 | 


--------------------------------------------------------------------------------
/【抖音】无水印视频解析/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | # Auth: Zok  Email: 362416272@qq.com
3 | # Date: 2020/3/6
4 | 
5 | 


--------------------------------------------------------------------------------
/【抖音】无水印视频解析/parse.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # Auth: Zok  Email: 362416272@qq.com
 3 | # Date: 2020/3/6
 4 | 
 5 | import re
 6 | import requests
 7 | import json
 8 | 
 9 | 
10 | class ParseVideo:
11 | 
12 |     def __init__(self, share):
13 |         path = self.get_url(share)
14 |         self.url = 'https://v.douyin.com/' + path + '/'
15 |         self.headers = {
16 |             'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
17 |         }
18 |         self.session = requests.session()
19 |         self.first_url = None
20 | 
21 |     @staticmethod
22 |     def get_url(share_url):
23 |         return re.search(r'https://v\.douyin\.com/(.*?)/', share_url).group(1)
24 | 
25 |     def go_location(self):
26 |         response = self.session.get(self.url, headers=self.headers)
27 |         self.first_url = response.url
28 |         result = re.search(r'itemId: "(.*?)",[\s\S]*?uid: "(.*?)",[\s\S]*?authorName: "(.*?)",[\s\S]*?dytk: "(.*?)"',
29 |                            response.text)
30 |         return result
31 | 
32 |     def go_message(self, ret):
33 |         url = 'https://www.iesdouyin.com/web/api/v2/aweme/iteminfo/?item_ids=' + ret.group(1) + '&dytk=' + ret.group(4)
34 |         response = self.session.get(url, headers=self.headers)
35 |         json_data = json.loads(response.text)
36 |         user_id = ret.group(2)
37 |         user_name = ret.group(3).encode('utf-8').decode('unicode_escape')
38 | 
39 |         if json_data.get('status_code') != 0:
40 |             print('解析失败')
41 |             exit()
42 |         item_list = json_data.get('item_list')[0]
43 |         aweme_id = item_list.get('aweme_id')
44 |         desc = item_list.get('desc')
45 |         comment_count = item_list.get('statistics').get('comment_count')
46 |         digg_count = item_list.get('statistics').get('digg_count')
47 | 
48 |         video = item_list.get('video')
49 |         cover = video.get('origin_cover').get('url_list')[0]
50 |         play_addr = video.get('play_addr_lowbr').get('url_list')[0]
51 | 
52 |         play_addr_response = self.session.get(play_addr, headers=self.headers, allow_redirects=False)
53 |         msg = """
54 |         用户id：{user_id}
55 |         用户名：{user_name}
56 |         作品id：{aweme_id}
57 |         标题：  {desc}
58 |         评论数：  {comment_count}
59 |         点赞数：  {digg_count}
60 |         封面地址：{cover}
61 |         无水印视频：{addr}
62 |         """.format(
63 |             user_id=user_id,
64 |             user_name=user_name,
65 |             aweme_id=aweme_id,
66 |             desc=desc,
67 |             comment_count=comment_count,
68 |             digg_count=digg_count,
69 |             cover=cover,
70 |             addr=play_addr_response.headers['location']
71 |         )
72 |         print(msg)
73 | 
74 |     def start(self):
75 |         result = self.go_location()
76 |         self.go_message(result)
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     # text = '#在抖音，记录美好生活#要逆天！北京地坛医院证实新冠病毒攻击中枢神经系统 https://v.douyin.com/tW7qrw/ 复制此链接，打开【抖音短视频】，直接观看视频！'
81 |     text = input('请输入分享链接>>>')
82 |     pv = ParseVideo(text)
83 |     pv.start()
84 | 


--------------------------------------------------------------------------------
/【拼多多】登陆参数生成/PinDuoDuo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-07-23  Python: 3.7
 4 | 
 5 | 
 6 | import execjs.runtime_names
 7 | 
 8 | """
 9 | pip3 install execjs
10 | npm i jsdom -g
11 | """
12 | 
13 | 
14 | class PingDuoDuoSpider(object):
15 |     """
16 |     拼多多加密解析
17 |     """
18 | 
19 |     def __init__(self, password):
20 |         # 初始化
21 |         print('引擎', execjs.get().name)
22 |         self.password = password
23 | 
24 |     def make(self):
25 |         with open("encryp.js", "r", encoding="utf-8") as f:
26 |             ctx = execjs.compile(f.read())
27 | 
28 |         ret = ctx.call("test", self.password)
29 |         print(ret)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     key = input("输入字符串")
34 |     pdd = PingDuoDuoSpider(key)
35 |     pdd.make()
36 | 
37 | 


--------------------------------------------------------------------------------
/【拼多多】登陆参数生成/README.md:
--------------------------------------------------------------------------------
1 | # 解密过程参考博客
2 | 
3 | [博客链接](https://www.zhangkunzhi.com/archives/67)


--------------------------------------------------------------------------------
/【拼多多】登陆参数生成/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-07-23  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/【淘宝】自动登陆/login_for_sina.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-04-11  Python: 3.7
 4 | 
 5 | from selenium import webdriver
 6 | from selenium.webdriver.common.by import By
 7 | from selenium.webdriver.support.ui import WebDriverWait
 8 | from selenium.webdriver.support import expected_conditions as EC
 9 | 
10 | 
11 | class TB_Spider:
12 | 
13 |     def __init__(self, username, password):
14 |         """初始化参数"""
15 |         url = 'https://login.taobao.com/member/login.jhtml'
16 |         self.url = url
17 | 
18 |         options = webdriver.ChromeOptions()
19 |         # 不加载图片,加快访问速度
20 |         options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
21 |         # 设置为开发者模式，避免被识别
22 |         options.add_experimental_option('excludeSwitches',
23 |                                         ['enable-automation'])
24 |         self.browser = webdriver.Chrome(executable_path='./chromedriver', options=options)
25 |         self.wait = WebDriverWait(self.browser, 40)
26 |         # 初始化用户名
27 |         self.username = username
28 |         # 初始化密码
29 |         self.password = password
30 | 
31 |     def run(self):
32 |         """登陆接口"""
33 |         self.browser.get(self.url)
34 |         try:
35 |             # 这里设置等待：等待输入框
36 |             login_element = self.wait.until(
37 |                 EC.presence_of_element_located((By.CSS_SELECTOR, '.qrcode-login > .login-links > .forget-pwd')))
38 |             login_element.click()
39 | 
40 |             sina_login = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.weibo-login')))
41 |             sina_login.click()
42 | 
43 |             weibo_user = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.username > .W_input')))
44 |             weibo_user.send_keys(self.username)
45 | 
46 |             sina_password = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.password > .W_input')))
47 |             sina_password.send_keys(self.password)
48 | 
49 |             submit = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '.btn_tip > a > span')))
50 |             submit.click()
51 | 
52 |             taobao_name = self.wait.until(EC.presence_of_element_located((By.CSS_SELECTOR,
53 |                                                                           '.site-nav-bd > ul.site-nav-bd-l > li#J_SiteNavLogin > div.site-nav-menu-hd > div.site-nav-user > a.site-nav-login-info-nick ')))
54 |             # 登陆成功打印提示信息
55 |             print("登陆成功：%s" % taobao_name.text)
56 |         except Exception:
57 |             self.browser.close()
58 |             print("登陆失败")
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     name = input("请输入你的微博用户名:")
63 |     pas = input("请输入密码:")
64 |     spider = TB_Spider(name, pas)
65 |     spider.run()
66 | 


--------------------------------------------------------------------------------
/其他实战/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-11  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【5173网】自动登录/auto_login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-08-26  Python: 3.7
 4 | 
 5 | import re
 6 | import requests
 7 | import execjs.runtime_names
 8 | 
 9 | 
10 | class YX(object):
11 |     """
12 |     易通贷自动登陆
13 |     """
14 | 
15 |     def __init__(self, user, pwd):
16 |         self.user = user
17 |         self.pwd = pwd
18 |         self.session = requests.session()
19 |         self.url = 'https://passport.5173.com/'
20 |         self.headers = {
21 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
22 |             'Host': 'passport.5173.com',
23 |         }
24 |         print('引擎', execjs.get().name)
25 | 
26 |     def make_pwd(self, key):
27 |         with open("encryp.js", "r", encoding="utf-8") as f:
28 |             ctx = execjs.compile(f.read())
29 |         return ctx.call("make_js", self.pwd, key)
30 | 
31 |     def make_data(self, token, key):
32 |         data = {
33 |             'userName': self.user,
34 |             'password': self.make_pwd(key),
35 |             'mobileNo': '',
36 |             'captcha': '',
37 |             'smsCaptcha': '',
38 |             'category': '',
39 |             'passpod': '',
40 |             'smsLogin': '0',
41 |             '__validationToken__': token,
42 |             '__validationDna__': '',
43 |         }
44 |         return data
45 | 
46 |     def login(self):
47 |         """start
48 |         """
49 |         response = self.session.get(self.url)
50 |         info = re.search(r'SecurityToken:"(.*?)",[\s\S]*?PasswordKey:"(.*?)",', response.text)
51 |         try:
52 |             token = info.group(1)
53 |             key = info.group(2)
54 |             data = self.make_data(token, key)
55 |             result = self.session.post(self.url, data=data, headers=self.headers)
56 |             if '5173auth' in str(result.cookies):
57 |                 print(result.cookies)
58 |                 print('【登陆成功】')
59 |             else:
60 |                 print('【登陆失败】')
61 |         except AttributeError:
62 |             print('【获取key失败】')
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     username = input('请输入账号')
67 |     password = input('密码')
68 |     yx = YX(username, password)
69 |     yx.login()
70 | 
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/其他实战/【5173网】自动登录/logOK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【5173网】自动登录/logOK.png


--------------------------------------------------------------------------------
/其他实战/【9377网】自动登录/9377login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-07-23  Python: 3.7
 4 | 
 5 | import requests
 6 | 
 7 | 
 8 | class Login9377:
 9 |     """9377游戏平台自动登陆
10 |     """
11 | 
12 |     def __init__(self, username, password):
13 |         self.headers = {
14 |             'Upgrade-Insecure-Requests': '1',
15 |             'Host': 'wvw.9377.com',
16 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
17 |         }
18 |         if len(password) < 6 or len(username) < 6:
19 |             print('请输入正确账号密码!')
20 |             exit()
21 |         self.username = username
22 |         self.password = password
23 |         self.login_url = 'http://wvw.9377.com/login.php'
24 |         self.host = 'https://www.9377.com/'
25 |         self.session = requests.session()
26 | 
27 |     def login(self):
28 |         """登陆
29 |         """
30 |         data = {
31 |             'do': 'login',
32 |             'gourl': self.host,
33 |             'login_save': '1',
34 |             'username': self.username,
35 |             'password': self.password
36 |         }
37 |         self.session.get(self.login_url, headers=self.headers)
38 |         result = self.session.post(self.login_url, headers=self.headers, data=data)
39 |         self.check(result)
40 | 
41 |     def check(self, result):
42 |         """检测登陆状态
43 |         """
44 |         if self.username in str(result.cookies):
45 |             print('登陆成功')
46 |         else:
47 |             print('用户名或密码错误')
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     name = input('输入账号')
52 |     word = input('输入密码')
53 |     lg = Login9377(name, word)
54 |     lg.login()
55 | 


--------------------------------------------------------------------------------
/其他实战/【9377网】自动登录/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-07-24  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【DNS】自动登录/Login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-08  Python: 3.7
 4 | import requests
 5 | import re
 6 | import execjs.runtime_names
 7 | 
 8 | 
 9 | class DNS:
10 |     def __init__(self, user, pwd):
11 |         self.user = user
12 |         self.pwd = pwd
13 |         self.js = None
14 |         self.url = 'https://www.dns.com/login.html'
15 |         self.headers = {
16 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
17 |             'Host': 'www.dns.com'
18 |         }
19 |         self.read_js()
20 | 
21 |     def get_token(self):
22 |         response = requests.get(self.url, headers=self.headers)
23 |         try:
24 |             token = re.search(r'<input type="hidden" name="_token" value="(.*?)">', response.text).group(1)
25 |             print(token)
26 |         except AttributeError:
27 |             print('token 捕获失败')
28 | 
29 |     def read_js(self):
30 |         with open("dns.js", "r", encoding="utf-8") as f:
31 |             self.js = execjs.compile(f.read())
32 | 
33 |     def login(self):
34 |         data = {
35 |             '_token': self.get_token(),
36 |             'password': self.js.call('aes', self.pwd),
37 |             'email': self.js.call('aes', self.user),
38 |             'redirectTo': 'https://www.dns.com/dashboard',
39 |         }
40 |         response = requests.post(self.url, data=data, headers=self.headers)
41 |         print(response)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     username = input('请输入账号')
46 |     password = input('密码')
47 |     dns = DNS(username, password)
48 |     dns.login()
49 | 


--------------------------------------------------------------------------------
/其他实战/【DNS】自动登录/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【GitHub】自动登录/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-06-18  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【GitHub】自动登录/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-04-11  Python: 3.7
 4 | """
 5 | 1. get login html token
 6 | 2. login
 7 | """
 8 | 
 9 | import requests
10 | from lxml import etree
11 | 
12 | 
13 | class Login(object):
14 |     def __init__(self, username, password):
15 | 
16 |         self.headers = {
17 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36',
18 |             'Referer': 'https://github.com/',
19 |             'Host': 'github.com'
20 |         }
21 | 
22 |         self.login_url = 'https://github.com/login'
23 |         self.post_url = 'https://github.com/session'
24 |         self.session = requests.Session()
25 | 
26 |         self.username = username
27 |         self.password = password
28 | 
29 |     def login_GitHub(self):
30 |         """
31 |         模拟登陆
32 |         :return:
33 |         """
34 | 
35 |         post_data = {
36 |             'commit': 'Sign in',
37 |             'utf8': '✓',
38 |             'authenticity_token': self.get_token(),
39 |             'login': self.username,
40 |             'password': self.password
41 |         }
42 | 
43 |         response = self.session.post(self.post_url, data=post_data, headers=self.headers)
44 | 
45 |         if response.status_code == 200:
46 |             html = etree.HTML(response.content.decode())
47 |             if html.xpath('/html/body/div[1]/header/div[7]/details/summary'):
48 |                 pro_list = html.xpath('//ul[@class="list-style-none"]/li/div/a/span[2]/text()')
49 |                 print("登录成功！正在拉取你的所有项目..")
50 |                 print(pro_list)
51 | 
52 |             else:
53 |                 print('账号或密码错误')
54 |         else:
55 |             print("登录失败！")
56 | 
57 |     def get_token(self):
58 |         """
59 |         获取token
60 |         :return:
61 |         """
62 | 
63 |         response = self.session.get(self.login_url, headers=self.headers)
64 |         html = etree.HTML(response.content.decode())
65 | 
66 |         token = html.xpath('//input[@name="authenticity_token"]/@value')[0]
67 | 
68 |         return token
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     user = input('请输入您的账号： ')
73 |     key = input('请输入您的密码： ')
74 | 
75 |     login = Login(user, key)
76 |     login.login_GitHub()
77 | 


--------------------------------------------------------------------------------
/其他实战/【Glidedsky】自动登陆/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-25  Python: 3.7
 4 | 
 5 | import requests
 6 | import re
 7 | import json
 8 | 
 9 | 
10 | class Gli:
11 |     """
12 |     自动登陆 Glidedsky
13 |     http://www.glidedsky.com/login
14 |     """
15 | 
16 |     def __init__(self, user, pwd):
17 |         self.user = user
18 |         self.pwd = pwd
19 |         self.url = 'http://www.glidedsky.com/login'
20 |         self.session = requests.session()
21 |         self.headers = {
22 |             'Host': 'www.glidedsky.com',
23 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
24 |         }
25 | 
26 |     def get_token(self):
27 |         response = self.session.get(self.url, headers=self.headers)
28 |         _token = re.search(r'name="csrf-token" content="(.*?)">', response.text).group(1)
29 |         return _token
30 | 
31 |     def login(self):
32 |         data = {'_token': self.get_token(), 'email': self.user, 'password': self.pwd}
33 |         self.session.post(self.url, data=data)
34 |         # print(self.session.cookies)
35 |         cookies = requests.utils.dict_from_cookiejar(self.session.cookies)  # cookies 输出
36 |         with open('toolkit/cookies.json', 'w', encoding='utf-8') as f:
37 |             f.write(json.dumps(cookies))
38 |         # print(cookies)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     username = input('请输入用户名')
43 |     password = input('请输入密码')
44 |     g = Gli(username, password)
45 |     g.login()
46 | 


--------------------------------------------------------------------------------
/其他实战/【Python加密库】Demo/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-07-11  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【TCL金融】自动登录/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【TCL金融】自动登录/auto_login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-10  Python: 3.7
 4 | 
 5 | import requests
 6 | import execjs.runtime_names
 7 | 
 8 | 
 9 | class SpiderLogin:
10 |     """
11 |     TCL 个人金融
12 |     https://weixin.tjinsuo.com/#login/mine
13 |     """
14 | 
15 |     def __init__(self, user, pwd):
16 |         self.user = user
17 |         self.pwd = pwd
18 |         self.js = None
19 |         self.url = 'https://weixin.tjinsuo.com/service/user/login'
20 |         self.load_js()
21 |         print('引擎', execjs.get().name)
22 | 
23 |     def load_js(self):
24 |         """js 调用
25 |         """
26 |         with open("encryp.js", "r", encoding="utf-8") as f:
27 |             self.js = execjs.compile(f.read())
28 | 
29 |     def auto_login(self):
30 |         """登陆
31 |         """
32 |         ret = self.js.call('make', self.pwd)
33 |         rand_key, word = ret.split('||')
34 |         print(rand_key, word)
35 |         headers = {
36 |             'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
37 |             'Host': 'weixin.tjinsuo.com',
38 |             'terminalType': 'BEST_WX',
39 |             'Accept': 'application/json'
40 |         }
41 |         data = 'mobile={user}&password={pwd}&cipherkey=&message=&randKey={rand_key}'.format(user=self.user,
42 |                                                                                             pwd=word,
43 |                                                                                             rand_key=rand_key)
44 | 
45 |         response = requests.post(self.url, headers=headers, data=data)
46 |         print(response.text)
47 |         print(response)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     username = input('请输入账号')
52 |     password = input('密码')
53 |     wcb = SpiderLogin(username, password)
54 |     wcb.auto_login()
55 | 


--------------------------------------------------------------------------------
/其他实战/【TCL金融】自动登录/ok.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【TCL金融】自动登录/ok.png


--------------------------------------------------------------------------------
/其他实战/【steam】自动登录/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-12-11  Python: 3.7
 4 | 
 5 | import execjs
 6 | import requests, json, re
 7 | 
 8 | 
 9 | def Get_parameters(username):
10 |     """steam 登录   只处理了密码加密。其他请自行拓展
11 |     :return 公钥和一个参数；
12 |     """
13 |     import time
14 |     try:
15 |         url = "https://store.steampowered.com/login/getrsakey/"
16 | 
17 |         headers = {
18 |             'User-Agent': 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
19 |             'Host': 'store.steampowered.com',
20 |             'Referer': 'https://store.steampowered.com/login/?redir=&redir_ssl=1',
21 |             'Origin': 'https://store.steampowered.com'
22 |         }
23 |         data = {
24 |             'donotcache': int(round(time.time() * 1000)),
25 |             'username': username,
26 |         }
27 |         res = requests.post(url=url, headers=headers, data=data)
28 |         publickey_mod = json.loads(res.text).get('publickey_mod')
29 |         publickey_exp = json.loads(res.text).get('publickey_exp')
30 |         return publickey_mod, publickey_exp
31 | 
32 |     except Exception as err:
33 |         print('访问失败', err)
34 | 
35 | 
36 | def main(pwd, publickey_mod, publickey_exp):
37 |     """
38 |     :param pwd:
39 |     :param publickey_mod:
40 |     :param publickey_exp:
41 |     :return sign:
42 |     """
43 |     with open('execute.js', 'r', encoding='utf-8') as f:
44 |         js = execjs.compile(f.read())
45 |         print('引擎', execjs.get().name)
46 |         sign = js.call('get_pwd', pwd, publickey_mod, publickey_exp)
47 |         return sign
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     username = input('请输入账户:')
52 |     pwd = input('请输入密码：')
53 |     publickey_mod, publickey_exp = Get_parameters(username)
54 |     sign = main(pwd, publickey_mod, publickey_exp)
55 |     print(sign)
56 | 


--------------------------------------------------------------------------------
/其他实战/【万创帮】自动登录/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【万创帮】自动登录/login_ok.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【万创帮】自动登录/login_ok.png


--------------------------------------------------------------------------------
/其他实战/【万创帮】自动登录/spider_login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-03  Python: 3.7
 4 | 
 5 | import json
 6 | import requests
 7 | import execjs.runtime_names
 8 | 
 9 | 
10 | class SpiderLogin:
11 |     """
12 |     万创帮爬虫登陆
13 |     """
14 | 
15 |     def __init__(self, user, pwd):
16 |         self.user = user
17 |         self.pwd = pwd
18 |         self.url = 'https://m.wcbchina.com/login/other-login.html'
19 |         print('引擎', execjs.get().name)
20 | 
21 |     def use_js(self):
22 |         """js 调用
23 |         """
24 |         with open("encryp.js", "r", encoding="utf-8") as f:
25 |             js = execjs.compile(f.read())
26 | 
27 |         try:
28 |             sign, t = js.call("make_sigin")
29 |             pwd = js.call("make_pwd",  self.pwd)
30 |             return sign, t, pwd
31 |         except Exception:
32 |             print('异常数据')
33 | 
34 |     def auto_login(self):
35 |         """登陆
36 |         """
37 |         sign, t, pwd = self.use_js()
38 |         headers = {
39 |             'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
40 |             'Referer': 'https://m.wcbchina.com/login/other-login.html'
41 |         }
42 |         pay_load = {
43 |             'auth': {'sign': sign, 'timestamp': t},
44 |             'password': self.user,
45 |             'username': pwd
46 |         }
47 | 
48 |         response = requests.post(self.url, headers=headers, data=json.dumps(pay_load))
49 |         print(response.cookies)
50 |         print(response)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     username = input('请输入账号')
55 |     password = input('密码')
56 |     wcb = SpiderLogin(username, password)
57 |     wcb.auto_login()
58 | 


--------------------------------------------------------------------------------
/其他实战/【中关村】自动登录/README.md:
--------------------------------------------------------------------------------
1 | # 解密过程博客说明
2 | 
3 | https://www.zhangkunzhi.com/?p=135


--------------------------------------------------------------------------------
/其他实战/【中关村】自动登录/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-07-29  Python: 3.7
 4 | 
 5 | 
 6 | import requests
 7 | import hashlib
 8 | import time
 9 | import json
10 | 
11 | from urllib import parse
12 | 
13 | 
14 | class ZGC:
15 |     """
16 |     解析过程说明 https://www.zhangkunzhi.com/?p=135
17 | 
18 |     1. 用的 CryptoJS md5 加密
19 |     2. 需要带入 cookies
20 |     """
21 | 
22 |     def __init__(self, username, password):
23 |         self.username = username
24 |         self.password = password
25 |         self.headers = {
26 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
27 |         }
28 | 
29 |     def get_cookies(self):
30 |         """取cookies
31 |         """
32 |         _now = time.time()
33 |         t = str(_now)[:7]
34 |         _jsonp = int(round(_now * 1000))
35 |         pick = 'http://js.zol.com.cn/pvn/pv.ht?&t={t}&c=&callback=_jsonp{_jsonp}'.format(t=t, _jsonp=_jsonp)
36 |         try:
37 |             content = requests.get(pick, headers=self.headers).text
38 |             ipck = json.loads(content[content.find('(')+1:-1]).get('ipck')
39 |             return parse.quote(ipck)
40 |         except:
41 |             print('cookies 获取失败')
42 | 
43 |     def login(self, ipck):
44 |         """登陆
45 |         """
46 |         _str_now = str(int(time.time()))
47 |         login_url = 'http://service.zol.com.cn/user/ajax/login2014/login.php'
48 |         data = {
49 |             'userid': self.username,
50 |             'pwd': self.make_md5(self.password),
51 |             'is_auto': '1',
52 |             'backUrl': 'http://www.zol.com.cn/'
53 |         }
54 |         cookies = {
55 |             'Hm_lpvt_ae5edc2bc4fc71370807f6187f0a2dd0': _str_now,
56 |             'Hm_lvt_ae5edc2bc4fc71370807f6187f0a2dd0': _str_now,
57 |             'ip_ck': ipck,
58 |             'vn': '1',
59 |             'lv': _str_now,
60 |             'z_pro_city': 's_provice%3Dzhongqing%26s_city%3Dzhongqing',
61 |             'z_day': 'ixgo20%3D1'
62 |         }
63 | 
64 |         response = requests.post(login_url, headers=self.headers, data=data, cookies=cookies)
65 |         msg = json.loads(response.content)
66 |         return msg
67 | 
68 |     @staticmethod
69 |     def make_md5(_str):
70 |         """md5 生成
71 |         """
72 |         # 待加密信息
73 |         text = _str + 'zol'
74 |         # 创建md5对象
75 |         m = hashlib.md5()
76 |         m.update(text.encode(encoding='utf-8'))
77 |         str_md5 = m.hexdigest()
78 |         return str_md5
79 | 
80 |     def main(self):
81 |         ipck = self.get_cookies()
82 |         msg = self.login(ipck)
83 |         print(msg)
84 | 
85 | 
86 | if __name__ == '__main__':
87 |     user = input('请输入中关村账号')
88 |     pwd = input('请输入中关村密码')
89 |     zgc = ZGC(user, pwd)
90 |     zgc.main()
91 | 


--------------------------------------------------------------------------------
/其他实战/【京东】商品数据爬取/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-12-10  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【京东】商品数据爬取/geckodriver:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【京东】商品数据爬取/geckodriver


--------------------------------------------------------------------------------
/其他实战/【京东】商品数据爬取/selenium抓取.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-04-11  Python: 3.7
 4 | 
 5 | from selenium import webdriver
 6 | from selenium.webdriver.common.keys import Keys  # 键盘按键操作
 7 | import time
 8 | 
 9 | 
10 | def get_goods(driver):
11 |     try:
12 |         goods = driver.find_elements_by_class_name('gl-item')
13 | 
14 |         for good in goods:
15 |             detail_url = good.find_element_by_tag_name('a').get_attribute('href')
16 | 
17 |             p_name = good.find_element_by_css_selector('.p-name em').text.replace('\n', '')
18 |             price = good.find_element_by_css_selector('.p-price i').text
19 |             p_commit = good.find_element_by_css_selector('.p-commit a').text
20 | 
21 |             msg = '''
22 |             商品 : %s
23 |             链接 : %s
24 |             价钱 ：%s
25 |             评论 ：%s
26 |             ''' % (p_name, detail_url, price, p_commit)
27 | 
28 |             print(msg, end='\n\n')
29 | 
30 |         button = driver.find_element_by_partial_link_text('下一页')
31 |         button.click()
32 |         time.sleep(1)
33 |         get_goods(driver)
34 |     except Exception:
35 |         pass
36 | 
37 | 
38 | def spider(url, keyword):
39 |     driver = webdriver.Firefox()
40 |     driver.get(url)
41 |     driver.implicitly_wait(3)  # 使用隐式等待
42 |     try:
43 |         input_tag = driver.find_element_by_id('key')
44 |         input_tag.send_keys(keyword)
45 |         input_tag.send_keys(Keys.ENTER)
46 |         get_goods(driver)
47 |     finally:
48 |         driver.close()
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     spider('https://www.jd.com/', keyword='手机')
53 | 


--------------------------------------------------------------------------------
/其他实战/【人人网】自动登录/login.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | import re
  4 | import execjs.runtime_names
  5 | 
  6 | 
  7 | class People:
  8 |     def __init__(self, user, pwd):
  9 |         """
 10 |         初始化
 11 |         :param user: 用户名
 12 |         :param pwd: 密码
 13 |         """
 14 |         self.username = user
 15 |         self.pwd = pwd
 16 |         self.ua = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
 17 |         self.headers = {
 18 |             'User-Agent': self.ua,
 19 |             'Host': 'www.renren.com',
 20 |         }
 21 |         self.session = requests.session()
 22 |         self.json_data = ''
 23 | 
 24 |         print('【JS引擎】', execjs.get().name)
 25 |         with open("enc.js", "r", encoding="utf-8") as f:
 26 |             self.js = execjs.compile(f.read())
 27 | 
 28 |     def to_index(self):
 29 |         """
 30 |         第一步 - 访问首页
 31 |         获取 Cookies
 32 |         :return:
 33 |         """
 34 |         response = self.session.get('http://www.renren.com/', headers=self.headers)
 35 |         print('【主页】', response)
 36 | 
 37 |     def get_key(self):
 38 |         """
 39 |         第二步 - 获取加密参数
 40 |         获取 rkey 以及 密码加密所需参数
 41 |         :return:
 42 |         """
 43 |         headers = {
 44 |             'Referer': 'http://login.renren.com/ajaxproxy.htm',
 45 |             'User-Agent': self.ua,
 46 |         }
 47 |         response = self.session.get('http://login.renren.com/ajax/getEncryptKey', headers=headers)
 48 |         print('【获取key】', response.text)
 49 |         return response.text
 50 | 
 51 |     def login(self, key_info):
 52 |         """
 53 |         第三步 - 登录账号
 54 |         :param key_info: 第二步获取的参数
 55 |         :return:
 56 |         """
 57 |         url = 'http://www.renren.com/ajaxLogin/login?1=1' + self.js.call('getTime')
 58 |         data = {
 59 |             'email': self.username,
 60 |             'icode': "",
 61 |             'origURL': 'http://www.renren.com/home',
 62 |             'domain': 'renren.com',
 63 |             'key_id': '1',
 64 |             'captcha_type': 'web_login',
 65 |             'password': self.get_password(key_info),
 66 |             'rkey': json.loads(key_info).get('rkey'),
 67 |             'f': ''
 68 |         }
 69 |         print('【登录data】', data)
 70 |         print('【登录URL】', url)
 71 |         print('【Cookies】', self.session.cookies)
 72 |         response = self.session.post(url, data=data, headers=self.headers)
 73 |         print('【返回信息】', response.text)
 74 |         response = self.session.get('http://www.renren.com/home', headers=self.headers)
 75 |         print('【登录信息】', re.findall("<title>(.*?)</title>", response.text))
 76 | 
 77 |     def get_password(self, key_info):
 78 |         """
 79 |         调用 js 代码生成参数
 80 |         :param key_info:
 81 |         :return:
 82 |         """
 83 |         return self.js.call('enc', key_info, self.pwd)
 84 | 
 85 |     def start(self):
 86 |         """
 87 |         启动
 88 |         :return:
 89 |         """
 90 |         self.to_index()
 91 |         self.login(self.get_key())
 92 | 
 93 | 
 94 | if __name__ == '__main__':
 95 |     """
 96 |     启动区域
 97 |     """
 98 |     username = input('用户名>>> ')
 99 |     password = input('密码>>> ')
100 |     pp = People(username, password)
101 |     pp.start()
102 | 


--------------------------------------------------------------------------------
/其他实战/【企业名片】企业查询/qi_ming.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-11-08  Python: 3.7
 4 | 
 5 | import requests
 6 | import json
 7 | import execjs.runtime_names
 8 | 
 9 | 
10 | with open('encryp.js', 'r', encoding='utf-8') as f:
11 |     js = execjs.compile(f.read())
12 | 
13 | print('引擎', execjs.get().name)
14 | 
15 | data = {
16 |     'time_interval': '',
17 |     'tag': '',
18 |     'tag_type': '',
19 |     'province': '',
20 |     'lunci': '',
21 |     'page': '1',
22 |     'num': '20',
23 |     'unionid': '',
24 | }
25 | 
26 | headers = {
27 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.97 Safari/537.36'
28 | }
29 | 
30 | response = requests.post('https://vipapi.qimingpian.com/DataList/productListVip', data=data, headers=headers)
31 | 
32 | re_data = json.loads(response.text)
33 | 
34 | data = js.call('get_info', re_data.get('encrypt_data'))
35 | print(data.encode('utf-8').decode('unicode_escape'))
36 | 
37 | 


--------------------------------------------------------------------------------
/其他实战/【国鑫所】自动登录/Login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-10  Python: 3.7
 4 | import execjs.runtime_names
 5 | import requests
 6 | 
 7 | 
 8 | class GuoXin:
 9 |     """
10 |     国鑫所
11 |     https://wechat.gclfax.com/html/register/login.html
12 |     """
13 | 
14 |     def __init__(self, user, pwd):
15 |         self.user = user
16 |         self.pwd = pwd
17 |         self.url = 'https://wechat.gclfax.com/client/index.php'
18 |         self.js = None
19 |         self.init_js()
20 | 
21 |     def init_js(self):
22 |         print('引擎', execjs.get().name)
23 |         with open("encryp.js", "r", encoding="utf-8") as f:
24 |             self.js = execjs.compile(f.read())
25 | 
26 |     def login(self):
27 |         headers = {
28 |             'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
29 |             'Host': 'wechat.gclfax.com',
30 |             'Origin': 'https://wechat.gclfax.com',
31 |             'Referer': 'https://wechat.gclfax.com/html/register/login.html'
32 |         }
33 |         data = {
34 |             'OPT': '1',
35 |             'name': self.user,
36 |             'pwd': self.js.call('test', self.pwd),
37 |             'randomId': '',
38 |             'code': '',
39 |             'openid': '',
40 |         }
41 |         response = requests.post(self.url, headers=headers, data=data)
42 |         print(response.text)
43 |         print(response)
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     username = input('用户名')
48 |     password = input('密码')
49 |     gxs = GuoXin(username, password)
50 |     gxs.login()
51 | 


--------------------------------------------------------------------------------
/其他实战/【国鑫所】自动登录/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【国鑫所】自动登录/login_ok.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【国鑫所】自动登录/login_ok.png


--------------------------------------------------------------------------------
/其他实战/【天眼查】模拟登录/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-06-18  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【天眼查】模拟登录/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-04-13  Python: 3.7
 4 | import time
 5 | 
 6 | from lxml import etree
 7 | from selenium import webdriver
 8 | from selenium.webdriver.common.by import By
 9 | from selenium.webdriver.support.ui import WebDriverWait
10 | from selenium.webdriver.support import expected_conditions as EC
11 | 
12 | 
13 | class TYC_Spider:
14 | 
15 |     def __init__(self, username, password):
16 |         """初始化参数"""
17 |         url = 'https://www.tianyancha.com/login'
18 |         page_url = 'https://www.tianyancha.com/search/ohp1/p{page}?base=cq'
19 |         self.page_url = page_url
20 |         self.page = 1  # 当前页数
21 |         self.url = url
22 | 
23 |         options = webdriver.ChromeOptions()
24 |         # 不加载图片,加快访问速度
25 |         # options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
26 |         # 设置为开发者模式，避免被识别
27 |         options.add_experimental_option('excludeSwitches',
28 |                                         ['enable-automation'])
29 |         self.browser = webdriver.Chrome(executable_path='./chromedriver', options=options)
30 |         self.wait = WebDriverWait(self.browser, 40)
31 |         # 初始化用户名
32 |         self.username = username
33 |         # 初始化密码
34 |         self.password = password
35 | 
36 |     def run(self):
37 |         """登陆接口"""
38 |         self.browser.get(self.url)
39 |         try:
40 |             use_pass = self.wait.until(
41 |                 EC.presence_of_element_located((By.XPATH, '//*[@id="web-content"]/div/div[2]/div/div[2]/div/div[3]/div[1]/div[2]')))
42 |             time.sleep(2)
43 |             use_pass.click()
44 |             username = self.wait.until(
45 |                 EC.presence_of_element_located((By.XPATH, '//*[@id="web-content"]/div/div[2]/div/div[2]/div/div[3]/div[2]/div[2]/input')))
46 |             password = self.wait.until(
47 |                 EC.presence_of_element_located(
48 |                     (By.XPATH, '//*[@id="web-content"]/div/div[2]/div/div[2]/div/div[3]/div[2]/div[3]/input')))
49 |             input_to = self.wait.until(
50 |                 EC.presence_of_element_located(
51 |                     (By.XPATH, '//*[@id="web-content"]/div/div[2]/div/div[2]/div/div[3]/div[2]/div[5]')))
52 |             username.send_keys(self.username)
53 |             password.send_keys(self.password)
54 |             input_to.click()
55 | 
56 |             self.wait.until(
57 |                 EC.presence_of_element_located((By.XPATH, '//*[@id="home-main-search"]')))
58 |             print('登陆成功')
59 |             self.go_page()
60 | 
61 |         except Exception:
62 |             self.browser.close()
63 |             print("登陆失败")
64 | 
65 |     def go_page(self):
66 |         """进入指定页面"""
67 |         self.browser.get(self.page_url.format(page=str(self.page+1)))  # ohp带电话
68 |         self.get_info()
69 |         self.go_page()
70 | 
71 |     def get_info(self):
72 |         """获取当前页面，企业名称+电话号码"""
73 |         html = self.browser.page_source
74 |         etr = etree.HTML(html)
75 |         divs = etr.xpath("//div[@class='search-item sv-search-company']")
76 |         for div in divs:
77 |             title = div.xpath('./div/div[3]/div[1]/a/text()')
78 |             phone = div.xpath('./div/div[3]/div[3]/div[1]/script/text()')
79 |             if not phone:
80 |                 phone = div.xpath('./div/div[3]/div[3]/div[1]/span[2]/span/text()')
81 | 
82 |             if not phone:
83 |                 phone = div.xpath('./div/div[3]/div[4]/div[1]/script/text()')
84 |             print(title, phone)
85 |         time.sleep(2)
86 | 
87 | 
88 | if __name__ == "__main__":
89 |     name = input("请输入你的微博用户名:")
90 |     pas = input("请输入密码:")
91 |     spider = TYC_Spider(name, pas)
92 |     spider.run()
93 | 


--------------------------------------------------------------------------------
/其他实战/【天翼】登录/login.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # Auth: Zok  Email: 362416272@qq.com
 3 | # Date: 2020/1/23
 4 | 
 5 | 
 6 | import requests
 7 | import re
 8 | import execjs
 9 | 
10 | 
11 | session = requests.session()
12 | UA = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
13 | 
14 | 
15 | def login(username, password):
16 |     with open('v1.js', 'r', encoding='utf-8') as f:
17 |         js = execjs.compile(f.read())
18 |     username = js.call('make', username)
19 |     password = js.call('make', password)
20 |     url = 'https://e.189.cn/index.do'
21 |     login_url = 'https://open.e.189.cn/api/logbox/oauth2/loginSubmit.do'
22 |     response = session.get(url, headers={"User-Agent": UA})
23 |     ret = re.search(r'sign=(.*?)&appId=(.*?)&paras=(.*?)&format=(.*?)&clientType=(.*?)&version=(.*?)">', response.text)
24 | 
25 |     url = 'https://open.e.189.cn/api/logbox/oauth2/unifyAccountLogin.do?sign=' + ret.group(1) + '&appId=' + ret.group(
26 |         2) + '&paras=' + ret.group(3) + '&format=' + ret.group(4) + '&clientType=' + ret.group(
27 |         5) + '&version=' + ret.group(6)
28 | 
29 |     response = session.get(url, headers={"User-Agent": UA})
30 |     text = response.text
31 | 
32 |     captchaToken = re.search(r"captchaToken' value='(.*?)'>", text).group(1)
33 | 
34 |     ret = re.search(r"clientType = '(.*?)'[\s\S]*?accountType = '(.*?)'[\s\S]*?appKey = '(.*?)'", text)
35 |     clientType = ret.group(1)
36 |     accountType = ret.group(2)
37 |     appKey = ret.group(3)
38 | 
39 |     paramId = re.search(r'paramId = "(.*?)"', text).group(1)
40 |     REQID = re.search(r'reqId = "(.*?)"', text).group(1)
41 |     lt = re.search(r'lt = "(.*?)"', text).group(1)
42 | 
43 |     headers = {
44 |         'User-Agent': UA,
45 |         'Host': 'open.e.189.cn',
46 |         'Origin': 'https://open.e.189.cn',
47 |         'Referer': url,
48 |         'REQID': REQID,
49 |         'lt': lt,
50 |     }
51 |     data = {
52 |         'appKey': appKey,
53 |         'accountType': accountType,
54 |         'validateCode': "",  # 验证码
55 |         'captchaToken': captchaToken,
56 |         'returnUrl': 'https://e.189.cn/user/loginMiddle.do?returnUrlMid=https://e.189.cn/user/index.do',
57 |         'mailSuffix': '',
58 |         'dynamicCheck': 'FALSE',
59 |         'clientType': clientType,
60 |         'cb_SaveName': '1',
61 |         'isOauth2': 'false',
62 |         'state': '',
63 |         'paramId': paramId,
64 |         'userName': username,
65 |         'password': password,
66 |     }
67 |     response = session.post(login_url, headers=headers, data=data)
68 | 
69 |     # print(data)
70 |     print(response.text)
71 | 
72 | 
73 | print(execjs.get().name)
74 | if execjs.get().name != 'Node.js (V8)':
75 |     print('请安装V8 引擎')
76 | 
77 | if __name__ == '__main__':
78 |     user = input('用户名>>>')
79 |     pwd = input('密码>>>')
80 |     login(user, pwd)
81 | 


--------------------------------------------------------------------------------
/其他实战/【好莱客】参数解析/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【好莱客】参数解析/holike.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-07  Python: 3.7
 4 | import execjs.runtime_names
 5 | import requests
 6 | import time
 7 | import re
 8 | 
 9 | 
10 | class MakeParam:
11 |     """
12 |     好莱客
13 |     http://oa.holike.com/login.jsp
14 |     """
15 | 
16 |     def __init__(self, name, pwd):
17 |         self.name = name
18 |         self.pwd = pwd
19 |         self.js = None
20 | 
21 |         self.read_js()
22 | 
23 |     def get_key_vi(self):
24 |         url = 'http://oa.holike.com/resource/js/session.jsp?_={t}&s_ajax=true'
25 |         headers = {
26 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
27 |         }
28 |         response = requests.get(url.format(t=int(round(time.time() * 1000))), headers=headers)
29 |         try:
30 |             ret = re.search(r'return "(.*?)";', response.text).group(1)
31 |             _key = self.js.call('get_key_iv', ret)
32 |             return _key
33 |         except AttributeError:
34 |             print('获取key失败')
35 | 
36 |     def read_js(self):
37 |         with open('encryp.js', 'r', encoding='utf-8') as f:
38 |             self.js = execjs.compile(f.read())
39 | 
40 |     def make_params(self):
41 |         obj = self.get_key_vi()
42 |         j_password = self.js.call("make_j_password", self.pwd, obj.get('security'), obj.get('key'), obj.get('iv'))
43 | 
44 |         msg = """
45 |         j_username: {user}
46 |         j_password: {j_password}
47 |         """.format(user=self.name, j_password=j_password)
48 |         print(msg)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     username = input('请输入用户名')
53 |     password = input('请输入密码')
54 |     hk = MakeParam(username, password)
55 |     hk.make_params()
56 | 


--------------------------------------------------------------------------------
/其他实战/【好莱客】参数解析/ok.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【好莱客】参数解析/ok.png


--------------------------------------------------------------------------------
/其他实战/【小牛在线】登录参数生成/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【小牛在线】登录参数生成/make_param.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-29  Python: 3.7
 4 | 
 5 | import execjs.runtime_names
 6 | 
 7 | """
 8 | 小牛在线，登陆密码参数解密
 9 | https://www.xiaoniu88.com/user/login
10 | """
11 | 
12 | 
13 | def init_js():
14 |     with open("encryp.js", "r", encoding="utf-8") as f:
15 |         return execjs.compile(f.read())
16 | 
17 | 
18 | def make_param(password):
19 |     js = init_js()
20 |     pwd = js.call('get_pwd', password)
21 |     print('加密后密码', pwd)
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     password = input('明文密码')
26 |     make_param(password)
27 | 


--------------------------------------------------------------------------------
/其他实战/【开鑫贷】登陆参数生成/KaiXinDai.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-15  Python: 3.7
 4 | import requests
 5 | import execjs.runtime_names
 6 | 
 7 | 
 8 | class KaiXinDai:
 9 |     """
10 |     开鑫贷登陆参数解密
11 |     https://www.gkkxd.com/userAuth/login
12 |     """
13 |     def __init__(self, pwd):
14 |         self.js = None
15 |         self.pwd = pwd
16 |         self.init_js()
17 | 
18 |     @staticmethod
19 |     def get_dl():
20 |         from lxml import etree
21 |         url = 'https://www.kxjf.com/user/login?mainSiteName=kxd'
22 |         headers = {
23 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
24 |             'Host': 'www.kxjf.com',
25 |             'Referer': 'https://www.gkkxd.com/userAuth/login',
26 |         }
27 |         response = requests.get(url, headers=headers)
28 |         etree = etree.HTML(response.text)
29 |         dlmy = etree.xpath('//*[@id="dlmy"]/@value')[0]
30 |         return dlmy
31 | 
32 |     def init_js(self):
33 |         with open('encryp.js', 'r', encoding='utf-8') as f:
34 |             self.js = execjs.compile(f.read())
35 | 
36 |     def make_param(self):
37 |         pwd = self.js.call('test', self.get_dl(), self.pwd)
38 |         print('pwd生成', pwd)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     password = input('请输入用户密码')
43 |     kxd = KaiXinDai(password)
44 |     kxd.make_param()
45 | 


--------------------------------------------------------------------------------
/其他实战/【开鑫贷】登陆参数生成/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【微信】登录参数生成/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-10  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【微信】登录参数生成/make_pwd.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-08-22  Python: 3.7
 4 | import execjs.runtime_names
 5 | 
 6 | 
 7 | class WeXin(object):
 8 |     """
 9 |     wx 登陆密码解析
10 |     """
11 | 
12 |     def __init__(self):
13 |         self.url = 'https://mp.weixin.qq.com/?token=&lang=zh_CN'
14 |         print('引擎', execjs.get().name)
15 | 
16 |     @staticmethod
17 |     def make_pwd(pwd):
18 |         with open("encryp.js", "r", encoding="utf-8") as f:
19 |             ctx = execjs.compile(f.read())
20 | 
21 |         ret = ctx.call("make_pwd", pwd)
22 |         print(ret)
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     pdd = WeXin()
27 |     pdd.make_pwd('密码')
28 | 
29 | 


--------------------------------------------------------------------------------
/其他实战/【房价】房价获取/README.md:
--------------------------------------------------------------------------------
1 | # 概述
2 | 这不是一个完整的项目，是测试demo，可以获取区域内在售房产单套价格
3 | 
4 | 
5 | 
6 | **代码只是测试了一个最新销售小区中的一栋楼的在售楼房价格**
7 | 
8 | 如果需要更完整的，就联系作者


--------------------------------------------------------------------------------
/其他实战/【房价】房价获取/__pycache__/util.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【房价】房价获取/__pycache__/util.cpython-37.pyc


--------------------------------------------------------------------------------
/其他实战/【房价】房价获取/util.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # Auth: Zok  Email: 362416272@qq.com
 3 | # Date: 2020/2/21
 4 | 
 5 | 
 6 | from pyDes import *
 7 | import base64
 8 | 
 9 | KEY = b'hjkiuy6754edxc32890tfhjkw23xdea'[:24]  # 密钥只需要24位
10 | IV = b'jhf5632s'
11 | 
12 | 
13 | def des3_encrypt(s):
14 |     """
15 |     3DES 加密
16 |     :param s: 原始字符串
17 |     :return: 加密后字符串，16进制
18 |     """
19 |     k = triple_des(KEY, CBC, IV, pad=None, padmode=PAD_PKCS5)
20 |     en = k.encrypt(s, padmode=PAD_PKCS5)
21 |     return base64.b64encode(en).decode('utf-8')
22 | 
23 | 
24 | def des3_decrypt(s):
25 |     """
26 |     3DES 解密
27 |     :param s: 加密字符串
28 |     :return: 明文
29 |     """
30 |     _str = base64.b64decode(s)
31 |     k = triple_des(KEY, CBC, IV, pad=None, padmode=PAD_PKCS5)
32 |     en = k.decrypt(_str, padmode=PAD_PKCS5).decode('utf-8')
33 |     return en
34 | 
35 | 
36 | def decrypt_str(s):
37 |     info = des3_decrypt(s)  # 获得 解密后得 base64
38 |     content = info[:-6]
39 |     hIndex = base64.b64decode(info[-6:].replace("==", "")).decode().split("_")
40 |     content2 = content[int(hIndex[0]):]
41 |     txt = base64.b64decode(
42 |         content2[: len(content2)-int(hIndex[1])][::-1]
43 |     ).decode('utf-8').replace("##", "").replace("{@mk7}", "")
44 |     return txt
45 | 
46 | 
47 | def make_str(enB):
48 |     """
49 |     复写字符串算法
50 | 
51 |     根据传入文档，转换ascii并计算和
52 |     并复写算法
53 |     for (byte item : enB.getBytes("UTF-8")) {
54 |         sumResult = Long.valueOf(sumResult.longValue() + ((long) item));
55 |     }
56 |     """
57 |     count = 0
58 |     for i in enB:
59 |         count += ord(i)
60 |     # print('合', count)  # 每个字符的 Ascii 码的总和
61 |     p = count % len(enB)
62 |     n = 1
63 |     # print('position', p)
64 |     while p + n < len(enB) and p - n >= 0:
65 |         enB = rep(
66 |             rep(enB, p + n, enB[p - n]),
67 |             p - n,
68 |             enB[p + n]
69 |         )
70 |         n += 1
71 |     return enB
72 | 
73 | 
74 | def rep(source, index, rep_str):
75 |     """
76 |     复写的java层字符转换方法
77 |     :return:
78 |     """
79 |     str1 = source[0: index]
80 |     return str1 + rep_str + source[index + 1:]
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     decrypt_str("AaDaKV8GxE77rIScVyq7E0rebiFQjhrkq8PUcmR8A22NHhAW58pQkQ==")
85 | 


--------------------------------------------------------------------------------
/其他实战/【房天下】自动登录/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-26  Python: 3.7
 4 | 
 5 | import execjs.runtime_names
 6 | import requests
 7 | 
 8 | 
 9 | class Fang:
10 |     """
11 |     房天下自动登陆
12 |     https://passport.fang.com/
13 |     """
14 | 
15 |     def __init__(self, user, pwd):
16 |         self.user = user
17 |         self.pwd = pwd
18 |         self.js = None
19 |         self.api = 'https://passport.fang.com/login.api'
20 |         self.js_init()
21 | 
22 |     def js_init(self):
23 |         print('引擎', execjs.get().name)
24 |         with open("encryp.js", "r", encoding="utf-8") as f:
25 |             self.js = execjs.compile(f.read())
26 | 
27 |     def login(self):
28 |         data = {
29 |             'uid': self.user,
30 |             'pwd': self.js.call('getPwd', self.pwd),
31 |             'Service': 'soufun-passport-web',
32 |             'AutoLogin': '1'
33 |         }
34 |         headers = {
35 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
36 |             'Origin': 'https://passport.fang.com',
37 |             'Referer': 'https://passport.fang.com/',
38 |         }
39 |         response = requests.post(self.api, data=data, headers=headers)
40 |         print(response.text)
41 |         print(response.cookies)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     username = input('输入房天下账号')
46 |     password = input('输入密码')
47 |     f = Fang(username, password)
48 |     f.login()
49 | 


--------------------------------------------------------------------------------
/其他实战/【房天下】自动登录/ok.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【房天下】自动登录/ok.png


--------------------------------------------------------------------------------
/其他实战/【新浪微博】密码解密/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-26  Python: 3.7
 4 | 
 5 | 
 6 | import execjs
 7 | import requests
 8 | import json
 9 | import re
10 | 
11 | 
12 | def Get_parameters():
13 |     """微博加密参数有两个   用户名和密码
14 |     用户名为 base64加密
15 |     此处只解决了密码加密问题   其他的请自行拓展
16 |     pubkey,time,nonce
17 |     :return pubkey,time,nonce
18 |     """
19 |     try:
20 |         url = "https://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=MTc3MjM1NzI1OTA%3D&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.19)&_=1574300620782"
21 | 
22 |         headers = {
23 |             'User-Agent': 'Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.14',
24 |             'Host': 'login.sina.com.cn',
25 |             'Referer': 'https://www.weibo.com/login.php',
26 |         }
27 | 
28 |         res = requests.get(url=url, headers=headers)
29 |         data = re.findall('sinaSSOController.preloginCallBack\((.*?)\)', res.text, re.S)[0]
30 |         new_data = json.loads(data)
31 |         time = new_data.get('servertime')
32 |         nonce = new_data.get('nonce')
33 |         pubkey = new_data.get('pubkey')
34 |         return pubkey, time, nonce
35 |     except Exception as err:
36 |         print('访问失败', err)
37 | 
38 | 
39 | def main(pwd):
40 |     """
41 |     :param pwd:
42 |     :return:
43 |     """
44 |     with open('execute.js', 'r', encoding='utf-8') as f:
45 |         js = execjs.compile(f.read())
46 | 
47 |         print('引擎', execjs.get().name)
48 |         publickey, time, nonce = Get_parameters()
49 |         sign = js.call('get_up', pwd, publickey, time, nonce)
50 |         return sign
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     pwd = input('请输入密码：')
55 |     sign = main(pwd)
56 |     print(sign)
57 | 


--------------------------------------------------------------------------------
/其他实战/【时光网】登陆参数生成/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-11  Python: 3.7
 4 | import execjs.runtime_names
 5 | 
 6 | 
 7 | class MTime:
 8 |     """
 9 |     时光网登陆，password 加密解析
10 |     https://m.mtime.cn/#!/member/signin
11 |     """
12 |     def __init__(self, name, pwd):
13 |         self.name = name
14 |         self.pwd = pwd
15 |         self.url = 'https://m.mtime.cn/Service/callback-comm.mi/user/login.api'
16 |         self.js = None
17 |         self.init_js()
18 | 
19 |     def init_js(self):
20 |         print('引擎', execjs.get().name)
21 |         with open("encryp.js", "r", encoding="utf-8") as f:
22 |             self.js = execjs.compile(f.read())
23 | 
24 |     def make_pwd(self):
25 |         print(self.js.call('get_pwd', self.pwd))
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     username = input('请输入用户名')
30 |     password = input('输入密码')
31 |     mt = MTime(username, password)
32 |     mt.make_pwd()
33 | 


--------------------------------------------------------------------------------
/其他实战/【易通贷】自动登录/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【易通贷】自动登录/auto_login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-08-26  Python: 3.7
 4 | 
 5 | import requests
 6 | import execjs.runtime_names
 7 | 
 8 | 
 9 | class YDT(object):
10 |     """
11 |     易通贷自动登陆
12 |     """
13 | 
14 |     def __init__(self, user, pwd):
15 |         self.user = user
16 |         self.pwd = pwd
17 |         self.url = 'https://app.etongdai.com/login/verifylogin'
18 |         print('引擎', execjs.get().name)
19 | 
20 |     @staticmethod
21 |     def make_pwd(pwd):
22 |         with open("encryp.js", "r", encoding="utf-8") as f:
23 |             ctx = execjs.compile(f.read())
24 |         return ctx.call("make_js", pwd)
25 | 
26 |     def make_data(self):
27 |         data = {
28 |             'loginName': self.user,
29 |             'check': 'on',
30 |             'next': 'null',
31 |             'password': self.make_pwd(self.pwd),
32 |         }
33 | 
34 |         return data
35 | 
36 |     def login(self):
37 |         data = self.make_data()
38 |         response = requests.post(self.url, data=data)
39 |         data = response.content.decode('utf-8')
40 |         print(data)
41 | 
42 | 
43 | if __name__ == '__main__':
44 |     username = input('请输入 易通贷账号')
45 |     password = input('请输入 易通贷密码')
46 |     ydt = YDT(username, password)
47 |     ydt.login()
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/其他实战/【汽车之家】参数解密/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-08-26  Python: 3.7
 4 | 
 5 | import execjs
 6 | 
 7 | 
 8 | def main(pwd):
 9 |     """只解决了pwd的加密，其他请自行拓展
10 |     :param pwd:
11 |     :return:
12 |     """
13 |     with open('execute.js', 'r', encoding='utf-8') as f:
14 |         js = execjs.compile(f.read())
15 | 
16 |     print('引擎', execjs.get().name)
17 | 
18 |     sign = js.call('hex_md5', pwd)
19 |     return sign
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     pwd = input('请输入你的密码：')
24 |     print(main(pwd))
25 | 


--------------------------------------------------------------------------------
/其他实战/【满级网】自动登录/auto_login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-08-26  Python: 3.7
 4 | 
 5 | import requests
 6 | import base64
 7 | from Crypto.Cipher import PKCS1_v1_5 as Cipher_pksc1_v1_5
 8 | from Crypto.PublicKey import RSA
 9 | 
10 | 
11 | class YX(object):
12 |     """
13 |     满级网自动登陆 官网 www.manjiwang.com
14 |     http://www.manjiwang.com/Logins/BuyerLogin
15 |     """
16 | 
17 |     def __init__(self, user, pwd):
18 |         self.user = user
19 |         self.pwd = pwd
20 |         self.url = 'http://www.manjiwang.com/Logins/BuyerLogin'
21 |         self.headers = {
22 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36',
23 |             'Host': 'www.manjiwang.com',
24 |         }
25 |         self.public_key = """-----BEGIN PUBLIC KEY-----
26 |         MIGfMA0GCSqGSIb3DQEBAQUAA4GNADCBiQKBgQDC4wHerJc4BSst20Zb07lY9LeZss4OEEhe+SrnLyYy8hGquX/aTQNn+5wnV/+8ierKPgqPGIXPf1ZRww5/6yON+O7dAfJ7BRx85HneIWqwPCZToLck8DN8UXsBuXLMcG7tfMunnnZKenrPsAslN0eKvkYkvz4EPGdvmPwz0NCKXQIDAQAB
27 |         -----END PUBLIC KEY-----
28 |         """
29 | 
30 |     def make_pwd(self):
31 |         rsa_key = RSA.importKey(self.public_key)
32 |         cipher = Cipher_pksc1_v1_5.new(rsa_key)
33 |         cipher_text = base64.b64encode(cipher.encrypt(self.pwd.encode()))
34 |         return cipher_text.decode()
35 | 
36 |     def make_data(self):
37 |         data = {
38 |             'account': self.user,
39 |             'password': self.make_pwd(),
40 |             'returnUrl': '/'
41 |         }
42 |         return data
43 | 
44 |     def login(self):
45 |         """start
46 |         """
47 |         data = self.make_data()
48 |         response = requests.post(self.url, data=data)
49 |         print(response.text)
50 |         print(response.cookies)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     username = input('请输入账号')
55 |     password = input('密码')
56 |     yx = YX(username, password)
57 |     yx.login()
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/其他实战/【百度】wap端sig生成/make_sig.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2020-01-13  Python: 3.7
 4 | 
 5 | """
 6 | wap端 sig 参数生成
 7 | 应水友需求，帮忙弄的
 8 | 需要 V8 引擎！
 9 | """
10 | 
11 | import execjs
12 | import os
13 | 
14 | print(execjs.get().name)
15 | 
16 | 
17 | with open(os.path.dirname(__file__) + '/v3_update.js') as f:
18 |     js = execjs.compile(f.read())
19 | 
20 | 
21 | # dv 可固定， 用了一些随机参数生成的。
22 | dv = 'tk0.48553508531670751578885709447.0@mmy0VdnCHg9mlXM-7ZM-tbvB8YHXK3MIEg9WNa8V3x9Cqa5kqgOXcFOjca5BJWOB7eNIzY5k9j8VNKUk0~9F~~5rOiHXvivmzzHjJFMXubOG~W8VRln6~l9k0g9mlXM-7ZM-tbvB8YHXK3MIEg9WH~9V7x9Cql5kqgOXcFOjca5BJWOB7eNIzY5k9-9CRWUq__dy0ov8Cpy5k9j8S~W8Cpz9SlXM-7ZM-tbH-JSMIYaUktanm~F9VEg9WEj8VRgOXcFOjca5BJWOB7eNIzYUk0~9kHg9C9~5kEF8WqW9mlx-vvLwvB87Tr4hByj9G~F5kHyGynvrg~5Vty8CEW8Cqy8C9l8VH~8WEl8CHynkRz8WqK8kt-5Vq_jy~56JeOrJXLIKYOq__Hyr9m~~5k0K9k9g9WHj5k0K9Vqg9Cqy9m~lnCp~5k0K9Vqg9Cqa9q__'
23 | username = '这是测试'  # 用户名
24 | s_code = 'ilvw'  # 验证码
25 | verifystring = 'jxOb3456654e9d67a5c02ab155fe9012fb44e5b90ae9b01ca02'  # 首页返回的
26 | 
27 | result = js.call('v3test', dv, s_code, verifystring)
28 | 
29 | print(result)


--------------------------------------------------------------------------------
/其他实战/【百度】网页找回密码/__pycache__/header.cpython-37.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【百度】网页找回密码/__pycache__/header.cpython-37.pyc


--------------------------------------------------------------------------------
/其他实战/【百度】网页找回密码/header.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-12-23  Python: 3.7
 4 | 
 5 | UA = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
 6 | REFERER = 'https://passport.baidu.com/?getpassindex'
 7 | LANGUAGE = 'zh-CN,zh;q=0.9'
 8 | CONNECTION = 'keep-alive'
 9 | 
10 | headers_get_phone = {
11 |     'Connection': CONNECTION,
12 |     'User-Agent': UA,
13 |     'Accept': '*/*',
14 |     'Sec-Fetch-Site': 'same-origin',
15 |     'Sec-Fetch-Mode': 'no-cors',
16 |     'Referer': REFERER,
17 |     'Accept-Language': LANGUAGE
18 | }
19 | 
20 | 
21 | headers_token = {
22 |     "Connection": CONNECTION,
23 |     "Content-Lengt": '999',
24 |     "Cache-Control": 'max-age=0',
25 |     "Origin": "https://passport.baidu.com",
26 |     "Upgrade-Insecure-Requests": '1',
27 |     "Content-Type": "application/x-www-form-urlencoded",
28 |     "User-Agent": UA,
29 |     "Sec-Fetch-User": "?1",
30 |     "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
31 |     "Sec-Fetch-Site": "same-origin",
32 |     "Sec-Fetch-Mode": "navigate",
33 |     "Referer": REFERER,
34 |     "Accept-Language": LANGUAGE,
35 | }
36 | 
37 | headers_img = {
38 |     'Connection': CONNECTION,
39 |     'User-Agent': UA,
40 |     'Accept': 'image/webp,image/apng,image/*,*/*;q=0.8',
41 |     'Sec-Fetch-Site': 'same-origin',
42 |     'Sec-Fetch-Mode': 'no-cors',
43 |     'Referer': REFERER,
44 |     'Accept-Language': LANGUAGE,
45 | }
46 | 
47 | headers_bds_token = {
48 |     'Connection': CONNECTION,
49 |     'Upgrade-Insecure-Requests': '1',
50 |     'User-Agent': UA,
51 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
52 |     'Sec-Fetch-Site': 'none',
53 |     'Sec-Fetch-Mode': 'navigate',
54 |     'Accept-Language': LANGUAGE,
55 | }
56 | 
57 | headers_verify_str = {
58 |     'Connection': CONNECTION,
59 |     'User-Agent': UA,
60 |     'Accept': '*/*',
61 |     'Sec-Fetch-Site': 'same-origin',
62 |     'Sec-Fetch-Mode': 'no-cors',
63 |     'Referer': REFERER,
64 |     'Accept-Language': LANGUAGE,
65 | }
66 | 


--------------------------------------------------------------------------------
/其他实战/【百度】网页找回密码/验证码.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【百度】网页找回密码/验证码.png


--------------------------------------------------------------------------------
/其他实战/【百度】翻译/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-11-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【百度】翻译/translate.js:
--------------------------------------------------------------------------------
 1 | var i = "320305.131321201"
 2 | 
 3 | 
 4 | function n(r, o) {
 5 |     for (var t = 0; t < o.length - 2; t += 3) {
 6 |         var e = o.charAt(t + 2);
 7 |         e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e),
 8 |             e = "+" === o.charAt(t + 1) ? r >>> e : r << e,
 9 |             r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e
10 |     }
11 |     return r
12 | }
13 | 
14 | function a(r) {
15 |     var t = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
16 |     if (null === t) {
17 |         var a = r.length;
18 |         a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10))
19 |     } else {
20 |         for (var C = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), h = 0, f = C.length, u = []; f > h; h++)
21 |             "" !== C[h] && u.push.apply(u, e(C[h].split(""))),
22 |             h !== f - 1 && u.push(t[h]);
23 |         var g = u.length;
24 |         g > 30 && (r = u.slice(0, 10).join("") + u.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + u.slice(-10).join(""))
25 |     }
26 |     var l = void 0
27 |         , d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
28 |     l = null !== i ? i : (i = o.common[d] || "") || "";
29 |     for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) {
30 |         var p = r.charCodeAt(F);
31 |         128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)),
32 |             c[v++] = p >> 18 | 240,
33 |             c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224,
34 |             c[v++] = p >> 6 & 63 | 128),
35 |             c[v++] = 63 & p | 128)
36 |     }
37 |     for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++)
38 |         w += c[D],
39 |             w = n(w, A);
40 |     return w = n(w, b),
41 |         w ^= s,
42 |     0 > w && (w = (2147483647 & w) + 2147483648),
43 |         w %= 1e6,
44 |     w.toString() + "." + (w ^ S)
45 | }


--------------------------------------------------------------------------------
/其他实战/【百度】翻译/translation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-04-26  Python: 3.7
 4 | 
 5 | # 本代码参考  github作者：CriseLYJ
 6 | 
 7 | import requests
 8 | import js2py
 9 | 
10 | 
11 | class FanYiSpider(object):
12 |     """
13 |     翻译
14 |     """
15 |     context = js2py.EvalJs()  # python中使用js
16 | 
17 |     def __init__(self, query):
18 |         # 初始化
19 |         self.url = "https://fanyi.baidu.com/basetrans"
20 |         self.query = query
21 |         self.headers = {
22 |             "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Mobile Safari/537.36",
23 |             "Referer": "https://fanyi.baidu.com/",
24 |             "Cookie": "BAIDUID=714BFAAF02DA927F583935C7A354949A:FG=1; BIDUPSID=714BFAAF02DA927F583935C7A354949A; PSTM=1553390486; delPer=0; PSINO=5; H_PS_PSSID=28742_1463_21125_18559_28723_28557_28697_28585_28640_28604_28626_22160; locale=zh; from_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; to_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lvt_afd111fa62852d1f37001d1f980b6800=1553658863,1553766321,1553769980,1553770442; Hm_lpvt_afd111fa62852d1f37001d1f980b6800=1553770442; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1553766258,1553766321,1553769980,1553770442; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1553770442",
25 |             "Content-Type": "application/x-www-form-urlencoded",
26 |             "Origin": "https://fanyi.baidu.com",
27 |             "X-Requested-With": "XMLHttpRequest",
28 |         }
29 | 
30 |     def make_sign(self):
31 |         with open("translate.js", "r", encoding="utf-8") as f:
32 |             self.context.execute(f.read())
33 | 
34 |         sign = self.context.a(self.query)
35 |         return sign
36 | 
37 |     def make_data(self, sign):
38 |         data = {
39 |             "query": self.query,
40 |             "from": "en",
41 |             "to": "zh",
42 |             "token": "6f5c83b84d69ad3633abdf18abcb030d",
43 |             "sign": sign
44 |         }
45 |         return data
46 | 
47 |     def get_content(self, data):
48 |         response = requests.post(
49 |             url=self.url,
50 |             headers=self.headers,
51 |             data=data
52 |         )
53 |         return response.json()["trans"][0]["dst"]
54 | 
55 |     @property
56 |     def run(self):
57 |         sign = self.make_sign() # 获取sign的值
58 |         data = self.make_data(sign)  # 构建参数
59 |         content = self.get_content(data)  # 获取翻译内容
60 |         return content
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     key = input("输入翻译内容:")
65 |     translate = FanYiSpider(key)
66 |     print(translate.run)
67 | 


--------------------------------------------------------------------------------
/其他实战/【百度】自动登录/README.md:
--------------------------------------------------------------------------------
1 | # 解密过程参考博客
2 | 
3 | [博客链接](https://www.zhangkunzhi.com/?p=216)


--------------------------------------------------------------------------------
/其他实战/【百度】自动登录/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-08-05  Python: 3.7
 4 | 
 5 | """
 6 | 百度登陆参数比较多
 7 | 
 8 | 这里是密码加密生成器
 9 | """
10 | 
11 | import js2py
12 | 
13 | 
14 | class PingDuoDuoSpider(object):
15 |     """
16 |     生成百度登陆密码加密结果
17 |     """
18 |     context = js2py.EvalJs()  # python中使用js
19 | 
20 |     def __init__(self):
21 |         # 初始化
22 |         with open("encryp.js", "r", encoding="utf-8") as f:
23 |             self.context.execute(f.read())
24 | 
25 |     def make(self, password):
26 |         pwd = self.context.test(password)
27 |         print(pwd)  # 打印加密之后的密码
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     pdd = PingDuoDuoSpider()
32 | 
33 |     key = input("输入密码")
34 |     pdd.make(key)
35 | 


--------------------------------------------------------------------------------
/其他实战/【百度街拍】图片下载/get_image.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # __author__ = "zok"  362416272@qq.com
  3 | # Date: 2019-08-05  Python: 3.7
  4 | 
  5 | import requests, time
  6 | from urllib.parse import urlencode
  7 | from urllib.request import urlretrieve
  8 | 
  9 | 
 10 | def getPage(offset):
 11 |     '''获取网页信息'''
 12 |     data = {
 13 |         'tn': 'resultjson_com',
 14 |         'ipn': 'rj',
 15 |         'ct': '201326592',
 16 |         'is': '',
 17 |         'fp': 'result',
 18 |         'queryWord': '街拍',
 19 |         'cl': '2',
 20 |         'lm': '-1',
 21 |         'ie': 'utf - 8',
 22 |         'oe': 'utf - 8',
 23 |         'adpicid': '',
 24 |         'st': '-1',
 25 |         'z': '',
 26 |         'ic': '0',
 27 |         'hd': '',
 28 |         'latest': '',
 29 |         'copyright': '',
 30 |         'word': '街拍',
 31 |         's': '',
 32 |         'se': '',
 33 |         'tab': '',
 34 |         'width': '',
 35 |         'height': '',
 36 |         'face': '0',
 37 |         'istype': '2',
 38 |         'qc': '',
 39 |         'nc': '1',
 40 |         'fr': '',
 41 |         'expermode': '',
 42 |         'force': '',
 43 |         'pn': offset,
 44 |         'rn': '30',
 45 |         'gsm': '1e',
 46 |         '1551789143500': '',
 47 |     }
 48 |     headers = {
 49 |         'Accept': 'text/plain, */*; q=0.01',
 50 |         'Accept-Encoding': 'deflate, br',
 51 |         'Accept-Language': 'Accept-Language',
 52 |         'Connection': 'keep-alive',
 53 |         'Cookie': 'BDqhfp=%E8%A1%97%E6%8B%8D%26%260-10-1undefined%26%260%26%261; BIDUPSID=7CA5F033CA22949F5FB6110DBC5DC1EE; BAIDUID=6DDE5BAA44763FD6C7CA84401CB19F36:FG=1; indexPageSugList=%5B%22%E8%A1%97%E6%8B%8D%22%5D; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; uploadTime=1551768107224; userFrom=null; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; firstShowTip=1; cleanHistoryStatus=0',
 54 |         'Host': 'image.baidu.com',
 55 |         'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E8%A1%97%E6%8B%8D&oq=%E8%A1%97%E6%8B%8D&rsp=-1',
 56 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6735.400 QQBrowser/10.2.2328.400',
 57 |         'X-Requested-With': 'XMLHttpRequest',
 58 |     }
 59 |     url = 'https://image.baidu.com/search/acjson?' + urlencode(data)
 60 |     try:
 61 |         res = requests.get(url, data=data, headers=headers)
 62 |         res.encoding = 'utf-8'  # 网页信息编码
 63 |         if res.status_code == 200:
 64 |             return res.json()
 65 |     except requests.ConnectionError:
 66 |         return None
 67 | 
 68 | 
 69 | def getImage(json):
 70 |     '''解析网页数据并爬取所需的信息'''
 71 |     try:
 72 |         data = json.get('data')
 73 |         if data:
 74 |             for item in data:
 75 |                 yield {
 76 |                     'image': item.get('hoverURL'),
 77 |                     'title': item.get('fromPageTitleEnc'),
 78 |                 }
 79 |     except:
 80 |         return None
 81 | 
 82 | 
 83 | def saveImage(item):
 84 |     '''把获取的图片与标题封装并存储'''
 85 |     try:
 86 |         m = item.get('title')
 87 |         local_image = item.get('image')  # 获取图片的url
 88 |         image_url = local_image
 89 |         urlretrieve(image_url, './pic/' + str(m) + '.jpg')
 90 |         # print('p'+str(m) + '.jpg')
 91 |     except:
 92 |         return None
 93 | 
 94 | 
 95 | def main(offset):
 96 |     '''调度爬取函数和存储'''
 97 |     json = getPage(offset)
 98 |     for item in getImage(json):
 99 |         print(item)
100 |         saveImage(item)
101 | 
102 | 
103 | if __name__ == '__main__':
104 |     for i in range(5):  # 此处循环遍历五次是不可行的  每次data值中的gsm在变化
105 |         main(offset=i * 30)
106 |         time.sleep(1)
107 | 


--------------------------------------------------------------------------------
/其他实战/【移动】登录参数生成/MakeParam.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-05  Python: 3.7
 4 | 
 5 | import execjs.runtime_names
 6 | 
 7 | 
 8 | class MakeParam:
 9 |     """
10 |     移动登陆
11 |     加密参数生成器
12 |     页面 https://mail.10086.cn/
13 |     """
14 | 
15 |     def __init__(self, name, pwd):
16 |         self.name = name
17 |         self.pwd = pwd
18 |         self.js = None
19 |         self.init_js()
20 | 
21 |     def init_js(self):
22 |         print('引擎', execjs.get().name)
23 |         with open("encryp.js", "r", encoding="utf-8") as f:
24 |             self.js = execjs.compile(f.read())
25 | 
26 |     def mk_params(self):
27 |         cguid = self.js.call("customerGetCGUID")
28 |         _ = self.js.call('sha1', self.name)
29 |         word = self.js.call('calcDigest', self.pwd)
30 |         msg = """
31 |         cguid: {cguid}
32 |         _: {_}
33 |         password: {word}
34 |         """
35 |         print(msg.format(cguid=cguid, _=_, word=word))
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     username = input('输入用户名')
40 |     password = input('输入密码')
41 |     yd = MakeParam(username, password)
42 |     yd.mk_params()
43 | 


--------------------------------------------------------------------------------
/其他实战/【移动】登录参数生成/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【移动】登录参数生成/encryp.js:
--------------------------------------------------------------------------------
 1 | function sha1(a) {
 2 |     function b(a, b) {
 3 |         var c = (a & 65535) + (b & 65535);
 4 |         return (a >> 16) + (b >> 16) + (c >> 16) << 16 | c & 65535
 5 |     }
 6 | 
 7 |     for (var c = [], d = 0; d < 8 * a.length; d += 8)
 8 |         c[d >> 5] |= (a.charCodeAt(d / 8) & 255) << 24 - d % 32;
 9 |     a = 8 * a.length;
10 |     c[a >> 5] |= 128 << 24 - a % 32;
11 |     c[(a + 64 >> 9 << 4) + 15] = a;
12 |     a = Array(80);
13 |     for (var d = 1732584193, e = -271733879, f = -1732584194, g = 271733878, k = -1009589776, h = 0; h < c.length; h +=
14 |         16) {
15 |         for (var l = d, m = e, n = f, p = g, q = k, j = 0; 80 > j; j++) {
16 |             a[j] = 16 > j ? c[h + j] : (a[j - 3] ^ a[j - 8] ^ a[j - 14] ^ a[j - 16]) << 1 | (a[j - 3] ^ a[j - 8] ^ a[j - 14] ^
17 |                 a[j - 16]) >>> 31;
18 |             var r = b(b(d << 5 | d >>> 27, 20 > j ? e & f | ~e & g : 40 > j ? e ^ f ^ g : 60 > j ? e & f | e & g | f & g : e ^
19 |                 f ^ g), b(b(k, a[j]), 20 > j ? 1518500249 : 40 > j ? 1859775393 : 60 > j ? -1894007588 : -899497514)),
20 |                 k = g,
21 |                 g = f,
22 |                 f = e << 30 | e >>> 2,
23 |                 e = d,
24 |                 d = r
25 |         }
26 |         d = b(d, l);
27 |         e = b(e, m);
28 |         f = b(f, n);
29 |         g = b(g, p);
30 |         k = b(k, q)
31 |     }
32 |     c = [d, e, f, g, k];
33 |     a = "";
34 |     for (d = 0; d < 4 * c.length; d++)
35 |         a += "0123456789abcdef".charAt(c[d >> 2] >> 8 * (3 - d % 4) + 4 & 15) + "0123456789abcdef".charAt(c[d >> 2] >> 8 *
36 |             (3 - d % 4) & 15);
37 |     return a
38 | }
39 | 
40 | 
41 | function a(a, c) {
42 |     var d = (a & 65535) + (c & 65535);
43 |     return (a >> 16) + (c >> 16) + (d >> 16) << 16 | d & 65535
44 | }
45 | 
46 | calcDigest = function (b) {
47 |     for (var c = (b.length + 8 >> 6) + 1, d = Array(16 * c), e = 0; e < 16 * c; e++)
48 |         d[e] = 0;
49 |     for (e = 0; e < b.length; e++)
50 |         d[e >> 2] |= b.charCodeAt(e) << 24 - 8 * (e & 3);
51 |     d[e >> 2] |= 128 << 24 - 8 * (e & 3);
52 |     d[16 * c - 1] = 8 * b.length;
53 |     b = Array(80);
54 |     for (var c = 1732584193, e = -271733879, f = -1732584194, g = 271733878, k = -1009589776, h = 0; h < d.length; h +=
55 |         16) {
56 |         for (var l = c, m = e, n = f, p = g, q = k, j = 0; 80 > j; j++) {
57 |             b[j] = 16 > j ? d[h + j] : (b[j - 3] ^ b[j - 8] ^ b[j - 14] ^ b[j - 16]) << 1 | (b[j - 3] ^ b[j - 8] ^ b[j - 14] ^
58 |                 b[j - 16]) >>> 31;
59 |             var r = a(a(c << 5 | c >>> 27, 20 > j ? e & f | ~e & g : 40 > j ? e ^ f ^ g : 60 > j ? e & f | e & g | f & g : e ^
60 |                 f ^ g), a(a(k, b[j]), 20 > j ? 1518500249 : 40 > j ? 1859775393 : 60 > j ? -1894007588 : -899497514)),
61 |                 k = g,
62 |                 g = f,
63 |                 f = e << 30 | e >>> 2,
64 |                 e = c,
65 |                 c = r
66 |         }
67 |         c = a(c, l);
68 |         e = a(e, m);
69 |         f = a(f, n);
70 |         g = a(g, p);
71 |         k = a(k, q)
72 |     }
73 |     d = [c, e, f, g, k];
74 |     b = "";
75 |     for (c = 0; c < 4 * d.length; c++)
76 |         b += "0123456789abcdef".charAt(d[c >> 2] >> 8 * (3 - c % 4) + 4 & 15) + "0123456789abcdef".charAt(d[c >> 2] >> 8 *
77 |             (3 - c % 4) & 15);
78 |     return b
79 | }
80 | 
81 | 
82 | function customerGetCGUID() {
83 |     function a(a, b) {
84 |         var e = (b || 2) - (1 + Math.floor(Math.log(a | 1) / Math.LN10 + 1E-15));
85 |         return Array(e + 1).join("0") + a
86 |     }
87 | 
88 |     var b = new Date;
89 |     return "" + a(b.getHours()) + a(b.getMinutes()) + a(b.getSeconds()) + a(b.getMilliseconds(), 3) + a(Math.ceil(9999 *
90 |         Math.random()), 4)
91 | }


--------------------------------------------------------------------------------
/其他实战/【移动】登录参数生成/make_params.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【移动】登录参数生成/make_params.png


--------------------------------------------------------------------------------
/其他实战/【空中网】自动登录/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【空中网】自动登录/encryp.js:
--------------------------------------------------------------------------------
 1 | function mk_pwd (str, pwd) {
 2 |     if (pwd == null || pwd.length <= 0) {
 3 |         return null
 4 |     }
 5 |     ;var prand = "";
 6 |     for (var i = 0; i < pwd.length; i++) {
 7 |         prand += pwd.charCodeAt(i).toString()
 8 |     }
 9 |     ;var sPos = Math.floor(prand.length / 5);
10 |     var mult = parseInt(prand.charAt(sPos) + prand.charAt(sPos * 2) + prand.charAt(sPos * 3) + prand.charAt(sPos * 4) + prand.charAt(sPos * 5));
11 |     var incr = Math.ceil(pwd.length / 2);
12 |     var modu = Math.pow(2, 31) - 1;
13 |     if (mult < 2) {
14 |         return null
15 |     }
16 |     ;var salt = Math.round(Math.random() * 1000000000) % 100000000;
17 |     prand += salt;
18 |     while (prand.length > 10) {
19 |         var a = prand.substring(0, 1);
20 |         var b = prand.substring(10, prand.length);
21 |         if (b.length > 10) {
22 |             prand = b
23 |         } else {
24 |             prand = (parseInt(a) + parseInt(b)).toString()
25 |         }
26 |     }
27 |     ;prand = (mult * prand + incr) % modu;
28 |     var enc_chr = "";
29 |     var enc_str = "";
30 |     for (var i = 0; i < str.length; i++) {
31 |         enc_chr = parseInt(str.charCodeAt(i) ^ Math.floor((prand / modu) * 255));
32 |         if (enc_chr < 16) {
33 |             enc_str += "0" + enc_chr.toString(16)
34 |         } else
35 |             enc_str += enc_chr.toString(16);
36 |         prand = (mult * prand + incr) % modu
37 |     }
38 |     ;salt = salt.toString(16);
39 |     while (salt.length < 8)
40 |         salt = "0" + salt;
41 |     enc_str += salt;
42 |     return enc_str
43 | }


--------------------------------------------------------------------------------
/其他实战/【空中网】自动登录/spider_login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-03  Python: 3.7
 4 | import re
 5 | import time
 6 | import requests
 7 | import execjs.runtime_names
 8 | 
 9 | 
10 | class SpiderLogin:
11 |     """
12 |     空中网爬虫登陆
13 |     """
14 | 
15 |     def __init__(self, user, pwd):
16 |         self.session = requests.session()
17 |         self.user = user
18 |         self.pwd = pwd
19 |         self.login_time = int(round(time.time() * 1000))
20 |         self.url = 'https://m.wcbchina.com/login/other-login.html'
21 |         self.headers = {
22 |             'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
23 |             'Host': 'sso.kongzhong.com',
24 |             'Referer': 'https://passport.kongzhong.com/login'
25 |         }
26 | 
27 |     def use_js(self, dc):
28 |         """js 调用
29 |         """
30 |         with open("encryp.js", "r", encoding="utf-8") as f:
31 |             js = execjs.compile(f.read())
32 |         try:
33 |             pwd = js.call("mk_pwd", self.pwd, dc)
34 |             return pwd
35 |         except Exception:
36 |             print('js 异常')
37 | 
38 |     def auto_login(self):
39 |         """登陆
40 |         """
41 |         login_url = 'https://sso.kongzhong.com/ajaxLogin?j=j&&type=1&service=https://passport.kongzhong.com/&username={username}&password={password}&vcode=&toSave=0&_={_time}'
42 |         dc = self.get_dc()
43 |         en_pwd = self.use_js(dc)
44 |         response = self.session.get(login_url.format(username=self.user, password=en_pwd, _time=self.login_time), headers=self.headers)
45 |         print(response.cookies)
46 |         print(response.text)
47 |         print(response)
48 | 
49 |     def get_dc(self):
50 |         """捕获 dc 参数
51 |         """
52 |         target = 'https://sso.kongzhong.com/ajaxLogin?j=j&jsonp=j&service=https://passport.kongzhong.com/&_={t}'.format(
53 |             t=self.login_time)
54 |         response = self.session.get(target, headers=self.headers)
55 |         try:
56 |             dc = re.search(r'"dc":"(.*?)","kzmsg', response.text).group(1)
57 |             return dc
58 |         except AttributeError:
59 |             print('dc 捕获失败')
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     username = input('请输入账号')
64 |     password = input('密码')
65 |     kzw = SpiderLogin(username, password)
66 |     kzw.auto_login()
67 | 


--------------------------------------------------------------------------------
/其他实战/【美团】数据解析、token生成/README.md:
--------------------------------------------------------------------------------
 1 | ```
 2 | .
 3 | └── MeiTuan                              // -------美团-------
 4 |      ├── get_login_cookies.py            // 基于pyppeteer登陆并获取cookies
 5 |      ├── parse_play_areas.py             // 三级区域解析器(休闲板块)
 6 |      ├── parse_play_info.py              // 休闲会所商铺数据解析
 7 |      ├── parse_hotel_info.py             // 酒店基础数据解析
 8 |      ├── parse_hotel_comments.py         // 酒店评论解析
 9 |      ├── create_food_token.py            // 餐饮页Token生成器
10 |      ├── parse_food_comments.py          // 获取用户评论数据
11 |      └── parse_food_info.py              // 解析餐馆数据
12 | 
13 | ```
14 | 


--------------------------------------------------------------------------------
/其他实战/【美团】数据解析、token生成/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-06-18  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【美团】数据解析、token生成/create_food_token.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-04-21  Python: 3.7
 4 | 
 5 | import json, zlib, base64, time
 6 | 
 7 | 
 8 | class MakeToken():
 9 |     """
10 |     测试2019-4-21日可用
11 |     仅作为学术交流！如有侵权，联系作者删除
12 |     美团【餐馆列表】Token生成
13 |     """
14 | 
15 |     def __init__(self, areaId, cityName, originUrl, page):
16 |         self.areaId = areaId
17 |         self.cityName = cityName
18 |         self.originUrl = originUrl
19 |         self.page = page
20 |         self.uuid = 'c6eada3ffd8e444491e9.1555472928.3.0.0'  # Demo
21 | 
22 |     def join_sign(self):
23 |         # 参数
24 |         sign = 'areaId={areaId}&cateId=0&cityName={cityName}&dinnerCountAttrId=&optimusCode=1&originUrl={originUrl}&page={page}&partner=126&platform=1&riskLevel=1&sort=&userId=&uuid={uuid}'
25 |         _str = sign.format(areaId=self.areaId, cityName=self.cityName, originUrl=self.originUrl, page=self.page,
26 |                            uuid=self.uuid)
27 |         sign = base64.b64encode(zlib.compress(bytes(json.dumps(_str, ensure_ascii=False), encoding="utf8")))
28 |         sign = str(sign, encoding="utf8")
29 |         return sign
30 | 
31 |     @property
32 |     def join_token(self):
33 |         str_json = {}
34 |         str_json['rId'] = 100900
35 |         str_json['ver'] = '1.0.6'
36 |         str_json['ts'] = time.time()
37 |         str_json['cts'] = time.time() + 110
38 |         str_json['brVD'] = [1920, 315]
39 |         str_json['brR'] = [[1920, 1080], [1920, 1057], 24, 24]
40 |         str_json['bI'] = [self.originUrl, ""]
41 |         str_json['mT'] = []
42 |         str_json['kT'] = []
43 |         str_json['aT'] = []
44 |         str_json['tT'] = []
45 |         str_json['aM'] = ''
46 |         str_json['sign'] = self.join_sign()
47 |         token_decode = zlib.compress(
48 |             bytes(json.dumps(str_json, separators=(',', ':'), ensure_ascii=False), encoding="utf8"))
49 |         token = str(base64.b64encode(token_decode), encoding="utf8")
50 |         return token
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     # 测试数据
55 |     areaId = '4581'
56 |     cityName = '重庆'
57 |     originUrl = 'http://cq.meituan.com/meishi/b4581/'
58 |     page = '1'
59 | 
60 |     token = MakeToken(areaId, cityName, originUrl, page)
61 |     print(token.join_token)
62 | 


--------------------------------------------------------------------------------
/其他实战/【美团】数据解析、token生成/get_login_cookies.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-21  Python: 3.7
 4 | import asyncio
 5 | import json
 6 | 
 7 | from pyppeteer import launch
 8 | 
 9 | 
10 | class MeiTuanCookies():
11 |     def __init__(self, username, password):
12 |         self.login_url = 'https://passport.meituan.com/account/unitivelogin'
13 |         self.username = username
14 |         self.password = password
15 | 
16 |     async def star(self):
17 |         browser = await launch()
18 |         context = await browser.createIncogniteBrowserContext()
19 |         page = await context.newPage()
20 |         await page.evaluateOnNewDocument('() =>{ Object.defineProperties(navigator,'
21 |                                          '{ webdriver:{ get: () => false } }) }')  # 本页刷新后值不变
22 | 
23 |         await page.goto(self.login_url)
24 |         await page.type('input#login-email', self.username)
25 |         await page.type('input#login-password', self.password)
26 |         await page.click('input.btn')
27 |         await self.get_cookie(page)
28 | 
29 |     async def get_cookie(self, page):
30 |         """
31 |         获取 cookies
32 |         :param page: 页面
33 |         :return:
34 |         """
35 |         cookies_list = await page.cookies()
36 |         cookies = ''
37 |         for cookie in cookies_list:
38 |             str_cookie = '{0}={1};'
39 |             str_cookie = str_cookie.format(cookie.get('name'), cookie.get('value'))
40 |             cookies += str_cookie
41 |         print(cookies)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     name = input('美团账号')
46 |     pwd = input('密码')
47 |     mt = MeiTuanCookies(name, pwd)
48 |     loop = asyncio.get_event_loop()
49 |     loop.run_until_complete(mt.star())
50 | 


--------------------------------------------------------------------------------
/其他实战/【美团】数据解析、token生成/parse_food_comments.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-04-17  Python: 3.7
 4 | 
 5 | import requests
 6 | import json
 7 | import time
 8 | 
 9 | from urllib import parse
10 | 
11 | 
12 | class ParseComments(object):
13 |     def __init__(self, shop_id):
14 |         self.shop_id = shop_id
15 | 
16 |         self.get_data()
17 | 
18 |     def get_data(self):
19 |         url_code = self.get_originUrl()
20 | 
21 |         url = 'http://www.meituan.com/meishi/api/poi/getMerchantComment?'
22 |         params = {
23 |             'platform': '1',
24 |             'partner': '126',
25 |             'originUrl': url_code,
26 |             'riskLevel': '1',
27 |             'optimusCode': '1',
28 |             'id': self.shop_id,
29 |             'offset': '0',
30 |             'pageSize': '10',
31 |             'sortType': '1',
32 |         }
33 |         headers = {
34 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
35 |         }
36 |         response = requests.get(url=url, params=params, headers=headers)
37 |         data = response.text
38 |         self.parse(data)
39 | 
40 |     def get_originUrl(self):
41 |         """编码解码
42 |         """
43 |         return parse.quote_plus('http://www.meituan.com/meishi/' + self.shop_id + '/')
44 | 
45 |     def parse(self, data):
46 |         """解析数据
47 |         """
48 |         data_dict = json.loads(data)
49 |         for item in data_dict.get('data').get('comments'):
50 |             create_time = self.parse_time(item.get('commentTime'))
51 |             print_str = """
52 |             评论用户：{userName}
53 |             评论时间：{create_time}
54 |             评论详情：{comment}
55 |             评论id：{reviewId}
56 |             """.format(userName=item.get('userName'), comment=item.get('comment'), create_time=create_time,
57 |                        reviewId=item.get('reviewId'))
58 |             print(print_str)
59 | 
60 |     @staticmethod
61 |     def parse_time(timeStamp):
62 |         """13位 解码时间
63 |         """
64 |         time_stamp = float(int(timeStamp) / 1000)
65 |         time_array = time.localtime(time_stamp)
66 |         return time.strftime("%Y-%m-%d %H:%M:%S", time_array)
67 | 
68 | 
69 | if __name__ == '__main__':
70 |     p_id = input('请输入餐馆id')
71 |     ParseComments(p_id)
72 | 


--------------------------------------------------------------------------------
/其他实战/【美团】数据解析、token生成/parse_hotel_comments.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-06-10  Python: 3.7
 4 | 
 5 | """
 6 | 解析酒店评论
 7 | """
 8 | 
 9 | import requests
10 | import json
11 | import time
12 | 
13 | 
14 | class ParseComments(object):
15 |     """解析酒店评论
16 |     """
17 |     def __init__(self, hotel_id):
18 |         self.hotel_id = hotel_id
19 |         self.get_data()
20 | 
21 |     def get_data(self):
22 | 
23 |         url = 'https://ihotel.meituan.com/group/v1/poi/comment/' + self.hotel_id + '?'
24 |         params = {
25 |             'sortType': 'default',
26 |             'noempty': '1',
27 |             'withpic': '0',
28 |             'filter': 'all',
29 |             'limit': '10',
30 |             'offset': '0',
31 |         }
32 |         headers = {
33 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
34 |         }
35 |         response = requests.get(url=url, params=params, headers=headers)
36 |         data = response.text
37 |         self.parse(data)
38 | 
39 |     def parse(self, data):
40 |         """解析数据
41 |         """
42 |         data_dict = json.loads(data)
43 |         for item in data_dict.get('data').get('feedback'):
44 |             create_time = self.parse_time(item.get('replytimestamp'))
45 |             print_str = """
46 |             评论用户：{userName}
47 |             评论时间：{create_time}
48 |             评论详情：{comment}
49 |             满意度：{scoretext}
50 |             """.format(userName=item.get('username'), comment=item.get('comment'), create_time=create_time,
51 |                        scoretext=item.get('scoretext'))
52 |             print(print_str)
53 |             self.parse_pic(item)
54 | 
55 |     @staticmethod
56 |     def parse_time(timeStamp):
57 |         """13位 解码时间
58 |         """
59 |         time_array = time.localtime(timeStamp)
60 |         return time.strftime("%Y-%m-%d %H:%M:%S", time_array)
61 | 
62 |     def parse_pic(self, item):
63 |         pic_list = [i.get('url').replace('w.h', '750.0') for i in item.get('picinfo')]
64 |         print(pic_list)
65 | 
66 | 
67 | if __name__ == '__main__':
68 |     p_id = input('请输入酒店id')
69 |     ParseComments(p_id)
70 | 


--------------------------------------------------------------------------------
/其他实战/【美团】数据解析、token生成/parse_hotel_info.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-06-05  Python: 3.7
 4 | 
 5 | """
 6 | 解析
 7 | 美团酒店店铺的基础信息
 8 | 该板块信息隐藏在get请求后的js中直接用正则匹配出信息再抽取出来
 9 | """
10 | import requests
11 | import re
12 | import json
13 | import time
14 | 
15 | 
16 | class ParseHotelInfo(object):
17 |     headers = {
18 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
19 |     }
20 | 
21 |     def __init__(self, p_id):
22 |         self.p_id = p_id
23 | 
24 |     def go_to_hotel(self):
25 |         """执行访问
26 |         """
27 |         # 拼接日期
28 |         now_day = time.strftime('%Y-%m-%d', time.localtime(time.time()))
29 | 
30 |         # 组合 get 地址
31 |         url = 'https://hotel.meituan.com/' + self.p_id + '/?ci=' + now_day + '&co=' + now_day
32 |         data = requests.get(url, headers=self.headers).content.decode('utf-8')
33 | 
34 |         # 提取有效区域
35 |         info = re.search(r'window.__INITIAL_STATE__=(.*?)</script>', data, flags=re.DOTALL)
36 |         if info:
37 |             info_dict = json.loads(info.group(1).strip()[:-1])
38 |             self.parse_html(info_dict)
39 |         else:
40 |             print('访问失效')
41 | 
42 |     def parse_html(self, data_dict):
43 |         data = data_dict.get('poiData')
44 |         print('店名', data.get('name'))
45 |         print('店铺id', data.get('poiid'))
46 |         print('城市id', data.get('cityId'))
47 |         print('地址', data.get('addr'))
48 |         print('lng', data.get('lng'))
49 |         print('lat', data.get('lat'))
50 |         print('封面', data.get('frontImg').replace('w.h', '750.0'))
51 |         print('wifi', data.get('wifi'))
52 |         print('地区id', data.get('areaId'))
53 |         print('地区名', data.get('areaName'))
54 |         print('平均消费', data.get('avgPrice'))
55 |         print('类别id', data.get('brandId'))
56 |         print('类别名', data.get('brandName'))
57 |         print('简介', data.get('introduction'))
58 |         print('星级', data.get('highHotelStar'))
59 |         print('舒适类型', data.get('hotelStar'))
60 |         print('电话', [i.get('phone') for i in data.get('phoneList')])
61 |         print('平均分', data.get('avgScore'))
62 |         print('标签', data.get('poiAttrTagList'))
63 |         print('城市名', data.get('cityName'))
64 |         print('城市拼音', data.get('cityPinyin'))
65 | 
66 |         poi_data = data_dict.get('poiExt')  # 酒店详情
67 |         print('服务', [i.get('attrDesc') for i in poi_data.get('serviceIconsInfo').get('serviceIcons')])
68 |         print('酒店介绍', {i.get('attrDesc'): i.get('attrValue') for i in poi_data.get('hotelIntroInfo').get('poiExtendsInfos')})
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     print("""\033[1;33m请输入酒店ID \033[0m""")
73 |     _id = input('(链接末尾数字就是ID)')
74 |     # _id = '41823880'  # 测试
75 |     hotel = ParseHotelInfo(_id)
76 |     hotel.go_to_hotel()
77 | 


--------------------------------------------------------------------------------
/其他实战/【美团】数据解析、token生成/parse_play_areas.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-05  Python: 3.7
 4 | 
 5 | import requests
 6 | import json
 7 | import re
 8 | from pypinyin import pinyin
 9 | 
10 | 
11 | class ParseAreas(object):
12 | 
13 |     def __init__(self, city_name):
14 |         self.alphabet = "".join([i[0][0] for i in pinyin(city_name)])
15 | 
16 |         self.get_data()
17 | 
18 |     def get_data(self):
19 | 
20 |         url = 'https://{city}.meituan.com/xiuxianyule/'
21 |         headers = {
22 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
23 |         }
24 |         target_url = url.format(city=self.alphabet)
25 |         response = requests.get(target_url, headers=headers)
26 |         data = response.text
27 |         self.parse(data, target_url)
28 | 
29 |     @ staticmethod
30 |     def parse(data, url):
31 |         """解析数据
32 |         """
33 |         py_dict = {}
34 |         text = re.search(r'"city":{"id":(.*?),"name":"(.*?)","pinyin".*?"area":(.*?),"category":', data)
35 |         if text:
36 |             py_dict = {'城市': text.group(2), '城市ID': text.group(1)}
37 |             dict_info = json.loads(text.group(3)).get('children')  # 提取区域信息
38 |             py_dict['区'] = []
39 | 
40 |             for node in dict_info:
41 |                 if node.get('name') == '推荐商圈':
42 |                     continue  # 推荐商圈过滤
43 |                 # 二级区域
44 |                 district = {'区名': node.get('name'), '区ID': node.get('id'),
45 |                             '区链接': url + 'b' + str(node.get('id')) + '/'}
46 |                 if node.get('children'):
47 |                     district['街道'] = []
48 |                     # 三级区域
49 |                     for i in node.get('children'):
50 |                         area = {'街道名': i.get('name'), '街道ID': i.get('id'),
51 |                                 '街道链接': url + 'b' + str(i.get('id')) + '/'}
52 |                         district['街道'].append(area)
53 | 
54 |                 py_dict['区'].append(district)
55 | 
56 |         print(json.dumps(py_dict, ensure_ascii=False))
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     print("""
61 |     \033[1;33m娱乐板块区域解析
62 |     请输入城市名例如  北京   
63 |     返回json格式\033[0m
64 |     """)
65 |     chines = input('输入城市名')
66 |     ParseAreas(chines)
67 | 


--------------------------------------------------------------------------------
/其他实战/【美团】数据解析、token生成/parse_play_info.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-08  Python: 3.7
 4 | 
 5 | """
 6 | 解析
 7 | 美团休闲娱乐商铺信息
 8 | 该板块信息隐藏在get请求后的js中直接用正则匹配出信息再抽取出来
 9 | """
10 | import requests
11 | import re
12 | import json
13 | 
14 | 
15 | class ParsePlayInfo(object):
16 |     target_url = 'http://www.meituan.com/xiuxianyule/{p_id}/'
17 |     headers = {
18 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
19 |     }
20 | 
21 |     def __init__(self, restaurant_id):
22 |         self.restaurant_id = str(restaurant_id)
23 | 
24 |         self.go_to_restaurant()
25 | 
26 |     def go_to_restaurant(self):
27 |         """执行访问
28 |         """
29 |         url = self.target_url.format(p_id=self.restaurant_id)
30 |         data = requests.get(url, headers=self.headers).text
31 | 
32 |         # 提取有效区域
33 |         data = re.search(r'"params":{"poiInfo":(.*?)},"fallbackPara', data, flags=re.DOTALL)
34 |         if data:
35 |             self.parse_html(json.loads(data.group(1)))
36 |         else:
37 |             print('访问失效')
38 | 
39 |     def parse_html(self, data):
40 |         print('商铺ID', self.restaurant_id)
41 |         print('城市ID', data.get('catId'))
42 |         print('城市', data.get('cityName'))
43 |         print('城市拼音', data.get('cityPy'))
44 |         print('店铺', data.get('shopName'))
45 |         print('评分', data.get('score'))
46 |         print('平均消费', data.get('avgPrice'))
47 |         print('地址', data.get('address'))
48 |         print('电话', data.get('phone'))
49 |         print('营业时间', data.get('openTime'))
50 |         print('封面图片', data.get('headIcon'))
51 |         print('wifi', data.get('wifi'))  # 有=1  无=0
52 |         print('停车', data.get('park'))  # 如果有例如：免费提供5个停车位。 没有为空
53 |         print('经度', data.get('lng'))
54 |         print('纬度', data.get('lat'))
55 |         print('类型', data.get('breadCrumbNavDTOList')[2].get('title')[len(data.get('cityName')):])
56 | 
57 |         albums = []
58 |         images = data.get('albumDTOList')
59 |         for node in images:
60 |             albums.append(node.get('url'))
61 |         print('相册', albums)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     print("""
66 |     \033[1;33m请输入商铺ID \033[0m
67 |     """)
68 |     p_id = input('(商铺网址末尾数字就是ID)')
69 |     ParsePlayInfo(p_id)
70 | 


--------------------------------------------------------------------------------
/其他实战/【试客联盟】登录/login.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-11-23  Python: 3.7
 4 | 
 5 | 
 6 | import execjs
 7 | import requests, re
 8 | 
 9 | s = requests.Session()
10 | 
11 | 
12 | def main(pwd):
13 |     """res_n   这个参数  是从网页获取的   但调试发现是其实固定的
14 |     :param pwd:
15 |     :return:
16 |     """
17 |     with open('execute.js', 'r', encoding='utf-8') as f:
18 |         js = execjs.compile(f.read())
19 | 
20 |         print('引擎', execjs.get().name)
21 |         sign = js.call('get_pwd', pwd)
22 |         return sign
23 | 
24 | 
25 | def login(sign_pwd, username):
26 |     url = "http://login.shikee.com/check/?&_1574394219820"
27 |     data = {
28 |         "username": username,
29 |         "password": sign_pwd,
30 |         "vcode": '',
31 |         "to": 'http://user.shikee.com/',
32 |     }
33 |     res = s.post(url=url, data=data)
34 |     res.encoding = "utf-8"
35 |     print(res.text)
36 | 
37 | 
38 | def home():
39 |     home_url = "http://user.shikee.com/buyer"
40 |     response = s.get(home_url)
41 |     html = response.content.decode('utf-8')
42 |     data = re.findall(
43 |         '<div class="m-content">.*?<p class="loginInfo">您好！<span>(.*?)</span>您有未读提醒<b> <a href="/message">1</a></b> 条</p>',
44 |         html, re.S)[0]
45 |     print(data)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     username = input('请输入账户:')
50 |     pwd = input('请输入密码：')
51 |     sign = main(pwd)
52 |     print('正在登录....')
53 |     login(sign, username)
54 |     home()
55 | 


--------------------------------------------------------------------------------
/其他实战/【谷雨】数字解密/GuYu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-25  Python: 3.7
 4 | 
 5 | import requests
 6 | import os
 7 | from fontTools.ttLib import TTFont
 8 | 
 9 | 
10 | class Font:
11 |     """
12 |     https://guyujiezi.com/
13 |     谷雨解字的 数字解密
14 |     现在版本的 雨谷字体加的xml 会有一个移位操作
15 |     """
16 |     def __init__(self, uri):
17 |         self.url = uri
18 |         self.filename = uri.split('/')[-1]
19 |         self.font = None
20 |         self._list = []
21 | 
22 |     def check(self):
23 |         """检查目录
24 |         """
25 |         if not os.path.isfile(self.filename):
26 |             resp = requests.get(self.url)
27 |             with open(self.filename, 'wb') as f:
28 |                 f.write(resp.content)
29 |             # TTFont 存为 xml
30 |         self.font = TTFont(self.filename)
31 |         self.font.saveXML(self.filename.replace(self.filename.split('.')[-1], 'xml'))
32 | 
33 |     def get_wo(self):
34 |         """获取 woff
35 |         """
36 |         self.check()
37 |         ph = self.font['cmap']
38 |         _dict = ph.tables[0].cmap
39 |         # 1. 字典取 value 列表化
40 |         # 2. str 取最后 2 位，并转为 int
41 |         # 3. 减去 17 并从新组装列表
42 |         self._list = [int(i[-2:])-17 for i in list(_dict.values())]
43 |         """
44 |         处理移位
45 |         """
46 |         print(list(_dict.values()))
47 |         print(self._list)
48 | 
49 |     def parse(self, number):
50 |         _str = ''
51 |         for num in number:
52 |             _str += str(self._list[int(num)])
53 |         print('最终展示字', int(_str))
54 | 
55 | 
56 | if __name__ == '__main__':
57 |     ft = Font("https://guyujiezi.com/fonts/2DLw9u/3iZbr8.woff")
58 |     ft.get_wo()
59 |     # 输入页面数字测试
60 |     ft.parse('947')
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/其他实战/【豆瓣】自动登录/DouBan.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2020-01-08  Python: 3.7
 4 | 
 5 | import requests
 6 | import re
 7 | 
 8 | 
 9 | class DouBan:
10 |     def __init__(self, name, pwd):
11 |         self.name = name.strip()
12 |         self.pwd = pwd.strip()
13 |         self.session = requests.session()
14 |         self.headers = {
15 |             'Origin': 'https://accounts.douban.com',
16 |             'Host': 'accounts.douban.com',
17 |             'Referer': 'https://accounts.douban.com/passport/login_popup?login_source=anony',
18 |             'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
19 |         }
20 |         self.login_url = 'https://accounts.douban.com/j/mobile/login/basic'
21 |         self.index_url = "https://www.douban.com/"
22 |         self.session = requests.session()
23 | 
24 |     def login(self):
25 |         data = {
26 |             'ck': '',
27 |             'name': self.name,
28 |             'password': self.pwd,
29 |             'remember': 'false',
30 |             'ticket': '',
31 |         }
32 |         self.session.post(self.login_url, data=data, headers=self.headers)
33 | 
34 |     def check(self):
35 |         self.headers['Host'] = 'www.douban.com'
36 |         response = self.session.get("https://www.douban.com/", headers=self.headers)
37 |         try:
38 |             title = re.search(r'<span>(.*?)的帐号</span><span class="arrow"></span>', response.text).group(1)
39 |             print('【登录成功】', title)
40 |         except:
41 |             print('【登录失败】')
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     username = input('豆瓣用户名 >>>')
46 |     password = input('密码 >>>')
47 |     db = DouBan(username, password)
48 |     db.login()
49 |     db.check()
50 | 


--------------------------------------------------------------------------------
/其他实战/【逗游】自动登录/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【逗游】自动登录/douyou.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-08-01  Python: 3.7
 4 | 
 5 | import js2py
 6 | import requests
 7 | import json
 8 | 
 9 | 
10 | class DouYou:
11 |     headers = {
12 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
13 |         'Referer': 'http://www.doyo.cn/passport/login'
14 |     }
15 | 
16 |     def __init__(self, username, password):
17 |         self.context = js2py.EvalJs()  # python中使用js
18 |         self.username = username
19 |         self.password = password
20 | 
21 |     def make_password(self):
22 |         """取加密后的字符串
23 |         """
24 |         try:
25 |             nonce, ts = self.get_token()
26 |             with open("encryp.js", "r", encoding="utf-8") as f:
27 |                 self.context.execute(f.read())
28 |             pwd_hash = self.context.get_value(self.password, nonce, ts)
29 |             return pwd_hash  # 打印加密之后的密码
30 |         except:
31 |             print('获取token失败')
32 | 
33 |     def get_token(self):
34 |         """获取 token
35 |         """
36 |         get_token_url = 'http://www.doyo.cn/User/Passport/token?username={user}&random=0.1428378278012199'.format(user=self.username)
37 |         result = json.loads(requests.get(get_token_url).text)
38 |         if result.get('result'):
39 |             nonce = result.get('nonce')
40 |             ts = result.get('ts')
41 |             return nonce, ts
42 |         else:
43 |             print('获取token失败')
44 |             exit()
45 | 
46 |     def login(self):
47 |         """登陆
48 |         """
49 |         # decode('unicode_escape')
50 |         login_url = 'http://www.doyo.cn/passport/login'
51 |         data = {
52 |             'username': self.username,
53 |             'password': self.make_password(),
54 |             'remberme': '1',
55 |             'next': 'aHR0cCUzQSUyRiUyRnd3dy5kb3lvLmNuJTJG'
56 |         }
57 |         response = requests.post(login_url, data=data, headers=self.headers)
58 |         info = json.loads(response.text)
59 |         if info.get('result'):
60 |             print('登陆成功 | 用户等级:{level} 用户id:{uid}'.format(level=info.get('level'), uid=info.get('uid')))
61 |         else:
62 |             print('登陆失败')
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     user = input('输入逗游账号')
67 |     pwd = input('输入密码')
68 |     dy = DouYou(user, pwd)
69 |     dy.login()
70 | 


--------------------------------------------------------------------------------
/其他实战/【金逸电影】自动注册/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【金逸电影】自动注册/register.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/其他实战/【金逸电影】自动注册/register.png


--------------------------------------------------------------------------------
/其他实战/【金逸电影】自动注册/register.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-04  Python: 3.7
 4 | 
 5 | import requests
 6 | import execjs.runtime_names
 7 | 
 8 | 
 9 | class JinYiRegister:
10 |     """
11 |     金逸电影注册
12 |     http://www.jycinema.com/wap/#/register
13 |     """
14 |     def __init__(self, phone):
15 |         self.headers = {
16 |             'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1',
17 |         }
18 |         self.url = 'http://www.jycinema.com/frontUIWebapp/appserver/photoMessageService/newsSendMessage'
19 |         self.phone = phone
20 | 
21 |     @staticmethod
22 |     def js_make(json_data):
23 |         with open('encryp.js', 'r', encoding='utf-8') as f:
24 |             js = execjs.compile(f.read())
25 |         try:
26 |             result = js.call("getEncryption", json_data)
27 |             return result
28 |         except Exception:
29 |             print('js 异常')
30 | 
31 |     def register(self):
32 |         data = '{"mobileNumber": ' + self.phone + ', "channelId": 7, "channelCode": "J0005", "memberId": ""}'
33 |         data = {
34 |             'params': self.js_make(data),
35 |             'Origin': 'http://www.jycinema.com',
36 |             'Referer': 'http://www.jycinema.com/wap/',
37 |         }
38 |         response = requests.post(self.url, data=data, headers=self.headers)
39 |         print(response.content.decode('utf-8'))
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     your_phone = input('请输入待注册手机号')
44 |     jy = JinYiRegister(your_phone)
45 |     jy.register()
46 | 


--------------------------------------------------------------------------------
/其他实战/【青海移动】登陆参数生成/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-06  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/其他实战/【青海移动】登陆参数生成/make_param.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-09-12  Python: 3.7
 4 | import execjs.runtime_names
 5 | 
 6 | 
 7 | class QinHaiYiDong:
 8 |     """
 9 |     青海移动
10 |     参数加密
11 |     https://www.iqhmall.cn/shopweb/logon/logon
12 |     """
13 |     def __init__(self, user, pwd):
14 |         self.js = None
15 |         self.user = user
16 |         self.pwd = pwd
17 |         self.init_js()
18 | 
19 |     def init_js(self):
20 |         print('引擎', execjs.get().name)
21 |         with open("encryp.js", "r", encoding="utf-8") as f:
22 |             self.js = execjs.compile(f.read())
23 | 
24 |     def make_param(self):
25 |         print(self.js.call('test', self.pwd))
26 | 
27 | 
28 | if __name__ == '__main__':
29 |     yd = QinHaiYiDong('17327362817', '123123123')
30 |     yd.make_param()
31 | 


--------------------------------------------------------------------------------
/其他实战/【餐饮】查询信息/FoodInfo.py:
--------------------------------------------------------------------------------
 1 | # -*- encoding: utf-8 -*-
 2 | # Time    :   2020/01/16
 3 | # Author  :   Zok
 4 | # Email   :   362416272@qq.com
 5 | 
 6 | import requests
 7 | import re
 8 | import json
 9 | from copyheaders import headers_raw_to_dict
10 | 
11 | 
12 | class Food:
13 |     """
14 |     根据输入美团餐馆名，解析参观基础信息
15 |     """
16 |     def __init__(self):
17 |         self.headers = headers_raw_to_dict(b"""
18 |         Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9
19 |         Accept-Encoding: gzip, deflate, br
20 |         Accept-Language: zh-CN,zh;q=0.9
21 |         Cache-Control: max-age=0
22 |         Connection: keep-alive
23 |         Cookie: _lxsdk_s=16fb0ce3a0d-4cf-d9e-cf2%7C%7C1
24 |         Host: www.meituan.com
25 |         Sec-Fetch-Mode: navigate
26 |         Sec-Fetch-Site: none
27 |         Sec-Fetch-User: ?1
28 |         Upgrade-Insecure-Requests: 1
29 |         User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36
30 |         """)
31 | 
32 |     def get_info(self, url):
33 |         response = requests.get(url, headers=self.headers)
34 |         data = json.loads(re.search(r'<script>window\._appState = (.*?);</script><script', response.text).group(1))
35 |         info = data.get('detailInfo')
36 |         images = data.get('photos')
37 |         print(info)
38 |         print(images)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     fd = Food()
43 |     fd.get_info("https://www.meituan.com/meishi/177501077/")


--------------------------------------------------------------------------------
/其他实战/【餐饮】查询信息/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- encoding: utf-8 -*-
2 | # Auth: Zok  Email: 362416272@qq.com
3 | # Date: 2020/1/17
4 | 
5 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Cookies/MeiTuan/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-05-14  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Cookies/MeiTuan/config.py:
--------------------------------------------------------------------------------
 1 | # Redis数据库地址
 2 | REDIS_HOST = ''
 3 | 
 4 | # Redis端口
 5 | REDIS_PORT = 6379
 6 | 
 7 | # Redis密码，如无填None
 8 | REDIS_PASSWORD = None
 9 | 
10 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Cookies/MeiTuan/db.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-13  Python: 3.7
 4 | import redis
 5 | import random
 6 | 
 7 | from Cookies.MeiTuan.config import *
 8 | 
 9 | 
10 | class RedisClient(object):
11 |     def __init__(self, t_type, website, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD):
12 |         """
13 |         初始化 redis 池
14 |         :param t_type: 链接类型
15 |         :param website: 站点
16 |         :param host: IP
17 |         :param port: 端口
18 |         :param password: 密码
19 |         """
20 |         if password:
21 |             pool = redis.ConnectionPool(host=host, port=port, password=password)
22 |         else:
23 |             pool = redis.ConnectionPool(host=host, port=port)
24 |         self.db = redis.Redis(connection_pool=pool)
25 |         self.type = t_type
26 |         self.website = website
27 | 
28 |     @property
29 |     def name(self):
30 |         """
31 |         获取Hash名
32 |         :return: Hash名称
33 |         """
34 |         return "{type}:{website}".format(type=self.type, website=self.website)
35 | 
36 |     def set(self, username, value):
37 |         """
38 |         设置键值对
39 |         :param username: 用户名
40 |         :param value: 密码或Cookies
41 |         :return: 设置结果
42 |         """
43 |         return self.db.hset(self.name, username, value)
44 | 
45 |     def get(self, username):
46 |         """
47 |         根据键名获取值
48 |         :param username: 用户名
49 |         :return: 获取结果
50 |         """
51 |         return self.db.hdel(self.name, username)
52 | 
53 |     def delete(self, username):
54 |         """
55 |         根据键名删除
56 |         :param username: 用户名
57 |         :return: 删除结果
58 |         """
59 |         return self.db.hdel(self.name, username)
60 | 
61 |     def count(self):
62 |         """
63 |         获取数目
64 |         :return: 数目
65 |         """
66 |         return self.db.hlen(self.name)
67 | 
68 |     def random(self):
69 |         """
70 |         随机得到键值，用于随机Cookies获取
71 |         :return:
72 |         """
73 |         return random.choice(self.db.hvals(self.name))  # 返回所有，再随机取
74 | 
75 |     def all_username(self):
76 |         """
77 |         获取所有账户信息
78 |         :return: 所有用户名
79 |         """
80 |         return self.db.hkeys(self.name)
81 | 
82 |     def all(self):
83 |         """
84 |         获取所有键值对
85 |         :return: 用户名和密码或Cookies的映射表
86 |         """
87 |         return self.db.hgetall(self.name)
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Cookies/MeiTuan/generator.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-13  Python: 3.7
 4 | 
 5 | import asyncio
 6 | import json
 7 | 
 8 | from Cookies.MeiTuan.db import RedisClient
 9 | from pyppeteer import launch
10 | 
11 | 
12 | class MeiTuanCookies():
13 |     login_url = 'https://passport.meituan.com/account/unitivelogin'
14 | 
15 |     def __init__(self):
16 |         self.r = RedisClient('accounts', 'meituan')
17 | 
18 |     async def star(self, username, password):
19 |         browser = await launch()
20 |         context = await browser.createIncogniteBrowserContext()
21 |         page = await context.newPage()
22 |         await page.evaluateOnNewDocument('() =>{ Object.defineProperties(navigator,'
23 |                                          '{ webdriver:{ get: () => false } }) }')  # 本页刷新后值不变
24 | 
25 |         await page.goto(self.login_url)
26 |         await page.type('input#login-email', username)
27 |         await page.type('input#login-password', password)
28 |         await page.click('input.btn')
29 |         await self.get_cookie(page,username,password)
30 | 
31 |     async def get_cookie(self, page,username,password):
32 |         """
33 |         获取 cookies
34 |         :param page: 页面
35 |         :return:
36 |         """
37 |         cookies_list = await page.cookies()
38 |         cookies = ''
39 |         for cookie in cookies_list:
40 |             str_cookie = '{0}={1};'
41 |             str_cookie = str_cookie.format(cookie.get('name'), cookie.get('value'))
42 |             cookies += str_cookie
43 |         # 储存cookies
44 |         print(cookies)
45 |         self.r.set(username, json.dumps({'password': password, 'cookies': cookies}))
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     mt = MeiTuanCookies()
50 | 
51 |     with open('账号.txt', 'r', encoding='utf-8') as f:
52 |         # 账号|密码\n
53 |         lines = f.readlines()
54 | 
55 |     tasks = []
56 |     for line in lines:
57 |         username, password = line.strip().split('|')
58 |         tasks.append(mt.star(username, password))
59 | 
60 |     loop = asyncio.get_event_loop()
61 |     loop.run_until_complete(asyncio.wait(tasks))
62 | 
63 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Cookies/MeiTuan/账号.txt:
--------------------------------------------------------------------------------
1 | 账号1|密码1
2 | 账号2|密码2
3 | 账号3|密码3


--------------------------------------------------------------------------------
/原创爬虫工具/Cookies/README.md:
--------------------------------------------------------------------------------
1 | # 异步批量登陆美团获取cookies
2 | 
3 | > pyppeteer 异步批量登陆美团并将cookies储存到redis 的hash表中


--------------------------------------------------------------------------------
/原创爬虫工具/Cookies/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-05-14  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/原创爬虫工具/DataMigration/README.md:
--------------------------------------------------------------------------------
1 | # 工作中经常有这种需求
2 | > 将采集好的mongodb数据转存到mysql中，或者是redis数据转到mongodb，于是打算封装一个组件便于以后调用
3 | 
4 | # mysql转存mongo
5 | 1. 在 config 中配置 mongo 与 mysql 连接
6 | 2. 在 `msyql_to_mongo.py` 下方实例化时填入 `需要转换mysql表名`, `mongo库名`, `mongo表名`
7 | 3. 调用 `mi.easy_to_mongo()` 即可将 mysql 中的 数据导入到 mongodb
8 | 
9 | > 当然也支持自定义转换，在类中添加即可


--------------------------------------------------------------------------------
/原创爬虫工具/DataMigration/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-05-15  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/原创爬虫工具/DataMigration/config.py:
--------------------------------------------------------------------------------
 1 | # mongodb链接
 2 | MONGODB_URL = 'mongodb://localhost:27017'
 3 | 
 4 | # Redis数据库地址
 5 | REDIS_HOST = ''
 6 | 
 7 | # Redis端口
 8 | REDIS_PORT = 6379
 9 | 
10 | # Redis密码，如无填None
11 | REDIS_PASSWORD = None
12 | 
13 | # Mysql地址
14 | MYSQL_HOST = '127.0.0.1'
15 | 
16 | # Mysql端口
17 | MYSQL_PORT = 3306
18 | 
19 | # Mysql用户名
20 | MYSQL_USER = 'root'
21 | 
22 | # Mysql密码
23 | MYSQL_PASSWORD = ''
24 | 
25 | # Mysql链接库
26 | MYSQL_DB_NAME = 'travel'
27 | 
28 | 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/原创爬虫工具/DataMigration/db/MongoDB.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-15  Python: 3.7
 4 | from pymongo import MongoClient
 5 | 
 6 | from DataMigration.config import MONGODB_URL
 7 | 
 8 | 
 9 | class Mongo(object):
10 |     def __init__(self, db_name, collection):
11 |         client = MongoClient(MONGODB_URL)
12 |         database = client[db_name]
13 |         self.collection = database[collection]
14 | 
15 |     def delete(self, *args, del_one=True):
16 |         """
17 |         删除复合条件的信息
18 |         :param sql: sql 语句
19 |         :param del_one: 默认删除第一条，否则删除复合条件的所有
20 |         :return:
21 |         """
22 |         return self.collection.delete_one(*args) if del_one else self.collection.deleteMany(*args)
23 | 
24 |     @property
25 |     def all(self):
26 |         """
27 |         返回全部
28 |         :return: 整表信息
29 |         """
30 |         return self.collection.find({})
31 | 
32 |     def find(self, *args):
33 |         """
34 |         指定查找
35 |         :param sql:
36 |         :return:
37 |         """
38 |         return self.collection.find(*args)
39 | 
40 |     def update(self, *args, update_one=True):
41 |         """
42 |         修改数据
43 |         :param sql: 修改sql
44 |         :param update_one: 默认修改第一个，否则修改复合条件所有
45 |         :return:
46 |         """
47 |         return self.collection.update_one(*args) if update_one else self.collection.update_many(*args)
48 | 
49 |     def insert(self, *args, insert_one=True):
50 |         """
51 |         插入数据
52 |         :param sql: 新增sql
53 |         :param insert_one: 默认插入一个
54 |         :return:
55 |         """
56 |         return self.collection.insert_one(*args) if insert_one else self.collection.insert_many(*args)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     # 测试
61 |     mg = Mongo('meituan', 'user_info')
62 |     # data = mg.all
63 |     ret = mg.update({'用户名': '三丰948'}, {'$set': {'用户名': '三三风'}})
64 | 


--------------------------------------------------------------------------------
/原创爬虫工具/DataMigration/db/Mysql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-15  Python: 3.7
 4 | import pymysql
 5 | 
 6 | from DataMigration.config import MYSQL_HOST, MYSQL_PORT, MYSQL_USER, MYSQL_PASSWORD, MYSQL_DB_NAME
 7 | 
 8 | 
 9 | class Mysql(object):
10 |     def __init__(self):
11 |         """
12 |         链接数据库
13 |         """
14 |         self.conn = pymysql.Connect(
15 |             host=MYSQL_HOST,
16 |             port=MYSQL_PORT,
17 |             user=MYSQL_USER,
18 |             password=MYSQL_PASSWORD,
19 |             db=MYSQL_DB_NAME,
20 |         )
21 | 
22 |     def insert(self, sql):
23 |         """
24 |         查找
25 |         :param sql: sql语句
26 |         :return:
27 |         """
28 |         # 创建游标对象
29 |         cursor = self.conn.cursor()
30 |         # 执行并提交
31 |         try:
32 |             cursor.execute(sql)
33 |             self.conn.commit()
34 |         except Exception as e:
35 |             print('异常回滚')
36 |             self.conn.rollback()
37 |         finally:
38 |             cursor.close()
39 | 
40 |     def select(self, sql):
41 |         """
42 |         查找
43 |         :param sql: sql 语句
44 |         :return: 查找结果
45 |         """
46 |         cursor = self.conn.cursor()  # 创建游标对象
47 |         # 提交事务
48 |         try:
49 |             cursor.execute(sql)
50 |             data = cursor.fetchall()
51 |         except Exception as e:
52 |             print('异常回滚')
53 |             data = None
54 |             self.conn.rollback()
55 |         finally:
56 |             cursor.close()
57 |         return data
58 | 
59 | 


--------------------------------------------------------------------------------
/原创爬虫工具/DataMigration/db/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-05-15  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/原创爬虫工具/DataMigration/migration/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-05-15  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/原创爬虫工具/DataMigration/migration/mongo_to_mysql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-23  Python: 3.7
 4 | 
 5 | 
 6 | from DataMigration.db.MongoDB import Mongo
 7 | from DataMigration.db.Mysql import Mysql
 8 | from DataMigration.config import MYSQL_DB_NAME
 9 | 
10 | 
11 | class Migrate(object):
12 |     def __init__(self,mysql_table_name, mongodb_name, mongodb_collection):
13 |         self.mongo = Mongo(mongodb_name, mongodb_collection)
14 |         self.mysql = Mysql()
15 |         self.mysql_name = mysql_table_name
16 | 
17 |     def easy_to_mongo(self, column_comment=False):
18 |         """
19 |         将输入插入 mongodb
20 |         :return:
21 |         """
22 |         columns = self.get_column()
23 |         nodes = self.all_mysql_data()
24 |         data_list = []
25 | 
26 |         for node in nodes:
27 |             data_dict = {}
28 |             for index, column in enumerate(columns):
29 |                 if column_comment:
30 |                     data_dict[column[1]] = node[index]
31 |                 else:
32 |                     data_dict[column[0]] = node[index]
33 |             data_list.append(data_dict)
34 |         try:
35 |             self.mongo.insert(data_list, insert_one=False)
36 |             print('储存成功')
37 |         except Exception:
38 |             print('转存失败')
39 | 
40 |     def all_mysql_data(self):
41 |         """
42 |         获取需要转换的数据
43 |         :return: 所有 mysql 数据
44 |         """
45 |         sql = """SELECT * from {table_name};""".format(table_name=self.mysql_name)
46 |         return self.mysql.select(sql)
47 | 
48 |     def get_column(self):
49 |         """
50 |         取字段名
51 |         :return: (字段名,字段描述)
52 |         """
53 |         sql = """select COLUMN_NAME,column_comment 
54 |         from INFORMATION_SCHEMA.Columns 
55 |         where table_name='{table_name}' and table_schema='{db_name}'""".format(
56 |             table_name=self.mysql_name,
57 |             db_name=MYSQL_DB_NAME,
58 |         )
59 |         return self.mysql.select(sql)
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     mi = Migrate('需要转换mysql表名', 'mongo库名', 'mongo表名')
64 |     mi.easy_to_mongo(column_comment=True)  # column_comment=True 使用注释的字段名， 默认不使用
65 | 


--------------------------------------------------------------------------------
/原创爬虫工具/DataMigration/migration/mysql_to_mongo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-20  Python: 3.7
 4 | 
 5 | 
 6 | from DataMigration.db.MongoDB import Mongo
 7 | from DataMigration.db.Mysql import Mysql
 8 | from DataMigration.config import MYSQL_DB_NAME
 9 | 
10 | 
11 | class Migrate(object):
12 |     def __init__(self, mysql_table_name, mongodb_name, mongodb_collection):
13 |         self.mongo = Mongo(mongodb_name, mongodb_collection)
14 |         self.mysql = Mysql()
15 |         self.mysql_name = mysql_table_name
16 | 
17 |     def easy_to_mongo(self, column_comment=False):
18 |         """
19 |         将输入插入 mongodb
20 |         :return:
21 |         """
22 |         columns = self.get_column()
23 |         nodes = self.all_mysql_data()
24 |         data_list = []
25 | 
26 |         for node in nodes:
27 |             data_dict = {}
28 |             for index, column in enumerate(columns):
29 |                 if column_comment:
30 |                     data_dict[column[1]] = node[index]
31 |                 else:
32 |                     data_dict[column[0]] = node[index]
33 |             data_list.append(data_dict)
34 |         try:
35 |             self.mongo.insert(data_list, insert_one=False)
36 |             print('储存成功')
37 |         except Exception:
38 |             print('转存失败')
39 | 
40 |     def all_mysql_data(self):
41 |         """
42 |         获取需要转换的数据
43 |         :return: 所有 mysql 数据
44 |         """
45 |         sql = """SELECT * from {table_name};""".format(table_name=self.mysql_name)
46 |         return self.mysql.select(sql)
47 | 
48 |     def get_column(self):
49 |         """
50 |         取字段名
51 |         :return: (字段名,字段描述)
52 |         """
53 |         sql = """select COLUMN_NAME,column_comment 
54 |         from INFORMATION_SCHEMA.Columns 
55 |         where table_name='{table_name}' and table_schema='{db_name}'""".format(
56 |             table_name=self.mysql_name,
57 |             db_name=MYSQL_DB_NAME,
58 |         )
59 |         return self.mysql.select(sql)
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     mi = Migrate('需要转换mysql表名', 'mongo库名', 'mongo表名')
64 |     mi.easy_to_mongo(column_comment=True)  # column_comment=True 使用注释的字段名， 默认不使用
65 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Decode/README.md:
--------------------------------------------------------------------------------
 1 | # 可拓展式解密器
 2 | > 方便测试可连续转换重制的编码转换器，可灵活拓展解码规则
 3 | 
 4 | 
 5 | # 说明博客
 6 | 
 7 | [**博客地址**](https://www.zhangkunzhi.com/?p=241)
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Decode/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-06-01  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Decode/translation.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-04-28  Python: 3.7
 4 | 
 5 | import base64
 6 | import zlib
 7 | 
 8 | COLOR = {'red': 1, 'green': 2, 'yellow': 3, 'blue': 4}
 9 | 
10 | 
11 | class TranslationMetaClass(type):
12 |     """Meta 类"""
13 |     def __new__(mcs, name, bases, attrs):
14 |         count = 0
15 |         attrs['__Decode__'] = {}
16 |         for k, v in attrs.items():
17 |             if 'decode_' in k:
18 |                 count += 1
19 |                 attrs['__Decode__'][str(count)] = k
20 |         attrs['__TranslationFuncCount__'] = count
21 |         return type.__new__(mcs, name, bases, attrs)
22 | 
23 | 
24 | class Util(object):
25 |     """辅助类"""
26 | 
27 |     @staticmethod
28 |     def _print(color, msg):
29 |         """print color control
30 |         """
31 |         node = '\033[1;3{id}m{msg}\033[0m'
32 |         if COLOR.get(color):
33 |             print(node.format(id=COLOR.get(color), msg=msg))
34 |         else:
35 |             print(msg)
36 | 
37 |     def msg(self):
38 |         """print decode func
39 |         """
40 |         for k in self.__Decode__:
41 |             self._print('yellow', str(k) + ': ' + self.__Decode__[k][7:])
42 |         self._print('yellow', 'r: 【重制】 e:【退出】')
43 |         return input('请选择 >>>').lower()
44 | 
45 | 
46 | class Decode(Util, metaclass=TranslationMetaClass):
47 |     """
48 |     将需要添加的转码类型按下列类似格式添加即可
49 |     def decode_自定义名(self):
50 |         self._key = 解密过程
51 |     """
52 |     def __init__(self, _key):
53 |         self._key = _key
54 |         self._copy = _key
55 |         self.crumbs = ''
56 | 
57 |     def main(self):
58 |         choice = self.msg()
59 |         while choice != 'e':
60 |             if choice == 'r':  # 重制
61 |                 self._key, self.crumbs = self._copy, ''
62 |                 self._print('blue', '重制成功: ' + self._key)
63 |                 choice = self.msg()
64 |             elif choice in self.__Decode__:  # 选择是否在现有函数选项中
65 |                 try:
66 |                     eval("self.{}()".format(self.__Decode__[choice]))  # 字符串转函数运行
67 |                     self._print('blue', self._key)
68 |                     self.crumbs += self.__Decode__[choice][7:] + ' > '
69 |                     self._print('green', self.crumbs)
70 |                     choice = self.msg()
71 |                 except Exception:
72 |                     choice = input('解码失败，换一种 >>>')
73 | 
74 |         self._print('red', '调试结束')
75 | 
76 |     def decode_base64(self):
77 |         """解base64"""
78 |         self._key = base64.b64decode(self._key)
79 | 
80 |     def decode_zlib(self):
81 |         """解压串"""
82 |         self._key = zlib.decompress(self._key)
83 | 
84 |     def decode_str(self):
85 |         """转字符串"""
86 |         self._key = str(self._key, encoding="utf-8")
87 | 
88 |     def decode_hex(self):
89 |         """转到16进制"""
90 |         self._key = self._key.hex()
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     # _key = 'eJyrVnqxZdnT/u1KVgpKpcWpRUo6CkpP17c9X9AIEilILC4uzy9KUaoFAGxTEMo='  # 测试
95 |     _key = input('\033[1;31m输入解码内容>>> \033[0m')
96 |     ts = Decode(_key)
97 |     ts.main()
98 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Jsencrypt/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-06-29  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Jsencrypt/make_encrypt.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-06-28  Python: 3.7
 4 | 
 5 | import base64
 6 | 
 7 | from Crypto.Cipher import PKCS1_v1_5 as Cipher_pkcs1_v1_5
 8 | from Crypto.PublicKey import RSA
 9 | 
10 | 
11 | public_key = """
12 | -----BEGIN PUBLIC KEY-----
13 | Your PUBLIC KEY
14 | -----END PUBLIC KEY-----
15 | """
16 | 
17 | 
18 | def make_message(pwd):
19 |     rsakey = RSA.importKey(public_key)
20 |     cipher = Cipher_pkcs1_v1_5.new(rsakey)
21 |     cipher_text = base64.b64encode(cipher.encrypt(pwd.encode(encoding="utf-8")))
22 |     return cipher_text.decode('utf8')
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     print(make_message('hellow'))
27 | 


--------------------------------------------------------------------------------
/原创爬虫工具/OSS/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-06-24  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/原创爬虫工具/OSS/push_to_oss.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-06-24  Python: 3.7
 4 | 
 5 | """
 6 | 将图redis中储存的网络图片链接，并发直传到 OSS 上
 7 | """
 8 | 
 9 | import oss2
10 | import redis
11 | import requests
12 | 
13 | from concurrent.futures import ThreadPoolExecutor  # 线程池模块
14 | 
15 | KEY = ''
16 | KEYSECRET = ''
17 | BUCKETNAME = ''
18 | ENDPOINT = 'http://oss-cn-hangzhou.aliyuncs.com'
19 | 
20 | REDIS_HOST = "localhost"
21 | REDIS_USER = "root"
22 | REDIS_PASSWORD = ""
23 | REDIS_DB_NAME = 1
24 | REDIS_PORT = 6379
25 | 
26 | list_name = 'restaurant'  # 列队名
27 | 
28 | # oss
29 | auth = oss2.Auth(KEY, KEYSECRET)
30 | bucket = oss2.Bucket(auth, ENDPOINT, BUCKETNAME)
31 | 
32 | # redis 池
33 | pool = redis.ConnectionPool(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB_NAME, password=REDIS_PASSWORD,
34 |                             decode_responses=True)
35 | r = redis.Redis(connection_pool=pool)
36 | 
37 | 
38 | def put_img():
39 |     """上传逻辑，根据项目需求修改即可"""
40 |     url = r.rpop(list_name)
41 |     input = requests.get(url)
42 |     if input.status_code == 200:
43 |         file_name = url  # this is file name
44 |         obj = bucket.put_object(file_name, input)
45 |         if obj.status == 200:
46 |             print('OK', file_name)
47 |     else:
48 |         r.lpush(list_name)
49 | 
50 | 
51 | def get_len():
52 |     return r.llen(list_name)
53 | 
54 | 
55 | if __name__ == '__main__':
56 |     list_len = get_len()
57 |     print('专辑总图数量', list_len)
58 |     pool = ThreadPoolExecutor()  # 设置线程池大小，默认等于cpu核数
59 |     for i in range(list_len):
60 |         pool.submit(put_img)
61 | 
62 |     pool.shutdown(wait=True)
63 |     print('主进程')
64 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Proxy/KDLProxyPool.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-06-17  Python: 3.7
 4 | 
 5 | """
 6 | 快代理IP池
 7 | https://www.kuaidaili.com/ 放代理API
 8 | """
 9 | 
10 | import redis
11 | import requests
12 | import json
13 | 
14 | from apscheduler.schedulers.blocking import BlockingScheduler
15 | 
16 | 
17 | class KDLProxyPool(object):
18 |     """
19 |     快代理IP池
20 |     用的快代理开放代理API
21 |     """
22 | 
23 |     def __init__(self, key, count):
24 |         try:
25 |             self.key = key  # 订单号
26 |             self.count = count  # 代理池代理数量
27 |             """redis数据库配置区"""
28 |             pool = redis.ConnectionPool(decode_responses=True)
29 |             self.r = redis.Redis(connection_pool=pool)
30 |         except:
31 |             print('请填入正确的API链接')
32 | 
33 |     def check_ip(self):
34 |         """
35 |         监控 IP 分数、个数，对其进行增删
36 |         """
37 |         # 检查分数
38 |         nodes = self.r.zrevrange('KDLProxy', 0, -1, withscores=True)
39 |         for i in nodes:
40 |             node = list(i)
41 |             score = int(node[1])
42 |             if score <= 0:
43 |                 print('\033[1;33m分数过低剔除\033[0m')
44 |                 self.r.zrem('KDLProxy', node[0])
45 | 
46 |         # 检查个数
47 |         _sum = self.r.zcard('KDLProxy')
48 |         if _sum < self.count:
49 |             self.add_ip(self.count - _sum)
50 | 
51 |     def add_ip(self, num):
52 |         """
53 |         提取IP
54 |         """
55 |         get_url = 'http://svip.kdlapi.com/api/getproxy/?orderid={key}&num={num}&protocol=2&method=2&an_ha=1&sp1=1&quality=2&format=json&sep=1'.format(
56 |             key=self.key, num=num)
57 | 
58 |         # 返回的文本进行解析
59 |         response = requests.get(get_url)
60 |         if response.status_code == 200:
61 |             ret = json.loads(response.text)
62 |             if ret.get('code') == 0:
63 |                 self.parse(ret.get('data').get('proxy_list'))
64 |             else:
65 |                 print(ret.get('msg'))
66 |         else:
67 |             print('提取失败')
68 | 
69 |     def parse(self, proxy_list):
70 |         """
71 |         解析返回数据
72 |         """
73 |         for node in proxy_list:
74 |             self.save_to_redis(node, 10)  # 默认10分
75 | 
76 |     def save_to_redis(self, proxy, expire):
77 |         """
78 |         推送到redis集合中
79 |         """
80 |         print('代理 %s 推入redis集合' % proxy)
81 |         self.r.zadd('KDLProxy', {proxy: expire})
82 | 
83 | 
84 | def aps_run():
85 |     """
86 |     监控
87 |     """
88 |     kdl.check_ip()
89 | 
90 | 
91 | kdl = KDLProxyPool('填写开放代理订单号', 20)
92 | 
93 | # 循环监控
94 | scheduler = BlockingScheduler()
95 | scheduler.add_job(aps_run, 'cron', second='*/1')  # 这里设置检测评论，推荐2s一次(默认)
96 | scheduler.start()
97 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Proxy/README.md:
--------------------------------------------------------------------------------
 1 | [TOC]
 2 | 
 3 | # 安装模块
 4 | 
 5 | ```bush
 6 | pip3 install redis
 7 | pip3 install apscheduler
 8 | pip3 install reuqest
 9 | pip3 install python-dateutil
10 | ```
11 | 
12 | # 讯代理池使用 
13 | 1. 登陆讯代理 进入API页码将下面下方生成的API复制
14 |     ![讯代理API](https://www.zhangkunzhi.com/images/xdl3.png)
15 |     
16 | 2. 将链接复制到项目该位置
17 |     ![讯代理API](https://www.zhangkunzhi.com/images/xdl4.png)
18 |  
19 | 3. 配置redis， 默认是本机
20 |     ![讯代理API](https://www.zhangkunzhi.com/images/xdl5.png)
21 |  
22 | 4. 启动程序，大功告成，只需要在调用ip的时候对其进行增减分操作即可
23 |     ![讯代理API](https://www.zhangkunzhi.com/images/xdl1.png)
24 |     ![讯代理API](https://www.zhangkunzhi.com/images/xdl2.png)
25 | 
26 | # 芝麻代理池使用
27 | 
28 | 1. 首先登陆你的芝麻代理后台管理，找到自己的key如图
29 |     ![key位置](https://www.zhangkunzhi.com/images/芝麻1.png)
30 | 
31 | 1. 在代码下方配置key
32 |     ![key位置](https://www.zhangkunzhi.com/images/填入芝麻key.png)
33 |     
34 | 1. 在代码中配置 redis库连接 **默认链接的本地**
35 |     ![key位置](https://www.zhangkunzhi.com/images/代理模块.png)
36 |     
37 | 1. 启动程序
38 |     > 如果在服务端可以使用后台运行命令
39 |     `nohup python3 ProxyPool.py >my.log &`
40 |  
41 | 1. 第一次启动芝麻代理会绑定你的ip白名单，稍等片刻就会开始提取     
42 |     
43 |     ![key位置](https://www.zhangkunzhi.com/images/提取ip.png)
44 |     
45 | 1. 链接redis可以看到ip池了，大功告成
46 |     ![key位置](https://www.zhangkunzhi.com/images/20个ip.png)
47 |     
48 | 1. 后续在使用代理ip时，根据访问结果对代理ip积分增减即可，后续会更新这个Demo继续关注Github即可。[**传送门**](https://github.com/wkunzhi/SpiderUtilPackage)
49 |     
50 |     
51 | # 额外配置
52 | - 可以自由配置，代理池上线值(默认20),实例化时配置即可
53 |     ```python
54 |     zm = ZhiMaPool('key', ip_sum=100)
55 |     ```
56 | - 可以自由配置，只取可用时间xx以上的ip(默认1号套餐下的1000秒以上),实例化时配置即可
57 |     ```python
58 |     zm = ZhiMaPool('key', ttl=1000)
59 |     ```
60 | - 还可以配置 每次提取数、提取套餐类型、提取ip HTTP或者HTTPS或者Sockets
61 |  


--------------------------------------------------------------------------------
/原创爬虫工具/Proxy/XDLProxyPool.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # __author__ = "zok"  362416272@qq.com
  3 | # Date: 2019-05-09  Python: 3.7
  4 | import redis
  5 | import requests
  6 | import re
  7 | import time
  8 | import json
  9 | 
 10 | from apscheduler.schedulers.blocking import BlockingScheduler
 11 | 
 12 | """
 13 | 可自型拓展其他的代理ip产品，只需修改调用接口即可
 14 | """
 15 | 
 16 | 
 17 | class XDLProxyPool(object):
 18 |     """
 19 |     迅代理IP池
 20 |     """
 21 | 
 22 |     def __init__(self, api_url):
 23 |         try:
 24 |             """redis数据库配置区"""
 25 |             pool = redis.ConnectionPool(decode_responses=True)
 26 |             self.r = redis.Redis(connection_pool=pool)
 27 | 
 28 |             """白名单初始化"""
 29 |             ret = re.search(r'spiderId=(.*?)&orderno=(.*?)&returnType=\d+&count=(\d+)', api_url)
 30 |             self.spiderId, self.orderno, self.count = ret.group(1), ret.group(2), int(ret.group(3))
 31 |             self.init_proxy()
 32 |         except:
 33 |             print('请填入正确的API链接')
 34 | 
 35 |     def init_proxy(self):
 36 |         """
 37 |         初始化代理
 38 |         """
 39 |         print('\033[1;35m初始化中...\033[0m')
 40 | 
 41 |         # 取出当前IP地址
 42 |         response = requests.get('http://pv.sohu.com/cityjson?ie=utf-8')
 43 |         address = re.search(r'"cip": "(.*?)", "cid', response.text).group(1)
 44 | 
 45 |         # 加入白名单
 46 |         url = 'http://www.xdaili.cn/ipagent/newWhilteList/updateByOrder?orderno={orderno}&ip={ip}&spiderId={spiderId}'.format(
 47 |             orderno=self.orderno, ip=address, spiderId=self.spiderId)
 48 |         status = requests.get(url=url).status_code
 49 |         if status == 200:
 50 |             print('\033[1;35m初始化成功,启动中稍等..\033[0m')
 51 |             time.sleep(2)
 52 |             print('监控已开启')
 53 |         else:
 54 |             print('初始化白名单失败')
 55 | 
 56 |     def check_ip(self):
 57 |         """
 58 |         监控 IP 分数、个数，对其进行增删
 59 |         """
 60 | 
 61 |         # 检查分数
 62 |         nodes = self.r.zrevrange('XDLProxy', 0, -1, withscores=True)
 63 |         for i in nodes:
 64 |             node = list(i)
 65 |             score = int(node[1])
 66 |             if score <= 0:
 67 |                 print('\033[1;33m分数过低剔除\033[0m')
 68 |                 self.r.zrem('XDLProxy', node[0])
 69 | 
 70 |         # 检查个数
 71 |         _sum = self.r.zcard('XDLProxy')
 72 |         if _sum < self.count:
 73 |             self.add_ip(self.count - _sum)
 74 | 
 75 |     def add_ip(self, count):
 76 |         """
 77 |         提取IP
 78 |         """
 79 |         get_url = 'http://api.xdaili.cn/xdaili-api//greatRecharge/getGreatIp?spiderId={spiderId}&orderno={orderno}&returnType=2&count={count}'.format(
 80 |             spiderId=self.spiderId, orderno=self.orderno, count=str(count))
 81 | 
 82 |         # 返回的文本进行解析
 83 |         response = requests.get(get_url)
 84 |         if response.status_code == 200:
 85 |             ret = json.loads(response.text)
 86 |             if ret.get('ERRORCODE') in ['10036', '10038', '10055']:
 87 |                 print('提取速度过快5秒钟提取一次')
 88 |             elif ret.get('ERRORCODE') == '10032':
 89 |                 print('余额不足或今日已到提取上线')
 90 |             else:
 91 |                 self.parse(ret)
 92 |         else:
 93 |             print('提取失败')
 94 | 
 95 |     def parse(self, data):
 96 |         """
 97 |         解析返回数据
 98 |         """
 99 |         proxy_list = data.get('RESULT')
100 |         for node in proxy_list:
101 |             proxy = node.get('ip') + ':' + node.get('port')
102 |             self.save_to_redis(proxy, 10)  # 默认10分
103 | 
104 |     def save_to_redis(self, proxy, expire):
105 |         """
106 |         推送到redis集合中
107 |         """
108 |         print('代理 %s 推入redis集合' % proxy)
109 |         self.r.zadd('XDLProxy', {proxy: expire})
110 | 
111 | 
112 | def aps_run():
113 |     """
114 |     监控
115 |     """
116 |     xdl.check_ip()
117 | 
118 | 
119 | # 填入提取链接
120 | xdl = XDLProxyPool('填写讯代理api链接')
121 | 
122 | # 循环监控
123 | scheduler = BlockingScheduler()
124 | scheduler.add_job(aps_run, 'cron', second='*/1')  # 这里设置检测评论，推荐2s一次(默认)
125 | scheduler.start()
126 | 


--------------------------------------------------------------------------------
/原创爬虫工具/Proxy/XDLProxyUseDemo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-11  Python: 3.7
 4 | 
 5 | import redis
 6 | import random
 7 | 
 8 | # 在scrapy中使用 代理池的demo
 9 | 
10 | 
11 | """
12 | middleware中代码如下
13 | """
14 | 
15 | pool = redis.ConnectionPool(decode_responses=True)  # redis 池
16 | r = redis.Redis(connection_pool=pool)
17 | 
18 | 
19 | 
20 | 
21 | """
22 | middleware中配置代理中间键
23 | 注意，根据爬取网址是http 还是https 来设置
24 | """
25 | 
26 | class MyProxy(object):
27 |     """代理IP设置"""
28 |     def process_request(self, request, spider):
29 |         # 此处对接redis
30 |         data = r.zrangebyscore('XDLProxy', 1, 100, withscores=True)
31 |         ip, score = random.choice(data)
32 |         request.meta['proxy'] = 'http://'+ip  # 根据自己情况填写
33 | 
34 | 
35 | 
36 | 
37 | """
38 | 拦截中间键中配置如下，写入计分器，满分20分
39 | """
40 | 
41 | class DownloaderMiddleware(object):
42 |     def process_response(self, request, response, spider):
43 |         # 对代理ip进行清洗
44 |         proxy = request._meta.get('proxy')
45 |         if not response.status == 200:
46 |             print('IP访问失败')
47 |             if proxy:
48 |                 proxy = proxy[proxy.find('/')+2:]  # 提取当此访问proxy
49 |                 r.zincrby('XDLProxy', -1, proxy)  # redis 命令修改
50 |         else:
51 |             if proxy:
52 |                 proxy = proxy[proxy.find('/') + 2:]  # 提取当此访问proxy
53 |                 score = r.zscore('XDLProxy', proxy)  # 取出分数
54 |                 if score < 20:
55 |                     r.zincrby('XDLProxy', 1, proxy)  # redis 新版本命令更改这样了
56 |         return response
57 | 
58 |     def process_exception(self, request, exception, spider):  # 可能由于IP质量问题无法访问超时
59 |         print('超时异常')
60 |         proxy = request._meta.get('proxy')
61 |         if proxy:
62 |             proxy = proxy[proxy.find('/') + 2:]
63 |             r.zincrby('XDLProxy', -1, proxy)  # redis 新版本命令更改这样了
64 |             return request
65 | 
66 | 
67 | """
68 | setting中配置
69 | """
70 | DOWNLOAD_TIMEOUT = 5  # 有的时候代理ip失效，会导致一直卡在那里 ，也有可能是用http 访问https
71 | DOWNLOADER_MIDDLEWARES = {
72 |     'middlewares.MyProxy': 543,  # 自定义代理IP
73 |     'middlewares.spiderDownloaderMiddleware': 600,  # 拦截301、302等跳转  必须设置到600
74 | }


--------------------------------------------------------------------------------
/原创爬虫工具/Proxy/ZhiMaProxyUseDemo.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-05-07  Python: 3.7
 4 | import redis
 5 | import random
 6 | 
 7 | # 在scrapy中使用 代理池的demo
 8 | 
 9 | 
10 | """
11 | scrapy 中 middleware中代码如下
12 | """
13 | 
14 | pool = redis.ConnectionPool(decode_responses=True)
15 | r = redis.Redis(connection_pool=pool)
16 | 
17 | 
18 | 
19 | 
20 | """
21 | middleware中配置代理中间键
22 | 注意，根据爬取网址是http 还是https 来设置
23 | """
24 | 
25 | class MyProxy(object):
26 |     """代理IP设置"""
27 |     def process_request(self, request, spider):
28 |         # 此处对接redis
29 |         data = r.zrange('ZhiMaProxy', 0, -1, withscores=True)
30 |         ip, score = random.choice(data)
31 |         request.meta['proxy'] = 'http://'+ip
32 | 
33 | 
34 | 
35 | 
36 | """
37 | 拦截中间键中配置如下，写入计分器，满分20分
38 | """
39 | 
40 | class DownloaderMiddleware(object):
41 |     def process_response(self, request, response, spider):
42 |         # 对代理ip进行清洗
43 |         proxy = request._meta.get('proxy')
44 |         if response.status == 302:
45 |             print('IP访问失败')
46 |             if proxy:
47 |                 proxy = proxy[proxy.find('/')+2:]
48 |                 r.zincrby('ZhiMaProxy', -10000000000, proxy)  # redis 命令修改
49 |         elif response.status == 200:
50 |             if proxy:
51 |                 proxy = proxy[proxy.find('/') + 2:]
52 |                 score = r.zscore('ZhiMaProxy',proxy)
53 |                 if score < 200000000000:
54 |                     r.zincrby('ZhiMaProxy', 10000000000, proxy)  # redis 新版本命令更改这样了
55 |         return response
56 | 
57 |     def process_exception(self, request, exception, spider):  # 可能由于IP质量问题无法访问超时，必须在这里捕获然后扣分
58 |         print('超时异常')
59 |         proxy = request._meta.get('proxy')
60 |         if proxy:
61 |             proxy = proxy[proxy.find('/') + 2:]
62 |             r.zincrby('ZhiMaProxy', -10000000000, proxy)  # redis 新版本命令更改这样了
63 |             return request
64 | 
65 | 
66 | """
67 | setting中配置
68 | """
69 | DOWNLOAD_TIMEOUT = 5  # 有的时候代理ip失效，会导致一直卡在那里 ，也有可能是用http 访问https
70 | OWNLOADER_MIDDLEWARES = {
71 |     'middlewares.MyProxy': 543,  # 自定义代理IP
72 |     'middlewares.spiderDownloaderMiddleware': 600,  # 拦截301、302等跳转
73 | }


--------------------------------------------------------------------------------
/原创爬虫工具/README.md:
--------------------------------------------------------------------------------
 1 | ## 工具表
 2 | - [x] [解密工具-可拓展式解密器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Decode)
 3 | - [x] [自动注册-验证短信接收器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Register)
 4 | - [x] [代理IP-芝麻代理池监控器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Proxy)
 5 | - [x] [代理IP-芝麻代理池客户端Demo](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Proxy)
 6 | - [x] [代理IP-讯代理池监控器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Proxy)
 7 | - [x] [代理IP-讯代理池客户端Demo](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Proxy)
 8 | - [x] [代理IP-快代理池监控器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Proxy)
 9 | - [x] [cookies获取-pyppeteer获取美团登陆cookies](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Cookies)
10 | - [x] [跨数据库迁移器-开发中](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/DataMigration)
11 | - [x] [网络图片并发直传OSS](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/OSS)
12 | - [x] [生成encrypt加密参数器](https://github.com/wkunzhi/SpiderUtilPackage/tree/master/Jsencrypt)
13 | 
14 | <hr>
15 | 
16 | 
17 | 
18 | # 可拓展式解密器
19 | 
20 | [**博客传送门**](https://blog.zhangkunzhi.com/2019/06/02/%E5%8E%9F%E5%88%9B%E5%B7%A5%E5%85%B7%E4%B9%8B%E5%8F%AF%E6%8B%93%E5%B1%95%E8%A7%A3%E7%A0%81%E5%99%A8/index.html)
21 | 
22 | > 方便测试可连续转换重制的编码转换器，可灵活拓展解码规则
23 | 
24 | ![](https://zok-blog.oss-cn-hangzhou.aliyuncs.com/images/特殊.gif)
25 | 
26 | <hr>
27 | 
28 | 
29 | 
30 | 
31 | # 代理池清洗工具
32 | 
33 | [**博客传送门**](https://blog.zhangkunzhi.com/2019/05/02/%E6%90%AD%E5%BB%BA%E4%B8%80%E4%B8%AA%E8%B6%85%E7%AE%80%E5%8D%95%E7%9A%84%E5%AE%9E%E7%94%A8%E7%9A%84%E9%AB%98%E5%8F%AF%E7%94%A8%E4%BB%98%E8%B4%B9IP%E6%B1%A0/index.html)
34 | 
35 | > 爬虫经常会用到代理ip，其中有很多收费ip，但是如何在scrapy中，高效使用这些ip是一个比较麻烦的事情，在这里基于[芝麻代理ip](http://h.zhimaruanjian.com/pay/)做一个代理池监控器，首先整理我们的需求再对其代理质量进行管理，从而保持高效IP使用率
36 | 
37 | ![key位置](https://www.zhangkunzhi.com/images/提取ip.png)
38 | 
39 | 
40 | <hr>
41 | 
42 | # 验证码短信接收器
43 | 
44 | > 基于短信接收平台的异步短信接收器，最大并发上限 20，Python3.5+。
45 | 启动后会根据设置的异步并发数进行获取手机号码并监听短信接收情况（60秒） 超过60秒后会将未收到短信的手机号拉入黑名单，并是释放。
46 | 
47 | 若要配置具体某个网站使用，还需开发对应的账号注册器，配合调用本短信接收器来达到自动注册账号的功能
48 | 
49 | <hr>
50 | 
51 | # cookies获取Demo
52 | 
53 | > 基于Pyppeteer 并发获取站点cookies
54 | - 美团登陆cookies
55 | ![](https://www.zhangkunzhi.com/images/异步获取cookies.png)
56 | 
57 | 
58 | # 跨数据库迁移器
59 | **工作中经常有这种需求**
60 | > 将采集好的mongodb数据转存到mysql中，或者是redis数据转到mongodb，于是打算封装一个组件便于以后调用
61 | 
62 | - [x] mysql 数据迁移 mongodb 
63 | ![](https://www.zhangkunzhi.com/images/to_mongo1.png)
64 | ![](https://www.zhangkunzhi.com/images/to_mongo2.png)


--------------------------------------------------------------------------------
/原创爬虫工具/Register/README.md:
--------------------------------------------------------------------------------
 1 | # 注册短信并发异步接收器
 2 | 
 3 | > 基于短信接收平台的异步短信接收器，最大并发上限20，Python3.5+
 4 | 
 5 | `pip3 install asyncio`
 6 | `pip3 install aiohttp`
 7 | 
 8 | [平台网址](http://www.51ym.me/User/Default.aspx)
 9 | 
10 | ## 使用步骤
11 | 1. 实例化对象时填入平台 token
12 | 2. 实例化对象时填入后台查询的项目 id
13 | 3. 实例化对象时填入手机短信并发上限（最大20并发）
14 | 
15 | > 启动后会根据设置的异步并发数进行获取手机号码并监听短信接收情况（60秒） 超过60秒后会将未收到短信的手机号拉入黑名单，并是释放。
16 | 
17 | 若要配置具体某个网站使用，还需开发对应的账号注册器，配合调用本短信接收器来达到自动注册账号的功能


--------------------------------------------------------------------------------
/原创爬虫工具/Register/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-05-13  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/README.md:
--------------------------------------------------------------------------------
 1 | # Zok 组件使用说明
 2 | > by: 362416272@qq.com  自用
 3 | 
 4 | ### 目录
 5 | - repetition 内容更新处理
 6 | - save 通用持久化存储组件
 7 | - random_UA 随机UA
 8 | - proxies 阿布云代理组件
 9 | 
10 | 
11 | 
12 | **mysql储存**
13 | 1. 必须在zok_config中配置要持久化的数据库账户密码
14 | 2. 在爬虫项目文件pipelines管道中，引入并使用
15 | ```python
16 | from zok.save.to_mysql import SaveToMysqlBase
17 | 
18 | class CityLandmarkListPipeline(SaveToMysqlBase):
19 |     member = 'city'  # redis集合名  如果是分布式无需设置
20 | 
21 |     @staticmethod
22 |     def get_sql(item):
23 |         sql = """INSERT INTO base_city_landmark(city, county, landmark) VALUES ("{city}","{county}","{landmark}") """.format(
24 |             city=item['city'],
25 |             county=item['county'],
26 |             landmark=item['landmark'],
27 |         )
28 |         return sql
29 |         
30 | '''必须调用 def_sql(item)方法，并返回sql语句即可'''
31 | ```
32 | 
33 | **随机UA**
34 | ```python
35 | # setting.py中 加入即可
36 | DOWNLOADER_MIDDLEWARES = {
37 |    'zok.random_UA.ua_random.RandomUserAgentMiddleware': 20,
38 | }
39 | ```
40 | 
41 | **代理ip设置**
42 | ```python
43 | # 在setting中配置即可
44 | DOWNLOADER_MIDDLEWARES = {
45 |    'zok.proxies.proxies.ProxyMiddleware': 15,  # 自定义的中间件
46 | }
47 | ```
48 | 
49 | **基于redis内容去重更新**
50 | > 原理： 在储存数据之前取到hash数据值，并加以对比，如果有值就跳过不储存，无值就set(md5, id)
51 | 1. 开启redis服务
52 | 2. 在 zok_config中配置 redis配置
53 | 3. 应用储存组件 mysql 就会自动启用去重增量更新功能
54 |     


--------------------------------------------------------------------------------
/原创爬虫工具/zok/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok" 
3 | # Date: 2019/3/5  Python: 3.7
4 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/get_db/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok" 
3 | # Date: 2019/3/16  Python: 3.7
4 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/get_db/from_mongodb.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-04-23  Python: 3.7
 4 | from pymongo import MongoClient
 5 | 
 6 | from zok.zok_config import MONGODB_URL
 7 | 
 8 | client = MongoClient(MONGODB_URL)
 9 | 
10 | database = client.meituan_db  # 链接数据库
11 | collection = database.href_coolections  # 链接结合
12 | 
13 | data = collection.find({},{'_id': 0})
14 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/get_db/from_mysql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"
 3 | # Date: 2019/3/7  Python: 3.7
 4 | 
 5 | import pymysql
 6 | 
 7 | from zok.zok_config import *
 8 | 
 9 | 
10 | def get_data(sql):
11 |     conn = pymysql.Connect(
12 |         host=MYSQL_HOST,
13 |         port=MYSQL_PORT,
14 |         user=MYSQL_USER,
15 |         password=MYSQL_PASSWORD,
16 |         db=MYSQL_DB_NAME,
17 |     )
18 |     # 创建游标对象
19 |     cursor = conn.cursor()
20 |     # 提交事务
21 |     try:
22 |         cursor.execute(sql)
23 |         data = cursor.fetchall()
24 |         cursor.close()
25 |         conn.close()
26 |         return data
27 |     except Exception as e:
28 |         print(e)
29 |         print('异常回滚')
30 |         conn.rollback()
31 |         cursor.close()
32 |         conn.close()
33 |         return None
34 | 
35 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/proxies/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok" 
3 | # Date: 2019/3/7  Python: 3.7
4 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/proxies/proxies.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok" 
 3 | # Date: 2019/3/5  Python: 3.7
 4 | 
 5 | import base64
 6 | from zok.zok_config import *
 7 | 
 8 | # 代理服务器
 9 | proxyServer = "http://http-dyn.abuyun.com:9020"
10 | 
11 | 
12 | proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((PROXY_USER + ":" + PROXY_PASS), "ascii")).decode("utf8")
13 | 
14 | 
15 | class ProxyMiddleware(object):
16 |     """自定义中间件代理IP"""
17 |     def process_request(self, request, spider):
18 |         request.meta["proxy"] = proxyServer
19 |         request.headers["Proxy-Authorization"] = proxyAuth
20 | 
21 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/random_UA/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok" 
3 | # Date: 2019/3/7  Python: 3.7
4 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/random_UA/ua_random.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok" 
 3 | # Date: 2019/3/7  Python: 3.7
 4 | import os
 5 | 
 6 | from fake_useragent import UserAgent
 7 | 
 8 | 
 9 | class RandomUserAgentMiddleware(object):
10 |     """
11 |     first to use location  because it is the fastest
12 |     """
13 | 
14 |     def __init__(self):
15 |         location = os.getcwd() + '/zok/random_UA/fake_useragent.json'
16 |         self.agent = UserAgent(path=location)  # 调用本地 ua池
17 |         # self.agent = UserAgent(verify_ssl=False)
18 |         # self.agent = UserAgent(use_cache_server=False)
19 | 
20 |     @classmethod
21 |     def from_crawler(cls, crawler):
22 |         return cls()
23 | 
24 |     def process_request(self, request, spider):
25 |         request.headers.setdefault('User-Agent', self.agent.random)
26 | 
27 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/repetition/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok" 
3 | # Date: 2019/3/5  Python: 3.7
4 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/repetition/update_cache.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok" 
 3 | # Date: 2019/3/7  Python: 3.7
 4 | 
 5 | import redis
 6 | import hashlib
 7 | 
 8 | from zok.zok_config import REDIS_PORT, REDIS_DB_NAME, REDIS_HOST, REDIS_USER, REDIS_PASSWORD
 9 | 
10 | 
11 | class CacheRedis(object):
12 | 
13 |     pool = redis.ConnectionPool(host=REDIS_HOST, port=REDIS_PORT, db=REDIS_DB_NAME, password=REDIS_PASSWORD, decode_responses=True)
14 |     r = redis.Redis(connection_pool=pool)
15 |     # 加上decode_responses=True，写入的键值对中的value为str类型，不加这个参数写入的则为字节类型。
16 | 
17 |     # 1. 根据储存数据取值判断是否存在
18 |     # 3. 不存在-已有数据: 需要更新
19 |     # 4. 不存在-无数据: 需要插入
20 |     # 5. 存在 直接跳过储存
21 | 
22 |     # BUG 在redis数据库丢失的情况下【会全体重新录入】
23 | 
24 |     @staticmethod
25 |     def get_md5(data):
26 |         md5 = hashlib.md5(data.encode('utf-8')).hexdigest()
27 |         return md5
28 | 
29 |     def redis_exists(self, member, md5):
30 |         """
31 |         验证hash是否存在， 有返回True，没有返回False
32 |         :param member: 验证区域集合Key
33 |         :param md5: 要储存的数据
34 |         :return: True or False
35 |         """
36 |         print()
37 |         if self.r.sismember(member, md5):
38 |             return True
39 |         else:
40 |             return False
41 | 
42 |     def save_redis(self, member, md5):
43 |         self.r.sadd(member, md5)
44 | 
45 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/save/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok" 
3 | # Date: 2019/3/7  Python: 3.7
4 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/save/to_mysql.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok" 
 3 | # Date: 2019/3/7  Python: 3.7
 4 | 
 5 | import pymysql
 6 | 
 7 | from zok.zok_config import *
 8 | from zok.repetition.update_cache import CacheRedis
 9 | 
10 | 
11 | class SaveToMysqlBase(object):
12 |     """
13 |     mysql储存基类
14 |     新增语法 INSERT INTO 表名(city, county, district) VALUES ("%s","%s","%s")
15 |     更新语法 UPDATE 表名 SET mail = "playstation.com" WHERE user_name = "Peter"
16 |     """
17 |     member = None  # 不设置默认不开启 redis去重校验
18 |     conn = None
19 |     cursor = None  # 游标对象
20 |     redis = CacheRedis()
21 | 
22 |     def open_spider(self, spider):
23 |         print('开始爬虫，链接数据库')
24 |         self.conn = pymysql.Connect(
25 |             host=MYSQL_HOST,
26 |             port=MYSQL_PORT,
27 |             user=MYSQL_USER,
28 |             password=MYSQL_PASSWORD,
29 |             db=MYSQL_DB_NAME,
30 |         )
31 | 
32 |     def process_item(self, item, spider):
33 |         # 写sql语句 插数据，没有表的话要先在数据库创建
34 |         sql = self.get_sql(item)
35 |         if self.member:
36 |             sql_md5 = self.redis.get_md5(sql)
37 |             if not self.redis.redis_exists(self.member, sql_md5):
38 |                 # 创建游标对象
39 |                 self.cursor = self.conn.cursor()
40 |                 # 提交事务
41 |                 try:
42 |                     self.cursor.execute(sql)
43 |                     self.conn.commit()
44 |                     self.redis.save_redis(self.member, sql_md5)
45 |                     # int(conn.insert_id())  # 最新插入行的主键ID，conn.insert_id()一定要在conn.commit()之前，否则会返回0
46 |                 except Exception as e:
47 |                     print(e)
48 |                     print('异常回滚')
49 |                     self.conn.rollback()
50 | 
51 |                 self.cursor.close()
52 |                 return item
53 |             else:
54 |                 print('已有相同数据无需插入')
55 |         else:
56 |             # 创建游标对象
57 |             self.cursor = self.conn.cursor()
58 |             # 提交事务
59 |             try:
60 |                 self.cursor.execute(sql)
61 |                 self.conn.commit()
62 |             except Exception as e:
63 |                 print(e)
64 |                 print('异常回滚')
65 |                 self.conn.rollback()
66 |             self.cursor.close()
67 |             return item
68 | 
69 |     def close_spider(self, spider):
70 |         print('爬虫结束, 关闭通道')
71 |         self.conn.close()
72 | 


--------------------------------------------------------------------------------
/原创爬虫工具/zok/zok_config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok" 
 3 | # Date: 2019/3/5  Python: 3.7
 4 | from urllib import parse
 5 | 
 6 | 
 7 | MONGODB_URL = 'mongodb://localhost:27017'
 8 | 
 9 | 
10 | REDIS_HOST = "localhost"
11 | REDIS_USER = "root"
12 | REDIS_PASSWORD = ""
13 | REDIS_DB_NAME = 0
14 | REDIS_PORT = 6379
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/滑动验证码/【w3c】滑块验证/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-10-10  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/滑动验证码/【w3c】滑块验证/bg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/bg.png


--------------------------------------------------------------------------------
/滑动验证码/【w3c】滑块验证/chache.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/chache.png


--------------------------------------------------------------------------------
/滑动验证码/【w3c】滑块验证/hk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/hk.png


--------------------------------------------------------------------------------
/滑动验证码/【w3c】滑块验证/img/0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/img/0.png


--------------------------------------------------------------------------------
/滑动验证码/【w3c】滑块验证/img/1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/img/1.png


--------------------------------------------------------------------------------
/滑动验证码/【w3c】滑块验证/img/2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/img/2.png


--------------------------------------------------------------------------------
/滑动验证码/【w3c】滑块验证/img/3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【w3c】滑块验证/img/3.png


--------------------------------------------------------------------------------
/滑动验证码/【腾讯】滑块验证/bg.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/滑动验证码/【腾讯】滑块验证/bg.jpeg


--------------------------------------------------------------------------------
/滑动验证码/【腾讯】滑块验证/discriminate.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-10-11  Python: 3.7
 4 | 
 5 | 
 6 | """
 7 | pip3 install opencv-python
 8 | """
 9 | 
10 | import cv2 as cv
11 | 
12 | 
13 | def get_pos(image):
14 |     """
15 |     缺口轮廓检测
16 |     对付腾讯滑块够用
17 |     该方法识别率 95% 左右
18 |     """
19 |     blurred = cv.GaussianBlur(image, (5, 5), 0)
20 |     canny = cv.Canny(blurred, 200, 400)
21 |     contours, hierarchy = cv.findContours(canny, cv.RETR_EXTERNAL, cv.CHAIN_APPROX_SIMPLE)
22 |     for i, contour in enumerate(contours):
23 |         m = cv.moments(contour)
24 |         if m['m00'] == 0:
25 |             cx = cy = 0
26 |         else:
27 |             cx, cy = m['m10'] / m['m00'], m['m01'] / m['m00']
28 |         if 6000 < cv.contourArea(contour) < 8000 and 370 < cv.arcLength(contour, True) < 390:
29 |             if cx < 400:
30 |                 continue
31 |             x, y, w, h = cv.boundingRect(contour)  # 外接矩形
32 |             cv.rectangle(image, (x, y), (x + w, y + h), (0, 0, 255), 2)
33 |             cv.imshow('image', image)
34 |             return x
35 |     return 0
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     """
40 |     这里是滑块缺口识别
41 |     识别到后
42 |     1。可以通过自动化工具取拖动滑块
43 |     2。可以通过参数解析的形式生成参数提交通过验证
44 |     """
45 |     img0 = cv.imread('bg.jpeg')
46 |     get_pos(img0)
47 |     cv.waitKey(0)
48 |     cv.destroyAllWindows()
49 | 


--------------------------------------------------------------------------------
/项目/HouseScrapy/requirements:
--------------------------------------------------------------------------------
1 | scrapy
2 | scrapy-redis
3 | pymysql
4 | redis>=3.2.1
5 | pymongo


--------------------------------------------------------------------------------
/项目/HouseScrapy/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = HouseScrapy
12 | 


--------------------------------------------------------------------------------
/项目/HouseScrapy/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # 阿布云代理 IP
 4 | PROXY_USER = ''
 5 | PROXY_PASS = ''
 6 | 
 7 | BOT_NAME = 'HouseScrapy'
 8 | 
 9 | SPIDER_MODULES = ['spiders']
10 | NEWSPIDER_MODULE = 'spiders'
11 | 
12 | # 否认协议
13 | ROBOTSTXT_OBEY = False
14 | 
15 | # 随机延迟
16 | RANDOMIZE_DOWNLOAD_DELAY = True
17 | 
18 | # 重试处理
19 | DOWNLOAD_FAIL_ON_DATALOSS = False
20 | 
21 | # 设置超时时间
22 | DOWNLOAD_TIMEOUT = 5
23 | 
24 | # MongoDB
25 | MONGODB_URL = 'mongodb://localhost:27017'
26 | MONGODB_DB = '房产'
27 | MONGODB_COLL = '地产数据'
28 | 
29 | 
30 | # Redis
31 | REDIS_HOST = '127.0.0.1'  # 本机
32 | REDIS_WORD = None
33 | REDIS_PORT = 6379
34 | 
35 | # 限流 秒/次
36 | DOWNLOAD_DELAY = 1 / 10
37 | 
38 | # 禁止301
39 | # HTTPERROR_ALLOWED_CODES = [301]
40 | 
41 | # 日志配置
42 | # LOG_LEVEL = 'WARNING'
43 | # LOG_FILE = 'log/error_log.txt'
44 | 
45 | 
46 | # Headers
47 | DEFAULT_REQUEST_HEADERS = {
48 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
49 |     'Host': 'www.funi.com'
50 | }
51 | 
52 | 
53 | """项目独立配置区"""
54 | 
55 | # HOST
56 | HOST = 'http://www.funi.com'
57 | 
58 | 
59 | """===== 分布式配置区 ====="""
60 | 
61 | # # 去重，利用set指纹去重
62 | # DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
63 | #
64 | # # 调度器
65 | # SCHEDULER = 'scrapy_redis.scheduler.Scheduler'
66 | #
67 | # # 去重指纹的set
68 | # SCHEDULER_PERSIST = True
69 | #
70 | # # 配置密码
71 | # REDIS_PARAMS = {
72 | #     'password': REDIS_WORD,
73 | # }
74 | #
75 | #


--------------------------------------------------------------------------------
/项目/HouseScrapy/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-07-15  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/项目/HouseScrapy/toolkits/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-07-15  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/项目/HouseScrapy/toolkits/items.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | import scrapy
4 | 
5 | 
6 | class HousesItem(scrapy.Item):
7 |     data = scrapy.Field()
8 | 


--------------------------------------------------------------------------------
/项目/HouseScrapy/toolkits/make_ua.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok" 
 3 | # Date: 2019/3/7  Python: 3.7
 4 | import os
 5 | 
 6 | from fake_useragent import UserAgent
 7 | 
 8 | 
 9 | class RandomUserAgentMiddleware(object):
10 |     """
11 |     first to use location  because it is the fastest
12 |     """
13 | 
14 |     def __init__(self):
15 |         location = os.getcwd() + '/toolkits/fake_useragent.json'
16 |         self.agent = UserAgent(path=location)
17 |         # self.agent = UserAgent(verify_ssl=False)
18 |         # self.agent = UserAgent(use_cache_server=False)
19 | 
20 |     @classmethod
21 |     def from_crawler(cls, crawler):
22 |         return cls()
23 | 
24 |     def process_request(self, request, spider):
25 |         request.headers.setdefault('User-Agent', self.agent.random)
26 | 


--------------------------------------------------------------------------------
/项目/HouseScrapy/toolkits/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class HousescrapySpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class HousescrapyDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/项目/HouseScrapy/toolkits/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | from pymongo import MongoClient
 4 | from bson.objectid import ObjectId
 5 | from settings import MONGODB_URL, MONGODB_DB, MONGODB_COLL
 6 | 
 7 | 
 8 | class HousePipeline(object):
 9 |     """地产基础数据
10 |     """
11 | 
12 |     def __init__(self):
13 |         client = MongoClient(MONGODB_URL)
14 |         self.coll = client[MONGODB_DB][MONGODB_COLL]  # 地产链接
15 | 
16 |     def process_item(self, item, spider):
17 |         self.coll.insert_one(item['data'])
18 | 


--------------------------------------------------------------------------------
/项目/HouseScrapy/toolkits/proxies.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok" 
 3 | # Date: 2019/3/5  Python: 3.7
 4 | 
 5 | import base64
 6 | from settings import PROXY_USER, PROXY_PASS
 7 | 
 8 | # 代理服务器
 9 | proxyServer = "http://http-dyn.abuyun.com:9020"
10 | 
11 | 
12 | proxyAuth = "Basic " + base64.urlsafe_b64encode(bytes((PROXY_USER + ":" + PROXY_PASS), "ascii")).decode("utf8")
13 | 
14 | 
15 | class ProxyMiddleware(object):
16 |     """自定义中间件代理IP"""
17 |     def process_request(self, request, spider):
18 |         request.meta["proxy"] = proxyServer
19 |         request.headers["Proxy-Authorization"] = proxyAuth
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/项目/HouseSpider/README.md:
--------------------------------------------------------------------------------
1 | # 目前项目还在抽空更新中
2 | > 慢慢填坑
3 | 
4 | # 概述
5 | > 对 `www.funi.com` 网站进行数据爬取


--------------------------------------------------------------------------------
/项目/HouseSpider/config.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-07-12  Python: 3.7
 4 | 
 5 | # Redis
 6 | REDIS_HOST = '127.0.0.1'
 7 | REDIS_PORT = '6379'
 8 | REDIS_PASSWORD = None
 9 | 
10 | # MongoDB
11 | MONGO_CLEAN = 'mongodb://localhost:27017'
12 | 
13 | # TargetUrl
14 | TARGET_URL = "http://www.funi.com/loupan/region_0_0_0_0_{page}"
15 | 
16 | # ProxyIP
17 | PROXY_USER = ""
18 | PROXY_PASS = ""
19 | 
20 | # HOST
21 | HOST = 'http://www.funi.com'
22 | 


--------------------------------------------------------------------------------
/项目/HouseSpider/db/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-07-12  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/项目/HouseSpider/main.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-07-12  Python: 3.7
 4 | 
 5 | import asyncio
 6 | 
 7 | from tool.parse import *
 8 | from tool.toolkit import *
 9 | 
10 | 
11 | async def get_max_page():
12 |     """获取总页数
13 |     """
14 |     url = TARGET_URL.format(page=1)
15 |     result = await get(url)
16 |     return await parse_total_page(result)
17 | 
18 | 
19 | async def get_house_url(page):
20 |     """获取地产链接
21 |     """
22 |     url = TARGET_URL.format(page=page)
23 |     result = await get(url)
24 |     await parse_house_url(result, page)
25 | 
26 | 
27 | @count_time
28 | def main():
29 |     loop = asyncio.get_event_loop()
30 | 
31 |     # 1. 获取总页数
32 |     task = loop.create_task(get_max_page())
33 |     total_page = loop.run_until_complete(task)
34 | 
35 |     # 2. 获取链接
36 |     house_url_func = [asyncio.ensure_future(get_house_url(_)) for _ in range(1, int(total_page))]
37 |     loop.run_until_complete(asyncio.wait(house_url_func))
38 | 
39 |     # 3. 楼盘详情
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 | 


--------------------------------------------------------------------------------
/项目/HouseSpider/tool/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-07-12  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/项目/HouseSpider/tool/parse.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-07-12  Python: 3.7
 4 | 
 5 | from pyquery import PyQuery as pq
 6 | from config import *
 7 | 
 8 | 
 9 | async def parse_total_page(result):
10 |     """解析总页数
11 |     """
12 |     doc = pq(result)
13 |     max_page = doc('.pages a').eq(-2).text()
14 |     print('数据总: {total} 页'.format(total=max_page))
15 |     return max_page
16 | 
17 | 
18 | async def parse_house_url(result, page):
19 |     """页面解析链接
20 |     """
21 |     doc = pq(result)
22 |     dls = doc('.fleft div').eq(-2)('dl')
23 |     n = 0
24 |     for dl in dls:
25 |         href = pq(dl)('dt a').attr('href')
26 |         href = HOST + href[: href.find(';')]  # 清洗链接
27 |         print(href)
28 |         n += 1
29 |     if not n:
30 |         print('第 {page} 抽取链接失败'.format(page=page))
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/项目/HouseSpider/tool/proxy.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-07-12  Python: 3.7
 4 | 
 5 | 
 6 | from config import PROXY_PASS, PROXY_USER
 7 | 
 8 | # 代理服务器
 9 | proxyHost = "http-dyn.abuyun.com"
10 | proxyPort = "9020"
11 | 
12 | 
13 | proxyServer = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
14 |     "host": proxyHost,
15 |     "port": proxyPort,
16 |     "user": PROXY_USER,
17 |     "pass": PROXY_PASS,
18 | }
19 | 
20 | if not PROXY_USER or not PROXY_PASS:
21 |     msg = """
22 |     请先在 config.py 配置文件内填入代理IP账号
23 |     阿布云代理IP：https://www.abuyun.com/http-proxy/products.html
24 |     """
25 |     print(msg)
26 |     exit()
27 | 


--------------------------------------------------------------------------------
/项目/HouseSpider/tool/toolkit.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # __author__ = "zok"  362416272@qq.com
 3 | # Date: 2019-07-13  Python: 3.7
 4 | import datetime
 5 | import aiohttp
 6 | 
 7 | from tool.proxy import proxyServer
 8 | 
 9 | 
10 | async def get(url):
11 |     """请求页面
12 |     """
13 |     headers = {
14 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
15 |         'Host': 'www.funi.com'
16 |     }
17 | 
18 |     try:
19 |         """conn = aiohttp.TCPConnector(verify_ssl=False) connector=conn"""
20 |         async with aiohttp.ClientSession(headers=headers) as session:
21 |             async with session.get(url, proxy=proxyServer) as response:
22 |                 return await response.text("utf-8")
23 |     except TimeoutError as te:
24 |         print('超时', te)
25 | 
26 | 
27 | def count_time(func):
28 |     """取运行时间
29 |     """
30 |     def int_time(*args, **kwargs):
31 |         start_time = datetime.datetime.now()  # 程序开始时间
32 |         func()
33 |         over_time = datetime.datetime.now()   # 程序结束时间
34 |         total_time = (over_time-start_time).total_seconds()
35 |         print('程序耗时: %s 秒' % total_time)
36 |     return int_time
37 | 


--------------------------------------------------------------------------------
/项目/MeiTuanArea/MeiTuanArea/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wkunzhi/Python3-Spider/5188ca4056bb94d956df9ddbeb42c765ebe9819a/项目/MeiTuanArea/MeiTuanArea/__init__.py


--------------------------------------------------------------------------------
/项目/MeiTuanArea/MeiTuanArea/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import scrapy
 4 | 
 5 | 
 6 | class AreaItem(scrapy.Item):
 7 |     """地区"""
 8 |     type = scrapy.Field()
 9 |     id = scrapy.Field()
10 |     pid = scrapy.Field()
11 |     name = scrapy.Field()
12 |     pinyin = scrapy.Field()
13 |     first = scrapy.Field()
14 |     haschild = scrapy.Field()
15 | 
16 | 
17 | class CoordItem(scrapy.Item):
18 |     """坐标录入"""
19 |     type = scrapy.Field()
20 |     id = scrapy.Field()
21 |     lng = scrapy.Field()
22 |     lat = scrapy.Field()
23 | 


--------------------------------------------------------------------------------
/项目/MeiTuanArea/MeiTuanArea/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class MeituanareaSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class MeituanareaDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 


--------------------------------------------------------------------------------
/项目/MeiTuanArea/MeiTuanArea/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | 
 4 | BOT_NAME = 'MeiTuanArea'  # 爬虫项目名
 5 | 
 6 | SPIDER_MODULES = ['MeiTuanArea.spiders']  # 爬虫目录设定
 7 | NEWSPIDER_MODULE = 'MeiTuanArea.spiders'  # 爬虫生成目录
 8 | 
 9 | ROBOTSTXT_OBEY = False  # 否认协议
10 | 
11 | RANDOMIZE_DOWNLOAD_DELAY = True  # 开启随机增加毫秒级延迟，增加访问成功率
12 | 
13 | DOWNLOAD_FAIL_ON_DATALOSS = False  # 重试处理
14 | 
15 | DOWNLOAD_TIMEOUT = 5  # 设置超时时间，避免ip失效等待时间过长
16 | 
17 | # HTTPERROR_ALLOWED_CODES = [301]  # 禁止301
18 | 
19 | # 指定终端输出日志、日志位置
20 | # LOG_LEVEL = 'WARNING'
21 | # LOG_FILE = 'error_log.txt'
22 | 
23 | HTTPERROR_ALLOWED_CODES = [403]
24 | 
25 | # UA
26 | USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'
27 | 
28 | # mysql
29 | MYSQL_HOST = '127.0.0.1'
30 | MYSQL_PORT = 3306
31 | MYSQL_USER = 'root'
32 | MYSQL_PASSWORD = 'mysql 密码'
33 | MYSQL_DB_NAME = 'mysql库'
34 | 
35 | # API 百度地图坐标获取API，申请后填写即可
36 | API_AK = '百度地图 api ak'
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/项目/MeiTuanArea/MeiTuanArea/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/项目/MeiTuanArea/MeiTuanArea/spiders/area_coord.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import pymysql
 4 | import json
 5 | 
 6 | from MeiTuanArea.settings import API_AK
 7 | from MeiTuanArea.settings import MYSQL_DB_NAME, MYSQL_HOST, MYSQL_PASSWORD, MYSQL_PORT, MYSQL_USER
 8 | from MeiTuanArea.items import CoordItem
 9 | 
10 | 
11 | class GetLngSpider(scrapy.Spider):
12 |     name = 'area_coord'
13 | 
14 |     # 独立配置
15 |     custom_settings = {
16 |         'ITEM_PIPELINES': {
17 |             'MeiTuanArea.pipelines.CoordPipeline': 300,
18 |         },
19 |     }
20 | 
21 |     # mysql 配置
22 |     conn = pymysql.Connect(
23 |         host=MYSQL_HOST,
24 |         port=MYSQL_PORT,
25 |         user=MYSQL_USER,
26 |         password=MYSQL_PASSWORD,
27 |         db=MYSQL_DB_NAME,
28 |     )
29 | 
30 |     url = 'http://api.map.baidu.com/geocoder/v2/?address={address}&output=json&ak={ak}'
31 | 
32 |     def start_requests(self):
33 | 
34 |         # 一级区域 省市
35 |         provinces = self.get_db("""SELECT id,`name` from province""")
36 |         for _id, name in provinces:
37 |             target_url = self.url.format(address=name, ak=API_AK)
38 |             yield scrapy.Request(target_url, meta={'type': 'province', '_id': _id})
39 | 
40 |         # 二级区域 城市
41 |         city = self.get_db("""SELECT id,`name` from city""")
42 |         for _id, name in city:
43 |             target_url = self.url.format(address=name, ak=API_AK)
44 |             yield scrapy.Request(target_url, meta={'type': 'city', '_id': _id})
45 | 
46 |         # 三级区域 区域
47 |         area = self.get_db("""select area.id, city.name, area.name from city LEFT JOIN area on city.id=area.pid""")
48 |         for _id, name, address_name in area:
49 |             address = str(name)+str(address_name)
50 |             target_url = self.url.format(address=address, ak=API_AK)
51 |             yield scrapy.Request(target_url, meta={'type': 'area', '_id': i[0]})
52 | 
53 |         # 四级区域 街道
54 |         address = self.get_db("""select address.id,area.name, address.name from area LEFT JOIN address on address.pid=area.id""")
55 |         for _id, name, address_name in address:
56 |             target_url = self.url.format(address=str(name)+str(address_name), ak=API_AK)
57 |             yield scrapy.Request(target_url, meta={'type': 'address', '_id': _id})
58 | 
59 |     def get_db(self, sql):
60 |         """数据库查询"""
61 |         # 创建游标对象
62 |         cursor = self.conn.cursor()
63 |         # 提交事务
64 |         try:
65 |             cursor.execute(sql)
66 |             data = cursor.fetchall()
67 |             cursor.close()
68 |             self.conn.close()
69 |             return data
70 |         except Exception as e:
71 |             print('异常回滚')
72 |             self.conn.rollback()
73 |             cursor.close()
74 |             self.conn.close()
75 |             return None
76 | 
77 |     def parse(self, response):
78 |         """清洗数据"""
79 |         item = CoordItem()
80 |         data = json.loads(response.text)
81 |         # 处理字符串 把闲杂符号去掉
82 |         if data.get('status') == 0:
83 |             # 坐标
84 |             item['lng'] = data.get('result').get('location').get('lng')
85 |             item['lat'] = data.get('result').get('location').get('lat')
86 |             item['id'] = response.meta.get('_id')
87 |             item['type'] = response.meta.get('type')
88 |             yield item
89 | 
90 | 


--------------------------------------------------------------------------------
/项目/MeiTuanArea/MeiTuanArea/spiders/areas.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import scrapy
  3 | import json
  4 | import re
  5 | 
  6 | from pypinyin import pinyin, lazy_pinyin
  7 | from MeiTuanArea.items import AreaItem
  8 | 
  9 | 
 10 | class GetAreaSpider(scrapy.Spider):
 11 |     name = 'areas'
 12 | 
 13 |     # 独立配置
 14 |     custom_settings = {
 15 |         'ITEM_PIPELINES': {
 16 |             'MeiTuanArea.pipelines.AreaPipeline': 300,
 17 |         },
 18 |         'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36',
 19 |         'DOWNLOAD_DELAY': 0.5,  # 限流  下载同一个网站下一个页面前需要等待的时间
 20 |     }
 21 | 
 22 |     def start_requests(self):
 23 |         start_url = 'https://www.meituan.com/ptapi/getprovincecityinfo/'
 24 |         yield scrapy.Request(start_url, callback=self.parse_province)
 25 | 
 26 |     def parse_province(self, response):
 27 |         """省市+市 1、2 级区域采集"""
 28 |         target_url = 'http://{acronym}.meituan.com/meishi/'
 29 | 
 30 |         item = AreaItem()
 31 |         data = json.loads(response.text)
 32 |         for node in data:
 33 |             name = node.get('provinceName')
 34 |             item['type'] = 'province'
 35 |             item['haschild'] = 1
 36 |             item['id'] = node.get('provinceCode')
 37 |             item['pid'] = 0
 38 |             item['name'] = name
 39 |             item['pinyin'] = ''.join(lazy_pinyin(name))
 40 |             item['first'] = self.get_acronym(name)
 41 |             yield item  # 一级省市
 42 | 
 43 |             for i in node.get('cityInfoList'):
 44 |                 item['type'] = 'city'
 45 |                 item['id'] = i.get('id')
 46 |                 item['pid'] = node.get('provinceCode')
 47 |                 item['name'] = i.get('name')
 48 |                 item['pinyin'] = i.get('pinyin')
 49 |                 item['first'] = i.get('acronym')
 50 |                 yield item  # 二级市
 51 | 
 52 |                 url = target_url.format(acronym=i.get('acronym'))
 53 |                 yield scrapy.Request(url, callback=self.parse_area, meta={'pid': i.get('id')})
 54 | 
 55 |     def parse_area(self, response):
 56 |         """区域+街道 2、3 级区域采集"""
 57 |         info, areas = re.search(r',"areas":(.*?),"dinnerCountsAttr', response.text), None
 58 |         if info:
 59 |             areas = json.loads(info.group(1))
 60 |         if areas:
 61 |             city_id = response.meta.get('pid')
 62 |             item = AreaItem()
 63 | 
 64 |             # 解析区域 3 级
 65 |             for area in areas:
 66 |                 item['type'] = 'area'
 67 |                 item['id'] = area.get('id')
 68 |                 item['pid'] = city_id
 69 |                 item['name'] = area.get('name')
 70 |                 item['pinyin'] = ''.join(lazy_pinyin(area.get('name')))
 71 |                 item['first'] = self.get_acronym(area.get('name'))
 72 | 
 73 |                 subs = area.get('subAreas')
 74 |                 # 判断是否有下级，有的区域么有下级了
 75 |                 if len(subs) > 1:
 76 |                     item['haschild'] = 1
 77 |                 else:
 78 |                     item['haschild'] = 0
 79 | 
 80 |                 yield item
 81 | 
 82 |                 # 解析 4 级
 83 |                 if len(subs) > 1:
 84 |                     for sub in subs:
 85 |                         if not sub.get('name') == '全部':
 86 |                             item['haschild'] = 0
 87 |                             item['type'] = 'address'
 88 |                             item['id'] = sub.get('id')
 89 |                             item['pid'] = area.get('id')
 90 |                             item['name'] = sub.get('name')
 91 |                             item['pinyin'] = ''.join(lazy_pinyin(sub.get('name')))
 92 |                             item['first'] = self.get_acronym(sub.get('name'))
 93 |                             yield item
 94 | 
 95 |         else:
 96 |             print('区域读取失败')
 97 | 
 98 |     @staticmethod
 99 |     def get_acronym(str_data):
100 |         """
101 |         获取字符串的首字母
102 |         :param str_data: 字符串
103 |         :return: 字符串
104 |         """
105 |         return "".join([i[0][0] for i in pinyin(str_data)])
106 | 
107 | 


--------------------------------------------------------------------------------
/项目/MeiTuanArea/README.md:
--------------------------------------------------------------------------------
 1 | # 美团城市采集
 2 | > 因为全站爬取需要用到，区域基础数据。这里单独抽离出来。
 3 | 
 4 | ## 配置
 5 | 在 settings 内配置 mysql 与 百度api_ak 即可
 6 | 
 7 | ## 数据库设计
 8 | > 因为最终数据将会用到Mysql上，区域一共有4个层级，分别是省市、市、区域、街道，这里按照业务需求拆分到4张表中。
 9 | 
10 | ![](https://zok-blog.oss-cn-hangzhou.aliyuncs.com/images/区域表.png)
11 | 
12 | ## 坐标拾取
13 | > 通过百度API调用地址，获取坐标并存入库中
14 | 
15 | ## 效果
16 | ![](https://zok-blog.oss-cn-hangzhou.aliyuncs.com/images/区域坐标.png)


--------------------------------------------------------------------------------
/项目/MeiTuanArea/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # __author__ = "zok"  362416272@qq.com
3 | # Date: 2019-06-18  Python: 3.7
4 | 
5 | 


--------------------------------------------------------------------------------
/项目/MeiTuanArea/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = MeiTuanArea.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = MeiTuanArea
12 | 


--------------------------------------------------------------------------------
/项目/MeiTuanArea/初始化.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 |  Navicat Premium Data Transfer
 3 | 
 4 |  Source Server         : LocalhostMysql
 5 |  Source Server Type    : MySQL
 6 |  Source Server Version : 50725
 7 |  Source Host           : localhost:3306
 8 |  Source Schema         : nujiang
 9 | 
10 |  Target Server Type    : MySQL
11 |  Target Server Version : 50725
12 |  File Encoding         : 65001
13 | 
14 |  Date: 23/05/2019 16:32:56
15 | */
16 | 
17 | SET NAMES utf8mb4;
18 | SET FOREIGN_KEY_CHECKS = 0;
19 | 
20 | -- ----------------------------
21 | -- Table structure for address
22 | -- ----------------------------
23 | DROP TABLE IF EXISTS `address`;
24 | CREATE TABLE `address` (
25 |   `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT 'ID',
26 |   `pid` bigint(10) DEFAULT NULL COMMENT '父id',
27 |   `name` varchar(100) DEFAULT NULL COMMENT '名称',
28 |   `pinyin` varchar(100) DEFAULT NULL COMMENT '拼音',
29 |   `code` varchar(100) DEFAULT NULL COMMENT '长途区号',
30 |   `zip` varchar(100) DEFAULT NULL COMMENT '邮编',
31 |   `first` varchar(50) DEFAULT NULL COMMENT '首字母',
32 |   `lng` varchar(100) DEFAULT NULL COMMENT '经度',
33 |   `lat` varchar(100) DEFAULT NULL COMMENT '纬度',
34 |   PRIMARY KEY (`id`) USING BTREE,
35 |   KEY `pid` (`pid`) USING BTREE
36 | ) ENGINE=InnoDB AUTO_INCREMENT=3749 DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='四级区域 地址';
37 | 
38 | -- ----------------------------
39 | -- Table structure for area
40 | -- ----------------------------
41 | DROP TABLE IF EXISTS `area`;
42 | CREATE TABLE `area` (
43 |   `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT 'ID',
44 |   `pid` bigint(10) DEFAULT NULL COMMENT '父id',
45 |   `name` varchar(100) DEFAULT NULL COMMENT '名称',
46 |   `pinyin` varchar(100) DEFAULT NULL COMMENT '拼音',
47 |   `code` varchar(100) DEFAULT NULL COMMENT '长途区号',
48 |   `zip` varchar(100) DEFAULT NULL COMMENT '邮编',
49 |   `first` varchar(50) DEFAULT NULL COMMENT '首字母',
50 |   `lng` varchar(100) DEFAULT NULL COMMENT '经度',
51 |   `lat` varchar(100) DEFAULT NULL COMMENT '纬度',
52 |   `haschild` int(1) DEFAULT NULL COMMENT '是否有下级',
53 |   PRIMARY KEY (`id`) USING BTREE,
54 |   KEY `pid` (`pid`) USING BTREE
55 | ) ENGINE=InnoDB AUTO_INCREMENT=39793 DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='三级区域 区域';
56 | 
57 | -- ----------------------------
58 | -- Table structure for city
59 | -- ----------------------------
60 | DROP TABLE IF EXISTS `city`;
61 | CREATE TABLE `city` (
62 |   `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT 'ID',
63 |   `pid` bigint(10) DEFAULT NULL COMMENT '父id',
64 |   `name` varchar(100) DEFAULT NULL COMMENT '名称',
65 |   `pinyin` varchar(100) DEFAULT NULL COMMENT '拼音',
66 |   `code` varchar(100) DEFAULT NULL COMMENT '长途区号',
67 |   `zip` varchar(100) DEFAULT NULL COMMENT '邮编',
68 |   `first` varchar(50) DEFAULT NULL COMMENT '首字母',
69 |   `lng` varchar(100) DEFAULT NULL COMMENT '经度',
70 |   `lat` varchar(100) DEFAULT NULL COMMENT '纬度',
71 |   PRIMARY KEY (`id`) USING BTREE,
72 |   KEY `pid` (`pid`) USING BTREE
73 | ) ENGINE=InnoDB AUTO_INCREMENT=8002 DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='二级区域 城市';
74 | 
75 | -- ----------------------------
76 | -- Table structure for province
77 | -- ----------------------------
78 | DROP TABLE IF EXISTS `province`;
79 | CREATE TABLE `province` (
80 |   `id` bigint(10) NOT NULL AUTO_INCREMENT COMMENT 'ID',
81 |   `name` varchar(100) DEFAULT NULL COMMENT '名称',
82 |   `pinyin` varchar(100) DEFAULT NULL COMMENT '拼音',
83 |   `code` varchar(100) DEFAULT NULL COMMENT '长途区号',
84 |   `zip` varchar(100) DEFAULT NULL COMMENT '邮编',
85 |   `first` varchar(50) DEFAULT NULL COMMENT '首字母',
86 |   `lng` varchar(100) DEFAULT NULL COMMENT '经度',
87 |   `lat` varchar(100) DEFAULT NULL COMMENT '纬度',
88 |   PRIMARY KEY (`id`) USING BTREE
89 | ) ENGINE=InnoDB AUTO_INCREMENT=820001 DEFAULT CHARSET=utf8 ROW_FORMAT=COMPACT COMMENT='一级区域 省市';
90 | 
91 | SET FOREIGN_KEY_CHECKS = 1;
92 | 


--------------------------------------------------------------------------------
/项目/README.md:
--------------------------------------------------------------------------------
1 | # 该板块不定期更新
2 | > 因为工作中会经常开发重型的爬虫，并且也属于公司的资源，所以并不会将代码放到网上。尽量以一些实战demo形式发布一些个人小项目。
3 | 
4 | ## MeiTuanArea
5 | 美团区域 Scrapy 爬虫
6 | 
7 | ## HoseSpider
8 | 房地产爬虫 aiohttp 爬虫


--------------------------------------------------------------------------------