├── .gitignore
├── CpuToInfluxdb.py
├── ModifyFilename.py
├── Python 黑魔法
    ├── Python 远程开机.py
    └── README.MD
├── README.md
├── biyingSpider.py
├── countFile.py
├── countPm.py
├── douban_book.py
├── douban_movie.py
├── excelToDatabase.py
├── image_recognition_zhihu.py
├── lagouSpider.py
├── login_zhihu.py
├── qiubai_crawer.py
├── qrcode.jpg
├── readExcel.py
├── wechat
    ├── README.MD
    ├── connect.py
    ├── face_id.py
    ├── faces
    │   ├── oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg
    │   └── oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg
    ├── images
    │   ├── oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg
    │   └── oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg
    ├── requirements.txt
    ├── utils.py
    └── yahei.ttf
└── 爬虫集合
    ├── README.MD
    ├── lagou.py
    ├── meizitu.py
    └── qiubai_crawer.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /no_use
2 | *.xlsx


--------------------------------------------------------------------------------
/CpuToInfluxdb.py:
--------------------------------------------------------------------------------
 1 | import psutil
 2 | import os
 3 | from influxdb import InfluxDBClient
 4 | import time,math,random
 5 | 
 6 | 
 7 | #获取当前运行的pid
 8 | p1=psutil.Process(os.getpid()) 
 9 | 
10 | 
11 | from influxdb import InfluxDBClient
12 | import time,math,random
13 | while True:
14 |     a = psutil.virtual_memory().percent  #内存占用率
15 | 
16 |     b = psutil.cpu_percent(interval=1.0) #cpu占用率
17 | 
18 |     json_body = [
19 |         {
20 |             "measurement": "cpu_load_short",
21 |             "tags": {
22 |                 "host": "server01",
23 |                 "region": "us-west"
24 |             },
25 |             #"time": "2009-11-10T23:00:00Z",
26 |             "fields": {
27 |                 "cpu": b,
28 |                 "mem": a
29 |             }
30 |         }
31 |     ]
32 |     client = InfluxDBClient('localhost', 8086, 'root', 'root', 'xxyyxx')
33 |     client.create_database('xxyyxx',if_not_exists=False)
34 |     client.write_points(json_body)
35 |     #result = client.query('select value from cpu_load_short;')
36 |     #print("Result: {0}".format(result))
37 |     time.sleep(2)


--------------------------------------------------------------------------------
/ModifyFilename.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | dir = os.getcwd()
 3 | subdir = os.listdir(dir)
 4 | for i in subdir:
 5 |     path = os.path.join(dir, i)
 6 |     if os.path.isdir(path):
 7 |         end_dir = os.listdir(path)
 8 |         for i in range(len(end_dir)):
 9 |             newname = end_dir[i][0:50]
10 |             os.rename(os.path.join(path, end_dir[
11 |                       i]), os.path.join(path, newname))
12 | 


--------------------------------------------------------------------------------
/Python 黑魔法/Python 远程开机.py:
--------------------------------------------------------------------------------
 1 | def wake_up(request, mac='DC-4A-3E-78-3E-0A'):
 2 |     MAC = mac
 3 |     BROADCAST = "192.168.0.255"
 4 |     if len(MAC) != 17:
 5 |         raise ValueError("MAC address should be set as form 'XX-XX-XX-XX-XX-XX'")
 6 |     mac_address = MAC.replace("-", '')
 7 |     data = ''.join(['FFFFFFFFFFFF', mac_address * 20])  # 构造原始数据格式
 8 |     send_data = b''
 9 | 
10 |     # 把原始数据转换为16进制字节数组，
11 |     for i in range(0, len(data), 2):
12 |         send_data = b''.join([send_data, struct.pack('B', int(data[i: i + 2], 16))])
13 |     print(send_data)
14 | 
15 |     # 通过socket广播出去，为避免失败，间隔广播三次
16 |     try:
17 |         sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
18 |         sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1)
19 |         sock.sendto(send_data, (BROADCAST, 7))
20 |         time.sleep(1)
21 |         sock.sendto(send_data, (BROADCAST, 7))
22 |         time.sleep(1)
23 |         sock.sendto(send_data, (BROADCAST, 7))
24 |         return HttpResponse()
25 |         print("Done")
26 |     except Exception as e:
27 |         return HttpResponse()
28 |         print(e)


--------------------------------------------------------------------------------
/Python 黑魔法/README.MD:
--------------------------------------------------------------------------------
1 | # 代码详细说明请看文章
2 | 
3 | [Python 远程关机](https://mp.weixin.qq.com/s/RSod4XWxyzL32eNcrXLjUQ)
4 | 
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # 欢迎关注我的微信公众号【智能制造社区】
 3 | 
 4 | ## 左手代码，右手制造，分享智能制造相关技术和业务，包括 Python, C#, 数据库，工业大数据、物联网技术及MES/ERP/SAP等系统。
 5 | 
 6 | ## 可以通过微信公众号加我好友
 7 | 
 8 | ![二维码](qrcode.jpg)
 9 | 
10 | # 内容列表
11 | 
12 | ## [Python微信公众号开发](https://github.com/injetlee/Python/tree/master/wechat)
13 | 
14 | - ### Python 微信公众号开发—小白篇(一)
15 | 
16 | - ### Python 公众号开发—颜值检测
17 | 
18 | ## [Python 爬虫入门合集](https://github.com/injetlee/Python/tree/master/%E7%88%AC%E8%99%AB%E9%9B%86%E5%90%88)
19 | 
20 | - ### Python 爬虫入门(一)——爬取糗事百科
21 | 
22 | - ### Python 爬虫入门(二)——爬取妹子图
23 | 
24 | - ### Python 爬虫——Python 岗位分析报告
25 | 
26 | - ### Python 爬虫利器——Selenium介绍
27 | 
28 | - ### Python 爬虫—— 抖音 App 视频抓包爬取
29 | 
30 | ## [Python 黑魔法](https://github.com/injetlee/Python/tree/master/Python%20%E9%BB%91%E9%AD%94%E6%B3%95)
31 | 
32 | - ### Python 远程关机
33 | 
34 | ## SQL 数据库
35 | 
36 | - [1 小时 SQL 极速入门（一）](https://mp.weixin.qq.com/s/Lx4B349OlD49ihJPnB6YiA)
37 | - [1 小时 SQL 极速入门（二）](https://mp.weixin.qq.com/s/D-CEtGYomne5kV_Ji4lodA)
38 | - [1 小时 SQL 极速入门（三）](https://mp.weixin.qq.com/s/7aJqrhCNcvnt2gO3p5P50Q)
39 | - [SQL 高级查询——（层次化查询，递归）](https://mp.weixin.qq.com/s/R9Yldd-5AK4ObRA9Lfbz-Q)
40 | - [GROUP BY高级查询,ROLLUP，CUBE，GROUPPING详解](https://mp.weixin.qq.com/s/_OK6dtHGhp7ukC2pe1ginQ)
41 | - [SQL 行转列，列转行](https://mp.weixin.qq.com/s/xOFIg42FQhNpyg94ajhtqQ)
42 | 
43 | ## 其他
44 | 
45 | - 1.[获取当前CPU状态，存储到Influxdb](https://github.com/injetlee/demo/blob/master/CpuToInfluxdb.py)
46 | 
47 | - 2.[模拟登录知乎](https://github.com/injetlee/demo/blob/master/login_zhihu.py)
48 | 
49 | - 3.[对目录下所有文件计数](https://github.com/injetlee/demo/blob/master/countFile.py)
50 | 
51 | - 4.[爬取豆瓣电影top250](https://github.com/injetlee/demo/blob/master/douban_movie.py)
52 | 
53 | - 5.[Excel文件读入数据库](https://github.com/injetlee/demo/blob/master/excelToDatabase.py)
54 | 
55 | - 6.[爬取拉勾网职位信息](https://github.com/injetlee/demo/blob/master/lagouSpider.py)
56 | 
57 | - 7.[批量修改文件名](https://github.com/injetlee/demo/blob/master/ModifyFilename.py)
58 | 
59 | - 8.[读写excel](https://github.com/injetlee/demo/blob/master/readExcel.py)
60 | 
61 | - 9.[下载必应首页图片,只下载当天的，一张。](https://github.com/injetlee/Python/blob/master/biyingSpider.py)
62 | 


--------------------------------------------------------------------------------
/biyingSpider.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | import time
 4 | local = time.strftime("%Y.%m.%d")
 5 | url = 'http://cn.bing.com/'
 6 | con = requests.get(url)
 7 | content = con.text
 8 | reg = r"(az/hprichbg/rb/.*?.jpg)"
 9 | a = re.findall(reg, content, re.S)[0]
10 | print(a)
11 | picUrl = url + a
12 | read = requests.get(picUrl)
13 | f = open('%s.jpg' % local, 'wb')
14 | f.write(read.content)
15 | f.close()
16 | 


--------------------------------------------------------------------------------
/countFile.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | result = []
 3 | def get_all(cwd):
 4 |     get_dir = os.listdir(cwd)  #遍历当前目录，获取文件列表
 5 |     for i in get_dir:          
 6 |         sub_dir = os.path.join(cwd,i)  # 把第一步获取的文件加入路径
 7 |         if os.path.isdir(sub_dir):     #如果当前仍然是文件夹，递归调用
 8 |             get_all(sub_dir)
 9 |         else:
10 |             ax = os.path.basename(sub_dir)  #如果当前路径不是文件夹，则把文件名放入列表
11 |             result.append(ax)
12 |             print(len(result))   #对列表计数
13 |             
14 | if __name__ == "__main__": 
15 |     cur_path = os.getcwd()   #当前目录
16 |     get_all(cur_path)


--------------------------------------------------------------------------------
/countPm.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | def count_pm(*args):
 3 |     alist = list([round(i*2-8,2) for i in args])  #计算三种颗粒浓度
 4 |     result = []
 5 |     for pm in alist:
 6 |     	pm_abs = abs(pm)
 7 |     	result.append(generate_iso_code(pm_abs))
 8 |     print (result)
 9 |     return result
10 |     	
11 | def generate_iso_code(x):
12 | 	pm_value = [0.01,0.02,0.04,0.08,0.16,0.32,0.64,1.3,2.5,5,10,20,40,80]  #颗粒浓度
13 | 	iso = list(range(1,25))   #iso级别，共24级
14 | 	for i in range(len(pm_value)):           #for循环得到某个浓度范围的iso4006级别
15 | 		if pm_value[i] < x <= pm_value[i+1]:
16 | 			iso_code = iso[i]
17 | 			break
18 | 	return iso_code
19 | 			
20 | if __name__ == '__main__':
21 |     count_pm(7.95,5.85,3.98)		
22 |     count_pm(7.918,5.949,5.456)	
23 |     count_pm(6.916,3.956,3.956)		
24 | 


--------------------------------------------------------------------------------
/douban_book.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | import requests
 3 | from openpyxl import Workbook
 4 | excel_name = "书籍.xlsx"
 5 | wb = Workbook()
 6 | ws1 = wb.active
 7 | ws1.title='书籍'
 8 | 
 9 | 
10 | def get_html(url):
11 |     header = {
12 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'}
13 |     html = requests.get(url, headers=header).content
14 |     return html
15 | 
16 | 
17 | def get_con(html):
18 |     soup = BeautifulSoup(html,'html.parser')
19 |     book_list = soup.find('div', attrs={'class': 'article'})
20 |     page = soup.find('div', attrs={'class': 'paginator'})
21 |     next_page = page.find('span', attrs={'class': 'next'}).find('a')
22 |     name = []
23 |     for i in book_list.find_all('table'):
24 |         book_name = i.find('div', attrs={'class': 'pl2'})
25 |         m = list(book_name.find('a').stripped_strings)
26 |         if len(m)>1:
27 |             x = m[0]+m[1]
28 |         else:
29 |             x = m[0]
30 |         #print(x)
31 |         name.append(x)
32 |     if next_page:
33 |         return name, next_page.get('href')
34 |     else:
35 |         return name, None
36 | 
37 | 
38 | def main():
39 |     url = 'https://book.douban.com/top250'
40 |     name_list=[]
41 |     while url:
42 |         html = get_html(url)
43 |         name, url = get_con(html)
44 |         name_list = name_list + name
45 |     for i in name_list:
46 |         location = 'A%s'%(name_list.index(i)+1)
47 |         print(i)
48 |         print(location)
49 |         ws1[location]=i
50 |     wb.save(filename=excel_name)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 
56 | 


--------------------------------------------------------------------------------
/douban_movie.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encoding=utf-8
 3 | import requests
 4 | import re
 5 | import codecs
 6 | from bs4 import BeautifulSoup
 7 | from openpyxl import Workbook
 8 | wb = Workbook()
 9 | dest_filename = '电影.xlsx'
10 | ws1 = wb.active
11 | ws1.title = "电影top250"
12 | 
13 | DOWNLOAD_URL = 'http://movie.douban.com/top250/'
14 | 
15 | 
16 | def download_page(url):
17 |     """获取url地址页面内容"""
18 |     headers = {
19 |         'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36'
20 |     }
21 |     data = requests.get(url, headers=headers).content
22 |     return data
23 | 
24 | 
25 | def get_li(doc):
26 |     soup = BeautifulSoup(doc, 'html.parser')
27 |     ol = soup.find('ol', class_='grid_view')
28 |     name = []  # 名字
29 |     star_con = []  # 评价人数
30 |     score = []  # 评分
31 |     info_list = []  # 短评
32 |     for i in ol.find_all('li'):
33 |         detail = i.find('div', attrs={'class': 'hd'})
34 |         movie_name = detail.find(
35 |             'span', attrs={'class': 'title'}).get_text()  # 电影名字
36 |         level_star = i.find(
37 |             'span', attrs={'class': 'rating_num'}).get_text()  # 评分
38 |         star = i.find('div', attrs={'class': 'star'})
39 |         star_num = star.find(text=re.compile('评价'))  # 评价
40 | 
41 |         info = i.find('span', attrs={'class': 'inq'})  # 短评
42 |         if info:  # 判断是否有短评
43 |             info_list.append(info.get_text())
44 |         else:
45 |             info_list.append('无')
46 |         score.append(level_star)
47 | 
48 |         name.append(movie_name)
49 |         star_con.append(star_num)
50 |     page = soup.find('span', attrs={'class': 'next'}).find('a')  # 获取下一页
51 |     if page:
52 |         return name, star_con, score, info_list, DOWNLOAD_URL + page['href']
53 |     return name, star_con, score, info_list, None
54 | 
55 | 
56 | def main():
57 |     url = DOWNLOAD_URL
58 |     name = []
59 |     star_con = []
60 |     score = []
61 |     info = []
62 |     while url:
63 |         doc = download_page(url)
64 |         movie, star, level_num, info_list, url = get_li(doc)
65 |         name = name + movie
66 |         star_con = star_con + star
67 |         score = score + level_num
68 |         info = info + info_list
69 |     for (i, m, o, p) in zip(name, star_con, score, info):
70 |         col_A = 'A%s' % (name.index(i) + 1)
71 |         col_B = 'B%s' % (name.index(i) + 1)
72 |         col_C = 'C%s' % (name.index(i) + 1)
73 |         col_D = 'D%s' % (name.index(i) + 1)
74 |         ws1[col_A] = i
75 |         ws1[col_B] = m
76 |         ws1[col_C] = o
77 |         ws1[col_D] = p
78 |     wb.save(filename=dest_filename)
79 | 
80 | 
81 | if __name__ == '__main__':
82 |     main()
83 | 


--------------------------------------------------------------------------------
/excelToDatabase.py:
--------------------------------------------------------------------------------
 1 | from openpyxl import load_workbook
 2 | import pymysql
 3 | config = {
 4 | 	'host': '127.0.0.1',
 5 | 	'port':3306,
 6 | 	'user': 'root',
 7 | 	'password': 'root',
 8 | 	'charset': 'utf8mb4',
 9 | 	#'cursorclass': pymysql.cursors.DictCursor
10 | 
11 | }
12 | conn = pymysql.connect(**config)
13 | conn.autocommit(1)
14 | cursor = conn.cursor()
15 | name = 'lyexcel'
16 | cursor.execute('create database if not exists %s' %name)
17 | conn.select_db(name)
18 | table_name = 'info'
19 | cursor.execute('create table if not exists %s(id MEDIUMINT NOT NULL AUTO_INCREMENT,name varchar(30),tel varchar(30),primary key (id))'%table_name)
20 | 
21 | wb2 = load_workbook('hpu.xlsx')
22 | ws=wb2.get_sheet_names()
23 | for row in wb2:
24 | 	print("1")
25 | 	for cell in row:
26 | 		value1=(cell[0].value,cell[4].value)
27 | 		cursor.execute('insert into info (name,tel) values(%s,%s)',value1)
28 | 
29 | print("overing...")
30 | # for row in A:
31 | # 	print(row)
32 | #print (wb2.get_sheet_names())
33 | 


--------------------------------------------------------------------------------
/image_recognition_zhihu.py:
--------------------------------------------------------------------------------
  1 | # -*- coding:UTF-8 -*-
  2 | 
  3 | import  requests , time ,random
  4 | import  hmac ,json ,base64
  5 | from bs4 import BeautifulSoup
  6 | from hashlib import sha1
  7 | import TencentYoutuyun
  8 | from PIL import Image
  9 | import uuid
 10 | 
 11 | 
 12 |     
 13 | def recognition_captcha(data):
 14 |     ''' 识别验证码 '''
 15 | 
 16 |     file_id = str(uuid.uuid1())
 17 |     filename = 'captcha_'+ file_id +'.gif'
 18 |     filename_png =  'captcha_'+ file_id +'.png'
 19 | 
 20 |     if(data is None):
 21 |         return 
 22 |     data = base64.b64decode(data.encode('utf-8'))
 23 |     with open( filename ,'wb') as fb:
 24 |         fb.write( data )    
 25 |     
 26 |     appid = 'appid' # 接入优图服务，注册账号获取 
 27 |     secret_id = 'secret_id'  
 28 |     secret_key = 'secret_key'  
 29 |     userid= 'userid' 
 30 |     end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT   
 31 | 
 32 |     youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化
 33 | 
 34 |     # 拿到的是gif格式，而优图只支持 JPG PNG BMP 其中之一，这时我们需要 pip install Pillow 来转换格式
 35 |     im = Image.open( filename)
 36 |     im.save( filename_png ,"png")
 37 |     im.close()
 38 |     
 39 |     result = youtu.generalocr( filename_png , data_type = 0 , seq = '')  #  0代表本地路径，1代表url
 40 | 
 41 |     return result
 42 | 
 43 | 
 44 | def get_captcha(sessiona,headers):
 45 |     ''' 获取验证码 '''
 46 |     
 47 |     need_cap = False
 48 | 
 49 |     while( need_cap is not True):
 50 |         try:
 51 |             sessiona.get('https://www.zhihu.com/signin',headers=headers)  # 拿cookie:_xsrf
 52 |             resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers)  # 拿cookie:capsion_ticket 
 53 |             need_cap = json.loads(resp2.text)["show_captcha"]  # {"show_captcha":false} 表示不用验证码
 54 |             time.sleep( 0.5 + random.randint(1,9)/10 )
 55 |         except Exception:
 56 |             continue
 57 | 
 58 |     try:
 59 |         resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据，注意是put
 60 |         img_data = json.loads(resp3.text)["img_base64"]
 61 |     except Exception:
 62 |         return     
 63 |     
 64 | 
 65 |     return img_data
 66 | 
 67 | def create_point( point_data, confidence ):
 68 |     ''' 获得点阵 '''
 69 | 
 70 |     # 实际操作下，套路不深，x间隔25，y相同，共7个点 ，先模拟意思一下
 71 |     points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]}
 72 |     wi = 0
 73 |     input_points = []
 74 |     
 75 |     for word in ( point_data['items'][0]['words'] ):
 76 |         wi = wi+1
 77 |         if( word['confidence'] < confidence ):
 78 |             try:
 79 |                 input_points.append(points[wi]) # 倒置的中文，优图识别不出来，置信度会低于0.5
 80 |             except KeyError:
 81 |                 continue
 82 |         
 83 |     if( len(input_points) > 2 or len(input_points) == 0 ):
 84 |         return []  # 7个字中只有2个倒置中文的成功率高
 85 |     
 86 |     result = {}
 87 |     result['img_size']=[200,44]
 88 |     result['input_points']=input_points
 89 |     result = json.dumps(result)
 90 |     print(result)
 91 |     return result
 92 | 
 93 | def bolting(k_low,k_hi,k3_confidence):
 94 |     ''' 筛选把握大的进行验证 '''
 95 | 
 96 |     start = time.time()
 97 |     
 98 |     is_success = False
 99 |     while(is_success is not True):
100 |     
101 |         points_len = 1
102 |         angle = -20
103 |         img_ko = []
104 | 
105 |         while(points_len != 21  or  angle < k_low  or angle > k_hi ):  
106 |             img_data = get_captcha(sessiona,headers)
107 |             img_ko = recognition_captcha(img_data)
108 |      
109 |             ## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False
110 |             # img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False ) 
111 |             # img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因，也因为优图自身的原因，此处要特殊处理
112 |         
113 |             # with open( "json.txt" ,'wb') as fb:
114 |             #     fb.write( img_ko_json )  
115 |     
116 |             try:
117 |                 points_len = len(img_ko['items'][0]['itemstring'])
118 |                 angle = img_ko['angle']
119 |             except Exception:
120 |                 points_len = 1
121 |                 angle = -20
122 |                 continue
123 | 
124 |         # print(img_ko_json.decode('utf8')) ## stdout用的是utf8，需转码才能正常显示
125 |         # print('-'*50)
126 |         
127 |         input_text = create_point( img_ko ,k3_confidence )
128 |         if(type(input_text) == type([])):
129 |             continue
130 |         
131 |         data = {
132 |             "input_text":input_text   
133 |             }
134 | 
135 |         # 提交过快会被拒绝，{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} ，假装思考5秒钟
136 |         time.sleep( 4 + random.randint(1,9)/10 )
137 |         try:    
138 |             resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers)
139 |         except Exception:
140 |             continue
141 |         
142 |         print("angle: "+ str(angle) )
143 |         print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功，会回应{"success":true}，开心
144 |         print('-'*50)
145 |         try:
146 |             is_success = json.loads(resp5.text)["success"]
147 |         except KeyError:
148 |             continue
149 | 
150 |     end = time.time()
151 | 
152 |     return end-start
153 | 
154 | 
155 | if __name__ == "__main__":
156 |     
157 |     sessiona = requests.Session()
158 |     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}
159 | 
160 |     k3_confidence = 0.71
161 |     
162 |     '''
163 |     # 可视化数据会被保存在云端供浏览
164 |     # https://plot.ly/~weldon2010/4
165 |     # 纯属学习，并未看出"角度"范围扩大对图像识别的影响，大部分时候60s内能搞定，说明优图还是很强悍的，识别速度也非常快
166 |     '''
167 |     runtime_list_x = []
168 |     runtime_list_y = []
169 |     nn = range(1,11) # 愿意的话搞多线程，1百万次更有意思
170 |     
171 |     # 成功尝试100次，形成2维数据以热力图的方式展示
172 |     for y in nn :
173 |         for x in  nn :
174 |             runtime_list_x.append( bolting(-3,3,k3_confidence) )
175 |             print( "y: " + str(runtime_list_y) )
176 |             print( "x: " + str(runtime_list_x) )
177 |         runtime_list_y.append(runtime_list_x.copy())
178 |         runtime_list_x = []
179 | 
180 |     print ("-"*30)    
181 |     print( runtime_list_y )
182 |     print ("-"*30)
183 | 
184 |     # pip install plotly 数据可视化
185 |     import plotly
186 |     import plotly.graph_objs as go
187 |     plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号，去官网注册
188 |     trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ])
189 |     data=[trace]
190 |     plotly.plotly.plot(data, filename='weldon-time2-heatmap')    
191 |    
192 |     # 尝试后发现一个特点，基本都是1~2个倒置中文，这样我们可以借此提速
193 |     # 角度范围放大，仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找
194 | 
195 | ### chcp 65001 (win下改变cmd字符集)
196 | ### python  c:\python34\image_recognition_zhihu.py
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 


--------------------------------------------------------------------------------
/lagouSpider.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from openpyxl import Workbook
 3 | 
 4 | def get_json(url, page, lang_name):
 5 |     data = {'first': 'true', 'pn': page, 'kd': lang_name}
 6 |     json = requests.post(url, data).json()
 7 |     list_con = json['content']['positionResult']['result']
 8 |     info_list = []
 9 |     for i in list_con:
10 |         info = []
11 |         info.append(i['companyShortName'])
12 |         info.append(i['companyName'])
13 |         info.append(i['salary'])
14 |         info.append(i['city'])
15 |         info.append(i['education'])
16 |         info_list.append(info)
17 |     return info_list
18 | 
19 | 
20 | def main():
21 |     lang_name = input('职位名：')
22 |     page = 1
23 |     url = 'http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
24 |     info_result = []
25 |     while page < 31:
26 |         info = get_json(url, page, lang_name)
27 |         info_result = info_result + info
28 |         page += 1
29 |     wb = Workbook()
30 |     ws1 = wb.active
31 |     ws1.title = lang_name
32 |     for row in info_result:
33 |         ws1.append(row)
34 |     wb.save('职位信息.xlsx')
35 | 
36 | if __name__ == '__main__':
37 |     main()
38 | 


--------------------------------------------------------------------------------
/login_zhihu.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:UTF-8 -*-
 2 | 
 3 | import  requests , time
 4 | import  hmac ,json
 5 | from bs4 import BeautifulSoup
 6 | from hashlib import sha1
 7 | 
 8 | 
 9 | def get_captcha(data,need_cap):
10 |     ''' 处理验证码 '''
11 |     if need_cap is False:
12 |         return
13 |     with open('captcha.gif','wb') as fb:
14 |         fb.write(data)
15 |     return input('captcha:')
16 |     
17 | def get_signature(grantType,clientId,source,timestamp):
18 |     ''' 处理签名 '''
19 | 	
20 |     hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4',None,sha1)
21 |     hm.update(str.encode(grantType))
22 |     hm.update(str.encode(clientId))
23 |     hm.update(str.encode(source))
24 |     hm.update(str.encode(timestamp))
25 | 
26 |     return  str(hm.hexdigest())
27 | 
28 | 
29 | 
30 | def login(username,password,oncaptcha,sessiona,headers):
31 |     ''' 处理登录 '''
32 |     
33 |     resp1 = sessiona.get('https://www.zhihu.com/signin',headers=headers)  # 拿cookie:_xsrf
34 |     resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers)  # 拿cookie:capsion_ticket 
35 |     need_cap = json.loads(resp2.text)["show_captcha"]  # {"show_captcha":false} 表示不用验证码
36 | 
37 |     grantType = 'password'
38 |     clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20'
39 |     source ='com.zhihu.web'
40 |     timestamp = str((time.time()*1000)).split('.')[0]  # 签名只按这个时间戳变化
41 |        
42 |     captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000),headers=headers).content
43 |     
44 |     data = {
45 |         "client_id":clientId,
46 |         "grant_type":grantType,
47 |         "timestamp":timestamp,
48 |         "source":source,
49 |         "signature": get_signature(grantType,clientId,source,timestamp), # 获取签名
50 |         "username":username,
51 |         "password":password,
52 |         "lang":"cn",
53 |         "captcha":oncaptcha(captcha_content,need_cap), # 获取图片验证码
54 |         "ref_source":"other_",
55 |         "utm_source":""
56 |     }
57 |     
58 |     print("**2**: "+str(data))
59 |     print("-"*50)
60 |     resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in',data,headers=headers).content
61 |     print(BeautifulSoup(resp,'html.parser'))
62 |     
63 |     print("-"*50)
64 |     return resp 
65 | 
66 | if __name__ == "__main__":
67 |     sessiona = requests.Session()
68 |     headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'}
69 | 
70 |     login('12345678@qq.com','12345678',get_captcha,sessiona,headers) # 用户名密码换自己的就好了
71 |     resp = sessiona.get('https://www.zhihu.com/inbox',headers=headers)  # 登录进去了，可以看私信了
72 |     print(BeautifulSoup(resp.content ,'html.parser'))
73 |     
74 |     
75 |     
76 |     
77 | ### chcp 65001 (win下改变cmd字符集)
78 | ### python  c:\python34\login_zhihu.py
79 | ### 有非常无语的事情发生，还以为代码没生效
80 | 
81 | 
82 | 
83 | 
84 | 
85 | 


--------------------------------------------------------------------------------
/qiubai_crawer.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | 
 5 | def download_page(url):
 6 |     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
 7 |     r = requests.get(url, headers=headers)
 8 |     return r.text
 9 | 
10 | 
11 | def get_content(html, page):
12 |     output = """第{}页 作者：{} 性别：{} 年龄：{} 点赞：{} 评论：{}\n{}\n------------\n"""
13 |     soup = BeautifulSoup(html, 'html.parser')
14 |     con = soup.find(id='content-left')
15 |     con_list = con.find_all('div', class_="article")
16 |     for i in con_list:
17 |         author = i.find('h2').string  # 获取作者名字
18 |         content = i.find('div', class_='content').find('span').get_text()  # 获取内容
19 |         stats = i.find('div', class_='stats')
20 |         vote = stats.find('span', class_='stats-vote').find('i', class_='number').string
21 |         comment = stats.find('span', class_='stats-comments').find('i', class_='number').string
22 |         author_info = i.find('div', class_='articleGender')  # 获取作者 年龄，性别
23 |         if author_info is not None:  # 非匿名用户
24 |             class_list = author_info['class']
25 |             if "womenIcon" in class_list:
26 |                 gender = '女'
27 |             elif "manIcon" in class_list:
28 |                 gender = '男'
29 |             else:
30 |                 gender = ''
31 |             age = author_info.string   # 获取年龄
32 |         else:  # 匿名用户
33 |             gender = ''
34 |             age = ''
35 | 
36 |         save_txt(output.format(page, author, gender, age, vote, comment, content))
37 | 
38 | 
39 | def save_txt(*args):
40 |     for i in args:
41 |         with open('qiubai.txt', 'a', encoding='utf-8') as f:
42 |             f.write(i)
43 | 
44 | 
45 | def main():
46 |     # 我们点击下面链接，在页面下方可以看到共有13页，可以构造如下 url，
47 |     # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。
48 |     for i in range(1, 14):
49 |         url = 'https://qiushibaike.com/text/page/{}'.format(i)
50 |         html = download_page(url)
51 |         get_content(html, i)
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------
/qrcode.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/qrcode.jpg


--------------------------------------------------------------------------------
/readExcel.py:
--------------------------------------------------------------------------------
 1 | from openpyxl import Workbook
 2 | from openpyxl.compat import range
 3 | from openpyxl.cell import get_column_letter
 4 | wb = Workbook()
 5 | dest_filename = 'empty_book2.xlsx'
 6 | ws1 = wb.active  # 第一个表
 7 | ws1.title = "range names"  # 第一个表命名
 8 | # 遍历第一个表的1到40行，赋值一个600内的随机数
 9 | for row in range(1, 40):
10 |     ws1.append(range(60))
11 | ws2 = wb.create_sheet(title="Pi")
12 | ws2['F5'] = 3.14
13 | ws3 = wb.create_sheet(title="Data")
14 | for row in range(10, 20):
15 |     for col in range(27, 54):
16 |         _ = ws3.cell(column=col, row=row, value="%s" % get_column_letter(col))
17 | wb.save(filename=dest_filename)
18 | 


--------------------------------------------------------------------------------
/wechat/README.MD:
--------------------------------------------------------------------------------
1 | # 详细使用请看文章
2 | 
3 | [Python微信公众号开发—小白篇(一)](https://mp.weixin.qq.com/s/iMPUC0yxI-zuf4AjtyAu6g)
4 | 
5 | [Python公众号开发—颜值检测](https://mp.weixin.qq.com/s/I0DxhIHkeqhc2LeQ2ICHeA)


--------------------------------------------------------------------------------
/wechat/connect.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8 -*-
 2 | import falcon
 3 | from falcon import uri
 4 | from wechatpy.utils import check_signature
 5 | from wechatpy.exceptions import InvalidSignatureException
 6 | from wechatpy import parse_message
 7 | from wechatpy.replies import TextReply, ImageReply
 8 | 
 9 | from utils import img_download, img_upload
10 | from face_id import access_api
11 | 
12 | 
13 | class Connect(object):
14 | 
15 |     def on_get(self, req, resp):
16 |         query_string = req.query_string
17 |         query_list = query_string.split('&')
18 |         b = {}
19 |         for i in query_list:
20 |             b[i.split('=')[0]] = i.split('=')[1]
21 | 
22 |         try:
23 |             check_signature(token='lengxiao', signature=b['signature'], timestamp=b['timestamp'], nonce=b['nonce'])
24 |             resp.body = (b['echostr'])
25 |         except InvalidSignatureException:
26 |             pass
27 |         resp.status = falcon.HTTP_200
28 | 
29 |     def on_post(self, req, resp):
30 |         xml = req.stream.read()
31 |         msg = parse_message(xml)
32 |         if msg.type == 'text':
33 |             reply = TextReply(content=msg.content, message=msg)
34 |             xml = reply.render()
35 |             resp.body = (xml)
36 |             resp.status = falcon.HTTP_200
37 |         elif msg.type == 'image':
38 |             name = img_download(msg.image, msg.source)  
39 |             print(name)
40 |             r = access_api('images/' + name)
41 |             if r == 'success':
42 |                 media_id = img_upload('image', 'faces/' + name)
43 |                 reply = ImageReply(media_id=media_id, message=msg)
44 |             else:
45 |                 reply = TextReply(content='人脸检测失败，请上传1M以下人脸清晰的照片', message=msg)
46 |             xml = reply.render()
47 |             resp.body = (xml)
48 |             resp.status = falcon.HTTP_200
49 | 
50 | app = falcon.API()
51 | connect = Connect()
52 | app.add_route('/connect', connect)
53 | 


--------------------------------------------------------------------------------
/wechat/face_id.py:
--------------------------------------------------------------------------------
  1 | # -*-coding:utf-8 -*-
  2 | import time
  3 | import random
  4 | import base64
  5 | import hashlib
  6 | import requests
  7 | from urllib.parse import urlencode
  8 | import cv2
  9 | import numpy as np
 10 | from PIL import Image, ImageDraw, ImageFont
 11 | import os
 12 | 
 13 | 
 14 | # 一.计算接口鉴权，构造请求参数
 15 | 
 16 | def random_str():
 17 |     '''得到随机字符串nonce_str'''
 18 |     str = 'abcdefghijklmnopqrstuvwxyz'
 19 |     r = ''
 20 |     for i in range(15):
 21 |         index = random.randint(0,25)
 22 |         r += str[index]
 23 |     return r
 24 | 
 25 | 
 26 | def image(name):
 27 |     with open(name, 'rb') as f:
 28 |         content = f.read()
 29 |     return base64.b64encode(content)
 30 | 
 31 | 
 32 | def get_params(img):
 33 |     '''组织接口请求的参数形式，并且计算sign接口鉴权信息，
 34 |     最终返回接口请求所需要的参数字典'''
 35 |     params = {
 36 |         'app_id': '1106860829',
 37 |         'time_stamp': str(int(time.time())),
 38 |         'nonce_str': random_str(),
 39 |         'image': img,
 40 |         'mode': '0'
 41 | 
 42 |     }
 43 | 
 44 |     sort_dict = sorted(params.items(), key=lambda item: item[0], reverse=False)  # 排序
 45 |     sort_dict.append(('app_key', 'P8Gt8nxi6k8vLKbS'))  # 添加app_key
 46 |     rawtext = urlencode(sort_dict).encode()  # URL编码
 47 |     sha = hashlib.md5()
 48 |     sha.update(rawtext)
 49 |     md5text = sha.hexdigest().upper()  # 计算出sign，接口鉴权
 50 |     params['sign'] = md5text  # 添加到请求参数列表中
 51 |     return params
 52 | 
 53 | # 二.请求接口URL
 54 | 
 55 | 
 56 | def access_api(img):
 57 |     frame = cv2.imread(img)
 58 |     nparry_encode = cv2.imencode('.jpg', frame)[1]
 59 |     data_encode = np.array(nparry_encode)
 60 |     img_encode = base64.b64encode(data_encode)  # 图片转为base64编码格式
 61 |     url = 'https://api.ai.qq.com/fcgi-bin/face/face_detectface' 
 62 |     res = requests.post(url, get_params(img_encode)).json()  # 请求URL,得到json信息
 63 |     # 把信息显示到图片上
 64 |     if res['ret'] == 0:  # 0代表请求成功
 65 |         pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))  # 把opencv格式转换为PIL格式，方便写汉字
 66 |         draw = ImageDraw.Draw(pil_img)
 67 |         for obj in res['data']['face_list']:
 68 |             img_width = res['data']['image_width']  # 图像宽度
 69 |             img_height = res['data']['image_height']  # 图像高度
 70 |             # print(obj)
 71 |             x = obj['x']  # 人脸框左上角x坐标
 72 |             y = obj['y']  # 人脸框左上角y坐标
 73 |             w = obj['width']  # 人脸框宽度
 74 |             h = obj['height']  # 人脸框高度
 75 |             # 根据返回的值，自定义一下显示的文字内容
 76 |             if obj['glass'] == 1:  # 眼镜
 77 |                 glass = '有'
 78 |             else:
 79 |                 glass = '无'
 80 |             if obj['gender'] >= 70:  # 性别值从0-100表示从女性到男性
 81 |                 gender = '男'
 82 |             elif 50 <= obj['gender'] < 70:
 83 |                 gender = "娘"
 84 |             elif obj['gender'] < 30:
 85 |                 gender = '女'
 86 |             else:
 87 |                 gender = '女汉子'
 88 |             if 90 < obj['expression'] <= 100:  # 表情从0-100，表示笑的程度
 89 |                 expression = '一笑倾城'
 90 |             elif 80 < obj['expression'] <= 90:
 91 |                 expression = '心花怒放'
 92 |             elif 70 < obj['expression'] <= 80:
 93 |                 expression = '兴高采烈'
 94 |             elif 60 < obj['expression'] <= 70:
 95 |                 expression = '眉开眼笑'
 96 |             elif 50 < obj['expression'] <= 60:
 97 |                 expression = '喜上眉梢'
 98 |             elif 40 < obj['expression'] <= 50:
 99 |                 expression = '喜气洋洋'
100 |             elif 30 < obj['expression'] <= 40:
101 |                 expression = '笑逐颜开'
102 |             elif 20 < obj['expression'] <= 30:
103 |                 expression = '似笑非笑'
104 |             elif 10 < obj['expression'] <= 20:
105 |                 expression = '半嗔半喜'
106 |             elif 0 <= obj['expression'] <= 10:
107 |                 expression = '黯然伤神'
108 |             delt = h // 5  # 确定文字垂直距离
109 |             # 写入图片
110 |             if len(res['data']['face_list']) > 1:  # 检测到多个人脸，就把信息写入人脸框内
111 |                 font = ImageFont.truetype('yahei.ttf', w // 8, encoding='utf-8')  # 提前把字体文件下载好
112 |                 draw.text((x + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font)
113 |                 draw.text((x + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font)
114 |                 draw.text((x + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font)
115 |                 draw.text((x + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font)
116 |                 draw.text((x + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font)
117 |             elif img_width - x - w < 170:  # 避免图片太窄，导致文字显示不完全
118 |                 font = ImageFont.truetype('yahei.ttf', w // 8, encoding='utf-8')
119 |                 draw.text((x + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font)
120 |                 draw.text((x + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font)
121 |                 draw.text((x + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font)
122 |                 draw.text((x + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font)
123 |                 draw.text((x + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font)
124 |             else:
125 |                 font = ImageFont.truetype('yahei.ttf', 20, encoding='utf-8')
126 |                 draw.text((x + w + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font)
127 |                 draw.text((x + w + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font)
128 |                 draw.text((x + w + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font)
129 |                 draw.text((x + w + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font)
130 |                 draw.text((x + w + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font)
131 | 
132 |             draw.rectangle((x, y, x + w, y + h), outline="#4CB050")  # 画出人脸方框
133 |             cv2img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)  # 把 pil 格式转换为 cv
134 |             cv2.imwrite('faces/{}'.format(os.path.basename(img)), cv2img)  # 保存图片到 face 文件夹下
135 |         return 'success'
136 |     else:
137 |         return 'fail'


--------------------------------------------------------------------------------
/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg


--------------------------------------------------------------------------------
/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg


--------------------------------------------------------------------------------
/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg


--------------------------------------------------------------------------------
/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg


--------------------------------------------------------------------------------
/wechat/requirements.txt:
--------------------------------------------------------------------------------
 1 | certifi==2018.4.16
 2 | chardet==3.0.4
 3 | falcon==1.4.1
 4 | idna==2.6
 5 | numpy==1.14.5
 6 | opencv-python==3.4.1.15
 7 | optionaldict==0.1.1
 8 | Pillow==5.1.0
 9 | pycrypto==2.6.1
10 | python-dateutil==2.7.3
11 | python-mimeparse==1.6.0
12 | requests==2.18.4
13 | six==1.11.0
14 | urllib3==1.22
15 | waitress==1.1.0
16 | wechatpy==1.7.0
17 | xmltodict==0.11.0
18 | 


--------------------------------------------------------------------------------
/wechat/utils.py:
--------------------------------------------------------------------------------
 1 | # -*-coding:utf-8 -*-
 2 | import requests
 3 | import json
 4 | import threading
 5 | import time
 6 | import os
 7 | 
 8 | token = ''
 9 | app_id = '开发者ID(AppID)'
10 | secret = '开发者密码(AppSecret)'
11 | 
12 | 
13 | def img_download(url, name):
14 |     r = requests.get(url)
15 |     with open('images/{}-{}.jpg'.format(name, time.strftime("%Y_%m_%d%H_%M_%S", time.localtime())), 'wb') as fd:
16 |         fd.write(r.content)
17 |     if os.path.getsize(fd.name) >= 1048576:
18 |         return 'large'
19 |     # print('namename', os.path.basename(fd.name))
20 |     return os.path.basename(fd.name)
21 | 
22 | 
23 | def get_access_token(appid, secret):
24 |     '''获取access_token,100分钟刷新一次'''
25 | 
26 |     url = 'https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid={}&secret={}'.format(appid, secret)
27 |     r = requests.get(url)
28 |     parse_json = json.loads(r.text)
29 |     global token
30 |     token = parse_json['access_token']
31 |     global timer
32 |     timer = threading.Timer(6000, get_access_token)
33 |     timer.start()
34 | 
35 | 
36 | def img_upload(mediaType, name):
37 |     global token
38 |     url = "https://api.weixin.qq.com/cgi-bin/media/upload?access_token=%s&type=%s" % (token, mediaType)
39 |     files = {'media': open('{}'.format(name), 'rb')}
40 |     r = requests.post(url, files=files)
41 |     parse_json = json.loads(r.text)
42 |     return parse_json['media_id']
43 | 
44 | get_access_token(app_id, secret)


--------------------------------------------------------------------------------
/wechat/yahei.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/wechat/yahei.ttf


--------------------------------------------------------------------------------
/爬虫集合/README.MD:
--------------------------------------------------------------------------------
 1 | # 代码详细说明请看文章
 2 | 
 3 | [Python 爬虫入门(一)——爬取糗事百科](https://mp.weixin.qq.com/s/ApnEy6NWS2f-DqIIrhHzGw)
 4 | 
 5 | [Python 爬虫入门(二)——爬取妹子图](https://mp.weixin.qq.com/s/4TZHgoE_yqeDha17f3Tbew)
 6 | 
 7 | [Python 爬虫——Python 岗位分析报告](https://mp.weixin.qq.com/s/8wAHBPnQMbcrP9La7WZiJA)
 8 | 
 9 | [Python 爬虫利器——Selenium介绍](https://mp.weixin.qq.com/s/YJGjZkUejEos_yJ1ukp5kw)
10 | 
11 | [Python 爬虫——抖音App视频抓包](https://mp.weixin.qq.com/s/a8Tky_u1u0A4vbssnAK2_g)


--------------------------------------------------------------------------------
/爬虫集合/lagou.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import time
 3 | 
 4 | import requests
 5 | from openpyxl import Workbook
 6 | import pymysql.cursors
 7 | 
 8 | 
 9 | def get_conn():
10 |     '''建立数据库连接'''
11 |     conn = pymysql.connect(host='localhost',
12 |                                 user='root',
13 |                                 password='root',
14 |                                 db='python',
15 |                                 charset='utf8mb4',
16 |                                 cursorclass=pymysql.cursors.DictCursor)
17 |     return conn
18 | 
19 | 
20 | def insert(conn, info):
21 |     '''数据写入数据库'''
22 |     with conn.cursor() as cursor:
23 |         sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)"
24 |         cursor.execute(sql, info)
25 |     conn.commit()
26 | 
27 | 
28 | def get_json(url, page, lang_name):
29 |     '''返回当前页面的信息列表'''
30 |     headers = {
31 |         'Host': 'www.lagou.com',
32 |         'Connection': 'keep-alive',
33 |         'Content-Length': '23',
34 |         'Origin': 'https://www.lagou.com',
35 |         'X-Anit-Forge-Code': '0',
36 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0',
37 |         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
38 |         'Accept': 'application/json, text/javascript, */*; q=0.01',
39 |         'X-Requested-With': 'XMLHttpRequest',
40 |         'X-Anit-Forge-Token': 'None',
41 |         'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=',
42 |         'Accept-Encoding': 'gzip, deflate, br',
43 |         'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7'
44 |     }
45 |     data = {'first': 'false', 'pn': page, 'kd': lang_name}
46 |     json = requests.post(url, data, headers=headers).json()
47 |     list_con = json['content']['positionResult']['result']
48 |     info_list = []
49 |     for i in list_con:
50 |         info = []
51 |         info.append(i.get('companyShortName', '无'))
52 |         info.append(i.get('companyFullName', '无'))
53 |         info.append(i.get('industryField', '无'))
54 |         info.append(i.get('companySize', '无'))
55 |         info.append(i.get('salary', '无'))
56 |         info.append(i.get('city', '无'))
57 |         info.append(i.get('education', '无'))
58 |         info_list.append(info)
59 |     return info_list
60 | 
61 | 
62 | def main():
63 |     lang_name = 'python'
64 |     wb = Workbook()  # 打开 excel 工作簿
65 |     conn = get_conn()  # 建立数据库连接  不存数据库 注释此行
66 |     for i in ['北京', '上海', '广州', '深圳', '杭州']:   # 五个城市
67 |         page = 1
68 |         ws1 = wb.active
69 |         ws1.title = lang_name
70 |         url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i)
71 |         while page < 31:   # 每个城市30页信息
72 |             info = get_json(url, page, lang_name)
73 |             page += 1
74 |             print(i, 'page', page)
75 |             time.sleep(random.randint(10, 20))
76 |             for row in info:
77 |                 insert(conn, tuple(row))  # 插入数据库，若不想存入 注释此行
78 |                 ws1.append(row)
79 |     conn.close()  # 关闭数据库连接，不存数据库 注释此行
80 |     wb.save('{}职位信息.xlsx'.format(lang_name))
81 | 
82 | if __name__ == '__main__':
83 |     main()


--------------------------------------------------------------------------------
/爬虫集合/meizitu.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import os
 3 | import time
 4 | import threading
 5 | from bs4 import BeautifulSoup
 6 | 
 7 | 
 8 | def download_page(url):
 9 |     '''
10 |     用于下载页面
11 |     '''
12 |     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
13 |     r = requests.get(url, headers=headers)
14 |     r.encoding = 'gb2312'
15 |     return r.text
16 | 
17 | 
18 | def get_pic_list(html):
19 |     '''
20 |     获取每个页面的套图列表,之后循环调用get_pic函数获取图片
21 |     '''
22 |     soup = BeautifulSoup(html, 'html.parser')
23 |     pic_list = soup.find_all('li', class_='wp-item')
24 |     for i in pic_list:
25 |         a_tag = i.find('h3', class_='tit').find('a')
26 |         link = a_tag.get('href')
27 |         text = a_tag.get_text()
28 |         get_pic(link, text)
29 | 
30 | 
31 | def get_pic(link, text):
32 |     '''
33 |     获取当前页面的图片,并保存
34 |     '''
35 |     html = download_page(link)  # 下载界面
36 |     soup = BeautifulSoup(html, 'html.parser')
37 |     pic_list = soup.find('div', id="picture").find_all('img')  # 找到界面所有图片
38 |     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
39 |     create_dir('pic/{}'.format(text))
40 |     for i in pic_list:
41 |         pic_link = i.get('src')  # 拿到图片的具体 url
42 |         r = requests.get(pic_link, headers=headers)  # 下载图片，之后保存到文件
43 |         with open('pic/{}/{}'.format(text, pic_link.split('/')[-1]), 'wb') as f:
44 |             f.write(r.content)
45 |             time.sleep(1)   # 休息一下，不要给网站太大压力，避免被封
46 | 
47 | 
48 | def create_dir(name):
49 |     if not os.path.exists(name):
50 |         os.makedirs(name)
51 | 
52 | 
53 | def execute(url):
54 |     page_html = download_page(url)
55 |     get_pic_list(page_html)
56 | 
57 | 
58 | def main():
59 |     create_dir('pic')
60 |     queue = [i for i in range(1, 72)]   # 构造 url 链接 页码。
61 |     threads = []
62 |     while len(queue) > 0:
63 |         for thread in threads:
64 |             if not thread.is_alive():
65 |                 threads.remove(thread)
66 |         while len(threads) < 5 and len(queue) > 0:   # 最大线程数设置为 5
67 |             cur_page = queue.pop(0)
68 |             url = 'http://meizitu.com/a/more_{}.html'.format(cur_page)
69 |             thread = threading.Thread(target=execute, args=(url,))
70 |             thread.setDaemon(True)
71 |             thread.start()
72 |             print('{}正在下载{}页'.format(threading.current_thread().name, cur_page))
73 |             threads.append(thread)
74 | 
75 | 
76 | if __name__ == '__main__':
77 |     main()
78 | 


--------------------------------------------------------------------------------
/爬虫集合/qiubai_crawer.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | 
 4 | 
 5 | def download_page(url):
 6 |     headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"}
 7 |     r = requests.get(url, headers=headers)
 8 |     return r.text
 9 | 
10 | 
11 | def get_content(html, page):
12 |     output = """第{}页 作者：{} 性别：{} 年龄：{} 点赞：{} 评论：{}\n{}\n------------\n"""
13 |     soup = BeautifulSoup(html, 'html.parser')
14 |     con = soup.find(id='content-left')
15 |     con_list = con.find_all('div', class_="article")
16 |     for i in con_list:
17 |         author = i.find('h2').string  # 获取作者名字
18 |         content = i.find('div', class_='content').find('span').get_text()  # 获取内容
19 |         stats = i.find('div', class_='stats')
20 |         vote = stats.find('span', class_='stats-vote').find('i', class_='number').string
21 |         comment = stats.find('span', class_='stats-comments').find('i', class_='number').string
22 |         author_info = i.find('div', class_='articleGender')  # 获取作者 年龄，性别
23 |         if author_info is not None:  # 非匿名用户
24 |             class_list = author_info['class']
25 |             if "womenIcon" in class_list:
26 |                 gender = '女'
27 |             elif "manIcon" in class_list:
28 |                 gender = '男'
29 |             else:
30 |                 gender = ''
31 |             age = author_info.string   # 获取年龄
32 |         else:  # 匿名用户
33 |             gender = ''
34 |             age = ''
35 | 
36 |         save_txt(output.format(page, author, gender, age, vote, comment, content))
37 | 
38 | 
39 | def save_txt(*args):
40 |     for i in args:
41 |         with open('qiubai.txt', 'a', encoding='utf-8') as f:
42 |             f.write(i)
43 | 
44 | 
45 | def main():
46 |     # 我们点击下面链接，在页面下方可以看到共有13页，可以构造如下 url，
47 |     # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。
48 |     for i in range(1, 14):
49 |         url = 'https://qiushibaike.com/text/page/{}'.format(i)
50 |         html = download_page(url)
51 |         get_content(html, i)
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------