├── .gitignore ├── CpuToInfluxdb.py ├── ModifyFilename.py ├── Python 黑魔法 ├── Python 远程开机.py └── README.MD ├── README.md ├── biyingSpider.py ├── countFile.py ├── countPm.py ├── douban_book.py ├── douban_movie.py ├── excelToDatabase.py ├── image_recognition_zhihu.py ├── lagouSpider.py ├── login_zhihu.py ├── qiubai_crawer.py ├── qrcode.jpg ├── readExcel.py ├── wechat ├── README.MD ├── connect.py ├── face_id.py ├── faces │ ├── oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg │ └── oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg ├── images │ ├── oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg │ └── oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg ├── requirements.txt ├── utils.py └── yahei.ttf └── 爬虫集合 ├── README.MD ├── lagou.py ├── meizitu.py └── qiubai_crawer.py /.gitignore: -------------------------------------------------------------------------------- 1 | /no_use 2 | *.xlsx -------------------------------------------------------------------------------- /CpuToInfluxdb.py: -------------------------------------------------------------------------------- 1 | import psutil 2 | import os 3 | from influxdb import InfluxDBClient 4 | import time,math,random 5 | 6 | 7 | #获取当前运行的pid 8 | p1=psutil.Process(os.getpid()) 9 | 10 | 11 | from influxdb import InfluxDBClient 12 | import time,math,random 13 | while True: 14 | a = psutil.virtual_memory().percent #内存占用率 15 | 16 | b = psutil.cpu_percent(interval=1.0) #cpu占用率 17 | 18 | json_body = [ 19 | { 20 | "measurement": "cpu_load_short", 21 | "tags": { 22 | "host": "server01", 23 | "region": "us-west" 24 | }, 25 | #"time": "2009-11-10T23:00:00Z", 26 | "fields": { 27 | "cpu": b, 28 | "mem": a 29 | } 30 | } 31 | ] 32 | client = InfluxDBClient('localhost', 8086, 'root', 'root', 'xxyyxx') 33 | client.create_database('xxyyxx',if_not_exists=False) 34 | client.write_points(json_body) 35 | #result = client.query('select value from cpu_load_short;') 36 | #print("Result: {0}".format(result)) 37 | time.sleep(2) -------------------------------------------------------------------------------- /ModifyFilename.py: -------------------------------------------------------------------------------- 1 | import os 2 | dir = os.getcwd() 3 | subdir = os.listdir(dir) 4 | for i in subdir: 5 | path = os.path.join(dir, i) 6 | if os.path.isdir(path): 7 | end_dir = os.listdir(path) 8 | for i in range(len(end_dir)): 9 | newname = end_dir[i][0:50] 10 | os.rename(os.path.join(path, end_dir[ 11 | i]), os.path.join(path, newname)) 12 | -------------------------------------------------------------------------------- /Python 黑魔法/Python 远程开机.py: -------------------------------------------------------------------------------- 1 | def wake_up(request, mac='DC-4A-3E-78-3E-0A'): 2 | MAC = mac 3 | BROADCAST = "192.168.0.255" 4 | if len(MAC) != 17: 5 | raise ValueError("MAC address should be set as form 'XX-XX-XX-XX-XX-XX'") 6 | mac_address = MAC.replace("-", '') 7 | data = ''.join(['FFFFFFFFFFFF', mac_address * 20]) # 构造原始数据格式 8 | send_data = b'' 9 | 10 | # 把原始数据转换为16进制字节数组, 11 | for i in range(0, len(data), 2): 12 | send_data = b''.join([send_data, struct.pack('B', int(data[i: i + 2], 16))]) 13 | print(send_data) 14 | 15 | # 通过socket广播出去,为避免失败,间隔广播三次 16 | try: 17 | sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 18 | sock.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) 19 | sock.sendto(send_data, (BROADCAST, 7)) 20 | time.sleep(1) 21 | sock.sendto(send_data, (BROADCAST, 7)) 22 | time.sleep(1) 23 | sock.sendto(send_data, (BROADCAST, 7)) 24 | return HttpResponse() 25 | print("Done") 26 | except Exception as e: 27 | return HttpResponse() 28 | print(e) -------------------------------------------------------------------------------- /Python 黑魔法/README.MD: -------------------------------------------------------------------------------- 1 | # 代码详细说明请看文章 2 | 3 | [Python 远程关机](https://mp.weixin.qq.com/s/RSod4XWxyzL32eNcrXLjUQ) 4 | 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # 欢迎关注我的微信公众号【智能制造社区】 3 | 4 | ## 左手代码,右手制造,分享智能制造相关技术和业务,包括 Python, C#, 数据库,工业大数据、物联网技术及MES/ERP/SAP等系统。 5 | 6 | ## 可以通过微信公众号加我好友 7 | 8 | ![二维码](qrcode.jpg) 9 | 10 | # 内容列表 11 | 12 | ## [Python微信公众号开发](https://github.com/injetlee/Python/tree/master/wechat) 13 | 14 | - ### Python 微信公众号开发—小白篇(一) 15 | 16 | - ### Python 公众号开发—颜值检测 17 | 18 | ## [Python 爬虫入门合集](https://github.com/injetlee/Python/tree/master/%E7%88%AC%E8%99%AB%E9%9B%86%E5%90%88) 19 | 20 | - ### Python 爬虫入门(一)——爬取糗事百科 21 | 22 | - ### Python 爬虫入门(二)——爬取妹子图 23 | 24 | - ### Python 爬虫——Python 岗位分析报告 25 | 26 | - ### Python 爬虫利器——Selenium介绍 27 | 28 | - ### Python 爬虫—— 抖音 App 视频抓包爬取 29 | 30 | ## [Python 黑魔法](https://github.com/injetlee/Python/tree/master/Python%20%E9%BB%91%E9%AD%94%E6%B3%95) 31 | 32 | - ### Python 远程关机 33 | 34 | ## SQL 数据库 35 | 36 | - [1 小时 SQL 极速入门(一)](https://mp.weixin.qq.com/s/Lx4B349OlD49ihJPnB6YiA) 37 | - [1 小时 SQL 极速入门(二)](https://mp.weixin.qq.com/s/D-CEtGYomne5kV_Ji4lodA) 38 | - [1 小时 SQL 极速入门(三)](https://mp.weixin.qq.com/s/7aJqrhCNcvnt2gO3p5P50Q) 39 | - [SQL 高级查询——(层次化查询,递归)](https://mp.weixin.qq.com/s/R9Yldd-5AK4ObRA9Lfbz-Q) 40 | - [GROUP BY高级查询,ROLLUP,CUBE,GROUPPING详解](https://mp.weixin.qq.com/s/_OK6dtHGhp7ukC2pe1ginQ) 41 | - [SQL 行转列,列转行](https://mp.weixin.qq.com/s/xOFIg42FQhNpyg94ajhtqQ) 42 | 43 | ## 其他 44 | 45 | - 1.[获取当前CPU状态,存储到Influxdb](https://github.com/injetlee/demo/blob/master/CpuToInfluxdb.py) 46 | 47 | - 2.[模拟登录知乎](https://github.com/injetlee/demo/blob/master/login_zhihu.py) 48 | 49 | - 3.[对目录下所有文件计数](https://github.com/injetlee/demo/blob/master/countFile.py) 50 | 51 | - 4.[爬取豆瓣电影top250](https://github.com/injetlee/demo/blob/master/douban_movie.py) 52 | 53 | - 5.[Excel文件读入数据库](https://github.com/injetlee/demo/blob/master/excelToDatabase.py) 54 | 55 | - 6.[爬取拉勾网职位信息](https://github.com/injetlee/demo/blob/master/lagouSpider.py) 56 | 57 | - 7.[批量修改文件名](https://github.com/injetlee/demo/blob/master/ModifyFilename.py) 58 | 59 | - 8.[读写excel](https://github.com/injetlee/demo/blob/master/readExcel.py) 60 | 61 | - 9.[下载必应首页图片,只下载当天的,一张。](https://github.com/injetlee/Python/blob/master/biyingSpider.py) 62 | -------------------------------------------------------------------------------- /biyingSpider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import time 4 | local = time.strftime("%Y.%m.%d") 5 | url = 'http://cn.bing.com/' 6 | con = requests.get(url) 7 | content = con.text 8 | reg = r"(az/hprichbg/rb/.*?.jpg)" 9 | a = re.findall(reg, content, re.S)[0] 10 | print(a) 11 | picUrl = url + a 12 | read = requests.get(picUrl) 13 | f = open('%s.jpg' % local, 'wb') 14 | f.write(read.content) 15 | f.close() 16 | -------------------------------------------------------------------------------- /countFile.py: -------------------------------------------------------------------------------- 1 | import os 2 | result = [] 3 | def get_all(cwd): 4 | get_dir = os.listdir(cwd) #遍历当前目录,获取文件列表 5 | for i in get_dir: 6 | sub_dir = os.path.join(cwd,i) # 把第一步获取的文件加入路径 7 | if os.path.isdir(sub_dir): #如果当前仍然是文件夹,递归调用 8 | get_all(sub_dir) 9 | else: 10 | ax = os.path.basename(sub_dir) #如果当前路径不是文件夹,则把文件名放入列表 11 | result.append(ax) 12 | print(len(result)) #对列表计数 13 | 14 | if __name__ == "__main__": 15 | cur_path = os.getcwd() #当前目录 16 | get_all(cur_path) -------------------------------------------------------------------------------- /countPm.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | def count_pm(*args): 3 | alist = list([round(i*2-8,2) for i in args]) #计算三种颗粒浓度 4 | result = [] 5 | for pm in alist: 6 | pm_abs = abs(pm) 7 | result.append(generate_iso_code(pm_abs)) 8 | print (result) 9 | return result 10 | 11 | def generate_iso_code(x): 12 | pm_value = [0.01,0.02,0.04,0.08,0.16,0.32,0.64,1.3,2.5,5,10,20,40,80] #颗粒浓度 13 | iso = list(range(1,25)) #iso级别,共24级 14 | for i in range(len(pm_value)): #for循环得到某个浓度范围的iso4006级别 15 | if pm_value[i] < x <= pm_value[i+1]: 16 | iso_code = iso[i] 17 | break 18 | return iso_code 19 | 20 | if __name__ == '__main__': 21 | count_pm(7.95,5.85,3.98) 22 | count_pm(7.918,5.949,5.456) 23 | count_pm(6.916,3.956,3.956) 24 | -------------------------------------------------------------------------------- /douban_book.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import requests 3 | from openpyxl import Workbook 4 | excel_name = "书籍.xlsx" 5 | wb = Workbook() 6 | ws1 = wb.active 7 | ws1.title='书籍' 8 | 9 | 10 | def get_html(url): 11 | header = { 12 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'} 13 | html = requests.get(url, headers=header).content 14 | return html 15 | 16 | 17 | def get_con(html): 18 | soup = BeautifulSoup(html,'html.parser') 19 | book_list = soup.find('div', attrs={'class': 'article'}) 20 | page = soup.find('div', attrs={'class': 'paginator'}) 21 | next_page = page.find('span', attrs={'class': 'next'}).find('a') 22 | name = [] 23 | for i in book_list.find_all('table'): 24 | book_name = i.find('div', attrs={'class': 'pl2'}) 25 | m = list(book_name.find('a').stripped_strings) 26 | if len(m)>1: 27 | x = m[0]+m[1] 28 | else: 29 | x = m[0] 30 | #print(x) 31 | name.append(x) 32 | if next_page: 33 | return name, next_page.get('href') 34 | else: 35 | return name, None 36 | 37 | 38 | def main(): 39 | url = 'https://book.douban.com/top250' 40 | name_list=[] 41 | while url: 42 | html = get_html(url) 43 | name, url = get_con(html) 44 | name_list = name_list + name 45 | for i in name_list: 46 | location = 'A%s'%(name_list.index(i)+1) 47 | print(i) 48 | print(location) 49 | ws1[location]=i 50 | wb.save(filename=excel_name) 51 | 52 | 53 | if __name__ == '__main__': 54 | main() 55 | 56 | -------------------------------------------------------------------------------- /douban_movie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encoding=utf-8 3 | import requests 4 | import re 5 | import codecs 6 | from bs4 import BeautifulSoup 7 | from openpyxl import Workbook 8 | wb = Workbook() 9 | dest_filename = '电影.xlsx' 10 | ws1 = wb.active 11 | ws1.title = "电影top250" 12 | 13 | DOWNLOAD_URL = 'http://movie.douban.com/top250/' 14 | 15 | 16 | def download_page(url): 17 | """获取url地址页面内容""" 18 | headers = { 19 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.80 Safari/537.36' 20 | } 21 | data = requests.get(url, headers=headers).content 22 | return data 23 | 24 | 25 | def get_li(doc): 26 | soup = BeautifulSoup(doc, 'html.parser') 27 | ol = soup.find('ol', class_='grid_view') 28 | name = [] # 名字 29 | star_con = [] # 评价人数 30 | score = [] # 评分 31 | info_list = [] # 短评 32 | for i in ol.find_all('li'): 33 | detail = i.find('div', attrs={'class': 'hd'}) 34 | movie_name = detail.find( 35 | 'span', attrs={'class': 'title'}).get_text() # 电影名字 36 | level_star = i.find( 37 | 'span', attrs={'class': 'rating_num'}).get_text() # 评分 38 | star = i.find('div', attrs={'class': 'star'}) 39 | star_num = star.find(text=re.compile('评价')) # 评价 40 | 41 | info = i.find('span', attrs={'class': 'inq'}) # 短评 42 | if info: # 判断是否有短评 43 | info_list.append(info.get_text()) 44 | else: 45 | info_list.append('无') 46 | score.append(level_star) 47 | 48 | name.append(movie_name) 49 | star_con.append(star_num) 50 | page = soup.find('span', attrs={'class': 'next'}).find('a') # 获取下一页 51 | if page: 52 | return name, star_con, score, info_list, DOWNLOAD_URL + page['href'] 53 | return name, star_con, score, info_list, None 54 | 55 | 56 | def main(): 57 | url = DOWNLOAD_URL 58 | name = [] 59 | star_con = [] 60 | score = [] 61 | info = [] 62 | while url: 63 | doc = download_page(url) 64 | movie, star, level_num, info_list, url = get_li(doc) 65 | name = name + movie 66 | star_con = star_con + star 67 | score = score + level_num 68 | info = info + info_list 69 | for (i, m, o, p) in zip(name, star_con, score, info): 70 | col_A = 'A%s' % (name.index(i) + 1) 71 | col_B = 'B%s' % (name.index(i) + 1) 72 | col_C = 'C%s' % (name.index(i) + 1) 73 | col_D = 'D%s' % (name.index(i) + 1) 74 | ws1[col_A] = i 75 | ws1[col_B] = m 76 | ws1[col_C] = o 77 | ws1[col_D] = p 78 | wb.save(filename=dest_filename) 79 | 80 | 81 | if __name__ == '__main__': 82 | main() 83 | -------------------------------------------------------------------------------- /excelToDatabase.py: -------------------------------------------------------------------------------- 1 | from openpyxl import load_workbook 2 | import pymysql 3 | config = { 4 | 'host': '127.0.0.1', 5 | 'port':3306, 6 | 'user': 'root', 7 | 'password': 'root', 8 | 'charset': 'utf8mb4', 9 | #'cursorclass': pymysql.cursors.DictCursor 10 | 11 | } 12 | conn = pymysql.connect(**config) 13 | conn.autocommit(1) 14 | cursor = conn.cursor() 15 | name = 'lyexcel' 16 | cursor.execute('create database if not exists %s' %name) 17 | conn.select_db(name) 18 | table_name = 'info' 19 | cursor.execute('create table if not exists %s(id MEDIUMINT NOT NULL AUTO_INCREMENT,name varchar(30),tel varchar(30),primary key (id))'%table_name) 20 | 21 | wb2 = load_workbook('hpu.xlsx') 22 | ws=wb2.get_sheet_names() 23 | for row in wb2: 24 | print("1") 25 | for cell in row: 26 | value1=(cell[0].value,cell[4].value) 27 | cursor.execute('insert into info (name,tel) values(%s,%s)',value1) 28 | 29 | print("overing...") 30 | # for row in A: 31 | # print(row) 32 | #print (wb2.get_sheet_names()) 33 | -------------------------------------------------------------------------------- /image_recognition_zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | 3 | import requests , time ,random 4 | import hmac ,json ,base64 5 | from bs4 import BeautifulSoup 6 | from hashlib import sha1 7 | import TencentYoutuyun 8 | from PIL import Image 9 | import uuid 10 | 11 | 12 | 13 | def recognition_captcha(data): 14 | ''' 识别验证码 ''' 15 | 16 | file_id = str(uuid.uuid1()) 17 | filename = 'captcha_'+ file_id +'.gif' 18 | filename_png = 'captcha_'+ file_id +'.png' 19 | 20 | if(data is None): 21 | return 22 | data = base64.b64decode(data.encode('utf-8')) 23 | with open( filename ,'wb') as fb: 24 | fb.write( data ) 25 | 26 | appid = 'appid' # 接入优图服务,注册账号获取 27 | secret_id = 'secret_id' 28 | secret_key = 'secret_key' 29 | userid= 'userid' 30 | end_point = TencentYoutuyun.conf.API_YOUTU_END_POINT 31 | 32 | youtu = TencentYoutuyun.YouTu(appid, secret_id, secret_key, userid, end_point) # 初始化 33 | 34 | # 拿到的是gif格式,而优图只支持 JPG PNG BMP 其中之一,这时我们需要 pip install Pillow 来转换格式 35 | im = Image.open( filename) 36 | im.save( filename_png ,"png") 37 | im.close() 38 | 39 | result = youtu.generalocr( filename_png , data_type = 0 , seq = '') # 0代表本地路径,1代表url 40 | 41 | return result 42 | 43 | 44 | def get_captcha(sessiona,headers): 45 | ''' 获取验证码 ''' 46 | 47 | need_cap = False 48 | 49 | while( need_cap is not True): 50 | try: 51 | sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf 52 | resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket 53 | need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 54 | time.sleep( 0.5 + random.randint(1,9)/10 ) 55 | except Exception: 56 | continue 57 | 58 | try: 59 | resp3 = sessiona.put('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿到验证码数据,注意是put 60 | img_data = json.loads(resp3.text)["img_base64"] 61 | except Exception: 62 | return 63 | 64 | 65 | return img_data 66 | 67 | def create_point( point_data, confidence ): 68 | ''' 获得点阵 ''' 69 | 70 | # 实际操作下,套路不深,x间隔25,y相同,共7个点 ,先模拟意思一下 71 | points = {1:[ 20.5,25.1875],2:[ 45.5,25.1875],3:[ 70.5,25.1875],4:[ 95.5,25.1875],5:[120.5,25.1875],6:[145.5,25.1875],7:[170.5,25.1875]} 72 | wi = 0 73 | input_points = [] 74 | 75 | for word in ( point_data['items'][0]['words'] ): 76 | wi = wi+1 77 | if( word['confidence'] < confidence ): 78 | try: 79 | input_points.append(points[wi]) # 倒置的中文,优图识别不出来,置信度会低于0.5 80 | except KeyError: 81 | continue 82 | 83 | if( len(input_points) > 2 or len(input_points) == 0 ): 84 | return [] # 7个字中只有2个倒置中文的成功率高 85 | 86 | result = {} 87 | result['img_size']=[200,44] 88 | result['input_points']=input_points 89 | result = json.dumps(result) 90 | print(result) 91 | return result 92 | 93 | def bolting(k_low,k_hi,k3_confidence): 94 | ''' 筛选把握大的进行验证 ''' 95 | 96 | start = time.time() 97 | 98 | is_success = False 99 | while(is_success is not True): 100 | 101 | points_len = 1 102 | angle = -20 103 | img_ko = [] 104 | 105 | while(points_len != 21 or angle < k_low or angle > k_hi ): 106 | img_data = get_captcha(sessiona,headers) 107 | img_ko = recognition_captcha(img_data) 108 | 109 | ## json.dumps 序列化时对中文默认使用的ascii编码.想输出真正的中文需要指定ensure_ascii=False 110 | # img_ko_json = json.dumps(img_ko , indent =2 ,ensure_ascii=False ) 111 | # img_ko_json = img_ko_json.encode('raw_unicode_escape') ## 因为python3的原因,也因为优图自身的原因,此处要特殊处理 112 | 113 | # with open( "json.txt" ,'wb') as fb: 114 | # fb.write( img_ko_json ) 115 | 116 | try: 117 | points_len = len(img_ko['items'][0]['itemstring']) 118 | angle = img_ko['angle'] 119 | except Exception: 120 | points_len = 1 121 | angle = -20 122 | continue 123 | 124 | # print(img_ko_json.decode('utf8')) ## stdout用的是utf8,需转码才能正常显示 125 | # print('-'*50) 126 | 127 | input_text = create_point( img_ko ,k3_confidence ) 128 | if(type(input_text) == type([])): 129 | continue 130 | 131 | data = { 132 | "input_text":input_text 133 | } 134 | 135 | # 提交过快会被拒绝,{"code":120005,"name":"ERR_VERIFY_CAPTCHA_TOO_QUICK"} ,假装思考5秒钟 136 | time.sleep( 4 + random.randint(1,9)/10 ) 137 | try: 138 | resp5 = sessiona.post('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',data,headers=headers) 139 | except Exception: 140 | continue 141 | 142 | print("angle: "+ str(angle) ) 143 | print(BeautifulSoup(resp5.content ,'html.parser')) # 如果验证成功,会回应{"success":true},开心 144 | print('-'*50) 145 | try: 146 | is_success = json.loads(resp5.text)["success"] 147 | except KeyError: 148 | continue 149 | 150 | end = time.time() 151 | 152 | return end-start 153 | 154 | 155 | if __name__ == "__main__": 156 | 157 | sessiona = requests.Session() 158 | headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} 159 | 160 | k3_confidence = 0.71 161 | 162 | ''' 163 | # 可视化数据会被保存在云端供浏览 164 | # https://plot.ly/~weldon2010/4 165 | # 纯属学习,并未看出"角度"范围扩大对图像识别的影响,大部分时候60s内能搞定,说明优图还是很强悍的,识别速度也非常快 166 | ''' 167 | runtime_list_x = [] 168 | runtime_list_y = [] 169 | nn = range(1,11) # 愿意的话搞多线程,1百万次更有意思 170 | 171 | # 成功尝试100次,形成2维数据以热力图的方式展示 172 | for y in nn : 173 | for x in nn : 174 | runtime_list_x.append( bolting(-3,3,k3_confidence) ) 175 | print( "y: " + str(runtime_list_y) ) 176 | print( "x: " + str(runtime_list_x) ) 177 | runtime_list_y.append(runtime_list_x.copy()) 178 | runtime_list_x = [] 179 | 180 | print ("-"*30) 181 | print( runtime_list_y ) 182 | print ("-"*30) 183 | 184 | # pip install plotly 数据可视化 185 | import plotly 186 | import plotly.graph_objs as go 187 | plotly.tools.set_credentials_file(username='username', api_key='username') # 设置账号,去官网注册 188 | trace = go.Heatmap(z = runtime_list_y , x = [n for n in nn ] ,y =[n for n in nn ]) 189 | data=[trace] 190 | plotly.plotly.plot(data, filename='weldon-time2-heatmap') 191 | 192 | # 尝试后发现一个特点,基本都是1~2个倒置中文,这样我们可以借此提速 193 | # 角度范围放大,仅当识别出倒置中文为1~2个时才提交验证否则放弃继续寻找 194 | 195 | ### chcp 65001 (win下改变cmd字符集) 196 | ### python c:\python34\image_recognition_zhihu.py 197 | 198 | 199 | 200 | 201 | 202 | 203 | -------------------------------------------------------------------------------- /lagouSpider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from openpyxl import Workbook 3 | 4 | def get_json(url, page, lang_name): 5 | data = {'first': 'true', 'pn': page, 'kd': lang_name} 6 | json = requests.post(url, data).json() 7 | list_con = json['content']['positionResult']['result'] 8 | info_list = [] 9 | for i in list_con: 10 | info = [] 11 | info.append(i['companyShortName']) 12 | info.append(i['companyName']) 13 | info.append(i['salary']) 14 | info.append(i['city']) 15 | info.append(i['education']) 16 | info_list.append(info) 17 | return info_list 18 | 19 | 20 | def main(): 21 | lang_name = input('职位名:') 22 | page = 1 23 | url = 'http://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false' 24 | info_result = [] 25 | while page < 31: 26 | info = get_json(url, page, lang_name) 27 | info_result = info_result + info 28 | page += 1 29 | wb = Workbook() 30 | ws1 = wb.active 31 | ws1.title = lang_name 32 | for row in info_result: 33 | ws1.append(row) 34 | wb.save('职位信息.xlsx') 35 | 36 | if __name__ == '__main__': 37 | main() 38 | -------------------------------------------------------------------------------- /login_zhihu.py: -------------------------------------------------------------------------------- 1 | # -*- coding:UTF-8 -*- 2 | 3 | import requests , time 4 | import hmac ,json 5 | from bs4 import BeautifulSoup 6 | from hashlib import sha1 7 | 8 | 9 | def get_captcha(data,need_cap): 10 | ''' 处理验证码 ''' 11 | if need_cap is False: 12 | return 13 | with open('captcha.gif','wb') as fb: 14 | fb.write(data) 15 | return input('captcha:') 16 | 17 | def get_signature(grantType,clientId,source,timestamp): 18 | ''' 处理签名 ''' 19 | 20 | hm = hmac.new(b'd1b964811afb40118a12068ff74a12f4',None,sha1) 21 | hm.update(str.encode(grantType)) 22 | hm.update(str.encode(clientId)) 23 | hm.update(str.encode(source)) 24 | hm.update(str.encode(timestamp)) 25 | 26 | return str(hm.hexdigest()) 27 | 28 | 29 | 30 | def login(username,password,oncaptcha,sessiona,headers): 31 | ''' 处理登录 ''' 32 | 33 | resp1 = sessiona.get('https://www.zhihu.com/signin',headers=headers) # 拿cookie:_xsrf 34 | resp2 = sessiona.get('https://www.zhihu.com/api/v3/oauth/captcha?lang=cn',headers=headers) # 拿cookie:capsion_ticket 35 | need_cap = json.loads(resp2.text)["show_captcha"] # {"show_captcha":false} 表示不用验证码 36 | 37 | grantType = 'password' 38 | clientId = 'c3cef7c66a1843f8b3a9e6a1e3160e20' 39 | source ='com.zhihu.web' 40 | timestamp = str((time.time()*1000)).split('.')[0] # 签名只按这个时间戳变化 41 | 42 | captcha_content = sessiona.get('https://www.zhihu.com/captcha.gif?r=%d&type=login'%(time.time()*1000),headers=headers).content 43 | 44 | data = { 45 | "client_id":clientId, 46 | "grant_type":grantType, 47 | "timestamp":timestamp, 48 | "source":source, 49 | "signature": get_signature(grantType,clientId,source,timestamp), # 获取签名 50 | "username":username, 51 | "password":password, 52 | "lang":"cn", 53 | "captcha":oncaptcha(captcha_content,need_cap), # 获取图片验证码 54 | "ref_source":"other_", 55 | "utm_source":"" 56 | } 57 | 58 | print("**2**: "+str(data)) 59 | print("-"*50) 60 | resp = sessiona.post('https://www.zhihu.com/api/v3/oauth/sign_in',data,headers=headers).content 61 | print(BeautifulSoup(resp,'html.parser')) 62 | 63 | print("-"*50) 64 | return resp 65 | 66 | if __name__ == "__main__": 67 | sessiona = requests.Session() 68 | headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0','authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20'} 69 | 70 | login('12345678@qq.com','12345678',get_captcha,sessiona,headers) # 用户名密码换自己的就好了 71 | resp = sessiona.get('https://www.zhihu.com/inbox',headers=headers) # 登录进去了,可以看私信了 72 | print(BeautifulSoup(resp.content ,'html.parser')) 73 | 74 | 75 | 76 | 77 | ### chcp 65001 (win下改变cmd字符集) 78 | ### python c:\python34\login_zhihu.py 79 | ### 有非常无语的事情发生,还以为代码没生效 80 | 81 | 82 | 83 | 84 | 85 | -------------------------------------------------------------------------------- /qiubai_crawer.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | 5 | def download_page(url): 6 | headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} 7 | r = requests.get(url, headers=headers) 8 | return r.text 9 | 10 | 11 | def get_content(html, page): 12 | output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" 13 | soup = BeautifulSoup(html, 'html.parser') 14 | con = soup.find(id='content-left') 15 | con_list = con.find_all('div', class_="article") 16 | for i in con_list: 17 | author = i.find('h2').string # 获取作者名字 18 | content = i.find('div', class_='content').find('span').get_text() # 获取内容 19 | stats = i.find('div', class_='stats') 20 | vote = stats.find('span', class_='stats-vote').find('i', class_='number').string 21 | comment = stats.find('span', class_='stats-comments').find('i', class_='number').string 22 | author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别 23 | if author_info is not None: # 非匿名用户 24 | class_list = author_info['class'] 25 | if "womenIcon" in class_list: 26 | gender = '女' 27 | elif "manIcon" in class_list: 28 | gender = '男' 29 | else: 30 | gender = '' 31 | age = author_info.string # 获取年龄 32 | else: # 匿名用户 33 | gender = '' 34 | age = '' 35 | 36 | save_txt(output.format(page, author, gender, age, vote, comment, content)) 37 | 38 | 39 | def save_txt(*args): 40 | for i in args: 41 | with open('qiubai.txt', 'a', encoding='utf-8') as f: 42 | f.write(i) 43 | 44 | 45 | def main(): 46 | # 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url, 47 | # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。 48 | for i in range(1, 14): 49 | url = 'https://qiushibaike.com/text/page/{}'.format(i) 50 | html = download_page(url) 51 | get_content(html, i) 52 | 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /qrcode.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/qrcode.jpg -------------------------------------------------------------------------------- /readExcel.py: -------------------------------------------------------------------------------- 1 | from openpyxl import Workbook 2 | from openpyxl.compat import range 3 | from openpyxl.cell import get_column_letter 4 | wb = Workbook() 5 | dest_filename = 'empty_book2.xlsx' 6 | ws1 = wb.active # 第一个表 7 | ws1.title = "range names" # 第一个表命名 8 | # 遍历第一个表的1到40行,赋值一个600内的随机数 9 | for row in range(1, 40): 10 | ws1.append(range(60)) 11 | ws2 = wb.create_sheet(title="Pi") 12 | ws2['F5'] = 3.14 13 | ws3 = wb.create_sheet(title="Data") 14 | for row in range(10, 20): 15 | for col in range(27, 54): 16 | _ = ws3.cell(column=col, row=row, value="%s" % get_column_letter(col)) 17 | wb.save(filename=dest_filename) 18 | -------------------------------------------------------------------------------- /wechat/README.MD: -------------------------------------------------------------------------------- 1 | # 详细使用请看文章 2 | 3 | [Python微信公众号开发—小白篇(一)](https://mp.weixin.qq.com/s/iMPUC0yxI-zuf4AjtyAu6g) 4 | 5 | [Python公众号开发—颜值检测](https://mp.weixin.qq.com/s/I0DxhIHkeqhc2LeQ2ICHeA) -------------------------------------------------------------------------------- /wechat/connect.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import falcon 3 | from falcon import uri 4 | from wechatpy.utils import check_signature 5 | from wechatpy.exceptions import InvalidSignatureException 6 | from wechatpy import parse_message 7 | from wechatpy.replies import TextReply, ImageReply 8 | 9 | from utils import img_download, img_upload 10 | from face_id import access_api 11 | 12 | 13 | class Connect(object): 14 | 15 | def on_get(self, req, resp): 16 | query_string = req.query_string 17 | query_list = query_string.split('&') 18 | b = {} 19 | for i in query_list: 20 | b[i.split('=')[0]] = i.split('=')[1] 21 | 22 | try: 23 | check_signature(token='lengxiao', signature=b['signature'], timestamp=b['timestamp'], nonce=b['nonce']) 24 | resp.body = (b['echostr']) 25 | except InvalidSignatureException: 26 | pass 27 | resp.status = falcon.HTTP_200 28 | 29 | def on_post(self, req, resp): 30 | xml = req.stream.read() 31 | msg = parse_message(xml) 32 | if msg.type == 'text': 33 | reply = TextReply(content=msg.content, message=msg) 34 | xml = reply.render() 35 | resp.body = (xml) 36 | resp.status = falcon.HTTP_200 37 | elif msg.type == 'image': 38 | name = img_download(msg.image, msg.source) 39 | print(name) 40 | r = access_api('images/' + name) 41 | if r == 'success': 42 | media_id = img_upload('image', 'faces/' + name) 43 | reply = ImageReply(media_id=media_id, message=msg) 44 | else: 45 | reply = TextReply(content='人脸检测失败,请上传1M以下人脸清晰的照片', message=msg) 46 | xml = reply.render() 47 | resp.body = (xml) 48 | resp.status = falcon.HTTP_200 49 | 50 | app = falcon.API() 51 | connect = Connect() 52 | app.add_route('/connect', connect) 53 | -------------------------------------------------------------------------------- /wechat/face_id.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import time 3 | import random 4 | import base64 5 | import hashlib 6 | import requests 7 | from urllib.parse import urlencode 8 | import cv2 9 | import numpy as np 10 | from PIL import Image, ImageDraw, ImageFont 11 | import os 12 | 13 | 14 | # 一.计算接口鉴权,构造请求参数 15 | 16 | def random_str(): 17 | '''得到随机字符串nonce_str''' 18 | str = 'abcdefghijklmnopqrstuvwxyz' 19 | r = '' 20 | for i in range(15): 21 | index = random.randint(0,25) 22 | r += str[index] 23 | return r 24 | 25 | 26 | def image(name): 27 | with open(name, 'rb') as f: 28 | content = f.read() 29 | return base64.b64encode(content) 30 | 31 | 32 | def get_params(img): 33 | '''组织接口请求的参数形式,并且计算sign接口鉴权信息, 34 | 最终返回接口请求所需要的参数字典''' 35 | params = { 36 | 'app_id': '1106860829', 37 | 'time_stamp': str(int(time.time())), 38 | 'nonce_str': random_str(), 39 | 'image': img, 40 | 'mode': '0' 41 | 42 | } 43 | 44 | sort_dict = sorted(params.items(), key=lambda item: item[0], reverse=False) # 排序 45 | sort_dict.append(('app_key', 'P8Gt8nxi6k8vLKbS')) # 添加app_key 46 | rawtext = urlencode(sort_dict).encode() # URL编码 47 | sha = hashlib.md5() 48 | sha.update(rawtext) 49 | md5text = sha.hexdigest().upper() # 计算出sign,接口鉴权 50 | params['sign'] = md5text # 添加到请求参数列表中 51 | return params 52 | 53 | # 二.请求接口URL 54 | 55 | 56 | def access_api(img): 57 | frame = cv2.imread(img) 58 | nparry_encode = cv2.imencode('.jpg', frame)[1] 59 | data_encode = np.array(nparry_encode) 60 | img_encode = base64.b64encode(data_encode) # 图片转为base64编码格式 61 | url = 'https://api.ai.qq.com/fcgi-bin/face/face_detectface' 62 | res = requests.post(url, get_params(img_encode)).json() # 请求URL,得到json信息 63 | # 把信息显示到图片上 64 | if res['ret'] == 0: # 0代表请求成功 65 | pil_img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) # 把opencv格式转换为PIL格式,方便写汉字 66 | draw = ImageDraw.Draw(pil_img) 67 | for obj in res['data']['face_list']: 68 | img_width = res['data']['image_width'] # 图像宽度 69 | img_height = res['data']['image_height'] # 图像高度 70 | # print(obj) 71 | x = obj['x'] # 人脸框左上角x坐标 72 | y = obj['y'] # 人脸框左上角y坐标 73 | w = obj['width'] # 人脸框宽度 74 | h = obj['height'] # 人脸框高度 75 | # 根据返回的值,自定义一下显示的文字内容 76 | if obj['glass'] == 1: # 眼镜 77 | glass = '有' 78 | else: 79 | glass = '无' 80 | if obj['gender'] >= 70: # 性别值从0-100表示从女性到男性 81 | gender = '男' 82 | elif 50 <= obj['gender'] < 70: 83 | gender = "娘" 84 | elif obj['gender'] < 30: 85 | gender = '女' 86 | else: 87 | gender = '女汉子' 88 | if 90 < obj['expression'] <= 100: # 表情从0-100,表示笑的程度 89 | expression = '一笑倾城' 90 | elif 80 < obj['expression'] <= 90: 91 | expression = '心花怒放' 92 | elif 70 < obj['expression'] <= 80: 93 | expression = '兴高采烈' 94 | elif 60 < obj['expression'] <= 70: 95 | expression = '眉开眼笑' 96 | elif 50 < obj['expression'] <= 60: 97 | expression = '喜上眉梢' 98 | elif 40 < obj['expression'] <= 50: 99 | expression = '喜气洋洋' 100 | elif 30 < obj['expression'] <= 40: 101 | expression = '笑逐颜开' 102 | elif 20 < obj['expression'] <= 30: 103 | expression = '似笑非笑' 104 | elif 10 < obj['expression'] <= 20: 105 | expression = '半嗔半喜' 106 | elif 0 <= obj['expression'] <= 10: 107 | expression = '黯然伤神' 108 | delt = h // 5 # 确定文字垂直距离 109 | # 写入图片 110 | if len(res['data']['face_list']) > 1: # 检测到多个人脸,就把信息写入人脸框内 111 | font = ImageFont.truetype('yahei.ttf', w // 8, encoding='utf-8') # 提前把字体文件下载好 112 | draw.text((x + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font) 113 | draw.text((x + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font) 114 | draw.text((x + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font) 115 | draw.text((x + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font) 116 | draw.text((x + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font) 117 | elif img_width - x - w < 170: # 避免图片太窄,导致文字显示不完全 118 | font = ImageFont.truetype('yahei.ttf', w // 8, encoding='utf-8') 119 | draw.text((x + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font) 120 | draw.text((x + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font) 121 | draw.text((x + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font) 122 | draw.text((x + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font) 123 | draw.text((x + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font) 124 | else: 125 | font = ImageFont.truetype('yahei.ttf', 20, encoding='utf-8') 126 | draw.text((x + w + 10, y + 10), '性别 :' + gender, (76, 176, 80), font=font) 127 | draw.text((x + w + 10, y + 10 + delt * 1), '年龄 :' + str(obj['age']), (76, 176, 80), font=font) 128 | draw.text((x + w + 10, y + 10 + delt * 2), '表情 :' + expression, (76, 176, 80), font=font) 129 | draw.text((x + w + 10, y + 10 + delt * 3), '魅力 :' + str(obj['beauty']), (76, 176, 80), font=font) 130 | draw.text((x + w + 10, y + 10 + delt * 4), '眼镜 :' + glass, (76, 176, 80), font=font) 131 | 132 | draw.rectangle((x, y, x + w, y + h), outline="#4CB050") # 画出人脸方框 133 | cv2img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) # 把 pil 格式转换为 cv 134 | cv2.imwrite('faces/{}'.format(os.path.basename(img)), cv2img) # 保存图片到 face 文件夹下 135 | return 'success' 136 | else: 137 | return 'fail' -------------------------------------------------------------------------------- /wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg -------------------------------------------------------------------------------- /wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/wechat/faces/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg -------------------------------------------------------------------------------- /wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1618_37_43.jpg -------------------------------------------------------------------------------- /wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/wechat/images/oWFPW1ay1L-73ZFrNkCnq4siIgtQ-2018_06_1619_33_56.jpg -------------------------------------------------------------------------------- /wechat/requirements.txt: -------------------------------------------------------------------------------- 1 | certifi==2018.4.16 2 | chardet==3.0.4 3 | falcon==1.4.1 4 | idna==2.6 5 | numpy==1.14.5 6 | opencv-python==3.4.1.15 7 | optionaldict==0.1.1 8 | Pillow==5.1.0 9 | pycrypto==2.6.1 10 | python-dateutil==2.7.3 11 | python-mimeparse==1.6.0 12 | requests==2.18.4 13 | six==1.11.0 14 | urllib3==1.22 15 | waitress==1.1.0 16 | wechatpy==1.7.0 17 | xmltodict==0.11.0 18 | -------------------------------------------------------------------------------- /wechat/utils.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | import requests 3 | import json 4 | import threading 5 | import time 6 | import os 7 | 8 | token = '' 9 | app_id = '开发者ID(AppID)' 10 | secret = '开发者密码(AppSecret)' 11 | 12 | 13 | def img_download(url, name): 14 | r = requests.get(url) 15 | with open('images/{}-{}.jpg'.format(name, time.strftime("%Y_%m_%d%H_%M_%S", time.localtime())), 'wb') as fd: 16 | fd.write(r.content) 17 | if os.path.getsize(fd.name) >= 1048576: 18 | return 'large' 19 | # print('namename', os.path.basename(fd.name)) 20 | return os.path.basename(fd.name) 21 | 22 | 23 | def get_access_token(appid, secret): 24 | '''获取access_token,100分钟刷新一次''' 25 | 26 | url = 'https://api.weixin.qq.com/cgi-bin/token?grant_type=client_credential&appid={}&secret={}'.format(appid, secret) 27 | r = requests.get(url) 28 | parse_json = json.loads(r.text) 29 | global token 30 | token = parse_json['access_token'] 31 | global timer 32 | timer = threading.Timer(6000, get_access_token) 33 | timer.start() 34 | 35 | 36 | def img_upload(mediaType, name): 37 | global token 38 | url = "https://api.weixin.qq.com/cgi-bin/media/upload?access_token=%s&type=%s" % (token, mediaType) 39 | files = {'media': open('{}'.format(name), 'rb')} 40 | r = requests.post(url, files=files) 41 | parse_json = json.loads(r.text) 42 | return parse_json['media_id'] 43 | 44 | get_access_token(app_id, secret) -------------------------------------------------------------------------------- /wechat/yahei.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/injetlee/Python/94faec41b8a74cde03ec79c2140da4b8839b224a/wechat/yahei.ttf -------------------------------------------------------------------------------- /爬虫集合/README.MD: -------------------------------------------------------------------------------- 1 | # 代码详细说明请看文章 2 | 3 | [Python 爬虫入门(一)——爬取糗事百科](https://mp.weixin.qq.com/s/ApnEy6NWS2f-DqIIrhHzGw) 4 | 5 | [Python 爬虫入门(二)——爬取妹子图](https://mp.weixin.qq.com/s/4TZHgoE_yqeDha17f3Tbew) 6 | 7 | [Python 爬虫——Python 岗位分析报告](https://mp.weixin.qq.com/s/8wAHBPnQMbcrP9La7WZiJA) 8 | 9 | [Python 爬虫利器——Selenium介绍](https://mp.weixin.qq.com/s/YJGjZkUejEos_yJ1ukp5kw) 10 | 11 | [Python 爬虫——抖音App视频抓包](https://mp.weixin.qq.com/s/a8Tky_u1u0A4vbssnAK2_g) -------------------------------------------------------------------------------- /爬虫集合/lagou.py: -------------------------------------------------------------------------------- 1 | import random 2 | import time 3 | 4 | import requests 5 | from openpyxl import Workbook 6 | import pymysql.cursors 7 | 8 | 9 | def get_conn(): 10 | '''建立数据库连接''' 11 | conn = pymysql.connect(host='localhost', 12 | user='root', 13 | password='root', 14 | db='python', 15 | charset='utf8mb4', 16 | cursorclass=pymysql.cursors.DictCursor) 17 | return conn 18 | 19 | 20 | def insert(conn, info): 21 | '''数据写入数据库''' 22 | with conn.cursor() as cursor: 23 | sql = "INSERT INTO `python` (`shortname`, `fullname`, `industryfield`, `companySize`, `salary`, `city`, `education`) VALUES (%s, %s, %s, %s, %s, %s, %s)" 24 | cursor.execute(sql, info) 25 | conn.commit() 26 | 27 | 28 | def get_json(url, page, lang_name): 29 | '''返回当前页面的信息列表''' 30 | headers = { 31 | 'Host': 'www.lagou.com', 32 | 'Connection': 'keep-alive', 33 | 'Content-Length': '23', 34 | 'Origin': 'https://www.lagou.com', 35 | 'X-Anit-Forge-Code': '0', 36 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0', 37 | 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 38 | 'Accept': 'application/json, text/javascript, */*; q=0.01', 39 | 'X-Requested-With': 'XMLHttpRequest', 40 | 'X-Anit-Forge-Token': 'None', 41 | 'Referer': 'https://www.lagou.com/jobs/list_python?city=%E5%85%A8%E5%9B%BD&cl=false&fromSearch=true&labelWords=&suginput=', 42 | 'Accept-Encoding': 'gzip, deflate, br', 43 | 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7' 44 | } 45 | data = {'first': 'false', 'pn': page, 'kd': lang_name} 46 | json = requests.post(url, data, headers=headers).json() 47 | list_con = json['content']['positionResult']['result'] 48 | info_list = [] 49 | for i in list_con: 50 | info = [] 51 | info.append(i.get('companyShortName', '无')) 52 | info.append(i.get('companyFullName', '无')) 53 | info.append(i.get('industryField', '无')) 54 | info.append(i.get('companySize', '无')) 55 | info.append(i.get('salary', '无')) 56 | info.append(i.get('city', '无')) 57 | info.append(i.get('education', '无')) 58 | info_list.append(info) 59 | return info_list 60 | 61 | 62 | def main(): 63 | lang_name = 'python' 64 | wb = Workbook() # 打开 excel 工作簿 65 | conn = get_conn() # 建立数据库连接 不存数据库 注释此行 66 | for i in ['北京', '上海', '广州', '深圳', '杭州']: # 五个城市 67 | page = 1 68 | ws1 = wb.active 69 | ws1.title = lang_name 70 | url = 'https://www.lagou.com/jobs/positionAjax.json?city={}&needAddtionalResult=false'.format(i) 71 | while page < 31: # 每个城市30页信息 72 | info = get_json(url, page, lang_name) 73 | page += 1 74 | print(i, 'page', page) 75 | time.sleep(random.randint(10, 20)) 76 | for row in info: 77 | insert(conn, tuple(row)) # 插入数据库,若不想存入 注释此行 78 | ws1.append(row) 79 | conn.close() # 关闭数据库连接,不存数据库 注释此行 80 | wb.save('{}职位信息.xlsx'.format(lang_name)) 81 | 82 | if __name__ == '__main__': 83 | main() -------------------------------------------------------------------------------- /爬虫集合/meizitu.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import os 3 | import time 4 | import threading 5 | from bs4 import BeautifulSoup 6 | 7 | 8 | def download_page(url): 9 | ''' 10 | 用于下载页面 11 | ''' 12 | headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} 13 | r = requests.get(url, headers=headers) 14 | r.encoding = 'gb2312' 15 | return r.text 16 | 17 | 18 | def get_pic_list(html): 19 | ''' 20 | 获取每个页面的套图列表,之后循环调用get_pic函数获取图片 21 | ''' 22 | soup = BeautifulSoup(html, 'html.parser') 23 | pic_list = soup.find_all('li', class_='wp-item') 24 | for i in pic_list: 25 | a_tag = i.find('h3', class_='tit').find('a') 26 | link = a_tag.get('href') 27 | text = a_tag.get_text() 28 | get_pic(link, text) 29 | 30 | 31 | def get_pic(link, text): 32 | ''' 33 | 获取当前页面的图片,并保存 34 | ''' 35 | html = download_page(link) # 下载界面 36 | soup = BeautifulSoup(html, 'html.parser') 37 | pic_list = soup.find('div', id="picture").find_all('img') # 找到界面所有图片 38 | headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} 39 | create_dir('pic/{}'.format(text)) 40 | for i in pic_list: 41 | pic_link = i.get('src') # 拿到图片的具体 url 42 | r = requests.get(pic_link, headers=headers) # 下载图片,之后保存到文件 43 | with open('pic/{}/{}'.format(text, pic_link.split('/')[-1]), 'wb') as f: 44 | f.write(r.content) 45 | time.sleep(1) # 休息一下,不要给网站太大压力,避免被封 46 | 47 | 48 | def create_dir(name): 49 | if not os.path.exists(name): 50 | os.makedirs(name) 51 | 52 | 53 | def execute(url): 54 | page_html = download_page(url) 55 | get_pic_list(page_html) 56 | 57 | 58 | def main(): 59 | create_dir('pic') 60 | queue = [i for i in range(1, 72)] # 构造 url 链接 页码。 61 | threads = [] 62 | while len(queue) > 0: 63 | for thread in threads: 64 | if not thread.is_alive(): 65 | threads.remove(thread) 66 | while len(threads) < 5 and len(queue) > 0: # 最大线程数设置为 5 67 | cur_page = queue.pop(0) 68 | url = 'http://meizitu.com/a/more_{}.html'.format(cur_page) 69 | thread = threading.Thread(target=execute, args=(url,)) 70 | thread.setDaemon(True) 71 | thread.start() 72 | print('{}正在下载{}页'.format(threading.current_thread().name, cur_page)) 73 | threads.append(thread) 74 | 75 | 76 | if __name__ == '__main__': 77 | main() 78 | -------------------------------------------------------------------------------- /爬虫集合/qiubai_crawer.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | 5 | def download_page(url): 6 | headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0"} 7 | r = requests.get(url, headers=headers) 8 | return r.text 9 | 10 | 11 | def get_content(html, page): 12 | output = """第{}页 作者:{} 性别:{} 年龄:{} 点赞:{} 评论:{}\n{}\n------------\n""" 13 | soup = BeautifulSoup(html, 'html.parser') 14 | con = soup.find(id='content-left') 15 | con_list = con.find_all('div', class_="article") 16 | for i in con_list: 17 | author = i.find('h2').string # 获取作者名字 18 | content = i.find('div', class_='content').find('span').get_text() # 获取内容 19 | stats = i.find('div', class_='stats') 20 | vote = stats.find('span', class_='stats-vote').find('i', class_='number').string 21 | comment = stats.find('span', class_='stats-comments').find('i', class_='number').string 22 | author_info = i.find('div', class_='articleGender') # 获取作者 年龄,性别 23 | if author_info is not None: # 非匿名用户 24 | class_list = author_info['class'] 25 | if "womenIcon" in class_list: 26 | gender = '女' 27 | elif "manIcon" in class_list: 28 | gender = '男' 29 | else: 30 | gender = '' 31 | age = author_info.string # 获取年龄 32 | else: # 匿名用户 33 | gender = '' 34 | age = '' 35 | 36 | save_txt(output.format(page, author, gender, age, vote, comment, content)) 37 | 38 | 39 | def save_txt(*args): 40 | for i in args: 41 | with open('qiubai.txt', 'a', encoding='utf-8') as f: 42 | f.write(i) 43 | 44 | 45 | def main(): 46 | # 我们点击下面链接,在页面下方可以看到共有13页,可以构造如下 url, 47 | # 当然我们最好是用 Beautiful Soup找到页面底部有多少页。 48 | for i in range(1, 14): 49 | url = 'https://qiushibaike.com/text/page/{}'.format(i) 50 | html = download_page(url) 51 | get_content(html, i) 52 | 53 | if __name__ == '__main__': 54 | main() 55 | --------------------------------------------------------------------------------