├── 10-Requests.py
├── 10.1busPath_Crawler.py
├── 11.1pytesser.py
├── 11.2jTessBoxEditor-tesseract.py
├── 11verification_code.py
├── 12video.py
├── 13.1thread_ood.py
├── 13.2thread_queue.py
├── 13.3Mthread_crawler.py
├── 13multiThread.py
├── 1urllib_base.py
├── 2ajax.py
├── 4handler.py
├── 5.1正则爬取糗.py
├── 5.2正则爬取励志网并建立文章集合页面.py
├── 58crawler
    ├── 58.ttf
    └── 58decode.py
├── 6.1read_list.py
├── 6xpath.py
├── 7pictureLoad.py
├── 8jsonpath.py
├── 9.1Chrome-headless.py
├── 9selenium.py
├── README.md
├── chineseUniversityRankCrawler
    └── RankofNuni.py
├── exe_file
    ├── 10
    │   ├── baidu.html
    │   ├── bus_line.json
    │   ├── chinaunix_login.html
    │   ├── renren.html
    │   └── set_proxy.html
    ├── 11
    │   ├── code.png
    │   ├── code1.png
    │   ├── code_2.png
    │   ├── gushi.html
    │   ├── gushiwen_code
    │   │   ├── gu.bat
    │   │   ├── gu.traineddata
    │   │   ├── train_toBox.bat
    │   │   ├── 第一轮训练.rar
    │   │   ├── 第三轮训练.rar
    │   │   ├── 第二轮训练.rar
    │   │   └── 第四轮训练.rar
    │   ├── test
    │   │   └── 0-9A-Z训练字典
    │   │   │   └── gu.traineddata
    │   └── verify_code
    │   │   └── verify_code.rar
    ├── 12
    │   └── download
    │   │   └── test.txt
    ├── 13
    │   └── bus_line.json
    ├── baidu.png
    ├── book.json
    ├── chrome-driver
    │   └── chromedriver.exe
    ├── douban.html
    ├── douban.png
    ├── douban_d.png
    ├── hello.txt
    ├── meinv.png
    ├── python_postion.csv
    ├── show.png
    ├── szchina_page_1.html
    ├── szchina_page_2.html
    ├── xinggan
    │   ├── hpic408_s.jpg
    │   ├── zzpic12973_s.jpg
    │   ├── zzpic13004_s.jpg
    │   ├── zzpic13068_s.jpg
    │   ├── zzpic13087_s.jpg
    │   ├── zzpic13131_s.jpg
    │   ├── zzpic13242_s.jpg
    │   ├── zzpic13256_s.jpg
    │   ├── zzpic13424_s.jpg
    │   ├── zzpic13487_s.jpg
    │   ├── zzpic13589_s.jpg
    │   ├── zzpic13628_s.jpg
    │   ├── zzpic13668_s.jpg
    │   ├── zzpic13710_s.jpg
    │   ├── zzpic13772_s.jpg
    │   ├── zzpic13941_s.jpg
    │   ├── zzpic14042_s.jpg
    │   ├── zzpic14131_s.jpg
    │   ├── zzpic14178_s.jpg
    │   ├── zzpic14185_s.jpg
    │   ├── zzpic14298_s.jpg
    │   ├── zzpic14358_s.jpg
    │   ├── zzpic14425_s.jpg
    │   ├── zzpic14458_s.jpg
    │   ├── zzpic14479_s.jpg
    │   ├── zzpic14568_s.jpg
    │   ├── zzpic14603_s.jpg
    │   ├── zzpic14638_s.jpg
    │   ├── zzpic14802_s.jpg
    │   ├── zzpic14872_s.jpg
    │   ├── zzpic14965_s.jpg
    │   ├── zzpic15059_s.jpg
    │   ├── zzpic15084_s.jpg
    │   ├── zzpic15247_s.jpg
    │   ├── zzpic15324_s.jpg
    │   ├── zzpic15420_s.jpg
    │   ├── zzpic15469_s.jpg
    │   ├── zzpic15567_s.jpg
    │   ├── zzpic15608_s.jpg
    │   ├── zzpic15786_s.jpg
    │   ├── zzpic15891_s.jpg
    │   ├── zzpic15920_s.jpg
    │   ├── zzpic16049_s.jpg
    │   ├── zzpic16135_s.jpg
    │   ├── zzpic16191_s.jpg
    │   ├── zzpic16240_s.jpg
    │   ├── zzpic16394_s.jpg
    │   ├── zzpic16406_s.jpg
    │   ├── zzpic16566_s.jpg
    │   ├── zzpic16638_s.jpg
    │   ├── zzpic16686_s.jpg
    │   ├── zzpic16786_s.jpg
    │   ├── zzpic16807_s.jpg
    │   ├── zzpic16817_s.jpg
    │   ├── zzpic16857_s.jpg
    │   ├── zzpic16889_s.jpg
    │   ├── zzpic16921_s.jpg
    │   ├── zzpic16949_s.jpg
    │   ├── zzpic17052_s.jpg
    │   ├── zzpic17175_s.jpg
    │   ├── zzpic17202_s.jpg
    │   ├── zzpic17322_s.jpg
    │   ├── zzpic17359_s.jpg
    │   ├── zzpic17378_s.jpg
    │   ├── zzpic17442_s.jpg
    │   ├── zzpic17558_s.jpg
    │   ├── zzpic17615_s.jpg
    │   ├── zzpic17727_s.jpg
    │   ├── zzpic17778_s.jpg
    │   ├── zzpic17797_s.jpg
    │   ├── zzpic17879_s.jpg
    │   ├── zzpic17946_s.jpg
    │   ├── zzpic18038_s.jpg
    │   ├── zzpic18089_s.jpg
    │   ├── zzpic18110_s.jpg
    │   ├── zzpic18144_s.jpg
    │   ├── zzpic18308_s.jpg
    │   ├── zzpic18433_s.jpg
    │   ├── zzpic18631_s.jpg
    │   └── zzpic18883_s.jpg
    └── xpath.html
├── fillder.py
├── meizhuo_crawler.py
├── pictureCrawler
    ├── PictureDown.py
    ├── depthPicCrawler.py
    ├── informationMark.py
    └── multiPicDown.py
├── tesseract训练模型
    ├── 0-9A-Z.png
    ├── README.md
    ├── combine.png
    └── oriCode.png
└── zhilianCrawler.py


/10-Requests.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     Requests库：安装 pip install requests
  3 |     官方文档：
  4 |         http://cn.python-requests.org/zh_CN/latest/
  5 |     what to do？
  6 |         与urllib功能相似
  7 |     get 请求
  8 |         定制头部 -requests.get(url=url,headers=headers,params=data)
  9 |         响应对象
 10 |             r.text 字符串形式查看响应
 11 |             r.content 字符类型查看响应
 12 |             r.encoding 查看或者设置编码类型
 13 |             r.status_code 查看响应状态
 14 |             r.headers 查看响应头部
 15 |             r.url 查看请求url
 16 |             r.json 查看json数据
 17 |             
 18 |     post 请求
 19 |         必应翻译
 20 |         requests.post(url=url,headers=headers,data=data)
 21 |     ajax、get、post
 22 |         和上面是一样的
 23 |     代理
 24 |         requests.get(url=url,headers=headers,proxies=proxy)
 25 |     cookie
 26 |         实现人人登陆
 27 |     留坑：
 28 |         教程中的chinaunix改版并且难以登陆操作，在此跳过
 29 |         如有解决方法，请联系我
 30 | """
 31 | 
 32 | import requests
 33 | 
 34 | 
 35 | # 带头部的Requests应用
 36 | url = 'http://www.baidu.com/'
 37 | headers = {
 38 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
 39 |                   'AppleWebKit/537.36 (KHTML, like Gecko) '
 40 |                   'Chrome/75.0.3770.142 Safari/537.36',
 41 | }
 42 | request = requests.get(url=url,headers=headers)
 43 | 
 44 | request.encoding = 'utf-8'
 45 | # print(request.text)
 46 | 
 47 | # 带参数的get
 48 | # https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=中国
 49 | '''
 50 |     坑：一开始url用 'http://www.baidu.com/' 这个，
 51 |     结果在构造搜索请求时返回的一直是百度首页，尬住，请用下面这个
 52 | '''
 53 | url = 'http://www.baidu.com/s?'
 54 | data = {
 55 |     'ie':'utf-8',
 56 |     'wd':'中国'
 57 | }
 58 | request = requests.get(url=url,headers=headers,params=data)
 59 | request.encoding = 'utf-8'
 60 | 
 61 | with open('exe_file/10/baidu.html','wb') as fp:
 62 |     fp.write(request.content)
 63 | 
 64 | 
 65 | 
 66 | 
 67 | # post请求：必应翻译实战
 68 | url = 'https://cn.bing.com/tlookupv3?isVertical=1&&' \
 69 |       'IG=B25CDCC5FE9D4B2EA382D628AFEAFDCD&IID=translator.5028.5'
 70 | # 构造表单
 71 | data = {
 72 |     'from': 'zh-Hans',
 73 |     'to': 'en',
 74 |     'text': 'compute',
 75 | }
 76 | """
 77 | request = requests.post(url=url,headers=headers,data=data)
 78 | # request.encoding = 'utf-8'
 79 | print(request.json())
 80 | 
 81 | 
 82 | # 代理的使用
 83 | url = 'https://www.baidu.com/s?ie=utf-8&f=8&' \
 84 |       'rsv_bp=1&rsv_idx=1&tn=baidu&wd=ip'
 85 | proxy = {
 86 |     'http':'http://113.54.153.217:1080'
 87 | }
 88 | request = requests.get(url=url,headers=headers,proxies=proxy)
 89 | request.encoding = 'utf-8'
 90 | with open('exe_file/10/set_proxy.html','wb') as fp:
 91 |     fp.write(request.content)
 92 | """
 93 | 
 94 | """
 95 | # 带cookie登陆
 96 | # 创建一个会话session,用于保存cookie信息，后续的请求利用session来发送
 97 | session = requests.Session()
 98 | url = 'http://www.renren.com/ajaxLogin/login?1=1'
 99 | formdata = {
100 |     'email':'15625266605',
101 |     'icode'	:'',
102 |     'origURL':'http://www.renren.com/home',
103 |     'domain':'renren.com',
104 |     'key_id':'1',
105 |     'captcha_type':	'web_login',
106 |     'password':	'1162c49a98a09a374364c99e2ad203b82211bc9cfdf8411e3b47d3ae268ec869',
107 |     'rkey':	'54fa0fe478cb62a6ae1184e8e15c9dbb',
108 |     'f':'http%3A%2F%2Fwww.renren.com%2F969920379',
109 | }
110 | 
111 | request = session.post(url=url,headers=headers,data=formdata)
112 | # print(request.text)
113 | # >>>{"code":true,"homeUrl":"http://www.renren.com/home"}
114 | 
115 | # 登陆后访问主页
116 | home_url = 'http://www.renren.com/home'
117 | home_page = session.get(url=home_url,headers=headers)
118 | home_page.encoding = 'utf-8'
119 | with open('exe_file/10/renren.html','wb') as fp:
120 |     fp.write(home_page.content)
121 | 
122 | """
123 | 
124 | 


--------------------------------------------------------------------------------
/10.1busPath_Crawler.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import json
  3 | from lxml import etree
  4 | import time
  5 | 
  6 | # 获取当前时间
  7 | localtime = time.asctime( time.localtime(time.time()) )
  8 | 
  9 | url = 'https://shenzhen.8684.cn'
 10 | headers = {
 11 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
 12 |                   'AppleWebKit/537.36 (KHTML, like Gecko) '
 13 |                   'Chrome/75.0.3770.142 Safari/537.36',
 14 | }
 15 | result = []
 16 | # 请求指定url的内容
 17 | def handle_request(request_url):
 18 |     try:
 19 |         request = requests.get(url=request_url,headers=headers)
 20 |         request.raise_for_status()
 21 |         request.encoding = request.apparent_encoding
 22 |         return request.text
 23 |     except:
 24 |         print(request_url + ' get failed')
 25 |         return 'NULL'
 26 | 
 27 | # 首页导航
 28 | def parse_navigation():
 29 |     content = handle_request(request_url=url)
 30 |     tree = etree.HTML(content)
 31 | 
 32 |     # 获取以数字开头的连接
 33 |     number_href_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href')
 34 |     # 获取以字母开头的连接
 35 |     char_href_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href')
 36 |     # 将爬取的导航链接列表返回
 37 |     return number_href_list + char_href_list
 38 | 
 39 | # 爬取以1（数字或字符）开头的某条线路的所有公交线
 40 | def parse_singlePath(navi_list):
 41 |     # 遍历上面的列表，依次发送请求，解析内容，获取每一个页面
 42 |     for navi in navi_list:
 43 |         path_url = url + navi
 44 |         print(path_url)
 45 |         content = handle_request(request_url=path_url)
 46 |         # 解析内容，获取每一路公交具体的url
 47 |         parse_specialroute(content)
 48 | 
 49 |     pass
 50 | 
 51 | # 获取每一条具体公交线的链接尾缀及名称
 52 | def parse_specialroute(content):
 53 |     tree = etree.HTML(content)
 54 |     route_infos = tree.xpath('//div[@class="stie_list"]/a')
 55 |     # print(len(route_infos))
 56 |     for route_info in route_infos:
 57 |         # 该线路的url后缀
 58 |         route_suffix = route_info.xpath('.//@href')[0]
 59 |         # 名称
 60 |         route_name = route_info.xpath('.//@title')[0]
 61 |         # print(route_suffix,route_name)
 62 |         # 获取每一条具体公交线路的具体信息
 63 |         get_specialroute(route_suffix,route_name)
 64 | 
 65 | #获取每一条具体公交线路的具体信息
 66 | def get_specialroute(route_suffix,route_name):
 67 |     # 请求页面
 68 |     content = handle_request(url+route_suffix)
 69 |     tree = etree.HTML(content)
 70 |     # 公交信息的标签位置
 71 |     bus_basic_infos = tree.xpath('//div[@class="bus_i_content"]')[0]
 72 | 
 73 |     # 获取线路名称、运营时间、票价
 74 |     bus_name = bus_basic_infos.xpath('./div[@class="bus_i_t1"]/h1/text()')[0]\
 75 |                                     .replace('&nbsp','')     # 替换掉特殊编码
 76 |     bus_runtime = bus_basic_infos.xpath('./p[1]/text()')[0].replace('运行时间：','')
 77 |     bus_fares = bus_basic_infos.xpath('./p[2]/text()')[0].replace('票价信息：','')
 78 |     bus_company = bus_basic_infos.xpath('./p[3]/a/text()')[0]
 79 |     bus_update = bus_basic_infos.xpath('./p[4]/text()')[0].replace('最后更新：','')
 80 |     # print(bus_name)
 81 |     # print(bus_runtime)
 82 |     # print(bus_fares)
 83 |     # print(bus_company)
 84 |     # print(bus_update)
 85 | 
 86 |     # 获取线路站点
 87 |     '''
 88 |         坑：原本思路是找到//div[@class="bus_line_site"][1](第一个，也就是起点到终点的单程站集）
 89 |             下的--div[@class="bus_site_layer"]，但是一直找不到，所以最后直接找后者，这时得到的站集
 90 |             是来回的，取列表的1/2，可以得到单程站集
 91 |         填坑：实际上是"bus_line_site "，得再加一个空格
 92 |     '''
 93 |     bus_line = tree.xpath('//div[@class="bus_site_layer"]')
 94 |     length = len(bus_line)
 95 |     bus_line = bus_line[:int(length/2)]
 96 |     sites = []
 97 |     for line in bus_line:
 98 |         for site in line.xpath('./div'):
 99 |             sites.append(site.xpath('./a/text()')[0])
100 |     # print(sites)
101 | 
102 |     bus_data = {
103 |         '线路名称' : bus_name,
104 |         '运行时间' : bus_runtime,
105 |         '票价信息' : bus_fares,
106 |         '运营公司' : bus_company,
107 |         '更新时间' : bus_update,
108 |         '经过站点' : sites,
109 |     }
110 | 
111 |     # 公交线路放入结果中
112 |     result.append(bus_data)
113 | 
114 | 
115 | 
116 | def main():
117 |     # 获取导航页全部的线路（数字字母）开头的url
118 |     navi_list = parse_navigation()
119 | 
120 |     # 爬取以某个（数字或字符）开头的某条线路的所有公交线
121 |     parse_singlePath(navi_list)
122 | 
123 |     # 将bus_data 存入一个result列表，构造<"result":result>键值对并存入一个新字典
124 |     # 将字典转成json格式并存入json文件
125 |     shenzhen_busLine = {
126 |         'json_name' : '深圳公交线路汇总',
127 |         'updatetime' : localtime,
128 |         'results' : result
129 |     }
130 |     file = open('exe_file/10/bus_line.json','w',encoding='utf-8')
131 | 
132 |     """
133 |         json.dump()
134 |         把字典转成json串，并自动写入文件中
135 |         dump参数是（字典，文件句柄，indent）。indent用于缩进美化json串的
136 |         ensure_ascii=False用于写文件时有unicode时用，正常显示出中文来
137 |     """
138 |     json.dump(shenzhen_busLine,file,indent=4,ensure_ascii=False)
139 | if __name__ == '__main__':
140 |     main()


--------------------------------------------------------------------------------
/11.1pytesser.py:
--------------------------------------------------------------------------------
 1 | import pytesseract
 2 | from PIL import Image
 3 | from PIL import ImageEnhance
 4 | """
 5 | tesseract 安装及使用
 6 |     OCR，即Optical Character Recognition，光学字符识别，是指通过扫描字符，然后通过其形状将其翻译成电子文本的过程。
 7 |     对于图形验证码来说，它们都是一些不规则的字符，这些字符确实是由字符稍加扭曲变换得到的内容。
 8 |     参考：
 9 |         Windows安装Tesseract-OCR 4.00并配置环境变量：https://segmentfault.com/a/1190000014086067
10 |         图像文字识别（三）：Tesseract4.0训练字库，提高正确识别率：https://blog.csdn.net/a745233700/article/details/80175883
11 | PIL可以做很多和图像处理相关的事情:
12 |     图像归档(Image Archives)：
13 |         PIL非常适合于图像归档以及图像的批处理任务。你可以使用PIL创建缩略图，转换图像格式，打印图像等等。
14 |     图像展示(Image Display)：
15 |         PIL较新的版本支持包括Tk PhotoImage，BitmapImage还有Windows DIB等接口。PIL支持众多的GUI框架接口，可以用于图像展示。
16 |     图像处理(Image Processing)：
17 |         PIL包括了基础的图像处理函数，包括对点的处理，使用众多的卷积核(convolution kernels)做过滤(filter),还有颜色空间的转换。
18 |         PIL库同样支持图像的大小转换，图像旋转，以及任意的仿射变换。PIL还有一些直方图的方法，允许你展示图像的一些统计特性。
19 |         这个可以用来实现图像的自动对比度增强，还有全局的统计分析等。
20 |     具体参考：
21 |         PIL介绍：https://www.cnblogs.com/lyrichu/p/9124504.html
22 |         Python图像处理库PIL的ImageEnhance模块介绍：https://blog.csdn.net/icamera0/article/details/50753705
23 |         
24 |     ***python+tesseract 训练和破解验证码：https://zhuanlan.zhihu.com/p/40178190
25 |     ***介绍了命令行的操作形式：超级详细的Tesseract-OCR样本训练方法https://blog.csdn.net/sylsjane/article/details/83751297
26 |     ***tesseract v4.0.0 帮助文档解读：https://blog.csdn.net/qq_32674197/article/details/80744783
27 |     ****tesseract_ocr训练字库、合并字库：https://www.imooc.com/article/32331
28 | """
29 | img = Image.open('exe_file/11/code1.png')
30 | print(img)
31 | 
32 | img= img.convert('RGB')
33 | # 颜色调到最暗
34 | enhancer = ImageEnhance.Color(img)
35 | enhancer = enhancer.enhance(0)
36 | # 增加亮度
37 | enhancer = ImageEnhance.Brightness(enhancer)
38 | enhancer = enhancer.enhance(4)
39 | # 增加对比度
40 | enhancer = ImageEnhance.Contrast(enhancer)
41 | enhancer = enhancer.enhance(15)
42 | # 增加图片锐度
43 | enhancer = ImageEnhance.Sharpness(enhancer)
44 | img = enhancer.enhance(25)
45 | # img.show()
46 | 
47 | # 转成灰度图片
48 | img = img.convert('L')
49 | # img.show()
50 | #二值化处理
51 | threshold = 140
52 | table=[]
53 | for i in range(256):
54 |     if i < threshold:
55 |         table.append(0)
56 |     else:
57 |         table.append(1)
58 | out = img.point(table,'1')
59 | out.show()
60 | # img = img.convert('RGB')
61 | # out.save('exe_file/11/gushiwen_code/35.png','png')
62 | 
63 | print(pytesseract.image_to_string(out,lang='gu',config='--psm 7'))


--------------------------------------------------------------------------------
/11.2jTessBoxEditor-tesseract.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     验证码训练脚本
  3 |     Author:caixiaoxin
  4 |     date：2019/7/23
  5 | """
  6 | from PIL import ImageEnhance
  7 | from PIL import Image
  8 | import pytesseract
  9 | from bs4 import BeautifulSoup
 10 | import os
 11 | import requests
 12 | 
 13 | headers = {
 14 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
 15 |                   'AppleWebKit/537.36 (KHTML, like Gecko) '
 16 |                   'Chrome/75.0.3770.142 Safari/537.36',
 17 | }
 18 | # 根据训练字库识别验证码
 19 | def get_varifyCode()->str:
 20 |     img = Image.open('exe_file/11/code.png')
 21 |     # print(img)
 22 |     img = img.convert('RGB')
 23 |     # 颜色调到最暗
 24 |     enhancer = ImageEnhance.Color(img)
 25 |     enhancer = enhancer.enhance(0)
 26 |     # 增加亮度
 27 |     enhancer = ImageEnhance.Brightness(enhancer)
 28 |     enhancer = enhancer.enhance(2)
 29 |     # 增加对比度
 30 |     enhancer = ImageEnhance.Contrast(enhancer)
 31 |     enhancer = enhancer.enhance(8)
 32 |     # 增加图片锐度
 33 |     enhancer = ImageEnhance.Sharpness(enhancer)
 34 |     img = enhancer.enhance(20)
 35 |     # img.show()
 36 | 
 37 |     # 转成灰度图片
 38 |     img = img.convert('L')
 39 |     # img.show()
 40 |     # 二值化处理
 41 |     threshold = 140
 42 |     table = []
 43 |     for i in range(256):
 44 |         if i < threshold:
 45 |             table.append(0)
 46 |         else:
 47 |             table.append(1)
 48 |     out = img.point(table, '1')
 49 |     # out.show()
 50 |     # img = img.convert('RGB')
 51 |     out.save('exe_file/11/code.png','png')
 52 |     code = pytesseract.image_to_string(out,lang='gu',config='--psm 7')
 53 |     code = code.replace(' ','')     # 除去空格
 54 |     return code
 55 | 
 56 | # 下载验证码
 57 | def download_code(session):
 58 |     url = 'https://so.gushiwen.org/user/login.aspx?' \
 59 |           'from=http://so.gushiwen.org/user/collect.aspx'
 60 |     request = session.get(url=url, headers=headers)
 61 |     soup = BeautifulSoup(request.text,'lxml')
 62 | 
 63 |     '''
 64 |         问题：url相同,为什么每次获取的验证码不同
 65 |         同个url下，通过cookie随机生成验证码
 66 |         所以需要在获取验证码，登陆这个过程需要建立会话
 67 |     '''
 68 |     img_src = 'https://so.gushiwen.org' + \
 69 |               soup.find('img',id='imgCode')['src']
 70 |     # print(img_src)
 71 |     img = session.get(url=img_src,headers=headers)
 72 |     with open('exe_file/11/code.png','wb') as fp:
 73 |         fp.write(img.content)
 74 | 
 75 |     # 查找表单需要的两个参数
 76 |     __VIEWSTATE = soup.find('input', id='__VIEWSTATE')['value']
 77 |     __VIEWSTATEGENERATOR = soup.find('input', id='__VIEWSTATEGENERATOR')['value']
 78 | 
 79 |     # 识别验证码
 80 |     code = get_varifyCode()
 81 | 
 82 |     return __VIEWSTATE, __VIEWSTATEGENERATOR, code
 83 | 
 84 | # post登陆
 85 | def login(__VIEWSTATE, __VIEWSTATEGENERATOR, code, session)->bool:
 86 |     post_url = 'https://so.gushiwen.org/user/login.aspx?' \
 87 |                'from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
 88 |     data = {
 89 |         '__VIEWSTATE' : __VIEWSTATE,
 90 |         '__VIEWSTATEGENERATOR' : __VIEWSTATEGENERATOR,
 91 |         'from' : 'http://so.gushiwen.org/user/collect.aspx',
 92 |         'email' : '15625266605',
 93 |         'pwd' : '123456',
 94 |         'code' : code,
 95 |         'denglu': '登录',
 96 |     }
 97 |     # 登陆
 98 |     request = session.post(url=post_url,headers=headers,data=data)
 99 |     # print(len(request.text))
100 |     if len(request.text)==35822:
101 |         return False
102 |     else:
103 |         return True
104 | # 实现模拟登陆，如果验证码识别错误，将有误验证码存入
105 | def test_login()->bool:
106 |     # 创建会话
107 |     session = requests.Session()
108 |     # 下载验证码到本地
109 |     __VIEWSTATE, __VIEWSTATEGENERATOR, code = download_code(session)
110 | 
111 |     status = login(__VIEWSTATE, __VIEWSTATEGENERATOR, code ,session)
112 | 
113 |     if status is not True:
114 |         try:
115 |             img = Image.open('exe_file/11/code.png')
116 |             img.save('exe_file/11/verify_code/{}.png'.format(code), 'png')
117 |         except OSError:
118 |             pass
119 |         return False
120 |     else:   return True
121 | 
122 | # 批量处理验证码图片
123 | def deal_img():
124 |     root = 'exe_file/11/gushiwen_code/'
125 |     ind = 0
126 |     # 从100张图片中提取出字符样本
127 |     for image in os.listdir(root):
128 |         img = Image.open(root + image)
129 |         img = img.convert('RGB')
130 |         # 颜色调到最暗
131 |         enhancer = ImageEnhance.Color(img)
132 |         enhancer = enhancer.enhance(0)
133 |         # 增加亮度
134 |         enhancer = ImageEnhance.Brightness(enhancer)
135 |         enhancer = enhancer.enhance(2)
136 |         # 增加对比度
137 |         enhancer = ImageEnhance.Contrast(enhancer)
138 |         enhancer = enhancer.enhance(8)
139 |         # 增加图片锐度
140 |         enhancer = ImageEnhance.Sharpness(enhancer)
141 |         img = enhancer.enhance(20)
142 |         # img.show()
143 | 
144 |         # 转成灰度图片
145 |         img = img.convert('L')
146 |         # img.show()
147 |         # 二值化处理
148 |         threshold = 140
149 |         table = []
150 |         for i in range(256):
151 |             if i < threshold:
152 |                 table.append(0)
153 |             else:
154 |                 table.append(1)
155 |         out = img.point(table, '1')
156 |         out.save(root+'{}.png'.format(ind),'png')
157 |         ind = ind + 1
158 | 
159 | if __name__ == '__main__':
160 |     # 测试识别准确率
161 |     test_num = 200
162 |     correct_num = 0
163 |     for i in range(test_num):
164 |         if test_login() is True:
165 |             correct_num += 1
166 |     print("准确率{}%".format(correct_num*100/test_num))
167 |     # deal_img()
168 | 
169 | 
170 | 
171 | 


--------------------------------------------------------------------------------
/11verification_code.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     验证码
 3 |         登陆古诗文网
 4 |         将验证码下载到本地
 5 |         在登陆页面中获取表单的两个重要参数
 6 |         整个过程在会话状态下进行
 7 | """
 8 | 
 9 | import requests
10 | from bs4 import BeautifulSoup
11 | 
12 | headers = {
13 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
14 |                   'AppleWebKit/537.36 (KHTML, like Gecko) '
15 |                   'Chrome/75.0.3770.142 Safari/537.36',
16 | }
17 | def download_code(session):
18 |     url = 'https://so.gushiwen.org/user/login.aspx?' \
19 |           'from=http://so.gushiwen.org/user/collect.aspx'
20 |     request = session.get(url=url, headers=headers)
21 |     soup = BeautifulSoup(request.text,'lxml')
22 | 
23 |     '''
24 |         问题：url相同,为什么每次获取的验证码不同
25 |         同个url下，通过cookie随机生成验证码
26 |         所以需要在获取验证码，登陆这个过程需要建立会话
27 |     '''
28 |     img_src = 'https://so.gushiwen.org' + \
29 |               soup.find('img',id='imgCode')['src']
30 |     # print(img_src)
31 |     img = session.get(url=img_src,headers=headers)
32 |     with open('exe_file/11/code.png','wb') as fp:
33 |         fp.write(img.content)
34 | 
35 |     # 查找表单需要的两个参数
36 |     __VIEWSTATE = soup.find('input', id='__VIEWSTATE')['value']
37 |     __VIEWSTATEGENERATOR = soup.find('input', id='__VIEWSTATEGENERATOR')['value']
38 | 
39 |     return __VIEWSTATE, __VIEWSTATEGENERATOR
40 | 
41 | def login(__VIEWSTATE, __VIEWSTATEGENERATOR,session):
42 |     post_url = 'https://so.gushiwen.org/user/login.aspx?' \
43 |                'from=http%3a%2f%2fso.gushiwen.org%2fuser%2fcollect.aspx'
44 |     # 提示用户输入验证码
45 |     code = input('input verification code:')
46 |     data = {
47 |         '__VIEWSTATE' : __VIEWSTATE,
48 |         '__VIEWSTATEGENERATOR' : __VIEWSTATEGENERATOR,
49 |         'from' : 'http://so.gushiwen.org/user/collect.aspx',
50 |         'email' : '15625266605',
51 |         'pwd' : '123456',
52 |         'code' : code,
53 |         'denglu': '登录',
54 |     }
55 |     # 登陆并且将页面写入文件
56 |     request = session.post(url=post_url,headers=headers,data=data)
57 |     print(len(request.text))
58 |     with open('exe_file/11/gushi_error.html','w',encoding='utf-8') as file:
59 |         file.write(request.text)
60 | def main():
61 |     # 创建会话
62 |     session = requests.Session()
63 |     # 下载验证码到本地
64 |     __VIEWSTATE, __VIEWSTATEGENERATOR = download_code(session)
65 | 
66 |     login(__VIEWSTATE, __VIEWSTATEGENERATOR,session)
67 | if __name__ == '__main__':
68 |     main()
69 | 


--------------------------------------------------------------------------------
/12video.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | 
  3 | headers = {
  4 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
  5 |                   'AppleWebKit/537.36 (KHTML, like Gecko) '
  6 |                   'Chrome/75.0.3770.142 Safari/537.36',
  7 | }
  8 | 
  9 | 
 10 | # e：下载视频
 11 | """
 12 | tip:视频播放窗口是无法右键F12的,正确的做法是在暂停、倍数的功能栏进入开发者模式，就可以简单获取视频的url
 13 | """
 14 | url = 'http://v1-default.ixigua.com/0675cf76b8a56330683ebbae99e4986e/5d3bbed0/video/m/' \
 15 |       '220ed97da2708af47afa4bb16d59e4eba1f116131fb7000082d6359fa977/?rc=amd1NDY0dTtpajM' \
 16 |       'zPDczM0ApQHRAbzw7NTs6MzgzMzUzNDUzNDVvQGg2dilAZzN3KUBmM3UpZHNyZ3lrdXJneXJseHdmOzpAa' \
 17 |       'C1wNGtqMG9rXy0tLS0vc3MtbyNvIy8uMy0wMy4uMC4tNDQ2LTojbyM6YS1vIzpgLXAjOmB2aVxiZitgXmJmK15xbDojMy5e'
 18 | 
 19 | r = requests.get(url=url,headers=headers)
 20 | 
 21 | with open('exe_file/12/1.mp4','wb') as file:
 22 |     file.write(r.content)
 23 | 
 24 | 
 25 | '''
 26 |     首先向365yg.com发送请求
 27 |     获取响应，解析响应，将里面所有的标题链接获取到
 28 |     依次向每个标题链接发送请求
 29 |     获取响应，解析响应，获取video标签的src属性
 30 |     向src属性发送请求，获取响应，将内容保存到本地
 31 | '''
 32 | 
 33 | # 爬取主页的推荐视频
 34 | from lxml import etree
 35 | import json
 36 | from selenium import webdriver
 37 | from selenium.webdriver.chrome.options import Options
 38 | import time
 39 | 
 40 | # 请求指定url的内容
 41 | def handle_request(request_url):
 42 |     try:
 43 |         request = requests.get(url=request_url,headers=headers)
 44 |         request.raise_for_status()
 45 |         request.encoding = request.apparent_encoding
 46 |         return request
 47 |     except:
 48 |         print(request_url + ' get failed')
 49 |         return 'NULL'
 50 | 
 51 | # 解析视频页，获取视频的url
 52 | def handle_href(a_href)->str:
 53 |     # 通过chrome-headless解决
 54 |     path = r'exe_file/chromedriver.exe'
 55 |     chrome_options = Options()
 56 |     chrome_options.add_argument('--headless')
 57 |     chrome_options.add_argument('--disable-gpu')  # 上面三行代码就是为了将Chrome不弹出界面，实现无界面爬取
 58 |     browser = webdriver.Chrome(path, options=chrome_options)
 59 |     browser.get(a_href)
 60 |     time.sleep(3)
 61 |     # 获取源码，生成tree对象，然后查找video里面的src属性
 62 |     '''
 63 |     code:tree = etree.HTML(browser.page_source,'lxml)
 64 |     TypeError: Argument 'parser' has incorrect type (expected lxml.etree._BaseParser, got str)
 65 |     去掉lxml完美解决
 66 |     '''
 67 |     # 利用xpath获取视频的url
 68 |     tree = etree.HTML(browser.page_source)
 69 |     video_src = tree.xpath('//video/@src')[0]
 70 |     browser.close()
 71 |     return video_src
 72 | 
 73 | # 获取主页的视频信息
 74 | def handle_title(widen:int):
 75 |     # json内容会根据widen属性变化
 76 |     basic_url = 'http://365yg.com/api/pc/feed/?max_behot_time=1564196117&category=video_new&utm_source=toutiao' \
 77 |           '&widen={}&tadrequire=true&as=A125ED93CB0BDA9&cp=5D3B3BBDAA498E1&_signature=.sLedBAXpAP3jqRhTQlB7.7C3m'
 78 |     # 获取请求
 79 |     request = handle_request(basic_url.format(widen))
 80 |     # 解析json数据
 81 |     json_obj = json.loads(request.text)
 82 |     # 取出与视频相关的数据,data是一个字典元素的列表，每个元素都是一个视频的所有信息
 83 |     data = json_obj['data']
 84 |     # 循环data列表，依次取出每一个视频信息
 85 |     for video_data in data:
 86 |         title = video_data['title']
 87 |         a_href = 'http://365yg.com' + video_data['source_url']
 88 |         print('downloading~...' + title)
 89 |         video_src = handle_href(a_href)
 90 |         # print(video_src)
 91 |         '''
 92 |             调用写好的函数，下载速度会慢很多
 93 |             request = handle_request(video_src)
 94 |             with open('exe_file/12/download/{}.mp4'.format(title), 'wb') as file:
 95 |             file.write(request.content)
 96 |         '''
 97 |         r = requests.get(url=url, headers=headers)
 98 |         with open('exe_file/12/download/{}.mp4'.format(title), 'wb') as file:
 99 |             file.write(r.content)
100 |         print('finish')
101 | def main():
102 |     handle_title(1)
103 | if __name__ == '__main__':
104 |     main()


--------------------------------------------------------------------------------
/13.1thread_ood.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | import time
 3 | # 写一个类，继承自threading.Thread
 4 | class Singthread(threading.Thread):
 5 |     def __init__(self, name, a):
 6 |         super().__init__()
 7 |         self.name = name
 8 |         self.a = a
 9 |     def run(self):
10 |         for x in range(1, 6):
11 |             print('I am sing')
12 |             time.sleep(1)
13 | 
14 | class Dancethread(threading.Thread):
15 |     def __init__(self, name, a):
16 |         super().__init__()
17 |         self.name = name
18 |         self.a = a
19 |     def run(self):
20 |         for x in range(1, 6):
21 |             print('I am dancing')
22 |             time.sleep(1)
23 | 
24 | def main():
25 |     # create thread
26 |     tsing = Singthread('sing', 'cai')
27 |     tdance = Dancethread('dance', 'crayon')
28 | 
29 |     # start thread
30 |     tsing.start()
31 |     tdance.start()
32 | 
33 | 
34 |     # waiting thread end
35 |     tsing.join()
36 |     tdance.join()
37 | 
38 | 
39 |     print('I am Main')
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     main()
44 | 


--------------------------------------------------------------------------------
/13.2thread_queue.py:
--------------------------------------------------------------------------------
 1 | from queue import Queue
 2 | 
 3 | # create queue
 4 | q = Queue(5)
 5 | # store data
 6 | q.put('c')
 7 | q.put('c++')
 8 | q.put('python')
 9 | q.put('java')
10 | q.put('matlab')
11 | # q.put('markdown', True, 3)
12 | # q.put('markdown', False)
13 | # q.put('markdown')
14 | 
15 | # get data
16 | # 先进先出
17 | print(q.get())
18 | print(q.get())
19 | print(q.get())
20 | print(q.get())
21 | print(q.get())
22 | 
23 | print(q.get())  # 队空阻塞
24 | 


--------------------------------------------------------------------------------
/13.3Mthread_crawler.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 多线程爬虫
  3 |     分析
  4 |         两类线程：下载、解析
  5 |         内容队列：下载线程往队列中put数据，解析线程从队列get数据
  6 |         数据
  7 |             url队列:下载线程从url队列中get数据
  8 |             写数据：上锁
  9 |             
 10 | """
 11 | import threading
 12 | import requests
 13 | import json
 14 | from lxml import etree
 15 | from queue import Queue
 16 | import time
 17 | import timeit
 18 | 
 19 | # 队空退出标志
 20 | navi_EXIT = False
 21 | line_EXIT = False
 22 | route_EXIT = False
 23 | data_EXIT = False
 24 | 
 25 | # 获取当前时间
 26 | localtime = time.asctime( time.localtime(time.time()) )
 27 | 
 28 | url = 'https://shenzhen.8684.cn'
 29 | headers = {
 30 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
 31 |                   'AppleWebKit/537.36 (KHTML, like Gecko) '
 32 |                   'Chrome/75.0.3770.142 Safari/537.36',
 33 | }
 34 | # 请求指定url的内容
 35 | def handle_request(request_url):
 36 |     try:
 37 |         request = requests.get(url=request_url,headers=headers)
 38 |         request.raise_for_status()
 39 |         request.encoding = request.apparent_encoding
 40 |         return request.text
 41 |     except:
 42 |         print(request_url + ' get failed')
 43 |         return 'NULL'
 44 | 
 45 | # 首页导航
 46 | def parse_navigation():
 47 |     content = handle_request(request_url=url)
 48 |     tree = etree.HTML(content)
 49 | 
 50 |     # 获取以数字开头的连接
 51 |     number_href_list = tree.xpath('//div[@class="bus_kt_r1"]/a/@href')
 52 |     # 获取以字母开头的连接
 53 |     char_href_list = tree.xpath('//div[@class="bus_kt_r2"]/a/@href')
 54 |     # 将爬取的导航链接列表返回
 55 |     return number_href_list + char_href_list
 56 | 
 57 | # 获取页面线程类-获取关键字页面
 58 | class crawlerThread_getLine(threading.Thread):
 59 |     def __init__(self, threadName, naviQueue, lineQueue):
 60 |         super(crawlerThread_getLine, self).__init__()
 61 | 
 62 |         self.threadName = threadName
 63 |         self.naviQueue = naviQueue
 64 |         self.lineQueue = lineQueue
 65 |     def run(self):
 66 |         # 需保证主线程中队列空才能退出
 67 |         while not navi_EXIT:
 68 |             try:
 69 |                 navi = self.naviQueue.get(False)    # 设置False是为了避免队空阻塞死循环现象
 70 |                 content = handle_request(url + navi)
 71 |                 self.lineQueue.put(content)
 72 |             except:
 73 |                 pass
 74 | 
 75 | # 解析线程-将关键字页面中对应的线路url及名称解析出来
 76 | class parseThread_getline(threading.Thread):
 77 |     def __init__(self, threadName, lineQueue, routeQueue):
 78 |         super(parseThread_getline, self).__init__()
 79 |         self.threadName = threadName
 80 |         self.lineQueue = lineQueue
 81 |         self.routeQueue = routeQueue
 82 |     def parse(self, content):
 83 |         tree = etree.HTML(content)
 84 |         route_infos = tree.xpath('//div[@class="stie_list"]/a')
 85 |         # print(len(route_infos))
 86 |         for route_info in route_infos:
 87 |             # 该线路的url后缀
 88 |             route_suffix = route_info.xpath('.//@href')[0]
 89 |             # 名称
 90 |             route_name = route_info.xpath('.//@title')[0]
 91 |             # print(route_suffix,route_name)
 92 |             self.routeQueue.put((route_suffix, route_name))
 93 | 
 94 |     def run(self):
 95 |         while not line_EXIT:
 96 |             try:
 97 |                 content = self.lineQueue.get(False)
 98 |                 self.parse(content)
 99 |             except:
100 |                 pass
101 | 
102 | # 获取页面线程-获取具体线路信息页
103 | class crawlerThread_getRoute(threading.Thread):
104 |     def __init__(self, threadName, routeQueue, dataQueue):
105 |         super(crawlerThread_getRoute, self).__init__()
106 | 
107 |         self.threadName = threadName
108 |         self.routeQueue = routeQueue
109 |         self.dataQueue = dataQueue
110 | 
111 |     def run(self):
112 |         while not route_EXIT:
113 |             try:
114 |                 route_suffix, route_name = self.routeQueue.get(False)
115 |                 content = handle_request(url + route_suffix)
116 |                 # print(content)
117 |                 self.dataQueue.put(content)
118 |             except:
119 |                 pass
120 | 
121 | # 解析线程-解析具体线路的信息
122 | class parseThread_getRoute(threading.Thread):
123 |     def __init__(self, threadName, dataQueue, result, lock):
124 |         super(parseThread_getRoute, self).__init__()
125 |         self.threadName = threadName
126 |         self.dataQueue = dataQueue
127 |         self.result = result
128 |         self.lock = lock
129 |     def parse(self, content):
130 |         tree = etree.HTML(content)
131 |         # 公交信息的标签位置
132 |         bus_basic_infos = tree.xpath('//div[@class="bus_i_content"]')[0]
133 | 
134 |         # 获取线路名称、运营时间、票价
135 |         bus_name = bus_basic_infos.xpath('./div[@class="bus_i_t1"]/h1/text()')[0] \
136 |             .replace('&nbsp', '')  # 替换掉特殊编码
137 |         bus_runtime = bus_basic_infos.xpath('./p[1]/text()')[0].replace('运行时间：', '')
138 |         bus_fares = bus_basic_infos.xpath('./p[2]/text()')[0].replace('票价信息：', '')
139 |         bus_company = bus_basic_infos.xpath('./p[3]/a/text()')[0]
140 |         bus_update = bus_basic_infos.xpath('./p[4]/text()')[0].replace('最后更新：', '')
141 |         # print(bus_name)
142 |         # print(bus_runtime)
143 |         # print(bus_fares)
144 |         # print(bus_company)
145 |         # print(bus_update)
146 | 
147 |         # 获取线路站点
148 |         '''
149 |             坑：原本思路是找到//div[@class="bus_line_site"][1](第一个，也就是起点到终点的单程站集）
150 |                 下的--div[@class="bus_site_layer"]，但是一直找不到，所以最后直接找后者，这时得到的站集
151 |                 是来回的，取列表的1/2，可以得到单程站集
152 |             填坑：实际上是"bus_line_site "，得再加一个空格
153 |         '''
154 |         bus_line = tree.xpath('//div[@class="bus_site_layer"]')
155 |         length = len(bus_line)
156 |         bus_line = bus_line[:int(length / 2)]
157 |         sites = []
158 |         for line in bus_line:
159 |             for site in line.xpath('./div'):
160 |                 sites.append(site.xpath('./a/text()')[0])
161 |         # print(sites)
162 | 
163 |         bus_data = {
164 |             '线路名称': bus_name,
165 |             '运行时间': bus_runtime,
166 |             '票价信息': bus_fares,
167 |             '运营公司': bus_company,
168 |             '更新时间': bus_update,
169 |             '经过站点': sites,
170 |         }
171 |         # print(bus_data)
172 |         with self.lock:
173 |             # 公交线路放入结果中
174 |             self.result.append(bus_data)
175 |             print('\r' + self.threadName + '-当前已爬取线路数量：' + str(len(self.result)),end=' ')
176 |     def run(self):
177 |         while not data_EXIT:
178 |             try:
179 |                 content = self.dataQueue.get(False)
180 |                 self.parse(content)
181 |             except:
182 |                 pass
183 | 
184 | def main():
185 | 
186 |     # 初始化队列
187 |     naviQueue = Queue()
188 |     lineQueue = Queue()
189 |     routeQueue = Queue()
190 |     dataQueue = Queue()
191 | 
192 |     result = []
193 |     # 设置锁
194 |     # 但好像没必要，因为list本来就是线程安全？？？
195 |     lock = threading.Lock()
196 | 
197 |     # 获取导航页面
198 |     navi_list = parse_navigation()
199 |     for navi in navi_list:
200 |         naviQueue.put(navi)
201 | 
202 |     # -------------------------------------------------------------------------------------------
203 |     # 开启获取线程
204 |     craw_getLine = ['craw-getLine' + str(i) for i in range(16)]
205 |     # craw_getLine = ['craw-getLine1', 'craw-getLine2', 'craw-getLine3', 'craw-getLine4']
206 |     craw_getLine_Threads = []
207 |     for threadName in craw_getLine:
208 |         thread = crawlerThread_getLine(threadName, naviQueue, lineQueue)
209 |         thread.start()
210 |         craw_getLine_Threads.append(thread)
211 | 
212 |     #-------------------------------------------------------------------------------------------
213 |     # 开启解析线程-获取以某个关键字开头的所有线路概要信息
214 |     parse_getLine = ['parse-getLine' + str(i) for i in range(16)]
215 |     # parse_getLine = ['parse-getLine1', 'parse-getLine2', 'parse-getLine3', 'parse-getLine4']
216 |     parse_getLine_Threads = []
217 |     for threadName in parse_getLine:
218 |         thread = parseThread_getline(threadName, lineQueue, routeQueue)
219 |         thread.start()
220 |         parse_getLine_Threads.append(thread)
221 | 
222 |     # -------------------------------------------------------------------------------------------
223 |     # 开启获取线程-获取具体的线路信息页
224 |     craw_getRoute = ['craw-getRoute' + str(i) for i in range(16)]
225 |     # craw_getRoute = ['craw-getRoute1', 'craw-getRoute2', 'craw-getRoute3', 'craw-getRoute4']
226 |     craw_getRoute_Threads = []
227 |     for threadName in craw_getRoute:
228 |         thread = crawlerThread_getRoute(threadName, routeQueue, dataQueue)
229 |         thread.start()
230 |         craw_getRoute_Threads.append(thread)
231 | 
232 |     # -------------------------------------------------------------------------------------------
233 |     parse_getRoute = ['parse-getRoute' + str(i) for i in range(16)]
234 |     # parse_getRoute = ['parse-getRoute1', 'parse-getRoute2', 'parse-getRoute3', 'parse-getRoute4']
235 |     parse_getRoute_Threads = []
236 |     for threadName in parse_getRoute:
237 |         thread = parseThread_getRoute(threadName, dataQueue, result, lock)
238 |         thread.start()
239 |         parse_getRoute_Threads.append(thread)
240 | 
241 |     """
242 |         while.........
243 |         for.........
244 |             .join()
245 |         以上结构在下面一共设置四个，起到阻塞作用
246 |         主线程队空才是真正的队空情况，防止子线程在暂时队空的状态下退出
247 |     """
248 |     #-----------------------------------------------------------------------------------------------
249 |     while not naviQueue.empty():
250 |         pass
251 | 
252 |     global navi_EXIT
253 |     navi_EXIT = True
254 |     print('\rnaviQueue empty!',end='')
255 | 
256 |     for thread in craw_getLine_Threads:
257 |         thread.join()
258 |     #------------------------------------
259 | 
260 |     while not lineQueue.empty():
261 |         pass
262 | 
263 |     global line_EXIT
264 |     line_EXIT = True
265 |     print('\rlineQueue empty!',end='')
266 | 
267 |     for thread in parse_getLine_Threads:
268 |         thread.join()
269 |     #------------------------------------
270 | 
271 |     while not routeQueue.empty():
272 |         pass
273 | 
274 |     global route_EXIT
275 |     route_EXIT = True
276 |     print('\rrouteQueue empty!',end='')
277 | 
278 |     for thread in craw_getRoute_Threads:
279 |         thread.join()
280 |     #-----------------------------------
281 | 
282 |     while not dataQueue.empty():
283 |         pass
284 | 
285 |     global data_EXIT
286 |     data_EXIT = True
287 |     print('\rdataQueue empty!',end='')
288 | 
289 |     for thread in parse_getRoute_Threads:
290 |         thread.join()
291 |     #-----------------------------------
292 | 
293 | 
294 |     # 将bus_data 存入一个result列表，构造<"result":result>键值对并存入一个新字典
295 |     # 将字典转成json格式并存入json文件
296 |     shenzhen_busLine = {
297 |         'json_name': '深圳公交线路汇总',
298 |         'updatetime': localtime,
299 |         'results': result
300 |     }
301 |     file = open('exe_file/13/bus_line.json', 'w', encoding='utf-8')
302 | 
303 |     """
304 |         json.dump()
305 |         把字典转成json串，并自动写入文件中
306 |         dump参数是（字典，文件句柄，indent）。indent用于缩进美化json串的
307 |         ensure_ascii=False用于写文件时有unicode时用，正常显示出中文来
308 |     """
309 |     json.dump(shenzhen_busLine, file, indent=4, ensure_ascii=False)
310 |     file.close()
311 | 
312 | if __name__ == '__main__':
313 |     # 比单线程的程序快5倍左右
314 |     start = timeit.default_timer()
315 |     main()
316 |     print('\ntime:' + str(timeit.default_timer() - start))
317 | 


--------------------------------------------------------------------------------
/13multiThread.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 多线程
 3 |     面向过程
 4 |         t = threading.Thread(target=xxx(函数),name=xxx,args=(xx,xx))
 5 |         target :线程启动之后要执行的函数
 6 |         name:线程的名字
 7 |         获取线程名字：threading.current_thread().name
 8 |         args:主线程向子线程传递参数
 9 |         t.start():启动线程
10 |         t.join():让主线程等待子线程结束
11 |     面向对象
12 |         定义一个类，继承自threading.Thread，重写一个方法run()，
13 |         需要线程名字、传递参数，重写构造方法，在重写构造方法的时候，主动调用父类的构造方法
14 |     线程同步问题
15 |         线程之间共享全局变量，很容易发生数据紊乱现象
16 |         使用线程锁解决
17 |         抢锁，谁抢到，谁先上锁，谁就先使用
18 |         创建锁
19 |             suo = threading.Lock()
20 |         上锁
21 |             suo = acquire()
22 |         释放锁
23 |             suo.release()
24 |         
25 |         队列(queue)
26 |             下载线程
27 |             解析线程，通过队列进行交互
28 |             q = Queue(size)
29 |             q.put('xxx')-如果队列满，程序卡在这里等待
30 |             q.put(xxx,False)-如果队列满，程序直接报错
31 |             q.put(xxx,True,3)-如果队列满，等待三秒再报错
32 |             
33 |             获取数据
34 |             q.get()
35 |             q.get(False)    队空取元素直接报错
36 |             q.get(True, 3) 队列空，程序等待3s报错
37 |             
38 |             q.empty()   判断队列是否满
39 |             q.full()    判断队列是否已满
40 |             q.qsize()   获取队列长度
41 | """
42 | import threading
43 | import time
44 | # 一个主线程，一个唱歌，一个跳舞线程
45 | 
46 | # TypeError: sing() takes 0 positional arguments but 1 was given
47 | # 需要接收参数a
48 | def sing(a):
49 |     print(threading.current_thread().name, a)
50 |     for x in range(1, 6):
51 |         print('I am sing')
52 |         time.sleep(1)
53 | def dance(a):
54 |     print(threading.current_thread().name, a)
55 |     for x in range(1, 6):
56 |         print('I am dancing')
57 |         time.sleep(1)
58 | def main():
59 |     a = 'superman'
60 |     # 创建唱歌线程
61 |     tsing = threading.Thread(target=sing, name="sing", args=(a,))
62 |     # 创建跳舞线程
63 |     tdance = threading.Thread(target=dance, name="dance", args=(a,))
64 |     # 启动线程
65 |     tsing.start()
66 |     tdance.start()
67 |     # 让主线程等待子线程结束之后在结束
68 |     tsing.join()
69 |     tsing.join()
70 |     """
71 |         先让子线程停
72 |         再让主线程停止
73 |     """
74 |     print("I am Main")
75 | 
76 | if __name__ == '__main__':
77 |     main()


--------------------------------------------------------------------------------
/1urllib_base.py:
--------------------------------------------------------------------------------
  1 | """
  2 | code by python 3.7.2
  3 | utf-8
  4 | caixiaoxin
  5 | index: 1
  6 | """
  7 | 
  8 | 
  9 | """
 10 | urllib.request:
 11 |     urlopen:打开url
 12 |     urlretrieve(url,file_name)：打开并保存url内容
 13 | urllib.parse:
 14 |     quote(): url编码函数，将中文进行转化为%xxx
 15 |     unquote()：url解码函数，将%xxx转化为指定字符
 16 |     urlencode()：非法字符转码
 17 | response:
 18 |     read()  读取字节类型
 19 |     geturl()    获取请求url
 20 |     getheaders()
 21 |     getcode()
 22 |     readlines()
 23 | """
 24 | """
 25 | 字符串->二进制：encode()
 26 | 二进制->字符串：decode()
 27 | 默认utf8
 28 | """
 29 | 
 30 | 
 31 | 
 32 | 
 33 | import urllib.request
 34 | url = 'http://www.baidu.com/'
 35 | response = urllib.request.urlopen(url=url)
 36 | """
 37 | print(response)
 38 | print(response.geturl())
 39 | print(response.getheaders())
 40 | print(response.getcode())
 41 | """
 42 | # print(response.read().decode())
 43 | # 读取的url内容存储
 44 | with open('baidu.html','w',encoding='utf8') as file:
 45 |     file.write(response.read().decode())
 46 | """
 47 | 等同上方
 48 | 只不过上述用utf8写入
 49 | 在此用二进制写入
 50 | 图片用这个！
 51 | with open('baidu_1.html','wb') as flie:
 52 |     file.write(response.read())
 53 | """
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | # urlretrieve(url,file_name)
 61 | picurl = "https://timgsa.baidu.com/timg?image&quality=80&" \
 62 |          "size=b9999_10000&sec=1551421909555&di=9f9d69abb9fe596f493f9c6e3e52f08e&imgtype=0&" \
 63 |          "src=http%3A%2F%2Fgss0.baidu.com%2F9vo3dSag _xI4khGko9WTAnF6hhy%2Fzhidao%2Fpic%2Fitem%" \
 64 |          "2Fb151f8198618367a039b78422c738bd4b31ce51b.jpg"
 65 | 
 66 | """
 67 | # 创建写入文件一条龙服务
 68 | # urllib.request.urlretrieve(picurl,'ironMan.jpg')
 69 | """
 70 | 
 71 | 
 72 | 
 73 | 
 74 | 
 75 | 
 76 | 
 77 | import urllib.parse
 78 | 
 79 | # url中若出现 $ 空格 中文等，就要对其进行编码
 80 | url = 'http://www.baidu/index.html?name=钢铁侠&pwd=123456'
 81 | ret = urllib.parse.quote(url)
 82 | re = urllib.parse.unquote(url)
 83 | re_1 = urllib.parse.unquote(picurl)
 84 | print(ret)
 85 | print(re)
 86 | print(re_1)
 87 | 
 88 | """
 89 | urllib.parse.urlencode 的应用！
 90 | """
 91 | url = 'http://www.baidu.com/index.html'
 92 | # 构造 http://www.baidu.co/index.html?name=goudan&age=18&sex=nv&height=180
 93 | name = '钢铁侠'
 94 | age = 18
 95 | sex = 'nv'
 96 | height = "180"
 97 | 
 98 | data = {
 99 |     'name' : name,
100 |     'age' : age,
101 |     'sex' : sex,
102 |     'height' : height,
103 |     'weight' : 180,
104 | }
105 | # 具有非法字符的自动转换功能
106 | construct_url = urllib.parse.urlencode(data)
107 | print(construct_url)
108 | construct_url = url + '?' + construct_url
109 | print(construct_url)
110 | 
111 | # example：植入搜索关键字
112 | import urllib.parse
113 | baidu = 'http://www.baidu.com/s?'
114 | word = input('input the key you want:')
115 | _data = {
116 |     'ie' : 'utf-8',
117 |     'wd' : word,
118 | }
119 | # 非法字符转码
120 | query_string = urllib.parse.urlencode(_data)
121 | baidu += query_string
122 | response = urllib.request.urlopen(baidu)
123 | filename = word + '.html'
124 | with open(filename,'wb') as file:
125 |     file.write(response.read())
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | 
133 | # 伪装UA
134 | # 构建请求对象:urllib.request.Request(self,url,data=None,headers={},...)
135 | url = 'http://www.baidu.com/'
136 | headers = {
137 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
138 | }
139 | request = urllib.request.Request(url = url, headers=headers)
140 | 
141 | response = urllib.request.urlopen(request)
142 | 


--------------------------------------------------------------------------------
/2ajax.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 | author:caixiaoxin
  4 | Ajax 即“Asynchronous Javascript And XML”（异步 JavaScript 和 XML），是指一种创建交互式网页应用的网页开发技术。
  5 | Ajax = 异步 JavaScript 和 XML 或者是 HTML（标准通用标记语言的子集）。
  6 | Ajax 是一种用于创建快速动态网页的技术。
  7 | Ajax 是一种在无需重新加载整个网页的情况下，能够更新部分网页的技术。
  8 | 通过在后台与服务器进行少量数据交换，Ajax 可以使网页实现异步更新。这意味着可以在不重新加载整个网页的情况下，对网页的某部分进行更新
  9 | """
 10 | 
 11 | 
 12 | import urllib.parse
 13 | import urllib.request
 14 | 
 15 | 
 16 | 
 17 | 
 18 | 
 19 | """
 20 |                 豆瓣爬取
 21 | """
 22 | #如果url链接中出现我们要调配的参数并赋值常数，需要删除相关参数部分
 23 | url = "https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&"
 24 | page = 1
 25 | limit = 1
 26 | # 构建post表单
 27 | data = {
 28 |     'start':(page-1)*limit,
 29 |     'limit':limit,
 30 | }
 31 | headers = {
 32 |     'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
 33 | }
 34 | 
 35 | """
 36 | request = urllib.request.Request(url=url,headers=headers)
 37 | data = urllib.parse.urlencode(data).encode()
 38 | response = urllib.request.urlopen(request,data = data)
 39 | """
 40 | 
 41 | # 等价于三部曲
 42 | query_string = urllib.parse.urlencode(data)
 43 | url += query_string
 44 | 
 45 | request = urllib.request.Request(url = url,headers=headers)
 46 | response = urllib.request.urlopen(request)
 47 | 
 48 | print(response.read().decode())
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | 
 57 | 
 58 | 
 59 | 
 60 | 
 61 | 
 62 | 
 63 | 
 64 | """
 65 |                 kfc爬取
 66 | """
 67 | post_url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=cname'
 68 | form_data = {
 69 |     'cname': '深圳',
 70 |     'pid': '',
 71 |     'pageIndex': '2',
 72 |     'pageSize': '10',
 73 | }
 74 | 
 75 | request = urllib.request.Request(url = post_url,headers=headers)
 76 | form_data = urllib.parse.urlencode(form_data).encode()
 77 | response = urllib.request.urlopen(request,data = form_data)
 78 | print(response.read().decode())
 79 | 
 80 | 
 81 | 
 82 | 
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | 
 94 | """
 95 |                 贴吧爬取
 96 | """
 97 | 
 98 | # tieba_url = 'http://tieba.baidu.com/f?ie=utf-8&kw=python&red_tag=d3356873073'
 99 | tieba_url = 'http://tieba.baidu.com/f?ie=utf-8'
100 | # 页码变化参数 pn
101 | # pn按50递增
102 | 
103 | 
104 | for page in range(0,10):
105 |     form_data = {
106 |         'kw': 'python',
107 |         'pn': (page-1)*50
108 |     }
109 |     form_data = urllib.parse.urlencode(form_data).encode()
110 |     request = urllib.request.Request(url = tieba_url,headers = headers)
111 |     response = urllib.request.urlopen(request,data = form_data)
112 | 
113 |     with open('tiebaPage//'+str(page)+'.html','wb') as file:
114 |         file.write(response.read())
115 | 
116 | 
117 | 
118 | 
119 | 
120 | """
121 |             异常处理
122 | """
123 | import urllib.error
124 | url = 'http://www.maodan.com/'
125 | 
126 | #URLerror
127 | try:
128 |     response = urllib.request.urlopen(url)
129 | except urllib.error.URLError as e:
130 |     print(e)    #<urlopen error [Errno 11001] getaddrinfo failed>#
131 | 
132 | #HTTPerror
133 | 
134 | 


--------------------------------------------------------------------------------
/4handler.py:
--------------------------------------------------------------------------------
  1 | 
  2 | """
  3 |     Handler处理器，自定义Opener
  4 |     urlopen() 给一个url，发送请求，获取响应
  5 |     Request() 定制请求头，创建请求对象
  6 |     高级功能：使用代理，cookie
  7 |     
  8 |     代理：
  9 |         配置：浏览器配置:高级-代理设置
 10 |         代码配置：就此可以在xici上爬取ip代理，然后随机分配给爬虫，来突破爬取网站所给的爬取频率
 11 |                 同时也防止自身ip被封
 12 |     cookie
 13 |         #服务器端访问网站所留下的识别信息#
 14 |         模拟登陆：抓包获取cookie
 15 |         通过cookieJar保存模拟登陆所得到的cookie
 16 | """
 17 | 
 18 | 
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | """利用handler和opener获取页面的基本操作"""
 25 | import urllib.request
 26 | import urllib.parse
 27 | 
 28 | url = 'http://baidu.com/'
 29 | 
 30 | headers = {
 31 |     'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
 32 | }
 33 | 
 34 | #  创建
 35 | handler = urllib.request.HTTPHandler()
 36 | 
 37 | # 通过hander创建一个opener,使用opener中的方法发送请求
 38 | opener = urllib.request.build_opener(handler)
 39 | 
 40 | # 构建请求对象
 41 | request = urllib.request.Request(url,headers=headers)
 42 | 
 43 | # 发送请求
 44 | response = opener.open(request)
 45 | # print(response.read().decode())
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | 
 55 | 
 56 | """
 57 | 利用ProxyHandler进行ip伪装
 58 | ip伪装会出现问题，留坑
 59 | 在西刺网获取ip进行实践成功率非常低
 60 | 响应时间过长及ip伪装失败（成功响应但是ip仍为本机）
 61 | """
 62 | 
 63 | url = 'https://baidu.com/s?'
 64 | headers = {
 65 |     'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
 66 | }
 67 | data = {
 68 |     'ie' : 'utf-8',
 69 |     'wd' : 'ip',
 70 | }
 71 | # 构造表单请求链接
 72 | query_string = urllib.parse.urlencode(data)
 73 | url += query_string
 74 | 
 75 | # 伪装ip格式
 76 | handler = urllib.request.ProxyHandler({'http':'121.232.148.73:9000'})
 77 | opener = urllib.request.build_opener(handler)
 78 | 
 79 | request = urllib.request.Request(url,headers=headers)
 80 | # 很神奇，三部曲失效，所以只能构造url了
 81 | # data = urllib.parse.urlencode(data).encode()
 82 | response = opener.open(request)
 83 | 
 84 | with open("ip1.html",'wb') as file:
 85 |     file.write(response.read())
 86 | 
 87 | 
 88 | 
 89 | 
 90 | 
 91 | 
 92 | 
 93 | """
 94 | 利用fiddler抓取cookie实现人人网主页的获取
 95 | 并没有抓取登陆时的json因为文件特别特别多
 96 | 抓取的json来源于进入主页所发送的表单请求
 97 | 
 98 | 问题：cookie是实时的，所以该方法捕获的cookie需要实时抓包，过期失效
 99 | 
100 | 该问题引出下一个实例，登陆人人网并进入个人主页
101 | """
102 | 
103 | renren_url = 'http://www.renren.com/969920379/profile'
104 | 
105 | headers = {
106 |     'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
107 |     'Cookie': 'anonymid=jsrj643rtki53d; depovince=GUZ; _r01_=1; ln_uact=15625266605; '
108 |               'ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; ick_login=c8e7f87f-75da-4f2f-b840-104909404637; '
109 |               'first_login_flag=1; JSESSIONID=abcJbn-wkjg1eh_WTZiLw; jebecookies=7817aaaf-eb35-4f05-b071-f3e5f75f07c2|||||;'
110 |               ' _de=E00E5A467C4B17304268C536701AF72D; p=c14a2da80879c2daa73cc1a3853720609; t=1e2cbc27606389927d404e61f48774c19; '
111 |               'societyguester=1e2cbc27606389927d404e61f48774c19; id=969920379; xnsid=b72aafa1; ver=7.0; loginfrom=null; '
112 |               'wp_fold=0log=[{"hostId":"969920379","targetTag":"name_click","sendUserId":"969920379"}]&requestToken=-1644252112&_rtk=69bfb28a',
113 | }
114 | 
115 | request = urllib.request.Request(url = renren_url,headers = headers)
116 | response = urllib.request.urlopen(request)
117 | 
118 | with open('renren.html','wb') as file:
119 |     file.write(response.read())
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 
128 | 
129 | 
130 | 
131 | 
132 | """
133 |     http.cookiejar的应用，保存cookie，通过保存的cookie访问主页
134 |     密码是加密过的，所以并不能构造表单实现登陆
135 |     表单的获取需要手动登陆得到post请求
136 |     登陆后，由于cookie提前保存，所以能够登陆该账号的其他页面
137 | """
138 | 
139 | 
140 | import http.cookiejar
141 | #模拟真实浏览器，发送完post请求猴，将cookie保存到代码中
142 | 
143 | # 创建一个cookie对象
144 | cj = http.cookiejar.CookieJar()
145 | # 通过cookie创建一个handler
146 | handler = urllib.request.HTTPCookieProcessor(cj)
147 | # 根据handler创建一个opener
148 | opener = urllib.request.build_opener(handler)
149 | 
150 | url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019212154558'
151 | formdata = {
152 |     'email':'15625266605',
153 |     'icode'	:'',
154 |     'origURL':'http://www.renren.com/home',
155 |     'domain':'renren.com',
156 |     'key_id':'1',
157 |     'captcha_type':	'web_login',
158 |     'password':	'1162c49a98a09a374364c99e2ad203b82211bc9cfdf8411e3b47d3ae268ec869',
159 |     'rkey':	'54fa0fe478cb62a6ae1184e8e15c9dbb',
160 |     'f':'http%3A%2F%2Fwww.renren.com%2F969920379',
161 | }
162 | headers = {
163 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
164 |                   'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36'
165 | }
166 | 
167 | request = urllib.request.Request(url = url,headers = headers)
168 | formdata = urllib.parse.urlencode(formdata).encode()
169 | response = opener.open(request,data = formdata)
170 | 
171 | 
172 | 
173 | get_url = 'http://www.renren.com/969920379/profile'
174 | 
175 | # 进入自己的主页
176 | request = urllib.request.Request(url=get_url,headers = headers)
177 | response = opener.open(request)
178 | 
179 | with open('renren.html','wb') as file:
180 |     file.write(response.read())
181 | 
182 | 
183 | 
184 | 


--------------------------------------------------------------------------------
/5.1正则爬取糗.py:
--------------------------------------------------------------------------------
 1 | """     
 2 |     5.1 正则表达式-爬取糗图
 3 |     code utf8
 4 |     date 03/08/2019
 5 |     author caixiaoxin
 6 | """
 7 | 
 8 | import urllib.parse
 9 | import urllib.request
10 | import re
11 | def download_image(content,file):
12 |     # pattern = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt="(.*?)" />.*?</div>',re.S)
13 |     """
14 |     re.S 可以使 . 具有匹配换行的功能
15 |     
16 |     正则中加入括号，表示匹配的目标字段，也就是想要获取的信息
17 |     
18 |     pattern 为得到的图片的url
19 |     _pattern 为得到图片相应的段子
20 |     """
21 |     pattern = re.compile(r'<div class="thumb">.*?<img src="(.*?)".*?</div>',re.S)
22 |     _pattern = re.compile(r'div class="content".*?<span>(.*?)</span>.*?</div>',re.S)
23 | 
24 |     """
25 |         findall：返回匹配目标字段的列表
26 |     """
27 |     image_urls = pattern.findall(content)
28 |     image_titles = _pattern.findall(content)
29 | 
30 | 
31 |     # 将爬取得到的图片url及段子写入文件
32 |     for index in range(len(image_urls)):
33 |         image_urls[index] = 'http:' + image_urls[index]
34 |         try:
35 |             file.writelines(image_urls[index] + ':\n' +
36 |                             image_titles[index] + '\n\n')
37 |         except:
38 |             pass
39 | 
40 | def main():
41 |     url = 'https://www.qiushibaike.com/pic/page/{}/'
42 |     headers = {
43 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
44 |     }
45 |     file = open('qiutu.html', 'w')
46 | 
47 |     start_page = 1
48 |     end_page = 5
49 |     for page in range(start_page,end_page+1):
50 |         request = urllib.request.Request(url = url.format(page),headers = headers)
51 |         content = urllib.request.urlopen(request).read().decode()
52 |         download_image(content,file)
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/5.2正则爬取励志网并建立文章集合页面.py:
--------------------------------------------------------------------------------
  1 | """     
  2 |     5.2 正则表达式-爬取励志网
  3 |     code utf8
  4 |     date 03/08/2019
  5 |     author caixiaoxin
  6 | """
  7 | 
  8 | 
  9 | import urllib.parse
 10 | import urllib.request
 11 | import re
 12 | import os
 13 | 
 14 | 
 15 | headers = {
 16 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
 17 | }
 18 | # html文件头-不加会乱码
 19 | html_head = """
 20 | <head>
 21 |     <meta charset="UTF-8">
 22 |     <title>5.2正则：爬取励志网</title>
 23 | </head>
 24 | """
 25 | 
 26 | file_address = 'lizhi.html'
 27 | 
 28 | # 文件存在则先删除原有文件
 29 | if os.path.exists(file_address):
 30 |     os.remove(file_address)
 31 | else:
 32 |     os.mkdir(file_address)
 33 | 
 34 | # 写入文件头
 35 | file = open(file_address,'a',encoding='utf8')
 36 | file.write(html_head)
 37 | 
 38 | 
 39 | # 获取url请求
 40 | def get_request(url):
 41 |     request = urllib.request.Request(url = url,headers = headers)
 42 |     return request
 43 | 
 44 | # 提取每篇文章的标题和链接
 45 | def parse_content(content):
 46 |     """
 47 |     # <a href="/lizhi/qianming/20180139714.html">努力，奋斗，坚持，不抛弃，不放弃，一切皆有可能</a>
 48 |     # <h3><a href="/lizhi/qianming/20190241246.html"><b>我不知道年少轻狂，我只知道胜者为王——追梦赤子心</b></a></h3>
 49 |     查出标签有两个版本，一个带b标签，一个没有
 50 |     决定暂时保留b标签，过后单独处理
 51 |     """
 52 | 
 53 |     pattern = re.compile(r'<h3><a href="(/lizhi/qianming/\d+.html)">(.*?)</a></h3>')
 54 | 
 55 |     articleList = pattern.findall(content)
 56 |     # print(len(articleList))
 57 |     for article in articleList:
 58 |         # 可能出现带b标签的标题，清除
 59 |         # article[1].replace('</b>','').replace('<b>','')
 60 |         get_text(url = 'http://www.yikexun.cn'+article[0],title = article[1].replace('</b>','').replace('<b>',''))
 61 | 
 62 | 
 63 | # 提取文章内容
 64 | def get_text(url,title):
 65 |     request = urllib.request.Request(url = url,headers = headers)
 66 |     content = urllib.request.urlopen(request).read().decode()
 67 | 
 68 |     # 提取文章内容
 69 |     pattern = re.compile(r'<div class="neirong">(.*?)</div>',re.S)
 70 |     article = pattern.findall(content)[0].strip()
 71 | 
 72 | 
 73 |     """
 74 |     bug：
 75 |     写入html后打开，会出现文章渐进的排版错误
 76 |     
 77 |     源：因为有些文章结尾不明多出<p>，缺失结束标签 </li></ol>
 78 |     
 79 |     修复：去除内容结尾空格，检查尾缀是否为<p> ，将<p>替换成结束标签 </li></ol>
 80 |     """
 81 |     title = title.strip()
 82 |     if article[-3:] == "<p>":
 83 |         article = article[:-3] + "</li></ol>"
 84 | 
 85 | 
 86 | 
 87 |     # 美化：去除所有无法加载（其实也就是全部）的图片
 88 |     # <img src="/uploads/image/201901/121547293661170287.png" title="121547293661170287.png" alt="v2-fbdde028d48d572b2425965acf058add_hd.png">
 89 |     image_pattern = re.compile(r'<img .*?/>')
 90 | 
 91 |     """
 92 |         这个替换挺简便的
 93 |     """
 94 |     article = image_pattern.sub('',article)
 95 |     parse_html(title = title,article = article)
 96 | 
 97 | 
 98 | 
 99 | # 文章写入html文件
100 | def parse_html(title,article):
101 | 
102 |     #标题加上h1标签，设置每篇文章排版
103 |     complete_arc = '<h1>%s</h1>%s\n\n'%(title,article)
104 | 
105 |     file.write(complete_arc)
106 | 
107 | 
108 | 
109 | def main():
110 |     url = 'http://www.yikexun.cn/lizhi/qianming/list_50_{}.html'
111 |     start_page = 1
112 |     end_page = 10
113 | 
114 |     for page in range(start_page,end_page+1):
115 |         request = get_request(url.format(page))
116 | 
117 |         #预览页（主页内容）
118 |         content = urllib.request.urlopen(request).read().decode()
119 |         parse_content(content)
120 | 
121 | 
122 | 
123 | 
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     main()
128 | 
129 | 


--------------------------------------------------------------------------------
/58crawler/58.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/58crawler/58.ttf


--------------------------------------------------------------------------------
/58crawler/58decode.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import lxml.html
  3 | import base64
  4 | from fontTools.ttLib import TTFont
  5 | import requests
  6 | import random
  7 | import sqlite3
  8 | import time
  9 | 
 10 | db = sqlite3.connect("58.db")
 11 | cursor = db.cursor()
 12 | 
 13 | UA = [
 14 |     "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 15 |     "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 16 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
 17 |     "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
 18 | ]
 19 | # https://sz.58.com/shixia/chuzu/pn68/?PGTID=0d3090a7-017c-74d8-4e30-b085460b77a1&ClickID=2
 20 | # https://sz.58.com/shixia/chuzu/pn66/?PGTID=0d3090a7-017c-74e7-0457-6697c22f0410&ClickID=2
 21 | headers = {
 22 |     "User-Agent": random.choice(UA)
 23 | }
 24 | 
 25 | 
 26 | def resp(i):
 27 | 
 28 |     base_url = "https://sz.58.com/chuzu/pn{}/?key=大望&PGTID=0d3090a7-017c-74d8-4e30-b085460b77a1&ClickID=2"
 29 |     response = requests.get(base_url.format(i), headers=headers)
 30 |     print("正在下载:", response.url)
 31 |     return response
 32 | 
 33 | 
 34 | def get_base64_str(response):
 35 |     base_font = re.compile("base64,(.*?)\'")
 36 |     base64_str = re.search(base_font, response.text).group().split(',')[1].split('\'')[0]
 37 |     return base64_str
 38 | 
 39 | 
 40 | def make_font_file(base64_str):
 41 |     b = base64.b64decode(base64_str)
 42 |     with open("58.ttf", "wb") as f:
 43 |         f.write(b)
 44 | 
 45 | 
 46 | def make_dict():
 47 |     font = TTFont('58.ttf')
 48 |     b = font['cmap'].tables[2].ttFont.getReverseGlyphMap()  # 编码对应的数字
 49 |     c = font['cmap'].tables[2].ttFont.tables['cmap'].tables[1].cmap  # 页面的十六进制数对应的编码
 50 |     return b, c
 51 | 
 52 | 
 53 | def parse_title(text):
 54 |     s = ""
 55 |     title_re = re.compile("\s")
 56 |     html = lxml.html.fromstring(text)
 57 |     title = html.xpath('//a[@class="strongbox"]//text()')[0]
 58 |     title = re.sub(title_re, '', title)
 59 |     for i in title:
 60 |         encode_str = str(i.encode("unicode-escape")).split(r'\\u')[-1].replace('\'', '').replace(r'b(', '').strip()
 61 |         try:
 62 |             num, code = make_dict()
 63 |             if len(encode_str) != 4:
 64 |                 i = i
 65 |             elif int(encode_str, 16) not in code:
 66 |                 i = i
 67 |             else:
 68 |                 i = str(num[code[int(encode_str, 16)]] - 1)
 69 |             s += i
 70 |         except:
 71 |             s = "None"
 72 |     return s
 73 | 
 74 | 
 75 | def parse_price(text):
 76 |     s = ""
 77 |     html = lxml.html.fromstring(text)
 78 |     price_code = html.xpath('//div[@class="money"]/b/text()')[0]
 79 |     price_code = price_code.strip().replace('\r\n', '').replace(' ', '')
 80 |     price_encode_str = str(price_code.encode("unicode-escape")).split('\'')[1].split('-')
 81 |     if len(price_encode_str) > 1:
 82 |         s1 = ""
 83 |         s2 = ""
 84 |         encode_list1 = price_encode_str[0].split(r"\\u")[1:]
 85 |         encode_list2 = price_encode_str[1].split(r"\\u")[1:]
 86 |         for i in encode_list1:
 87 |             price = int(i, 16)
 88 |             num, code = make_dict()
 89 |             s1 += str(num[code[price]] - 1)
 90 |         for i in encode_list2:
 91 |             price = int(i, 16)
 92 |             num, code = make_dict()
 93 |             s2 += str(num[code[price]] - 1)
 94 |         s = s1 + '-' + s2
 95 | 
 96 |     else:
 97 |         str_list = price_encode_str[0].split(r'\\u')[1:]
 98 |         for i in str_list:
 99 |             price = int(i, 16)
100 |             try:
101 |                 num, code = make_dict()
102 |                 s += str(num[code[price]] - 1)
103 |             except:
104 |                 s = "None"
105 | 
106 |     return s
107 | 
108 | 
109 | def parse_room(text):
110 |     s = ""
111 |     html = lxml.html.fromstring(text)
112 |     p_rooms = html.xpath('//p[@class="room strongbox"]/text()')[0]
113 |     room_re = re.compile('[\s]')
114 |     room_re1 = re.compile(r'[m²]')
115 |     room_re2 = re.compile(r'/')
116 |     rooms = re.sub(room_re, '', p_rooms)
117 |     rooms = re.sub(room_re1, "平米", rooms)
118 |     rooms = re.sub(room_re2, "至", rooms)
119 |     for i in rooms:
120 |         # print(i.encode("unicode-escape"))
121 |         encode_str = str(i.encode("unicode-escape")).split(r'\\u')[-1].replace('\'', '').replace(r'b/', '').strip()
122 |         # print(encode_str)
123 |         try:
124 |             num, code = make_dict()
125 |             if len(encode_str) != 4:
126 |                 i = i
127 |             elif int(encode_str, 16) not in code:
128 |                 i = i
129 |             else:
130 |                 i = str(num[code[int(encode_str, 16)]] - 1)
131 |             s += i
132 |         except:
133 |             s = "None"
134 |     return s
135 | 
136 | #debug
137 | def parse_dist(text):
138 |     s = ""
139 |     html = lxml.html.fromstring(text)
140 |     p_dist_re = re.compile('\skm')
141 |     try:
142 |         p_dist = html.xpath('//p[@class="add"]/text()')[3]
143 |         p_dist = ''.join(p_dist).replace(' ', '')
144 |         p_dist = re.sub(p_dist_re, '千米', p_dist)
145 |         for i in p_dist:
146 |             encode_str = str(i.encode("unicode-escape")).split(r'\\u')[-1].replace('\'', '').replace(r'\\r',
147 |                                                                                                      '').replace(r'\\n',
148 |                                                                                                                  '').replace(
149 |                 r'b.', '').strip()
150 |             num, code = make_dict()
151 |             if len(encode_str) != 4:
152 |                 i = i
153 |             elif int(encode_str, 16) not in code:
154 |                 i = i
155 |             else:
156 |                 i = str(num[code[int(encode_str, 16)]] - 1)
157 |             s += i
158 |         dist = s
159 |     except:
160 |         dist = "暂无"
161 |     return dist
162 | 
163 | 
164 | def short_rent(text):
165 |     html = lxml.html.fromstring(text)
166 |     try:
167 |         rent = html.xpath('//p[@class="room"]/b/text()')[0]
168 |     except:
169 |         rent = "不可短租"
170 |     return rent
171 | 
172 | 
173 | def parse_li(response):
174 |     li_re = re.compile('<li logr([\s\S]*?)</li>')
175 |     li_list = re.findall(li_re, response.text)
176 |     return li_list
177 | 
178 | 
179 | def parse_target(text):
180 |     html = lxml.html.fromstring(text)
181 |     try:
182 |         target = html.xpath('//p[@class="spec"]/span/text()')
183 |         target = ','.join(target)
184 |     except:
185 |         target = "暂无"
186 |     return target
187 | 
188 | 
189 | if __name__ == '__main__':
190 |     for i in range(1, 71):
191 |         response = resp(i)
192 |         time.sleep(5)
193 |         base64_str = get_base64_str(response)
194 |         make_font_file(base64_str)
195 |         make_dict()
196 |         li_list = parse_li(response)
197 |         for i in li_list:
198 |             # print(i)
199 |             title = parse_title(i)
200 |             price = parse_price(i)
201 |             room = parse_room(i)
202 |             dist = parse_dist(i)
203 |             rent = short_rent(i)
204 |             target = parse_target(i)
205 |             city = "深圳"
206 |             print(title,price,room,dist,rent,target)
207 |             # cursor.execute("insert into home(title, price, room, dist, rent,target, city) values (?,?,?,?,?,?,?)",
208 |             #                [title, price, room, dist, rent, target, city])
209 |             db.commit()
210 | 


--------------------------------------------------------------------------------
/6.1read_list.py:
--------------------------------------------------------------------------------
 1 | 
 2 | file = open('exe_file/hello.txt','r',encoding='utf8')
 3 | string = file.read()
 4 | file.close()
 5 | 
 6 | lt = eval(string)
 7 | print(lt[0]['name'])
 8 | 
 9 | # out：
10 | # 宫本武藏
11 | # 小田纯一郎


--------------------------------------------------------------------------------
/6xpath.py:
--------------------------------------------------------------------------------
  1 | """
  2 | TODO：xpath学习
  3 | TEST：段子网爬取
  4 | Author:caixiaoxin
  5 | Date:2019/7/10
  6 | """
  7 | """
  8 |     xpath?
  9 |         xml是用来存储和传输数据的
 10 |         和html的不同点：
 11 |             1 html是用来显示数据的，xml是用来传输的
 12 |             2 html是固定的，xml标签是自定义的
 13 |         <bookstore>
 14 |         <book>
 15 |             <title lang="en"> Harry Potter</title>
 16 |             <author> K.Rowling</author>
 17 |             <year>2005</year>
 18 |             <price>29.99</price>
 19 |         </book>
 20 |         </bookstore>
 21 |         
 22 |         <bookstore> 文档节点
 23 |         <author> K.Rowling</author> 元素节点
 24 |         lang="en" 属性节点
 25 |         
 26 |         // 不考虑位置的查找
 27 |         ./ 从当前节点开始往下查找
 28 |         .. 从当前节点的父结点查找
 29 |         @ 选取属性
 30 |         
 31 |         e:
 32 |         /bookstore/book 选取根节点bookstore下面所有直接子节点的book
 33 |         //book  选取所有的book
 34 |         bookstore//book 查找bookstore 下面所有的book,不管所在位置
 35 |         /bookstore/book[1] bookstore 里面的第一个book
 36 |         /bookstore/book[last()] bookstore里面的最后一个book
 37 |         /bookstore/book[position()<3] 前两个book
 38 |         //title[@lang] 所有带有lang属性的title
 39 |         //title[@lang='eng'] 所有的lang属性为eng的title节点
 40 |         * 任何元素节点
 41 |         
 42 |         
 43 |         属性定位
 44 |         //input[@id="kw"]
 45 |         //input[@class="g s_ btn"]
 46 |         层级定位
 47 |         //div[@id="head"]/div/div[2]/a[@class="toindex"]   --索引从1开始
 48 |         //div[@id="head"]//a[@class="toindex"]   --双斜杠表示下面的所有a节点，不管位置
 49 |         逻辑运算
 50 |         //input[@class="s_ipt" and @name="wd]
 51 |         模糊匹配
 52 |         contains://input[contains(@class,"s_i")]---所有input，有class属性，并且属性中带s_i节点
 53 |                 //input[contains(text(),"爱")]
 54 |         starts-with://input[starts-with(@class,"s")]---所有的input，有class属性，并且属性以s开头
 55 |         取文本
 56 |         //div[@id="ul"]/a[5]/text()
 57 |         所有文本
 58 |         //div[@id="n1"]//text()   div下所有的文本
 59 |         
 60 |         取属性
 61 |         //div[@id="ul"]/a[5]/@href
 62 |         
 63 |         代码中应用xpath
 64 |         from lxml import etree
 65 |         将html文档变成一个对象，然后调用对象的方法去查找指定的节点
 66 |         1 本地文件
 67 |             tree=etree.parse
 68 |         2 网络文件
 69 |             tree=etree.HTML(网页字符串)
 70 |         
 71 | """
 72 | 
 73 | # xpath测试
 74 | 
 75 | from lxml import etree
 76 | # 使用lxml.etree.parse()解析html文件，该方法默认使用的是“XML”解析器，所以如果碰到不规范的html文件时就会解析错误
 77 | # lxml.etree.XMLSyntaxError: Opening and ending tag mismatch: meta line 3 and head, line 3, column 87
 78 | # 创建html解析器，增加parser参数
 79 | parser = etree.HTMLParser(encoding="utf-8")
 80 | tree = etree.parse('exe_file/xpath.html', parser=parser)
 81 | # print(tree)
 82 | 
 83 | ret = tree.xpath('//div[@class="tang"]/ul/li[1]/text()')  #取文本
 84 | print(ret)  #out:['\r\n                停车坐爱枫林晚，霜叶红于二月花\r\n            ']
 85 | 
 86 | ret1 = tree.xpath('//div[@class="tang"]/ul/li[last()]/a/@href') #取属性
 87 | print(ret1) #out:['http://www.baidu.com/']
 88 | 
 89 | ret2 = tree.xpath('//div[@class="tang"]/ul/li[@class="love"]')  #层次定位
 90 | print(ret2)  #out:[<Element li at 0x1f4953f8788>, <Element li at 0x1f4953f8808>]
 91 | 
 92 | ret3 = tree.xpath('//div[@class="tang"]/ul/li[@class="love" and @name="yang"]') #逻辑定位
 93 | print(ret3)  #out:[<Element li at 0x1cb943b8788>]
 94 | 
 95 | ret4 = tree.xpath('//li[contains(@class,"l")]') #模糊搜索
 96 | print(ret4) #out:[<Element li at 0x201f60a8888>, <Element li at 0x201f60a8908>, <Element li at 0x201f60a8988>, <Element li at 0x201f60a89c8>, <Element li at 0x201f60a8a08>]
 97 | 
 98 | ret5 = tree.xpath('//li[contains(text(),"爱")]/text()')  #模糊文本搜索
 99 | print(ret5) #['\r\n                停车坐爱枫林晚，霜叶红于二月花\r\n            ', '爱就一个字，我只说一次', '爱情36计，我要立刻美丽']
100 | 
101 | 
102 | ret6 = tree.xpath('//li[starts-with(@class,"li")]/text()') #模糊匹配
103 | print(ret6) #['\r\n                乍暖还寒时候，最难将息\r\n            ', '\r\n                三杯两盏淡酒\r\n            ']
104 | 
105 | ret7 = tree.xpath('//div[@class="song"]//text()')
106 | print(ret7) # ['\r\n        火药\r\n        ', '指南针', '\r\n        ', '印刷术', '\r\n        造纸术\r\n    ']
107 | 
108 | 
109 | # 不建议采用，因为编码原因难以转换
110 | ret8 = tree.xpath('//div[@class="song"]')  #提取拼接文本
111 | str = ret8[0].xpath('string(.)')
112 | print(str)
113 | #        火药
114 | #        指南针
115 | #        印刷术
116 | #        造纸术
117 | 
118 | 
119 | 
120 | """
121 | 爬取段子网
122 | """
123 | import urllib.request
124 | import urllib.parse
125 | from lxml import etree
126 | 
127 | # 构造url，返回请求内容
128 | def handle_request(url,page):
129 |     # 构造头部信息
130 |     headers = {
131 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
132 |         'Accept-Encoding': 'gzip, deflate',
133 |         'Accept-Language': 'zh-CN,zh;q=0.9',
134 |         'Cache-Control': 'max-age=0',
135 |         'Connection': 'keep-alive',
136 |         'Host': 'duanziwang.com',
137 |         'Upgrade-Insecure-Requests': 1,
138 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
139 |     }
140 |     # 构造相应页面的url
141 |     url = url.format(page)
142 |     print(url)
143 |     request = urllib.request.Request(url, headers=headers)
144 |     return request
145 | 
146 | # html内容解析
147 | def parse_content(content):
148 | 
149 |     # 构造对象
150 |     tree = etree.HTML(content)
151 |     # 筛选本页面的文章概要列表
152 |     article_list = tree.xpath('//article[@id and @class="post"]')
153 |     # print(len(article_list))
154 | 
155 |     # 概要中提取信息
156 |     for article in article_list:
157 |         title = article.xpath('.//div[@class="post-head"]/h1/a/text()') [0]    #获取标题
158 |         # print(title)
159 |         text = article.xpath('.//div[@class="post-content"]//text()')   #获取文本
160 |         content_text = ''
161 |         for word in text:
162 |             word = word.strip()
163 |             content_text += word.replace('\n','').replace('\r','')
164 |         # 空文本进行信息补充
165 |         if len(content_text) == 0:
166 |             content_text = "这个标题有点长"
167 | 
168 |         # 提取时间
169 |         time = article.xpath('.//div[@class="post-meta"]/time[@class="post-date" and @datetime]/text()')[0]
170 |         # print(time)
171 | 
172 |         # 提取点赞数
173 |         like_num = article.xpath('.//div[@class="post-meta"]/time[@class="post-date"]/a/span/text()')[0]
174 |         # print(like_num)
175 | 
176 |         print("title:" + title)
177 |         print("time:" + time)
178 |         print("like:" + like_num)
179 |         print("text:" + content_text)
180 |         print("------------------------------")
181 | 
182 | 
183 | def main():
184 |     # start_page = int(input('begin:'))
185 |     # end_page = int(input('end:'))
186 | 
187 |     start_page = 1
188 |     end_page = 100
189 | 
190 |     url = 'http://duanziwang.com/page/{}/'
191 |     for page in range(start_page,end_page+1):
192 |         request = handle_request(url, page)
193 |         content = urllib.request.urlopen(request).read().decode()
194 |         # print(content)
195 | 
196 |         parse_content(content)
197 | 
198 | if __name__ == '__main__':
199 |     main()
200 | 
201 | 


--------------------------------------------------------------------------------
/7pictureLoad.py:
--------------------------------------------------------------------------------
 1 | #  http://sc.chinaz.com/tupian/xingganmeinvtupian.html
 2 | """
 3 | 懒加载：只显示可视区的图片
 4 | 实现方式
 5 | <img src2/data_src/总之不少src="图片路径">
 6 | 监视 -> <img src="图片路径" src2="">
 7 | 特点：找不到src
 8 | """
 9 | import urllib.request
10 | import urllib.parse
11 | from lxml import etree
12 | import os
13 | 
14 | #  构造请求
15 | def handle_request(url, page):
16 |     if page == 1:
17 |         url = url.format('')
18 |     else:
19 |         url = url.format('_' + str(page))
20 |     #print(url)
21 |     # 构造头部信息
22 |     headers = {
23 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
24 |         'Accept-Language': 'zh-CN,zh;q=0.9',
25 |         'Cache-Control': 'max-age=0',
26 |         'Connection': 'keep-alive',
27 |         'Upgrade-Insecure-Requests': 1,
28 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
29 |     }
30 |     request = urllib.request.Request(url=url,headers=headers)
31 |     return request
32 | 
33 | # 页面信息解析提取
34 | def parse_content(content):
35 |     tree = etree.HTML(content)
36 |     # //div[@id="container"]/div/div/a/img/@src
37 |     image_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src2')
38 |     # 懒加载
39 |     # print(image_list)
40 |     for image_url in image_list:
41 |             download_image(image_url)
42 | 
43 | # 下载图片
44 | def download_image(image_url):
45 |     # 下载目录
46 |     dirpath = 'exe_file/xinggan'
47 |     # 不存在目录即创建
48 |     if not os.path.exists(dirpath):
49 |         os.mkdir(dirpath)
50 |     # 生成文件名
51 |     filename = os.path.basename(image_url)
52 |     # 加入文件
53 |     filepath = os.path.join(dirpath, filename)
54 | 
55 |     # 构造头部信息
56 |     headers = {
57 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
58 |         'Accept-Language': 'zh-CN,zh;q=0.9',
59 |         'Cache-Control': 'max-age=0',
60 |         'Connection': 'keep-alive',
61 |         'Upgrade-Insecure-Requests': 1,
62 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
63 |     }
64 |     # 保存图片
65 |     request = urllib.request.Request(url=image_url, headers=headers)
66 |     response = urllib.request.urlopen(request)
67 |     with open(filepath, 'wb') as fp:
68 |         fp.write(response.read())
69 | 
70 | 
71 | def main():
72 |     url = 'http://sc.chinaz.com/tupian/xingganmeinvtupian{}.html'
73 |     start_page = 1
74 |     end_page = 2
75 |     for page in range(start_page,end_page+1):
76 |         request = handle_request(url, page)
77 |         # UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start by
78 |         # slove:headers有一句'Accept-Encoding': 'gzip, deflate'，删掉就好了
79 |         content = urllib.request.urlopen(request).read().decode()
80 |         parse_content(content)
81 | 
82 | 
83 | if __name__ == '__main__':
84 |     main()


--------------------------------------------------------------------------------
/8jsonpath.py:
--------------------------------------------------------------------------------
  1 | ﻿"""
  2 | jsonpath--用来解析json数据
  3 | python处理json用到的函数
  4 |     import json
  5 |     json.dumps()--将字典获列表转化为json的字符串
  6 |     json.loads()--将json转化为python对象
  7 |     json.dump()---将字典/列表转化为json格式字符串并写入文件中
  8 |     json.load()---从文件中读取json格式字符串，转化为python对象
  9 | 前端处理
 10 |     将json格式字符串转化为js对象
 11 |     JSON.parse('json格式字符串')
 12 |     eval('(' + json格式字符串 + ')')
 13 | 安装:
 14 |     pip install jsonpath
 15 |     https://blog.csdn.net/luxideyao/article/details/77802389
 16 |     
 17 | 与xpath的区别
 18 |     /	$	表示根元素
 19 |     .	@	 当前元素
 20 |     /	. or []	子元素
 21 |     ..	n/a	父元素
 22 |     //	..	递归下降，JSONPath是从E4X借鉴的。
 23 |     *	*	通配符，表示所有的元素
 24 |     xpath下标从1开始，jsonpath从0开始
 25 |     ---------------------------------------
 26 |     @	n/a	 属性访问字符
 27 |     []	[]	子元素操作符
 28 |     |	[,]	连接操作符在XPath 结果合并其它结点集合。JSONP允许name或者数组索引。
 29 |     n/a	[start:end:step]	数组分割操作从ES4借鉴。
 30 |     []	?()	应用过滤表示式
 31 |     n/a	()	脚本表达式，使用在脚本引擎下面。
 32 |     ()	n/a	Xpath分组
 33 | """
 34 | 
 35 | 
 36 | import json
 37 | 
 38 | lt = [
 39 |     {'name': '王宝强', 'age': 30},
 40 |     {'name': 'pgone', 'age': 30},
 41 |     {'name': '马蓉', 'age': 30},
 42 |     {'name': '宋吉', 'age': 30},
 43 |     {'name': '李小璐', 'age': 30},
 44 | ]
 45 | # 将字典获列表转化为json的字符串
 46 | string = json.dumps(lt)
 47 | print(string)
 48 | # out：[{"name": "\u738b\u5b9d\u5f3a", "age": 30},
 49 | # {"name": "pgone", "age": 30}, {"name": "\u9a6c\u84c9", "age": 30},
 50 | # {"name": "\u5b8b\u5409", "age": 30}, {"name": "\u674e\u5c0f\u7490", "age": 30}]
 51 | 
 52 | import jsonpath
 53 | 
 54 | # 将json格式文件转成python对象
 55 | obj = json.load(open('exe_file/book.json','r',encoding='utf-8'))
 56 | print(obj)
 57 | 
 58 | # 书单所有书的作者
 59 | ret = jsonpath.jsonpath(obj,'$.store.book[*].author')
 60 | print(ret)
 61 | # solve2
 62 | ret1 = jsonpath.jsonpath(obj,'$..author')
 63 | print(ret1)
 64 | 
 65 | # 查找store下面所有的节点
 66 | ret2 = jsonpath.jsonpath(obj,'$.store.*')
 67 | print(ret2)
 68 | 
 69 | # 查找store下面所有的price
 70 | ret3 = jsonpath.jsonpath(obj,'$.store..price')
 71 | print(ret3)
 72 | 
 73 | # 查找第三个book
 74 | ret4 = jsonpath.jsonpath(obj,'$..book[2]')
 75 | print(ret4)
 76 | 
 77 | # 查找最后一个book
 78 | ret5 = jsonpath.jsonpath(obj,'$..book[(@.length-1)]')
 79 | print(ret5)
 80 | 
 81 | # 查找前两本书
 82 | ret6 = jsonpath.jsonpath(obj,'$..book[0,1]')
 83 | # ret6 = jsonpath.jsonpath(obj,'$..book[:2]')
 84 | # ret6 = jsonpath.jsonpath(obj,'$..book')[:2]
 85 | print(ret6)
 86 | 
 87 | # 查找含有isbn键的book
 88 | ret7 = jsonpath.jsonpath(obj,'$..book[?(@.isbn)]')
 89 | print(ret7)
 90 | 
 91 | #查找所有price键对应的值小于10的所有book
 92 | ret8 = jsonpath.jsonpath(obj,'$..book[?(@.price<10)]')
 93 | print(ret8)
 94 | 
 95 | 
 96 | 
 97 | import urllib.request
 98 | import urllib.response
 99 | import jsonpath
100 | import csv
101 | """
102 | https://fe-api.zhaopin.com/c/i/sou?start=180&pageSize=90&
103 | cityId=765&workExperience=-1&education=-1&companyType=-1&
104 | employmentType=-1&jobWelfareTag=-1&kw=python&kt=3
105 | """
106 | 
107 | def main():
108 | 
109 |     # 创建csv文件
110 |     csv_url = 'exe_file/python_postion.csv'
111 |     fp = open(csv_url, 'wt', newline='', encoding='utf-8-sig')
112 |     writer = csv.writer(fp)
113 |     writer.writerow(('岗位', '企业名称', '企业规模', '企业类别', '企业主页', '工作地点', '薪酬', '学历要求', '工作经验', '岗位招聘主页'))
114 | 
115 |     # 智联招聘网址
116 |     # kw表示职位关键字，cityId是城市代号
117 |     # start和pageSize控制翻页
118 |     url = 'https://fe-api.zhaopin.com/c/i/sou?start=90&pageSize=90&' \
119 |           'cityId=765&workExperience=-1&education=-1&companyType=-1&' \
120 |           'employmentType=-1&jobWelfareTag=-1&kw=python&kt=3'
121 |     # 请求头
122 |     headers = {
123 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
124 |         'Accept-Language': 'zh-CN,zh;q=0.9',
125 |         'Cache-Control': 'max-age=0',
126 |         'Connection': 'keep-alive',
127 |         'Upgrade-Insecure-Requests': 1,
128 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
129 |     }
130 |     # 请求
131 |     request = urllib.request.Request(url=url, headers=headers)
132 |     json_text = urllib.request.urlopen(request).read().decode()
133 | 
134 |     # 将请求到的json转为python对象
135 |     json_obj = json.loads(json_text)
136 |     # print(json_text)
137 | 
138 |     # 筛选出招聘职位信息集合
139 |     pos_infos = jsonpath.jsonpath(json_obj,'$.data.results[*]')
140 | 
141 |     for info in pos_infos:
142 |         # 基于jsonpath的信息查找
143 |         job_name = jsonpath.jsonpath(info,'$.jobName')[0]
144 |         company_name = jsonpath.jsonpath(info,'$.company.name')[0]
145 |         company_size = jsonpath.jsonpath(info,'$.company.size.name')[0]
146 |         company_type = jsonpath.jsonpath(info,'$.company.type.name')[0]
147 |         company_url = jsonpath.jsonpath(info,'$.company.url')[0]
148 |         city = jsonpath.jsonpath(info,'$..city.display')[0]
149 |         salary = jsonpath.jsonpath(info,'$.salary')[0]
150 |         edu_level = jsonpath.jsonpath(info,'$.eduLevel.name')[0]
151 |         working_exp = jsonpath.jsonpath(info,'$.workingExp.name')[0]
152 |         position_url = jsonpath.jsonpath(info,'$.positionURL')[0]
153 | 
154 |         writer.writerow((job_name, company_name, company_size, company_type,
155 |                          company_url, city, salary, edu_level, working_exp, position_url))
156 | 
157 | 
158 | 
159 | 
160 | if __name__ == '__main__':
161 |     main()
162 | 


--------------------------------------------------------------------------------
/9.1Chrome-headless.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PhantomJS 无界面浏览器
 3 | selenium+phantomjs 爬虫解决方案
 4 | 下拉滚动条到底部
 5 |     豆瓣电影下拉
 6 | 图片加载
 7 |     图片懒加载问题
 8 |         在下拉到底部后，对比获取的page1 和 page2 
 9 |         可以发现 src2全部变为src1
10 |     
11 | """
12 | from selenium import webdriver
13 | 
14 | from selenium.webdriver.chrome.options import Options
15 | import time
16 | """
17 | 使用pip show selenium显示默认安装的是3.1.3版本
18 | 目前使用新版selenium调用PhantomJS是会报这样的错: UserWarning: Selenium support for PhantomJS has been deprecated, 
19 | please use headless versions of Chrome or Firefox instead warnings.warn('Selenium support for PhantomJS has been deprecated, please use headless
20 | 如果还想继续用PhantomJS的话只能使用旧版的selenium，卸载之后重新pip install selenium==2.48.0安装成功。
21 | 但其实更好的选择，我们可以使用firefox或chrome的headlesss模式,无需重装selenium
22 | 只需要添加以下代码：
23 | """
24 | path = r'E:\Program Files\chrome-driver\chromedriver.exe'
25 | 
26 | chrome_options = Options()
27 | chrome_options.add_argument('--headless')
28 | chrome_options.add_argument('--disable-gpu')#上面三行代码就是为了将Chrome不弹出界面，实现无界面爬取
29 | browser = webdriver.Chrome(path,options=chrome_options)
30 | 
31 | """
32 | url = 'http://www.baidu.com/'
33 | browser.get(url)
34 | time.sleep(2)
35 | browser.save_screenshot(r'exe_file/baidu.png')
36 | 
37 | my_input = browser.find_element_by_id('kw')
38 | my_input.send_keys('美女')
39 | time.sleep(2)
40 | browser.save_screenshot('exe_file/meinv.png')
41 | 
42 | button = browser.find_elements_by_class_name('s_btn')[0]
43 | button.click()
44 | time.sleep(2)
45 | browser.save_screenshot('exe_file/show.png')
46 | time.sleep(2)
47 | browser.quit()
48 | """
49 | 
50 | """
51 | url = 'https://movie.douban.com/typerank?type_name=%E7%88%B1%E6%83%85%E7%89%87&type=13&interval_id=100:90&action='
52 | 
53 | browser.get(url)
54 | time.sleep(3)
55 | browser.save_screenshot('exe_file/douban.png')
56 | # 模拟滚动条滚动到底部
57 | # 不同，教程是用body，改用documentElement
58 | js = 'document.documentElement.scrollTop=10000'
59 | browser.execute_script(js)
60 | time.sleep(3)
61 | browser.save_screenshot('exe_file/douban_d.png')
62 | 
63 | # 获取网页代码
64 | html = browser.page_source
65 | 
66 | # 保存在文件中
67 | with open(r'exe_file/douban.html','w',encoding='utf-8') as f:
68 |     f.write(html)
69 | '''
70 | 豆瓣的数据是js动态加载的
71 | 两个方法可以获取数据：
72 |     1 直接获取请求接口 -推荐
73 |     2 利用浏览器驱动模拟真正浏览器获取数据，不过这个比较慢
74 | ''''
75 | browser.quit()
76 | 
77 | """
78 | 
79 | url = 'http://sc.chinaz.com/tupian/'
80 | browser.get(url)
81 | time.sleep(5)
82 | with open(r'exe_file/szchina_page_1.html','w',encoding='utf-8') as fp:
83 |     fp.write(browser.page_source)
84 | 
85 | # 下拉到底部后再获取图片
86 | js = 'document.documentElement.scrollTop=10000'
87 | browser.execute_script(js)
88 | time.sleep(5)
89 | 
90 | with open(r'exe_file/szchina_page_2.html','w',encoding='utf-8') as fp:
91 |     fp.write(browser.page_source)
92 | browser.quit()


--------------------------------------------------------------------------------
/9selenium.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 浏览器自动化测试框架
  3 | 是一个python的第三方库，对外提供接口可以操作浏览器
  4 | 让浏览器完成自动化操作
  5 | 使用selenium
  6 |     1 安装 pip install selenium
  7 |     2 如何操作谷歌浏览器，首先必须有谷歌浏览器的一个驱动
  8 |     3
  9 |         驱动与谷歌浏览器的版本映射关系
 10 |         https://blog.csdn.net/fox990152806/article/details/91881361
 11 |         谷歌驱动下载
 12 |         http://npm.taobao.org/mirrors/chromedriver/
 13 |     4 代码操作
 14 |         find_element_by_id              id
 15 |         find_element_by_name            name
 16 |         find_element_by_xpath           xpath
 17 |         find_element_by_tag_name        标签名
 18 |         find_element_by_class_name      class名称
 19 |         find_element_by_css_selector    选择器查找
 20 |         find_element_by_link_text       根据链接内容
 21 |         
 22 |         get\set_keys\click
 23 |     
 24 | """
 25 | 
 26 | # 简单selenium操作
 27 | from selenium import webdriver
 28 | import  time
 29 | 
 30 | # 模拟创建浏览器对象,通过对象操作浏览器
 31 | path = r'E:\Program Files\chrome-driver\chromedriver.exe'
 32 | browser = webdriver.Chrome(executable_path=path)
 33 | # print(browser)
 34 | 
 35 | # 打开百度
 36 | url = 'http://www.baidu.com/'
 37 | browser.get(url)
 38 | 
 39 | # 中间有内容请求，发送响应的过程，需要停顿
 40 | time.sleep(2)
 41 | 
 42 | # 向百度搜索框中填入关键字
 43 | my_input = browser.find_element_by_id('kw') # 对应百度搜索框的id
 44 | my_input.send_keys('美女')
 45 | 
 46 | time.sleep(2)
 47 | # 查找搜索按钮s
 48 | # ..s：返回一个列表
 49 | # bg s_btn 不行
 50 | button = browser.find_elements_by_class_name('s_btn')[0]
 51 | button.click()  # 点击
 52 | # 页面停顿
 53 | time.sleep(2)
 54 | 
 55 | 
 56 | # 坑：百度已将该图片链接设置为动态，故无法点击
 57 | page_url = browser.find_elements_by_class_name('op-img-address-hover')[0]
 58 | page_url.click()
 59 | 
 60 | time.sleep(5)
 61 | 
 62 | # 关闭浏览器
 63 | browser.quit()
 64 | 
 65 | 
 66 | from selenium import webdriver
 67 | import  time
 68 | 
 69 | # 模拟创建浏览器对象,通过对象操作浏览器
 70 | path = r'E:\Program Files\chrome-driver\chromedriver.exe'
 71 | browser = webdriver.Chrome(executable_path=path)
 72 | # print(browser)
 73 | 
 74 | # 打开百度
 75 | url = 'http://www.baidu.com/'
 76 | browser.get(url)
 77 | 
 78 | browser.find_elements_by_link_text('设置')[0].click()
 79 | time.sleep(3)
 80 | 
 81 | browser.find_elements_by_link_text(r'搜索设置')[0].click()
 82 | time.sleep(2)
 83 | 
 84 | m = browser.find_element_by_id('nr')
 85 | time.sleep(2)
 86 | 
 87 | # 每页搜索50条
 88 | m.find_element_by_xpath('//*[@id="nr"]/option[3]').click()
 89 | time.sleep(2)
 90 | 
 91 | # 确认更改
 92 | browser.find_elements_by_class_name("prefpanelgo")[0].click()
 93 | time.sleep(2)
 94 | 
 95 | # 处理弹窗
 96 | browser.switch_to_alert().accept()
 97 | time.sleep(2)
 98 | 
 99 | # 进行搜索
100 | browser.find_element_by_id('kw').send_keys('美女')
101 | time.sleep(2)
102 | 
103 | # 确认
104 | browser.find_element_by_id('su').click()
105 | time.sleep(2)
106 | 
107 | # 进入该搜索项
108 | browser.find_elements_by_link_text('美女_百度图片')[0].click()
109 | time.sleep(3)
110 | 
111 | 
112 | # 关闭浏览器
113 | browser.quit()
114 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # pythonCrawler
 2 | [![HitCount](https://hits.b3log.org/ZhuoZhuoCrayon/pythonCrawler.svg)](https://github.com/ZhuoZhuoCrayon/pythonCrawler/)
 3 | >## Notice
 4 | 1. exe_file 是本程序爬取的附录，全部测试、实战读写路径全部指向exe\_file
 5 | 2. 本爬虫笔记基于b站 [Python爬虫从入门到高级实战【92集】千锋Python高级教程](https://www.bilibili.comvideo/av37027372)
 6 | 3. 在该教程的基础上对教程中的思路进行实践，对教程出现的错误进行修正，并且另外扩展，**并非教程源码照搬**
 7 | 4. 由于时间有限，笔记与代码都位于.py文件中，以注释及代码形式存在，对学习过程中会出现的bug以及难点进行分析
 8 | 5. 由于作者能力有限以及爬虫技术迭代速度快，代码可能会存在bug，如有此情况，欢迎联系我更正或者pull request
 9 | 6. **更新日志的正确打开方式：**
10 |     - 数字代表每一章，每个数字的第一个py文件是基础知识讲解及简单实践
11 |     - x.x形式的py文件一般是实战内容
12 |     - 例如6.基于xpath...是基础知识，那么6.1就是项目实战内容
13 |     - **所有的py文件都会有思路、踩坑以及知识点的介绍**
14 |     - **人性化设置，md文件的更新日志附属笔记的超链接跳转**
15 | 7. 如果笔记对您有用，麻烦Star谢谢
16 | - - -
17 | >## Update log
18 | 1. __2019/03-2019/03/12__
19 |     - [1.urllib基础](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/1urllib_base.py)
20 |     - [2.利用ajax的特点构建post请求，及对url异常的处理实例：豆瓣，kfc餐厅，百度贴吧的页面爬取](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/2ajax.py)
21 |     - [3.以百度翻译为例介绍fiddler中json包的解析](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/fillder.py)
22 |     - [4.Handler处理器的应用：设置ip及cookieJar，人人网模拟登陆](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/4handler.py)
23 |     - [5.1.利用正则表达式提取糗图网页面信息](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/5.1%E6%AD%A3%E5%88%99%E7%88%AC%E5%8F%96%E7%B3%97.py)
24 |     - [5.2.正则爬取励志网并建立文章集合页面](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/5.2%E6%AD%A3%E5%88%99%E7%88%AC%E5%8F%96%E5%8A%B1%E5%BF%97%E7%BD%91%E5%B9%B6%E5%BB%BA%E7%AB%8B%E6%96%87%E7%AB%A0%E9%9B%86%E5%90%88%E9%A1%B5%E9%9D%A2.py)
25 | 2. __2019/04-__
26 |     - 项目实战：[智联招聘爬虫-通用版：目前已爬取2019年第一季度IT领域招聘信息数据集](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/zhilianCrawler.py)
27 |         + urllib, BeautifulSoup, 正则表达式, 多线程爬取, json获取, csv文件读写
28 | 3. __2019/07/10__
29 |     - [6.基于xpath的html页面信息提取](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/6xpath.py)
30 |         + 实例：段子网爬取
31 | 4. __2019/07/11__
32 |     - [6.1.读取文件中的列表格式](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/6.1read_list.py)
33 |         + 实例：文本文件中对象的读取
34 |     - [7.基于图片懒加载技术的图片下载](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/7pictureLoad.py)
35 | 5. __2019/07/15__
36 |     - [8.基于jsonpath的json文件解析方法](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/8jsonpath.py)
37 |         + 实例：智联招聘，填补之前智联爬虫采用正则表达式解析json文件的繁琐方法
38 |         + b站教程以爬取淘宝评论为例，但现淘宝系统过于难爬，**此处留坑**
39 | 6. __2019/07/16__
40 |     - 谷歌浏览器驱动，适配谷歌75版本---在exeFile目录下
41 | 7. __2019/07/17__
42 |     - [9.基于selenium的浏览器控制访问](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/9selenium.py)
43 |         + 实例：百度关键字搜索
44 | 8. __2019/07/19__
45 |     - [9.1.基于Chrome无界面模式浏览，图片懒加载的特点，异步加载的解决方法](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/9.1Chrome-headless.py)
46 |         + 实例1：豆瓣电影下拉滚动条，懒加载变化解析
47 |         + 实例2：百度图片搜索，无界面模式实践
48 | 9. __2019/07/20__
49 |     - **告知：**
50 |         + 为方便实例的各种测试文件的查找，在第10章包括以后，每章的测试文件保存在exe\_file/x/下
51 |         + **x为对应章节，例如第10章，则位于exe\_file/10/**
52 |     - [10.Requests库的基本用法](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/10-Requests.py)
53 |         + 实例：百度搜索，必应翻译，登陆人人网为例介绍post、cookie、get的用法
54 |         + 代理使用
55 |     - [10.1.Requests库实战](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/10.1busPath_Crawler.py)
56 |         + 实例：爬取深圳所有公交路线
57 |         + 运用：json文件读写、Requests库及xpath解析
58 |         + 数据集：[深圳公交线路json文件](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/exe_file/10/bus_line.json)
59 |     - [11.验证码登陆方式](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/11verification_code.py)
60 |         + 实例：利用返回验证码到本地的方法登陆古诗文网
61 |         + 运用：Requests库（创建会话用于支持cookie），美味汤(beautifulSoup)
62 | 10. __2019/07/21-2019/07/26__
63 |     - [11.1pytesser介绍](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/11.1pytesser.py)
64 |         + 介绍了pytesser库以及PIL库的基本使用
65 |     - [11.2jTessBoxEditor-tesseract字库训练模式](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/11.2jTessBoxEditor-tesseract.py)
66 |         + 验证码测试脚本
67 |     - **[重点：tesseract训练字库详解](https://github.com/ZhuoZhuoCrayon/pythonCrawler/tree/master/tesseract%E8%AE%AD%E7%BB%83%E6%A8%A1%E5%9E%8B)**
68 |         + 通过建立特征字符库，逐层加入识别错误的验证码进行补充训练，可以在三次扩充样本训练后达到90%以上识别率
69 | 11. __2019/07/28__
70 |     - [12.视频爬取](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/12video.py)
71 |         + 基于xpath, json, chromeDrive-headless的视频爬取方案
72 | 12. __2019/07/29-2019/07/31__
73 |     - [13.多线程基础汇总](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/13multiThread.py)
74 |     - [13.1多线程的面向对象构造形式](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/13.1thread_ood.py)
75 |     - [13.2队列的基本Queue的基本操作](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/13.2thread_queue.py)
76 |     - [13.3多线程爬取深圳公交线路](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/13.3Mthread_crawler.py)
77 |         + 基于10.1的程序进行多线程重构
78 |         + 多线程爬取速度提升至500%
79 | 13. __2019/03-2019/05__
80 |     - [实战：58同城租房价格爬取](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/58crawler/58decode.py)
81 |         + 涉及反爬策略，关于编码转化的技巧
82 |     - [实战：中国大学排名爬取](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/chineseUniversityRankCrawler/RankofNuni.py)
83 |         + 美味汤、requests库的使用
84 |     - [实战：美桌网图片爬取实例4则](https://github.com/ZhuoZhuoCrayon/pythonCrawler/tree/master/pictureCrawler)
85 |         + 入门级别
86 |         + 实践多线程、美味汤、requests库
87 | ---
88 | >## Contributing
89 | >如果你对这个项目感兴趣，非常乐意你可以将.py文件的笔记和代码进行格式加工
90 | >>[版权声明]笔记内容是我原创并且开源到github上的，所有内容仅限于学习，不作商用，欢迎star/download/fork，但务必遵守相关开源协议进行使用，原创不易，请勿copy。在实践时遵守爬虫协议，目的只是为了更好的掌握爬虫知识，如果有所影响，请联系我删除，谢谢！
91 | 
92 | 


--------------------------------------------------------------------------------
/chineseUniversityRankCrawler/RankofNuni.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import bs4
 4 | 
 5 | def getHTMLText(url):
 6 |     try:
 7 |         r = requests.get(url,timeout = 30)
 8 |         r.raise_for_status()
 9 |         r.encoding = r.apparent_encoding
10 |         return r.text
11 |     except:
12 |         print("getError")
13 |         return ""
14 |     return ""
15 | 
16 | def fillUnivList(ulist,html):
17 |     soup = BeautifulSoup(html,"html.parser")
18 |     """
19 |     结构分析
20 |     排名位于tbody标签下
21 |     每个tr标签是一个学校的信息
22 |     tr标签下有多个td标签，保存有该学校的各类指标
23 |     """
24 |     # 遍历tr标签 tr标签是tbody标签的孩子
25 |     for tr in soup.find('tbody').children:
26 |         print(tr)
27 |         if isinstance(tr,bs4.element.Tag):
28 |             # 获取tr标签下的td标签
29 |             tds = tr('td')
30 |             # 获取相关指标 只需要第 0 1 3 个相关td标签，分别是学校名称，排名，分数
31 |             ulist.append([tds[0].string,tds[1].string,tds[3].string])
32 | 
33 | # 打印前20的榜单
34 | def printUnivList(ulist,num):
35 |     """
36 |     print("{:^10}\t{:^6}\t{:^10}".format("排名","学校名称","分数"))
37 |     for i in range(num):
38 |         u = ulist[i]
39 |         print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
40 |     """
41 | 
42 |     # 优化，解决中文不对齐问题
43 |     #^num   num代表占位
44 |     tplt = "{0:^10}\t{1:{3}^10}\t{2:^10}"
45 |     # chr(12288)是中文空白符
46 |     print(tplt.format("排名","学校名称","分数",chr(12288)))
47 |     for i in range(num):
48 |         u = ulist[i]
49 |         print(tplt.format(u[0],u[1],u[2],chr(12288)))
50 | def main():
51 |     unifo = []
52 |     url = "http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html"
53 |     html = getHTMLText(url)
54 |     fillUnivList(unifo,html)
55 |     printUnivList(unifo,20)
56 | 
57 | main()
58 | 


--------------------------------------------------------------------------------
/exe_file/10/chinaunix_login.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html>
 3 |     <head>
 4 |         <meta charset="UTF-8" />
 5 |         <meta name="robots" content="noindex,nofollow" />
 6 |         <style>            body { background-color: #F9F9F9; color: #222; font: 14px/1.4 Helvetica, Arial, sans-serif; margin: 0; padding-bottom: 45px; }
 7 | 
 8 |             a { cursor: pointer; text-decoration: none; }
 9 |             a:hover { text-decoration: underline; }
10 |             abbr[title] { border-bottom: none; cursor: help; text-decoration: none; }
11 | 
12 |             code, pre { font: 13px/1.5 Consolas, Monaco, Menlo, "Ubuntu Mono", "Liberation Mono", monospace; }
13 | 
14 |             table, tr, th, td { background: #FFF; border-collapse: collapse; vertical-align: top; }
15 |             table { background: #FFF; border: 1px solid #E0E0E0; box-shadow: 0px 0px 1px rgba(128, 128, 128, .2); margin: 1em 0; width: 100%; }
16 |             table th, table td { border: solid #E0E0E0; border-width: 1px 0; padding: 8px 10px; }
17 |             table th { background-color: #E0E0E0; font-weight: bold; text-align: left; }
18 | 
19 |             .hidden-xs-down { display: none; }
20 |             .block { display: block; }
21 |             .break-long-words { -ms-word-break: break-all; word-break: break-all; word-break: break-word; -webkit-hyphens: auto; -moz-hyphens: auto; hyphens: auto; }
22 |             .text-muted { color: #999; }
23 | 
24 |             .container { max-width: 1024px; margin: 0 auto; padding: 0 15px; }
25 |             .container::after { content: ""; display: table; clear: both; }
26 | 
27 |             .exception-summary { background: #B0413E; border-bottom: 2px solid rgba(0, 0, 0, 0.1); border-top: 1px solid rgba(0, 0, 0, .3); flex: 0 0 auto; margin-bottom: 30px; }
28 | 
29 |             .exception-message-wrapper { display: flex; align-items: center; min-height: 70px; }
30 |             .exception-message { flex-grow: 1; padding: 30px 0; }
31 |             .exception-message, .exception-message a { color: #FFF; font-size: 21px; font-weight: 400; margin: 0; }
32 |             .exception-message.long { font-size: 18px; }
33 |             .exception-message a { border-bottom: 1px solid rgba(255, 255, 255, 0.5); font-size: inherit; text-decoration: none; }
34 |             .exception-message a:hover { border-bottom-color: #ffffff; }
35 | 
36 |             .exception-illustration { flex-basis: 111px; flex-shrink: 0; height: 66px; margin-left: 15px; opacity: .7; }
37 | 
38 |             .trace + .trace { margin-top: 30px; }
39 |             .trace-head .trace-class { color: #222; font-size: 18px; font-weight: bold; line-height: 1.3; margin: 0; position: relative; }
40 | 
41 |             .trace-message { font-size: 14px; font-weight: normal; margin: .5em 0 0; }
42 | 
43 |             .trace-file-path, .trace-file-path a { color: #222; margin-top: 3px; font-size: 13px; }
44 |             .trace-class { color: #B0413E; }
45 |             .trace-type { padding: 0 2px; }
46 |             .trace-method { color: #B0413E; font-weight: bold; }
47 |             .trace-arguments { color: #777; font-weight: normal; padding-left: 2px; }
48 | 
49 |             @media (min-width: 575px) {
50 |                 .hidden-xs-down { display: initial; }
51 |             }</style>
52 |     </head>
53 |     <body>
54 |                     <div class="exception-summary">
55 |                 <div class="container">
56 |                     <div class="exception-message-wrapper">
57 |                         <h1 class="break-long-words exception-message">Whoops, looks like something went wrong.</h1>
58 |                         <div class="exception-illustration hidden-xs-down"><svg viewBox="0 0 136 81" xmlns="http://www.w3.org/2000/svg" fill-rule="evenodd" clip-rule="evenodd" stroke-linejoin="round" stroke-miterlimit="1.414"><path d="M92.358 20.403a23.168 23.168 0 0 1 9.003 1.881 23.67 23.67 0 0 1 5.197 3.079 24.257 24.257 0 0 1 3.457 3.296 24.771 24.771 0 0 1 5.042 9.396c.486 1.72.78 3.492.895 5.28l.008.142.028.158.015.246v13.875c.116.034.232.065.348.098.193.054.383.116.577.168.487.125.989.191 1.49.215.338.016.689.023 1.021-.059.021-.005.032-.029.048-.044l.095-.1c.243-.265.461-.552.663-.851.277-.408.523-.837.746-1.279l.042-.087c-.066-.012-.131-.026-.197-.04l-.099-.023a5.536 5.536 0 0 1-.694-.242 5.649 5.649 0 0 1-2.374-1.845 5.694 5.694 0 0 1-.824-1.594 6.514 6.514 0 0 1-.267-2.781c.045-.394.126-.779.233-1.159.079-.278.162-.562.307-.812.094-.163.129-.196.247-.341l.79-.882c.143-.143.174-.186.34-.303.249-.174.536-.289.834-.333.074-.011.15-.014.224-.02l1.188-.037c.173.004.217-.002.388.028s.211.05.375.105l.018.007c.059.026.119.05.176.079.151.076.179.104.313.2l.006-.021c.073-.187.084-.238.187-.41.077-.129.167-.249.27-.357.051-.054.108-.103.162-.154l1.124-.95c.14-.107.172-.14.327-.224.155-.085.199-.094.363-.154l.019-.006c.169-.043.211-.06.385-.077.174-.016.218-.007.392.003l1.446.158c.193.033.244.033.43.098.278.097.534.259.744.47.053.053.1.112.149.167l.923 1.158.149.213.028.054.017-.014.184-.125c.196-.104.196-.104.402-.184l1.386-.451c.064-.018.126-.038.19-.052.129-.028.259-.042.39-.043.16-.002.321.017.478.047.364.069.711.21 1.032.396.162.094.316.199.469.308.088.063.176.132.27.188l.021.011c.19.123.245.146.409.305.185.178.336.393.443.63.035.079.061.162.091.243l.439 1.428c.045.175.062.219.081.4.02.193.006.381-.015.573a7.79 7.79 0 0 1-.101.645c-.09.455-.212.901-.365 1.339-.128.366-.273.73-.445 1.077-.658 1.335-1.652 2.512-2.917 3.265a6.399 6.399 0 0 1-1.019.489 6.097 6.097 0 0 1-.631.203c-.226.058-.455.1-.686.134l-.096.012-.061.007c-.01.176-.022.352-.036.528-.034.39-.082.778-.153 1.163a14.258 14.258 0 0 1-.574 2.114c-.229.654-.484 1.306-.806 1.918a9.16 9.16 0 0 1-.386.656c-.219.348-.451.686-.697 1.013-.448.594-.946 1.148-1.521 1.614-.255.207-.52.397-.808.553-.9.489-1.919.648-2.921.735-.493.038-.986.059-1.478.099-.162.015-.324.033-.486.049-.145.011-.289.022-.434.03a15.768 15.768 0 0 1-2.778-.118c0 1.416.007 2.832-.001 4.248a9.737 9.737 0 0 1-.684 3.479 9.615 9.615 0 0 1-1.72 2.804 9.326 9.326 0 0 1-3.04 2.279 9.046 9.046 0 0 1-5.33.715 9.064 9.064 0 0 1-2.988-1.079 9.363 9.363 0 0 1-2.761-2.429 10.078 10.078 0 0 1-1.05 1.16 9.281 9.281 0 0 1-1.871 1.358 9.033 9.033 0 0 1-2.495.926 9.04 9.04 0 0 1-6.462-1.072 9.395 9.395 0 0 1-2.602-2.292l-.062-.08a10.896 10.896 0 0 1-.53.635 9.266 9.266 0 0 1-2.671 2.032 9.028 9.028 0 0 1-6.044.751 9.048 9.048 0 0 1-2.436-.934 9.343 9.343 0 0 1-2.286-1.803 9.572 9.572 0 0 1-1.783-2.757 9.705 9.705 0 0 1-.773-3.693V67.244c-.157.024-.314.047-.472.067-.487.06-.977.103-1.469.109-.313.004-.627-.009-.94-.028-.426-.025-.85-.065-1.273-.125-1.833-.264-3.65-.92-5.109-2.117a8.172 8.172 0 0 1-1.064-1.049 10.155 10.155 0 0 1-.878-1.236 15.277 15.277 0 0 1-.7-1.274 20.835 20.835 0 0 1-1.889-6.194l-.018-.142-.008-.061a6.47 6.47 0 0 1-.99-.297 6.135 6.135 0 0 1-.61-.285 6.587 6.587 0 0 1-.889-.562c-1.228-.924-2.124-2.259-2.668-3.711a9.947 9.947 0 0 1-.307-.99 10.288 10.288 0 0 1-.318-1.923c-.009-.147-.011-.293-.015-.44v-.037c.008-.175.004-.22.037-.393.033-.173.053-.213.11-.378l.561-1.417c.031-.068.06-.139.095-.206a2.028 2.028 0 0 1 .771-.803c.093-.054.194-.095.289-.145l.311-.179c.352-.194.714-.358 1.107-.44.213-.044.426-.061.643-.061l.034.001c.177.014.223.01.396.052.174.041.214.065.379.132l1.347.635c.073.04.15.076.221.121.142.091.272.2.388.325.154.166.176.222.297.414l.022.047.722-.762.168-.158c.165-.122.202-.161.385-.253.206-.102.429-.168.656-.193.076-.008.152-.008.228-.011l1.46.013c.177.011.223.007.397.046.175.038.215.061.381.126l.018.008c.154.08.196.094.338.196.142.102.169.137.294.259l.853.912.152-.067.191-.063.019-.005.196-.042c.177-.019.222-.031.401-.022.066.003.133.013.199.02l1.185.182c.073.016.147.027.219.047.288.08.558.227.784.428.151.135.177.181.303.339l.714 1.004c.097.152.127.187.201.352.077.172.123.352.164.536.029.134.056.269.08.404.063.361.102.725.112 1.091.021.78-.08 1.566-.321 2.307a5.906 5.906 0 0 1-.532 1.183 5.463 5.463 0 0 1-3.257 2.489l-.03.008c.195.584.433 1.155.712 1.701.215.422.453.833.735 1.211.026.035.026.034.053.068l.058.072c.056.024.113.042.171.06.319.09.653.121.982.14.488.027.978.013 1.461-.06.167-.028.333-.062.499-.089.134-.022.267-.042.401-.066l.28-.056c.154-.023.308-.049.462-.076l.115-.021V43.881c.011-.203.006-.203.042-.404a26.66 26.66 0 0 1 .226-2.241 24.737 24.737 0 0 1 5.72-12.577 24.204 24.204 0 0 1 3.457-3.296 23.653 23.653 0 0 1 4.937-2.966 23.215 23.215 0 0 1 5.604-1.681 23.703 23.703 0 0 1 3.958-.313zm-.287 2.042a21.169 21.169 0 0 0-8.012 1.622 21.636 21.636 0 0 0-4.799 2.766 22.233 22.233 0 0 0-3.205 2.985 22.705 22.705 0 0 0-4.897 9.196 23.383 23.383 0 0 0-.737 4.867h-.025v15.744c-.258.053-.258.052-.517.101-.28.051-.56.1-.841.144-.211.04-.421.079-.632.115l-.232.037-.411.078c-.116.02-.233.035-.348.057-.305.056-.609.11-.917.14a9.929 9.929 0 0 1-1.883-.017c-.514-.056-1.044-.155-1.51-.397a1.762 1.762 0 0 1-.33-.218 1.925 1.925 0 0 1-.234-.252 5.248 5.248 0 0 1-.174-.22 8.97 8.97 0 0 1-.582-.883 13.806 13.806 0 0 1-.941-1.971 14.348 14.348 0 0 1-.608-1.954 14.04 14.04 0 0 1-.169-.86l-.015-.11-.015-.109c.161-.007.16-.007.321-.016a12.793 12.793 0 0 0 1.413-.182 4.43 4.43 0 0 0 .28-.074 3.56 3.56 0 0 0 1.199-.616c.309-.244.576-.543.786-.88.163-.261.292-.544.387-.838.123-.378.192-.774.214-1.172a5.102 5.102 0 0 0-.024-.865 7.192 7.192 0 0 0-.145-.799l-.714-1.005-1.184-.182-.019.005-.946.758-.12 1.229a4.953 4.953 0 0 1 .111.455c.032.181.052.36.043.544a1.04 1.04 0 0 1-.056.303c-.11.301-.419.451-.696.548-.402.142-.813.25-1.229.339l.07-.648c.022-.191.047-.381.08-.57.036-.207.079-.413.152-.61.077-.211.182-.412.296-.605.044-.074.092-.146.135-.222.029-.048.031-.047.055-.098.016-.033.031-.064.045-.098l-.026-1.551-1.042-1.116-.018-.008-1.459-.014-1.022 1.079c-.049.128-.08.258-.111.393a5.274 5.274 0 0 0-.1.651 5.55 5.55 0 0 0-.031.466c-.009.687.104 1.37.294 2.028.11.382.262.753.402 1.123-.115-.029-.228-.06-.342-.092a9.526 9.526 0 0 1-1.176-.446c-.108-.05-.111-.048-.191-.097a1.921 1.921 0 0 1-.327-.249c-.416-.4-.589-.986-.671-1.55a5.643 5.643 0 0 1-.057-.549c-.007-.143-.006-.286-.007-.429-.001-.186.005-.372.011-.558l.001-.039-.567-1.446-1.347-.634c-.316-.008-.599.144-.867.299-.109.063-.218.126-.33.185a2.058 2.058 0 0 1-.125.061l-.042.019-.561 1.416c0 .209.014.416.036.624.04.377.106.75.196 1.118.076.309.164.616.275.913.415 1.109 1.093 2.146 2.043 2.838.234.171.485.317.746.442.183.088.371.161.565.22.263.079.532.13.803.17.296.045.594.075.892.095l.108.007c.004.151.01.302.017.453.011.177.023.353.038.529a18.13 18.13 0 0 0 .762 3.752c.239.76.522 1.505.857 2.225.23.494.483.977.767 1.44.288.469.608.915.989 1.308 1.001 1.028 2.324 1.648 3.687 1.976.643.155 1.298.243 1.955.287.311.021.622.036.933.033.418-.006.835-.041 1.25-.094.238-.03.477-.064.713-.11.117-.023.232-.053.348-.081.196-.048.392-.097.586-.151.147-.041.291-.094.436-.144.204-.069.408-.139.608-.217l.006-.003c0 2.207-.013 4.414.001 6.62a7.942 7.942 0 0 0 .13 1.32 7.545 7.545 0 0 0 2.383 4.243 7.23 7.23 0 0 0 2.258 1.372 7.094 7.094 0 0 0 7.012-1.164 7.504 7.504 0 0 0 2.035-2.613 7.727 7.727 0 0 0 .676-2.401l.009-.088.038-.765a8.16 8.16 0 0 0 .113 1.324c.121.694.338 1.37.643 2.001a7.49 7.49 0 0 0 1.692 2.275 7.266 7.266 0 0 0 2.24 1.399 7.11 7.11 0 0 0 4.615.19 7.212 7.212 0 0 0 2.351-1.218 7.501 7.501 0 0 0 2.128-2.64 7.763 7.763 0 0 0 .702-2.39l.01-.088.009-.088.038-.765a9.339 9.339 0 0 0 .021.575 7.626 7.626 0 0 0 .621 2.504 7.507 7.507 0 0 0 2.35 2.972 7.1 7.1 0 0 0 7.026.881 7.275 7.275 0 0 0 2.268-1.515 7.525 7.525 0 0 0 1.612-2.338 7.58 7.58 0 0 0 .572-2.033c.048-.347.069-.696.071-1.046v-6.721c.136.051.271.101.408.148a12.153 12.153 0 0 0 1.976.443c.264.035.529.055.794.071.33.02.66.031.991.027.245-.002.49-.012.735-.031.245-.018.49-.048.735-.068.407-.03.814-.051 1.221-.079a9.493 9.493 0 0 0 1.384-.188c.315-.073.626-.174.912-.329a3.53 3.53 0 0 0 .586-.418c.46-.386.85-.85 1.205-1.337a12.178 12.178 0 0 0 .801-1.246c.122-.232.229-.471.33-.712a15.873 15.873 0 0 0 .681-1.988c.136-.525.23-1.058.282-1.598.035-.41.052-.822.088-1.232.03-.317.078-.632.121-.947l.018-.145.016-.145c.144.009.287.016.431.021.459.009.924.007 1.378-.07a4.456 4.456 0 0 0 1.353-.482c.989-.55 1.752-1.466 2.258-2.488.116-.235.214-.48.304-.727a7.58 7.58 0 0 0 .377-1.43c.016-.109.027-.218.039-.328l.001-.009-.438-1.428a5.206 5.206 0 0 1-.16-.096c-.158-.105-.311-.219-.467-.326a3.829 3.829 0 0 0-.159-.1 1.356 1.356 0 0 0-.509-.18l-.01-.001-1.386.452-.681 1.323c-.016.212-.023.424-.043.636a5.66 5.66 0 0 1-.139.873c-.118.494-.316.999-.702 1.338a1.865 1.865 0 0 1-.496.301l-.272.087a9.57 9.57 0 0 1-.83.205 8.797 8.797 0 0 1-.582.091l.229-.462c.079-.163.158-.325.229-.492.051-.118.096-.239.139-.36.036-.103.076-.209.103-.315.019-.075.031-.153.041-.229.017-.132.031-.263.043-.395.035-.368.06-.737.094-1.104.02-.187.048-.372.067-.559.015-.167.015-.336.012-.505a4.76 4.76 0 0 0-.074-.826c-.012-.065-.03-.13-.045-.194l-.003-.009-.923-1.157-1.446-.159-.019.006-1.124.95-.154 1.489c.011.034.024.066.037.099.044.115.107.221.161.331.046.096.088.193.13.29l.031.076c.013.033.017.07.023.105.012.096.022.191.031.287.031.364.047.73.081 1.093.013.102.028.202.04.303.014.145.027.29.033.435.014.28.016.561.023.841a9.588 9.588 0 0 1-.862-.323c-.063-.027-.128-.062-.193-.084a1.325 1.325 0 0 0-.067-.013c-.081-.01-.162-.017-.243-.025-.245-.02-.49-.037-.734-.061-.066-.007-.132-.014-.198-.028l-.017-.005c-.03-.013-.029-.014-.067-.038a1.614 1.614 0 0 1-.161-.108.863.863 0 0 1-.22-.242c-.089-.155-.102-.34-.09-.517.02-.299.117-.591.228-.866l.004-.009-.018-1.197-.874-.84-.018-.007-1.188.036-.79.882c-.037.112-.074.224-.106.338a4.756 4.756 0 0 0-.171 1.906c.039.329.115.654.233.963a3.542 3.542 0 0 0 1.263 1.636c.313.222.659.393 1.019.517.237.082.487.111.734.145.479.06.959.106 1.438.166.121.017.241.037.362.058l.158.026a12.12 12.12 0 0 1-.923 2.565 13.221 13.221 0 0 1-.829 1.474 9.474 9.474 0 0 1-.984 1.286c-.08.087-.163.17-.248.252a1.655 1.655 0 0 1-.329.262 2.376 2.376 0 0 1-.722.247c-.457.089-.927.093-1.39.071-.391-.018-.781-.06-1.168-.123a7.817 7.817 0 0 1-.609-.124c-.226-.056-.448-.124-.671-.191-.065-.019-.131-.035-.197-.054a14.75 14.75 0 0 1-.543-.165 23.384 23.384 0 0 1-.453-.128c-.196-.059-.195-.059-.39-.12l-.276-.077V43.881h-.025a34.633 34.633 0 0 0-.031-.557 23.606 23.606 0 0 0-.4-2.994 22.743 22.743 0 0 0-1.492-4.708 22.567 22.567 0 0 0-4.593-6.748 21.865 21.865 0 0 0-6.882-4.706 21.175 21.175 0 0 0-8.115-1.722l-.411-.001zm9.15 33.69c.109.015.214.038.315.085a1.012 1.012 0 0 1 .574.771c.021.132.013.268.009.4a8.38 8.38 0 0 1-.026.476 8.767 8.767 0 0 1-1.564 4.282c-.306.437-.65.846-1.024 1.222a10.09 10.09 0 0 1-4.612 2.627c-1.32.343-2.704.427-4.055.254a10.422 10.422 0 0 1-2.67-.709 9.917 9.917 0 0 1-3.57-2.503 9.312 9.312 0 0 1-.775-.984 8.933 8.933 0 0 1-.731-1.288 8.648 8.648 0 0 1-.795-3.377c-.003-.104-.008-.211 0-.316a1.042 1.042 0 0 1 .254-.609.98.98 0 0 1 1.337-.125 1.023 1.023 0 0 1 .385.719c.007.151.006.303.014.454a6.547 6.547 0 0 0 .524 2.217c.257.595.599 1.15 1.006 1.648.325.398.691.759 1.087 1.081.312.253.642.482.987.684 2.592 1.522 5.945 1.538 8.553.047a7.982 7.982 0 0 0 1.069-.731 7.619 7.619 0 0 0 1.142-1.15 6.949 6.949 0 0 0 1.018-1.741 6.538 6.538 0 0 0 .467-2.425l.004-.084a1.012 1.012 0 0 1 .672-.876c.08-.028.158-.04.241-.05.082-.003.082-.003.164.001zm-70.51-12.426c-15.5.93-28.544-5.922-30.126-16.443C-1.156 15.689 11.64 4.024 29.14 1.235c17.501-2.79 33.123 4.345 34.864 15.922 1.575 10.475-8.749 21.021-23.691 25.001l.001.099a31.185 31.185 0 0 0 .042.833c.007.094.019.188.021.282.006.178.013.356.024.534.011.16.024.32.039.48.017.154.038.306.058.459.036.273.077.544.144.811a4.723 4.723 0 0 0 .449 1.128c.192.332.434.628.702.898l.047.05c.151.139.302.275.461.403.24.192.492.367.748.537.474.314.962.6 1.457.877l.041.023.588.735-.729.586c-.376.112-.755.216-1.135.309a11.193 11.193 0 0 1-2.562.355 8.575 8.575 0 0 1-2.995-.486 8.461 8.461 0 0 1-.96-.413 11.194 11.194 0 0 1-1.836-1.152 13.345 13.345 0 0 1-1.07-.934c-.23-.221-.454-.448-.672-.681-.121-.129-.246-.258-.36-.395a23.448 23.448 0 0 1-1.328-1.773c-.051-.076-.049-.077-.095-.155l-.277-.477-.072-.13c-.081-.177-.159-.357-.238-.535l-.003-.01-.092-.707zm52.409-7.804c3.557 0 6.444 3.201 6.444 7.145 0 3.944-2.887 7.146-6.444 7.146s-6.444-3.202-6.444-7.146 2.887-7.145 6.444-7.145zm18.062 0c3.557 0 6.444 3.201 6.444 7.145 0 3.944-2.887 7.146-6.444 7.146s-6.444-3.202-6.444-7.146 2.887-7.145 6.444-7.145zM83.12 42.029c1.915 0 3.47 1.601 3.47 3.573s-1.555 3.573-3.47 3.573c-1.915 0-3.47-1.601-3.47-3.573s1.555-3.573 3.47-3.573zm17.846 0c1.915 0 3.47 1.601 3.47 3.573s-1.555 3.573-3.47 3.573c-1.915 0-3.47-1.601-3.47-3.573s1.555-3.573 3.47-3.573zM17.019 28c-.368 1.65-1.848 5.008-5.178 5.799-2.572.611-4.153-.815-4.544-2.559-.424-1.891.722-3.532 2.121-4.575a3.473 3.473 0 0 1-1.446-2.099c-.421-1.875.867-3.637 3.184-4.187 1.917-.455 3.185.248 3.462 1.482.265 1.184-.534 2.275-1.828 2.582-.878.209-1.574-.042-1.718-.683a1.4 1.4 0 0 1 .044-.704s.287.227.894.083c.751-.179 1.086-.709.972-1.219-.14-.625-.892-.827-1.739-.626-1.054.25-2.06 1.096-1.713 2.642.232 1.036.871 1.56 1.483 1.813.245-.11.481-.183.688-.233.943-.224 1.48-.005 1.587.472.092.411-.144.935-1.166 1.178a3.255 3.255 0 0 1-1.548.004c-.837.771-1.58 1.883-1.27 3.264.276 1.234 1.267 2.125 2.944 1.726 2.598-.617 3.861-3.638 4.277-4.883-.353-.574-.615-1.153-.732-1.676-.107-.477.145-1.005.863-1.175.48-.114.702.127.846.769a2.77 2.77 0 0 1-.03.995c.209.331.443.622.735.951.616-1.983 1.369-3.877 1.737-3.964.591-.141 1.492.65 1.492.65-.815.644-1.689 2.376-2.333 4.158.804.658 1.627 1.103 2.139.982.43-.102.735-.577.95-1.151-.323-2.226.975-4.331 2.31-4.648.703-.167 1.257.204 1.39.796.114.51-.044 1.379-.854 1.745-.236-1.053-.672-1.348-.944-1.283-.495.117-.844 1.413-.538 2.778.232 1.037.712 1.529 1.351 1.377.756-.179 1.333-1.176 1.699-2.128-.265-2.095.877-4.166 2.221-4.486.671-.159 1.214.162 1.391.952.332 1.48-.986 2.885-2.173 3.444.265.734.673 1.053 1.281.909.96-.229 1.578-1.465 1.923-2.506-.125-1.267-.26-2.385-.406-3.035l-.055-.247s1.568-.286 1.778.652l.019.082c.238-.663.67-1.216 1.309-1.368.83-.197 1.526.504 1.755 1.524.497 2.22-.556 4.428-1.834 4.732-.368.087-.642.066-.883-.033.121 1.288.292 2.651.542 3.77.126.559.272 1.061.448 1.47-.464.11-1.797.392-1.978-.414-.16-.716-.342-3.206-.554-5.612-.504 1.107-1.311 2.192-2.441 2.46-1.008.24-1.685-.303-2.055-1.182-.491 1.082-1.281 2.148-2.381 2.409-.817.194-1.554-.117-1.988-1.013-.36.843-.875 1.555-1.54 1.713-.639.152-1.53-.295-2.4-1.024-.239.888-.384 1.668-.39 2.241 0 0-.701.028-.804-.433-.096-.427.065-1.436.341-2.61a10.315 10.315 0 0 1-.713-.848zm38.163-17.803c.068.157.185.527.266.889.424 1.892.37 4.451.739 6.42-.065.61-.387 3.077-1.352 3.307-.192.045-.333-.06-.422-.454-.14-.626-.091-1.607-.293-2.512-.258-1.152-.782-1.686-1.517-1.511-.767.182-1.287 1.016-1.643 2.054-.022-.099-.053-.386-.093-.567-.211-.938-1.779-.652-1.779-.652a6.2 6.2 0 0 1 .457 1.364c.07.31.119.618.155.921-.246.495-.637.996-1.225 1.135-.064.015-.128.031-.195.029a6.977 6.977 0 0 0-.126-.784c-.258-1.152-.871-2.011-1.526-1.855a.712.712 0 0 0-.423.291c-1.337.317-2.358 2.107-2.118 3.919-.214.889-.551 1.757-1.059 1.877-.415.099-.724-.452-1.03-1.817-.059-.263-.09-.706-.122-1.149.142-.64.177-1.237.081-1.665-.107-.477-.417-.733-.816-.638-.715.17-.909 1.75-.52 3.801-.238.92-.639 1.915-1.278 2.067-.464.11-.835-.27-1.012-1.059-.158-.708-.196-1.929-.236-3.08 1.201-.424 1.911-1.009 1.775-1.617-.114-.51-.739-.743-.739-.743s-.124.722-1.064 1.258c-.029-.582-.064-1.111-.137-1.44-.137-.609-.458-.914-1.688-.622.158.327.274.698.359 1.076.103.46.162.949.189 1.445-.611.128-.947.052-.947.052s-.1.457-.041.72c.078.345.432.348 1.026.224.02 1.364-.067 2.701.143 3.639.306 1.365 1.231 1.89 2.046 1.697.907-.216 1.539-1.275 1.914-2.36.407 1.245 1.031 1.955 1.951 1.736.731-.174 1.261-1.142 1.587-2.195.431.765 1.15 1.129 1.983.931 1.214-.289 1.742-1.54 1.835-2.775 0 0 .147-.018.243-.04.526-.125.949-.488 1.26-.915.04.788.053 1.518.194 2.146.111.493.339.612.595.552.495-.118 1.081-.881 1.081-.881a3.93 3.93 0 0 1-.383-1.035c-.284-1.267.317-3.541.988-3.7.208-.049.377.257.492.767.057.255.092.504.115.751l.098 1.469c.024.246.059.496.116.751.158.707.63 1.236 1.381 1.058 1.317-.313 2.07-2.634 2.178-3.956.228.157.536.175.909.086-.505-2.253.089-6.136-.298-7.864-.1-.444-1.001-.58-1.607-.583l-.467.037zM33.729 22.293c.415-.099.711.246.885 1.02.287 1.283-.222 2.616-.797 2.753-.191.045-.695-.025-.961-1.21-.025-.115-.051-.23-.061-.349.05-1.277.439-2.097.934-2.214zm-5.187.955c.271-.065.511.104.588.449.137.609-.338 1.345-1.275 1.966-.255-1.36.159-2.29.687-2.415zm18.032-.403c-.607.144-1.062-.458-1.239-1.248-.217-.97.001-2.097.644-2.457.001.155.038.32.075.484.147.658.554 1.497 1.268 1.83-.017.749-.253 1.273-.748 1.391zm9.877-1.654c.103.461.496.714 1.039.585.799-.19.973-.993.847-1.553-.125-.559-.461-.93-.988-.805-.543.13-1.108.836-.898 1.773zm-14.21-5.442c-.104-.461-.497-.714-1.056-.581-.783.186-.972.993-.847 1.552.126.56.461.93.908.824.56-.133 1.172-1.006.995-1.795z" fill="#fff" fill-opacity=".6"></path></svg></div>
59 |                     </div>
60 |                 </div>
61 |             </div>
62 | 
63 |             <div class="container">
64 |                 
65 |             </div>
66 |     </body>
67 | </html>


--------------------------------------------------------------------------------
/exe_file/11/code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/code.png


--------------------------------------------------------------------------------
/exe_file/11/code1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/code1.png


--------------------------------------------------------------------------------
/exe_file/11/code_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/code_2.png


--------------------------------------------------------------------------------
/exe_file/11/gushiwen_code/gu.bat:
--------------------------------------------------------------------------------
 1 | cd C:\Users\crayon\OneDrive\Pycode\Crawler\crawler_Basic\exe_file\11\gushiwen_code
 2 | 
 3 | echo font 0 0 0 0 0>font_properties
 4 | 
 5 | echo Run Tesseract for Training..
 6 | tesseract.exe --psm 10 gu.font.exp0.tif gu.font.exp0 nobatch box.train
 7 | 
 8 | 
 9 | 
10 | echo Compute the Character Set..
11 | unicharset_extractor.exe gu.font.exp0.box
12 | 
13 | 
14 | mftraining -F font_properties -U unicharset -O gu.unicharset gu.font.exp0.tr
15 | 
16 | echo Clustering..
17 | cntraining.exe gu.font.exp0.tr
18 | 
19 | echo Rename Files..
20 | 
21 | 
22 | rename normproto gu.normproto
23 | 
24 | rename inttemp gu.inttemp
25 | 
26 | 
27 | rename pffmtable gu.pffmtable
28 | 
29 | 
30 | rename shapetable gu.shapetable
31 | 
32 | 
33 | 
34 | echo Create Tessdata..
35 | 
36 | 
37 | combine_tessdata.exe gu.
38 | 


--------------------------------------------------------------------------------
/exe_file/11/gushiwen_code/gu.traineddata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/gushiwen_code/gu.traineddata


--------------------------------------------------------------------------------
/exe_file/11/gushiwen_code/train_toBox.bat:
--------------------------------------------------------------------------------
1 | cd C:\Users\crayon\OneDrive\Pycode\Crawler\crawler_Basic\exe_file\11\gushiwen_code
2 | 
3 | tesseract gu.font.exp0.tif gu.font.exp0 -l gu --psm 7 batch.nochop makebox


--------------------------------------------------------------------------------
/exe_file/11/gushiwen_code/第一轮训练.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/gushiwen_code/第一轮训练.rar


--------------------------------------------------------------------------------
/exe_file/11/gushiwen_code/第三轮训练.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/gushiwen_code/第三轮训练.rar


--------------------------------------------------------------------------------
/exe_file/11/gushiwen_code/第二轮训练.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/gushiwen_code/第二轮训练.rar


--------------------------------------------------------------------------------
/exe_file/11/gushiwen_code/第四轮训练.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/gushiwen_code/第四轮训练.rar


--------------------------------------------------------------------------------
/exe_file/11/test/0-9A-Z训练字典/gu.traineddata:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/test/0-9A-Z训练字典/gu.traineddata


--------------------------------------------------------------------------------
/exe_file/11/verify_code/verify_code.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/11/verify_code/verify_code.rar


--------------------------------------------------------------------------------
/exe_file/12/download/test.txt:
--------------------------------------------------------------------------------
1 | test file


--------------------------------------------------------------------------------
/exe_file/baidu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/baidu.png


--------------------------------------------------------------------------------
/exe_file/book.json:
--------------------------------------------------------------------------------
 1 | { "store": {
 2 |     "book": [
 3 |       { "category": "文学",
 4 |         "author": "路遥",
 5 |         "title": "平凡的世界",
 6 |         "price": 8.95
 7 |       },
 8 |       { "category": "文学",
 9 |         "author": "席慕容",
10 |         "title": "穆斯林的赞礼",
11 |         "price": 12.99
12 |       },
13 |       { "category": "历史",
14 |         "author": "二月河",
15 |         "title": "康熙大帝",
16 |         "isbn": "0-553-21311-3",
17 |         "price": 8.99
18 |       },
19 |       { "category": "言情",
20 |         "author": "琼瑶",
21 |         "title": "The Lord of the Rings",
22 |         "isbn": "0-395-19395-8",
23 |         "price": 22.99
24 |       }
25 |     ],
26 |     "bicycle": {
27 |       "color": "red",
28 |       "price": 19.95
29 |     }
30 |   }
31 | }


--------------------------------------------------------------------------------
/exe_file/chrome-driver/chromedriver.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/chrome-driver/chromedriver.exe


--------------------------------------------------------------------------------
/exe_file/douban.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/douban.png


--------------------------------------------------------------------------------
/exe_file/douban_d.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/douban_d.png


--------------------------------------------------------------------------------
/exe_file/hello.txt:
--------------------------------------------------------------------------------
1 | [{'name':'宫本武藏\n小田纯一郎'}]


--------------------------------------------------------------------------------
/exe_file/meinv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/meinv.png


--------------------------------------------------------------------------------
/exe_file/python_postion.csv:
--------------------------------------------------------------------------------
 1 | ﻿岗位,企业名称,企业规模,企业类别,企业主页,工作地点,薪酬,学历要求,工作经验,岗位招聘主页
 2 | python工程师,深圳市博奥特科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ298487210.htm,深圳-罗湖区,15K-20K,本科,3-5年,https://jobs.zhaopin.com/CC298487217J00237350002.htm
 3 | 急聘:Python开发助理/包住/项目奖金,深圳市宇达计算机有限公司,100-499人,外商独资,https://company.zhaopin.com/CZ644104880.htm,深圳,3.5K-7K,学历不限,不限,https://jobs.zhaopin.com/CC644104880J00368828601.htm
 4 | 高级后端研发工程师(node/python),深圳市中联创新自控系统有限公司,100-499人,民营,https://company.zhaopin.com/CZ157447310.htm,深圳-龙岗区,18K-30K,本科,3-5年,https://jobs.zhaopin.com/CC157447311J00258918704.htm
 5 | Python开发,深圳市德科信息技术有限公司,1000-9999人,股份制企业,https://company.zhaopin.com/CZ589380620.htm,深圳-福田区,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CC589380625J00315155807.htm
 6 | Python后端开发,深圳市金鑫云端科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ816720960.htm,深圳,15K-20K,大专,3-5年,https://jobs.zhaopin.com/CC816720960J00396160401.htm
 7 | STEAM教育/Python编程老师,深圳格趣创新科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ609854230.htm,深圳,7K-10K,大专,无经验,https://jobs.zhaopin.com/CC609854230J00326990907.htm
 8 | 急招Python开发助理/实习生,深圳市十七大道科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ644079680.htm,深圳-南山区,3K-5K,大专,无经验,https://jobs.zhaopin.com/CC644079685J00215102201.htm
 9 | python,深圳市乐易网络股份有限公司,100-499人,民营,https://company.zhaopin.com/CZ495056820.htm,深圳,15K-25K,本科,3-5年,https://jobs.zhaopin.com/CC495056821J00261237003.htm
10 | python,深圳德科共赢创投合伙企业(有限合伙),不限,民营,https://company.zhaopin.com/CZ868439670.htm,深圳-南山区,10K-15K,大专,3-5年,https://jobs.zhaopin.com/CC868439670J00255308908.htm
11 | python工程师,叠云(北京)科技股份有限公司,20-99人,上市公司,https://company.zhaopin.com/CZ394261830.htm,深圳,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CC394261836J00117878715.htm
12 | python开发工程师,深圳市乐易网络股份有限公司,100-499人,民营,https://company.zhaopin.com/CZ495056820.htm,深圳,10K-20K,本科,1-3年,https://jobs.zhaopin.com/CC495056821J00261241003.htm
13 | python开发工程师,深圳市芒柠科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ466965980.htm,深圳,10K-15K,大专,1-3年,https://jobs.zhaopin.com/CC466965980J00235518307.htm
14 | python,深圳国开创新科技有限公司,500-999人,民营,https://company.zhaopin.com/CZ690553130.htm,深圳-南山区,9K-15K,大专,3-5年,https://jobs.zhaopin.com/CC690553137J00209624102.htm
15 | python后台开发工程师,深圳市悦动天下科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ644924620.htm,深圳,15K-30K,本科,3-5年,https://jobs.zhaopin.com/644924620250125.htm
16 | python工程师,深圳市纽尔科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ571842520.htm,深圳-罗湖区,15K-30K,本科,3-5年,https://jobs.zhaopin.com/CC571842520J00304504205.htm
17 | Python开发工程师,深圳市欧恩德技术有限公司,100-499人,民营,https://company.zhaopin.com/CZ672784720.htm,深圳-南山区,8K-10K,大专,1-3年,https://jobs.zhaopin.com/CC672784724J00396219401.htm
18 | Python开发工程师（深圳）,深圳市前海谷雨网络科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ544024680.htm,深圳-南山区,8K-15K,大专,1-3年,https://jobs.zhaopin.com/544024686250018.htm
19 | python高级开发工程师,华南城集团有限公司,1000-9999人,外商独资,https://company.zhaopin.com/CZ502416520.htm,深圳-龙岗区,15K-25K,本科,5-10年,https://jobs.zhaopin.com/CC502416523J00382948305.htm
20 | Python开发（后海),深圳市德科信息技术有限公司,1000-9999人,股份制企业,https://company.zhaopin.com/CZ589380620.htm,深圳,10K-15K,大专,3-5年,https://jobs.zhaopin.com/CC589380625J00318725207.htm
21 | Python开发工程师（深圳）,深圳市八斗才数据有限公司,20-99人,民营,https://company.zhaopin.com/CZ562550030.htm,深圳-南山区,10K-15K,大专,1-3年,https://jobs.zhaopin.com/562550032250031.htm
22 | python开发工程师（深圳）,深圳市源极光科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ562114530.htm,深圳-南山区,10K-15K,大专,1-3年,https://jobs.zhaopin.com/562114535250038.htm
23 | 高级Python软件工程师,August Robotics Limited,20人以下,外商独资,https://company.zhaopin.com/CZ716976740.htm,深圳,30K-50K,本科,3-5年,https://jobs.zhaopin.com/CZ716976740J00350193203.htm
24 | python,深圳德科共赢创投合伙企业(有限合伙),不限,民营,https://company.zhaopin.com/CZ868439670.htm,深圳-南山区,10K-15K,大专,3-5年,https://jobs.zhaopin.com/CC868439670J00257258908.htm
25 | python开发工程师,赞同科技股份有限公司,1000-9999人,民营,https://company.zhaopin.com/CZ539946580.htm,深圳-福田区,6K-12K,本科,不限,https://jobs.zhaopin.com/CC539946581J00290691103.htm
26 | 软件开发工程师（Python),德硕管理咨询(深圳)有限公司,20-99人,外商独资,https://company.zhaopin.com/CZ154580510.htm,深圳-福田区,8K-15K,大专,1-3年,https://jobs.zhaopin.com/CC154580515J00240303004.htm
27 | Python数据挖掘,深圳前海招文天下金融服务有限公司,不限,民营,https://company.zhaopin.com/CZ815392880.htm,深圳-南山区,11K-18K,本科,3-5年,https://jobs.zhaopin.com/CC815392880J00406708501.htm
28 | 高级Python后端工程师,深圳德聚企业管理咨询有限公司,100-499人,民营,https://company.zhaopin.com/CZ413479080.htm,深圳-南山区,20K-40K,本科,3-5年,https://jobs.zhaopin.com/413479081250105.htm
29 | Python开发工程师,深圳市芒柠科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ466965980.htm,深圳-宝安区,10K-15K,大专,1-3年,https://jobs.zhaopin.com/466965980250155.htm
30 | Python工程师,深圳市凹凸微科科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ238749130.htm,深圳,15K-20K,本科,1-3年,https://jobs.zhaopin.com/CC238749137J00417101501.htm
31 | Python软件开发（应届生）,德硕管理咨询(深圳)有限公司,20-99人,外商独资,https://company.zhaopin.com/CZ154580510.htm,深圳-福田区,6K-8K,本科,无经验,https://jobs.zhaopin.com/CC154580515J00254012304.htm
32 | python开发工程师,深圳市大富科技股份有限公司,1000-9999人,上市公司,https://company.zhaopin.com/CZ133833090.htm,深圳-福田区,10K-15K,本科,1-3年,https://jobs.zhaopin.com/CC133833090J00215320006.htm
33 | 日语Java/Python开发工程师,Uniqsys优尼卡日本株式会社,20-99人,外商独资,https://company.zhaopin.com/CZ842910190.htm,深圳,15K-30K,本科,不限,https://jobs.zhaopin.com/CC842910190J00176413904.htm
34 | python开发工程师,广州市欢雀科技有限公司,100-499人,民营,http://special.zhaopin.com/pagepublish/41297738/index.html,深圳-南山区,12K-16K,本科,3-5年,https://jobs.zhaopin.com/CC412977388J00340584403.htm
35 | Python开发工程师,深圳市掌世界网络科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ253935310.htm,深圳-罗湖区,8K-15K,本科,3-5年,https://jobs.zhaopin.com/253935313250030.htm
36 | Python后端开发工程师,北京国双科技有限公司,1000-9999人,上市公司,https://company.zhaopin.com/CZ147278820.htm,深圳-福田区,10K-20K,本科,1-3年,https://jobs.zhaopin.com/CC147278824J00166336110.htm
37 | python工程师（区块链）,北京天盛京享科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ839864870.htm,深圳,15K-30K,本科,3-5年,https://jobs.zhaopin.com/CC839864870J00177422813.htm
38 | 日语Java/Python开发工程师-初级,Uniqsys优尼卡日本株式会社,20-99人,外商独资,https://company.zhaopin.com/CZ842910190.htm,深圳,5K-10K,本科,不限,https://jobs.zhaopin.com/CC842910190J00180734404.htm
39 | Python后台开发工程师,广州宏鸿钥环境科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ644239030.htm,深圳-宝安区,25K-30K,本科,3-5年,https://jobs.zhaopin.com/CZ644239030J00261748208.htm
40 | Python开发,深圳德聚企业管理咨询有限公司,100-499人,民营,https://company.zhaopin.com/CZ413479080.htm,深圳,15K-30K,大专,3-5年,https://jobs.zhaopin.com/413479081250149.htm
41 | 高级python,深圳德聚企业管理咨询有限公司,100-499人,民营,https://company.zhaopin.com/CZ413479080.htm,深圳-南山区,20K-40K,本科,不限,https://jobs.zhaopin.com/413479081250103.htm
42 |     Python运维开发中级讲师,北京千锋互联科技有限公司深圳分公司,1000-9999人,民营,https://company.zhaopin.com/CZ556809120.htm,深圳-宝安区,18K-30K,本科,3-5年,https://jobs.zhaopin.com/CC556809127J00373127101.htm
43 | python工程师,北京嘉连勤科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ475451320.htm,深圳-南山区,8K-15K,学历不限,1-3年,https://jobs.zhaopin.com/CC475451320J00131518714.htm
44 | PHP/Python软件工程师,深圳市蜂窝网络有限公司,20-99人,民营,https://company.zhaopin.com/CZ517946720.htm,深圳,10K-15K,本科,1-3年,https://jobs.zhaopin.com/CC517946724J00320176301.htm
45 | 急聘Python开发工程师 ,深圳神州讯盟软件有限公司,20人以下,民营,https://company.zhaopin.com/CZ248856010.htm,深圳-龙华新区,8K-10K,本科,1-3年,https://jobs.zhaopin.com/CC248856017J00292840102.htm
46 | "日语Java,C,Python,Sap,DB软件开发工程师",Uniqsys优尼卡日本株式会社,20-99人,外商独资,https://company.zhaopin.com/CZ842910190.htm,深圳,20K-30K,本科,不限,https://jobs.zhaopin.com/CC842910190J00173666004.htm
47 | python高级工程师,深圳市美铁科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ711331820.htm,深圳,15K-20K,本科,1-3年,https://jobs.zhaopin.com/CC711331825J00332456307.htm
48 | Python开发,前海泰坦科技（深圳）有限公司,20-99人,民营,https://company.zhaopin.com/CZ739677640.htm,深圳-福田区,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CZ739677640J00416464201.htm
49 | python中级开发工程师,深圳新河通创科技有限公司,500-999人,其它,https://company.zhaopin.com/CZ897911160.htm,深圳-福田区,10K-15K,大专,3-5年,https://jobs.zhaopin.com/CZ897911160J00205978405.htm
50 | python web工程师,深圳市南伽科技信息有限公司,100-499人,民营,https://company.zhaopin.com/CZ616068220.htm,深圳,8K-16K,本科,1-3年,https://jobs.zhaopin.com/616068226250060.htm
51 | python高级开发工程师,深圳市南伽科技信息有限公司,100-499人,民营,https://company.zhaopin.com/CZ616068220.htm,深圳,9K-15K,本科,3-5年,https://jobs.zhaopin.com/CC616068226J00057649707.htm
52 | python工程师,杭州同帆科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ512025020.htm,深圳-南山区,10K-15K,本科,1-3年,https://jobs.zhaopin.com/CC512025027J00287262707.htm
53 | python初级开发工程师,深圳市神州动力数码有限公司,100-499人,民营,https://company.zhaopin.com/CZ145341820.htm,深圳,4K-8K,大专,不限,https://jobs.zhaopin.com/CC145341829J00267045107.htm
54 | 协议分析师（python爬虫方向）,尼尔林技术咨询(深圳)有限公司,20-99人,外商独资,https://company.zhaopin.com/CZ328908510.htm,深圳-南山区,15K-25K,本科,不限,https://jobs.zhaopin.com/CC328908511J00229100104.htm
55 | python开发,深圳市南伽科技信息有限公司,100-499人,民营,https://company.zhaopin.com/CZ616068220.htm,深圳,7K-14K,本科,1-3年,https://jobs.zhaopin.com/CC616068226J00127137107.htm
56 | Python工程师,深圳市南伽科技信息有限公司,100-499人,民营,https://company.zhaopin.com/CZ616068220.htm,深圳,10K-20K,学历不限,3-5年,https://jobs.zhaopin.com/616068226250044.htm
57 | Python工程师,深圳市神经云网络科技有限公司,100-499人,外商独资,https://company.zhaopin.com/CZ578097080.htm,深圳,15K-30K,大专,1-3年,https://jobs.zhaopin.com/CC578097081J00198102905.htm
58 | python开发工程师,深圳市瑞驰信息技术有限公司,100-499人,民营,https://company.zhaopin.com/CZ298519280.htm,深圳-南山区,10K-20K,大专,3-5年,https://jobs.zhaopin.com/298519281250039.htm
59 | python后台开发工程师,深圳市美鸿电子有限公司,20-99人,民营,https://company.zhaopin.com/CZ718489740.htm,深圳,16K-22K,本科,3-5年,https://jobs.zhaopin.com/CZ718489740J00373815407.htm
60 | python数据分析,深圳市美鸿电子有限公司,20-99人,民营,https://company.zhaopin.com/CZ718489740.htm,深圳,15K-20K,本科,3-5年,https://jobs.zhaopin.com/CZ718489740J00373817807.htm
61 | 服务器软件开发工程师（Python方向）,深圳市数联信息科技有限公司,20-99人,不限,https://company.zhaopin.com/CZ823276440.htm,深圳-南山区,15K-20K,本科,3-5年,https://jobs.zhaopin.com/CZ823276440J00393859505.htm
62 | python开发工程师,深圳市比一比网络科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ572165720.htm,深圳,10K-20K,大专,3-5年,https://jobs.zhaopin.com/CC572165726J00394238905.htm
63 | python工程师,深圳市比一比网络科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ572165720.htm,深圳,10K-20K,大专,3-5年,https://jobs.zhaopin.com/CC572165726J00394237005.htm
64 | python&Flask 微信公众号软件开发兼职,深圳市薄荷阅读科技有限公司,20人以下,民营,https://company.zhaopin.com/CZ632943280.htm,深圳,4K-6K,学历不限,不限,https://jobs.zhaopin.com/CZ632943280J00174588601.htm
65 | python量化开发工程师,深圳矩心科技有限公司,20人以下,股份制企业,https://company.zhaopin.com/CZ874879190.htm,深圳,8K-13K,大专,3-5年,https://jobs.zhaopin.com/CC874879190J00280523208.htm
66 | 视觉及深度学习算法工程师,深圳飞科机器人有限公司,20-99人,不限,https://company.zhaopin.com/CZ701390880.htm,深圳,15K-25K,本科,3-5年,https://jobs.zhaopin.com/CZ701390880J00421223601.htm
67 | 乐高研发老师,深圳百智教育科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ841629950.htm,深圳,8K-15K,大专,1-3年,https://jobs.zhaopin.com/CC841629950J00283614304.htm
68 | 数据库管理处数据库管理员(J10017),富德保险控股股份有限公司,10000人以上,股份制企业,https://company.zhaopin.com/CZ298522580.htm,深圳,8K-15K,本科,不限,https://jobs.zhaopin.com/CC298522587J00276619903.htm
69 | 软件测试岗,富德保险控股股份有限公司,10000人以上,股份制企业,https://company.zhaopin.com/CZ298522580.htm,深圳,10K-15K,本科,3-5年,https://jobs.zhaopin.com/298522587250081.htm
70 | 工程师助理 视频录制剪辑 文档工程师,深圳市微雪电子有限公司,20-99人,民营,http://special.zhaopin.com/2018/shz/11test/szsw090518,深圳,5K-8K,大专,不限,https://jobs.zhaopin.com/CC401347719J00266136404.htm
71 | 运维工程师,北京魅动力教育咨询有限公司,100-499人,民营,https://company.zhaopin.com/CZ487000820.htm,深圳-南山区,12K-20K,本科,3-5年,https://jobs.zhaopin.com/CC487000829J00172325513.htm
72 | 大数据应用部资深数据应用研发岗,富德保险控股股份有限公司,10000人以上,股份制企业,https://company.zhaopin.com/CZ298522580.htm,深圳,9K-18K,本科,3-5年,https://jobs.zhaopin.com/298522587250124.htm
73 | 数据分析专员,深圳市赛益莱特科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ814100400.htm,深圳-南山区,5K-8K,大专,不限,https://jobs.zhaopin.com/CC814100400J00201589901.htm
74 | 服务器管理员岗,富德保险控股股份有限公司,10000人以上,股份制企业,https://company.zhaopin.com/CZ298522580.htm,深圳,9K-15K,本科,3-5年,https://jobs.zhaopin.com/CC298522587J00300066803.htm
75 | 软件测试工程师（招商金科）,深圳市博悦科创科技有限公司,500-999人,民营,https://company.zhaopin.com/CZ435485810.htm,深圳-福田区,10K-15K,本科,1-3年,https://jobs.zhaopin.com/CC435485814J00249600106.htm
76 | RD2-测试工程师,深圳银澎云计算有限公司,500-999人,股份制企业,https://company.zhaopin.com/CZ609532620.htm,深圳,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CC609532620J00242978907.htm
77 | RD2-测试工程师(服务器),深圳银澎云计算有限公司,500-999人,股份制企业,https://company.zhaopin.com/CZ609532620.htm,深圳,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CC609532620J00246531807.htm
78 | c++开发工程师,深圳市新类型科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ826945830.htm,深圳,15K-30K,本科,5-10年,https://jobs.zhaopin.com/CZ826945830J00295046108.htm
79 | 测试工程师中级,小欧科技(珠海市)有限责任公司,20-99人,民营,https://company.zhaopin.com/CZ880922430.htm,深圳,10K-15K,大专,3-5年,https://jobs.zhaopin.com/CC880922430J00391273205.htm
80 | RD3-测试工程师（音视频测试）,深圳银澎云计算有限公司,500-999人,股份制企业,https://company.zhaopin.com/CZ609532620.htm,深圳,10K-18K,本科,3-5年,https://jobs.zhaopin.com/CC609532620J00356739707.htm
81 | 网络及安全工程师,恒大人寿保险有限公司,1000-9999人,合资,https://company.zhaopin.com/CZ121064920.htm,深圳-南山区,15K-20K,本科,3-5年,https://jobs.zhaopin.com/CC121064921J00381589505.htm
82 | devops工程师,深圳市新类型科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ826945830.htm,深圳,15K-30K,本科,5-10年,https://jobs.zhaopin.com/CZ826945830J00295050808.htm
83 | 测试工程师,北京魅动力教育咨询有限公司,100-499人,民营,https://company.zhaopin.com/CZ487000820.htm,深圳-南山区,12K-18K,本科,3-5年,https://jobs.zhaopin.com/CC487000829J00172326313.htm
84 | 软件测试工程师,深圳睿世达信息科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ483573220.htm,深圳-福田区,8K-10K,大专,3-5年,https://jobs.zhaopin.com/483573223250154.htm
85 | java高级开发工程师,春禾(深圳)自动化技术有限公司,100-499人,民营,https://company.zhaopin.com/CZ548753130.htm,深圳-光明新区,10K-20K,本科,1-3年,https://jobs.zhaopin.com/CZ548753130J00216868602.htm
86 | 算法研究生（计算机视觉方向）,深圳市方直科技股份有限公司,100-499人,民营,https://company.zhaopin.com/CZ143270300.htm,深圳-南山区,15K-22K,硕士,不限,https://jobs.zhaopin.com/CC143270306J00372626607.htm
87 | 前端/小程序开发工程师,深圳市大律科技有限公司,20-99人,民营,https://company.zhaopin.com/CZ422692680.htm,深圳,10K-15K,本科,3-5年,https://jobs.zhaopin.com/CZ422692680J00378070905.htm
88 | 算法工程师,深圳市铁越电气有限公司,100-499人,民营,https://company.zhaopin.com/CZ145536890.htm,深圳,11K-19K,本科,1-3年,https://jobs.zhaopin.com/CC145536893J00125372602.htm
89 | 运维工程师,深圳市房一族网络科技有限公司,100-499人,民营,https://company.zhaopin.com/CZ815431260.htm,深圳,13K-16K,本科,3-5年,https://jobs.zhaopin.com/CC815431260J00321222307.htm
90 | 生物信息工程师,广东美格基因科技有限公司,20-99人,其它,http://special.zhaopin.com/pagepublish/47718393/index.html,深圳-龙岗区,6K-12K,本科,不限,https://jobs.zhaopin.com/CC477183933J00293271902.htm
91 | 强化学习,真玫智能科技（深圳）有限公司,100-499人,民营,https://company.zhaopin.com/CZ330043480.htm,深圳-南山区,15K-30K,本科,3-5年,https://jobs.zhaopin.com/CC330043489J00293303305.htm
92 | 


--------------------------------------------------------------------------------
/exe_file/show.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/show.png


--------------------------------------------------------------------------------
/exe_file/xinggan/hpic408_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/hpic408_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic12973_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic12973_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13004_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13004_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13068_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13068_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13087_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13087_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13131_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13131_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13242_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13242_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13256_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13256_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13424_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13424_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13487_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13487_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13589_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13589_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13628_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13628_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13668_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13668_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13710_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13710_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13772_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13772_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic13941_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic13941_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14042_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14042_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14131_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14131_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14178_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14178_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14185_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14185_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14298_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14298_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14358_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14358_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14425_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14425_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14458_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14458_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14479_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14479_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14568_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14568_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14603_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14603_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14638_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14638_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14802_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14802_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14872_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14872_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic14965_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic14965_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15059_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15059_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15084_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15084_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15247_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15247_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15324_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15324_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15420_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15420_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15469_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15469_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15567_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15567_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15608_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15608_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15786_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15786_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15891_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15891_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic15920_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic15920_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16049_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16049_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16135_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16135_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16191_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16191_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16240_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16240_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16394_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16394_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16406_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16406_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16566_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16566_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16638_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16638_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16686_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16686_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16786_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16786_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16807_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16807_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16817_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16817_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16857_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16857_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16889_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16889_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16921_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16921_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic16949_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic16949_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17052_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17052_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17175_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17175_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17202_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17202_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17322_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17322_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17359_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17359_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17378_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17378_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17442_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17442_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17558_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17558_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17615_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17615_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17727_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17727_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17778_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17778_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17797_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17797_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17879_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17879_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic17946_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic17946_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic18038_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18038_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic18089_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18089_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic18110_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18110_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic18144_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18144_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic18308_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18308_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic18433_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18433_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic18631_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18631_s.jpg


--------------------------------------------------------------------------------
/exe_file/xinggan/zzpic18883_s.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/exe_file/xinggan/zzpic18883_s.jpg


--------------------------------------------------------------------------------
/exe_file/xpath.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 | <head>
 4 |     <meta charset="UTF-8">
 5 |     <title>xpath Test</title>
 6 | </head>
 7 | <body>
 8 |     <div class="song">
 9 |         火药
10 |         <b>指南针</b>
11 |         <b>印刷术</b>
12 |         造纸术
13 |     </div>
14 |     <div class="tang">
15 |         <ul>
16 |             <li class="love">
17 |                 停车坐爱枫林晚，霜叶红于二月花
18 |             </li>
19 |             <li id="hua">
20 |                 商女不知亡国恨，隔江犹唱后庭花
21 |             </li>
22 |             <li class="love" name="yang">
23 |                 一骑红尘妃子笑，无人知是荔枝来
24 |             </li>
25 |             <li id="bei">
26 |                 葡萄美酒夜光杯，欲饮琵琶马上催
27 |             </li>
28 |             <li><a href="http://www.baidu.com/">百度一下</a> </li>
29 |         </ul>
30 |         <ol>
31 |             <li class="lucy">
32 |                 寻寻觅觅，冷冷清清，凄凄惨惨戚戚
33 |             </li>
34 |             <li class="lily">
35 |                 乍暖还寒时候，最难将息
36 |             </li>
37 |             <li class="lilei">
38 |                 三杯两盏淡酒
39 |             </li>
40 |             <li>
41 |                 怎敌他晚来风
42 |             </li>
43 |             <li>
44 |                 雁过也，正伤心，却是旧时相识
45 |             </li>
46 |             <li>爱就一个字，我只说一次</li>
47 |             <li>爱情36计，我要立刻美丽</li>
48 |         </ol>
49 |     </div>
50 | 
51 | </body>
52 | </html>


--------------------------------------------------------------------------------
/fillder.py:
--------------------------------------------------------------------------------
  1 | """
  2 | code by python3.7,utf8
  3 | author:caixiaoxin_
  4 | """
  5 | """
  6 | post:
  7 |     表单数据的处理：form_data = urllib.parse.urlencode(form_data).encode()
  8 |     fillder抓包,带箭头小本表示post
  9 |     
 10 |     fillder对json的常见查看
 11 |         请求部分：
 12 |         WebForms:查看post请求表单，用于构造post清单
 13 |         Raw：查看头部信息，构造headers
 14 |         response部分：
 15 |         headers-Content-Encoding:查看response的编码----先查就不会有下面的错误了
 16 |         JSON：查看JSON解析
 17 | """
 18 | 
 19 | 
 20 | import urllib.request
 21 | import urllib.parse
 22 | 
 23 | post_url = 'https://fanyi.baidu.com/sug'
 24 | word = 'baby'
 25 | 
 26 | # 构建post表单数据
 27 | form_data = {
 28 |     'kw':word,
 29 | }
 30 | 
 31 | """
 32 | form_data = urllib.parse.urlencode(form_data)
 33 | 报错：POST data should be bytes
 34 | urlencode结果是utf8，但是post数据规定为字节类型
 35 | 需要再使用encode转成字节型
 36 | """
 37 | 
 38 | form_data = urllib.parse.urlencode(form_data).encode()
 39 | 
 40 | #发送请求
 41 | headers = {
 42 |     'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
 43 | }
 44 | 
 45 | request = urllib.request.Request(url=post_url,headers=headers)
 46 | response = urllib.request.urlopen(request,data = form_data)
 47 | print(response.read().decode())
 48 | 
 49 | 
 50 | 
 51 | 
 52 | 
 53 | # 获得完整的百度翻译json
 54 | post_url = 'https://fanyi.baidu.com/v2transapi'
 55 | form_data = {
 56 |     'from':	'en',
 57 |     'to':	'zh',
 58 |     'query':	'wolf',
 59 |     'transtype':	'realtime',
 60 |     'simple_means_flag':	'3',
 61 |     'sign':	'275695.55262',
 62 |     'token':	'd7627f387f6d0d573368943337783227',
 63 | }
 64 | headers = {
 65 |     'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36",
 66 |     'Host': 'fanyi.baidu.com',
 67 |     'Connection': 'keep-alive',
 68 |     # 'Content-Length': '120',
 69 |     'Accept': '*/*',
 70 |     'Origin': 'https://fanyi.baidu.com',
 71 |     'X-Requested-With': 'XMLHttpRequest',
 72 |     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36',
 73 |     'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
 74 |     'Referer': 'https://fanyi.baidu.com/',
 75 |     # 'Accept-Encoding': 'gzip, deflate, br',
 76 |     'Accept-Language': 'zh-CN,zh;q=0.9',
 77 |     'Cookie': 'BAIDUID=C609024C7FB6D201F3FDA13AB612DCCD:FG=1; '
 78 |               'BIDUPSID=C609024C7FB6D201F3FDA13AB612DCCD; PSTM=1548991277; '
 79 |               'BDUSS=WdmNU96am9Hc3hNc2J5Mn5DZWFsS3hEYmV4c0lGYWJIM1VEekFPdWxpY'
 80 |               'mI1WDFjQVFBQUFBJCQAAAAAAAAAAAEAAAAA0Kows~bK28j8tvu6xTg4AAAAAAAA'
 81 |               'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAANtYVlzbWFZcS; '
 82 |               'delPer=0; PSINO=6; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm; BDRCVFR[tox4WRQ4-Km]=mk3SLVN4HKm; '
 83 |               'BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; '
 84 |               'H_PS_PSSID=1445_21103_18560_28585_28557_28519_20718; locale=zh; REALTIME_TRANS_SWITCH=1; '
 85 |               'FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; Hm_lvt_64ecd82404c51e03d'
 86 |               'c91cb9e8c025574=1551442945,1551442984; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1551442984; to_lang_often=%5B%7B%22value'
 87 |               '%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; '
 88 |               'from_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D',
 89 | 
 90 | 
 91 | }
 92 | """
 93 | 2
 94 | UnicodeDecodeError: 'utf-8' codec can't decode byte 0x8b in position 1: invalid start byte
 95 | 完成1后编码错误
 96 | 
 97 | 'Accept-Encoding': 'gzip, deflate, br',注意此条，zip是压缩格式
 98 | 屏蔽该头部信息就能得到json，浏览器和fidder自动解json，但是在py中只能借助json在线解析
 99 | https://www.json.cn/
100 | """
101 | 
102 | #提交post三部曲
103 | request = urllib.request.Request(url = post_url,headers=headers) #构建请求对象：也就是伪装headers
104 | form_data = urllib.parse.urlencode(form_data).encode()          #构建post清单
105 | response = urllib.request.urlopen(request,data=form_data)       #获取url链接
106 | print(response.read().decode())
107 | 
108 | """
109 | 1
110 | {"error":997,"from":"en","to":"zh","query":"wolf"}
111 | 暴力构造post失败，正常情况下的error是0
112 | 解决方法:将fidder中RAW的全部信息拷贝到headers，如上
113 | """
114 | 


--------------------------------------------------------------------------------
/meizhuo_crawler.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import random
  3 | import os
  4 | from bs4 import BeautifulSoup
  5 | import threading
  6 | 
  7 | class crawler_pic(threading.Thread):
  8 |     begin_index = 0 # 起始页面
  9 |     end_index = 0   # 终止页
 10 |     grads = 20      # 爬取梯度：每个线程爬虫需要执行的爬取页数
 11 |     # 链接
 12 |     base_url = "http://www.win4000.com/wallpaper_big_154{}.html"
 13 |     # 图片保存根目录
 14 |     file_root = "D://pics_multi//"
 15 |     # 伪装浏览器
 16 |     UA = [
 17 |         "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 18 |         "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 19 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
 20 |         "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
 21 |     ]
 22 |     # 随机构造头部信息
 23 |     headers = {
 24 |         "User-Agent": random.choice(UA)
 25 |     }
 26 |     def __init__(self, name, begin):
 27 |         threading.Thread.__init__(self)
 28 |         self.name = name
 29 |         self.begin_index = begin
 30 |         self.end_index = begin + self.grads
 31 |     # 获取
 32 |     def get_html(self, url):
 33 |         try:
 34 |             HTML = requests.get(url,headers=self.headers)
 35 |             HTML.raise_for_status()
 36 |             HTML.encoding = HTML.apparent_encoding
 37 |             return HTML.text
 38 |         except:
 39 |             print("In "+self.name+":ERROR Load "+url)
 40 |             return "NULL"
 41 |     # 将获取的图片存储至根目录下
 42 |     def store_pics(self,pic_urls):
 43 |         fileName = pic_urls[0]+"//"
 44 |         for picurl in pic_urls[1:]:
 45 |             # 构造图片存储地址
 46 |             path = self.file_root + fileName + picurl.split('/')[-1]
 47 |             print(path)
 48 | 
 49 |             try:
 50 |                 # 需要逐层创建目录
 51 |                 if not os.path.exists(self.file_root):
 52 |                     os.mkdir(self.file_root)
 53 |                 # 如无该目录，先行构建
 54 |                 if not os.path.exists(self.file_root+fileName):
 55 |                     os.mkdir(self.file_root+fileName)
 56 |                 # 图片存在，不重复保存
 57 |                 # 不存在，创建
 58 |                 if not os.path.exists(path):
 59 |                     pic = requests.get(picurl)
 60 |                     with open(path, 'wb') as f:
 61 |                         f.write(pic.content)
 62 |                         f.close()
 63 |                         print("图片:" + picurl + " 成功下载")
 64 |                 else:
 65 |                     print("图片已存在")
 66 |             except:
 67 |                 print("爬取失败")
 68 |         return 1
 69 | 
 70 |     # 在html页面中获取图片链接，返回链接列表
 71 |     def get_pic_urls(self, HTML):
 72 | 
 73 |         pic_urls = ["filename"]
 74 |         soup = BeautifulSoup(HTML, "html.parser")
 75 |         """
 76 |         页面分析：
 77 |         图片链接位于标签<div "id": "picBox", "class": "picBox">  -- <li> -- <a> [href:pic_url]
 78 |         获取最上层:div 全部子孙标签  选取a 获取a的属性信息
 79 |         """
 80 |         for tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).descendants:
 81 |             if tag.name == 'img':
 82 |                 pic_urls.append(tag.attrs['src'])
 83 |                 pic_urls[0] = tag.attrs['title']
 84 |         """
 85 |         for a_tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).findAll("a"):
 86 |             pic_urls.append(a_tag.attrs['href'])
 87 |         """
 88 |         # 全局，记录图片数量
 89 |         global pic_num
 90 |         pic_num += len(pic_urls) - 1
 91 |         return pic_urls
 92 | 
 93 |     # 线程方法
 94 |     def run(self):
 95 |         # 爬取一遍分配的页面
 96 |         for i in range(self.begin_index,self.end_index):
 97 |             html = self.get_html(self.base_url.format(i))
 98 |             # 页面爬取成功的情况下获取图片链接
 99 |             if html != "NULL":
100 |                 pic_urls = self.get_pic_urls(html)
101 |                 self.store_pics(pic_urls)
102 |                 """
103 |                 for pic in pic_urls:
104 |                     print("in "+self.name+":"+pic)
105 |                 """
106 | 
107 | 
108 | if __name__ == '__main__':
109 | 
110 |     threads = []
111 |     count = 0
112 |     pic_num = 0
113 |     # 构造爬虫
114 |     for begin in range(700,900,20):
115 |         threads.append(crawler_pic("Thread-begin:"+str(begin),begin))
116 | 
117 |     # 开始爬取
118 |     for thread in threads:
119 |         thread.start()
120 | 
121 |     for thread in threads:
122 |         thread.join()
123 | 
124 | 
125 |     print(pic_num)
126 | 


--------------------------------------------------------------------------------
/pictureCrawler/PictureDown.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | import os
 4 | import time
 5 | url_root = 'http://www.win4000.com/wallpaper_big_154'
 6 | # http://www.win4000.com/wallpaper_big_154(3bits).html
 7 | user = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"}
 8 | pattern = re.compile(r'http://pic1.win4000.com/wallpaper/[\w|-]+/[\w]+.jpg')
 9 | 
10 | def get_picture_url(suffix):
11 |     try:
12 |         url = url_root + str(suffix) + ".html"
13 |         print(url)
14 |         r = requests.get(url,headers = user)
15 |         r.raise_for_status()
16 |         validpart = r.text.split('当前位置')[-1]
17 |         validpart = validpart.split('listBox')[0]
18 |         picurl_list = pattern.findall(validpart)
19 |         return picurl_list
20 |     except:
21 |         print("ERROR")
22 |         return ["NULL"]
23 | 
24 | def store_pic(picurl_list):
25 | 
26 |     if "NULL" in picurl_list:
27 |         return 0
28 |     file_root = "D://pics//"
29 | 
30 |     for picurl in picurl_list:
31 |         path = file_root + picurl.split('/')[-1]
32 |         try:
33 |             if not os.path.exists(file_root):
34 |                 os.mkdir(file_root)
35 |             if not os.path.exists(path):
36 |                 pic = requests.get(picurl)
37 |                 with open(path,'wb') as f:
38 |                     f.write(pic.content)
39 |                     f.close()
40 |                     print("图片:"+picurl+" 成功下载")
41 |             else:
42 |                 print("图片已存在")
43 |         except:
44 |             print("爬取失败")
45 |     return 1
46 | 
47 | if __name__ == '__main__':
48 |     for suffix in range(800,900):
49 |         store_pic(get_picture_url(suffix))
50 |         time.sleep(5)


--------------------------------------------------------------------------------
/pictureCrawler/depthPicCrawler.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import random
 3 | import _thread
 4 | import threading
 5 | import re
 6 | from bs4 import BeautifulSoup
 7 | UA = [
 8 |         "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 9 |         "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
10 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
11 |         "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
12 |     ]
13 | # 随机构造头部信息
14 | headers = {
15 |     "User-Agent": random.choice(UA)
16 | }
17 | global thread_max_num
18 | thread_max_num = 20
19 | init_links = ['http://www.win4000.com/wallpaper_193_0_0_1.html', 'http://www.win4000.com/wallpaper_0_0_0_1.html',
20 |              'http://www.win4000.com/hj/haolanzhuan.html', 'http://www.win4000.com/wallpaper_192_0_0_1.html',
21 |              'http://www.win4000.com/wallpaper.html', 'http://www.win4000.com/wallpaper_detail_155224.html',  'http://www.win4000.com/mt/index.html',
22 |              'http://www.win4000.com/wallpaper_201_0_0_1.html', 'http://www.win4000.com/meitu.html',
23 |              'http://www.win4000.com/wallpaper_197_0_0_1.html', 'http://www.win4000.com/wallpaper_195_0_0_1.html',
24 |              'http://www.win4000.com/mobile.html', 'http://www.win4000.com/retu.html','http://www.win4000.com'
25 |              'http://www.win4000.com/wallpaper_194_0_0_1.html', 'http://www.win4000.com/zt/index.html',
26 |              'http://www.win4000.com/hj/index.html', 'http://www.win4000.com/wallpaper_191_0_0_1.html',
27 |              'http://www.win4000.com/wallpaper_196_0_0_1.html', 'http://www.win4000.com/mt/star.html']
28 | pages = set()
29 | class myThread(threading.Thread):
30 |     def __init__(self,name,url):
31 |         threading.Thread.__init__(self)
32 |         self.name = name
33 |         self.url = url
34 |     def run(self):
35 |         crawler(self.name,self.url,1)
36 | 
37 | def get_html(url):
38 |     try:
39 |         HTML = requests.get(url, headers=headers)
40 |         HTML.raise_for_status()
41 |         HTML.encoding=HTML.apparent_encoding
42 |         return HTML.text
43 |     except:
44 |         # print("ERROR:"+url)
45 |         return "NULL"
46 | 
47 | 
48 | def crawler(thread_name,url,depth):
49 |     if depth > 20:
50 |         return
51 |     demo = get_html(url)
52 |     try:
53 |         soup = BeautifulSoup(demo,"html.parser")
54 |         get_pic_url(soup)
55 |         for link in soup.findAll("a",href=re.compile("http://www.win4000.com/[\S]*.html")):
56 |             if "href" in link.attrs:
57 |                 if link.attrs['href'] not in pages:
58 |                     newpage = link.attrs['href']
59 |                     pages.add(newpage)
60 |                     crawler(thread_name,newpage,depth+1)
61 | 
62 | 
63 |     except:
64 |         print("e!")
65 |         pass
66 | # <div class="paper-down">
67 | # <a href="http://pic1.win4000.com/pic/6/68/5d2043f8a5.jpg?down" class="">下载图片</a>
68 | pic_urls = set()
69 | def get_pic_url(soup):
70 |     try:
71 |         a_tag = soup.find("div",attrs={"class":"paper-down"}).a
72 |         if "href" in a_tag.attrs:
73 |             pic_url = a_tag['href']
74 |             if pic_url not in pic_urls:
75 |                 title = soup.find("h1").string
76 |                 pic_urls.add(pic_url)
77 |                 print("NO."+str(len(pic_urls))+"-"+title+":"+pic_url)
78 |     except:
79 |         pass
80 | 
81 | 
82 | threads = []
83 | counter = 0
84 | for link in init_links:
85 |     threads.append(myThread(str(counter),link))
86 |     counter += 1
87 | for thread in threads:
88 |     thread.start()
89 | for thread in threads:
90 |     thread.join()
91 | 
92 | """
93 | url = "http://www.win4000.com/wallpaper_detail_40709.html"
94 | demo = get_html(url)
95 | soup = BeautifulSoup(demo,"html.parser")
96 | get_pic_url(soup)
97 | """


--------------------------------------------------------------------------------
/pictureCrawler/informationMark.py:
--------------------------------------------------------------------------------
 1 | # XML
 2 | """ 
 3 | <img(Name) [src="china.jpg"(Tag) size="10"](Attribute)>...</img>
 4 | 缩写  <img src="china.jpg" size="10"/>
 5 | 注释 <!-- This i a comment -->
 6 | """
 7 | #JSON 有类型的键值对
 8 | """
 9 | "key" : "value"
10 | "key" : ["value1","value2"]
11 | "key" : {"subkey" : "subvalue"}   嵌套键值对采用花括号
12 | """
13 | 
14 | # YAML 缩进体现所属关系
15 | """
16 | 1: "|"表示整块可跨行信息
17 | 
18 | key : value
19 | key : #comment
20 | - value1
21 | - value2
22 | key :
23 |     subkey : subvalue 
24 | """
25 | 
26 | from bs4 import BeautifulSoup
27 | import requests
28 | import re
29 | 
30 | r = requests.get("https://st.58.com/chuzu/?PGTID=0d100000-0030-f99b-60c3-61bb358828a0&ClickID=3")
31 | demo = r.text
32 | soup = BeautifulSoup(demo,"html.parser")
33 | # print(soup.prettify())
34 | """
35 | for link in soup.find_all('a'):
36 |     print(link.get('href'))
37 | """
38 | allTag=[]
39 | for tag in soup.find_all("div","des"):
40 |     a_soup = BeautifulSoup(tag.text,"html.parser")
41 |     for a_tag in a_soup("a",tongji_label="listclick",
42 |                        onclick="clickLog('from=fcpc_zflist_gzcount');",
43 |                        target="_blank",rel="nofollow"):
44 |         print(str(a_tag.string).strip())
45 |     for a_tag in soup.find_all("p","room strongbox"):
46 |         print(str(a_tag.string).strip())
47 | 
48 | 
49 | # 利用正则搜索
50 | allTag = []
51 | for tag in soup.find_all(re.compile('h')):
52 |     if tag.name not in allTag:
53 |         allTag.append(tag.name)
54 |         # print(tag.name)
55 |     else:
56 |         pass
57 | 
58 | # 重点！！！
59 | # 规定标签"img" 及标签属性 alt="孟子义写真图片高清桌面壁纸" 可准确找到所查找信息，"孟子义写真图片高清桌面壁纸"是准确匹配
60 | # 模糊匹配用正则
61 | for tag in soup.find_all("img",alt="孟子义写真图片高清桌面壁纸"):
62 |     print(tag.get('src'))


--------------------------------------------------------------------------------
/pictureCrawler/multiPicDown.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | import random
  3 | import os
  4 | from bs4 import BeautifulSoup
  5 | import threading
  6 | 
  7 | class crawler_pic(threading.Thread):
  8 |     begin_index = 0 # 起始页面
  9 |     end_index = 0   # 终止页
 10 |     grads = 20      # 爬取梯度：每个线程爬虫需要执行的爬取页数
 11 |     # 链接
 12 |     base_url = "http://www.win4000.com/wallpaper_big_154{}.html"
 13 |     # 图片保存根目录
 14 |     file_root = "D://pics_multi//"
 15 |     # 伪装浏览器
 16 |     UA = [
 17 |         "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 18 |         "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 19 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
 20 |         "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
 21 |     ]
 22 |     # 随机构造头部信息
 23 |     headers = {
 24 |         "User-Agent": random.choice(UA)
 25 |     }
 26 |     def __init__(self, name, begin):
 27 |         threading.Thread.__init__(self)
 28 |         self.name = name
 29 |         self.begin_index = begin
 30 |         self.end_index = begin + self.grads
 31 |     # 获取
 32 |     def get_html(self, url):
 33 |         try:
 34 |             HTML = requests.get(url,headers=self.headers)
 35 |             HTML.raise_for_status()
 36 |             HTML.encoding = HTML.apparent_encoding
 37 |             return HTML.text
 38 |         except:
 39 |             print("In "+self.name+":ERROR Load "+url)
 40 |             return "NULL"
 41 |     # 将获取的图片存储至根目录下
 42 |     def store_pics(self,pic_urls):
 43 |         fileName = pic_urls[0]+"//"
 44 |         for picurl in pic_urls[1:]:
 45 |             # 构造图片存储地址
 46 |             path = self.file_root + fileName + picurl.split('/')[-1]
 47 |             print(path)
 48 | 
 49 |             try:
 50 |                 # 需要逐层创建目录
 51 |                 if not os.path.exists(self.file_root):
 52 |                     os.mkdir(self.file_root)
 53 |                 # 如无该目录，先行构建
 54 |                 if not os.path.exists(self.file_root+fileName):
 55 |                     os.mkdir(self.file_root+fileName)
 56 |                 # 图片存在，不重复保存
 57 |                 # 不存在，创建
 58 |                 if not os.path.exists(path):
 59 |                     # request获取图片内容
 60 |                     pic = requests.get(picurl)
 61 |                     with open(path, 'wb') as f:
 62 |                         f.write(pic.content)
 63 |                         f.close()
 64 |                         print("图片:" + picurl + " 成功下载")
 65 |                 else:
 66 |                     print("图片已存在")
 67 |             except:
 68 |                 print("爬取失败")
 69 |         return 1
 70 | 
 71 |     # 在html页面中获取图片链接，返回链接列表
 72 |     def get_pic_urls(self, HTML):
 73 | 
 74 |         pic_urls = ["filename"]
 75 |         soup = BeautifulSoup(HTML, "html.parser")
 76 |         """
 77 |         页面分析：
 78 |         图片链接位于标签<div "id": "picBox", "class": "picBox">  -- <li> -- <img> [href:pic_url]
 79 |         获取最上层:div 全部子孙标签  选取a 获取a的属性信息
 80 |         """
 81 |         for tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).descendants:
 82 |             if tag.name == 'img':
 83 |                 pic_urls.append(tag.attrs['src'])
 84 |                 pic_urls[0] = tag.attrs['title']
 85 |         """
 86 |         for a_tag in soup.find("div", attrs={"id": "picBox", "class": "picBox"}).findAll("a"):
 87 |             pic_urls.append(a_tag.attrs['href'])
 88 |         """
 89 |         # 全局，记录图片数量
 90 |         global pic_num
 91 |         pic_num += len(pic_urls) - 1
 92 |         return pic_urls
 93 | 
 94 |     # 线程方法
 95 |     def run(self):
 96 |         # 爬取一遍分配的页面
 97 |         for i in range(self.begin_index,self.end_index):
 98 |             html = self.get_html(self.base_url.format(i))
 99 |             # 页面爬取成功的情况下获取图片链接
100 |             if html != "NULL":
101 |                 pic_urls = self.get_pic_urls(html)
102 |                 self.store_pics(pic_urls)
103 |                 """
104 |                 for pic in pic_urls:
105 |                     print("in "+self.name+":"+pic)
106 |                 """
107 | 
108 | 
109 | if __name__ == '__main__':
110 | 
111 |     threads = []
112 |     count = 0
113 |     pic_num = 0
114 |     # 构造爬虫
115 |     for begin in range(700,900,20):
116 |         threads.append(crawler_pic("Thread-begin:"+str(begin),begin))
117 | 
118 |     # 开始爬取
119 |     for thread in threads:
120 |         thread.start()
121 | 
122 |     for thread in threads:
123 |         thread.join()
124 | 
125 | 
126 |     print(pic_num)


--------------------------------------------------------------------------------
/tesseract训练模型/0-9A-Z.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/tesseract训练模型/0-9A-Z.png


--------------------------------------------------------------------------------
/tesseract训练模型/README.md:
--------------------------------------------------------------------------------
  1 | # 验证码识别-训练字库
  2 | ## Purpose
  3 | >验证码的存在使得在模拟登陆时会遇到障碍，通过训练通用验证码识别库，可以通过加入几行代码轻松登陆，对网页进行测试\
  4 | >通过建立特征字符库，逐层加入识别错误的验证码进行补充训练，可以在三次扩充样本训练后达到90%以上识别率\
  5 | >![验证码样例](oriCode.png)
  6 | ## Tools
  7 | >**Tesseract**
  8 | >>**基于OCR原理**：Optical Character Recognition，光学字符识别，是指通过扫描字符，然后通过其形状将其翻译成电子文本的过程。\
  9 | 对于图形验证码来说，它们都是一些不规则的字符，这些字符确实是由字符稍加扭曲变换得到的内容。\
 10 | >>[Windows安装Tesseract-OCR 4.00并配置环境变量](https://segmentfault.com/a/1190000014086067):现在最新版本为5.00,安装及配置方法一致\
 11 | >>[tesseract v4.0.0 帮助文档解读](https://blog.csdn.net/qq_32674197/article/details/80744783)
 12 | 
 13 | >**jTessBoxEditor**
 14 | >>安装后记得把ocr目录下文件全部替换成Tesseract的OCR目录下的文件
 15 | 
 16 | >**pytesseract**
 17 | >>安装：**pip install pytesseract**\
 18 | >>简介:基于Tessract的图片光学字符识别库\
 19 | >>[官方文档](https://pypi.org/project/pytesseract/)
 20 | 
 21 | 
 22 | >**PIL**
 23 | >>类似与OpenCV的图像处理库，用于验证码的预处理\
 24 | >>[PIL介绍](https://www.cnblogs.com/lyrichu/p/9124504.html)\
 25 | >>[Python图像处理库PIL的ImageEnhance模块介绍](https://blog.csdn.net/icamera0/article/details/50753705)
 26 | 
 27 | ## Training
 28 | >**预处理验证码图片**
 29 | >>目的：原始验证码图片存在背景纹路等干扰，通过PIL库的ImageEnhance模块进行预处理，突出文本\
 30 | >>下面代码介绍了一种参数配置方法，可以很有效突出识别文本特征
 31 | >>```python
 32 | >>from PIL import Image
 33 | >>from PIL import ImageEnhance
 34 | >>img = Image.open('exe_file/11/code1.png')
 35 | >>print(img)
 36 | >>img= img.convert('RGB')
 37 | >># 颜色调到最暗
 38 | >>enhancer = ImageEnhance.Color(img)
 39 | >>enhancer = enhancer.enhance(0)
 40 | >># 增加亮度
 41 | >>enhancer = ImageEnhance.Brightness(enhancer)
 42 | >>enhancer = enhancer.enhance(4)
 43 | >># 增加对比度
 44 | >>enhancer = ImageEnhance.Contrast(enhancer)
 45 | >>enhancer = enhancer.enhance(15)
 46 | >># 增加图片锐度
 47 | >>enhancer = ImageEnhance.Sharpness(enhancer)
 48 | >>img = enhancer.enhance(25)
 49 | >># img.show()
 50 | >># 转成灰度图片
 51 | >>img = img.convert('L')
 52 | >># img.show()
 53 | >>#二值化处理
 54 | >>threshold = 140
 55 | >>table=[]
 56 | >>for i in range(256):
 57 | >>    if i < threshold:
 58 | >>        table.append(0)
 59 | >>    else:
 60 | >>        table.append(1)
 61 | >>out = img.point(table,'1')
 62 | >>out.show()
 63 | >>```
 64 | >**第一次训练：建立单字符字库**
 65 | >>对于该网站，验证码只存在0-9A-Z 36个字符，单独提取36个字符的图片，采用默认字库进行训练，利用jTessBoxEditor进行矫正后生成新字库\
 66 | >>![sigleChar](0-9A-Z.png)
 67 | 
 68 | >**第二次训练：扩充字库**
 69 | >>python脚本获取识别错误的验证码（脚本最后会给出传送门）\
 70 | >>利用jTessBoxEditor将错误识别的验证码集成tif文件与第一次字库的训练样本(tif文件)合并\
 71 | >>![combine](combine.png)\
 72 | >>利用第一次训练的字库将tif文件进行测试，生成box文件，进行人工纠错\
 73 | >>**操作**
 74 | >>>将文件路径改为tif文件所在目录\
 75 | >>>tif文件命名规则 [语言名].[font].[exp0].tif\
 76 | >>>gu是第一轮训练的字库名
 77 | >>```commandline
 78 | >>cd C:\Users\crayon\OneDrive\Pycode\Crawler\crawler_Basic\exe_file\11\gushiwen_code
 79 | >>
 80 | >>tesseract gu.font.exp0.tif gu.font.exp0 -l gu --psm 7 batch.nochop makebox
 81 | >>```
 82 | >>更新字库
 83 | >>```commandlinecd 
 84 | >>C:\Users\crayon\OneDrive\Pycode\Crawler\crawler_Basic\exe_file\11\gushiwen_code
 85 | >>echo font 0 0 0 0 0>font_properties
 86 | >>
 87 | >>echo Run Tesseract for Training..
 88 | >>tesseract.exe --psm 10 gu.font.exp0.tif gu.font.exp0 nobatch box.train
 89 | >>
 90 | >>echo Compute the Character Set..
 91 | >>unicharset_extractor.exe gu.font.exp0.box
 92 | >>mftraining -F font_properties -U unicharset -O gu.unicharset gu.font.exp0.tr
 93 | >>
 94 | >>echo Clustering..
 95 | >>cntraining.exe gu.font.exp0.tr
 96 | >>
 97 | >>echo Rename Files..
 98 | >>rename normproto gu.normproto
 99 | >>rename inttemp gu.inttemp
100 | >>rename pffmtable gu.pffmtable
101 | >>rename shapetable gu.shapetable
102 | >>
103 | >>echo Create Tessdata..
104 | >>combine_tessdata.exe gu.
105 | >>```
106 | 
107 | >第n次训练
108 | >>同第二次训练步骤\
109 | >>遇到的问题：Empty Page 一般是没有指定训练模式造成的，通过设置psm可以解决
110 | 
111 | ## references
112 | >[python+tesseract 训练和破解验证码](https://zhuanlan.zhihu.com/p/40178190)
113 | >>非常详细地介绍了训练字库的步骤，比较有特色的是该作者利用字符色彩不同进行单字符提取，但是这仅对单个字符同色位有效\
114 | >>对于本文的验证码，这种方法就无效了，还是需要利用截图工具
115 | 
116 | >[Tesseract-OCR样本训练方法](https://blog.csdn.net/sylsjane/article/details/83751297)
117 | >>介绍了指令文件运行，写成bat文件后就不用重复劳动
118 | 
119 | >[字库合并](https://www.imooc.com/article/32331)
120 | >>本文提出的方法有个缺陷，每次都需要对图片集人工矫正一遍，但实际前一次训练图片集一般是不需要校对的\
121 | >>这篇文章介绍了box文件合并的技巧，每次只需对新加入的样本进行矫正，从而可以减少训练字库的工作量
122 | 
123 | ## Resource
124 | >[python验证码测试脚本](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/11.2jTessBoxEditor-tesseract.py)
125 | >>[功能1]图片预处理，突出文本\
126 | >>[功能2]验证码测试，利用训练的字库测试验证码，将无法正确识别的验证码保存
127 | 
128 | >[训练字库](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/exe_file/11/gushiwen_code/gu.traineddata)
129 | >>该字库在第三轮训练中产生，可以达到90%以上的准确率
130 | 
131 | >训练命令行指令文件
132 | >>[生成box文件](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/exe_file/11/gushiwen_code/train_toBox.bat)\
133 | >>[生成字库](https://github.com/ZhuoZhuoCrayon/pythonCrawler/blob/master/exe_file/11/gushiwen_code/gu.bat)
134 | 
135 | >[训练文件集合](https://github.com/ZhuoZhuoCrayon/pythonCrawler/tree/master/exe_file/11/gushiwen_code)
136 | >>保存了每次训练的图片集合、tif、box、训练字库等文件 
137 | 


--------------------------------------------------------------------------------
/tesseract训练模型/combine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/tesseract训练模型/combine.png


--------------------------------------------------------------------------------
/tesseract训练模型/oriCode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZhuoZhuoCrayon/pythonCrawler/dd1ab2670e2f7e05cde7fe8db183f493c7987495/tesseract训练模型/oriCode.png


--------------------------------------------------------------------------------
/zhilianCrawler.py:
--------------------------------------------------------------------------------
  1 | import urllib.parse
  2 | import urllib.request
  3 | import re
  4 | import csv
  5 | import time
  6 | import json
  7 | import gzip
  8 | from io import StringIO
  9 | from bs4 import BeautifulSoup
 10 | 
 11 | class crawler:
 12 |     date = '20190320'
 13 | 
 14 |     # 城市编码
 15 |     cityIds = {
 16 |         '北京': '530',
 17 |         '上海': '538',
 18 |         '广州': '763',
 19 |         '深圳': '765',
 20 |         '杭州': '653',
 21 |         '天津': '531',
 22 |         '武汉': '736',
 23 |         '重庆': '551',
 24 |         '苏州': '639',
 25 |         '南京': '635',
 26 |         '长沙': '749',
 27 |     }
 28 |     UA = [
 29 |         "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 30 |         "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
 31 |         "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",
 32 |         "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1"
 33 |     ]
 34 |     # 随机构造头部信息
 35 |     headers = {
 36 |         "User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36',
 37 |         'Host': 'fe-api.zhaopin.com',
 38 |         'Upgrade-Insecure-Requests': '1',
 39 |         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
 40 |         'Accept-Language': 'zh-CN,zh;q =0.9',
 41 |         'Cache-Control': 'max-age=0',
 42 |         'Connection': 'keep-alive'
 43 |     }
 44 | 
 45 |     zhilian_url = 'https://fe-api.zhaopin.com/c/i/sou?'
 46 |     start_page = 1
 47 |     end_page = 100
 48 |     pos_infor = {
 49 |         'local':'NULL',
 50 |         'name' :'NULL',
 51 |         'size':'NULL',
 52 |         'type' :'NULL',
 53 |         'position' :'NULL',
 54 |         'education':'NULL',
 55 |         'experience' :'NULL',
 56 |         'need' :'NULL',
 57 |         'salary' :'NULL',
 58 |         'welfare' :'NULL',
 59 |         'num' :'NULL',
 60 |         'workplace' :'NULL',
 61 |         'companyPage' :'NULL',
 62 |     }
 63 | 
 64 |     def __init__(self,city,keyword):
 65 |         self.kw = keyword
 66 |         self.cityId = self.cityIds[city]
 67 |         csv_url = 'csvfile//智联招聘-'+city+'-'+keyword+'-'+self.date+'.csv'
 68 |         self.fp = open(csv_url,'wt',newline='',encoding='utf-8-sig')
 69 |         self.writer = csv.writer(self.fp)
 70 |         self.writer.writerow(('地区', '企业名称', '企业规模', '企业类别', '招聘岗位', '学历', '工作经验', '工作职责/要求', '薪酬', '福利', '招聘人数', '工作地点', '公司主页'))
 71 | 
 72 |     def textDecoration(self,text):
 73 |         delspace = re.compile(r'\s+')
 74 |         text = delspace.sub('',text)
 75 |         set_newline = re.compile(r'[；|：|。|！]')
 76 |         text = set_newline.sub('\n',text)
 77 |         text = text.replace("展开",'').strip()
 78 |         return text
 79 | 
 80 |     def getPosition(self,url):
 81 |         request = urllib.request.Request(url=url,headers=self.headers)
 82 |         content = urllib.request.urlopen(request)
 83 |         if content.info().get('Content-Encoding') == 'gzip':
 84 |             buf = StringIO(content.read())
 85 |             f = gzip.GzipFile(fileobj=buf)
 86 |             content = f.read()
 87 |         else:
 88 |             content = content.read()
 89 |         soup = BeautifulSoup(content,'lxml')
 90 |         main = soup.find('div',class_ = 'main')
 91 | 
 92 | 
 93 |         lis = main.find('div',class_ = 'main1 cl main1-stat').find_all('li')
 94 | 
 95 |         self.pos_infor['salary'] = lis[0].strong.text;
 96 |         self.pos_infor['position'] = lis[0].h1.text;
 97 | 
 98 |         self.pos_infor['name'] = lis[1].find('div',class_ = 'company l').a.text;
 99 |         _companyPage = lis[1].find('div',class_ = 'company l').a.attrs['href']
100 |         spans = lis[1].find('div',class_ = 'info-three l').find_all('span')
101 | 
102 |         self.pos_infor['local'] = spans[0].a.text
103 |         self.pos_infor['experience'] = spans[1].text
104 |         self.pos_infor['education'] = spans[2].text
105 |         self.pos_infor['num'] = spans[3].text[1:-1]
106 | 
107 |         # 福利信息异步加载，改为在json中提取
108 |         """
109 |         # pos_info_in = main.find('div',class_ = 'l pos-info-in')
110 |         # print(pos_info_in)
111 |         welfareSpans = main.find('div',class_ = 'l pos-info-in').find_all('div',class_ = 'pos-info-tit')
112 |         print(welfareSpans)
113 |         # self.welfare = ''
114 |         for span in welfareSpans:
115 |             print(span.text)
116 |             # self.welfare += str(span.text())+'\n'
117 |         """
118 | 
119 |         companyAttrs = main.find('ul',class_ = 'promulgator-ul cl')
120 |         lis1 = companyAttrs.find_all('li')
121 | 
122 |         self.pos_infor['type'] = lis1[1].strong.text
123 |         self.pos_infor['size'] = lis1[2].strong.text
124 |         self.pos_infor['companyPage'] = lis1[3].strong.a['href']
125 |         if self.pos_infor['companyPage'] == '' or self.pos_infor['companyPage'] =='NULL':
126 |             self.pos_infor['companyPage'] = _companyPage
127 | 
128 |         self.pos_infor['workplace'] = lis1[4].strong.text
129 |         self.pos_infor['need'] = self.textDecoration(main.find('div',class_ = 'responsibility pos-common').get_text())
130 | 
131 |         self.writer.writerow((self.pos_infor['local'],self.pos_infor['name'],self.pos_infor['size'],self.pos_infor['type'],self.pos_infor['position'] ,
132 |                               self.pos_infor['education'],self.pos_infor['experience'],self.pos_infor['need'] ,self.pos_infor['salary'],
133 |                               self.pos_infor['welfare'],self.pos_infor['num'],self.pos_infor['workplace'],self.pos_infor['companyPage']))
134 |         """
135 |         print(self.pos_infor['local'],self.pos_infor['name'],self.pos_infor['size'],self.pos_infor['type'],self.pos_infor['position'] ,
136 |                               self.pos_infor['education'],self.pos_infor['experience'],self.pos_infor['need'] ,self.pos_infor['salary'],
137 |                               self.pos_infor['welfare'],self.pos_infor['num'],self.pos_infor['workplace'],self.pos_infor['companyPage'])
138 |         """
139 | 
140 |     def handle_request(self,page):
141 |         data = {
142 |             'start':90*(page-1),
143 |             'pageSize':'90',
144 |             'cityId':self.cityId,
145 |             'workExperience':'-1',
146 |             'education':'-1',
147 |             'companyType':'-1',
148 |             'employmentType':'-1',
149 |             'jobWelfareTag':'-1',
150 |             'kw': self.kw,
151 |             'kt':'3',
152 |             '_v':'0.70987222',
153 |             'x-zp-page-request-id':'5c93296b093c49febba0d63d812d38d6-1553071553649-676137',
154 |         }
155 | 
156 |         url = self.zhilian_url + urllib.parse.urlencode(data)
157 |         print(url)
158 |         request = urllib.request.Request(url = url, headers = self.headers)
159 |         return request
160 | 
161 |         # requests.get(url, headers=headers)
162 | 
163 |     def parse_content(self, content):
164 |         selector = json.loads(content)
165 |         # print(selector)
166 |         data = selector['data']['results']
167 | 
168 |         if len(data) == 0:
169 |             return 'crawler all'
170 | 
171 |         for position in data:
172 |             # print(position['positionURL'])
173 |             self.pos_infor['welfare'] =''
174 |             for _welfare in position['welfare']:
175 |                 self.pos_infor['welfare'] += _welfare + '\n'
176 |             try:
177 |                 self.getPosition(position['positionURL'])
178 |             except:
179 |                 pass
180 |             time.sleep(0.1)
181 | 
182 | 
183 |         return 'next page'
184 | 
185 |     def run(self):
186 |         for page in range(self.start_page,self.end_page+1):
187 | 
188 |             request = self.handle_request(page)
189 |             content = urllib.request.urlopen(request)
190 | 
191 |             """
192 |             html = content.read()
193 |             print(html)
194 |             buff = BytesIO(html)
195 |             f = gzip.GzipFile(fileobj=buff)
196 |             content = f.read().decode()
197 |             print(content)
198 |             """
199 |             if content.info().get('Content-Encoding')=='gzip':
200 |                 buf = StringIO(content.read())
201 |                 f = gzip.GzipFile(fileobj=buf)
202 |                 content = f.read()
203 |             else:
204 |                 content = content.read()
205 | 
206 |             status = self.parse_content(content)
207 |             if status == 'crawler all':
208 |                 print('crawler all')
209 |                 break
210 |             else:
211 |                 print('crawler end in Page.'+str(page))
212 |             time.sleep(0.5)
213 | 
214 | import threading
215 | class crawlerThread(threading.Thread):
216 | 
217 |     def __init__(self,name,city,keyword):
218 |         threading.Thread.__init__(self)
219 |         self.name = name
220 |         self.city = city
221 |         self.keyword = keyword
222 |     def run(self):
223 |         print(self.name)
224 |         test = crawler(self.city,self.keyword)
225 |         test.run()
226 |         print(self.name+"-----------------get all now!")
227 | 
228 | 
229 | 
230 | if __name__ == '__main__':
231 |     cities = [
232 |         '北京',
233 |         '上海',
234 |         '广州',
235 |         '深圳',
236 |         '杭州',
237 |         '天津',
238 |         '武汉',
239 |         '重庆',
240 |         '苏州',
241 |         '南京',
242 |         '长沙',]
243 |     positions_IT = ['Java开发',
244 |                  'UI设计师',
245 |                  'Web前端',
246 |                  'PHP',
247 |                  'Python',
248 |                  'Android',
249 |                  '深度学习',
250 |                  '算法工程师',
251 |                  'hadoop',
252 |                  'Node.js',
253 |                  '数据开发',
254 |                  '数据分析师',
255 |                  '数据架构',
256 |                  '人工智能'
257 |                  '区块链'
258 |                 ]
259 |     positions_Finance = [
260 |         '投资经理',
261 |         '风控',
262 |         '催收',
263 |         '银行柜员',
264 |         '银行销售',
265 |         '信审',
266 |         '信用卡',
267 |         '贷款',
268 |         '金融产品',
269 |         '汽车金融',
270 |         '金融研究',
271 |         '证券交易员',
272 |         '投资经理',
273 |         '期货',
274 |         '操盘手',
275 |         '基金',
276 |         '股票',
277 |         '投资顾问',
278 |         '信托',
279 |         '典当',
280 |         '担保',
281 |         '信贷',
282 |         '权证',
283 |         '保险',
284 |         '理赔',
285 |         '精算师',
286 |         '理财',
287 |         '顾问',
288 |         '查勘定损',
289 |         '车险'
290 |     ]
291 | 
292 | 
293 |     for city in cities:
294 |         threads = []
295 |         for position in positions_Finance:
296 |             thread = crawlerThread(city+'-'+position,city,position)
297 |             # thread.start()
298 |             # thread.join()
299 |             threads.append(thread)
300 |         for thread in threads:
301 |             thread.start()
302 |         for thread in threads:
303 |             thread.join()
304 | 
305 | 


--------------------------------------------------------------------------------