├── simplescreenrecorder-2020-05-04_23.54.22.mkv
├── 有声阅读(百度接口).py
├── CSDN爬虫.py
├── 百度贴吧爬虫.py
├── 表格删除操作.html
├── 抖音爬虫.py
└── tencentcomic
/simplescreenrecorder-2020-05-04_23.54.22.mkv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jjjjmhao/Sprider/HEAD/simplescreenrecorder-2020-05-04_23.54.22.mkv
--------------------------------------------------------------------------------
/有声阅读(百度接口).py:
--------------------------------------------------------------------------------
1 | import re
2 | from aip import AipSpeech
3 |
4 | app_id = '14975947'
5 | api_key = 'X9f3qewZCohppMHxlunznUbi'
6 | secret_key = 'LupWgIIFzZ9kTVNZSH5G0guNGZIqqTom'
7 |
8 | client = AipSpeech(app_id,api_key,secret_key)
9 |
10 | with open('read.txt','r') as a:
11 | text = a.readlines()
12 |
13 | for cut in text:
14 | #以1000个字节的长度进行分割
15 | text_cut = re.findall('.{1000}', cut)
16 | text_cut.append(cut[(len(text_cut) * 1000):])
17 | #在分割后的字符串中间插入"---"
18 | text_final = '---'.join(text_cut)
19 | #计算文本中有多少个"---"标志
20 | times = text_final.count('---')
21 | for n in range(0,times+1):
22 | name = text_final.split('---')[n]
23 | result = client.synthesis(name, 'zh', '1',
24 | {"vol": 9,
25 | "spd": 4,
26 | "pit": 9,
27 | "per": 3,
28 | })
29 |
30 | with open('test/' + str(n + 1) + '.mp3', "wb") as d:
31 | print('正在生成第' + str(n + 1) + '段语音......')
32 | d.write(result)
--------------------------------------------------------------------------------
/CSDN爬虫.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 | from selenium import webdriver
4 | import time
5 |
6 | path = r'/home/jmhao/chromedriver'
7 | driver = webdriver.Chrome(path)
8 | #因为正常抓取无法获得链接信息,所以使用模拟浏览器抓取
9 | driver.get(url = 'https://blog.csdn.net/tuoshao123')
10 | time.sleep(2)
11 | #将页面源码保存到变量response中
12 | response = driver.page_source
13 | driver.quit()
14 | html = etree.HTML(response)
15 | #提取所有的url
16 | tuoeg_urls = html.xpath("//div[@class='article-item-box csdn-tracking-statistics']/h4/a/@href")
17 | #以列表的形式遍历其中的元素
18 | for i in range(0,len(tuoeg_urls)):
19 | article = requests.get(tuoeg_urls[i]).text
20 | html_article = etree.HTML(article)
21 | #提取文本标题
22 | tuo_title = '\n'.join(html_article.xpath("//h1[@class='title-article']/text()"))
23 | #提取文章文本内容
24 | tuo_article = html_article.xpath("string(//div[@id='content_views'])")
25 | #提取文章中的图片链接
26 | tuo_src = '\n'.join(html_article.xpath("//div[@id='content_views']/p/img/@src"))
27 | with open('tuoge/' + tuo_title + '.txt','w') as f:
28 | print('正在下载 ' + tuo_title + ' ......')
29 | f.write(tuo_article)
30 | f.write(tuo_src)
--------------------------------------------------------------------------------
/百度贴吧爬虫.py:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 | import re
4 | import os
5 |
6 | class TieBa:
7 | def __init__(self,a):
8 | self.base_url = "https://tieba.baidu.com/f"
9 | self.para = {'kw':a}
10 |
11 | def send_request(self,url,para={}):
12 | response = requests.get(url,params=para)
13 | return response.content
14 |
15 | def write_file(self,data,name):
16 | print(name)
17 | #image_path = 'img\\' + name
18 | with open('img/','wb') as f:
19 | f.write(data)
20 |
21 | def parse_data(self,data,rule):
22 | element = etree.HTML(data)
23 | result = element.xpath(rule)
24 | return result
25 |
26 | # def change_title(title):
27 | # pattern = re.compile(r"[\/\\\:\*\?\"\<\>\|]")
28 | # new_title = re.sub(pattern,"_",title)
29 | # return new_title
30 |
31 | def run(self):
32 | list_data = self.send_request(self.base_url,self.para)
33 | detail_rule = '//div[@class="t_con cleafix"]/div/div/div/a/@href'
34 | detail_url_list = self.parse_data(list_data,detail_rule)
35 | for detail in detail_url_list:
36 | detail_url = "https://tieba.baidu.com/" + detail
37 | detail_data = self.send_request(detail_url)
38 | image_rule = '//img[@class="BDE_Image"]/@src'
39 | image_url_list = self.parse_data(detail_data, image_rule)
40 |
41 | for image_url in image_url_list:
42 | image_data = self.send_request(image_url)
43 | image_name = image_url[-15:]
44 | self.write_file(image_data, image_name)
45 |
46 |
47 | if __name__ == '__main__':
48 | a = input('请输入吧名:')
49 | tieba = TieBa(a)
50 | tieba.run()
51 |
52 |
--------------------------------------------------------------------------------
/表格删除操作.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 购物车
6 |
24 |
25 |
26 |
27 |
28 |
29 | | 商品名称 |
30 | 数量 |
31 | 价格 |
32 | 修改 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
78 |
79 |
--------------------------------------------------------------------------------
/抖音爬虫.py:
--------------------------------------------------------------------------------
1 | from selenium import webdriver
2 | from lxml import etree
3 | from selenium.webdriver.chrome.options import Options
4 | import requests
5 | import json
6 | import time
7 |
8 | class Douyin:
9 | def page_num(self,max_cursor):
10 | #网址后面的随机参数(我实在分析不出规律)
11 | # 设置谷歌无界面浏览器
12 | chrome_options = Options()
13 | chrome_options.add_argument('--headless')
14 | chrome_options.add_argument('--disable-gpu')
15 | # chromdriver地址
16 | path = r'/home/jmhao/chromedriver'
17 | #随机码
18 | random_field = ''
19 | #网址的主体
20 | url = '' + str(max_cursor) + '&aid=1128&_signature=' + random_field
21 | #请求头
22 | headers = {
23 | 'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
24 | }
25 | response = requests.get(url,headers=headers).text
26 | #转换成json数据
27 | resp = json.loads(response)
28 | #提取到max_cursor
29 | max_cursor = resp['max_cursor']
30 | #遍历
31 | for data in resp["aweme_list"]:
32 | # id值
33 | video_id = data['aweme_id']
34 | # 视频简介
35 | video_title = data['desc']
36 | # 构造视频网址
37 | video_url = 'https://www.iesdouyin.com/share/video/{}/?mid=1'
38 | # 填充内容
39 | video_douyin = video_url.format(video_id)
40 | driver = webdriver.Chrome(executable_path=path, options=chrome_options)
41 | # 打开视频界面
42 | driver.get(video_douyin)
43 | # 点击播放按钮
44 | driver.find_element_by_class_name('play-btn').click()
45 | time.sleep(2)
46 | # 将网页源码存放到变量中
47 | information = driver.page_source
48 | # 退出
49 | driver.quit()
50 | html = etree.HTML(information)
51 | # 提取视频地址
52 | video_adress = html.xpath("//video[@class='player']/@src")
53 | for i in video_adress:
54 | # 请求视频
55 | video = requests.get(i, headers=headers).content
56 | with open('douyin/' + video_title, 'wb') as f:
57 | print('正在下载:', video_title)
58 | f.write(video)
59 |
60 | #判断停止构造网址的条件
61 | if max_cursor==0:
62 | return 1
63 | else:
64 | douyin.page_num(max_cursor)
65 | return url
66 |
67 | if __name__ == '__main__':
68 | douyin = Douyin()
69 | douyin.page_num(max_cursor=0)
--------------------------------------------------------------------------------
/tencentcomic:
--------------------------------------------------------------------------------
1 | import requests
2 | from lxml import etree
3 | from selenium import webdriver
4 | from time import sleep
5 | from bs4 import BeautifulSoup
6 | from selenium.webdriver.chrome.options import Options
7 | import os
8 |
9 | #打开腾讯动漫首页
10 | url = 'https://ac.qq.com/'
11 | #给网页发送请求
12 | data = requests.get(url).text
13 | #将网页信息转换成xpath可识别的类型
14 | html = etree.HTML(data)
15 | #提取到每个漫画的目录页地址
16 | comic_list = html.xpath('//a[@class="in-rank-name"]/@href')
17 | #print(comic_list)
18 | #遍历提取到的信息
19 | for comic in comic_list:
20 | #拼接成为漫画目录页的网址
21 | comic_url = url + str(comic)
22 | #从漫画目录页提取信息
23 | url_data = requests.get(comic_url).text
24 | #准备用xpath语法提取信息
25 | data_comic = etree.HTML(url_data)
26 | #提取漫画名--text()为提取文本内容
27 | name_comic = data_comic.xpath("//h2[@class='works-intro-title ui-left']/strong/text()")
28 | #提取该漫画每一页的地址
29 | item_list = data_comic.xpath("//span[@class='works-chapter-item']/a/@href")
30 | #print(name_comic)
31 | #print(item_list)
32 | #以漫画名字为文件夹名创建文件夹
33 | os.makedirs('comic/' + str(name_comic))
34 | #将一本漫画的每一章地址遍历
35 | for item in item_list:
36 | #拼接每一章节的地址
37 | item_url = url + str(item)
38 | #print(item_url)
39 | #请求每一章节的信息
40 | page_mes = requests.get(item_url).text
41 | #准备使用xpath提取内容
42 | page_ming = etree.HTML(page_mes)
43 | #提取章节名
44 | page_name = page_ming.xpath('//span[@class="title-comicHeading"]/text()')
45 | #print(page_name)
46 | #再以章节名命名一个文件夹
47 | os.makedirs('comic/' + str(name_comic) + '/' + str(page_name))
48 |
49 | #以下为代码的主体部分
50 |
51 | #设置谷歌无界面浏览器
52 | chrome_options = Options()
53 | chrome_options.add_argument('--headless')
54 | chrome_options.add_argument('--disable-gpu')
55 | #webdriver位置
56 | path = r'/home/jmhao/chromedriver'
57 | #浏览器参数设置
58 | browser = webdriver.Chrome(executable_path=path, options=chrome_options)
59 | #开始请求第一个章节的网址
60 | browser.get(item_url)
61 | #设置延时,为后续做缓冲
62 | sleep(2)
63 | #browser.get_screenshot_as_file(str(page_name) + ".png")
64 | #尝试执行下列代码
65 | try:
66 | #设置自动下滑滚动条操作
67 | for i in range(1, 100):
68 | #滑动距离设置
69 | js = 'var q=document.getElementById("mainView").scrollTop = ' + str(i * 1000)
70 | #执行滑动选项
71 | browser.execute_script(js)
72 | #延时,使图片充分加载
73 | sleep(2)
74 | sleep(2)
75 | #将打开的界面截图保存,证明无界面浏览器确实打开了网页
76 | browser.get_screenshot_as_file(str(page_name) + ".png")
77 | #获取当前页面源码
78 | data = browser.page_source
79 | #在当前文件夹下创建html文件,并将网页源码写入
80 | fh = open("dongman.html", "w", encoding="utf-8")
81 | #写入操作
82 | fh.write(data)
83 | #关掉无界面浏览器
84 | fh.close()
85 |
86 | #下面的操作为打开保存的html文件,提取其中的图片信息,并保存到文件夹中
87 |
88 | #用beautifulsoup打开本地文件
89 | html_new = BeautifulSoup(open('dongman.html', encoding='utf-8'), features='html.parser')
90 | #提取html文件中的主体部分
91 | soup = html_new.find(id="mainView")
92 | #设置变量i,方便为保存的图片命名
93 | i = 0
94 | #提取出主体部分中的img标签(因为图片地址保存在img标签中)
95 | for items in soup.find_all("img"):
96 | #提取图片地址信息
97 | item = items.get("src")
98 | #请求图片地址
99 | comic_pic = requests.get(item).content
100 | #print(comic_pic)
101 | #尝试提取图片,若发生错误则跳过
102 | try:
103 | #打开文件夹,将图片存入
104 | with open('comic/' + str(name_comic) + '/' + str(page_name) + '/' + str(i + 1) + '.jpg', 'wb') as f:
105 | #print('正在下载第 ', (i + 1), ' 张图片中')
106 | print('正在下载' , str(name_comic) , '-' , str(page_name) , '- 第' , (i+1) , '张图片')
107 | #写入操作
108 | f.write(comic_pic)
109 | #更改图片名,防止新下载的图片覆盖原图片
110 | i += 1
111 | #若上述代码执行报错,则执行此部分代码
112 | except Exception as err:
113 | #跳过错误代码
114 | pass
115 | # 若上述代码执行报错(大概率是由于付费漫画),则执行此部分代码
116 | except Exception as err:
117 | #跳过错误代码
118 | pass
119 |
120 |
--------------------------------------------------------------------------------