├── simplescreenrecorder-2020-05-04_23.54.22.mkv ├── 有声阅读(百度接口).py ├── CSDN爬虫.py ├── 百度贴吧爬虫.py ├── 表格删除操作.html ├── 抖音爬虫.py └── tencentcomic /simplescreenrecorder-2020-05-04_23.54.22.mkv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jjjjmhao/Sprider/HEAD/simplescreenrecorder-2020-05-04_23.54.22.mkv -------------------------------------------------------------------------------- /有声阅读(百度接口).py: -------------------------------------------------------------------------------- 1 | import re 2 | from aip import AipSpeech 3 | 4 | app_id = '14975947' 5 | api_key = 'X9f3qewZCohppMHxlunznUbi' 6 | secret_key = 'LupWgIIFzZ9kTVNZSH5G0guNGZIqqTom' 7 | 8 | client = AipSpeech(app_id,api_key,secret_key) 9 | 10 | with open('read.txt','r') as a: 11 | text = a.readlines() 12 | 13 | for cut in text: 14 | #以1000个字节的长度进行分割 15 | text_cut = re.findall('.{1000}', cut) 16 | text_cut.append(cut[(len(text_cut) * 1000):]) 17 | #在分割后的字符串中间插入"---" 18 | text_final = '---'.join(text_cut) 19 | #计算文本中有多少个"---"标志 20 | times = text_final.count('---') 21 | for n in range(0,times+1): 22 | name = text_final.split('---')[n] 23 | result = client.synthesis(name, 'zh', '1', 24 | {"vol": 9, 25 | "spd": 4, 26 | "pit": 9, 27 | "per": 3, 28 | }) 29 | 30 | with open('test/' + str(n + 1) + '.mp3', "wb") as d: 31 | print('正在生成第' + str(n + 1) + '段语音......') 32 | d.write(result) -------------------------------------------------------------------------------- /CSDN爬虫.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from selenium import webdriver 4 | import time 5 | 6 | path = r'/home/jmhao/chromedriver' 7 | driver = webdriver.Chrome(path) 8 | #因为正常抓取无法获得链接信息,所以使用模拟浏览器抓取 9 | driver.get(url = 'https://blog.csdn.net/tuoshao123') 10 | time.sleep(2) 11 | #将页面源码保存到变量response中 12 | response = driver.page_source 13 | driver.quit() 14 | html = etree.HTML(response) 15 | #提取所有的url 16 | tuoeg_urls = html.xpath("//div[@class='article-item-box csdn-tracking-statistics']/h4/a/@href") 17 | #以列表的形式遍历其中的元素 18 | for i in range(0,len(tuoeg_urls)): 19 | article = requests.get(tuoeg_urls[i]).text 20 | html_article = etree.HTML(article) 21 | #提取文本标题 22 | tuo_title = '\n'.join(html_article.xpath("//h1[@class='title-article']/text()")) 23 | #提取文章文本内容 24 | tuo_article = html_article.xpath("string(//div[@id='content_views'])") 25 | #提取文章中的图片链接 26 | tuo_src = '\n'.join(html_article.xpath("//div[@id='content_views']/p/img/@src")) 27 | with open('tuoge/' + tuo_title + '.txt','w') as f: 28 | print('正在下载 ' + tuo_title + ' ......') 29 | f.write(tuo_article) 30 | f.write(tuo_src) -------------------------------------------------------------------------------- /百度贴吧爬虫.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | import re 4 | import os 5 | 6 | class TieBa: 7 | def __init__(self,a): 8 | self.base_url = "https://tieba.baidu.com/f" 9 | self.para = {'kw':a} 10 | 11 | def send_request(self,url,para={}): 12 | response = requests.get(url,params=para) 13 | return response.content 14 | 15 | def write_file(self,data,name): 16 | print(name) 17 | #image_path = 'img\\' + name 18 | with open('img/','wb') as f: 19 | f.write(data) 20 | 21 | def parse_data(self,data,rule): 22 | element = etree.HTML(data) 23 | result = element.xpath(rule) 24 | return result 25 | 26 | # def change_title(title): 27 | # pattern = re.compile(r"[\/\\\:\*\?\"\<\>\|]") 28 | # new_title = re.sub(pattern,"_",title) 29 | # return new_title 30 | 31 | def run(self): 32 | list_data = self.send_request(self.base_url,self.para) 33 | detail_rule = '//div[@class="t_con cleafix"]/div/div/div/a/@href' 34 | detail_url_list = self.parse_data(list_data,detail_rule) 35 | for detail in detail_url_list: 36 | detail_url = "https://tieba.baidu.com/" + detail 37 | detail_data = self.send_request(detail_url) 38 | image_rule = '//img[@class="BDE_Image"]/@src' 39 | image_url_list = self.parse_data(detail_data, image_rule) 40 | 41 | for image_url in image_url_list: 42 | image_data = self.send_request(image_url) 43 | image_name = image_url[-15:] 44 | self.write_file(image_data, image_name) 45 | 46 | 47 | if __name__ == '__main__': 48 | a = input('请输入吧名:') 49 | tieba = TieBa(a) 50 | tieba.run() 51 | 52 | -------------------------------------------------------------------------------- /表格删除操作.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 购物车 6 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 |
商品名称数量价格修改
40 | 78 | 79 | -------------------------------------------------------------------------------- /抖音爬虫.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from lxml import etree 3 | from selenium.webdriver.chrome.options import Options 4 | import requests 5 | import json 6 | import time 7 | 8 | class Douyin: 9 | def page_num(self,max_cursor): 10 | #网址后面的随机参数(我实在分析不出规律) 11 | # 设置谷歌无界面浏览器 12 | chrome_options = Options() 13 | chrome_options.add_argument('--headless') 14 | chrome_options.add_argument('--disable-gpu') 15 | # chromdriver地址 16 | path = r'/home/jmhao/chromedriver' 17 | #随机码 18 | random_field = '' 19 | #网址的主体 20 | url = '' + str(max_cursor) + '&aid=1128&_signature=' + random_field 21 | #请求头 22 | headers = { 23 | 'user-agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36', 24 | } 25 | response = requests.get(url,headers=headers).text 26 | #转换成json数据 27 | resp = json.loads(response) 28 | #提取到max_cursor 29 | max_cursor = resp['max_cursor'] 30 | #遍历 31 | for data in resp["aweme_list"]: 32 | # id值 33 | video_id = data['aweme_id'] 34 | # 视频简介 35 | video_title = data['desc'] 36 | # 构造视频网址 37 | video_url = 'https://www.iesdouyin.com/share/video/{}/?mid=1' 38 | # 填充内容 39 | video_douyin = video_url.format(video_id) 40 | driver = webdriver.Chrome(executable_path=path, options=chrome_options) 41 | # 打开视频界面 42 | driver.get(video_douyin) 43 | # 点击播放按钮 44 | driver.find_element_by_class_name('play-btn').click() 45 | time.sleep(2) 46 | # 将网页源码存放到变量中 47 | information = driver.page_source 48 | # 退出 49 | driver.quit() 50 | html = etree.HTML(information) 51 | # 提取视频地址 52 | video_adress = html.xpath("//video[@class='player']/@src") 53 | for i in video_adress: 54 | # 请求视频 55 | video = requests.get(i, headers=headers).content 56 | with open('douyin/' + video_title, 'wb') as f: 57 | print('正在下载:', video_title) 58 | f.write(video) 59 | 60 | #判断停止构造网址的条件 61 | if max_cursor==0: 62 | return 1 63 | else: 64 | douyin.page_num(max_cursor) 65 | return url 66 | 67 | if __name__ == '__main__': 68 | douyin = Douyin() 69 | douyin.page_num(max_cursor=0) -------------------------------------------------------------------------------- /tencentcomic: -------------------------------------------------------------------------------- 1 | import requests 2 | from lxml import etree 3 | from selenium import webdriver 4 | from time import sleep 5 | from bs4 import BeautifulSoup 6 | from selenium.webdriver.chrome.options import Options 7 | import os 8 | 9 | #打开腾讯动漫首页 10 | url = 'https://ac.qq.com/' 11 | #给网页发送请求 12 | data = requests.get(url).text 13 | #将网页信息转换成xpath可识别的类型 14 | html = etree.HTML(data) 15 | #提取到每个漫画的目录页地址 16 | comic_list = html.xpath('//a[@class="in-rank-name"]/@href') 17 | #print(comic_list) 18 | #遍历提取到的信息 19 | for comic in comic_list: 20 | #拼接成为漫画目录页的网址 21 | comic_url = url + str(comic) 22 | #从漫画目录页提取信息 23 | url_data = requests.get(comic_url).text 24 | #准备用xpath语法提取信息 25 | data_comic = etree.HTML(url_data) 26 | #提取漫画名--text()为提取文本内容 27 | name_comic = data_comic.xpath("//h2[@class='works-intro-title ui-left']/strong/text()") 28 | #提取该漫画每一页的地址 29 | item_list = data_comic.xpath("//span[@class='works-chapter-item']/a/@href") 30 | #print(name_comic) 31 | #print(item_list) 32 | #以漫画名字为文件夹名创建文件夹 33 | os.makedirs('comic/' + str(name_comic)) 34 | #将一本漫画的每一章地址遍历 35 | for item in item_list: 36 | #拼接每一章节的地址 37 | item_url = url + str(item) 38 | #print(item_url) 39 | #请求每一章节的信息 40 | page_mes = requests.get(item_url).text 41 | #准备使用xpath提取内容 42 | page_ming = etree.HTML(page_mes) 43 | #提取章节名 44 | page_name = page_ming.xpath('//span[@class="title-comicHeading"]/text()') 45 | #print(page_name) 46 | #再以章节名命名一个文件夹 47 | os.makedirs('comic/' + str(name_comic) + '/' + str(page_name)) 48 | 49 | #以下为代码的主体部分 50 | 51 | #设置谷歌无界面浏览器 52 | chrome_options = Options() 53 | chrome_options.add_argument('--headless') 54 | chrome_options.add_argument('--disable-gpu') 55 | #webdriver位置 56 | path = r'/home/jmhao/chromedriver' 57 | #浏览器参数设置 58 | browser = webdriver.Chrome(executable_path=path, options=chrome_options) 59 | #开始请求第一个章节的网址 60 | browser.get(item_url) 61 | #设置延时,为后续做缓冲 62 | sleep(2) 63 | #browser.get_screenshot_as_file(str(page_name) + ".png") 64 | #尝试执行下列代码 65 | try: 66 | #设置自动下滑滚动条操作 67 | for i in range(1, 100): 68 | #滑动距离设置 69 | js = 'var q=document.getElementById("mainView").scrollTop = ' + str(i * 1000) 70 | #执行滑动选项 71 | browser.execute_script(js) 72 | #延时,使图片充分加载 73 | sleep(2) 74 | sleep(2) 75 | #将打开的界面截图保存,证明无界面浏览器确实打开了网页 76 | browser.get_screenshot_as_file(str(page_name) + ".png") 77 | #获取当前页面源码 78 | data = browser.page_source 79 | #在当前文件夹下创建html文件,并将网页源码写入 80 | fh = open("dongman.html", "w", encoding="utf-8") 81 | #写入操作 82 | fh.write(data) 83 | #关掉无界面浏览器 84 | fh.close() 85 | 86 | #下面的操作为打开保存的html文件,提取其中的图片信息,并保存到文件夹中 87 | 88 | #用beautifulsoup打开本地文件 89 | html_new = BeautifulSoup(open('dongman.html', encoding='utf-8'), features='html.parser') 90 | #提取html文件中的主体部分 91 | soup = html_new.find(id="mainView") 92 | #设置变量i,方便为保存的图片命名 93 | i = 0 94 | #提取出主体部分中的img标签(因为图片地址保存在img标签中) 95 | for items in soup.find_all("img"): 96 | #提取图片地址信息 97 | item = items.get("src") 98 | #请求图片地址 99 | comic_pic = requests.get(item).content 100 | #print(comic_pic) 101 | #尝试提取图片,若发生错误则跳过 102 | try: 103 | #打开文件夹,将图片存入 104 | with open('comic/' + str(name_comic) + '/' + str(page_name) + '/' + str(i + 1) + '.jpg', 'wb') as f: 105 | #print('正在下载第 ', (i + 1), ' 张图片中') 106 | print('正在下载' , str(name_comic) , '-' , str(page_name) , '- 第' , (i+1) , '张图片') 107 | #写入操作 108 | f.write(comic_pic) 109 | #更改图片名,防止新下载的图片覆盖原图片 110 | i += 1 111 | #若上述代码执行报错,则执行此部分代码 112 | except Exception as err: 113 | #跳过错误代码 114 | pass 115 | # 若上述代码执行报错(大概率是由于付费漫画),则执行此部分代码 116 | except Exception as err: 117 | #跳过错误代码 118 | pass 119 | 120 | --------------------------------------------------------------------------------