├── manhwa_v5.1.py ├── manhwa_v1.0.py ├── manhwa_v2.0.py ├── manhwa_v3.0.py ├── manhwa_v5.0.py ├── manhwa_v4.0.py └── README.md /manhwa_v5.1.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.system(r"python D:\manhua\整站爬取www.manhwa.cc\manhua4.py") 4 | 5 | os.system(r"F:\CloudMusic\是萝莉控真是太好了.mp3") 6 | -------------------------------------------------------------------------------- /manhwa_v1.0.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import os 5 | 6 | #1-1030 7 | for num1 in range(1,1031): 8 | circle = requests.get('https://www.manhwa.cc/book/'+str(num1)) 9 | # 将获取的图片地址依次放入count中 10 | count = [] 11 | # 将获取的网页内容放入BeautifulSoup 12 | soup = BeautifulSoup(circle.text, 'lxml') 13 | # 根据谷歌SelectGadGet这个插件,获取html标签,比如获取:#gallery-list 14 | 15 | for item_book in soup.select('.d_bg_t'): 16 | for book_name in item_book.find_all('a'): 17 | if(book_name.string!='韩国'and book_name.string!='男性'): 18 | book_name_clean=book_name.string 19 | print(num1, book_name_clean) 20 | 21 | os.makedirs('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + str(num1) +'.'+ book_name_clean ) 22 | 23 | #menu_path_num = [] 24 | 25 | for item in soup.select('.d_menu>ul>li'): 26 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 27 | for a in item.find_all('a'): 28 | #print('a', a) 29 | # m 是 img标签中存在的属性 30 | menu_path = 'https://www.manhwa.cc/' + a.get('href') 31 | #count.append(menu_path) 32 | #menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path)) 33 | menu_path_num=re.findall(r"\d+\.?\d*", menu_path) 34 | 35 | #当前一部书爬取循环,从上面得到每一章地址后,遍历这么多“章”次 36 | 37 | #for num in menu_path_num: 38 | print('book_url:',menu_path) 39 | circle = requests.get(menu_path) 40 | # 将获取的图片地址依次放入count中 41 | count = [] 42 | # 将获取的网页内容放入BeautifulSoup 43 | soup = BeautifulSoup(circle.text, 'lxml') 44 | # 根据谷歌SelectGadGet这个插件,获取html标签,比如获取:#gallery-list 45 | 46 | 47 | for title in soup.select('div.fl.r_tab_l'): 48 | for title in title.find_all('span'): 49 | print('title:', title.text) 50 | title=title.text 51 | 52 | for item in soup.select('.r_img'): 53 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 54 | for img in item.find_all('img'): 55 | print('img_url:', img) 56 | # m 是 img标签中存在的属性 57 | img_path = img.get('data-original') 58 | count.append(img_path) 59 | # 用enumerate依次取出count中的图片地址 放入v中 60 | os.makedirs('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//' + str(title) + '//') 61 | for i, v in enumerate(count): 62 | # 将获取的v值再次放入request中进行与网站相应 63 | image = requests.get(v) 64 | # 存取图片过程中,出现不能存储 int 类型,故而,我们对他进行类型转换 str()。w:读写方式打开,b:二进制进行读写。图片一般用到的都是二进制。 65 | with open('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//'+ str(title) + '//' +str(i) + '.jpg', 'wb') as file: 66 | #with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file: 67 | # content:图片转换成二进制,进行保存。 68 | file.write(image.content) 69 | print(i) 70 | 71 | -------------------------------------------------------------------------------- /manhwa_v2.0.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import os 5 | 6 | #1-1030 7 | for num1 in range(1,1030): 8 | #716字符问题无法生成文件夹 9 | 10 | import urllib.request # url包 11 | 12 | def openUrl(circle): 13 | headers = { 14 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 15 | 'Host': 'jandan.net' 16 | } 17 | req = urllib.request.Request(circle, headers=headers) 18 | response = urllib.request.urlopen(req) # 请求 19 | html = response.read() # 获取 20 | html = html.decode("utf-8") # 解码 21 | print(html) # 打印 22 | 23 | if __name__ == "__main__": 24 | circle = requests.get('https://www.manhwa.cc/book/' + str(num1)) 25 | 26 | # circle = requests.get('https://www.manhwa.cc/book/'+str(num1)) 27 | # 将获取的图片地址依次放入count中 28 | count = [] 29 | # 将获取的网页内容放入BeautifulSoup 30 | soup = BeautifulSoup(circle.text, 'lxml') 31 | # 根据谷歌SelectGadGet这个插件,获取html标签,比如获取:#gallery-list 32 | 33 | for item_book in soup.select('.d_bg_t'): 34 | for book_name in item_book.select('a')[0]: 35 | book_name_clean = book_name.string 36 | print(num1, book_name_clean) 37 | 38 | 39 | #menu_path_num = [] 40 | 41 | for item in soup.select('.d_menu>ul>li'): 42 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 43 | for a in item.find_all('a'): 44 | #print('a', a) 45 | # m 是 img标签中存在的属性 46 | menu_path = 'https://www.manhwa.cc/' + a.get('href') 47 | #count.append(menu_path) 48 | #menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path)) 49 | menu_path_num=re.findall(r"\d+\.?\d*", menu_path) 50 | 51 | #当前一部书爬取循环,从上面得到每一章地址后,遍历这么多“章”次 52 | 53 | #for num in menu_path_num: 54 | print('book_url:',menu_path) 55 | 56 | circle = requests.get(menu_path) 57 | # 将获取的图片地址依次放入count中 58 | count = [] 59 | # 将获取的网页内容放入BeautifulSoup 60 | soup = BeautifulSoup(circle.text, 'lxml') 61 | 62 | for title in soup.select('div.fl.r_tab_l'): 63 | for title in title.find_all('span'): 64 | print('title:', title.text) 65 | title=title.text 66 | 67 | for item in soup.select('.r_img'): 68 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 69 | for img in item.find_all('img'): 70 | print('img_url:', img) 71 | # m 是 img标签中存在的属性 72 | img_path = img.get('data-original') 73 | count.append(img_path) 74 | 75 | # 用enumerate依次取出count中的图片地址 放入v中 76 | os.makedirs('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//' + str(title) + '//') 77 | for i, v in enumerate(count): 78 | # 将获取的v值再次放入request中进行与网站相应 79 | image = requests.get(v) 80 | # 存取图片过程中,出现不能存储 int 类型,故而,我们对他进行类型转换 str()。w:读写方式打开,b:二进制进行读写。图片一般用到的都是二进制。 81 | with open('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//'+ str(title) + '//' +str(i) + '.jpg', 'wb') as file: 82 | #with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file: 83 | # content:图片转换成二进制,进行保存。 84 | file.write(image.content) 85 | print(i) 86 | -------------------------------------------------------------------------------- /manhwa_v3.0.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import os 5 | 6 | # 1-1030 7 | 8 | from manhua3_ui import A 9 | 10 | A().aa() 11 | 12 | ''' 13 | for num1 in range(first, last): 14 | # 716字符问题无法生成文件夹 15 | 16 | import urllib.request # url包 17 | 18 | def openUrl(circle): 19 | headers = { 20 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 21 | 'Host': 'jandan.net' 22 | } 23 | req = urllib.request.Request(circle, headers=headers) 24 | response = urllib.request.urlopen(req) # 请求 25 | html = response.read() # 获取 26 | html = html.decode("utf-8") # 解码 27 | print(html) # 打印 28 | 29 | 30 | if __name__ == "__main__": 31 | circle = requests.get('https://www.manhwa.cc/book/' + str(num1)) 32 | 33 | # circle = requests.get('https://www.manhwa.cc/book/'+str(num1)) 34 | # 将获取的图片地址依次放入count中 35 | count = [] 36 | # 将获取的网页内容放入BeautifulSoup 37 | soup = BeautifulSoup(circle.text, 'lxml') 38 | # 根据谷歌SelectGadGet这个插件,获取html标签,比如获取:#gallery-list 39 | 40 | for item_book in soup.select('.d_bg_t'): 41 | for book_name in item_book.select('a')[0]: 42 | book_name_clean = book_name.string 43 | print(num1, book_name_clean) 44 | 45 | for item in soup.select('.d_menu>ul>li'): 46 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 47 | for a in item.find_all('a'): 48 | # print('a', a) 49 | # m 是 img标签中存在的属性 50 | menu_path = 'https://www.manhwa.cc/' + a.get('href') 51 | # count.append(menu_path) 52 | # menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path)) 53 | menu_path_num = re.findall(r"\d+\.?\d*", menu_path) 54 | 55 | # 当前一部书爬取循环,从上面得到每一章地址后,遍历这么多“章”次 56 | 57 | # for num in menu_path_num: 58 | print('book_url:', menu_path) 59 | 60 | circle = requests.get(menu_path) 61 | # 将获取的图片地址依次放入count中 62 | count = [] 63 | # 将获取的网页内容放入BeautifulSoup 64 | soup = BeautifulSoup(circle.text, 'lxml') 65 | 66 | for title in soup.select('div.fl.r_tab_l'): 67 | for title in title.find_all('span'): 68 | print('title:', title.text) 69 | title = title.text 70 | 71 | for item in soup.select('.r_img'): 72 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 73 | for img in item.find_all('img'): 74 | print('img_url:', img) 75 | # m 是 img标签中存在的属性 76 | img_path = img.get('data-original') 77 | count.append(img_path) 78 | 79 | # 用enumerate依次取出count中的图片地址 放入v中 80 | os.makedirs('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//' + str(title) + '//') 81 | for i, v in enumerate(count): 82 | # 将获取的v值再次放入request中进行与网站相应 83 | image = requests.get(v) 84 | # 存取图片过程中,出现不能存储 int 类型,故而,我们对他进行类型转换 str()。w:读写方式打开,b:二进制进行读写。图片一般用到的都是二进制。 85 | with open('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//' + str(title) + '//' + str( 86 | i) + '.jpg', 'wb') as file: 87 | # with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file: 88 | # content:图片转换成二进制,进行保存。 89 | file.write(image.content) 90 | ''' #print(i) 91 | 92 | -------------------------------------------------------------------------------- /manhwa_v5.0.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import os 5 | 6 | from PyQt5.QtWidgets import QApplication, QWidget, QLineEdit, QInputDialog, QGridLayout, QLabel, QPushButton, QFrame, QProgressBar 7 | 8 | first=16 9 | 10 | class InputDialog(QWidget): 11 | 12 | def __init__(self): 13 | super(InputDialog,self).__init__() 14 | self.initUi() 15 | 16 | def initUi(self): 17 | self.setWindowTitle("漫画爬取") 18 | self.setGeometry(50,50,1200,600) 19 | 20 | label1=QLabel("第一本:") 21 | label2=QLabel("最后一本:") 22 | 23 | self.nameLable = QLabel("1")#1 24 | self.first=int(self.nameLable.text()) 25 | self.nameLable.setText(str(self.first)) 26 | self.nameLable.setFrameStyle(QFrame.Panel|QFrame.Sunken) 27 | self.styleLable = QLabel("1")#1030 28 | self.last=self.styleLable.text() 29 | self.styleLable.setText(str(self.last)) 30 | self.styleLable.setFrameStyle(QFrame.Panel|QFrame.Sunken) 31 | 32 | # 设置进度条(弃用) 33 | 34 | nameButton=QPushButton("更改") 35 | nameButton.clicked.connect(self.selectName) 36 | styleButton=QPushButton("更改") 37 | styleButton.clicked.connect(self.selectStyle) 38 | okButton = QPushButton("OK") 39 | okButton.clicked.connect(self.selectOk) 40 | 41 | mainLayout=QGridLayout() 42 | mainLayout.addWidget(label1,0,0) 43 | mainLayout.addWidget(self.nameLable,0,1) 44 | mainLayout.addWidget(nameButton,0,2) 45 | mainLayout.addWidget(label2,1,0) 46 | mainLayout.addWidget(self.styleLable,1,1) 47 | mainLayout.addWidget(styleButton,1,2) 48 | mainLayout.addWidget(okButton,2,1) 49 | 50 | self.setLayout(mainLayout) 51 | 52 | #爬取代码 53 | def ManHua(self): 54 | 55 | for num1 in range(first,1030): 56 | import urllib.request # url包 57 | 58 | def openUrl(circle): 59 | headers = { 60 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 61 | 'Host': 'jandan.net' 62 | } 63 | req = urllib.request.Request(circle, headers=headers) 64 | response = urllib.request.urlopen(req) # 请求 65 | html = response.read() # 获取 66 | html = html.decode("utf-8") # 解码 67 | print(html) # 打印 68 | 69 | if __name__ == "__main__": 70 | circle = requests.get('https://www.manhwa.cc/book/' + str(num1)) 71 | 72 | count = [] 73 | soup = BeautifulSoup(circle.text, 'lxml') 74 | 75 | for item_book in soup.select('.d_bg_t'): 76 | for book_name in item_book.select('a')[0]: 77 | book_name_clean = book_name.string 78 | print('') 79 | print("正在下载:",num1, book_name_clean) 80 | aa=0 81 | #print(aa,num1) 82 | if num1>aa: 83 | aa=num1 84 | #print(aa) 85 | for i in range(int(num1*(100/1030))+1): 86 | print('\r'+'总进度:' + '▇' * (i // 2) + str(i) + '%', end='') 87 | print('') 88 | 89 | for item in soup.select('.d_menu>ul>li'): 90 | for a in item.find_all('a'): 91 | menu_path = 'https://www.manhwa.cc/' + a.get('href') 92 | # count.append(menu_path) 93 | # menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path)) 94 | menu_path_num = re.findall(r"\d+\.?\d*", menu_path) 95 | 96 | # 当前一部书爬取循环,从上面得到每一章地址后,遍历这么多“章”次 97 | 98 | # for num in menu_path_num: 99 | #print('book_url:', menu_path) 100 | 101 | 102 | circle = requests.get(menu_path) 103 | # 将获取的图片地址依次放入count中 104 | count = [] 105 | # 将获取的网页内容放入BeautifulSoup 106 | soup = BeautifulSoup(circle.text, 'lxml') 107 | #print(menu_path) 108 | print('.', end='') 109 | 110 | for title in soup.select('div.fl.r_tab_l'): 111 | for title in title.find_all('span'): 112 | #print('title:', title.text) 113 | title = title.text 114 | 115 | for item in soup.select('.r_img'): 116 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 117 | for img in item.find_all('img'): 118 | #print('img_url:', img) 119 | # m 是 img标签中存在的属性 120 | img_path = img.get('data-original') 121 | count.append(img_path) 122 | 123 | #自动识别'文件夹+文件'重复后跳过下载如何continue 124 | if(os.path.exists('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/')): 125 | continue 126 | else: 127 | os.makedirs('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/') 128 | 129 | for i, v in enumerate(count): 130 | image = requests.get(v) 131 | if (os.path.exists('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/' + str(i) + '.jpg')): 132 | continue 133 | else: 134 | with open('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/' + str(i) + '.jpg', 'wb') as file: 135 | file.write(image.content) 136 | #print(i) 137 | continue 138 | continue 139 | 140 | 141 | def selectName(self): 142 | name,ok = QInputDialog.getText(self,"第一本","第一本序号:", 143 | QLineEdit.Normal,self.nameLable.text()) 144 | if ok and (len(name)!=0): 145 | self.nameLable.setText(name) 146 | def selectStyle(self): 147 | style, ok = QInputDialog.getText(self, "最后一本", "最后一本序号:", 148 | QLineEdit.Normal, self.nameLable.text()) 149 | if ok and (len(style)!=0): 150 | self.styleLable.setText(style) 151 | def selectOk(self): 152 | self.ManHua() 153 | os.system(r"F:\CloudMusic\是萝莉控真是太好了.mp3") 154 | #self.first=int(self.nameLable.text()) 155 | #self.last=self.styleLable.text() 156 | #print(self.first, self.last) 157 | #os.system(r"python D:\manhua\整站爬取www.manhwa.cc\manhua3.py") 158 | 159 | 160 | 161 | if __name__=="__main__": 162 | import sys 163 | app=QApplication(sys.argv) 164 | myshow=InputDialog() 165 | myshow.show() 166 | #InputDialog().ManHua() 167 | sys.exit(app.exec_()) 168 | 169 | 170 | 171 | 172 | 173 | -------------------------------------------------------------------------------- /manhwa_v4.0.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import re 4 | import os 5 | 6 | from PyQt5.QtWidgets import QApplication, QWidget, QLineEdit, QInputDialog, QGridLayout, QLabel, QPushButton, QFrame 7 | 8 | class InputDialog(QWidget): 9 | 10 | def __init__(self): 11 | super(InputDialog,self).__init__() 12 | self.initUi() 13 | 14 | def initUi(self): 15 | self.setWindowTitle("漫画爬取") 16 | self.setGeometry(400,400,300,260) 17 | 18 | label1=QLabel("第一本:") 19 | label2=QLabel("最后一本:") 20 | 21 | self.nameLable = QLabel("2")#1 22 | self.first=int(self.nameLable.text()) 23 | self.nameLable.setText(str(self.first)) 24 | self.nameLable.setFrameStyle(QFrame.Panel|QFrame.Sunken) 25 | self.styleLable = QLabel("2")#1030 26 | self.last=self.styleLable.text() 27 | self.styleLable.setText(str(self.last)) 28 | self.styleLable.setFrameStyle(QFrame.Panel|QFrame.Sunken) 29 | 30 | nameButton=QPushButton("更改") 31 | nameButton.clicked.connect(self.selectName) 32 | styleButton=QPushButton("更改") 33 | styleButton.clicked.connect(self.selectStyle) 34 | okButton = QPushButton("OK") 35 | okButton.clicked.connect(self.selectOk) 36 | 37 | mainLayout=QGridLayout() 38 | mainLayout.addWidget(label1,0,0) 39 | mainLayout.addWidget(self.nameLable,0,1) 40 | mainLayout.addWidget(nameButton,0,2) 41 | mainLayout.addWidget(label2,1,0) 42 | mainLayout.addWidget(self.styleLable,1,1) 43 | mainLayout.addWidget(styleButton,1,2) 44 | mainLayout.addWidget(okButton,2,1) 45 | 46 | self.setLayout(mainLayout) 47 | 48 | #爬取代码 49 | 50 | def ManHua(self): 51 | #2/3/4/ 52 | #多线程同时几个py def传值出去++? 53 | num=8 54 | for num1 in range(num,num+1): 55 | 56 | #循环过大(大于2都不行) 会直接返回Process finished with exit code -1073740791 (0xC0000409) 57 | #716字符问题无法生成文件夹 58 | #循环不行 只能采用一次次一本本下载完提醒 按多一次 59 | #后来发现不是循环不行 是这个 60 | # self.nameLable.setText(num1-1) 61 | # self.styleLable.setText(num1) 62 | #循环也有点关系 63 | 64 | import urllib.request # url包 65 | 66 | def openUrl(circle): 67 | headers = { 68 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 69 | 'Host': 'jandan.net' 70 | } 71 | req = urllib.request.Request(circle, headers=headers) 72 | response = urllib.request.urlopen(req) # 请求 73 | html = response.read() # 获取 74 | html = html.decode("utf-8") # 解码 75 | print(html) # 打印 76 | 77 | if __name__ == "__main__": 78 | circle = requests.get('https://www.manhwa.cc/book/' + str(num1)) 79 | 80 | # circle = requests.get('https://www.manhwa.cc/book/'+str(num1)) 81 | # 将获取的图片地址依次放入count中 82 | count = [] 83 | # 将获取的网页内容放入BeautifulSoup 84 | soup = BeautifulSoup(circle.text, 'lxml') 85 | # 根据谷歌SelectGadGet这个插件,获取html标签,比如获取:#gallery-list 86 | 87 | for item_book in soup.select('.d_bg_t'): 88 | for book_name in item_book.select('a')[0]: 89 | book_name_clean = book_name.string 90 | print(num1, book_name_clean) 91 | 92 | for item in soup.select('.d_menu>ul>li'): 93 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 94 | for a in item.find_all('a'): 95 | # print('a', a) 96 | # m 是 img标签中存在的属性 97 | menu_path = 'https://www.manhwa.cc/' + a.get('href') 98 | # count.append(menu_path) 99 | # menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path)) 100 | menu_path_num = re.findall(r"\d+\.?\d*", menu_path) 101 | 102 | # 当前一部书爬取循环,从上面得到每一章地址后,遍历这么多“章”次 103 | 104 | # for num in menu_path_num: 105 | print('book_url:', menu_path) 106 | 107 | circle = requests.get(menu_path) 108 | # 将获取的图片地址依次放入count中 109 | count = [] 110 | # 将获取的网页内容放入BeautifulSoup 111 | soup = BeautifulSoup(circle.text, 'lxml') 112 | 113 | for title in soup.select('div.fl.r_tab_l'): 114 | for title in title.find_all('span'): 115 | print('title:', title.text) 116 | title = title.text 117 | 118 | for item in soup.select('.r_img'): 119 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 120 | for img in item.find_all('img'): 121 | print('img_url:', img) 122 | # m 是 img标签中存在的属性 123 | img_path = img.get('data-original') 124 | count.append(img_path) 125 | 126 | # 用enumerate依次取出count中的图片地址 放入v中 127 | os.makedirs('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//' + str(title) + '//') 128 | for i, v in enumerate(count): 129 | # 将获取的v值再次放入request中进行与网站相应 130 | image = requests.get(v) 131 | # 存取图片过程中,出现不能存储 int 类型,故而,我们对他进行类型转换 str()。w:读写方式打开,b:二进制进行读写。图片一般用到的都是二进制。 132 | with open('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//' + str( 133 | title) + '//' + str( 134 | i) + '.jpg', 'wb') as file: 135 | # with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file: 136 | # content:图片转换成二进制,进行保存。 137 | file.write(image.content) 138 | print(i) 139 | 140 | 141 | 142 | #爬取代码 143 | 144 | def selectName(self): 145 | name,ok = QInputDialog.getText(self,"第一本","第一本序号:", 146 | QLineEdit.Normal,self.nameLable.text()) 147 | if ok and (len(name)!=0): 148 | self.nameLable.setText(name) 149 | def selectStyle(self): 150 | style, ok = QInputDialog.getText(self, "最后一本", "最后一本序号:", 151 | QLineEdit.Normal, self.nameLable.text()) 152 | if ok and (len(style)!=0): 153 | self.styleLable.setText(style) 154 | def selectOk(self): 155 | self.ManHua() 156 | #self.first=int(self.nameLable.text()) 157 | #self.last=self.styleLable.text() 158 | #print(self.first, self.last) 159 | #os.system(r"python D:\manhua\整站爬取www.manhwa.cc\manhua3.py") 160 | 161 | 162 | if __name__=="__main__": 163 | import sys 164 | app=QApplication(sys.argv) 165 | myshow=InputDialog() 166 | myshow.show() 167 | sys.exit(app.exec_()) 168 | os.system(r"F:\CloudMusic\是萝莉控真是太好了.mp3") 169 | 170 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 第一代版本: 2 | 3 |
4 |
5 | 6 | # 目的 7 | 8 |
9 | 10 | 1.目标网站:https://www.这里是网址.cc/ 11 | 12 | ![1565345268608](https://littlebuzi.github.io/post-images/1565345268608.png) 13 | 14 | 2.目标结果:获取全部漫画图片文件,并分好文件夹 15 | 16 | 17 |
18 |
19 | 20 | # 实现过程 21 | 22 | 23 |
24 | 25 | 26 | ## 基本逻辑 27 | 28 | ![1565360279264](https://littlebuzi.github.io/post-images/1565360279264.png) 29 | 30 | ## 代码实现 31 | 32 |
33 | 34 | ``` 35 | 36 | import requests 37 | from bs4 import BeautifulSoup 38 | import re 39 | import os 40 | 41 | #1-1030 42 | for num1 in range(1,1031): 43 | circle = requests.get('https://这里是网址/book/'+str(num1)) 44 | # 将获取的图片地址依次放入count中 45 | count = [] 46 | # 将获取的网页内容放入BeautifulSoup 47 | soup = BeautifulSoup(circle.text, 'lxml') 48 | # 根据谷歌SelectGadGet这个插件,获取html标签,比如获取:#gallery-list 49 | 50 | for item_book in soup.select('.d_bg_t'): 51 | for book_name in item_book.find_all('a'): 52 | if(book_name.string!='韩国'and book_name.string!='男性'): 53 | book_name_clean=book_name.string 54 | print(num1, book_name_clean) 55 | 56 | os.makedirs('D://manhua//整站漫画爬取//' + str(num1) +'.'+ book_name_clean ) 57 | 58 | #menu_path_num = [] 59 | 60 | for item in soup.select('.d_menu>ul>li'): 61 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 62 | for a in item.find_all('a'): 63 | #print('a', a) 64 | # m 是 img标签中存在的属性 65 | menu_path = 'https://www.manhwa.cc/' + a.get('href') 66 | #count.append(menu_path) 67 | #menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path)) 68 | menu_path_num=re.findall(r"\d+\.?\d*", menu_path) 69 | 70 | #当前一部书爬取循环,从上面得到每一章地址后,遍历这么多“章”次 71 | 72 | #for num in menu_path_num: 73 | print('book_url:',menu_path) 74 | circle = requests.get(menu_path) 75 | # 将获取的图片地址依次放入count中 76 | count = [] 77 | # 将获取的网页内容放入BeautifulSoup 78 | soup = BeautifulSoup(circle.text, 'lxml') 79 | # 根据谷歌SelectGadGet这个插件,获取html标签,比如获取:#gallery-list 80 | 81 | for title in soup.select('div.fl.r_tab_l'): 82 | for title in title.find_all('span'): 83 | print('title:', title.text) 84 | title=title.text 85 | 86 | for item in soup.select('.r_img'): 87 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 88 | for img in item.find_all('img'): 89 | print('img_url:', img) 90 | # m 是 img标签中存在的属性 91 | img_path = img.get('data-original') 92 | count.append(img_path) 93 | # 用enumerate依次取出count中的图片地址 放入v中 94 | os.makedirs('D://manhua//整站漫画爬取//' + book_name_clean + '//' + str(title) + '//') 95 | for i, v in enumerate(count): 96 | # 将获取的v值再次放入request中进行与网站相应 97 | image = requests.get(v) 98 | # 存取图片过程中,出现不能存储 int 类型,故而,我们对他进行类型转换 str()。w:读写方式打开,b:二进制进行读写。图片一般用到的都是二进制。 99 | with open('D://manhua//整站漫画爬取//' + book_name_clean + '//'+ str(title) + '//' +str(i) + '.jpg', 'wb') as file: 100 | #with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file: 101 | # content:图片转换成二进制,进行保存。 102 | file.write(image.content) 103 | print(i) 104 | ``` 105 | 106 | 到这基本工作已完成,进入测试阶段,出现以下 107 | 108 |
109 |
110 | 111 | # 测试问题 112 | 113 | 114 | 1.第250本左右,书名字开始出现异常,爬取书名有其他文字并出现混乱,因为之前是通过最前面几本书的情况,通过抛弃字样,来筛选出书名,而后1030本里标签发生变动,所以之后通过只取第一个出现的标签代替现在的筛选。 115 | 116 | 2.文件夹命名及生成文件夹出错,由于整理时出现混乱,代码写重复了。而后修改。 117 | 118 | 3.中途停止,可能是网站识别到了这是爬虫,而后添加伪浏览器头部head,还是会停,基本是connect超时。 119 | 120 | 针对上面问题,修改成了2.0版本: 121 | 122 | ``` 123 | 124 | import requests 125 | from bs4 import BeautifulSoup 126 | import re 127 | import os 128 | 129 | #1-1030 130 | for num1 in range(2,1031): 131 | #716字符问题无法生成文件夹 132 | import urllib.request # url包 133 | 134 | def openUrl(circle): 135 | headers = { 136 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 137 | 'Host': 'jandan.net' 138 | } 139 | req = urllib.request.Request(circle, headers=headers) 140 | response = urllib.request.urlopen(req) # 请求 141 | html = response.read() # 获取 142 | html = html.decode("utf-8") # 解码 143 | print(html) # 打印 144 | 145 | if __name__ == "__main__": 146 | circle = requests.get('https://这里是网址/book/' + str(num1)) 147 | 148 | # 将获取的图片地址依次放入count中 149 | count = [] 150 | # 将获取的网页内容放入BeautifulSoup 151 | soup = BeautifulSoup(circle.text, 'lxml') 152 | # 根据谷歌SelectGadGet这个插件,获取html标签,比如获取:#gallery-list 153 | 154 | for item_book in soup.select('.d_bg_t'): 155 | for book_name in item_book.select('a')[0]: 156 | book_name_clean = book_name.string 157 | print(num1, book_name_clean) 158 | 159 | #os.makedirs('D://manhua//整站漫画爬取//' + str(num1) +'.'+ book_name_clean ) 160 | 161 | for item_book in soup.select('.d_bg_t'): 162 | for book_name in item_book.find_all('a'): 163 | if(book_name.string!='韩国'and book_name.string!='男性'): 164 | book_name_clean=book_name.string 165 | print(num1, book_name_clean) 166 | 167 | #menu_path_num = [] 168 | 169 | for item in soup.select('.d_menu>ul>li'): 170 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 171 | for a in item.find_all('a'): 172 | #print('a', a) 173 | # m 是 img标签中存在的属性 174 | menu_path = 'https://www.manhwa.cc/' + a.get('href') 175 | #count.append(menu_path) 176 | #menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path)) 177 | menu_path_num=re.findall(r"\d+\.?\d*", menu_path) 178 | 179 | #当前一部书爬取循环,从上面得到每一章地址后,遍历这么多“章”次 180 | 181 | #for num in menu_path_num: 182 | print('book_url:',menu_path) 183 | 184 | circle = requests.get(menu_path) 185 | # 将获取的图片地址依次放入count中 186 | count = [] 187 | # 将获取的网页内容放入BeautifulSoup 188 | soup = BeautifulSoup(circle.text, 'lxml') 189 | 190 | for title in soup.select('div.fl.r_tab_l'): 191 | for title in title.find_all('span'): 192 | print('title:', title.text) 193 | title=title.text 194 | 195 | for item in soup.select('.r_img'): 196 | # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签 197 | for img in item.find_all('img'): 198 | print('img_url:', img) 199 | # m 是 img标签中存在的属性 200 | img_path = img.get('data-original') 201 | count.append(img_path) 202 | 203 | # 用enumerate依次取出count中的图片地址 放入v中 204 | os.makedirs('D://manhua//整站漫画爬取//' + book_name_clean + '//' + str(title) + '//') 205 | for i, v in enumerate(count): 206 | # 将获取的v值再次放入request中进行与网站相应 207 | image = requests.get(v) 208 | # 存取图片过程中,出现不能存储 int 类型,故而,我们对他进行类型转换 str()。w:读写方式打开,b:二进制进行读写。图片一般用到的都是二进制。 209 | with open('D://manhua//整站漫画爬取//' + book_name_clean + '//'+ str(title) + '//' +str(i) + '.jpg', 'wb') as file: 210 | #with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file: 211 | # content:图片转换成二进制,进行保存。 212 | file.write(image.content) 213 | print(i) 214 | 215 | ``` 216 | 217 |
218 |
219 | 220 | # 爬取过程: 221 | 222 |
223 |
224 | 225 | ![1565361188393](https://littlebuzi.github.io/post-images/1565361188393.png) 226 | 227 | ![1565361235369](https://littlebuzi.github.io/post-images/1565361235369.png) 228 | 229 | ![1565361244280](https://littlebuzi.github.io/post-images/1565361244280.png) 230 | 231 | 基本可行,最高纪录 ,爬取四本后停止。 232 | 233 | 真的太多了,一本大小平均150M左右。 234 | 235 |
236 |
237 | 238 | # 总结: 239 | 240 |
241 |
242 | 243 | 爬取正本漫画 ✅ 244 | 245 | 整站漫画半自动化爬取(停止需手动启动一次)✅ 246 | 247 | 全自动下载网站漫画 (会被网站截停)❌ 248 | 249 |
250 | 251 |
252 | 253 | # 2.0优化版 254 | 255 |
256 | 257 |
258 | 259 | ------ 260 | 261 |
262 | 263 | # 特点 264 | 265 |
266 | 267 | ui界面添加✅ 268 | 269 | 网站截停后 播放音乐提醒 接近半自动重启 ✅ 270 | 271 | 各个细节爬取优化,优化接近自身无报错 ✅ 272 | 273 | cmd输出界面优化✅ 274 | 275 | 计时器检测添加中(待) 276 | 277 | 全自动重启(待) 278 | 279 |
280 | 281 | ------ 282 | 283 |
284 | 285 | # 逻辑 286 | 287 |
288 | 289 | ![1565871675669](https://littlebuzi.github.io/post-images/1565871675669.png) 290 | 291 |
292 | 293 | ------ 294 | 295 |
296 | 297 | # 代码 298 | 299 |
300 | 301 | ## 启动代码: 302 | 303 | ``` 304 | 305 | import os 306 | 307 | os.system(r"python D:\manhua\这里是网址\manhua4.py") 308 | 309 | os.system(r"F:\CloudMusic\是萝莉控真是太好了.mp3") 310 | 311 | ``` 312 | 313 | ## 爬取代码: 314 | 315 | ``` 316 | 317 | import requests 318 | from bs4 import BeautifulSoup 319 | import re 320 | import os 321 | 322 | from PyQt5.QtWidgets import QApplication, QWidget, QLineEdit, QInputDialog, QGridLayout, QLabel, QPushButton, QFrame, QProgressBar 323 | 324 | first=1 325 | 326 | class InputDialog(QWidget): 327 | 328 | def __init__(self): 329 | super(InputDialog,self).__init__() 330 | self.initUi() 331 | 332 | def initUi(self): 333 | self.setWindowTitle("漫画爬取") 334 | self.setGeometry(50,50,1200,600) 335 | 336 | label1=QLabel("第一本:") 337 | label2=QLabel("最后一本:") 338 | 339 | self.nameLable = QLabel("1")#1 340 | self.first=int(self.nameLable.text()) 341 | self.nameLable.setText(str(self.first)) 342 | self.nameLable.setFrameStyle(QFrame.Panel|QFrame.Sunken) 343 | self.styleLable = QLabel("1")#1030 344 | self.last=self.styleLable.text() 345 | self.styleLable.setText(str(self.last)) 346 | self.styleLable.setFrameStyle(QFrame.Panel|QFrame.Sunken) 347 | 348 | # 设置进度条(弃用) 349 | 350 | nameButton=QPushButton("更改") 351 | nameButton.clicked.connect(self.selectName) 352 | styleButton=QPushButton("更改") 353 | styleButton.clicked.connect(self.selectStyle) 354 | okButton = QPushButton("OK") 355 | okButton.clicked.connect(self.selectOk) 356 | 357 | mainLayout=QGridLayout() 358 | mainLayout.addWidget(label1,0,0) 359 | mainLayout.addWidget(self.nameLable,0,1) 360 | mainLayout.addWidget(nameButton,0,2) 361 | mainLayout.addWidget(label2,1,0) 362 | mainLayout.addWidget(self.styleLable,1,1) 363 | mainLayout.addWidget(styleButton,1,2) 364 | mainLayout.addWidget(okButton,2,1) 365 | 366 | self.setLayout(mainLayout) 367 | 368 | #爬取代码 369 | 370 | def ManHua(self): 371 | 372 | for num1 in range(first,1030): 373 | import urllib.request # url包 374 | 375 | def openUrl(circle): 376 | headers = { 377 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 378 | 'Host': 'jandan.net' 379 | } 380 | req = urllib.request.Request(circle, headers=headers) 381 | response = urllib.request.urlopen(req) # 请求 382 | html = response.read() # 获取 383 | html = html.decode("utf-8") # 解码 384 | print(html) # 打印 385 | 386 | if __name__ == "__main__": 387 | circle = requests.get('https://这里是网址/book/' + str(num1)) 388 | 389 | count = [] 390 | soup = BeautifulSoup(circle.text, 'lxml') 391 | 392 | for item_book in soup.select('.d_bg_t'): 393 | for book_name in item_book.select('a')[0]: 394 | book_name_clean = book_name.string 395 | print('') 396 | print("正在下载:",num1, book_name_clean) 397 | aa=0 398 | #print(aa,num1) 399 | if num1>aa: 400 | aa=num1 401 | #print(aa) 402 | for i in range(int(num1*(100/1030))+1): 403 | print('\r'+'总进度:' + '▇' * (i // 2) + str(i) + '%', end='') 404 | print('') 405 | 406 | for item in soup.select('.d_menu>ul>li'): 407 | for a in item.find_all('a'): 408 | menu_path = 'https://这里是网址/' + a.get('href') 409 | # count.append(menu_path) 410 | # menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path)) 411 | menu_path_num = re.findall(r"\d+\.?\d*", menu_path) 412 | 413 | # 当前一部书爬取循环,从上面得到每一章地址后,遍历这么多“章”次 414 | 415 | # for num in menu_path_num: 416 | #print('book_url:', menu_path) 417 | 418 | 419 | circle = requests.get(menu_path) 420 | count = [] 421 | soup = BeautifulSoup(circle.text, 'lxml') 422 | #print(menu_path) 423 | print('.', end='') 424 | 425 | for title in soup.select('div.fl.r_tab_l'): 426 | for title in title.find_all('span'): 427 | #print('title:', title.text) 428 | title = title.text 429 | 430 | for item in soup.select('.r_img'): 431 | for img in item.find_all('img'): 432 | #print('img_url:', img) 433 | img_path = img.get('data-original') 434 | count.append(img_path) 435 | 436 | #自动识别'文件夹+文件'重复后跳过下载如何continue 437 | if(os.path.exists('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/')): 438 | continue 439 | else: 440 | os.makedirs('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/') 441 | 442 | for i, v in enumerate(count): 443 | image = requests.get(v) 444 | if (os.path.exists('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/' + str(i) + '.jpg')): 445 | continue 446 | else: 447 | with open('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/' + str(i) + '.jpg', 'wb') as file: 448 | file.write(image.content) 449 | #print(i) 450 | continue 451 | continue 452 | 453 | 454 | def selectName(self): 455 | name,ok = QInputDialog.getText(self,"第一本","第一本序号:", 456 | QLineEdit.Normal,self.nameLable.text()) 457 | if ok and (len(name)!=0): 458 | self.nameLable.setText(name) 459 | def selectStyle(self): 460 | style, ok = QInputDialog.getText(self, "最后一本", "最后一本序号:", 461 | QLineEdit.Normal, self.nameLable.text()) 462 | if ok and (len(style)!=0): 463 | self.styleLable.setText(style) 464 | def selectOk(self): 465 | self.ManHua() 466 | 467 | if __name__=="__main__": 468 | import sys 469 | app=QApplication(sys.argv) 470 | myshow=InputDialog() 471 | myshow.show() 472 | sys.exit(app.exec_()) 473 | 474 | 475 | ``` 476 | 477 |
478 | 479 | ------ 480 | 481 |
482 | 483 | # 过程 484 | 485 |
486 | 487 | ![1565763309680](https://littlebuzi.github.io/post-images/1565763309680.png) 488 | 489 | ![1565763331643](https://littlebuzi.github.io/post-images/1565763331643.png) 490 | 491 | ![1565763348226](https://littlebuzi.github.io/post-images/1565763348226.png) 492 | 493 |
494 | 495 | ------ 496 | 497 |
498 | 499 | # 总结 500 | 501 |
502 | 503 | 整站漫画全自动化爬取✅ 504 | 505 | 不能自动重启❌ 506 | 507 | 基本百分之95的功能实现,项目可宣布成功完成!✅ 508 | 509 |
510 | --------------------------------------------------------------------------------