├── manhwa_v5.1.py
├── manhwa_v1.0.py
├── manhwa_v2.0.py
├── manhwa_v3.0.py
├── manhwa_v5.0.py
├── manhwa_v4.0.py
└── README.md


/manhwa_v5.1.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | os.system(r"python D:\manhua\整站爬取www.manhwa.cc\manhua4.py")
4 | 
5 | os.system(r"F:\CloudMusic\是萝莉控真是太好了.mp3")
6 | 


--------------------------------------------------------------------------------
/manhwa_v1.0.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import os
 5 | 
 6 | #1-1030
 7 | for num1 in range(1,1031):
 8 |     circle = requests.get('https://www.manhwa.cc/book/'+str(num1))
 9 |     # 将获取的图片地址依次放入count中
10 |     count = []
11 |     # 将获取的网页内容放入BeautifulSoup
12 |     soup = BeautifulSoup(circle.text, 'lxml')
13 |     # 根据谷歌SelectGadGet这个插件，获取html标签，比如获取：#gallery-list
14 | 
15 |     for item_book in soup.select('.d_bg_t'):
16 |         for book_name in item_book.find_all('a'):
17 |             if(book_name.string!='韩国'and book_name.string!='男性'):
18 |                 book_name_clean=book_name.string
19 |                 print(num1, book_name_clean)
20 | 
21 |     os.makedirs('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + str(num1) +'.'+ book_name_clean )
22 | 
23 |     #menu_path_num = []
24 | 
25 |     for item in soup.select('.d_menu>ul>li'):
26 |         # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
27 |         for a in item.find_all('a'):
28 |             #print('a', a)
29 |             # m 是 img标签中存在的属性
30 |             menu_path = 'https://www.manhwa.cc/' + a.get('href')
31 |             #count.append(menu_path)
32 |             #menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path))
33 |             menu_path_num=re.findall(r"\d+\.?\d*", menu_path)
34 | 
35 |             #当前一部书爬取循环，从上面得到每一章地址后，遍历这么多“章”次
36 | 
37 |             #for num in menu_path_num:
38 |             print('book_url:',menu_path)
39 |             circle = requests.get(menu_path)
40 |             # 将获取的图片地址依次放入count中
41 |             count = []
42 |             # 将获取的网页内容放入BeautifulSoup
43 |             soup = BeautifulSoup(circle.text, 'lxml')
44 |             # 根据谷歌SelectGadGet这个插件，获取html标签，比如获取：#gallery-list
45 | 
46 | 
47 |             for title in soup.select('div.fl.r_tab_l'):
48 |                 for title in title.find_all('span'):
49 |                     print('title:', title.text)
50 |                     title=title.text
51 | 
52 |             for item in soup.select('.r_img'):
53 |                 # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
54 |                 for img in item.find_all('img'):
55 |                     print('img_url:', img)
56 |                     # m 是 img标签中存在的属性
57 |                     img_path = img.get('data-original')
58 |                     count.append(img_path)
59 |             # 用enumerate依次取出count中的图片地址 放入v中
60 |             os.makedirs('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' +  book_name_clean + '//' + str(title) + '//')
61 |             for i, v in enumerate(count):
62 |                 # 将获取的v值再次放入request中进行与网站相应
63 |                 image = requests.get(v)
64 |                 # 存取图片过程中，出现不能存储 int 类型，故而，我们对他进行类型转换 str()。w:读写方式打开，b：二进制进行读写。图片一般用到的都是二进制。
65 |                 with open('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//'+ str(title) + '//' +str(i) + '.jpg', 'wb') as file:
66 |                 #with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file:
67 |                     # content：图片转换成二进制，进行保存。
68 |                     file.write(image.content)
69 |                 print(i)
70 | 
71 | 


--------------------------------------------------------------------------------
/manhwa_v2.0.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import os
 5 | 
 6 | #1-1030
 7 | for num1 in range(1,1030):
 8 |     #716字符问题无法生成文件夹
 9 | 
10 |     import urllib.request  # url包
11 | 
12 |     def openUrl(circle):
13 |         headers = {
14 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
15 |             'Host': 'jandan.net'
16 |         }
17 |         req = urllib.request.Request(circle, headers=headers)
18 |         response = urllib.request.urlopen(req)  # 请求
19 |         html = response.read()  # 获取
20 |         html = html.decode("utf-8")  # 解码
21 |         print(html)  # 打印
22 | 
23 |     if __name__ == "__main__":
24 |         circle = requests.get('https://www.manhwa.cc/book/' + str(num1))
25 | 
26 |    # circle = requests.get('https://www.manhwa.cc/book/'+str(num1))
27 |     # 将获取的图片地址依次放入count中
28 |     count = []
29 |     # 将获取的网页内容放入BeautifulSoup
30 |     soup = BeautifulSoup(circle.text, 'lxml')
31 |     # 根据谷歌SelectGadGet这个插件，获取html标签，比如获取：#gallery-list
32 | 
33 |     for item_book in soup.select('.d_bg_t'):
34 |         for book_name in item_book.select('a')[0]:
35 |             book_name_clean = book_name.string
36 |             print(num1, book_name_clean)
37 | 
38 | 
39 |     #menu_path_num = []
40 | 
41 |     for item in soup.select('.d_menu>ul>li'):
42 |         # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
43 |         for a in item.find_all('a'):
44 |             #print('a', a)
45 |             # m 是 img标签中存在的属性
46 |             menu_path = 'https://www.manhwa.cc/' + a.get('href')
47 |             #count.append(menu_path)
48 |             #menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path))
49 |             menu_path_num=re.findall(r"\d+\.?\d*", menu_path)
50 | 
51 |             #当前一部书爬取循环，从上面得到每一章地址后，遍历这么多“章”次
52 | 
53 |             #for num in menu_path_num:
54 |             print('book_url:',menu_path)
55 |             
56 |             circle = requests.get(menu_path)
57 |             # 将获取的图片地址依次放入count中
58 |             count = []
59 |             # 将获取的网页内容放入BeautifulSoup
60 |             soup = BeautifulSoup(circle.text, 'lxml')
61 | 
62 |             for title in soup.select('div.fl.r_tab_l'):
63 |                 for title in title.find_all('span'):
64 |                     print('title:', title.text)
65 |                     title=title.text
66 | 
67 |             for item in soup.select('.r_img'):
68 |                 # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
69 |                 for img in item.find_all('img'):
70 |                     print('img_url:', img)
71 |                     # m 是 img标签中存在的属性
72 |                     img_path = img.get('data-original')
73 |                     count.append(img_path)
74 |                     
75 |             # 用enumerate依次取出count中的图片地址 放入v中
76 |             os.makedirs('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' +  book_name_clean + '//' + str(title) + '//')
77 |             for i, v in enumerate(count):
78 |                 # 将获取的v值再次放入request中进行与网站相应
79 |                 image = requests.get(v)
80 |                 # 存取图片过程中，出现不能存储 int 类型，故而，我们对他进行类型转换 str()。w:读写方式打开，b：二进制进行读写。图片一般用到的都是二进制。
81 |                 with open('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//'+ str(title) + '//' +str(i) + '.jpg', 'wb') as file:
82 |                 #with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file:
83 |                     # content：图片转换成二进制，进行保存。
84 |                     file.write(image.content)
85 |                 print(i)
86 | 


--------------------------------------------------------------------------------
/manhwa_v3.0.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from bs4 import BeautifulSoup
 3 | import re
 4 | import os
 5 | 
 6 | # 1-1030
 7 | 
 8 | from manhua3_ui import A
 9 | 
10 | A().aa()
11 | 
12 | '''
13 | for num1 in range(first, last):
14 |     # 716字符问题无法生成文件夹
15 | 
16 |     import urllib.request  # url包
17 | 
18 |     def openUrl(circle):
19 |         headers = {
20 |             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
21 |             'Host': 'jandan.net'
22 |         }
23 |         req = urllib.request.Request(circle, headers=headers)
24 |         response = urllib.request.urlopen(req)  # 请求
25 |         html = response.read()  # 获取
26 |         html = html.decode("utf-8")  # 解码
27 |         print(html)  # 打印
28 | 
29 | 
30 |     if __name__ == "__main__":
31 |         circle = requests.get('https://www.manhwa.cc/book/' + str(num1))
32 | 
33 |     # circle = requests.get('https://www.manhwa.cc/book/'+str(num1))
34 |     # 将获取的图片地址依次放入count中
35 |     count = []
36 |     # 将获取的网页内容放入BeautifulSoup
37 |     soup = BeautifulSoup(circle.text, 'lxml')
38 |     # 根据谷歌SelectGadGet这个插件，获取html标签，比如获取：#gallery-list
39 | 
40 |     for item_book in soup.select('.d_bg_t'):
41 |         for book_name in item_book.select('a')[0]:
42 |             book_name_clean = book_name.string
43 |             print(num1, book_name_clean)
44 | 
45 |     for item in soup.select('.d_menu>ul>li'):
46 |         # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
47 |         for a in item.find_all('a'):
48 |             # print('a', a)
49 |             # m 是 img标签中存在的属性
50 |             menu_path = 'https://www.manhwa.cc/' + a.get('href')
51 |             # count.append(menu_path)
52 |             # menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path))
53 |             menu_path_num = re.findall(r"\d+\.?\d*", menu_path)
54 | 
55 |             # 当前一部书爬取循环，从上面得到每一章地址后，遍历这么多“章”次
56 | 
57 |             # for num in menu_path_num:
58 |             print('book_url:', menu_path)
59 | 
60 |             circle = requests.get(menu_path)
61 |             # 将获取的图片地址依次放入count中
62 |             count = []
63 |             # 将获取的网页内容放入BeautifulSoup
64 |             soup = BeautifulSoup(circle.text, 'lxml')
65 | 
66 |             for title in soup.select('div.fl.r_tab_l'):
67 |                 for title in title.find_all('span'):
68 |                     print('title:', title.text)
69 |                     title = title.text
70 | 
71 |             for item in soup.select('.r_img'):
72 |                 # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
73 |                 for img in item.find_all('img'):
74 |                     print('img_url:', img)
75 |                     # m 是 img标签中存在的属性
76 |                     img_path = img.get('data-original')
77 |                     count.append(img_path)
78 | 
79 |             # 用enumerate依次取出count中的图片地址 放入v中
80 |             os.makedirs('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//' + str(title) + '//')
81 |             for i, v in enumerate(count):
82 |                 # 将获取的v值再次放入request中进行与网站相应
83 |                 image = requests.get(v)
84 |                 # 存取图片过程中，出现不能存储 int 类型，故而，我们对他进行类型转换 str()。w:读写方式打开，b：二进制进行读写。图片一般用到的都是二进制。
85 |                 with open('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//' + str(title) + '//' + str(
86 |                         i) + '.jpg', 'wb') as file:
87 |                     # with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file:
88 |                     # content：图片转换成二进制，进行保存。
89 |                     file.write(image.content)
90 | '''                #print(i)
91 | 
92 | 


--------------------------------------------------------------------------------
/manhwa_v5.0.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import re
  4 | import os
  5 | 
  6 | from PyQt5.QtWidgets import QApplication, QWidget, QLineEdit, QInputDialog, QGridLayout, QLabel, QPushButton, QFrame, QProgressBar
  7 | 
  8 | first=16
  9 | 
 10 | class InputDialog(QWidget):
 11 | 
 12 |     def __init__(self):
 13 |         super(InputDialog,self).__init__()
 14 |         self.initUi()
 15 | 
 16 |     def initUi(self):
 17 |         self.setWindowTitle("漫画爬取")
 18 |         self.setGeometry(50,50,1200,600)
 19 | 
 20 |         label1=QLabel("第一本:")
 21 |         label2=QLabel("最后一本:")
 22 | 
 23 |         self.nameLable = QLabel("1")#1
 24 |         self.first=int(self.nameLable.text())
 25 |         self.nameLable.setText(str(self.first))
 26 |         self.nameLable.setFrameStyle(QFrame.Panel|QFrame.Sunken)
 27 |         self.styleLable = QLabel("1")#1030
 28 |         self.last=self.styleLable.text()
 29 |         self.styleLable.setText(str(self.last))
 30 |         self.styleLable.setFrameStyle(QFrame.Panel|QFrame.Sunken)
 31 | 
 32 |         # 设置进度条(弃用)
 33 | 
 34 |         nameButton=QPushButton("更改")
 35 |         nameButton.clicked.connect(self.selectName)
 36 |         styleButton=QPushButton("更改")
 37 |         styleButton.clicked.connect(self.selectStyle)
 38 |         okButton = QPushButton("OK")
 39 |         okButton.clicked.connect(self.selectOk)
 40 | 
 41 |         mainLayout=QGridLayout()
 42 |         mainLayout.addWidget(label1,0,0)
 43 |         mainLayout.addWidget(self.nameLable,0,1)
 44 |         mainLayout.addWidget(nameButton,0,2)
 45 |         mainLayout.addWidget(label2,1,0)
 46 |         mainLayout.addWidget(self.styleLable,1,1)
 47 |         mainLayout.addWidget(styleButton,1,2)
 48 |         mainLayout.addWidget(okButton,2,1)
 49 | 
 50 |         self.setLayout(mainLayout)
 51 | 
 52 |     #爬取代码
 53 |     def ManHua(self):
 54 | 
 55 |         for num1 in range(first,1030):
 56 |             import urllib.request  # url包
 57 | 
 58 |             def openUrl(circle):
 59 |                 headers = {
 60 |                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
 61 |                     'Host': 'jandan.net'
 62 |                 }
 63 |                 req = urllib.request.Request(circle, headers=headers)
 64 |                 response = urllib.request.urlopen(req)  # 请求
 65 |                 html = response.read()  # 获取
 66 |                 html = html.decode("utf-8")  # 解码
 67 |                 print(html)  # 打印
 68 | 
 69 |             if __name__ == "__main__":
 70 |                 circle = requests.get('https://www.manhwa.cc/book/' + str(num1))
 71 | 
 72 |             count = []
 73 |             soup = BeautifulSoup(circle.text, 'lxml')
 74 | 
 75 |             for item_book in soup.select('.d_bg_t'):
 76 |                 for book_name in item_book.select('a')[0]:
 77 |                     book_name_clean = book_name.string
 78 |                     print('')
 79 |                     print("正在下载：",num1, book_name_clean)
 80 |                     aa=0
 81 |                     #print(aa,num1)
 82 |                     if num1>aa:
 83 |                         aa=num1
 84 |                         #print(aa)
 85 |                         for i in range(int(num1*(100/1030))+1):
 86 |                             print('\r'+'总进度：' + '▇' * (i // 2) + str(i) + '%', end='')
 87 |                             print('')
 88 | 
 89 |             for item in soup.select('.d_menu>ul>li'):
 90 |                 for a in item.find_all('a'):
 91 |                     menu_path = 'https://www.manhwa.cc/' + a.get('href')
 92 |                     # count.append(menu_path)
 93 |                     # menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path))
 94 |                     menu_path_num = re.findall(r"\d+\.?\d*", menu_path)
 95 | 
 96 |                     # 当前一部书爬取循环，从上面得到每一章地址后，遍历这么多“章”次
 97 | 
 98 |                     # for num in menu_path_num:
 99 |                     #print('book_url:', menu_path)
100 | 
101 | 
102 |                     circle = requests.get(menu_path)
103 |                     # 将获取的图片地址依次放入count中
104 |                     count = []
105 |                     # 将获取的网页内容放入BeautifulSoup
106 |                     soup = BeautifulSoup(circle.text, 'lxml')
107 |                     #print(menu_path)
108 |                     print('.', end='')
109 | 
110 |                     for title in soup.select('div.fl.r_tab_l'):
111 |                         for title in title.find_all('span'):
112 |                             #print('title:', title.text)
113 |                             title = title.text
114 | 
115 |                     for item in soup.select('.r_img'):
116 |                         # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
117 |                         for img in item.find_all('img'):
118 |                             #print('img_url:', img)
119 |                             # m 是 img标签中存在的属性
120 |                             img_path = img.get('data-original')
121 |                             count.append(img_path)
122 | 
123 |                     #自动识别'文件夹+文件'重复后跳过下载如何continue
124 |                     if(os.path.exists('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/')):
125 |                         continue
126 |                     else:
127 |                         os.makedirs('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/')
128 | 
129 |                         for i, v in enumerate(count):
130 |                             image = requests.get(v)
131 |                             if (os.path.exists('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/' + str(i) + '.jpg')):
132 |                                 continue
133 |                             else:
134 |                                 with open('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/' + str(i) + '.jpg', 'wb') as file:
135 |                                     file.write(image.content)
136 |                                 #print(i)
137 |                                 continue
138 |                         continue
139 | 
140 | 
141 |     def selectName(self):
142 |         name,ok = QInputDialog.getText(self,"第一本","第一本序号:",
143 |                                        QLineEdit.Normal,self.nameLable.text())
144 |         if ok and (len(name)!=0):
145 |             self.nameLable.setText(name)
146 |     def selectStyle(self):
147 |         style, ok = QInputDialog.getText(self, "最后一本", "最后一本序号:",
148 |                                         QLineEdit.Normal, self.nameLable.text())
149 |         if ok and (len(style)!=0):
150 |             self.styleLable.setText(style)
151 |     def selectOk(self):
152 |         self.ManHua()
153 |         os.system(r"F:\CloudMusic\是萝莉控真是太好了.mp3")
154 |         #self.first=int(self.nameLable.text())
155 |         #self.last=self.styleLable.text()
156 |         #print(self.first, self.last)
157 |         #os.system(r"python D:\manhua\整站爬取www.manhwa.cc\manhua3.py")
158 | 
159 | 
160 | 
161 | if __name__=="__main__":
162 |     import sys
163 |     app=QApplication(sys.argv)
164 |     myshow=InputDialog()
165 |     myshow.show()
166 |     #InputDialog().ManHua()
167 |     sys.exit(app.exec_())
168 | 
169 | 
170 | 
171 | 
172 | 
173 | 


--------------------------------------------------------------------------------
/manhwa_v4.0.py:
--------------------------------------------------------------------------------
  1 | import requests
  2 | from bs4 import BeautifulSoup
  3 | import re
  4 | import os
  5 | 
  6 | from PyQt5.QtWidgets import QApplication, QWidget, QLineEdit, QInputDialog, QGridLayout, QLabel, QPushButton, QFrame
  7 | 
  8 | class InputDialog(QWidget):
  9 | 
 10 |     def __init__(self):
 11 |         super(InputDialog,self).__init__()
 12 |         self.initUi()
 13 | 
 14 |     def initUi(self):
 15 |         self.setWindowTitle("漫画爬取")
 16 |         self.setGeometry(400,400,300,260)
 17 | 
 18 |         label1=QLabel("第一本:")
 19 |         label2=QLabel("最后一本:")
 20 | 
 21 |         self.nameLable = QLabel("2")#1
 22 |         self.first=int(self.nameLable.text())
 23 |         self.nameLable.setText(str(self.first))
 24 |         self.nameLable.setFrameStyle(QFrame.Panel|QFrame.Sunken)
 25 |         self.styleLable = QLabel("2")#1030
 26 |         self.last=self.styleLable.text()
 27 |         self.styleLable.setText(str(self.last))
 28 |         self.styleLable.setFrameStyle(QFrame.Panel|QFrame.Sunken)
 29 | 
 30 |         nameButton=QPushButton("更改")
 31 |         nameButton.clicked.connect(self.selectName)
 32 |         styleButton=QPushButton("更改")
 33 |         styleButton.clicked.connect(self.selectStyle)
 34 |         okButton = QPushButton("OK")
 35 |         okButton.clicked.connect(self.selectOk)
 36 | 
 37 |         mainLayout=QGridLayout()
 38 |         mainLayout.addWidget(label1,0,0)
 39 |         mainLayout.addWidget(self.nameLable,0,1)
 40 |         mainLayout.addWidget(nameButton,0,2)
 41 |         mainLayout.addWidget(label2,1,0)
 42 |         mainLayout.addWidget(self.styleLable,1,1)
 43 |         mainLayout.addWidget(styleButton,1,2)
 44 |         mainLayout.addWidget(okButton,2,1)
 45 | 
 46 |         self.setLayout(mainLayout)
 47 | 
 48 |         #爬取代码
 49 | 
 50 |     def ManHua(self):
 51 |         #2/3/4/
 52 |         #多线程同时几个py def传值出去++？
 53 |         num=8
 54 |         for num1 in range(num,num+1):
 55 | 
 56 |             #循环过大（大于2都不行） 会直接返回Process finished with exit code -1073740791 (0xC0000409)
 57 |             #716字符问题无法生成文件夹
 58 |             #循环不行 只能采用一次次一本本下载完提醒 按多一次
 59 |             #后来发现不是循环不行 是这个
 60 |             # self.nameLable.setText(num1-1)
 61 |             # self.styleLable.setText(num1)
 62 |             #循环也有点关系
 63 | 
 64 |             import urllib.request  # url包
 65 | 
 66 |             def openUrl(circle):
 67 |                 headers = {
 68 |                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
 69 |                     'Host': 'jandan.net'
 70 |                 }
 71 |                 req = urllib.request.Request(circle, headers=headers)
 72 |                 response = urllib.request.urlopen(req)  # 请求
 73 |                 html = response.read()  # 获取
 74 |                 html = html.decode("utf-8")  # 解码
 75 |                 print(html)  # 打印
 76 | 
 77 |             if __name__ == "__main__":
 78 |                 circle = requests.get('https://www.manhwa.cc/book/' + str(num1))
 79 | 
 80 |             # circle = requests.get('https://www.manhwa.cc/book/'+str(num1))
 81 |             # 将获取的图片地址依次放入count中
 82 |             count = []
 83 |             # 将获取的网页内容放入BeautifulSoup
 84 |             soup = BeautifulSoup(circle.text, 'lxml')
 85 |             # 根据谷歌SelectGadGet这个插件，获取html标签，比如获取：#gallery-list
 86 | 
 87 |             for item_book in soup.select('.d_bg_t'):
 88 |                 for book_name in item_book.select('a')[0]:
 89 |                     book_name_clean = book_name.string
 90 |                     print(num1, book_name_clean)
 91 | 
 92 |             for item in soup.select('.d_menu>ul>li'):
 93 |                 # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
 94 |                 for a in item.find_all('a'):
 95 |                     # print('a', a)
 96 |                     # m 是 img标签中存在的属性
 97 |                     menu_path = 'https://www.manhwa.cc/' + a.get('href')
 98 |                     # count.append(menu_path)
 99 |                     # menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path))
100 |                     menu_path_num = re.findall(r"\d+\.?\d*", menu_path)
101 | 
102 |                     # 当前一部书爬取循环，从上面得到每一章地址后，遍历这么多“章”次
103 | 
104 |                     # for num in menu_path_num:
105 |                     print('book_url:', menu_path)
106 | 
107 |                     circle = requests.get(menu_path)
108 |                     # 将获取的图片地址依次放入count中
109 |                     count = []
110 |                     # 将获取的网页内容放入BeautifulSoup
111 |                     soup = BeautifulSoup(circle.text, 'lxml')
112 | 
113 |                     for title in soup.select('div.fl.r_tab_l'):
114 |                         for title in title.find_all('span'):
115 |                             print('title:', title.text)
116 |                             title = title.text
117 | 
118 |                     for item in soup.select('.r_img'):
119 |                         # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
120 |                         for img in item.find_all('img'):
121 |                             print('img_url:', img)
122 |                             # m 是 img标签中存在的属性
123 |                             img_path = img.get('data-original')
124 |                             count.append(img_path)
125 | 
126 |                     # 用enumerate依次取出count中的图片地址 放入v中
127 |                     os.makedirs('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//' + str(title) + '//')
128 |                     for i, v in enumerate(count):
129 |                         # 将获取的v值再次放入request中进行与网站相应
130 |                         image = requests.get(v)
131 |                         # 存取图片过程中，出现不能存储 int 类型，故而，我们对他进行类型转换 str()。w:读写方式打开，b：二进制进行读写。图片一般用到的都是二进制。
132 |                         with open('D://manhua//整站爬取www.manhwa.cc//整站漫画爬取//' + book_name_clean + '//' + str(
133 |                                 title) + '//' + str(
134 |                                 i) + '.jpg', 'wb') as file:
135 |                             # with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file:
136 |                             # content：图片转换成二进制，进行保存。
137 |                             file.write(image.content)
138 |                         print(i)
139 | 
140 | 
141 | 
142 |             #爬取代码
143 | 
144 |     def selectName(self):
145 |         name,ok = QInputDialog.getText(self,"第一本","第一本序号:",
146 |                                        QLineEdit.Normal,self.nameLable.text())
147 |         if ok and (len(name)!=0):
148 |             self.nameLable.setText(name)
149 |     def selectStyle(self):
150 |         style, ok = QInputDialog.getText(self, "最后一本", "最后一本序号:",
151 |                                         QLineEdit.Normal, self.nameLable.text())
152 |         if ok and (len(style)!=0):
153 |             self.styleLable.setText(style)
154 |     def selectOk(self):
155 |         self.ManHua()
156 |         #self.first=int(self.nameLable.text())
157 |         #self.last=self.styleLable.text()
158 |         #print(self.first, self.last)
159 |         #os.system(r"python D:\manhua\整站爬取www.manhwa.cc\manhua3.py")
160 | 
161 | 
162 | if __name__=="__main__":
163 |     import sys
164 |     app=QApplication(sys.argv)
165 |     myshow=InputDialog()
166 |     myshow.show()
167 |     sys.exit(app.exec_())
168 |     os.system(r"F:\CloudMusic\是萝莉控真是太好了.mp3")
169 | 
170 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 第一代版本：
  2 | 
  3 | <hr>
  4 | <br>
  5 | 
  6 | # 目的
  7 | 
  8 | <br>
  9 | 
 10 | 1.目标网站：https://www.这里是网址.cc/
 11 | 
 12 | ![1565345268608](https://littlebuzi.github.io/post-images/1565345268608.png)
 13 | 
 14 | 2.目标结果：获取全部漫画图片文件，并分好文件夹
 15 | 
 16 | 
 17 | <hr>
 18 | <br>
 19 | 
 20 | # 实现过程
 21 | 
 22 | 
 23 | <br>
 24 | 
 25 | 
 26 | ## 基本逻辑
 27 | 
 28 | ![1565360279264](https://littlebuzi.github.io/post-images/1565360279264.png)
 29 | 
 30 | ## 代码实现
 31 | 
 32 | <br>
 33 | 
 34 | ```
 35 | 
 36 | import requests
 37 | from bs4 import BeautifulSoup
 38 | import re
 39 | import os
 40 | 
 41 | #1-1030
 42 | for num1 in range(1,1031):
 43 |     circle = requests.get('https://这里是网址/book/'+str(num1))
 44 |     # 将获取的图片地址依次放入count中
 45 |     count = []
 46 |     # 将获取的网页内容放入BeautifulSoup
 47 |     soup = BeautifulSoup(circle.text, 'lxml')
 48 |     # 根据谷歌SelectGadGet这个插件，获取html标签，比如获取：#gallery-list
 49 | 
 50 | for item_book in soup.select('.d_bg_t'):
 51 |     for book_name in item_book.find_all('a'):
 52 |         if(book_name.string!='韩国'and book_name.string!='男性'):
 53 |             book_name_clean=book_name.string
 54 |             print(num1, book_name_clean)
 55 | 
 56 | os.makedirs('D://manhua//整站漫画爬取//' + str(num1) +'.'+ book_name_clean )
 57 | 
 58 | #menu_path_num = []
 59 | 
 60 | for item in soup.select('.d_menu>ul>li'):
 61 |     # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
 62 |     for a in item.find_all('a'):
 63 |         #print('a', a)
 64 |         # m 是 img标签中存在的属性
 65 |         menu_path = 'https://www.manhwa.cc/' + a.get('href')
 66 |         #count.append(menu_path)
 67 |         #menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path))
 68 |         menu_path_num=re.findall(r"\d+\.?\d*", menu_path)
 69 | 
 70 |         #当前一部书爬取循环，从上面得到每一章地址后，遍历这么多“章”次
 71 | 
 72 |         #for num in menu_path_num:
 73 |         print('book_url:',menu_path)
 74 |         circle = requests.get(menu_path)
 75 |         # 将获取的图片地址依次放入count中
 76 |         count = []
 77 |         # 将获取的网页内容放入BeautifulSoup
 78 |         soup = BeautifulSoup(circle.text, 'lxml')
 79 |         # 根据谷歌SelectGadGet这个插件，获取html标签，比如获取：#gallery-list
 80 | 				
 81 |         for title in soup.select('div.fl.r_tab_l'):
 82 |             for title in title.find_all('span'):
 83 |                 print('title:', title.text)
 84 |                 title=title.text
 85 | 
 86 |         for item in soup.select('.r_img'):
 87 |             # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
 88 |             for img in item.find_all('img'):
 89 |                 print('img_url:', img)
 90 |                 # m 是 img标签中存在的属性
 91 |                 img_path = img.get('data-original')
 92 |                 count.append(img_path)
 93 |         # 用enumerate依次取出count中的图片地址 放入v中
 94 |         os.makedirs('D://manhua//整站漫画爬取//' +  book_name_clean + '//' + str(title) + '//')
 95 |         for i, v in enumerate(count):
 96 |             # 将获取的v值再次放入request中进行与网站相应
 97 |             image = requests.get(v)
 98 |             # 存取图片过程中，出现不能存储 int 类型，故而，我们对他进行类型转换 str()。w:读写方式打开，b：二进制进行读写。图片一般用到的都是二进制。
 99 |             with open('D://manhua//整站漫画爬取//' + book_name_clean + '//'+ str(title) + '//' +str(i) + '.jpg', 'wb') as file:
100 |             #with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file:
101 |                 # content：图片转换成二进制，进行保存。
102 |                 file.write(image.content)
103 |             print(i)
104 | ```
105 | 
106 | 到这基本工作已完成，进入测试阶段，出现以下
107 | 
108 | <hr>
109 | <br>
110 | 
111 | # 测试问题
112 | 
113 | 
114 | 1.第250本左右，书名字开始出现异常，爬取书名有其他文字并出现混乱，因为之前是通过最前面几本书的情况，通过抛弃字样，来筛选出书名，而后1030本里标签发生变动，所以之后通过只取第一个出现的标签代替现在的筛选。
115 | 
116 | 2.文件夹命名及生成文件夹出错，由于整理时出现混乱，代码写重复了。而后修改。
117 | 
118 | 3.中途停止，可能是网站识别到了这是爬虫，而后添加伪浏览器头部head，还是会停，基本是connect超时。
119 | 
120 | 针对上面问题，修改成了2.0版本:
121 | 
122 | ```
123 | 
124 | import requests
125 | from bs4 import BeautifulSoup
126 | import re
127 | import os
128 | 
129 | #1-1030
130 | for num1 in range(2,1031):
131 |     #716字符问题无法生成文件夹
132 | import urllib.request  # url包
133 | 
134 | def openUrl(circle):
135 |     headers = {
136 |         'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
137 |         'Host': 'jandan.net'
138 |     }
139 |     req = urllib.request.Request(circle, headers=headers)
140 |     response = urllib.request.urlopen(req)  # 请求
141 |     html = response.read()  # 获取
142 |     html = html.decode("utf-8")  # 解码
143 |     print(html)  # 打印
144 | 
145 | if __name__ == "__main__":
146 |     circle = requests.get('https://这里是网址/book/' + str(num1))
147 | 
148 | # 将获取的图片地址依次放入count中
149 | count = []
150 | # 将获取的网页内容放入BeautifulSoup
151 | soup = BeautifulSoup(circle.text, 'lxml')
152 | # 根据谷歌SelectGadGet这个插件，获取html标签，比如获取：#gallery-list
153 | 
154 | for item_book in soup.select('.d_bg_t'):
155 |     for book_name in item_book.select('a')[0]:
156 |         book_name_clean = book_name.string
157 |         print(num1, book_name_clean)
158 | 
159 | #os.makedirs('D://manhua//整站漫画爬取//' + str(num1) +'.'+ book_name_clean )
160 | 
161 | for item_book in soup.select('.d_bg_t'):
162 |     for book_name in item_book.find_all('a'):
163 |         if(book_name.string!='韩国'and book_name.string!='男性'):
164 |             book_name_clean=book_name.string
165 |             print(num1, book_name_clean)
166 | 
167 | #menu_path_num = []
168 | 
169 | for item in soup.select('.d_menu>ul>li'):
170 |     # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
171 |     for a in item.find_all('a'):
172 |         #print('a', a)
173 |         # m 是 img标签中存在的属性
174 |         menu_path = 'https://www.manhwa.cc/' + a.get('href')
175 |         #count.append(menu_path)
176 |         #menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path))
177 |         menu_path_num=re.findall(r"\d+\.?\d*", menu_path)
178 | 
179 |         #当前一部书爬取循环，从上面得到每一章地址后，遍历这么多“章”次
180 | 
181 |         #for num in menu_path_num:
182 |         print('book_url:',menu_path)
183 |         
184 |         circle = requests.get(menu_path)
185 |         # 将获取的图片地址依次放入count中
186 |         count = []
187 |         # 将获取的网页内容放入BeautifulSoup
188 |         soup = BeautifulSoup(circle.text, 'lxml')
189 | 
190 |         for title in soup.select('div.fl.r_tab_l'):
191 |             for title in title.find_all('span'):
192 |                 print('title:', title.text)
193 |                 title=title.text
194 | 
195 |         for item in soup.select('.r_img'):
196 |             # 用bs4中的find_all获取 #gallery-list 中是否存在 img这个标签
197 |             for img in item.find_all('img'):
198 |                 print('img_url:', img)
199 |                 # m 是 img标签中存在的属性
200 |                 img_path = img.get('data-original')
201 |                 count.append(img_path)
202 |                 
203 |         # 用enumerate依次取出count中的图片地址 放入v中
204 |         os.makedirs('D://manhua//整站漫画爬取//' +  book_name_clean + '//' + str(title) + '//')
205 |         for i, v in enumerate(count):
206 |             # 将获取的v值再次放入request中进行与网站相应
207 |             image = requests.get(v)
208 |             # 存取图片过程中，出现不能存储 int 类型，故而，我们对他进行类型转换 str()。w:读写方式打开，b：二进制进行读写。图片一般用到的都是二进制。
209 |             with open('D://manhua//整站漫画爬取//' + book_name_clean + '//'+ str(title) + '//' +str(i) + '.jpg', 'wb') as file:
210 |             #with open('C://Users//50159//Desktop//manhua//test//' + str(num1) + '_' + str(i) + '.jpg', 'wb') as file:
211 |                 # content：图片转换成二进制，进行保存。
212 |                 file.write(image.content)
213 |             print(i)
214 | 						
215 | ```
216 | 
217 | <hr>
218 | <br>
219 | 
220 | # 爬取过程：
221 | 
222 | <hr>
223 | <br>
224 | 
225 | ![1565361188393](https://littlebuzi.github.io/post-images/1565361188393.png)
226 | 
227 | ![1565361235369](https://littlebuzi.github.io/post-images/1565361235369.png)
228 | 
229 | ![1565361244280](https://littlebuzi.github.io/post-images/1565361244280.png)
230 | 
231 | 基本可行，最高纪录 ，爬取四本后停止。
232 | 
233 | 真的太多了，一本大小平均150M左右。
234 | 
235 | <hr>
236 | <br>
237 | 
238 | # 总结：
239 | 
240 | <hr>
241 | <br>
242 | 
243 | 爬取正本漫画 ✅
244 | 
245 | 整站漫画半自动化爬取（停止需手动启动一次）✅
246 | 
247 | 全自动下载网站漫画 （会被网站截停）❌
248 | 
249 | <br>
250 | 
251 | <center>
252 | 	
253 | # 2.0优化版
254 | 
255 | </center>
256 | 
257 | <br>
258 | 
259 | ------
260 | 
261 | <br>
262 | 
263 | # 特点
264 | 
265 | <br>
266 | 
267 | ui界面添加✅
268 | 
269 | 网站截停后 播放音乐提醒 接近半自动重启 ✅
270 | 
271 | 各个细节爬取优化，优化接近自身无报错 ✅
272 | 
273 | cmd输出界面优化✅
274 | 
275 | 计时器检测添加中（待）
276 | 
277 | 全自动重启（待）
278 | 
279 | <br>
280 | 
281 | ------
282 | 
283 | <br>
284 | 
285 | # 逻辑
286 | 
287 | <br>
288 | 
289 | ![1565871675669](https://littlebuzi.github.io/post-images/1565871675669.png)
290 | 
291 | <br>
292 | 
293 | ------
294 | 
295 | <br>
296 | 
297 | # 代码
298 | 
299 | <br>
300 | 
301 | ## 启动代码：
302 | 
303 | ```
304 | 
305 | import os
306 | 
307 | os.system(r"python D:\manhua\这里是网址\manhua4.py")
308 | 
309 | os.system(r"F:\CloudMusic\是萝莉控真是太好了.mp3")
310 | 
311 | ```
312 | 
313 | ## 爬取代码：
314 | 
315 | ```
316 | 
317 | import requests
318 | from bs4 import BeautifulSoup
319 | import re
320 | import os
321 | 
322 | from PyQt5.QtWidgets import QApplication, QWidget, QLineEdit, QInputDialog, QGridLayout, QLabel, QPushButton, QFrame, QProgressBar
323 | 
324 | first=1
325 | 
326 | class InputDialog(QWidget):
327 | 
328 |     def __init__(self):
329 |         super(InputDialog,self).__init__()
330 |         self.initUi()
331 | 
332 |     def initUi(self):
333 |         self.setWindowTitle("漫画爬取")
334 |         self.setGeometry(50,50,1200,600)
335 | 
336 |         label1=QLabel("第一本:")
337 |         label2=QLabel("最后一本:")
338 | 
339 |         self.nameLable = QLabel("1")#1
340 |         self.first=int(self.nameLable.text())
341 |         self.nameLable.setText(str(self.first))
342 |         self.nameLable.setFrameStyle(QFrame.Panel|QFrame.Sunken)
343 |         self.styleLable = QLabel("1")#1030
344 |         self.last=self.styleLable.text()
345 |         self.styleLable.setText(str(self.last))
346 |         self.styleLable.setFrameStyle(QFrame.Panel|QFrame.Sunken)
347 | 
348 |         # 设置进度条(弃用)
349 | 
350 |         nameButton=QPushButton("更改")
351 |         nameButton.clicked.connect(self.selectName)
352 |         styleButton=QPushButton("更改")
353 |         styleButton.clicked.connect(self.selectStyle)
354 |         okButton = QPushButton("OK")
355 |         okButton.clicked.connect(self.selectOk)
356 | 
357 |         mainLayout=QGridLayout()
358 |         mainLayout.addWidget(label1,0,0)
359 |         mainLayout.addWidget(self.nameLable,0,1)
360 |         mainLayout.addWidget(nameButton,0,2)
361 |         mainLayout.addWidget(label2,1,0)
362 |         mainLayout.addWidget(self.styleLable,1,1)
363 |         mainLayout.addWidget(styleButton,1,2)
364 |         mainLayout.addWidget(okButton,2,1)
365 | 
366 |         self.setLayout(mainLayout)
367 | 
368 |     #爬取代码
369 | 		
370 |     def ManHua(self):
371 | 
372 |         for num1 in range(first,1030):
373 |             import urllib.request  # url包
374 | 
375 |             def openUrl(circle):
376 |                 headers = {
377 |                     'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
378 |                     'Host': 'jandan.net'
379 |                 }
380 |                 req = urllib.request.Request(circle, headers=headers)
381 |                 response = urllib.request.urlopen(req)  # 请求
382 |                 html = response.read()  # 获取
383 |                 html = html.decode("utf-8")  # 解码
384 |                 print(html)  # 打印
385 | 
386 |             if __name__ == "__main__":
387 |                 circle = requests.get('https://这里是网址/book/' + str(num1))
388 | 
389 |             count = []
390 |             soup = BeautifulSoup(circle.text, 'lxml')
391 | 
392 |             for item_book in soup.select('.d_bg_t'):
393 |                 for book_name in item_book.select('a')[0]:
394 |                     book_name_clean = book_name.string
395 |                     print('')
396 |                     print("正在下载：",num1, book_name_clean)
397 |                     aa=0
398 |                     #print(aa,num1)
399 |                     if num1>aa:
400 |                         aa=num1
401 |                         #print(aa)
402 |                         for i in range(int(num1*(100/1030))+1):
403 |                             print('\r'+'总进度：' + '▇' * (i // 2) + str(i) + '%', end='')
404 |                             print('')
405 | 
406 |             for item in soup.select('.d_menu>ul>li'):
407 |                 for a in item.find_all('a'):
408 |                     menu_path = 'https://这里是网址/' + a.get('href')
409 |                     # count.append(menu_path)
410 |                     # menu_path_num.append(re.findall(r"\d+\.?\d*", menu_path))
411 |                     menu_path_num = re.findall(r"\d+\.?\d*", menu_path)
412 | 
413 |                     # 当前一部书爬取循环，从上面得到每一章地址后，遍历这么多“章”次
414 | 
415 |                     # for num in menu_path_num:
416 |                     #print('book_url:', menu_path)
417 | 
418 | 
419 |                     circle = requests.get(menu_path)
420 |                     count = []
421 |                     soup = BeautifulSoup(circle.text, 'lxml')
422 |                     #print(menu_path)
423 |                     print('.', end='')
424 | 
425 |                     for title in soup.select('div.fl.r_tab_l'):
426 |                         for title in title.find_all('span'):
427 |                             #print('title:', title.text)
428 |                             title = title.text
429 | 
430 |                     for item in soup.select('.r_img'):
431 |                         for img in item.find_all('img'):
432 |                             #print('img_url:', img)
433 |                             img_path = img.get('data-original')
434 |                             count.append(img_path)
435 | 
436 |                     #自动识别'文件夹+文件'重复后跳过下载如何continue
437 |                     if(os.path.exists('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/')):
438 |                         continue
439 |                     else:
440 |                         os.makedirs('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/')
441 | 
442 |                         for i, v in enumerate(count):
443 |                             image = requests.get(v)
444 |                             if (os.path.exists('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/' + str(i) + '.jpg')):
445 |                                 continue
446 |                             else:
447 |                                 with open('D:/manhua/manhuatest/' + book_name_clean + '/' + str(title) + '/' + str(i) + '.jpg', 'wb') as file:
448 |                                     file.write(image.content)
449 |                                 #print(i)
450 |                                 continue
451 |                         continue
452 | 
453 | 
454 |     def selectName(self):
455 |         name,ok = QInputDialog.getText(self,"第一本","第一本序号:",
456 |                                        QLineEdit.Normal,self.nameLable.text())
457 |         if ok and (len(name)!=0):
458 |             self.nameLable.setText(name)
459 |     def selectStyle(self):
460 |         style, ok = QInputDialog.getText(self, "最后一本", "最后一本序号:",
461 |                                         QLineEdit.Normal, self.nameLable.text())
462 |         if ok and (len(style)!=0):
463 |             self.styleLable.setText(style)
464 |     def selectOk(self):
465 |         self.ManHua()
466 | 
467 | if __name__=="__main__":
468 |     import sys
469 |     app=QApplication(sys.argv)
470 |     myshow=InputDialog()
471 |     myshow.show()
472 |     sys.exit(app.exec_())
473 | 
474 | 
475 | ```
476 | 
477 | <br>
478 | 
479 | ------
480 | 
481 | <br>
482 | 
483 | # 过程
484 | 
485 | <br>
486 | 
487 | ![1565763309680](https://littlebuzi.github.io/post-images/1565763309680.png)
488 | 
489 | ![1565763331643](https://littlebuzi.github.io/post-images/1565763331643.png)
490 | 
491 | ![1565763348226](https://littlebuzi.github.io/post-images/1565763348226.png)
492 | 
493 | <br>
494 | 
495 | ------
496 | 
497 | <br>
498 | 
499 | # 总结
500 | 
501 | <br>
502 | 
503 | 整站漫画全自动化爬取✅
504 | 
505 | 不能自动重启❌
506 | 
507 | 基本百分之95的功能实现，项目可宣布成功完成！✅
508 | 
509 | <br>
510 | 


--------------------------------------------------------------------------------