├── CsdnBlogSpider.py
├── README.md
└── SpiderGui.py
/CsdnBlogSpider.py:
--------------------------------------------------------------------------------
1 | # encoding:utf-8
2 | __author__ = 'Sun'
3 |
4 | import re
5 | import urllib.request
6 | import urllib
7 | import queue
8 | import threading
9 | import os
10 |
11 |
12 | queue = queue.Queue()
13 | visited = set()
14 | cnt = 0
15 | class CsdnBlogSpider(threading.Thread):
16 |
17 | def __init__(self, queue, opener, blog_name):
18 | threading.Thread.__init__(self)
19 | self.queue = queue
20 | self.opener = opener
21 | self.blog_name = blog_name
22 | self.lock = threading.Lock()
23 |
24 | def save_data(self, data, filename):
25 | if not os.path.exists('blog'):
26 | blog_path = os.path.join(os.path.abspath('.'),'blog')
27 | os.mkdir(blog_path)
28 | try:
29 | fout = open('./blog/' + filename + '.html', 'wb')
30 | fout.write(data)
31 | except IOError as e:
32 | print(e)
33 | # finally:
34 | # fout.close()
35 |
36 | def find_title(self,data):
37 | data = data.decode('utf-8')
38 | begin = data.find(r'
' + url)
51 | self.lock.release()
52 | try:
53 | res = self.opener.open(url, timeout=1000)
54 | except Exception as e:
55 | if hasattr(e, 'reason'):
56 | print('reason:', e.reason)
57 | elif hasattr(e, 'code'):
58 | print('error code:', e.code)
59 | cnt -= 1
60 | self.queue.task_done()
61 | continue
62 | else:
63 | data = res.read()
64 | title = self.find_title(data)
65 | self.save_data(data,title)
66 |
67 | data = data.decode('utf-8')
68 | blog_urls = re.compile('/' + self.blog_name + '/article/details/' + '\d*')
69 | for url in blog_urls.findall(data):
70 | url = 'http://blog.csdn.net' + url
71 | if url not in visited:
72 | self.queue.put(url)
73 | visited |= {url}
74 | # print('加入队列---》' + url)
75 | self.queue.task_done()
76 |
77 | def init(name, number=10):
78 | global cnt
79 | global visited
80 | # blog_name = input('输入博客名称:')
81 | # thread_num = input('输入启动线程数:')
82 | blog_name = name.lower()
83 | th_num = int(number)
84 | url = 'http://blog.csdn.net/' + blog_name + '/'
85 | opener = urllib.request.build_opener(urllib.request.HTTPHandler)
86 | headers = [
87 | ('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko')
88 | ]
89 | urllib.request.install_opener(opener)
90 | opener.addheaders = headers
91 |
92 | queue.put(url)
93 | visited |= {url}
94 | cnt = 0
95 |
96 | for i in range(th_num):
97 | t = CsdnBlogSpider(queue,opener,blog_name)
98 | t.setDaemon(True)
99 | t.start()
100 | queue.join()
101 | print('--------end!!!-----')
102 | print('共抓取:' + str(cnt))
103 |
104 | if __name__ == '__main__':
105 | init()
106 |
107 |
108 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 基于Python的Csdn博客爬虫(多线程)
2 | =====
3 | ### 说明
4 | 爬取Csdn某个博主的所有博文,并下载到本地(暂时只支持保存html格式),加入了多线程,爬取更快速。
5 | SpiderGui----------------->爬虫gui界面
6 | CsdnBlogSpider------------>爬虫主程序
7 | ### 环境
8 | Python3.4
9 | ### 运行
10 | * 确保安装Python解释器。
11 | * SpiderGui.py
12 | * 提示输入要爬取的博客名和开启线程数,根据自己需求配置
13 | * 开始爬取,存储在当前目录的blog文件夹下
14 | ### 存在问题
15 | 本爬虫还不够完善,欢迎大家指导
16 |
--------------------------------------------------------------------------------
/SpiderGui.py:
--------------------------------------------------------------------------------
1 | # encoding:utf-8
2 | __author__ = 'Sun'
3 |
4 | import CsdnBlogSpider
5 | from tkinter import *
6 | import tkinter as tk
7 | import tkinter.messagebox as messagebox
8 | import tkinter.font
9 | import threading
10 | import queue
11 |
12 | gui_que = queue.Queue()
13 |
14 | class Application(threading.Thread):
15 | def __init__(self, root):
16 | '''Init frame
17 | '''
18 | threading.Thread.__init__(self)
19 | self.progress = ''
20 | self.root = root
21 | self.createFrame()
22 | self.createFrameTop()
23 |
24 | def createFrameTop(self):
25 | self.frm_top_label = tk.Label(self.root, text='Csdn_Blog_Download_Tool', font=('Courier New', 15, tk.font.BOLD))
26 | self.frm_top_label.grid(row=0, column=0, padx=10, pady=10)
27 |
28 | def createFrame(self):
29 | ''' Create Frame
30 | '''
31 | self.frm = tk.LabelFrame(self.root)
32 | self.frm.grid(row=1, column=0, padx=8, pady=20)
33 |
34 | self.frm_label_name = tk.Label(self.frm, text='BlogName:', font=('Courier New', 11))
35 | self.frm_label_name.grid(row=0, column=0, padx=5, pady=10)
36 |
37 | self.frm_entry_name = tk.Entry(self.frm)
38 | self.frm_entry_name.grid(row=0, column=1, padx=5, pady=10)
39 |
40 | self.frm_label_num = tk.Label(self.frm, text='ThreadNum:', font=('Courier New', 11))
41 | self.frm_label_num.grid(row=1, column=0, padx=5, pady=10)
42 |
43 | default_value = StringVar()
44 | default_value.set('10')
45 | self.frm_entry_num = tk.Entry(self.frm, textvariable=default_value)
46 | self.frm_entry_num.grid(row=1, column=1, padx=5, pady=10)
47 |
48 | self.frm_button_cancel = tk.Button(self.frm, text=' Cancel ', command=self.root.quit)
49 | self.frm_button_cancel.grid(row=2, column=0, padx=25, pady=10)
50 |
51 | self.frm_button_download = tk.Button(self.frm, text='Download', command=self.download)
52 | self.frm_button_download.grid(row=2, column=1, padx=5, pady=10)
53 |
54 | def createFrameBottom(self):
55 | self.frm_bottom_label = tk.Label(self.root, text=self.progress)
56 | self.frm_bottom_label.grid(row=2, column=0)
57 |
58 | def download(self):
59 | self.name = self.frm_entry_name.get()
60 | self.num = self.frm_entry_num.get()
61 | self.createFrameBottom()
62 | self.progress = 'Downloading, '
63 | if self.name == '':
64 | messagebox.showwarning('Warning', 'Blog name can not be empty')
65 | elif not self.num.isdigit():
66 | messagebox.showwarning('Warning', 'Thread num is invalid')
67 | elif int(self.num) == 0:
68 | messagebox.showwarning('Warning', 'Thread num can not be 0')
69 | else:
70 | gui_que.put(self.name)
71 | self.progress += 'please wait...'
72 | self.frm_bottom_label.config(text=self.progress)
73 | def run(self):
74 | while True:
75 | name = gui_que.get()
76 | CsdnBlogSpider.init(name, int(self.num))
77 | tasks = CsdnBlogSpider.queue.unfinished_tasks
78 | if tasks == 0:
79 | self.progress += "done!!!"
80 | self.frm_bottom_label.config(text=self.progress)
81 | if CsdnBlogSpider.cnt == 0:
82 | messagebox.showerror('Error', 'Can not download!!Please check name or internet is correct!!')
83 | else:
84 | messagebox.showinfo('Download Success',
85 | 'Download ' + str(CsdnBlogSpider.cnt) + ' blogs' + ',saved in ./blog directory!')
86 | gui_que.task_done()
87 | '''
88 | make window center
89 | '''
90 |
91 |
92 | def center_window(w=300, h=220):
93 | # get screen width and height
94 | ws = root.winfo_screenwidth()
95 | hs = root.winfo_screenheight()
96 | # calculate position x, y
97 | x = (ws / 2) - (w / 2)
98 | y = (hs / 2) - (h / 2)
99 | root.geometry('%dx%d+%d+%d' % (w, h, x, y))
100 |
101 |
102 | if __name__ == '__main__':
103 | root = tk.Tk()
104 | root.title('Csdn_Blog_Download_Tool')
105 | center_window()
106 | t = Application(root)
107 | t.setDaemon(True)
108 | t.start()
109 | root.mainloop()
110 |
--------------------------------------------------------------------------------