├── CsdnBlogSpider.py ├── README.md └── SpiderGui.py /CsdnBlogSpider.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | __author__ = 'Sun' 3 | 4 | import re 5 | import urllib.request 6 | import urllib 7 | import queue 8 | import threading 9 | import os 10 | 11 | 12 | queue = queue.Queue() 13 | visited = set() 14 | cnt = 0 15 | class CsdnBlogSpider(threading.Thread): 16 | 17 | def __init__(self, queue, opener, blog_name): 18 | threading.Thread.__init__(self) 19 | self.queue = queue 20 | self.opener = opener 21 | self.blog_name = blog_name 22 | self.lock = threading.Lock() 23 | 24 | def save_data(self, data, filename): 25 | if not os.path.exists('blog'): 26 | blog_path = os.path.join(os.path.abspath('.'),'blog') 27 | os.mkdir(blog_path) 28 | try: 29 | fout = open('./blog/' + filename + '.html', 'wb') 30 | fout.write(data) 31 | except IOError as e: 32 | print(e) 33 | # finally: 34 | # fout.close() 35 | 36 | def find_title(self,data): 37 | data = data.decode('utf-8') 38 | begin = data.find(r'' + url) 51 | self.lock.release() 52 | try: 53 | res = self.opener.open(url, timeout=1000) 54 | except Exception as e: 55 | if hasattr(e, 'reason'): 56 | print('reason:', e.reason) 57 | elif hasattr(e, 'code'): 58 | print('error code:', e.code) 59 | cnt -= 1 60 | self.queue.task_done() 61 | continue 62 | else: 63 | data = res.read() 64 | title = self.find_title(data) 65 | self.save_data(data,title) 66 | 67 | data = data.decode('utf-8') 68 | blog_urls = re.compile('/' + self.blog_name + '/article/details/' + '\d*') 69 | for url in blog_urls.findall(data): 70 | url = 'http://blog.csdn.net' + url 71 | if url not in visited: 72 | self.queue.put(url) 73 | visited |= {url} 74 | # print('加入队列---》' + url) 75 | self.queue.task_done() 76 | 77 | def init(name, number=10): 78 | global cnt 79 | global visited 80 | # blog_name = input('输入博客名称:') 81 | # thread_num = input('输入启动线程数:') 82 | blog_name = name.lower() 83 | th_num = int(number) 84 | url = 'http://blog.csdn.net/' + blog_name + '/' 85 | opener = urllib.request.build_opener(urllib.request.HTTPHandler) 86 | headers = [ 87 | ('User-Agent', 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko') 88 | ] 89 | urllib.request.install_opener(opener) 90 | opener.addheaders = headers 91 | 92 | queue.put(url) 93 | visited |= {url} 94 | cnt = 0 95 | 96 | for i in range(th_num): 97 | t = CsdnBlogSpider(queue,opener,blog_name) 98 | t.setDaemon(True) 99 | t.start() 100 | queue.join() 101 | print('--------end!!!-----') 102 | print('共抓取:' + str(cnt)) 103 | 104 | if __name__ == '__main__': 105 | init() 106 | 107 | 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 基于Python的Csdn博客爬虫(多线程) 2 | ===== 3 | ### 说明 4 | 爬取Csdn某个博主的所有博文,并下载到本地(暂时只支持保存html格式),加入了多线程,爬取更快速。 5 | SpiderGui----------------->爬虫gui界面 6 | CsdnBlogSpider------------>爬虫主程序 7 | ### 环境 8 | Python3.4 9 | ### 运行 10 | * 确保安装Python解释器。 11 | * SpiderGui.py 12 | * 提示输入要爬取的博客名和开启线程数,根据自己需求配置 13 | * 开始爬取,存储在当前目录的blog文件夹下 14 | ### 存在问题 15 | 本爬虫还不够完善,欢迎大家指导 16 | -------------------------------------------------------------------------------- /SpiderGui.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | __author__ = 'Sun' 3 | 4 | import CsdnBlogSpider 5 | from tkinter import * 6 | import tkinter as tk 7 | import tkinter.messagebox as messagebox 8 | import tkinter.font 9 | import threading 10 | import queue 11 | 12 | gui_que = queue.Queue() 13 | 14 | class Application(threading.Thread): 15 | def __init__(self, root): 16 | '''Init frame 17 | ''' 18 | threading.Thread.__init__(self) 19 | self.progress = '' 20 | self.root = root 21 | self.createFrame() 22 | self.createFrameTop() 23 | 24 | def createFrameTop(self): 25 | self.frm_top_label = tk.Label(self.root, text='Csdn_Blog_Download_Tool', font=('Courier New', 15, tk.font.BOLD)) 26 | self.frm_top_label.grid(row=0, column=0, padx=10, pady=10) 27 | 28 | def createFrame(self): 29 | ''' Create Frame 30 | ''' 31 | self.frm = tk.LabelFrame(self.root) 32 | self.frm.grid(row=1, column=0, padx=8, pady=20) 33 | 34 | self.frm_label_name = tk.Label(self.frm, text='BlogName:', font=('Courier New', 11)) 35 | self.frm_label_name.grid(row=0, column=0, padx=5, pady=10) 36 | 37 | self.frm_entry_name = tk.Entry(self.frm) 38 | self.frm_entry_name.grid(row=0, column=1, padx=5, pady=10) 39 | 40 | self.frm_label_num = tk.Label(self.frm, text='ThreadNum:', font=('Courier New', 11)) 41 | self.frm_label_num.grid(row=1, column=0, padx=5, pady=10) 42 | 43 | default_value = StringVar() 44 | default_value.set('10') 45 | self.frm_entry_num = tk.Entry(self.frm, textvariable=default_value) 46 | self.frm_entry_num.grid(row=1, column=1, padx=5, pady=10) 47 | 48 | self.frm_button_cancel = tk.Button(self.frm, text=' Cancel ', command=self.root.quit) 49 | self.frm_button_cancel.grid(row=2, column=0, padx=25, pady=10) 50 | 51 | self.frm_button_download = tk.Button(self.frm, text='Download', command=self.download) 52 | self.frm_button_download.grid(row=2, column=1, padx=5, pady=10) 53 | 54 | def createFrameBottom(self): 55 | self.frm_bottom_label = tk.Label(self.root, text=self.progress) 56 | self.frm_bottom_label.grid(row=2, column=0) 57 | 58 | def download(self): 59 | self.name = self.frm_entry_name.get() 60 | self.num = self.frm_entry_num.get() 61 | self.createFrameBottom() 62 | self.progress = 'Downloading, ' 63 | if self.name == '': 64 | messagebox.showwarning('Warning', 'Blog name can not be empty') 65 | elif not self.num.isdigit(): 66 | messagebox.showwarning('Warning', 'Thread num is invalid') 67 | elif int(self.num) == 0: 68 | messagebox.showwarning('Warning', 'Thread num can not be 0') 69 | else: 70 | gui_que.put(self.name) 71 | self.progress += 'please wait...' 72 | self.frm_bottom_label.config(text=self.progress) 73 | def run(self): 74 | while True: 75 | name = gui_que.get() 76 | CsdnBlogSpider.init(name, int(self.num)) 77 | tasks = CsdnBlogSpider.queue.unfinished_tasks 78 | if tasks == 0: 79 | self.progress += "done!!!" 80 | self.frm_bottom_label.config(text=self.progress) 81 | if CsdnBlogSpider.cnt == 0: 82 | messagebox.showerror('Error', 'Can not download!!Please check name or internet is correct!!') 83 | else: 84 | messagebox.showinfo('Download Success', 85 | 'Download ' + str(CsdnBlogSpider.cnt) + ' blogs' + ',saved in ./blog directory!') 86 | gui_que.task_done() 87 | ''' 88 | make window center 89 | ''' 90 | 91 | 92 | def center_window(w=300, h=220): 93 | # get screen width and height 94 | ws = root.winfo_screenwidth() 95 | hs = root.winfo_screenheight() 96 | # calculate position x, y 97 | x = (ws / 2) - (w / 2) 98 | y = (hs / 2) - (h / 2) 99 | root.geometry('%dx%d+%d+%d' % (w, h, x, y)) 100 | 101 | 102 | if __name__ == '__main__': 103 | root = tk.Tk() 104 | root.title('Csdn_Blog_Download_Tool') 105 | center_window() 106 | t = Application(root) 107 | t.setDaemon(True) 108 | t.start() 109 | root.mainloop() 110 | --------------------------------------------------------------------------------