├── .gitignore ├── README.md ├── main.py └── utils ├── __init__.py ├── parser.py ├── pool.py ├── progress.py ├── save.py └── spider.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | *.log 3 | *.db 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python_spider 2 | 3 | python开发的Web爬虫 4 | 5 | ### 功能描述 6 | 7 | 使用python编写一个网站爬虫程序,支持参数如下: 8 | spider.py -u url -d deep -f logfile -l loglevel(1-5) --testself --thread number --dbfile filepath --key=”HTML5” 9 | 10 | 参数说明: 11 | -u 指定爬虫开始地址 12 | -d 指定爬虫深度 13 | --thread 指定线程池大小,多线程爬取页面,可选参数,默认10 14 | --dbfile 存放结果数据到指定的数据库(sqlite)文件中 15 | --key 页面内的关键词,获取满足该关键词的网页,可选参数,默认为所有页面 16 | -l 日志记录文件记录详细程度,数字越大记录越详细,可选参数,默认spider.log 17 | --testself 程序自测,可选参数 18 | 19 | ### 具体要求 20 | 21 | 1. 指定网站爬取指定深度的页面,将包含指定关键词的页面内容存放到sqlite3数据库文件中 22 | 2. 程序每隔10秒在屏幕上打印进度信息 23 | 3. 支持线程池机制,并发爬取网页 24 | 4. 代码需要详尽的注释,自己需要深刻理解该程序所涉及到的各类知识点 25 | 5. 需要自己实现线程池 26 | 27 | ### 使用方法 28 | #起始地址新浪,下载2级,使用10个线程,保存到sina.db 29 | python main.py -u http://www.sina.com.cn -d 1 --thread 10 --dbfile sina.db 30 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import logging 5 | import urllib2 6 | import sqlite3 7 | import time 8 | from utils.parser import get_args 9 | from utils.pool import ThreadPool 10 | from utils.spider import spider 11 | 12 | 13 | # 测试网络连接 14 | def test_network(url): 15 | """ 16 | 测试网络是否通常,返回200为测试通过 17 | >>> test_network("http://www.baidu.com") 18 | 200 19 | """ 20 | 21 | try: 22 | response = urllib2.urlopen(url) 23 | except urllib2.HTTPError as e: 24 | return e.code 25 | except Exception as e: 26 | return str(e) 27 | else: 28 | return response.getcode() 29 | 30 | 31 | # 测试sqlite连接 32 | def test_sqlite(dbfile): 33 | """ 34 | 测试是否可以创建并连接sqlite数据库文件,返回True为测试通过 35 | >>> test_sqlite("test.db") 36 | True 37 | """ 38 | 39 | try: 40 | conn = sqlite3.connect(dbfile) 41 | except Exception as e: 42 | return str(e) 43 | else: 44 | conn.close() 45 | return True 46 | 47 | if __name__ == "__main__": 48 | 49 | # 参数处理 50 | args = get_args() 51 | 52 | if args.testself: 53 | # 使用doctest进行测试 54 | import doctest 55 | doctest.testmod(verbose=True) 56 | else: 57 | start = time.time() 58 | 59 | # logging初始化,设定日志文件名和记录级别 60 | LEVELS = { 61 | 1: logging.CRITICAL, 62 | 2: logging.ERROR, 63 | 3: logging.WARNING, 64 | 4: logging.INFO, 65 | 5: logging.DEBUG 66 | } 67 | level = LEVELS[args.loglevel] 68 | logging.basicConfig(filename=args.logfile, level=level) 69 | 70 | if args.url != '': 71 | # 初始化线程池,开始工作 72 | thread_pool = ThreadPool(args.thread, args) 73 | thread_pool.add_task(spider, args.url, 0) 74 | thread_pool.start_task() 75 | thread_pool.wait_all_complete() 76 | 77 | # 完成后打印信息 78 | progress_info = thread_pool.get_progress_info() 79 | print "总任务数:", progress_info['tasks_number'] 80 | print "成功下载:", progress_info['success'] 81 | print "下载失败:", progress_info['failure'] 82 | print "花费时间: {0} 秒".format(time.time()-start) 83 | else: 84 | logging.critical("No initial url") 85 | print "请使用-u参数指定初始url" 86 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dqw/python_spider/a854bdb0a9a2a04095630fa01262be7bc6b43a46/utils/__init__.py -------------------------------------------------------------------------------- /utils/parser.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import argparse 5 | 6 | 7 | def get_args(): 8 | # 参数处理 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('-u', dest="url", default="", help="爬虫起始地址") 11 | parser.add_argument("-d", type=int, dest="deep", default=0, help="爬取深度,起始地址为第0级") 12 | parser.add_argument("--thread", type=int, dest="thread", default=10, help="爬虫使用的线程数") 13 | parser.add_argument('--dbfile', dest="dbfile", default="spider.db", help="存放数据库(sqlite)文件名") 14 | parser.add_argument('-f', dest="logfile", default="spider.log", help="日志文件名") 15 | parser.add_argument('-l', dest="loglevel", default="5", type=int, help="日志记录文件记录详细程度,数字越大记录越详细(1-5)") 16 | parser.add_argument('--key', dest="key", default="", help="页面内的关键词,获取满足该关键词的网页,不指定关键字时获取所有页面") 17 | parser.add_argument('--encoding', dest="encoding", default=None, help="指定页面编码,如果不指定将自动测试页面编码") 18 | parser.add_argument('--testself', action="store_true", dest="testself", default="", help="程序自测自测") 19 | args = parser.parse_args() 20 | args.key = args.key.decode("utf-8") 21 | 22 | return args 23 | -------------------------------------------------------------------------------- /utils/pool.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import threading 5 | import Queue 6 | import md5 7 | import logging 8 | from utils.progress import PrintProgress 9 | from utils.save import SaveToSqlite 10 | 11 | 12 | class ThreadPool(object): 13 | def __init__(self, thread_num, args): 14 | 15 | self.args = args 16 | self.work_queue = Queue.Queue() 17 | self.save_queue = Queue.Queue() 18 | self.threads = [] 19 | self.running = 0 20 | self.failure = 0 21 | self.success = 0 22 | self.tasks = {} 23 | self.thread_name = threading.current_thread().getName() 24 | self.__init_thread_pool(thread_num) 25 | 26 | # 线程池初始化 27 | def __init_thread_pool(self, thread_num): 28 | # 下载线程 29 | for i in range(thread_num): 30 | self.threads.append(WorkThread(self)) 31 | # 打印进度信息线程 32 | self.threads.append(PrintProgress(self)) 33 | # 保存线程 34 | self.threads.append(SaveToSqlite(self, self.args.dbfile)) 35 | 36 | # 添加下载任务 37 | def add_task(self, func, url, deep): 38 | # 记录任务,判断是否已经下载过 39 | url_hash = md5.new(url.encode("utf8")).hexdigest() 40 | if not url_hash in self.tasks: 41 | self.tasks[url_hash] = url 42 | self.work_queue.put((func, url, deep)) 43 | logging.info("{0} add task {1}".format(self.thread_name, url.encode("utf8"))) 44 | 45 | # 获取下载任务 46 | def get_task(self): 47 | task = self.work_queue.get(block=False) 48 | 49 | return task 50 | 51 | def task_done(self): 52 | self.work_queue.task_done() 53 | 54 | # 开始任务 55 | def start_task(self): 56 | for item in self.threads: 57 | item.start() 58 | 59 | logging.debug("Work start") 60 | 61 | def increase_success(self): 62 | self.success += 1 63 | 64 | def increase_failure(self): 65 | self.failure += 1 66 | 67 | def increase_running(self): 68 | self.running += 1 69 | 70 | def decrease_running(self): 71 | self.running -= 1 72 | 73 | def get_running(self): 74 | return self.running 75 | 76 | def get_progress_info(self): 77 | progress_info = {} 78 | progress_info['work_queue_number'] = self.work_queue.qsize() 79 | progress_info['tasks_number'] = len(self.tasks) 80 | progress_info['save_queue_number'] = self.save_queue.qsize() 81 | progress_info['success'] = self.success 82 | progress_info['failure'] = self.failure 83 | 84 | return progress_info 85 | 86 | def add_save_task(self, url, html): 87 | self.save_queue.put((url, html)) 88 | 89 | def get_save_task(self): 90 | save_task = self.save_queue.get(block=False) 91 | 92 | return save_task 93 | 94 | def wait_all_complete(self): 95 | for item in self.threads: 96 | if item.isAlive(): 97 | item.join() 98 | 99 | 100 | class WorkThread(threading.Thread): 101 | def __init__(self, thread_pool): 102 | threading.Thread.__init__(self) 103 | self.thread_pool = thread_pool 104 | 105 | def run(self): 106 | while True: 107 | try: 108 | do, url, deep = self.thread_pool.get_task() 109 | self.thread_pool.increase_running() 110 | 111 | # 判断deep,是否获取新的链接 112 | flag_get_new_link = True 113 | if deep >= self.thread_pool.args.deep: 114 | flag_get_new_link = False 115 | 116 | html, new_link = do(url, self.thread_pool.args, flag_get_new_link) 117 | 118 | if html == '': 119 | self.thread_pool.increase_failure() 120 | else: 121 | self.thread_pool.increase_success() 122 | # html添加到待保存队列 123 | self.thread_pool.add_save_task(url, html) 124 | 125 | # 添加新任务 126 | if new_link: 127 | for url in new_link: 128 | self.thread_pool.add_task(do, url, deep + 1) 129 | 130 | self.thread_pool.decrease_running() 131 | self.thread_pool.task_done() 132 | except Queue.Empty: 133 | if self.thread_pool.get_running() <= 0: 134 | break 135 | except Exception, e: 136 | print str(e) 137 | break 138 | -------------------------------------------------------------------------------- /utils/progress.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import threading 5 | import time 6 | import logging 7 | 8 | 9 | # 打印进度信息 10 | class PrintProgress(threading.Thread): 11 | def __init__(self, thread_pool): 12 | threading.Thread.__init__(self) 13 | self.thread_pool = thread_pool 14 | 15 | def run(self): 16 | while True: 17 | thread_number = self.thread_pool.get_running() 18 | progress_info = self.thread_pool.get_progress_info() 19 | 20 | if thread_number <= 0 and progress_info['save_queue_number'] <= 0: 21 | break 22 | 23 | print '总任务数:', progress_info['tasks_number'] 24 | print '工作线程:', thread_number 25 | print '待下载:', progress_info['work_queue_number'] 26 | print '待保存:', progress_info['save_queue_number'] 27 | print '---------------------------------------' 28 | 29 | logging.debug("Print progress") 30 | 31 | time.sleep(10) 32 | -------------------------------------------------------------------------------- /utils/save.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import threading 5 | import Queue 6 | import sqlite3 7 | import logging 8 | 9 | 10 | # 保存html 11 | class SaveToSqlite(threading.Thread): 12 | def __init__(self, thread_pool, dbfile): 13 | threading.Thread.__init__(self) 14 | self.thread_pool = thread_pool 15 | self.conn = sqlite3.connect(dbfile, check_same_thread=False) 16 | #设置支持中文存储 17 | self.conn.text_factory = str 18 | self.cmd = self.conn.cursor() 19 | self.cmd.execute(''' 20 | create table if not exists data( 21 | id INTEGER PRIMARY KEY AUTOINCREMENT, 22 | url text, 23 | html text 24 | ) 25 | ''') 26 | self.conn.commit() 27 | 28 | def run(self): 29 | while True: 30 | try: 31 | url, html = self.thread_pool.get_save_task() 32 | try: 33 | self.cmd.execute("insert into data (url, html) values (?,?)", (url, html)) 34 | self.conn.commit() 35 | except Exception as e: 36 | logging.error("Save error:{0}".format(str(e))) 37 | except Queue.Empty: 38 | thread_number = self.thread_pool.get_running() 39 | if thread_number <= 0: 40 | self.conn.close() 41 | break 42 | except Exception, e: 43 | print str(e) 44 | break 45 | -------------------------------------------------------------------------------- /utils/spider.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import threading 5 | import urllib2 6 | import re 7 | import gzip 8 | import chardet 9 | import logging 10 | from StringIO import StringIO 11 | from BeautifulSoup import BeautifulSoup 12 | 13 | 14 | def spider(url, args, flag_get_new_link): 15 | 16 | thread_name = threading.current_thread().getName() 17 | 18 | # 分析页面,获取链接 19 | def get_link(html): 20 | new_link = [] 21 | 22 | soup = BeautifulSoup(html) 23 | for link in soup.findAll('a', attrs={'href': re.compile("^http://")}): 24 | href = link.get('href') 25 | new_link.append(href) 26 | 27 | return new_link 28 | 29 | # 获取页面 30 | def get_html(url, args, flag_get_new_link): 31 | html = '' 32 | new_link = [] 33 | 34 | try: 35 | response = urllib2.urlopen(url, timeout=20) 36 | if response.info().get('Content-Encoding') == 'gzip': 37 | buf = StringIO(response.read()) 38 | f = gzip.GzipFile(fileobj=buf) 39 | html = f.read() 40 | else: 41 | html = response.read() 42 | except urllib2.URLError as e: 43 | logging.warning("{0} URL error: {1}".format(url.encode("utf8"), e.reason)) 44 | except urllib2.HTTPError as e: 45 | logging.warning("{0} HTTP error: {1}".format(url.encode("utf8"), e.code)) 46 | except Exception as e: 47 | logging.warning("{0} Unexpected: {1}".format(url.encode("utf8"), str(e))) 48 | else: 49 | logging.info("{0} downloaded {1}".format(thread_name, url.encode("utf8"))) 50 | 51 | if args.key == "": 52 | if flag_get_new_link: 53 | new_link = get_link(html) 54 | else: 55 | # 页面编码处理 56 | if not args.encoding: 57 | charset = chardet.detect(html) 58 | args.encoding = charset['encoding'] 59 | 60 | # 下载匹配关键字的页面 61 | match = re.search(re.compile(args.key), html.decode(args.encoding, "ignore")) 62 | if match and flag_get_new_link: 63 | logging.info("{0} {1} match key".format(thread_name, url.encode("utf8"))) 64 | new_link = get_link(html) 65 | else: 66 | logging.info("{0} {1} not match key".format(thread_name, url.encode("utf8"))) 67 | finally: 68 | return html, new_link 69 | 70 | return get_html(url, args, flag_get_new_link) 71 | --------------------------------------------------------------------------------