├── .gitignore
├── README.md
├── main.py
└── utils
    ├── __init__.py
    ├── parser.py
    ├── pool.py
    ├── progress.py
    ├── save.py
    └── spider.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.py[cod]
2 | *.log
3 | *.db
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # python_spider
 2 | 
 3 | python开发的Web爬虫
 4 | 
 5 | ### 功能描述
 6 | 
 7 |     使用python编写一个网站爬虫程序，支持参数如下：
 8 |     spider.py -u url -d deep -f logfile -l loglevel(1-5)  --testself --thread number --dbfile  filepath  --key=”HTML5”
 9 | 
10 |     参数说明：
11 |     -u 指定爬虫开始地址
12 |     -d 指定爬虫深度
13 |     --thread 指定线程池大小，多线程爬取页面，可选参数，默认10
14 |     --dbfile 存放结果数据到指定的数据库（sqlite）文件中
15 |     --key 页面内的关键词，获取满足该关键词的网页，可选参数，默认为所有页面
16 |     -l 日志记录文件记录详细程度，数字越大记录越详细，可选参数，默认spider.log
17 |     --testself 程序自测，可选参数
18 | 
19 | ### 具体要求
20 | 
21 | 1. 指定网站爬取指定深度的页面，将包含指定关键词的页面内容存放到sqlite3数据库文件中
22 | 2. 程序每隔10秒在屏幕上打印进度信息
23 | 3. 支持线程池机制，并发爬取网页
24 | 4. 代码需要详尽的注释，自己需要深刻理解该程序所涉及到的各类知识点
25 | 5. 需要自己实现线程池
26 | 
27 | ### 使用方法
28 |     #起始地址新浪，下载2级，使用10个线程，保存到sina.db
29 |     python main.py -u http://www.sina.com.cn -d 1 --thread 10 --dbfile sina.db
30 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import logging
 5 | import urllib2
 6 | import sqlite3
 7 | import time
 8 | from utils.parser import get_args
 9 | from utils.pool import ThreadPool
10 | from utils.spider import spider
11 | 
12 | 
13 | # 测试网络连接
14 | def test_network(url):
15 |     """
16 |     测试网络是否通常，返回200为测试通过
17 |     >>> test_network("http://www.baidu.com")
18 |     200
19 |     """
20 | 
21 |     try:
22 |         response = urllib2.urlopen(url)
23 |     except urllib2.HTTPError as e:
24 |         return e.code
25 |     except Exception as e:
26 |         return str(e)
27 |     else:
28 |         return response.getcode()
29 | 
30 | 
31 | # 测试sqlite连接
32 | def test_sqlite(dbfile):
33 |     """
34 |     测试是否可以创建并连接sqlite数据库文件，返回True为测试通过
35 |     >>> test_sqlite("test.db")
36 |     True
37 |     """
38 | 
39 |     try:
40 |         conn = sqlite3.connect(dbfile)
41 |     except Exception as e:
42 |         return str(e)
43 |     else:
44 |         conn.close()
45 |         return True
46 | 
47 | if __name__ == "__main__":
48 | 
49 |     # 参数处理
50 |     args = get_args()
51 | 
52 |     if args.testself:
53 |         # 使用doctest进行测试
54 |         import doctest
55 |         doctest.testmod(verbose=True)
56 |     else:
57 |         start = time.time()
58 | 
59 |         # logging初始化，设定日志文件名和记录级别
60 |         LEVELS = {
61 |             1: logging.CRITICAL,
62 |             2: logging.ERROR,
63 |             3: logging.WARNING,
64 |             4: logging.INFO,
65 |             5: logging.DEBUG
66 |         }
67 |         level = LEVELS[args.loglevel]
68 |         logging.basicConfig(filename=args.logfile, level=level)
69 | 
70 |         if args.url != '':
71 |             # 初始化线程池，开始工作
72 |             thread_pool = ThreadPool(args.thread, args)
73 |             thread_pool.add_task(spider, args.url, 0)
74 |             thread_pool.start_task()
75 |             thread_pool.wait_all_complete()
76 | 
77 |             # 完成后打印信息
78 |             progress_info = thread_pool.get_progress_info()
79 |             print "总任务数：", progress_info['tasks_number']
80 |             print "成功下载：", progress_info['success']
81 |             print "下载失败：", progress_info['failure']
82 |             print "花费时间： {0} 秒".format(time.time()-start)
83 |         else:
84 |             logging.critical("No initial url")
85 |             print "请使用-u参数指定初始url"
86 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dqw/python_spider/a854bdb0a9a2a04095630fa01262be7bc6b43a46/utils/__init__.py


--------------------------------------------------------------------------------
/utils/parser.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import argparse
 5 | 
 6 | 
 7 | def get_args():
 8 |     # 参数处理
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('-u', dest="url", default="", help="爬虫起始地址")
11 |     parser.add_argument("-d", type=int, dest="deep", default=0, help="爬取深度，起始地址为第0级")  
12 |     parser.add_argument("--thread", type=int, dest="thread", default=10, help="爬虫使用的线程数")  
13 |     parser.add_argument('--dbfile', dest="dbfile", default="spider.db", help="存放数据库（sqlite）文件名")
14 |     parser.add_argument('-f', dest="logfile", default="spider.log", help="日志文件名")
15 |     parser.add_argument('-l', dest="loglevel", default="5", type=int, help="日志记录文件记录详细程度，数字越大记录越详细(1-5)")
16 |     parser.add_argument('--key', dest="key", default="", help="页面内的关键词，获取满足该关键词的网页，不指定关键字时获取所有页面")
17 |     parser.add_argument('--encoding', dest="encoding", default=None, help="指定页面编码，如果不指定将自动测试页面编码")
18 |     parser.add_argument('--testself', action="store_true", dest="testself", default="", help="程序自测自测")
19 |     args = parser.parse_args()
20 |     args.key = args.key.decode("utf-8")
21 | 
22 |     return args
23 | 


--------------------------------------------------------------------------------
/utils/pool.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import threading
  5 | import Queue
  6 | import md5
  7 | import logging
  8 | from utils.progress import PrintProgress
  9 | from utils.save import SaveToSqlite
 10 | 
 11 | 
 12 | class ThreadPool(object):
 13 |     def __init__(self, thread_num, args):
 14 | 
 15 |         self.args = args
 16 |         self.work_queue = Queue.Queue()
 17 |         self.save_queue = Queue.Queue()
 18 |         self.threads = []
 19 |         self.running = 0
 20 |         self.failure = 0
 21 |         self.success = 0
 22 |         self.tasks = {}
 23 |         self.thread_name = threading.current_thread().getName()
 24 |         self.__init_thread_pool(thread_num)
 25 | 
 26 |     # 线程池初始化
 27 |     def __init_thread_pool(self, thread_num):
 28 |         # 下载线程
 29 |         for i in range(thread_num):
 30 |             self.threads.append(WorkThread(self))
 31 |         # 打印进度信息线程
 32 |         self.threads.append(PrintProgress(self))
 33 |         # 保存线程
 34 |         self.threads.append(SaveToSqlite(self, self.args.dbfile))
 35 | 
 36 |     # 添加下载任务
 37 |     def add_task(self, func, url, deep):
 38 |         # 记录任务，判断是否已经下载过
 39 |         url_hash = md5.new(url.encode("utf8")).hexdigest()
 40 |         if not url_hash in self.tasks:
 41 |             self.tasks[url_hash] = url
 42 |             self.work_queue.put((func, url, deep))
 43 |             logging.info("{0} add task {1}".format(self.thread_name, url.encode("utf8")))
 44 | 
 45 |     # 获取下载任务
 46 |     def get_task(self):
 47 |         task = self.work_queue.get(block=False)
 48 | 
 49 |         return task
 50 | 
 51 |     def task_done(self):
 52 |         self.work_queue.task_done()
 53 | 
 54 |     # 开始任务
 55 |     def start_task(self):
 56 |         for item in self.threads:
 57 |             item.start()
 58 | 
 59 |         logging.debug("Work start")
 60 | 
 61 |     def increase_success(self):
 62 |         self.success += 1
 63 | 
 64 |     def increase_failure(self):
 65 |         self.failure += 1
 66 | 
 67 |     def increase_running(self):
 68 |         self.running += 1
 69 | 
 70 |     def decrease_running(self):
 71 |         self.running -= 1
 72 | 
 73 |     def get_running(self):
 74 |         return self.running
 75 | 
 76 |     def get_progress_info(self):
 77 |         progress_info = {}
 78 |         progress_info['work_queue_number'] = self.work_queue.qsize()
 79 |         progress_info['tasks_number'] = len(self.tasks)
 80 |         progress_info['save_queue_number'] = self.save_queue.qsize()
 81 |         progress_info['success'] = self.success
 82 |         progress_info['failure'] = self.failure
 83 | 
 84 |         return progress_info
 85 | 
 86 |     def add_save_task(self, url, html):
 87 |         self.save_queue.put((url, html))
 88 | 
 89 |     def get_save_task(self):
 90 |         save_task = self.save_queue.get(block=False)
 91 | 
 92 |         return save_task
 93 | 
 94 |     def wait_all_complete(self):
 95 |         for item in self.threads:
 96 |             if item.isAlive():
 97 |                 item.join()
 98 | 
 99 | 
100 | class WorkThread(threading.Thread):
101 |     def __init__(self, thread_pool):
102 |         threading.Thread.__init__(self)
103 |         self.thread_pool = thread_pool
104 | 
105 |     def run(self):
106 |         while True:
107 |             try:
108 |                 do, url, deep = self.thread_pool.get_task()
109 |                 self.thread_pool.increase_running()
110 | 
111 |                 # 判断deep，是否获取新的链接
112 |                 flag_get_new_link = True
113 |                 if deep >= self.thread_pool.args.deep:
114 |                     flag_get_new_link = False
115 | 
116 |                 html, new_link = do(url, self.thread_pool.args, flag_get_new_link)
117 | 
118 |                 if html == '':
119 |                     self.thread_pool.increase_failure()
120 |                 else:
121 |                     self.thread_pool.increase_success()
122 |                     # html添加到待保存队列
123 |                     self.thread_pool.add_save_task(url, html)
124 | 
125 |                 # 添加新任务
126 |                 if new_link:
127 |                     for url in new_link:
128 |                         self.thread_pool.add_task(do, url, deep + 1)
129 | 
130 |                 self.thread_pool.decrease_running()
131 |                 self.thread_pool.task_done()
132 |             except Queue.Empty:
133 |                 if self.thread_pool.get_running() <= 0:
134 |                     break
135 |             except Exception, e:
136 |                 print str(e)
137 |                 break
138 | 


--------------------------------------------------------------------------------
/utils/progress.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import threading
 5 | import time
 6 | import logging
 7 | 
 8 | 
 9 | # 打印进度信息
10 | class PrintProgress(threading.Thread):
11 |     def __init__(self, thread_pool):
12 |         threading.Thread.__init__(self)
13 |         self.thread_pool = thread_pool
14 | 
15 |     def run(self):
16 |         while True:
17 |             thread_number = self.thread_pool.get_running()
18 |             progress_info = self.thread_pool.get_progress_info()
19 | 
20 |             if thread_number <= 0 and progress_info['save_queue_number'] <= 0:
21 |                 break
22 | 
23 |             print '总任务数:', progress_info['tasks_number']
24 |             print '工作线程:', thread_number
25 |             print '待下载:', progress_info['work_queue_number']
26 |             print '待保存:', progress_info['save_queue_number']
27 |             print '---------------------------------------'
28 | 
29 |             logging.debug("Print progress")
30 | 
31 |             time.sleep(10)
32 | 


--------------------------------------------------------------------------------
/utils/save.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import threading
 5 | import Queue
 6 | import sqlite3
 7 | import logging
 8 | 
 9 | 
10 | # 保存html
11 | class SaveToSqlite(threading.Thread):
12 |     def __init__(self, thread_pool, dbfile):
13 |         threading.Thread.__init__(self)
14 |         self.thread_pool = thread_pool
15 |         self.conn = sqlite3.connect(dbfile, check_same_thread=False)
16 |         #设置支持中文存储
17 |         self.conn.text_factory = str
18 |         self.cmd = self.conn.cursor()
19 |         self.cmd.execute('''
20 |             create table if not exists data(
21 |                 id INTEGER PRIMARY KEY AUTOINCREMENT,
22 |                 url text,
23 |                 html text
24 |             )
25 |         ''')
26 |         self.conn.commit()
27 | 
28 |     def run(self):
29 |         while True:
30 |             try:
31 |                 url, html = self.thread_pool.get_save_task()
32 |                 try:
33 |                     self.cmd.execute("insert into data (url, html) values (?,?)", (url, html))
34 |                     self.conn.commit()
35 |                 except Exception as e:
36 |                     logging.error("Save error:{0}".format(str(e)))
37 |             except Queue.Empty:
38 |                 thread_number = self.thread_pool.get_running()
39 |                 if thread_number <= 0:
40 |                     self.conn.close()
41 |                     break
42 |             except Exception, e:
43 |                 print str(e)
44 |                 break
45 | 


--------------------------------------------------------------------------------
/utils/spider.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import threading
 5 | import urllib2
 6 | import re
 7 | import gzip
 8 | import chardet
 9 | import logging
10 | from StringIO import StringIO
11 | from BeautifulSoup import BeautifulSoup
12 | 
13 | 
14 | def spider(url, args, flag_get_new_link):
15 | 
16 |     thread_name = threading.current_thread().getName()
17 | 
18 |     # 分析页面，获取链接
19 |     def get_link(html):
20 |         new_link = []
21 | 
22 |         soup = BeautifulSoup(html)
23 |         for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
24 |             href = link.get('href')
25 |             new_link.append(href)
26 | 
27 |         return new_link
28 | 
29 |     # 获取页面
30 |     def get_html(url, args, flag_get_new_link):
31 |         html = ''
32 |         new_link = []
33 | 
34 |         try:
35 |             response = urllib2.urlopen(url, timeout=20)
36 |             if response.info().get('Content-Encoding') == 'gzip':
37 |                 buf = StringIO(response.read())
38 |                 f = gzip.GzipFile(fileobj=buf)
39 |                 html = f.read()
40 |             else:
41 |                 html = response.read()
42 |         except urllib2.URLError as e:
43 |             logging.warning("{0} URL error: {1}".format(url.encode("utf8"), e.reason))
44 |         except urllib2.HTTPError as e:
45 |             logging.warning("{0} HTTP error: {1}".format(url.encode("utf8"), e.code))
46 |         except Exception as e:
47 |             logging.warning("{0} Unexpected: {1}".format(url.encode("utf8"), str(e)))
48 |         else:
49 |             logging.info("{0} downloaded {1}".format(thread_name, url.encode("utf8")))
50 | 
51 |             if args.key == "":
52 |                 if flag_get_new_link:
53 |                     new_link = get_link(html)
54 |             else:
55 |                 # 页面编码处理
56 |                 if not args.encoding:
57 |                     charset = chardet.detect(html)
58 |                     args.encoding = charset['encoding']
59 | 
60 |                 # 下载匹配关键字的页面
61 |                 match = re.search(re.compile(args.key), html.decode(args.encoding, "ignore"))
62 |                 if match and flag_get_new_link:
63 |                     logging.info("{0} {1} match key".format(thread_name, url.encode("utf8")))
64 |                     new_link = get_link(html)
65 |                 else:
66 |                     logging.info("{0} {1} not match key".format(thread_name, url.encode("utf8")))
67 |         finally:
68 |             return html, new_link
69 | 
70 |     return get_html(url, args, flag_get_new_link)
71 | 


--------------------------------------------------------------------------------