├── .gitignore ├── README.md ├── img ├── progress.jpg └── start.jpg ├── spider.py └── test.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.db 2 | *.log 3 | test/* 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | *Author: littlethunder* *mail:* 2 | 3 | ##基本要求 4 | 使用python编写一个网站爬虫程序,支持参数如下: 5 | 6 | python3 spider.py -u url -d deep -f logfile -l loglevel(1-5) --testself -thread number --dbfile filepath --key=”HTML5” 7 | 8 | 9 | ####参数说明: 10 | 11 | -u 指定爬虫开始地址 12 | 13 | -d 指定爬虫深度 14 | 15 | --thread 指定线程池大小,多线程爬取页面,可选参数,默认10 16 | 17 | --dbfile 存放结果数据到指定的数据库(sqlite)文件中 18 | 19 | --key 页面内的关键词,获取满足该关键词的网页,可选参数,默认为所有页面 20 | 21 | -l 日志记录文件记录详细程度,数字越大记录越详细,可选参数,默认spider.log 22 | 23 | --testself 程序自测,可选参数 24 | 25 | 26 | 27 | ####功能描述: 28 | 29 | 1. 指定网站爬取指定深度的页面,将包含指定关键词的页面内容存放到sqlite3数据库文件中 30 | 2. 程序在屏幕上打印进度信息 31 | 3. 支持线程池机制,并发爬取网页 32 | 4. 代码需要详尽的注释 33 | 5. 需要自己实现线程池 34 | 35 | ##基本说明 36 | 1. 采用PEP8风格; 37 | 2. 采用线程池,在初始化线程池时创建线程,运行中不销毁线程,分配任务后从线程池中取出线程执行,完成后再放回线程池中,最后主程序运行完成后销毁全部线程; 38 | 3. 优化主线程,防止重复抓取相同链接,并根据文件md5值判断是否有同名但内容不同的文件; 39 | 4. 采用lxml网页解析模块; 40 | 5. 主进程进度显示采用curses 41 | 5. 自测模块test.py用来测试爬取的HTML页面是否有重复,指定关键词时在数据库中查找;没有指定关键词时在本地目录下查找,进度显示采用[progressbar-python3](https://github.com/coagulant/progressbar-python3) 42 | 43 | 44 | ##测试情况统计 45 | 测试运行命令是:python3 spider.py -u http://www.sina.com.cn -d 3 -l 4 --key 科技 46 | 47 | 测试使用工具:nethogs、htop。 48 | 49 | 默认在本目录下生成: 50 | 51 | * spider.db: 数据库文件,保存含有关键词“科技”的网页 52 | * spider.log: 日志文件,第四等级,很详细。 53 | 54 | ####总共URL数量共158238个,15万条的量级: 55 | 56 | * 第一层:1 57 | * 第二层:1610 58 | * 第三层:156627 59 | 60 | ####系统网络、CPU、内存占用情况: 61 | 62 | * 带宽占用:最高13.8Mbps,最低185.6bps,平均6Mbps。校园网在不同时间段由于使用人数不同会造成较大波动。 63 | * CPU占用:0% 大部分时间是线程发出请求等待响应,基本不占用CPU时间 64 | * 内存占用:默认开10个线程,每个线程平均占用8%,共占用80%左右,去掉chrome等其他占用内存的程序,spider.py相当于占满了内存。 65 | 66 | ##运行情况截图: 67 | ![](img/start.jpg) 68 | 69 | ![](img/progress.jpg) 70 | 71 | ##v0.4更新说明: 72 | 73 | * 改进格式 74 | * 改进异常处理 75 | * 把urls(保存url,防止下载同名url)和fileMD5(保存文件md5值,防止不同名相同内容重复下载)两个list改成set,查询效率提升为O(1) 76 | -------------------------------------------------------------------------------- /img/progress.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leitro/knowsecSpider2/945d64d99f7266bcf8a52047f74f0ed26e5eda0b/img/progress.jpg -------------------------------------------------------------------------------- /img/start.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/leitro/knowsecSpider2/945d64d99f7266bcf8a52047f74f0ed26e5eda0b/img/start.jpg -------------------------------------------------------------------------------- /spider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Version: 0.4 4 | Author: littlethunder 5 | Mail: kingthunder2004@aliyun.com 6 | ''' 7 | import urllib.request 8 | import urllib.parse 9 | import time 10 | import sys 11 | import getopt 12 | import threading 13 | import os 14 | import hashlib 15 | import queue 16 | import sqlite3 17 | import functools 18 | import curses 19 | import logging 20 | import lxml.html 21 | 22 | 23 | class SameFileError(Exception): pass 24 | class NoneTypeError(Exception): pass 25 | 26 | #初始化数据库,如果不存在该文件则创建并新增一个表格 27 | def _initDB(dbFile): 28 | exist = False 29 | ls = os.listdir('.') 30 | if dbFile in ls: 31 | exist = True 32 | db = sqlite3.connect(dbFile, check_same_thread=False) 33 | c = db.cursor() 34 | if not exist: 35 | try: 36 | c.execute('create table spider(id integer primary key,\ 37 | url text,key text,content text)') 38 | db.commit() 39 | except sqlite3.OperationalError: 40 | logging.cratical(dbFile+' 创建表格错误') 41 | return db, c 42 | 43 | 44 | #向数据库表格中插入链接、关键词、网页全文 45 | def _insert(url, key, content): 46 | try: 47 | content = content.decode('gbk') 48 | except UnicodeDecodeError: 49 | content = content.decode('utf8', 'ignore') 50 | logging.debug(url+' 链接是UTF-8编码') 51 | content = urllib.parse.quote(content) 52 | try: 53 | c.execute('insert into spider(url,key,content) values(\ 54 | "'+url+'","'+key+'","'+content+'")') 55 | db.commit() 56 | except sqlite3.OperationalError: 57 | logging.critical('插入 '+url+' 数据错误') 58 | 59 | 60 | #请求一个链接,返回HTTP类型、主机Host名、页面的二进制数据 61 | def _requestData(url): 62 | headers = { 63 | 'Connection': 'keep-alive', 64 | 'Accept': 'text/html,application/xhtml+xml,\ 65 | application/xml;q=0.9,image/webp,*/*;q=0.8', 66 | 'User-Agent': 'Mozilla/5.0 (X11; Linux i686)\ 67 | AppleWebKit/537.36 (KHTML, like Gecko)\ 68 | Chrome/35.0.1916.153 Safari/537.36', 69 | 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 70 | } 71 | req = urllib.request.Request(url, headers=headers) 72 | try: 73 | res = urllib.request.urlopen(req, timeout=5).read() 74 | except: 75 | logging.error('打开'+url+'链接失败') 76 | return req.type, req.host, None 77 | return req.type, req.host, res 78 | 79 | 80 | #处理文本内容不一样但名字一样的情况 81 | def _dealSameFileName(name): 82 | try: 83 | files=os.listdir('.') 84 | except: 85 | logging.error('无法读取本目录下的文件') 86 | exit() 87 | count=1 88 | while True: 89 | if name in files: 90 | name='.'.join([name,str(count)]) 91 | count+=1 92 | else: 93 | return name 94 | 95 | #显示进度信息 96 | class showProgress(threading.Thread): 97 | def __init__(self, QLinks, deep, event): 98 | threading.Thread.__init__(self) 99 | self.QLinks = QLinks 100 | self.deep = deep 101 | self.event = event 102 | self.start() 103 | 104 | def run(self): 105 | if deep == 0: 106 | print('level 1 :', 1, '/', 1) 107 | return 108 | screen = curses.initscr() # 初始化终端界面输出窗口 109 | maxFile = [0] * (self.deep+1) 110 | while True: 111 | links = list(self.QLinks.__dict__['queue']) 112 | #队列中每个URL此时的深度值 113 | deeps = [x[1] for x in links] 114 | '''keys中元素是[deep值,次数], 115 | deep=0为最里子层,deep=n-1为父层''' 116 | keys = [[x, 0] for x in range(self.deep+1)] 117 | n = len(keys) 118 | for d in deeps: 119 | keys[d][1] += 1 120 | screen.clear() # 清屏,等待输出 121 | count = 0 122 | for d in range(1, n+1): 123 | count += 1 124 | if keys[n-d][1] > maxFile[d-1]: 125 | maxFile[d-1] = keys[n-d][1] 126 | screen.addstr(count, 0, 'level ' + str(d) + ' : ' + 127 | str(keys[n-d][1])+' / '+str(maxFile[d-1])) 128 | screen.refresh() # 使生效 129 | time.sleep(0.2) 130 | total = functools.reduce(lambda x, y: x + y, 131 | [i[1] for i in keys]) 132 | totalMax = functools.reduce(lambda x, y: x + y, maxFile) 133 | if self.event.is_set(): 134 | curses.endwin() 135 | logging.info('Done at '+time.ctime()) 136 | break 137 | 138 | 139 | class spider(threading.Thread): 140 | def __init__(self, QLinks, key, rlock): 141 | threading.Thread.__init__(self) 142 | self.queue = QLinks 143 | self.keyList = key 144 | self.rlock = rlock 145 | self.link = None 146 | self.deep = None 147 | self.key = None 148 | self.setDaemon(True) # 父线程结束后,子线程也相应结束 149 | self.start() 150 | 151 | def run(self): 152 | while True: 153 | try: 154 | self.link, self.deep = self.queue.get(timeout=2) 155 | self.key = self.keyList[0] 156 | except queue.Empty: 157 | continue 158 | if self.deep > 0: 159 | self.deep -= 1 160 | links = self.getLinks() 161 | if links: 162 | for i in links: 163 | global urls 164 | if i not in urls: 165 | urls.add(i) 166 | self.queue.put((i, self.deep)) 167 | self.queue.put((self.link, 0)) 168 | else: 169 | if not self.key: 170 | self.download2File() 171 | else: 172 | self.download2DB() 173 | logging.info(self.link+' ['+str(self.deep)+']') 174 | self.queue.task_done() 175 | 176 | #没有设定关键词的时候,下载到本地目录下 177 | def download2File(self): 178 | name = urllib.parse.quote(self.link) 179 | name = name.replace('/', '_') 180 | name = _dealSameFileName(name) 181 | try: 182 | data = _requestData(self.link)[2] 183 | md5 = hashlib.md5(data).hexdigest() 184 | global fileMD5 185 | if md5 in fileMD5: 186 | raise SameFileError 187 | else: 188 | fileMD5.add(md5) 189 | with open(name, 'wb') as f: 190 | f.write(data) 191 | except SameFileError: 192 | logging.info(self.link+' 出现相同的内容,已丢弃') 193 | 194 | #设定关键词的时候,把查询到关键词的网页保存到数据库中 195 | def download2DB(self): 196 | data = _requestData(self.link)[2] 197 | if not data: 198 | return 199 | try: 200 | html = data.decode('gbk') 201 | except UnicodeDecodeError: 202 | html = data.decode('utf8', 'ignore') 203 | if key in html: # 在页面中找到关键字则放到数据库中 204 | self.rlock.acquire() 205 | _insert(self.link, self.key, data) 206 | self.rlock.release() 207 | 208 | #找出一个URL页面中所有不重复的且正确的子URL 209 | def getLinks(self): 210 | try: 211 | resType, resHost, resData = _requestData(self.link) 212 | if not resData: 213 | raise NoneTypeError 214 | except NoneTypeError: 215 | return None 216 | try: 217 | data = resData.decode('gbk') 218 | except UnicodeDecodeError: 219 | data = resData.decode('utf8', 'ignore') 220 | host = resType+'://'+resHost 221 | doc = lxml.html.document_fromstring(data) 222 | tags = ['a', 'iframe', 'frame'] 223 | doc.make_links_absolute(host) 224 | links = doc.iterlinks() 225 | trueLinks = [] 226 | for l in links: 227 | if l[0].tag in tags: 228 | trueLinks.append(l[2]) 229 | return trueLinks # 要确保是绝对路径 230 | 231 | 232 | class threadPool: 233 | def __init__(self, num, event): 234 | self.num = num 235 | self.event = event 236 | self.threads = [] 237 | self.queue = queue.Queue() 238 | self.key = [None] 239 | self.createThread() 240 | 241 | def createThread(self): 242 | for i in range(self.num): 243 | self.threads.append(spider(self.queue, self.key, rlock)) 244 | 245 | def putJob(self, job, key=None): # job是(link,deep)的一个tuple 246 | self.queue.put(job) 247 | self.key[0] = key 248 | 249 | def getQueue(self): 250 | return self.queue 251 | 252 | def wait(self): 253 | self.queue.join() 254 | self.event.set() # 通知显示模块程序结束,关闭进度显示 255 | 256 | 257 | #主控制程序 258 | def mainHandler(threadNum, link, deep, key, test): 259 | event = threading.Event() 260 | event.clear() 261 | pool = threadPool(threadNum, event) 262 | showProgress(pool.getQueue(), deep, event) 263 | pool.putJob((link, deep), key) 264 | pool.wait() 265 | if test: # 需要自测模块运行 266 | import test 267 | test.test(key, dbFile) 268 | 269 | 270 | #用法说明 271 | def _usage(): 272 | print('''spider v0.4 --littlethunder 273 | 用法:python3 spider.py -u [URL] -d [Deep] -f [Log File]\ 274 | -l [Level] --thread [Thread Number] --dbfile \ 275 | [Database File Name] --key [Key Word] 276 | 277 | -h 帮助 278 | -u [URL] (必选参数)指定爬虫开始地址,必须以http或https开头 279 | -d [Deep] 指定爬虫深度,默认为1,即为当前页 280 | -f [Log File] 日志记录文件,默认本地目录下spider.log 281 | -l [Level] 日志记录详细程度,数字越大越详细,1-5取值,默认1 282 | --thread [Thread Number] 指定线程池大小,默认10 283 | --dbfile [Database File Name] 指定存放数据结果的数据库,\ 284 | 默认spider.db,如果没有指定--key参数则失效 285 | --key [Key Word] 页面内的关键词,如果指定该项则会把数据\ 286 | 保存在数据库中,否则默认下载网页到本地目录 287 | testself 程序自测,可选参数''') 288 | 289 | if __name__ == '__main__': 290 | rlock = threading.RLock() 291 | url = None # 默认参数开始 292 | deep = '1' 293 | logFile = 'spider.log' 294 | level = '4' 295 | threadNum = '10' 296 | dbFile = 'spider.db' 297 | key = None # 默认参数结束 298 | 299 | optlist, args = getopt.getopt( 300 | sys.argv[1:], 301 | 'u:d:f:l:h', 302 | ['thread=', 'dbfile=', 'key=']) 303 | for k, v in optlist: 304 | if k == '-u': 305 | url = v 306 | elif k == '-d': 307 | deep = v 308 | elif k == '-f': 309 | logFile = v 310 | elif k == '-l': 311 | level = v 312 | elif k == '--thread': 313 | threadNum = v 314 | elif k == '--dbfile': 315 | dbFile = v 316 | elif k == '--key': 317 | key = v 318 | elif k == '-h': 319 | _usage() 320 | exit() 321 | 322 | deep = int(deep) 323 | level = int(level) 324 | threadNum = int(threadNum) 325 | if level < 1 or level > 5 or deep < 1 or threadNum < 1 or not url: 326 | _usage() 327 | exit() 328 | if key: 329 | db, c = _initDB(dbFile) 330 | logLevel = { # 日志级别,1最不详细,5最详细 331 | 1: logging.CRITICAL, 332 | 2: logging.ERROR, 333 | 3: logging.WARNING, 334 | 4: logging.INFO, 335 | 5: logging.DEBUG, 336 | } 337 | logging.basicConfig(filename=logFile, level=logLevel[level]) 338 | deep -= 1 339 | urls = set() # 防止抓取重复URL 340 | fileMD5 = set() # 保存文件的MD5值防止url不同但内容相同的文件重复下载 341 | mainHandler(threadNum, url, deep, key, 'test`self' in args) 342 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | ''' 3 | Version: 0.3 4 | Author: gitsocket 5 | Mail: kingthunder2004@aliyun.com 6 | Phone: 18756091592 7 | ''' 8 | import threading 9 | import sqlite3 10 | import hashlib 11 | import os 12 | from progressbar import * 13 | 14 | 15 | #生成数据库中保存的所有HTML页面的MD5值列表 16 | class testSameDB(threading.Thread): 17 | def __init__(self, cursor, md5, progress): 18 | threading.Thread.__init__(self) 19 | self.c = cursor 20 | self.count = 0 21 | self.md5 = md5 # list的append是线程安全的,可以这么使用 22 | self.progress = progress 23 | self.start() 24 | 25 | def run(self): 26 | while True: 27 | self.c.execute('select content from spider limit %s,%s' 28 | % (self.count, self.count + 10000)) 29 | self.count += 10000 30 | contents = self.c.fetchall() 31 | if len(contents) == 0: 32 | break 33 | for c in contents: 34 | res = hashlib.md5(c[0].encode('utf8')) 35 | self.md5.append(res.hexdigest()) 36 | self.progress[0] = self.count 37 | 38 | 39 | #生成本地目录中保存的所有HTML页面的MD5值列表 40 | class testSameFile(threading.Thread): 41 | def __init__(self, files, md5, progress): 42 | threading.Thread.__init__(self) 43 | self.count = 0 44 | self.files = files 45 | self.md5 = md5 # list的append是线程安全的,可以这么使用 46 | self.progress = progress 47 | self.start() 48 | 49 | def run(self): 50 | while len(self.files): 51 | fileName = self.files.pop() 52 | with open(fileName, 'rb') as f: 53 | data = f.read() 54 | res = hashlib.md5(data) 55 | self.md5.append(res.hexdigest()) 56 | self.progress[0] += 1 57 | 58 | 59 | #自测主程序,测试爬取的HTML页面有无重复 60 | def test(key, dbFile): 61 | md5 = [] 62 | progress = [0] 63 | if key: # 指定了关键词,要在数据库中自测 64 | db = sqlite3.connect(dbFile, check_same_thread=False) 65 | c = db.cursor() 66 | c.execute('select count(*) from spider') 67 | totalNum = c.fetchall()[0][0] 68 | t = testSameDB(c, md5, progress) 69 | t.join() 70 | else: # 没有指定关键词,在本地目录下自测 71 | files = os.listdir('.') 72 | files.remove(dbFile) 73 | totalNum = len(files) 74 | threads = [testSameFile(files, md5, progress) 75 | for i in range(100)] 76 | for t in threads: 77 | t.join() 78 | pBar = ProgressBar(widgets=[Percentage(), Bar()], 79 | maxval=totalNum).start() 80 | while progress[0] < totalNum: 81 | pBar.update(progress[0] + 1) 82 | pBar.finish() 83 | if len(md5) == totalNum: 84 | print('爬取的HTML页面没有重复,程序按要求“正确”执行。') 85 | else: 86 | print('爬取了“重复”的HTML页面,请修改代码!') 87 | 88 | if __name__ == '__main__': 89 | test('科学', 'spider.db') 90 | --------------------------------------------------------------------------------