├── README.md ├── 公开课代码.rar ├── 公开课代码 ├── autohome.py ├── baidu_home_news.py └── iterator.py ├── 第一讲 互联网、互联网架构方面介绍,网站基本原理及扫盲.pdf ├── 第一课代码.rar ├── 第一课代码 ├── crawl_bsf.py ├── iterator.py ├── iterator_bsf.java ├── iterator_dsf.java └── mfw_url_feed.py ├── 第三讲 分布式爬虫.pdf ├── 第三讲代码.zip ├── 第三课代码.zip ├── 第三课代码 ├── mongo_redis_mgr.py ├── mongomgr.py ├── mysqlmanager.py ├── process_crawl.py ├── readme.txt ├── spider_process_mongo.py └── spider_process_mr.py ├── 第二讲 爬虫基本原理、搭建第一个爬虫.pdf ├── 第二讲代码.zip ├── 第二讲代码 ├── dbmanager.py ├── jd.com_2131674.html ├── lxml_test.py ├── multi_thread_mfw.py └── process_crawl.py ├── 第五讲 PageRank、动态重拍、避开网站反爬的技术.pdf ├── 第四讲 爬虫与反爬虫的对抗.pdf └── 第四讲代码.zip /README.md: -------------------------------------------------------------------------------- 1 | # distributed_crawler 2 | 分布式爬虫 3 | -------------------------------------------------------------------------------- /公开课代码.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/公开课代码.rar -------------------------------------------------------------------------------- /公开课代码/autohome.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import json 3 | from bs4 import BeautifulSoup 4 | 5 | url_format = 'http://www.autohome.com.cn/grade/carhtml/%s.html'; 6 | 7 | request_headers = { 8 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 9 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6", 10 | 'cache-control': "no-cache", 11 | 'connection': "keep-alive", 12 | 'cookie': "ASP.NET_SessionId=x4skm3eeyrup1xhm4g3v3c5j; cookieCityId=110100; fvlid=1485090119298xz0qs5oQ; sessionip=124.205.188.242; sessionid=D0E06CDF-C45B-4B6A-A3B3-B0E70EE1C87D%7C%7C2017-01-22+21%3A02%3A01.307%7C%7C0; sessionuid=D0E06CDF-C45B-4B6A-A3B3-B0E70EE1C87D||2017-01-22+21%3A02%3A01.307||0; ahpvno=4; __utma=1.944097921.1485090121.1485090121.1485090121.1; __utmb=1.0.10.1485090121; __utmc=1; __utmz=1.1485090121.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ref=0%7C0%7C0%7C0%7C2017-01-22+21%3A15%3A16.728%7C2017-01-22+21%3A02%3A01.307; sessionvid=0686F4A6-50B3-4997-AFE0-2F5D28420D34; area=110199", 13 | 'host': "www.autohome.com.cn", 14 | 'if-modified-since': "Sun, 22 Jan 2017 13:00:08 GMT", 15 | 'upgrade-insecure-requests': "1", 16 | 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", 17 | 'postman-token': "8f6a0417-5aba-4b5b-cce3-f41eb134a5bd" 18 | } 19 | 20 | try: 21 | fo = open('autohome1.html', 'r') 22 | except IOError: 23 | html_doc = '' 24 | start_char = 'A' 25 | 26 | for i in range(ord('A'), ord('Z')): 27 | req = urllib2.Request(url_format % (chr(i)),headers=request_headers) 28 | response = urllib2.urlopen(req) 29 | page = response.read() 30 | html_doc += page; 31 | fo = open('autohome1.html', 'wb+') 32 | fo.write('\ 33 | \ 34 | \ 35 | \ 36 | \ 37 | \ 38 | \ 39 | \ 40 | Autohome\ 41 | \ 42 | \ 43 | ') 44 | fo.write(html_doc); 45 | fo.write('') 46 | 47 | soup = BeautifulSoup(fo, "html.parser") 48 | 49 | models_file = open("models.txt", "wb") 50 | 51 | for model in soup.find_all("h4"): 52 | try: 53 | if model.string is not None: 54 | models_file.write("%s\r\n" % (model.string.encode('utf-8'))) 55 | except ValueError: 56 | continue 57 | 58 | fo.close() 59 | models_file.close() -------------------------------------------------------------------------------- /公开课代码/baidu_home_news.py: -------------------------------------------------------------------------------- 1 | import urllib 2 | import urllib2 3 | import json 4 | 5 | url_format = 'https://www.baidu.com/home/pcweb/data/mancardwater?id=2&offset=%d&sessionId=14832978921842&p_params=31415927&newsNum=3&indextype=manht&_req_seqid=0xf7e28ac600008a71&asyn=1&t=1483297904932&sid=1445_21093_20691_21554_21592' 6 | 7 | user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' 8 | values = {'name': 'Michael Foord', 9 | 'location': 'Northampton', 10 | 'language': 'Python' } 11 | request_headers = { 12 | 'upgrade-insecure-requests': "1", 13 | 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36", 14 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 15 | 'accept-encoding': "gzip, deflate, sdch, br", 16 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6", 17 | 'cookie': "BAIDUID=967E3B223D6EF159BEC8EDB441C8CA3E:FG=1; BIDUPSID=967E3B223D6EF159BEC8EDB441C8CA3E; PSTM=1484982290; BDUSS=nVvaXZmbFhTZlBNUDhPNXlYZlVYYU5OTm10N3UtMnk0bnJEV09yd2V3RVRKS3hZSVFBQUFBJCQAAAAAAAAAAAEAAAC4Qr0zzve5z7rNv9bB-gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABOXhFgTl4RYR; pgv_pvi=6289754112; BD_HOME=1; H_PS_PSSID=1457_21098_20691_20719; BD_UPN=12314753", 18 | 'cache-control': "no-cache", 19 | 'postman-token': "ffd3e32c-3099-ffcb-576e-c77b9d9a83ab" 20 | } 21 | 22 | data = urllib.urlencode(values) 23 | html_doc = '' 24 | for i in range(1,6): 25 | req = urllib2.Request(url_format % (i),headers=request_headers) 26 | response = urllib2.urlopen(req) 27 | page = response.read() 28 | page = page.replace('\\x22','Xx22').replace('\\', '').replace('Xx22', '\\"') 29 | response_obj = json.loads(page) 30 | html_doc += response_obj['html'].replace('\\"', '"').encode('utf-8') 31 | 32 | fo = open('baidu.html', 'wb') 33 | fo.write('') 34 | fo.write(html_doc); 35 | fo.write('') 36 | fo.close() -------------------------------------------------------------------------------- /公开课代码/iterator.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import json 3 | from lxml import etree 4 | import hashlib 5 | 6 | request_headers = { 7 | 'host': "www.mafengwo.cn", 8 | 'connection': "keep-alive", 9 | 'cache-control': "no-cache", 10 | 'upgrade-insecure-requests': "1", 11 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 12 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 13 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6", 14 | 'cookie': "mfw_uuid=5879e298-7d17-50bf-100a-30898d44da2d; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222017-01-14+16%3A34%3A32%22%3B%7D; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1484382875%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A0%3A%22%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5879e298-7d17-50bf-100a-30898d44da2d; PHPSESSID=v17pef8jrto99pvsgsppo748j0; __mfwlv=1484402143; __mfwvn=2; __mfwlt=1484402151; uva=a%3A4%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1484402148%3Bs%3A10%3A%22last_refer%22%3Bs%3A6%3A%22direct%22%3Bs%3A5%3A%22rhost%22%3Bs%3A0%3A%22%22%3Bs%3A4%3A%22step%22%3Bi%3A9%3B%7D; CNZZDATA30065558=cnzz_eid%3D55928032-1484382591-%26ntime%3D1484397604", 15 | 'postman-token': "0d7a1e08-f8d5-ec1f-ab2e-879ab9a00d34" 16 | } 17 | 18 | root_url = 'http://www.mafengwo.cn' 19 | max_level = 5 20 | dir_name = 'iterate/' 21 | iter_width = 3 22 | 23 | downloaded_url_file_name = dir_name + 'download.txt' 24 | 25 | du_file = open(downloaded_url_file_name, 'a+') 26 | downloaded_urls = du_file.readlines() 27 | 28 | 29 | def getpagecontent(cur_url, cur_level): 30 | print "downloading %s at level %d" % (cur_url, cur_level) 31 | try: 32 | req = urllib2.Request(cur_url, headers=request_headers) 33 | response = urllib2.urlopen(req) 34 | html_page = response.read() 35 | filename = cur_url[7:].replace('/', '_') 36 | fo = open("%s%s.html" % (dir_name, filename), 'wb+') 37 | fo.write(html_page) 38 | fo.close() 39 | except urllib2.HTTPError: 40 | print 'HTTP Error at ' + cur_url 41 | return 42 | except httplib.BadStatusLine: 43 | print 'BadStatusLine' 44 | return 45 | except IOError: 46 | print 'IO Error at ' + filename 47 | return 48 | except Exception: 49 | print 'Unhandled Exception' 50 | return 51 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list' 52 | downloaded_urls.append(hashlib.md5(cur_url).hexdigest()) 53 | du_file.write(hashlib.md5(cur_url).hexdigest() + '\r\n') 54 | 55 | html = etree.HTML(html_page.lower().decode('utf-8')) 56 | hrefs = html.xpath(u"//a") 57 | 58 | if cur_level == max_level: 59 | return 60 | 61 | page_index = 0 62 | for href in hrefs: 63 | try: 64 | if 'href' in href.attrib: 65 | val = href.attrib['href'] 66 | if val.find('javascript:') != -1: 67 | continue 68 | if val.startswith('http://') is False: 69 | if val.startswith('/'): 70 | val = 'http://www.mafengwo.cn' + val 71 | else: 72 | continue 73 | if val[-1] == '/': 74 | val = val[0:-1] 75 | if hashlib.md5(val).hexdigest() not in downloaded_urls: 76 | getpagecontent(val, cur_level + 1) 77 | page_index += 1 78 | if page_index == iter_width: 79 | break 80 | else: 81 | print val + ' is skipped' 82 | except ValueError: 83 | continue 84 | 85 | getpagecontent(root_url, 0) 86 | 87 | du_file.close() 88 | -------------------------------------------------------------------------------- /第一讲 互联网、互联网架构方面介绍,网站基本原理及扫盲.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第一讲 互联网、互联网架构方面介绍,网站基本原理及扫盲.pdf -------------------------------------------------------------------------------- /第一课代码.rar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第一课代码.rar -------------------------------------------------------------------------------- /第一课代码/crawl_bsf.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from collections import deque 3 | import json 4 | from lxml import etree 5 | import httplib 6 | import hashlib 7 | from pybloomfilter import BloomFilter 8 | 9 | class CrawlBSF: 10 | request_headers = { 11 | 'host': "www.mafengwo.cn", 12 | 'connection': "keep-alive", 13 | 'cache-control': "no-cache", 14 | 'upgrade-insecure-requests': "1", 15 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 16 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 17 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" 18 | } 19 | 20 | cur_level = 0 21 | max_level = 5 22 | dir_name = 'iterate/' 23 | iter_width = 50 24 | downloaded_urls = [] 25 | 26 | du_md5_file_name = dir_name + 'download.txt' 27 | du_url_file_name = dir_name + 'urls.txt' 28 | 29 | download_bf = BloomFilter(1024*1024*16, 0.01) 30 | 31 | cur_queue = deque() 32 | child_queue = deque() 33 | 34 | def __init__(self, url): 35 | self.root_url = url 36 | self.cur_queue.append(url) 37 | self.du_file = open(self.du_url_file_name, 'a+') 38 | try: 39 | self.dumd5_file = open(self.du_md5_file_name, 'r') 40 | self.downloaded_urls = self.dumd5_file.readlines() 41 | self.dumd5_file.close() 42 | for urlmd5 in self.downloaded_urls: 43 | self.download_bf.add(urlmd5[:-2]) 44 | except IOError: 45 | print "File not found" 46 | finally: 47 | self.dumd5_file = open(self.du_md5_file_name, 'a+') 48 | 49 | def enqueueUrl(self, url): 50 | self.child_queue.append(url) 51 | 52 | def dequeuUrl(self): 53 | try: 54 | url = self.cur_queue.popleft() 55 | return url 56 | except IndexError: 57 | self.cur_level += 1 58 | if self.cur_level == self.max_level: 59 | return None 60 | if len(self.child_queue) == 0: 61 | return None 62 | self.cur_queue = self.child_queue 63 | self.child_queue = deque() 64 | return self.dequeuUrl() 65 | 66 | def getpagecontent(self, cur_url): 67 | print "downloading %s at level %d" % (cur_url, self.cur_level) 68 | try: 69 | req = urllib2.Request(cur_url, headers=self.request_headers) 70 | response = urllib2.urlopen(req) 71 | html_page = response.read() 72 | filename = cur_url[7:].replace('/', '_') 73 | fo = open("%s%s.html" % (self.dir_name, filename), 'wb+') 74 | fo.write(html_page) 75 | fo.close() 76 | except urllib2.HTTPError, Arguments: 77 | print Arguments 78 | return 79 | except httplib.BadStatusLine: 80 | print 'BadStatusLine' 81 | return 82 | except IOError: 83 | print 'IO Error at ' + filename 84 | return 85 | except Exception, Arguments: 86 | print Arguments 87 | return 88 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list' 89 | dumd5 = hashlib.md5(cur_url).hexdigest() 90 | self.downloaded_urls.append(dumd5) 91 | self.dumd5_file.write(dumd5 + '\r\n') 92 | self.du_file.write(cur_url + '\r\n') 93 | self.download_bf.add(dumd5) 94 | 95 | html = etree.HTML(html_page.lower().decode('utf-8')) 96 | hrefs = html.xpath(u"//a") 97 | 98 | for href in hrefs: 99 | try: 100 | if 'href' in href.attrib: 101 | val = href.attrib['href'] 102 | if val.find('javascript:') != -1: 103 | continue 104 | if val.startswith('http://') is False: 105 | if val.startswith('/'): 106 | val = 'http://www.mafengwo.cn' + val 107 | else: 108 | continue 109 | if val[-1] == '/': 110 | val = val[0:-1] 111 | # if hashlib.md5(val).hexdigest() not in self.downloaded_urls: 112 | if hashlib.md5(val).hexdigest() not in self.download_bf: 113 | self.enqueueUrl(val) 114 | else: 115 | print 'Skip %s' % (val) 116 | except ValueError: 117 | continue 118 | 119 | def start_crawl(self): 120 | while True: 121 | url = self.dequeuUrl() 122 | if url is None: 123 | break 124 | self.getpagecontent(url) 125 | self.dumd5_file.close() 126 | self.du_file.close() 127 | 128 | crawler = CrawlBSF("http://www.mafengwo.cn") 129 | crawler.start_crawl() -------------------------------------------------------------------------------- /第一课代码/iterator.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import json 3 | from lxml import etree 4 | import hashlib 5 | 6 | request_headers = { 7 | 'host': "www.mafengwo.cn", 8 | 'connection': "keep-alive", 9 | 'cache-control': "no-cache", 10 | 'upgrade-insecure-requests': "1", 11 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 12 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 13 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6", 14 | 'cookie': "mfw_uuid=5879e298-7d17-50bf-100a-30898d44da2d; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222017-01-14+16%3A34%3A32%22%3B%7D; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1484382875%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A0%3A%22%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5879e298-7d17-50bf-100a-30898d44da2d; PHPSESSID=v17pef8jrto99pvsgsppo748j0; __mfwlv=1484402143; __mfwvn=2; __mfwlt=1484402151; uva=a%3A4%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1484402148%3Bs%3A10%3A%22last_refer%22%3Bs%3A6%3A%22direct%22%3Bs%3A5%3A%22rhost%22%3Bs%3A0%3A%22%22%3Bs%3A4%3A%22step%22%3Bi%3A9%3B%7D; CNZZDATA30065558=cnzz_eid%3D55928032-1484382591-%26ntime%3D1484397604", 15 | 'postman-token': "0d7a1e08-f8d5-ec1f-ab2e-879ab9a00d34" 16 | } 17 | 18 | root_url = 'http://www.mafengwo.cn' 19 | max_level = 5 20 | dir_name = 'iterate/' 21 | iter_width = 3 22 | 23 | downloaded_url_file_name = dir_name + 'download.txt' 24 | 25 | du_file = open(downloaded_url_file_name, 'a+') 26 | downloaded_urls = du_file.readlines() 27 | 28 | 29 | def getpagecontent(cur_url, cur_level): 30 | print "downloading %s at level %d" % (cur_url, cur_level) 31 | try: 32 | req = urllib2.Request(cur_url, headers=request_headers) 33 | response = urllib2.urlopen(req) 34 | html_page = response.read() 35 | filename = cur_url[7:].replace('/', '_') 36 | fo = open("%s%s.html" % (dir_name, filename), 'wb+') 37 | fo.write(html_page) 38 | fo.close() 39 | except urllib2.HTTPError: 40 | print 'HTTP Error at ' + cur_url 41 | return 42 | except httplib.BadStatusLine: 43 | print 'BadStatusLine' 44 | return 45 | except IOError: 46 | print 'IO Error at ' + filename 47 | return 48 | except Exception: 49 | print 'Unhandled Exception' 50 | return 51 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list' 52 | downloaded_urls.append(hashlib.md5(cur_url).hexdigest()) 53 | du_file.write(hashlib.md5(cur_url).hexdigest() + '\r\n') 54 | 55 | html = etree.HTML(html_page.lower().decode('utf-8')) 56 | hrefs = html.xpath(u"//a") 57 | 58 | if cur_level == max_level: 59 | return 60 | 61 | page_index = 0 62 | for href in hrefs: 63 | try: 64 | if 'href' in href.attrib: 65 | val = href.attrib['href'] 66 | if val.find('javascript:') != -1: 67 | continue 68 | if val.startswith('http://') is False: 69 | if val.startswith('/'): 70 | val = 'http://www.mafengwo.cn' + val 71 | else: 72 | continue 73 | if val[-1] == '/': 74 | val = val[0:-1] 75 | if hashlib.md5(val).hexdigest() not in downloaded_urls: 76 | getpagecontent(val, cur_level + 1) 77 | page_index += 1 78 | if page_index == iter_width: 79 | break 80 | else: 81 | print val + ' is skipped' 82 | except ValueError: 83 | continue 84 | 85 | getpagecontent(root_url, 0) 86 | 87 | du_file.close() 88 | -------------------------------------------------------------------------------- /第一课代码/iterator_bsf.java: -------------------------------------------------------------------------------- 1 | package cn.marble; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Queue; 6 | import java.util.concurrent.ConcurrentLinkedQueue; 7 | 8 | public class Main { 9 | 10 | Queue currentQueue = new ConcurrentLinkedQueue(); 11 | 12 | Queue childQueue = new ConcurrentLinkedQueue(); 13 | 14 | int currentLevel = 0; 15 | 16 | void appendDownloadedUrl(String url){ 17 | 18 | } 19 | 20 | void appendErrorUrl(String url){ 21 | 22 | } 23 | 24 | boolean isDownloaded(String url){ 25 | 26 | } 27 | 28 | String getPageContent(String url) throws CrawlException, IOException { 29 | return ""; 30 | } 31 | 32 | String savePageContent(String pageContent){ 33 | return ""; 34 | } 35 | 36 | void enqueueUrls(ConcurrentLinkedQueue urls){ 37 | childQueue.addAll(urls); 38 | } 39 | 40 | void enqueueUrlsFromPageSrc(String pageContent){ 41 | 42 | } 43 | 44 | void enqueueUrl(String url){ 45 | childQueue.add(url); 46 | } 47 | 48 | String dequeueUrl(){ 49 | String url = currentQueue.poll(); 50 | if ( url == null ){ 51 | currentLevel ++; 52 | if ( currentLevel == HEIGHT ) 53 | return null; 54 | currentQueue = childQueue; 55 | childQueue = new ConcurrentLinkedQueue(); 56 | url = currentQueue.poll(); 57 | } 58 | return url; 59 | } 60 | 61 | String rootNode; 62 | 63 | final static int WIDTH = 50; 64 | final static int HEIGHT = 5; 65 | 66 | void crawl(String url){ 67 | String pageContent; 68 | try{ 69 | pageContent = getPageContent(url); 70 | savePageContent(pageContent); 71 | } catch( Exception e ){ 72 | appendErrorUrl(url); 73 | return; 74 | } 75 | 76 | enqueueUrlsFromPageSrc(pageContent); 77 | } 78 | 79 | void start(){ 80 | 81 | int curLevel = 0; 82 | enqueueUrl("http://www.mafengwo.cn"); 83 | 84 | while( true ){ 85 | String url = dequeueUrl(); 86 | if ( url == null ){ 87 | if ( url == null ) 88 | break; 89 | } 90 | crawl(url); 91 | 92 | } 93 | } 94 | 95 | public static void main(String[] args) { 96 | // write your code here 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /第一课代码/iterator_dsf.java: -------------------------------------------------------------------------------- 1 | public class Crawler{ 2 | 3 | String[] getAllUrls(String htmlContent); 4 | 5 | void appendDownloadedUrl(String url); 6 | 7 | void appendErrorUrl(String url); 8 | 9 | boolean isDownloaded(String url); 10 | 11 | String getPageContent(String url) throws CrawlException, IOException; 12 | 13 | String savePageContent(String pageContent); 14 | 15 | String rootNode; 16 | 17 | final static int WIDTH = 50; 18 | final static int HEIGHT = 5; 19 | 20 | 21 | void crawl(String url, level){ 22 | try{ 23 | String pageContent = getPageContent(url); 24 | } catch( Exception e ){ 25 | appendErrorUrl(url); 26 | return; 27 | } 28 | 29 | savePageContent(pageContent); 30 | 31 | if ( level == HEIGHT ) 32 | return; 33 | 34 | String[] urls = getAllUrls(pageContent); 35 | 36 | for( int i = 0; i < urls.length; i ++ ){ 37 | 38 | if ( i == WIDTH ) 39 | return; 40 | if ( isDownloaded(urls[i])) 41 | continue; 42 | crawl( urls[i], level + 1 ); 43 | } 44 | } 45 | 46 | start(){ 47 | crawl("http://www.mafengwo.com.cn", 0 ); 48 | } 49 | 50 | } -------------------------------------------------------------------------------- /第一课代码/mfw_url_feed.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import httplib 3 | import re 4 | from pybloomfilter import BloomFilter 5 | import os 6 | 7 | request_headers = { 8 | 'host': "www.mafengwo.cn", 9 | 'connection': "keep-alive", 10 | 'cache-control': "no-cache", 11 | 'upgrade-insecure-requests': "1", 12 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 13 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 14 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" 15 | } 16 | 17 | city_home_pages = [] 18 | city_ids = [] 19 | dirname = 'mafengwo_notes/' 20 | 21 | # 创建 Bloom Filter 22 | download_bf = BloomFilter(1024 * 1024 * 16, 0.01) 23 | 24 | 25 | def download_city_notes(id): 26 | for i in range(1, 999): 27 | url = 'http://www.mafengwo.cn/yj/%s/1-0-%d.html' % (id, i) 28 | if url in download_bf: 29 | continue 30 | print 'open url %s' % (url) 31 | download_bf.add(url) 32 | req = urllib2.Request(url, headers=request_headers) 33 | response = urllib2.urlopen(req) 34 | htmlcontent = response.read() 35 | city_notes = re.findall('href="/i/\d{7}.html', htmlcontent) 36 | 37 | # 如果导航页错误,该页的游记数为0,则意味着 1-0-xxx.html 已经遍历完,结束这个城市 38 | if len(city_notes) == 0: 39 | return 40 | for city_note in city_notes: 41 | try: 42 | city_url = 'http://www.mafengwo.cn%s' % (city_note[6:]) 43 | if city_url in download_bf: 44 | continue 45 | print 'download %s' % (city_url) 46 | req = urllib2.Request(city_url, headers=request_headers) 47 | response = urllib2.urlopen(req) 48 | html = response.read() 49 | filename = city_url[7:].replace('/', '_') 50 | fo = open("%s%s" % (dirname, filename), 'wb+') 51 | fo.write(html) 52 | fo.close() 53 | download_bf.add(city_url) 54 | except Exception, Arguments: 55 | print Arguments 56 | continue 57 | 58 | # 检查用于存储网页文件夹是否存在,不存在则创建 59 | if not os.path.exits(dirname): 60 | os.makedirs(dirname) 61 | 62 | try: 63 | # 下载目的地的首页 64 | req = urllib2.Request('http://www.mafengwo.cn/mdd/', headers=request_headers) 65 | response = urllib2.urlopen(req) 66 | htmlcontent = response.read() 67 | 68 | # 利用正则表达式,找出所有的城市主页 69 | city_home_pages = re.findall('/travel-scenic-spot/mafengwo/\d{5}.html', htmlcontent) 70 | 71 | # 通过循环,依次下载每个城市下的所有游记 72 | for city in city_home_pages: 73 | city_ids.append(city[29:34]) 74 | download_city_notes(city[29:34]) 75 | except urllib2.HTTPError, Arguments: 76 | print Arguments 77 | except httplib.BadStatusLine: 78 | print 'BadStatusLine' 79 | except Exception, Arguments: 80 | print Arguments 81 | -------------------------------------------------------------------------------- /第三讲 分布式爬虫.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第三讲 分布式爬虫.pdf -------------------------------------------------------------------------------- /第三讲代码.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第三讲代码.zip -------------------------------------------------------------------------------- /第三课代码.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第三课代码.zip -------------------------------------------------------------------------------- /第三课代码/mongo_redis_mgr.py: -------------------------------------------------------------------------------- 1 | import mysql.connector 2 | import httplib 3 | import hashlib 4 | import time 5 | from datetime import datetime 6 | from datetime import timedelta 7 | 8 | import redis 9 | from pymongo import MongoClient 10 | from pymongo import IndexModel, ASCENDING, DESCENDING 11 | 12 | 13 | class MongoRedisUrlManager: 14 | 15 | def __init__(self, sever_ip='localhost', client=None, expires=timedelta(days=30)): 16 | """ 17 | client: mongo database client 18 | expires: timedelta of amount of time before a cache entry is considered expired 19 | """ 20 | # if a client object is not passed 21 | # then try connecting to mongodb at the default localhost port 22 | self.client = MongoClient(sever_ip, 27017) if client is None else client 23 | self.redis_client = redis.StrictRedis(host=sever_ip, port=6379, db=0) 24 | #create collection to store cached webpages, 25 | # which is the equivalent of a table in a relational database 26 | self.db = self.client.spider 27 | 28 | # create index if db is empty 29 | if self.db.mfw.count() is 0: 30 | self.db.mfw.create_index('status') 31 | 32 | def dequeueUrl(self): 33 | record = self.db.mfw.find_one_and_update( 34 | { 'status': 'new'}, 35 | { '$set': { 'status' : 'downloading'} }, 36 | { 'upsert':False, 'returnNewDocument' : False} 37 | ) 38 | if record: 39 | return record 40 | else: 41 | return None 42 | 43 | def enqueueUrl(self, url, status, depth): 44 | num = self.redis_client.get(url) 45 | if num is not None: 46 | self.redis_client.set(url, int(num) + 1 ) 47 | return 48 | self.redis_client.set(url, 1) 49 | self.db.mfw.insert({ 50 | '_id': hashlib.md5(url).hexdigest(), 51 | 'url': url, 52 | 'status': status, 53 | 'queue_time': datetime.utcnow(), 54 | 'depth': depth 55 | }) 56 | 57 | def finishUrl(self, url): 58 | record = {'status': 'done', 'done_time': datetime.utcnow()} 59 | self.db.mfw.update({'_id': hashlib.md5(url).hexdigest()}, {'$set': record}, upsert=False) 60 | 61 | def clear(self): 62 | self.redis_client.flushall() 63 | self.db.mfw.drop() -------------------------------------------------------------------------------- /第三课代码/mongomgr.py: -------------------------------------------------------------------------------- 1 | import mysql.connector 2 | import httplib 3 | import hashlib 4 | import time 5 | import datetime 6 | from datetime import datetime 7 | from datetime import timedelta 8 | from pymongo import MongoClient 9 | 10 | 11 | class MongoUrlManager: 12 | def __init__(self, SERVER_IP = 'localhost', port=27017, client=None): 13 | # if a client object is not passed 14 | # then try connecting to mongodb at the default localhost port 15 | self.client = MongoClient(SERVER_IP, port) if client is None else client 16 | #create collection to store cached webpages, 17 | # which is the equivalent of a table in a relational database 18 | self.db = self.client.spider 19 | 20 | def dequeueUrl(self): 21 | record = self.db.mfw.find_one_and_update( 22 | {'status': 'new'}, 23 | { '$set': { 'status' : 'downloading'} }, 24 | { 'upsert':False, 'returnNewDocument' : False} 25 | ) 26 | if record: 27 | return record 28 | else: 29 | return None 30 | 31 | def enqueueUrl(self, url, status, depth): 32 | try: 33 | self.db.mfw.insert({'_id': url, 'status': status, 'queue_time': datetime.utcnow(), 'depth': depth}) 34 | except Exception, Arguments: 35 | pass 36 | 37 | def finishUrl(self, url, status='done'): 38 | record = {'status': status, 'done_time': datetime.utcnow()} 39 | self.db.mfw.update({'_id': url}, {'$set': record}, upsert=False) 40 | 41 | def clear(self): 42 | self.db.mfw.drop() -------------------------------------------------------------------------------- /第三课代码/mysqlmanager.py: -------------------------------------------------------------------------------- 1 | import mysql.connector 2 | from mysql.connector import errorcode 3 | from mysql.connector import pooling 4 | import httplib 5 | import hashlib 6 | import time 7 | 8 | class CrawlDatabaseManager: 9 | 10 | DB_NAME = 'mfw_pro_crawl' 11 | 12 | SERVER_IP = '127.0.0.1' 13 | 14 | TABLES = {} 15 | TABLES['urls'] = ( 16 | "CREATE TABLE `urls` (" 17 | " `index` int(11) NOT NULL AUTO_INCREMENT," 18 | " `url` varchar(512) NOT NULL," 19 | " `md5` varchar(32) NOT NULL," 20 | " `status` varchar(11) NOT NULL DEFAULT 'new'," 21 | " `depth` int(11) NOT NULL," 22 | " `queue_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP," 23 | " `done_time` timestamp NOT NULL DEFAULT 0," 24 | " PRIMARY KEY (`index`)," 25 | " UNIQUE KEY `md5` (`md5`)" 26 | ") ENGINE=InnoDB") 27 | 28 | 29 | def __init__(self, max_num_thread): 30 | try: 31 | cnx = mysql.connector.connect(host=self.SERVER_IP, user='root') 32 | except mysql.connector.Error as err: 33 | if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: 34 | print("Something is wrong with your user name or password") 35 | elif err.errno == errorcode.ER_BAD_DB_ERROR: 36 | print("Database does not exist") 37 | else: 38 | print 'Create Error ' + err.msg 39 | exit(1) 40 | 41 | cursor = cnx.cursor() 42 | 43 | try: 44 | cnx.database = self.DB_NAME 45 | except mysql.connector.Error as err: 46 | if err.errno == errorcode.ER_BAD_DB_ERROR: 47 | self.create_database(cursor) 48 | cnx.database = self.DB_NAME 49 | self.create_tables(cursor) 50 | else: 51 | print(err) 52 | exit(1) 53 | finally: 54 | cursor.close() 55 | cnx.close() 56 | 57 | dbconfig = { 58 | "database": self.DB_NAME, 59 | "user": "root", 60 | "host": self.SERVER_IP, 61 | } 62 | self.cnxpool = mysql.connector.pooling.MySQLConnectionPool(pool_name = "mypool", 63 | pool_size = max_num_thread, 64 | **dbconfig) 65 | 66 | 67 | def create_database(self, cursor): 68 | try: 69 | cursor.execute( 70 | "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(self.DB_NAME)) 71 | except mysql.connector.Error as err: 72 | print("Failed creating database: {}".format(err)) 73 | exit(1) 74 | 75 | def create_tables(self, cursor): 76 | for name, ddl in self.TABLES.iteritems(): 77 | try: 78 | cursor.execute(ddl) 79 | except mysql.connector.Error as err: 80 | if err.errno == errorcode.ER_TABLE_EXISTS_ERROR: 81 | print 'create tables error ALREADY EXISTS' 82 | else: 83 | print 'create tables error ' + err.msg 84 | else: 85 | print 'Tables created' 86 | 87 | 88 | def enqueueUrl(self, url, depth): 89 | con = self.cnxpool.get_connection() 90 | cursor = con.cursor() 91 | try: 92 | add_url = ("INSERT INTO urls (url, md5, depth) VALUES (%s, %s, %s)") 93 | data_url = (url, hashlib.md5(url).hexdigest(), depth) 94 | cursor.execute(add_url, data_url) 95 | con.commit() 96 | except mysql.connector.Error as err: 97 | # print 'enqueueUrl() ' + err.msg 98 | return 99 | finally: 100 | cursor.close() 101 | con.close() 102 | 103 | 104 | def dequeueUrl(self): 105 | con = self.cnxpool.get_connection() 106 | cursor = con.cursor(dictionary=True) 107 | try: 108 | query = ("SELECT `index`, `url`, `depth` FROM urls WHERE status='new' ORDER BY `index` ASC LIMIT 1 FOR UPDATE") 109 | cursor.execute(query) 110 | if cursor.rowcount is 0: 111 | return None 112 | row = cursor.fetchone() 113 | update_query = ("UPDATE urls SET `status`='downloading' WHERE `index`=%d") % (row['index']) 114 | cursor.execute(update_query) 115 | con.commit() 116 | return row 117 | except mysql.connector.Error as err: 118 | # print 'dequeueUrl() ' + err.msg 119 | return None 120 | finally: 121 | cursor.close() 122 | con.close() 123 | 124 | def finishUrl(self, index): 125 | con = self.cnxpool.get_connection() 126 | cursor = con.cursor() 127 | try: 128 | # we don't need to update done_time using time.strftime('%Y-%m-%d %H:%M:%S') as it's auto updated 129 | update_query = ("UPDATE urls SET `status`='done', `done_time`=%s WHERE `index`=%d") % (time.strftime('%Y-%m-%d %H:%M:%S'), index) 130 | cursor.execute(update_query) 131 | con.commit() 132 | except mysql.connector.Error as err: 133 | # print 'finishUrl() ' + err.msg 134 | return 135 | finally: 136 | cursor.close() 137 | con.close() 138 | -------------------------------------------------------------------------------- /第三课代码/process_crawl.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from collections import deque 3 | import json 4 | from lxml import etree 5 | import httplib 6 | import hashlib 7 | import thread 8 | import threading 9 | import time 10 | from mysqlmanager import CrawlDatabaseManager 11 | 12 | from hdfs import * 13 | from hdfs.util import HdfsError 14 | 15 | from mysql.connector import errorcode 16 | import mysql.connector 17 | 18 | request_headers = { 19 | 'host': "www.mafengwo.cn", 20 | 'connection': "keep-alive", 21 | 'cache-control': "no-cache", 22 | 'upgrade-insecure-requests': "1", 23 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 24 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 25 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" 26 | } 27 | 28 | def get_page_content(cur_url, index, depth): 29 | print "downloading %s at level %d" % (cur_url, depth) 30 | try: 31 | req = urllib2.Request(cur_url, headers=request_headers) 32 | response = urllib2.urlopen(req) 33 | html_page = response.read() 34 | filename = cur_url[7:].replace('/', '_') 35 | 36 | Write page to local files system 37 | fo = open("%s%s.html" % (dir_name, filename), 'wb+') 38 | fo.write(html_page) 39 | fo.close() 40 | 41 | # Write to HDFS 42 | # with hdfs_client.write('/htmls/mfw/%s.html' % (filename)) as writer: 43 | # writer.write(html_page) 44 | 45 | dbmanager.finishUrl(index) 46 | except urllib2.HTTPError, Arguments: 47 | print Arguments 48 | return 49 | except httplib.BadStatusLine, Arguments: 50 | print Arguments 51 | return 52 | except IOError, Arguments: 53 | print Arguments 54 | return 55 | except HdfsError, Arguments: 56 | print Arguments 57 | except Exception, Arguments: 58 | print Arguments 59 | return 60 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list' 61 | 62 | html = etree.HTML(html_page.lower().decode('utf-8')) 63 | hrefs = html.xpath(u"//a") 64 | 65 | for href in hrefs: 66 | try: 67 | if 'href' in href.attrib: 68 | val = href.attrib['href'] 69 | if val.find('javascript:') != -1: 70 | continue 71 | if val.startswith('http://') is False: 72 | if val.startswith('/'): 73 | val = 'http://www.mafengwo.cn' + val 74 | else: 75 | continue 76 | if val[-1] == '/': 77 | val = val[0:-1] 78 | dbmanager.enqueueUrl(val, 'new', depth + 1) 79 | 80 | except ValueError: 81 | continue 82 | 83 | 84 | max_num_thread = 5 85 | dbmanager = CrawlDatabaseManager(max_num_thread) 86 | 87 | dir_name = 'dir_process/' 88 | 89 | dbmanager.enqueueUrl("http://www.mafengwo.cn", 0) 90 | start_time = time.time() 91 | is_root_page = True 92 | threads = [] 93 | 94 | CRAWL_DELAY = 0.6 95 | 96 | # hdfs_client = InsecureClient('http://54.223.92.169:50070', user='ec2-user') 97 | 98 | while True: 99 | curtask = dbmanager.dequeueUrl() 100 | # Go on next level, before that, needs to wait all current level crawling done 101 | if curtask is None: 102 | for t in threads: 103 | t.join() 104 | break 105 | 106 | # looking for an empty thread from pool to crawl 107 | 108 | if is_root_page is True: 109 | get_page_content(curtask['url'], curtask['index'], curtask['depth']) 110 | is_root_page = False 111 | else: 112 | while True: 113 | # first remove all finished running threads 114 | for t in threads: 115 | if not t.is_alive(): 116 | threads.remove(t) 117 | if len(threads) >= max_num_thread: 118 | time.sleep(CRAWL_DELAY) 119 | continue 120 | try: 121 | t = threading.Thread(target=get_page_content, name=None, args=(curtask['url'], curtask['index'], curtask['depth'])) 122 | threads.append(t) 123 | # set daemon so main thread can exit when receives ctrl-c 124 | t.setDaemon(True) 125 | t.start() 126 | time.sleep(CRAWL_DELAY) 127 | break 128 | except Exception: 129 | print "Error: unable to start thread" 130 | 131 | cursor.close() 132 | cnx.close() 133 | -------------------------------------------------------------------------------- /第三课代码/readme.txt: -------------------------------------------------------------------------------- 1 | python spider_process_mongo.py 2 | 仅使用了 mongo db 3 | 需要在本机启动mongo,使用默认端口,可以在创建 MongoUrlManager 的时候设置Mongo服务器和端口 4 | 5 | python spider_process_mr.py 6 | 同时用了 mongo 与 reids 7 | 需要在本机启动mongo及redis,使用默认端口,可以在创建 MongoRedisUrlManager 的时候设置Mongo服务器和端口 -------------------------------------------------------------------------------- /第三课代码/spider_process_mongo.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import httplib 3 | from lxml import etree 4 | import thread 5 | import threading 6 | import os 7 | import time 8 | from mongomgr import MongoUrlManager 9 | 10 | from hdfs import * 11 | from hdfs.util import HdfsError 12 | 13 | request_headers = { 14 | 'host': "www.mafengwo.cn", 15 | 'connection': "keep-alive", 16 | 'cache-control': "no-cache", 17 | 'upgrade-insecure-requests': "1", 18 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 19 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 20 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" 21 | } 22 | 23 | dir_path = './mafengwo' 24 | 25 | def save_page_content(html, filename): 26 | if os.path.exists(dir_path) is False: 27 | os.mkdir(dir_path) 28 | fo = open("%s/%s.html" % (dir_path, filename), 'wb+') 29 | fo.write(html) 30 | fo.close() 31 | 32 | def get_page_content(cur_url, depth): 33 | print "downloading %s at level %d" % (cur_url, depth) 34 | try: 35 | req = urllib2.Request(cur_url, headers=request_headers) 36 | response = urllib2.urlopen(req) 37 | html_page = response.read() 38 | filename = cur_url[7:].replace('/', '_') 39 | 40 | #Write page to local files system 41 | save_page_content(html_page, filename) 42 | 43 | # Write to HDFS 44 | # with hdfs_client.write('/htmls/mfw/%s.html' % (filename)) as writer: 45 | # writer.write(html_page) 46 | 47 | dbmanager.finishUrl(cur_url) 48 | except urllib2.HTTPError, Arguments: 49 | print Arguments 50 | return 51 | except httplib.BadStatusLine, Arguments: 52 | print Arguments 53 | return 54 | except IOError, Arguments: 55 | print Arguments 56 | return 57 | except HdfsError, Arguments: 58 | print Arguments 59 | except Exception, Arguments: 60 | print Arguments 61 | return 62 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list' 63 | 64 | html = etree.HTML(html_page.lower().decode('utf-8')) 65 | hrefs = html.xpath(u"//a") 66 | 67 | for href in hrefs: 68 | try: 69 | if 'href' in href.attrib: 70 | val = href.attrib['href'] 71 | if val.find('javascript:') != -1: 72 | continue 73 | if val.startswith('http://') is False: 74 | if val.startswith('/'): 75 | val = 'http://www.mafengwo.cn' + val 76 | else: 77 | continue 78 | if val[-1] == '/': 79 | val = val[0:-1] 80 | dbmanager.enqueueUrl(val, 'new', depth + 1) 81 | except ValueError: 82 | continue 83 | 84 | 85 | max_num_thread = 5 86 | dbmanager = MongoUrlManager() 87 | 88 | dbmanager.enqueueUrl("http://www.mafengwo.cn", 'new', 0) 89 | 90 | start_time = time.time() 91 | is_root_page = True 92 | threads = [] 93 | 94 | CRAWL_DELAY = 0.6 95 | 96 | # use hdfs to save pages 97 | # hdfs_client = InsecureClient('http://54.223.92.169:50070', user='ec2-user') 98 | 99 | while True: 100 | curtask = dbmanager.dequeueUrl() 101 | # Go on next level, before that, needs to wait all current level crawling done 102 | if curtask is None: 103 | print 'No task available!' 104 | for t in threads: 105 | t.join() 106 | break 107 | 108 | # looking for an empty thread from pool to crawl 109 | 110 | if is_root_page is True: 111 | get_page_content(curtask['_id'], curtask['depth']) 112 | is_root_page = False 113 | else: 114 | while True: 115 | # first remove all finished running threads 116 | for t in threads: 117 | if not t.is_alive(): 118 | threads.remove(t) 119 | if len(threads) >= max_num_thread: 120 | time.sleep(CRAWL_DELAY) 121 | continue 122 | try: 123 | t = threading.Thread(target=get_page_content, name=None, args=(curtask['_id'], curtask['depth'])) 124 | threads.append(t) 125 | # set daemon so main thread can exit when receives ctrl-c 126 | t.setDaemon(True) 127 | t.start() 128 | time.sleep(CRAWL_DELAY) 129 | break 130 | except Exception as error: 131 | print 'Unable to start thread: ' + error.message 132 | break -------------------------------------------------------------------------------- /第三课代码/spider_process_mr.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | import httplib 3 | from lxml import etree 4 | import thread 5 | import threading 6 | import os 7 | import time 8 | from mongo_redis_mgr import MongoRedisUrlManager 9 | 10 | from hdfs import * 11 | from hdfs.util import HdfsError 12 | 13 | request_headers = { 14 | 'host': "www.mafengwo.cn", 15 | 'connection': "keep-alive", 16 | 'cache-control': "no-cache", 17 | 'upgrade-insecure-requests': "1", 18 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 19 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 20 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" 21 | } 22 | 23 | dir_path = 'mafengwo' 24 | 25 | def save_page_content(html, filename): 26 | if os.path.exists(dir_path) is False: 27 | os.mkdir(dir_path) 28 | fo = open("%s/%s.html" % (dir_path, filename), 'wb+') 29 | fo.write(html) 30 | fo.close() 31 | 32 | def get_page_content(cur_url, depth): 33 | print "downloading %s at level %d" % (cur_url, depth) 34 | try: 35 | req = urllib2.Request(cur_url, headers=request_headers) 36 | response = urllib2.urlopen(req) 37 | html_page = response.read() 38 | filename = cur_url[7:].replace('/', '_') 39 | 40 | #Write page to local files system 41 | save_page_content(html_page, filename) 42 | 43 | # Write to HDFS 44 | # with hdfs_client.write('/htmls/mfw/%s.html' % (filename)) as writer: 45 | # writer.write(html_page) 46 | 47 | dbmanager.finishUrl(cur_url) 48 | except urllib2.HTTPError, Arguments: 49 | print Arguments 50 | return 51 | except httplib.BadStatusLine, Arguments: 52 | print Arguments 53 | return 54 | except IOError, Arguments: 55 | print Arguments 56 | return 57 | except HdfsError, Arguments: 58 | print Arguments 59 | except Exception, Arguments: 60 | print Arguments 61 | return 62 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list' 63 | 64 | html = etree.HTML(html_page.lower().decode('utf-8')) 65 | hrefs = html.xpath(u"//a") 66 | 67 | for href in hrefs: 68 | try: 69 | if 'href' in href.attrib: 70 | val = href.attrib['href'] 71 | if val.find('javascript:') != -1: 72 | continue 73 | if val.startswith('http://') is False: 74 | if val.startswith('/'): 75 | val = 'http://www.mafengwo.cn' + val 76 | else: 77 | continue 78 | if val[-1] == '/': 79 | val = val[0:-1] 80 | dbmanager.enqueueUrl(val, 'new', depth + 1) 81 | except ValueError: 82 | continue 83 | 84 | 85 | max_num_thread = 5 86 | dbmanager = MongoRedisUrlManager() 87 | 88 | dbmanager.enqueueUrl("http://www.mafengwo.cn", 'new', 0) 89 | 90 | start_time = time.time() 91 | is_root_page = True 92 | threads = [] 93 | 94 | CRAWL_DELAY = 0.6 95 | 96 | # use hdfs to save pages 97 | # hdfs_client = InsecureClient('http://54.223.92.169:50070', user='ec2-user') 98 | 99 | while True: 100 | curtask = dbmanager.dequeueUrl() 101 | print curtask 102 | # Go on next level, before that, needs to wait all current level crawling done 103 | if curtask is None: 104 | print 'No task available!' 105 | for t in threads: 106 | t.join() 107 | break 108 | 109 | # looking for an empty thread from pool to crawl 110 | 111 | if is_root_page is True: 112 | get_page_content(curtask['url'], curtask['depth']) 113 | is_root_page = False 114 | else: 115 | while True: 116 | # first remove all finished running threads 117 | for t in threads: 118 | if not t.is_alive(): 119 | threads.remove(t) 120 | if len(threads) >= max_num_thread: 121 | time.sleep(CRAWL_DELAY) 122 | continue 123 | try: 124 | t = threading.Thread(target=get_page_content, name=None, args=(curtask['url'], curtask['depth'])) 125 | threads.append(t) 126 | # set daemon so main thread can exit when receives ctrl-c 127 | t.setDaemon(True) 128 | t.start() 129 | time.sleep(CRAWL_DELAY) 130 | break 131 | except Exception, Arguments: 132 | print 'Unable to start thread: ' + error.message 133 | break 134 | 135 | # clear redis and mongo 136 | # dbmanager.clear() -------------------------------------------------------------------------------- /第二讲 爬虫基本原理、搭建第一个爬虫.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第二讲 爬虫基本原理、搭建第一个爬虫.pdf -------------------------------------------------------------------------------- /第二讲代码.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第二讲代码.zip -------------------------------------------------------------------------------- /第二讲代码/dbmanager.py: -------------------------------------------------------------------------------- 1 | import mysql.connector 2 | import hashlib 3 | from mysql.connector import errorcode 4 | 5 | 6 | class CrawlDatabaseManager: 7 | 8 | DB_NAME = 'mfw_pro_crawl' 9 | 10 | SERVER_IP = 'localhost' 11 | 12 | TABLES = {} 13 | # create new table, using sql 14 | TABLES['urls'] = ( 15 | "CREATE TABLE `urls` (" 16 | " `index` int(11) NOT NULL AUTO_INCREMENT," # index of queue 17 | " `url` varchar(512) NOT NULL," 18 | " `md5` varchar(16) NOT NULL," 19 | " `status` varchar(11) NOT NULL DEFAULT 'new'," # could be new, downloading and finish 20 | " `depth` int(11) NOT NULL," 21 | " `queue_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP," 22 | " `done_time` timestamp NOT NULL DEFAULT 0 ON UPDATE CURRENT_TIMESTAMP," 23 | " PRIMARY KEY (`index`)," 24 | " UNIQUE KEY `md5` (`md5`)" 25 | ") ENGINE=InnoDB") 26 | 27 | 28 | def __init__(self, max_num_thread): 29 | # connect mysql server 30 | try: 31 | cnx = mysql.connector.connect(host=self.SERVER_IP, user='root') 32 | except mysql.connector.Error as err: 33 | if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: 34 | print "Something is wrong with your user name or password" 35 | elif err.errno == errorcode.ER_BAD_DB_ERROR: 36 | print "Database does not exist" 37 | else: 38 | print 'Create Error ' + err.msg 39 | exit(1) 40 | 41 | cursor = cnx.cursor() 42 | 43 | # use database, create it if not exist 44 | try: 45 | cnx.database = self.DB_NAME 46 | except mysql.connector.Error as err: 47 | if err.errno == errorcode.ER_BAD_DB_ERROR: 48 | # create database and table 49 | self.create_database(cursor) 50 | cnx.database = self.DB_NAME 51 | self.create_tables(cursor) 52 | else: 53 | print err 54 | exit(1) 55 | finally: 56 | cursor.close() 57 | cnx.close() 58 | 59 | dbconfig = { 60 | "database": self.DB_NAME, 61 | "user": "root", 62 | "host": self.SERVER_IP, 63 | } 64 | self.cnxpool = mysql.connector.pooling.MySQLConnectionPool(pool_name="mypool", 65 | pool_size=max_num_thread, 66 | **dbconfig) 67 | 68 | 69 | # create databse 70 | def create_database(self, cursor): 71 | try: 72 | cursor.execute( 73 | "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(self.DB_NAME)) 74 | except mysql.connector.Error as err: 75 | print "Failed creating database: {}".format(err) 76 | exit(1) 77 | 78 | def create_tables(self, cursor): 79 | for name, ddl in self.TABLES.iteritems(): 80 | try: 81 | cursor.execute(ddl) 82 | except mysql.connector.Error as err: 83 | if err.errno == errorcode.ER_TABLE_EXISTS_ERROR: 84 | print 'create tables error ALREADY EXISTS' 85 | else: 86 | print 'create tables error ' + err.msg 87 | else: 88 | print 'Tables created' 89 | 90 | 91 | # put an url into queue 92 | def enqueueUrl(self, url, depth): 93 | con = self.cnxpool.get_connection() 94 | cursor = con.cursor() 95 | try: 96 | add_url = ("INSERT INTO urls (url, md5, depth) VALUES (%s, %s, %s)") 97 | data_url = (url, hashlib.md5(url).hexdigest(), depth) 98 | cursor.execute(add_url, data_url) 99 | # commit this transaction, please refer to "mysql transaction" for more info 100 | con.commit() 101 | except mysql.connector.Error as err: 102 | # print 'enqueueUrl() ' + err.msg 103 | return 104 | finally: 105 | cursor.close() 106 | con.close() 107 | 108 | 109 | # get an url from queue 110 | def dequeueUrl(self): 111 | con = self.cnxpool.get_connection() 112 | cursor = con.cursor(dictionary=True) 113 | try: 114 | # use select * for update to lock the rows for read 115 | query = ("SELECT `index`, `url`, `depth` FROM urls WHERE status='new' ORDER BY `index` ASC LIMIT 1 FOR UPDATE") 116 | cursor.execute(query) 117 | if cursor.rowcount is 0: 118 | return None 119 | row = cursor.fetchone() 120 | update_query = ("UPDATE urls SET `status`='downloading' WHERE `index`=%d") % (row['index']) 121 | cursor.execute(update_query) 122 | con.commit() 123 | return row 124 | except mysql.connector.Error as err: 125 | # print 'dequeueUrl() ' + err.msg 126 | return None 127 | finally: 128 | cursor.close() 129 | con.close() 130 | 131 | def finishUrl(self, index): 132 | con = self.cnxpool.get_connection() 133 | cursor = con.cursor() 134 | try: 135 | # we don't need to update done_time using time.strftime('%Y-%m-%d %H:%M:%S') as it's auto updated 136 | update_query = ("UPDATE urls SET `status`='done' WHERE `index`=%d") % (index) 137 | cursor.execute(update_query) 138 | con.commit() 139 | except mysql.connector.Error as err: 140 | # print 'finishUrl() ' + err.msg 141 | return 142 | finally: 143 | cursor.close() 144 | con.close() 145 | -------------------------------------------------------------------------------- /第二讲代码/lxml_test.py: -------------------------------------------------------------------------------- 1 | import lxml 2 | from lxml import html 3 | from lxml import etree 4 | 5 | from bs4 import BeautifulSoup 6 | 7 | f = open('jd.com_2131674.html', 'r') 8 | content = f.read() 9 | 10 | tree = etree.HTML(content.decode('utf-8')) 11 | 12 | print '--------------------------------------------' 13 | print '# different quote //*[@class="p-price J-p-2131674"' 14 | print '--------------------------------------------' 15 | print tree.xpath(u"//*[@class='p-price J-p-2131674']") 16 | print '' 17 | 18 | print '--------------------------------------------' 19 | print '# partial match ' + "//*[@class='J-p-2131674']" 20 | print '--------------------------------------------' 21 | print tree.xpath(u"//*[@class='J-p-2131674']") 22 | print '' 23 | 24 | print '--------------------------------------------' 25 | print '# exactly match class string ' + '//*[@class="p-price J-p-2131674"]' 26 | print '--------------------------------------------' 27 | print tree.xpath(u'//*[@class="p-price J-p-2131674"]') 28 | print '' 29 | 30 | print '--------------------------------------------' 31 | print '# use contain ' + "//*[contains(@class, 'J-p-2131674')]" 32 | print '--------------------------------------------' 33 | print tree.xpath(u"//*[contains(@class, 'J-p-2131674')]") 34 | print '' 35 | 36 | 37 | print '--------------------------------------------' 38 | print '# specify tag name ' + "//strong[contains(@class, 'J-p-2131674')]" 39 | print '--------------------------------------------' 40 | print tree.xpath(u"//strong[contains(@class, 'J-p-2131674')]") 41 | print '' 42 | 43 | print '--------------------------------------------' 44 | print '# css selector with tag' + "cssselect('strong.J-p-2131674')" 45 | print '--------------------------------------------' 46 | htree = lxml.html.fromstring(content) 47 | print htree.cssselect('strong.J-p-2131674') 48 | print '' 49 | 50 | print '--------------------------------------------' 51 | print '# css selector without tag, partial match' + "cssselect('.J-p-2131674')" 52 | print '--------------------------------------------' 53 | htree = lxml.html.fromstring(content) 54 | elements = htree.cssselect('.J-p-2131674') 55 | print elements 56 | print '' 57 | 58 | print '--------------------------------------------' 59 | print '# attrib and text' 60 | print '--------------------------------------------' 61 | for element in tree.xpath(u"//strong[contains(@class, 'J-p-2131674')]"): 62 | print element.text 63 | print element.attrib 64 | print '' 65 | 66 | print '--------------------------------------------' 67 | print '########## use BeautifulSoup ##############' 68 | print '--------------------------------------------' 69 | print '# loading content to BeautifulSoup' 70 | soup = BeautifulSoup(content, 'html.parser') 71 | print '# loaded, show result' 72 | print soup.find(attrs={'class':'J-p-2131674'}).text 73 | 74 | f.close() 75 | -------------------------------------------------------------------------------- /第二讲代码/multi_thread_mfw.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from collections import deque 3 | import json 4 | from lxml import etree 5 | import httplib 6 | import hashlib 7 | from pybloomfilter import BloomFilter 8 | import thread 9 | import threading 10 | import time 11 | 12 | 13 | class CrawlBSF: 14 | request_headers = { 15 | 'host': "www.mafengwo.cn", 16 | 'connection': "keep-alive", 17 | 'cache-control': "no-cache", 18 | 'upgrade-insecure-requests': "1", 19 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 20 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 21 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" 22 | } 23 | 24 | cur_level = 0 25 | max_level = 5 26 | dir_name = 'iterate/' 27 | iter_width = 50 28 | downloaded_urls = [] 29 | 30 | du_md5_file_name = dir_name + 'download.txt' 31 | du_url_file_name = dir_name + 'urls.txt' 32 | 33 | bloom_downloaded_urls = BloomFilter(1024 * 1024 * 16, 0.01) 34 | bloom_url_queue = BloomFilter(1024 * 1024 * 16, 0.01) 35 | 36 | cur_queue = deque() 37 | child_queue = deque() 38 | 39 | def __init__(self, url): 40 | self.root_url = url 41 | self.cur_queue.append(url) 42 | self.du_file = open(self.du_url_file_name, 'a+') 43 | try: 44 | self.dumd5_file = open(self.du_md5_file_name, 'r') 45 | self.downloaded_urls = self.dumd5_file.readlines() 46 | self.dumd5_file.close() 47 | for urlmd5 in self.downloaded_urls: 48 | self.bloom_downloaded_urls.add(urlmd5[:-2]) 49 | except IOError: 50 | print "File not found" 51 | finally: 52 | self.dumd5_file = open(self.du_md5_file_name, 'a+') 53 | 54 | def enqueueUrl(self, url): 55 | if url not in self.bloom_url_queue and hashlib.md5(url).hexdigest() not in crawler.bloom_downloaded_urls: 56 | self.child_queue.append(url) 57 | self.bloom_url_queue.add(url) 58 | 59 | def dequeuUrl(self): 60 | try: 61 | url = self.cur_queue.popleft() 62 | return url 63 | except IndexError: 64 | return None 65 | 66 | def close(self): 67 | self.dumd5_file.close() 68 | self.du_file.close() 69 | 70 | 71 | num_downloaded_pages = 0 72 | 73 | 74 | #download the page content 75 | def get_page_content(cur_url): 76 | global num_downloaded_pages 77 | print "downloading %s at level %d" % (cur_url, crawler.cur_level) 78 | try: 79 | req = urllib2.Request(cur_url, headers=crawler.request_headers) 80 | response = urllib2.urlopen(req) 81 | html_page = response.read() 82 | filename = cur_url[7:].replace('/', '_') 83 | fo = open("%s%s.html" % (crawler.dir_name, filename), 'wb+') 84 | fo.write(html_page) 85 | fo.close() 86 | except urllib2.HTTPError, Arguments: 87 | print Arguments 88 | return 89 | except httplib.BadStatusLine, Arguments: 90 | print Arguments 91 | return 92 | except IOError, Arguments: 93 | print Arguments 94 | return 95 | except Exception, Arguments: 96 | print Arguments 97 | return 98 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list' 99 | 100 | # save page and set bloomfilter 101 | dumd5 = hashlib.md5(cur_url).hexdigest() 102 | crawler.downloaded_urls.append(dumd5) 103 | crawler.dumd5_file.write(dumd5 + '\r\n') 104 | crawler.du_file.write(cur_url + '\r\n') 105 | crawler.bloom_downloaded_urls.add(dumd5) 106 | num_downloaded_pages += 1 107 | 108 | html = etree.HTML(html_page.lower().decode('utf-8')) 109 | hrefs = html.xpath(u"//a") 110 | 111 | for href in hrefs: 112 | try: 113 | if 'href' in href.attrib: 114 | val = href.attrib['href'] 115 | if val.find('javascript:') != -1: 116 | continue 117 | if val.startswith('http://') is False: 118 | if val.startswith('/'): 119 | val = 'http://www.mafengwo.cn' + val 120 | else: 121 | continue 122 | if val[-1] == '/': 123 | val = val[0:-1] 124 | # if hashlib.md5(val).hexdigest() not in self.downloaded_urls: 125 | crawler.enqueueUrl(val) 126 | # else: 127 | # print 'Skip %s' % (val) 128 | except ValueError: 129 | continue 130 | 131 | 132 | crawler = CrawlBSF("http://www.mafengwo.cn") 133 | start_time = time.time() 134 | 135 | # if it's the first page (start url), if true, crawl it in main thread in sync(blocking) mode 136 | # 如果是第一个抓取页面的话,在主线程用同步(阻塞)的模式下载,后续的页面会通过创建子线程的方式异步爬取 137 | is_root_page = True 138 | threads = [] 139 | max_threads = 10 140 | 141 | CRAWL_DELAY = 0.6 142 | 143 | while True: 144 | url = crawler.dequeuUrl() 145 | # Go on next level, before that, needs to wait all current level crawling done 146 | if url is None: 147 | crawler.cur_level += 1 148 | for t in threads: 149 | t.join() 150 | if crawler.cur_level == crawler.max_level: 151 | break 152 | if len(crawler.child_queue) == 0: 153 | break 154 | crawler.cur_queue = crawler.child_queue 155 | crawler.child_queue = deque() 156 | continue 157 | 158 | 159 | # looking for an empty thread from pool to crawl 160 | 161 | if is_root_page is True: 162 | get_page_content(url) 163 | is_root_page = False 164 | else: 165 | while True: 166 | # first remove all finished running threads 167 | for t in threads: 168 | if not t.is_alive(): 169 | threads.remove(t) 170 | if len(threads) >= max_threads: 171 | time.sleep(CRAWL_DELAY) 172 | continue 173 | try: 174 | t = threading.Thread(target=get_page_content, name=None, args=(url,)) 175 | threads.append(t) 176 | # set daemon so main thread can exit when receives ctrl-c 177 | t.setDaemon(True) 178 | t.start() 179 | time.sleep(CRAWL_DELAY) 180 | break 181 | except Exception: 182 | print "Error: unable to start thread" 183 | 184 | print '%d pages downloaded, time cost %0.2f seconds' % (num_downloaded_pages, time.time()-start_time) 185 | -------------------------------------------------------------------------------- /第二讲代码/process_crawl.py: -------------------------------------------------------------------------------- 1 | import urllib2 2 | from collections import deque 3 | import json 4 | from lxml import etree 5 | import httplib 6 | import hashlib 7 | from pybloomfilter import BloomFilter 8 | import thread 9 | import threading 10 | import time 11 | from dbmanager import CrawlDatabaseManager 12 | 13 | from mysql.connector import errorcode 14 | import mysql.connector 15 | 16 | request_headers = { 17 | 'host': "www.mafengwo.cn", 18 | 'connection': "keep-alive", 19 | 'cache-control': "no-cache", 20 | 'upgrade-insecure-requests': "1", 21 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36", 22 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 23 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6" 24 | } 25 | 26 | def get_page_content(cur_url, index, depth): 27 | print "downloading %s at level %d" % (cur_url, depth) 28 | try: 29 | req = urllib2.Request(cur_url, headers=request_headers) 30 | response = urllib2.urlopen(req) 31 | html_page = response.read() 32 | filename = cur_url[7:].replace('/', '_') 33 | fo = open("%s%s.html" % (dir_name, filename), 'wb+') 34 | fo.write(html_page) 35 | fo.close() 36 | dbmanager.finishUrl(index) 37 | except urllib2.HTTPError, Arguments: 38 | print Arguments 39 | return 40 | except httplib.BadStatusLine, Arguments: 41 | print Arguments 42 | return 43 | except IOError, Arguments: 44 | print Arguments 45 | return 46 | except Exception, Arguments: 47 | print Arguments 48 | return 49 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list' 50 | 51 | html = etree.HTML(html_page.lower().decode('utf-8')) 52 | hrefs = html.xpath(u"//a") 53 | 54 | for href in hrefs: 55 | try: 56 | if 'href' in href.attrib: 57 | val = href.attrib['href'] 58 | if val.find('javascript:') != -1: 59 | continue 60 | if val.startswith('http://') is False: 61 | if val.startswith('/'): 62 | val = 'http://www.mafengwo.cn' + val 63 | else: 64 | continue 65 | if val[-1] == '/': 66 | val = val[0:-1] 67 | dbmanager.enqueueUrl(val, depth + 1) 68 | 69 | except ValueError: 70 | continue 71 | 72 | 73 | max_num_thread = 5 74 | 75 | # create instance of Mysql database manager, which is used as a queue for crawling 76 | dbmanager = CrawlDatabaseManager(max_num_thread) 77 | 78 | # dir for saving HTML files 79 | dir_name = 'dir_process/' 80 | 81 | # put first page into queue 82 | dbmanager.enqueueUrl("http://www.mafengwo.cn", 0) 83 | start_time = time.time() 84 | is_root_page = True 85 | threads = [] 86 | 87 | # time delay before a new crawling thread is created 88 | # use a delay to control the crawling rate, avoiding visiting target website too frequently 89 | # 设置超时,控制下载的速率,避免太过频繁访问目标网站 90 | CRAWL_DELAY = 0.6 91 | 92 | 93 | while True: 94 | curtask = dbmanager.dequeueUrl() 95 | # Go on next level, before that, needs to wait all current level crawling done 96 | if curtask is None: 97 | for t in threads: 98 | t.join() 99 | break 100 | 101 | # looking for an empty thread from pool to crawl 102 | 103 | if is_root_page is True: 104 | get_page_content(curtask['url'], curtask['index'], curtask['depth']) 105 | is_root_page = False 106 | else: 107 | while True: 108 | # first remove all finished running threads 109 | for t in threads: 110 | if not t.is_alive(): 111 | threads.remove(t) 112 | if len(threads) >= max_num_thread: 113 | time.sleep(CRAWL_DELAY) 114 | continue 115 | try: 116 | t = threading.Thread(target=get_page_content, name=None, args=(curtask['url'], curtask['index'], curtask['depth'])) 117 | threads.append(t) 118 | # set daemon so main thread can exit when receives ctrl-c 119 | t.setDaemon(True) 120 | t.start() 121 | time.sleep(CRAWL_DELAY) 122 | break 123 | except Exception: 124 | print "Error: unable to start thread" 125 | 126 | cursor.close() 127 | cnx.close() 128 | -------------------------------------------------------------------------------- /第五讲 PageRank、动态重拍、避开网站反爬的技术.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第五讲 PageRank、动态重拍、避开网站反爬的技术.pdf -------------------------------------------------------------------------------- /第四讲 爬虫与反爬虫的对抗.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第四讲 爬虫与反爬虫的对抗.pdf -------------------------------------------------------------------------------- /第四讲代码.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第四讲代码.zip --------------------------------------------------------------------------------