├── README.md
├── 公开课代码.rar
├── 公开课代码
├── autohome.py
├── baidu_home_news.py
└── iterator.py
├── 第一讲 互联网、互联网架构方面介绍,网站基本原理及扫盲.pdf
├── 第一课代码.rar
├── 第一课代码
├── crawl_bsf.py
├── iterator.py
├── iterator_bsf.java
├── iterator_dsf.java
└── mfw_url_feed.py
├── 第三讲 分布式爬虫.pdf
├── 第三讲代码.zip
├── 第三课代码.zip
├── 第三课代码
├── mongo_redis_mgr.py
├── mongomgr.py
├── mysqlmanager.py
├── process_crawl.py
├── readme.txt
├── spider_process_mongo.py
└── spider_process_mr.py
├── 第二讲 爬虫基本原理、搭建第一个爬虫.pdf
├── 第二讲代码.zip
├── 第二讲代码
├── dbmanager.py
├── jd.com_2131674.html
├── lxml_test.py
├── multi_thread_mfw.py
└── process_crawl.py
├── 第五讲 PageRank、动态重拍、避开网站反爬的技术.pdf
├── 第四讲 爬虫与反爬虫的对抗.pdf
└── 第四讲代码.zip
/README.md:
--------------------------------------------------------------------------------
1 | # distributed_crawler
2 | 分布式爬虫
3 |
--------------------------------------------------------------------------------
/公开课代码.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/公开课代码.rar
--------------------------------------------------------------------------------
/公开课代码/autohome.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | import json
3 | from bs4 import BeautifulSoup
4 |
5 | url_format = 'http://www.autohome.com.cn/grade/carhtml/%s.html';
6 |
7 | request_headers = {
8 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
9 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
10 | 'cache-control': "no-cache",
11 | 'connection': "keep-alive",
12 | 'cookie': "ASP.NET_SessionId=x4skm3eeyrup1xhm4g3v3c5j; cookieCityId=110100; fvlid=1485090119298xz0qs5oQ; sessionip=124.205.188.242; sessionid=D0E06CDF-C45B-4B6A-A3B3-B0E70EE1C87D%7C%7C2017-01-22+21%3A02%3A01.307%7C%7C0; sessionuid=D0E06CDF-C45B-4B6A-A3B3-B0E70EE1C87D||2017-01-22+21%3A02%3A01.307||0; ahpvno=4; __utma=1.944097921.1485090121.1485090121.1485090121.1; __utmb=1.0.10.1485090121; __utmc=1; __utmz=1.1485090121.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ref=0%7C0%7C0%7C0%7C2017-01-22+21%3A15%3A16.728%7C2017-01-22+21%3A02%3A01.307; sessionvid=0686F4A6-50B3-4997-AFE0-2F5D28420D34; area=110199",
13 | 'host': "www.autohome.com.cn",
14 | 'if-modified-since': "Sun, 22 Jan 2017 13:00:08 GMT",
15 | 'upgrade-insecure-requests': "1",
16 | 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
17 | 'postman-token': "8f6a0417-5aba-4b5b-cce3-f41eb134a5bd"
18 | }
19 |
20 | try:
21 | fo = open('autohome1.html', 'r')
22 | except IOError:
23 | html_doc = ''
24 | start_char = 'A'
25 |
26 | for i in range(ord('A'), ord('Z')):
27 | req = urllib2.Request(url_format % (chr(i)),headers=request_headers)
28 | response = urllib2.urlopen(req)
29 | page = response.read()
30 | html_doc += page;
31 | fo = open('autohome1.html', 'wb+')
32 | fo.write('\
33 | \
34 |
\
35 | \
36 | \
37 | \
38 | \
39 | \
40 | Autohome\
41 | \
42 | \
43 | ')
44 | fo.write(html_doc);
45 | fo.write('')
46 |
47 | soup = BeautifulSoup(fo, "html.parser")
48 |
49 | models_file = open("models.txt", "wb")
50 |
51 | for model in soup.find_all("h4"):
52 | try:
53 | if model.string is not None:
54 | models_file.write("%s\r\n" % (model.string.encode('utf-8')))
55 | except ValueError:
56 | continue
57 |
58 | fo.close()
59 | models_file.close()
--------------------------------------------------------------------------------
/公开课代码/baidu_home_news.py:
--------------------------------------------------------------------------------
1 | import urllib
2 | import urllib2
3 | import json
4 |
5 | url_format = 'https://www.baidu.com/home/pcweb/data/mancardwater?id=2&offset=%d&sessionId=14832978921842&p_params=31415927&newsNum=3&indextype=manht&_req_seqid=0xf7e28ac600008a71&asyn=1&t=1483297904932&sid=1445_21093_20691_21554_21592'
6 |
7 | user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
8 | values = {'name': 'Michael Foord',
9 | 'location': 'Northampton',
10 | 'language': 'Python' }
11 | request_headers = {
12 | 'upgrade-insecure-requests': "1",
13 | 'user-agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
14 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
15 | 'accept-encoding': "gzip, deflate, sdch, br",
16 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
17 | 'cookie': "BAIDUID=967E3B223D6EF159BEC8EDB441C8CA3E:FG=1; BIDUPSID=967E3B223D6EF159BEC8EDB441C8CA3E; PSTM=1484982290; BDUSS=nVvaXZmbFhTZlBNUDhPNXlYZlVYYU5OTm10N3UtMnk0bnJEV09yd2V3RVRKS3hZSVFBQUFBJCQAAAAAAAAAAAEAAAC4Qr0zzve5z7rNv9bB-gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABOXhFgTl4RYR; pgv_pvi=6289754112; BD_HOME=1; H_PS_PSSID=1457_21098_20691_20719; BD_UPN=12314753",
18 | 'cache-control': "no-cache",
19 | 'postman-token': "ffd3e32c-3099-ffcb-576e-c77b9d9a83ab"
20 | }
21 |
22 | data = urllib.urlencode(values)
23 | html_doc = ''
24 | for i in range(1,6):
25 | req = urllib2.Request(url_format % (i),headers=request_headers)
26 | response = urllib2.urlopen(req)
27 | page = response.read()
28 | page = page.replace('\\x22','Xx22').replace('\\', '').replace('Xx22', '\\"')
29 | response_obj = json.loads(page)
30 | html_doc += response_obj['html'].replace('\\"', '"').encode('utf-8')
31 |
32 | fo = open('baidu.html', 'wb')
33 | fo.write('')
34 | fo.write(html_doc);
35 | fo.write('')
36 | fo.close()
--------------------------------------------------------------------------------
/公开课代码/iterator.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | import json
3 | from lxml import etree
4 | import hashlib
5 |
6 | request_headers = {
7 | 'host': "www.mafengwo.cn",
8 | 'connection': "keep-alive",
9 | 'cache-control': "no-cache",
10 | 'upgrade-insecure-requests': "1",
11 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
12 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
13 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
14 | 'cookie': "mfw_uuid=5879e298-7d17-50bf-100a-30898d44da2d; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222017-01-14+16%3A34%3A32%22%3B%7D; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1484382875%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A0%3A%22%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5879e298-7d17-50bf-100a-30898d44da2d; PHPSESSID=v17pef8jrto99pvsgsppo748j0; __mfwlv=1484402143; __mfwvn=2; __mfwlt=1484402151; uva=a%3A4%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1484402148%3Bs%3A10%3A%22last_refer%22%3Bs%3A6%3A%22direct%22%3Bs%3A5%3A%22rhost%22%3Bs%3A0%3A%22%22%3Bs%3A4%3A%22step%22%3Bi%3A9%3B%7D; CNZZDATA30065558=cnzz_eid%3D55928032-1484382591-%26ntime%3D1484397604",
15 | 'postman-token': "0d7a1e08-f8d5-ec1f-ab2e-879ab9a00d34"
16 | }
17 |
18 | root_url = 'http://www.mafengwo.cn'
19 | max_level = 5
20 | dir_name = 'iterate/'
21 | iter_width = 3
22 |
23 | downloaded_url_file_name = dir_name + 'download.txt'
24 |
25 | du_file = open(downloaded_url_file_name, 'a+')
26 | downloaded_urls = du_file.readlines()
27 |
28 |
29 | def getpagecontent(cur_url, cur_level):
30 | print "downloading %s at level %d" % (cur_url, cur_level)
31 | try:
32 | req = urllib2.Request(cur_url, headers=request_headers)
33 | response = urllib2.urlopen(req)
34 | html_page = response.read()
35 | filename = cur_url[7:].replace('/', '_')
36 | fo = open("%s%s.html" % (dir_name, filename), 'wb+')
37 | fo.write(html_page)
38 | fo.close()
39 | except urllib2.HTTPError:
40 | print 'HTTP Error at ' + cur_url
41 | return
42 | except httplib.BadStatusLine:
43 | print 'BadStatusLine'
44 | return
45 | except IOError:
46 | print 'IO Error at ' + filename
47 | return
48 | except Exception:
49 | print 'Unhandled Exception'
50 | return
51 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list'
52 | downloaded_urls.append(hashlib.md5(cur_url).hexdigest())
53 | du_file.write(hashlib.md5(cur_url).hexdigest() + '\r\n')
54 |
55 | html = etree.HTML(html_page.lower().decode('utf-8'))
56 | hrefs = html.xpath(u"//a")
57 |
58 | if cur_level == max_level:
59 | return
60 |
61 | page_index = 0
62 | for href in hrefs:
63 | try:
64 | if 'href' in href.attrib:
65 | val = href.attrib['href']
66 | if val.find('javascript:') != -1:
67 | continue
68 | if val.startswith('http://') is False:
69 | if val.startswith('/'):
70 | val = 'http://www.mafengwo.cn' + val
71 | else:
72 | continue
73 | if val[-1] == '/':
74 | val = val[0:-1]
75 | if hashlib.md5(val).hexdigest() not in downloaded_urls:
76 | getpagecontent(val, cur_level + 1)
77 | page_index += 1
78 | if page_index == iter_width:
79 | break
80 | else:
81 | print val + ' is skipped'
82 | except ValueError:
83 | continue
84 |
85 | getpagecontent(root_url, 0)
86 |
87 | du_file.close()
88 |
--------------------------------------------------------------------------------
/第一讲 互联网、互联网架构方面介绍,网站基本原理及扫盲.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第一讲 互联网、互联网架构方面介绍,网站基本原理及扫盲.pdf
--------------------------------------------------------------------------------
/第一课代码.rar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第一课代码.rar
--------------------------------------------------------------------------------
/第一课代码/crawl_bsf.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | from collections import deque
3 | import json
4 | from lxml import etree
5 | import httplib
6 | import hashlib
7 | from pybloomfilter import BloomFilter
8 |
9 | class CrawlBSF:
10 | request_headers = {
11 | 'host': "www.mafengwo.cn",
12 | 'connection': "keep-alive",
13 | 'cache-control': "no-cache",
14 | 'upgrade-insecure-requests': "1",
15 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
16 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
17 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
18 | }
19 |
20 | cur_level = 0
21 | max_level = 5
22 | dir_name = 'iterate/'
23 | iter_width = 50
24 | downloaded_urls = []
25 |
26 | du_md5_file_name = dir_name + 'download.txt'
27 | du_url_file_name = dir_name + 'urls.txt'
28 |
29 | download_bf = BloomFilter(1024*1024*16, 0.01)
30 |
31 | cur_queue = deque()
32 | child_queue = deque()
33 |
34 | def __init__(self, url):
35 | self.root_url = url
36 | self.cur_queue.append(url)
37 | self.du_file = open(self.du_url_file_name, 'a+')
38 | try:
39 | self.dumd5_file = open(self.du_md5_file_name, 'r')
40 | self.downloaded_urls = self.dumd5_file.readlines()
41 | self.dumd5_file.close()
42 | for urlmd5 in self.downloaded_urls:
43 | self.download_bf.add(urlmd5[:-2])
44 | except IOError:
45 | print "File not found"
46 | finally:
47 | self.dumd5_file = open(self.du_md5_file_name, 'a+')
48 |
49 | def enqueueUrl(self, url):
50 | self.child_queue.append(url)
51 |
52 | def dequeuUrl(self):
53 | try:
54 | url = self.cur_queue.popleft()
55 | return url
56 | except IndexError:
57 | self.cur_level += 1
58 | if self.cur_level == self.max_level:
59 | return None
60 | if len(self.child_queue) == 0:
61 | return None
62 | self.cur_queue = self.child_queue
63 | self.child_queue = deque()
64 | return self.dequeuUrl()
65 |
66 | def getpagecontent(self, cur_url):
67 | print "downloading %s at level %d" % (cur_url, self.cur_level)
68 | try:
69 | req = urllib2.Request(cur_url, headers=self.request_headers)
70 | response = urllib2.urlopen(req)
71 | html_page = response.read()
72 | filename = cur_url[7:].replace('/', '_')
73 | fo = open("%s%s.html" % (self.dir_name, filename), 'wb+')
74 | fo.write(html_page)
75 | fo.close()
76 | except urllib2.HTTPError, Arguments:
77 | print Arguments
78 | return
79 | except httplib.BadStatusLine:
80 | print 'BadStatusLine'
81 | return
82 | except IOError:
83 | print 'IO Error at ' + filename
84 | return
85 | except Exception, Arguments:
86 | print Arguments
87 | return
88 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list'
89 | dumd5 = hashlib.md5(cur_url).hexdigest()
90 | self.downloaded_urls.append(dumd5)
91 | self.dumd5_file.write(dumd5 + '\r\n')
92 | self.du_file.write(cur_url + '\r\n')
93 | self.download_bf.add(dumd5)
94 |
95 | html = etree.HTML(html_page.lower().decode('utf-8'))
96 | hrefs = html.xpath(u"//a")
97 |
98 | for href in hrefs:
99 | try:
100 | if 'href' in href.attrib:
101 | val = href.attrib['href']
102 | if val.find('javascript:') != -1:
103 | continue
104 | if val.startswith('http://') is False:
105 | if val.startswith('/'):
106 | val = 'http://www.mafengwo.cn' + val
107 | else:
108 | continue
109 | if val[-1] == '/':
110 | val = val[0:-1]
111 | # if hashlib.md5(val).hexdigest() not in self.downloaded_urls:
112 | if hashlib.md5(val).hexdigest() not in self.download_bf:
113 | self.enqueueUrl(val)
114 | else:
115 | print 'Skip %s' % (val)
116 | except ValueError:
117 | continue
118 |
119 | def start_crawl(self):
120 | while True:
121 | url = self.dequeuUrl()
122 | if url is None:
123 | break
124 | self.getpagecontent(url)
125 | self.dumd5_file.close()
126 | self.du_file.close()
127 |
128 | crawler = CrawlBSF("http://www.mafengwo.cn")
129 | crawler.start_crawl()
--------------------------------------------------------------------------------
/第一课代码/iterator.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | import json
3 | from lxml import etree
4 | import hashlib
5 |
6 | request_headers = {
7 | 'host': "www.mafengwo.cn",
8 | 'connection': "keep-alive",
9 | 'cache-control': "no-cache",
10 | 'upgrade-insecure-requests': "1",
11 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
12 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
13 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6",
14 | 'cookie': "mfw_uuid=5879e298-7d17-50bf-100a-30898d44da2d; oad_n=a%3A3%3A%7Bs%3A3%3A%22oid%22%3Bi%3A1029%3Bs%3A2%3A%22dm%22%3Bs%3A15%3A%22www.mafengwo.cn%22%3Bs%3A2%3A%22ft%22%3Bs%3A19%3A%222017-01-14+16%3A34%3A32%22%3B%7D; __mfwurd=a%3A3%3A%7Bs%3A6%3A%22f_time%22%3Bi%3A1484382875%3Bs%3A9%3A%22f_rdomain%22%3Bs%3A0%3A%22%22%3Bs%3A6%3A%22f_host%22%3Bs%3A3%3A%22www%22%3B%7D; __mfwuuid=5879e298-7d17-50bf-100a-30898d44da2d; PHPSESSID=v17pef8jrto99pvsgsppo748j0; __mfwlv=1484402143; __mfwvn=2; __mfwlt=1484402151; uva=a%3A4%3A%7Bs%3A2%3A%22lt%22%3Bi%3A1484402148%3Bs%3A10%3A%22last_refer%22%3Bs%3A6%3A%22direct%22%3Bs%3A5%3A%22rhost%22%3Bs%3A0%3A%22%22%3Bs%3A4%3A%22step%22%3Bi%3A9%3B%7D; CNZZDATA30065558=cnzz_eid%3D55928032-1484382591-%26ntime%3D1484397604",
15 | 'postman-token': "0d7a1e08-f8d5-ec1f-ab2e-879ab9a00d34"
16 | }
17 |
18 | root_url = 'http://www.mafengwo.cn'
19 | max_level = 5
20 | dir_name = 'iterate/'
21 | iter_width = 3
22 |
23 | downloaded_url_file_name = dir_name + 'download.txt'
24 |
25 | du_file = open(downloaded_url_file_name, 'a+')
26 | downloaded_urls = du_file.readlines()
27 |
28 |
29 | def getpagecontent(cur_url, cur_level):
30 | print "downloading %s at level %d" % (cur_url, cur_level)
31 | try:
32 | req = urllib2.Request(cur_url, headers=request_headers)
33 | response = urllib2.urlopen(req)
34 | html_page = response.read()
35 | filename = cur_url[7:].replace('/', '_')
36 | fo = open("%s%s.html" % (dir_name, filename), 'wb+')
37 | fo.write(html_page)
38 | fo.close()
39 | except urllib2.HTTPError:
40 | print 'HTTP Error at ' + cur_url
41 | return
42 | except httplib.BadStatusLine:
43 | print 'BadStatusLine'
44 | return
45 | except IOError:
46 | print 'IO Error at ' + filename
47 | return
48 | except Exception:
49 | print 'Unhandled Exception'
50 | return
51 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list'
52 | downloaded_urls.append(hashlib.md5(cur_url).hexdigest())
53 | du_file.write(hashlib.md5(cur_url).hexdigest() + '\r\n')
54 |
55 | html = etree.HTML(html_page.lower().decode('utf-8'))
56 | hrefs = html.xpath(u"//a")
57 |
58 | if cur_level == max_level:
59 | return
60 |
61 | page_index = 0
62 | for href in hrefs:
63 | try:
64 | if 'href' in href.attrib:
65 | val = href.attrib['href']
66 | if val.find('javascript:') != -1:
67 | continue
68 | if val.startswith('http://') is False:
69 | if val.startswith('/'):
70 | val = 'http://www.mafengwo.cn' + val
71 | else:
72 | continue
73 | if val[-1] == '/':
74 | val = val[0:-1]
75 | if hashlib.md5(val).hexdigest() not in downloaded_urls:
76 | getpagecontent(val, cur_level + 1)
77 | page_index += 1
78 | if page_index == iter_width:
79 | break
80 | else:
81 | print val + ' is skipped'
82 | except ValueError:
83 | continue
84 |
85 | getpagecontent(root_url, 0)
86 |
87 | du_file.close()
88 |
--------------------------------------------------------------------------------
/第一课代码/iterator_bsf.java:
--------------------------------------------------------------------------------
1 | package cn.marble;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.Queue;
6 | import java.util.concurrent.ConcurrentLinkedQueue;
7 |
8 | public class Main {
9 |
10 | Queue currentQueue = new ConcurrentLinkedQueue();
11 |
12 | Queue childQueue = new ConcurrentLinkedQueue();
13 |
14 | int currentLevel = 0;
15 |
16 | void appendDownloadedUrl(String url){
17 |
18 | }
19 |
20 | void appendErrorUrl(String url){
21 |
22 | }
23 |
24 | boolean isDownloaded(String url){
25 |
26 | }
27 |
28 | String getPageContent(String url) throws CrawlException, IOException {
29 | return "";
30 | }
31 |
32 | String savePageContent(String pageContent){
33 | return "";
34 | }
35 |
36 | void enqueueUrls(ConcurrentLinkedQueue urls){
37 | childQueue.addAll(urls);
38 | }
39 |
40 | void enqueueUrlsFromPageSrc(String pageContent){
41 |
42 | }
43 |
44 | void enqueueUrl(String url){
45 | childQueue.add(url);
46 | }
47 |
48 | String dequeueUrl(){
49 | String url = currentQueue.poll();
50 | if ( url == null ){
51 | currentLevel ++;
52 | if ( currentLevel == HEIGHT )
53 | return null;
54 | currentQueue = childQueue;
55 | childQueue = new ConcurrentLinkedQueue();
56 | url = currentQueue.poll();
57 | }
58 | return url;
59 | }
60 |
61 | String rootNode;
62 |
63 | final static int WIDTH = 50;
64 | final static int HEIGHT = 5;
65 |
66 | void crawl(String url){
67 | String pageContent;
68 | try{
69 | pageContent = getPageContent(url);
70 | savePageContent(pageContent);
71 | } catch( Exception e ){
72 | appendErrorUrl(url);
73 | return;
74 | }
75 |
76 | enqueueUrlsFromPageSrc(pageContent);
77 | }
78 |
79 | void start(){
80 |
81 | int curLevel = 0;
82 | enqueueUrl("http://www.mafengwo.cn");
83 |
84 | while( true ){
85 | String url = dequeueUrl();
86 | if ( url == null ){
87 | if ( url == null )
88 | break;
89 | }
90 | crawl(url);
91 |
92 | }
93 | }
94 |
95 | public static void main(String[] args) {
96 | // write your code here
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/第一课代码/iterator_dsf.java:
--------------------------------------------------------------------------------
1 | public class Crawler{
2 |
3 | String[] getAllUrls(String htmlContent);
4 |
5 | void appendDownloadedUrl(String url);
6 |
7 | void appendErrorUrl(String url);
8 |
9 | boolean isDownloaded(String url);
10 |
11 | String getPageContent(String url) throws CrawlException, IOException;
12 |
13 | String savePageContent(String pageContent);
14 |
15 | String rootNode;
16 |
17 | final static int WIDTH = 50;
18 | final static int HEIGHT = 5;
19 |
20 |
21 | void crawl(String url, level){
22 | try{
23 | String pageContent = getPageContent(url);
24 | } catch( Exception e ){
25 | appendErrorUrl(url);
26 | return;
27 | }
28 |
29 | savePageContent(pageContent);
30 |
31 | if ( level == HEIGHT )
32 | return;
33 |
34 | String[] urls = getAllUrls(pageContent);
35 |
36 | for( int i = 0; i < urls.length; i ++ ){
37 |
38 | if ( i == WIDTH )
39 | return;
40 | if ( isDownloaded(urls[i]))
41 | continue;
42 | crawl( urls[i], level + 1 );
43 | }
44 | }
45 |
46 | start(){
47 | crawl("http://www.mafengwo.com.cn", 0 );
48 | }
49 |
50 | }
--------------------------------------------------------------------------------
/第一课代码/mfw_url_feed.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | import httplib
3 | import re
4 | from pybloomfilter import BloomFilter
5 | import os
6 |
7 | request_headers = {
8 | 'host': "www.mafengwo.cn",
9 | 'connection': "keep-alive",
10 | 'cache-control': "no-cache",
11 | 'upgrade-insecure-requests': "1",
12 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
13 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
14 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
15 | }
16 |
17 | city_home_pages = []
18 | city_ids = []
19 | dirname = 'mafengwo_notes/'
20 |
21 | # 创建 Bloom Filter
22 | download_bf = BloomFilter(1024 * 1024 * 16, 0.01)
23 |
24 |
25 | def download_city_notes(id):
26 | for i in range(1, 999):
27 | url = 'http://www.mafengwo.cn/yj/%s/1-0-%d.html' % (id, i)
28 | if url in download_bf:
29 | continue
30 | print 'open url %s' % (url)
31 | download_bf.add(url)
32 | req = urllib2.Request(url, headers=request_headers)
33 | response = urllib2.urlopen(req)
34 | htmlcontent = response.read()
35 | city_notes = re.findall('href="/i/\d{7}.html', htmlcontent)
36 |
37 | # 如果导航页错误,该页的游记数为0,则意味着 1-0-xxx.html 已经遍历完,结束这个城市
38 | if len(city_notes) == 0:
39 | return
40 | for city_note in city_notes:
41 | try:
42 | city_url = 'http://www.mafengwo.cn%s' % (city_note[6:])
43 | if city_url in download_bf:
44 | continue
45 | print 'download %s' % (city_url)
46 | req = urllib2.Request(city_url, headers=request_headers)
47 | response = urllib2.urlopen(req)
48 | html = response.read()
49 | filename = city_url[7:].replace('/', '_')
50 | fo = open("%s%s" % (dirname, filename), 'wb+')
51 | fo.write(html)
52 | fo.close()
53 | download_bf.add(city_url)
54 | except Exception, Arguments:
55 | print Arguments
56 | continue
57 |
58 | # 检查用于存储网页文件夹是否存在,不存在则创建
59 | if not os.path.exits(dirname):
60 | os.makedirs(dirname)
61 |
62 | try:
63 | # 下载目的地的首页
64 | req = urllib2.Request('http://www.mafengwo.cn/mdd/', headers=request_headers)
65 | response = urllib2.urlopen(req)
66 | htmlcontent = response.read()
67 |
68 | # 利用正则表达式,找出所有的城市主页
69 | city_home_pages = re.findall('/travel-scenic-spot/mafengwo/\d{5}.html', htmlcontent)
70 |
71 | # 通过循环,依次下载每个城市下的所有游记
72 | for city in city_home_pages:
73 | city_ids.append(city[29:34])
74 | download_city_notes(city[29:34])
75 | except urllib2.HTTPError, Arguments:
76 | print Arguments
77 | except httplib.BadStatusLine:
78 | print 'BadStatusLine'
79 | except Exception, Arguments:
80 | print Arguments
81 |
--------------------------------------------------------------------------------
/第三讲 分布式爬虫.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第三讲 分布式爬虫.pdf
--------------------------------------------------------------------------------
/第三讲代码.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第三讲代码.zip
--------------------------------------------------------------------------------
/第三课代码.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第三课代码.zip
--------------------------------------------------------------------------------
/第三课代码/mongo_redis_mgr.py:
--------------------------------------------------------------------------------
1 | import mysql.connector
2 | import httplib
3 | import hashlib
4 | import time
5 | from datetime import datetime
6 | from datetime import timedelta
7 |
8 | import redis
9 | from pymongo import MongoClient
10 | from pymongo import IndexModel, ASCENDING, DESCENDING
11 |
12 |
13 | class MongoRedisUrlManager:
14 |
15 | def __init__(self, sever_ip='localhost', client=None, expires=timedelta(days=30)):
16 | """
17 | client: mongo database client
18 | expires: timedelta of amount of time before a cache entry is considered expired
19 | """
20 | # if a client object is not passed
21 | # then try connecting to mongodb at the default localhost port
22 | self.client = MongoClient(sever_ip, 27017) if client is None else client
23 | self.redis_client = redis.StrictRedis(host=sever_ip, port=6379, db=0)
24 | #create collection to store cached webpages,
25 | # which is the equivalent of a table in a relational database
26 | self.db = self.client.spider
27 |
28 | # create index if db is empty
29 | if self.db.mfw.count() is 0:
30 | self.db.mfw.create_index('status')
31 |
32 | def dequeueUrl(self):
33 | record = self.db.mfw.find_one_and_update(
34 | { 'status': 'new'},
35 | { '$set': { 'status' : 'downloading'} },
36 | { 'upsert':False, 'returnNewDocument' : False}
37 | )
38 | if record:
39 | return record
40 | else:
41 | return None
42 |
43 | def enqueueUrl(self, url, status, depth):
44 | num = self.redis_client.get(url)
45 | if num is not None:
46 | self.redis_client.set(url, int(num) + 1 )
47 | return
48 | self.redis_client.set(url, 1)
49 | self.db.mfw.insert({
50 | '_id': hashlib.md5(url).hexdigest(),
51 | 'url': url,
52 | 'status': status,
53 | 'queue_time': datetime.utcnow(),
54 | 'depth': depth
55 | })
56 |
57 | def finishUrl(self, url):
58 | record = {'status': 'done', 'done_time': datetime.utcnow()}
59 | self.db.mfw.update({'_id': hashlib.md5(url).hexdigest()}, {'$set': record}, upsert=False)
60 |
61 | def clear(self):
62 | self.redis_client.flushall()
63 | self.db.mfw.drop()
--------------------------------------------------------------------------------
/第三课代码/mongomgr.py:
--------------------------------------------------------------------------------
1 | import mysql.connector
2 | import httplib
3 | import hashlib
4 | import time
5 | import datetime
6 | from datetime import datetime
7 | from datetime import timedelta
8 | from pymongo import MongoClient
9 |
10 |
11 | class MongoUrlManager:
12 | def __init__(self, SERVER_IP = 'localhost', port=27017, client=None):
13 | # if a client object is not passed
14 | # then try connecting to mongodb at the default localhost port
15 | self.client = MongoClient(SERVER_IP, port) if client is None else client
16 | #create collection to store cached webpages,
17 | # which is the equivalent of a table in a relational database
18 | self.db = self.client.spider
19 |
20 | def dequeueUrl(self):
21 | record = self.db.mfw.find_one_and_update(
22 | {'status': 'new'},
23 | { '$set': { 'status' : 'downloading'} },
24 | { 'upsert':False, 'returnNewDocument' : False}
25 | )
26 | if record:
27 | return record
28 | else:
29 | return None
30 |
31 | def enqueueUrl(self, url, status, depth):
32 | try:
33 | self.db.mfw.insert({'_id': url, 'status': status, 'queue_time': datetime.utcnow(), 'depth': depth})
34 | except Exception, Arguments:
35 | pass
36 |
37 | def finishUrl(self, url, status='done'):
38 | record = {'status': status, 'done_time': datetime.utcnow()}
39 | self.db.mfw.update({'_id': url}, {'$set': record}, upsert=False)
40 |
41 | def clear(self):
42 | self.db.mfw.drop()
--------------------------------------------------------------------------------
/第三课代码/mysqlmanager.py:
--------------------------------------------------------------------------------
1 | import mysql.connector
2 | from mysql.connector import errorcode
3 | from mysql.connector import pooling
4 | import httplib
5 | import hashlib
6 | import time
7 |
8 | class CrawlDatabaseManager:
9 |
10 | DB_NAME = 'mfw_pro_crawl'
11 |
12 | SERVER_IP = '127.0.0.1'
13 |
14 | TABLES = {}
15 | TABLES['urls'] = (
16 | "CREATE TABLE `urls` ("
17 | " `index` int(11) NOT NULL AUTO_INCREMENT,"
18 | " `url` varchar(512) NOT NULL,"
19 | " `md5` varchar(32) NOT NULL,"
20 | " `status` varchar(11) NOT NULL DEFAULT 'new',"
21 | " `depth` int(11) NOT NULL,"
22 | " `queue_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,"
23 | " `done_time` timestamp NOT NULL DEFAULT 0,"
24 | " PRIMARY KEY (`index`),"
25 | " UNIQUE KEY `md5` (`md5`)"
26 | ") ENGINE=InnoDB")
27 |
28 |
29 | def __init__(self, max_num_thread):
30 | try:
31 | cnx = mysql.connector.connect(host=self.SERVER_IP, user='root')
32 | except mysql.connector.Error as err:
33 | if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
34 | print("Something is wrong with your user name or password")
35 | elif err.errno == errorcode.ER_BAD_DB_ERROR:
36 | print("Database does not exist")
37 | else:
38 | print 'Create Error ' + err.msg
39 | exit(1)
40 |
41 | cursor = cnx.cursor()
42 |
43 | try:
44 | cnx.database = self.DB_NAME
45 | except mysql.connector.Error as err:
46 | if err.errno == errorcode.ER_BAD_DB_ERROR:
47 | self.create_database(cursor)
48 | cnx.database = self.DB_NAME
49 | self.create_tables(cursor)
50 | else:
51 | print(err)
52 | exit(1)
53 | finally:
54 | cursor.close()
55 | cnx.close()
56 |
57 | dbconfig = {
58 | "database": self.DB_NAME,
59 | "user": "root",
60 | "host": self.SERVER_IP,
61 | }
62 | self.cnxpool = mysql.connector.pooling.MySQLConnectionPool(pool_name = "mypool",
63 | pool_size = max_num_thread,
64 | **dbconfig)
65 |
66 |
67 | def create_database(self, cursor):
68 | try:
69 | cursor.execute(
70 | "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(self.DB_NAME))
71 | except mysql.connector.Error as err:
72 | print("Failed creating database: {}".format(err))
73 | exit(1)
74 |
75 | def create_tables(self, cursor):
76 | for name, ddl in self.TABLES.iteritems():
77 | try:
78 | cursor.execute(ddl)
79 | except mysql.connector.Error as err:
80 | if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
81 | print 'create tables error ALREADY EXISTS'
82 | else:
83 | print 'create tables error ' + err.msg
84 | else:
85 | print 'Tables created'
86 |
87 |
88 | def enqueueUrl(self, url, depth):
89 | con = self.cnxpool.get_connection()
90 | cursor = con.cursor()
91 | try:
92 | add_url = ("INSERT INTO urls (url, md5, depth) VALUES (%s, %s, %s)")
93 | data_url = (url, hashlib.md5(url).hexdigest(), depth)
94 | cursor.execute(add_url, data_url)
95 | con.commit()
96 | except mysql.connector.Error as err:
97 | # print 'enqueueUrl() ' + err.msg
98 | return
99 | finally:
100 | cursor.close()
101 | con.close()
102 |
103 |
104 | def dequeueUrl(self):
105 | con = self.cnxpool.get_connection()
106 | cursor = con.cursor(dictionary=True)
107 | try:
108 | query = ("SELECT `index`, `url`, `depth` FROM urls WHERE status='new' ORDER BY `index` ASC LIMIT 1 FOR UPDATE")
109 | cursor.execute(query)
110 | if cursor.rowcount is 0:
111 | return None
112 | row = cursor.fetchone()
113 | update_query = ("UPDATE urls SET `status`='downloading' WHERE `index`=%d") % (row['index'])
114 | cursor.execute(update_query)
115 | con.commit()
116 | return row
117 | except mysql.connector.Error as err:
118 | # print 'dequeueUrl() ' + err.msg
119 | return None
120 | finally:
121 | cursor.close()
122 | con.close()
123 |
124 | def finishUrl(self, index):
125 | con = self.cnxpool.get_connection()
126 | cursor = con.cursor()
127 | try:
128 | # we don't need to update done_time using time.strftime('%Y-%m-%d %H:%M:%S') as it's auto updated
129 | update_query = ("UPDATE urls SET `status`='done', `done_time`=%s WHERE `index`=%d") % (time.strftime('%Y-%m-%d %H:%M:%S'), index)
130 | cursor.execute(update_query)
131 | con.commit()
132 | except mysql.connector.Error as err:
133 | # print 'finishUrl() ' + err.msg
134 | return
135 | finally:
136 | cursor.close()
137 | con.close()
138 |
--------------------------------------------------------------------------------
/第三课代码/process_crawl.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | from collections import deque
3 | import json
4 | from lxml import etree
5 | import httplib
6 | import hashlib
7 | import thread
8 | import threading
9 | import time
10 | from mysqlmanager import CrawlDatabaseManager
11 |
12 | from hdfs import *
13 | from hdfs.util import HdfsError
14 |
15 | from mysql.connector import errorcode
16 | import mysql.connector
17 |
18 | request_headers = {
19 | 'host': "www.mafengwo.cn",
20 | 'connection': "keep-alive",
21 | 'cache-control': "no-cache",
22 | 'upgrade-insecure-requests': "1",
23 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
24 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
25 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
26 | }
27 |
28 | def get_page_content(cur_url, index, depth):
29 | print "downloading %s at level %d" % (cur_url, depth)
30 | try:
31 | req = urllib2.Request(cur_url, headers=request_headers)
32 | response = urllib2.urlopen(req)
33 | html_page = response.read()
34 | filename = cur_url[7:].replace('/', '_')
35 |
36 | Write page to local files system
37 | fo = open("%s%s.html" % (dir_name, filename), 'wb+')
38 | fo.write(html_page)
39 | fo.close()
40 |
41 | # Write to HDFS
42 | # with hdfs_client.write('/htmls/mfw/%s.html' % (filename)) as writer:
43 | # writer.write(html_page)
44 |
45 | dbmanager.finishUrl(index)
46 | except urllib2.HTTPError, Arguments:
47 | print Arguments
48 | return
49 | except httplib.BadStatusLine, Arguments:
50 | print Arguments
51 | return
52 | except IOError, Arguments:
53 | print Arguments
54 | return
55 | except HdfsError, Arguments:
56 | print Arguments
57 | except Exception, Arguments:
58 | print Arguments
59 | return
60 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list'
61 |
62 | html = etree.HTML(html_page.lower().decode('utf-8'))
63 | hrefs = html.xpath(u"//a")
64 |
65 | for href in hrefs:
66 | try:
67 | if 'href' in href.attrib:
68 | val = href.attrib['href']
69 | if val.find('javascript:') != -1:
70 | continue
71 | if val.startswith('http://') is False:
72 | if val.startswith('/'):
73 | val = 'http://www.mafengwo.cn' + val
74 | else:
75 | continue
76 | if val[-1] == '/':
77 | val = val[0:-1]
78 | dbmanager.enqueueUrl(val, 'new', depth + 1)
79 |
80 | except ValueError:
81 | continue
82 |
83 |
84 | max_num_thread = 5
85 | dbmanager = CrawlDatabaseManager(max_num_thread)
86 |
87 | dir_name = 'dir_process/'
88 |
89 | dbmanager.enqueueUrl("http://www.mafengwo.cn", 0)
90 | start_time = time.time()
91 | is_root_page = True
92 | threads = []
93 |
94 | CRAWL_DELAY = 0.6
95 |
96 | # hdfs_client = InsecureClient('http://54.223.92.169:50070', user='ec2-user')
97 |
98 | while True:
99 | curtask = dbmanager.dequeueUrl()
100 | # Go on next level, before that, needs to wait all current level crawling done
101 | if curtask is None:
102 | for t in threads:
103 | t.join()
104 | break
105 |
106 | # looking for an empty thread from pool to crawl
107 |
108 | if is_root_page is True:
109 | get_page_content(curtask['url'], curtask['index'], curtask['depth'])
110 | is_root_page = False
111 | else:
112 | while True:
113 | # first remove all finished running threads
114 | for t in threads:
115 | if not t.is_alive():
116 | threads.remove(t)
117 | if len(threads) >= max_num_thread:
118 | time.sleep(CRAWL_DELAY)
119 | continue
120 | try:
121 | t = threading.Thread(target=get_page_content, name=None, args=(curtask['url'], curtask['index'], curtask['depth']))
122 | threads.append(t)
123 | # set daemon so main thread can exit when receives ctrl-c
124 | t.setDaemon(True)
125 | t.start()
126 | time.sleep(CRAWL_DELAY)
127 | break
128 | except Exception:
129 | print "Error: unable to start thread"
130 |
131 | cursor.close()
132 | cnx.close()
133 |
--------------------------------------------------------------------------------
/第三课代码/readme.txt:
--------------------------------------------------------------------------------
1 | python spider_process_mongo.py
2 | 仅使用了 mongo db
3 | 需要在本机启动mongo,使用默认端口,可以在创建 MongoUrlManager 的时候设置Mongo服务器和端口
4 |
5 | python spider_process_mr.py
6 | 同时用了 mongo 与 reids
7 | 需要在本机启动mongo及redis,使用默认端口,可以在创建 MongoRedisUrlManager 的时候设置Mongo服务器和端口
--------------------------------------------------------------------------------
/第三课代码/spider_process_mongo.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | import httplib
3 | from lxml import etree
4 | import thread
5 | import threading
6 | import os
7 | import time
8 | from mongomgr import MongoUrlManager
9 |
10 | from hdfs import *
11 | from hdfs.util import HdfsError
12 |
13 | request_headers = {
14 | 'host': "www.mafengwo.cn",
15 | 'connection': "keep-alive",
16 | 'cache-control': "no-cache",
17 | 'upgrade-insecure-requests': "1",
18 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
19 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
20 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
21 | }
22 |
23 | dir_path = './mafengwo'
24 |
25 | def save_page_content(html, filename):
26 | if os.path.exists(dir_path) is False:
27 | os.mkdir(dir_path)
28 | fo = open("%s/%s.html" % (dir_path, filename), 'wb+')
29 | fo.write(html)
30 | fo.close()
31 |
32 | def get_page_content(cur_url, depth):
33 | print "downloading %s at level %d" % (cur_url, depth)
34 | try:
35 | req = urllib2.Request(cur_url, headers=request_headers)
36 | response = urllib2.urlopen(req)
37 | html_page = response.read()
38 | filename = cur_url[7:].replace('/', '_')
39 |
40 | #Write page to local files system
41 | save_page_content(html_page, filename)
42 |
43 | # Write to HDFS
44 | # with hdfs_client.write('/htmls/mfw/%s.html' % (filename)) as writer:
45 | # writer.write(html_page)
46 |
47 | dbmanager.finishUrl(cur_url)
48 | except urllib2.HTTPError, Arguments:
49 | print Arguments
50 | return
51 | except httplib.BadStatusLine, Arguments:
52 | print Arguments
53 | return
54 | except IOError, Arguments:
55 | print Arguments
56 | return
57 | except HdfsError, Arguments:
58 | print Arguments
59 | except Exception, Arguments:
60 | print Arguments
61 | return
62 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list'
63 |
64 | html = etree.HTML(html_page.lower().decode('utf-8'))
65 | hrefs = html.xpath(u"//a")
66 |
67 | for href in hrefs:
68 | try:
69 | if 'href' in href.attrib:
70 | val = href.attrib['href']
71 | if val.find('javascript:') != -1:
72 | continue
73 | if val.startswith('http://') is False:
74 | if val.startswith('/'):
75 | val = 'http://www.mafengwo.cn' + val
76 | else:
77 | continue
78 | if val[-1] == '/':
79 | val = val[0:-1]
80 | dbmanager.enqueueUrl(val, 'new', depth + 1)
81 | except ValueError:
82 | continue
83 |
84 |
85 | max_num_thread = 5
86 | dbmanager = MongoUrlManager()
87 |
88 | dbmanager.enqueueUrl("http://www.mafengwo.cn", 'new', 0)
89 |
90 | start_time = time.time()
91 | is_root_page = True
92 | threads = []
93 |
94 | CRAWL_DELAY = 0.6
95 |
96 | # use hdfs to save pages
97 | # hdfs_client = InsecureClient('http://54.223.92.169:50070', user='ec2-user')
98 |
99 | while True:
100 | curtask = dbmanager.dequeueUrl()
101 | # Go on next level, before that, needs to wait all current level crawling done
102 | if curtask is None:
103 | print 'No task available!'
104 | for t in threads:
105 | t.join()
106 | break
107 |
108 | # looking for an empty thread from pool to crawl
109 |
110 | if is_root_page is True:
111 | get_page_content(curtask['_id'], curtask['depth'])
112 | is_root_page = False
113 | else:
114 | while True:
115 | # first remove all finished running threads
116 | for t in threads:
117 | if not t.is_alive():
118 | threads.remove(t)
119 | if len(threads) >= max_num_thread:
120 | time.sleep(CRAWL_DELAY)
121 | continue
122 | try:
123 | t = threading.Thread(target=get_page_content, name=None, args=(curtask['_id'], curtask['depth']))
124 | threads.append(t)
125 | # set daemon so main thread can exit when receives ctrl-c
126 | t.setDaemon(True)
127 | t.start()
128 | time.sleep(CRAWL_DELAY)
129 | break
130 | except Exception as error:
131 | print 'Unable to start thread: ' + error.message
132 | break
--------------------------------------------------------------------------------
/第三课代码/spider_process_mr.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | import httplib
3 | from lxml import etree
4 | import thread
5 | import threading
6 | import os
7 | import time
8 | from mongo_redis_mgr import MongoRedisUrlManager
9 |
10 | from hdfs import *
11 | from hdfs.util import HdfsError
12 |
13 | request_headers = {
14 | 'host': "www.mafengwo.cn",
15 | 'connection': "keep-alive",
16 | 'cache-control': "no-cache",
17 | 'upgrade-insecure-requests': "1",
18 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
19 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
20 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
21 | }
22 |
23 | dir_path = 'mafengwo'
24 |
25 | def save_page_content(html, filename):
26 | if os.path.exists(dir_path) is False:
27 | os.mkdir(dir_path)
28 | fo = open("%s/%s.html" % (dir_path, filename), 'wb+')
29 | fo.write(html)
30 | fo.close()
31 |
32 | def get_page_content(cur_url, depth):
33 | print "downloading %s at level %d" % (cur_url, depth)
34 | try:
35 | req = urllib2.Request(cur_url, headers=request_headers)
36 | response = urllib2.urlopen(req)
37 | html_page = response.read()
38 | filename = cur_url[7:].replace('/', '_')
39 |
40 | #Write page to local files system
41 | save_page_content(html_page, filename)
42 |
43 | # Write to HDFS
44 | # with hdfs_client.write('/htmls/mfw/%s.html' % (filename)) as writer:
45 | # writer.write(html_page)
46 |
47 | dbmanager.finishUrl(cur_url)
48 | except urllib2.HTTPError, Arguments:
49 | print Arguments
50 | return
51 | except httplib.BadStatusLine, Arguments:
52 | print Arguments
53 | return
54 | except IOError, Arguments:
55 | print Arguments
56 | return
57 | except HdfsError, Arguments:
58 | print Arguments
59 | except Exception, Arguments:
60 | print Arguments
61 | return
62 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list'
63 |
64 | html = etree.HTML(html_page.lower().decode('utf-8'))
65 | hrefs = html.xpath(u"//a")
66 |
67 | for href in hrefs:
68 | try:
69 | if 'href' in href.attrib:
70 | val = href.attrib['href']
71 | if val.find('javascript:') != -1:
72 | continue
73 | if val.startswith('http://') is False:
74 | if val.startswith('/'):
75 | val = 'http://www.mafengwo.cn' + val
76 | else:
77 | continue
78 | if val[-1] == '/':
79 | val = val[0:-1]
80 | dbmanager.enqueueUrl(val, 'new', depth + 1)
81 | except ValueError:
82 | continue
83 |
84 |
85 | max_num_thread = 5
86 | dbmanager = MongoRedisUrlManager()
87 |
88 | dbmanager.enqueueUrl("http://www.mafengwo.cn", 'new', 0)
89 |
90 | start_time = time.time()
91 | is_root_page = True
92 | threads = []
93 |
94 | CRAWL_DELAY = 0.6
95 |
96 | # use hdfs to save pages
97 | # hdfs_client = InsecureClient('http://54.223.92.169:50070', user='ec2-user')
98 |
99 | while True:
100 | curtask = dbmanager.dequeueUrl()
101 | print curtask
102 | # Go on next level, before that, needs to wait all current level crawling done
103 | if curtask is None:
104 | print 'No task available!'
105 | for t in threads:
106 | t.join()
107 | break
108 |
109 | # looking for an empty thread from pool to crawl
110 |
111 | if is_root_page is True:
112 | get_page_content(curtask['url'], curtask['depth'])
113 | is_root_page = False
114 | else:
115 | while True:
116 | # first remove all finished running threads
117 | for t in threads:
118 | if not t.is_alive():
119 | threads.remove(t)
120 | if len(threads) >= max_num_thread:
121 | time.sleep(CRAWL_DELAY)
122 | continue
123 | try:
124 | t = threading.Thread(target=get_page_content, name=None, args=(curtask['url'], curtask['depth']))
125 | threads.append(t)
126 | # set daemon so main thread can exit when receives ctrl-c
127 | t.setDaemon(True)
128 | t.start()
129 | time.sleep(CRAWL_DELAY)
130 | break
131 | except Exception, Arguments:
132 | print 'Unable to start thread: ' + error.message
133 | break
134 |
135 | # clear redis and mongo
136 | # dbmanager.clear()
--------------------------------------------------------------------------------
/第二讲 爬虫基本原理、搭建第一个爬虫.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第二讲 爬虫基本原理、搭建第一个爬虫.pdf
--------------------------------------------------------------------------------
/第二讲代码.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第二讲代码.zip
--------------------------------------------------------------------------------
/第二讲代码/dbmanager.py:
--------------------------------------------------------------------------------
1 | import mysql.connector
2 | import hashlib
3 | from mysql.connector import errorcode
4 |
5 |
6 | class CrawlDatabaseManager:
7 |
8 | DB_NAME = 'mfw_pro_crawl'
9 |
10 | SERVER_IP = 'localhost'
11 |
12 | TABLES = {}
13 | # create new table, using sql
14 | TABLES['urls'] = (
15 | "CREATE TABLE `urls` ("
16 | " `index` int(11) NOT NULL AUTO_INCREMENT," # index of queue
17 | " `url` varchar(512) NOT NULL,"
18 | " `md5` varchar(16) NOT NULL,"
19 | " `status` varchar(11) NOT NULL DEFAULT 'new'," # could be new, downloading and finish
20 | " `depth` int(11) NOT NULL,"
21 | " `queue_time` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,"
22 | " `done_time` timestamp NOT NULL DEFAULT 0 ON UPDATE CURRENT_TIMESTAMP,"
23 | " PRIMARY KEY (`index`),"
24 | " UNIQUE KEY `md5` (`md5`)"
25 | ") ENGINE=InnoDB")
26 |
27 |
28 | def __init__(self, max_num_thread):
29 | # connect mysql server
30 | try:
31 | cnx = mysql.connector.connect(host=self.SERVER_IP, user='root')
32 | except mysql.connector.Error as err:
33 | if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
34 | print "Something is wrong with your user name or password"
35 | elif err.errno == errorcode.ER_BAD_DB_ERROR:
36 | print "Database does not exist"
37 | else:
38 | print 'Create Error ' + err.msg
39 | exit(1)
40 |
41 | cursor = cnx.cursor()
42 |
43 | # use database, create it if not exist
44 | try:
45 | cnx.database = self.DB_NAME
46 | except mysql.connector.Error as err:
47 | if err.errno == errorcode.ER_BAD_DB_ERROR:
48 | # create database and table
49 | self.create_database(cursor)
50 | cnx.database = self.DB_NAME
51 | self.create_tables(cursor)
52 | else:
53 | print err
54 | exit(1)
55 | finally:
56 | cursor.close()
57 | cnx.close()
58 |
59 | dbconfig = {
60 | "database": self.DB_NAME,
61 | "user": "root",
62 | "host": self.SERVER_IP,
63 | }
64 | self.cnxpool = mysql.connector.pooling.MySQLConnectionPool(pool_name="mypool",
65 | pool_size=max_num_thread,
66 | **dbconfig)
67 |
68 |
69 | # create databse
70 | def create_database(self, cursor):
71 | try:
72 | cursor.execute(
73 | "CREATE DATABASE {} DEFAULT CHARACTER SET 'utf8'".format(self.DB_NAME))
74 | except mysql.connector.Error as err:
75 | print "Failed creating database: {}".format(err)
76 | exit(1)
77 |
78 | def create_tables(self, cursor):
79 | for name, ddl in self.TABLES.iteritems():
80 | try:
81 | cursor.execute(ddl)
82 | except mysql.connector.Error as err:
83 | if err.errno == errorcode.ER_TABLE_EXISTS_ERROR:
84 | print 'create tables error ALREADY EXISTS'
85 | else:
86 | print 'create tables error ' + err.msg
87 | else:
88 | print 'Tables created'
89 |
90 |
91 | # put an url into queue
92 | def enqueueUrl(self, url, depth):
93 | con = self.cnxpool.get_connection()
94 | cursor = con.cursor()
95 | try:
96 | add_url = ("INSERT INTO urls (url, md5, depth) VALUES (%s, %s, %s)")
97 | data_url = (url, hashlib.md5(url).hexdigest(), depth)
98 | cursor.execute(add_url, data_url)
99 | # commit this transaction, please refer to "mysql transaction" for more info
100 | con.commit()
101 | except mysql.connector.Error as err:
102 | # print 'enqueueUrl() ' + err.msg
103 | return
104 | finally:
105 | cursor.close()
106 | con.close()
107 |
108 |
109 | # get an url from queue
110 | def dequeueUrl(self):
111 | con = self.cnxpool.get_connection()
112 | cursor = con.cursor(dictionary=True)
113 | try:
114 | # use select * for update to lock the rows for read
115 | query = ("SELECT `index`, `url`, `depth` FROM urls WHERE status='new' ORDER BY `index` ASC LIMIT 1 FOR UPDATE")
116 | cursor.execute(query)
117 | if cursor.rowcount is 0:
118 | return None
119 | row = cursor.fetchone()
120 | update_query = ("UPDATE urls SET `status`='downloading' WHERE `index`=%d") % (row['index'])
121 | cursor.execute(update_query)
122 | con.commit()
123 | return row
124 | except mysql.connector.Error as err:
125 | # print 'dequeueUrl() ' + err.msg
126 | return None
127 | finally:
128 | cursor.close()
129 | con.close()
130 |
131 | def finishUrl(self, index):
132 | con = self.cnxpool.get_connection()
133 | cursor = con.cursor()
134 | try:
135 | # we don't need to update done_time using time.strftime('%Y-%m-%d %H:%M:%S') as it's auto updated
136 | update_query = ("UPDATE urls SET `status`='done' WHERE `index`=%d") % (index)
137 | cursor.execute(update_query)
138 | con.commit()
139 | except mysql.connector.Error as err:
140 | # print 'finishUrl() ' + err.msg
141 | return
142 | finally:
143 | cursor.close()
144 | con.close()
145 |
--------------------------------------------------------------------------------
/第二讲代码/lxml_test.py:
--------------------------------------------------------------------------------
1 | import lxml
2 | from lxml import html
3 | from lxml import etree
4 |
5 | from bs4 import BeautifulSoup
6 |
7 | f = open('jd.com_2131674.html', 'r')
8 | content = f.read()
9 |
10 | tree = etree.HTML(content.decode('utf-8'))
11 |
12 | print '--------------------------------------------'
13 | print '# different quote //*[@class="p-price J-p-2131674"'
14 | print '--------------------------------------------'
15 | print tree.xpath(u"//*[@class='p-price J-p-2131674']")
16 | print ''
17 |
18 | print '--------------------------------------------'
19 | print '# partial match ' + "//*[@class='J-p-2131674']"
20 | print '--------------------------------------------'
21 | print tree.xpath(u"//*[@class='J-p-2131674']")
22 | print ''
23 |
24 | print '--------------------------------------------'
25 | print '# exactly match class string ' + '//*[@class="p-price J-p-2131674"]'
26 | print '--------------------------------------------'
27 | print tree.xpath(u'//*[@class="p-price J-p-2131674"]')
28 | print ''
29 |
30 | print '--------------------------------------------'
31 | print '# use contain ' + "//*[contains(@class, 'J-p-2131674')]"
32 | print '--------------------------------------------'
33 | print tree.xpath(u"//*[contains(@class, 'J-p-2131674')]")
34 | print ''
35 |
36 |
37 | print '--------------------------------------------'
38 | print '# specify tag name ' + "//strong[contains(@class, 'J-p-2131674')]"
39 | print '--------------------------------------------'
40 | print tree.xpath(u"//strong[contains(@class, 'J-p-2131674')]")
41 | print ''
42 |
43 | print '--------------------------------------------'
44 | print '# css selector with tag' + "cssselect('strong.J-p-2131674')"
45 | print '--------------------------------------------'
46 | htree = lxml.html.fromstring(content)
47 | print htree.cssselect('strong.J-p-2131674')
48 | print ''
49 |
50 | print '--------------------------------------------'
51 | print '# css selector without tag, partial match' + "cssselect('.J-p-2131674')"
52 | print '--------------------------------------------'
53 | htree = lxml.html.fromstring(content)
54 | elements = htree.cssselect('.J-p-2131674')
55 | print elements
56 | print ''
57 |
58 | print '--------------------------------------------'
59 | print '# attrib and text'
60 | print '--------------------------------------------'
61 | for element in tree.xpath(u"//strong[contains(@class, 'J-p-2131674')]"):
62 | print element.text
63 | print element.attrib
64 | print ''
65 |
66 | print '--------------------------------------------'
67 | print '########## use BeautifulSoup ##############'
68 | print '--------------------------------------------'
69 | print '# loading content to BeautifulSoup'
70 | soup = BeautifulSoup(content, 'html.parser')
71 | print '# loaded, show result'
72 | print soup.find(attrs={'class':'J-p-2131674'}).text
73 |
74 | f.close()
75 |
--------------------------------------------------------------------------------
/第二讲代码/multi_thread_mfw.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | from collections import deque
3 | import json
4 | from lxml import etree
5 | import httplib
6 | import hashlib
7 | from pybloomfilter import BloomFilter
8 | import thread
9 | import threading
10 | import time
11 |
12 |
13 | class CrawlBSF:
14 | request_headers = {
15 | 'host': "www.mafengwo.cn",
16 | 'connection': "keep-alive",
17 | 'cache-control': "no-cache",
18 | 'upgrade-insecure-requests': "1",
19 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
20 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
21 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
22 | }
23 |
24 | cur_level = 0
25 | max_level = 5
26 | dir_name = 'iterate/'
27 | iter_width = 50
28 | downloaded_urls = []
29 |
30 | du_md5_file_name = dir_name + 'download.txt'
31 | du_url_file_name = dir_name + 'urls.txt'
32 |
33 | bloom_downloaded_urls = BloomFilter(1024 * 1024 * 16, 0.01)
34 | bloom_url_queue = BloomFilter(1024 * 1024 * 16, 0.01)
35 |
36 | cur_queue = deque()
37 | child_queue = deque()
38 |
39 | def __init__(self, url):
40 | self.root_url = url
41 | self.cur_queue.append(url)
42 | self.du_file = open(self.du_url_file_name, 'a+')
43 | try:
44 | self.dumd5_file = open(self.du_md5_file_name, 'r')
45 | self.downloaded_urls = self.dumd5_file.readlines()
46 | self.dumd5_file.close()
47 | for urlmd5 in self.downloaded_urls:
48 | self.bloom_downloaded_urls.add(urlmd5[:-2])
49 | except IOError:
50 | print "File not found"
51 | finally:
52 | self.dumd5_file = open(self.du_md5_file_name, 'a+')
53 |
54 | def enqueueUrl(self, url):
55 | if url not in self.bloom_url_queue and hashlib.md5(url).hexdigest() not in crawler.bloom_downloaded_urls:
56 | self.child_queue.append(url)
57 | self.bloom_url_queue.add(url)
58 |
59 | def dequeuUrl(self):
60 | try:
61 | url = self.cur_queue.popleft()
62 | return url
63 | except IndexError:
64 | return None
65 |
66 | def close(self):
67 | self.dumd5_file.close()
68 | self.du_file.close()
69 |
70 |
71 | num_downloaded_pages = 0
72 |
73 |
74 | #download the page content
75 | def get_page_content(cur_url):
76 | global num_downloaded_pages
77 | print "downloading %s at level %d" % (cur_url, crawler.cur_level)
78 | try:
79 | req = urllib2.Request(cur_url, headers=crawler.request_headers)
80 | response = urllib2.urlopen(req)
81 | html_page = response.read()
82 | filename = cur_url[7:].replace('/', '_')
83 | fo = open("%s%s.html" % (crawler.dir_name, filename), 'wb+')
84 | fo.write(html_page)
85 | fo.close()
86 | except urllib2.HTTPError, Arguments:
87 | print Arguments
88 | return
89 | except httplib.BadStatusLine, Arguments:
90 | print Arguments
91 | return
92 | except IOError, Arguments:
93 | print Arguments
94 | return
95 | except Exception, Arguments:
96 | print Arguments
97 | return
98 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list'
99 |
100 | # save page and set bloomfilter
101 | dumd5 = hashlib.md5(cur_url).hexdigest()
102 | crawler.downloaded_urls.append(dumd5)
103 | crawler.dumd5_file.write(dumd5 + '\r\n')
104 | crawler.du_file.write(cur_url + '\r\n')
105 | crawler.bloom_downloaded_urls.add(dumd5)
106 | num_downloaded_pages += 1
107 |
108 | html = etree.HTML(html_page.lower().decode('utf-8'))
109 | hrefs = html.xpath(u"//a")
110 |
111 | for href in hrefs:
112 | try:
113 | if 'href' in href.attrib:
114 | val = href.attrib['href']
115 | if val.find('javascript:') != -1:
116 | continue
117 | if val.startswith('http://') is False:
118 | if val.startswith('/'):
119 | val = 'http://www.mafengwo.cn' + val
120 | else:
121 | continue
122 | if val[-1] == '/':
123 | val = val[0:-1]
124 | # if hashlib.md5(val).hexdigest() not in self.downloaded_urls:
125 | crawler.enqueueUrl(val)
126 | # else:
127 | # print 'Skip %s' % (val)
128 | except ValueError:
129 | continue
130 |
131 |
132 | crawler = CrawlBSF("http://www.mafengwo.cn")
133 | start_time = time.time()
134 |
135 | # if it's the first page (start url), if true, crawl it in main thread in sync(blocking) mode
136 | # 如果是第一个抓取页面的话,在主线程用同步(阻塞)的模式下载,后续的页面会通过创建子线程的方式异步爬取
137 | is_root_page = True
138 | threads = []
139 | max_threads = 10
140 |
141 | CRAWL_DELAY = 0.6
142 |
143 | while True:
144 | url = crawler.dequeuUrl()
145 | # Go on next level, before that, needs to wait all current level crawling done
146 | if url is None:
147 | crawler.cur_level += 1
148 | for t in threads:
149 | t.join()
150 | if crawler.cur_level == crawler.max_level:
151 | break
152 | if len(crawler.child_queue) == 0:
153 | break
154 | crawler.cur_queue = crawler.child_queue
155 | crawler.child_queue = deque()
156 | continue
157 |
158 |
159 | # looking for an empty thread from pool to crawl
160 |
161 | if is_root_page is True:
162 | get_page_content(url)
163 | is_root_page = False
164 | else:
165 | while True:
166 | # first remove all finished running threads
167 | for t in threads:
168 | if not t.is_alive():
169 | threads.remove(t)
170 | if len(threads) >= max_threads:
171 | time.sleep(CRAWL_DELAY)
172 | continue
173 | try:
174 | t = threading.Thread(target=get_page_content, name=None, args=(url,))
175 | threads.append(t)
176 | # set daemon so main thread can exit when receives ctrl-c
177 | t.setDaemon(True)
178 | t.start()
179 | time.sleep(CRAWL_DELAY)
180 | break
181 | except Exception:
182 | print "Error: unable to start thread"
183 |
184 | print '%d pages downloaded, time cost %0.2f seconds' % (num_downloaded_pages, time.time()-start_time)
185 |
--------------------------------------------------------------------------------
/第二讲代码/process_crawl.py:
--------------------------------------------------------------------------------
1 | import urllib2
2 | from collections import deque
3 | import json
4 | from lxml import etree
5 | import httplib
6 | import hashlib
7 | from pybloomfilter import BloomFilter
8 | import thread
9 | import threading
10 | import time
11 | from dbmanager import CrawlDatabaseManager
12 |
13 | from mysql.connector import errorcode
14 | import mysql.connector
15 |
16 | request_headers = {
17 | 'host': "www.mafengwo.cn",
18 | 'connection': "keep-alive",
19 | 'cache-control': "no-cache",
20 | 'upgrade-insecure-requests': "1",
21 | 'user-agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
22 | 'accept': "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
23 | 'accept-language': "zh-CN,en-US;q=0.8,en;q=0.6"
24 | }
25 |
26 | def get_page_content(cur_url, index, depth):
27 | print "downloading %s at level %d" % (cur_url, depth)
28 | try:
29 | req = urllib2.Request(cur_url, headers=request_headers)
30 | response = urllib2.urlopen(req)
31 | html_page = response.read()
32 | filename = cur_url[7:].replace('/', '_')
33 | fo = open("%s%s.html" % (dir_name, filename), 'wb+')
34 | fo.write(html_page)
35 | fo.close()
36 | dbmanager.finishUrl(index)
37 | except urllib2.HTTPError, Arguments:
38 | print Arguments
39 | return
40 | except httplib.BadStatusLine, Arguments:
41 | print Arguments
42 | return
43 | except IOError, Arguments:
44 | print Arguments
45 | return
46 | except Exception, Arguments:
47 | print Arguments
48 | return
49 | # print 'add ' + hashlib.md5(cur_url).hexdigest() + ' to list'
50 |
51 | html = etree.HTML(html_page.lower().decode('utf-8'))
52 | hrefs = html.xpath(u"//a")
53 |
54 | for href in hrefs:
55 | try:
56 | if 'href' in href.attrib:
57 | val = href.attrib['href']
58 | if val.find('javascript:') != -1:
59 | continue
60 | if val.startswith('http://') is False:
61 | if val.startswith('/'):
62 | val = 'http://www.mafengwo.cn' + val
63 | else:
64 | continue
65 | if val[-1] == '/':
66 | val = val[0:-1]
67 | dbmanager.enqueueUrl(val, depth + 1)
68 |
69 | except ValueError:
70 | continue
71 |
72 |
73 | max_num_thread = 5
74 |
75 | # create instance of Mysql database manager, which is used as a queue for crawling
76 | dbmanager = CrawlDatabaseManager(max_num_thread)
77 |
78 | # dir for saving HTML files
79 | dir_name = 'dir_process/'
80 |
81 | # put first page into queue
82 | dbmanager.enqueueUrl("http://www.mafengwo.cn", 0)
83 | start_time = time.time()
84 | is_root_page = True
85 | threads = []
86 |
87 | # time delay before a new crawling thread is created
88 | # use a delay to control the crawling rate, avoiding visiting target website too frequently
89 | # 设置超时,控制下载的速率,避免太过频繁访问目标网站
90 | CRAWL_DELAY = 0.6
91 |
92 |
93 | while True:
94 | curtask = dbmanager.dequeueUrl()
95 | # Go on next level, before that, needs to wait all current level crawling done
96 | if curtask is None:
97 | for t in threads:
98 | t.join()
99 | break
100 |
101 | # looking for an empty thread from pool to crawl
102 |
103 | if is_root_page is True:
104 | get_page_content(curtask['url'], curtask['index'], curtask['depth'])
105 | is_root_page = False
106 | else:
107 | while True:
108 | # first remove all finished running threads
109 | for t in threads:
110 | if not t.is_alive():
111 | threads.remove(t)
112 | if len(threads) >= max_num_thread:
113 | time.sleep(CRAWL_DELAY)
114 | continue
115 | try:
116 | t = threading.Thread(target=get_page_content, name=None, args=(curtask['url'], curtask['index'], curtask['depth']))
117 | threads.append(t)
118 | # set daemon so main thread can exit when receives ctrl-c
119 | t.setDaemon(True)
120 | t.start()
121 | time.sleep(CRAWL_DELAY)
122 | break
123 | except Exception:
124 | print "Error: unable to start thread"
125 |
126 | cursor.close()
127 | cnx.close()
128 |
--------------------------------------------------------------------------------
/第五讲 PageRank、动态重拍、避开网站反爬的技术.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第五讲 PageRank、动态重拍、避开网站反爬的技术.pdf
--------------------------------------------------------------------------------
/第四讲 爬虫与反爬虫的对抗.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第四讲 爬虫与反爬虫的对抗.pdf
--------------------------------------------------------------------------------
/第四讲代码.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhangguo7/distributed_crawler/0557783408cffa22d197a439baa0dc5a3efe724f/第四讲代码.zip
--------------------------------------------------------------------------------