5]
50 | # print ip_port_type_list, len(ip_port_type_list)
51 | matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS']
52 | # print matched_ip_port_list, len(matched_ip_port_list)
53 | com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list]
54 | http_proxy_list.extend(com_str_list)
55 |
56 | #过滤已被百度查封的IP
57 | com_str_list = [item for item in http_proxy_list if request_in_baidu(item)]
58 | filename = os.path.join(PATH, 'sys', 'xici_proxy')
59 | with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
60 | com_str_list = [item+'\n' for item in com_str_list]
61 | wf.writelines(com_str_list)
62 | gen_proxy()
63 |
64 |
65 |
66 |
--------------------------------------------------------------------------------
/jd/varify_proxy.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | #coding:utf-8
3 | import urllib2
4 | import os
5 | import re
6 | import codecs
7 | import time
8 | import random
9 | from bs4 import BeautifulSoup
10 | SLEEP_INTERVAL = random.randint(5,10)
11 |
12 | PATH = os.path.dirname(os.path.abspath(__file__))
13 |
14 | def varify(ip_port):
15 | url = 'http://item.jd.com/1013330.html'
16 | proxy_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
17 | opener = urllib2.build_opener(proxy_hanlder)
18 | urllib2.install_opener(opener)
19 | try:
20 | start_time = time.time()
21 | html = urllib2.urlopen(url, timeout=10).read()
22 | soup = BeautifulSoup(html)
23 | content = soup.find('div', id='product-detail-1')
24 | end_time = time.time()
25 | time_consume = end_time - start_time
26 | if not content:
27 | print 'invalid ip_port:%s'%ip_port
28 | return
29 | elif content and time_consume < 1:
30 | print 'success ip_port:%s'%ip_port
31 | print end_time - start_time
32 | return True
33 | except:
34 | print 'timed item_id...in ip_port:%s'%ip_port
35 | return
36 | # varify('')
37 |
38 | def write_proxy_into_file(http_proxy_list):
39 | com_str_list = [item+'\n' for item in http_proxy_list if varify(item)]
40 | filename = os.path.join(PATH, 'xici_proxy')
41 | with codecs.open(filename, mode='a', encoding='utf-8') as wf:
42 | wf.writelines(com_str_list)
43 |
44 | def gen_proxy():
45 | url_pattern = "http://www.xici.net.co/nn/%s"
46 | url_list = [url_pattern%str(i) for i in range(1,6)]
47 | http_proxy_list = []
48 | for url in url_list:
49 | try:
50 | html = urllib2.urlopen(url, timeout=15).read()
51 | except:
52 | time.sleep(60)
53 | try:
54 | html = urllib2.urlopen(url, timeout=15).read()
55 | except:
56 | continue
57 | soup = BeautifulSoup(html)
58 | tr_level_list = soup.find_all('tr')
59 | td_level_list = [item.find_all('td') for item in tr_level_list]
60 | ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5]
61 | matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS']
62 | com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list]
63 | http_proxy_list.extend(com_str_list)
64 | write_proxy_into_file(http_proxy_list)
65 | if __name__ == "__main__":
66 | gen_proxy()
67 |
68 |
--------------------------------------------------------------------------------
/proxy/xici.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | #coding:utf-8
3 | import urllib2
4 | import os
5 | import re
6 | import codecs
7 | import time
8 | import random
9 | from bs4 import BeautifulSoup
10 | SLEEP_INTERVAL = random.randint(5,10)
11 |
12 | PATH = os.path.dirname(os.path.abspath(__file__))
13 |
14 | def request_in_douban(ip_port):
15 | '''过滤豆瓣被封的IP'''
16 | url = 'http://movie.douban.com/subject/6786002/'
17 | http_proxy = 'http://%s'%ip_port
18 | proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
19 | opener = urllib2.build_opener(proxy_hanlder)
20 | urllib2.install_opener(opener)
21 | try:
22 | html = urllib2.urlopen(url, timeout=15)
23 | soup = BeautifulSoup(html)
24 |
25 | title = soup.find('span', property='v:itemreviewed')
26 |
27 | if not title:
28 | print 'invalid ip_port:%s'%ip_port
29 | return
30 | else:
31 | print 'success ip_port:%s'%ip_port
32 | return True
33 | except:
34 | print 'timed item_id...in ip_port:%s'%ip_port
35 | return
36 |
37 | def write_proxy_into_file(http_proxy_list):
38 | #过滤已被豆瓣查封的IP
39 | com_str_list = [item+'\n' for item in http_proxy_list if request_in_douban(item)]
40 | filename = os.path.join(PATH, 'sys', 'xici_proxy')
41 | with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
42 | wf.writelines(com_str_list)
43 |
44 | def gen_proxy():
45 | url_pattern = "http://www.xici.net.co/nn/%s"
46 | url_list = [url_pattern%str(i) for i in range(1,4)]
47 | http_proxy_list = []
48 | for url in url_list:
49 | try:
50 | html = urllib2.urlopen(url, timeout=15).read()
51 | except:
52 | time.sleep(60)
53 | try:
54 | html = urllib2.urlopen(url, timeout=15).read()
55 | except:
56 | continue
57 | soup = BeautifulSoup(html)
58 | tr_level_list = soup.find_all('tr')
59 | td_level_list = [item.find_all('td') for item in tr_level_list]
60 | ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5]
61 | # print ip_port_type_list, len(ip_port_type_list)
62 | matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS']
63 | # print matched_ip_port_list, len(matched_ip_port_list)
64 | com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list]
65 | http_proxy_list.extend(com_str_list)
66 |
67 | write_proxy_into_file(http_proxy_list)
68 | # time.sleep(SLEEP_INTERVAL)
69 |
70 | gen_proxy()
71 |
72 |
--------------------------------------------------------------------------------
/douban/xici_proxy.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | #coding:utf-8
3 | import urllib2
4 | import os
5 | import re
6 | import codecs
7 | import time
8 | import random
9 | from bs4 import BeautifulSoup
10 | SLEEP_INTERVAL = random.randint(5,10)
11 |
12 | PATH = os.path.dirname(os.path.abspath(__file__))
13 |
14 | def request_in_douban(ip_port):
15 | '''过滤豆瓣被封的IP'''
16 | url = 'http://movie.douban.com/subject/6786002/'
17 | http_proxy = 'http://%s'%ip_port
18 | proxy_hanlder = urllib2.ProxyHandler({'http':http_proxy})
19 | opener = urllib2.build_opener(proxy_hanlder)
20 | urllib2.install_opener(opener)
21 | try:
22 | html = urllib2.urlopen(url, timeout=10)
23 | soup = BeautifulSoup(html)
24 |
25 | title = soup.find('span', property='v:itemreviewed')
26 |
27 | if not title:
28 | print 'invalid ip_port:%s'%ip_port
29 | return
30 | else:
31 | print 'success ip_port:%s'%ip_port
32 | return True
33 | except:
34 | print 'timed item_id...in ip_port:%s'%ip_port
35 | return
36 |
37 | def write_proxy_into_file(http_proxy_list):
38 | #过滤已被豆瓣查封的IP
39 | com_str_list = [item+'\n' for item in http_proxy_list if request_in_douban(item)]
40 | filename = os.path.join(PATH, 'sys', 'xici_proxy')
41 | with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
42 | wf.writelines(com_str_list)
43 |
44 | def gen_proxy():
45 | url_pattern = "http://www.xici.net.co/nn/%s"
46 | url_list = [url_pattern%str(i) for i in range(1,6)]
47 | http_proxy_list = []
48 | for url in url_list:
49 | try:
50 | html = urllib2.urlopen(url, timeout=15).read()
51 | except:
52 | time.sleep(60)
53 | try:
54 | html = urllib2.urlopen(url, timeout=15).read()
55 | except:
56 | continue
57 | soup = BeautifulSoup(html)
58 | tr_level_list = soup.find_all('tr')
59 | td_level_list = [item.find_all('td') for item in tr_level_list]
60 | ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5]
61 | # print ip_port_type_list, len(ip_port_type_list)
62 | matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS']
63 | # print matched_ip_port_list, len(matched_ip_port_list)
64 | com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list]
65 | http_proxy_list.extend(com_str_list)
66 |
67 | write_proxy_into_file(http_proxy_list)
68 | # time.sleep(SLEEP_INTERVAL)
69 |
70 | gen_proxy()
71 |
72 |
--------------------------------------------------------------------------------
/dangdang/jewellery_glass_watch/jewellery/sys/jewellery_item_id:
--------------------------------------------------------------------------------
1 | http://product.dangdang.com/60545118.html
2 | http://product.dangdang.com/60329284.html
3 | http://product.dangdang.com/60561956.html
4 | http://product.dangdang.com/1135936507.html
5 | http://product.dangdang.com/60326136.html
6 | http://product.dangdang.com/60217143.html
7 | http://product.dangdang.com/60275012.html
8 | http://product.dangdang.com/60095332.html
9 | http://product.dangdang.com/60296404.html
10 | http://product.dangdang.com/60312091.html
11 | http://product.dangdang.com/1102856607.html
12 | http://product.dangdang.com/1103019307.html
13 | http://product.dangdang.com/60543004.html
14 | http://product.dangdang.com/60561747.html
15 | http://product.dangdang.com/60307526.html
16 | http://product.dangdang.com/60556879.html
17 | http://product.dangdang.com/60079298.html
18 | http://product.dangdang.com/60324171.html
19 | http://product.dangdang.com/60328975.html
20 | http://product.dangdang.com/60563407.html
21 | http://product.dangdang.com/1226033608.html
22 | http://product.dangdang.com/1103073107.html
23 | http://product.dangdang.com/1205883106.html
24 | http://product.dangdang.com/60543166.html
25 | http://product.dangdang.com/60545362.html
26 | http://product.dangdang.com/1219193022.html
27 | http://product.dangdang.com/60329283.html
28 | http://product.dangdang.com/60293398.html
29 | http://product.dangdang.com/60311152.html
30 | http://product.dangdang.com/60542979.html
31 | http://product.dangdang.com/60559721.html
32 | http://product.dangdang.com/1103111307.html
33 | http://product.dangdang.com/60554268.html
34 | http://product.dangdang.com/1104669412.html
35 | http://product.dangdang.com/60313678.html
36 | http://product.dangdang.com/60564171.html
37 | http://product.dangdang.com/1175804808.html
38 | http://product.dangdang.com/60304263.html
39 | http://product.dangdang.com/60542966.html
40 | http://product.dangdang.com/1062967608.html
41 | http://product.dangdang.com/60556746.html
42 | http://product.dangdang.com/60542968.html
43 | http://product.dangdang.com/1137365805.html
44 | http://product.dangdang.com/1025107506.html
45 | http://product.dangdang.com/60571706.html
46 | http://product.dangdang.com/1062940608.html
47 | http://product.dangdang.com/1365805202.html
48 | http://product.dangdang.com/1159089022.html
49 | http://product.dangdang.com/1080116522.html
50 | http://product.dangdang.com/1108646412.html
51 | http://product.dangdang.com/1066589907.html
52 | http://product.dangdang.com/1173284507.html
53 | http://product.dangdang.com/60323221.html
54 | http://product.dangdang.com/1112093507.html
55 | http://product.dangdang.com/60020047.html
56 | http://product.dangdang.com/60079299.html
57 | http://product.dangdang.com/60567912.html
58 | http://product.dangdang.com/60547504.html
59 |
--------------------------------------------------------------------------------
/jd/read_config.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | import re
3 | import urllib2
4 | import urllib
5 | import time
6 | import cookielib
7 | import ConfigParser
8 | from bs4 import BeautifulSoup
9 |
10 | def read_config():
11 | config = ConfigParser.ConfigParser()
12 | print config.read('config.ini')
13 |
14 | spiders_name = config.sections()
15 |
16 | crawled_id_filename = config.get('computer','crawled_id_filename')
17 | print crawled_id_filename
18 | # read_config()
19 | def read_one_item_id():
20 | url = 'http://list.jd.com/6233-6236-6254-0-0-0-0-0-0-0-1-1-625-1-1-72-4137-33.html'
21 | ip_port = '218.207.195.206:80'
22 | enable_proxy = False
23 | start_time = time.time()
24 | http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
25 | null_http_hanlder = urllib2.ProxyHandler({})
26 | if enable_proxy:
27 | opener = urllib2.build_opener(http_hanlder)
28 | else:
29 | opener = urllib2.build_opener(null_http_hanlder)
30 | urllib2.install_opener(opener)
31 | html = urllib2.urlopen(url).read()
32 | item_id_list = re.findall(r"sku='(\d+)'>
5000:
80 | self.write_content_into_file()
81 | self.movie_info_list[:] = []
82 | time.sleep(SLEEP_INTERVAL)
83 |
84 | if self.timeout_url_list:
85 | filename = os.path.join(PATH, 'log', 'timeout_item_urls')
86 | with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
87 | temp_list = ["".join((item, '\n')) for item in self.timeout_url_list]
88 | wf.writelines(temp_list)
89 |
90 | douban = DoubanHistory()
91 | douban.main()
--------------------------------------------------------------------------------
/baidu_word_freq/get_word_freq.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | #coding:utf-8
3 | import urllib2
4 | import os
5 | import re
6 | import codecs
7 | import time
8 | from bs4 import BeautifulSoup
9 |
10 | PATH = os.path.dirname(os.path.abspath(__file__))
11 | class CrawlFreq:
12 |
13 | def __init__(self):
14 | self.words_list = []
15 | self._gen_words_list()
16 |
17 | def _gen_words_list(self):
18 | filename = os.path.join(PATH, 'sys', 'wrod_without_freq.txt')
19 | with codecs.open(filename) as f:
20 | temp_word_list = [item.strip() for item in f.readlines()]
21 | self.words_list.extend(temp_word_list)
22 |
23 | def read_item_url(self):
24 | timeout_url_list = []
25 | url_pattern = 'http://www.baidu.com/s?wd="%s"&fr=wenku'
26 | num_pattern = re.compile(r'\d+')
27 | filename = os.path.join(PATH, 'out', 'words_freq_0528')
28 | with codecs.open(filename, mode='a')as f:
29 | self.words_list = [u'\u67cf\u79c0\u6f2b\u753b', u'\u5e2e\u4ec0\u4e48\u5fd9\u6bcf\u4e2a', u'\u5305\u5c71\u8001\u5988', u'\u523a\u9752\u6709\u8138', u'\u5361\u9e7f\u519c\u5e84', u'\u8ba9\u6cea\u5316\u4f5c\u76f8\u601d\u5440', u'\u5916\u59d3\u5144\u5f1f', u'\u665a\u4e0a\u4e0d\u7761', u'\u559c\u6b22\u505a\u7684', u'\u559c\u9a6c\u62c9\u96c5\u5c71', u'\u76f8\u8c8c\u6b66\u50e7', u'\u6cbf\u6c5f\u5927\u9053', u'\u4e00\u6839\u5934\u53d1', u'\u4e00\u89c9\u7761\u9192', u'\u4e00\u8def\u6b4c\u5531', u'\u4e00\u76f4\u4e0b\u53bb', u'\u4e00\u5b57\u4e00\u952e\u7ae0\u9c7c\u8f93\u5165', u'\u610f\u601d\u4e00\u4e0b', u'\u4f18\u8c08\u5b9d\u5b9d\u4e0d\u9519\u7684', u'\u6709\u8bdd\u597d\u597d\u8bf4', u'\u53c8\u4e0d\u597d\u5403', u'\u53c8\u4e0d\u80fd\u5403', u'\u4e0e\u4e0d\u7b11\u732b\u60f3\u53bb', u'\u5728\u7ed9\u6211\u8bf4', u'\u5728\u5bb6\u5462\u554a', u'\u5728\u54ea\u91cc\u4f4f', u'\u7cdf\u8001\u5934\u5b50', u'\u600e\u4e48\u4e0d\u56de\u6211', u'\u600e\u4e48\u8fc7\u53bb', u'\u600e\u4e48\u4e00\u4e0b', u'\u627e\u5230\u6ca1\u6709', u'\u627e\u5230\u4f60\u4e86', u'\u8fd9\u4e48\u660e\u663e', u'\u81ea\u5c0a\u5b59\u5b59', u'\u6700\u6f6e\u6728\u4e43\u4f0a', u'\u6700\u8001\u60c5\u503a', u'\u6700\u5f3a\u723d\u7ea6']
30 | for word in self.words_list:
31 | url = url_pattern%word.encode('gbk')
32 | try:
33 | html = urllib2.urlopen(url, timeout=15).read()
34 | except:
35 | try:
36 | html = urllib2.urlopen(url, timeout=15).read()
37 | except:
38 | timeout_url_list.append(url)
39 | continue
40 | soup = BeautifulSoup(html)
41 | span_level_str_list = soup.find_all('span', class_='nums')
42 | if span_level_str_list:
43 | span_level_str = span_level_str_list[0]
44 | num_text = span_level_str.get_text()
45 | num = "".join(num_pattern.findall(num_text))
46 | com_str = "\t".join((word, str(num))) + '\n'
47 | else:
48 | com_str = "\t".join((word, "0")) + '\n'
49 | print com_str.strip()
50 | # f.write(com_str.encode('gbk'))
51 | # time.sleep(2)
52 |
53 | # if timeout_url_list:
54 | # timeout_filename = os.path.join(PATH, 'timeout_url')
55 | # with codecs.open(timeout_filename, mode='wb') as wf:
56 | # timeout_url_list = [item+'\n' for item in timeout_url_list]
57 | # wf.writelines(timeout_url_list)
58 |
59 | if __name__ == "__main__":
60 | freq_crawler = CrawlFreq()
61 | freq_crawler.read_item_url()
62 |
63 |
--------------------------------------------------------------------------------
/jd/book_jd/book_name.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | #coding:utf-8
3 | import os
4 | import re
5 | import time
6 | import random
7 | import codecs
8 | import urllib2
9 | import xici_proxy
10 | from bs4 import BeautifulSoup
11 |
12 | PATH = os.path.dirname(os.path.abspath(__file__))
13 | def parse_topic_url():
14 | book_url = "http://book.jd.com/booksort.html"
15 | response = urllib2.urlopen(book_url)
16 | html = response.read()
17 | soup = BeautifulSoup(html)
18 | div_level_str = soup.find('div', id='booksort')
19 | em_level_list = div_level_str.find_all('em')
20 | topic_url_list = [item.a['href'] for item in em_level_list]
21 | print topic_url_list[32:-2],len(set(topic_url_list))
22 | return topic_url_list[32:-2]
23 | # parse_topic_url()
24 | def gen_whole_page_url():
25 | topic_url_list = parse_topic_url()
26 | book_page_url_filename = os.path.join(PATH, 'sys', 'book_name_whole_page_url')
27 | redirect_url_filename = os.path.join(PATH, 'log', 'redirect_page_url')
28 | timeout_url_filename = os.path.join(PATH, 'log', 'timeout_topic_url')
29 | with codecs.open(book_page_url_filename, mode='wb', encoding='utf-8') as wf_page_url,\
30 | codecs.open(redirect_url_filename, mode='wb', encoding='utf-8') as wf_redirect_url,\
31 | codecs.open(timeout_url_filename, mode='wb', encoding='utf-8')as wf_timeout_url:
32 | count = 0
33 | for topic_url in topic_url_list:
34 | count += 1
35 | page_url_list = []
36 | try:
37 | response = urllib2.urlopen(topic_url, timeout=10)
38 | if response.geturl() != topic_url:
39 | print 'redirect page hrer in url:%s'%topic_url
40 | except:
41 | wf_timeout_url.write(topic_url+'\n')
42 | print 'timed out item_id in url:%s'%topic_url
43 | continue
44 | html = response.read()
45 | soup = BeautifulSoup(html)
46 | max_page_str = soup.find('div', class_='pagin pagin-m')
47 | if not max_page_str:
48 | print 'max_page_str do not match regular expression in url:%s'%topic_url
49 | continue
50 | page_size = max_page_str.span.text.split('/')[-1]
51 | print count, page_size
52 | end_url_pattern = '?s=15&t=1&p=%s'
53 | for page_num in range(1, int(page_size)+1):
54 | url = ''.join((topic_url,end_url_pattern%page_num))
55 | page_url_list.append(url+'\n')
56 | wf_page_url.writelines(page_url_list)
57 | # gen_whole_page_url()
58 | def read_topic_page_url_to_get_pagesize():
59 | page_url = 'http://list.jd.com/1713-3265-3429.html'
60 | # page_url = "http://list.jd.com/1713-3267-3456.html"
61 | html = urllib2.urlopen(page_url).read()
62 | soup = BeautifulSoup(html)
63 | div_level_str = soup.find('div', id='plist')
64 | # print div_level_str
65 | a_level_list = div_level_str.find_all('a', href=re.compile('http://item'), class_=None, title=True, target='_blank')
66 | # print a_level_list, len(a_level_list)
67 | item_url_list = set([item['href'] for item in a_level_list])
68 | # print item_url_list,len(item_url_list)
69 | max_page_str = soup.find('div', class_='pagin pagin-m')
70 | if not max_page_str:
71 | print 'max_page_str is null'
72 | return
73 | page_size = max_page_str.span.text.split('/')[-1]
74 | print page_size
75 | end_url_pattern = '?s=15&t=1&p=%s'
76 | for page_num in range(1, int(page_size)+1):
77 | url = ''.join((page_url,end_url_pattern%page_num))
78 | print url
79 | # read_topic_page_url_to_get_pagesize()
--------------------------------------------------------------------------------
/jd/book_jd/xici_proxy.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | #coding:utf-8
3 | import os
4 | import re
5 | import time
6 | import codecs
7 | import urllib2
8 | from bs4 import BeautifulSoup
9 |
10 | PATH = os.path.dirname(os.path.abspath(__file__))
11 |
12 | def Varify_proxy(ip_port):
13 | '''被封的IP'''
14 | # ip_port = "58.20.127.178:3128"
15 | url = 'http://list.jd.com/1713-3260-3338-0-0-0-0-0-0-0-1-1-2.html'
16 | proxy_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
17 | opener = urllib2.build_opener(proxy_hanlder)
18 | urllib2.install_opener(opener)
19 | try:
20 | resopnse = urllib2.urlopen(url, timeout=8).read().decode('gbk')
21 | # print resopnse.read().decode('gbk')
22 | except urllib2.URLError, e:
23 | if e.reason:
24 | print e.reason
25 | return
26 | else:
27 | return
28 | except:
29 | print 'timed item_id...in ip_port:%s'%ip_port
30 | return
31 |
32 | soup = BeautifulSoup(resopnse)
33 | div_level_str = soup.find('div', id='plist')
34 | # print div_level_str
35 | div_item_list = div_level_str.find_all('div', class_='item')
36 | item_id_list = [item['sku'] for item in div_item_list]
37 |
38 | if item_id_list:
39 | print 'success ip_port:%s'%ip_port
40 | return True
41 | else:
42 | return
43 |
44 | def write_proxy_into_file(http_proxy_list):
45 | '''过滤已被查封的IP,并把有效代理写入到本地xici_proxy文件中'''
46 | com_str_list = [item+'\n' for item in http_proxy_list if Varify_proxy(item)]
47 | filename = os.path.join(PATH, 'sys', 'xici_proxy')
48 | with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
49 | wf.writelines(com_str_list)
50 |
51 | def check_exists_proxy_file():
52 | '''过滤当前代理文件xici_proxy中有效proxy, 并写入到本地文件'''
53 | proxy_filename = os.path.join(PATH, 'sys', 'xici_proxy')
54 | new_proxy_filename = os.path.join(PATH, 'sys', 'xici_proxy')
55 | with codecs.open(proxy_filename, encoding='utf-8') as proxy_filename_f:
56 | new_proxy_list = [item for item in proxy_filename_f.readlines() if Varify_proxy(item)]
57 | with codecs.open(new_proxy_filename, mode='wb', encoding='utf-8') as new_proxy_filename_wf:
58 | new_proxy_filename_wf.writelines(new_proxy_list)
59 | print len(new_proxy_list)
60 |
61 | def get_valid_proxy(proxy_list):
62 | '''返回有效的代理'''
63 | ip_port_list = [item for item in proxy_list if Varify_proxy(item)]
64 | return ip_port_list
65 |
66 | def gen_proxy():
67 | url_pattern = "http://www.xici.net.co/nn/%s"
68 | url_list = [url_pattern%str(i) for i in range(1,4)]
69 |
70 | filename = os.path.join(PATH, 'sys', 'xici_proxy')
71 | with codecs.open(filename, mode='wb', encoding='utf-8') as wf:
72 | for url in url_list:
73 | try:
74 | html = urllib2.urlopen(url, timeout=15).read()
75 | except:
76 | time.sleep(60)
77 | try:
78 | html = urllib2.urlopen(url, timeout=15).read()
79 | except:
80 | continue
81 | soup = BeautifulSoup(html)
82 | tr_level_list = soup.find_all('tr')
83 | td_level_list = [item.find_all('td') for item in tr_level_list]
84 | ip_port_type_list = [(param[1].text, param[2].text, param[5].text) for param in td_level_list if len(param)>5]
85 | matched_ip_port_list = [item for item in ip_port_type_list if item[-1] != 'HTTPS']
86 | com_str_list = [":".join((item[0], item[1])) for item in matched_ip_port_list]
87 | temp_proxy_list = [item+'\n' for item in com_str_list if Varify_proxy(item)]
88 | wf.writelines(temp_proxy_list)
89 |
90 | if __name__ == "__main__":
91 | # gen_proxy()
92 | check_exists_proxy_file()
--------------------------------------------------------------------------------
/baidu_word_freq/tst_freq.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | #coding:utf-8
3 | import urllib2
4 | import os
5 | import re
6 | import codecs
7 | import time
8 | from bs4 import BeautifulSoup
9 |
10 | PATH = os.path.dirname(os.path.abspath(__file__))
11 | class CrawlFreq:
12 |
13 | def __init__(self):
14 | self.words_list = []
15 | self._gen_words_list()
16 |
17 | def _gen_words_list(self):
18 | filename = os.path.join(PATH, 'sys', 'wrod_without_freq.txt')
19 | with codecs.open(filename) as f:
20 | temp_word_list = [item.strip() for item in f.readlines()]
21 | self.words_list.extend(temp_word_list)
22 |
23 | def read_item_url(self):
24 | timeout_url_list = []
25 | url_pattern = 'http://www.baidu.com/s?wd="%s"&fr=wenku'
26 | num_pattern = re.compile(r'\d+')
27 | filename = os.path.join(PATH, 'out', 'words_freq_0507')
28 | with codecs.open(filename, mode='wb')as f:
29 | for word in self.words_list:
30 | url = url_pattern%word
31 | try:
32 | html = urllib2.urlopen(url, timeout=15).read()
33 | except:
34 | try:
35 | html = urllib2.urlopen(url, timeout=15).read()
36 | except:
37 | timeout_url_list.append(url)
38 | continue
39 | soup = BeautifulSoup(html)
40 | span_level_str_list = soup.find_all('span', class_='nums')
41 | if span_level_str_list:
42 | span_level_str = span_level_str_list[0]
43 | num_text = span_level_str.get_text()
44 | num = "".join(num_pattern.findall(num_text))
45 | com_str = "\t".join((word, str(num))) + '\n'
46 | else:
47 | com_str = "\t".join((word, "0")) + '\n'
48 | f.write(com_str)
49 | time.sleep(2)
50 |
51 | if timeout_url_list:
52 | timeout_filename = os.path.join(PATH, 'timeout_url')
53 | with codecs.open(timeout_filename, mode='wb') as wf:
54 | timeout_url_list = [item+'\n' for item in timeout_url_list]
55 | wf.writelines(timeout_url_list)
56 | # freq_crawler = CrawlFreq()
57 | # freq_crawler.read_item_url()
58 |
59 | def use_num_pattern():
60 | s = "百度为您找到相关结果约8,080,000个"
61 | num_list = re.findall(r'\d+',s)
62 | num = "".join(num_list)
63 | print num
64 | def read_unformal_page():
65 | url = 'http://www.baidu.com/s?wd="一字一键章鱼输入"&fr=wenku'
66 | html = urllib2.urlopen(url).read()
67 | soup = BeautifulSoup(html)
68 | span_level_str_list = soup.find_all('span', class_='nums')
69 | print span_level_str_list
70 | # read_unformal_page()
71 | def get_omit_word():
72 | source_filename = os.path.join(PATH, 'sys', 'word_without_freq_0528.txt')
73 | with codecs.open(source_filename) as f:
74 | word_set = set([item.strip() for item in f.readlines()])
75 | # for word in word_set:
76 | # print type(word)
77 |
78 | des_filename = os.path.join(PATH, 'out', 'words_freq_0528')
79 | with codecs.open(des_filename) as rf:
80 | word_list = set([item.split('\t')[0] for item in rf.readlines()])
81 |
82 | # print len(word_list), len(word_set)
83 | # param = word_set-word_list
84 | # print list(param)[0].decode('gbk')
85 |
86 | for word in word_set:
87 | if word not in word_list:
88 | print word.decode('gbk')
89 | # get_omit_word()
90 | def check_zero_num():
91 | filename = os.path.join(PATH, 'out', 'words_freq_0528')
92 | with open(filename) as f:
93 | zero_freq_list = []
94 | for line in f.readlines():
95 | splited_line = line.split('\t')
96 | word = splited_line[0]
97 | freq = splited_line[1].strip()
98 | if freq == '0':
99 | print line.decode('gbk')
100 | zero_freq_list.append(word.decode('gbk'))
101 | print zero_freq_list, len(zero_freq_list)
102 | # check_zero_num()
--------------------------------------------------------------------------------
/jd/electronic_jd/log/elec_failed_url:
--------------------------------------------------------------------------------
1 | div do not match pattern in url;http://item.jd.com/1125519418.html
2 | div do not match pattern in url;http://item.jd.com/1096499132.html
3 | div do not match pattern in url;http://item.jd.com/1014261178.html
4 | div do not match pattern in url;http://item.jd.com/1052779534.html
5 | div do not match pattern in url;http://item.jd.com/1051365121.html
6 | div do not match pattern in url;http://item.jd.com/1052636010.html
7 | div do not match pattern in url;http://item.jd.com/1052714871.html
8 | div do not match pattern in url;http://item.jd.com/1052747289.html
9 | div do not match pattern in url;http://item.jd.com/1052785449.html
10 | div do not match pattern in url;http://item.jd.com/1053007633.html
11 | div do not match pattern in url;http://item.jd.com/1053011719.html
12 | div do not match pattern in url;http://item.jd.com/1053021423.html
13 | div do not match pattern in url;http://item.jd.com/1053050579.html
14 | div do not match pattern in url;http://item.jd.com/1056009851.html
15 | div do not match pattern in url;http://item.jd.com/1062536835.html
16 | div do not match pattern in url;http://item.jd.com/1064935912.html
17 | div do not match pattern in url;http://item.jd.com/1066380089.html
18 | div do not match pattern in url;http://item.jd.com/1073344556.html
19 | div do not match pattern in url;http://item.jd.com/1082487043.html
20 | div do not match pattern in url;http://item.jd.com/646259.html
21 | div do not match pattern in url;http://item.jd.com/968372.html
22 | div do not match pattern in url;http://item.jd.com/968371.html
23 | div do not match pattern in url;http://item.jd.com/968373.html
24 | div do not match pattern in url;http://item.jd.com/1049616669.html
25 | div do not match pattern in url;http://item.jd.com/1049604169.html
26 | div do not match pattern in url;http://item.jd.com/1049713716.html
27 | div do not match pattern in url;http://item.jd.com/1049901825.html
28 | div do not match pattern in url;http://item.jd.com/1049864281.html
29 | div do not match pattern in url;http://item.jd.com/1049998153.html
30 | div do not match pattern in url;http://item.jd.com/1067645985.html
31 | div do not match pattern in url;http://item.jd.com/1067645986.html
32 | div do not match pattern in url;http://item.jd.com/1025080.html
33 | div do not match pattern in url;http://item.jd.com/1000297153.html
34 | div do not match pattern in url;http://item.jd.com/1010819419.html
35 | div do not match pattern in url;http://item.jd.com/1132494073.html
36 | div do not match pattern in url;http://item.jd.com/600316.html
37 | div do not match pattern in url;http://item.jd.com/600318.html
38 | div do not match pattern in url;http://item.jd.com/854174.html
39 | div do not match pattern in url;http://item.jd.com/1065790112.html
40 | div do not match pattern in url;http://item.jd.com/1068210793.html
41 | div do not match pattern in url;http://item.jd.com/1068210794.html
42 | div do not match pattern in url;http://item.jd.com/1068210795.html
43 | div do not match pattern in url;http://item.jd.com/1010471.html
44 | div do not match pattern in url;http://item.jd.com/271696.html
45 | div do not match pattern in url;http://item.jd.com/481325.html
46 | div do not match pattern in url;http://item.jd.com/1003116056.html
47 | div do not match pattern in url;http://item.jd.com/1041219087.html
48 | div do not match pattern in url;http://item.jd.com/1062607890.html
49 | div do not match pattern in url;http://item.jd.com/1091726881.html
50 | div do not match pattern in url;http://item.jd.com/1091729610.html
51 | div do not match pattern in url;http://item.jd.com/1031478255.html
52 | div do not match pattern in url;http://item.jd.com/1052374448.html
53 | div do not match pattern in url;http://item.jd.com/1059701191.html
54 | div do not match pattern in url;http://item.jd.com/904326.html
55 | div do not match pattern in url;http://item.jd.com/848334.html
56 | div do not match pattern in url;http://item.jd.com/518467.html
57 | div do not match pattern in url;http://item.jd.com/904327.html
58 | div do not match pattern in url;http://item.jd.com/1031373.html
59 | div do not match pattern in url;http://item.jd.com/925068.html
60 | div do not match pattern in url;http://item.jd.com/842360.html
61 | div do not match pattern in url;http://item.jd.com/988689.html
62 | div do not match pattern in url;http://item.jd.com/881212.html
63 | div do not match pattern in url;http://item.jd.com/881190.html
64 |
--------------------------------------------------------------------------------
/baidu/out/50:
--------------------------------------------------------------------------------
1 | {"url": "http://baike.baidu.com/view/50.htm", "header": "\u955c\u50cf\u7ad9\u70b9", "content": "\u955c\u50cf\uff0c\u539f\u610f\u662f\u5149\u5b66\u91cc\u6307\u7684\u7269\u4f53\u5728\u955c\u9762\u4e2d\u6240\u6210\u4e4b\u50cf\u3002\u5f15\u7528\u5230\u8ba1\u7b97\u673a\u7f51\u7edc\u4e0a\uff0c\u4e00\u4e2a\u955c\u50cf\u7ad9\u70b9\uff08\u6216\u79f0\u955c\u50cf\uff09\u662f\u6307\u53e6\u4e00\u4e2a\u7ad9\u70b9\u5185\u5bb9\u7684\u62f7\u8d1d\u3002\u955c\u50cf\u901a\u5e38\u7528\u4e8e\u4e3a\u76f8\u540c\u4fe1\u606f\u5185\u5bb9\u63d0\u4f9b\u4e0d\u540c\u7684\u6e90\uff0c\u7279\u522b\u662f\u5728\u4e0b\u8f7d\u91cf\u5927\u7684\u65f6\u5019\u63d0\u4f9b\u4e86\u4e00\u79cd\u53ef\u9760\u7684\u7f51\u7edc\u8fde\u63a5\u3002\u5236\u4f5c\u955c\u50cf\u662f\u4e00\u79cd\u6587\u4ef6\u540c\u6b65\u7684\u8fc7\u7a0b\u3002\u521b\u5efa\u955c\u50cf\u7684\u76ee\u7684\u901a\u5e38\u6709\u4ee5\u4e0b\u51e0\u4e2a\uff1a\u4fdd\u5b58\u7f51\u9875\u4fe1\u606f\uff0c\u7279\u522b\u662f\u5728\u4e00\u4e2a\u7f51\u7ad9\u9762\u4e34\u5173\u7ad9\u7684\u65f6\u5019\u3002\u63d0\u9ad8\u7528\u6237\u5728\u67d0\u4e2a\u5730\u533a\u7684\u4e0b\u8f7d\u901f\u5ea6\u3002\u8b6c\u5982\u4e00\u4e2a\u7f8e\u56fd\u7f51\u7ad9\u7684\u4e2d\u56fd\u955c\u50cf\u53ef\u4ee5\u4f7f\u6765\u81ea\u4e2d\u56fd\u7684\u7528\u6237\u76f4\u63a5\u4ece\u8fd9\u4e2a\u4e2d\u56fd\u7684\u955c\u50cf\u8bbf\u95ee\uff0c\u4ece\u800c\u52a0\u5feb\u4e86\u901f\u5ea6\u3002\u8fd9\u53ef\u4ee5\u770b\u4f5c\u662f\u4e00\u79cd\u5168\u7403\u8303\u56f4\u7684\u7f13\u5b58\u3002\u5bf9\u4e0d\u53ef\u7528\u7684\u4fe1\u606f\u63d0\u4f9b\u8fde\u63a5\u9014\u5f84\u3002\u4f8b\u5982\uff0c2002\u5e74\u7684\u65f6\u5019\u4e2d\u56fd\u5bf9Google\u5c01\u9501\u7684\u65f6\u5019\uff0c\u955c\u50cf\u7ad9\u70b9google\u6210\u4e3a\u4e86\u6709\u6548\u7684\u7ed5\u8fc7\u5c01\u9501\u7684\u9014\u5f84\u3002\u4fdd\u5b58\u5386\u53f2\u6027\u7684\u6570\u636e\u3001\u4fe1\u606f\uff0c\u9632\u6b62\u4e22\u5931\u3002\u5e73\u8861\u7f51\u7ad9\u7684\u6d41\u91cf\u8d1f\u8f7d\u3002\u4f8b\u5982\uff0c\u4e00\u4e2aLinux\u7684\u53d1\u884c\u7248\u7684ISO\u955c\u50cf\u6587\u4ef6\u7684\u5927\u91cf\u4e0b\u8f7d\u53ef\u80fd\u4f1a\u5bfc\u81f4\u4e3b\u7ad9\u8fc7\u8377\u800c\u4e0b\u7ebf\uff0c\u800c\u955c\u50cf\u7ad9\u70b9\u5219\u53ef\u4ee5\u5728\u591a\u4e2a\u670d\u52a1\u5668\u5206\u62c5\u6d41\u91cf\uff0c\u4ece\u800c\u4fdd\u8bc1\u7ad9\u70b9\u7684\u6301\u7eed\u5728\u7ebf\u3002\u5bf9\u56e0\u6d41\u91cf\u6fc0\u589e\u800c\u4e0b\u7ebf\u7684\u7ad9\u70b9\u63d0\u4f9b\u4e34\u65f6\u7684\u8bbf\u95ee\u3002\u4fbf\u4e8e\u4e0d\u540c\u5730\u533a\u7684\u4eba\u83b7\u5f97\u66f4\u5feb\u7684\u8bbf\u95ee\u901f\u5ea6\u3002\u907f\u5f00\u5bf9\u4e8e\u4e3b\u7f51\u7ad9\u7684\u5ba1\u67e5\u548c\u5c4f\u853d\u3002\u521b\u5efa\u955c\u50cf\u7ad9\u70b9\u901a\u5e38\u662f\u7531\u4e8e\u7f51\u7edc\u5e26\u5bbd\u9650\u5236\u3001\u5c01\u9501\u6216\u662f\u5176\u4ed6\u539f\u56e0\uff0c\u5bfc\u81f4\u65e0\u6cd5\u5b9e\u73b0\u5bf9\u4e3b\u7ad9\u70b9\u7684\u6b63\u5e38\u8bbf\u95ee\u3002\u8fd9\u65f6\u901a\u8fc7\u5c06\u4e3b\u7ad9\u70b9\u7684\u4fe1\u606f\u8d44\u6e90\u79fb\u690d\u8f6c\u79fb\u5230\u76f8\u5bf9\u5bb9\u6613\u8bbf\u95ee\u7684\u672c\u5730\u670d\u52a1\u5668\uff0c\u4ee5\u63d0\u9ad8\u7528\u6237\u7684\u8bbf\u95ee\u6548\u7387\u3002\u5e38\u89c1\u4e8e\u5185\u90e8\u7f51\u3001\u6821\u56ed\u7f51\u6216\u662f\u5176\u4ed6\u6709\u8f83\u5927\u5b58\u50a8\u8bbe\u5907\u7684\u670d\u52a1\u5668\u3002\u8f83\u5c0f\u7684\u7f51\u7ad9\u6709\u65f6\u4e5f\u4ee5\u955c\u50cf\u6765\u5e94\u5bf9\u5c01\u9501\u3001\u5c4f\u853d\uff08\u53c2\u89c1\u9632\u706b\u957f\u57ce\uff09\u3002\u901a\u8fc7\u955c\u50cf\u7ad9\u70b9\uff0c\u4e5f\u53ef\u4ee5\u5b9e\u73b0\u591a\u76ee\u6807\u591a\u7ebf\u7a0b\u7684\u5feb\u901f\u4e0b\u8f7d\u3002\u4f8b\u5982GetSmart2\u548cGetRight\u6b63\u662f\u4f7f\u7528\u4e86\u8fd9\u79cd\u9ad8\u7ea7\u7684\u591a\u7ebf\u7a0b\u4e0b\u8f7d\u6280\u672f\u3002\u00a0\u00a0\u955c\u50cf\u7ad9\u70b9\u955c\u50cf\u7ad9\u70b9\u662f\u901a\u8fc7\u4e3b\u670d\u52a1\u5668\u589e\u52a0\u8f6c\u79fb\u5b58\u50a8\u5730\u5740\u6765\u5b9e\u73b0\u4fe1\u606f\u7684\u5f02\u5730\u5907\u4efd\u3002\u901a\u5e38\u4e00\u4e2a\u955c\u50cf\u4f1a\u5b9a\u671f\u8bbf\u95ee\u4e3b\u7f51\u7ad9\uff0c\u4ee5\u66f4\u65b0\u5176\u5185\u5bb9\u3002\u955c\u50cf\u4e5f\u6709\u5206\u4e00\u7ea7\u3001\u4e8c\u7ea7\u7b49\u7b49\u3002\u4e8c\u7ea7\u955c\u50cf\u662f\u6307\u90a3\u4e9b\u901a\u8fc7\u8bbf\u95ee\u4e00\u7ea7\u955c\u50cf\u7f51\u7ad9\u6765\u66f4\u65b0\u5185\u5bb9\u7684\u7f51\u7ad9\uff0c\u901a\u5e38\u66f4\u65b0\u901f\u5ea6\u4e0d\u5982\u4e00\u7ea7\u955c\u50cf\uff0c\u4f46\u4e0d\u4f1a\u7ed9\u4e3b\u7f51\u7ad9\u589e\u52a0\u8d1f\u62c5\u3002\u00a0\u00a0\u955c\u50cf\u7ad9\u70b9\u66f4\u65b0\u955c\u50cf\u6700\u5e38\u7528\u7684\u8f6f\u4ef6\u662frsync\uff0c\u6709\u65f6\u5019\u4e5f\u76f4\u63a5\u7528http\u955c\u50cf\u5de5\u5177\u3002"}
--------------------------------------------------------------------------------
/douban/movie_actors.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 |
3 | import os
4 | import re
5 | import time
6 | import codecs
7 | import logging
8 | import urllib2
9 | import random
10 | import gevent
11 | import gevent.monkey
12 | from math import ceil
13 | from bs4 import BeautifulSoup
14 | gevent.monkey.patch_all()
15 |
16 | PATH = os.path.dirname(os.path.abspath(__file__))
17 | SLEEP_INTERVAL = random.randint(2,5)
18 |
19 | class MovieActor:
20 | def __init__(self):
21 | self.proxy_list = []
22 | self._gen_proxy()
23 | self.actor_content_text_list = []
24 | self.timeout_url_list = []
25 |
26 | def _gen_proxy(self):
27 | filename = os.path.join(PATH, 'sys', 'xici_proxy')
28 | with codecs.open(filename, encoding='utf-8') as f:
29 | self.proxy_list.extend([item.strip() for item in f.readlines()])
30 |
31 | def parse_actor_content_url(self, url, ip_port):
32 | # http_proxy = 'http://%s'%ip_port
33 | # proxy_hanlder = urllib2.HTTPHandler({'http':http_proxy})
34 | # opener = urllib2.build_opener(proxy_hanlder)
35 | # urllib2.install_opener(opener)
36 |
37 | try:
38 | html = urllib2.urlopen(url, timeout=15).read()
39 | except:
40 | try:
41 | html = urllib2.urlopen(url, timeout=15).read()
42 | except:
43 | self.timeout_url_list.append(url)
44 | return
45 | soup = BeautifulSoup(html)
46 | div_level_str = soup.find('div', id='content')
47 | if not div_level_str:
48 | return
49 | actor_summary = div_level_str.find('div', class_='bd') if not div_level_str.find('span', class_='all hidden') else div_level_str.find('span', class_='all hidden')
50 | actor_content_text = actor_summary.text.strip()
51 | if actor_content_text:
52 | self.actor_content_text_list.append(actor_content_text)
53 |
54 | def write_content_into_file(self):
55 | timestamp = time.strftime('%Y_%m_%d_%H%M%S.txt')
56 | filename = os.path.join(PATH, 'out', 'actor_con_out', timestamp)
57 | actor_content_text_list = ["".join((item, '\n')) for item in self.actor_content_text_list]
58 | with codecs.open(filename, mode='a', encoding='utf-8') as wf:
59 | wf.writelines(actor_content_text_list)
60 |
61 | def write_timeout_url(self):
62 | filename = os.path.join(PATH, 'log', 'actor_info_timeout_url')
63 | timeout_url_list = [item+'\n' for item in self.timeout_url_list]
64 | with codecs.open(filename, mode='a', encoding='utf-8') as f:
65 | f.writelines(timeout_url_list)
66 |
67 | def main(self):
68 | proxy_count = len(self.proxy_list)
69 | thread_count = 60
70 | range_from = 1000000
71 | range_to = 1339959
72 | url_range_start = range_from/thread_count
73 | url_range_stop = range_to/thread_count
74 |
75 | threads_per_proxy = int(ceil(thread_count/float(proxy_count)))
76 | threads = []
77 | url_pattern = 'http://movie.douban.com/celebrity/%s/'
78 | for post_time_count in range(url_range_start, url_range_stop):
79 | for url_point in range(post_time_count*thread_count, (post_time_count+1)*thread_count):
80 | url = url_pattern%str(url_point)
81 | proxy_point = (url_point - post_time_count*thread_count)/threads_per_proxy
82 | ip_port = self.proxy_list[proxy_point]
83 | threads.append(gevent.spawn(self.parse_actor_content_url, url, ip_port))
84 | gevent.joinall(threads)
85 | self.write_content_into_file()
86 | self.actor_content_text_list[:] = []
87 | if self.timeout_url_list:
88 | self.write_timeout_url()
89 | self.timeout_url_list[:] = []
90 | time.sleep(SLEEP_INTERVAL)
91 | left_threads = []
92 | url_left_count = range_to % thread_count
93 | for url_point in range(url_left_count):
94 | url = url_pattern%str(range_to-url_point)
95 | proxy_point = url_point/threads_per_proxy
96 | ip_port = self.proxy_list[proxy_point]
97 | left_threads.append(gevent.spawn(self.parse_actor_content_url, url, ip_port))
98 | gevent.joinall(left_threads)
99 | self.write_content_into_file()
100 | self.actor_content_text_list[:] = []
101 | if __name__ == '__main__':
102 | actor = MovieActor()
103 | actor.main()
--------------------------------------------------------------------------------
/dangdang/mobile_digital/mobile/sys/mobile_item_id:
--------------------------------------------------------------------------------
1 | http://product.dangdang.com/1049524121.html
2 | http://product.dangdang.com/1091210612.html
3 | http://product.dangdang.com/1214373005.html
4 | http://product.dangdang.com/1055559322.html
5 | http://product.dangdang.com/1055308007.html
6 | http://product.dangdang.com/1004961108.html
7 | http://product.dangdang.com/1270509408.html
8 | http://product.dangdang.com/1070416207.html
9 | http://product.dangdang.com/1011294508.html
10 | http://product.dangdang.com/1099246612.html
11 | http://product.dangdang.com/1222979005.html
12 | http://product.dangdang.com/1005093405.html
13 | http://product.dangdang.com/1006944708.html
14 | http://product.dangdang.com/400724832.html
15 | http://product.dangdang.com/1118130212.html
16 | http://product.dangdang.com/60321917.html
17 | http://product.dangdang.com/1379982106.html
18 | http://product.dangdang.com/400939011.html
19 | http://product.dangdang.com/400630754.html
20 | http://product.dangdang.com/1280568908.html
21 | http://product.dangdang.com/1146304608.html
22 | http://product.dangdang.com/1095921111.html
23 | http://product.dangdang.com/1055569222.html
24 | http://product.dangdang.com/1395168506.html
25 | http://product.dangdang.com/1009467807.html
26 | http://product.dangdang.com/1022694312.html
27 | http://product.dangdang.com/1115351422.html
28 | http://product.dangdang.com/1436105208.html
29 | http://product.dangdang.com/1023120306.html
30 | http://product.dangdang.com/1270633008.html
31 | http://product.dangdang.com/1121937507.html
32 | http://product.dangdang.com/60320918.html
33 | http://product.dangdang.com/1299690605.html
34 | http://product.dangdang.com/1034395805.html
35 | http://product.dangdang.com/1224226022.html
36 | http://product.dangdang.com/60556058.html
37 | http://product.dangdang.com/1057377505.html
38 | http://product.dangdang.com/400635387.html
39 | http://product.dangdang.com/1036615308.html
40 | http://product.dangdang.com/60557518.html
41 | http://product.dangdang.com/1005324107.html
42 | http://product.dangdang.com/1019822907.html
43 | http://product.dangdang.com/1264778205.html
44 | http://product.dangdang.com/1135040012.html
45 | http://product.dangdang.com/60548424.html
46 | http://product.dangdang.com/400639260.html
47 | http://product.dangdang.com/1226466722.html
48 | http://product.dangdang.com/1047734008.html
49 | http://product.dangdang.com/1143364612.html
50 | http://product.dangdang.com/1023101006.html
51 | http://product.dangdang.com/1141474912.html
52 | http://product.dangdang.com/1102204721.html
53 | http://product.dangdang.com/1124284321.html
54 | http://product.dangdang.com/1444889401.html
55 | http://product.dangdang.com/1224105622.html
56 | http://product.dangdang.com/1216049505.html
57 | http://product.dangdang.com/1224198302.html
58 | http://product.dangdang.com/1229077905.html
59 | http://product.dangdang.com/1003481108.html
60 | http://product.dangdang.com/1227067722.html
61 | http://product.dangdang.com/1227051822.html
62 | http://product.dangdang.com/60327476.html
63 | http://product.dangdang.com/1009859702.html
64 | http://product.dangdang.com/1014556112.html
65 | http://product.dangdang.com/1270563208.html
66 | http://product.dangdang.com/1010554222.html
67 | http://product.dangdang.com/1073193122.html
68 | http://product.dangdang.com/1474689501.html
69 | http://product.dangdang.com/1227084022.html
70 | http://product.dangdang.com/1020516307.html
71 | http://product.dangdang.com/60556072.html
72 | http://product.dangdang.com/1008639908.html
73 | http://product.dangdang.com/1244660206.html
74 | http://product.dangdang.com/1039487908.html
75 | http://product.dangdang.com/1084207222.html
76 | http://product.dangdang.com/1167984407.html
77 | http://product.dangdang.com/1134409012.html
78 | http://product.dangdang.com/1044489722.html
79 | http://product.dangdang.com/1211221011.html
80 | http://product.dangdang.com/1065724312.html
81 | http://product.dangdang.com/1140880905.html
82 | http://product.dangdang.com/1024874906.html
83 | http://product.dangdang.com/1024955006.html
84 | http://product.dangdang.com/1089685721.html
85 | http://product.dangdang.com/1025820307.html
86 | http://product.dangdang.com/1009458508.html
87 | http://product.dangdang.com/1009074807.html
88 | http://product.dangdang.com/1101452422.html
89 | http://product.dangdang.com/1384422106.html
90 | http://product.dangdang.com/1187447108.html
91 | http://product.dangdang.com/1059197007.html
92 | http://product.dangdang.com/60320920.html
93 | http://product.dangdang.com/1238420705.html
94 | http://product.dangdang.com/1185192311.html
95 | http://product.dangdang.com/1212772205.html
96 | http://product.dangdang.com/1208068222.html
97 |
--------------------------------------------------------------------------------
/dangdang/appliance/sys/appliance_item_id:
--------------------------------------------------------------------------------
1 | http://product.dangdang.com/1280649708.html
2 | http://product.dangdang.com/1105676205.html
3 | http://product.dangdang.com/1203684808.html
4 | http://product.dangdang.com/1446526808.html
5 | http://product.dangdang.com/1254945508.html
6 | http://product.dangdang.com/1226355406.html
7 | http://product.dangdang.com/1203655708.html
8 | http://product.dangdang.com/1211710508.html
9 | http://product.dangdang.com/1214771306.html
10 | http://product.dangdang.com/1203673808.html
11 | http://product.dangdang.com/1274290606.html
12 | http://product.dangdang.com/1261458801.html
13 | http://product.dangdang.com/1214794906.html
14 | http://product.dangdang.com/1215095806.html
15 | http://product.dangdang.com/1317165807.html
16 | http://product.dangdang.com/1211700908.html
17 | http://product.dangdang.com/1286296907.html
18 | http://product.dangdang.com/1203325207.html
19 | http://product.dangdang.com/1058890906.html
20 | http://product.dangdang.com/1292496705.html
21 | http://product.dangdang.com/1197298806.html
22 | http://product.dangdang.com/1198256106.html
23 | http://product.dangdang.com/1203295907.html
24 | http://product.dangdang.com/1301703507.html
25 | http://product.dangdang.com/1120774705.html
26 | http://product.dangdang.com/1221631706.html
27 | http://product.dangdang.com/400966486.html
28 | http://product.dangdang.com/1126679905.html
29 | http://product.dangdang.com/1211674908.html
30 | http://product.dangdang.com/1211712308.html
31 | http://product.dangdang.com/1215095606.html
32 | http://product.dangdang.com/1027167006.html
33 | http://product.dangdang.com/1283333507.html
34 | http://product.dangdang.com/1032008522.html
35 | http://product.dangdang.com/1283283707.html
36 | http://product.dangdang.com/400967439.html
37 | http://product.dangdang.com/1216709307.html
38 | http://product.dangdang.com/1058922906.html
39 | http://product.dangdang.com/1274303306.html
40 | http://product.dangdang.com/1414981308.html
41 | http://product.dangdang.com/400966577.html
42 | http://product.dangdang.com/1058912006.html
43 | http://product.dangdang.com/1283283607.html
44 | http://product.dangdang.com/400921188.html
45 | http://product.dangdang.com/1027136606.html
46 | http://product.dangdang.com/1283309707.html
47 | http://product.dangdang.com/1064502405.html
48 | http://product.dangdang.com/1428125408.html
49 | http://product.dangdang.com/1203734508.html
50 | http://product.dangdang.com/1175929607.html
51 | http://product.dangdang.com/1282144307.html
52 | http://product.dangdang.com/1286273207.html
53 | http://product.dangdang.com/1287529107.html
54 | http://product.dangdang.com/1203702708.html
55 | http://product.dangdang.com/1203669408.html
56 | http://product.dangdang.com/1233154808.html
57 | http://product.dangdang.com/1203692108.html
58 | http://product.dangdang.com/1211666708.html
59 | http://product.dangdang.com/1211662008.html
60 | http://product.dangdang.com/1105958005.html
61 | http://product.dangdang.com/1292501105.html
62 | http://product.dangdang.com/1332343708.html
63 | http://product.dangdang.com/1215814106.html
64 | http://product.dangdang.com/1272541506.html
65 | http://product.dangdang.com/1115126205.html
66 | http://product.dangdang.com/1475195801.html
67 | http://product.dangdang.com/1261422601.html
68 | http://product.dangdang.com/1272537406.html
69 | http://product.dangdang.com/1163241806.html
70 | http://product.dangdang.com/1215813606.html
71 | http://product.dangdang.com/1042311408.html
72 | http://product.dangdang.com/1047781108.html
73 | http://product.dangdang.com/1283284307.html
74 | http://product.dangdang.com/1172570305.html
75 | http://product.dangdang.com/400963930.html
76 | http://product.dangdang.com/400966534.html
77 | http://product.dangdang.com/1105739305.html
78 | http://product.dangdang.com/1274315306.html
79 | http://product.dangdang.com/1180661806.html
80 | http://product.dangdang.com/1226359806.html
81 | http://product.dangdang.com/1064498305.html
82 | http://product.dangdang.com/1226332006.html
83 | http://product.dangdang.com/1254952608.html
84 | http://product.dangdang.com/1047362508.html
85 | http://product.dangdang.com/1198418106.html
86 | http://product.dangdang.com/1320565507.html
87 | http://product.dangdang.com/1230938505.html
88 | http://product.dangdang.com/1172568905.html
89 | http://product.dangdang.com/1317176607.html
90 | http://product.dangdang.com/1283288207.html
91 | http://product.dangdang.com/400921475.html
92 | http://product.dangdang.com/1047787708.html
93 | http://product.dangdang.com/400921293.html
94 | http://product.dangdang.com/1274483206.html
95 | http://product.dangdang.com/400921326.html
96 | http://product.dangdang.com/1203732107.html
97 |
--------------------------------------------------------------------------------
/dangdang/decoration_health/health/sys/health_item_id:
--------------------------------------------------------------------------------
1 | http://product.dangdang.com/1300766708.html
2 | http://product.dangdang.com/1262439205.html
3 | http://product.dangdang.com/1300811708.html
4 | http://product.dangdang.com/1217809806.html
5 | http://product.dangdang.com/1164369811.html
6 | http://product.dangdang.com/1300774508.html
7 | http://product.dangdang.com/1300729908.html
8 | http://product.dangdang.com/1043063508.html
9 | http://product.dangdang.com/1134177621.html
10 | http://product.dangdang.com/1471728301.html
11 | http://product.dangdang.com/1147670301.html
12 | http://product.dangdang.com/1043091008.html
13 | http://product.dangdang.com/1035360906.html
14 | http://product.dangdang.com/1131230621.html
15 | http://product.dangdang.com/1088187406.html
16 | http://product.dangdang.com/1332967606.html
17 | http://product.dangdang.com/1043154012.html
18 | http://product.dangdang.com/1165649811.html
19 | http://product.dangdang.com/1164370111.html
20 | http://product.dangdang.com/1300784108.html
21 | http://product.dangdang.com/1066033806.html
22 | http://product.dangdang.com/1389907102.html
23 | http://product.dangdang.com/1241614202.html
24 | http://product.dangdang.com/1132865612.html
25 | http://product.dangdang.com/1049550412.html
26 | http://product.dangdang.com/1066034506.html
27 | http://product.dangdang.com/1070724512.html
28 | http://product.dangdang.com/1262439005.html
29 | http://product.dangdang.com/1122839921.html
30 | http://product.dangdang.com/1053568112.html
31 | http://product.dangdang.com/1283924001.html
32 | http://product.dangdang.com/1090847112.html
33 | http://product.dangdang.com/1300798308.html
34 | http://product.dangdang.com/1066034406.html
35 | http://product.dangdang.com/1148181605.html
36 | http://product.dangdang.com/1132865412.html
37 | http://product.dangdang.com/1164371811.html
38 | http://product.dangdang.com/1046218812.html
39 | http://product.dangdang.com/1053104112.html
40 | http://product.dangdang.com/1016472512.html
41 | http://product.dangdang.com/1300816108.html
42 | http://product.dangdang.com/1300797008.html
43 | http://product.dangdang.com/1262439105.html
44 | http://product.dangdang.com/1245770008.html
45 | http://product.dangdang.com/1168229201.html
46 | http://product.dangdang.com/1061937522.html
47 | http://product.dangdang.com/1283974301.html
48 | http://product.dangdang.com/1142358807.html
49 | http://product.dangdang.com/1153334406.html
50 | http://product.dangdang.com/1051734212.html
51 | http://product.dangdang.com/1010583612.html
52 | http://product.dangdang.com/1164418711.html
53 | http://product.dangdang.com/1240227008.html
54 | http://product.dangdang.com/1240131308.html
55 | http://product.dangdang.com/1296490308.html
56 | http://product.dangdang.com/1296432008.html
57 | http://product.dangdang.com/1011428512.html
58 | http://product.dangdang.com/1066407808.html
59 | http://product.dangdang.com/1334221301.html
60 | http://product.dangdang.com/1363996301.html
61 | http://product.dangdang.com/1130211521.html
62 | http://product.dangdang.com/1025738621.html
63 | http://product.dangdang.com/1282620501.html
64 | http://product.dangdang.com/1091273121.html
65 | http://product.dangdang.com/1039520021.html
66 | http://product.dangdang.com/1240176408.html
67 | http://product.dangdang.com/1299184101.html
68 | http://product.dangdang.com/1299180601.html
69 | http://product.dangdang.com/1299163201.html
70 | http://product.dangdang.com/1283958201.html
71 | http://product.dangdang.com/1283940001.html
72 | http://product.dangdang.com/1283936201.html
73 | http://product.dangdang.com/1122840421.html
74 | http://product.dangdang.com/1100714521.html
75 | http://product.dangdang.com/1100714421.html
76 | http://product.dangdang.com/1094173521.html
77 | http://product.dangdang.com/1090059321.html
78 | http://product.dangdang.com/1039473621.html
79 | http://product.dangdang.com/1039459821.html
80 | http://product.dangdang.com/1240301308.html
81 | http://product.dangdang.com/1240279408.html
82 | http://product.dangdang.com/1240239108.html
83 | http://product.dangdang.com/1240210408.html
84 | http://product.dangdang.com/1039473721.html
85 | http://product.dangdang.com/1389951602.html
86 | http://product.dangdang.com/1090599812.html
87 | http://product.dangdang.com/1241580302.html
88 | http://product.dangdang.com/1240087308.html
89 | http://product.dangdang.com/1016472612.html
90 | http://product.dangdang.com/1296476408.html
91 | http://product.dangdang.com/1051661812.html
92 | http://product.dangdang.com/1381633102.html
93 | http://product.dangdang.com/1059486712.html
94 | http://product.dangdang.com/1012202912.html
95 | http://product.dangdang.com/1053065408.html
96 | http://product.dangdang.com/1053056008.html
97 |
--------------------------------------------------------------------------------
/jd/book_jd/book_jd.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | #coding:utf-8
3 | import os
4 | import re
5 | import time
6 | import codecs
7 | import urllib2
8 | import xici_proxy
9 | from bs4 import BeautifulSoup
10 |
11 | PATH = os.path.dirname(os.path.abspath(__file__))
12 |
13 | def read_proxy_file():
14 | proxy_list = []
15 | filename = os.path.join(PATH, 'sys', 'xici_proxy')
16 | with codecs.open(filename, encoding='utf-8')as f:
17 | proxy_list.extend([item.strip() for item in f.readlines()])
18 | return proxy_list
19 | def gen_whole_item_id():
20 | timeout_timestamp = time.strftime('%m%d_timeout_page_url')
21 | crawled_timestamp = time.strftime('%m%d_crawled_page_url')
22 | proxy_list = read_proxy_file()
23 | page_url_proxy_count = 0
24 | if not proxy_list:
25 | xici_proxy.gen_proxy()
26 | proxy_list = read_proxy_file()
27 | ip_port = proxy_list.pop()
28 | handle_no_div_pattern = re.compile('no_(item|plist)_div:')
29 | whole_page_url_filename = os.path.join(PATH, 'log', '0518_timeout_page_url')
30 | timeout_page_url_filename = os.path.join(PATH, 'log', timeout_timestamp)
31 | item_id_filename = os.path.join(PATH, 'sys', 'book_item_ids')
32 | page_url_crawled_filename = os.path.join(PATH, 'log', crawled_timestamp)
33 | with codecs.open(whole_page_url_filename, encoding='utf-8')as whole_page_url_f,\
34 | codecs.open(item_id_filename, mode='a', encoding='utf-8')as item_id_wf,\
35 | codecs.open(timeout_page_url_filename, mode='wb', encoding='utf-8') as timeout_url_wf,\
36 | codecs.open(page_url_crawled_filename, mode='wb', encoding='utf-8')as crawled_url_wf:
37 | for page_url in [handle_no_div_pattern.sub('', item.strip()) for item in whole_page_url_f.readlines() if item.startswith('no_')]:
38 | page_url_proxy_count += 1
39 | try:
40 | if page_url_proxy_count > 2000:
41 | if not proxy_list:
42 | re_read_proxy_list = read_proxy_file()
43 | proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list)
44 | if not proxy_list:
45 | xici_proxy.gen_proxy()
46 | timeout_url_wf.write('get new proxy in xici network!\n')
47 | proxy_list = read_proxy_file()
48 | ip_port = proxy_list.pop()
49 | page_url_proxy_count = 0
50 | http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
51 | opener = urllib2.build_opener(http_hanlder)
52 | html = opener.open(page_url, timeout=15)
53 | except urllib2.HTTPError, e:
54 | if e.getcode() == 403:
55 | timeout_url_wf.write('403 error:request forbiddon!!!\n')
56 | if not proxy_list:
57 | re_read_proxy_list = read_proxy_file()
58 | proxy_list = xici_proxy.get_valid_proxy(re_read_proxy_list)
59 | if not proxy_list:
60 | xici_proxy.gen_proxy()
61 | timeout_url_wf.write('get new proxy in xici network!\n')
62 | proxy_list = read_proxy_file()
63 | ip_port = proxy_list.pop()
64 | http_hanlder = urllib2.ProxyHandler({'http':'http://%s'%ip_port})
65 | opener = urllib2.build_opener(http_hanlder)
66 | html = opener.open(page_url, timeout=15).read().decode('gbk')
67 | else:
68 | continue
69 | except:
70 | timeout_info = ''.join(('request_timeout:', page_url, '\n'))
71 | timeout_url_wf.write(timeout_info)
72 | continue
73 | soup = BeautifulSoup(html)
74 | div_level_str = soup.find('div', id='plist')
75 | if not div_level_str:
76 | error_match_info = ''.join(('no_plist_div:', page_url, '\n'))
77 | timeout_url_wf.write(error_match_info)
78 | continue
79 | div_item_list = div_level_str.find_all('div', class_='item')
80 | if not div_item_list:
81 | error_match_info = ''.join(('no_item_div:', page_url, '\n'))
82 | timeout_url_wf.write(error_match_info)
83 | continue
84 | item_id_list = [item['sku']+'\n' for item in div_item_list]
85 | item_id_wf.writelines(item_id_list)
86 | crawled_url_wf.write(page_url+'\n')
87 | # time.sleep(3)
88 | gen_whole_item_id()
89 |
--------------------------------------------------------------------------------
/douban/movie_actors_single_thread.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 |
3 | import os
4 | import re
5 | import time
6 | import codecs
7 | import logging
8 | import urllib2
9 | import random
10 | from math import ceil
11 | from bs4 import BeautifulSoup
12 |
13 | PATH = os.path.dirname(os.path.abspath(__file__))
14 | SLEEP_INTERVAL = random.randint(2, 5)
15 |
16 | class MovieActor:
17 | def __init__(self):
18 | self.proxy_list = []
19 | self._gen_proxy()
20 | self.actor_content_text_list = []
21 | self.timeout_url_list = []
22 |
23 | def _gen_log(self):
24 | logfile = os.path.join(PATH, 'log', 'douban_actor_cralwer.log')
25 | self.logger = logging.getLogger(__name__)
26 | self.logger.setLevel(logging.DEBUG)
27 | log_file = logging.FileHandler(logfile)
28 | log_file.setLevel(logging.DEBUG)
29 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
30 | log_file.setFormatter(formatter)
31 | self.logger.addHandler(log_file)
32 |
33 | def _gen_proxy(self):
34 | filename = os.path.join(PATH, 'sys', 'xici_proxy')
35 | with codecs.open(filename, encoding='utf-8') as f:
36 | self.proxy_list.extend([item.strip() for item in f.readlines()])
37 |
38 | def parse_actor_content_url(self, url):
39 | try:
40 | html = urllib2.urlopen(url, timeout=15).read()
41 | except:
42 | try:
43 | html = urllib2.urlopen(url, timeout=15).read()
44 | except:
45 | self.timeout_url_list.append(url)
46 | self.logger.debug('request timeout in item_url:%s'%url)
47 | return
48 | soup = BeautifulSoup(html)
49 | div_level_str = soup.find('div', id='content')
50 | if not div_level_str:
51 | self.logger.error('div_level do not match regular expression in url:%s'%url)
52 | return
53 | actor_summary = div_level_str.find('div', class_='bd') if not div_level_str.find('span', class_='all hidden') else div_level_str.find('span', class_='all hidden')
54 | if not actor_summary:
55 | self.logger.debug('actor_summary do not match re in item_url:%s'%url)
56 | actor_content_text = actor_summary.text.strip()
57 | if actor_content_text:
58 | self.actor_content_text_list.append(actor_content_text)
59 |
60 | def write_content_into_file(self):
61 | timestamp = time.strftime('%Y_%m_%d_%H%M%S.txt')
62 | filename = os.path.join(PATH, 'out', 'actor_con_out', timestamp)
63 | actor_content_text_list = ["".join((item, '\n')) for item in self.actor_content_text_list]
64 | with codecs.open(filename, mode='a', encoding='utf-8') as wf:
65 | wf.writelines(actor_content_text_list)
66 |
67 | def write_timeout_url(self):
68 | filename = os.path.join(PATH, 'log', 'actor_info_timeout_url')
69 | timeout_url_list = [item+'\n' for item in self.timeout_url_list]
70 | with codecs.open(filename, mode='a', encoding='utf-8') as f:
71 | f.writelines(timeout_url_list)
72 |
73 | def main(self):
74 | range_from = 1000000
75 | range_to = 1339959
76 | url_pattern = 'http://movie.douban.com/celebrity/%d/'
77 | url_list = [url_pattern%item for item in range(range_from, range_to+1)]
78 | # print len(url_list), url_list[-1]
79 | url_count = range_to - range_from
80 | con_count_write_in_file = 2000
81 | file_count = url_count/2000#338000
82 | for file_point in range(file_count):
83 | for url_point in range(file_point*con_count_write_in_file, (file_point+1)*con_count_write_in_file):
84 | url = url_list[url_point]
85 | self.parse_actor_content_url(url)
86 | time.sleep(SLEEP_INTERVAL)
87 | self.write_content_into_file()
88 | self.actor_content_text_list[:] = []
89 | if self.timeout_url_list:
90 | self.write_timeout_url()
91 |
92 | url_left = url_count - file_count*con_count_write_in_file#339959-338000
93 | for i in range(1,url_left+1):
94 | url = url_list[-i]
95 | self.parse_actor_content_url(url)
96 | time.sleep(SLEEP_INTERVAL)
97 | self.write_content_into_file()
98 | self.actor_content_text_list[:] = []
99 | if self.timeout_url_list:
100 | self.write_timeout_url()
101 |
102 | if __name__ == '__main__':
103 | actor = MovieActor()
104 | actor.main()
--------------------------------------------------------------------------------
/jd/electronic_jd/electronic_name.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | #coding:utf-8
3 | import os
4 | import re
5 | import time
6 | import random
7 | import codecs
8 | import urllib2
9 | from bs4 import BeautifulSoup
10 |
11 | PATH = os.path.dirname(os.path.abspath(__file__))
12 |
13 | def read_item_id_to_get_title():
14 | url_pattern = 'http://item.jd.com/%s.html'
15 | elec_filename = os.path.join(PATH, 'out', 'elec_name')
16 | failed_url = os.path.join(PATH, 'log', 'elec_failed_url')
17 | count = 0
18 | with codecs.open('./sys/electronic_item_id', encoding='utf-8') as item_id_f,\
19 | codecs.open(failed_url, mode='wb', encoding='utf-8') as failed_url_wf,\
20 | codecs.open(elec_filename, mode='a', encoding='utf-8') as con_to_write_wf:
21 | for item_id in [item.strip() for item in item_id_f.readlines()]:
22 | count += 1
23 | item_url = url_pattern%item_id
24 | try:
25 | html = urllib2.urlopen(item_url).read()
26 | except:
27 | try:
28 | html = urllib2.urlopen(item_url).read()
29 | except:
30 | try:
31 | html = urllib2.urlopen(item_url).read()
32 | except:
33 | print 'timed out in url;%s'%item_url
34 | failed_url_wf.write('timed out in url;%s\n'%item_url)
35 | continue
36 | soup = BeautifulSoup(html)
37 | try:
38 | div_level_str = soup.find('div', id='name')
39 | elec_title = div_level_str.text.strip()
40 | except:
41 | print 'div do not match pattern in url;%s'%item_url
42 | failed_url_wf.write('div do not match pattern in url;%s\n'%item_url)
43 | continue
44 | con_to_write_wf.write(elec_title+'\n')
45 | print count
46 |
47 | def read_failed_item_url():
48 | failed_url_filename = os.path.join(PATH, 'log', 'elec_failed_url')
49 | content_to_write_filename = os.path.join(PATH, 'out', 'elec_name')
50 | with codecs.open(failed_url_filename, encoding='utf-8') as item_id_f,\
51 | codecs.open(content_to_write_filename, mode='a', encoding='utf-8') as content_to_write_af:
52 | count = 0
53 | for url in [item.split(';')[1].strip() for item in item_id_f.readlines()]:
54 | count += 1
55 | try:
56 | html = urllib2.urlopen(url).read()
57 | except:
58 | try:
59 | html = urllib2.urlopen(url).read()
60 | except:
61 | print 'timed out in url:%s'%url
62 | continue
63 |
64 | soup = BeautifulSoup(html)
65 | try:
66 | div_level_str = soup.find('div', id='name')
67 | elec_title = div_level_str.text.strip()
68 | except:
69 | print 'div do not match pattern in url;%s'%url
70 | continue
71 | content_to_write_af.write(elec_title+'\n')
72 | print count
73 | # read_failed_item_url()
74 |
75 | def extract_goods_name():
76 | electronic_name_str_filename = os.path.join(PATH, 'out', 'elec_name')
77 | content_for_write_filename =os.path.join(PATH, 'out', 'electronic_name.txt')
78 | pattern = re.compile(ur"([\u4E00-\u9FA5]+)", re.U)
79 | count = 0
80 | with codecs.open(electronic_name_str_filename, encoding='utf-8') as f,\
81 | codecs.open(content_for_write_filename, mode='a', encoding='utf-8') as electronic_name_af:
82 | for line in f.readlines():
83 | count += 1
84 | temp_list_for_write = []
85 | splited_lien = pattern.split(line)
86 | for param in splited_lien:
87 | if len(param) <= 1:
88 | continue
89 | if pattern.match(param):
90 | temp_list_for_write.append(param+'\n')
91 | print count
92 | electronic_name_af.writelines(temp_list_for_write)
93 |
94 | def chose_len_between_1_8_param():
95 | content_for_write_filename =os.path.join(PATH, 'out', 'electronic_name.txt')
96 | with codecs.open(content_for_write_filename, encoding='utf-8') as f:
97 | electronic_name_list = f.readlines()
98 | print len(electronic_name_list)
99 | remove_length_one_param_list = [item for item in electronic_name_list if 1 1000:
102 | del self.crawled_url_list[:200]
103 | self.re_write_crawled_url_file()
104 |
105 | if __name__ == "__main__":
106 | musicer = BaiduMusic()
107 | musicer.main()
108 |
109 |
110 |
111 |
--------------------------------------------------------------------------------
/douban/sys/douban_crawled_urls:
--------------------------------------------------------------------------------
1 | http://movie.douban.com/subject/10543682/
2 | http://movie.douban.com/subject/24736526/
3 | http://movie.douban.com/subject/7003297/
4 | http://movie.douban.com/subject/10487568/
5 | http://movie.douban.com/subject/21776863/
6 | http://movie.douban.com/subject/19997896/
7 | http://movie.douban.com/subject/25853104/
8 | http://movie.douban.com/subject/21352814/
9 | http://movie.douban.com/subject/19962587/
10 | http://movie.douban.com/subject/20284939/
11 | http://movie.douban.com/subject/25828563/
12 | http://movie.douban.com/subject/25713540/
13 | http://movie.douban.com/subject/25841341/
14 | http://movie.douban.com/subject/6890751/
15 | http://movie.douban.com/subject/25851768/
16 | http://movie.douban.com/subject/10485647/
17 | http://movie.douban.com/subject/24695967/
18 | http://movie.douban.com/subject/24707368/
19 | http://movie.douban.com/subject/10726941/
20 | http://movie.douban.com/subject/6878457/
21 | http://movie.douban.com/subject/25844367/
22 | http://movie.douban.com/subject/25804446/
23 | http://movie.douban.com/subject/25820625/
24 | http://movie.douban.com/subject/25869685/
25 | http://movie.douban.com/subject/25844581/
26 | http://movie.douban.com/subject/25863020/
27 | http://movie.douban.com/subject/24873473/
28 | http://movie.douban.com/subject/4746257/
29 | http://movie.douban.com/subject/4922789/
30 | http://movie.douban.com/subject/5421797/
31 | http://movie.douban.com/subject/25798808/
32 | http://movie.douban.com/subject/25758654/
33 | http://movie.douban.com/subject/2063914/
34 | http://movie.douban.com/subject/25862407/
35 | http://movie.douban.com/subject/20451334/
36 | http://movie.douban.com/subject/6873819/
37 | http://movie.douban.com/subject/7054604/
38 | http://movie.douban.com/subject/24879858/
39 | http://movie.douban.com/subject/6529847/
40 | http://movie.douban.com/subject/25755645/
41 | http://movie.douban.com/subject/25792690/
42 | http://movie.douban.com/subject/24851526/
43 | http://movie.douban.com/subject/20513060/
44 | http://movie.douban.com/subject/25792684/
45 | http://movie.douban.com/subject/25845586/
46 | http://movie.douban.com/subject/24847340/
47 | http://movie.douban.com/subject/24859034/
48 | http://movie.douban.com/subject/20513061/
49 | http://movie.douban.com/subject/25777330/
50 | http://movie.douban.com/subject/11443316/
51 | http://movie.douban.com/subject/4881607/
52 | http://movie.douban.com/subject/10807916/
53 | http://movie.douban.com/subject/6721670/
54 | http://movie.douban.com/subject/11610281/
55 | http://movie.douban.com/subject/7564989/
56 | http://movie.douban.com/subject/10604893/
57 | http://movie.douban.com/subject/20270795/
58 | http://movie.douban.com/subject/24695277/
59 | http://movie.douban.com/subject/24743712/
60 | http://movie.douban.com/subject/25717233/
61 | http://movie.douban.com/subject/24404677/
62 | http://movie.douban.com/subject/23048775/
63 | http://movie.douban.com/subject/25778491/
64 | http://movie.douban.com/subject/10545939/
65 | http://movie.douban.com/subject/25300674/
66 | http://movie.douban.com/subject/25798222/
67 | http://movie.douban.com/subject/25778488/
68 | http://movie.douban.com/subject/25746414/
69 | http://movie.douban.com/subject/6126442/
70 | http://movie.douban.com/subject/3078390/
71 | http://movie.douban.com/subject/3993588/
72 | http://movie.douban.com/subject/24879839/
73 | http://movie.douban.com/subject/10807909/
74 | http://movie.douban.com/subject/24298770/
75 | http://movie.douban.com/subject/25713420/
76 | http://movie.douban.com/subject/25823833/
77 | http://movie.douban.com/subject/25827963/
78 | http://movie.douban.com/subject/6082518/?from=playing_poster
79 | http://movie.douban.com/subject/24743711/?from=playing_poster
80 | http://movie.douban.com/subject/24843198/?from=playing_poster
81 | http://movie.douban.com/subject/11443314/?from=playing_poster
82 | http://movie.douban.com/subject/6973460/?from=playing_poster
83 | http://movie.douban.com/subject/7057975/?from=playing_poster
84 | http://movie.douban.com/subject/10810745/?from=playing_poster
85 | http://movie.douban.com/subject/20515977/?from=playing_poster
86 | http://movie.douban.com/subject/24163542/?from=playing_poster
87 | http://movie.douban.com/subject/3927791/?from=playing_poster
88 | http://movie.douban.com/subject/25804697/?from=playing_poster
89 | http://movie.douban.com/subject/6390823/?from=playing_poster
90 | http://movie.douban.com/subject/3731581/?from=playing_poster
91 | http://movie.douban.com/subject/22300822/?from=playing_poster
92 | http://movie.douban.com/subject/4919019/?from=playing_poster
93 | http://movie.douban.com/subject/21941804/?from=playing_poster
94 | http://movie.douban.com/subject/1437313/?from=playing_poster
95 | http://movie.douban.com/subject/2129132/?from=playing_poster
96 | http://movie.douban.com/subject/3273700/?from=playing_poster
97 | http://movie.douban.com/subject/21327518/?from=playing_poster
98 | http://movie.douban.com/subject/1300689/?from=playing_poster
99 |
--------------------------------------------------------------------------------
/sina/sina_news.py:
--------------------------------------------------------------------------------
1 | __author__ = 'huafeng'
2 | #coding:utf-8
3 | import re
4 | import os
5 | import time
6 | import codecs
7 | import requests
8 | import datetime
9 | from bs4 import BeautifulSoup
10 |
11 | PATH = os.path.dirname(os.path.abspath(__file__))
12 | TIMESTAMP = time.strftime('%Y%m%d')
13 | # yesterday = (datetime.datetime.now().date()+datetime.timedelta(days=-1)).strftime('%Y_%m_%d')
14 |
15 | def read_item_url_file():
16 | whole_item_url_filename = os.path.join(PATH, 'sys', 'whole_item_url_%s'%TIMESTAMP)
17 | timestamp_filename = time.strftime('%Y_%m_%d_%H%M00_sina_news')
18 | data_directory = os.path.join(PATH, 'html', time.strftime('%Y_%m_%d'))
19 | if not os.path.exists(data_directory):
20 | os.system('mkdir %s'%data_directory)
21 | output_filename = os.path.join(PATH, 'out', timestamp_filename)
22 | failed_url_filename = os.path.join(PATH, 'log','sina_news_log')
23 | with codecs.open(whole_item_url_filename, encoding='utf-8') as f,\
24 | codecs.open(output_filename, mode='a', encoding='utf-8') as wf,\
25 | codecs.open(failed_url_filename, mode='a', encoding='utf-8')as log_f:
26 | url_list = [item.strip() for item in f.readlines()]
27 | for url in url_list:
28 | item_url_info_list = []
29 | try:
30 | html = requests.get(url).text.encode('ISO-8859-1')
31 | splited_url = url.split('/')
32 | html_filename = '+'.join(splited_url[-3:])
33 | filename = os.path.join(PATH, data_directory, html_filename)
34 | with open(filename, mode='wb') as htmlwf:
35 | htmlwf.write(html)
36 | except BaseException:
37 | log_f.write('timed out in item_url;%s\n'%url)
38 | continue
39 | soup = BeautifulSoup(html, 'html5lib')
40 | try:
41 | title = soup.find('h1', id='artibodyTitle').text.strip()
42 | item_url_info_list.append(title+'\n')
43 | div_lelvel_str = soup.find('div', id='artibody')
44 | p_level_list = div_lelvel_str.find_all('p')
45 | content_list = [item.text.strip()+'\n' for item in p_level_list]
46 | item_url_info_list.extend(content_list)
47 | except BaseException:
48 | log_f.write('div do not match pattern in item_url;%s\n'%url)
49 | continue
50 | if item_url_info_list:
51 | wf.writelines(item_url_info_list)
52 | # read_item_url_file()
53 | def write_item_url_into_file(item_url_list):
54 | item_url_filename = os.path.join(PATH, 'sys', 'whole_item_url_%s'%TIMESTAMP)
55 | with codecs.open(item_url_filename, mode='a', encoding='utf-8') as af:
56 | temp_url_list_for_write = [item+'\n' for item in item_url_list]
57 | af.writelines(temp_url_list_for_write)
58 | def get_realtime_news():
59 | url_pattern = 'http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=89&spec=&type=&ch=01&k=&offset_page=0&offset_num=0&num=80&asc=&page=%s&r=0.30903104213777677'
60 | start_page_num = 1
61 | valid_timestamp_url_list = []
62 | failed_url_filename = os.path.join(PATH, 'log','sina_news_log')
63 | with codecs.open(failed_url_filename, mode='a', encoding='utf-8')as af:
64 | for page_num in range(start_page_num, start_page_num+100):
65 | url = url_pattern%page_num
66 | try:
67 | res = requests.get(url)
68 | html = res.text.encode('ISO-8859-1')
69 | except BaseException:
70 | try:
71 | res = requests.get(url)
72 | html = res.text.encode('ISO-8859-1')
73 | except BaseException:
74 | try:
75 | res = requests.get(url)
76 | html = res.text.encode('ISO-8859-1')
77 | except BaseException:
78 | af.write('timed out in page_url;%s\n'%url)
79 | continue
80 | try:
81 | page_url_list = re.findall(r'url : "(http://.*\.shtml)"', html)
82 | url_list_timestamp = page_url_list[-1].split('/')[-2].replace('-', '')
83 | except BaseException:
84 | af.write('div not pattern in page_url;%s\n'%url)
85 | continue
86 | if url_list_timestamp != TIMESTAMP:
87 | for url in page_url_list:
88 | url_list_timestamp = url.split('/')[-2].replace('-','')
89 | if url_list_timestamp == TIMESTAMP:
90 | valid_timestamp_url_list.append(url)
91 | write_item_url_into_file(valid_timestamp_url_list)
92 | read_item_url_file()
93 | break
94 | else:
95 | write_item_url_into_file(page_url_list)
96 |
97 | if __name__ == '__main__':
98 | get_realtime_news()
99 |
--------------------------------------------------------------------------------
/dangdang/appliance/sys/whole_page_url:
--------------------------------------------------------------------------------
1 | http://category.dangdang.com/cid4009643-pg1.html
2 | http://category.dangdang.com/cid4009643-pg2.html
3 | http://category.dangdang.com/cid4009643-pg3.html
4 | http://category.dangdang.com/cid4009643-pg4.html
5 | http://category.dangdang.com/cid4009643-pg5.html
6 | http://category.dangdang.com/cid4009643-pg6.html
7 | http://category.dangdang.com/cid4009643-pg7.html
8 | http://category.dangdang.com/cid4009643-pg8.html
9 | http://category.dangdang.com/cid4009643-pg9.html
10 | http://category.dangdang.com/cid4009643-pg10.html
11 | http://category.dangdang.com/cid4009642-pg1.html
12 | http://category.dangdang.com/cid4009642-pg2.html
13 | http://category.dangdang.com/cid4009642-pg3.html
14 | http://category.dangdang.com/cid4009642-pg4.html
15 | http://category.dangdang.com/cid4009642-pg5.html
16 | http://category.dangdang.com/cid4009642-pg6.html
17 | http://category.dangdang.com/cid4009642-pg7.html
18 | http://category.dangdang.com/cid4009642-pg8.html
19 | http://category.dangdang.com/cid4009642-pg9.html
20 | http://category.dangdang.com/cid4009642-pg10.html
21 | http://category.dangdang.com/cid4009642-pg11.html
22 | http://category.dangdang.com/cid4009635-pg1.html
23 | http://category.dangdang.com/cid4009635-pg2.html
24 | http://category.dangdang.com/cid4009635-pg3.html
25 | http://category.dangdang.com/cid4009635-pg4.html
26 | http://category.dangdang.com/cid4009635-pg5.html
27 | http://category.dangdang.com/cid4009635-pg6.html
28 | http://category.dangdang.com/cid4009635-pg7.html
29 | http://category.dangdang.com/cid4009635-pg8.html
30 | http://category.dangdang.com/cid4009635-pg9.html
31 | http://category.dangdang.com/cid4009635-pg10.html
32 | http://category.dangdang.com/cid4009635-pg11.html
33 | http://category.dangdang.com/cid4009635-pg12.html
34 | http://category.dangdang.com/cid4009635-pg13.html
35 | http://category.dangdang.com/cid4009635-pg14.html
36 | http://category.dangdang.com/cid4009635-pg15.html
37 | http://category.dangdang.com/cid4009635-pg16.html
38 | http://category.dangdang.com/cid4009635-pg17.html
39 | http://category.dangdang.com/cid4009636-pg1.html
40 | http://category.dangdang.com/cid4009636-pg2.html
41 | http://category.dangdang.com/cid4009636-pg3.html
42 | http://category.dangdang.com/cid4009636-pg4.html
43 | http://category.dangdang.com/cid4009636-pg5.html
44 | http://category.dangdang.com/cid4009636-pg6.html
45 | http://category.dangdang.com/cid4009636-pg7.html
46 | http://category.dangdang.com/cid4009636-pg8.html
47 | http://category.dangdang.com/cid4009636-pg9.html
48 | http://category.dangdang.com/cid4009636-pg10.html
49 | http://category.dangdang.com/cid4009636-pg11.html
50 | http://category.dangdang.com/cid4009636-pg12.html
51 | http://category.dangdang.com/cid4009636-pg13.html
52 | http://category.dangdang.com/cid4009636-pg14.html
53 | http://category.dangdang.com/cid4009637-pg1.html
54 | http://category.dangdang.com/cid4009637-pg2.html
55 | http://category.dangdang.com/cid4009637-pg3.html
56 | http://category.dangdang.com/cid4009637-pg4.html
57 | http://category.dangdang.com/cid4009637-pg5.html
58 | http://category.dangdang.com/cid4009637-pg6.html
59 | http://category.dangdang.com/cid4009637-pg7.html
60 | http://category.dangdang.com/cid4009637-pg8.html
61 | http://category.dangdang.com/cid4009637-pg9.html
62 | http://category.dangdang.com/cid4009638-pg1.html
63 | http://category.dangdang.com/cid4009638-pg2.html
64 | http://category.dangdang.com/cid4009638-pg3.html
65 | http://category.dangdang.com/cid4009638-pg4.html
66 | http://category.dangdang.com/cid4009638-pg5.html
67 | http://category.dangdang.com/cid4009638-pg6.html
68 | http://category.dangdang.com/cid4009638-pg7.html
69 | http://category.dangdang.com/cid4009638-pg8.html
70 | http://category.dangdang.com/cid4003236-pg1.html
71 | http://category.dangdang.com/cid4003236-pg2.html
72 | http://category.dangdang.com/cid4003236-pg3.html
73 | http://category.dangdang.com/cid4003236-pg4.html
74 | http://category.dangdang.com/cid4009645-pg1.html
75 | http://category.dangdang.com/cid4009645-pg2.html
76 | http://category.dangdang.com/cid4009645-pg3.html
77 | http://category.dangdang.com/cid4009645-pg4.html
78 | http://category.dangdang.com/cid4009645-pg5.html
79 | http://category.dangdang.com/cid4009645-pg6.html
80 | http://category.dangdang.com/cid4009640-pg1.html
81 | http://category.dangdang.com/cid4009640-pg2.html
82 | http://category.dangdang.com/cid4009640-pg3.html
83 | http://category.dangdang.com/cid4009639-pg1.html
84 | http://category.dangdang.com/cid4009639-pg2.html
85 | http://category.dangdang.com/cid4009639-pg3.html
86 | http://category.dangdang.com/cid4009647-pg1.html
87 | http://category.dangdang.com/cid4009647-pg2.html
88 | http://category.dangdang.com/cid4009647-pg3.html
89 | http://category.dangdang.com/cid4009641-pg1.html
90 | http://category.dangdang.com/cid4009641-pg2.html
91 | http://category.dangdang.com/cid4009646-pg1.html
92 | http://category.dangdang.com/cid4009646-pg2.html
93 | http://category.dangdang.com/cid4009646-pg3.html
94 | http://category.dangdang.com/cid4010054-pg1.html
95 |
--------------------------------------------------------------------------------
/baidu/out/52:
--------------------------------------------------------------------------------
1 | {"url": "http://baike.baidu.com/view/52.htm", "header": "\u673a\u5668\u7801", "content": "\u673a\u5668\u7801\u6307\u7684\u662f\u5c06\u786c\u4ef6\u5e8f\u5217\u53f7\u7ecf\u8fc7\u4e00\u7cfb\u5217\u52a0\u5bc6\u3001\u6563\u5217\u5f62\u6210\u7684\u4e00\u4e32\u5e8f\u5217\u53f7\u3002\u786c\u76d8\uff0cCPU\u90fd\u6709\u4e00\u4e2a\u65e0\u6cd5\u4fee\u6539\u7684\u8bc6\u522b\u7801\u3002\u7f51\u5361\u7684MAC\u5176\u5b9e\u4e5f\u7b97\u4e00\u79cd\uff0c\u4f46\u5b83\u662f\u53ef\u4ee5\u4eba\u4e3a\u4fee\u6539\u7684\u3002\u8f6f\u4ef6\u4e3a\u4e86\u9632\u6b62\u76d7\u7248\uff0c\u91c7\u53d6\u4e86\u4e00\u5b9a\u7684\u4fdd\u62a4\u63aa\u65bd\u3002\u5728\u7528\u6237\u6ce8\u518c\u7684\u65f6\u5019\u4f1a\u6839\u636e\u7528\u6237\u8f6f\u4ef6\u6240\u5b89\u88c5\u7684\u8ba1\u7b97\u673a\u8f6f\u786c\u4ef6\u4fe1\u606f\u751f\u6210\u552f\u4e00\u7684\u8bc6\u522b\u7801\uff0c\u4e00\u822c\u79f0\u4f5c\u673a\u5668\u7801\uff0c\u4e5f\u53eb\u5e8f\u5217\u53f7\u3001\u8ba4\u8bc1\u7801\u3001\u6ce8\u518c\u7533\u8bf7\u7801\u7b49\u3002\u673a\u5668\u7801\u4e00\u822c\u7528\u4f5c\u8f6f\u4ef6\u80fd\u591f\u552f\u4e00\u8bc6\u522b\u7684\u673a\u5668\uff0c\u6ce8\u518c\u8f6f\u4ef6\u65f6\u4f1a\u81ea\u52a8\u6839\u636e\u786c\u4ef6\u914d\u7f6e\u4ea7\u751f\u4e00\u4e32\u5e8f\u53f7\uff0c\u8fd9\u4e32\u5e8f\u53f7\u53eb\u673a\u5668\u7801\uff0c\u8f6f\u4ef6\u63d0\u4f9b\u5546\u4e00\u822c\u6839\u636e\u7528\u6237\u6240\u63d0\u4f9b\u7684\u673a\u5668\u7801\u6765\u4ea7\u751f\u552f\u4e00\u7684\u6ce8\u518c\u7801\uff0c\u8fd9\u6837\u6240\u4f7f\u7528\u7684\u8f6f\u4ef6\u5c31\u53ef\u4ee5\u6b63\u5e38\u5de5\u4f5c\u4e86\u3002\u4e0d\u8fc7\u6709\u4e9b\u9ed1\u5ba2\u4eec\u5229\u7528\u673a\u5668\u7801\u548c\u83b7\u5f97\u7684\u6ce8\u518c\u7801\u4e4b\u95f4\u7684\u5173\u7cfb\uff0c\u7814\u7a76\u51fa\u6ce8\u518c\u7801\u8ba1\u7b97\u5668\uff0c\u628a\u673a\u5668\u7801\u8f93\u5165\u8fdb\u53bb\uff0c\u7ecf\u8fc7\u76f8\u5e94\u7684\u7a0b\u5e8f\u8ba1\u7b97\u5c31\u80fd\u5f97\u5230\u6ce8\u518c\u7801\u3002\u673a\u5668\u7801\u662f\u7531\u60a8\u7684\u7535\u8111\u786c\u4ef6\u4fe1\u606f\u4ea7\u751f\uff0c\u4e0d\u4f1a\u6539\u53d8\uff0c\u5982\u679c\u60a8\u683c\u5f0f\u5316\u786c\u76d8\u6216\u91cd\u88c5\u7cfb\u7edf\u5219\u53ef\u80fd\u4f1a\u6539\u53d8\u3002\u5982\u679c\u4e2d\u4e86\u75c5\u6bd2\uff0c\u5219\u4e5f\u53ef\u80fd\u4f1a\u6539\u53d8\u3002\u00a0\u00a0\u7535\u8111\u4e0a\u7684\u673a\u5668\u7801\u4ec0\u4e48\u662f\u673a\u5668\u7801\uff1f\u673a\u5668\u7801\u662f\u8ba1\u7b97\u673a\u7684\u552f\u4e00\u7f16\u53f7\uff0c\u4e00\u822c\u53d6\u81ea\u8ba1\u7b97\u673a\u786c\u4ef6\u7684\u5e8f\u5217\u53f7\u3002\u6211\u4eec\u8f6f\u4ef6\u7684\u6ce8\u518c\u7cfb\u7edf\u4f9d\u8d56\u4e8e\u6b64\u673a\u5668\u7801\u533a\u5206\u4e0d\u540c\u7684\u8ba1\u7b97\u673a\u3002\u7531\u4e8e\u673a\u5668\u7801\u6e90\u81ea\u67d0\u4e9b\u786c\u4ef6\uff0c\u6240\u4ee5\u5f53\u60a8\u8ba1\u7b97\u673a\u7684\u786c\u4ef6\u6539\u53d8\u65f6\u53ef\u80fd\u5bfc\u81f4\u673a\u5668\u7801\u6539\u53d8\uff0c\u800c\u8fd9\u65f6\u6211\u4eec\u7684\u6ce8\u518c\u7cfb\u7edf\u4f1a\u8ba4\u4e3a\u662f\u4e00\u53f0\u65b0\u7684\u8ba1\u7b97\u673a\uff0c\u60a8\u53ea\u9700\u8981\u91cd\u65b0\u6ce8\u518c\u5b83\uff0c\u4f46\u91cd\u65b0\u6ce8\u518c\u53ef\u80fd\u51fa\u73b0\u201c\u5df2\u8d85\u8fc7\u6700\u5927\u6388\u6743\u6570\u201d\u7684\u9519\u8bef\uff0c\u8fd9\u662f\u56e0\u4e3a\u8001\u7684\u786c\u4ef6\u5df2\u88ab\u4f5c\u4e3a\u4e00\u53f0\u7535\u8111\u6ce8\u518c\u7684\u7f18\u6545\uff0c\u60a8\u53ea\u9700\u8981\u6309\u63d0\u793a\u7533\u8bf7\u64a4\u9500\u4e0a\u4e00\u53f0\u5c31\u53ef\u4ee5\u4e86\u3002\u8fd9\u662f\u5b8c\u5168\u6b63\u5e38\u7684\uff0c\u8bf7\u4e0d\u7528\u62c5\u5fc3\uff0c\u6ce8\u518c\u7801\u4e0d\u4f1a\u5931\u6548\uff0c\u8fd9\u79cd\u60c5\u51b5\u6211\u4eec\u4f1a\u4fdd\u969c\u6ce8\u518c\u7801\u53ef\u7528\u3002\u6211\u4eec\u53d6\u7684\u673a\u5668\u7801\u662f\u6765\u81ea\u786c\u76d8\u3001\u4e3b\u677f\u6216CPU\uff0c\u5f53\u8fd9\u4e09\u79cd\u786c\u4ef6\u6539\u53d8\u65f6\u53ef\u80fd\u4f1a\u81f4\u4f7f\u673a\u5668\u7801\u6539\u53d8\u800c\u9700\u8981\u91cd\u65b0\u6ce8\u518c\uff0c\u5176\u5b83\u786c\u4ef6\u6539\u53d8\u5219\u6ca1\u6709\u5f71\u54cd\u3002\u5176\u4e2d\u6700\u4e3b\u8981\u7684\u662f\u786c\u76d8\uff0c\u4e00\u822c\u60c5\u51b5\u4e0b\u786c\u76d8\u7684\u6539\u53d8\u90fd\u9700\u8981\u91cd\u65b0\u6ce8\u518c\u3002\u4e3b\u677f\u548cCPU\u53ea\u6709\u6781\u4e2a\u522b\u60c5\u51b5\u4e0b\u624d\u4f7f\u7528\u3002\u4ec0\u4e48\u65f6\u5019\u4f7f\u7528\u673a\u5668\u7801\uff1f\u4e00\u822c\u60c5\u51b5\u4e0b\uff0c\u60a8\u4e0d\u9700\u8981\u4f7f\u7528\u4e5f\u6ca1\u6709\u5fc5\u8981\u77e5\u9053\u6b64\u673a\u5668\u7801\uff0c\u56e0\u4e3a\u6240\u6709\u7684\u64cd\u4f5c\u90fd\u662f\u81ea\u52a8\u5b8c\u6210\u00a0\u00a0\u6ce8\u518c\u5e94\u7528\u7684\u673a\u5668\u7801\u7684\uff0c\u6ca1\u6709\u5fc5\u8981\u77e5\u9053\u5b83\u3002\u53ea\u6709\u5728\u6280\u672f\u652f\u6301\u65f6\uff0c\u5982\u679c\u6211\u4eec\u9700\u8981\u60a8\u63d0\u4f9b\u673a\u5668\u7801\uff0c\u8fd9\u65f6\u60a8\u624d\u80fd\u7528\u5230\u5b83\u3002\u5373\u4f7f\u5728\u5f53\u60a8\u65e0\u6cd5\u5b8c\u6210\u5728\u7ebf\u6ce8\u518c\u65f6\uff0c\u4e00\u822c\u4e5f\u4e0d\u9700\u8981\u624b\u5de5\u63d0\u4f9b\u673a\u5668\u7801\uff0c\u7a0b\u5e8f\u81ea\u52a8\u751f\u6210\u7684\u90ae\u4ef6\u4e2d\u5c31\u5305\u62ec\u4e86\u8fd9\u4e9b\u4fe1\u606f\uff0c\u4e07\u4e00\u60a8\u9700\u8981\u624b\u5de5\u4e66\u5199\u7533\u8bf7\u6ce8\u518c\u3001\u7533\u8bf7\u64a4\u9500\u3001\u7533\u8bf7\u7eed\u8ba2\u3001\u540c\u6b65\u7eed\u8ba2\u671f\u9650\u7b49\u90ae\u4ef6\u65f6\u624d\u9700\u8981\u624b\u5de5\u63d0\u4f9b\u8fd9\u4e9b\u673a\u5668\u7801\u4fe1\u606f\u3002"}
--------------------------------------------------------------------------------