119 | if html is None:
120 | return
121 | soup = BeautifulSoup(html, 'html.parser', )
122 | # today = str(datetime.date.today())
123 | # post_date = soup.find('em', id='post-date' )
124 | # if post_date == today:
125 | # get_text()返回的是unicode编码
126 | try:
127 | title = soup.find('h2', class_='rich_media_title').get_text().strip(' \n').encode('utf-8')
128 | wname = soup.find('a', id='post-user').get_text().encode('utf-8')
129 | date = soup.find('em', id='post-date').get_text().encode('utf-8')
130 | content = soup.find('div', class_='rich_media_content ').get_text().strip('\n').encode('utf-8')#文章内容
131 | readNum = None
132 | praise_num = None
133 | discuss_content = None
134 | discuss_praise = None
135 | except Exception as e:
136 | return None
137 | try:
138 | readNum = soup.find('span', id='sg_readNum3').get_text().encode('utf-8')
139 | praise_num = soup.find('span', id='sg_likeNum3').get_text().encode('utf-8')
140 | discuss_list = soup.find_all('li', class_='discuss_item')
141 | discuss_content = [a.find('div', class_='discuss_message_content').get_text().strip().encode('utf-8') for a in discuss_list]
142 | discuss_praise = [a.find('span', class_='praise_num').get_text().encode('utf-8') for a in discuss_list]
143 | except Exception as e:
144 | pass
145 | # print(e)
146 |
147 | return title, wname, date, content, readNum, praise_num, discuss_content, discuss_praise
148 |
149 | def parse_wechat(self, page_source):
150 | if page_source is None:
151 | return
152 | soup = BeautifulSoup(page_source, 'html.parser',).find('li', id='sogou_vr_11002301_box_0')
153 | account_name = soup.find('a', uigs='account_name_0').get_text().encode('utf-8')
154 | info = soup.find('p', class_='info')
155 | weixinhao = info.find('label').get_text().encode('utf-8')
156 | information = [text for text in info.stripped_strings]
157 | # sougou 更新 去掉平均阅读数
158 | # if len(information) == 3:
159 | # num1, num2 = re.findall(u'[\\d]+', information[-1])
160 | # else:
161 | # num1 = 'null'
162 | # num2 = 'null'
163 | if len(information) == 3:
164 | num1 = re.findall(u'[\\d]+', information[-1])
165 | else:
166 | num1 = 'null'
167 |
168 | introduction = soup.find_all('dl')
169 | fuction = introduction[0].find('dd').get_text()
170 | identify = 'null'
171 | if len(introduction) > 1:
172 | if introduction[1].find('dt').get_text().find(u'认证') != -1:
173 | identify = introduction[1].find('dd').get_text()
174 | return account_name, weixinhao, num1, fuction, identify
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
--------------------------------------------------------------------------------
/spider_main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: UTF-8 -*-
3 |
4 | # URL管理器
5 | """
6 | 添加新的URL到待爬取集合中
7 | 判断待添加URL是否在容器中
8 | 获取待爬取URL
9 | 判断是否还有待爬取URL
10 | 将URL从待爬取移动到已爬取
11 | """
12 |
13 | # 网页下载器
14 | import urllib2
15 |
16 | """
17 | urllib2
18 | requests
19 | """
20 |
21 | # 网页解析器
22 | """
23 | 正则表达式
24 | html.parser
25 | BeautifulSoup
26 | lxml
27 | """
28 | import httplib
29 | import url_manager, html_downloader, html_outputer, html_parser
30 | import os
31 | import sys
32 | import codecs
33 | import datetime
34 | import logging
35 | import threadpool
36 | from apscheduler.schedulers.blocking import BlockingScheduler
37 | from multiprocessing.dummy import Pool as ThreadPool
38 | from multiprocessing import Pool
39 |
40 |
41 | class SpiderMain(object):
42 | def __init__(self):
43 | # self.urls = url_manager.UrlManager()
44 | self.downloader = html_downloader.HtmlDownloader()
45 | self.parser = html_parser.HtmlParser()
46 | self.outputer = html_outputer.HtmlOutputer()
47 |
48 | def craw(self, root_url, full_path, name):
49 | '''
50 | :param root_url: 搜狗微信的搜索url
51 | :param full_path: 存储的文件目录
52 | :param name: 公众号的名称
53 | :return:
54 | '''
55 | new_url = root_url
56 | # html = None
57 | # try:
58 | # html = self.downloader.download_list_ph(new_url, name)
59 | # except httplib.IncompleteRead as e:
60 | # with open(r'list_error.txt', 'a') as f:
61 | # f.write(name.encode('utf-8'))
62 | # f.write('\n')
63 | # if html == None:
64 | # return
65 | # wechat_url, html_cont = html
66 | # acticle_links = self.parser.parse_list(wechat_url, html_cont)
67 | # if acticle_links == None:
68 | # return
69 | html = None
70 | html_list = None
71 | try:
72 | html = self.downloader.download_list_ph(new_url, name)
73 | except httplib.IncompleteRead as e:
74 | with open(r'list_error.txt', 'a') as f:
75 | f.write(name.encode('utf-8'))
76 | f.write('\n')
77 | if html is None:
78 | return
79 | link, page_source = html
80 | # data = self.parser.parse_wechat(page_source)
81 | # self.outputer.wechat_info(data)
82 | try:
83 | html_list = self.downloader.download_list_ph_2(name, link)
84 | except httplib.IncompleteRead as e:
85 | with open(r'list_error.txt', 'a') as f:
86 | f.write(name.encode('utf-8'))
87 | f.write('\n')
88 |
89 | if html_list is None:
90 | return
91 |
92 | acticle_links = self.parser.parse_list(link, html_list)
93 | if acticle_links is None:
94 | with open(r'list_error.txt', 'a') as f:
95 | f.write(name.encode('utf-8'))
96 | f.write('\n')
97 | return
98 |
99 | for link in acticle_links:
100 | html = self.downloader.download_articles_ph(link)
101 | data = self.parser.parse_article(html) # 解析出文本
102 | if data == None:
103 | continue
104 | (title, wname, date, content, readNum, praise_num, discuss_content, discuss_praise) = data
105 | self.outputer.output_mongodb(name, data)
106 | f = open('category1.csv', 'a')
107 | def task(self,link):
108 | data = None
109 | while data is None:
110 | html = self.downloader.download_articles_ph(link)
111 | data = self.parser.parse_article(html)
112 | self.f.write(data[1] + '#' + data[0] + '#' + data[2] + '#' + data[3])
113 | self.f.write('\n')
114 | self.f.flush()
115 |
116 | def craw4key(self, key):
117 | f = open('category1.csv', 'a')
118 | cookie = self.downloader.get_cookies()
119 | for i in range(11, 20):
120 | print('the page is %d' % i)
121 | root_url = u"http://weixin.sogou.com/weixin?type=2&page=%d&ie=utf8&s_from=hotnews&query=%s" % (i, key)
122 | html_list = self.downloader.download_list4key(root_url, cookie)
123 | # pool = ThreadPool(6)
124 | # pool.map(self.downloader.download_articles_ph, html_list)
125 | # pool.close()
126 | # pool.join()
127 | for link in html_list:
128 | data = None
129 | while data is None:
130 | html = self.downloader.download_articles_ph(link)
131 | data = self.parser.parse_article(html) # 解析出文本
132 | f.write(data[1] + '#' + data[0] + '#' + data[2] + '#' + data[3])
133 | f.write('\n')
134 | f.flush()
135 | f.close()
136 |
137 | def schedule(self, name):
138 | if name == '':
139 | return 0
140 | root_url = "http://weixin.sogou.com/weixin?type=%d&query=%s" % (1, name)
141 | full_path = None
142 | # full_path = new_path(name) # 存储目录
143 | # type:表示搜索类型 querystring:表示公众号 i:表示网页页数1
144 | # oneday = datetime.timedelta(days=1)
145 | # today = str(datetime.date.today())
146 | # file_name = full_path+r'\%s.csv' % today
147 | # if os.path.exists(file_name):
148 | # return 0
149 | try:
150 | self.craw(root_url, full_path, name)
151 | except urllib2.URLError as e:
152 | print(datetime.datetime.now())
153 | print(e)
154 | with open(r'list_error.txt', 'a') as f:
155 | f.write(name.encode('utf-8'))
156 | f.write('\n')
157 |
158 | return 1
159 |
160 | def list_multiprocess(self, filename):
161 | name_list = []
162 | with open(filename) as fout:
163 | for name in fout:
164 | if name[:3] == codecs.BOM_UTF8:
165 | name = name[3:]
166 | named = name.strip('.\n').decode('utf-8')
167 | # print named
168 | name_list.append(named)
169 |
170 | pool = ThreadPool(6)
171 | pool.map(self.schedule, name_list)
172 | pool.close()
173 | pool.join()
174 | self.error_handle()
175 |
176 | def single_job(self, filename):
177 | with open(filename) as fout:
178 | for name in fout:
179 | if name[:3] == codecs.BOM_UTF8:
180 | name = name[3:]
181 | named = name.strip('.\n').decode('utf-8')
182 | print named
183 | self.schedule(named)
184 | self.error_handle()
185 | os.remove('list_error.txt')
186 |
187 | # 多线程的格式预处理
188 | def list_handle(self, filename):
189 | name_list = []
190 | with open(filename) as fout:
191 | for name in fout:
192 | if name[:3] == codecs.BOM_UTF8:
193 | name = name[3:]
194 | named = name.strip('.\n').decode('utf-8')
195 | print named
196 | name_list.append(named)
197 | pool = threadpool.ThreadPool(4)
198 | requests = threadpool.makeRequests(self.schedule, name_list)
199 | [pool.putRequest(req) for req in requests]
200 | pool.wait()
201 | print('destory all threads')
202 | pool.dismissWorkers(4, True)
203 |
204 | def error_handle(self):
205 | number = 0
206 | while os.path.exists('list_error.txt'):
207 | number = number + 1
208 | print ('the number for handling is %d' % number)
209 | print('start list_error download')
210 | print(datetime.datetime.now())
211 | with open('list_error.txt', ) as f:
212 | names = f.readlines()
213 | for i, name in enumerate(names):
214 | names[i] = name.strip('\n')
215 | os.remove('list_error.txt')
216 | print(names)
217 | pool1 = ThreadPool(3)
218 | try:
219 | pool1.map(self.schedule, names)
220 | pool1.close()
221 | pool1.join()
222 | except:
223 | pass
224 | print(datetime.datetime.now())
225 |
226 |
227 | path = u'd:\\wechat_data1'
228 |
229 |
230 | def mk_dir(full_path):
231 | full_path = full_path.strip()
232 | full_path = full_path.rstrip("\\")
233 | # 判断路径是否存在
234 | is_exists = os.path.exists(full_path)
235 | if not is_exists:
236 | # 如果不存在则创建目录
237 | # 创建目录操作函数
238 | os.makedirs(full_path)
239 | return True
240 | else:
241 | pass
242 | # 如果目录存在则不创建,并提示目录已存在
243 |
244 |
245 | def new_path(name):
246 | full_path = path + r'\%s' % name
247 | mk_dir(full_path)
248 | return full_path
249 |
250 |
251 | def job_period():
252 | # ip_pool.ip_collect() # 采集代理ip
253 | obj_spider = SpiderMain()
254 | # obj_spider.single_job('D:\\WechatList.txt')
255 | obj_spider.list_multiprocess('D:\\WechatList.txt')
256 |
257 | os.remove('wechat.txt')
258 |
259 |
260 | if __name__ == "__main__":
261 | # logging.basicConfig(filename='log.txt')
262 | # sched = BlockingScheduler()
263 | # sched.add_job(job_period, 'cron', start_date='2017-01-01', hour=1, minute=0, second=0, end_date='2017-12-30')
264 | # a = sched.get_jobs()
265 | # print(a)
266 | # sched.start()
267 |
268 | # job_period()
269 | spider = SpiderMain()
270 | # spider.single_job('D:\\WechatList.txt')
271 | spider.craw4key(u'中兴跳楼')
272 |
--------------------------------------------------------------------------------
/html_downloader.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python2
2 | # -*- coding: UTF-8 -*-
3 | import requests
4 | import html_parser
5 | import urllib2
6 | from ruokuaicode import RClient
7 | import signal
8 | import exceptions
9 | from PIL import Image
10 | from selenium import webdriver
11 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
12 | from selenium.webdriver.common.by import By
13 | from selenium.webdriver.support.ui import WebDriverWait
14 | from selenium.webdriver.support import expected_conditions as EC
15 | import filecache
16 | import time
17 | import os
18 | import base64
19 | import random
20 | import datetime
21 | import config
22 | import string
23 | import zipfile
24 | import socket
25 | import sys
26 | import logging
27 | from bs4 import BeautifulSoup
28 |
29 | try:
30 | import StringIO
31 |
32 |
33 | def readimg(content):
34 | return Image.open(StringIO.StringIO(content))
35 | except ImportError:
36 | import tempfile
37 |
38 |
39 | def readimg(content):
40 | f = tempfile.TemporaryFile()
41 | f.write(content)
42 | return Image.open(f)
43 |
44 | UA = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"
45 | PROXY = "123.56.238.200:8123"
46 | PSIPHON = '127.0.0.1:54552'
47 |
48 | # 代理服务器
49 | proxyHost = "proxy.abuyun.com"
50 | proxyPort = "9020"
51 | proxyServer = "http://proxy.abuyun.com:9020"
52 | # 代理隧道验证信息
53 | proxyUser = ""
54 | proxyPass = ""
55 |
56 | # proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % {
57 | # "host" : proxyHost,
58 | # "port" : proxyPort,
59 | # "user" : proxyUser,
60 | # "pass" : proxyPass,
61 | # }
62 | # proxyAuth = "Basic " + base64.b64encode(proxyUser + ":" + proxyPass)
63 | # proxy_handler = urllib2.ProxyHandler({
64 | # "http": proxyMeta,
65 | # "https": proxyMeta,
66 | # })
67 | # opener = urllib2.build_opener(proxy_handler)
68 | service_args = [
69 | "--proxy-type=http",
70 | "--proxy=%(host)s:%(port)s" % {
71 | "host": proxyHost,
72 | "port": proxyPort,
73 | },
74 | "--proxy-auth=%(user)s:%(pass)s" % {
75 | "user": proxyUser,
76 | "pass": proxyPass,
77 | },
78 | ]
79 |
80 |
81 | def create_proxy_auth_extension(proxy_host, proxy_port,
82 | proxy_username, proxy_password,
83 | scheme='http', plugin_path=None):
84 | if plugin_path is None:
85 | plugin_path = r'D:/{}_{}@http-dyn.abuyun.com_9020.zip'.format(proxy_username, proxy_password)
86 |
87 | manifest_json = """
88 | {
89 | "version": "1.0.0",
90 | "manifest_version": 2,
91 | "name": "Abuyun Proxy",
92 | "permissions": [
93 | "proxy",
94 | "tabs",
95 | "unlimitedStorage",
96 | "storage",
97 | "
",
98 | "webRequest",
99 | "webRequestBlocking"
100 | ],
101 | "background": {
102 | "scripts": ["background.js"]
103 | },
104 | "minimum_chrome_version":"22.0.0"
105 | }
106 | """
107 |
108 | background_js = string.Template(
109 | """
110 | var config = {
111 | mode: "fixed_servers",
112 | rules: {
113 | singleProxy: {
114 | scheme: "${scheme}",
115 | host: "${host}",
116 | port: parseInt(${port})
117 | },
118 | bypassList: ["foobar.com"]
119 | }
120 | };
121 |
122 | chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
123 |
124 | function callbackFn(details) {
125 | return {
126 | authCredentials: {
127 | username: "${username}",
128 | password: "${password}"
129 | }
130 | };
131 | }
132 |
133 | chrome.webRequest.onAuthRequired.addListener(
134 | callbackFn,
135 | {urls: [""]},
136 | ['blocking']
137 | );
138 | """
139 | ).substitute(
140 | host=proxy_host,
141 | port=proxy_port,
142 | username=proxy_username,
143 | password=proxy_password,
144 | scheme=scheme,
145 | )
146 |
147 | with zipfile.ZipFile(plugin_path, 'w') as zp:
148 | zp.writestr("manifest.json", manifest_json)
149 | zp.writestr("background.js", background_js)
150 |
151 | return plugin_path
152 |
153 |
154 | proxy_auth_plugin_path = create_proxy_auth_extension(
155 | proxy_host=proxyHost,
156 | proxy_port=proxyPort,
157 | proxy_username=proxyUser,
158 | proxy_password=proxyPass)
159 |
160 |
161 | def test():
162 | profile_dir = r"D:\MyChrome\Default"
163 | # 设置请求头
164 | # "Referer": "http://weixin.sogou.com"
165 | chrome_options = webdriver.ChromeOptions()
166 | chrome_options.add_argument("--user-data-dir=" + os.path.abspath(profile_dir))
167 | PROXY = "123.56.238.200:8123"
168 | # j = random.randint(0, len(proxys)-1)
169 | # proxy = proxys[j]
170 | chrome_options.add_argument('--proxy-server=%s' % PROXY)
171 | # chrome_options.add_extension('')添加crx扩展
172 | # service_args = ['--proxy=localhost:9050', '--proxy-type=socks5', ]
173 | driver = webdriver.Chrome(r'C:\Python27\chromedriver', chrome_options=chrome_options)
174 | driver.get('http://icanhazip.com')
175 | driver.refresh()
176 | print(driver.page_source)
177 | driver.quit()
178 |
179 |
180 | class HtmlDownloader(object):
181 | def __init__(self):
182 | self._ocr = RClient(config.dama_name, config.dama_pswd, config.dama_soft_id, config.dama_soft_key)
183 | self._cache = filecache.WechatCache(config.cache_dir, 60 * 60)
184 | self._session = self._cache.get(config.cache_session_name) if self._cache.get(
185 | config.cache_session_name) else requests.session()
186 | # self.cookie = self.maintain_cookies_ph()
187 | self.agents = [
188 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
189 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
190 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
191 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
192 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
193 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
194 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
195 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
196 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
197 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
198 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
199 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
200 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
201 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
202 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
203 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
204 | ]
205 |
206 | def ocr4wechat(self, url):
207 | # logger.debug('vcode appear, using _ocr_for_get_gzh_article_by_url_text')
208 | timestr = str(time.time()).replace('.', '')
209 | timever = timestr[0:13] + '.' + timestr[13:17]
210 | codeurl = 'http://mp.weixin.qq.com/mp/verifycode?cert=' + timever
211 | coder = self._session.get(codeurl)
212 | if hasattr(self, '_ocr'):
213 | result = self._ocr.create(coder.content, 2040)
214 | img_code = result['Result']
215 | print(img_code)
216 | else:
217 | im = readimg(coder.content)
218 | im.show()
219 | img_code = raw_input("please input code: ")
220 | post_url = 'http://mp.weixin.qq.com/mp/verifycode'
221 | post_data = {
222 | 'cert': timever,
223 | 'input': img_code
224 | }
225 | headers = {
226 | "User-Agent": random.choice(self.agents),
227 | 'Host': 'mp.weixin.qq.com',
228 | 'Referer': url
229 | }
230 | rr = self._session.post(post_url, post_data, headers=headers)
231 | print(rr.text)
232 | # remsg = eval(rr.text)
233 | # if remsg['ret'] != 0:
234 | # logger.error('cannot verify get_gzh_article because ' + remsg['errmsg'])
235 | # raise exceptions.WechatSogouVcodeException('cannot verify wechat_code because ' + remsg['errmsg'])
236 | self._cache.set(config.cache_session_name, self._session)
237 | # logger.debug('ocr ', remsg['errmsg'])
238 |
239 | def download_list(self, url, name):
240 | '''
241 | 使用urllib2 获取微信公众号列表页的url
242 | :param url:
243 | :param name:
244 | :return:
245 | '''
246 | urllib2.install_opener(opener)
247 | headers = {
248 | "User-Agent": random.choice(self.agents),
249 | "Referer": 'http://weixin.sogou.com/',
250 | 'Host': 'weixin.sogou.com',
251 | 'Cookie': random.choice(self.cookie)
252 | }
253 | req = urllib2.Request(url, headers=headers)
254 | # req.set_proxy(PROXY, 'http')
255 | try:
256 | response = urllib2.urlopen(req)
257 | time.sleep(2)
258 | except urllib2.URLError as e:
259 | if hasattr(e, 'reason'):
260 | # HTTPError and URLError all have reason attribute.
261 | print 'We failed to reach a server.'
262 | print 'Reason: ', e.reason
263 | elif hasattr(e, 'code'):
264 | # Only HTTPError has code attribute.
265 | print 'The server couldn\'t fulfill the request.'
266 | print 'Error code: ', e.code
267 | with open(r'list_error.txt', 'a') as f:
268 | f.write(name.encode('utf-8'))
269 | f.write('\n')
270 | return
271 |
272 | try:
273 | a = html_parser.HtmlParser.parse_list_url(response, name)
274 | except AttributeError:
275 | with open(r'list_error.txt', 'a') as f:
276 | f.write(name.encode('utf-8'))
277 | f.write('\n')
278 | return
279 | if a is not None:
280 | time.sleep(1)
281 | return self.download(a, name, url)
282 |
283 | # headers_weixin = {
284 | # "User-Agent": random.choice(self.agents),
285 | # "Referer": 'http://weixin.sogou.com/',
286 | # 'Host': 'mp.weixin.qq.com',
287 | # }
288 | # req1 = urllib2.Request(a, headers=headers_weixin)
289 | # response1 = urllib2.urlopen(req1)
290 | # with open('c:\\a.html', 'a') as f:
291 | # f.write(response1.read())
292 |
293 | def download(self, link, name, url):
294 | """
295 | 下载指定公众号的文章列表
296 | :param link:
297 | :param name:
298 | :param url:
299 | :return:
300 | """
301 | dcap = dict(DesiredCapabilities.PHANTOMJS)
302 | dcap["phantomjs.page.settings.userAgent"] = (
303 | random.choice(self.agents)
304 | )
305 | dcap["takesScreenshot"] = False
306 | dcap["phantomjs.page.customHeaders.Cookie"] = random.choice(self.cookie)
307 | dcap["phantomjs.page.customHeaders.Proxy-Authorization"] = proxyAuth
308 | # dcap["phantomjs.page.settings.resourceTimeout"] = ("1000")
309 | try:
310 | driver1 = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no',
311 | '--proxy=http://proxy.abuyun.com:9020'])
312 | except Exception as e:
313 | with open(r'list_error.txt', 'a') as f:
314 | f.write(name.encode('utf-8'))
315 | f.write('\n')
316 | print(datetime.datetime.now())
317 | print(url)
318 | print(e)
319 | else:
320 | try:
321 | driver1.set_page_load_timeout(20)
322 | driver1.get(link)
323 | # driver1.get('http://ip.chinaz.com/getip.aspx')
324 | # a = driver1.page_source
325 | b = True
326 | try:
327 | driver1.find_element_by_class_name('page_verify')
328 | except:
329 | b = False
330 |
331 | if b is True:
332 | print('page needs verify, stop the program')
333 | print('the last weixinNUM is %s\n' % name)
334 | # self.ocr4wechat(link)
335 | with open(r'list_error.txt', 'a') as f:
336 | f.write(name.encode('utf-8'))
337 | f.write('\n')
338 | # time.sleep(80)
339 | else:
340 | html = driver1.page_source
341 | return link, html
342 | except Exception as e:
343 | with open(r'list_error.txt', 'a') as f:
344 | f.write(name.encode('utf-8'))
345 | f.write('\n')
346 | print(url)
347 | print(datetime.datetime.now())
348 | print(e)
349 |
350 | finally:
351 | driver1.quit()
352 |
353 | def down_list1(self, url, name):
354 | if url is None:
355 | return None
356 | profile_dir = r"D:\MyChrome\Default"
357 | chrome_options = webdriver.ChromeOptions()
358 | chrome_options.add_argument("--user-data-dir=" + os.path.abspath(profile_dir))
359 | chrome_options.add_argument("--window-size=1920,1080")
360 | chrome_options.add_argument("--headless")
361 | chrome_options.add_extension(proxy_auth_plugin_path)
362 | try:
363 | driver = webdriver.Chrome(r'C:\Python27\chromedriver', chrome_options=chrome_options)
364 | except Exception as e:
365 | with open(r'list_error.txt', 'a') as f:
366 | f.write(name.encode('utf-8'))
367 | f.write('\n')
368 | print(datetime.datetime.now())
369 | print(url)
370 | print(e)
371 | else:
372 | driver.set_page_load_timeout(20)
373 | try:
374 | driver.get(url)
375 | except:
376 | time.sleep(2)
377 | driver.refresh()
378 | try:
379 | driver.find_element_by_id("noresult_part1_container")
380 | a = True
381 | except:
382 | a = False
383 | if a is True:
384 | with open(r'no_wechat.txt', 'a') as f:
385 | f.write(name.encode('utf-8'))
386 | f.write('\n')
387 | # 公众号存在
388 | elif a is False:
389 | try:
390 | time.sleep(5)
391 | # driver.save_screenshot('pic1.png') # 搜索公众号截图
392 | # 代理连接过多导致失败
393 | button = driver.find_element_by_css_selector('a[uigs =\'account_name_0\']')
394 | link = button.get_attribute('href')
395 | return link, driver.page_source
396 | except Exception as e:
397 | with open(r'list_error.txt', 'a') as f:
398 | f.write(name.encode('utf-8'))
399 | f.write('\n')
400 | print(datetime.datetime.now())
401 | print(url)
402 | print(e)
403 | finally:
404 | try:
405 | driver.quit()
406 | except Exception, e:
407 | pass
408 |
409 | def down_list2(self, name, link):
410 | if link is not None:
411 | profile_dir = r"D:\MyChrome\Default"
412 | chrome_options = webdriver.ChromeOptions()
413 | chrome_options.add_argument("--user-data-dir=" + os.path.abspath(profile_dir))
414 | chrome_options.add_argument("--window-size=1920,1080")
415 | chrome_options.add_argument("--headless")
416 | chrome_options.add_extension(proxy_auth_plugin_path)
417 | try:
418 | driver1 = webdriver.Chrome(r'C:\Python27\chromedriver', chrome_options=chrome_options)
419 | except Exception as e:
420 | with open(r'list_error.txt', 'a') as f:
421 | f.write(name.encode('utf-8'))
422 | f.write('\n')
423 | print(datetime.datetime.now())
424 | print(name)
425 | print(e)
426 | else:
427 | try:
428 | driver1.set_page_load_timeout(20)
429 | driver1.get(link)
430 | time.sleep(5)
431 | driver1.save_screenshot('pic2.png') # 文章列表截图
432 | b = True
433 | try:
434 | driver1.find_element_by_class_name('page_verify')
435 | except:
436 | b = False
437 | if b is True:
438 | print('page needs verify, stop the program')
439 | print('the last weixinNUM is %s\n' % name)
440 | # self.ocr4wechat(link)
441 | with open(r'list_error.txt', 'a') as f:
442 | f.write(name.encode('utf-8'))
443 | f.write('\n')
444 | # time.sleep(100)
445 | # os.system('pause')
446 | else:
447 | html = driver1.page_source
448 | with open(r'wechat.txt', 'a') as f:
449 | f.write(name.encode('utf-8') + '\n')
450 | return html
451 | except Exception as e:
452 | with open(r'list_error.txt', 'a') as f:
453 | f.write(name.encode('utf-8'))
454 | f.write('\n')
455 | print(name)
456 | print(datetime.datetime.now())
457 | print(e)
458 |
459 | finally:
460 | try:
461 | driver1.quit()
462 | except Exception, e:
463 | pass
464 |
465 | def download_list_ph(self, url, name):
466 | '''
467 | 使用phantomjs下载微信公众号文章列表
468 | :param url:
469 | :param name:
470 | :return:
471 | '''
472 | if url is None:
473 | return None
474 |
475 | dcap = dict(DesiredCapabilities.PHANTOMJS)
476 | dcap["phantomjs.page.settings.userAgent"] = (
477 | random.choice(self.agents)
478 | )
479 | # dcap["takesScreenshot"] = False
480 | # dcap["phantomjs.page.customHeaders.Cookie"] = random.choice(self.cookie)
481 | # dcap["phantomjs.page.customHeaders.Proxy-Authorization"] = proxyAuth
482 | dcap["phantomjs.page.settings.loadImages"] = False
483 | # dcap["phantomjs.page.settings.resourceTimeout"] = ("1000")
484 | try:
485 | driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args)
486 | except Exception as e:
487 | with open(r'list_error.txt', 'a') as f:
488 | f.write(name.encode('utf-8'))
489 | f.write('\n')
490 | print(datetime.datetime.now())
491 | print(url)
492 | print(e)
493 |
494 | else:
495 | driver.set_page_load_timeout(20)
496 | try:
497 | driver.get(url)
498 | except:
499 | time.sleep(2)
500 | driver.refresh()
501 | try:
502 | driver.find_element_by_id("noresult_part1_container")
503 | a = True
504 | except:
505 | a = False
506 | if a is True:
507 | with open(r'no_wechat.txt', 'a') as f:
508 | f.write(name.encode('utf-8'))
509 | f.write('\n')
510 | # 公众号存在
511 | elif a is False:
512 | try:
513 | time.sleep(5)
514 | # driver.save_screenshot('pic1.png') # 搜索公众号截图
515 | # 代理连接过多导致失败
516 | button = driver.find_element_by_css_selector('a[uigs =\'account_name_0\']')
517 | link = button.get_attribute('href')
518 | return link, driver.page_source
519 | except Exception as e:
520 | with open(r'list_error.txt', 'a') as f:
521 | f.write(name.encode('utf-8'))
522 | f.write('\n')
523 | print(datetime.datetime.now())
524 | print(url)
525 | print(e)
526 | finally:
527 | try:
528 | driver.quit()
529 | except Exception, e:
530 | pass
531 |
532 |
533 | # 获取公众号文章列表
534 |
535 | def download_list_ph_2(self, name, link):
536 | if link is not None:
537 | dcap = dict(DesiredCapabilities.PHANTOMJS)
538 | dcap["phantomjs.page.settings.userAgent"] = (
539 | random.choice(self.agents)
540 | )
541 | # dcap["takesScreenshot"] = False
542 | # dcap["phantomjs.page.customHeaders.Cookie"] = random.choice(self.cookie)
543 | # dcap["phantomjs.page.customHeaders.Proxy-Authorization"] = proxyAuth
544 | try:
545 | driver1 = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args)
546 | except Exception as e:
547 | with open(r'list_error.txt', 'a') as f:
548 | f.write(name.encode('utf-8'))
549 | f.write('\n')
550 | print(datetime.datetime.now())
551 | print(name)
552 | print(e)
553 | else:
554 | try:
555 | driver1.set_page_load_timeout(20)
556 | driver1.get(link)
557 | time.sleep(5)
558 | driver1.save_screenshot('pic2.png') # 文章列表截图
559 | b = True
560 | # while b is True:
561 | # try:
562 | # driver1.find_element_by_class_name('page_verify')
563 | # except:
564 | # b = False
565 | # if b is True:
566 | # driver1.refresh()
567 | # time.sleep(2)
568 | try:
569 | driver1.find_element_by_class_name('page_verify')
570 | except:
571 | b = False
572 | if b is True:
573 | print('page needs verify, stop the program')
574 | print('the last weixinNUM is %s\n' % name)
575 | # self.ocr4wechat(link)
576 | with open(r'list_error.txt', 'a') as f:
577 | f.write(name.encode('utf-8'))
578 | f.write('\n')
579 | # time.sleep(100)
580 | # os.system('pause')
581 | else:
582 | html = driver1.page_source
583 | with open(r'wechat.txt', 'a') as f:
584 | f.write(name.encode('utf-8') + '\n')
585 | return html
586 | except Exception as e:
587 | with open(r'list_error.txt', 'a') as f:
588 | f.write(name.encode('utf-8'))
589 | f.write('\n')
590 | print(name)
591 | print(datetime.datetime.now())
592 | print(e)
593 |
594 | finally:
595 | try:
596 | driver1.quit()
597 | except Exception, e:
598 | pass
599 |
600 | def download_list4key(self, link, cookie):
601 | links = []
602 | dcap = dict(DesiredCapabilities.CHROME)
603 | if link is not None:
604 | try:
605 | driver1 = webdriver.Chrome()
606 | driver1.delete_all_cookies()
607 | driver1.add_cookie(cookie)
608 | except Exception as e:
609 | print(datetime.datetime.now())
610 | print(e)
611 | driver1.set_page_load_timeout(20)
612 | driver1.get(link)
613 | time.sleep(5)
614 | html = driver1.page_source
615 | soup = BeautifulSoup(html, 'html.parser', )
616 | articles = soup.find_all('h3', )
617 | for article in articles:
618 | links.append(article.find('a').get('href'))
619 | driver1.close()
620 | return links
621 | def demo(self):
622 | links = []
623 | driver = webdriver.Chrome()
624 | driver.get("http://weixin.sogou.com/")
625 | time.sleep(5)
626 | driver.find_element_by_xpath('//*[@id="loginBtn"]').click()
627 | time.sleep(10)
628 | driver.find_element_by_class_name('query').send_keys(u'中兴跳楼')
629 | driver.find_element_by_class_name('swz').click()
630 | c = 0
631 | while(True):
632 | if(c == 40):
633 | break
634 | time.sleep(3)
635 | html = driver.page_source
636 | soup = BeautifulSoup(html, 'html.parser', )
637 | articles = soup.find_all('h3', )
638 | for article in articles:
639 | links.append(article.find('a').get('href'))
640 | # for link in links:
641 | # data = None
642 | # while data is None:
643 | # html = self.download_articles_ph(link)
644 | # data = parser.parse_article(html) # 解析出文本
645 | # f.write(data[1] + '#' + data[0] + '#' + data[2] + '#' + data[3])
646 | # f.write('\n')
647 | # f.flush()
648 | driver.find_element_by_class_name('np').click()
649 | c += 1
650 | return links
651 |
652 |
653 |
654 | def download_list_chrome(self, url, name):
655 | if url is None:
656 | return None
657 | profile_dir = r"D:\MyChrome\Default"
658 | # "Referer": "http://weixin.sogou.com"
659 | chrome_options = webdriver.ChromeOptions()
660 | chrome_options.add_argument("--user-data-dir=" + os.path.abspath(profile_dir))
661 | chrome_options.add_argument('--proxy-server=%s' % PROXY)
662 | # chrome_options.add_extension('')添加crx扩展
663 | # service_args = ['--proxy=localhost:9050', '--proxy-type=socks5', ]
664 | try:
665 | driver = webdriver.Chrome(r'C:\Python27\chromedriver', chrome_options=chrome_options)
666 | except Exception as e:
667 | with open(r'list_error.txt', 'a') as f:
668 | f.write(name.encode('utf-8'))
669 | f.write('\n')
670 | print(datetime.datetime.now())
671 | print(url)
672 | print(e)
673 | else:
674 | try:
675 | driver.set_page_load_timeout(20)
676 | try:
677 | driver.get('http://weixin.sogou.com/')
678 | except:
679 | time.sleep(3)
680 | driver.refresh()
681 | # driver.implicitly_wait(5)
682 | # 会产生too many requests
683 | driver.delete_all_cookies()
684 | i = random.randint(0, 4)
685 | for cookie in self.cookie[i]:
686 | driver.add_cookie(cookie)
687 | time.sleep(1)
688 | try:
689 | driver.get(url)
690 | except:
691 | time.sleep(2)
692 | driver.refresh()
693 | time.sleep(2)
694 | # 判断是否存在这个公众号
695 | try:
696 | driver.find_element_by_id("noresult_part1_container")
697 | a = True
698 | except:
699 | a = False
700 | if a is True:
701 | with open(r'no_wechat.txt', 'a') as f:
702 | f.write(name.encode('utf-8'))
703 | f.write('\n')
704 | elif a is False:
705 | # 应对 too many connections
706 | try:
707 | WebDriverWait(driver, 5).until(
708 | EC.presence_of_element_located((By.ID, "sogou_vr_11002301_box_0"))
709 | )
710 | except:
711 | time.sleep(2)
712 | driver.refresh()
713 | now_handle = driver.current_window_handle
714 | driver.find_element_by_id('sogou_vr_11002301_box_0').click()
715 | # 会存在需要验证的情况
716 | time.sleep(2)
717 | all_handles = driver.window_handles
718 | for handle in all_handles:
719 | if handle != now_handle:
720 | driver.switch_to.window(handle) # 跳转到新的窗口
721 | # 判断页面是否是验证页面
722 | # b = True
723 | # while b is True:
724 | # try:
725 | # driver.find_element_by_class_name("page_verify")
726 | # b = True
727 | # driver.refresh()
728 | # time.sleep(2)
729 | # except:
730 | # b = False
731 | #
732 | # # 等待列表的出现
733 | # try:
734 | # WebDriverWait(driver, 5).until(
735 | # EC.presence_of_element_located((By.CLASS_NAME, "weui_msg_card_hd"))
736 | # )
737 | # except:
738 | # driver.refresh()
739 | # time.sleep(2)
740 | # html = driver.page_source#网页动态加载后的代码
741 | wechat_url = driver.current_url
742 | i = random.randint(0, 4)
743 | dcap = dict(DesiredCapabilities.PHANTOMJS)
744 | dcap["phantomjs.page.settings.userAgent"] = (
745 | UA
746 | )
747 | dcap["takesScreenshot"] = (False)
748 | dcap["phantomjs.page.customHeaders.Cookie"] = self.cookie[i]
749 | try:
750 | driver1 = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no'])
751 | except Exception as e:
752 | with open(r'list_error.txt', 'a') as f:
753 | f.write(name.encode('utf-8'))
754 | f.write('\n')
755 | print(datetime.datetime.now())
756 | print(url)
757 | print(e)
758 | else:
759 | try:
760 | driver1.set_page_load_timeout(20)
761 | driver1.get(wechat_url)
762 | html = driver1.page_source
763 | return wechat_url, html
764 | # except Exception as e:
765 | # with open(r'list_error.txt', 'a') as f:
766 | # f.write(name.encode('utf-8'))
767 | # f.write('\n')
768 | # print(datetime.datetime.now())
769 | # print(url)
770 | # print(e)
771 | finally:
772 | driver1.quit()
773 | # return wechat_url, html
774 | except Exception as e:
775 | with open(r'list_error.txt', 'a') as f:
776 | f.write(name.encode('utf-8'))
777 | f.write('\n')
778 | print(url)
779 | print(datetime.datetime.now())
780 | print(e)
781 | finally:
782 | driver.quit()
783 | # if a is False:
784 | # i = random.randint(0, 4)
785 | # dcap = dict(DesiredCapabilities.PHANTOMJS)
786 | # dcap["phantomjs.page.settings.userAgent"] = (
787 | # UA
788 | # )
789 | # dcap["takesScreenshot"] = (False)
790 | # dcap["phantomjs.page.customHeaders.Cookie"] = self.cookie[i]
791 | # try:
792 | # driver1 = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no'])
793 | # except Exception as e:
794 | # print(datetime.datetime.now())
795 | # print(url)
796 | # print(e)
797 | # else:
798 | # try:
799 | # driver1.set_page_load_timeout(20)
800 | # driver1.get(wechat_url)
801 | # html = driver1.page_source
802 | # return wechat_url, html
803 | # except Exception as e:
804 | # print(datetime.datetime.now())
805 | # print(url)
806 | # print(e)
807 | # finally:
808 | # driver1.quit()
809 |
810 |
811 |
812 | # response = urllib2.urlopen(url)
813 | # if response.getcode() != 200:
814 | # return None
815 | # return response.read()
816 |
817 | def download_articles_ph(self, url):
818 | '''
819 | 使用phantomjs下载文章
820 | :param url: 文章链接
821 | :return:
822 | '''
823 | if url is None:
824 | return None
825 | dcap = dict(DesiredCapabilities.PHANTOMJS)
826 | dcap["phantomjs.page.settings.userAgent"] = (
827 | UA
828 | )
829 | dcap["takesScreenshot"] = (False)
830 | try:
831 | driver = webdriver.PhantomJS(desired_capabilities=dcap,
832 | service_args=['--load-images=no', ])
833 | except Exception as e:
834 | print(datetime.datetime.now())
835 | print(url)
836 | print(e)
837 | else:
838 | try:
839 | driver.set_page_load_timeout(30)
840 | driver.get(url)
841 | time.sleep(5)
842 | # driver.implicitly_wait(2)
843 | html = driver.page_source
844 | return html
845 | except:
846 | print(datetime.datetime.now())
847 | print(url)
848 | finally:
849 | try:
850 | driver.quit()
851 | except Exception, e:
852 | pass
853 |
854 | def download_articles_chrome(self, url):
855 | # service_args = ['--load-images=no', ]
856 | profile_dir = r"D:\MyChrome\Default"
857 | chrome_options = webdriver.ChromeOptions()
858 | chrome_options.add_argument("--user-data-dir=" + os.path.abspath(profile_dir))
859 | # PROXY = "123.56.238.200:8123"
860 | # # j = random.randint(0, len(proxys)-1)
861 | # # proxy = proxys[j]
862 | # chrome_options.add_argument('--proxy-server=%s' % PROXY)
863 | # chrome_options.add_extension('')添加crx扩展
864 | # service_args = ['--proxy=localhost:9050', '--proxy-type=socks5', '--load-images=no', ]
865 | try:
866 | driver = webdriver.Chrome(r'C:\Python27\chromedriver', chrome_options=chrome_options)
867 | except Exception as e:
868 | print(datetime.datetime.now())
869 | print(url)
870 | print(e)
871 | else:
872 |
873 | try:
874 | driver.set_page_load_timeout(30)
875 | driver.get(url)
876 | driver.implicitly_wait(2)
877 | html = driver.page_source
878 | return html
879 | except:
880 | print(datetime.datetime.now())
881 | print(url)
882 | # selenium.common.exceptions.TimeoutException:
883 | # return self.download_acticles(url)
884 | return None
885 | finally:
886 | driver.quit()
887 |
888 | def maintain_cookies(self):
889 | cookie = []
890 | # 获取5组cookies
891 | for i in range(5):
892 | driver = webdriver.Chrome(r'C:\Python27\chromedriver')
893 | driver.get("http://weixin.sogou.com/")
894 | # 获得cookie信息
895 | cookie.append(driver.get_cookies())
896 | print(driver.get_cookies())
897 | driver.quit()
898 |
899 | return cookie
900 |
901 | def maintain_cookies_ph(self):
902 | dcap = dict(DesiredCapabilities.PHANTOMJS)
903 | dcap["phantomjs.page.settings.userAgent"] = UA
904 | cookie = []
905 | # 获取5组cookies
906 | for i in range(10):
907 | driver = webdriver.PhantomJS(desired_capabilities=dcap,
908 | service_args=['--load-images=no', ])
909 | driver.get("http://weixin.sogou.com/")
910 | # 获得cookie信息
911 | cookie.append(driver.get_cookies())
912 | # print(driver.get_cookies())
913 | driver.quit()
914 | return cookie
915 |
916 | def get_cookies(self):
917 | driver = webdriver.Chrome()
918 | driver.get("http://weixin.sogou.com/")
919 | time.sleep(5)
920 | driver.find_element_by_xpath('//*[@id="loginBtn"]').click()
921 | time.sleep(10)
922 |
923 | cookies = driver.get_cookies()
924 | cookie = {}
925 | for items in cookies:
926 | cookie[items.get('name')] = items.get('value')
927 | return cookie
928 | b = html_parser.HtmlParser()
929 | f = open('category2.csv', 'a')
930 | def task(self, link):
931 | data = None
932 | while data is None:
933 | html = self.download_articles_ph(link)
934 | data = self.b.parse_article(html)
935 | self.f.write(data[1] + '#' + data[0] + '#' + data[2] + '#' + data[3])
936 | self.f.write('\n')
937 |
938 |
939 |
940 | if __name__ == "__main__":
941 |
942 | a = HtmlDownloader()
943 | b = html_parser.HtmlParser()
944 | links = a.demo()
945 | from multiprocessing.dummy import Pool as ThreadPool
946 | pool = ThreadPool(6)
947 | pool.map(a.task, links)
948 | pool.close()
949 | pool.join()
950 | # # a.ocr4wechat('http://mp.weixin.qq.com/s?timestamp=1478687270&src=3&ver=1&signature=5RtOXxZ16P0x8hvN7sARkESooWCRi1F-'
951 | # 'AcdjyV1phiMF7EC8fCYB1STlGWMUeoUQtSoEFQC26jd-X-*3GiGa-ZwBJQBld54xrGpEc81g*kjGncNNXLgRkpw5WIoCO5T-KbO'
952 | # 'xjsRjYFvrvDaynu1I7vvIE9itjIEzCa77YZuMMyM=')
953 | # a.download_list_chrome("http://weixin.sogou.com/weixin?type=%d&query=%s" % (1, 'renmin'), u'renmin')
954 |
--------------------------------------------------------------------------------