119 | if html is None: 120 | return 121 | soup = BeautifulSoup(html, 'html.parser', ) 122 | # today = str(datetime.date.today()) 123 | # post_date = soup.find('em', id='post-date' ) 124 | # if post_date == today: 125 | # get_text()返回的是unicode编码 126 | try: 127 | title = soup.find('h2', class_='rich_media_title').get_text().strip(' \n').encode('utf-8') 128 | wname = soup.find('a', id='post-user').get_text().encode('utf-8') 129 | date = soup.find('em', id='post-date').get_text().encode('utf-8') 130 | content = soup.find('div', class_='rich_media_content ').get_text().strip('\n').encode('utf-8')#文章内容 131 | readNum = None 132 | praise_num = None 133 | discuss_content = None 134 | discuss_praise = None 135 | except Exception as e: 136 | return None 137 | try: 138 | readNum = soup.find('span', id='sg_readNum3').get_text().encode('utf-8') 139 | praise_num = soup.find('span', id='sg_likeNum3').get_text().encode('utf-8') 140 | discuss_list = soup.find_all('li', class_='discuss_item') 141 | discuss_content = [a.find('div', class_='discuss_message_content').get_text().strip().encode('utf-8') for a in discuss_list] 142 | discuss_praise = [a.find('span', class_='praise_num').get_text().encode('utf-8') for a in discuss_list] 143 | except Exception as e: 144 | pass 145 | # print(e) 146 | 147 | return title, wname, date, content, readNum, praise_num, discuss_content, discuss_praise 148 | 149 | def parse_wechat(self, page_source): 150 | if page_source is None: 151 | return 152 | soup = BeautifulSoup(page_source, 'html.parser',).find('li', id='sogou_vr_11002301_box_0') 153 | account_name = soup.find('a', uigs='account_name_0').get_text().encode('utf-8') 154 | info = soup.find('p', class_='info') 155 | weixinhao = info.find('label').get_text().encode('utf-8') 156 | information = [text for text in info.stripped_strings] 157 | # sougou 更新去掉平均阅读数 158 | # if len(information) == 3: 159 | # num1, num2 = re.findall(u'[\\d]+', information[-1]) 160 | # else: 161 | # num1 = 'null' 162 | # num2 = 'null' 163 | if len(information) == 3: 164 | num1 = re.findall(u'[\\d]+', information[-1]) 165 | else: 166 | num1 = 'null' 167 | 168 | introduction = soup.find_all('dl') 169 | fuction = introduction[0].find('dd').get_text() 170 | identify = 'null' 171 | if len(introduction) > 1: 172 | if introduction[1].find('dt').get_text().find(u'认证') != -1: 173 | identify = introduction[1].find('dd').get_text() 174 | return account_name, weixinhao, num1, fuction, identify 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /spider_main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: UTF-8 -*- 3 | 4 | # URL管理器 5 | """ 6 | 添加新的URL到待爬取集合中 7 | 判断待添加URL是否在容器中 8 | 获取待爬取URL 9 | 判断是否还有待爬取URL 10 | 将URL从待爬取移动到已爬取 11 | """ 12 | 13 | # 网页下载器 14 | import urllib2 15 | 16 | """ 17 | urllib2 18 | requests 19 | """ 20 | 21 | # 网页解析器 22 | """ 23 | 正则表达式 24 | html.parser 25 | BeautifulSoup 26 | lxml 27 | """ 28 | import httplib 29 | import url_manager, html_downloader, html_outputer, html_parser 30 | import os 31 | import sys 32 | import codecs 33 | import datetime 34 | import logging 35 | import threadpool 36 | from apscheduler.schedulers.blocking import BlockingScheduler 37 | from multiprocessing.dummy import Pool as ThreadPool 38 | from multiprocessing import Pool 39 | 40 | 41 | class SpiderMain(object): 42 | def __init__(self): 43 | # self.urls = url_manager.UrlManager() 44 | self.downloader = html_downloader.HtmlDownloader() 45 | self.parser = html_parser.HtmlParser() 46 | self.outputer = html_outputer.HtmlOutputer() 47 | 48 | def craw(self, root_url, full_path, name): 49 | ''' 50 | :param root_url: 搜狗微信的搜索url 51 | :param full_path: 存储的文件目录 52 | :param name: 公众号的名称 53 | :return: 54 | ''' 55 | new_url = root_url 56 | # html = None 57 | # try: 58 | # html = self.downloader.download_list_ph(new_url, name) 59 | # except httplib.IncompleteRead as e: 60 | # with open(r'list_error.txt', 'a') as f: 61 | # f.write(name.encode('utf-8')) 62 | # f.write('\n') 63 | # if html == None: 64 | # return 65 | # wechat_url, html_cont = html 66 | # acticle_links = self.parser.parse_list(wechat_url, html_cont) 67 | # if acticle_links == None: 68 | # return 69 | html = None 70 | html_list = None 71 | try: 72 | html = self.downloader.download_list_ph(new_url, name) 73 | except httplib.IncompleteRead as e: 74 | with open(r'list_error.txt', 'a') as f: 75 | f.write(name.encode('utf-8')) 76 | f.write('\n') 77 | if html is None: 78 | return 79 | link, page_source = html 80 | # data = self.parser.parse_wechat(page_source) 81 | # self.outputer.wechat_info(data) 82 | try: 83 | html_list = self.downloader.download_list_ph_2(name, link) 84 | except httplib.IncompleteRead as e: 85 | with open(r'list_error.txt', 'a') as f: 86 | f.write(name.encode('utf-8')) 87 | f.write('\n') 88 | 89 | if html_list is None: 90 | return 91 | 92 | acticle_links = self.parser.parse_list(link, html_list) 93 | if acticle_links is None: 94 | with open(r'list_error.txt', 'a') as f: 95 | f.write(name.encode('utf-8')) 96 | f.write('\n') 97 | return 98 | 99 | for link in acticle_links: 100 | html = self.downloader.download_articles_ph(link) 101 | data = self.parser.parse_article(html) # 解析出文本 102 | if data == None: 103 | continue 104 | (title, wname, date, content, readNum, praise_num, discuss_content, discuss_praise) = data 105 | self.outputer.output_mongodb(name, data) 106 | f = open('category1.csv', 'a') 107 | def task(self,link): 108 | data = None 109 | while data is None: 110 | html = self.downloader.download_articles_ph(link) 111 | data = self.parser.parse_article(html) 112 | self.f.write(data[1] + '#' + data[0] + '#' + data[2] + '#' + data[3]) 113 | self.f.write('\n') 114 | self.f.flush() 115 | 116 | def craw4key(self, key): 117 | f = open('category1.csv', 'a') 118 | cookie = self.downloader.get_cookies() 119 | for i in range(11, 20): 120 | print('the page is %d' % i) 121 | root_url = u"http://weixin.sogou.com/weixin?type=2&page=%d&ie=utf8&s_from=hotnews&query=%s" % (i, key) 122 | html_list = self.downloader.download_list4key(root_url, cookie) 123 | # pool = ThreadPool(6) 124 | # pool.map(self.downloader.download_articles_ph, html_list) 125 | # pool.close() 126 | # pool.join() 127 | for link in html_list: 128 | data = None 129 | while data is None: 130 | html = self.downloader.download_articles_ph(link) 131 | data = self.parser.parse_article(html) # 解析出文本 132 | f.write(data[1] + '#' + data[0] + '#' + data[2] + '#' + data[3]) 133 | f.write('\n') 134 | f.flush() 135 | f.close() 136 | 137 | def schedule(self, name): 138 | if name == '': 139 | return 0 140 | root_url = "http://weixin.sogou.com/weixin?type=%d&query=%s" % (1, name) 141 | full_path = None 142 | # full_path = new_path(name) # 存储目录 143 | # type:表示搜索类型 querystring:表示公众号 i:表示网页页数1 144 | # oneday = datetime.timedelta(days=1) 145 | # today = str(datetime.date.today()) 146 | # file_name = full_path+r'\%s.csv' % today 147 | # if os.path.exists(file_name): 148 | # return 0 149 | try: 150 | self.craw(root_url, full_path, name) 151 | except urllib2.URLError as e: 152 | print(datetime.datetime.now()) 153 | print(e) 154 | with open(r'list_error.txt', 'a') as f: 155 | f.write(name.encode('utf-8')) 156 | f.write('\n') 157 | 158 | return 1 159 | 160 | def list_multiprocess(self, filename): 161 | name_list = [] 162 | with open(filename) as fout: 163 | for name in fout: 164 | if name[:3] == codecs.BOM_UTF8: 165 | name = name[3:] 166 | named = name.strip('.\n').decode('utf-8') 167 | # print named 168 | name_list.append(named) 169 | 170 | pool = ThreadPool(6) 171 | pool.map(self.schedule, name_list) 172 | pool.close() 173 | pool.join() 174 | self.error_handle() 175 | 176 | def single_job(self, filename): 177 | with open(filename) as fout: 178 | for name in fout: 179 | if name[:3] == codecs.BOM_UTF8: 180 | name = name[3:] 181 | named = name.strip('.\n').decode('utf-8') 182 | print named 183 | self.schedule(named) 184 | self.error_handle() 185 | os.remove('list_error.txt') 186 | 187 | # 多线程的格式预处理 188 | def list_handle(self, filename): 189 | name_list = [] 190 | with open(filename) as fout: 191 | for name in fout: 192 | if name[:3] == codecs.BOM_UTF8: 193 | name = name[3:] 194 | named = name.strip('.\n').decode('utf-8') 195 | print named 196 | name_list.append(named) 197 | pool = threadpool.ThreadPool(4) 198 | requests = threadpool.makeRequests(self.schedule, name_list) 199 | [pool.putRequest(req) for req in requests] 200 | pool.wait() 201 | print('destory all threads') 202 | pool.dismissWorkers(4, True) 203 | 204 | def error_handle(self): 205 | number = 0 206 | while os.path.exists('list_error.txt'): 207 | number = number + 1 208 | print ('the number for handling is %d' % number) 209 | print('start list_error download') 210 | print(datetime.datetime.now()) 211 | with open('list_error.txt', ) as f: 212 | names = f.readlines() 213 | for i, name in enumerate(names): 214 | names[i] = name.strip('\n') 215 | os.remove('list_error.txt') 216 | print(names) 217 | pool1 = ThreadPool(3) 218 | try: 219 | pool1.map(self.schedule, names) 220 | pool1.close() 221 | pool1.join() 222 | except: 223 | pass 224 | print(datetime.datetime.now()) 225 | 226 | 227 | path = u'd:\\wechat_data1' 228 | 229 | 230 | def mk_dir(full_path): 231 | full_path = full_path.strip() 232 | full_path = full_path.rstrip("\\") 233 | # 判断路径是否存在 234 | is_exists = os.path.exists(full_path) 235 | if not is_exists: 236 | # 如果不存在则创建目录 237 | # 创建目录操作函数 238 | os.makedirs(full_path) 239 | return True 240 | else: 241 | pass 242 | # 如果目录存在则不创建，并提示目录已存在 243 | 244 | 245 | def new_path(name): 246 | full_path = path + r'\%s' % name 247 | mk_dir(full_path) 248 | return full_path 249 | 250 | 251 | def job_period(): 252 | # ip_pool.ip_collect() # 采集代理ip 253 | obj_spider = SpiderMain() 254 | # obj_spider.single_job('D:\\WechatList.txt') 255 | obj_spider.list_multiprocess('D:\\WechatList.txt') 256 | 257 | os.remove('wechat.txt') 258 | 259 | 260 | if __name__ == "__main__": 261 | # logging.basicConfig(filename='log.txt') 262 | # sched = BlockingScheduler() 263 | # sched.add_job(job_period, 'cron', start_date='2017-01-01', hour=1, minute=0, second=0, end_date='2017-12-30') 264 | # a = sched.get_jobs() 265 | # print(a) 266 | # sched.start() 267 | 268 | # job_period() 269 | spider = SpiderMain() 270 | # spider.single_job('D:\\WechatList.txt') 271 | spider.craw4key(u'中兴跳楼') 272 | -------------------------------------------------------------------------------- /html_downloader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: UTF-8 -*- 3 | import requests 4 | import html_parser 5 | import urllib2 6 | from ruokuaicode import RClient 7 | import signal 8 | import exceptions 9 | from PIL import Image 10 | from selenium import webdriver 11 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 12 | from selenium.webdriver.common.by import By 13 | from selenium.webdriver.support.ui import WebDriverWait 14 | from selenium.webdriver.support import expected_conditions as EC 15 | import filecache 16 | import time 17 | import os 18 | import base64 19 | import random 20 | import datetime 21 | import config 22 | import string 23 | import zipfile 24 | import socket 25 | import sys 26 | import logging 27 | from bs4 import BeautifulSoup 28 | 29 | try: 30 | import StringIO 31 | 32 | 33 | def readimg(content): 34 | return Image.open(StringIO.StringIO(content)) 35 | except ImportError: 36 | import tempfile 37 | 38 | 39 | def readimg(content): 40 | f = tempfile.TemporaryFile() 41 | f.write(content) 42 | return Image.open(f) 43 | 44 | UA = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36" 45 | PROXY = "123.56.238.200:8123" 46 | PSIPHON = '127.0.0.1:54552' 47 | 48 | # 代理服务器 49 | proxyHost = "proxy.abuyun.com" 50 | proxyPort = "9020" 51 | proxyServer = "http://proxy.abuyun.com:9020" 52 | # 代理隧道验证信息 53 | proxyUser = "" 54 | proxyPass = "" 55 | 56 | # proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { 57 | # "host" : proxyHost, 58 | # "port" : proxyPort, 59 | # "user" : proxyUser, 60 | # "pass" : proxyPass, 61 | # } 62 | # proxyAuth = "Basic " + base64.b64encode(proxyUser + ":" + proxyPass) 63 | # proxy_handler = urllib2.ProxyHandler({ 64 | # "http": proxyMeta, 65 | # "https": proxyMeta, 66 | # }) 67 | # opener = urllib2.build_opener(proxy_handler) 68 | service_args = [ 69 | "--proxy-type=http", 70 | "--proxy=%(host)s:%(port)s" % { 71 | "host": proxyHost, 72 | "port": proxyPort, 73 | }, 74 | "--proxy-auth=%(user)s:%(pass)s" % { 75 | "user": proxyUser, 76 | "pass": proxyPass, 77 | }, 78 | ] 79 | 80 | 81 | def create_proxy_auth_extension(proxy_host, proxy_port, 82 | proxy_username, proxy_password, 83 | scheme='http', plugin_path=None): 84 | if plugin_path is None: 85 | plugin_path = r'D:/{}_{}@http-dyn.abuyun.com_9020.zip'.format(proxy_username, proxy_password) 86 | 87 | manifest_json = """ 88 | { 89 | "version": "1.0.0", 90 | "manifest_version": 2, 91 | "name": "Abuyun Proxy", 92 | "permissions": [ 93 | "proxy", 94 | "tabs", 95 | "unlimitedStorage", 96 | "storage", 97 | "", 98 | "webRequest", 99 | "webRequestBlocking" 100 | ], 101 | "background": { 102 | "scripts": ["background.js"] 103 | }, 104 | "minimum_chrome_version":"22.0.0" 105 | } 106 | """ 107 | 108 | background_js = string.Template( 109 | """ 110 | var config = { 111 | mode: "fixed_servers", 112 | rules: { 113 | singleProxy: { 114 | scheme: "${scheme}", 115 | host: "${host}", 116 | port: parseInt(${port}) 117 | }, 118 | bypassList: ["foobar.com"] 119 | } 120 | }; 121 | 122 | chrome.proxy.settings.set({value: config, scope: "regular"}, function() {}); 123 | 124 | function callbackFn(details) { 125 | return { 126 | authCredentials: { 127 | username: "${username}", 128 | password: "${password}" 129 | } 130 | }; 131 | } 132 | 133 | chrome.webRequest.onAuthRequired.addListener( 134 | callbackFn, 135 | {urls: [""]}, 136 | ['blocking'] 137 | ); 138 | """ 139 | ).substitute( 140 | host=proxy_host, 141 | port=proxy_port, 142 | username=proxy_username, 143 | password=proxy_password, 144 | scheme=scheme, 145 | ) 146 | 147 | with zipfile.ZipFile(plugin_path, 'w') as zp: 148 | zp.writestr("manifest.json", manifest_json) 149 | zp.writestr("background.js", background_js) 150 | 151 | return plugin_path 152 | 153 | 154 | proxy_auth_plugin_path = create_proxy_auth_extension( 155 | proxy_host=proxyHost, 156 | proxy_port=proxyPort, 157 | proxy_username=proxyUser, 158 | proxy_password=proxyPass) 159 | 160 | 161 | def test(): 162 | profile_dir = r"D:\MyChrome\Default" 163 | # 设置请求头 164 | # "Referer": "http://weixin.sogou.com" 165 | chrome_options = webdriver.ChromeOptions() 166 | chrome_options.add_argument("--user-data-dir=" + os.path.abspath(profile_dir)) 167 | PROXY = "123.56.238.200:8123" 168 | # j = random.randint(0, len(proxys)-1) 169 | # proxy = proxys[j] 170 | chrome_options.add_argument('--proxy-server=%s' % PROXY) 171 | # chrome_options.add_extension('')添加crx扩展 172 | # service_args = ['--proxy=localhost:9050', '--proxy-type=socks5', ] 173 | driver = webdriver.Chrome(r'C:\Python27\chromedriver', chrome_options=chrome_options) 174 | driver.get('http://icanhazip.com') 175 | driver.refresh() 176 | print(driver.page_source) 177 | driver.quit() 178 | 179 | 180 | class HtmlDownloader(object): 181 | def __init__(self): 182 | self._ocr = RClient(config.dama_name, config.dama_pswd, config.dama_soft_id, config.dama_soft_key) 183 | self._cache = filecache.WechatCache(config.cache_dir, 60 * 60) 184 | self._session = self._cache.get(config.cache_session_name) if self._cache.get( 185 | config.cache_session_name) else requests.session() 186 | # self.cookie = self.maintain_cookies_ph() 187 | self.agents = [ 188 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 189 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", 190 | "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", 191 | "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", 192 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", 193 | "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", 194 | "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", 195 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", 196 | "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", 197 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", 198 | "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", 199 | "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", 200 | "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", 201 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 202 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", 203 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", 204 | ] 205 | 206 | def ocr4wechat(self, url): 207 | # logger.debug('vcode appear, using _ocr_for_get_gzh_article_by_url_text') 208 | timestr = str(time.time()).replace('.', '') 209 | timever = timestr[0:13] + '.' + timestr[13:17] 210 | codeurl = 'http://mp.weixin.qq.com/mp/verifycode?cert=' + timever 211 | coder = self._session.get(codeurl) 212 | if hasattr(self, '_ocr'): 213 | result = self._ocr.create(coder.content, 2040) 214 | img_code = result['Result'] 215 | print(img_code) 216 | else: 217 | im = readimg(coder.content) 218 | im.show() 219 | img_code = raw_input("please input code: ") 220 | post_url = 'http://mp.weixin.qq.com/mp/verifycode' 221 | post_data = { 222 | 'cert': timever, 223 | 'input': img_code 224 | } 225 | headers = { 226 | "User-Agent": random.choice(self.agents), 227 | 'Host': 'mp.weixin.qq.com', 228 | 'Referer': url 229 | } 230 | rr = self._session.post(post_url, post_data, headers=headers) 231 | print(rr.text) 232 | # remsg = eval(rr.text) 233 | # if remsg['ret'] != 0: 234 | # logger.error('cannot verify get_gzh_article because ' + remsg['errmsg']) 235 | # raise exceptions.WechatSogouVcodeException('cannot verify wechat_code because ' + remsg['errmsg']) 236 | self._cache.set(config.cache_session_name, self._session) 237 | # logger.debug('ocr ', remsg['errmsg']) 238 | 239 | def download_list(self, url, name): 240 | ''' 241 | 使用urllib2 获取微信公众号列表页的url 242 | :param url: 243 | :param name: 244 | :return: 245 | ''' 246 | urllib2.install_opener(opener) 247 | headers = { 248 | "User-Agent": random.choice(self.agents), 249 | "Referer": 'http://weixin.sogou.com/', 250 | 'Host': 'weixin.sogou.com', 251 | 'Cookie': random.choice(self.cookie) 252 | } 253 | req = urllib2.Request(url, headers=headers) 254 | # req.set_proxy(PROXY, 'http') 255 | try: 256 | response = urllib2.urlopen(req) 257 | time.sleep(2) 258 | except urllib2.URLError as e: 259 | if hasattr(e, 'reason'): 260 | # HTTPError and URLError all have reason attribute. 261 | print 'We failed to reach a server.' 262 | print 'Reason: ', e.reason 263 | elif hasattr(e, 'code'): 264 | # Only HTTPError has code attribute. 265 | print 'The server couldn\'t fulfill the request.' 266 | print 'Error code: ', e.code 267 | with open(r'list_error.txt', 'a') as f: 268 | f.write(name.encode('utf-8')) 269 | f.write('\n') 270 | return 271 | 272 | try: 273 | a = html_parser.HtmlParser.parse_list_url(response, name) 274 | except AttributeError: 275 | with open(r'list_error.txt', 'a') as f: 276 | f.write(name.encode('utf-8')) 277 | f.write('\n') 278 | return 279 | if a is not None: 280 | time.sleep(1) 281 | return self.download(a, name, url) 282 | 283 | # headers_weixin = { 284 | # "User-Agent": random.choice(self.agents), 285 | # "Referer": 'http://weixin.sogou.com/', 286 | # 'Host': 'mp.weixin.qq.com', 287 | # } 288 | # req1 = urllib2.Request(a, headers=headers_weixin) 289 | # response1 = urllib2.urlopen(req1) 290 | # with open('c:\\a.html', 'a') as f: 291 | # f.write(response1.read()) 292 | 293 | def download(self, link, name, url): 294 | """ 295 | 下载指定公众号的文章列表 296 | :param link: 297 | :param name: 298 | :param url: 299 | :return: 300 | """ 301 | dcap = dict(DesiredCapabilities.PHANTOMJS) 302 | dcap["phantomjs.page.settings.userAgent"] = ( 303 | random.choice(self.agents) 304 | ) 305 | dcap["takesScreenshot"] = False 306 | dcap["phantomjs.page.customHeaders.Cookie"] = random.choice(self.cookie) 307 | dcap["phantomjs.page.customHeaders.Proxy-Authorization"] = proxyAuth 308 | # dcap["phantomjs.page.settings.resourceTimeout"] = ("1000") 309 | try: 310 | driver1 = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no', 311 | '--proxy=http://proxy.abuyun.com:9020']) 312 | except Exception as e: 313 | with open(r'list_error.txt', 'a') as f: 314 | f.write(name.encode('utf-8')) 315 | f.write('\n') 316 | print(datetime.datetime.now()) 317 | print(url) 318 | print(e) 319 | else: 320 | try: 321 | driver1.set_page_load_timeout(20) 322 | driver1.get(link) 323 | # driver1.get('http://ip.chinaz.com/getip.aspx') 324 | # a = driver1.page_source 325 | b = True 326 | try: 327 | driver1.find_element_by_class_name('page_verify') 328 | except: 329 | b = False 330 | 331 | if b is True: 332 | print('page needs verify, stop the program') 333 | print('the last weixinNUM is %s\n' % name) 334 | # self.ocr4wechat(link) 335 | with open(r'list_error.txt', 'a') as f: 336 | f.write(name.encode('utf-8')) 337 | f.write('\n') 338 | # time.sleep(80) 339 | else: 340 | html = driver1.page_source 341 | return link, html 342 | except Exception as e: 343 | with open(r'list_error.txt', 'a') as f: 344 | f.write(name.encode('utf-8')) 345 | f.write('\n') 346 | print(url) 347 | print(datetime.datetime.now()) 348 | print(e) 349 | 350 | finally: 351 | driver1.quit() 352 | 353 | def down_list1(self, url, name): 354 | if url is None: 355 | return None 356 | profile_dir = r"D:\MyChrome\Default" 357 | chrome_options = webdriver.ChromeOptions() 358 | chrome_options.add_argument("--user-data-dir=" + os.path.abspath(profile_dir)) 359 | chrome_options.add_argument("--window-size=1920,1080") 360 | chrome_options.add_argument("--headless") 361 | chrome_options.add_extension(proxy_auth_plugin_path) 362 | try: 363 | driver = webdriver.Chrome(r'C:\Python27\chromedriver', chrome_options=chrome_options) 364 | except Exception as e: 365 | with open(r'list_error.txt', 'a') as f: 366 | f.write(name.encode('utf-8')) 367 | f.write('\n') 368 | print(datetime.datetime.now()) 369 | print(url) 370 | print(e) 371 | else: 372 | driver.set_page_load_timeout(20) 373 | try: 374 | driver.get(url) 375 | except: 376 | time.sleep(2) 377 | driver.refresh() 378 | try: 379 | driver.find_element_by_id("noresult_part1_container") 380 | a = True 381 | except: 382 | a = False 383 | if a is True: 384 | with open(r'no_wechat.txt', 'a') as f: 385 | f.write(name.encode('utf-8')) 386 | f.write('\n') 387 | # 公众号存在 388 | elif a is False: 389 | try: 390 | time.sleep(5) 391 | # driver.save_screenshot('pic1.png') # 搜索公众号截图 392 | # 代理连接过多导致失败 393 | button = driver.find_element_by_css_selector('a[uigs =\'account_name_0\']') 394 | link = button.get_attribute('href') 395 | return link, driver.page_source 396 | except Exception as e: 397 | with open(r'list_error.txt', 'a') as f: 398 | f.write(name.encode('utf-8')) 399 | f.write('\n') 400 | print(datetime.datetime.now()) 401 | print(url) 402 | print(e) 403 | finally: 404 | try: 405 | driver.quit() 406 | except Exception, e: 407 | pass 408 | 409 | def down_list2(self, name, link): 410 | if link is not None: 411 | profile_dir = r"D:\MyChrome\Default" 412 | chrome_options = webdriver.ChromeOptions() 413 | chrome_options.add_argument("--user-data-dir=" + os.path.abspath(profile_dir)) 414 | chrome_options.add_argument("--window-size=1920,1080") 415 | chrome_options.add_argument("--headless") 416 | chrome_options.add_extension(proxy_auth_plugin_path) 417 | try: 418 | driver1 = webdriver.Chrome(r'C:\Python27\chromedriver', chrome_options=chrome_options) 419 | except Exception as e: 420 | with open(r'list_error.txt', 'a') as f: 421 | f.write(name.encode('utf-8')) 422 | f.write('\n') 423 | print(datetime.datetime.now()) 424 | print(name) 425 | print(e) 426 | else: 427 | try: 428 | driver1.set_page_load_timeout(20) 429 | driver1.get(link) 430 | time.sleep(5) 431 | driver1.save_screenshot('pic2.png') # 文章列表截图 432 | b = True 433 | try: 434 | driver1.find_element_by_class_name('page_verify') 435 | except: 436 | b = False 437 | if b is True: 438 | print('page needs verify, stop the program') 439 | print('the last weixinNUM is %s\n' % name) 440 | # self.ocr4wechat(link) 441 | with open(r'list_error.txt', 'a') as f: 442 | f.write(name.encode('utf-8')) 443 | f.write('\n') 444 | # time.sleep(100) 445 | # os.system('pause') 446 | else: 447 | html = driver1.page_source 448 | with open(r'wechat.txt', 'a') as f: 449 | f.write(name.encode('utf-8') + '\n') 450 | return html 451 | except Exception as e: 452 | with open(r'list_error.txt', 'a') as f: 453 | f.write(name.encode('utf-8')) 454 | f.write('\n') 455 | print(name) 456 | print(datetime.datetime.now()) 457 | print(e) 458 | 459 | finally: 460 | try: 461 | driver1.quit() 462 | except Exception, e: 463 | pass 464 | 465 | def download_list_ph(self, url, name): 466 | ''' 467 | 使用phantomjs下载微信公众号文章列表 468 | :param url: 469 | :param name: 470 | :return: 471 | ''' 472 | if url is None: 473 | return None 474 | 475 | dcap = dict(DesiredCapabilities.PHANTOMJS) 476 | dcap["phantomjs.page.settings.userAgent"] = ( 477 | random.choice(self.agents) 478 | ) 479 | # dcap["takesScreenshot"] = False 480 | # dcap["phantomjs.page.customHeaders.Cookie"] = random.choice(self.cookie) 481 | # dcap["phantomjs.page.customHeaders.Proxy-Authorization"] = proxyAuth 482 | dcap["phantomjs.page.settings.loadImages"] = False 483 | # dcap["phantomjs.page.settings.resourceTimeout"] = ("1000") 484 | try: 485 | driver = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args) 486 | except Exception as e: 487 | with open(r'list_error.txt', 'a') as f: 488 | f.write(name.encode('utf-8')) 489 | f.write('\n') 490 | print(datetime.datetime.now()) 491 | print(url) 492 | print(e) 493 | 494 | else: 495 | driver.set_page_load_timeout(20) 496 | try: 497 | driver.get(url) 498 | except: 499 | time.sleep(2) 500 | driver.refresh() 501 | try: 502 | driver.find_element_by_id("noresult_part1_container") 503 | a = True 504 | except: 505 | a = False 506 | if a is True: 507 | with open(r'no_wechat.txt', 'a') as f: 508 | f.write(name.encode('utf-8')) 509 | f.write('\n') 510 | # 公众号存在 511 | elif a is False: 512 | try: 513 | time.sleep(5) 514 | # driver.save_screenshot('pic1.png') # 搜索公众号截图 515 | # 代理连接过多导致失败 516 | button = driver.find_element_by_css_selector('a[uigs =\'account_name_0\']') 517 | link = button.get_attribute('href') 518 | return link, driver.page_source 519 | except Exception as e: 520 | with open(r'list_error.txt', 'a') as f: 521 | f.write(name.encode('utf-8')) 522 | f.write('\n') 523 | print(datetime.datetime.now()) 524 | print(url) 525 | print(e) 526 | finally: 527 | try: 528 | driver.quit() 529 | except Exception, e: 530 | pass 531 | 532 | 533 | # 获取公众号文章列表 534 | 535 | def download_list_ph_2(self, name, link): 536 | if link is not None: 537 | dcap = dict(DesiredCapabilities.PHANTOMJS) 538 | dcap["phantomjs.page.settings.userAgent"] = ( 539 | random.choice(self.agents) 540 | ) 541 | # dcap["takesScreenshot"] = False 542 | # dcap["phantomjs.page.customHeaders.Cookie"] = random.choice(self.cookie) 543 | # dcap["phantomjs.page.customHeaders.Proxy-Authorization"] = proxyAuth 544 | try: 545 | driver1 = webdriver.PhantomJS(desired_capabilities=dcap, service_args=service_args) 546 | except Exception as e: 547 | with open(r'list_error.txt', 'a') as f: 548 | f.write(name.encode('utf-8')) 549 | f.write('\n') 550 | print(datetime.datetime.now()) 551 | print(name) 552 | print(e) 553 | else: 554 | try: 555 | driver1.set_page_load_timeout(20) 556 | driver1.get(link) 557 | time.sleep(5) 558 | driver1.save_screenshot('pic2.png') # 文章列表截图 559 | b = True 560 | # while b is True: 561 | # try: 562 | # driver1.find_element_by_class_name('page_verify') 563 | # except: 564 | # b = False 565 | # if b is True: 566 | # driver1.refresh() 567 | # time.sleep(2) 568 | try: 569 | driver1.find_element_by_class_name('page_verify') 570 | except: 571 | b = False 572 | if b is True: 573 | print('page needs verify, stop the program') 574 | print('the last weixinNUM is %s\n' % name) 575 | # self.ocr4wechat(link) 576 | with open(r'list_error.txt', 'a') as f: 577 | f.write(name.encode('utf-8')) 578 | f.write('\n') 579 | # time.sleep(100) 580 | # os.system('pause') 581 | else: 582 | html = driver1.page_source 583 | with open(r'wechat.txt', 'a') as f: 584 | f.write(name.encode('utf-8') + '\n') 585 | return html 586 | except Exception as e: 587 | with open(r'list_error.txt', 'a') as f: 588 | f.write(name.encode('utf-8')) 589 | f.write('\n') 590 | print(name) 591 | print(datetime.datetime.now()) 592 | print(e) 593 | 594 | finally: 595 | try: 596 | driver1.quit() 597 | except Exception, e: 598 | pass 599 | 600 | def download_list4key(self, link, cookie): 601 | links = [] 602 | dcap = dict(DesiredCapabilities.CHROME) 603 | if link is not None: 604 | try: 605 | driver1 = webdriver.Chrome() 606 | driver1.delete_all_cookies() 607 | driver1.add_cookie(cookie) 608 | except Exception as e: 609 | print(datetime.datetime.now()) 610 | print(e) 611 | driver1.set_page_load_timeout(20) 612 | driver1.get(link) 613 | time.sleep(5) 614 | html = driver1.page_source 615 | soup = BeautifulSoup(html, 'html.parser', ) 616 | articles = soup.find_all('h3', ) 617 | for article in articles: 618 | links.append(article.find('a').get('href')) 619 | driver1.close() 620 | return links 621 | def demo(self): 622 | links = [] 623 | driver = webdriver.Chrome() 624 | driver.get("http://weixin.sogou.com/") 625 | time.sleep(5) 626 | driver.find_element_by_xpath('//*[@id="loginBtn"]').click() 627 | time.sleep(10) 628 | driver.find_element_by_class_name('query').send_keys(u'中兴跳楼') 629 | driver.find_element_by_class_name('swz').click() 630 | c = 0 631 | while(True): 632 | if(c == 40): 633 | break 634 | time.sleep(3) 635 | html = driver.page_source 636 | soup = BeautifulSoup(html, 'html.parser', ) 637 | articles = soup.find_all('h3', ) 638 | for article in articles: 639 | links.append(article.find('a').get('href')) 640 | # for link in links: 641 | # data = None 642 | # while data is None: 643 | # html = self.download_articles_ph(link) 644 | # data = parser.parse_article(html) # 解析出文本 645 | # f.write(data[1] + '#' + data[0] + '#' + data[2] + '#' + data[3]) 646 | # f.write('\n') 647 | # f.flush() 648 | driver.find_element_by_class_name('np').click() 649 | c += 1 650 | return links 651 | 652 | 653 | 654 | def download_list_chrome(self, url, name): 655 | if url is None: 656 | return None 657 | profile_dir = r"D:\MyChrome\Default" 658 | # "Referer": "http://weixin.sogou.com" 659 | chrome_options = webdriver.ChromeOptions() 660 | chrome_options.add_argument("--user-data-dir=" + os.path.abspath(profile_dir)) 661 | chrome_options.add_argument('--proxy-server=%s' % PROXY) 662 | # chrome_options.add_extension('')添加crx扩展 663 | # service_args = ['--proxy=localhost:9050', '--proxy-type=socks5', ] 664 | try: 665 | driver = webdriver.Chrome(r'C:\Python27\chromedriver', chrome_options=chrome_options) 666 | except Exception as e: 667 | with open(r'list_error.txt', 'a') as f: 668 | f.write(name.encode('utf-8')) 669 | f.write('\n') 670 | print(datetime.datetime.now()) 671 | print(url) 672 | print(e) 673 | else: 674 | try: 675 | driver.set_page_load_timeout(20) 676 | try: 677 | driver.get('http://weixin.sogou.com/') 678 | except: 679 | time.sleep(3) 680 | driver.refresh() 681 | # driver.implicitly_wait(5) 682 | # 会产生too many requests 683 | driver.delete_all_cookies() 684 | i = random.randint(0, 4) 685 | for cookie in self.cookie[i]: 686 | driver.add_cookie(cookie) 687 | time.sleep(1) 688 | try: 689 | driver.get(url) 690 | except: 691 | time.sleep(2) 692 | driver.refresh() 693 | time.sleep(2) 694 | # 判断是否存在这个公众号 695 | try: 696 | driver.find_element_by_id("noresult_part1_container") 697 | a = True 698 | except: 699 | a = False 700 | if a is True: 701 | with open(r'no_wechat.txt', 'a') as f: 702 | f.write(name.encode('utf-8')) 703 | f.write('\n') 704 | elif a is False: 705 | # 应对 too many connections 706 | try: 707 | WebDriverWait(driver, 5).until( 708 | EC.presence_of_element_located((By.ID, "sogou_vr_11002301_box_0")) 709 | ) 710 | except: 711 | time.sleep(2) 712 | driver.refresh() 713 | now_handle = driver.current_window_handle 714 | driver.find_element_by_id('sogou_vr_11002301_box_0').click() 715 | # 会存在需要验证的情况 716 | time.sleep(2) 717 | all_handles = driver.window_handles 718 | for handle in all_handles: 719 | if handle != now_handle: 720 | driver.switch_to.window(handle) # 跳转到新的窗口 721 | # 判断页面是否是验证页面 722 | # b = True 723 | # while b is True: 724 | # try: 725 | # driver.find_element_by_class_name("page_verify") 726 | # b = True 727 | # driver.refresh() 728 | # time.sleep(2) 729 | # except: 730 | # b = False 731 | # 732 | # # 等待列表的出现 733 | # try: 734 | # WebDriverWait(driver, 5).until( 735 | # EC.presence_of_element_located((By.CLASS_NAME, "weui_msg_card_hd")) 736 | # ) 737 | # except: 738 | # driver.refresh() 739 | # time.sleep(2) 740 | # html = driver.page_source#网页动态加载后的代码 741 | wechat_url = driver.current_url 742 | i = random.randint(0, 4) 743 | dcap = dict(DesiredCapabilities.PHANTOMJS) 744 | dcap["phantomjs.page.settings.userAgent"] = ( 745 | UA 746 | ) 747 | dcap["takesScreenshot"] = (False) 748 | dcap["phantomjs.page.customHeaders.Cookie"] = self.cookie[i] 749 | try: 750 | driver1 = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no']) 751 | except Exception as e: 752 | with open(r'list_error.txt', 'a') as f: 753 | f.write(name.encode('utf-8')) 754 | f.write('\n') 755 | print(datetime.datetime.now()) 756 | print(url) 757 | print(e) 758 | else: 759 | try: 760 | driver1.set_page_load_timeout(20) 761 | driver1.get(wechat_url) 762 | html = driver1.page_source 763 | return wechat_url, html 764 | # except Exception as e: 765 | # with open(r'list_error.txt', 'a') as f: 766 | # f.write(name.encode('utf-8')) 767 | # f.write('\n') 768 | # print(datetime.datetime.now()) 769 | # print(url) 770 | # print(e) 771 | finally: 772 | driver1.quit() 773 | # return wechat_url, html 774 | except Exception as e: 775 | with open(r'list_error.txt', 'a') as f: 776 | f.write(name.encode('utf-8')) 777 | f.write('\n') 778 | print(url) 779 | print(datetime.datetime.now()) 780 | print(e) 781 | finally: 782 | driver.quit() 783 | # if a is False: 784 | # i = random.randint(0, 4) 785 | # dcap = dict(DesiredCapabilities.PHANTOMJS) 786 | # dcap["phantomjs.page.settings.userAgent"] = ( 787 | # UA 788 | # ) 789 | # dcap["takesScreenshot"] = (False) 790 | # dcap["phantomjs.page.customHeaders.Cookie"] = self.cookie[i] 791 | # try: 792 | # driver1 = webdriver.PhantomJS(desired_capabilities=dcap, service_args=['--load-images=no']) 793 | # except Exception as e: 794 | # print(datetime.datetime.now()) 795 | # print(url) 796 | # print(e) 797 | # else: 798 | # try: 799 | # driver1.set_page_load_timeout(20) 800 | # driver1.get(wechat_url) 801 | # html = driver1.page_source 802 | # return wechat_url, html 803 | # except Exception as e: 804 | # print(datetime.datetime.now()) 805 | # print(url) 806 | # print(e) 807 | # finally: 808 | # driver1.quit() 809 | 810 | 811 | 812 | # response = urllib2.urlopen(url) 813 | # if response.getcode() != 200: 814 | # return None 815 | # return response.read() 816 | 817 | def download_articles_ph(self, url): 818 | ''' 819 | 使用phantomjs下载文章 820 | :param url: 文章链接 821 | :return: 822 | ''' 823 | if url is None: 824 | return None 825 | dcap = dict(DesiredCapabilities.PHANTOMJS) 826 | dcap["phantomjs.page.settings.userAgent"] = ( 827 | UA 828 | ) 829 | dcap["takesScreenshot"] = (False) 830 | try: 831 | driver = webdriver.PhantomJS(desired_capabilities=dcap, 832 | service_args=['--load-images=no', ]) 833 | except Exception as e: 834 | print(datetime.datetime.now()) 835 | print(url) 836 | print(e) 837 | else: 838 | try: 839 | driver.set_page_load_timeout(30) 840 | driver.get(url) 841 | time.sleep(5) 842 | # driver.implicitly_wait(2) 843 | html = driver.page_source 844 | return html 845 | except: 846 | print(datetime.datetime.now()) 847 | print(url) 848 | finally: 849 | try: 850 | driver.quit() 851 | except Exception, e: 852 | pass 853 | 854 | def download_articles_chrome(self, url): 855 | # service_args = ['--load-images=no', ] 856 | profile_dir = r"D:\MyChrome\Default" 857 | chrome_options = webdriver.ChromeOptions() 858 | chrome_options.add_argument("--user-data-dir=" + os.path.abspath(profile_dir)) 859 | # PROXY = "123.56.238.200:8123" 860 | # # j = random.randint(0, len(proxys)-1) 861 | # # proxy = proxys[j] 862 | # chrome_options.add_argument('--proxy-server=%s' % PROXY) 863 | # chrome_options.add_extension('')添加crx扩展 864 | # service_args = ['--proxy=localhost:9050', '--proxy-type=socks5', '--load-images=no', ] 865 | try: 866 | driver = webdriver.Chrome(r'C:\Python27\chromedriver', chrome_options=chrome_options) 867 | except Exception as e: 868 | print(datetime.datetime.now()) 869 | print(url) 870 | print(e) 871 | else: 872 | 873 | try: 874 | driver.set_page_load_timeout(30) 875 | driver.get(url) 876 | driver.implicitly_wait(2) 877 | html = driver.page_source 878 | return html 879 | except: 880 | print(datetime.datetime.now()) 881 | print(url) 882 | # selenium.common.exceptions.TimeoutException: 883 | # return self.download_acticles(url) 884 | return None 885 | finally: 886 | driver.quit() 887 | 888 | def maintain_cookies(self): 889 | cookie = [] 890 | # 获取5组cookies 891 | for i in range(5): 892 | driver = webdriver.Chrome(r'C:\Python27\chromedriver') 893 | driver.get("http://weixin.sogou.com/") 894 | # 获得cookie信息 895 | cookie.append(driver.get_cookies()) 896 | print(driver.get_cookies()) 897 | driver.quit() 898 | 899 | return cookie 900 | 901 | def maintain_cookies_ph(self): 902 | dcap = dict(DesiredCapabilities.PHANTOMJS) 903 | dcap["phantomjs.page.settings.userAgent"] = UA 904 | cookie = [] 905 | # 获取5组cookies 906 | for i in range(10): 907 | driver = webdriver.PhantomJS(desired_capabilities=dcap, 908 | service_args=['--load-images=no', ]) 909 | driver.get("http://weixin.sogou.com/") 910 | # 获得cookie信息 911 | cookie.append(driver.get_cookies()) 912 | # print(driver.get_cookies()) 913 | driver.quit() 914 | return cookie 915 | 916 | def get_cookies(self): 917 | driver = webdriver.Chrome() 918 | driver.get("http://weixin.sogou.com/") 919 | time.sleep(5) 920 | driver.find_element_by_xpath('//*[@id="loginBtn"]').click() 921 | time.sleep(10) 922 | 923 | cookies = driver.get_cookies() 924 | cookie = {} 925 | for items in cookies: 926 | cookie[items.get('name')] = items.get('value') 927 | return cookie 928 | b = html_parser.HtmlParser() 929 | f = open('category2.csv', 'a') 930 | def task(self, link): 931 | data = None 932 | while data is None: 933 | html = self.download_articles_ph(link) 934 | data = self.b.parse_article(html) 935 | self.f.write(data[1] + '#' + data[0] + '#' + data[2] + '#' + data[3]) 936 | self.f.write('\n') 937 | 938 | 939 | 940 | if __name__ == "__main__": 941 | 942 | a = HtmlDownloader() 943 | b = html_parser.HtmlParser() 944 | links = a.demo() 945 | from multiprocessing.dummy import Pool as ThreadPool 946 | pool = ThreadPool(6) 947 | pool.map(a.task, links) 948 | pool.close() 949 | pool.join() 950 | # # a.ocr4wechat('http://mp.weixin.qq.com/s?timestamp=1478687270&src=3&ver=1&signature=5RtOXxZ16P0x8hvN7sARkESooWCRi1F-' 951 | # 'AcdjyV1phiMF7EC8fCYB1STlGWMUeoUQtSoEFQC26jd-X-*3GiGa-ZwBJQBld54xrGpEc81g*kjGncNNXLgRkpw5WIoCO5T-KbO' 952 | # 'xjsRjYFvrvDaynu1I7vvIE9itjIEzCa77YZuMMyM=') 953 | # a.download_list_chrome("http://weixin.sogou.com/weixin?type=%d&query=%s" % (1, 'renmin'), u'renmin') 954 | --------------------------------------------------------------------------------