├── keywords_data.csv ├── tesseract_ocr ├── 0_0.png ├── 0_1.png ├── __init__.py └── tesseract_ocr_module.py ├── baidu_index ├── __init__.pyc ├── __init__.py └── baidu_web_driver │ ├── __init__.py │ ├── baidu_web_driver_pool.py │ ├── baidu_login_module.py │ └── baidu_source.py ├── base ├── __init__.py └── send_email.py ├── dama ├── __init__.py └── damatu │ ├── __init__.py │ └── damatuWeb.py ├── hbase ├── __init__.py └── hbase_module.py ├── kafka └── __init__.py ├── model ├── __init__.py ├── ali │ ├── __init__.py │ └── ali_index_model.py └── baidu │ ├── __init__.py │ └── baidu_index_model.py ├── ali_index ├── __init__.py └── ali_web_driver │ ├── __init__.py │ ├── ali_login.py │ ├── ali_web_driver_pool.py │ └── ali_source.py ├── setting.py └── main.py /keywords_data.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Spider_index/HEAD/keywords_data.csv -------------------------------------------------------------------------------- /tesseract_ocr/0_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Spider_index/HEAD/tesseract_ocr/0_0.png -------------------------------------------------------------------------------- /tesseract_ocr/0_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Spider_index/HEAD/tesseract_ocr/0_1.png -------------------------------------------------------------------------------- /baidu_index/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/da2vin/Spider_index/HEAD/baidu_index/__init__.pyc -------------------------------------------------------------------------------- /base/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /dama/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /hbase/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /kafka/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /model/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /ali_index/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /model/ali/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /baidu_index/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /dama/damatu/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /model/baidu/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /tesseract_ocr/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /ali_index/ali_web_driver/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /ali_index/ali_web_driver/ali_login.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /baidu_index/baidu_web_driver/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') -------------------------------------------------------------------------------- /base/send_email.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from email.header import Header 4 | from email.mime.text import MIMEText 5 | from email.utils import parseaddr, formataddr 6 | import smtplib 7 | import sys 8 | import setting 9 | import traceback 10 | 11 | reload(sys) 12 | sys.setdefaultencoding('utf-8') 13 | 14 | 15 | def _format_addr(s): 16 | name, addr = parseaddr(s) 17 | return formataddr(( \ 18 | Header(name, 'utf-8').encode(), \ 19 | addr.encode('utf-8') if isinstance(addr, unicode) else addr)) 20 | 21 | 22 | def sendEmail(text): 23 | try: 24 | msg = MIMEText(text, 'plain', 'utf-8') 25 | msg['From'] = _format_addr(u'指数爬虫 <%s>' % setting.EMAIL_ADDR) 26 | msg['To'] = _format_addr(u'DK <%s>' % setting.EMAIL_TO_ADDR) 27 | msg['Subject'] = Header(u'指数爬虫异常', 'utf-8').encode() 28 | 29 | server = smtplib.SMTP(setting.EMAIL_SMTP_SERVER, 25) 30 | server.set_debuglevel(1) 31 | server.login(setting.EMAIL_ADDR, setting.EMAIL_PASSWORD) 32 | server.sendmail(setting.EMAIL_ADDR, [setting.EMAIL_TO_ADDR], msg.as_string()) 33 | server.quit() 34 | except: 35 | print '发送邮件失败:' + traceback.format_exc() 36 | 37 | -------------------------------------------------------------------------------- /ali_index/ali_web_driver/ali_web_driver_pool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import Queue 4 | import sys 5 | from selenium import webdriver 6 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 7 | import setting 8 | 9 | reload(sys) 10 | sys.setdefaultencoding('utf-8') 11 | 12 | dcap = dict(DesiredCapabilities.PHANTOMJS) 13 | dcap["phantomjs.page.settings.resourceTimeout"] = 10 14 | dcap["phantomjs.page.settings.loadImages"] = True 15 | dcap["phantomjs.page.settings.userAgent"] = (setting.USER_AGENT) 16 | 17 | 18 | def _get_base_ali_driver(): 19 | if setting.PHANTOMJS_SERVICE: 20 | web = webdriver.PhantomJS(service_args=setting.PHANTOMJS_SERVICE, executable_path=setting.PHANTOMJS_PATH 21 | , desired_capabilities=dcap) 22 | else: 23 | web = webdriver.PhantomJS(executable_path=setting.PHANTOMJS_PATH 24 | , desired_capabilities=dcap) 25 | return web 26 | 27 | 28 | def get_ali_web_driver_pool(num): 29 | driver_queue = Queue.Queue() 30 | i = 0 31 | while i < num: 32 | web = _get_base_ali_driver() 33 | driver_queue.put(web) 34 | i += 1 35 | return driver_queue 36 | -------------------------------------------------------------------------------- /setting.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | 5 | reload(sys) 6 | sys.setdefaultencoding('utf-8') 7 | 8 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' 9 | # PHANTOMJS_PATH = '/root/phantomjs/phantomjs' 10 | PHANTOMJS_PATH = 'C:/Python27/phantomjs.exe' 11 | # PHANTOMJS_SERVICE = [ 12 | # '--proxy=localhost:8888', 13 | # '--proxy-type=http', 14 | # # '--proxy-auth=username:password' 15 | # ] 16 | PHANTOMJS_SERVICE = None 17 | DRIVER_POOL_SIZE = 1 18 | 19 | BAIDU_USERNAME = '百度账号' 20 | BAIDU_PASSWORD = '百度密码' 21 | 22 | HBASE_HOST = '192.168.2.240' 23 | HBASE_PORT = 9090 24 | 25 | REFLECTION = { 26 | '': '0', 27 | '': '1', 28 | '': '2', 29 | '': '3', 30 | '': '4', 31 | '': '5', 32 | '': '6', 33 | '': '7', 34 | '': '8', 35 | '': '9', 36 | '': '%', 37 | } 38 | 39 | HBASE_BAIDU_FAM = 'fam_baidu' 40 | HBASE_ALI_FAM = 'fam_ali' 41 | HBASE_INDEX_BASE_FAM = 'fam_exponent_info' 42 | HBASE_INDEX_TABLE_NAME = 'index' 43 | 44 | AREA_REFLECTION = { 45 | 'sc_cd': '四川_成都', 46 | 'sc_my': '四川_绵阳', 47 | 'gd_gz': '广东_广州', 48 | 'qg_qg': '全国_全国', 49 | } 50 | 51 | AREA_LOCATION = { 52 | # 'sc_cd': '四川_成都', 53 | # 'sc_my': '四川_绵阳', 54 | # 'gd_gz': '广东_广州', 55 | 'qg_qg': '全国_全国', 56 | } 57 | 58 | SLEEP_TIME = 1 59 | 60 | EMAIL_ADDR = '发送邮箱地址' 61 | EMAIL_PASSWORD = '发送邮箱密码' 62 | EMAIL_SMTP_SERVER = 'smtp.163.com' 63 | EMAIL_TO_ADDR = '推送邮箱地址' 64 | 65 | BAIDU_AVG_SLEEP_TIME = 2 66 | -------------------------------------------------------------------------------- /baidu_index/baidu_web_driver/baidu_web_driver_pool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import Queue 4 | import sys 5 | import baidu_login_module 6 | from selenium import webdriver 7 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 8 | import setting 9 | import pickle 10 | import os 11 | 12 | reload(sys) 13 | sys.setdefaultencoding('utf-8') 14 | 15 | dcap = dict(DesiredCapabilities.PHANTOMJS) 16 | dcap["phantomjs.page.settings.resourceTimeout"] = 10 17 | dcap["phantomjs.page.settings.loadImages"] = True 18 | dcap["phantomjs.page.settings.userAgent"] = (setting.USER_AGENT) 19 | 20 | 21 | def _get_base_baidu_driver(cookies): 22 | if setting.PHANTOMJS_SERVICE: 23 | web = webdriver.PhantomJS(service_args=setting.PHANTOMJS_SERVICE, executable_path=setting.PHANTOMJS_PATH 24 | , desired_capabilities=dcap) 25 | else: 26 | web = webdriver.PhantomJS(executable_path=setting.PHANTOMJS_PATH 27 | , desired_capabilities=dcap) 28 | for cookie in cookies: 29 | web.add_cookie(cookie) 30 | 31 | return web 32 | 33 | ''' 34 | 获取一个webkit池,若cookie已经保存,则使用已保存cookie,若没有则执行登录 35 | ''' 36 | def get_baidu_web_driver_pool(num, username, password): 37 | cookies = get_cookie() 38 | if not cookies: 39 | cookies = baidu_login_module.login_baidu(username, password) 40 | save_cookie(cookies) 41 | driver_queue = Queue.Queue() 42 | if not cookies: 43 | return None 44 | else: 45 | i = 0 46 | while i < num: 47 | web = _get_base_baidu_driver(cookies) 48 | driver_queue.put(web) 49 | i += 1 50 | return driver_queue 51 | 52 | 53 | def save_cookie(cookies): 54 | with open('cookies.pkl', 'wb') as output: 55 | pickle.dump(cookies, output) 56 | 57 | 58 | def get_cookie(): 59 | if os.path.exists('cookies.pkl'): 60 | with open('cookies.pkl', 'rb') as cookie_file: 61 | cookies = pickle.load(cookie_file) 62 | return cookies 63 | else: 64 | return None 65 | -------------------------------------------------------------------------------- /hbase/hbase_module.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import happybase 5 | import setting 6 | import json 7 | import csv 8 | import codecs 9 | 10 | reload(sys) 11 | sys.setdefaultencoding('utf-8') 12 | 13 | 14 | def insert(table_name, key, data, timestamp): 15 | connection = happybase.Connection(setting.HBASE_HOST, port=setting.HBASE_PORT) 16 | table = connection.table(table_name) 17 | table.put(key, data, timestamp) 18 | connection.close() 19 | 20 | 21 | def drop_table(table_name): 22 | connection = happybase.Connection(setting.HBASE_HOST, port=setting.HBASE_PORT) 23 | connection.disable_table(table_name) 24 | connection.delete_table(table_name) 25 | connection.close() 26 | 27 | 28 | def create_table(table_name, table_families): 29 | connection = happybase.Connection(setting.HBASE_HOST, port=setting.HBASE_PORT) 30 | connection.create_table(table_name, families=table_families) 31 | connection.close() 32 | 33 | 34 | def scan(table_name): 35 | connection = happybase.Connection(setting.HBASE_HOST, port=setting.HBASE_PORT) 36 | table = connection.table(table_name) 37 | items = table.scan() 38 | for item in items: 39 | print json.dumps(dict(item[1])).decode('unicode-escape') 40 | print(len(list(items))) 41 | connection.close() 42 | 43 | 44 | def write_csv(table_name, file_name, key_filter='2016'): 45 | connection = happybase.Connection(setting.HBASE_HOST, port=setting.HBASE_PORT) 46 | table = connection.table(table_name) 47 | items = table.scan(filter="RowFilter(=,\'substring:%s\')" % (key_filter,)) 48 | 49 | with open(file_name, 'wb') as csvfile: 50 | csvfile.write(codecs.BOM_UTF8) 51 | spamwriter = csv.writer(csvfile, dialect='excel') 52 | i = 0 53 | for item in items: 54 | if i == 0: 55 | temp_dict = dict(item[1]).keys() 56 | temp_dict.append('key') 57 | spamwriter.writerow(temp_dict) 58 | temp_dict = dict(item[1]).values() 59 | temp_dict.append(item[0]) 60 | spamwriter.writerow(temp_dict) 61 | i += 1 62 | 63 | connection.close() 64 | 65 | # write_csv('index', 'data.csv', '201610') 66 | 67 | # scan('index') 68 | 69 | # drop_table('index') 70 | # create_table('index', {'fam_exponent_info': dict(max_versions=31), 71 | # 'fam_baidu': dict(max_versions=31), 72 | # 'fam_ali': dict(max_versions=31), 73 | # }) 74 | -------------------------------------------------------------------------------- /tesseract_ocr/tesseract_ocr_module.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from PIL import Image 4 | import pytesseract 5 | import threading 6 | 7 | mutex = threading.Lock() 8 | 9 | 10 | def get_vcode(path): 11 | with Image.open(path) as image: 12 | mutex.acquire(1) 13 | vcode = pytesseract.image_to_string(image, lang='numfont') 14 | mutex.release() 15 | return vcode.replace(',', '').replace('\n', '') 16 | 17 | 18 | def get_vcode_by_img_0(img): 19 | mutex.acquire(1) 20 | vcode = pytesseract.image_to_string(img, lang='numfont') 21 | if vcode == '': 22 | img = merge_thumb_0(img) 23 | vcode = pytesseract.image_to_string(img, lang='numfont') 24 | if vcode == '00': 25 | vcode = '0' 26 | else: 27 | vcode = vcode.strip('0') 28 | mutex.release() 29 | return vcode.replace(',', '').replace('\n', '') 30 | 31 | 32 | def get_vcode_by_img_1(img): 33 | mutex.acquire(1) 34 | vcode = pytesseract.image_to_string(img, lang='numfont') 35 | if vcode == '': 36 | img = merge_thumb_1(img) 37 | vcode = pytesseract.image_to_string(img, lang='numfont') 38 | if vcode == '00': 39 | vcode = '0' 40 | else: 41 | vcode = vcode.strip('0') 42 | mutex.release() 43 | return vcode.replace(',', '').replace('\n', '') 44 | 45 | ''' 46 | 个位数图片无法识别,另外融合一个图片来识别,此为黑底图片 47 | ''' 48 | def merge_thumb_0(image_need_merge): 49 | image_0 = Image.open('tesseract_ocr/0_0.png') 50 | size_need_merge = image_need_merge.size 51 | size_0 = image_0.size 52 | 53 | merge_image = Image.new('RGBA', (size_need_merge[0] + size_0[0], size_need_merge[1])) 54 | merge_image.paste(image_0, (0, 0)) 55 | merge_image.paste(image_need_merge, (size_0[0], 0)) 56 | 57 | # merge_image.save('pic_temp/merged.png') 58 | return merge_image 59 | 60 | ''' 61 | 个位数图片无法识别,另外融合一个图片来识别,此为白底图片 62 | ''' 63 | def merge_thumb_1(image_need_merge): 64 | image_0 = Image.open('tesseract_ocr/0_1.png') 65 | size_need_merge = image_need_merge.size 66 | size_0 = image_0.size 67 | 68 | merge_image = Image.new('RGBA', (size_need_merge[0] + size_0[0], size_need_merge[1])) 69 | merge_image.paste(image_need_merge, (0, 0)) 70 | merge_image.paste(image_0, (size_need_merge[0], 0)) 71 | 72 | # merge_image.save('pic_temp/merged.png') 73 | return merge_image 74 | 75 | # import cv 76 | # def white_and_black(pic_name): 77 | # image = cv.LoadImage(pic_name, 0) 78 | # size = (image.width, image.height) 79 | # iTmp = cv.CreateImage(size, image.depth, image.nChannels) 80 | # for i in range(image.height): 81 | # for j in range(image.width): 82 | # if image[i, j] < 100: 83 | # iTmp[i, j] = 255 84 | # else: 85 | # iTmp[i, j] = 0 86 | # 87 | # cv.SaveImage(pic_name, iTmp) 88 | 89 | # get_vcode('merged.png') 90 | # merge_thumb('111.png') 91 | -------------------------------------------------------------------------------- /baidu_index/baidu_web_driver/baidu_login_module.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | from selenium import webdriver 4 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 5 | import time 6 | from PIL import Image 7 | from dama.damatu.damatuWeb import dmt 8 | import setting 9 | from selenium.webdriver.common.keys import Keys 10 | 11 | dcap = dict(DesiredCapabilities.PHANTOMJS) 12 | dcap["phantomjs.page.settings.resourceTimeout"] = 10 13 | dcap["phantomjs.page.settings.loadImages"] = True 14 | dcap["phantomjs.page.settings.userAgent"] = (setting.USER_AGENT) 15 | 16 | 17 | def login_baidu(username, password): 18 | if setting.PHANTOMJS_SERVICE: 19 | web = webdriver.PhantomJS(service_args=setting.PHANTOMJS_SERVICE, executable_path=setting.PHANTOMJS_PATH, 20 | desired_capabilities=dcap) 21 | else: 22 | web = webdriver.PhantomJS(executable_path=setting.PHANTOMJS_PATH, desired_capabilities=dcap) 23 | 24 | web.get('https://passport.baidu.com/v2/?login') 25 | 26 | cookies = [] 27 | 28 | element = web.find_element_by_id('TANGRAM__PSP_3__userName') 29 | element.clear() 30 | element.send_keys(username) 31 | 32 | element = web.find_element_by_id('TANGRAM__PSP_3__password') 33 | element.clear() 34 | element.send_keys(password) 35 | 36 | element = web.find_element_by_id('TANGRAM__PSP_3__submit') 37 | element.click() 38 | time.sleep(3) 39 | 40 | while True: 41 | if '帐号设置' in web.find_element_by_css_selector('title').get_attribute('innerText'): 42 | print '登录成功' 43 | cookies = web.get_cookies() 44 | break 45 | errorMsg = web.find_element_by_id('TANGRAM__PSP_3__error').get_attribute('innerText') 46 | if errorMsg == '请输入验证码': 47 | print errorMsg 48 | authcode = _get_authcode(web) 49 | element = web.find_element_by_id('TANGRAM__PSP_3__verifyCode') 50 | element.clear() 51 | element.send_keys(authcode) 52 | 53 | element = web.find_element_by_id('TANGRAM__PSP_3__submit') 54 | element.click() 55 | time.sleep(3) 56 | elif errorMsg == '您输入的验证码有误': 57 | print errorMsg 58 | element = web.find_element_by_id('TANGRAM__PSP_3__verifyCodeImg') 59 | element.click() 60 | time.sleep(1) 61 | authcode = _get_authcode(web) 62 | element = web.find_element_by_id('TANGRAM__PSP_3__verifyCode') 63 | element.clear() 64 | element.send_keys(authcode) 65 | element.send_keys(Keys.ENTER) 66 | 67 | # element = web.find_element_by_id('TANGRAM__PSP_3__submit') 68 | # element.click() 69 | time.sleep(3) 70 | # web.save_screenshot('screen_baidu.png') 71 | else: 72 | print errorMsg 73 | cookies = None 74 | break 75 | 76 | web.close() 77 | return cookies 78 | 79 | 80 | def _get_authcode(web): 81 | web.save_screenshot('authcode_baidu.png') 82 | element = web.find_element_by_id('TANGRAM__PSP_3__verifyCodeImgParent') 83 | 84 | left = 800 85 | top = 352 86 | right = left + element.size['width'] 87 | bottom = top + element.size['height'] 88 | 89 | im = Image.open('authcode_baidu.png') 90 | im = im.crop((left, top, right, bottom)) 91 | im.save('authcode_baidu.png') 92 | 93 | data = _get_bytes('authcode_baidu.png') 94 | result = dmt.decode(data, 71) 95 | return result 96 | 97 | 98 | def _get_bytes(path): 99 | list_data = [] 100 | f = open(path, 'rb') 101 | f.seek(0, 0) 102 | while True: 103 | t_byte = f.read(1) 104 | if len(t_byte) == 0: 105 | break 106 | else: 107 | list_data.append(ord(t_byte)) 108 | list_data = bytearray(list_data) 109 | return list_data 110 | -------------------------------------------------------------------------------- /model/ali/ali_index_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import setting 5 | 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | 10 | def get_ali_model_by_area(area_code, base_model): 11 | ali_model = dict() 12 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_1688buy_index'] = base_model['ali_1688buy_index'] 13 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_taobaobuy_index'] = base_model['ali_taobaobuy_index'] 14 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_1688sup_index'] = base_model['ali_1688sup_index'] 15 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_taobao_needsfore'] = base_model['ali_taobao_needsfore'] 16 | 17 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_newbuyer'] = base_model['ali_newbuyer'] 18 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_puchaseQty'] = base_model['ali_buyer_puchaseQty'] 19 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_taobao'] = base_model['ali_buyer_taobao'] 20 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_taobao_grade'] = base_model['ali_buyer_taobao_grade'] 21 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_unitprice'] = base_model['ali_buyer_unitprice'] 22 | return ali_model 23 | 24 | def get_ali_model(base_model): 25 | ali_model = dict() 26 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_1688buy_index'] = base_model['ali_1688buy_index'] 27 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_taobaobuy_index'] = base_model['ali_taobaobuy_index'] 28 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_1688sup_index'] = base_model['ali_1688sup_index'] 29 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_taobao_needsfore'] = base_model['ali_taobao_needsfore'] 30 | 31 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_newbuyer'] = base_model['ali_newbuyer'] 32 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_puchaseQty'] = base_model['ali_buyer_puchaseQty'] 33 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_taobao'] = base_model['ali_buyer_taobao'] 34 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_taobao_grade'] = base_model['ali_buyer_taobao_grade'] 35 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_unitprice'] = base_model['ali_buyer_unitprice'] 36 | return ali_model 37 | 38 | def get_null_ali_model_by_area(area_code): 39 | ali_model = dict() 40 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_1688buy_index'] = '' 41 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_taobaobuy_index'] = '' 42 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_1688sup_index'] = '' 43 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_taobao_needsfore'] = '' 44 | 45 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_newbuyer'] = '' 46 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_puchaseQty'] = '' 47 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_taobao'] = '' 48 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_taobao_grade'] = '' 49 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_unitprice'] = '' 50 | return ali_model 51 | 52 | def get_null_ali_model(): 53 | ali_model = dict() 54 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_1688buy_index'] = '' 55 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_taobaobuy_index'] = '' 56 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_1688sup_index'] = '' 57 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_taobao_needsfore'] = '' 58 | 59 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_newbuyer'] = '' 60 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_puchaseQty'] = '' 61 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_taobao'] = '' 62 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_taobao_grade'] = '' 63 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_unitprice'] = '' 64 | return ali_model 65 | -------------------------------------------------------------------------------- /dama/damatu/damatuWeb.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import hashlib 4 | import urllib 5 | import json 6 | import base64 7 | import urllib2 8 | 9 | 10 | def md5str(str): # md5加密字符串 11 | m = hashlib.md5(str.encode(encoding="utf-8")) 12 | return m.hexdigest() 13 | 14 | 15 | def md5(byte): # md5加密byte 16 | return hashlib.md5(byte).hexdigest() 17 | 18 | 19 | class DamatuApi(): 20 | ID = '43311' 21 | KEY = 'd191c0fd4d6f1957067350f171409441' 22 | HOST = 'http://api.dama2.com:7766/app/' 23 | 24 | def __init__(self, username, password): 25 | self.username = username 26 | self.password = password 27 | 28 | def getSign(self, param=b''): 29 | return (md5(bytearray(self.KEY, encoding="utf8") + bytearray(self.username, encoding="utf8") + param))[:8] 30 | 31 | def getPwd(self): 32 | return md5str(self.KEY + md5str(md5str(self.username) + md5str(self.password))) 33 | 34 | def post(self, path, params={}): 35 | data = urllib.urlencode(params).encode('utf-8') 36 | url = self.HOST + path 37 | req = urllib2.Request(url, data) 38 | return urllib2.urlopen(req).read() 39 | 40 | # 查询余额 return 是正数为余额 如果为负数 则为错误码 41 | def getBalance(self): 42 | data = {'appID': self.ID, 43 | 'user': self.username, 44 | 'pwd': dmt.getPwd(), 45 | 'sign': dmt.getSign() 46 | } 47 | res = self.post('d2Balance', data) 48 | res = str(res, encoding="utf-8") 49 | jres = json.loads(res) 50 | if jres['ret'] == 0: 51 | return jres['balance'] 52 | else: 53 | return jres['ret'] 54 | 55 | # 上传验证码 参数filePath 验证码图片路径 如d:/1.jpg type是类型,查看http://wiki.dama2.com/index.php?n=ApiDoc.Pricedesc return 是答案为成功 如果为负数 则为错误码 56 | def decode(self, fdata, type): 57 | filedata = base64.b64encode(fdata) 58 | data = {'appID': self.ID, 59 | 'user': self.username, 60 | 'pwd': dmt.getPwd(), 61 | 'type': type, 62 | 'fileDataBase64': filedata, 63 | 'sign': dmt.getSign(fdata) 64 | } 65 | res = self.post('d2File', data) 66 | # res = str(res, encoding = "utf-8") 67 | jres = json.loads(res) 68 | if jres['ret'] == 0: 69 | # 注意这个json里面有ret,id,result,cookie,根据自己的需要获取 70 | return (jres['result']) 71 | else: 72 | return jres['ret'] 73 | 74 | # url地址打码 参数 url地址 type是类型(类型查看http://wiki.dama2.com/index.php?n=ApiDoc.Pricedesc) return 是答案为成功 如果为负数 则为错误码 75 | def decodeUrl(self, url, type): 76 | data = {'appID': self.ID, 77 | 'user': self.username, 78 | 'pwd': dmt.getPwd(), 79 | 'type': type, 80 | 'url': urllib.parse.quote(url), 81 | 'sign': dmt.getSign(url.encode(encoding="utf-8")) 82 | } 83 | res = self.post('d2Url', data) 84 | res = str(res, encoding="utf-8") 85 | jres = json.loads(res) 86 | if jres['ret'] == 0: 87 | # 注意这个json里面有ret,id,result,cookie,根据自己的需要获取 88 | return (jres['result']) 89 | else: 90 | return jres['ret'] 91 | 92 | # 报错 参数id(string类型)由上传打码函数的结果获得 return 0为成功 其他见错误码 93 | def reportError(self, id): 94 | # f=open('0349.bmp','rb') 95 | # fdata=f.read() 96 | # print(md5(fdata)) 97 | data = {'appID': self.ID, 98 | 'user': self.username, 99 | 'pwd': dmt.getPwd(), 100 | 'id': id, 101 | 'sign': dmt.getSign(id.encode(encoding="utf-8")) 102 | } 103 | res = self.post('d2ReportError', data) 104 | res = str(res, encoding="utf-8") 105 | jres = json.loads(res) 106 | return jres['ret'] 107 | 108 | 109 | # 调用类型实例: 110 | # 1.实例化类型 参数是打码兔用户账号和密码 111 | dmt = DamatuApi("iamDW", "maosu1989") 112 | # #2.调用方法: 113 | # print(dmt.getBalance()) #查询余额 114 | # print(dmt.decode('0349.bmp',200)) #上传打码 115 | # print(dmt.decodeUrl('http://captcha.qq.com/getimage?aid=549000912&r=0.7257105156128585&uin=3056517021',200)) #上传打码 116 | # #print(dmt.reportError('894657096')) #上报错误 117 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | from baidu_index.baidu_web_driver import baidu_web_driver_pool 5 | from ali_index.ali_web_driver import ali_web_driver_pool 6 | import setting 7 | from multiprocessing.dummy import Pool as ThreadPool 8 | import traceback 9 | from baidu_index.baidu_web_driver.baidu_source import get_baidu_source 10 | from ali_index.ali_web_driver.ali_source import get_ali_source 11 | import json 12 | from model.baidu.baidu_index_model import get_baidu_model, get_null_baidu_model 13 | from model.ali.ali_index_model import get_ali_model, get_null_ali_model 14 | import csv 15 | from hbase.hbase_module import insert 16 | import time 17 | 18 | reload(sys) 19 | sys.setdefaultencoding('utf-8') 20 | 21 | 22 | def init_pool(): 23 | print '正在初始化百度指数爬取模块' 24 | ''' 25 | 生成百度指数搜索所用webkit池 26 | ''' 27 | baidu_driver_pool = baidu_web_driver_pool.get_baidu_web_driver_pool(setting.DRIVER_POOL_SIZE, 28 | setting.BAIDU_USERNAME, 29 | setting.BAIDU_PASSWORD) 30 | print '正在初始化阿里指数爬取模块' 31 | ''' 32 | 生成阿里指数搜索所用webkit池 33 | ''' 34 | ali_driver_pool = ali_web_driver_pool.get_ali_web_driver_pool(setting.DRIVER_POOL_SIZE) 35 | 36 | return baidu_driver_pool, ali_driver_pool 37 | 38 | 39 | ''' 40 | 读取所需爬取内容 41 | ''' 42 | def keywords(): 43 | reader = csv.reader(file('keywords_data.csv', 'rb')) 44 | for line in reader: 45 | yield line 46 | 47 | 48 | ''' 49 | 获取指数详情 50 | ''' 51 | def get_index(baidu_driver_pool_temp, ali_driver_pool_temp, keyword_line): 52 | try: 53 | keyword = keyword_line[3].decode('gbk').strip() 54 | final_result = dict() 55 | for area_code in setting.AREA_LOCATION.iterkeys(): 56 | 57 | print keyword + ';' + setting.AREA_REFLECTION[area_code] + ':百度指数开始搜索' 58 | 59 | need_search_top = setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_1' not in final_result.iterkeys() 60 | result_baidu = get_baidu_source(baidu_driver_pool_temp, keyword, area_code, 61 | need_search_top) # 最后一个参数是为了不重复搜索区域排名 62 | if result_baidu is None: 63 | final_result = dict(final_result, **get_null_baidu_model(area_code, need_search_top)) 64 | else: 65 | final_result = dict(final_result, 66 | **get_baidu_model(area_code, result_baidu, need_search_top)) 67 | 68 | print keyword + ';' + setting.AREA_REFLECTION[area_code] + ':百度指数搜索完成' 69 | 70 | print keyword + ':阿里指数开始搜索' 71 | 72 | result_ali = get_ali_source(ali_driver_pool_temp, keyword) 73 | if result_ali is None: 74 | final_result = dict(final_result, **get_null_ali_model()) 75 | else: 76 | final_result = dict(final_result, **get_ali_model(result_ali)) 77 | 78 | print keyword + ':阿里指数搜索完成' 79 | 80 | final_result[setting.HBASE_INDEX_BASE_FAM + ':' + 'crawl_key'] = keyword 81 | final_result[setting.HBASE_INDEX_BASE_FAM + ':' + 'industry_name'] = keyword_line[2].decode('gbk') 82 | final_result[setting.HBASE_INDEX_BASE_FAM + ':' + 'industry_name_big'] = keyword_line[1].decode('gbk') 83 | 84 | datakey = keyword_line[0] + '_' + time.strftime('%Y%m', time.localtime(time.time())) 85 | timestamp = int(time.strftime('%Y%m%d', time.localtime(time.time()))) 86 | insert(setting.HBASE_INDEX_TABLE_NAME, datakey, final_result, timestamp) 87 | 88 | # result = json.dumps(final_result, sort_keys=True, indent=4).decode('unicode-escape') 89 | result = json.dumps(final_result).decode('unicode-escape') 90 | print result 91 | 92 | return final_result 93 | except: 94 | print '-------------------------' 95 | print traceback.format_exc() 96 | print '--------------------------' 97 | 98 | 99 | def main(): 100 | try: 101 | baidu_driver_pool, ali_driver_pool = init_pool() 102 | thread_pool = ThreadPool(setting.DRIVER_POOL_SIZE) 103 | for keyword in keywords(): 104 | thread_pool.apply_async(get_index, (baidu_driver_pool, ali_driver_pool, keyword)) 105 | thread_pool.close() 106 | thread_pool.join() 107 | except: 108 | print traceback.format_exc() 109 | print '--------end--------' 110 | 111 | 112 | if __name__ == '__main__': 113 | main() 114 | -------------------------------------------------------------------------------- /ali_index/ali_web_driver/ali_source.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import traceback 5 | import time 6 | from selenium.webdriver.common.keys import Keys 7 | import setting 8 | 9 | reload(sys) 10 | sys.setdefaultencoding('utf-8') 11 | 12 | 13 | def get_ali_index(web, keyword): 14 | result = None 15 | try: 16 | web.get('https://index.1688.com/alizs/keyword.htm') 17 | time.sleep(setting.SLEEP_TIME) 18 | 19 | item = dict() 20 | 21 | element = web.find_element_by_id('alizs-input') 22 | element.clear() 23 | element.send_keys(keyword.decode('utf-8')) 24 | element.send_keys(Keys.ENTER) 25 | time.sleep(setting.SLEEP_TIME) 26 | 27 | element = web.find_elements_by_css_selector('ul.page-list li a')[0] 28 | element.click() 29 | time.sleep(setting.SLEEP_TIME) 30 | 31 | # item['ali_1688buy_index'] = web.find_element_by_css_selector('p.right-detail span.highlight-red').get_attribute( 32 | # 'innerText').replace('第', '') 33 | 34 | element = web.find_element_by_css_selector('div[class=\'selected-list fd-clr list\']') 35 | item['ali_taobaobuy_index'] = element.find_element_by_css_selector('div.col-tb-purchase p').get_attribute( 36 | 'innerText').replace(',', '') 37 | item['ali_1688buy_index'] = element.find_element_by_css_selector('div.col-1688-purchase p').get_attribute( 38 | 'innerText').replace(',', '') 39 | item['ali_1688sup_index'] = element.find_element_by_css_selector('div.col-supply p').get_attribute( 40 | 'innerText').replace(',', '') 41 | item['ali_taobao_needsfore'] = element.find_element_by_css_selector('p.col-forecast').get_attribute('innerText') 42 | 43 | element = web.find_elements_by_css_selector('ul.page-list li a')[2] 44 | element.click() 45 | time.sleep(setting.SLEEP_TIME) 46 | 47 | has_first_rect = '最近30天,在您所选行业老采购商人数过少,暂不提供新/老采购商身份分布' not in web.find_element_by_css_selector( 48 | 'div.mod-identity').get_attribute('outerHTML') 49 | has_second_rect = '最近30天,在您所选行业淘宝店主采购商人数过少,暂不提供采购商的非淘宝/淘宝店主身份分布' not in web.find_element_by_css_selector( 50 | 'div.mod-identity').get_attribute('outerHTML') 51 | has_third_rect = '最近30天,在您所选行业线上交易的供应商人数过少,暂不提供采购客单价分布。' not in web.find_element_by_css_selector( 52 | 'div.mod-price').get_attribute('outerHTML') 53 | 54 | if has_first_rect and has_second_rect: 55 | elements = web.find_elements_by_css_selector('div[class=\'content detail\'] span.highlight-red') 56 | item['ali_newbuyer'] = elements[0].get_attribute('innerText') 57 | item['ali_buyer_puchaseQty'] = elements[1].get_attribute('innerText').replace('次以上', '') 58 | item['ali_buyer_taobao'] = elements[2].get_attribute('innerText') 59 | item['ali_buyer_taobao_grade'] = elements[3].get_attribute('innerText') 60 | elif has_first_rect and not has_second_rect: 61 | elements = web.find_elements_by_css_selector('div[class=\'content detail\'] span.highlight-red') 62 | item['ali_newbuyer'] = elements[0].get_attribute('innerText') 63 | item['ali_buyer_puchaseQty'] = elements[1].get_attribute('innerText').replace('次以上', '') 64 | item['ali_buyer_taobao'] = '' 65 | item['ali_buyer_taobao_grade'] = '' 66 | elif not has_first_rect and has_second_rect: 67 | elements = web.find_elements_by_css_selector('div[class=\'content detail\'] span.highlight-red') 68 | item['ali_newbuyer'] = '' 69 | item['ali_buyer_puchaseQty'] = '' 70 | item['ali_buyer_taobao'] = elements[0].get_attribute('innerText') 71 | item['ali_buyer_taobao_grade'] = elements[1].get_attribute('innerText') 72 | else: 73 | item['ali_newbuyer'] = '' 74 | item['ali_buyer_puchaseQty'] = '' 75 | item['ali_buyer_taobao'] = '' 76 | item['ali_buyer_taobao_grade'] = '' 77 | 78 | if has_third_rect: 79 | element = web.find_element_by_css_selector('div[class=\'obj-right obj-analyse\'] span.highlight-red') 80 | item['ali_buyer_unitprice'] = element.get_attribute('innerText') 81 | else: 82 | item['ali_buyer_unitprice'] = '' 83 | 84 | result = item 85 | except: 86 | print traceback.format_exc() 87 | # web.save_screenshot('pic_temp/screen_ali.png') 88 | finally: 89 | return result 90 | 91 | 92 | def get_ali_source(ali_driver_pool, keyword): 93 | if not ali_driver_pool: 94 | return None 95 | web = ali_driver_pool.get() 96 | source = get_ali_index(web, keyword) 97 | ali_driver_pool.put(web) 98 | return source 99 | 100 | 101 | def get_page_source(web): 102 | return web.execute_script("return document.documentElement.outerHTML") 103 | -------------------------------------------------------------------------------- /model/baidu/baidu_index_model.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | import setting 5 | 6 | reload(sys) 7 | sys.setdefaultencoding('utf-8') 8 | 9 | 10 | def get_baidu_model(area_code, base_model, need_top): 11 | baidu_model = dict() 12 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_index'] = base_model['baidu_overa_index'] 13 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_index'] = base_model['baidu_mbl_index'] 14 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_yearbase'] = base_model['baidu_overa_yearbase'] 15 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_chain'] = base_model['baidu_overa_chain'] 16 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_yearbase'] = base_model['baidu_mbl_yearbase'] 17 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_chain'] = base_model['baidu_mbl_chain'] 18 | 19 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_7'] = base_model['baidu_avg_7'] 20 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_30'] = base_model['baidu_avg_30'] 21 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_90'] = base_model['baidu_avg_90'] 22 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_180'] = base_model['baidu_avg_180'] 23 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_all'] = base_model['baidu_avg_all'] 24 | 25 | if need_top: 26 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_1'] = base_model['baidu_prov_1'] 27 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_2'] = base_model['baidu_prov_2'] 28 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_3'] = base_model['baidu_prov_3'] 29 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_4'] = base_model['baidu_prov_4'] 30 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_5'] = base_model['baidu_prov_5'] 31 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_6'] = base_model['baidu_prov_6'] 32 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_7'] = base_model['baidu_prov_7'] 33 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_8'] = base_model['baidu_prov_8'] 34 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_9'] = base_model['baidu_prov_9'] 35 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_10'] = base_model['baidu_prov_10'] 36 | 37 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_1'] = base_model['baidu_area_1'] 38 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_2'] = base_model['baidu_area_2'] 39 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_3'] = base_model['baidu_area_3'] 40 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_4'] = base_model['baidu_area_4'] 41 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_5'] = base_model['baidu_area_5'] 42 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_6'] = base_model['baidu_area_6'] 43 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_7'] = base_model['baidu_area_7'] 44 | 45 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_1'] = base_model['baidu_city_1'] 46 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_2'] = base_model['baidu_city_2'] 47 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_3'] = base_model['baidu_city_3'] 48 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_4'] = base_model['baidu_city_4'] 49 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_5'] = base_model['baidu_city_5'] 50 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_6'] = base_model['baidu_city_6'] 51 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_7'] = base_model['baidu_city_7'] 52 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_8'] = base_model['baidu_city_8'] 53 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_9'] = base_model['baidu_city_9'] 54 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_10'] = base_model['baidu_city_10'] 55 | return baidu_model 56 | 57 | def get_null_baidu_model(area_code, need_top): 58 | baidu_model = dict() 59 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_index'] = '' 60 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_index'] = '' 61 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_yearbase'] = '' 62 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_chain'] = '' 63 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_yearbase'] = '' 64 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_chain'] = '' 65 | 66 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_7'] = '' 67 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_30'] = '' 68 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_90'] = '' 69 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_180'] = '' 70 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_all'] = '' 71 | 72 | if need_top: 73 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_1'] = '' 74 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_2'] = '' 75 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_3'] = '' 76 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_4'] = '' 77 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_5'] = '' 78 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_6'] = '' 79 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_7'] = '' 80 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_8'] = '' 81 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_9'] = '' 82 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_10'] = '' 83 | 84 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_1'] = '' 85 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_2'] = '' 86 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_3'] = '' 87 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_4'] = '' 88 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_5'] = '' 89 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_6'] = '' 90 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_7'] = '' 91 | 92 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_1'] = '' 93 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_2'] = '' 94 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_3'] = '' 95 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_4'] = '' 96 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_5'] = '' 97 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_6'] = '' 98 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_7'] = '' 99 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_8'] = '' 100 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_9'] = '' 101 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_10'] = '' 102 | return baidu_model 103 | -------------------------------------------------------------------------------- /baidu_index/baidu_web_driver/baidu_source.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | import sys 4 | from selenium.webdriver.common.keys import Keys 5 | import time 6 | import traceback 7 | import setting 8 | from dama.damatu.damatuWeb import dmt 9 | from PIL import Image 10 | from tesseract_ocr.tesseract_ocr_module import get_vcode_by_img_0, get_vcode_by_img_1 11 | import os 12 | from selenium.webdriver.common.action_chains import ActionChains 13 | import copy 14 | from base.send_email import sendEmail 15 | 16 | reload(sys) 17 | sys.setdefaultencoding('utf-8') 18 | 19 | 20 | def get_baidu_index(web, keyword, area_code, need_search_top_area): 21 | result = None 22 | try: 23 | web.get('http://index.baidu.com') 24 | # web.save_screenshot('pic_temp/screen_baidu.png') 25 | element = web.find_element_by_id('schword') 26 | element.clear() 27 | element.send_keys(keyword.decode('utf-8')) 28 | element.send_keys(Keys.ENTER) 29 | time.sleep(setting.SLEEP_TIME * 3) 30 | 31 | while '请输入验证码' in get_page_source(web): 32 | print '需要验证码,保存截图并结束搜索' 33 | web.save_screenshot('pic_temp/need_authcode_%s.png' % (str(time.time()),)) 34 | sendEmail('%s;-1' % (keyword,)) 35 | os._exit(0) 36 | return result 37 | 38 | # region 暂时注释掉打码 39 | ''' 40 | element = web.find_element_by_css_selector('input.verifyInput') 41 | element.clear() 42 | 43 | authcode = _get_authcode(web) 44 | if 'ERROR' in authcode: 45 | continue 46 | 47 | element.send_keys(authcode) 48 | 49 | element = web.find_element_by_css_selector('a.tang-dialog-button') 50 | element.click() 51 | 52 | time.sleep(setting.SLEEP_TIME) 53 | ''' 54 | # endregion 55 | if '未被收录,如要查看相关数据,您需要购买创建新词的权限' in get_page_source(web): 56 | print '关键字:' + keyword + '未被百度收录' 57 | return result 58 | 59 | # region 选择地区 60 | if area_code != 'qg_qg': 61 | area = setting.AREA_REFLECTION[area_code].split('_') 62 | element = web.find_element_by_css_selector('div#compOtharea div.comCtl span.holdBox') 63 | element.click() 64 | time.sleep(setting.SLEEP_TIME * 0.6) 65 | 66 | element = web.find_elements_by_css_selector('div#compOtharea div.sltOpt')[0].find_element_by_xpath( 67 | '//a[contains(.,\'%s\')]' % (area[0],)) 68 | element.click() 69 | time.sleep(setting.SLEEP_TIME * 0.6) 70 | 71 | element = web.find_elements_by_css_selector('div#compOtharea div.sltOpt')[1].find_element_by_xpath( 72 | '//a[contains(.,\'%s\')]' % (area[1],)) 73 | element.click() 74 | time.sleep(setting.SLEEP_TIME * 0.6) 75 | 76 | # web.save_screenshot('pic_temp/screen_baidu.png') 77 | # endregion 78 | 79 | element = web.find_element_by_css_selector('a.gColor1') 80 | element.click() 81 | time.sleep(setting.SLEEP_TIME) 82 | time.sleep(setting.SLEEP_TIME) 83 | item = dict() 84 | 85 | elements = web.find_elements_by_css_selector('span.ftlwhf') 86 | 87 | item['baidu_overa_index'] = _get_value_by_ocr(web, elements[6]) 88 | item['baidu_mbl_index'] = _get_value_by_ocr(web, elements[7]) 89 | 90 | item['baidu_overa_yearbase'] = get_value(elements[8]) 91 | item['baidu_overa_chain'] = get_value(elements[9]) 92 | 93 | item['baidu_mbl_yearbase'] = get_value(elements[10]) 94 | item['baidu_mbl_chain'] = get_value(elements[11]) 95 | 96 | element = web.find_element_by_css_selector('label#trend-meanline') 97 | element.click() 98 | time.sleep(setting.SLEEP_TIME) 99 | 100 | # region avg_7 --------------------------------- 101 | element = web.find_element_by_css_selector('a[rel=\'7\']') 102 | element.click() 103 | time.sleep(setting.SLEEP_TIME) 104 | 105 | element = web.find_elements_by_css_selector('rect[x=\'45\']')[0] 106 | element.click() 107 | time.sleep(setting.BAIDU_AVG_SLEEP_TIME) 108 | 109 | element = web.find_element_by_css_selector('div.contentWord') 110 | location = copy.copy(element.location) 111 | size = copy.copy(element.size) 112 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),) 113 | web.save_screenshot(pic_name) 114 | 115 | left = location['x'] + 2 116 | top = location['y'] + 2 117 | right = left + size['width'] - 4 118 | bottom = top + size['height'] - 4 119 | 120 | with Image.open(pic_name) as im: 121 | im = im.crop((left, top, right, bottom)) 122 | im.save(pic_name) 123 | item['baidu_avg_7'] = get_vcode_by_img_0(im) 124 | if os.path.exists(pic_name) and item['baidu_avg_7'] != '': 125 | os.remove(pic_name) 126 | # endregion --------------------------------- 127 | 128 | # region avg_30 --------------------------------- 129 | element = web.find_element_by_css_selector('a[rel=\'30\']') 130 | element.click() 131 | time.sleep(setting.SLEEP_TIME) 132 | 133 | ActionChains(web).move_to_element(element).perform() 134 | time.sleep(setting.SLEEP_TIME) 135 | 136 | element = web.find_elements_by_css_selector('rect[x=\'45\']')[0] 137 | element.click() 138 | time.sleep(setting.BAIDU_AVG_SLEEP_TIME) 139 | 140 | element = web.find_element_by_css_selector('div.contentWord') 141 | location = copy.copy(element.location) 142 | size = copy.copy(element.size) 143 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),) 144 | web.save_screenshot(pic_name) 145 | 146 | left = location['x'] + 2 147 | top = location['y'] + 2 148 | right = left + size['width'] - 4 149 | bottom = top + size['height'] - 4 150 | 151 | with Image.open(pic_name) as im: 152 | im = im.crop((left, top, right, bottom)) 153 | im.save(pic_name) 154 | item['baidu_avg_30'] = get_vcode_by_img_0(im) 155 | if os.path.exists(pic_name) and item['baidu_avg_30'] != '': 156 | os.remove(pic_name) 157 | # endregion --------------------------------- 158 | 159 | # region avg_90 --------------------------------- 160 | element = web.find_element_by_css_selector('a[rel=\'90\']') 161 | element.click() 162 | time.sleep(setting.SLEEP_TIME) 163 | 164 | ActionChains(web).move_to_element(element).perform() 165 | time.sleep(setting.SLEEP_TIME) 166 | 167 | element = web.find_elements_by_css_selector('rect[x=\'45\']')[0] 168 | element.click() 169 | time.sleep(setting.BAIDU_AVG_SLEEP_TIME) 170 | 171 | element = web.find_element_by_css_selector('div.contentWord') 172 | location = copy.copy(element.location) 173 | size = copy.copy(element.size) 174 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),) 175 | web.save_screenshot(pic_name) 176 | 177 | left = location['x'] + 2 178 | top = location['y'] + 2 179 | right = left + size['width'] - 4 180 | bottom = top + size['height'] - 4 181 | 182 | with Image.open(pic_name) as im: 183 | im = im.crop((left, top, right, bottom)) 184 | im.save(pic_name) 185 | item['baidu_avg_90'] = get_vcode_by_img_0(im) 186 | if os.path.exists(pic_name) and item['baidu_avg_90'] != '': 187 | os.remove(pic_name) 188 | # endregion --------------------------------- 189 | 190 | # region avg_180 --------------------------------- 191 | element = web.find_element_by_css_selector('a[rel=\'180\']') 192 | element.click() 193 | time.sleep(setting.SLEEP_TIME) 194 | 195 | ActionChains(web).move_to_element(element).perform() 196 | time.sleep(setting.SLEEP_TIME) 197 | 198 | element = web.find_elements_by_css_selector('rect[x=\'45\']')[0] 199 | element.click() 200 | time.sleep(setting.BAIDU_AVG_SLEEP_TIME) 201 | 202 | element = web.find_element_by_css_selector('div.contentWord') 203 | location = copy.copy(element.location) 204 | size = copy.copy(element.size) 205 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),) 206 | web.save_screenshot(pic_name) 207 | 208 | left = location['x'] + 2 209 | top = location['y'] + 2 210 | right = left + size['width'] - 4 211 | bottom = top + size['height'] - 4 212 | 213 | with Image.open(pic_name) as im: 214 | im = im.crop((left, top, right, bottom)) 215 | im.save(pic_name) 216 | item['baidu_avg_180'] = get_vcode_by_img_0(im) 217 | if os.path.exists(pic_name) and item['baidu_avg_180'] != '': 218 | os.remove(pic_name) 219 | # endregion --------------------------------- 220 | 221 | # region avg_all --------------------------------- 222 | element = web.find_element_by_css_selector('a[rel=\'all\']') 223 | element.click() 224 | time.sleep(setting.SLEEP_TIME) 225 | 226 | ActionChains(web).move_to_element(element).perform() 227 | time.sleep(setting.SLEEP_TIME) 228 | 229 | element = web.find_elements_by_css_selector('rect[x=\'45\']')[0] 230 | element.click() 231 | time.sleep(setting.BAIDU_AVG_SLEEP_TIME) 232 | 233 | element = web.find_element_by_css_selector('div.contentWord') 234 | location = copy.copy(element.location) 235 | size = copy.copy(element.size) 236 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),) 237 | web.save_screenshot(pic_name) 238 | 239 | left = location['x'] + 2 240 | top = location['y'] + 2 241 | right = left + size['width'] - 4 242 | bottom = top + size['height'] - 4 243 | 244 | with Image.open(pic_name) as im: 245 | im = im.crop((left, top, right, bottom)) 246 | im.save(pic_name) 247 | item['baidu_avg_all'] = get_vcode_by_img_0(im) 248 | if os.path.exists(pic_name) and item['baidu_avg_all'] != '': 249 | os.remove(pic_name) 250 | # endregion --------------------------------- 251 | 252 | if need_search_top_area: 253 | element = web.find_elements_by_css_selector('table#subNav td')[3].find_element_by_css_selector('a') 254 | element.click() 255 | time.sleep(setting.SLEEP_TIME * 3.5) 256 | 257 | element = web.find_element_by_css_selector('div.grpArea svg text[text-anchor=\'middle\']') 258 | if 'display: none;' not in element.get_attribute('style'): 259 | item['baidu_prov_1'] = '' 260 | item['baidu_prov_2'] = '' 261 | item['baidu_prov_3'] = '' 262 | item['baidu_prov_4'] = '' 263 | item['baidu_prov_5'] = '' 264 | item['baidu_prov_6'] = '' 265 | item['baidu_prov_7'] = '' 266 | item['baidu_prov_8'] = '' 267 | item['baidu_prov_9'] = '' 268 | item['baidu_prov_10'] = '' 269 | 270 | item['baidu_area_1'] = '' 271 | item['baidu_area_2'] = '' 272 | item['baidu_area_3'] = '' 273 | item['baidu_area_4'] = '' 274 | item['baidu_area_5'] = '' 275 | item['baidu_area_6'] = '' 276 | item['baidu_area_7'] = '' 277 | 278 | item['baidu_city_1'] = '' 279 | item['baidu_city_2'] = '' 280 | item['baidu_city_3'] = '' 281 | item['baidu_city_4'] = '' 282 | item['baidu_city_5'] = '' 283 | item['baidu_city_6'] = '' 284 | item['baidu_city_7'] = '' 285 | item['baidu_city_8'] = '' 286 | item['baidu_city_9'] = '' 287 | item['baidu_city_10'] = '' 288 | else: 289 | elements = web.find_elements_by_css_selector('div.items') 290 | item['baidu_prov_1'] = elements[0].find_element_by_css_selector('td.scName').get_attribute('innerText') 291 | item['baidu_prov_2'] = elements[1].find_element_by_css_selector('td.scName').get_attribute('innerText') 292 | item['baidu_prov_3'] = elements[2].find_element_by_css_selector('td.scName').get_attribute('innerText') 293 | item['baidu_prov_4'] = elements[3].find_element_by_css_selector('td.scName').get_attribute('innerText') 294 | item['baidu_prov_5'] = elements[4].find_element_by_css_selector('td.scName').get_attribute('innerText') 295 | item['baidu_prov_6'] = elements[5].find_element_by_css_selector('td.scName').get_attribute('innerText') 296 | item['baidu_prov_7'] = elements[6].find_element_by_css_selector('td.scName').get_attribute('innerText') 297 | item['baidu_prov_8'] = elements[7].find_element_by_css_selector('td.scName').get_attribute('innerText') 298 | item['baidu_prov_9'] = elements[8].find_element_by_css_selector('td.scName').get_attribute('innerText') 299 | item['baidu_prov_10'] = elements[9].find_element_by_css_selector('td.scName').get_attribute('innerText') 300 | 301 | element = web.find_elements_by_css_selector('ul.scTab li')[1] 302 | element.click() 303 | time.sleep(setting.SLEEP_TIME) 304 | elements = web.find_elements_by_css_selector('div.items') 305 | item['baidu_area_1'] = elements[0].find_element_by_css_selector('td.scName').get_attribute('innerText') 306 | item['baidu_area_2'] = elements[1].find_element_by_css_selector('td.scName').get_attribute('innerText') 307 | item['baidu_area_3'] = elements[2].find_element_by_css_selector('td.scName').get_attribute('innerText') 308 | item['baidu_area_4'] = elements[3].find_element_by_css_selector('td.scName').get_attribute('innerText') 309 | item['baidu_area_5'] = elements[4].find_element_by_css_selector('td.scName').get_attribute('innerText') 310 | item['baidu_area_6'] = elements[5].find_element_by_css_selector('td.scName').get_attribute('innerText') 311 | item['baidu_area_7'] = elements[6].find_element_by_css_selector('td.scName').get_attribute('innerText') 312 | 313 | element = web.find_elements_by_css_selector('ul.scTab li')[2] 314 | element.click() 315 | time.sleep(setting.SLEEP_TIME) 316 | elements = web.find_elements_by_css_selector('div.items') 317 | item['baidu_city_1'] = elements[0].find_element_by_css_selector('td.scName').get_attribute('innerText') 318 | item['baidu_city_2'] = elements[1].find_element_by_css_selector('td.scName').get_attribute('innerText') 319 | item['baidu_city_3'] = elements[2].find_element_by_css_selector('td.scName').get_attribute('innerText') 320 | item['baidu_city_4'] = elements[3].find_element_by_css_selector('td.scName').get_attribute('innerText') 321 | item['baidu_city_5'] = elements[4].find_element_by_css_selector('td.scName').get_attribute('innerText') 322 | item['baidu_city_6'] = elements[5].find_element_by_css_selector('td.scName').get_attribute('innerText') 323 | item['baidu_city_7'] = elements[6].find_element_by_css_selector('td.scName').get_attribute('innerText') 324 | item['baidu_city_8'] = elements[7].find_element_by_css_selector('td.scName').get_attribute('innerText') 325 | item['baidu_city_9'] = elements[8].find_element_by_css_selector('td.scName').get_attribute('innerText') 326 | item['baidu_city_10'] = elements[9].find_element_by_css_selector('td.scName').get_attribute('innerText') 327 | 328 | result = item 329 | except: 330 | if '请输入验证码' in get_page_source(web): 331 | print '需要验证码,保存截图并结束搜索' 332 | web.save_screenshot('pic_temp/need_authcode_%s.png' % (str(time.time()),)) 333 | sendEmail('%s;-1' % (keyword,)) 334 | os._exit(0) 335 | 336 | print '---------------' 337 | print traceback.format_exc() 338 | print '---------------' 339 | error_pic_name = 'pic_temp/error_%s.png' % (str(time.time()),) 340 | web.save_screenshot(error_pic_name) 341 | sendEmail('关键字:%s\r\n%s\r\n%s' % (keyword, traceback.format_exc(), error_pic_name)) 342 | finally: 343 | return result 344 | 345 | 346 | def get_value(span): 347 | str_value = span.get_attribute('innerText') 348 | 349 | i_s = span.find_elements_by_css_selector('i') 350 | for i in i_s: 351 | str_value += setting.REFLECTION[i.get_attribute('outerHTML')] 352 | return str_value 353 | 354 | 355 | def get_baidu_source(baidu_driver_pool, keyword, area_code, need_search_top_area): 356 | if not baidu_driver_pool: 357 | return None 358 | web = baidu_driver_pool.get() 359 | source = get_baidu_index(web, keyword, area_code, need_search_top_area) 360 | baidu_driver_pool.put(web) 361 | return source 362 | 363 | 364 | def get_page_source(web): 365 | return web.execute_script("return document.documentElement.outerHTML") 366 | 367 | 368 | def _get_value_by_ocr(web, element): 369 | # print element.get_attribute('outerHTML') 370 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),) 371 | web.save_screenshot(pic_name) 372 | 373 | left = element.location['x'] 374 | top = element.location['y'] 375 | right = left + element.size['width'] 376 | bottom = top + element.size['height'] 377 | 378 | with Image.open(pic_name) as im: 379 | im = im.crop((left, top, right, bottom)) 380 | im.save(pic_name) 381 | vcode = get_vcode_by_img_1(im) 382 | if os.path.exists(pic_name): 383 | os.remove(pic_name) 384 | return vcode 385 | 386 | 387 | def _get_authcode(web): 388 | print '指数页面打码1次' 389 | web.save_screenshot('pic_temp/authcode_baidu_index.png') 390 | element = web.find_element_by_css_selector('img.verifyImg') 391 | 392 | left = element.location['x'] 393 | top = element.location['y'] 394 | right = left + element.size['width'] 395 | bottom = top + element.size['height'] 396 | 397 | with Image.open('pic_temp/authcode_baidu_index.png') as im: 398 | im = im.crop((left, top, right, bottom)) 399 | im.save('pic_temp/authcode_baidu_index.png') 400 | 401 | data = _get_bytes('pic_temp/authcode_baidu_index.png') 402 | result = dmt.decode(data, 42) 403 | print ' 指数页面打码结果:' + result 404 | return result 405 | 406 | 407 | def _get_bytes(path): 408 | list_data = [] 409 | f = open(path, 'rb') 410 | f.seek(0, 0) 411 | while True: 412 | t_byte = f.read(1) 413 | if len(t_byte) == 0: 414 | break 415 | else: 416 | list_data.append(ord(t_byte)) 417 | list_data = bytearray(list_data) 418 | return list_data 419 | --------------------------------------------------------------------------------