├── keywords_data.csv
├── tesseract_ocr
├── 0_0.png
├── 0_1.png
├── __init__.py
└── tesseract_ocr_module.py
├── baidu_index
├── __init__.pyc
├── __init__.py
└── baidu_web_driver
│ ├── __init__.py
│ ├── baidu_web_driver_pool.py
│ ├── baidu_login_module.py
│ └── baidu_source.py
├── base
├── __init__.py
└── send_email.py
├── dama
├── __init__.py
└── damatu
│ ├── __init__.py
│ └── damatuWeb.py
├── hbase
├── __init__.py
└── hbase_module.py
├── kafka
└── __init__.py
├── model
├── __init__.py
├── ali
│ ├── __init__.py
│ └── ali_index_model.py
└── baidu
│ ├── __init__.py
│ └── baidu_index_model.py
├── ali_index
├── __init__.py
└── ali_web_driver
│ ├── __init__.py
│ ├── ali_login.py
│ ├── ali_web_driver_pool.py
│ └── ali_source.py
├── setting.py
└── main.py
/keywords_data.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Spider_index/HEAD/keywords_data.csv
--------------------------------------------------------------------------------
/tesseract_ocr/0_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Spider_index/HEAD/tesseract_ocr/0_0.png
--------------------------------------------------------------------------------
/tesseract_ocr/0_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Spider_index/HEAD/tesseract_ocr/0_1.png
--------------------------------------------------------------------------------
/baidu_index/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/da2vin/Spider_index/HEAD/baidu_index/__init__.pyc
--------------------------------------------------------------------------------
/base/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/dama/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/hbase/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/kafka/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/model/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/ali_index/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/model/ali/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/baidu_index/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/dama/damatu/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/model/baidu/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/tesseract_ocr/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/ali_index/ali_web_driver/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/ali_index/ali_web_driver/ali_login.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/baidu_index/baidu_web_driver/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
--------------------------------------------------------------------------------
/base/send_email.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from email.header import Header
4 | from email.mime.text import MIMEText
5 | from email.utils import parseaddr, formataddr
6 | import smtplib
7 | import sys
8 | import setting
9 | import traceback
10 |
11 | reload(sys)
12 | sys.setdefaultencoding('utf-8')
13 |
14 |
15 | def _format_addr(s):
16 | name, addr = parseaddr(s)
17 | return formataddr(( \
18 | Header(name, 'utf-8').encode(), \
19 | addr.encode('utf-8') if isinstance(addr, unicode) else addr))
20 |
21 |
22 | def sendEmail(text):
23 | try:
24 | msg = MIMEText(text, 'plain', 'utf-8')
25 | msg['From'] = _format_addr(u'指数爬虫 <%s>' % setting.EMAIL_ADDR)
26 | msg['To'] = _format_addr(u'DK <%s>' % setting.EMAIL_TO_ADDR)
27 | msg['Subject'] = Header(u'指数爬虫异常', 'utf-8').encode()
28 |
29 | server = smtplib.SMTP(setting.EMAIL_SMTP_SERVER, 25)
30 | server.set_debuglevel(1)
31 | server.login(setting.EMAIL_ADDR, setting.EMAIL_PASSWORD)
32 | server.sendmail(setting.EMAIL_ADDR, [setting.EMAIL_TO_ADDR], msg.as_string())
33 | server.quit()
34 | except:
35 | print '发送邮件失败:' + traceback.format_exc()
36 |
37 |
--------------------------------------------------------------------------------
/ali_index/ali_web_driver/ali_web_driver_pool.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import Queue
4 | import sys
5 | from selenium import webdriver
6 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
7 | import setting
8 |
9 | reload(sys)
10 | sys.setdefaultencoding('utf-8')
11 |
12 | dcap = dict(DesiredCapabilities.PHANTOMJS)
13 | dcap["phantomjs.page.settings.resourceTimeout"] = 10
14 | dcap["phantomjs.page.settings.loadImages"] = True
15 | dcap["phantomjs.page.settings.userAgent"] = (setting.USER_AGENT)
16 |
17 |
18 | def _get_base_ali_driver():
19 | if setting.PHANTOMJS_SERVICE:
20 | web = webdriver.PhantomJS(service_args=setting.PHANTOMJS_SERVICE, executable_path=setting.PHANTOMJS_PATH
21 | , desired_capabilities=dcap)
22 | else:
23 | web = webdriver.PhantomJS(executable_path=setting.PHANTOMJS_PATH
24 | , desired_capabilities=dcap)
25 | return web
26 |
27 |
28 | def get_ali_web_driver_pool(num):
29 | driver_queue = Queue.Queue()
30 | i = 0
31 | while i < num:
32 | web = _get_base_ali_driver()
33 | driver_queue.put(web)
34 | i += 1
35 | return driver_queue
36 |
--------------------------------------------------------------------------------
/setting.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 |
5 | reload(sys)
6 | sys.setdefaultencoding('utf-8')
7 |
8 | USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
9 | # PHANTOMJS_PATH = '/root/phantomjs/phantomjs'
10 | PHANTOMJS_PATH = 'C:/Python27/phantomjs.exe'
11 | # PHANTOMJS_SERVICE = [
12 | # '--proxy=localhost:8888',
13 | # '--proxy-type=http',
14 | # # '--proxy-auth=username:password'
15 | # ]
16 | PHANTOMJS_SERVICE = None
17 | DRIVER_POOL_SIZE = 1
18 |
19 | BAIDU_USERNAME = '百度账号'
20 | BAIDU_PASSWORD = '百度密码'
21 |
22 | HBASE_HOST = '192.168.2.240'
23 | HBASE_PORT = 9090
24 |
25 | REFLECTION = {
26 | '': '0',
27 | '': '1',
28 | '': '2',
29 | '': '3',
30 | '': '4',
31 | '': '5',
32 | '': '6',
33 | '': '7',
34 | '': '8',
35 | '': '9',
36 | '': '%',
37 | }
38 |
39 | HBASE_BAIDU_FAM = 'fam_baidu'
40 | HBASE_ALI_FAM = 'fam_ali'
41 | HBASE_INDEX_BASE_FAM = 'fam_exponent_info'
42 | HBASE_INDEX_TABLE_NAME = 'index'
43 |
44 | AREA_REFLECTION = {
45 | 'sc_cd': '四川_成都',
46 | 'sc_my': '四川_绵阳',
47 | 'gd_gz': '广东_广州',
48 | 'qg_qg': '全国_全国',
49 | }
50 |
51 | AREA_LOCATION = {
52 | # 'sc_cd': '四川_成都',
53 | # 'sc_my': '四川_绵阳',
54 | # 'gd_gz': '广东_广州',
55 | 'qg_qg': '全国_全国',
56 | }
57 |
58 | SLEEP_TIME = 1
59 |
60 | EMAIL_ADDR = '发送邮箱地址'
61 | EMAIL_PASSWORD = '发送邮箱密码'
62 | EMAIL_SMTP_SERVER = 'smtp.163.com'
63 | EMAIL_TO_ADDR = '推送邮箱地址'
64 |
65 | BAIDU_AVG_SLEEP_TIME = 2
66 |
--------------------------------------------------------------------------------
/baidu_index/baidu_web_driver/baidu_web_driver_pool.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import Queue
4 | import sys
5 | import baidu_login_module
6 | from selenium import webdriver
7 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
8 | import setting
9 | import pickle
10 | import os
11 |
12 | reload(sys)
13 | sys.setdefaultencoding('utf-8')
14 |
15 | dcap = dict(DesiredCapabilities.PHANTOMJS)
16 | dcap["phantomjs.page.settings.resourceTimeout"] = 10
17 | dcap["phantomjs.page.settings.loadImages"] = True
18 | dcap["phantomjs.page.settings.userAgent"] = (setting.USER_AGENT)
19 |
20 |
21 | def _get_base_baidu_driver(cookies):
22 | if setting.PHANTOMJS_SERVICE:
23 | web = webdriver.PhantomJS(service_args=setting.PHANTOMJS_SERVICE, executable_path=setting.PHANTOMJS_PATH
24 | , desired_capabilities=dcap)
25 | else:
26 | web = webdriver.PhantomJS(executable_path=setting.PHANTOMJS_PATH
27 | , desired_capabilities=dcap)
28 | for cookie in cookies:
29 | web.add_cookie(cookie)
30 |
31 | return web
32 |
33 | '''
34 | 获取一个webkit池,若cookie已经保存,则使用已保存cookie,若没有则执行登录
35 | '''
36 | def get_baidu_web_driver_pool(num, username, password):
37 | cookies = get_cookie()
38 | if not cookies:
39 | cookies = baidu_login_module.login_baidu(username, password)
40 | save_cookie(cookies)
41 | driver_queue = Queue.Queue()
42 | if not cookies:
43 | return None
44 | else:
45 | i = 0
46 | while i < num:
47 | web = _get_base_baidu_driver(cookies)
48 | driver_queue.put(web)
49 | i += 1
50 | return driver_queue
51 |
52 |
53 | def save_cookie(cookies):
54 | with open('cookies.pkl', 'wb') as output:
55 | pickle.dump(cookies, output)
56 |
57 |
58 | def get_cookie():
59 | if os.path.exists('cookies.pkl'):
60 | with open('cookies.pkl', 'rb') as cookie_file:
61 | cookies = pickle.load(cookie_file)
62 | return cookies
63 | else:
64 | return None
65 |
--------------------------------------------------------------------------------
/hbase/hbase_module.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import happybase
5 | import setting
6 | import json
7 | import csv
8 | import codecs
9 |
10 | reload(sys)
11 | sys.setdefaultencoding('utf-8')
12 |
13 |
14 | def insert(table_name, key, data, timestamp):
15 | connection = happybase.Connection(setting.HBASE_HOST, port=setting.HBASE_PORT)
16 | table = connection.table(table_name)
17 | table.put(key, data, timestamp)
18 | connection.close()
19 |
20 |
21 | def drop_table(table_name):
22 | connection = happybase.Connection(setting.HBASE_HOST, port=setting.HBASE_PORT)
23 | connection.disable_table(table_name)
24 | connection.delete_table(table_name)
25 | connection.close()
26 |
27 |
28 | def create_table(table_name, table_families):
29 | connection = happybase.Connection(setting.HBASE_HOST, port=setting.HBASE_PORT)
30 | connection.create_table(table_name, families=table_families)
31 | connection.close()
32 |
33 |
34 | def scan(table_name):
35 | connection = happybase.Connection(setting.HBASE_HOST, port=setting.HBASE_PORT)
36 | table = connection.table(table_name)
37 | items = table.scan()
38 | for item in items:
39 | print json.dumps(dict(item[1])).decode('unicode-escape')
40 | print(len(list(items)))
41 | connection.close()
42 |
43 |
44 | def write_csv(table_name, file_name, key_filter='2016'):
45 | connection = happybase.Connection(setting.HBASE_HOST, port=setting.HBASE_PORT)
46 | table = connection.table(table_name)
47 | items = table.scan(filter="RowFilter(=,\'substring:%s\')" % (key_filter,))
48 |
49 | with open(file_name, 'wb') as csvfile:
50 | csvfile.write(codecs.BOM_UTF8)
51 | spamwriter = csv.writer(csvfile, dialect='excel')
52 | i = 0
53 | for item in items:
54 | if i == 0:
55 | temp_dict = dict(item[1]).keys()
56 | temp_dict.append('key')
57 | spamwriter.writerow(temp_dict)
58 | temp_dict = dict(item[1]).values()
59 | temp_dict.append(item[0])
60 | spamwriter.writerow(temp_dict)
61 | i += 1
62 |
63 | connection.close()
64 |
65 | # write_csv('index', 'data.csv', '201610')
66 |
67 | # scan('index')
68 |
69 | # drop_table('index')
70 | # create_table('index', {'fam_exponent_info': dict(max_versions=31),
71 | # 'fam_baidu': dict(max_versions=31),
72 | # 'fam_ali': dict(max_versions=31),
73 | # })
74 |
--------------------------------------------------------------------------------
/tesseract_ocr/tesseract_ocr_module.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from PIL import Image
4 | import pytesseract
5 | import threading
6 |
7 | mutex = threading.Lock()
8 |
9 |
10 | def get_vcode(path):
11 | with Image.open(path) as image:
12 | mutex.acquire(1)
13 | vcode = pytesseract.image_to_string(image, lang='numfont')
14 | mutex.release()
15 | return vcode.replace(',', '').replace('\n', '')
16 |
17 |
18 | def get_vcode_by_img_0(img):
19 | mutex.acquire(1)
20 | vcode = pytesseract.image_to_string(img, lang='numfont')
21 | if vcode == '':
22 | img = merge_thumb_0(img)
23 | vcode = pytesseract.image_to_string(img, lang='numfont')
24 | if vcode == '00':
25 | vcode = '0'
26 | else:
27 | vcode = vcode.strip('0')
28 | mutex.release()
29 | return vcode.replace(',', '').replace('\n', '')
30 |
31 |
32 | def get_vcode_by_img_1(img):
33 | mutex.acquire(1)
34 | vcode = pytesseract.image_to_string(img, lang='numfont')
35 | if vcode == '':
36 | img = merge_thumb_1(img)
37 | vcode = pytesseract.image_to_string(img, lang='numfont')
38 | if vcode == '00':
39 | vcode = '0'
40 | else:
41 | vcode = vcode.strip('0')
42 | mutex.release()
43 | return vcode.replace(',', '').replace('\n', '')
44 |
45 | '''
46 | 个位数图片无法识别,另外融合一个图片来识别,此为黑底图片
47 | '''
48 | def merge_thumb_0(image_need_merge):
49 | image_0 = Image.open('tesseract_ocr/0_0.png')
50 | size_need_merge = image_need_merge.size
51 | size_0 = image_0.size
52 |
53 | merge_image = Image.new('RGBA', (size_need_merge[0] + size_0[0], size_need_merge[1]))
54 | merge_image.paste(image_0, (0, 0))
55 | merge_image.paste(image_need_merge, (size_0[0], 0))
56 |
57 | # merge_image.save('pic_temp/merged.png')
58 | return merge_image
59 |
60 | '''
61 | 个位数图片无法识别,另外融合一个图片来识别,此为白底图片
62 | '''
63 | def merge_thumb_1(image_need_merge):
64 | image_0 = Image.open('tesseract_ocr/0_1.png')
65 | size_need_merge = image_need_merge.size
66 | size_0 = image_0.size
67 |
68 | merge_image = Image.new('RGBA', (size_need_merge[0] + size_0[0], size_need_merge[1]))
69 | merge_image.paste(image_need_merge, (0, 0))
70 | merge_image.paste(image_0, (size_need_merge[0], 0))
71 |
72 | # merge_image.save('pic_temp/merged.png')
73 | return merge_image
74 |
75 | # import cv
76 | # def white_and_black(pic_name):
77 | # image = cv.LoadImage(pic_name, 0)
78 | # size = (image.width, image.height)
79 | # iTmp = cv.CreateImage(size, image.depth, image.nChannels)
80 | # for i in range(image.height):
81 | # for j in range(image.width):
82 | # if image[i, j] < 100:
83 | # iTmp[i, j] = 255
84 | # else:
85 | # iTmp[i, j] = 0
86 | #
87 | # cv.SaveImage(pic_name, iTmp)
88 |
89 | # get_vcode('merged.png')
90 | # merge_thumb('111.png')
91 |
--------------------------------------------------------------------------------
/baidu_index/baidu_web_driver/baidu_login_module.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | from selenium import webdriver
4 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
5 | import time
6 | from PIL import Image
7 | from dama.damatu.damatuWeb import dmt
8 | import setting
9 | from selenium.webdriver.common.keys import Keys
10 |
11 | dcap = dict(DesiredCapabilities.PHANTOMJS)
12 | dcap["phantomjs.page.settings.resourceTimeout"] = 10
13 | dcap["phantomjs.page.settings.loadImages"] = True
14 | dcap["phantomjs.page.settings.userAgent"] = (setting.USER_AGENT)
15 |
16 |
17 | def login_baidu(username, password):
18 | if setting.PHANTOMJS_SERVICE:
19 | web = webdriver.PhantomJS(service_args=setting.PHANTOMJS_SERVICE, executable_path=setting.PHANTOMJS_PATH,
20 | desired_capabilities=dcap)
21 | else:
22 | web = webdriver.PhantomJS(executable_path=setting.PHANTOMJS_PATH, desired_capabilities=dcap)
23 |
24 | web.get('https://passport.baidu.com/v2/?login')
25 |
26 | cookies = []
27 |
28 | element = web.find_element_by_id('TANGRAM__PSP_3__userName')
29 | element.clear()
30 | element.send_keys(username)
31 |
32 | element = web.find_element_by_id('TANGRAM__PSP_3__password')
33 | element.clear()
34 | element.send_keys(password)
35 |
36 | element = web.find_element_by_id('TANGRAM__PSP_3__submit')
37 | element.click()
38 | time.sleep(3)
39 |
40 | while True:
41 | if '帐号设置' in web.find_element_by_css_selector('title').get_attribute('innerText'):
42 | print '登录成功'
43 | cookies = web.get_cookies()
44 | break
45 | errorMsg = web.find_element_by_id('TANGRAM__PSP_3__error').get_attribute('innerText')
46 | if errorMsg == '请输入验证码':
47 | print errorMsg
48 | authcode = _get_authcode(web)
49 | element = web.find_element_by_id('TANGRAM__PSP_3__verifyCode')
50 | element.clear()
51 | element.send_keys(authcode)
52 |
53 | element = web.find_element_by_id('TANGRAM__PSP_3__submit')
54 | element.click()
55 | time.sleep(3)
56 | elif errorMsg == '您输入的验证码有误':
57 | print errorMsg
58 | element = web.find_element_by_id('TANGRAM__PSP_3__verifyCodeImg')
59 | element.click()
60 | time.sleep(1)
61 | authcode = _get_authcode(web)
62 | element = web.find_element_by_id('TANGRAM__PSP_3__verifyCode')
63 | element.clear()
64 | element.send_keys(authcode)
65 | element.send_keys(Keys.ENTER)
66 |
67 | # element = web.find_element_by_id('TANGRAM__PSP_3__submit')
68 | # element.click()
69 | time.sleep(3)
70 | # web.save_screenshot('screen_baidu.png')
71 | else:
72 | print errorMsg
73 | cookies = None
74 | break
75 |
76 | web.close()
77 | return cookies
78 |
79 |
80 | def _get_authcode(web):
81 | web.save_screenshot('authcode_baidu.png')
82 | element = web.find_element_by_id('TANGRAM__PSP_3__verifyCodeImgParent')
83 |
84 | left = 800
85 | top = 352
86 | right = left + element.size['width']
87 | bottom = top + element.size['height']
88 |
89 | im = Image.open('authcode_baidu.png')
90 | im = im.crop((left, top, right, bottom))
91 | im.save('authcode_baidu.png')
92 |
93 | data = _get_bytes('authcode_baidu.png')
94 | result = dmt.decode(data, 71)
95 | return result
96 |
97 |
98 | def _get_bytes(path):
99 | list_data = []
100 | f = open(path, 'rb')
101 | f.seek(0, 0)
102 | while True:
103 | t_byte = f.read(1)
104 | if len(t_byte) == 0:
105 | break
106 | else:
107 | list_data.append(ord(t_byte))
108 | list_data = bytearray(list_data)
109 | return list_data
110 |
--------------------------------------------------------------------------------
/model/ali/ali_index_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import setting
5 |
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
8 |
9 |
10 | def get_ali_model_by_area(area_code, base_model):
11 | ali_model = dict()
12 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_1688buy_index'] = base_model['ali_1688buy_index']
13 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_taobaobuy_index'] = base_model['ali_taobaobuy_index']
14 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_1688sup_index'] = base_model['ali_1688sup_index']
15 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_taobao_needsfore'] = base_model['ali_taobao_needsfore']
16 |
17 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_newbuyer'] = base_model['ali_newbuyer']
18 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_puchaseQty'] = base_model['ali_buyer_puchaseQty']
19 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_taobao'] = base_model['ali_buyer_taobao']
20 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_taobao_grade'] = base_model['ali_buyer_taobao_grade']
21 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_unitprice'] = base_model['ali_buyer_unitprice']
22 | return ali_model
23 |
24 | def get_ali_model(base_model):
25 | ali_model = dict()
26 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_1688buy_index'] = base_model['ali_1688buy_index']
27 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_taobaobuy_index'] = base_model['ali_taobaobuy_index']
28 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_1688sup_index'] = base_model['ali_1688sup_index']
29 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_taobao_needsfore'] = base_model['ali_taobao_needsfore']
30 |
31 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_newbuyer'] = base_model['ali_newbuyer']
32 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_puchaseQty'] = base_model['ali_buyer_puchaseQty']
33 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_taobao'] = base_model['ali_buyer_taobao']
34 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_taobao_grade'] = base_model['ali_buyer_taobao_grade']
35 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_unitprice'] = base_model['ali_buyer_unitprice']
36 | return ali_model
37 |
38 | def get_null_ali_model_by_area(area_code):
39 | ali_model = dict()
40 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_1688buy_index'] = ''
41 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_taobaobuy_index'] = ''
42 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_1688sup_index'] = ''
43 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_taobao_needsfore'] = ''
44 |
45 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_newbuyer'] = ''
46 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_puchaseQty'] = ''
47 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_taobao'] = ''
48 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_taobao_grade'] = ''
49 | ali_model[setting.HBASE_ALI_FAM + ':' + area_code + '_' + 'ali_buyer_unitprice'] = ''
50 | return ali_model
51 |
52 | def get_null_ali_model():
53 | ali_model = dict()
54 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_1688buy_index'] = ''
55 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_taobaobuy_index'] = ''
56 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_1688sup_index'] = ''
57 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_taobao_needsfore'] = ''
58 |
59 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_newbuyer'] = ''
60 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_puchaseQty'] = ''
61 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_taobao'] = ''
62 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_taobao_grade'] = ''
63 | ali_model[setting.HBASE_ALI_FAM + ':' + 'ali_buyer_unitprice'] = ''
64 | return ali_model
65 |
--------------------------------------------------------------------------------
/dama/damatu/damatuWeb.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | import hashlib
4 | import urllib
5 | import json
6 | import base64
7 | import urllib2
8 |
9 |
10 | def md5str(str): # md5加密字符串
11 | m = hashlib.md5(str.encode(encoding="utf-8"))
12 | return m.hexdigest()
13 |
14 |
15 | def md5(byte): # md5加密byte
16 | return hashlib.md5(byte).hexdigest()
17 |
18 |
19 | class DamatuApi():
20 | ID = '43311'
21 | KEY = 'd191c0fd4d6f1957067350f171409441'
22 | HOST = 'http://api.dama2.com:7766/app/'
23 |
24 | def __init__(self, username, password):
25 | self.username = username
26 | self.password = password
27 |
28 | def getSign(self, param=b''):
29 | return (md5(bytearray(self.KEY, encoding="utf8") + bytearray(self.username, encoding="utf8") + param))[:8]
30 |
31 | def getPwd(self):
32 | return md5str(self.KEY + md5str(md5str(self.username) + md5str(self.password)))
33 |
34 | def post(self, path, params={}):
35 | data = urllib.urlencode(params).encode('utf-8')
36 | url = self.HOST + path
37 | req = urllib2.Request(url, data)
38 | return urllib2.urlopen(req).read()
39 |
40 | # 查询余额 return 是正数为余额 如果为负数 则为错误码
41 | def getBalance(self):
42 | data = {'appID': self.ID,
43 | 'user': self.username,
44 | 'pwd': dmt.getPwd(),
45 | 'sign': dmt.getSign()
46 | }
47 | res = self.post('d2Balance', data)
48 | res = str(res, encoding="utf-8")
49 | jres = json.loads(res)
50 | if jres['ret'] == 0:
51 | return jres['balance']
52 | else:
53 | return jres['ret']
54 |
55 | # 上传验证码 参数filePath 验证码图片路径 如d:/1.jpg type是类型,查看http://wiki.dama2.com/index.php?n=ApiDoc.Pricedesc return 是答案为成功 如果为负数 则为错误码
56 | def decode(self, fdata, type):
57 | filedata = base64.b64encode(fdata)
58 | data = {'appID': self.ID,
59 | 'user': self.username,
60 | 'pwd': dmt.getPwd(),
61 | 'type': type,
62 | 'fileDataBase64': filedata,
63 | 'sign': dmt.getSign(fdata)
64 | }
65 | res = self.post('d2File', data)
66 | # res = str(res, encoding = "utf-8")
67 | jres = json.loads(res)
68 | if jres['ret'] == 0:
69 | # 注意这个json里面有ret,id,result,cookie,根据自己的需要获取
70 | return (jres['result'])
71 | else:
72 | return jres['ret']
73 |
74 | # url地址打码 参数 url地址 type是类型(类型查看http://wiki.dama2.com/index.php?n=ApiDoc.Pricedesc) return 是答案为成功 如果为负数 则为错误码
75 | def decodeUrl(self, url, type):
76 | data = {'appID': self.ID,
77 | 'user': self.username,
78 | 'pwd': dmt.getPwd(),
79 | 'type': type,
80 | 'url': urllib.parse.quote(url),
81 | 'sign': dmt.getSign(url.encode(encoding="utf-8"))
82 | }
83 | res = self.post('d2Url', data)
84 | res = str(res, encoding="utf-8")
85 | jres = json.loads(res)
86 | if jres['ret'] == 0:
87 | # 注意这个json里面有ret,id,result,cookie,根据自己的需要获取
88 | return (jres['result'])
89 | else:
90 | return jres['ret']
91 |
92 | # 报错 参数id(string类型)由上传打码函数的结果获得 return 0为成功 其他见错误码
93 | def reportError(self, id):
94 | # f=open('0349.bmp','rb')
95 | # fdata=f.read()
96 | # print(md5(fdata))
97 | data = {'appID': self.ID,
98 | 'user': self.username,
99 | 'pwd': dmt.getPwd(),
100 | 'id': id,
101 | 'sign': dmt.getSign(id.encode(encoding="utf-8"))
102 | }
103 | res = self.post('d2ReportError', data)
104 | res = str(res, encoding="utf-8")
105 | jres = json.loads(res)
106 | return jres['ret']
107 |
108 |
109 | # 调用类型实例:
110 | # 1.实例化类型 参数是打码兔用户账号和密码
111 | dmt = DamatuApi("iamDW", "maosu1989")
112 | # #2.调用方法:
113 | # print(dmt.getBalance()) #查询余额
114 | # print(dmt.decode('0349.bmp',200)) #上传打码
115 | # print(dmt.decodeUrl('http://captcha.qq.com/getimage?aid=549000912&r=0.7257105156128585&uin=3056517021',200)) #上传打码
116 | # #print(dmt.reportError('894657096')) #上报错误
117 |
--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | from baidu_index.baidu_web_driver import baidu_web_driver_pool
5 | from ali_index.ali_web_driver import ali_web_driver_pool
6 | import setting
7 | from multiprocessing.dummy import Pool as ThreadPool
8 | import traceback
9 | from baidu_index.baidu_web_driver.baidu_source import get_baidu_source
10 | from ali_index.ali_web_driver.ali_source import get_ali_source
11 | import json
12 | from model.baidu.baidu_index_model import get_baidu_model, get_null_baidu_model
13 | from model.ali.ali_index_model import get_ali_model, get_null_ali_model
14 | import csv
15 | from hbase.hbase_module import insert
16 | import time
17 |
18 | reload(sys)
19 | sys.setdefaultencoding('utf-8')
20 |
21 |
22 | def init_pool():
23 | print '正在初始化百度指数爬取模块'
24 | '''
25 | 生成百度指数搜索所用webkit池
26 | '''
27 | baidu_driver_pool = baidu_web_driver_pool.get_baidu_web_driver_pool(setting.DRIVER_POOL_SIZE,
28 | setting.BAIDU_USERNAME,
29 | setting.BAIDU_PASSWORD)
30 | print '正在初始化阿里指数爬取模块'
31 | '''
32 | 生成阿里指数搜索所用webkit池
33 | '''
34 | ali_driver_pool = ali_web_driver_pool.get_ali_web_driver_pool(setting.DRIVER_POOL_SIZE)
35 |
36 | return baidu_driver_pool, ali_driver_pool
37 |
38 |
39 | '''
40 | 读取所需爬取内容
41 | '''
42 | def keywords():
43 | reader = csv.reader(file('keywords_data.csv', 'rb'))
44 | for line in reader:
45 | yield line
46 |
47 |
48 | '''
49 | 获取指数详情
50 | '''
51 | def get_index(baidu_driver_pool_temp, ali_driver_pool_temp, keyword_line):
52 | try:
53 | keyword = keyword_line[3].decode('gbk').strip()
54 | final_result = dict()
55 | for area_code in setting.AREA_LOCATION.iterkeys():
56 |
57 | print keyword + ';' + setting.AREA_REFLECTION[area_code] + ':百度指数开始搜索'
58 |
59 | need_search_top = setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_1' not in final_result.iterkeys()
60 | result_baidu = get_baidu_source(baidu_driver_pool_temp, keyword, area_code,
61 | need_search_top) # 最后一个参数是为了不重复搜索区域排名
62 | if result_baidu is None:
63 | final_result = dict(final_result, **get_null_baidu_model(area_code, need_search_top))
64 | else:
65 | final_result = dict(final_result,
66 | **get_baidu_model(area_code, result_baidu, need_search_top))
67 |
68 | print keyword + ';' + setting.AREA_REFLECTION[area_code] + ':百度指数搜索完成'
69 |
70 | print keyword + ':阿里指数开始搜索'
71 |
72 | result_ali = get_ali_source(ali_driver_pool_temp, keyword)
73 | if result_ali is None:
74 | final_result = dict(final_result, **get_null_ali_model())
75 | else:
76 | final_result = dict(final_result, **get_ali_model(result_ali))
77 |
78 | print keyword + ':阿里指数搜索完成'
79 |
80 | final_result[setting.HBASE_INDEX_BASE_FAM + ':' + 'crawl_key'] = keyword
81 | final_result[setting.HBASE_INDEX_BASE_FAM + ':' + 'industry_name'] = keyword_line[2].decode('gbk')
82 | final_result[setting.HBASE_INDEX_BASE_FAM + ':' + 'industry_name_big'] = keyword_line[1].decode('gbk')
83 |
84 | datakey = keyword_line[0] + '_' + time.strftime('%Y%m', time.localtime(time.time()))
85 | timestamp = int(time.strftime('%Y%m%d', time.localtime(time.time())))
86 | insert(setting.HBASE_INDEX_TABLE_NAME, datakey, final_result, timestamp)
87 |
88 | # result = json.dumps(final_result, sort_keys=True, indent=4).decode('unicode-escape')
89 | result = json.dumps(final_result).decode('unicode-escape')
90 | print result
91 |
92 | return final_result
93 | except:
94 | print '-------------------------'
95 | print traceback.format_exc()
96 | print '--------------------------'
97 |
98 |
99 | def main():
100 | try:
101 | baidu_driver_pool, ali_driver_pool = init_pool()
102 | thread_pool = ThreadPool(setting.DRIVER_POOL_SIZE)
103 | for keyword in keywords():
104 | thread_pool.apply_async(get_index, (baidu_driver_pool, ali_driver_pool, keyword))
105 | thread_pool.close()
106 | thread_pool.join()
107 | except:
108 | print traceback.format_exc()
109 | print '--------end--------'
110 |
111 |
112 | if __name__ == '__main__':
113 | main()
114 |
--------------------------------------------------------------------------------
/ali_index/ali_web_driver/ali_source.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import traceback
5 | import time
6 | from selenium.webdriver.common.keys import Keys
7 | import setting
8 |
9 | reload(sys)
10 | sys.setdefaultencoding('utf-8')
11 |
12 |
13 | def get_ali_index(web, keyword):
14 | result = None
15 | try:
16 | web.get('https://index.1688.com/alizs/keyword.htm')
17 | time.sleep(setting.SLEEP_TIME)
18 |
19 | item = dict()
20 |
21 | element = web.find_element_by_id('alizs-input')
22 | element.clear()
23 | element.send_keys(keyword.decode('utf-8'))
24 | element.send_keys(Keys.ENTER)
25 | time.sleep(setting.SLEEP_TIME)
26 |
27 | element = web.find_elements_by_css_selector('ul.page-list li a')[0]
28 | element.click()
29 | time.sleep(setting.SLEEP_TIME)
30 |
31 | # item['ali_1688buy_index'] = web.find_element_by_css_selector('p.right-detail span.highlight-red').get_attribute(
32 | # 'innerText').replace('第', '')
33 |
34 | element = web.find_element_by_css_selector('div[class=\'selected-list fd-clr list\']')
35 | item['ali_taobaobuy_index'] = element.find_element_by_css_selector('div.col-tb-purchase p').get_attribute(
36 | 'innerText').replace(',', '')
37 | item['ali_1688buy_index'] = element.find_element_by_css_selector('div.col-1688-purchase p').get_attribute(
38 | 'innerText').replace(',', '')
39 | item['ali_1688sup_index'] = element.find_element_by_css_selector('div.col-supply p').get_attribute(
40 | 'innerText').replace(',', '')
41 | item['ali_taobao_needsfore'] = element.find_element_by_css_selector('p.col-forecast').get_attribute('innerText')
42 |
43 | element = web.find_elements_by_css_selector('ul.page-list li a')[2]
44 | element.click()
45 | time.sleep(setting.SLEEP_TIME)
46 |
47 | has_first_rect = '最近30天,在您所选行业老采购商人数过少,暂不提供新/老采购商身份分布' not in web.find_element_by_css_selector(
48 | 'div.mod-identity').get_attribute('outerHTML')
49 | has_second_rect = '最近30天,在您所选行业淘宝店主采购商人数过少,暂不提供采购商的非淘宝/淘宝店主身份分布' not in web.find_element_by_css_selector(
50 | 'div.mod-identity').get_attribute('outerHTML')
51 | has_third_rect = '最近30天,在您所选行业线上交易的供应商人数过少,暂不提供采购客单价分布。' not in web.find_element_by_css_selector(
52 | 'div.mod-price').get_attribute('outerHTML')
53 |
54 | if has_first_rect and has_second_rect:
55 | elements = web.find_elements_by_css_selector('div[class=\'content detail\'] span.highlight-red')
56 | item['ali_newbuyer'] = elements[0].get_attribute('innerText')
57 | item['ali_buyer_puchaseQty'] = elements[1].get_attribute('innerText').replace('次以上', '')
58 | item['ali_buyer_taobao'] = elements[2].get_attribute('innerText')
59 | item['ali_buyer_taobao_grade'] = elements[3].get_attribute('innerText')
60 | elif has_first_rect and not has_second_rect:
61 | elements = web.find_elements_by_css_selector('div[class=\'content detail\'] span.highlight-red')
62 | item['ali_newbuyer'] = elements[0].get_attribute('innerText')
63 | item['ali_buyer_puchaseQty'] = elements[1].get_attribute('innerText').replace('次以上', '')
64 | item['ali_buyer_taobao'] = ''
65 | item['ali_buyer_taobao_grade'] = ''
66 | elif not has_first_rect and has_second_rect:
67 | elements = web.find_elements_by_css_selector('div[class=\'content detail\'] span.highlight-red')
68 | item['ali_newbuyer'] = ''
69 | item['ali_buyer_puchaseQty'] = ''
70 | item['ali_buyer_taobao'] = elements[0].get_attribute('innerText')
71 | item['ali_buyer_taobao_grade'] = elements[1].get_attribute('innerText')
72 | else:
73 | item['ali_newbuyer'] = ''
74 | item['ali_buyer_puchaseQty'] = ''
75 | item['ali_buyer_taobao'] = ''
76 | item['ali_buyer_taobao_grade'] = ''
77 |
78 | if has_third_rect:
79 | element = web.find_element_by_css_selector('div[class=\'obj-right obj-analyse\'] span.highlight-red')
80 | item['ali_buyer_unitprice'] = element.get_attribute('innerText')
81 | else:
82 | item['ali_buyer_unitprice'] = ''
83 |
84 | result = item
85 | except:
86 | print traceback.format_exc()
87 | # web.save_screenshot('pic_temp/screen_ali.png')
88 | finally:
89 | return result
90 |
91 |
92 | def get_ali_source(ali_driver_pool, keyword):
93 | if not ali_driver_pool:
94 | return None
95 | web = ali_driver_pool.get()
96 | source = get_ali_index(web, keyword)
97 | ali_driver_pool.put(web)
98 | return source
99 |
100 |
101 | def get_page_source(web):
102 | return web.execute_script("return document.documentElement.outerHTML")
103 |
--------------------------------------------------------------------------------
/model/baidu/baidu_index_model.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | import setting
5 |
6 | reload(sys)
7 | sys.setdefaultencoding('utf-8')
8 |
9 |
10 | def get_baidu_model(area_code, base_model, need_top):
11 | baidu_model = dict()
12 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_index'] = base_model['baidu_overa_index']
13 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_index'] = base_model['baidu_mbl_index']
14 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_yearbase'] = base_model['baidu_overa_yearbase']
15 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_chain'] = base_model['baidu_overa_chain']
16 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_yearbase'] = base_model['baidu_mbl_yearbase']
17 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_chain'] = base_model['baidu_mbl_chain']
18 |
19 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_7'] = base_model['baidu_avg_7']
20 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_30'] = base_model['baidu_avg_30']
21 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_90'] = base_model['baidu_avg_90']
22 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_180'] = base_model['baidu_avg_180']
23 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_all'] = base_model['baidu_avg_all']
24 |
25 | if need_top:
26 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_1'] = base_model['baidu_prov_1']
27 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_2'] = base_model['baidu_prov_2']
28 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_3'] = base_model['baidu_prov_3']
29 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_4'] = base_model['baidu_prov_4']
30 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_5'] = base_model['baidu_prov_5']
31 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_6'] = base_model['baidu_prov_6']
32 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_7'] = base_model['baidu_prov_7']
33 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_8'] = base_model['baidu_prov_8']
34 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_9'] = base_model['baidu_prov_9']
35 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_10'] = base_model['baidu_prov_10']
36 |
37 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_1'] = base_model['baidu_area_1']
38 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_2'] = base_model['baidu_area_2']
39 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_3'] = base_model['baidu_area_3']
40 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_4'] = base_model['baidu_area_4']
41 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_5'] = base_model['baidu_area_5']
42 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_6'] = base_model['baidu_area_6']
43 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_7'] = base_model['baidu_area_7']
44 |
45 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_1'] = base_model['baidu_city_1']
46 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_2'] = base_model['baidu_city_2']
47 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_3'] = base_model['baidu_city_3']
48 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_4'] = base_model['baidu_city_4']
49 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_5'] = base_model['baidu_city_5']
50 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_6'] = base_model['baidu_city_6']
51 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_7'] = base_model['baidu_city_7']
52 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_8'] = base_model['baidu_city_8']
53 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_9'] = base_model['baidu_city_9']
54 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_10'] = base_model['baidu_city_10']
55 | return baidu_model
56 |
57 | def get_null_baidu_model(area_code, need_top):
58 | baidu_model = dict()
59 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_index'] = ''
60 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_index'] = ''
61 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_yearbase'] = ''
62 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_overa_chain'] = ''
63 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_yearbase'] = ''
64 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_mbl_chain'] = ''
65 |
66 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_7'] = ''
67 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_30'] = ''
68 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_90'] = ''
69 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_180'] = ''
70 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + area_code + '_' + 'baidu_avg_all'] = ''
71 |
72 | if need_top:
73 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_1'] = ''
74 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_2'] = ''
75 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_3'] = ''
76 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_4'] = ''
77 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_5'] = ''
78 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_6'] = ''
79 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_7'] = ''
80 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_8'] = ''
81 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_9'] = ''
82 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_prov_10'] = ''
83 |
84 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_1'] = ''
85 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_2'] = ''
86 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_3'] = ''
87 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_4'] = ''
88 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_5'] = ''
89 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_6'] = ''
90 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_area_7'] = ''
91 |
92 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_1'] = ''
93 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_2'] = ''
94 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_3'] = ''
95 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_4'] = ''
96 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_5'] = ''
97 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_6'] = ''
98 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_7'] = ''
99 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_8'] = ''
100 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_9'] = ''
101 | baidu_model[setting.HBASE_BAIDU_FAM + ':' + 'baidu_city_10'] = ''
102 | return baidu_model
103 |
--------------------------------------------------------------------------------
/baidu_index/baidu_web_driver/baidu_source.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import sys
4 | from selenium.webdriver.common.keys import Keys
5 | import time
6 | import traceback
7 | import setting
8 | from dama.damatu.damatuWeb import dmt
9 | from PIL import Image
10 | from tesseract_ocr.tesseract_ocr_module import get_vcode_by_img_0, get_vcode_by_img_1
11 | import os
12 | from selenium.webdriver.common.action_chains import ActionChains
13 | import copy
14 | from base.send_email import sendEmail
15 |
16 | reload(sys)
17 | sys.setdefaultencoding('utf-8')
18 |
19 |
20 | def get_baidu_index(web, keyword, area_code, need_search_top_area):
21 | result = None
22 | try:
23 | web.get('http://index.baidu.com')
24 | # web.save_screenshot('pic_temp/screen_baidu.png')
25 | element = web.find_element_by_id('schword')
26 | element.clear()
27 | element.send_keys(keyword.decode('utf-8'))
28 | element.send_keys(Keys.ENTER)
29 | time.sleep(setting.SLEEP_TIME * 3)
30 |
31 | while '请输入验证码' in get_page_source(web):
32 | print '需要验证码,保存截图并结束搜索'
33 | web.save_screenshot('pic_temp/need_authcode_%s.png' % (str(time.time()),))
34 | sendEmail('%s;-1' % (keyword,))
35 | os._exit(0)
36 | return result
37 |
38 | # region 暂时注释掉打码
39 | '''
40 | element = web.find_element_by_css_selector('input.verifyInput')
41 | element.clear()
42 |
43 | authcode = _get_authcode(web)
44 | if 'ERROR' in authcode:
45 | continue
46 |
47 | element.send_keys(authcode)
48 |
49 | element = web.find_element_by_css_selector('a.tang-dialog-button')
50 | element.click()
51 |
52 | time.sleep(setting.SLEEP_TIME)
53 | '''
54 | # endregion
55 | if '未被收录,如要查看相关数据,您需要购买创建新词的权限' in get_page_source(web):
56 | print '关键字:' + keyword + '未被百度收录'
57 | return result
58 |
59 | # region 选择地区
60 | if area_code != 'qg_qg':
61 | area = setting.AREA_REFLECTION[area_code].split('_')
62 | element = web.find_element_by_css_selector('div#compOtharea div.comCtl span.holdBox')
63 | element.click()
64 | time.sleep(setting.SLEEP_TIME * 0.6)
65 |
66 | element = web.find_elements_by_css_selector('div#compOtharea div.sltOpt')[0].find_element_by_xpath(
67 | '//a[contains(.,\'%s\')]' % (area[0],))
68 | element.click()
69 | time.sleep(setting.SLEEP_TIME * 0.6)
70 |
71 | element = web.find_elements_by_css_selector('div#compOtharea div.sltOpt')[1].find_element_by_xpath(
72 | '//a[contains(.,\'%s\')]' % (area[1],))
73 | element.click()
74 | time.sleep(setting.SLEEP_TIME * 0.6)
75 |
76 | # web.save_screenshot('pic_temp/screen_baidu.png')
77 | # endregion
78 |
79 | element = web.find_element_by_css_selector('a.gColor1')
80 | element.click()
81 | time.sleep(setting.SLEEP_TIME)
82 | time.sleep(setting.SLEEP_TIME)
83 | item = dict()
84 |
85 | elements = web.find_elements_by_css_selector('span.ftlwhf')
86 |
87 | item['baidu_overa_index'] = _get_value_by_ocr(web, elements[6])
88 | item['baidu_mbl_index'] = _get_value_by_ocr(web, elements[7])
89 |
90 | item['baidu_overa_yearbase'] = get_value(elements[8])
91 | item['baidu_overa_chain'] = get_value(elements[9])
92 |
93 | item['baidu_mbl_yearbase'] = get_value(elements[10])
94 | item['baidu_mbl_chain'] = get_value(elements[11])
95 |
96 | element = web.find_element_by_css_selector('label#trend-meanline')
97 | element.click()
98 | time.sleep(setting.SLEEP_TIME)
99 |
100 | # region avg_7 ---------------------------------
101 | element = web.find_element_by_css_selector('a[rel=\'7\']')
102 | element.click()
103 | time.sleep(setting.SLEEP_TIME)
104 |
105 | element = web.find_elements_by_css_selector('rect[x=\'45\']')[0]
106 | element.click()
107 | time.sleep(setting.BAIDU_AVG_SLEEP_TIME)
108 |
109 | element = web.find_element_by_css_selector('div.contentWord')
110 | location = copy.copy(element.location)
111 | size = copy.copy(element.size)
112 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),)
113 | web.save_screenshot(pic_name)
114 |
115 | left = location['x'] + 2
116 | top = location['y'] + 2
117 | right = left + size['width'] - 4
118 | bottom = top + size['height'] - 4
119 |
120 | with Image.open(pic_name) as im:
121 | im = im.crop((left, top, right, bottom))
122 | im.save(pic_name)
123 | item['baidu_avg_7'] = get_vcode_by_img_0(im)
124 | if os.path.exists(pic_name) and item['baidu_avg_7'] != '':
125 | os.remove(pic_name)
126 | # endregion ---------------------------------
127 |
128 | # region avg_30 ---------------------------------
129 | element = web.find_element_by_css_selector('a[rel=\'30\']')
130 | element.click()
131 | time.sleep(setting.SLEEP_TIME)
132 |
133 | ActionChains(web).move_to_element(element).perform()
134 | time.sleep(setting.SLEEP_TIME)
135 |
136 | element = web.find_elements_by_css_selector('rect[x=\'45\']')[0]
137 | element.click()
138 | time.sleep(setting.BAIDU_AVG_SLEEP_TIME)
139 |
140 | element = web.find_element_by_css_selector('div.contentWord')
141 | location = copy.copy(element.location)
142 | size = copy.copy(element.size)
143 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),)
144 | web.save_screenshot(pic_name)
145 |
146 | left = location['x'] + 2
147 | top = location['y'] + 2
148 | right = left + size['width'] - 4
149 | bottom = top + size['height'] - 4
150 |
151 | with Image.open(pic_name) as im:
152 | im = im.crop((left, top, right, bottom))
153 | im.save(pic_name)
154 | item['baidu_avg_30'] = get_vcode_by_img_0(im)
155 | if os.path.exists(pic_name) and item['baidu_avg_30'] != '':
156 | os.remove(pic_name)
157 | # endregion ---------------------------------
158 |
159 | # region avg_90 ---------------------------------
160 | element = web.find_element_by_css_selector('a[rel=\'90\']')
161 | element.click()
162 | time.sleep(setting.SLEEP_TIME)
163 |
164 | ActionChains(web).move_to_element(element).perform()
165 | time.sleep(setting.SLEEP_TIME)
166 |
167 | element = web.find_elements_by_css_selector('rect[x=\'45\']')[0]
168 | element.click()
169 | time.sleep(setting.BAIDU_AVG_SLEEP_TIME)
170 |
171 | element = web.find_element_by_css_selector('div.contentWord')
172 | location = copy.copy(element.location)
173 | size = copy.copy(element.size)
174 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),)
175 | web.save_screenshot(pic_name)
176 |
177 | left = location['x'] + 2
178 | top = location['y'] + 2
179 | right = left + size['width'] - 4
180 | bottom = top + size['height'] - 4
181 |
182 | with Image.open(pic_name) as im:
183 | im = im.crop((left, top, right, bottom))
184 | im.save(pic_name)
185 | item['baidu_avg_90'] = get_vcode_by_img_0(im)
186 | if os.path.exists(pic_name) and item['baidu_avg_90'] != '':
187 | os.remove(pic_name)
188 | # endregion ---------------------------------
189 |
190 | # region avg_180 ---------------------------------
191 | element = web.find_element_by_css_selector('a[rel=\'180\']')
192 | element.click()
193 | time.sleep(setting.SLEEP_TIME)
194 |
195 | ActionChains(web).move_to_element(element).perform()
196 | time.sleep(setting.SLEEP_TIME)
197 |
198 | element = web.find_elements_by_css_selector('rect[x=\'45\']')[0]
199 | element.click()
200 | time.sleep(setting.BAIDU_AVG_SLEEP_TIME)
201 |
202 | element = web.find_element_by_css_selector('div.contentWord')
203 | location = copy.copy(element.location)
204 | size = copy.copy(element.size)
205 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),)
206 | web.save_screenshot(pic_name)
207 |
208 | left = location['x'] + 2
209 | top = location['y'] + 2
210 | right = left + size['width'] - 4
211 | bottom = top + size['height'] - 4
212 |
213 | with Image.open(pic_name) as im:
214 | im = im.crop((left, top, right, bottom))
215 | im.save(pic_name)
216 | item['baidu_avg_180'] = get_vcode_by_img_0(im)
217 | if os.path.exists(pic_name) and item['baidu_avg_180'] != '':
218 | os.remove(pic_name)
219 | # endregion ---------------------------------
220 |
221 | # region avg_all ---------------------------------
222 | element = web.find_element_by_css_selector('a[rel=\'all\']')
223 | element.click()
224 | time.sleep(setting.SLEEP_TIME)
225 |
226 | ActionChains(web).move_to_element(element).perform()
227 | time.sleep(setting.SLEEP_TIME)
228 |
229 | element = web.find_elements_by_css_selector('rect[x=\'45\']')[0]
230 | element.click()
231 | time.sleep(setting.BAIDU_AVG_SLEEP_TIME)
232 |
233 | element = web.find_element_by_css_selector('div.contentWord')
234 | location = copy.copy(element.location)
235 | size = copy.copy(element.size)
236 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),)
237 | web.save_screenshot(pic_name)
238 |
239 | left = location['x'] + 2
240 | top = location['y'] + 2
241 | right = left + size['width'] - 4
242 | bottom = top + size['height'] - 4
243 |
244 | with Image.open(pic_name) as im:
245 | im = im.crop((left, top, right, bottom))
246 | im.save(pic_name)
247 | item['baidu_avg_all'] = get_vcode_by_img_0(im)
248 | if os.path.exists(pic_name) and item['baidu_avg_all'] != '':
249 | os.remove(pic_name)
250 | # endregion ---------------------------------
251 |
252 | if need_search_top_area:
253 | element = web.find_elements_by_css_selector('table#subNav td')[3].find_element_by_css_selector('a')
254 | element.click()
255 | time.sleep(setting.SLEEP_TIME * 3.5)
256 |
257 | element = web.find_element_by_css_selector('div.grpArea svg text[text-anchor=\'middle\']')
258 | if 'display: none;' not in element.get_attribute('style'):
259 | item['baidu_prov_1'] = ''
260 | item['baidu_prov_2'] = ''
261 | item['baidu_prov_3'] = ''
262 | item['baidu_prov_4'] = ''
263 | item['baidu_prov_5'] = ''
264 | item['baidu_prov_6'] = ''
265 | item['baidu_prov_7'] = ''
266 | item['baidu_prov_8'] = ''
267 | item['baidu_prov_9'] = ''
268 | item['baidu_prov_10'] = ''
269 |
270 | item['baidu_area_1'] = ''
271 | item['baidu_area_2'] = ''
272 | item['baidu_area_3'] = ''
273 | item['baidu_area_4'] = ''
274 | item['baidu_area_5'] = ''
275 | item['baidu_area_6'] = ''
276 | item['baidu_area_7'] = ''
277 |
278 | item['baidu_city_1'] = ''
279 | item['baidu_city_2'] = ''
280 | item['baidu_city_3'] = ''
281 | item['baidu_city_4'] = ''
282 | item['baidu_city_5'] = ''
283 | item['baidu_city_6'] = ''
284 | item['baidu_city_7'] = ''
285 | item['baidu_city_8'] = ''
286 | item['baidu_city_9'] = ''
287 | item['baidu_city_10'] = ''
288 | else:
289 | elements = web.find_elements_by_css_selector('div.items')
290 | item['baidu_prov_1'] = elements[0].find_element_by_css_selector('td.scName').get_attribute('innerText')
291 | item['baidu_prov_2'] = elements[1].find_element_by_css_selector('td.scName').get_attribute('innerText')
292 | item['baidu_prov_3'] = elements[2].find_element_by_css_selector('td.scName').get_attribute('innerText')
293 | item['baidu_prov_4'] = elements[3].find_element_by_css_selector('td.scName').get_attribute('innerText')
294 | item['baidu_prov_5'] = elements[4].find_element_by_css_selector('td.scName').get_attribute('innerText')
295 | item['baidu_prov_6'] = elements[5].find_element_by_css_selector('td.scName').get_attribute('innerText')
296 | item['baidu_prov_7'] = elements[6].find_element_by_css_selector('td.scName').get_attribute('innerText')
297 | item['baidu_prov_8'] = elements[7].find_element_by_css_selector('td.scName').get_attribute('innerText')
298 | item['baidu_prov_9'] = elements[8].find_element_by_css_selector('td.scName').get_attribute('innerText')
299 | item['baidu_prov_10'] = elements[9].find_element_by_css_selector('td.scName').get_attribute('innerText')
300 |
301 | element = web.find_elements_by_css_selector('ul.scTab li')[1]
302 | element.click()
303 | time.sleep(setting.SLEEP_TIME)
304 | elements = web.find_elements_by_css_selector('div.items')
305 | item['baidu_area_1'] = elements[0].find_element_by_css_selector('td.scName').get_attribute('innerText')
306 | item['baidu_area_2'] = elements[1].find_element_by_css_selector('td.scName').get_attribute('innerText')
307 | item['baidu_area_3'] = elements[2].find_element_by_css_selector('td.scName').get_attribute('innerText')
308 | item['baidu_area_4'] = elements[3].find_element_by_css_selector('td.scName').get_attribute('innerText')
309 | item['baidu_area_5'] = elements[4].find_element_by_css_selector('td.scName').get_attribute('innerText')
310 | item['baidu_area_6'] = elements[5].find_element_by_css_selector('td.scName').get_attribute('innerText')
311 | item['baidu_area_7'] = elements[6].find_element_by_css_selector('td.scName').get_attribute('innerText')
312 |
313 | element = web.find_elements_by_css_selector('ul.scTab li')[2]
314 | element.click()
315 | time.sleep(setting.SLEEP_TIME)
316 | elements = web.find_elements_by_css_selector('div.items')
317 | item['baidu_city_1'] = elements[0].find_element_by_css_selector('td.scName').get_attribute('innerText')
318 | item['baidu_city_2'] = elements[1].find_element_by_css_selector('td.scName').get_attribute('innerText')
319 | item['baidu_city_3'] = elements[2].find_element_by_css_selector('td.scName').get_attribute('innerText')
320 | item['baidu_city_4'] = elements[3].find_element_by_css_selector('td.scName').get_attribute('innerText')
321 | item['baidu_city_5'] = elements[4].find_element_by_css_selector('td.scName').get_attribute('innerText')
322 | item['baidu_city_6'] = elements[5].find_element_by_css_selector('td.scName').get_attribute('innerText')
323 | item['baidu_city_7'] = elements[6].find_element_by_css_selector('td.scName').get_attribute('innerText')
324 | item['baidu_city_8'] = elements[7].find_element_by_css_selector('td.scName').get_attribute('innerText')
325 | item['baidu_city_9'] = elements[8].find_element_by_css_selector('td.scName').get_attribute('innerText')
326 | item['baidu_city_10'] = elements[9].find_element_by_css_selector('td.scName').get_attribute('innerText')
327 |
328 | result = item
329 | except:
330 | if '请输入验证码' in get_page_source(web):
331 | print '需要验证码,保存截图并结束搜索'
332 | web.save_screenshot('pic_temp/need_authcode_%s.png' % (str(time.time()),))
333 | sendEmail('%s;-1' % (keyword,))
334 | os._exit(0)
335 |
336 | print '---------------'
337 | print traceback.format_exc()
338 | print '---------------'
339 | error_pic_name = 'pic_temp/error_%s.png' % (str(time.time()),)
340 | web.save_screenshot(error_pic_name)
341 | sendEmail('关键字:%s\r\n%s\r\n%s' % (keyword, traceback.format_exc(), error_pic_name))
342 | finally:
343 | return result
344 |
345 |
346 | def get_value(span):
347 | str_value = span.get_attribute('innerText')
348 |
349 | i_s = span.find_elements_by_css_selector('i')
350 | for i in i_s:
351 | str_value += setting.REFLECTION[i.get_attribute('outerHTML')]
352 | return str_value
353 |
354 |
355 | def get_baidu_source(baidu_driver_pool, keyword, area_code, need_search_top_area):
356 | if not baidu_driver_pool:
357 | return None
358 | web = baidu_driver_pool.get()
359 | source = get_baidu_index(web, keyword, area_code, need_search_top_area)
360 | baidu_driver_pool.put(web)
361 | return source
362 |
363 |
364 | def get_page_source(web):
365 | return web.execute_script("return document.documentElement.outerHTML")
366 |
367 |
368 | def _get_value_by_ocr(web, element):
369 | # print element.get_attribute('outerHTML')
370 | pic_name = 'pic_temp/element_%s.png' % (str(time.time()),)
371 | web.save_screenshot(pic_name)
372 |
373 | left = element.location['x']
374 | top = element.location['y']
375 | right = left + element.size['width']
376 | bottom = top + element.size['height']
377 |
378 | with Image.open(pic_name) as im:
379 | im = im.crop((left, top, right, bottom))
380 | im.save(pic_name)
381 | vcode = get_vcode_by_img_1(im)
382 | if os.path.exists(pic_name):
383 | os.remove(pic_name)
384 | return vcode
385 |
386 |
387 | def _get_authcode(web):
388 | print '指数页面打码1次'
389 | web.save_screenshot('pic_temp/authcode_baidu_index.png')
390 | element = web.find_element_by_css_selector('img.verifyImg')
391 |
392 | left = element.location['x']
393 | top = element.location['y']
394 | right = left + element.size['width']
395 | bottom = top + element.size['height']
396 |
397 | with Image.open('pic_temp/authcode_baidu_index.png') as im:
398 | im = im.crop((left, top, right, bottom))
399 | im.save('pic_temp/authcode_baidu_index.png')
400 |
401 | data = _get_bytes('pic_temp/authcode_baidu_index.png')
402 | result = dmt.decode(data, 42)
403 | print ' 指数页面打码结果:' + result
404 | return result
405 |
406 |
407 | def _get_bytes(path):
408 | list_data = []
409 | f = open(path, 'rb')
410 | f.seek(0, 0)
411 | while True:
412 | t_byte = f.read(1)
413 | if len(t_byte) == 0:
414 | break
415 | else:
416 | list_data.append(ord(t_byte))
417 | list_data = bytearray(list_data)
418 | return list_data
419 |
--------------------------------------------------------------------------------