├── README.md ├── sql.sql └── pa3.py /README.md: -------------------------------------------------------------------------------- 1 | # python_spider 2 | 百度关键词刷排名 3 | -------------------------------------------------------------------------------- /sql.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/phgczm/python_spider/HEAD/sql.sql -------------------------------------------------------------------------------- /pa3.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from selenium import webdriver 3 | import time 4 | import requests 5 | import random 6 | import os 7 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 8 | import traceback 9 | import urllib.request 10 | import pymysql 11 | import socket 12 | #import win32api #pip install pypiwin32 13 | 14 | #from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 15 | #DesiredCapabilities.INTERNETEXPLORER['ignoreProtectedModeSettings'] = True 16 | 17 | 18 | 19 | #rasdial 宽带连接 19ab68----643534 20 | def connect(): 21 | cmd_str = "rasdial %s %s %s" % (g_adsl_account['name'], g_adsl_account['username'], g_adsl_account['password']) 22 | os.system(cmd_str) 23 | time.sleep(5) 24 | 25 | 26 | #"rasdial 断开宽带连接 /disconnect" 27 | def disconnect(): 28 | cmd_str = "rasdial %s /disconnect" % g_adsl_account['name'] 29 | os.system(cmd_str) 30 | time.sleep(5) 31 | 32 | #获取ip地址 33 | def get_ip(): 34 | #return ['ip','address'] 35 | fp = urllib.request.urlopen("http://ip.chinaz.com/getip.aspx") 36 | mybytes = fp.read() 37 | # note that Python3 does not read the html code as string 38 | # but as html code bytearray, convert to string with 39 | mystr = mybytes.decode("utf8") 40 | fp.close() 41 | ip = mystr.find("ip") 42 | add = mystr.find("address") 43 | ip = mystr[ip+4:add-2] 44 | address = mystr[add+9:-2] 45 | return [ip,address] 46 | 47 | #将ip地址插入数据库 48 | def insert_db(ipdate): 49 | #try: 50 | #获取一个数据库连接,注意如果是UTF-8类型的,需要制定数据库 51 | conn=pymysql.connect(host='localhost',user='root',passwd='',port=3306,charset='utf8') 52 | cur=conn.cursor() #获取一个游标对象 53 | #cur.execute("CREATE DATABASE zongzong") #执行对应的SQL语句 54 | #exit() 55 | cur.execute("USE zongzong") 56 | #exit() 57 | #cur.execute("CREATE TABLE `ip_log` (`id` int(11) NOT NULL AUTO_INCREMENT,`ip` varchar(32) DEFAULT NULL,`address` varchar(64) DEFAULT NULL,`keyword` varchar(64) DEFAULT '',`url` varchar(256) DEFAULT '',`error` varchar(64) DEFAULT '',`created_at` timestamp NULL DEFAULT NULL ON UPDATE CURRENT_TIMESTAMP,PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=21 DEFAULT CHARSET=utf8;") 58 | 59 | #插入数据 60 | ISOTIMEFORMAT='%Y-%m-%d %X' 61 | ipdate.append( time.strftime( ISOTIMEFORMAT, time.localtime() )) 62 | cur.execute("INSERT INTO ip_log(ip,address,keyword,url,error,page,rank,created_at) VALUES(%s, %s, %s, %s, %s, %s, %s, %s)",ipdate) 63 | 64 | #cur.execute("SELECT * FROM ip_log") 65 | #data=cur.fetchall() 66 | #print(data) 67 | 68 | cur.close()#关闭游标 69 | conn.commit()#向数据库中提交任何未解决的事务,对不支持事务的数据库不进行任何操作 70 | conn.close()#关闭到数据库的连接,释放数据库资源 71 | #except: 72 | # print("发生异常") 73 | 74 | 75 | #获取搜素出来的url 76 | def get_search_url(driver): 77 | urls = [] 78 | real = [] 79 | real_url = [] 80 | click_link = [] 81 | content = driver.find_element_by_css_selector("div[id=\"content_left\"]") 82 | links = content.find_elements_by_tag_name("a") 83 | for link in links: 84 | if link.get_attribute('class') == "c-showurl": 85 | real.append(link.text) 86 | url = link.get_attribute('href') 87 | urls.append(url) 88 | 89 | #解密url 90 | header = requests.head(url).headers 91 | is_append = True 92 | for out_url in out_urls: 93 | if out_url in header['location']: 94 | is_append = False 95 | break 96 | 97 | if is_append == True: 98 | real_url.append(header['location']) 99 | #a标签对象 100 | click_link.append(link) 101 | 102 | #print(real) 103 | #print(urls) 104 | #return urls 105 | return [real_url,click_link] 106 | 107 | 108 | #function:解析加密url,剔除竞争对手的url 109 | # def get_real_url(urls): 110 | # real_url = [] 111 | # for url in urls: 112 | # header = requests.head(url).headers 113 | # is_append = True 114 | # for out_url in out_urls: 115 | # if out_url in header['location']: 116 | # is_append = False 117 | # break 118 | 119 | # if is_append == True: 120 | # real_url.append(header['location']) 121 | # return real_url 122 | 123 | #function 目标地址是否在某个list中 124 | def get_urlIndex(tagurl,urls): 125 | i = 0 126 | has = -1 127 | for url in urls: 128 | if tagurl in url: 129 | has = True 130 | return i 131 | i = i+1 132 | return has 133 | 134 | 135 | #点击百度搜索内容下面的下一页 136 | def click_nextBtn(driver): 137 | div = driver.find_element_by_css_selector("div[id=\"page\"]") 138 | a = div.find_elements_by_tag_name("a") 139 | for item in a: 140 | print(item.text) 141 | if item.text == "下一页>": 142 | item.click() 143 | 144 | return driver 145 | 146 | 147 | 148 | 149 | #随机点击 150 | def click_search_url(driver,items): 151 | urls = [] 152 | real = [] 153 | content = driver.find_element_by_css_selector("div[id=\"content_left\"]") 154 | links = content.find_elements_by_tag_name("a") 155 | i=0 156 | '''获取当前窗口''' 157 | nowhandle = driver.current_window_handle 158 | #allhandles=driver.window_handles 159 | #for handle in allhandles: 160 | # print('....当前窗口....',handle.title) 161 | #exit() 162 | 163 | for link in links: 164 | if link.get_attribute('class') == "c-showurl": 165 | if i in items: 166 | print("随机点击item:",i) 167 | print(link.get_attribute('href'),link.text) 168 | #exit() 169 | link.click() 170 | #停留在点击页面 171 | time.sleep(random.randint(5,10)) 172 | 173 | '''获取所有窗口''' 174 | allhandles=driver.window_handles 175 | #for handle in allhandles: 176 | # print('....当前窗口....',handle.title) 177 | #exit() 178 | 179 | '''循环判断窗口是否为当前窗口''' 180 | for handle in allhandles: 181 | if handle != nowhandle: 182 | print("切换到当前窗口") 183 | driver.switch_to_window(handle) 184 | print("title:",driver.title) 185 | '''关闭当前窗口''' 186 | driver.close() 187 | '''回到原先的窗口''' 188 | print("切换到原来的窗口") 189 | driver.switch_to_window(nowhandle) 190 | print("title:",driver.title) 191 | print("本次随机点击完毕!") 192 | 193 | i=i+1 194 | 195 | 196 | #获取随机点击的搜索页random.randint(0 197 | def get_random_index(index,len): 198 | if index >= 8: 199 | random_index = [ 200 | random.randint(0,4),random.randint(5,8) 201 | ] 202 | elif index>=4: 203 | random_index = [ 204 | random.randint(0,3),random.randint(3,index) 205 | ] 206 | elif index>=0: 207 | random_index = [ 208 | index 209 | ] 210 | elif index == -1: 211 | if len <=5: 212 | random_index = [ 213 | random.randint(0,5) 214 | ] 215 | else: 216 | random_index = [ 217 | #random.randint(0,4),random.randint(5,len) 218 | random.randint(5,len) 219 | ] 220 | return random_index 221 | 222 | 223 | 224 | def getUA(): 225 | uaList = [ 226 | #360 227 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36", 228 | #chrome 229 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36", 230 | #"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36", 231 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", 232 | 233 | #firefox 234 | #"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0", 235 | "Mozilla/5.0 (Windows NT 6.3; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0", 236 | 237 | #ie11 238 | #"Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", 239 | #ie8 240 | #"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; 4399Box.1357; 4399Box.1253; 4399Box.1357)", 241 | 242 | #2345王牌 243 | #"Chrome/39.0.2171.99 Safari/537.36 2345Explorer/6.5.0.11018", 244 | 245 | #搜狗 246 | #"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 Safari/537.36 SE 2.X MetaSr 1.0", 247 | #opera 248 | "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60" 249 | 250 | ] 251 | headers = random.choice(uaList) 252 | return headers 253 | 254 | #屏幕浏览器窗口大小 255 | def getWindowSize(): 256 | wind_size = [ 257 | [1920,1080], 258 | [1600,900], 259 | [1280,720] 260 | ] 261 | headers = random.choice(wind_size) 262 | return headers 263 | 264 | 265 | #屏幕分辨率设置 266 | def setDisplay(): 267 | display_size = [ 268 | [1920,1080], 269 | [1680,1050], 270 | [1600,900], 271 | [1440,900], 272 | [1400,1050] 273 | ] 274 | d_size = random.choice(display_size) 275 | 276 | dm = win32api.EnumDisplaySettings(None, 0) 277 | dm.PelsWidth = d_size[0] 278 | dm.PelsHeight = d_size[1] 279 | dm.BitsPerPel = 32 280 | dm.DisplayFixedOutput = 0 281 | win32api.ChangeDisplaySettings(dm, 0) 282 | 283 | 284 | #拨号 19ab68----643534 285 | g_adsl_account = { 286 | "name":"宽带连接", 287 | "username":"19ab68", 288 | "password":"643534" 289 | } 290 | 291 | 292 | #屏蔽点击的地址(竞争对手) 293 | out_urls = [ 294 | 'www.ef43.com.cn/zhuanti/2257/', 295 | 'www.ef43.com.cn/brands/mdm/', 296 | 'http://money.163.com/15/0416/11/ANANRECC00253B0H.html' 297 | ] 298 | 299 | 300 | ##内页词 301 | targetURL = [ 302 | #['www.beilaikang.com','产前65天'], 303 | #['www.beilaikang.com','孕产妇多用防风巾'], 304 | #['www.beilaikang.com','产妇专用弹力网眼内裤'], 305 | #['www.beilaikang.com','孕产妇保暖护腹内裤'], 306 | ##['www.beilaikang.com','卡萨图儿童安全座椅'], 307 | ##['www.beilaikang.com','卡萨图安全座椅'], 308 | #['www.hzalbl.com','杭州品牌折扣女装加盟'], 309 | #['www.hzalbl.com','杭州品牌女装折扣店'], 310 | #['www.hzalbl.com','杭州品牌折扣女装'], 311 | #['www.hzalbl.com','杭州品牌女装折扣加盟'], 312 | #['www.hzalbl.com','杭州时尚品牌女装加盟'], 313 | #['www.hzalbl.com','杭州时尚精品女装'], 314 | #['ssjj.qq.com','腾讯生死狙击'], 315 | #['ssjj.qq.com','生死狙击腾讯'], 316 | #['ssjj.qq.com','生死狙击OL'], 317 | 318 | ['http://www.hkuws.com','注册离岸公司'], 319 | ['zs.efu.com.cn/mornfeeit/','梦菲雪'], 320 | ['zs.efu.com.cn/chengshijiaren/','城市佳人'], 321 | ['www.kidsnet.cn/exposition','童装展会'], 322 | #['top.kidsnet.cn/','童装加盟排行榜'], 323 | #['www.nynet.com.cn/','内衣网'], 324 | #['www.nzw.cn/','女装网'], 325 | ['zs.efu.com.cn/ks/','卡索'], 326 | ['zs.efu.com.cn/distin-kidny/','迪斯廷凯'], 327 | ['zs.efu.com.cn/fuzhuang/luyidigao/','路易迪高童装代理'], 328 | ['brand.efu.com.cn/brandshow-1221090.html','凯帝龙驰'], 329 | ['zs.efu.com.cn/rabbitjero/','兔子杰罗'], 330 | ['zs.efu.com.cn/wmprince/','西瓜王子'], 331 | ['zs.efu.com.cn/betu','百图'], 332 | ['zs.efu.com.cn/pepco/','小猪班纳'], 333 | 334 | 335 | #['http://news.ifeng.com/a/20160518/48795120_0.shtml','华夏信财'], 336 | ['http://weibo.com/huaxiafinance','华夏信财'], 337 | ['http://p2p.hexun.com/2016-04-26/183531215.html','华夏信财'], 338 | #['http://news.xinhuanet.com/fortune/2016-04/26/c_128932834.htm','华夏信财'], 339 | ['http://www.xcf.cn/gdyw/201605/t20160526_772682.htm','华夏信财'], 340 | ['http://www.huaxiaoxia.com/','华夏信财'], 341 | #['https://lc.huaxiafinance.com/','华夏信财'], 342 | 343 | 344 | 345 | ['so.tedu.cn','网络营销培训机构'], 346 | ['www.cosatto.net.cn','个性安全座椅'], 347 | ['www.kaihuata.com/','开化旅游'], 348 | #['www.kaihuata.com/','开化'], 349 | 350 | 351 | #['http://hotel.elong.com/beijing/chain53.html','7天连锁酒店'], 352 | #['http://hotel.elong.com/beijing/','北京宾馆住宿'], 353 | #['http://hotel.elong.com/beijing/60101567/','北京招待所'], 354 | #['http://hotel.elong.com/beijing/','北京住宿价格'], 355 | #['http://www.elong.com/','酒店预订'], 356 | #['http://hotel.elong.com/shanghai/','上海订宾馆'], 357 | #['http://hotel.elong.com/shanghai/','上海订房'], 358 | #['http://hotel.elong.com/shanghai/','上海订房网'], 359 | #['http://hotel.elong.com/shanghai/40201958/','上海商务酒店'], 360 | #['http://hotel.elong.com/shanghai/30201012/','上海市宾馆'], 361 | #['http://hotel.elong.com/wuxi/90933644/','书香府邸酒店'], 362 | #['http://hotel.elong.com/hongkong/53201178/','四季酒店'], 363 | #['http://hotel.elong.com/wuhan/01801320/','武汉枫尚酒店公寓'], 364 | #['http://hotel.elong.com/wuhan/01801473/','武汉友发商务酒店'], 365 | 366 | ] 367 | 368 | 369 | for targetInfo in targetURL: 370 | try: 371 | #更换ip 372 | disconnect() 373 | connect() 374 | 375 | while(1): 376 | try: 377 | socket.gethostbyname("baidu.com") 378 | break; 379 | except: 380 | disconnect() 381 | connect() 382 | #更换分辨率 383 | #setDisplay() 384 | 385 | 386 | #启动浏览器 387 | #driver = webdriver.Ie() 388 | #driver = webdriver.Chrome() 389 | #driver = webdriver.Firefox() 390 | 391 | #设置PhantomJS的user_agent 392 | dcap = dict(DesiredCapabilities.PHANTOMJS) 393 | user_agent = getUA() 394 | print(user_agent) 395 | dcap["phantomjs.page.settings.userAgent"] = ( 396 | user_agent 397 | ) 398 | #dcap["phantomjs.page.settings.resourceTimeout"] = (15000) 399 | dcap["phantomjs.page.settings.loadImages"] = (False) 400 | driver = webdriver.PhantomJS(desired_capabilities=dcap,service_args=['--load-images=no']) 401 | 402 | 403 | # UA = getUA() 404 | # print(UA) 405 | # webdriver.DesiredCapabilities.PHANTOMJS['phantomjs.page.customHeaders.User-Agent'] = UA 406 | # driver = webdriver.PhantomJS() 407 | 408 | driver.implicitly_wait(30) 409 | 410 | #清cookie 411 | driver.delete_all_cookies() 412 | 413 | #driver.maximize_window() # 浏览器全屏显示 414 | 415 | #打开百度 416 | driver.get("http://www.baidu.com/") 417 | #driver.get("http://mch.weiba01.com/2.php") 418 | 419 | #设置浏览器窗口大小 420 | window_size = getWindowSize() 421 | driver.set_window_size(window_size[0], window_size[1]) 422 | 423 | 424 | #搜索某个关键词 425 | print('打开百度成功',driver.title) 426 | target = targetInfo[0] 427 | keyword = targetInfo[1] 428 | if len(targetInfo)>2: 429 | error_keyword = targetInfo[random.randint(2,len(targetInfo)-1)] 430 | print(">>>>>>>>>>>>>>>点击的关键词:",keyword,"--->目标地址:",target,">>>>>>>>>>>>>>>>>>>>") 431 | 432 | 433 | if len(targetInfo)>2: 434 | #模拟错误关键词 435 | print("点击错误关键词:",error_keyword); 436 | driver.find_element_by_id("kw").send_keys(error_keyword) 437 | time.sleep(2) 438 | driver.find_element_by_id("su").click() 439 | time.sleep(5) 440 | driver.find_element_by_id("kw").clear() 441 | time.sleep(2) 442 | print("错误关键词点击完毕") 443 | 444 | driver.find_element_by_id("kw").send_keys(keyword) 445 | #time.sleep(2) 446 | 447 | #点击搜索按钮 448 | print("...开始点击搜索按钮..") 449 | driver.find_element_by_id("su").click() 450 | #exit() 451 | print("...点击完毕..") 452 | time.sleep(2) 453 | 454 | 455 | #获取搜索结果页 0:着陆页 1:对应的链接对象 456 | urls_res = get_search_url(driver) 457 | real_urls = urls_res[0] 458 | #get_search_url(driver)[1][2].click() 459 | 460 | 461 | #real_urls = get_real_url(urls) 462 | print("搜索出来的可点击着陆页个数:",len(real_urls)) 463 | print(real_urls) 464 | index = get_urlIndex(target,real_urls) 465 | print("目标index:",index) 466 | 467 | page = 1 468 | while index == -1 and page <= 4: 469 | if page == 1: 470 | #点击前面的几个着陆页,模拟用户真实行为 471 | items = get_random_index(index,len(real_urls)) 472 | #items = [4] 473 | print(items) 474 | click_search_url(driver,items) 475 | 476 | #下一页 477 | driver = click_nextBtn(driver) 478 | time.sleep(3) 479 | urls_res = get_search_url(driver) 480 | real_urls = urls_res[0] 481 | #real_urls = get_real_url(urls) 482 | print(real_urls) 483 | index = get_urlIndex(target,real_urls) 484 | 485 | page = page+1 486 | 487 | 488 | 489 | if index > 4 and page == 1: 490 | #第一页,随机点击两个或一个 491 | int = random.randint(1,2) 492 | if int == 2: 493 | items = get_random_index(index,len(real_urls)) 494 | else: 495 | items = [1] 496 | print(items) 497 | click_search_url(driver,items) 498 | 499 | if page >=5: 500 | print("没有找到目标地址,放弃搜索...") 501 | print("关闭浏览器") 502 | driver.quit() 503 | 504 | time.sleep(5) 505 | data = get_ip() 506 | data.append(keyword) 507 | data.append(target) 508 | data.append("no_find") 509 | data.append(-1) 510 | data.append(-1) 511 | insert_db(data) 512 | continue 513 | 514 | print("目标在page",page,"当前排名:",index,real_urls[index]) 515 | print("反问最后的目标页...") 516 | #driver.get(real_urls[index]) 517 | urls_res[1][index].click() 518 | time.sleep(5) 519 | 520 | nowhandle = driver.current_window_handle 521 | allhandles = driver.window_handles 522 | #目标页和搜索栏目页切换下 523 | for handle in allhandles: 524 | if handle != nowhandle: 525 | print("切换到当前窗口") 526 | driver.switch_to_window(handle) 527 | stime = random.randint(15,25) 528 | #stime = 5; 529 | print("目标页title:",driver.title,"停留-->",stime) 530 | time.sleep(stime) 531 | '''关闭当前窗口''' 532 | driver.close() 533 | 534 | '''回到原先的窗口''' 535 | print("切换到原来的窗口") 536 | driver.switch_to_window(nowhandle) 537 | print("title:",driver.title) 538 | 539 | 540 | #time.sleep(random.randint(40,60)) 541 | #time.sleep(5) 542 | 543 | #清除所有cookie 544 | print("打印cookie") 545 | cookie= driver.get_cookies() 546 | print(cookie) 547 | print("清除cookie") 548 | driver.delete_all_cookies() 549 | print("打印cookie:") 550 | cookie= driver.get_cookies() 551 | print(cookie) 552 | 553 | #关闭浏览器 554 | print("关闭浏览器") 555 | time.sleep(5) 556 | #driver.close() 557 | driver.quit() 558 | #time.sleep(5) 559 | 560 | #数据库记录运行信息 561 | data = get_ip() 562 | data.append(keyword) 563 | data.append(target) 564 | data.append("success") 565 | data.append(page) 566 | data.append(index) 567 | insert_db(data) 568 | 569 | except: 570 | data = get_ip() 571 | data.append(keyword) 572 | data.append(target) 573 | data.append("faild") 574 | data.append(-1) 575 | data.append(-1) 576 | insert_db(data) 577 | 578 | --------------------------------------------------------------------------------