├── cookie └── __init__.py ├── spider ├── __init__.py ├── test.py ├── set_cookie.py ├── step_content.py ├── study.py └── step_vedio.py ├── 2019-04-09_202248.png ├── requirments.txt └── README.md /cookie/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /spider/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /2019-04-09_202248.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/needones/study_score/HEAD/2019-04-09_202248.png -------------------------------------------------------------------------------- /requirments.txt: -------------------------------------------------------------------------------- 1 | certifi==2019.3.9 2 | selenium==3.141.0 3 | urllib3==1.24.1 4 | wincertstore==0.2 5 | -------------------------------------------------------------------------------- /spider/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | # file_name = os.path.dirname(os.path.realpath(__file__)) 4 | # file_name = os.path.dirname(os.path.realpath(file_name)) 5 | # print(file_name) 6 | 7 | -------------------------------------------------------------------------------- /spider/set_cookie.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from time import sleep 4 | 5 | from selenium import webdriver 6 | 7 | driver = webdriver.Chrome() 8 | driver.get('https://www.xuexi.cn') 9 | 10 | sleep(30) 11 | # 60s时间内登录成功,等待程序关闭即可 12 | '''cookie有效期很短,基本上每天都要重新登录,运行这个文件''' 13 | 14 | windows = driver.window_handles 15 | driver.switch_to.window(windows[1]) 16 | sleep(30) 17 | a = driver.get_cookies() 18 | file_name = os.path.dirname(os.path.realpath(__file__)) 19 | file_name = os.path.dirname(os.path.realpath(file_name)) + '\cookie' 20 | os.makedirs(file_name, exist_ok=True) 21 | dict_cookie = {} 22 | dict_cookie['data'] = a 23 | data = json.dumps(dict_cookie) 24 | with open(file_name + '\cookie.json', 'w', encoding='utf-8')as f: 25 | f.write(data) 26 | print('cookie get success') 27 | driver.quit() 28 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 谷歌浏览器+driver+python安装包(Windows版本) 2 | 3 | 链接:https://pan.baidu.com/s/1VswwG0x4jdZ07hHeiSVohg 4 | 提取码:glj8 5 | 复制这段内容后打开百度网盘手机App,操作更方便哦 6 | 7 | 8 | ## 1.配置selenium的谷歌浏览器(上面的链接有资源) 9 | 到http://npm.taobao.org/mirrors/chromedriver/ 下载对应自己谷歌浏览器版本的chromedriver( 10 | 我使用的谷歌浏览器版本号和chromedriver已上传,仅供参考 11 | 鄙人使用谷歌浏览器版本号win32_71.0.3578.80 12 | #### 下载好的chromedriver.exe复制到谷歌浏览器的根目录 13 | 并把所在的目录加入到环境变量中,win7需要重启才能生效 14 | #### 再复制一份到Python的根目录 15 | 16 | ## 2.环境安装 17 | 安装Python环境,到Python官网下载Python版本,安装加入环境变量(请查看根目录的图片) 18 | 在项目目录下,pip install -r requirments.txt 即可 19 | 20 | ## 3.运行spider目录下的set_cookie.py文件 21 | 22 | 手动点击登陆,60s内完成扫码登陆,等待程序关闭,cookie获取成功(保存登录数据) 23 | 24 | ## 4.运行(建议使用分开模块) 25 | 26 | study.py为总程序,包含文章和视频(容易报错) 27 | 28 | 建议使用分模块,step_content.py和step_vedio.py。两个程序可以同时运行,互不干扰 29 | -------------------------------------------------------------------------------- /spider/step_content.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | from datetime import datetime 5 | from time import sleep 6 | 7 | from selenium import webdriver 8 | 9 | # driver = webdriver.Chrome() 10 | from selenium.webdriver import ActionChains 11 | from selenium.webdriver.common.keys import Keys 12 | 13 | ops = webdriver.ChromeOptions() 14 | ops.add_argument('--headless') # 无头 15 | ops.add_argument('--disable-gpu') # 禁用GPU 16 | ops.add_argument('--disable-infobars') # 关闭浏览器上方自动测试提示 17 | driver = webdriver.Chrome(options=ops) 18 | 19 | driver.get('https://www.xuexi.cn') 20 | sleep(10) 21 | # 隐式等待100s 22 | driver.implicitly_wait(20) 23 | # windows = driver.window_handles 24 | # driver.switch_to.window(windows[1]) 25 | file_name = os.path.dirname(os.path.realpath(__file__)) 26 | file_name = os.path.dirname(os.path.realpath(file_name)) + '\cookie\cookie.json' 27 | driver.delete_all_cookies() 28 | cookie = json.load(open(file_name, encoding='utf-8')) 29 | data = cookie['data'] 30 | # print(data) 31 | for i in data: 32 | driver.add_cookie(i) 33 | 34 | sleep(1) 35 | # driver.refresh() 36 | sleep(10) 37 | 38 | ''' 39 | 独立文章模块 40 | ''' 41 | # 学习理论(阅读文章12分,预计20分钟) 42 | # driver.find_element_by_xpath('//div[@class="father-nav"]/ul[3]/li[1]/a').click() 43 | driver.find_element_by_xpath('//div[@class="menu-list"]/div[1]/a[3]').click() 44 | sleep(1) 45 | windows = driver.window_handles 46 | driver.switch_to.window(windows[1]) 47 | sleep(2) 48 | # 7篇 49 | # list_content = [] 50 | for i in list(range(7)): 51 | sleep(10) 52 | # while True: 53 | # 54 | # kk = random.randint(7, 20) 55 | # if kk not in list_content: 56 | # list_content.append(kk) 57 | # break 58 | 59 | # 当时间在17点之前,新内容没有上架,点击昨日的后半部分的新文章 60 | time1 = int(datetime.now().strftime('%H')) 61 | if time1 < 16: 62 | kk = i + 7 63 | else: 64 | kk = i 65 | try: 66 | driver.find_elements_by_xpath('//div[@class="_3wnLIRcEni99IWb4rSpguK"]/div/div[1]')[kk].click() 67 | except: 68 | driver.refresh() 69 | sleep(15) 70 | driver.find_elements_by_xpath('//*[@id="root"]/div/section/div/div/div/div/section/div/div/div[1]/div/section/div/div/div/div/section/div/div/div/div[3]/section/div/div/div/div/section/div/div/div[1]/div/div/div/div[1]/span')[kk].click() 71 | 72 | sleep(3) 73 | windows = driver.window_handles 74 | driver.switch_to.window(windows[2]) 75 | key = random.randint(0, 1) 76 | for j in list(range(9)): 77 | sleep(random.randint(5, 6)) 78 | if key == 1: 79 | num = random.randint(200, 350) 80 | driver.execute_script("var q=document.documentElement.scrollTop={}".format((j + 3) * num)) 81 | 82 | sleep(random.randint(9, 13)) 83 | else: 84 | if j == 1: 85 | sleep(random.randint(10, 13)) 86 | else: 87 | ActionChains(driver).key_down(Keys.PAGE_DOWN).perform() 88 | sleep(random.randint(9, 13)) 89 | print('---第{}篇---{}0%---'.format((i + 1), (j + 1))) 90 | # 滑动到底部 91 | driver.execute_script("var q=document.documentElement.scrollTop=100000") 92 | sleep(2) 93 | ActionChains(driver).key_down(Keys.PAGE_UP).perform() 94 | sleep(5) 95 | driver.close() 96 | sleep(2) 97 | windows = driver.window_handles 98 | driver.switch_to.window(windows[1]) 99 | print('===============第%d篇结束===============' % (i + 1)) 100 | driver.close() 101 | sleep(2) 102 | # 换回首页 103 | windows = driver.window_handles 104 | driver.switch_to.window(windows[0]) 105 | 106 | # 写入最新的cookie 107 | # dict_cookie = {} 108 | # a = driver.get_cookies() 109 | # dict_cookie['data'] = a 110 | # data = json.dumps(dict_cookie) 111 | # with open(file_name, 'w', encoding='utf-8')as f: 112 | # f.write(data) 113 | print('---------恭喜你---12分到手--------') 114 | 115 | sleep(2) 116 | ActionChains(driver).key_down(Keys.PAGE_DOWN).perform() 117 | sleep(5) 118 | driver.quit() 119 | -------------------------------------------------------------------------------- /spider/study.py: -------------------------------------------------------------------------------- 1 | '''''' 2 | import json 3 | import os 4 | import random 5 | from time import sleep 6 | 7 | from selenium import webdriver 8 | 9 | # driver = webdriver.Chrome() 10 | from selenium.webdriver import ActionChains 11 | from selenium.webdriver.common.keys import Keys 12 | 13 | ops = webdriver.ChromeOptions() 14 | ops.add_argument('--headless') # 无头 15 | ops.add_argument('--disable-gpu') # 禁用GPU 16 | ops.add_argument('--disable-infobars') # 关闭浏览器上方自动测试提示 17 | driver = webdriver.Chrome(chrome_options=ops) 18 | 19 | driver.get('https://www.xuexi.cn') 20 | sleep(2) 21 | # windows = driver.window_handles 22 | # driver.switch_to.window(windows[1]) 23 | file_name = os.path.dirname(os.path.realpath(__file__)) 24 | file_name = os.path.dirname(os.path.realpath(file_name)) + '\cookie\cookie.json' 25 | driver.delete_all_cookies() 26 | cookie = json.load(open(file_name, encoding='utf-8')) 27 | data = cookie['data'] 28 | # print(data) 29 | for i in data: 30 | driver.add_cookie(i) 31 | 32 | sleep(1) 33 | driver.refresh() 34 | sleep(3) 35 | 36 | # 学习理论(阅读文章12分,预计15分钟) 37 | driver.find_element_by_xpath('//div[@class="father-nav"]/ul[3]/li[1]/a').click() 38 | sleep(1) 39 | windows = driver.window_handles 40 | driver.switch_to.window(windows[1]) 41 | sleep(2) 42 | for i in list(range(5)): 43 | sleep(2) 44 | driver.find_elements_by_xpath('//div[@class="_3wnLIRcEni99IWb4rSpguK"]/div/div[1]')[i].click() 45 | sleep(3) 46 | windows = driver.window_handles 47 | driver.switch_to.window(windows[2]) 48 | for j in list(range(6)): 49 | sleep(10) 50 | num = random.randint(200, 350) 51 | driver.execute_script("var q=document.documentElement.scrollTop={}".format((j + 1) * num)) 52 | print('第%d次滑动' % (j + 1)) 53 | sleep(20) 54 | driver.close() 55 | sleep(2) 56 | windows = driver.window_handles 57 | driver.switch_to.window(windows[1]) 58 | print('---------------第%d篇刷完-----------------' % (i + 1)) 59 | driver.close() 60 | sleep(2) 61 | 62 | # 换回首页 63 | windows = driver.window_handles 64 | driver.switch_to.window(windows[0]) 65 | 66 | # 学习电视台(观看视频12分,预计40分钟) 67 | driver.find_element_by_xpath('//div[@class="father-nav"]/ul[2]/li[2]/a').click() 68 | sleep(5) 69 | windows = driver.window_handles 70 | driver.switch_to.window(windows[1]) 71 | driver.execute_script("var q=document.documentElement.scrollTop=1000") 72 | sleep(2) 73 | 74 | # 视频区 75 | # driver.find_element_by_xpath('//div[@id="Chwgg53wi10o00"]/div/div[1]').click() 76 | try: 77 | driver.find_element_by_xpath('//*[@id="5586"]/div/div/div/div/div/section/div/div/div[1]/div[1]').click() 78 | except: 79 | driver.find_element_by_xpath('//div[@id="Chwgg53wi10o00"]/div/div[1]').click() 80 | sleep(1) 81 | windows = driver.window_handles 82 | driver.switch_to.window(windows[2]) 83 | sleep(2) 84 | list_vedio = [] 85 | for i in list(range(7)): 86 | print('第%d次视频' % (i + 1)) 87 | if i < 2: 88 | pass 89 | elif i < 4: 90 | try: 91 | driver.find_element_by_xpath('//div[@class="_1KFAyh5wHi8boHp83TMkv-"]/div/div[3]').click() 92 | sleep(2) 93 | except: 94 | try: 95 | ActionChains(driver).key_down(Keys.PAGE_UP) 96 | sleep(2) 97 | driver.find_element_by_xpath('//div[@class="_1KFAyh5wHi8boHp83TMkv-"]/div/div[3]').click() 98 | sleep(2) 99 | except: 100 | try: 101 | driver.find_element_by_xpath('//div[@class="radio_2p2eqv4lwtk00"]/div[2]').click() 102 | sleep(5) 103 | except: 104 | pass 105 | else: 106 | try: 107 | driver.find_element_by_xpath('//div[@class="_1KFAyh5wHi8boHp83TMkv-"]/div/div[4]').click() 108 | sleep(2) 109 | except: 110 | try: 111 | ActionChains(driver).key_down(Keys.PAGE_UP) 112 | sleep(2) 113 | driver.find_element_by_xpath('//div[@class="_1KFAyh5wHi8boHp83TMkv-"]/div/div[4]').click() 114 | sleep(2) 115 | except: 116 | try: 117 | driver.find_element_by_xpath('//div[@class="radio_2p2eqv4lwtk00"]/div[3]').click() 118 | sleep(5) 119 | except: 120 | pass 121 | # 随机更改视频 122 | while True: 123 | k = random.randint(0, 16) 124 | 125 | if k not in list_vedio: 126 | list_vedio.append(k) 127 | break 128 | try: 129 | driver.find_elements_by_xpath( 130 | '//div[@class="Iuu474S1L6y5p7yalKQbW grid-gr"]//div[@class="_252R0WxMJIuJyNty2pZiaL thePic"]')[k].click() 131 | except: 132 | try: 133 | driver.find_elements_by_xpath('//div[@class="_252R0WxMJIuJyNty2pZiaL thePic"]')[k].click() 134 | except: 135 | try: 136 | driver.find_elements_by_xpath('//div[@id="Cd5zymfz1fzs0"]/div/div/div[1]')[k].click() 137 | except: 138 | print('第%d次视频获取失败!' % (i + 1)) 139 | sleep(2) 140 | windows = driver.window_handles 141 | driver.switch_to.window(windows[3]) 142 | sleep(2) 143 | driver.execute_script("var q=document.documentElement.scrollTop=400") 144 | sleep(4) 145 | # 视频睡眠 146 | time_s = random.randint(200, 240) 147 | # driver.execute_script("var q=document.documentElement.scrollTop=1500") 148 | sleep(time_s) 149 | driver.close() 150 | sleep(2) 151 | windows = driver.window_handles 152 | driver.switch_to.window(windows[2]) 153 | sleep(2) 154 | 155 | dict_cookie = {} 156 | a = driver.get_cookies() 157 | dict_cookie['data'] = a 158 | data = json.dumps(dict_cookie) 159 | with open(file_name, 'w', encoding='utf-8')as f: 160 | f.write(data) 161 | 162 | driver.quit() 163 | -------------------------------------------------------------------------------- /spider/step_vedio.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import random 4 | from time import sleep 5 | 6 | import sys 7 | from selenium import webdriver 8 | from selenium.webdriver import ActionChains 9 | from selenium.webdriver.common.keys import Keys 10 | 11 | ''' 12 | 说明:视频独立模块 13 | 修改找不到视频bug 14 | 15 | ''' 16 | # 浏览器配置 17 | ops = webdriver.ChromeOptions() 18 | ops.add_argument('--headless') # 无头 19 | ops.add_argument('--disable-gpu') # 禁用GPU 20 | ops.add_argument('--disable-infobars') # 关闭浏览器上方自动测试提示 21 | driver = webdriver.Chrome(options=ops) 22 | driver.get('https://www.xuexi.cn') 23 | sleep(10) 24 | # 隐式等待100s 25 | # driver.implicitly_wait(200) 26 | # windows = driver.window_handles 27 | # driver.switch_to.window(windows[1]) 28 | file_name = os.path.dirname(os.path.realpath(__file__)) 29 | file_name = os.path.dirname(os.path.realpath(file_name)) + '\cookie\cookie.json' 30 | driver.delete_all_cookies() 31 | cookie = json.load(open(file_name, encoding='utf-8')) 32 | data = cookie['data'] 33 | # print(data) 34 | for i in data: 35 | driver.add_cookie(i) 36 | 37 | sleep(1) 38 | driver.refresh() 39 | sleep(10) 40 | 41 | # 学习电视台(观看视频12分,预计40分钟) 42 | # driver.find_element_by_xpath('//div[@class="father-nav"]/ul[2]/li[2]/a').click() 43 | try: 44 | driver.find_element_by_xpath('//div[@class="menu-list"]/div[2]/a[2]').click() 45 | except: 46 | try: 47 | driver.refresh() 48 | sleep(15) 49 | driver.find_element_by_xpath('//*[@id="root"]/div/header/div[2]/div[1]/div[2]/a[2]').click() 50 | except: 51 | print('学习电视台xpath路径出错') 52 | sys.exit(0) 53 | 54 | sleep(5) 55 | windows = driver.window_handles 56 | driver.switch_to.window(windows[1]) 57 | driver.execute_script("var q=document.documentElement.scrollTop=1000") 58 | sleep(10) 59 | 60 | # 视频区 61 | # driver.find_element_by_xpath('//div[@id="Chwgg53wi10o00"]/div/div[1]').click() 62 | try: 63 | # driver.find_element_by_xpath('//*[@id="5586"]/div/div/div/div/div/section/div/div/div[1]/div[1]').click() 64 | driver.find_element_by_xpath('//*[@id="495f"]/div/div/div/div/section/div/div/div[1]/div[1]/div/div/span').click() 65 | except: 66 | try: 67 | driver.find_element_by_xpath('//div[@class="Pic"]"]').click() 68 | except: 69 | try: 70 | driver.find_element_by_xpath('//div[@class="Iuu474S1L6y5p7yalKQbW grid-cell"]').click() 71 | except: 72 | driver.find_element_by_xpath( 73 | '//*[@id="1novbsbi47k-5"]/div/div/div/div/div/section/div[3]/section/div/div/div/div/section/div/div/div/div[1]').click() 74 | sleep(1) 75 | windows = driver.window_handles 76 | driver.switch_to.window(windows[2]) 77 | sleep(2) 78 | list_vedio = [] 79 | for i in list(range(7)): 80 | print('第%d次视频开始' % (i + 1)) 81 | sleep(2) 82 | if i < 1: 83 | pass 84 | elif i < 5: 85 | try: 86 | driver.find_element_by_xpath('//div[@class="_1KFAyh5wHi8boHp83TMkv-"]/div/div[3]').click() 87 | sleep(2) 88 | except: 89 | try: 90 | ActionChains(driver).key_down(Keys.PAGE_UP) 91 | sleep(2) 92 | driver.find_element_by_xpath('//div[@class="_1KFAyh5wHi8boHp83TMkv-"]/div/div[3]').click() 93 | sleep(2) 94 | except: 95 | try: 96 | driver.find_element_by_xpath('//div[@class="radio_2p2eqv4lwtk00"]/div[2]').click() 97 | sleep(5) 98 | except: 99 | pass 100 | else: 101 | try: 102 | driver.find_element_by_xpath('//div[@class="_1KFAyh5wHi8boHp83TMkv-"]/div/div[4]').click() 103 | sleep(2) 104 | except: 105 | try: 106 | ActionChains(driver).key_down(Keys.PAGE_UP) 107 | sleep(2) 108 | driver.find_element_by_xpath('//div[@class="_1KFAyh5wHi8boHp83TMkv-"]/div/div[4]').click() 109 | sleep(2) 110 | except: 111 | try: 112 | driver.find_element_by_xpath('//div[@class="radio_2p2eqv4lwtk00"]/div[3]').click() 113 | sleep(5) 114 | except: 115 | pass 116 | # 随机点击视频 117 | while True: 118 | k = random.randint(0, 16) 119 | 120 | if k not in list_vedio: 121 | list_vedio.append(k) 122 | break 123 | try: 124 | sleep(5) 125 | driver.find_elements_by_xpath( 126 | '//div[@class="Iuu474S1L6y5p7yalKQbW grid-gr"]//div[@class="_252R0WxMJIuJyNty2pZiaL thePic"]')[k].click() 127 | except: 128 | try: 129 | ActionChains(driver).key_down(Keys.PAGE_DOWN).perform() 130 | driver.find_elements_by_xpath('//div[@class="_252R0WxMJIuJyNty2pZiaL thePic"]')[k].click() 131 | except: 132 | try: 133 | # driver.find_elements_by_xpath('//div[@id="Cd5zymfz1fzs0"]/div/div/div[1]')[k].click() 134 | driver.find_elements_by_xpath( 135 | '//*[@id="1novbsbi47k-5"]/div/div/div/div/div/section/div[3]/section/div/div/div/div')[k].click() 136 | except: 137 | driver.find_elements_by_xpath('//div[@id="Cd5zymfz1fzs0"]/div/div/div[1]')[k].click() 138 | print('第%d次视频获取失败!' % (i + 1)) 139 | sleep(2) 140 | windows = driver.window_handles 141 | driver.switch_to.window(windows[3]) 142 | sleep(2) 143 | driver.execute_script("var q=document.documentElement.scrollTop=400") 144 | sleep(4) 145 | # 视频睡眠 146 | time_s = random.randint(200, 350) 147 | driver.execute_script("var q=document.documentElement.scrollTop=800") 148 | sleep(time_s) 149 | ActionChains(driver).key_down(Keys.PAGE_DOWN).perform() 150 | sleep(5) 151 | ActionChains(driver).key_down(Keys.PAGE_DOWN).perform() 152 | sleep(5) 153 | driver.close() 154 | sleep(2) 155 | windows = driver.window_handles 156 | driver.switch_to.window(windows[2]) 157 | sleep(2) 158 | 159 | # 写入最新的cookie 160 | # dict_cookie = {} 161 | # a = driver.get_cookies() 162 | # dict_cookie['data'] = a 163 | # data = json.dumps(dict_cookie) 164 | # with open(file_name, 'w', encoding='utf-8')as f: 165 | # f.write(data) 166 | 167 | print('---------恭喜你---12分到手--------') 168 | driver.quit() 169 | --------------------------------------------------------------------------------