├── LICENSE ├── README.md ├── spiderAPI ├── __init__.py ├── baidumap.py ├── dianping.py ├── github.py ├── lagou.py └── proxyip.py └── spiderFile ├── ECUT_get_grade.py ├── ECUT_pos_html.py ├── JD_spider.py ├── baidu_sy_img.py ├── baidu_wm_img.py ├── fuckCTF.py ├── get_baike.py ├── get_history_weather.py ├── get_photos.py ├── get_tj_accident_info.py ├── get_top_sec_com.py ├── get_web_all_img.py ├── github_hot.py ├── kantuSpider.py ├── lagou_position_spider.py ├── one_img.py ├── one_update.py ├── search_useful_camera_ip_address.py ├── student_img.py └── xz_picture_spider.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 yhf 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ```shell 2 | ( 3 | )\ ) ) ) ( ( 4 | (()/( ( ( /( ( /( )\ ( ) ( ( )\ ( ( 5 | /(_)))\ ) )\()))\()) ( ( (((_) )( ( /( )\))( ((_) ))\ )( 6 | (_)) (()/( (_))/((_)\ )\ )\ ) )\___ (()\ )(_))((_)()\ _ /((_)(()\ 7 | | _ \ )(_))| |_ | |(_) ((_) _(_/(((/ __| ((_)((_)_ _(()((_)| |(_)) ((_) 8 | | _/| || || _|| ' \ / _ \| ' \))| (__ | '_|/ _` |\ V V /| |/ -_) | '_| 9 | |_| \_, | \__||_||_|\___/|_||_| \___||_| \__,_| \_/\_/ |_|\___| |_| 10 | |__/ 11 | —————— by yanghangfeng 12 | ``` 13 | #
PythonCrawler: 用 python编写的爬虫项目集合:bug:(本项目代码仅作为爬虫技术学习之用,学习者务必遵循中华人民共和国法律!)
14 | 15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
(.*?)
') 72 | grade = re.findall(reg_, _)[0] 73 | grade_list.append(grade) 74 | 75 | grade_data_ = pd.DataFrame() 76 | grade_data = np.array(grade_list).reshape(-1, 6) 77 | for i, j in zip(cloumn_name, range(6)): 78 | grade_data_[i] = grade_data[:, j] 79 | 80 | print('1:打印最新的五门成绩') 81 | print('2:保存所有的成绩到本地文件夹') 82 | print('3:打印学位课成绩并计算平均学分绩') 83 | print('\n') 84 | select = input('请输入你的请求:') 85 | if select == '1': 86 | print(grade_data_[-5:]) 87 | elif select == '2': 88 | grade_data_.to_csv('./grade_data.csv', index=False) 89 | print('成绩已保存在运行此程序的文件夹') 90 | elif select == '3': 91 | xw_grade = grade_data_[(grade_data_['课程名'] == '*数学分析(I)') | (grade_data_['课程名'] == '高等代数(I)') | 92 | (grade_data_['课程名'] == 'C语言程序设计基础') | (grade_data_['课程名'] == '大学英语(II)') | 93 | (grade_data_['课程名'] == '*常微分方程') | (grade_data_['课程名'] == '*概率论') | 94 | (grade_data_['课程名'] == '数据结构')] 95 | print(xw_grade) 96 | print('\n') 97 | avg_grade = np.sum((xw_grade.学分.astype(float) * xw_grade.成绩.astype(float))) / \ 98 | np.sum(xw_grade.学分.astype(float)) 99 | print('平均学分绩={0}'.format(avg_grade)) 100 | input('按任意键结束') 101 | -------------------------------------------------------------------------------- /spiderFile/ECUT_pos_html.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re from bs4 3 | import BeautifulSoup as bs 4 | 5 | 6 | def crawl_all_main_url(page=10): 7 | # 默认抓取官网前十页招聘信息的url 8 | all_url_list = [] 9 | for _ in range(1, page+1): 10 | url = 'http://zjc.ecit.edu.cn/jy/app/newslist.php?BigClassName=%D5%D0%C6%B8%D0%C5%CF%A2&Page={0}'.format(_) 11 | page_html = requests.get(url).text 12 | x_url_reg = re.compile('(.*?)') 20 | explain_text = re.findall(explain_text_reg, html)[0] 21 | if ('时间' and '地点') in explain_text: 22 | return True 23 | else: pass 24 | def save_html(): 25 | all_url_list = crawl_all_main_url() 26 | for son_url in all_url_list: 27 | if get_title(son_url): 28 | text_html = requests.get(son_url).content.decode('gbk') 29 | domain_url = 'http://zjc.ecit.edu.cn/jy' 30 | img_url_reg = re.compile('border=0 src="\.\.(.*?)"') 31 | child_url = re.findall(img_url_reg, text_html) 32 | if child_url != []: 33 | img_url = domain_url + child_url[0] 34 | re_url = 'src="..{0}"'.format(child_url[0]) 35 | end_url = 'src="{0}"'.format(img_url) 36 | end_html = text_html.replace(re_url, end_url) 37 | soup = bs(end_html, 'lxml') 38 | text_div = soup.find_all('div', id='main')[0] 39 | with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: 40 | text_html = 'U职网提供数据咨询服务 {0} '.format(text_div) file.write(text_html.encode('utf-8')) 41 | else: 42 | with open('./{0}.html'.format(son_url[-11:]), 'wb') as file: 43 | html = requests.get(son_url).content.decode('gbk') 44 | soup = bs(text_html, 'lxml') 45 | text_div = soup.find_all('div', id='main')[0] 46 | text_html = 'U职网提供数据咨询服务 {0} '.format(text_div) 47 | file.write(text_html.encode('utf-8')) 48 | else: continue 49 | if __name__ == '__main__': 50 | save_html() 51 | -------------------------------------------------------------------------------- /spiderFile/JD_spider.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import pandas as pd 4 | 5 | def get_data(): 6 | jj_url1 = 'http://search.jd.com/s_new.php?keyword=%E5%AE%B6%E5%B1%85%E7%94%A8%E5%93%81&enc=utf-8&qrst=1&rt=1&stop=1&pt=1&vt=2&sttr=1&offset=6&page=' 7 | jj_url2 = '&s=53&click=0' 8 | bt_ = [] 9 | _id = [] 10 | url_list = [] 11 | for i in range(1, 10, 2): 12 | jj_url = jj_url1 + str(i) + jj_url2 13 | url_list.append(jj_url) 14 | html = requests.get(jj_url).content.decode('utf-8') 15 | reg1 = re.compile('') 17 | bt = re.findall(reg1, html) 18 | id_ = re.findall(reg2, html) 19 | bt_.extend(bt) 20 | _id.extend(id_) 21 | return bt_, _id 22 | 23 | def split_str(_id): 24 | zid = [] 25 | for _ in _id: 26 | zid.append(_.split('_')[2]) 27 | return zid 28 | 29 | def save_data(zid, bt_): 30 | data = pd.DataFrame({ 31 | '标题': bt_, 32 | 'ID': zid 33 | }) 34 | data.to_excel('./家居用品.xlsx', index=False) 35 | 36 | def start_main(): 37 | bt_, _id = get_data() 38 | zid = split_str(_id) 39 | save_data(zid, bt_) 40 | 41 | if __name__ == '__main__': 42 | start_main() 43 | -------------------------------------------------------------------------------- /spiderFile/baidu_sy_img.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | 4 | url = 'http://image.baidu.com/search/index' 5 | headers = { 6 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', 7 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 8 | 'Accept-Encoding': 'gzip, deflate', 9 | 'Referer': 'http://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&fm=detail&lm=-1&st=-1&sf=2&fmq=&pv=&ic=0&nc=1&z=&se=&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&oq=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1&rsp=-1', 10 | 'Cookie': 'HOSUPPORT=1; UBI=fi_PncwhpxZ%7ETaMMzY0i9qXJ9ATcu3rvxFIc-a7KI9byBcYk%7EjBVmPGIbL3LTKKJ2D17mh5VfJ5yjlCncAb2yhPI5sZM51Qo7tpCemygM0VNUzuTBJwYF8OYmi3nsCCzbpo5U9tLSzkZfcQ1rxUcJSzaipThg__; HISTORY=fec845b215cd8e8be424cf320de232722d0050; PTOKEN=ff58b208cc3c16596889e0a20833991d; STOKEN=1b1f4b028b5a4415aa1dd9794ff061d312ad2a822d52418f3f1ffabbc0ac6142; SAVEUSERID=0868a2b4c9d166dc85e605f0dfd153; USERNAMETYPE=3; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_18205_18559_17001_17073_15479_12166_18086_10634; BDRCVFR[feWj1Vr5u3D]=I67x6TjHwwYf0; BDRCVFR[X_XKQks0S63]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm', 11 | } 12 | 13 | 14 | def get_html(url, headers): 15 | data = { 16 | 'cl': '2', 17 | 'ct': '201326592', 18 | 'face': '0', 19 | 'fp': 'result', 20 | 'gsm': '200001e', 21 | 'ic': '0', 22 | 'ie': 'utf-8', 23 | 'ipn': 'rj', 24 | 'istype': '2', 25 | 'lm': '-1', 26 | 'nc': '1', 27 | 'oe': 'utf-8', 28 | 'pn': '30', 29 | 'queryword': '高清摄影', 30 | 'rn': '30', 31 | 'st': '-1', 32 | 'tn': 'resultjson_com', 33 | 'word': '高清摄影' 34 | } 35 | 36 | page = requests.get(url, data, headers=headers).text 37 | return page 38 | 39 | 40 | def get_img(page, headers): 41 | # img_url_list = [] 42 | reg = re.compile('http://.*?\.jpg') 43 | imglist1 = re.findall(reg, page) 44 | imglist2 = imglist1[0: len(imglist1): 3] 45 | # [img_url_list.append(i) for i in imglist if not i in img_url_list] 46 | x = 0 47 | for imgurl in imglist2: 48 | bin = requests.get(imgurl, headers=headers).content 49 | with open('./%s.jpg' % x, 'wb') as file: 50 | file.write(bin) 51 | x += 1 52 | 53 | if __name__ == '__main__': 54 | page = get_html(url, headers) 55 | get_img(page, headers) 56 | -------------------------------------------------------------------------------- /spiderFile/baidu_wm_img.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | 4 | url = 'http://image.baidu.com/search/index' 5 | date = { 6 | 'cl': '2', 7 | 'ct': '201326592', 8 | 'fp': 'result', 9 | 'gsm': '1e', 10 | 'ie': 'utf-8', 11 | 'ipn': 'rj', 12 | 'istype': '2', 13 | 'lm': '-1', 14 | 'nc': '1', 15 | 'oe': 'utf-8', 16 | 'pn': '30', 17 | 'queryword': '唯美意境图片', 18 | 'rn': '30', 19 | 'st': '-1', 20 | 'tn': 'resultjson_com', 21 | 'word': '唯美意境图片' 22 | } 23 | headers = { 24 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:44.0) Gecko/20100101 Firefox/44.0', 25 | 'Accept': 'text/plain, */*; q=0.01', 26 | 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 27 | 'Accept-Encoding': 'gzip, deflate', 28 | 'X-Requested-With': 'XMLHttpRequest', 29 | 'Referer': 'http://image.baidu.com/search/index?ct=201326592&cl=2&st=-1&lm=-1&nc=1&ie=utf-8&tn=baiduimage&ipn=r&rps=1&pv=&fm=rs3&word=%E5%94%AF%E7%BE%8E%E6%84%8F%E5%A2%83%E5%9B%BE%E7%89%87&ofr=%E9%AB%98%E6%B8%85%E6%91%84%E5%BD%B1', 30 | 'Cookie': 'BDqhfp=%E5%94%AF%E7%BE%8E%E6%84%8F%E5%A2%83%E5%9B%BE%E7%89%87%26%26NaN-1undefined-1undefined%26%260%26%261; Hm_lvt_737dbb498415dd39d8abf5bc2404b290=1455016371,1455712809,1455769605,1455772886; PSTM=1454309602; BAIDUID=E5493FD55CFE5424BA25B1996943B3B6:FG=1; BIDUPSID=B7D6D9EFA208B7B8C7CB6EF8F827BD4E; BDUSS=VSeFB6UXBmRWc3UEdFeXhKOFRvQm4ySmVmTkVEN2N0bldnM2o5RHdyaE54ZDlXQVFBQUFBJCQAAAAAAAAAAAEAAABzhCtU3Mbj5cfl0e8AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAE04uFZNOLhWZW; H_PS_PSSID=1447_18282_17946_15479_12166_18086_10634; Hm_lpvt_737dbb498415dd39d8abf5bc2404b290=1455788775; firstShowTip=1; BDRCVFR[-pGxjrCMryR]=mk3SLVN4HKm; BDRCVFR[dG2JNJb_ajR]=mk3SLVN4HKm', 31 | 'Connection': 'keep-alive' 32 | } 33 | 34 | 35 | def get_page(url, date, headers): 36 | page = requests.get(url, date, headers=headers).text 37 | return page 38 | 39 | 40 | def get_img(page, headers): 41 | reg = re.compile('http://.*?\.jpg') 42 | imglist = re.findall(reg, page)[::3] 43 | x = 0 44 | for imgurl in imglist: 45 | with open('E:/Pic/%s.jpg' % x, 'wb') as file: 46 | file.write(requests.get(imgurl, headers=headers).content) 47 | x += 1 48 | 49 | if __name__ == '__main__': 50 | page = get_page(url, date, headers) 51 | get_img(page, headers) 52 | -------------------------------------------------------------------------------- /spiderFile/fuckCTF.py: -------------------------------------------------------------------------------- 1 | 2 | """ 3 | author: 杨航锋 4 | date : 2018.8.19 5 | mood : 嗯,比较无聊,甚至还有点想吃黄焖鸡米饭😋 6 | """ 7 | 8 | 9 | import os 10 | import random 11 | import functools 12 | 13 | from PIL import Image 14 | from selenium import webdriver 15 | 16 | 17 | class fuckCTF: 18 | 19 | def __init__(self, username, old_password): 20 | self.url = "http://hetianlab.com/" 21 | self.login_url = "http://hetianlab.com/loginLab.do" 22 | self.username = username 23 | self.old_password = old_password 24 | self.new_password = (yield_new_password(), "******")[0] 25 | self.options = webdriver.FirefoxOptions() 26 | self.options.add_argument("-headless") 27 | self.browser = webdriver.Firefox(options=self.options) 28 | print("init ok") 29 | 30 | def login_hetian(self): 31 | self.browser.get(self.login_url) 32 | self.browser.find_element_by_id("userEmail").clear() 33 | self.browser.find_element_by_id("userEmail").send_keys(self.username) 34 | self.browser.find_element_by_id("passwordIn").clear() 35 | self.browser.find_element_by_id("passwordIn").send_keys(self.old_password) 36 | self.browser.get_screenshot_as_file(self.username + '/' + "login.png") 37 | self.browser.find_element_by_id("registButIn").click() 38 | self.browser.get(self.url) 39 | print("login_hetian running ok!") 40 | 41 | def get_personl_information_page(self): 42 | grzx_btn = self.browser.find_element_by_xpath("/html/body/div[1]/div[1]/div/div/div[2]/ul/li[2]/a") 43 | self.browser.execute_script("$(arguments[0]).click()", grzx_btn) 44 | self.browser.get("http://hetianlab.com/getUserInfo.do") 45 | print("get_personl_information_page running ok!") 46 | 47 | def get_password_setting_page(self): 48 | mmsz_btn = self.browser.find_element_by_xpath("/html/body/div[2]/div/div[1]/ul/ul[3]/li[2]") 49 | self.browser.execute_script("$(arguments[0]).click()", mmsz_btn) 50 | self.browser.find_element_by_id("person").click() 51 | self.browser.find_element_by_class_name("check") 52 | print("get_password_setting_page running ok!") 53 | 54 | def setting_password(self): 55 | self.browser.find_element_by_id("oldpwd").clear() 56 | self.browser.find_element_by_id("oldpwd").send_keys(self.old_password) 57 | self.browser.find_element_by_id("newpwd").clear() 58 | self.browser.find_element_by_id("newpwd").send_keys(self.new_password) 59 | self.browser.find_element_by_id("quepwd").clear() 60 | self.browser.find_element_by_id("quepwd").send_keys(self.new_password) 61 | print("setting_password running ok!") 62 | 63 | def get_v_code(self): 64 | status = self.browser.get_screenshot_as_file(self.username + '/' + "v_code.png") 65 | if status: 66 | img = Image.open(self.username + '/' + "v_code.png") 67 | img.show() 68 | self.v_code = input("请输入验证码: ") 69 | self.browser.find_element_by_class_name("code").send_keys(self.v_code) 70 | else: 71 | raise("截屏失败!") 72 | print("get_v_code running ok!") 73 | 74 | def submit_data(self): 75 | self.browser.find_element_by_id("submitbtn").click() 76 | self.browser.get_screenshot_as_file(self.username + '/' + "result.png") 77 | self.browser.quit() 78 | print("submit_data running ok!") 79 | 80 | def make_portfolio(self): 81 | if not os.path.exists(self.username): 82 | os.makedirs(self.username) 83 | print("make_portfolio running ok!") 84 | 85 | def save_success_data(self): 86 | with open("./username_and_password_data_successed.log", "a+") as fp: 87 | fp.write( 88 | "username" + ": {}".format(self.username) + "\t" 89 | "password" + ": {}".format(self.new_password) + 90 | "\n" 91 | ) 92 | print("save_success_data running ok!") 93 | 94 | def save_failed_data(self): 95 | with open("./username_and_password_data_failed.log", "a+") as fp: 96 | fp.write( 97 | "username" + ": {}".format(self.username) + "\n" 98 | ) 99 | print("save_failed_data running ok!") 100 | 101 | def main(self): 102 | try: 103 | self.make_portfolio() 104 | self.login_hetian() 105 | self.get_personl_information_page() 106 | self.get_password_setting_page() 107 | self.setting_password() 108 | self.get_v_code() 109 | self.submit_data() 110 | self.save_success_data() 111 | except: 112 | self.save_failed_data() 113 | 114 | 115 | def gen_decorator(gen): 116 | @functools.wraps(gen) 117 | def inner(*args, **kwargs): 118 | return next(gen(*args, **kwargs)) 119 | return inner 120 | 121 | 122 | @gen_decorator 123 | def yield_new_password(): 124 | strings = list("abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()") 125 | yield "".join(random.choices(strings, k=6)) 126 | 127 | 128 | def yield_usernames(n): 129 | prefix = "ctf2018_gzhu" 130 | postfix = "@dh.com" 131 | for num in range(n): 132 | if num < 10: 133 | infix = '0' + str(num) 134 | else: 135 | infix = str(num) 136 | yield prefix + infix + postfix 137 | 138 | 139 | if __name__ == "__main__": 140 | for username in yield_usernames(100): 141 | ctfer = fuckCTF(username, "******") 142 | ctfer.main() 143 | -------------------------------------------------------------------------------- /spiderFile/get_baike.py: -------------------------------------------------------------------------------- 1 | import re 2 | import requests as rq 3 | 4 | def get_baidubaike(): 5 | 6 | keyword = input('please input wordkey:') 7 | url = 'http://baike.baidu.com/item/{}'.format(keyword) 8 | html = rq.get(url).content.decode('utf-8') 9 | 10 | regex = re.compile('content="(.*?)">') 11 | words = re.findall(regex, html)[0] 12 | return words 13 | 14 | if __name__ == '__main__': 15 | words = get_baidubaike() 16 | print(words) 17 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /spiderFile/get_history_weather.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | import requests as rq 4 | from bs4 import BeautifulSoup 5 | 6 | 7 | def get_data(url): 8 | html = rq.get(url).content.decode("gbk") 9 | soup = BeautifulSoup(html, "html.parser") 10 | tr_list = soup.find_all("tr") 11 | dates, conditions, temperatures = [], [], [] 12 | for data in tr_list[1:]: 13 | sub_data = data.text.split() 14 | dates.append(sub_data[0]) 15 | conditions.append("".join(sub_data[1:3])) 16 | temperatures.append("".join(sub_data[3:6])) 17 | _data = pd.DataFrame() 18 | _data["日期"] = dates 19 | _data["天气状况"] = conditions 20 | _data["气温"] = temperatures 21 | return _data 22 | 23 | # 获取广州市2019年第一季度天气状况 24 | data_1_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201901.html") 25 | data_2_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201902.html") 26 | data_3_month = get_data("http://www.tianqihoubao.com/lishi/guangzhou/month/201903.html") 27 | 28 | 29 | data = pd.concat([data_1_month, data_2_month, data_3_month]).reset_index(drop=True) 30 | 31 | data.to_csv("guangzhou_history_weather_data.csv", index=False, encoding="utf-8") 32 | -------------------------------------------------------------------------------- /spiderFile/get_photos.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | url = 'http://tieba.baidu.com/p/4178314700' 5 | 6 | 7 | def GetHtml(url): 8 | html = requests.get(url).text 9 | return html 10 | 11 | 12 | def GetImg(html): 13 | soup = BeautifulSoup(html, 'html.parser') 14 | imglist = [] 15 | for photourl in soup.find_all('img'): 16 | imglist.append(photourl.get('src')) 17 | x = 0 18 | for imgurl in imglist: 19 | with open('E:/Pic/%s.jpg' % x, 'wb') as file: 20 | file.write(requests.get(imgurl).content) 21 | x += 1 22 | 23 | if __name__ == '__main__': 24 | html = GetHtml(url) 25 | GetImg(html) 26 | -------------------------------------------------------------------------------- /spiderFile/get_tj_accident_info.py: -------------------------------------------------------------------------------- 1 | import re 2 | import joblib 3 | import asyncio 4 | import aiohttp 5 | import requests as rq 6 | from bs4 import BeautifulSoup 7 | 8 | def yield_all_page_url(root_url, page=51): 9 | """生成所有的页面url 10 | @param root_url: 首页url 11 | type root_url: str 12 | @param page: 爬取的页面个数 13 | type page: int 14 | """ 15 | # 观察网站翻页结构可知 16 | page_url_list = [f"{root_url}index_{i}.html" for i in range(1, page)] 17 | # 添加首页url 18 | page_url_list.insert(0, root_url) 19 | return page_url_list 20 | 21 | async def get_info_page_url(url, session): 22 | regex = re.compile("') 43 | html = rq.get(url, headers=HEADERS).content.decode("utf-8") 44 | soup = BeautifulSoup(html) 45 | title = re.search(title_regex, html) 46 | content_1 = soup.find("div", class_="TRS_UEDITOR TRS_WEB") 47 | content_2 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_word") 48 | content_3 = soup.find("div", class_="view TRS_UEDITOR trs_paper_default trs_web") 49 | if content_1: 50 | content = content_1.text 51 | elif content_2: 52 | content = content_2.text 53 | elif content_3: 54 | content = content_3.text 55 | else: 56 | content = "" 57 | return {"title": title.groups()[0], "content": content} 58 | 59 | def get_all_data(all_info_page_url_list): 60 | all_data = [] 61 | for i, url in enumerate(all_info_page_url_list): 62 | all_data.append(get_data(url)) 63 | print(i, url, all_data[-1]) 64 | joblib.dump(all_data, "all_data.joblib") 65 | 66 | 67 | if __name__ == "__main__": 68 | root_url = "http://yjgl.tj.gov.cn/ZWGK6939/SGXX3106/" 69 | agent_part_1 = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " 70 | agent_part_2 = "(KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36" 71 | HEADERS = {"Host": "yjgl.tj.gov.cn", 72 | "Connection": "keep-alive", 73 | "User-Agent": agent_part_1 + agent_part_2, 74 | "Referer": "http://static.bshare.cn/"} 75 | page_url_list = yield_all_page_url(root_url, page=51) 76 | all_info_page_url_list = asyncio.run(get_all_info_page_url(root_url, page_url_list)) 77 | joblib.dump("all_info_page_url_list", all_info_page_url_list) 78 | -------------------------------------------------------------------------------- /spiderFile/get_top_sec_com.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import time 4 | import joblib 5 | import asyncio 6 | import aiohttp 7 | import requests as rq 8 | 9 | import pandas as pd 10 | import matplotlib.pyplot as plt 11 | # import nest_asyncio 12 | # nest_asyncio.apply() 13 | 14 | class getTopSecCom: 15 | def __init__(self, top=None): 16 | self.headers = {"Referer": "http://quote.eastmoney.com/", 17 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"} 18 | self.bk_url = "http://71.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124034348162124675374_1612595298605&pn=1&pz=85&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&fid=f62&fs=b:BK0655&fields=f12,f14&_=1612595298611" 19 | self.shares_api = "https://xueqiu.com/S/" 20 | self.top = top 21 | if not os.path.exists("./useful_sec_com_list"): 22 | self.useful_sec_com_list = self.get_sec_com_code() 23 | else: 24 | with open("./useful_sec_com_list", "rb") as fp: 25 | self.useful_sec_com_list = joblib.load(fp) 26 | 27 | def get_sec_com_code(self): 28 | html = rq.get(self.bk_url, headers=self.headers).content.decode("utf-8") 29 | sec_com_list = eval(re.findall("\[(.*?)\]", html)[0]) 30 | useful_sec_com_list = [[i["f12"], i["f14"]] for i in sec_com_list if "ST" not in i["f14"]] 31 | 32 | # 0和3开头的为深证上市股票前缀为sz,6开头的为上证上市股票前缀为sh 33 | for sec_com in useful_sec_com_list: 34 | if sec_com[0][0] == "6": 35 | sec_com[0] = "sh" + sec_com[0] 36 | else: 37 | sec_com[0] = "sz" + sec_com[0] 38 | with open("useful_sec_com_list", "wb") as fp: 39 | joblib.dump(useful_sec_com_list, fp) 40 | return useful_sec_com_list 41 | 42 | async def async_get_shares_details(self, sec_com, url): 43 | async with aiohttp.ClientSession() as session: 44 | async with session.get(url, headers=self.headers) as response: 45 | html = await response.text() 46 | market_value = re.search("\s*(.*?)\s*
') 14 | summary_text = re.findall(url_abstract_reg, html) 15 | hotDF = pd.DataFrame() 16 | hotDF['项目简介'] = summary_text 17 | hotDF['项目地址'] = hot_url 18 | hotDF.to_csv('./github_hot.csv', index=False) 19 | 20 | if __name__ == '__main__': 21 | keyword = input('请输入查找的热门语言:') 22 | hot_github(keyword) 23 | -------------------------------------------------------------------------------- /spiderFile/kantuSpider.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import time 4 | 5 | import requests as rq 6 | 7 | 8 | def get_all_page(page): 9 | url = 'http://52kantu.cn/?page={}'.format(page) 10 | html = rq.get(url).text 11 | 12 | return html 13 | 14 | 15 | def get_img_url(html): 16 | regex = re.compile('