├── 12306_query_spider ├── README.md ├── data │ └── city_data.json ├── run_spider.py └── save_city_list.py ├── 163music_comment_spider ├── README.md ├── data │ └── 孤勇者_热评.json └── run_spider.py ├── README.md ├── autohome_wom_spider ├── README.md └── run_spider.py ├── baidu_fanyi_spider ├── README.md ├── demo.js └── run_spider.py ├── bilibili_vedio_spider ├── README.md └── run_spider.py ├── csdn_hot_search_spider ├── README.md ├── data │ └── c_c++.json ├── get_data.py ├── run_spider.py ├── save_data.py └── ua_pool.py ├── dianping_comment_spider ├── README.md ├── decrypt_fonts.py └── run_spider.py ├── dianping_font_decrypt ├── README.md ├── decrypt_fonts.py ├── download_fonts.py ├── run_spider.py └── woff │ └── fonts_dic.json ├── douban_annual_list_spider ├── README.md ├── run_spider.py ├── ua_ip_pool.py └── 榜单电影链接.xlsx ├── douyin_video_spider ├── README.md └── run_spider.py ├── fangtianxia_spider ├── README.md ├── data │ └── 郑州楼盘_数据.json ├── run_spider.py └── ua_pool.py ├── gupiao_rank_spider ├── README.md ├── data │ ├── A股市场_人气榜.xlsx │ ├── 港股市场_人气榜.xlsx │ └── 美股市场_人气榜.xlsx ├── decryption_AES.py ├── get_message.py ├── run_spider.py └── 获取密钥和偏移量.html ├── huya_all_types_spider ├── README.md ├── data │ ├── Apex英雄_直播用户信息.xlsx │ └── all_types_msg.json ├── get_proxyz.py ├── get_types_user_msg.py ├── get_ua.py └── run_spider.py ├── lagou_jobs_spider ├── README.md ├── data │ ├── Python爬虫.csv │ └── lagou.png └── run_spider.py ├── lol_hero_message_spider ├── README.md ├── heroes_data.json └── run_spider.py ├── lol_skins_spider ├── README.md ├── lol_skins_data └── run_spider.py ├── maoyan_data_spider ├── README.md ├── data │ └── 猫眼实时数据.xlsx ├── get_url_data.py ├── run_spider.py └── save_data.py ├── meituan_foods_spider ├── README.md ├── ip_pool.py └── run_spider.py ├── simple_ip_proxy_pool ├── README.md ├── all_ip_agent │ ├── get_66ip.py │ ├── get_89ip.py │ ├── get_ihuan.py │ ├── get_ip3366.py │ ├── get_jiangxianli.py │ ├── get_kuaidaili.py │ ├── test_save.py │ └── user_agent.py ├── ip_pool.json └── ip_pool_run.py ├── taobao_commodity_spider ├── README.md ├── data │ └── 光遇_商品信息.xlsx └── run_spider.py ├── umeitu_dongman_spider ├── README.md ├── all_images │ ├── AIR神尾观铃双马尾高清卡通图片.jpg │ ├── 樱花庄的宠物女孩椎名真白高清卡通图片.jpg │ ├── 软萌系列动漫头像高清卡通图片.jpg │ └── 黄昏之大地的炼金术士高清卡通图片.jpg └── run_spider.py ├── ximalaya_audio_spider ├── README.md └── run_spider.py ├── yibu_book_spider ├── README.md ├── get_proxyz.py ├── get_ua.py └── run_spider.py ├── yiqing_data_spider ├── README.md ├── city_list.json └── run_spider.py ├── youdao_fanyi_spider ├── README.md └── run_spider.py └── ziroom_message_spider ├── README.md ├── ocr_img ├── bg_image.png ├── black_img.png └── text.png ├── run_spider.py └── 自如网租房房源信息.csv /12306_query_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨查询12306火车票信息✨
2 | - 12306官网:[https://www.12306.cn/index/](https://www.12306.cn/index/) 3 | 4 | - 输入日期、出发地、目的地获取火车票信息,包含: 5 | - 车次 6 | - 出发时间 7 | - 到达时间 8 | - 历时 9 | - 商务座 10 | - 一等座 11 | - 二等座 12 | - 软卧 13 | - 硬卧 14 | - 硬座 15 | - 无座 16 | - 备注 17 | 18 | - 因12306有反爬机制,所以当查询失败时,请更换`cookie`的值后重试 19 | - 当然也可使用`selenium`自动获取`cookie`,这里则不再演示 20 | 21 | - 该爬虫使用到的模块: 22 | - requests 23 | - json 24 | - openpyxl 25 | - prettytable 26 | -------------------------------------------------------------------------------- /12306_query_spider/data/city_data.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/12306_query_spider/data/city_data.json -------------------------------------------------------------------------------- /12306_query_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/25 13:15 3 | # @Author : Torres-圣君 4 | # @File : download_fonts.py 5 | # @Sofaware : PyCharm 6 | import requests 7 | import json 8 | from openpyxl import Workbook 9 | from prettytable import PrettyTable 10 | from save_city_list import get_city_data 11 | 12 | 13 | class GetTrains: 14 | def __init__(self, date, begin_id, end_id): 15 | self.url = "https://kyfw.12306.cn/otn/leftTicket/query" 16 | # 构建请求头 17 | self.headers = { 18 | # 失效时,需要更新cookie 19 | "Cookie": "JSESSIONID=5BCD4997EB7387D6F2F26CF860144AE6; RAIL_EXPIRATION=1653658158853; RAIL_DEVICEID=OYdRuCkXuonxJIyWihWNwMa5x-JAFt30BYWuZd9lAzHOtXh1TezSjz0oQm9n0TYq3InM3pJKfGexQCQEFpOqkTJq5XqXQ_taNYf1hTlQ6YWdWKWrJosRmvmDdUmt9omgZ2sDBAmcohSg662SJ-55JM97DtJQ0sfA; guidesStatus=off; highContrastMode=defaltMode; cursorStatus=off; BIGipServerotn=384827914.50210.0000; BIGipServerpool_passport=31719946.50215.0000; route=c5c62a339e7744272a54643b3be5bf64; _jc_save_toDate=2022-05-25; _jc_save_wfdc_flag=dc; _jc_save_fromStation=%u5546%u4E18%2CSQF; _jc_save_toStation=%u90D1%u5DDE%2CZZF; _jc_save_fromDate=2022-05-26", 20 | # "Referer": referer, 21 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53" 22 | } 23 | # 构建请求所需参数 24 | self.params = { 25 | "leftTicketDTO.train_date": date, 26 | "leftTicketDTO.from_station": begin_id, 27 | "leftTicketDTO.to_station": end_id, 28 | "purpose_codes": "ADULT" 29 | } 30 | # 实例化美化表格对象 31 | self.pt = PrettyTable() 32 | 33 | def run(self): 34 | # 对目标网址发送请求 35 | res = requests.get(self.url, headers=self.headers, params=self.params).json() 36 | data_list = res['data']['result'] 37 | # 构造表格的表头,用于展示和保存 38 | header_list = [ 39 | ['车次', '出发时间', '到达时间', '历时', '商务座', '一等座', '二等座', '软卧', '硬卧', '硬座', '无座', '备注'] 40 | ] 41 | # 将表头信息添加进展示表格的表头 42 | self.pt.field_names = header_list[0] 43 | for data in data_list: 44 | # 格式化添加表数据 45 | trains_msg = self.format_data(data) 46 | # 将数据添加进列表,用于保存 47 | header_list.append(trains_msg) 48 | # 打印表格 49 | print(self.pt) 50 | # 返回车次信息列表 51 | return header_list 52 | 53 | def format_data(self, data): 54 | # 将返回的数据以'|'进行分隔 55 | all_data_list = data.split('|') 56 | # 提取车次的信息 57 | trains_msg = [ 58 | all_data_list[3], 59 | all_data_list[8], 60 | all_data_list[9], 61 | all_data_list[10], 62 | all_data_list[32] if all_data_list[32] != "" else "--", 63 | all_data_list[31] if all_data_list[31] != "" else "--", 64 | all_data_list[30] if all_data_list[30] != "" else "--", 65 | all_data_list[23] if all_data_list[23] != "" else "--", 66 | all_data_list[28] if all_data_list[28] != "" else "--", 67 | all_data_list[29] if all_data_list[29] != "" else "--", 68 | all_data_list[26] if all_data_list[26] != "" else "--", 69 | all_data_list[1] if all_data_list[1] != "" else "--" 70 | ] 71 | # 增添表内容 72 | self.pt.add_row(trains_msg) 73 | # 将提取的信息返回,用于保存 74 | return trains_msg 75 | 76 | def save_data(self, trains_data_list, date, begin, end): 77 | num = input("如果展示不清晰,需要保存时请扣1:") 78 | if num == "1": 79 | wb = Workbook() 80 | sheet = wb.create_sheet("车次信息", -1) 81 | # 遍历表格索引,写入数据 82 | for x in range(len(trains_data_list)): 83 | for y in range(len(trains_data_list[x])): 84 | sheet.cell(x + 1, y + 1).value = trains_data_list[x][y] 85 | wb.save(f"./data/{date}_{begin}_{end}.xlsx") 86 | print("数据保存完成!") 87 | 88 | 89 | if __name__ == '__main__': 90 | # 更新城市对应的英文代码,需要时再启用 91 | # get_city_data() 92 | date = input("请输入出发日期(YYYY-MM-DD):") 93 | begin = input("请输入出发地:") 94 | end = input("请输入目的地:") 95 | # 读取生成的json文件 96 | city_list = json.load(open('./data/city_data.json', 'r')) 97 | # 获取城市对应的英文代码 98 | begin_id = city_list[begin] 99 | end_id = city_list[end] 100 | gt = GetTrains(date, begin_id, end_id) 101 | trains_data_list = gt.run() 102 | # 是否需要保存数据 103 | gt.save_data(trains_data_list, date, begin, end) 104 | print( 105 | "12306直达链接(复制到浏览器打开):", 106 | "https://kyfw.12306.cn/otn/leftTicket/init?" 107 | "linktypeid=dc&" 108 | f"fs={begin},{begin_id}&" 109 | f"ts={end},{end_id}&" 110 | f"date={date}&" 111 | "flag=N,N,Y" 112 | ) 113 | -------------------------------------------------------------------------------- /12306_query_spider/save_city_list.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/25 12:48 3 | # @Author : Torres-圣君 4 | # @File : save_city_list.py 5 | # @Sofaware : PyCharm 6 | import requests 7 | import json 8 | 9 | 10 | def get_city_data(): 11 | url = "https://kyfw.12306.cn/otn/resources/js/framework/station_name.js?station_version=1.9053" 12 | print("正在获取数据。") 13 | # 发送请求,获取返回的数据 14 | res = requests.get(url) 15 | data = str(res.content, encoding="utf8") 16 | # 格式化返回的数据 17 | response_format(data) 18 | 19 | 20 | def response_format(data): 21 | dict_data = dict() 22 | # 根据'|'分隔数据 23 | list_data = data.split('|') 24 | # 从下标'1'开始, 每间隔5个为字典key 25 | result_x = list_data[1:len(list_data):5] 26 | # 从下标'2'开始, 每间隔5个为字典value 27 | result_y = list_data[2:len(list_data):5] 28 | # 循环将数据写入字典 29 | for i in range(len(result_x)): 30 | dict_data[result_x[i].replace(" ", "")] = result_y[i] 31 | # 保存数据 32 | save_data(dict_data) 33 | 34 | 35 | def save_data(dict_data): 36 | json_data = json.dumps(dict_data, indent=1, ensure_ascii=False) 37 | with open("./data/city_data.json", 'w') as w: 38 | w.write(json_data) 39 | print("数据保存完成!") 40 | -------------------------------------------------------------------------------- /163music_comment_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取网易云歌曲热门评论✨
2 | - 网易云音乐官网:[https://music.163.com](https://music.163.com) 3 | 4 | - 输入歌曲名或歌曲ID,自动获取该歌曲的所有热评 5 | - 输入为歌曲名时:自动获取该歌曲名对应的第一首歌曲ID,并获取其热评 6 | - 输入为歌曲ID时:精准定位歌曲,获取其热评 7 | - 爬取的数据存储方式: 8 | - 通过a追加模式,将爬取的数据存储到`data/`文件夹下的json文件 9 | - 该爬虫使用到的模块: 10 | - requests 11 | - selenium 12 | - json -------------------------------------------------------------------------------- /163music_comment_spider/data/孤勇者_热评.json: -------------------------------------------------------------------------------- 1 | { 2 | "用户名": "肖星星H", 3 | "评论日期": "2021年12月8日", 4 | "评论赞数": "34.1万", 5 | "评论内容": "“谁说站在光里的才算英雄” ——敬缉毒警" 6 | }{ 7 | "用户名": "惠子Are-ONE", 8 | "评论日期": "2021年12月10日", 9 | "评论赞数": "24.3万", 10 | "评论内容": "我发小,缉毒警,前些年谈的女朋友第二个月他就接到任务了,找我们几个要好的朋友演戏逼他女朋友恨他然后分手,六年了只留下一封信要等他女朋友结婚之后才让交给她,到今天六年多了没有半点我发小的消息,他曾说如果你有一天遇见了一块无名碑,可能就是他" 11 | }{ 12 | "用户名": "取盛", 13 | "评论日期": "2021年12月9日", 14 | "评论赞数": "18.4万", 15 | "评论内容": "这首歌背后还有个真实故事,作词人唐恬,1983年出生于湖南,中国内地作词人。" 16 | }{ 17 | "用户名": "江拯", 18 | "评论日期": "2021年12月8日", 19 | "评论赞数": "18万", 20 | "评论内容": "“爸爸 乞丐是怎么回事?” “你要好好学习,未来能让他们不必要饭。”" 21 | }{ 22 | "用户名": "罗德兰的苍渊", 23 | "评论日期": "2021年12月30日", 24 | "评论赞数": "13.5万", 25 | "评论内容": "这首歌已经和游戏没啥关系了" 26 | }{ 27 | "用户名": "777eleve", 28 | "评论日期": "2021年12月8日", 29 | "评论赞数": "13.3万", 30 | "评论内容": "总有一天,全台湾省的人,都要考全国卷!" 31 | }{ 32 | "用户名": "啊啊啊Gem", 33 | "评论日期": "2021年12月8日", 34 | "评论赞数": "95259", 35 | "评论内容": "大家都要做考研路上的孤勇者" 36 | }{ 37 | "用户名": "0706天天", 38 | "评论日期": "2021年12月8日", 39 | "评论赞数": "87706", 40 | "评论内容": "爱你又菜又爱装, 爱你漏刀的模样, 爱你对线被打穿,不肯出肉装。 爱你最烂的装备,压最凶的线。 爱你张狂又嚣张,结局都一样。 去吗?去啊!以最垃的出装! 团吗!团啊!以五换零收场! 致你上单时的无能与狂怒, 谁说0-20的不配叫做英雄。" 41 | }{ 42 | "用户名": "马可波罗堡-", 43 | "评论日期": "2021年12月16日", 44 | "评论赞数": "75480", 45 | "评论内容": "EDG!YYDS!战到最后脚踩全球冠军杯回来了🔥" 46 | }{ 47 | "用户名": "你是得不到的白月光", 48 | "评论日期": "1月24日 19:23", 49 | "评论赞数": "61819", 50 | "评论内容": "我是蜜雪员工 每一杯珍珠奶茶我都有多加半勺珍珠 谁说站在光里的才算英雄" 51 | }{ 52 | "用户名": "嘬起一根棒棒糖", 53 | "评论日期": "2021年12月22日", 54 | "评论赞数": "52748", 55 | "评论内容": " 1.笔记 周笔畅 2.追光者 岑宁儿 3.昨日青空 尤长靖 4.体面 于文文 5.如果爱忘了 戚薇 6.朋友请听好 何炅&谢娜&易烊千玺 7.爱无反顾 姚贝娜 8. 你一定要幸福 何洁 9.少年游 魏晨 10.下一站永远 至上励合 11.给眼泪一点时间 黄雅莉 12.荆棘王冠 迪玛希 13.无罪说 何炅&撒贝宁&吴映洁&魏大勋" 56 | }{ 57 | "用户名": "hhuiuiu", 58 | "评论日期": "1月14日 03:35", 59 | "评论赞数": "52035", 60 | "评论内容": "中奖了兄弟们 肺癌 人生这趟车我就先下了 你们接着奏乐接着舞 哈哈其实想想也没什么 不是么..." 61 | }{ 62 | "用户名": "九星NineStar", 63 | "评论日期": "2月13日 21:43", 64 | "评论赞数": "43445", 65 | "评论内容": "“谁说站在光里的才算英雄”,建筑工人,环卫工,缉毒警,为国争光的电竞选手,还有千千万万默默在暗处发光发热的英雄们啊~我爱你们" 66 | }{ 67 | "用户名": "温柔神_", 68 | "评论日期": "2021年12月20日", 69 | "评论赞数": "41295", 70 | "评论内容": "对于双城之战最好的评价就是卡蜜尔的那句话,世界既不白也不黑,而是一道精致的灰,没有人是真正的反派,或者说,没有谁一开始就是反派,每一个角色都是某个角度的光,他们都是站在影子里的英雄" 71 | }{ 72 | "用户名": "如果不爱了就别勉为其难_lucky", 73 | "评论日期": "2021年12月13日", 74 | "评论赞数": "36430", 75 | "评论内容": "我是一名消防员,火场、废墟、洪水…有多少次出生入死,我也害怕会牺牲。我还没结婚我才20几岁,每一次出警都告诫自己,要小心,不要去冒风险。可每当身临其境,我发现我顾不了这么多,义无反顾的冲进那随时会把你吞噬的火场,废墟,洪水...我用希望扩展希望,用生命激活生命,只为不负人民那期盼眼神" 76 | } -------------------------------------------------------------------------------- /163music_comment_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/18 23:21 3 | # @Author : Torres-圣君 4 | # @File : get_page_data.py 5 | # @Sofaware : PyCharm 6 | from selenium import webdriver 7 | import requests 8 | import json 9 | 10 | 11 | def get_data(music_id): 12 | web = webdriver.Edge() 13 | web.get(f"https://music.163.com/#/song?id={music_id}") 14 | # 等待网页加载完成,不是死等;加载完成即可 15 | web.implicitly_wait(10) 16 | # 定位iframe 17 | iframe = web.find_element_by_css_selector('.g-iframe') 18 | # 先进入到iframe 19 | web.switch_to.frame(iframe) 20 | # 获取歌名 21 | title = web.find_element_by_css_selector('.tit em').text 22 | # 获取评论列表 23 | div_list = web.find_elements_by_css_selector('.itm') 24 | for i in range(0, len(div_list)-20): 25 | item = {} 26 | item["用户名"] = div_list[i].find_element_by_css_selector('.s-fc7').text 27 | item["评论日期"] = div_list[i].find_element_by_css_selector('.time.s-fc4').text 28 | item["评论赞数"] = div_list[i].find_element_by_css_selector('.rp').text.split("(")[-1].split(")")[0] 29 | item["评论内容"] = div_list[i].find_element_by_css_selector('.cnt.f-brk').text.split(":")[-1].replace("\n", " ") 30 | save_data(title, item) 31 | web.close() 32 | print("该歌曲热评已保存完毕!") 33 | 34 | 35 | def save_data(title, item): 36 | data = json.dumps(item, indent=1, ensure_ascii=False) 37 | with open(f"data/{title}_热评.json", "a", encoding="utf-8") as w: 38 | w.write(data + ",") 39 | 40 | 41 | def get_music_id(music_name): 42 | url = f"http://music.163.com/api/search/get/?s={music_name}&limit=1&type=1" 43 | headers = { 44 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44" 45 | } 46 | res = requests.get(url, headers=headers, ) 47 | music_id = res.json() 48 | return music_id["result"]["songs"][0]["id"] 49 | 50 | 51 | if __name__ == '__main__': 52 | music = input("请输入歌曲ID或名称:") 53 | if (len(music) == 10) and music.isdigit(): 54 | get_data(music) 55 | else: 56 | music = get_music_id(music) 57 | print("该歌曲的ID为:", music) 58 | get_data(music) 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |
3 | 4 | # ✨个人爬虫练习案例✨ 5 | 6 | --- 7 | 8 | ## 入门级爬虫案例 9 | 10 | | 案例名称 | 快速定位 | 爬虫所需模块 | 11 | | :------------------------: | :--: | :------------------------: | 12 | | 获取城市实时疫情数据信息 | [yiqing_data_spider](https://github.com/cjladmin/spider_cases/tree/main/yiqing_data_spider) | re、requests | 13 | | 保存喜马拉雅免费音频 | [ximalaya_audio_spider](https://github.com/cjladmin/spider_cases/tree/main/ximalaya_audio_spider) | requests | 14 | | 获取异步社区所有图书信息 | [yibu_book_spider](https://github.com/cjladmin/spider_cases/tree/main/yibu_book_spider) | requests、pymongo | 15 | | 爬取房天下全部的楼盘数据 | [fangtianxia_spider](https://github.com/cjladmin/spider_cases/tree/main/fangtianxia_spider) | requests、time、json、lxml、re | 16 | | 获取LOL道聚城皮肤信息 | [lol_skins_spider](https://github.com/cjladmin/spider_cases/tree/main/lol_skins_spider) | requests、re、time、json | 17 | | 获取LOL全英雄的资料信息 | [lol_hero_message_spider](https://github.com/cjladmin/spider_cases/tree/main/lol_hero_message_spider) | requests、json、time | 18 | | 获取CSDN分类的热榜文章 | [csdn_hot_search_spider](https://github.com/cjladmin/spider_cases/tree/main/csdn_hot_search_spider) | requests、time、json | 19 | 20 | 21 | ## 爬虫进阶案例 22 | 23 | | 案例名称 | 快速定位 | 爬虫所需模块 | 24 | | :------------------------: | :--: | :------------------------: | 25 | | 简易的免费IP代理池 | [simple_ip_proxy_pool](https://github.com/cjladmin/spider_cases/tree/main/simple_ip_proxy_pool) | lxml、request、json、random、threading、asyncio、aiohttp | 26 | | 网易云歌曲热门评论 | [163music_comment_spider](https://github.com/cjladmin/spider_cases/tree/main/163music_comment_spider) | requests、selenium、json | 27 | | 美团美食的店铺信息 | [meituan_foods_spider](https://github.com/cjladmin/spider_cases/tree/main/meituan_foods_spider) | requests、re、time、json、pymongo | 28 | | 优美图库的动漫图片 | [umeitu_dongman_spider](https://github.com/cjladmin/spider_cases/tree/main/umeitu_dongman_spider) | requests、aiohttp、asyncio、lxml | 29 | | 下载抖音作者视频 | [douyin_video_spider](https://github.com/cjladmin/spider_cases/tree/main/douyin_video_spider) | re、os、time、requests、selenium | 30 | | 下载B站非会员视频 | [bilibili_vedio_spider](https://github.com/cjladmin/spider_cases/tree/main/bilibili_vedio_spider) | requests、time、json、subprocess、os | 31 | | 猫眼电影实时数据 | [maoyan_data_spider](https://github.com/cjladmin/spider_cases/tree/main/maoyan_data_spider) | requests、lxml、openpyxl | 32 | | 虎牙在播的所有用户 | [huya_all_types_spider](https://github.com/cjladmin/spider_cases/tree/main/huya_all_types_spider) | threading、requests、json、time、lxml、openpyxl、random | 33 | | 豆瓣年度电影榜单 | [douban_annual_list_spider](https://github.com/cjladmin/spider_cases/tree/main/douban_annual_list_spider) | re、time、requests、openpyxl、selenium | 34 | | 拉钩网的招聘信息 | [lagou_jobs_spider](https://github.com/cjladmin/spider_cases/tree/main/lagou_jobs_spider) | os、csv、playwright | 35 | | 12306的车次信息 | [12306_query_spider](https://github.com/cjladmin/spider_cases/tree/main/12306_query_spider) | requests、json、openpyxl、prettytable | 36 | | 淘宝搜索页商品数据 | [taobao_commodity_spider](https://github.com/cjladmin/spider_cases/tree/main/taobao_commodity_spider) | re、time、random、selenium、openpyxl | 37 | 38 | ## JS逆向案例 39 | 40 | | 案例名称 | 快速定位 | 爬虫所需模块 | 41 | | :------------------------: | :--: | :------------------------: | 42 | | 百度在线翻译接口 | [baidu_fanyi_spider](https://github.com/cjladmin/spider_cases/tree/main/baidu_fanyi_spider) | requests、execjs | 43 | | 有道在线翻译接口 | [youdao_fanyi_spider](https://github.com/cjladmin/spider_cases/tree/main/youdao_fanyi_spider) | requests、hashlib、time | 44 | | 东方财富个股人气榜 | [gupiao_rank_spider](https://github.com/cjladmin/spider_cases/tree/main/gupiao_rank_spider) | requests、time、json、openpyxl、Crypto、base64 | 45 | 46 | ## 字体反爬案例 47 | 48 | | 案例名称 | 快速定位 | 爬虫所需模块 | 49 | | :------------------------: | :--: | :------------------------: | 50 | | 大众点评商铺信息 | [dianping_font_decrypt](https://github.com/cjladmin/spider_cases/tree/main/dianping_font_decrypt) | requests、fontTools、json、lxml、re | 51 | | 大众点评评论信息 | [dianping_comment_spider](https://github.com/cjladmin/spider_cases/tree/main/dianping_comment_spider) | requests、lxml、re | 52 | | 汽车之家口碑信息 | [autohome_wom_spider](https://github.com/cjladmin/spider_cases/tree/main/autohome_wom_spider) | requests、lxml、re、fontTools | 53 | | 自如网房价信息 | [ziroom_message_spider](https://github.com/cjladmin/spider_cases/tree/main/ziroom_message_spider) | requests、lxml、re、time、pytesseract、PIL | 54 | 55 |
56 | 57 | -------------------------------------------------------------------------------- /autohome_wom_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取汽车之家口碑信息✨
2 | - 随便拿一条口碑信息做个测试:[https://k.autohome.com.cn/detail/view_01g5ryk7f66gt34d9p6wvg0000.html](https://k.autohome.com.cn/detail/view_01g5ryk7f66gt34d9p6wvg0000.html) 3 | 4 | ```python 5 | 字体反爬大体思路: 6 | 1. 通过口碑页面的源码中,提取'.ttf'字体链接,并保存该字体 7 | 2. 使用'fontTools'模块提取字体中所有的文字编号,后将这些编号和文字建立映射 8 | 3. 建立映射时,需把字体编号的'uni'替换为和源码中相同的'&#x'形式 9 | 4. 最后将页面源码中的加密文字进行替换即可 10 | ``` 11 | 12 | - 该爬虫使用到的模块: 13 | - requests 14 | - re 15 | - lxml 16 | - fontTools 17 | -------------------------------------------------------------------------------- /autohome_wom_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/29 21:47 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import requests 7 | import re 8 | from lxml import etree 9 | from fontTools.ttLib import TTFont 10 | 11 | url = "https://k.autohome.com.cn/detail/view_01g5ryk7f66gt34d9p6wvg0000.html" 12 | headers = { 13 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37" 14 | } 15 | 16 | 17 | def run(): 18 | res = decrypt_font() 19 | html = etree.HTML(res) 20 | # 随便提取一段内容,只做测试 21 | content = ''.join(html.xpath('//div[@class="kb-con"]/div[1]/p//text()')) 22 | print(content) 23 | 24 | 25 | def decrypt_font(): 26 | page_res = download_font() 27 | words = "不是门味机控量启六低多排性二灯近光雨问过十无耗油和短级远得右比真中硬八加来三音着孩实好七内更有长四身坐保下地冷外养软高响呢的电很自盘一开小副左里九五档当路手泥公动上只了少空皮大矮坏" 28 | font = TTFont('font.ttf') 29 | font_list = font.getGlyphOrder()[1:] 30 | fonts_dic = {} 31 | for i, v in enumerate(words): 32 | num_char = font_list[i].replace("uni", "&#x").lower() + ';' 33 | fonts_dic[num_char] = v 34 | for i in fonts_dic: 35 | if str(i) in page_res: 36 | page_res = page_res.replace(str(i), fonts_dic[i].replace(';', '')) 37 | return page_res 38 | 39 | 40 | def download_font(): 41 | # 获取口碑字体链接 42 | res = requests.get(url, headers=headers).text 43 | font_url = re.findall('href="(.*?)\.ttf"', res)[0] + ".ttf" 44 | print("字体链接:", font_url) 45 | # 保存该字体 46 | font_data = requests.get(font_url, headers=headers).content 47 | with open('font.ttf', 'wb') as w: 48 | w.write(font_data) 49 | print("字体保存完成!") 50 | return res 51 | 52 | 53 | if __name__ == '__main__': 54 | run() 55 | -------------------------------------------------------------------------------- /baidu_fanyi_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨百度在线翻译接口✨
2 | - 百度在线翻译:[https://fanyi.baidu.com/translate](https://fanyi.baidu.com/translate) 3 | 4 | ```python 5 | 通过抓包获取到接口后,查看其携带的参数信息 6 | 通过对参数的分析得出: 7 | 'from': 初始内容的语言 8 | 'to': 需要翻译成那种语言 9 | 'query': 需要翻译的内容 10 | 'sign':动态加密参数,通过断点调试发现,该参数是通过一个名为'e'的函数返回的,找到该函数后直接copy函数源码到本地文件,调试时发现缺少'i'和'n',在对源码进行分析,得出'i'为固定值,而'n'则直接复制其上面的函数'n'即可 11 | 除此之外,其余的参数则都为固定值 12 | 参数都解决完成后,携带这些参数对接口发送请求即可 13 | ``` 14 | 15 | - 该爬虫使用到的模块: 16 | - requests 17 | - execjs 18 | -------------------------------------------------------------------------------- /baidu_fanyi_spider/demo.js: -------------------------------------------------------------------------------- 1 | function n(r, o) { 2 | for (var t = 0; t < o.length - 2; t += 3) { 3 | var a = o.charAt(t + 2); 4 | a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a), 5 | a = "+" === o.charAt(t + 1) ? r >>> a : r << a, 6 | r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a 7 | } 8 | return r 9 | } 10 | 11 | function e(r) { 12 | var i = '320305.131321201' 13 | var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g); 14 | if (null === o) { 15 | var t = r.length; 16 | t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10)) 17 | } else { 18 | for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++) 19 | "" !== e[C] && f.push.apply(f, a(e[C].split(""))), 20 | C !== h - 1 && f.push(o[C]); 21 | var g = f.length; 22 | g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join("")) 23 | } 24 | var u = void 0 25 | , l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107); 26 | u = null !== i ? i : (i = window[l] || "") || ""; 27 | for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) { 28 | var A = r.charCodeAt(v); 29 | 128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)), 30 | S[c++] = A >> 18 | 240, 31 | S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224, 32 | S[c++] = A >> 6 & 63 | 128), 33 | S[c++] = 63 & A | 128) 34 | } 35 | for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++) 36 | p += S[b], 37 | p = n(p, F); 38 | return p = n(p, D), 39 | p ^= s, 40 | 0 > p && (p = (2147483647 & p) + 2147483648), 41 | p %= 1e6, 42 | p.toString() + "." + (p ^ m) 43 | } 44 | -------------------------------------------------------------------------------- /baidu_fanyi_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/7/2 22:51 3 | # @Author : Torres-圣君 4 | import requests 5 | import execjs 6 | 7 | 8 | def run(text): 9 | url = "https://fanyi.baidu.com/v2transapi?from=en&to=zh" 10 | headers = { 11 | "Cookie": "BIDUPSID=3A2B984F3D5B346B085AD5B5865CD243; PSTM=1620650820; __yjs_duid=1_90335d2ecbd02d6fabe6c844263de3b31620818982804; REALTIME_TRANS_SWITCH=1; HISTORY_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; MCITY=-%3A; BAIDUID=9E5B8CC930FAAF06F999A3B216168E7A:FG=1; APPGUIDE_10_0_2=1; BDUSS=JSbjBTaDAzVkRNYzJnbVJPY35mZmZEQTNxN0JCb1lBSXNvd2lieVJ5ZnZkOUZpRVFBQUFBJCQAAAAAAAAAAAEAAADkEyraVG9ycmVzyqW-~QAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAO~qqWLv6qliQW; BDUSS_BFESS=JSbjBTaDAzVkRNYzJnbVJPY35mZmZEQTNxN0JCb1lBSXNvd2lieVJ5ZnZkOUZpRVFBQUFBJCQAAAAAAAAAAAEAAADkEyraVG9ycmVzyqW-~QAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAO~qqWLv6qliQW; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1655206845,1655616330; ZFY=XmZ2OfobadeEG7DawlXrwhekRikKh5VFZ62dA6HB2JY:C; BAIDUID_BFESS=C1F2EB92F00DFE8F8EC8F5B5023D2982:FG=1; BA_HECTOR=8k8h8l8l8la02025041hbvh2q14; ab_sr=1.0.1_Y2RmZWRhZDVjZGQ2NTc3MzY4ZWVkZmU1NzkzNjFkMWI2MjM4MjRhMDllMmY2MTVjOTcxYWZiYjY2NDFmNWNiYmYxZjEzN2JlN2EyZjRiYzNiMjY5NWZiYWI5ZTljNmYzNjlmMjgxMzEzMzJjNjViNDdmYzViNjA4YzFkZDZhMWYyZDEyYmI5MzYyOTc3MTU2NjNhODE2ZGRjMWI1MTRkZjlkNzI4OTFjY2U4OTBiYTRkYjUzN2Y5NjRkMjA1NmRl", 12 | "Referer": "https://fanyi.baidu.com/translate", 13 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36 Edg/103.0.1264.44" 14 | } 15 | res = requests.post(url, headers=headers, data=get_data(text)) 16 | data = res.json() 17 | print(f"{'-'*100}\n", data['trans_result']['data'][0]['dst']) 18 | 19 | 20 | def get_data(text): 21 | x1, x2 = is_chinese(text) 22 | data = { 23 | "from": x1, 24 | "to": x2, 25 | "query": text, 26 | "transtype": "realtime", 27 | "simple_means_flag": "3", 28 | "sign": decrypt_params(text), 29 | "token": "8f8d536955d93b439ca12bb1977f5195", 30 | "domain": "common", 31 | } 32 | # print(data) 33 | return data 34 | 35 | 36 | def is_chinese(check_str): 37 | # 判断字符串中是否含有中文 38 | for ch in check_str: 39 | if u'⼀' <= ch <= u'󰀀': 40 | return "zh", "en" 41 | else: 42 | return "en", "zh" 43 | 44 | 45 | def decrypt_params(text): 46 | with open("demo.js", 'r') as r: 47 | # 读取js文件,使用compile加载js代码并执行 48 | js = r.read() 49 | js_func = execjs.compile(js) 50 | js_data = js_func.call("e", text) 51 | # print(js_data) 52 | return js_data 53 | 54 | 55 | if __name__ == '__main__': 56 | content = input("请输入需要翻译的内容:") 57 | run(content) 58 | -------------------------------------------------------------------------------- /bilibili_vedio_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨下载B站非会员专享的视频✨
2 | - bilibili官网:[https://www.bilibili.com/](https://www.bilibili.com/) 3 | 4 | - 该爬虫需要用到`ffmpeg` -> 下载链接:[http://ffmpeg.org/download.html](http://ffmpeg.org/download.html) 5 | - 下载后,需要将其`bin`目录添加进环境变量 6 | 7 | - 支持一次性下载多个视频,只需输入视频的链接或BV即可 8 | 9 | - 该爬虫使用到的模块: 10 | - requests 11 | - time 12 | - json 13 | - subprocess 14 | - os 15 | -------------------------------------------------------------------------------- /bilibili_vedio_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/9 12:16 3 | # @Author : Torres-圣君 4 | # @File : download_fonts.py 5 | # @Sofaware : PyCharm 6 | import requests 7 | import re 8 | import json 9 | import subprocess 10 | import os 11 | 12 | 13 | class DownloadVideo: 14 | def __init__(self, url_list: list): 15 | self.task_url = url_list 16 | self.headers = { 17 | "Referer": "https://www.bilibili.com/", 18 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39", 19 | } 20 | 21 | def run(self): 22 | for task in self.task_url: 23 | if "https://www.bilibili.com/video/" in task: 24 | url = task 25 | else: 26 | url = "https://www.bilibili.com/video/" + task 27 | res = requests.get(url, headers=self.headers) 28 | print(f"正在提取{url}...") 29 | self.format_data(res.text) 30 | 31 | def format_data(self, text_data): 32 | print("正在提取视频数据 > > >") 33 | video_title = re.findall(r'

', text_data)[0] 34 | mp4_url_data = re.findall(r"", text_data)[0] 35 | json_data = json.loads(mp4_url_data) 36 | audio_url = json_data["data"]["dash"]["audio"][0]["baseUrl"] 37 | video_url = json_data["data"]["dash"]["video"][0]["baseUrl"] 38 | print("数据提取完毕...") 39 | self.save_data(video_title, audio_url, video_url) 40 | 41 | def save_data(self, video_title, audio_url, video_url): 42 | try: 43 | os.mkdir('./data') 44 | finally: 45 | # 保存音频数据 46 | print("正在保存音频数据...") 47 | audio_data = requests.get(audio_url, headers=self.headers).content 48 | with open(f"./data/{video_title}.mp3", "wb") as w: 49 | w.write(audio_data) 50 | print("音频数据保存完毕...") 51 | 52 | # 保存视频数据 53 | print("正在保存视频数据...") 54 | video_data = requests.get(video_url, headers=self.headers).content 55 | with open(f"./data/{video_title}.mp4", "wb") as w: 56 | w.write(video_data) 57 | print("视频数据保存完毕...") 58 | 59 | # 合并音频和视频 60 | self.combined_data(video_title) 61 | 62 | def combined_data(self, video_title): 63 | # 需要将ffmpeg配置到环境变量 64 | final_data = f'ffmpeg -i data/{video_title}.mp4 -i data/{video_title}.mp3 -c:v copy -c:a aac -strict experimental data/_{video_title}.mp4' 65 | subprocess.run(final_data, shell=True) 66 | # os.system(final_data) 67 | self.move_other(video_title) 68 | 69 | def move_other(self, video_title): 70 | os.remove(f'./data/{video_title}.mp3') 71 | os.remove(f'./data/{video_title}.mp4') 72 | 73 | 74 | if __name__ == '__main__': 75 | task = [] 76 | name = input("请输入视频链接:") 77 | while name != 'q': 78 | task.append(name) 79 | name = input("继续输入视频链接(输入q结束):") 80 | spider = DownloadVideo(task) 81 | spider.run() 82 | -------------------------------------------------------------------------------- /csdn_hot_search_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取CSDN博客所有分类的热榜文章✨
2 | - CSDN博客官网:[https://www.csdn.net/](https://www.csdn.net/) 3 | 4 | - 爬取CSDN所有分类的热榜文章信息,包含: 5 | - 文章标题 6 | - 文章作者 7 | - 文章链接 8 | - 文章浏览数 9 | - 文章收藏数 10 | - 文章评论数 11 | - 爬取的数据存储方式: 12 | - 通过w写模式,将爬取的数据存储到`data/`文件夹下的json文件 13 | - 该爬虫使用到的模块: 14 | - requests 15 | - time 16 | - json 17 | -------------------------------------------------------------------------------- /csdn_hot_search_spider/get_data.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/16 19:32 3 | # @Author : Torres-圣君 4 | # @File : get_data.py 5 | # @Sofaware : PyCharm 6 | import requests 7 | from .ua_pool import * 8 | 9 | 10 | class CsdnHot: 11 | def __init__(self, page_, type_): 12 | # 页面数据起始的URL 13 | self.url = f"https://blog.csdn.net/phoenix/web/blog/hot-rank" 14 | # 自定义请求头 15 | self.headers = { 16 | "user-agent": get_user_agent(), 17 | } 18 | # 发送请求时需要携带的参数 19 | self.params = { 20 | "page": page_, 21 | "pageSize": 25, 22 | "child_channel": type_, 23 | "type": None 24 | } 25 | 26 | def get_data(self): 27 | # 创建一个空列表,用于保存所有字典数据 28 | list_data = [] 29 | # 模拟请求 30 | res = requests.get(self.url, headers=self.headers, params=self.params) 31 | # 获取页面返回的数据,并转换为json 32 | data = res.json() 33 | for i in range(0, 25): 34 | # 创建空字典 35 | item = {} 36 | # 文章标题 37 | item["标题"] = data["data"][i]["articleTitle"] 38 | # 文章作者 39 | item["作者"] = data["data"][i]["nickName"] 40 | # 文章链接 41 | item["链接"] = data["data"][i]["articleDetailUrl"] 42 | # 文章浏览数 43 | item["浏览数"] = data["data"][i]["viewCount"] 44 | # 文章收藏数 45 | item["收藏数"] = data["data"][i]["commentCount"] 46 | # 文章评论数 47 | item["评论数"] = data["data"][i]["favorCount"] 48 | # 向列表内追加数据 49 | list_data.append(item) 50 | # 返回列表数据 51 | return list_data 52 | -------------------------------------------------------------------------------- /csdn_hot_search_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/16 19:30 3 | # @Author : Torres-圣君 4 | # @File : get_page_data.py 5 | # @Sofaware : PyCharm 6 | from .get_data import * 7 | from .save_data import * 8 | import time 9 | 10 | 11 | # 获取所有文章分类 12 | def get_type(): 13 | all_type_url = 'https://img-home.csdnimg.cn/data_json/jsconfig/rank_nav_list.json' 14 | headers = { 15 | "user-agent": get_user_agent() 16 | } 17 | res = requests.get(all_type_url, headers=headers) 18 | type_json = res.json() 19 | return type_json["list"] 20 | 21 | 22 | def run(): 23 | # 调用get_type方法,获取所有文章分类 24 | all_type_list = get_type() 25 | # 实例化SavaData类 26 | b = SavaData() 27 | for i in range(0, len(all_type_list)): 28 | # 获取具体分类的名称 29 | type_ = all_type_list[i]["type"] 30 | print(f"开始获取<{type_}>的热榜文章!") 31 | # 创建文件并打开 32 | b.open(type_) 33 | for j in range(0, 8): 34 | # 实例化CsdnHot类 35 | d = CsdnHot(j, type_) 36 | # 调用get_data方法请求目标网址 37 | data = d.get_data() 38 | # 调用save_data方法保存返回的数据 39 | b.sava_data(j, data) 40 | # break 41 | # 关闭文件 42 | b.close() 43 | print(f"<{type_}>的热榜抓取完毕!") 44 | # 每切换一种分类就休息2秒,防止访问频率过快被封IP 45 | time.sleep(2) 46 | # break 47 | 48 | 49 | if __name__ == '__main__': 50 | run() 51 | -------------------------------------------------------------------------------- /csdn_hot_search_spider/save_data.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/16 19:49 3 | # @Author : Torres-圣君 4 | # @File : save_data.py 5 | # @Sofaware : PyCharm 6 | import json 7 | import time 8 | 9 | 10 | class SavaData: 11 | def open(self, type_): 12 | self.w = open(rf"data/{type_.replace('/','_')}.json", "w", encoding="utf-8") 13 | 14 | def sava_data(self, page, data): 15 | # 获取当前日期 16 | t = time.localtime() 17 | now = time.strftime("%Y-%m-%d %H:%M", t) 18 | # 将日期和数据合成字典 19 | item = { 20 | f"<第{page+1}页> | {now}": data 21 | } 22 | # 将字典转为JSON格式 23 | data = json.dumps(item, indent=1, ensure_ascii=False) 24 | # 写入数据 25 | self.w.write(data) 26 | 27 | def close(self): 28 | self.w.close() 29 | -------------------------------------------------------------------------------- /csdn_hot_search_spider/ua_pool.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/16 21:13 3 | # @Author : Torres-圣君 4 | # @File : ua_pool.py 5 | # @Sofaware : PyCharm 6 | import random 7 | 8 | 9 | def get_user_agent(): 10 | # UA池 11 | user_agent_list = [ 12 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 13 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 14 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", 15 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", 16 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", 17 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", 18 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", 19 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 20 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 21 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 22 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", 23 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 24 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 25 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", 26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", 27 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 28 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", 29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", 30 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", 31 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", 32 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 33 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 34 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 35 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 36 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 37 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 38 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 39 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 40 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 41 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 42 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 43 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 44 | "UCWEB7.0.2.37/28/999", 45 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39", 46 | "NOKIA5700/ UCWEB7.0.2.37/28/999", 47 | "Openwave/ UCWEB7.0.2.37/28/999", 48 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999", 49 | "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25", 50 | ] 51 | # 设置UA伪装 52 | return random.choice(user_agent_list) -------------------------------------------------------------------------------- /dianping_comment_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取大众点评店铺评论信息✨
2 | - 随便拿一家店铺的评论做个测试:[https://www.dianping.com/shop/H1XZuxIfuHl8meAJ/review_all](https://www.dianping.com/shop/H1XZuxIfuHl8meAJ/review_all) 3 | 4 | - 获取大众点评店铺评论信息,包含: 5 | - 用户昵称 6 | - 发布日期 7 | - 评分 8 | - 内容 9 | 10 | ```python 11 | 运行该程序前,需先将cookie补充完整 12 | 字体反爬大体思路: 13 | 1. 通过店铺页面的源码中,提取字体所在的css文件链接 14 | 2. 在css文件中找到所需字体的链接,通过re正则提取SVG的链接 15 | 3. 建立映射: 16 | 1. 根据css样式源码,建立'文字css样式的class值'和'文字的坐标值'的映射 17 | 2. 根据SVG的`path`标签内'id'和'd'属性,建立行号和行高的映射 18 | 3. 根据SVG的`textPath`标签内'href'和'textLength'属性,建立文字坐标和对应的文字的映射 19 | 4. 建立映射完成后,替换页面源码中的所有加密文字,最后提取数据即可 20 | 注:与店铺信息的字体反爬不同,该字体反爬只适用于'用户评论中字体的解密' 21 | ``` 22 | 23 | - 该爬虫使用到的模块: 24 | - requests 25 | - lxml 26 | - re 27 | -------------------------------------------------------------------------------- /dianping_comment_spider/decrypt_fonts.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/27 16:56 3 | # @Author : Torres-圣君 4 | # @File : decrypt_fonts.py 5 | # @Software : PyCharm 6 | import requests 7 | import re 8 | 9 | 10 | def decrypt_font(html): 11 | print("正在获取css链接!") 12 | css_link = get_css_link(html) 13 | print("正在获取svg链接!") 14 | svg_res, css_res = get_svg_link(css_link) 15 | 16 | # 创建空字典,用于存放'文字css样式的class值'和'文字的坐标值' 17 | css_dict = {} 18 | xy_list = re.findall('.(.*?){background:(.*?).0px (.*?).0px;}', css_res) 19 | for css in xy_list: 20 | css_dict[css[0]] = (int(css[1]), int(css[2])) 21 | 22 | # 创建空字典,用于存放'id对应的值'和'文字所在的y坐标' 23 | text_height_dict = {} 24 | # 提取所有id和y坐标信息 25 | defs_list = re.findall('', svg_res) 26 | for height in defs_list: 27 | text_height_dict[height[0]] = height[1] 28 | 29 | # 创建空字典,用于存放'文字坐标'和'对应的文字' 30 | word_dict = {} 31 | # 提取所有行号、y坐标、行文字 32 | text_list = re.findall('(.*?)', svg_res) 33 | for row in text_list: 34 | for word in row[2]: 35 | # 使用线性回归得出相应公式,从而计算出文字的坐标信息 36 | word_dict[((row[2].index(word) + 1) * -14 + 14, int(text_height_dict[row[0]]) * -1 + 23)] = word 37 | 38 | # 提取页面源码中加密的文字 39 | fonts_dic = {f'': word_dict.get(css_dict[i], '*') for i in css_dict} 40 | # 替换页面源码所有加密文字 41 | for key in fonts_dic: 42 | html = html.replace(key, fonts_dic[key]) 43 | return html 44 | 45 | 46 | def get_css_link(html): 47 | # 使用re提取css样式链接 48 | query_css_link = re.findall('href="//s3plus.meituan.net/v1/(.*?)"', html)[0] 49 | css_link = 'https://s3plus.meituan.net/v1/' + query_css_link 50 | print(css_link) 51 | return css_link 52 | 53 | 54 | def get_svg_link(css_link): 55 | # 对css链接发送请求,并使用正则提取SVG的链接 56 | css_res = requests.get(css_link).text 57 | svg_link = 'https:' + re.findall(r'class\^="qxu".*url\((.*?)\);', css_res)[0] 58 | print(svg_link) 59 | # 对SVG链接发送请求,获取其中的文字 60 | svg_res = requests.get(svg_link).text 61 | return svg_res, css_res 62 | -------------------------------------------------------------------------------- /dianping_comment_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/27 10:05 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import requests 7 | from lxml import etree 8 | from decrypt_fonts import decrypt_font 9 | 10 | 11 | class DetailedData: 12 | def __init__(self, url): 13 | self.url = url 14 | self.headers = { 15 | 'Cookie': '', 16 | 'Referer': url, 17 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44' 18 | } 19 | 20 | def run(self): 21 | res = requests.get(url=self.url, headers=self.headers).text 22 | # 开始解密页面源码 23 | html = etree.HTML(decrypt_font(res)) 24 | li_list = html.xpath('//div[@class="reviews-items"]/ul/li') 25 | for li in li_list: 26 | # 用户昵称 27 | name = li.xpath('.//a[@class="name"]/text()')[0].strip() 28 | # 发布日期 29 | date = li.xpath('.//span[@class="time"]/text()')[0].strip() 30 | # 评分 31 | score = '.'.join(li.xpath('.//div[@class="review-rank"]/span[1]/@class')[0].split()[1][-2:]) 32 | # 内容 33 | comment = ''.join(li.xpath('.//div[contains(@class,"review-words")]/text()')).replace('\n', '').strip() 34 | item = [name, date, score, comment] 35 | print(item) 36 | 37 | 38 | if __name__ == '__main__': 39 | # 随便拿一家店铺的评论做个测试 40 | url = 'https://www.dianping.com/shop/H1XZuxIfuHl8meAJ/review_all' 41 | dd = DetailedData(url) 42 | dd.run() 43 | -------------------------------------------------------------------------------- /dianping_font_decrypt/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取大众点评商铺详细信息✨
2 | - 随便拿一家店铺做个测试:[https://www.dianping.com/shop/H1XZuxIfuHl8meAJ](https://www.dianping.com/shop/H1XZuxIfuHl8meAJ) 3 | 4 | - 获取大众点评商铺详细信息,包含: 5 | - 店名 6 | - 评分 7 | - 评价数 8 | - 人均消费 9 | - 口味评分 10 | - 环境评分 11 | - 服务评分 12 | - 地址 13 | - 电话 14 | 15 | ```python 16 | 运行该程序前,需先将程序入口的cookie补充完整 17 | 字体反爬大体思路: 18 | 1. 通过店铺页面的源码中,提取字体所在的css文件链接 19 | 2. 在css文件中找到所需字体的链接,通过re正则提取链接,并使用requests将其下载到本地 20 | 3. 在店铺页面源码中查看自己所需数据对应的字体样式,看引入的是那种字体,找到后将其建立映射 21 | 4. 建立映射完成后,就可以复原字体的原内容了,这里可以先将页面源码还原再提取所需的数据,也可先提取所需的数据再将其数据字体还原 22 | 注:该方法只适用于获取店铺的详细信息。如果要获取不同分类下的店铺数据,需要修改`download_fonts.py`下的字体列表`tags = ['tagName', 'reviewTag', 'address', 'shopNum']`,并重复步骤3的操作即可 23 | ``` 24 | 25 | - 该爬虫使用到的模块: 26 | - requests 27 | - fontTools 28 | - json 29 | - lxml 30 | - re 31 | -------------------------------------------------------------------------------- /dianping_font_decrypt/decrypt_fonts.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/25 15:36 3 | # @Author : Torres-圣君 4 | # @File : decrypt_fonts.py 5 | # @Software : PyCharm 6 | import json 7 | from fontTools.ttLib import TTFont 8 | 9 | 10 | def decrypt_font(page_res): 11 | # 读取解密字典 12 | with open('./woff/fonts_dic.json', 'r', encoding='utf-8') as r: 13 | fonts_dic = json.loads(r.read()) 14 | # 替换网页源码中的所有加密文字,当然也可以在提取到数据后再解密 15 | for i in fonts_dic: 16 | if str(i) in page_res: 17 | page_res = page_res.replace(str(i), fonts_dic[i].replace(';', '')) 18 | # 返回解密后的网页源码 19 | return page_res 20 | 21 | 22 | def save_fonts_dic(): 23 | words = '1234567890店中美家馆小车大市公酒行国品发电金心业商司超生装园场食有新限天面工服海华水房饰城乐汽香部利子老艺花专东肉菜学福饭人百餐茶务通味所山区门药银农龙停尚安广鑫一容动南具源兴鲜记时机烤文康信果阳理锅宝达地儿衣特产西批坊州牛佳化五米修爱北养卖建材三会鸡室红站德王光名丽油院堂烧江社合星货型村自科快便日民营和活童明器烟育宾精屋经居庄石顺林尔县手厅销用好客火雅盛体旅之鞋辣作粉包楼校鱼平彩上吧保永万物教吃设医正造丰健点汤网庆技斯洗料配汇木缘加麻联卫川泰色世方寓风幼羊烫来高厂兰阿贝皮全女拉成云维贸道术运都口博河瑞宏京际路祥青镇厨培力惠连马鸿钢训影甲助窗布富牌头四多妆吉苑沙恒隆春干饼氏里二管诚制售嘉长轩杂副清计黄讯太鸭号街交与叉附近层旁对巷栋环省桥湖段乡厦府铺内侧元购前幢滨处向座下臬凤港开关景泉塘放昌线湾政步宁解白田町溪十八古双胜本单同九迎第台玉锦底后七斜期武岭松角纪朝峰六振珠局岗洲横边济井办汉代临弄团外塔杨铁浦字年岛陵原梅进荣友虹央桂沿事津凯莲丁秀柳集紫旗张谷的是不了很还个也这我就在以可到错没去过感次要比觉看得说常真们但最喜哈么别位能较境非为欢然他挺着价那意种想出员两推做排实分间甜度起满给热完格荐喝等其再几只现朋候样直而买于般豆量选奶打每评少算又因情找些份置适什蛋师气你姐棒试总定啊足级整带虾如态且尝主话强当更板知己无酸让入啦式笑赞片酱差像提队走嫩才刚午接重串回晚微周值费性桌拍跟块调糕' 24 | # 这里咱们想要获取的数据,在 num 和 address 中都有涉及 25 | font_num = TTFont('./woff/num.woff') 26 | font_address = TTFont('./woff/address.woff') 27 | # 提取字体库的编码 28 | font_num_list = font_num.getGlyphOrder()[2:] 29 | font_address_list = font_address.getGlyphOrder()[2:] 30 | # 用于存放加密字体的键值对 31 | fonts_dic = {} 32 | for i, v in enumerate(words): 33 | num_char = font_num_list[i].replace("uni", "&#x").lower() + ';' 34 | fonts_dic[num_char] = v 35 | address_char = font_address_list[i].replace("uni", "&#x").lower() + ';' 36 | if address_char in fonts_dic: 37 | continue 38 | fonts_dic[address_char] = v 39 | with open('./woff/fonts_dic.json', 'w', encoding='utf-8') as w: 40 | json_data = json.dumps(fonts_dic, indent=1, ensure_ascii=False) 41 | w.write(json_data) 42 | -------------------------------------------------------------------------------- /dianping_font_decrypt/download_fonts.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/25 14:12 3 | # @Author : Torres-圣君 4 | # @File : download_fonts.py 5 | # @Software : PyCharm 6 | import requests 7 | import re 8 | 9 | 10 | class GetFont: 11 | def __init__(self, url, headers): 12 | self.url = url 13 | self.headers = headers 14 | 15 | def run(self): 16 | print("正在获取css链接!") 17 | css_link = self.get_css_link() 18 | print(css_link) 19 | print("正在获取字体链接!") 20 | font_link_list = self.get_font_link(css_link) 21 | print(font_link_list) 22 | self.save_font(font_link_list) 23 | 24 | def get_css_link(self): 25 | res = requests.get(self.url, headers=self.headers).text 26 | # 使用正则提取css样式链接 27 | query_css_link = re.findall('href="//s3plus.meituan.net/v1/(.*?)"', res)[0] 28 | css_link = "https://s3plus.meituan.net/v1/" + query_css_link 29 | return css_link 30 | 31 | def get_font_link(self, css_link): 32 | res = requests.get(css_link).text 33 | # 使用正则提取字体库的链接 34 | font_link_list = [ 35 | f"https:{i}" for i in re.findall(r'//s3plus.meituan.net/v1/mss_\w{32}/font/\w{8}.woff', res) 36 | ] 37 | return font_link_list 38 | 39 | def save_font(self, font_link_list): 40 | # 六种不同的字体库,实则有三种是一样的 41 | tags = ['review', 'hours', 'dishname', 'num', 'address', 'shopdesc'] 42 | for num, link in enumerate(font_link_list): 43 | woff_data = requests.get(link).content 44 | # 二进制写入文件,保存字体 45 | with open(f"./woff/{tags[num]}.woff", 'wb') as w: 46 | w.write(woff_data) 47 | print(f"{tags[num]} 字体保存完成!") 48 | -------------------------------------------------------------------------------- /dianping_font_decrypt/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/25 16:36 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import requests 7 | from lxml import etree 8 | from decrypt_fonts import decrypt_font, save_fonts_dic 9 | from download_fonts import GetFont 10 | 11 | 12 | class DetailedData: 13 | def __init__(self, url, headers): 14 | self.url = url 15 | self.headers = headers 16 | 17 | def run(self): 18 | res = requests.get(self.url, headers=self.headers).text 19 | # 解密页面的字体加密数据 20 | page_res = decrypt_font(res) 21 | html = etree.HTML(page_res) 22 | # 店名 23 | title = ''.join(html.xpath('//*[@id="body"]/div/div[1]/span/text()')) 24 | # 评分 25 | score = ''.join(html.xpath('//div[@class="brief-info"]/span[1]/@title')) 26 | # 评价数 27 | comment = ''.join(html.xpath('//span[@id="reviewCount"]//text()')).replace('条评价 ', '').strip() 28 | # 人均消费 29 | price = ''.join(html.xpath('//span[@id="avgPriceTitle"]//text()')).replace('人均:', '').strip() 30 | # 口味评分 31 | taste = ''.join(html.xpath('//span[@id="comment_score"]/span[1]//text()')).replace('口味:', '').strip() 32 | # 环境评分 33 | environment = ''.join(html.xpath('//span[@id="comment_score"]/span[2]//text()')).replace('环境:', '').strip() 34 | # 服务评分 35 | service = ''.join(html.xpath('//span[@id="comment_score"]/span[3]//text()')).replace('服务:', '').strip() 36 | # 地址 37 | address = ''.join(html.xpath('//span[@id="address"]//text()')).strip() 38 | # 电话 39 | phone = ''.join(html.xpath('//p[@class="expand-info tel"]//text()')).replace('电话:', '').strip() 40 | # 这里只做演示,直接把提取过程写在列表内返回更好 41 | item = [title, score, comment, price, taste, environment, service, address, phone] 42 | print(item) 43 | 44 | 45 | if __name__ == '__main__': 46 | # 随便拿一家店铺做个测试 47 | url = 'https://www.dianping.com/shop/H1XZuxIfuHl8meAJ' 48 | # 填写自己的cookie值 49 | headers = { 50 | 'Cookie': '填写自己的cookie值', 51 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44', 52 | } 53 | # 下载字体 54 | GetFont(url, headers).run() 55 | # 用json格式生成字体映射 56 | save_fonts_dic() 57 | # 提取页面数据 58 | dd = DetailedData(url, headers) 59 | dd.run() 60 | -------------------------------------------------------------------------------- /douban_annual_list_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取豆瓣年度电影榜单✨
2 | - 豆瓣2021电影榜单:[https://movie.douban.com/annual/2021?source=navigation](https://movie.douban.com/annual/2021?source=navigation) 3 | 4 | - 获取2021-2015豆瓣年度电影榜单,包含: 5 | - 电影标题 6 | - 电影链接 7 | - 电影评分 8 | - 电影影长 9 | - 电影类型 10 | - 制片地区 11 | - 该爬虫爬取的数据,全部通过`re正则`来解析的 12 | - 爬取的数据存储方式: 13 | - 使用`openpyxl`模块,将数据写入`榜单电影链接.xlsx`表格中 14 | - 该爬虫使用到的模块: 15 | - re 16 | - time 17 | - requests 18 | - openpyxl 19 | - selenium 20 | -------------------------------------------------------------------------------- /douban_annual_list_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/14 16:16 3 | # @Author : Torres-圣君 4 | # @File : download_fonts.py 5 | # @Sofaware : PyCharm 6 | import re 7 | import time 8 | import requests 9 | from openpyxl import Workbook 10 | from openpyxl import load_workbook 11 | from openpyxl.styles import Alignment 12 | from selenium import webdriver 13 | import ua_ip_pool 14 | 15 | 16 | class DoubanMovies: 17 | def __init__(self, links: list): 18 | options = webdriver.ChromeOptions() 19 | # 无头模式 20 | options.add_argument('--headless') 21 | options.add_argument('--disable-gpu') 22 | # 移除指纹 23 | options.add_experimental_option('excludeSwitches', ['enable-automation']) 24 | options.add_experimental_option('useAutomationExtension', False) 25 | self.driver = webdriver.Chrome(options=options) 26 | self.driver.execute_cdp_cmd( 27 | 'Page.addScriptToEvaluateOnNewDocument', 28 | { 29 | 'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})' 30 | } 31 | ) 32 | self.links = links 33 | self.headers = { 34 | "User-Agent": ua_ip_pool.get_ua() 35 | } 36 | self.proxies = ua_ip_pool.get_proxies() 37 | self.align = Alignment(horizontal='center', vertical='center', wrap_text=True) 38 | # 计数器 39 | self.count = 1 40 | 41 | def run(self): 42 | # 存放所有电影榜单信息 43 | for url in self.links: 44 | print("正在访问:", url) 45 | # 获取数据 46 | movie_data = self.get_move_links(url) 47 | # 展示数据 48 | print(movie_data) 49 | # 保存数据 50 | self.save_data(movie_data) 51 | # 关闭浏览器 52 | self.driver.close() 53 | 54 | def get_move_links(self, url): 55 | self.driver.get(url) 56 | # 等待页面加载动画 57 | time.sleep(10) 58 | # 返回加载后的源码 59 | page_source = self.driver.page_source 60 | # 提取热榜电影名称 61 | title_list = re.findall(r'title="(.*?)"', page_source) 62 | # 提取热榜电影链接 63 | url_list = re.findall(r'href="https://movie.douban.com/subject/(.*?)/', page_source) 64 | if len(url_list) == 0: 65 | url_list = re.findall(r'href="https://m.douban.com/movie/subject/(.*?)/', page_source) 66 | if ('2018' in url) or ('2017' in url): 67 | # 提取热榜电影名称 68 | special_title = re.findall('target="_blank">(.*?)', page_source) 69 | # 获取前18个名称,后将特殊字段插入列表 70 | titles = title_list[:18] 71 | titles.insert(0, special_title[0]) 72 | titles.insert(10, special_title[2]) 73 | movie_title = titles 74 | # 不打乱去重 75 | new_url_list = sorted(set(url_list), key=url_list.index) 76 | movie_link = [f'https://movie.douban.com/subject/{i}' for i in new_url_list] 77 | elif '2016' in url: 78 | movie_title = title_list[:20] 79 | # 不打乱去重 80 | new_url_list = sorted(set(url_list), key=url_list.index) 81 | movie_link = [f'https://movie.douban.com/subject/{i}' for i in new_url_list] 82 | elif '2015' in url: 83 | movie_title = title_list[:20] 84 | # 不打乱去重 85 | new_url_list = sorted(set(url_list[:23]), key=url_list[:23].index) 86 | movie_link = [f'https://movie.douban.com/subject/{i}' for i in new_url_list] 87 | else: 88 | movie_title = title_list[:20] 89 | movie_link = [f'https://movie.douban.com/subject/{i}' for i in url_list[:20]] 90 | # 获取影片详细信息 91 | pingfen_list, yingchang_list, leixing_list, diqu_list = self.send_request(movie_link) 92 | # 返回获取的数据 93 | links_data = [ 94 | movie_title, movie_link, pingfen_list, yingchang_list, leixing_list, diqu_list 95 | ] 96 | return links_data 97 | 98 | def send_request(self, movie_link): 99 | pingfen_list = [] 100 | yingchang_list = [] 101 | leixing_list = [] 102 | diqu_list = [] 103 | for url in movie_link: 104 | print(f"正在获取:'{url}'") 105 | while True: 106 | try: 107 | res = requests.get(url, headers=self.headers, proxies=self.proxies, timeout=2) 108 | if res.status_code == 200: 109 | res = res.text 110 | break 111 | except: 112 | print("代理超时,正在更换代理!") 113 | # 评分 114 | pingfen_list.append(re.findall('property="v:average">(.*?)<.strong>', res)[0]) 115 | # 影长 116 | yingchang_list.append(re.findall('property="v:runtime" content="(.*?)"', res)[0]) 117 | # 类型 118 | leixing_list.append('/'.join(re.findall('property="v:genre">(.*?)', res))) 119 | # 制片区 120 | diqu_list.append(re.findall('制片国家/地区:(.*?)
', res)[0].strip(' ')) 121 | return pingfen_list, yingchang_list, leixing_list, diqu_list 122 | 123 | def save_data(self, movie_data): 124 | # 首次写入时,创建表格并添加表头 125 | if self.count == 1: 126 | # 创建新的excel表格 127 | wb = Workbook() 128 | sheet = wb.create_sheet("sheet1", -1) 129 | # 设置列宽 130 | sheet.column_dimensions['A'].width = 30 131 | sheet.column_dimensions['B'].width = 50 132 | sheet.column_dimensions['C'].width = 10 133 | sheet.column_dimensions['D'].width = 10 134 | sheet.column_dimensions['E'].width = 20 135 | sheet.column_dimensions['F'].width = 25 136 | excel_title = ['电影名', '电影链接', '电影评分', '电影影长', '电影类型', '制片地区'] 137 | for x in range(len(excel_title)): 138 | sheet.cell(1, x+1).value = excel_title[x] 139 | # 居中对齐 140 | sheet.cell(1, 1).alignment = self.align 141 | self.count += 1 142 | # 后则读取并追加 143 | else: 144 | wb = load_workbook("榜单电影链接.xlsx") 145 | sheet = wb["sheet1"] 146 | for x in range(len(movie_data)): 147 | for y in range(len(movie_data[x])): 148 | sheet.cell(y + self.count, x + 1).value = movie_data[x][y] 149 | # 居中对齐 150 | sheet.cell(y + self.count, x + 1).alignment = self.align 151 | # 增加计数器 152 | self.count += 20 153 | # 保存该Excel表格 154 | wb.save("榜单电影链接.xlsx") 155 | 156 | 157 | if __name__ == '__main__': 158 | dm = DoubanMovies( 159 | [f"https://movie.douban.com/annual/20{i}?source=navigation" for i in range(21, 14, -1)] 160 | ) 161 | dm.run() 162 | -------------------------------------------------------------------------------- /douban_annual_list_spider/ua_ip_pool.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/23 11:59 3 | # @Author : Torres-圣君 4 | # @File : ua_ip_pool.py 5 | # @Sofaware : PyCharm 6 | import random 7 | 8 | 9 | def get_proxies(): 10 | proxies_list = [ 11 | { 12 | "https": "https://103.156.17.60:8888" 13 | } 14 | ] 15 | return random.choice(proxies_list) 16 | 17 | 18 | def get_ua(): 19 | user_agent_list = [ 20 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 21 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 22 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", 23 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", 24 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", 25 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", 26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", 27 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 28 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 29 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 30 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", 31 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 32 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 33 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", 34 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", 35 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 36 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", 37 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", 38 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", 39 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", 40 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 41 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 42 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 43 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 44 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 45 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 46 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 47 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 48 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 49 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 50 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 51 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 52 | "UCWEB7.0.2.37/28/999", 53 | "NOKIA5700/ UCWEB7.0.2.37/28/999", 54 | "Openwave/ UCWEB7.0.2.37/28/999", 55 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999", 56 | "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25", 57 | ] 58 | # 设置UA伪装 59 | return random.choice(user_agent_list) 60 | -------------------------------------------------------------------------------- /douban_annual_list_spider/榜单电影链接.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/douban_annual_list_spider/榜单电影链接.xlsx -------------------------------------------------------------------------------- /douyin_video_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨下载抖音作者发布的视频✨
2 | - 随便拿一位抖音作者当案例:[https://www.douyin.com/user/MS4wLjABAAAAkvysSgdqmkgtgucxkirpMWFHbTeZgVOW7zcdUjU3jM4](https://www.douyin.com/user/MS4wLjABAAAAkvysSgdqmkgtgucxkirpMWFHbTeZgVOW7zcdUjU3jM4) 3 | 4 | - 输入抖音的作者主页链接,即可自动下载保存其所有发布的视频 5 | - 保存形式为:以作者昵称为文件夹名,以视频标题为视频名称保存 6 | 7 | - 其中保存视频的同时,还会保存视频的相关信息(csv格式): 8 | - 视频链接 9 | - 视频标题 10 | - 视频点赞数 11 | - 视频评论数 12 | - 视频收藏数 13 | - 视频发布日期 14 | 15 | - 该爬虫使用到的模块: 16 | - re 17 | - os 18 | - time 19 | - requests 20 | - selenium 21 | -------------------------------------------------------------------------------- /douyin_video_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/28 14:50 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import re 7 | import os 8 | import time 9 | import requests 10 | from selenium import webdriver 11 | 12 | 13 | class DownloadVideo: 14 | def __init__(self, url): 15 | self.url = url 16 | self.headers = { 17 | # 记得补充一下自己的cookie值 18 | "cookie": "", 19 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37" 20 | } 21 | # 使用无头模式 22 | options = webdriver.ChromeOptions() 23 | options.add_argument('--headless') 24 | options.add_argument('-–disable-gpu') 25 | self.driver = webdriver.Chrome(options=options) 26 | 27 | def run(self): 28 | self.driver.get(self.url) 29 | # 将滚轮滑到最底部,从而加载所有视频 30 | # self.move_pulley() 31 | # 作者昵称 32 | author_name = self.driver.find_elements_by_xpath('//span[@class="Nu66P_ba"]')[0].text 33 | print(author_name) 34 | # 以作者名称创建文件夹 35 | try: 36 | os.mkdir(author_name) 37 | except FileExistsError: 38 | print(f"{author_name} 文件夹已存在!") 39 | li_list = self.driver.find_elements_by_xpath('//li[@class="ECMy_Zdt"]') 40 | for li in li_list: 41 | video_link = li.find_element_by_xpath('./a').get_attribute('href') 42 | print("正在保存 --- ", video_link) 43 | item, video_url = self.get_video_url(video_link) 44 | self.save_video(item, video_url, author_name) 45 | time.sleep(3) 46 | break 47 | 48 | def move_pulley(self): 49 | temp_height = 0 50 | while True: 51 | # 循环将滚动条下拉 52 | self.driver.execute_script("window.scrollBy(0,500)") 53 | # sleep一下让滚动条反应一下 54 | time.sleep(1) 55 | # 获取当前滚动条距离顶部的距离 56 | check_height = self.driver.execute_script( 57 | "return document.documentElement.scrollTop || window.pageYOffset || document.body.scrollTop;") 58 | # 如果两者相等说明到底了 59 | if check_height == temp_height: 60 | break 61 | temp_height = check_height 62 | 63 | def get_video_url(self, video_link): 64 | res = requests.get(video_link, headers=self.headers).text 65 | # 对返回的源码进行url解码 66 | unquote_res = requests.utils.unquote(res) 67 | # 原视频链接,用于下载保存 68 | video_url = "https:" + re.findall('"src":"(.*?)"},', unquote_res)[0] 69 | # 获取视频其他信息 70 | item = self.get_other_data(unquote_res, video_link) 71 | return item, video_url 72 | 73 | def get_other_data(self, unquote_res, video_link): 74 | other_data = re.findall('(.*?)', unquote_res) 75 | item = [ 76 | # 视频链接 77 | video_link, 78 | # 视频标题 79 | re.findall('(.*?)', unquote_res)[0], 80 | # 视频点赞数 81 | other_data[0], 82 | # 视频评论数 83 | other_data[1], 84 | # 视频收藏数 85 | other_data[2], 86 | # 视频的发布日期 87 | re.findall('(.*?)', unquote_res)[0].split('>')[-1] 88 | ] 89 | return item 90 | 91 | def save_video(self, item, video_url, author_name): 92 | with open(f'{author_name}/{item[1]}.mp4', 'wb') as w: 93 | res = requests.get(video_url).content 94 | w.write(res) 95 | print(item[1], " --- 保存完成!") 96 | with open(f'{author_name}/{author_name}_所有视频信息.csv', 'a+') as a: 97 | if a.read() == '': 98 | a.write(f'视频链接,视频标题,点赞数,评论数,收藏数,发布日期\n') 99 | a.write(','.join(item) + '\n') 100 | 101 | 102 | if __name__ == '__main__': 103 | dv = DownloadVideo('https://www.douyin.com/user/MS4wLjABAAAAkvysSgdqmkgtgucxkirpMWFHbTeZgVOW7zcdUjU3jM4') 104 | dv.run() 105 | -------------------------------------------------------------------------------- /fangtianxia_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨爬取房天下全部的楼盘数据✨
2 | - 郑州房天下官网:[https://zz.newhouse.fang.com/house/s/](https://zz.newhouse.fang.com/house/s/) 3 | 4 | - 爬取房天下全部的楼盘数据,包含: 5 | - 楼盘标签 6 | - 楼盘面积 7 | - 楼盘价格(平方米) 8 | - 楼盘的网页链接 9 | - 楼盘所在地址 10 | - 楼盘评论数 11 | - 爬取的数据存储方式: 12 | - 通过a追加内容模式,将爬取的数据存储到`data/`文件夹下的json文件 13 | - 该爬虫使用到的模块: 14 | - requests 15 | - time 16 | - json 17 | - lxml 18 | - re 19 | -------------------------------------------------------------------------------- /fangtianxia_spider/data/郑州楼盘_数据.json: -------------------------------------------------------------------------------- 1 | "2022-04-26 22:15": [ 2 | { 3 | "title": "万科·民安 理想星光", 4 | "area": "—76~100平米", 5 | "price": "14000元/㎡", 6 | "link": "http://zz.newhouse.fang.com/loupan/2510152991.htm", 7 | "address": "惠济北三环南阳路交会处向西约300米", 8 | "comment": "44条评论" 9 | }, 10 | { 11 | "title": "大溪地", 12 | "area": "—79~700平米", 13 | "price": "8200元/㎡", 14 | "link": "http://zz.newhouse.fang.com/loupan/2510665175.htm", 15 | "address": "[四至五环]荥阳中原西路与商隐路交汇处", 16 | "comment": "647条评论" 17 | }, 18 | { 19 | "title": "华瑞紫韵城", 20 | "area": "—101~143平米", 21 | "price": "13500元/㎡", 22 | "link": "http://zz.newhouse.fang.com/loupan/2510819101.htm", 23 | "address": "[三至四环]中原建设西路与长椿路交叉口南北两侧", 24 | "comment": "303条评论" 25 | }, 26 | { 27 | "title": "旭辉·一江雲著", 28 | "area": "—98~143平米", 29 | "price": "16000元/㎡", 30 | "link": "http://zz.newhouse.fang.com/loupan/2510152771.htm", 31 | "address": "惠济滨河路与清华园路交叉口向东200米", 32 | "comment": "45条评论" 33 | }, 34 | { 35 | "title": "中建·澜溪苑", 36 | "area": "—79~118平米", 37 | "price": "10000元/㎡", 38 | "link": "http://zz.newhouse.fang.com/loupan/2510149437.htm", 39 | "address": "经开前程大道与浔江东路交会处向东200米", 40 | "comment": "109条评论" 41 | }, 42 | { 43 | "title": "万科·新田 湖与城", 44 | "area": "—78~144平米", 45 | "price": "7200元/㎡", 46 | "link": "http://zz.newhouse.fang.com/loupan/2510148935.htm", 47 | "address": "[四至五环]荥阳五龙路与博学路交会处·未来生活体验场", 48 | "comment": "133条评论" 49 | }, 50 | { 51 | "title": "保利·天汇", 52 | "area": "—98~141平米", 53 | "price": "19500元/㎡", 54 | "link": "http://zz.newhouse.fang.com/loupan/2510149275.htm", 55 | "address": "[二至三环]经开中州大道·航海路·中原福塔北300米", 56 | "comment": "165条评论" 57 | }, 58 | { 59 | "title": "美盛教育港湾", 60 | "area": "—89~144平米", 61 | "price": "19500元/㎡", 62 | "link": "http://zz.newhouse.fang.com/loupan/2510148595.htm", 63 | "address": "[三至四环]金水文化路与国基路交汇处", 64 | "comment": "167条评论" 65 | }, 66 | { 67 | "title": "富田城·九鼎公馆", 68 | "area": "—76~142平米", 69 | "price": "14300元/㎡起", 70 | "link": "http://zz.newhouse.fang.com/loupan/2510148083.htm", 71 | "address": "[三至四环]管城南三环金岱路(郑尉路)交会处", 72 | "comment": "194条评论" 73 | }, 74 | { 75 | "title": "金沙湖高尔夫观邸", 76 | "area": "—70~398平米", 77 | "price": "15500元/㎡起", 78 | "link": "http://zz.newhouse.fang.com/loupan/2510726519.htm", 79 | "address": "[三至四环]经开南三环与第五大街下桥口南500米", 80 | "comment": "533条评论" 81 | }, 82 | { 83 | "title": "美盛·金水印", 84 | "area": "—108~165平米", 85 | "price": "23500元/㎡", 86 | "link": "http://zz.newhouse.fang.com/loupan/2510152929.htm", 87 | "address": "[三至四环]金水渠东路与北三环交汇处", 88 | "comment": "37条评论" 89 | }, 90 | { 91 | "title": "保利和光屿湖", 92 | "area": "—90~140平米", 93 | "price": "12500元/㎡", 94 | "link": "http://zz.newhouse.fang.com/loupan/2510149583.htm", 95 | "address": "高新双湖科技城创新大道与青梅街交叉口向东100米路北", 96 | "comment": "70条评论" 97 | }, 98 | { 99 | "title": "新城时光印象", 100 | "area": "—98~143平米", 101 | "price": "15800元/㎡", 102 | "link": "http://zz.newhouse.fang.com/loupan/2510152741.htm", 103 | "address": "[三至四环]管城南三环文治路南500米", 104 | "comment": "89条评论" 105 | }, 106 | { 107 | "title": "远洋沁园", 108 | "area": "—82~138平米", 109 | "price": "7200元/㎡", 110 | "link": "http://zz.newhouse.fang.com/loupan/2510149009.htm", 111 | "address": "[五环以外]荥阳洞林湖与五龙路交汇处", 112 | "comment": "90条评论" 113 | }, 114 | { 115 | "title": "碧桂园天玺湾", 116 | "area": "—76~143平米", 117 | "price": "17000元/㎡", 118 | "link": "http://zz.newhouse.fang.com/loupan/2510152717.htm", 119 | "address": "金水杨金路与博学路东南角", 120 | "comment": "82条评论" 121 | }, 122 | { 123 | "title": "绿都·东澜岸", 124 | "area": "—89~140平米", 125 | "price": "9800元/㎡", 126 | "link": "http://zz.newhouse.fang.com/loupan/2510148633.htm", 127 | "address": "[四至五环]经开南三环与龙飞北街交会处向北500米", 128 | "comment": "261条评论" 129 | }, 130 | { 131 | "title": "锦艺四季城", 132 | "area": "—92~96平米", 133 | "price": "5300元/㎡", 134 | "link": "http://zz.newhouse.fang.com/loupan/2510815251.htm", 135 | "address": "[三至四环]惠济京广快速路与天河路交接处北500米路东", 136 | "comment": "251条评论" 137 | }, 138 | { 139 | "title": "兴港永威南樾", 140 | "area": "—87~141平米", 141 | "price": "8500元/㎡起", 142 | "link": "http://zz.newhouse.fang.com/loupan/2510785327.htm", 143 | "address": "[五环以外]航空港区桥航路与凌空街交会处东南角", 144 | "comment": "844条评论" 145 | }, 146 | { 147 | "title": "康桥山海云图", 148 | "area": "—89~121平米", 149 | "price": "11000元/㎡起", 150 | "link": "http://zz.newhouse.fang.com/loupan/2510152725.htm", 151 | "address": "高新长椿路开元路·河工大北侧", 152 | "comment": "139条评论" 153 | } 154 | ], 155 | "2022-04-26 22:15": [ 156 | { 157 | "title": "雅宝龙湖·天钜", 158 | "area": "—89~149平米", 159 | "price": "18000元/㎡起", 160 | "link": "http://zz.newhouse.fang.com/loupan/2510149563.htm", 161 | "address": "[一至二环]管城航海路与城东南路交汇处向北800米", 162 | "comment": "108条评论" 163 | }, 164 | { 165 | "title": "腾威城", 166 | "area": "—86~154平米", 167 | "price": "15500元/㎡起", 168 | "link": "http://zz.newhouse.fang.com/loupan/2510148421.htm", 169 | "address": "[一至二环]金水郑汴路与英协路交叉口向南300米", 170 | "comment": "98条评论" 171 | }, 172 | { 173 | "title": "郑地·美景 紫华城", 174 | "area": "—98~128平米", 175 | "price": "12000元/㎡", 176 | "link": "http://zz.newhouse.fang.com/loupan/2510149621.htm", 177 | "address": "郑东新区中原大道与凤栖街交汇处南/地铁八号线龙王庙站南800米", 178 | "comment": "47条评论" 179 | }, 180 | { 181 | "title": "华润置地新时代广场", 182 | "area": "—29~49平米", 183 | "price": "12500元/㎡", 184 | "link": "http://zz.newhouse.fang.com/loupan/2510148343.htm", 185 | "address": "[三至四环]郑东新区商鼎路与博学路交汇处", 186 | "comment": "100条评论" 187 | }, 188 | { 189 | "title": "保利云上", 190 | "area": "—96~129平米", 191 | "price": "11000元/㎡起", 192 | "link": "http://zz.newhouse.fang.com/loupan/2510152767.htm", 193 | "address": "二七郑密路双铁路交会处向西1300米路南", 194 | "comment": "108条评论" 195 | }, 196 | { 197 | "title": "金地西湖春晓", 198 | "area": "—94~122平米", 199 | "price": "11800元/㎡", 200 | "link": "http://zz.newhouse.fang.com/loupan/2510149497.htm", 201 | "address": "[四至五环]中原中原路与杭州路交会处西南", 202 | "comment": "150条评论" 203 | }, 204 | { 205 | "title": "中建·滨水苑", 206 | "area": "—94~141平米", 207 | "price": "11000元/㎡", 208 | "link": "http://zz.newhouse.fang.com/loupan/2510148653.htm", 209 | "address": "[五环以外]航空港区桥航路与凌空街交汇处向北100米", 210 | "comment": "105条评论" 211 | }, 212 | { 213 | "title": "美盛象湖100", 214 | "area": "—27~33平米", 215 | "price": "8700元/㎡起", 216 | "link": "http://zz.newhouse.fang.com/loupan/2510148389.htm", 217 | "address": "郑东新区金水东路与凤栖街交汇处", 218 | "comment": "94条评论" 219 | }, 220 | { 221 | "title": "蓝城·凤起梧桐", 222 | "area": "—137~155平米", 223 | "price": "19000元/㎡", 224 | "link": "http://zz.newhouse.fang.com/loupan/2510148187.htm", 225 | "address": "[四至五环]金水中州大道与杨金路交汇处东360米", 226 | "comment": "109条评论" 227 | }, 228 | { 229 | "title": "融创空港宸院", 230 | "area": "—89~138平米", 231 | "price": "9500元/㎡起", 232 | "link": "http://zz.newhouse.fang.com/loupan/2510148561.htm", 233 | "address": "[五环以外]航空港区长安路鄱阳湖路交汇处", 234 | "comment": "164条评论" 235 | }, 236 | { 237 | "title": "兴港和昌·凌云筑", 238 | "area": "—95~115平米", 239 | "price": "10500元/㎡", 240 | "link": "http://zz.newhouse.fang.com/loupan/2510149227.htm", 241 | "address": "[四至五环]经开朗星路以南、龙善街以西、美辰路以北、龙真街以东", 242 | "comment": "186条评论" 243 | }, 244 | { 245 | "title": "远洋臻园", 246 | "area": "—89~119平米", 247 | "price": "11800元/㎡起", 248 | "link": "http://zz.newhouse.fang.com/loupan/2510152775.htm", 249 | "address": "[四至五环]二七大学南路与芦庄路交汇处向西600米", 250 | "comment": "54条评论" 251 | }, 252 | { 253 | "title": "新城郡望府", 254 | "area": "—89~125平米", 255 | "price": "6200元/㎡", 256 | "link": "http://zz.newhouse.fang.com/loupan/2510148183.htm", 257 | "address": "[四至五环]荥阳郑上路与飞龙路交汇处东南侧", 258 | "comment": "128条评论" 259 | }, 260 | { 261 | "title": "正商美誉铭筑", 262 | "area": "—36~132平米", 263 | "price": "7600元/㎡", 264 | "link": "http://zz.newhouse.fang.com/loupan/2510780543.htm", 265 | "address": "[三至四环]管城南三环郑新快速路往南300米路东", 266 | "comment": "368条评论" 267 | }, 268 | { 269 | "title": "融侨悦城", 270 | "area": "—89~138平米", 271 | "price": "10400元/㎡", 272 | "link": "http://zz.newhouse.fang.com/loupan/2510148379.htm", 273 | "address": "[三至四环]中原航海西路(郑少高速连接线)与西四环交会处", 274 | "comment": "86条评论" 275 | }, 276 | { 277 | "title": "世茂振兴璀璨熙湖", 278 | "area": "—106~142平米", 279 | "price": "14000元/㎡起", 280 | "link": "http://zz.newhouse.fang.com/loupan/2510148381.htm", 281 | "address": "[三至四环]中原陇海西路与汇智路交叉口往南800米", 282 | "comment": "105条评论" 283 | }, 284 | { 285 | "title": "保利金茂时光悦", 286 | "area": "—71~130平米", 287 | "price": "10500元/㎡", 288 | "link": "http://zz.newhouse.fang.com/loupan/2510149119.htm", 289 | "address": "[四至五环]经开浔江东路与蓝湖街交叉口向北300米", 290 | "comment": "189条评论" 291 | }, 292 | { 293 | "title": "坤达江山筑", 294 | "area": "—89~147平米", 295 | "price": "7300元/㎡", 296 | "link": "http://zz.newhouse.fang.com/loupan/2510148583.htm", 297 | "address": "[四至五环]新郑紫荆山南路和新老107连接线交汇处", 298 | "comment": "149条评论" 299 | }, 300 | { 301 | "title": "融创森屿墅", 302 | "area": "—143~165平米", 303 | "price": "8000元/㎡起", 304 | "link": "http://zz.newhouse.fang.com/loupan/2510148961.htm", 305 | "address": "[五环以外]荥阳滨河路与陇海快速路交汇处", 306 | "comment": "71条评论" 307 | }, 308 | { 309 | "title": "奥园·悦城", 310 | "area": "—89~112平米", 311 | "price": "5800元/㎡", 312 | "link": "http://zz.newhouse.fang.com/loupan/2510149491.htm", 313 | "address": "新郑大学南路与g107连接线东北角", 314 | "comment": "51条评论" 315 | } 316 | ], 317 | -------------------------------------------------------------------------------- /fangtianxia_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/26 18:38 3 | # @Author : Torres-圣君 4 | # @File : douban_run_spider.py 5 | # @Sofaware : PyCharm 6 | # https://zz.newhouse.fang.com/house/s/ 7 | import requests 8 | from lxml import etree 9 | import re 10 | import json 11 | import time 12 | from user_agent import get_ua 13 | 14 | 15 | def run(): 16 | # 获取总页数 17 | page_number = get_page_number(first_url) 18 | for i in range(2, page_number+1): 19 | time.sleep(1) 20 | url = f"{first_url}b9{str(i)}/" 21 | parse_page(url) 22 | print(f"第<{i}>页数据保存完毕!") 23 | 24 | 25 | def get_page_number(url): 26 | global city_name 27 | res = requests.get(url, headers=headers) 28 | html = etree.HTML(res.text) 29 | # 城市名称 30 | city_name = html.xpath('//ul[@class="tf f12"]/li[2]/a/text()')[0] 31 | # 提取页码 32 | page_number = html.xpath('//div[@class="otherpage"]/span[2]/text()')[0] 33 | return int(re.findall(r"(\d+)", page_number)[0]) 34 | 35 | 36 | def parse_page(url): 37 | res = requests.get(url, headers=headers) 38 | html = etree.HTML(res.text) 39 | div_list = html.xpath('//*[@id="newhouse_loupan_list"]/ul/li/div[1]/div[2]') 40 | # print(len(div_list)) 41 | all_data_list = [] 42 | for div in div_list: 43 | try: 44 | item = dict() 45 | item["title"] = div.xpath('./div[1]/div[1]/a/text()')[0].strip(" \t\n") 46 | item["area"] = div.xpath('./div[2]//text()')[-1].strip(" \t\n") 47 | item["price"] = div.xpath('./div[5]/span/text()')[0].strip(" \t\n") 48 | try: 49 | item["price"] = item["price"] + div.xpath('./div[5]/em/text()')[0].strip(" \t\n") 50 | except IndexError: 51 | pass 52 | item["link"] = div.xpath('./div[1]/div[1]/a/@href')[0].strip(" \t\n") 53 | item["address"] = div.xpath('./div[3]/div/a/@title')[0].strip(" \t\n") 54 | item["comment"] = div.xpath('./div[1]/div[2]/a/span/text()')[0].strip(" ()\t\n") 55 | # 展示数据 56 | print(item) 57 | all_data_list.append(item) 58 | except IndexError: 59 | pass 60 | save_data(all_data_list) 61 | 62 | 63 | def save_data(item): 64 | if len(item) != 0: 65 | date = time.localtime() 66 | now_date = time.strftime("%Y-%m-%d %H:%M", date) 67 | data = json.dumps(item, indent=1, ensure_ascii=False) 68 | with open(f"./data/{city_name}_数据.json", "a", encoding="utf-8") as w: 69 | w.write(f'"{now_date}": ' + data + ",\n") 70 | 71 | 72 | if __name__ == '__main__': 73 | # 房价首页链接 74 | first_url = "https://zz.newhouse.fang.com/house/s/" 75 | # 城市名称 76 | city_name = "" 77 | headers = { 78 | "user-agent": get_ua(), 79 | } 80 | run() 81 | -------------------------------------------------------------------------------- /fangtianxia_spider/ua_pool.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/16 21:13 3 | # @Author : Torres-圣君 4 | # @File : ua_pool.py 5 | # @Sofaware : PyCharm 6 | import random 7 | 8 | 9 | def get_user_agent(): 10 | # UA池 11 | user_agent_list = [ 12 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 13 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 14 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", 15 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", 16 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", 17 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", 18 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", 19 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 20 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 21 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 22 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", 23 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 24 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 25 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", 26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", 27 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 28 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", 29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", 30 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", 31 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", 32 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 33 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 34 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 35 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 36 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 37 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 38 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 39 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 40 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 41 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 42 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 43 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 44 | "UCWEB7.0.2.37/28/999", 45 | "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36 Edg/100.0.1185.39", 46 | "NOKIA5700/ UCWEB7.0.2.37/28/999", 47 | "Openwave/ UCWEB7.0.2.37/28/999", 48 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999", 49 | "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25", 50 | ] 51 | # 设置UA伪装 52 | return random.choice(user_agent_list) -------------------------------------------------------------------------------- /gupiao_rank_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取东方财富个股人气榜✨
2 | - 东方财富个股人气榜官网:[http://guba.eastmoney.com/rank/](http://guba.eastmoney.com/rank/) 3 | 4 | - 输入日期、出发地、目的地获取火车票信息,包含: 5 | - 当前排名 6 | - 排名较昨日变动 7 | - 股票代码 8 | - 股票名称 9 | - 最新价 10 | - 涨跌额 11 | - 涨跌幅 12 | - 最高价 13 | - 最低价 14 | 15 | ```python 16 | 通过抓包获取到接口后,发现接口数据为aes加密数据 17 | 这里通过拿到关键参数,利用js还原加密的密钥和偏移量 18 | 19 | 使用Python的第三方模块'Crypto',对AES的CBC模式进行解密 20 | 通过解密后的数据,获取每个不同股票对应的代码 21 | 再通过对不同代码进行分析和修改,最终构建完整的params 22 | 最后携带上params参数对链接发送请求,后提取关键数据,将其存储到data目录下 23 | ``` 24 | 25 | - 该爬虫使用到的模块: 26 | - requests 27 | - time 28 | - json 29 | - openpyxl 30 | - Crypto 31 | - base64 32 | -------------------------------------------------------------------------------- /gupiao_rank_spider/data/A股市场_人气榜.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/gupiao_rank_spider/data/A股市场_人气榜.xlsx -------------------------------------------------------------------------------- /gupiao_rank_spider/data/港股市场_人气榜.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/gupiao_rank_spider/data/港股市场_人气榜.xlsx -------------------------------------------------------------------------------- /gupiao_rank_spider/data/美股市场_人气榜.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/gupiao_rank_spider/data/美股市场_人气榜.xlsx -------------------------------------------------------------------------------- /gupiao_rank_spider/decryption_AES.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/23 9:58 3 | # @Author : Torres-圣君 4 | # @File : decryption_AES.py 5 | # @Software : PyCharm 6 | from Crypto.Cipher import AES 7 | import base64 8 | 9 | BLOCK_SIZE = 16 # Bytes 10 | pad = lambda s: s + (BLOCK_SIZE - len(s) % BLOCK_SIZE) * chr(BLOCK_SIZE - len(s) % BLOCK_SIZE) 11 | unpad = lambda s: s[:-ord(s[len(s) - 1:])] 12 | 13 | 14 | # 密钥(key), 偏移量(iv) CBC模式加密 15 | 16 | def AES_Decrypt(key, vi, data): 17 | data = data.encode('utf8') 18 | encodebytes = base64.decodebytes(data) 19 | # 将加密数据转换位bytes类型数据 20 | cipher = AES.new(key.encode('utf8'), AES.MODE_CBC, vi.encode('utf8')) 21 | text_decrypted = cipher.decrypt(encodebytes) 22 | # 去补位 23 | text_decrypted = unpad(text_decrypted) 24 | text_decrypted = text_decrypted.decode('utf8') 25 | return text_decrypted 26 | -------------------------------------------------------------------------------- /gupiao_rank_spider/get_message.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/23 15:19 3 | # @Author : Torres-圣君 4 | # @File : get_message.py 5 | # @Software : PyCharm 6 | import requests 7 | 8 | 9 | class OtherData: 10 | def __init__(self, headers): 11 | self.url = "https://push2.eastmoney.com/api/qt/ulist.np/get" 12 | self.headers = headers 13 | 14 | def join_type1_params(self, page_data): 15 | secids_list = [] 16 | for i in page_data: 17 | # HK_开头的为港股市场,代码前加116. 18 | if i[2].startswith('HK_'): 19 | i[2] = i[2].replace("HK_", "") 20 | secids_list.append(f'116.{i[2]}') 21 | # NASDAQ_开头的为美股市场,代码前加105. 22 | elif i[2].startswith('NASDAQ_'): 23 | i[2] = i[2].replace("NASDAQ_", "") 24 | secids_list.append(f'105.{i[2]}') 25 | # NYSE_开头的为美股市场,代码前加106. 26 | elif i[2].startswith('NYSE_'): 27 | i[2] = i[2].replace("NYSE_", "") 28 | secids_list.append(f'106.{i[2]}') 29 | # AMEX_开头的为美股市场,代码前加107. 30 | elif i[2].startswith('AMEX_'): 31 | i[2] = i[2].replace("AMEX_", "") 32 | secids_list.append(f'107.{i[2]}') 33 | # 数字6开头的为A股市场,代码前加1. 34 | elif i[2].startswith('6'): 35 | secids_list.append(f'1.{i[2]}') 36 | else: 37 | secids_list.append(f'0.{i[2]}') 38 | params = { 39 | "fltt": 2, 40 | "np": 3, 41 | "ut": "a79f54e3d4c8d44e494efb8f748db291", 42 | "invt": 2, 43 | "secids": ",".join(secids_list), 44 | "fields": "f1,f2,f3,f4,f12,f13,f14,f152,f15,f16", 45 | } 46 | print(params) 47 | return params 48 | 49 | def get_response(self, page_data): 50 | params = self.join_type1_params(page_data) 51 | res = requests.get(self.url, headers=self.headers, params=params).json() 52 | page_other_data = [] 53 | for data in res['data']['diff']: 54 | item = [ 55 | data['f14'], 56 | data['f2'], 57 | data['f4'], 58 | str(data['f3'])+'%', 59 | data['f15'], 60 | data['f16'] 61 | ] 62 | page_other_data.append(item) 63 | print(item) 64 | # 拼接完整的股票数据并返回 65 | page_all_data = [page_data[i]+page_other_data[i] for i in range(len(page_data))] 66 | return page_all_data 67 | -------------------------------------------------------------------------------- /gupiao_rank_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/23 9:53 3 | # @Author : Torres-圣君 4 | # @File : download_fonts.py 5 | # @Software : PyCharm 6 | import time 7 | import requests 8 | import json 9 | from decryption_AES import AES_Decrypt 10 | from get_message import OtherData 11 | from openpyxl import Workbook 12 | from openpyxl import load_workbook 13 | 14 | 15 | class GetAESData: 16 | def __init__(self): 17 | self.url = 'http://gbcdn.dfcfw.com/rank/popularityList.js' 18 | self.headers = { 19 | "Referer": "http://guba.eastmoney.com/", 20 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44" 21 | } 22 | self.gt = OtherData(self.headers) 23 | self.count = 1 24 | 25 | def run(self): 26 | types_list = ['A股市场', '港股市场', '美股市场'] 27 | for types in range(0, 3): 28 | # 循环获取前五页top100 29 | for page in range(1, 6): 30 | time.sleep(1) 31 | print(f"正在获取第{page}页数据!") 32 | # 构建请求参数 33 | params = self.build_params(types, page) 34 | # 解密数据 35 | decrypt_data = self.get_response(params) 36 | # 获取页面数据 37 | page_all_data = self.format_data(decrypt_data) 38 | # 保存页面数据 39 | self.save_data(types_list[types], page_all_data) 40 | print(f"第{page}页数据保存完成!") 41 | # 计数器归1 42 | self.count = 1 43 | 44 | def build_params(self, types, page): 45 | """ 46 | type: 0 47 | sort: 0 48 | page: 1 49 | v: 2022_6_23_9_56 50 | """ 51 | t = time.localtime() 52 | time_list = time.strftime("%Y_%m_%d_%H_%M", t).split('_') 53 | now = '_'.join([i[-1] if i.startswith('0') else i for i in time_list]) 54 | params = { 55 | "type": types, 56 | "sort": 0, 57 | "page": page, 58 | "v": now 59 | } 60 | print(params) 61 | return params 62 | 63 | def get_response(self, params): 64 | res = requests.get(self.url, headers=self.headers, params=params).text 65 | # 加密数据 66 | aes_data = res.split("'")[1] 67 | # 密钥 68 | key = 'ae13e0ad97cdd6e12408ac5063d88721' 69 | # 偏移量 70 | vi = 'getClassFromFile' 71 | # 使用AES解密 72 | decrypt_data = AES_Decrypt(key, vi, aes_data) 73 | return decrypt_data 74 | 75 | def format_data(self, decrypt_data): 76 | json_data = json.loads(decrypt_data) 77 | page_data = [] 78 | for everyone in json_data: 79 | item = [ 80 | everyone['rankNumber'], 81 | everyone['changeNumber'], 82 | everyone['code'] 83 | ] 84 | page_data.append(item) 85 | print(item) 86 | page_all_data = self.gt.get_response(page_data) 87 | return page_all_data 88 | 89 | def save_data(self, title, page_all_data): 90 | # 首次保存需创建表格,并写入表头信息 91 | if self.count == 1: 92 | wb = Workbook() 93 | # 创建新的工作蒲 94 | sheet = wb.create_sheet('sheet1', -1) 95 | # 添加表头信息 96 | data_header = ['当前排名', '排名较昨日变动', '股票代码', '股票名称', '最新价', '涨跌额', '涨跌幅', '最高价', '最低价'] 97 | page_all_data.insert(0, data_header) 98 | else: 99 | # 读取已有的工作蒲 100 | wb = load_workbook(f'./data/{title}_人气榜.xlsx') 101 | sheet = wb["sheet1"] 102 | for x in range(len(page_all_data)): 103 | for y in range(len(page_all_data[x])): 104 | sheet.cell(x + self.count, y + 1).value = page_all_data[x][y] 105 | # 保存表格并追加计数 106 | wb.save(f'./data/{title}_人气榜.xlsx') 107 | self.count += len(page_all_data) 108 | 109 | 110 | if __name__ == '__main__': 111 | aes = GetAESData() 112 | aes.run() 113 | -------------------------------------------------------------------------------- /gupiao_rank_spider/获取密钥和偏移量.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Title 6 | 7 | 8 | 9 | 10 | 16 | 49 | -------------------------------------------------------------------------------- /huya_all_types_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取虎牙直播平台在播用户✨
2 | - 虎牙直播全分类网站:[https://www.huya.com/g](https://www.huya.com/g) 3 | 4 | - 输入指定岗位,抓取该岗位的所有招聘信息,包含: 5 | - 主播头像链接 6 | - 主播昵称 7 | - 房间ID号 8 | - 房间标题 9 | - 房间标签 10 | - 直播链接 11 | - 爬取的数据存储方式: 12 | - 文件`all_types_msg.json`存放了虎牙平台实时分类的信息,包含分类的名称、分类的链接、分类的ID 13 | - 通过使用`openpyxl`模块,将爬取的数据存储到`data`文件夹下的`xxx_直播用户信息.xlsx`表格 14 | - 关于主播头像,本打算直接在Excel中插入图片,但考虑速度问题,暂以链接形式填充 15 | - 该爬虫使用到的模块: 16 | - threading 17 | - requests 18 | - json 19 | - time 20 | - lxml 21 | - openpyxl 22 | - random 23 | -------------------------------------------------------------------------------- /huya_all_types_spider/data/Apex英雄_直播用户信息.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/huya_all_types_spider/data/Apex英雄_直播用户信息.xlsx -------------------------------------------------------------------------------- /huya_all_types_spider/data/all_types_msg.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/huya_all_types_spider/data/all_types_msg.json -------------------------------------------------------------------------------- /huya_all_types_spider/get_proxyz.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/1 17:00 3 | # @Author : Torres-圣君 4 | # @File : get_proxyz.py 5 | # @Sofaware : PyCharm 6 | import random 7 | 8 | 9 | def get_proxies(): 10 | proxies_list = [ 11 | { 12 | "https": "https://58.220.95.42:10174" 13 | }, { 14 | "https": "https://118.163.13.200:8080" 15 | }, { 16 | "http": "http://223.96.90.216:8085" 17 | }, { 18 | "http": "http://165.225.202.95:10605" 19 | }, { 20 | "https": "https://139.198.157.59:7890" 21 | }, { 22 | "http": "http://120.220.220.95:8085" 23 | }, { 24 | "http": "http://182.61.201.201:80" 25 | }, { 26 | "http": "http://165.225.206.106:10605" 27 | }, { 28 | "https": "https://117.26.40.251:3712" 29 | }, { 30 | "http": "http://39.130.150.43:80" 31 | }, { 32 | "https": "https://103.38.80.138:3128" 33 | }, { 34 | "http": "http://39.130.150.42:80" 35 | }, { 36 | "http": "http://113.96.62.246:8081" 37 | }, { 38 | "http": "http://39.130.150.44:80" 39 | }, { 40 | "http": "http://112.6.117.135:8085" 41 | }, { 42 | "http": "http://39.130.150.44:80" 43 | }, { 44 | "http": "http://165.225.76.175:10605" 45 | }, { 46 | "https": "https://223.112.99.150:80" 47 | }, { 48 | "http": "http://39.130.150.44:80" 49 | }, { 50 | "https": "https://40.83.102.86:80" 51 | }, { 52 | "https": "https://113.21.237.83:443" 53 | }, { 54 | "http": "http://112.6.117.178:8085" 55 | }, { 56 | "http": "http://218.59.139.238:80" 57 | }, { 58 | "https": "https://210.5.10.87:53281" 59 | }, { 60 | "http": "http://183.247.199.153:30001" 61 | }, { 62 | "http": "http://112.6.117.178:8085" 63 | }, { 64 | "http": "http://47.113.90.161:83" 65 | }, { 66 | "https": "https://222.69.240.130:8001" 67 | }, { 68 | "https": "https://14.20.235.19:45770" 69 | }, { 70 | "http": "http://165.225.204.12:10605" 71 | }, { 72 | "http": "http://103.148.72.192:80" 73 | }, { 74 | "http": "http://165.225.76.165:10605" 75 | }, { 76 | "http": "http://120.220.220.95:8085" 77 | }, { 78 | "http": "http://103.37.141.69:80" 79 | }, { 80 | "https": "https://103.133.177.141:443" 81 | }, { 82 | "http": "http://223.96.90.216:8085" 83 | }, { 84 | "http": "http://120.220.220.95:8085" 85 | }, { 86 | "http": "http://221.122.91.60:80" 87 | }, { 88 | "https": "https://47.93.48.155:8888" 89 | }, { 90 | "http": "http://103.148.72.192:80" 91 | }, { 92 | "http": "http://120.220.220.95:8085" 93 | }, { 94 | "https": "https://42.193.253.152:8089" 95 | }, 96 | ] 97 | return random.choice(proxies_list) 98 | -------------------------------------------------------------------------------- /huya_all_types_spider/get_types_user_msg.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/18 18:13 3 | # @Author : Torres-圣君 4 | # @File : get_types_user_msg.py 5 | # @Sofaware : PyCharm 6 | # https://www.huya.com/cache.php?m=LiveList&do=getLiveListByPage&gameId=1&tagAll=0&callback=getLiveListJsonpCallback&page=1 7 | import requests 8 | import json 9 | import re 10 | import threading 11 | from openpyxl import Workbook 12 | from get_proxyz import get_proxies 13 | from get_ua import get_ua 14 | 15 | 16 | class NowLiveUsers: 17 | def __init__(self, key, url, gameId): 18 | self.lock = threading.Lock() 19 | self.key = key 20 | self.url = "https://www.huya.com/cache.php" 21 | self.headers = { 22 | "Host": "www.huya.com", 23 | "Referer": url, 24 | "User-Agent": get_ua() 25 | } 26 | self.params = { 27 | "m": "LiveList", 28 | "do": "getLiveListByPage", 29 | "gameId": gameId, 30 | "tagAll": 0, 31 | "callback": "getLiveListJsonpCallback", 32 | "page": 1 33 | } 34 | 35 | def get_page_msg(self): 36 | # 创建一个用于汇总页面数据的列表 37 | all_users_data = [ 38 | ["主播头像", "主播昵称", "房间ID号", "房间标题", "房间标签", "直播链接"] 39 | ] 40 | count = 0 41 | # 循环请求不同的页面 42 | while True: 43 | # 启用线程锁,防止数据穿线 44 | with self.lock: 45 | count += 1 46 | # 设置请求参数的页码值 47 | self.params['page'] = count 48 | # 对页面发送请求 49 | res = requests.get(self.url, headers=self.headers, params=self.params, proxies=get_proxies()).text 50 | # 使用re提取数据 51 | dict_data = re.findall(r'getLiveListJsonpCallback\((.*)\)', res)[0] 52 | json_data = json.loads(dict_data) 53 | data_list = json_data["data"]["datas"] 54 | # 如果页面返回为空时,跳出循环 55 | if len(data_list) != 0: 56 | for data in data_list: 57 | user_data = [ 58 | data['avatar180'], 59 | data['nick'], 60 | data['profileRoom'], 61 | data['roomName'], 62 | data['recommendTagName'], 63 | "https://www.huya.com/" + data['profileRoom'] 64 | ] 65 | # 将数据添加进页面汇总列表 66 | all_users_data.append(user_data) 67 | # 展示数据 68 | print(all_users_data) 69 | else: 70 | # 保存数据到Excel表格 71 | self.save_data(all_users_data) 72 | break 73 | 74 | def save_data(self, all_users_data_list): 75 | # 创建新的excel表格 76 | wb = Workbook() 77 | # 创建新的工作蒲 78 | sheet = wb.create_sheet(self.key, -1) 79 | # 遍历表格索引,写入数据 80 | for x in range(len(all_users_data_list)): 81 | for y in range(len(all_users_data_list[x])): 82 | sheet.cell(x+1, y+1).value = all_users_data_list[x][y] 83 | # 保存该文件 84 | wb.save(f"./data/{self.key}_直播用户信息.xlsx") 85 | -------------------------------------------------------------------------------- /huya_all_types_spider/get_ua.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/1 17:02 3 | # @Author : Torres-圣君 4 | # @File : get_ua.py 5 | # @Sofaware : PyCharm 6 | import random 7 | 8 | 9 | def get_ua(): 10 | user_agent_list = [ 11 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 12 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 13 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", 14 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", 15 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", 16 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", 17 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", 18 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 20 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 21 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", 22 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 23 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 24 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", 25 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", 26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 27 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", 28 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", 29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", 30 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", 31 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 32 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 33 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 34 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 35 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 36 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 37 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 38 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 39 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 40 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 41 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 42 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 43 | "UCWEB7.0.2.37/28/999", 44 | "NOKIA5700/ UCWEB7.0.2.37/28/999", 45 | "Openwave/ UCWEB7.0.2.37/28/999", 46 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999", 47 | "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25", 48 | ] 49 | # 设置UA伪装 50 | return random.choice(user_agent_list) 51 | -------------------------------------------------------------------------------- /huya_all_types_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/18 17:36 3 | # @Author : Torres-圣君 4 | # @File : download_fonts.py 5 | # @Sofaware : PyCharm 6 | # https://www.huya.com/g 全部分类链接 7 | import threading 8 | import requests 9 | import json 10 | import time 11 | from lxml import etree 12 | from get_types_user_msg import NowLiveUsers 13 | 14 | 15 | class HuyaAllTypes: 16 | def __init__(self): 17 | self.url = "https://www.huya.com/g" 18 | self.headers = { 19 | "Host": "www.huya.com", 20 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.47" 21 | } 22 | 23 | def get_types_url(self): 24 | # 请求目标网站,并将GBK的'\xa0'转为对应的空格 25 | res = requests.get(self.url, headers=self.headers).text.replace(u'\xa0', u' ') 26 | html = etree.HTML(res) 27 | all_links_list = html.xpath('//*[@id="js-game-list"]/li') 28 | print("共发现%d种分类" % len(all_links_list)) 29 | # 创建字典,用于存放所有数据 30 | all_types_msg = dict() 31 | # 循环获取所有分类信息 32 | for all_links in all_links_list: 33 | # 字典的存放格式 --> {分类的名称 :[分类的链接, 分类的gameId]} 34 | all_types_msg[all_links.xpath('./a/p/text()')[0]] = [ 35 | all_links.xpath('./a/@href')[0], 36 | all_links.xpath('./a/img/@src')[0].split('/')[-1].split('-')[0] 37 | ] 38 | # 将分类信息保存到本地 39 | self.save_all_types(all_types_msg) 40 | return all_types_msg 41 | 42 | def save_all_types(self, all_types_msg): 43 | json_data = json.dumps(all_types_msg, indent=1, ensure_ascii=False) 44 | # 将分类信息写入JSON文件 45 | with open('./data/all_types_msg.json', 'w') as w: 46 | w.write(json_data) 47 | print("\n全部分类信息保存完毕!") 48 | 49 | 50 | if __name__ == '__main__': 51 | # 获取所有分类的链接 52 | huya = HuyaAllTypes() 53 | all_types_dict_msg = huya.get_types_url() 54 | # 获取每个分类下的所有直播用户 55 | tasks = [] 56 | for key, val in all_types_dict_msg.items(): 57 | users_msg = NowLiveUsers(key, val[0], val[1]) 58 | tasks.append( 59 | threading.Thread(target=users_msg.get_page_msg) 60 | ) 61 | # users_msg = NowLiveUsers(key, val[0], val[1]) 62 | # users_msg.get_page_msg() 63 | for task in tasks: 64 | # 还是设置间隔1秒比较好点 65 | time.sleep(1) 66 | task.start() 67 | for task in tasks: 68 | task.join() 69 | -------------------------------------------------------------------------------- /lagou_jobs_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取拉钩网岗位的招聘信息✨
2 | - 拉钩招聘网:[https://www.lagou.com/](https://www.lagou.com/) 3 | 4 | - 输入指定岗位,抓取该岗位的所有招聘信息,包含: 5 | - 工作标题 6 | - 工作链接 7 | - 公司名称 8 | - 薪资范围 9 | - 投递要求 10 | - 公司地址 11 | - 爬取的数据存储方式: 12 | - 通过a追加模式,将爬取的数据存储到`data`文件夹下的`xxx.csv`文件 13 | - 该爬虫使用到的模块: 14 | - os 15 | - csv 16 | - playwright 17 | - `playwright`:新一代自动化工具,相比selenium速度更快,书写更佳 18 | - playwright使用教程:[点击链接进入](https://blog.csdn.net/qq_44091819/article/details/124656846) -------------------------------------------------------------------------------- /lagou_jobs_spider/data/Python爬虫.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/lagou_jobs_spider/data/Python爬虫.csv -------------------------------------------------------------------------------- /lagou_jobs_spider/data/lagou.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/lagou_jobs_spider/data/lagou.png -------------------------------------------------------------------------------- /lagou_jobs_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/10 19:18 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import os 7 | import csv 8 | from playwright.sync_api import sync_playwright 9 | 10 | 11 | class LagouJbos: 12 | def __init__(self, job_name): 13 | self.job_name = job_name 14 | self.url = "https://www.lagou.com/jobs/list_" + job_name 15 | self.flag = True 16 | 17 | def get_page_data(self): 18 | with sync_playwright() as pw: 19 | browser = pw.chromium.launch() 20 | page = browser.new_page() 21 | page.goto(self.url) 22 | # 获取总页数 23 | page_max = page.locator('xpath=//*[@id="order"]/li/div[4]/div[3]/span[2]').text_content() 24 | print(f"共找到<{page_max}>页相关数据") 25 | self.is_file() 26 | for i in range(0, int(page_max)): 27 | print(f"正在获取第<{i+1}>页") 28 | self.get_jobs_data(page) 29 | print(f"第<{i+1}>页数据写入完毕,正在进入下一页...") 30 | page.click('xpath=//*[@id="order"]/li/div[4]/div[2]') 31 | page.screenshot(path=f"./data/lagou.png") 32 | browser.close() 33 | 34 | def get_jobs_data(self, page): 35 | try: 36 | jobs_data_list = page.query_selector_all('//*[@id="s_position_list"]/ul/li') 37 | # print(len(jobs_data_list)) 38 | for jobs_data in jobs_data_list: 39 | item = dict() 40 | # 工作标题 41 | item['job_title'] = jobs_data.query_selector('xpath=./div[1]/div[1]/div[1]/a/h3').text_content() 42 | # 工作链接 43 | item['job_link'] = jobs_data.query_selector('xpath=./div[1]/div[1]/div[1]/a').get_attribute('href') 44 | # 公司名称 45 | item['job_company'] = jobs_data.query_selector('xpath=./div[1]/div[2]/div[1]/a').text_content().strip('\n') 46 | # 薪资范围 47 | item['job_price'] = jobs_data.query_selector('xpath=./div[1]/div[1]/div[2]/div/span').text_content() 48 | # 投递要求 49 | item['job_demand'] = jobs_data.query_selector('xpath=./div[1]/div[1]/div[2]/div').text_content().strip(' \n').split('\n')[-1] 50 | # 公司地址 51 | item['job_address'] = jobs_data.query_selector('xpath=./div[1]/div[1]/div[1]/a/span/em').text_content() 52 | # 将数据保存为csv格式 53 | self.save_data(item) 54 | except: 55 | pass 56 | 57 | def save_data(self, item): 58 | # 写入的数据为字典类型 59 | with open(f'./data/{self.job_name}.csv', 'a', newline='') as w: 60 | # 创建一个csv的DictWriter对象 61 | w_csv = csv.DictWriter(w, ['job_title', 'job_link', 'job_company', 'job_price', 'job_demand', 'job_address']) 62 | if self.flag: 63 | # 写入一行当表头,即字典键名 64 | w_csv.writeheader() 65 | self.flag = False 66 | # 写入对行数据,即字典的所有值 67 | w_csv.writerow(item) 68 | 69 | def is_file(self): 70 | try: 71 | # 检测文件是否存在,用于相同工作二次执行 72 | os.remove(f'./data/{self.job_name}.csv') 73 | except: 74 | pass 75 | 76 | 77 | if __name__ == '__main__': 78 | job_name = input("请输入职位名称:") 79 | lagou = LagouJbos(job_name) 80 | lagou.get_page_data() 81 | -------------------------------------------------------------------------------- /lol_hero_message_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取LOL全英雄的资料信息✨
2 | - LOL全英雄资料信息网站:[https://yz.lol.qq.com/zh_CN/champions/](https://yz.lol.qq.com/zh_CN/champions/) 3 | 4 | - 获取LOL全英雄的资料信息,包含: 5 | - 上线日期 6 | - 英文名称 7 | - 英雄名称 8 | - 英雄定位 9 | - 英雄台词 10 | - 英雄链接 11 | - 原画链接 12 | - 故事简述 13 | - 背景故事 14 | - 爬取的数据存储方式: 15 | - 通过w写入模式,将爬取的数据存储到`heroes_data.json`文件 16 | - 该爬虫使用到的模块: 17 | - requests 18 | - json 19 | - time -------------------------------------------------------------------------------- /lol_hero_message_spider/heroes_data.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "上线日期": "2013-06-13", 4 | "英文名称": "aatrox", 5 | "英雄名称": "暗裔剑魔·亚托克斯", 6 | "英雄定位": "战士, 坦克", 7 | "英雄台词": "我必须连同希望一起毁坏……", 8 | "英雄链接": "https://yz.lol.qq.com/zh_CN/champion/aatrox", 9 | "原画链接": "https://game.gtimg.cn/images/lol/universe/v1/assets/images/champion/splash/Aatrox_0.jpg", 10 | "故事简述": "亚托克斯曾是恕瑞玛抗击虚空时的伟大战士。但是,他和他的同胞却有可能变成符文之地更大的威胁。最终,他们败给了凡人的狡诈巫术,自身的精魂被锁在了武器之内。数百年的监禁之后,亚托克斯头一个挣脱出来,腐蚀并转化那些胆敢染指的蠢人。现在,他将夺来的血肉模仿着自己曾经的形象粗暴地重塑,渴望着迟来许久的末世复仇。", 11 | "背景故事": "许多传说都曾提到过暗裔魔剑,有的描述他是天神,也有的说他是恶魔。但很少有人知道他的真名,以及他是如何败落的。上古时代,远在黄沙吞噬帝国之前,一位伟大的恕瑞玛英雄被带到了太阳圆盘面前,成为一个如今无人记得的星间信念的化身。当他被重塑为飞升者之后,他的双翼彷如黎明时分的金光,盔甲闪亮,如同深空巨帷背后引人遥望的星座。亚托克斯就是他的真名。他在每一场高贵的战斗中都冲锋在前。他真诚待人、领兵有方,其他天神战士总是聚在他的麾下,身后则跟随着一万名恕瑞玛的凡人士兵。当飞升武后瑟塔卡因为艾卡西亚的叛乱而寻求他的帮助时,亚托克斯毫不犹豫地应允了。但是,没人能预料到当地的叛军后来竟然释放出了如此恐怖的力量。虚空转瞬间反客为主,吞噬了艾卡西亚,之后便开始毁灭一切所遭遇的生灵。经过多年苦战,亚托克斯和他的同胞终于遏制住了虚空狂乱的扩张,并将最大的裂口烧熔封铸了起来。但是,活下来的飞升者——他们自称为太阳血脉,却被他们的敌人永远地改变了。虽然恕瑞玛得胜了,但他们全都失去了一些东西……高贵的亚托克斯也不例外。时光流逝,恕瑞玛也陨落了。正如所有帝国的命运。没有了誓死守卫的王权,虚空的威胁也不再迫切,亚托克斯和太阳血脉开始互相争斗,最终演变成了一场战争,毁灭了他们的世界。侥幸逃脱的凡人给了他们一个新的名字,也是一个蔑称:暗裔。正如虚空的侵袭一样,因为担忧堕落的飞升者们也会危及符文之地的生存,巨神族便出手干涉了。据说,暮光星灵传授给了凡人禁锢暗裔的手段,而新近重生的战争星灵联合起了大军对抗他们。亚托克斯和他的军队何曾畏惧,早已蓄势待发。但是,等到他发觉自己中计的时候已经太晚了。一股比上千颗死去的恒星更强大的引力将他拖入了他手中随他出征无数次的巨剑,把他不朽的精魂永远地锁闭在内。这把武器是一座监狱,将他的意识封禁在密不透风的永恒黑暗里,甚至剥夺了他自我了断的能力。他与这地狱般的桎梏拉扯了数百年,直到某个愚蠢透顶的无名氏再次抓起这把巨剑。亚托克斯把握住机会,强行将意志注入到宿主体内,并模仿自己原本的形象重塑了宿主的躯体,同时也夺去了宿主的生命。此后数年间,亚托克斯侵占了许多宿主——无论男女,只要是生机勃勃,或是刚毅非凡。虽然他所掌握的魔法不算精深,但他却能在转瞬间便夺取凡人的身体。而且在战斗中,他发觉死去的人也能为他所用,把自己变得更加健硕强壮。亚托克斯在大地上巡游,不停地竭力寻找能够让他重回飞升之身的办法,但这把剑身上的谜团最终也无法解开,并且他也意识到自己永远也不能获得自由。强夺而来并残忍重塑的血肉愈发地像是一种嘲弄,嘲弄着他曾经的荣光——而那也不过是另一个比巨剑稍微大一些的牢笼罢了。绝望与羞愧在他心中滋长。他曾经所代表的神力,和他所有的记忆统统都被抹去了。不公的命运令他出离地愤怒了。而他最终想到的办法,完全是一个囚犯刻骨的绝望。如果他不能摧毁这把剑,也不能解脱自己,那他就拥抱湮灭好了。现在,亚托克斯怀抱着这无情的决心,沿途散布战争和死亡。他心中只剩下一个盲目的期望:如果他可以把一切造物都拖进一场最终的末日之战——一切都会因此毁灭——那么也许他和这把剑也会永远地不复存在。" 12 | }, 13 | { 14 | "上线日期": "2011-12-14", 15 | "英文名称": "ahri", 16 | "英雄名称": "九尾妖狐·阿狸", 17 | "英雄定位": "法师, 刺客", 18 | "英雄台词": "人心善变,甚于最深奥的魔法。", 19 | "英雄链接": "https://yz.lol.qq.com/zh_CN/champion/ahri", 20 | "原画链接": "https://game.gtimg.cn/images/lol/universe/v1/assets/images/champion/splash/Ahri_0.jpg", 21 | "故事简述": "天生就与精神领域的魔法存在连接的阿狸,是一名狐狸模样的瓦斯塔亚,在世界上寻找着自己所属的位置。进入凡人社会以后,她成为了一名充满悔意和同情心的掠食者,她喜欢操纵猎物的情绪,然后再吸食他们的生命精魄——每吞噬一个灵魂,都伴着他们生前的记忆片段与领悟洞见。", 22 | "背景故事": "阿狸的身世是个迷,甚至她自己也不清楚。

\n\n

她找不到自己瓦斯塔亚部族的历史,也不知道自己这一族在其他瓦斯塔亚中的地位。留给她的线索只有她此生一直佩戴的双生宝石。事实上,她最早的记忆,是在尚赞北部与一群冰原狐共同奔跑。虽然她知道自己不是它们的一员,但它们却将她视为相同的灵魂,将她接纳为狐群的一员。

\n\n

在那段狂野狩猎的日子里,阿狸始终感受到自己与周围的森林存在着更深层次的连接。不多久,她便领悟到,这就是流淌在她体内的瓦斯塔亚魔法,与彼端的精神领域产生着共鸣。虽然没有人教她,但她却以自己的方式学会使用这股力量——最常用的方式是强化自己的反应速度,便于追逐猎物。而如果她小心靠近,甚至还能安抚一只受惊的小鹿,即使被她的利齿刺入血肉,也一直保持安宁冷静。

\n\n

凡人的世界对于阿狸和冰原狐都很遥远、嘈杂,但她却因为某种说不清的理由感到一种吸引力。人类是一种尤其粗鲁、生硬的生物……一天,有一群猎人在附近扎营,阿狸从远处看着他们进行可怕的工作。

\n\n

当其中一人被弓箭误伤的时候,阿狸感受到他渐渐流失的生命。生而为掠食者的她,在仅有的本趋势下,品尝了潺潺流出的精魄,与此同时她获得了这位猎人零散的记忆——在战斗中殒命的爱人、留在南方家中的孩子们。阿狸轻轻把他的情绪从恐惧推到悲伤再推到快乐,用温馨的景象安慰他,让他的濒死记忆停留在一片暖融融的草地上。

\n\n

后来,她发现自己能轻易理解人类的词汇,他们的语言就像一场模糊的梦,于是阿狸知道,是时候该离开自己的狐群了。

\n\n

她游离在人类社会的边缘,从未感到如此充满活力。她依然保留着掠食者本性,但却陷入了许许多多新体验、情绪和艾欧尼亚的传统之中。看起来,凡人也同样会被她迷得神魂颠倒——她经常利用这一点,吸取他们的精魄,同时让他们陷入优美的回忆、渴望的幻象和痛彻心扉的忧伤梦境之中。

\n\n

那些不属于她的记忆令她沉醉,而结束他人性命则让她感到精神焕发,只不过她也能感受到自己给受害者带去的悲伤和痛苦。记忆的闪回让她体验到短暂而又美妙的心碎与欣喜,让她欲罢不能。她在记忆的画面中看到一群来自铁与石之地的残忍入侵者,并为猎物落泪。这种感觉让她无所适从,但每当她试图远离人类,就会感到自己的力量开始消散,于是忍不住一次次进食……也一次次因此而痛苦。

\n\n

通过无数个偷来的记忆,阿狸开始瞥见更多关于瓦斯塔亚的信息。看来她并不是孤身一人,现在有许多部族都与凡人存在某种紧张关系。最后,她得知了一场反叛运动,目标是要让瓦斯塔亚诸族恢复往日的荣光。

\n\n

或许,这将带她找到那段缺失回忆的过去。

\n\n

她手中紧握着那对双生宝石,出发寻找自己的同类。她将不再依赖那些借来的回忆和陌生的梦境——如果她的部族依然在符文之地上留存着痕迹,那她就一定要找到它。" 23 | }, 24 | { 25 | "上线日期": "2010-05-11", 26 | "英文名称": "akali", 27 | "英雄名称": "离群之刺·阿卡丽", 28 | "英雄定位": "刺客", 29 | "英雄台词": "如果你看上去凶神恶煞,你最好真的是凶神恶煞。", 30 | "英雄链接": "https://yz.lol.qq.com/zh_CN/champion/akali", 31 | "原画链接": "https://game.gtimg.cn/images/lol/universe/v1/assets/images/champion/splash/Akali_0.jpg", 32 | "故事简述": "无论是均衡教派还是暗影之拳的称号,都已被阿卡丽抛弃,如今的阿卡丽独来独往,随时可以成为她的人民所需要的夺命武器。虽然她牢牢铭记着她从宗师慎身上学来的一切,但她效忠保护艾欧尼亚并铲除敌人,每次一条命。或许阿卡丽的出击悄然无声,但她传达的信息将响亮无比:不听命于任何人的刺客最为可怕。", 33 | "背景故事": "艾欧尼亚一直以来都是个充满狂野魔法的地方,形形色色的人类和强大的灵体力求和谐共处……但有的时候这宁静的平衡并不能轻易得来。有的时候平衡也需要维护。均衡教派就是一个以维护艾欧尼亚神圣平衡为己任的团体。教派的信徒们行走于精神与物质两个世界之中,协调两界之间的冲突,而且有必要的时候,也会使用强制力介入。阿卡丽天生就是其中一员,她的母亲是梅目•约曼•特曦,身居暗影之拳的位置,她和她的伴侣塔诺决定让女儿从小就在均衡教派内成长,接受教派的宗师——暮光之眼苦说大师的精悉引导。每当阿卡丽的父母受派外出,教派的其他成员就会担当阿卡丽的代理家长,狂暴之心凯南就曾和这个小姑娘共同度过了许多时光,传授她手里剑的技法,教她利用速度和敏捷而非蛮力。阿卡丽拥有超越同龄人的心智,像吸水的海棉一样吸收学识。所有人都看得出,她将沿着自己父母的道路成长——她将伴着宗师的儿子和既定的继任者慎一起,引领新一代信徒,致力于保护艾欧尼亚的平衡。但平衡往往转瞬即逝,教派内部发生了分裂。一个名叫劫的信徒修行归来,刚愎自用的他与苦说大师激烈交锋,通过一场血腥的哗变让教派内部的权力动荡。阿卡丽逃到了东边的山林之中,一起逃走的还有梅目、慎、凯南和其他几名信徒。令人惋惜的是,塔诺不在其中。劫已经几乎把均衡教派变成了绝情的影流。但作为新一任暮光之眼的慎想要重建那些逝去的东西。他们要回归均衡的三大基本理念:以观星寻找纯粹的公正,用逐日执行正义的审判,并通过修枝根除一切不平衡。虽然他们势单力薄,但他们会训练新的信徒,光复并再次振兴他们的教派。当阿卡丽长大到十四岁的时候,她正式开始了均衡教派的训练,决心要继承母亲的位置,成为新的暗影之拳。阿卡丽是个练武奇才,精通钩镰和苦无——也就是单手镰和飞刀。虽然她不像其他信徒那样掌握魔法能力,但她依然用实力证明自己配得上这个称号,不久后,她的母亲就得以退居后方,担任年轻一代信徒的导师。但阿卡丽的灵魂坐立难安,她始终睁眼看着世界。在诺克萨斯入侵艾欧尼亚的战争余波中,虽然均衡和影流之间达成了脆弱的一致,但她却看到自己的家乡继续遭受着痛苦。她质疑他们的教派是否真正履行着职责。所谓修枝,就应该根除一切威胁到神圣平衡的人……然而慎却总是敦促克制。他总是在限制她。所有那些颂文和冥想都能让她的灵魂获得安宁,但这些陈词滥调却不能击败他们的敌人。她的热血冲动和超前成熟变成了不加掩饰的叛逆。她与慎大声争论,她公然反抗他,她用自己的方式解决艾欧尼亚的敌人。当着整个教派的面,她公然宣称均衡教派都是无能之辈,所有关于精神平衡和耐心的说教都毫无成效。艾欧尼亚人正在物质领域垂死挣扎,所以这也是阿卡丽要保护的领域。她接受了刺客的训练,所以她要刺客该做的事。她已经不再需要这个教派了。慎没有挽留也没有阻止,他知道这条路必须由阿卡丽自己走下去。或许有朝一日她会沿着那条路走回来,但那一切都将由她自己决定。" 34 | } 35 | ] -------------------------------------------------------------------------------- /lol_hero_message_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | # @Time : 2021/11/15 14:06 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import time 7 | import json 8 | import requests 9 | 10 | 11 | class HeroMessage: 12 | def __init__(self): 13 | self.headers = { 14 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.64 Safari/537.36 Edg/101.0.1210.53", 15 | } 16 | self.heroes_message_list = [] 17 | self.error_list = [] 18 | 19 | # 获取英雄对应的链接 20 | def get_heroes_link(self): 21 | url = "https://yz.lol.qq.com/v1/zh_cn/champion-browse/index.json" 22 | res = requests.get(url, headers=self.headers).json() 23 | heroes_list = res["champions"] 24 | for heroes in heroes_list: 25 | item = dict() 26 | # 英雄上线日期 27 | item['上线日期'] = heroes["release-date"][:10] 28 | # 英雄英文名称 29 | item['英文名称'] = heroes["slug"] 30 | # 英雄信息的链接 31 | heroes_slug_link = f"https://yz.lol.qq.com/v1/zh_cn/champions/{item['英文名称']}/index.json" 32 | # 获取英雄详细信息 33 | try: 34 | self.heroes_message_list.append( 35 | self.get_heroes_msg(heroes_slug_link, item) 36 | ) 37 | except: 38 | print(heroes_slug_link, "获取信息失败!") 39 | self.error_list.append(heroes_slug_link) 40 | time.sleep(0.5) 41 | # 保存英雄全部数据 42 | self.save_data(self.heroes_message_list) 43 | # 采集失败的链接 44 | print("采集失败的链接", self.error_list) 45 | 46 | # 获取英雄别名->英雄名全称 47 | def get_heroes_msg(self, heroes_slug_link, item): 48 | # 显示正在请求的链接 49 | print("正在获取:", heroes_slug_link) 50 | # 对链接发送请求 51 | res = requests.get(heroes_slug_link, self.headers).json() 52 | # 英雄中文名称 53 | item['英雄名称'] = res["champion"]["title"] + "·" + res["champion"]["name"] 54 | # 英雄定位 55 | item['英雄定位'] = ", ".join([roles["name"] for roles in res["champion"]["roles"]]) 56 | # 英雄台词 57 | item['英雄台词'] = res["champion"]["biography"]["quote"].strip("“”").replace("", "") 58 | # 英雄链接 59 | item['英雄链接'] = "https://yz.lol.qq.com/zh_CN/champion/" + item['英文名称'] 60 | # 英雄原画 61 | item['原画链接'] = res["champion"]["image"]["uri"] 62 | # 英雄精简故事 63 | item['故事简述'] = res["champion"]["biography"]["short"].strip("

") 64 | # 英雄完整故事 65 | item['背景故事'] = res["champion"]["biography"]["full"].strip("

").replace("

", "").replace("", "").replace(r"\n", "") 66 | # print(item) 67 | return item 68 | 69 | # 保存英雄数据 70 | def save_data(self, dict_data): 71 | data = json.dumps(dict_data, indent=1, ensure_ascii=False) 72 | with open("heroes_data.json", "w", encoding='utf-8') as w: 73 | w.write(data) 74 | print("英雄信息写入完成...") 75 | 76 | 77 | if __name__ == '__main__': 78 | hero = HeroMessage() 79 | hero.get_heroes_link() 80 | -------------------------------------------------------------------------------- /lol_skins_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取LOL道聚城所有皮肤信息✨
2 | - LOL道聚城所有皮肤:[https://daoju.qq.com/lol/list/17-0-0-0-0-0-0-0-0-0-0-00-0-0-1-1.shtml](https://daoju.qq.com/lol/list/17-0-0-0-0-0-0-0-0-0-0-00-0-0-1-1.shtml) 3 | 4 | - 爬取LOL道聚城所有皮肤,包含: 5 | - 皮肤名称 6 | - 皮肤价格(点券) 7 | - 皮肤上架日期 8 | - 爬取的数据存储方式: 9 | - 通过w写入模式,将爬取的数据存储到`lol_skins_data.json`文件 10 | - 该爬虫使用到的模块: 11 | - requests 12 | - re 13 | - time 14 | - json -------------------------------------------------------------------------------- /lol_skins_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/6 12:03 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import requests 7 | import time 8 | import re 9 | import json 10 | 11 | 12 | class LolSkins: 13 | def __init__(self): 14 | self.url = "https://apps.game.qq.com/daoju/v3/api/hx/goods/app/v71/GoodsListApp.php?" 15 | self.headers = { 16 | "referer": "https://daoju.qq.com/", 17 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.41 Safari/537.36 Edg/101.0.1210.32" 18 | } 19 | self.params = { 20 | "view": "biz_cate", 21 | "page": int, 22 | "pageSize": 16, 23 | "orderby": "dtShowBegin", 24 | "ordertype": "desc", 25 | "cate": 17, 26 | "appSource": "pc", 27 | "plat": 1, 28 | "output_format": "jsonp", 29 | "biz": "lol", 30 | "_": int(time.time() * 1000) 31 | } 32 | 33 | def get_data(self): 34 | # 初始化字典 35 | all_skins_data = dict() 36 | # 循环请求页面 37 | for i in range(1, 51): 38 | # 请求间隔 39 | time.sleep(1) 40 | # 参数页码 41 | self.params['page'] = i 42 | # 发送请求 43 | res = requests.get(self.url, headers=self.headers, params=self.params) 44 | # 提取数据 45 | skins_list = self.data_format(res.text) 46 | # 添加进字典 47 | all_skins_data[f"lol道具城第<{i}>页"] = skins_list 48 | # 保存数据 49 | self.save_data(all_skins_data) 50 | 51 | def data_format(self, data): 52 | # 皮肤名称 53 | skin_name_list = re.findall(r'"propName":"(.*?)"', data) 54 | # 皮肤价格 55 | skin_price_list = re.findall(r'"iDqPrice":"(\d+)"', data) 56 | # 上架日期 57 | skin_date_list = re.findall(r'"dtBegin":"(.*?)"', data) 58 | 59 | skins_list = [] 60 | 61 | for i in range(0, len(skin_name_list)): 62 | item = dict() 63 | item["skin_name"] = str(skin_name_list[i]).encode('utf8').decode('unicode_escape').replace("\\", "") 64 | item["skin_price"] = skin_price_list[i] 65 | item["skin_date"] = skin_date_list[i] 66 | skins_list.append(item) 67 | # 展示数据 68 | print(item) 69 | 70 | return skins_list 71 | 72 | def save_data(self, all_skins_data): 73 | # JSON序列化 74 | json_data = json.dumps(all_skins_data, indent=1, ensure_ascii=False) 75 | with open("lol_skins_data.json", "w", encoding="utf-8") as w: 76 | w.write(json_data) 77 | 78 | 79 | if __name__ == '__main__': 80 | lol = LolSkins() 81 | lol.get_data() -------------------------------------------------------------------------------- /maoyan_data_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取猫眼电影实时数据✨
2 | - 猫眼专业版实时数据:[https://piaofang.maoyan.com/](https://piaofang.maoyan.com/) 3 | - 一键爬取猫眼实时数据,包含: 4 | - 电影实时票房 5 | - 电影当日排片 6 | - 网络影视热度榜 7 | - 电视收视节目排行 8 | - 爬取的数据存储方式: 9 | - 通过使用`openpyxl`模块,将爬取的数据存储到`data`文件夹下的`猫眼实时数据.xlsx`表格 10 | - 该爬虫使用到的模块: 11 | - requests 12 | - lxml 13 | - openpyxl 14 | -------------------------------------------------------------------------------- /maoyan_data_spider/data/猫眼实时数据.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/maoyan_data_spider/data/猫眼实时数据.xlsx -------------------------------------------------------------------------------- /maoyan_data_spider/get_url_data.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/11 18:33 3 | # @Author : Torres-圣君 4 | # @File : get_url_data.py 5 | # @Sofaware : PyCharm 6 | import requests 7 | from lxml import etree 8 | from save_data import * 9 | 10 | 11 | class ExtractData: 12 | def __init__(self, url): 13 | self.url = url 14 | # 需要携带的请求头 15 | self.headers = { 16 | "Referer": "https://piaofang.maoyan.com", 17 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.54 Safari/537.36 Edg/101.0.1210.39" 18 | } 19 | self.html = etree.HTML(requests.get(self.url, headers=self.headers).text) 20 | 21 | def who_owns(self): 22 | # 判断进来的链接,根据不同的链接选用不同的解析方式 23 | if "box-office" in self.url: 24 | self.box_office() 25 | elif "session" in self.url: 26 | self.session() 27 | elif "web-heat" in self.url: 28 | self.web_heat() 29 | elif "getTVList" in self.url: 30 | self.getTVList() 31 | 32 | def box_office(self): 33 | data_list = [] 34 | header_list = ["影片", "票房(万)", "票房占比", "排片占比", "排座占比"] 35 | data_list.append(header_list) 36 | for i in range(1, 11): 37 | body_list = self.html.xpath(f'//*[@class="table-body"]/tr[{i}]') 38 | for body in body_list: 39 | item = [ 40 | body.xpath('./td[1]/div/p[1]/text()')[0], 41 | body.xpath('./td[2]/div/text()')[0], 42 | body.xpath('./td[3]/div/text()')[0], 43 | body.xpath('./td[4]/div/text()')[0], 44 | body.xpath('./td[5]/div/text()')[0] 45 | ] 46 | data_list.append(item) 47 | save_data(data_list, "实时票房") 48 | 49 | def session(self): 50 | data_list = [] 51 | header_list = ["片名", "场次占比", "场次"] 52 | data_list.append(header_list) 53 | for i in range(1, 12): 54 | body_list = self.html.xpath(f'//table//tr[{i}]') 55 | for body in body_list: 56 | item = [ 57 | body.xpath('./td[1]/div/div/span/text()')[0], 58 | body.xpath('./td[2]/div/text()')[0], 59 | body.xpath('./td[3]/div/text()')[0] 60 | ] 61 | data_list.append(item) 62 | save_data(data_list, "电影排片") 63 | 64 | def web_heat(self): 65 | data_list = [] 66 | header_list = ["节目", "平台", "上线时长", "实时热度"] 67 | data_list.append(header_list) 68 | for i in range(1, 11): 69 | body_list = self.html.xpath(f'//*[@class="table-body"]/tr[{i}]') 70 | for body in body_list: 71 | item = [ 72 | body.xpath('./td[1]/div/div[2]/p[1]/text()')[0], 73 | body.xpath('./td[1]/div/div[2]/p[2]/text()')[0], 74 | body.xpath('./td[1]/div/div[2]/p[2]/span/text()')[0], 75 | body.xpath('./td[2]/div/div[1]/div[1]/text()')[0] 76 | ] 77 | data_list.append(item) 78 | save_data(data_list, f"影视热度榜") 79 | 80 | def getTVList(self): 81 | data_list = [] 82 | title = "央视频道" if "0" in self.url else "卫视频道" 83 | header_list = ["节目", "频道", "实时关注度", "市占率"] 84 | data_list.append(header_list) 85 | # 获取返回的JSON数据 86 | json_data = requests.get(self.url, headers=self.headers).json() 87 | body_list = json_data["tvList"]["data"]["list"] 88 | for i in range(0, len(body_list)): 89 | item = [ 90 | body_list[i]["programmeName"], 91 | body_list[i]["channelName"], 92 | body_list[i]["attentionRateDesc"], 93 | body_list[i]["marketRateDesc"] 94 | ] 95 | data_list.append(item) 96 | save_data(data_list, title) 97 | 98 | -------------------------------------------------------------------------------- /maoyan_data_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/11 10:52 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Sofaware : PyCharm 6 | import time 7 | from get_url_data import * 8 | 9 | 10 | class MaoyanData: 11 | def __init__(self, son_url_list: list): 12 | self.son_url_list = son_url_list 13 | 14 | def get_data(self): 15 | for i in self.son_url_list: 16 | for j in i: 17 | url = "https://piaofang.maoyan.com/" + j 18 | print(f"正在获取<{url}>") 19 | ExtractData(url).who_owns() 20 | time.sleep(1) 21 | 22 | 23 | if __name__ == '__main__': 24 | maoyan = MaoyanData( 25 | [ 26 | ["box-office?ver=normal"], 27 | ["session"], 28 | ["web-heat"], 29 | ["getTVList?showDate=2&type=" + str(i) for i in range(2)] 30 | ] 31 | ) 32 | maoyan.get_data() 33 | -------------------------------------------------------------------------------- /maoyan_data_spider/save_data.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/11 19:36 3 | # @Author : Torres-圣君 4 | # @File : save_data.py 5 | # @Sofaware : PyCharm 6 | from openpyxl import load_workbook 7 | 8 | 9 | def save_data(data_list, title): 10 | # 创建Excel表对象 11 | wb = load_workbook("./data/猫眼实时数据.xlsx") 12 | # 创建新的sheet 13 | sheet = wb.create_sheet(title, -1) 14 | for i in range(0, len(data_list)): 15 | for j in range(0, len(data_list[i])): 16 | # 写入数据到单元格 17 | sheet.cell(row=i+1, column=j+1).value = data_list[i][j] 18 | # 保存并关闭文件 19 | wb.save("./data/猫眼实时数据.xlsx") 20 | print(f"{title}_写入成功...") 21 | wb.close() 22 | -------------------------------------------------------------------------------- /meituan_foods_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取美团美食的店铺信息✨
2 | - 美团北京美食所有店铺:[https://bj.meituan.com/meishi/](https://bj.meituan.com/meishi/) 3 | 4 | ```python 5 | 修改 'self.start_url' :修改为想要抓取的城市url 6 | 修改 'self.headers' 下的 'Cookie'和'User-Agent':修改为自己账号登录后的值 7 | 修改 'self.mongo_address' :修改为自己的MongoDB数据库地址 8 | 更换 'ip_pool_run.py':修改其文件下的IP代理地址 9 | ``` 10 | 11 | - 爬取美团北京美食店铺的信息,包含: 12 | - 店铺链接 13 | - 店铺名称 14 | - 店铺地址 15 | - 店铺评论数 16 | - 店铺评分 17 | - 爬取的数据存储方式: 18 | - 通过连接MongoDB数据库,将其存入数据库 19 | - 该爬虫使用到的模块: 20 | - requests 21 | - re 22 | - time 23 | - json 24 | - pymongo -------------------------------------------------------------------------------- /meituan_foods_spider/ip_pool.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/22 16:18 3 | # @Author : Torres-圣君 4 | # @File : ip_pool_run.py 5 | # @Sofaware : PyCharm 6 | import random 7 | 8 | 9 | def get_ip(): 10 | proxys = [ 11 | { 12 | "http": "http://211.103.138.117:8000" 13 | }, { 14 | "http": "http://183.247.215.218:30001" 15 | }, { 16 | "http": "http://221.7.197.248:8000" 17 | }, { 18 | "http": "http://39.175.85.225:30001" 19 | }, { 20 | "http": "http://39.175.85.225:30001" 21 | }, { 22 | "http": "http://123.57.246.163:8118" 23 | }, { 24 | "http": "http://120.76.244.188:8080" 25 | }, { 26 | "http": "http://58.20.232.245:9091" 27 | }, { 28 | "http": "http://203.222.25.57:80" 29 | }, { 30 | "http": "http://223.96.90.216:8085" 31 | }, { 32 | "http": "http://221.7.197.248:8000" 33 | }, { 34 | "http": "http://218.64.84.117:8060" 35 | }, { 36 | "http": "http://120.220.220.95:8085" 37 | }, 38 | ] 39 | return random.choice(proxys) 40 | -------------------------------------------------------------------------------- /meituan_foods_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/22 14:59 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import json 7 | import re 8 | import time 9 | from pymongo import MongoClient 10 | import requests 11 | from .ip_pool import get_ip 12 | 13 | 14 | class MeituanSpider: 15 | def __init__(self): 16 | # 目标网址,修改为想要抓取的城市url 17 | self.start_url = 'https://bj.meituan.com/meishi/' 18 | # 首先需要登录自己的账号上 获取登录后的Cookie信息和User-Agent来构造响应头 19 | self.headers = { 20 | # 修改成自己的cookie 21 | "Cookie": "uuid=fc13f93e2548beaced.1650610445.1.0.0; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=180500c9c6694-0455eeafa4c762-6b3e555b-144000-180500c9c67c8; ci=1; rvct=1; __mta=147677064.1650610453704.1650610453704.1650610453704.1; client-id=19be5210-8e89-4598-a6e2-0decd5081934; mtcdn=K; userTicket=FZodTvVmqRNBtcSIIGENmCRtUCXxFFlvvDUNbDQC; _yoda_verify_resp=lw%2FRe7KjJCSrXlzZjUUHoMC6cv33iCr7LluQL36vp7W%2FSWLD%2FcLgW2NnaEO1MT8u%2Fy0OGm3szpTRomNQj%2BLkD7AlVDDto75c16MkwWz2LQd39H2TWG5%2Fl6%2Bm5UU7W6F23%2BKoK3jYjHETueVKU67hIe%2Boztzp5vFoGPn3Ygs27T9M9Zf6Pd4zsLPyeFy9452ATZNT%2FFQkbqNOM1BLiHC4CdOT4QhO0DAhJU%2BIGJvnXZrRtPnlhlUulQoUsSJBtGPYwAQFJHOyRRM8CD0GXrMddMsXQiS%2FB8kx6aQFCxZPfFy04QHF26N2ztzmTL30e9Uy4Pqk3hS9w2oMRBsdH0wtTV8Mw1p9eqMAIpjTbuIcedfEt6fr2iQusiMwjUCCWTtt; _yoda_verify_rid=150ee0f22540000c; u=2988513400; n=Torres%E5%9C%A3%E5%90%9B; lt=Oj3P9g2z0stfWgMheCCf9Mw0CLUAAAAAfhEAALOaHf0lOvfBhE0OvVWmFtRqPsSY-1C5Fe7PsvPzZYt-ZYb_cDgiVVNJOFOhMF1fZQ; mt_c_token=Oj3P9g2z0stfWgMheCCf9Mw0CLUAAAAAfhEAALOaHf0lOvfBhE0OvVWmFtRqPsSY-1C5Fe7PsvPzZYt-ZYb_cDgiVVNJOFOhMF1fZQ; token=Oj3P9g2z0stfWgMheCCf9Mw0CLUAAAAAfhEAALOaHf0lOvfBhE0OvVWmFtRqPsSY-1C5Fe7PsvPzZYt-ZYb_cDgiVVNJOFOhMF1fZQ; token2=Oj3P9g2z0stfWgMheCCf9Mw0CLUAAAAAfhEAALOaHf0lOvfBhE0OvVWmFtRqPsSY-1C5Fe7PsvPzZYt-ZYb_cDgiVVNJOFOhMF1fZQ; unc=Torres%E5%9C%A3%E5%90%9B; _lxsdk=180500c9c6694-0455eeafa4c762-6b3e555b-144000-180500c9c67c8; _hc.v=89028ea2-f5ad-36f8-2732-5d938ae5b422.1650611594; lat=39.983375; lng=116.410765; firstTime=1650612012131; _lxsdk_s=180500c9c67-e43-f4b-d35%7C%7C77", 22 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36", 23 | } 24 | # 初始化MongoDB数据库并创建数据库连接 25 | self.mongo_address = '127.0.0.1' 26 | self.client = MongoClient(self.mongo_address, 27017) 27 | self.db = self.client['meituan'] 28 | self.col = self.db['bj_foods'] 29 | 30 | # 获取需要爬取的url列表 31 | def get_url_list(self, url, total_nums): 32 | url_temp = url + 'pn{}/' 33 | # 每一页显示显示15个美食 通过获取到每个分类下的总美食数来求出总页数 34 | pages = total_nums // 15 + 1 if total_nums % 15 != 0 else total_nums // 15 35 | url_list = [url_temp.format(i) for i in range(1, pages + 1)] 36 | return url_list 37 | 38 | # 对url进行请求并返回处理后的响应信息 39 | def parse_url(self, url): 40 | # self.headers['Cookie'] = random.choice(self.cookies) 41 | time.sleep(1) 42 | rest = requests.get(url, headers=self.headers, proxies=get_ip()) 43 | html_str = re.findall(r'window._appState = (.*?);', rest.content.decode())[0] 44 | return html_str 45 | 46 | # 访问店家详细页面,获取地址和电话 47 | def get_son_msg(self, url): 48 | time.sleep(1) 49 | res = requests.get(url, headers=self.headers, proxies=get_ip()) 50 | # 地址 51 | address = re.findall(r'"address":"(.*?)",', res.text)[0] 52 | # 电话 53 | phone_number = re.findall(r'"phone":"(.*?)",', res.text)[0] 54 | return address, phone_number 55 | 56 | # 创建item并进行存储 57 | def get_content_list(self, html_str): 58 | json_html = json.loads(html_str) 59 | foods = json_html['poiLists']['poiInfos'] 60 | for i in foods: 61 | item = {} 62 | # 获取子链接 63 | food_id = i['poiId'] 64 | item['链接'] = "https://www.meituan.com/meishi/{}/".format(food_id) 65 | item['店名'] = i['title'] 66 | item['地址'], item["电话"] = self.get_son_msg(item['链接']) 67 | item['评论数'] = i['allCommentNum'] 68 | item['评分'] = i['avgScore'] 69 | # item['价格'] = i['avgPrice'] 70 | self.save(item) 71 | 72 | # 保存数据到mongodb数据库中 73 | def save(self, item): 74 | # 转换为字典 75 | data = dict(item) 76 | # 展示数据 77 | print(data) 78 | # 写入数据 79 | self.col.insert_one(data) 80 | 81 | # 主方法 82 | def run(self): 83 | # 首先请求入口url来获取每一个美食分类的url地址 84 | html_str = requests.get(self.start_url, headers=self.headers, proxies=get_ip()) 85 | str_html = re.findall(r'window._appState = (.*?);', html_str.content.decode())[0] 86 | json_html = json.loads(str_html) 87 | # 获取分类链接列表 88 | cate_list = json_html['filters']['cates'][1:] 89 | print(cate_list) 90 | item_list = [] 91 | 92 | # 对每一个分类进行分组分别获取美食的分类名和美食的分类的url 93 | for i in cate_list: 94 | item = {} 95 | # 分类的url进行反爬处理,将http替换成https 96 | # cate_url= i.xpath('./a/@href')[0] 97 | cate_url = i['url'] 98 | item['cate_url'] = cate_url.replace('http', 'https') 99 | # item['cate_name'] = i.xpath('./a/text()')[0] 100 | item['name'] = i['name'] 101 | item_list.append(item) 102 | 103 | # 对每一个美食分类的分类名和分类url地址进行遍历并分别进行处理 104 | for i in item_list: 105 | time.sleep(3) 106 | # https请求 107 | rest = requests.get(i['cate_url'], headers=self.headers, proxies=get_ip()) 108 | # http替换成https后的全部分类链接 109 | str_html = re.findall(r'window._appState = (.*?);', rest.content.decode())[0] 110 | json_html = json.loads(str_html) 111 | total_nums = json_html['poiLists']['totalCounts'] 112 | # 获取每一页的链接 113 | url_list = self.get_url_list(i['cate_url'], total_nums) 114 | for url in url_list: 115 | time.sleep(2) 116 | list_html = self.parse_url(url) 117 | self.get_content_list(list_html) 118 | 119 | 120 | if __name__ == '__main__': 121 | meituan = MeituanSpider() 122 | meituan.run() 123 | 124 | -------------------------------------------------------------------------------- /simple_ip_proxy_pool/README.md: -------------------------------------------------------------------------------- 1 | ##
✨搭建一个简易的免费IP代理池✨
2 | - 爬取的免费IP代理的网站包含: 3 | - [http://www.66ip.cn/index.html](http://www.66ip.cn/index.html) 4 | - [https://www.89ip.cn/index_1.html](https://www.89ip.cn/index_1.html) 5 | - [https://ip.ihuan.me/address/5Lit5Zu9.html](https://ip.ihuan.me/address/5Lit5Zu9.html) 6 | - [https://proxy.ip3366.net/free/?action=china&page=1](https://proxy.ip3366.net/free/?action=china&page=1) 7 | - [https://ip.jiangxianli.com/blog.html?page=1](https://ip.jiangxianli.com/blog.html?page=1) 8 | - [https://www.kuaidaili.com/free/inha/1/](https://www.kuaidaili.com/free/inha/1/) 9 | 10 | ```python 11 | # 运行主方法:ip_pool_run.py 即可启动爬虫 12 | 13 | # 该爬虫使用到了 多线程和协程 (没有做到极致,可自行后续优化),同时对这些网站进行ip代理抓取 14 | # 将所有网站抓取到的ip添加到test_ip方法进行测试,如果代理可用则将其保存至ip_pool.json 15 | 16 | # 尚未实现ip代理池去重功能 17 | ``` 18 | 19 | - 该程序使用到的模块包含: 20 | - lxml 21 | - request 22 | - json 23 | - random 24 | - threading 25 | - asyncio 26 | - aiohttp 27 | -------------------------------------------------------------------------------- /simple_ip_proxy_pool/all_ip_agent/get_66ip.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/23 10:11 3 | # @Author : Torres-圣君 4 | # @File : get_66ip.py 5 | # @Sofaware : PyCharm 6 | # http://www.66ip.cn/index.html 7 | import aiohttp 8 | import asyncio 9 | from user_agent import get_ua 10 | from test_save import test_ip 11 | from lxml import etree 12 | 13 | 14 | def get_data(num): 15 | loop_ = asyncio.new_event_loop() 16 | asyncio.set_event_loop(loop_) 17 | loop = asyncio.get_event_loop() 18 | urls = [f"http://www.66ip.cn/{str(i)}.html" for i in range(1, num+1)] 19 | tasks = [loop.create_task(parse(url)) for url in urls] 20 | loop.run_until_complete(asyncio.wait(tasks)) 21 | 22 | 23 | async def parse(url): 24 | try: 25 | headers = { 26 | "User-Agent": get_ua() 27 | } 28 | timeout = aiohttp.ClientTimeout(total=1000) 29 | async with aiohttp.ClientSession(timeout=timeout) as session: 30 | async with session.get(url, headers=headers) as res: 31 | page = etree.HTML(await res.text()) 32 | ip_list = page.xpath('//*[@id="main"]/div[1]/div[2]/div[1]/table//tr') 33 | del ip_list[0] 34 | # print(len(ip_list)) 35 | for i in range(1, len(ip_list)): 36 | # 提取ip地址 37 | ip_address = ip_list[i].xpath(f'./td[1]/text()')[0] 38 | # 提取ip端口 39 | ip_port = ip_list[i].xpath(f'./td[2]/text()')[0] 40 | # 去除无用字符,并拼接为ip可用格式 41 | ip_msg = "http://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n") 42 | poxyz = { 43 | "http": ip_msg 44 | } 45 | test_ip(poxyz) 46 | except IndexError: 47 | pass 48 | -------------------------------------------------------------------------------- /simple_ip_proxy_pool/all_ip_agent/get_89ip.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/23 10:14 3 | # @Author : Torres-圣君 4 | # @File : get_89ip.py 5 | # @Sofaware : PyCharm 6 | # https://www.89ip.cn/index_1.html 7 | import aiohttp 8 | import asyncio 9 | from user_agent import get_ua 10 | from test_save import test_ip 11 | from lxml import etree 12 | 13 | 14 | def get_data(num): 15 | loop_ = asyncio.new_event_loop() 16 | asyncio.set_event_loop(loop_) 17 | loop = asyncio.get_event_loop() 18 | urls = [f"https://www.89ip.cn/index_{str(i)}.html" for i in range(1, num+1)] 19 | tasks = [loop.create_task(parse(url)) for url in urls] 20 | loop.run_until_complete(asyncio.wait(tasks)) 21 | 22 | 23 | async def parse(url): 24 | try: 25 | headers = { 26 | "User-Agent": get_ua() 27 | } 28 | timeout = aiohttp.ClientTimeout(total=1000) 29 | async with aiohttp.ClientSession(timeout=timeout) as session: 30 | async with session.get(url, headers=headers) as res: 31 | page = etree.HTML(await res.text()) 32 | ip_list = page.xpath('//table//tr') 33 | del ip_list[0] 34 | # print(len(ip_list)) 35 | for i in range(1, len(ip_list)): 36 | # 提取ip地址 37 | ip_address = ip_list[i].xpath(f'./td[1]/text()')[0] 38 | # 提取ip端口 39 | ip_port = ip_list[i].xpath(f'./td[2]/text()')[0] 40 | # 去除无用字符,并拼接为ip可用格式 41 | ip_msg = "http://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n") 42 | poxyz = { 43 | "http": ip_msg 44 | } 45 | test_ip(poxyz) 46 | except IndexError: 47 | pass 48 | -------------------------------------------------------------------------------- /simple_ip_proxy_pool/all_ip_agent/get_ihuan.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/23 10:14 3 | # @Author : Torres-圣君 4 | # @File : get_ihuan.py 5 | # @Sofaware : PyCharm 6 | # https://ip.ihuan.me/address/5Lit5Zu9.html 7 | from user_agent import get_ua 8 | from test_save import test_ip 9 | import time 10 | import requests 11 | from lxml import etree 12 | 13 | main_url = "https://ip.ihuan.me/address/5Lit5Zu9.html/" 14 | next_url = "" 15 | headers = { 16 | "User-Agent": get_ua() 17 | } 18 | 19 | 20 | def get_data(num): 21 | global next_url 22 | next_url = parse(main_url) 23 | for i in range(1, num+1): 24 | time.sleep(1) 25 | parse(next_url) 26 | 27 | 28 | def parse(url): 29 | try: 30 | global next_url 31 | res = requests.get(url, headers=headers) 32 | page = etree.HTML(res.text) 33 | ip_list = page.xpath('//table//tr') 34 | # print(len(ip_list)) 35 | for i in range(1, len(ip_list)): 36 | # 提取ip地址 37 | ip_address = ip_list[i].xpath(f'./td[1]/a/text()')[0] 38 | # 提取ip端口 39 | ip_port = ip_list[i].xpath(f'./td[2]/text()')[0] 40 | # 提取ip类型 41 | ip_type = ip_list[i].xpath(f'./td[5]/text()')[0] 42 | if ip_type == "支持": 43 | # 去除无用字符,并拼接为ip可用格式 44 | ip_msg = "https://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n") 45 | poxyz = { 46 | "https": ip_msg 47 | } 48 | else: 49 | # 去除无用字符,并拼接为ip可用格式 50 | ip_msg = "http://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n") 51 | poxyz = { 52 | "http": ip_msg 53 | } 54 | test_ip(poxyz) 55 | next_url = main_url + page.xpath('//ul[@class="pagination"]/li[3]/a/@href')[0] 56 | return next_url 57 | except IndexError: 58 | pass 59 | -------------------------------------------------------------------------------- /simple_ip_proxy_pool/all_ip_agent/get_ip3366.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/23 10:12 3 | # @Author : Torres-圣君 4 | # @File : get_ip3366.py 5 | # @Sofaware : PyCharm 6 | # https://proxy.ip3366.net/free/?action=china&page=1 7 | import asyncio 8 | import aiohttp 9 | from user_agent import get_ua 10 | from test_save import test_ip 11 | from lxml import etree 12 | 13 | 14 | def get_data(num): 15 | loop_ = asyncio.new_event_loop() 16 | asyncio.set_event_loop(loop_) 17 | loop = asyncio.get_event_loop() 18 | urls = [f"https://proxy.ip3366.net/free/?action=china&page={str(i)}" for i in range(1, num+1)] 19 | tasks = [loop.create_task(parse(url)) for url in urls] 20 | loop.run_until_complete(asyncio.wait(tasks)) 21 | 22 | 23 | async def parse(url): 24 | try: 25 | headers = { 26 | "User-Agent": get_ua() 27 | } 28 | timeout = aiohttp.ClientTimeout(total=1000) 29 | async with aiohttp.ClientSession(timeout=timeout) as session: 30 | async with session.get(url, headers=headers) as res: 31 | page = etree.HTML(await res.text()) 32 | ip_list = page.xpath('//*[@id="content"]/section/div[2]/table//tr') 33 | del ip_list[0] 34 | # print(len(ip_list)) 35 | for i in range(1, len(ip_list)): 36 | # 提取ip地址 37 | ip_address = ip_list[i].xpath(f'./td[1]/text()')[0] 38 | # 提取ip端口 39 | ip_port = ip_list[i].xpath(f'./td[2]/text()')[0] 40 | # 提取ip类型 41 | ip_type = ip_list[i].xpath(f'./td[4]/text()')[0] 42 | if ip_type == "HTTPS": 43 | # 去除无用字符,并拼接为ip可用格式 44 | ip_msg = "https://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n") 45 | poxyz = { 46 | "https": ip_msg 47 | } 48 | else: 49 | # 去除无用字符,并拼接为ip可用格式 50 | ip_msg = "http://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n") 51 | poxyz = { 52 | "http": ip_msg 53 | } 54 | test_ip(poxyz) 55 | except IndexError: 56 | pass 57 | -------------------------------------------------------------------------------- /simple_ip_proxy_pool/all_ip_agent/get_jiangxianli.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/23 16:33 3 | # @Author : Torres-圣君 4 | # @File : get_jiangxianli.py 5 | # @Sofaware : PyCharm 6 | # https://ip.jiangxianli.com/blog.html?page=1 7 | import asyncio 8 | import aiohttp 9 | from user_agent import get_ua 10 | from test_save import test_ip 11 | from lxml import etree 12 | 13 | 14 | headers = { 15 | "User-Agent": get_ua() 16 | } 17 | 18 | 19 | def get_data(num): 20 | loop_ = asyncio.new_event_loop() 21 | asyncio.set_event_loop(loop_) 22 | loop = asyncio.get_event_loop() 23 | urls = [f"https://ip.jiangxianli.com/blog.html?page={str(int(i/5)+1)}" for i in range(1, num+1)] 24 | tasks = [loop.create_task(parse(url)) for url in urls] 25 | loop.run_until_complete(asyncio.wait(tasks)) 26 | 27 | 28 | async def get_page(url): 29 | try: 30 | timeout = aiohttp.ClientTimeout(total=1000) 31 | async with aiohttp.ClientSession(timeout=timeout) as session: 32 | async with session.get(url, headers=headers, timeout=2) as res: 33 | page = etree.HTML(await res.text()) 34 | div_list = page.xpath('//div[@class="contar-wrap"]/div') 35 | for div in div_list: 36 | son_url = div.xpath('./div/h3/a/@href')[0] 37 | await parse(son_url) 38 | except IndexError: 39 | pass 40 | 41 | 42 | async def parse(son_url): 43 | try: 44 | timeout = aiohttp.ClientTimeout(total=1000) 45 | async with aiohttp.ClientSession(timeout=timeout) as session: 46 | async with session.get(son_url, headers=headers) as res: 47 | page = etree.HTML(await res.text()) 48 | ip_list = page.xpath('//div[@class="item"]/div/p/text()') 49 | for i in range(0, len(ip_list)): 50 | # 去除无用字符,并拼接为ip可用格式 51 | ip_msg = "http://" + ip_list[i].split("@")[0].strip(" \t\n") 52 | poxyz = { 53 | "http": ip_msg 54 | } 55 | test_ip(poxyz) 56 | except IndexError: 57 | pass 58 | -------------------------------------------------------------------------------- /simple_ip_proxy_pool/all_ip_agent/get_kuaidaili.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/23 16:45 3 | # @Author : Torres-圣君 4 | # @File : get_kuaidaili.py 5 | # @Sofaware : PyCharm 6 | # https://www.kuaidaili.com/free/inha/1/ 7 | import asyncio 8 | import aiohttp 9 | from user_agent import get_ua 10 | from test_save import test_ip 11 | from lxml import etree 12 | 13 | 14 | def get_data(num): 15 | loop_ = asyncio.new_event_loop() 16 | asyncio.set_event_loop(loop_) 17 | loop = asyncio.get_event_loop() 18 | urls = [f"https://www.kuaidaili.com/free/inha/{str(i)}/" for i in range(1, num+1)] 19 | tasks = [loop.create_task(parse(url)) for url in urls] 20 | loop.run_until_complete(asyncio.wait(tasks)) 21 | 22 | 23 | async def parse(url): 24 | try: 25 | headers = { 26 | "User-Agent": get_ua() 27 | } 28 | timeout = aiohttp.ClientTimeout(total=1000) 29 | async with aiohttp.ClientSession(timeout=timeout) as session: 30 | async with session.get(url, headers=headers) as res: 31 | page = etree.HTML(await res.text()) 32 | ip_list = page.xpath('//table//tr') 33 | del ip_list[0] 34 | # print(len(ip_list)) 35 | for i in range(1, len(ip_list)): 36 | # 提取ip地址 37 | ip_address = ip_list[i].xpath(f'./td[1]/text()')[0] 38 | # 提取ip端口 39 | ip_port = ip_list[i].xpath(f'./td[2]/text()')[0] 40 | # 去除无用字符,并拼接为ip可用格式 41 | ip_msg = "http://" + ip_address.strip(" \t\n") + ":" + ip_port.strip(" \t\n") 42 | poxyz = { 43 | "http": ip_msg 44 | } 45 | test_ip(poxyz) 46 | except IndexError: 47 | pass 48 | -------------------------------------------------------------------------------- /simple_ip_proxy_pool/all_ip_agent/test_save.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/23 12:09 3 | # @Author : Torres-圣君 4 | # @File : test_save.py 5 | # @Sofaware : PyCharm 6 | import requests 7 | import json 8 | from user_agent import get_ua 9 | 10 | 11 | # 测试ip代理是否可用 12 | def test_ip(poxyz): 13 | url = "http://www.baidu.com" 14 | headers = { 15 | "User-Agent": get_ua() 16 | } 17 | try: 18 | res = requests.get(url=url, headers=headers, proxies=poxyz, timeout=1) 19 | if res.status_code == 200: 20 | save_ip(poxyz) 21 | except Exception: 22 | pass 23 | 24 | 25 | # 将可用的代理进行保存 26 | def save_ip(poxyz): 27 | data = json.dumps(poxyz, indent=1) 28 | with open("./ip_pool.json", "a") as w: 29 | w.write(data+",") 30 | print(f"<{poxyz}>已写入IP代理池...") 31 | 32 | -------------------------------------------------------------------------------- /simple_ip_proxy_pool/all_ip_agent/user_agent.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/23 11:59 3 | # @Author : Torres-圣君 4 | # @File : user_agent.py 5 | # @Sofaware : PyCharm 6 | import random 7 | 8 | 9 | def get_ua(): 10 | user_agent_list = [ 11 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 12 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 13 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", 14 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", 15 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", 16 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", 17 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", 18 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 20 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 21 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", 22 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 23 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 24 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", 25 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", 26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 27 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", 28 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", 29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", 30 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", 31 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 32 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 33 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 34 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 35 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 36 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 37 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 38 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 39 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 40 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 41 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 42 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 43 | "UCWEB7.0.2.37/28/999", 44 | "NOKIA5700/ UCWEB7.0.2.37/28/999", 45 | "Openwave/ UCWEB7.0.2.37/28/999", 46 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999", 47 | "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25", 48 | ] 49 | # 设置UA伪装 50 | return random.choice(user_agent_list) 51 | -------------------------------------------------------------------------------- /simple_ip_proxy_pool/ip_pool.json: -------------------------------------------------------------------------------- 1 | { 2 | "https": "https://58.220.95.42:10174" 3 | },{ 4 | "https": "https://118.163.13.200:8080" 5 | },{ 6 | "http": "http://223.96.90.216:8085" 7 | },{ 8 | "http": "http://165.225.202.95:10605" 9 | },{ 10 | "https": "https://139.198.157.59:7890" 11 | },{ 12 | "http": "http://120.220.220.95:8085" 13 | },{ 14 | "http": "http://182.61.201.201:80" 15 | },{ 16 | "http": "http://165.225.206.106:10605" 17 | },{ 18 | "https": "https://117.26.40.251:3712" 19 | },{ 20 | "http": "http://39.130.150.43:80" 21 | },{ 22 | "https": "https://103.38.80.138:3128" 23 | },{ 24 | "http": "http://39.130.150.42:80" 25 | },{ 26 | "http": "http://113.96.62.246:8081" 27 | },{ 28 | "http": "http://39.130.150.44:80" 29 | },{ 30 | "http": "http://112.6.117.135:8085" 31 | },{ 32 | "http": "http://39.130.150.44:80" 33 | },{ 34 | "http": "http://165.225.76.175:10605" 35 | },{ 36 | "https": "https://223.112.99.150:80" 37 | },{ 38 | "http": "http://39.130.150.44:80" 39 | },{ 40 | "https": "https://40.83.102.86:80" 41 | },{ 42 | "https": "https://113.21.237.83:443" 43 | },{ 44 | "http": "http://112.6.117.178:8085" 45 | },{ 46 | "http": "http://218.59.139.238:80" 47 | },{ 48 | "https": "https://210.5.10.87:53281" 49 | },{ 50 | "http": "http://183.247.199.153:30001" 51 | },{ 52 | "http": "http://112.6.117.178:8085" 53 | },{ 54 | "http": "http://47.113.90.161:83" 55 | },{ 56 | "https": "https://222.69.240.130:8001" 57 | },{ 58 | "https": "https://14.20.235.19:45770" 59 | },{ 60 | "http": "http://165.225.204.12:10605" 61 | },{ 62 | "http": "http://103.148.72.192:80" 63 | },{ 64 | "http": "http://165.225.76.165:10605" 65 | },{ 66 | "http": "http://120.220.220.95:8085" 67 | },{ 68 | "http": "http://103.37.141.69:80" 69 | },{ 70 | "https": "https://103.133.177.141:443" 71 | },{ 72 | "http": "http://223.96.90.216:8085" 73 | },{ 74 | "http": "http://120.220.220.95:8085" 75 | },{ 76 | "http": "http://221.122.91.60:80" 77 | },{ 78 | "https": "https://47.93.48.155:8888" 79 | },{ 80 | "http": "http://103.148.72.192:80" 81 | },{ 82 | "http": "http://120.220.220.95:8085" 83 | },{ 84 | "https": "https://42.193.253.152:8089" 85 | }, -------------------------------------------------------------------------------- /simple_ip_proxy_pool/ip_pool_run.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/23 10:05 3 | # @Author : Torres-圣君 4 | # @File : ip_poop_run.py 5 | # @Sofaware : PyCharm 6 | import asyncio 7 | from all_ip_agent import get_66ip, get_89ip, get_ip3366, get_ihuan, get_kuaidaili, get_jiangxianli 8 | import threading 9 | import os 10 | 11 | 12 | def thread_run(num): 13 | threads = [ 14 | threading.Thread(target=get_66ip.get_data, args=(num,)), 15 | threading.Thread(target=get_89ip.get_data, args=(num,)), 16 | threading.Thread(target=get_ip3366.get_data, args=(num,)), 17 | threading.Thread(target=get_ihuan.get_data, args=(num,)), 18 | threading.Thread(target=get_kuaidaili.get_data, args=(num,)), 19 | threading.Thread(target=get_jiangxianli.get_data, args=(num,)), 20 | ] 21 | for thread in threads: 22 | thread.start() 23 | for thread in threads: 24 | thread.join() 25 | 26 | 27 | if __name__ == '__main__': 28 | try: 29 | os.remove("ip_pool.json") 30 | except: 31 | pass 32 | finally: 33 | # 爬取所有网站前10页可用的IP代理 34 | thread_run(5) 35 | print("爬取完毕!") 36 | -------------------------------------------------------------------------------- /taobao_commodity_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取淘宝商品相关信息✨
2 | - 淘宝官网:[https://www.taobao.com/](https://www.taobao.com/) 3 | 4 | - 输入需要搜索的商品名,自动获取搜索结果的商品信息,包含: 5 | - 商品名称 6 | - 商品链接 7 | - 商品价格 8 | - 商品销量 9 | - 店铺名称 10 | 11 | - 新增功能: 12 | 13 | ```text 14 | 1. 采集前先获取商品总页数后,再进入循环采集每一页的数据 15 | 2. 滑块验证,使用鼠标动作链实现自动拖拉滑块,当反复尝试无果后,会提示需人工手动滑动,待人工滑动完成后程序将继续采集数据 16 | 3. 使用openpyxl将采集的数据保存至Excel表格中,通过采用一页保存一次的方法,防止因某页数据获取失败影响前者采取到的数据 17 | 4. 设置Excel表格的样式,比如:居中、行高、列宽等,更人性化的展现数据信息 18 | ``` 19 | 20 | - 该爬虫使用到的模块: 21 | - re 22 | - time 23 | - random 24 | - selenium 25 | - openpyxl -------------------------------------------------------------------------------- /taobao_commodity_spider/data/光遇_商品信息.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/taobao_commodity_spider/data/光遇_商品信息.xlsx -------------------------------------------------------------------------------- /taobao_commodity_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/24 14:57 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import random 7 | import time 8 | import re 9 | from selenium import webdriver 10 | from selenium.webdriver.common.keys import Keys 11 | from selenium.webdriver import ActionChains as ac 12 | from openpyxl import Workbook 13 | from openpyxl import load_workbook 14 | from openpyxl.styles import Alignment 15 | 16 | 17 | class SaveTaobaoData: 18 | def __init__(self, search_content): 19 | # 搜索内容 20 | self.search_content = search_content 21 | # 数据计数器 22 | self.count = 1 23 | # 表格内容居中 24 | self.align = Alignment(horizontal='center', vertical='center', wrap_text=True) 25 | self.options = webdriver.ChromeOptions() 26 | self.options.add_experimental_option('excludeSwitches', ['enable-automation']) 27 | self.options.add_experimental_option('useAutomationExtension', False) 28 | self.driver = webdriver.Chrome(options=self.options) 29 | self.driver.execute_cdp_cmd( 30 | 'Page.addScriptToEvaluateOnNewDocument', 31 | { 32 | 'source': 'Object.defineProperty(navigator, "webdriver", {get: () => undefined})' 33 | } 34 | ) 35 | 36 | def get_page(self): 37 | # 访问淘宝网址 38 | self.driver.get('https://www.taobao.com/') 39 | time.sleep(3) # 停一会防止出意外 40 | # 向搜索框中添加内容,并按下回车进行搜索 41 | self.driver.find_element_by_xpath("//input[@aria-label='请输入搜索文字']").send_keys(self.search_content, Keys.ENTER) 42 | # 扫码登陆 43 | self.driver.find_element_by_xpath('//*[@id="login"]/div[1]/i').click() 44 | # 给20秒时间登陆自己的账号,根据自己的速度来 45 | time.sleep(20) 46 | # 进入循环获取每页数据信息 47 | self.get_next_page() 48 | 49 | def get_page_data(self): 50 | # 判断是否出现验证码 51 | self.driver = self.validation() 52 | # 模拟真人操作,拖动滚动条 53 | for x in range(1, 11, 2): 54 | time.sleep(0.5) 55 | j = x / 10 56 | js = 'document.documentElement.scrollTop = document.documentElement.scrollHeight * %f' % j 57 | self.driver.execute_script(js) 58 | # 页面存放的所有商品 59 | div_list = self.driver.find_elements_by_xpath('//*[@id="mainsrp-itemlist"]/div/div/div[1]/div') 60 | print("当前页面的总商品数:", len(div_list)) 61 | # 首次数据添加表头 62 | if self.count == 1: 63 | data_list = [ 64 | ['商品标题', '商品价格', '商品销量', '店铺名称', '商品链接'] 65 | ] 66 | else: 67 | data_list = [] 68 | for div in div_list: 69 | try: 70 | item = [ 71 | # 商品标题 72 | div.find_element_by_xpath('./div[2]/div[2]/a').text.strip(" \t\n"), 73 | # 商品价格 74 | float(div.find_element_by_xpath('./div[2]/div[1]/div[1]/strong').text), 75 | # 商品销量 76 | div.find_element_by_xpath('./div[2]/div[1]/div[2]').text, 77 | # 店铺名称 78 | div.find_element_by_xpath('./div[2]/div[3]/div[1]/a/span[2]').text, 79 | # 商品链接 80 | div.find_element_by_xpath('./div/div/div[1]/a').get_attribute('href').strip(" \t\n") 81 | ] 82 | # 展示爬取到的数据 83 | print(item) 84 | # 追加进列表 85 | data_list.append(item) 86 | except: 87 | pass 88 | # 保存数据 89 | self.save_data(data_list) 90 | 91 | def get_next_page(self): 92 | # 判断是否出现验证码 93 | self.driver = self.validation() 94 | # 获取关键字商品的总页数 95 | get_page_number = self.driver.find_element_by_xpath('//*[@id="mainsrp-pager"]/div/div/div/div[1]').text 96 | page_number = int(re.findall(r'(\d+)', get_page_number)[0]) 97 | print(f"共获取到数据:{page_number}页") 98 | # 循环访问所有页面 99 | for i in range(0, page_number*44, 44): 100 | # 构造每页的链接 101 | self.driver.get(f"https://s.taobao.com/search?q={self.search_content}&s={i}") 102 | # 隐式等待 103 | self.driver.implicitly_wait(10) 104 | # 解析页面数据 105 | self.get_page_data() 106 | print(f"第{int(i/44+1)}页数据写入完成!") 107 | 108 | def validation(self): 109 | content = self.driver.page_source 110 | if "亲,请拖动下方滑块完成验证" in content: 111 | con = self.hua_kuai() 112 | count = 1 113 | while "亲,请拖动下方滑块完成验证" in con and count <= 3: 114 | con = self.hua_kuai() 115 | count += 1 116 | if count == 3: 117 | print("已尽力尝试自动滑动验证码,但抱歉没能通过,请手动滑一下吧~\n") 118 | input("手动滑动后,请等待页面“加载完成”,扣1并按回车键继续采集:") 119 | con = self.driver.page_source 120 | return self.driver 121 | 122 | def hua_kuai(self): 123 | ele = self.driver.find_element_by_xpath('//*[@id="nc_1_n1z"]') 124 | # 按住滑块元素不放 125 | ac(self.driver).click_and_hold(ele).perform() 126 | # 拖动滑块,xxx需要滑动的大小 127 | ac(self.driver).move_by_offset(300, random.randint(-5, 5)).perform() 128 | # 松开鼠标 129 | ac(self.driver).release().perform() 130 | # 加载页面 131 | time.sleep(2) 132 | try: 133 | # 点击重新滑动按钮 134 | self.driver.find_element_by_xpath('//*[@id="`nc_1_refresh1`"]').click() 135 | except: 136 | pass 137 | return self.driver.page_source 138 | 139 | def save_data(self, data_list): 140 | # 第一次写入需创建表格,后者追加内容 141 | if self.count == 1: 142 | # 创建新的excel表格 143 | wb = Workbook() 144 | sheet = wb.create_sheet("sheet1", -1) 145 | # 设置列宽 146 | sheet.column_dimensions['A'].width = 70 147 | sheet.column_dimensions['B'].width = 10 148 | sheet.column_dimensions['C'].width = 15 149 | sheet.column_dimensions['D'].width = 25 150 | sheet.column_dimensions['E'].width = 80 151 | else: 152 | wb = load_workbook(f"./data/{self.search_content}_商品信息.xlsx") 153 | sheet = wb["sheet1"] 154 | # 遍历表格索引,写入商品数据 155 | for x in range(len(data_list)): 156 | # 设置行高 157 | sheet.row_dimensions[x].height = 15 158 | for y in range(len(data_list[x])): 159 | sheet.cell(x + self.count, y + 1).value = data_list[x][y] 160 | # 居中显示 161 | sheet.cell(x + self.count, y + 1).alignment = self.align 162 | # 保存该Excel表格 163 | wb.save(f"./data/{self.search_content}_商品信息.xlsx") 164 | # 累加计数器,用于追加表格内容 165 | self.count += len(data_list) 166 | 167 | 168 | if __name__ == '__main__': 169 | text = input("请输入需要搜索的关键字:") 170 | run_spider = SaveTaobaoData(text) 171 | run_spider.get_page() 172 | -------------------------------------------------------------------------------- /umeitu_dongman_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨异步爬取优美图库动漫图片✨
2 | - 优美图库官网:[https://www.umeitu.com/katongdongman/dongmantupian/](https://www.umeitu.com/katongdongman/dongmantupian/) 3 | 4 | - 输入指定页数后,异步下载页面上的所有图片 5 | - 下载的图片都保存在:`all_images/` 6 | - 该爬虫使用到的模块: 7 | - requests 8 | - aiohttp 9 | - asyncio 10 | - lxml -------------------------------------------------------------------------------- /umeitu_dongman_spider/all_images/AIR神尾观铃双马尾高清卡通图片.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/umeitu_dongman_spider/all_images/AIR神尾观铃双马尾高清卡通图片.jpg -------------------------------------------------------------------------------- /umeitu_dongman_spider/all_images/樱花庄的宠物女孩椎名真白高清卡通图片.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/umeitu_dongman_spider/all_images/樱花庄的宠物女孩椎名真白高清卡通图片.jpg -------------------------------------------------------------------------------- /umeitu_dongman_spider/all_images/软萌系列动漫头像高清卡通图片.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/umeitu_dongman_spider/all_images/软萌系列动漫头像高清卡通图片.jpg -------------------------------------------------------------------------------- /umeitu_dongman_spider/all_images/黄昏之大地的炼金术士高清卡通图片.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/umeitu_dongman_spider/all_images/黄昏之大地的炼金术士高清卡通图片.jpg -------------------------------------------------------------------------------- /umeitu_dongman_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2021/12/5 15:37 3 | # @Author : Torres-圣君 4 | # @File : get_page_data.py 5 | # @Sofaware : PyCharm 6 | import requests 7 | import aiohttp 8 | import asyncio 9 | from lxml import etree 10 | 11 | 12 | class uMeitu: 13 | def __init__(self): 14 | self.url = "https://www.umeitu.com/e/action/get_img_a.php" 15 | self.headers = { 16 | 'referer': 'https://www.umeitu.com/katongdongman/dongmantupian/', 17 | 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36 Edg/100.0.1185.44', 18 | } 19 | 20 | def get_img_data(self, i: int): 21 | data = { 22 | "next": i, 23 | "table": "news", 24 | "action": "getmorenews", 25 | "limit": 10, 26 | "small_length": 120, 27 | "classid": 48 28 | } 29 | res = requests.post(self.url, headers=self.headers, data=data) 30 | page_data = etree.HTML(res.text) 31 | imgs_list = page_data.xpath('//ul/li/a') 32 | # 存放图片名称的列表 33 | task_name = [] 34 | # 存放图片链接的列表 35 | task_link = [] 36 | for img in imgs_list: 37 | # 图片名称 38 | img_name = img.xpath('./span/text()')[0] 39 | # 图片链接 40 | img_link = img.xpath('./img/@src')[0].replace("small", "") 41 | task_name.append(img_name) 42 | task_link.append(img_link) 43 | self.async_spider(task_name, task_link) 44 | 45 | async def download_imgs(self, img_name, img_link): 46 | try: 47 | async with aiohttp.ClientSession() as session: 48 | async with session.get(img_link, headers=self.headers) as res: 49 | with open(f'all_images/{img_name}.jpg', "wb") as w: 50 | w.write(await res.content.read()) 51 | print(f"<{img_name}>下载完成") 52 | except Exception: 53 | pass 54 | 55 | def async_spider(self, task_name, task_link): 56 | # 获取事件循环 57 | loop = asyncio.get_event_loop() 58 | # 创建task列表 59 | tasks = [ 60 | loop.create_task(self.download_imgs(task_name[i], task_link[i])) for i in range(0, len(task_name)) 61 | ] 62 | # 执行爬虫事件列表 63 | loop.run_until_complete(asyncio.wait(tasks)) 64 | 65 | def run(self): 66 | num = int(input("请输入要下载的图片页数:")) 67 | for i in range(1, num+1): 68 | self.get_img_data(i) 69 | 70 | 71 | if __name__ == '__main__': 72 | u = uMeitu() 73 | u.run() 74 | -------------------------------------------------------------------------------- /ximalaya_audio_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨保存喜马拉雅免费音频✨
2 | - 喜马拉雅官网:[https://www.ximalaya.com](https://www.ximalaya.com) 3 | 4 | - 输入作者ID后,下载该ID下所有免费的有声书 5 | 6 | - 该爬虫使用到的模块: 7 | - requests 8 | -------------------------------------------------------------------------------- /ximalaya_audio_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/29 10:20 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import requests 7 | 8 | headers = { 9 | "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37" 10 | } 11 | 12 | 13 | def run(author_id): 14 | count = 0 15 | while True: 16 | count += 1 17 | url = f"https://www.ximalaya.com/revision/album/v1/getTracksList?albumId={author_id}&pageNum={count}&sort=0" 18 | res = requests.get(url, headers=headers) 19 | audio_link_lisk = res.json()['data']['tracks'] 20 | if len(audio_link_lisk) == 0: 21 | print("所有音频爬取完毕!") 22 | break 23 | else: 24 | for audio_link in audio_link_lisk: 25 | audio_title = audio_link['title'] 26 | audio_id = audio_link['trackId'] 27 | audio_url = f"https://www.ximalaya.com/revision/play/v1/audio?id={audio_id}&ptype=1" 28 | print("正在保存:", audio_title) 29 | save_audio(audio_title, audio_url) 30 | 31 | 32 | def save_audio(audio_title, audio_url): 33 | audio_res = requests.get(audio_url, headers=headers).json()['data']['src'] 34 | audio_data = requests.get(audio_res, headers=headers).content 35 | with open(f'{audio_title}.mp3', 'wb') as w: 36 | w.write(audio_data) 37 | print(audio_title, "保存完成!") 38 | 39 | 40 | if __name__ == '__main__': 41 | # 作者ID 42 | author_id = 10092072 43 | run(author_id) 44 | -------------------------------------------------------------------------------- /yibu_book_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取异步社区所有图书信息✨
2 | - 异步社区图书官网:[https://www.epubit.com/books](https://www.epubit.com/books) 3 | 4 | - 爬取异步社区所有图书信息,包含: 5 | - 书名 6 | - 书的作者 7 | - 书的价格 8 | - 书的标签 9 | - 书的链接 10 | - 爬取的数据存储方式: 11 | - 通过连接MongoDB数据库,将其存入数据库 12 | - 该爬虫使用到的模块: 13 | - requests 14 | - pymongo -------------------------------------------------------------------------------- /yibu_book_spider/get_proxyz.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/1 17:00 3 | # @Author : Torres-圣君 4 | # @File : get_proxyz.py 5 | # @Sofaware : PyCharm 6 | import random 7 | 8 | 9 | def get_proxies(): 10 | proxies_list = [ 11 | { 12 | "ip_address": "http://39.175.67.28:30001" 13 | }, { 14 | "ip_address": "http://101.133.138.238:8118" 15 | }, { 16 | "ip_address": "http://58.246.58.150:9002" 17 | }, { 18 | "ip_address": "http://112.6.117.178:8085" 19 | }, { 20 | "ip_address": "http://221.122.91.74:9401" 21 | }, { 22 | "ip_address": "http://58.220.95.116:10122" 23 | }, { 24 | "ip_address": "http://58.220.95.32:10174" 25 | }, { 26 | "ip_address": "http://220.168.132.43:9015" 27 | }, { 28 | "ip_address": "http://112.6.117.135:8085" 29 | }, { 30 | "ip_address": "http://183.131.85.16:7302" 31 | }, { 32 | "ip_address": "http://223.96.90.216:8085" 33 | }, { 34 | "ip_address": "http://120.133.231.92:8000" 35 | }, { 36 | "ip_address": "http://58.220.95.35:10174" 37 | }, { 38 | "ip_address": "http://47.97.191.179:8018" 39 | }, { 40 | "ip_address": "http://58.220.95.116:10122" 41 | }, { 42 | "ip_address": "http://221.122.91.64:9401" 43 | }, { 44 | "ip_address": "http://123.57.246.163:8118" 45 | }, 46 | ] 47 | return random.choice(proxies_list) 48 | -------------------------------------------------------------------------------- /yibu_book_spider/get_ua.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/4/1 17:02 3 | # @Author : Torres-圣君 4 | # @File : get_ua.py 5 | # @Sofaware : PyCharm 6 | import random 7 | 8 | 9 | def get_ua(): 10 | user_agent_list = [ 11 | "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 12 | "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50", 13 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:38.0) Gecko/20100101 Firefox/38.0", 14 | "Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; .NET4.0C; .NET4.0E; .NET CLR 2.0.50727; .NET CLR 3.0.30729; .NET CLR 3.5.30729; InfoPath.3; rv:11.0) like Gecko", 15 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", 16 | "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", 17 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", 18 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", 19 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 20 | "Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1", 21 | "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11", 22 | "Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11", 23 | "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", 24 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", 25 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", 26 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 27 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", 28 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", 29 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", 30 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", 31 | "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", 32 | "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 33 | "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 34 | "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5", 35 | "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 36 | "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1", 37 | "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10", 38 | "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13", 39 | "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+", 40 | "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0", 41 | "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124", 42 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)", 43 | "UCWEB7.0.2.37/28/999", 44 | "NOKIA5700/ UCWEB7.0.2.37/28/999", 45 | "Openwave/ UCWEB7.0.2.37/28/999", 46 | "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999", 47 | "Mozilla/6.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/536.26 (KHTML, like Gecko) Version/8.0 Mobile/10A5376e Safari/8536.25", 48 | ] 49 | # 设置UA伪装 50 | return random.choice(user_agent_list) 51 | -------------------------------------------------------------------------------- /yibu_book_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/3/30 21:08 3 | # @Author : Torres-圣君 4 | # @File : mian.py 5 | # @Sofaware : PyCharm 6 | import requests 7 | from .get_proxyz import get_proxies 8 | from .get_ua import get_ua 9 | from pymongo import MongoClient 10 | 11 | 12 | class CatchYibuBook: 13 | def __init__(self): 14 | self.url = f'https://www.epubit.com/pubcloud/content/front/portal/getUbookList' 15 | # 初始化MongoDB数据库并创建数据库连接 16 | self.mongo_address = '127.0.0.1' 17 | self.client = MongoClient(self.mongo_address, 27017) 18 | self.db = self.client['book'] 19 | self.col = self.db['yibutushu'] 20 | 21 | def get_data(self, i): 22 | headers = { 23 | 'Origin-Domain': 'www.epubit.com', 24 | 'User-Agent': get_ua() 25 | } 26 | params = { 27 | 'page': i, 28 | 'row': 20, 29 | 'startPrice': None, 30 | 'endPrice': None, 31 | 'tagId': None, 32 | } 33 | 34 | res = requests.get(self.url, headers=headers, params=params, proxies=get_proxies()) 35 | data = res.json() 36 | for i in range(0, 20): 37 | item = {} 38 | item['book_name'] = data['data']['records'][i]['name'] 39 | item['book_author'] = data['data']['records'][i]['authors'] 40 | item['book_price'] = data['data']['records'][i]['price'] 41 | item['book_tagNames'] = data['data']['records'][i]['tagNames'] 42 | item['book_link'] = "https://www.epubit.com/bookDetails?id=" + data['data']['records'][0]['code'] 43 | self.col.insert_one(item) 44 | print(item) 45 | 46 | def run(self, page): 47 | for i in range(1, page+1): 48 | # 设置抓取数据的页数 49 | catch_msg.get_data(i) 50 | # 断开连接mongo 51 | self.client.close() 52 | 53 | 54 | if __name__ == '__main__': 55 | num = int(input("请输入需要爬取的页数:")) 56 | # 实例化对象 57 | catch_msg = CatchYibuBook() 58 | catch_msg.run(num) 59 | -------------------------------------------------------------------------------- /yiqing_data_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取城市实时疫情数据信息✨
2 | - 腾讯疫情数据API:[https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=diseaseh5Shelf](https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=diseaseh5Shelf) 3 | 4 | - 输入城市名称获取疫情数据,包含: 5 | - 最近更新日期 6 | - 新增确诊人数 7 | - 目前确诊人数 8 | - 累计确诊人数 9 | - 累计治愈人数 10 | - 累计死亡人数 11 | - 该爬虫使用到的模块: 12 | - json 13 | - requests 14 | -------------------------------------------------------------------------------- /yiqing_data_spider/city_list.json: -------------------------------------------------------------------------------- 1 | { 2 | "省": [ 3 | "台湾", 4 | "香港", 5 | "澳门", 6 | "天津", 7 | "安徽", 8 | "吉林", 9 | "广东", 10 | "上海", 11 | "福建", 12 | "内蒙古", 13 | "山东", 14 | "江苏", 15 | "北京", 16 | "重庆", 17 | "四川", 18 | "陕西", 19 | "云南", 20 | "浙江", 21 | "江西", 22 | "湖北", 23 | "辽宁", 24 | "湖南", 25 | "河北", 26 | "河南", 27 | "甘肃", 28 | "黑龙江", 29 | "新疆", 30 | "宁夏", 31 | "西藏", 32 | "海南", 33 | "广西", 34 | "山西", 35 | "贵州", 36 | "青海" 37 | ], 38 | "市": [ 39 | "地区待确认", 40 | "地区待确认", 41 | "地区待确认", 42 | "待确认", 43 | "境外输入", 44 | "河北区", 45 | "北辰区", 46 | "和平区", 47 | "河西区", 48 | "南开区", 49 | "东丽区", 50 | "西青区", 51 | "津南区", 52 | "滨海新区", 53 | "红桥区", 54 | "河东区", 55 | "蓟州区", 56 | "宁河区", 57 | "武清区", 58 | "宝坻区", 59 | "静海区", 60 | "外地来津", 61 | "宿州", 62 | "境外输入", 63 | "六安", 64 | "宣城", 65 | "滁州", 66 | "安庆", 67 | "淮北", 68 | "蚌埠", 69 | "黄山", 70 | "合肥", 71 | "淮南", 72 | "池州", 73 | "马鞍山", 74 | "阜阳", 75 | "亳州", 76 | "芜湖", 77 | "铜陵", 78 | "长春", 79 | "境外输入", 80 | "吉林市", 81 | "四平", 82 | "通化", 83 | "延边", 84 | "白城", 85 | "梅河口市", 86 | "长白山管委会", 87 | "松原", 88 | "辽源", 89 | "白山", 90 | "待确认", 91 | "深圳", 92 | "广州", 93 | "湛江", 94 | "境外输入", 95 | "珠海", 96 | "中山", 97 | "惠州", 98 | "肇庆", 99 | "茂名", 100 | "云浮", 101 | "江门", 102 | "佛山", 103 | "河源", 104 | "汕尾", 105 | "韶关", 106 | "阳江", 107 | "梅州", 108 | "汕头", 109 | "潮州", 110 | "揭阳", 111 | "清远", 112 | "东莞", 113 | "地区待确认", 114 | "黄浦", 115 | "浦东", 116 | "杨浦", 117 | "徐汇", 118 | "虹口", 119 | "静安", 120 | "闵行", 121 | "宝山", 122 | "长宁", 123 | "普陀", 124 | "嘉定", 125 | "崇明", 126 | "奉贤", 127 | "松江", 128 | "青浦", 129 | "境外输入", 130 | "金山", 131 | "外地来沪", 132 | "境外来沪", 133 | "地区待确认", 134 | "境外输入", 135 | "宁德", 136 | "莆田", 137 | "厦门", 138 | "漳州", 139 | "泉州", 140 | "南平", 141 | "福州", 142 | "三明", 143 | "龙岩", 144 | "地区待确认", 145 | "锡林郭勒", 146 | "境外输入", 147 | "赤峰", 148 | "呼和浩特", 149 | "鄂尔多斯", 150 | "巴彦淖尔", 151 | "乌海", 152 | "乌兰察布", 153 | "兴安盟", 154 | "通辽", 155 | "阿拉善盟", 156 | "包头", 157 | "呼伦贝尔", 158 | "境外输入", 159 | "青岛", 160 | "临沂", 161 | "淄博", 162 | "德州", 163 | "日照", 164 | "滨州", 165 | "枣庄", 166 | "威海", 167 | "泰安", 168 | "聊城", 169 | "济宁", 170 | "东营", 171 | "潍坊", 172 | "菏泽", 173 | "烟台", 174 | "济南", 175 | "地区待确认", 176 | "徐州", 177 | "南京", 178 | "盐城", 179 | "常州", 180 | "苏州", 181 | "无锡", 182 | "宿迁", 183 | "镇江", 184 | "泰州", 185 | "境外输入", 186 | "淮安", 187 | "连云港", 188 | "扬州", 189 | "南通", 190 | "地区待确认", 191 | "朝阳", 192 | "丰台", 193 | "海淀", 194 | "房山", 195 | "境外输入", 196 | "西城", 197 | "通州", 198 | "东城", 199 | "昌平", 200 | "大兴", 201 | "顺义", 202 | "石景山", 203 | "外地来京", 204 | "门头沟", 205 | "经济开发区", 206 | "涉奥闭环人员", 207 | "密云", 208 | "延庆", 209 | "怀柔", 210 | "平谷区", 211 | "地区待确认", 212 | "境外输入", 213 | "南岸区", 214 | "沙坪坝区", 215 | "綦江区", 216 | "荣昌区", 217 | "潼南区", 218 | "涪陵区", 219 | "长寿区", 220 | "奉节县", 221 | "大渡口区", 222 | "合川区", 223 | "万州区", 224 | "渝中区", 225 | "丰都县", 226 | "垫江县", 227 | "城口县", 228 | "石柱县", 229 | "铜梁区", 230 | "酉阳县", 231 | "秀山县", 232 | "璧山区", 233 | "巫溪县", 234 | "两江新区", 235 | "高新区", 236 | "大足区", 237 | "梁平区", 238 | "黔江区", 239 | "南川区", 240 | "开州区", 241 | "北碚区", 242 | "万盛经开区", 243 | "江北区", 244 | "江津区", 245 | "巫山县", 246 | "云阳县", 247 | "渝北区", 248 | "永川区", 249 | "武隆区", 250 | "巴南区", 251 | "忠县", 252 | "九龙坡区", 253 | "彭水县", 254 | "广安", 255 | "境外输入", 256 | "成都", 257 | "巴中", 258 | "乐山", 259 | "达州", 260 | "德阳", 261 | "广元", 262 | "遂宁", 263 | "资阳", 264 | "宜宾", 265 | "泸州", 266 | "雅安", 267 | "阿坝", 268 | "自贡", 269 | "南充", 270 | "凉山", 271 | "攀枝花", 272 | "绵阳", 273 | "眉山", 274 | "甘孜", 275 | "内江", 276 | "地区待确认", 277 | "境外输入", 278 | "西安", 279 | "咸阳", 280 | "延安", 281 | "汉中", 282 | "榆林", 283 | "铜川", 284 | "渭南", 285 | "杨凌", 286 | "宝鸡", 287 | "商洛", 288 | "安康", 289 | "地区待确认", 290 | "红河", 291 | "境外输入", 292 | "临沧", 293 | "普洱", 294 | "文山州", 295 | "昆明", 296 | "西双版纳州", 297 | "曲靖", 298 | "保山市", 299 | "昭通市", 300 | "怒江州", 301 | "德宏州", 302 | "大理", 303 | "楚雄州", 304 | "丽江市", 305 | "迪庆州", 306 | "玉溪", 307 | "地区待确认", 308 | "境外输入", 309 | "杭州", 310 | "嘉兴", 311 | "衢州", 312 | "金华", 313 | "宁波", 314 | "湖州", 315 | "绍兴", 316 | "舟山", 317 | "温州", 318 | "丽水", 319 | "台州", 320 | "省十里丰监狱", 321 | "地区待确认", 322 | "境外输入", 323 | "上饶", 324 | "抚州", 325 | "新余", 326 | "吉安", 327 | "宜春", 328 | "赣江新区", 329 | "景德镇", 330 | "鹰潭", 331 | "萍乡", 332 | "赣州", 333 | "南昌", 334 | "九江", 335 | "地区待确认", 336 | "境外输入", 337 | "鄂州", 338 | "恩施州", 339 | "神农架", 340 | "宜昌", 341 | "荆门", 342 | "天门", 343 | "黄石", 344 | "孝感", 345 | "十堰", 346 | "襄阳", 347 | "仙桃", 348 | "咸宁", 349 | "潜江", 350 | "黄冈", 351 | "随州", 352 | "武汉", 353 | "荆州", 354 | "沈阳", 355 | "营口", 356 | "丹东", 357 | "葫芦岛", 358 | "大连", 359 | "鞍山", 360 | "铁岭", 361 | "阜新", 362 | "境外输入", 363 | "本溪", 364 | "锦州", 365 | "抚顺", 366 | "朝阳市", 367 | "盘锦", 368 | "辽阳", 369 | "地区待确认", 370 | "邵阳", 371 | "境外输入", 372 | "长沙", 373 | "湘西自治州", 374 | "湘潭", 375 | "永州", 376 | "郴州", 377 | "岳阳", 378 | "怀化", 379 | "常德", 380 | "衡阳", 381 | "益阳", 382 | "张家界", 383 | "株洲", 384 | "娄底", 385 | "地区待确认", 386 | "廊坊", 387 | "沧州", 388 | "邯郸", 389 | "唐山", 390 | "保定", 391 | "秦皇岛", 392 | "定州", 393 | "雄安新区", 394 | "承德", 395 | "衡水", 396 | "石家庄", 397 | "张家口", 398 | "邢台", 399 | "境外输入", 400 | "辛集市", 401 | "地区待确认", 402 | "许昌", 403 | "郑州", 404 | "周口", 405 | "安阳", 406 | "平顶山", 407 | "信阳", 408 | "濮阳", 409 | "漯河", 410 | "开封", 411 | "洛阳", 412 | "商丘", 413 | "境外输入", 414 | "南阳", 415 | "三门峡", 416 | "济源示范区", 417 | "驻马店", 418 | "新乡", 419 | "鹤壁", 420 | "焦作", 421 | "地区待确认", 422 | "境外输入", 423 | "金昌", 424 | "地区待确认", 425 | "临夏", 426 | "平凉", 427 | "庆阳", 428 | "甘南州", 429 | "定西", 430 | "嘉峪关", 431 | "张掖", 432 | "天水", 433 | "酒泉", 434 | "兰州", 435 | "陇南", 436 | "白银", 437 | "武威", 438 | "境外输入", 439 | "哈尔滨", 440 | "牡丹江", 441 | "大庆", 442 | "鸡西", 443 | "地区待确认", 444 | "齐齐哈尔", 445 | "佳木斯", 446 | "双鸭山", 447 | "伊春", 448 | "绥化", 449 | "大兴安岭", 450 | "鹤岗", 451 | "黑河", 452 | "七台河", 453 | "兵团第十一师", 454 | "兵团第九师", 455 | "喀什", 456 | "地区待确认", 457 | "兵团第十二师", 458 | "第七师", 459 | "第八师石河子", 460 | "兵团第四师", 461 | "伊犁哈萨克自治州", 462 | "六师五家渠", 463 | "克孜州", 464 | "哈密", 465 | "阿克苏", 466 | "昌吉州", 467 | "博尔塔拉州", 468 | "吐鲁番", 469 | "阿勒泰", 470 | "和田", 471 | "巴音郭楞州", 472 | "塔城", 473 | "克拉玛依", 474 | "乌鲁木齐", 475 | "境外输入", 476 | "中卫", 477 | "地区待确认", 478 | "石嘴山", 479 | "固原", 480 | "银川", 481 | "吴忠", 482 | "宁东管委会", 483 | "那曲", 484 | "山南", 485 | "林芝", 486 | "阿里地区", 487 | "拉萨", 488 | "日喀则", 489 | "昌都", 490 | "三亚", 491 | "海口", 492 | "陵水县", 493 | "琼海", 494 | "儋州", 495 | "万宁", 496 | "昌江县", 497 | "定安县", 498 | "临高县", 499 | "保亭", 500 | "澄迈县", 501 | "琼中县", 502 | "三沙", 503 | "境外输入", 504 | "文昌", 505 | "东方", 506 | "乐东", 507 | "地区待确认", 508 | "防城港", 509 | "河池", 510 | "玉林", 511 | "钦州", 512 | "桂林", 513 | "贵港", 514 | "贺州", 515 | "梧州", 516 | "柳州", 517 | "来宾", 518 | "南宁", 519 | "百色", 520 | "北海", 521 | "崇左", 522 | "地区待确认", 523 | "境外输入", 524 | "境外输入", 525 | "阳泉", 526 | "长治", 527 | "晋中", 528 | "忻州", 529 | "吕梁", 530 | "大同", 531 | "临汾", 532 | "晋城", 533 | "太原", 534 | "运城", 535 | "朔州", 536 | "遵义", 537 | "铜仁", 538 | "毕节", 539 | "贵阳", 540 | "黔南州", 541 | "六盘水", 542 | "黔西南州", 543 | "境外输入", 544 | "安顺", 545 | "黔东南州", 546 | "地区待确认", 547 | "西宁", 548 | "海东", 549 | "玉树州", 550 | "海西州", 551 | "果洛州", 552 | "海南州", 553 | "海北州", 554 | "黄南州", 555 | "地区待确认" 556 | ] 557 | } -------------------------------------------------------------------------------- /yiqing_data_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/5/3 23:23 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import json 7 | import requests 8 | 9 | url = "https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=diseaseh5Shelf" 10 | headers = { 11 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36 Edg/103.0.1264.44" 12 | } 13 | 14 | 15 | def run(city): 16 | res = requests.get(url, headers=headers).json() 17 | all_data = res['data']['diseaseh5Shelf'] 18 | last_update_time = all_data["lastUpdateTime"] 19 | # 保存城市列表 20 | save_city_list(all_data) 21 | # 读取城市名称列表 22 | city_list = json.loads(open("city_list.json", encoding='utf-8').read()) 23 | city = city.strip("省市") 24 | if city == "中国": 25 | data_ = all_data["areaTree"][0] 26 | else: 27 | try: 28 | if city in city_list["省"]: 29 | # 提取当前省份的所有数据 30 | data_ = [x for x in all_data["areaTree"][0]["children"] if x["name"] == city][0] 31 | elif city in city_list["市"]: 32 | # 提取当前城市的所有数据 33 | data_ = [y for x in all_data["areaTree"][0]["children"] for y in x["children"] if y["name"] == city][0] 34 | else: 35 | return f"没有查询到{city}的疫情数据~" 36 | except IndexError: 37 | return "疫情接口出现异常,请稍后重试~" 38 | confirm = data_["total"]["confirm"] # 累计确诊 39 | heal = data_["total"]["heal"] # 累计治愈 40 | dead = data_["total"]["dead"] # 累计死亡 41 | now_confirm = data_["total"]["nowConfirm"] # 目前确诊 42 | add_confirm = data_["today"]["confirm"] # 新增确诊 43 | return f"{city}疫情更新日期:\n" \ 44 | f"{last_update_time}\n" \ 45 | f"————————————————————————\n" \ 46 | f"该地区疫情数据如下:\n" \ 47 | f"新增确诊:{add_confirm}\n" \ 48 | f"目前确诊:{now_confirm}\n" \ 49 | f"累计确诊:{confirm}\n" \ 50 | f"累计治愈:{heal}\n" \ 51 | f"累计死亡:{dead}" 52 | 53 | 54 | def save_city_list(all_data): 55 | with open("city_list.json", 'w', encoding='utf-8') as w: 56 | # 保存所有省份和城市名称 57 | sheng_list = [] 58 | shi_list = [] 59 | for i in all_data["areaTree"][0]["children"]: 60 | sheng_list.append(i["name"]) 61 | sheng_list.append(i["name"]+"省") 62 | for j in i["children"]: 63 | shi_list.append(j["name"]) 64 | shi_list.append(j["name"]+"市") 65 | dict_city = { 66 | "省": sheng_list, 67 | "市": shi_list 68 | } 69 | w.write(json.dumps(dict_city, indent=1, ensure_ascii=False)) 70 | print("城市列表保存完成!") 71 | 72 | 73 | city_name = input("请输入要查的城市名:") 74 | print(run(city_name)) 75 | -------------------------------------------------------------------------------- /youdao_fanyi_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨有道在线翻译接口✨
2 | - 有道在线翻译:[https://fanyi.youdao.com/](https://fanyi.youdao.com/) 3 | 4 | ```python 5 | 通过抓包获取到接口后,查看其携带的参数信息 6 | 通过对参数的分析得出: 7 | 'i':需要翻译的文本 8 | 'salt':14位的时间戳 9 | 'sign':使用的是md5密码加盐方式,对需要翻译的文本加盐后进行加密 10 | 'lts':13位的时间戳 11 | 除此之外,其余的参数则都为固定值 12 | 参数都解决完成后,携带这些参数对接口发送请求即可 13 | ``` 14 | 15 | - 该爬虫使用到的模块: 16 | - requests 17 | - hashlib 18 | - time 19 | -------------------------------------------------------------------------------- /youdao_fanyi_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/14 21:00 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import requests 7 | import hashlib 8 | import time 9 | 10 | 11 | class YouDao(object): 12 | def __init__(self, word): 13 | self.word = word 14 | self.headers = { 15 | 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/' 16 | '99.0.4844.51 Safari/537.36 Edg/99.0.1150.39', 17 | 'Cookie': 'OUTFOX_SEARCH_USER_ID=1277855906@10.108.160.101; OUTFOX_SEARCH_USER_ID_NCOO=1759159210.6581216; ___rl__test__cookies=1656644180767; fanyi-ad-id=307488; fanyi-ad-closed=0', 18 | 'Referer': 'https://fanyi.youdao.com/' 19 | } 20 | 21 | def run(self): 22 | url = 'https://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule' 23 | res = requests.post(url, headers=self.headers, data=self.get_fromdata()) 24 | data = res.json() 25 | print(f"{'-'*100}\n", data['translateResult'][0][0]['tgt']) 26 | 27 | def get_fromdata(self): 28 | """ 29 | ts: "" + (new Date).getTime(), 30 | salt: ts + parseInt(10 * Math.random(), 10);, 31 | sign: n.md5("fanyideskweb" + e + i + "Ygy_4c=r#e#4EX^NUGUc5") 32 | """ 33 | salt = str(int(time.time()*10000)) # 14位 34 | lts = str(int(time.time() * 1000)) # 13位 35 | 36 | # MD5加密 37 | data = "fanyideskweb" + self.word + salt + "Ygy_4c=r#e#4EX^NUGUc5" 38 | md5 = hashlib.md5() 39 | md5.update(data.encode()) 40 | sign = md5.hexdigest() 41 | 42 | fromdata = { 43 | "i": self.word, 44 | "from": "AUTO", 45 | "to": "AUTO", 46 | "smartresult": "dict", 47 | "client": "fanyideskweb", 48 | "salt": salt, 49 | "sign": sign, 50 | "lts": lts, 51 | "bv": "8c5b4ecb9f7fdfe6b2997ab984775a98", 52 | "doctype": "json", 53 | "version": "2.1", 54 | "keyfrom": "fanyi.web", 55 | "action": "FY_BY_REALTlME" 56 | } 57 | return fromdata 58 | 59 | 60 | if __name__ == '__main__': 61 | content = input("请输入需要翻译的内容:") 62 | youdao = YouDao(content) 63 | youdao.run() 64 | -------------------------------------------------------------------------------- /ziroom_message_spider/README.md: -------------------------------------------------------------------------------- 1 | ##
✨获取自如网房源信息✨
2 | - 自如网官网:[https://www.ziroom.com/z/](https://www.ziroom.com/z/) 3 | 4 | ```python 5 | 字体反爬大体思路: 6 | 1. 通过自如网页面的源码中,提取房价数字的背景图片链接,并保存图片 7 | 2. 使用'PIL'的'Image'将数字图片和纯黑色图片合并(因为保存的图片背景为透明,pytesseract无法识别) 8 | 3. 合并后会生成'text.png'图片,再使用'pytesseract'进行识别提取数字 9 | 4. 将提取的数字和坐标值(固定的)建立映射,再将数字'position'对应的坐标替换对应的数字即可 10 | ``` 11 | 12 | - 该爬虫使用到的模块: 13 | - requests 14 | - re 15 | - time 16 | - lxml 17 | - pytesseract 18 | - PIL 19 | -------------------------------------------------------------------------------- /ziroom_message_spider/ocr_img/bg_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/ziroom_message_spider/ocr_img/bg_image.png -------------------------------------------------------------------------------- /ziroom_message_spider/ocr_img/black_img.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/ziroom_message_spider/ocr_img/black_img.png -------------------------------------------------------------------------------- /ziroom_message_spider/ocr_img/text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/ziroom_message_spider/ocr_img/text.png -------------------------------------------------------------------------------- /ziroom_message_spider/run_spider.py: -------------------------------------------------------------------------------- 1 | # encoding: utf-8 2 | # @Time : 2022/6/30 14:55 3 | # @Author : Torres-圣君 4 | # @File : run_spider.py 5 | # @Software : PyCharm 6 | import requests 7 | import re 8 | import time 9 | from lxml import etree 10 | import pytesseract 11 | from PIL import Image 12 | 13 | 14 | class DetailedData: 15 | def __init__(self, page_num): 16 | self.urls = [f'https://www.ziroom.com/z/p{num + 1}/' for num in range(page_num)] 17 | self.headers = { 18 | "Cookie": "CURRENT_CITY_CODE=110000; CURRENT_CITY_NAME=%E5%8C%97%E4%BA%AC; _csrf=yjfN8G-kzNnGvj1iEvjH6O1x3TNy89d0; __jsluid_s=4174712fab682cd6df16575532ddfe6b; sajssdk_2015_cross_new_user=1; gr_user_id=383b4bb6-6a6b-4901-a057-9c620a3e2e26; __jsluid_h=1df213e6b4954e733185c9409be2a2e7; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22181b36097ea1b3-0f16d08755ce17-4f617f5b-1327104-181b36097eb316%22%2C%22%24device_id%22%3A%22181b36097ea1b3-0f16d08755ce17-4f617f5b-1327104-181b36097eb316%22%2C%22props%22%3A%7B%22%24latest_referrer%22%3A%22%22%2C%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%7D%7D", 19 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37" 20 | } 21 | # 固定的数字位置 22 | self.position_list = ['-0px', '-21.4px', '-42.8px', '-64.2px', '-85.6px', '-107px', '-128.4px', '-149.8px', '-171.2px', '-192.6px'] 23 | 24 | def run(self): 25 | for url in self.urls: 26 | self.page_data(url) 27 | time.sleep(5) 28 | 29 | def page_data(self, url): 30 | res = requests.get(url, headers=self.headers).text 31 | # 保存数字背景图片 32 | self.download_img(res) 33 | # 使用ocr识别图片 34 | fonts_dic = self.ocr_fonts() 35 | print(fonts_dic) 36 | html = etree.HTML(res) 37 | div_list = html.xpath('//div[@class="Z_list-box"]/div') 38 | for div in div_list: 39 | room_link = "https:" + div.xpath('./div[2]/h5/a/@href')[0] 40 | title = div.xpath('./div[2]/h5/a/text()')[0] 41 | area = div.xpath('./div[2]/div[1]/div[1]/text()')[0] 42 | address = div.xpath('./div[2]/div[1]/div[2]/text()')[0].strip() 43 | bg_link = div.xpath('.//span[@class="num"]/@style') 44 | price = self.decrypt_font(bg_link, fonts_dic) 45 | item = [room_link, title, area, address, price] 46 | print(item) 47 | self.save_data(item) 48 | 49 | def download_img(self, res): 50 | # 在页面源码中提取图片链接 51 | img = re.findall(r'//static8.ziroom.com/phoenix/pc/images/price/new-list/(.*?)\);', res)[0] 52 | img_url = "https://static8.ziroom.com/phoenix/pc/images/price/new-list/" + img 53 | # 以二进制写入文件保存图片 54 | img_data = requests.get(img_url, headers=self.headers).content 55 | with open('ocr_img/bg_image.png', 'wb') as w: 56 | w.write(img_data) 57 | 58 | def ocr_fonts(self): 59 | # 纯白背景图 60 | white_img = Image.open('ocr_img/black_img.png') 61 | # 数字背景图 62 | bg_img = Image.open('ocr_img/bg_image.png') 63 | # 改变图像尺寸 64 | img1 = white_img.resize((600, 100)) 65 | img2 = bg_img.resize((560, 60)) 66 | # 合并两个图像,bg_img 放到 white_img 并指定坐标(不能完全重叠) 67 | img1.paste(img2, (30, 20)) 68 | # 保存图片 69 | img1.save("text.png") 70 | # 使用合并后的图 71 | image = Image.open('ocr_img/text.png') 72 | # 图片二值化,便于ocr识别 73 | Img = image.convert('L') 74 | # 识别提取图片中的内容 75 | text = pytesseract.image_to_string(Img) 76 | # 将内容写入列表 77 | nums = [num for num in text if num != " "] 78 | fonts_dic = {} 79 | # 把位置和数字存放为字典 80 | for k, v in zip(self.position_list, nums): 81 | fonts_dic[k] = v 82 | return fonts_dic 83 | 84 | def decrypt_font(self, bg_link, fonts_dic): 85 | price_list = [] 86 | # 替换价格的每个数字 87 | for bg in bg_link: 88 | position = bg.split(" ")[-1] 89 | num = fonts_dic[position] 90 | price_list.append(num) 91 | # 拼接成完整的价格 92 | price = ''.join(price_list) + "元/月" 93 | return price 94 | 95 | def save_data(self, item): 96 | with open('自如网租房房源信息.csv', 'a+') as w: 97 | w.seek(0) 98 | flag = w.read() == "" 99 | if flag: 100 | w.write("链接,标题,面积,地址,房价\n") 101 | w.write(','.join(item) + "\n") 102 | 103 | 104 | if __name__ == '__main__': 105 | page_num = int(input("请输入需要获取的页码:")) 106 | dd = DetailedData(page_num) 107 | dd.run() 108 | -------------------------------------------------------------------------------- /ziroom_message_spider/自如网租房房源信息.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cjladmin/spider_cases/68fe8763738066b07aa4ebafb94c3efd9e0fae96/ziroom_message_spider/自如网租房房源信息.csv --------------------------------------------------------------------------------