├── README.MD ├── api.py ├── auto.sh ├── download.py ├── log.txt ├── push.py ├── requirements.txt ├── sky.py └── skyDate 2021-11-15.csv /README.MD: -------------------------------------------------------------------------------- 1 | 7 | # 光遇官网——光遇博物馆爬虫脚本 8 | ## 前言 9 | ![献祭完的图片](https://i.loli.net/2021/02/02/lsPKiMowSvn94Ic.jpg) 10 | ![禁阁终点](https://i.loli.net/2021/02/02/qRJpxn52VYBuAKf.png) 11 | > 感恩季 追光季 归属季 凛冬季 魔法季 圣岛季 预言季 很荣幸能陪光遇走到现在,还有那些温柔的人 这一年多 有笑 有泪 不管怎样,我永远不会忘记我在这里度过的时光,开完好友树的,只开了对话的,只加了好友的,甚至是点亮过的小黑,很高兴认识你们,这一路承蒙照顾 真的很爱光遇,祝光遇越来越好,陈老师加油! 12 | ## 运行 13 | 1. 一定要在 `python3.X` 环境下运行本项目 14 | 2. 运行前请安装项目依赖的模块: 15 | `pip install -r requriements.txt ` 16 | 3. Linux运行:`sudo python3 sky.py` 17 | 4. 18 | ```shell 19 | sudo python3 sky.py # 获取CSV文件 20 | sudo python3 download.py #下载 21 | sudo python3 api.py # api模块 22 | ``` 23 | ## 爬取部分 24 | - 1. **爬取页面--光遇博物馆** 25 | > https://game.163.com/star/sky/index.html 26 | - 2. 在脚本目录生成 **skyData {time}.csv** 文件 27 | - 文件的结构: 28 | ```csv 29 | title,text,tags,name,picList,time 30 | 标题,文字内容(一些没有),标签,作者名,图片列表,发表时间 31 | ``` 32 | - 3. 本脚本 **可以一直爬取到`2019/6/27 2:06:24`的第一条内容** ,标题好像是: 33 | > 与温暖的灵魂相遇 34 | - 4. 在我的win10上用Excel读取CSV文件已**不存在中文乱码的问题** 35 | - 关于Python CSV中文乱码问题的解决: 36 | ```python 37 | import csv 38 | import codecs # 处理csv乱码 39 | with codecs.open("XXX.csv".format(new_time), "w", encoding="utf_8_sig") as cvs_file: 40 | writer = csv.DictWriter(cvs_file, headers) 41 | writer.writeheader() #写表头v 42 | writer.writerows(Data) #写入多行字典数据 43 | ``` 44 | >### 如果还是存在中文乱码问题的话请提交`issues` 45 | ## 下载部分 46 | **2021.2.5** ***已初步实现下载功能*** [详细请查看download.py](/download.py) 47 | > ### 运行前记得改路径哈! 48 | - 下载结构 49 | ```dat 50 | ├─Skydownload 51 | ├─pic //图片文件夹 52 | ├─ 图片标签名文件夹 53 | ├─XXX.jpg 54 | ├─txt //文字文件夹 55 | ├─标题_作者.txt 56 | ├─video //视频文件夹 57 | ├─标题.mp4 58 | ``` 59 | - **实践** 2020.2.4 自己下载了大约1小时 大概 ***10GB*** 左右叭 60 | ![下载完后的大小](https://i.loli.net/2021/02/05/y1XlvcwS6qZfnd7.png) 61 | ## API部分 62 | - 运行:`python3 api.py` 63 | - 获取json信息: **GET POST** `/sky/json/` 64 | 返回: 65 | ```json 66 | { 67 | "status": 200, # 正常返回200 错误返回500 68 | "data": { 69 | "title": "积云中藏有秘密", # 标题 70 | "text": "nan", # 文字 71 | "tag": "游戏截图", # 标签 72 | "time": "2019-07-15 14:06:44", # 发布日期 73 | "pic_url": "https://kol-fans.fp.ps.netease.com/file/ 5d2c88747f9d2abb9e34d4d5qBIAwTQd02", # 图片链接 74 | "content": "积云中藏有秘密|游戏截图" # 整理后的文字 75 | } 76 | ``` 77 | **异常返回:** 78 | ```json 79 | { 80 | "status": 500, 81 | "data": { 82 | "content": "出现了其他异常 {异常信息}", 83 | "pic_url": "https://i.loli.net/2021/02/18/w36CqS2FPkdvcV9.jpg", # 异常图片 84 | } 85 | } 86 | ``` 87 | - 直接重定向到图片地址: **GET POST** `/sky/pic/` 88 | ## 自动提交shell部分 89 | > 参见 **[auto.sh](./auto.sh)** 90 | - Linux 下使用crontab实现 **自动运行** 91 | > 安利一个很好用的crontab [表达式生成工具](https://qqe2.com/cron) 92 | 93 | ```shell 94 | sudo apt-get install crontabs 95 | # 用普通用户执行 96 | crontab -l # 查看 97 | crontab -e # 编辑 98 | # 添加 99 | 30 15 0 * * ? * pi /home/pi/skyData/auto.sh 100 | ``` 101 | ## TODO 102 | - [X] ~~添加`crantab`自动运行和自动提交`Git`的`shell` **脚本** 每天自动更新提交到`GitHub`~~(2020.2.19基本完成 **要开学了 嘤嘤嘤**) 103 | - [X] ~~添加 **下载列表中的图片功能** ,并把图片按文件夹**归类**~~ (2021.2.5已完成) 104 | - [ ] 爬取更多有关`Sky·光遇`的东西 (打算爬取 **光遇手游weibo**) 105 | - [X] ~~用Python Flask模块开发个 **光遇随机图api**~~(2021.2.18已完成) 106 | ## 后 107 | - 此项目由一个**初中生**开发,仅供**学习用途** 108 | - 如果有什么好的建议也欢迎提出 **Issues** ,也欢迎各位大佬 **Pull resquests** 这个项目. 109 | - 诶呀! 点个 **Star** 再走叭!! mua~ /可爱 110 | 111 | -------------------------------------------------------------------------------- /api.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: whalefall 3 | Date: 2021-02-17 06:56:41 4 | LastEditTime: 2021-02-18 18:51:11 5 | Description: Sky光遇随机图api 6 | ''' 7 | import csv 8 | from flask import * 9 | import json 10 | import os 11 | import sys 12 | import pandas as pd 13 | import random 14 | from ast import literal_eval # 将字符串列表转化为列表 15 | 16 | # 功能函数部分 17 | # 获取目录下的第一个CSV文件 18 | 19 | 20 | def getCsvPath(): 21 | result = os.listdir(os.getcwd()) 22 | 23 | for csvFileName in result: 24 | if ".csv" in csvFileName: 25 | csv_path = os.path.join(os.getcwd(), csvFileName) 26 | return csv_path 27 | else: 28 | pass 29 | 30 | print("[Error]目录下无法找到CSV文件!即将调用 sky.py 生成") 31 | os.system("python {}".format(os.path.join(os.getcwd(), "sky.py"))) 32 | sys.exit() 33 | 34 | # 读取CSV文件 35 | 36 | 37 | def readCSV(path): 38 | # print(path) 39 | try: 40 | df = pd.read_csv(path) 41 | except Exception as e: 42 | print("[Error]读取CSV文件时出现错误 %s" % (e)) 43 | # index_num = df.shape[0] # 取出总行数 44 | index_num = len(df.index) 45 | rand = random.randint(1, index_num) 46 | # 取出的数据标签不能为视频 并 图片列表不为空! 47 | df = df[(df["tags"] != "视频") & ( 48 | df["picList"] != "[]")] 49 | 50 | data = df.loc[rand:rand] # 取出随机某一列 51 | 52 | # 取出数据 53 | title = data["title"].values[0] 54 | text = data["text"].values[0] 55 | tags = data["tags"].values[0] 56 | # Empty DataFrame 57 | pic_url = random.choice(literal_eval(data["picList"].values[0])) 58 | time = data["time"].values[0] 59 | 60 | return str(title), str(text), str(tags), str(pic_url), str(time) 61 | 62 | 63 | # readCSV(getCsvPath()) 64 | 65 | 66 | # Flask接口部分 67 | app = Flask(__name__) 68 | 69 | # 返回json信息 70 | 71 | 72 | @app.route("/sky/json/", methods=["GET", "POST"]) 73 | def skyJson(): 74 | try: 75 | try: 76 | title, text, tags, pic_url, time = readCSV(getCsvPath()) 77 | except IndexError: 78 | print("[Error]可能出现了Empty DataFrame错误 我也不知道怎么解决了唉 重新生成试试") 79 | title, text, tags, pic_url, time = readCSV(getCsvPath()) 80 | 81 | if text == "NaN" or text == "nan": 82 | content = title + "|" + tags 83 | else: 84 | content = title + "|" + text 85 | 86 | dictData = { 87 | "status": 200, 88 | "data": { 89 | "title": title, 90 | "text": text, 91 | "tag": tags, 92 | "time": time, 93 | "pic_url": pic_url, 94 | "content": content, 95 | } 96 | } 97 | 98 | except IndexError: 99 | dictData = { 100 | "status": 500, 101 | "data": { 102 | "content": "出现了极低概率的错误(两次Empty DataFrame),请重新发送请求叭", 103 | "pic_url": "https://i.loli.net/2021/02/18/w36CqS2FPkdvcV9.jpg", 104 | } 105 | } 106 | except Exception as e: 107 | dictData = { 108 | "status": 500, 109 | "data": { 110 | "content": "出现了其他异常 %s" % (e), 111 | "pic_url": "https://i.loli.net/2021/02/18/w36CqS2FPkdvcV9.jpg", 112 | } 113 | } 114 | return json.dumps(dictData, ensure_ascii=False) 115 | 116 | # 直接重定向到图片 117 | 118 | 119 | @app.route("/sky/pic/", methods=["GET", "POST"]) 120 | def skyPic(): 121 | try: 122 | try: 123 | title, text, tags, pic_url, time = readCSV(getCsvPath()) 124 | except IndexError: 125 | print("[Error]可能出现了Empty DataFrame错误 我也不知道怎么解决了唉 重新生成试试") 126 | title, text, tags, pic_url, time = readCSV(getCsvPath()) 127 | except Exception as e: 128 | print("[Error]出现其他异常%s" % e) 129 | return redirect("https://i.loli.net/2021/02/18/w36CqS2FPkdvcV9.jpg") 130 | else: 131 | return redirect(pic_url) 132 | 133 | 134 | if __name__ == "__main__": 135 | 136 | app.run(host="0.0.0.0", port=5000, debug=True, threaded=True) 137 | -------------------------------------------------------------------------------- /auto.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | path_shell=$(dirname $(readlink -f "$0")) 3 | echo "脚本目录:"$path_shell"" 4 | 5 | cd $path_shell 6 | py_log=$(python3 sky.py) 7 | echo "###################获取完成####################" 8 | 9 | if [ "$?" != "0" ];then 10 | echo "运行 sky.py 出现错误!" 11 | time=$(date "+%Y-%m-%d %H:%M:%S") 12 | python3 push.py "False" "${time}" "${py_log}" 13 | exit 1 14 | fi 15 | 16 | echo "##########初始化Git远程仓库#############" 17 | git config --global user.name "adminwhalefall" 18 | git config --global user.email "2734184475@qq.com" 19 | git remote rm github 20 | git remote add github git@github.com:AdminWhaleFall/skyData.git 21 | echo "##########提交GitHub#############" 22 | git add . 23 | time=$(date "+%Y-%m-%d %H:%M:%S") 24 | git commit -m "${time} 自动更新提交Sky_CSV文件" 25 | 26 | t=0 27 | while (( $t <= 10 )) 28 | do 29 | git_log=$(git push -u github master 2>&1) 30 | # git_log=$(cat /sassass/sas 2>&1) 31 | if [ "$?" != "0" ];then 32 | echo "#############尝试第"$i"次##################" 33 | echo $git_log 34 | # exit 1 35 | else 36 | echo $git_log 37 | python3 push.py "True" "${time}" "${git_log}" 38 | exit 1 39 | fi 40 | let "t++" 41 | done 42 | 43 | python3 push.py "False" "${time}" "${git_log}" 44 | 45 | 46 | # 强制同步远程仓库 47 | # git fetch --all 48 | # git reset --hard origin/master 49 | # git fetch 50 | 51 | # 请使用 /bin/bash 运行!这是一个极大的坑点 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Author: whalefall 3 | Date: 2021-02-03 12:12:36 4 | LastEditors: Please set LastEditors 5 | LastEditTime: 2021-02-18 14:33:14 6 | Description: 读取生成的CVS文件 最好是多线程下载里边的图片 并自动归类到一个文件夹里边 7 | ''' 8 | import codecs 9 | import csv 10 | import os 11 | import random 12 | import string 13 | import threading 14 | import time 15 | from ast import literal_eval 16 | import re 17 | 18 | import requests 19 | from fake_useragent import UserAgent 20 | 21 | # 创建数据下载文件夹 22 | try: 23 | # os.mkdir("D://skyDownload") 24 | dirList = ["txt", "pic", "video"] # 需要创建的下级目录 25 | for d in dirList: 26 | # 嵌套一个try防止有些憨憨手残删掉了一个文件夹导致新建失误 27 | try: 28 | os.makedirs("D://skyDownload//{}".format(d)) 29 | except FileExistsError: 30 | continue 31 | # except FileExistsError: 32 | # print("文件夹已存在!") 33 | except Exception as e: 34 | # 通常是权限之类的错误叭 35 | print("创建文件夹时出现未知错误", e) 36 | 37 | # 生成随机数 38 | 39 | 40 | def ran(): 41 | charlist = [random.choice(string.ascii_uppercase) for i in range(6)] 42 | chars = ''.join(charlist) 43 | return chars 44 | 45 | # 去除不合规则的文件名 46 | 47 | 48 | def validateTitle(title): 49 | rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/ \ : * ? " < > |' 50 | new_title = re.sub(rstr, "_", title) # 替换为下划线 51 | return new_title 52 | 53 | # 下载保存函数 传入:类型(pic,txt,video) 作者 标题 标签 链接(文字) 没有的传0即可 54 | 55 | 56 | def download(contentType, name, title, tags, url): 57 | if contentType == "txt": 58 | name = validateTitle(name) 59 | title = validateTitle(title) 60 | 61 | with open(r"D://skyDownload//txt//{}_{}.txt".format(title, name), "w",encoding="utf8") as txt: 62 | txt.write(str(url)) 63 | print("[Txt]../txt/{}_{}.txt 下载成功".format(title, name)) 64 | 65 | elif contentType == "pic": # 这里要新建标签文件夹 如果存在异常处理直接写入 66 | 67 | resp = requests.get( 68 | url, headers={"User-Agent": UserAgent().random}).content 69 | 70 | try: 71 | os.makedirs(r"D://skyDownload//pic//{}".format(tags)) 72 | time_s = ran() 73 | with open(r"D://skyDownload//pic//{}//{}.jpg".format(tags, time_s), "wb") as pic: 74 | pic.write(resp) 75 | print("[Pic]../pic/{}/{}.jpg 下载成功".format(tags, time_s)) 76 | except FileExistsError: # 文件夹已存在时 直接由时间戳命名算了 随机字母 77 | time_s = ran() 78 | with open(r"D://skyDownload//pic//{}//{}.jpg".format(tags, time_s), "wb") as pic: 79 | pic.write(resp) 80 | print("[Pic]../pic/{}/{}.jpg 下载成功".format(tags, time_s)) 81 | except Exception as e: 82 | print("下载图片出现错误!", e) 83 | 84 | elif contentType == "video": 85 | resp = requests.get( 86 | url, headers={"User-Agent": UserAgent().random}).content 87 | 88 | title = validateTitle(title) 89 | with open("D://skyDownload//video//{}.mp4".format(title), "wb") as video: 90 | video.write(resp) 91 | print("[Video]../Video/{}.mp4 下载成功".format(title)) 92 | else: 93 | raise TypeError("传入的类型参数错误!") 94 | 95 | 96 | # download("txt", "hyy", "sas", "sa", "http://baidu.com") 97 | 98 | # 列出脚本目录下的文件 并匹配csv文件 99 | result = os.listdir(os.getcwd()) 100 | 101 | for csvFileName in result: 102 | if ".csv" in csvFileName: 103 | # 匹配到一个就退出了 104 | csv_path = "{}\{}".format(os.getcwd(), csvFileName) 105 | print("在脚本目录找到的.CSV文件:{}".format(csv_path)) 106 | break 107 | 108 | with codecs.open("{}".format(csv_path), "r", encoding="utf_8_sig") as f: 109 | reader = csv.DictReader(f) 110 | for row in reader: 111 | tags = row["tags"] 112 | text = row["text"] 113 | picList = literal_eval(row["picList"]) 114 | # print(picList) 115 | name = row["name"] 116 | title = row["title"] 117 | # 这里可能会出现诡异错误 118 | try: 119 | # print(tags,text,picList,name,title) 120 | # - 判断文字为空的情况 121 | # 为空-->判断tags是否为视频 122 | # -->否 下载图片到 ../pic/图片标签文件夹/.jpg 123 | # -->是 下载视频到 ../video/视频标题.mp4 124 | # - 不为空-->把文字下载到 ../txt/标题——作者.txt 125 | 126 | if text == "": 127 | if tags == "视频": 128 | if picList == "[]": 129 | pass 130 | for url in picList: 131 | download("video", name, title, tags, url) 132 | else: 133 | if picList == "[]": 134 | pass 135 | for url in picList: 136 | # print(url) 137 | download("pic", name, title, tags, url) 138 | else: 139 | download("txt", name, title, tags, text) 140 | 141 | except ConnectionResetError: 142 | 143 | print("爬取太快!服务器可能拒绝!") 144 | time.sleep(10) 145 | 146 | except Exception as e: 147 | print("出现诡异错误",e) 148 | time.sleep(4) 149 | finally: 150 | continue 151 | -------------------------------------------------------------------------------- /push.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 2 | # 酷推推送模块 支持传入参数 3 | import requests 4 | import sys 5 | # 酷推token 6 | token = "92f83d0596c7b553ea1df9f242e4fc46" 7 | 8 | status = sys.argv[1] 9 | time = sys.argv[2] 10 | log = sys.argv[3] 11 | 12 | # print(status,time,log) 13 | # sys.exit() 14 | 15 | url = "https://push.xuthus.cc/send/%s" % (token) 16 | 17 | if status == "True": 18 | content = "[SkyData] \n%s 自动提交skyCSV成功!\nGit信息:%s" % (time,log) 19 | elif status == "False": 20 | content = "[SkyData] \n%s 自动提交skyCSV失败!错误信息:\n%s" % (time, log) 21 | else: 22 | raise TypeError("参数有误") 23 | 24 | data = "%s" % (content) 25 | res = requests.post(url=url, data=data.encode('utf-8')) 26 | print(res.text) 27 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fake_useragent==0.1.11 2 | pandas==1.2.2 3 | requests==2.24.0 4 | Flask==1.1.2 5 | -------------------------------------------------------------------------------- /sky.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/python3 2 | ''' 3 | Author: WhaleFall 4 | Date: 2021-02-01 10:57:20 5 | LastEditTime: 2021-02-17 15:28:26 6 | Description: sky光遇官网爬虫 7 | Url:https://game.163.com/star/sky/index.html 光遇博物馆 8 | ''' 9 | import requests 10 | import csv 11 | import codecs # 处理csv乱码 12 | import re 13 | import time # 处理时间 14 | import sys # 获取脚本目录 15 | import os 16 | 17 | path = sys.path[0] 18 | 19 | # 获取当前时间 2021-01-23 20 | new_time = time.strftime("%Y-%m-%d") 21 | 22 | # print(new_time) 23 | 24 | 25 | # 获取 传入获取页数 60 120 180这样递增 但是每次只返回60条数据 26 | def getContent(page): 27 | url = "https://kol.tongren.163.com/article/" 28 | data = { 29 | "sort": "new", 30 | "game": "光遇", 31 | "random_hot_value": "10", 32 | "span": "60", # 每次获取的条数 33 | "start": str(page), 34 | } 35 | header = { 36 | "Accept": "application/json, text/javascript, */*; q=0.01", 37 | "User-Agent": 38 | "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.104 Safari/537.36", 39 | "Host": "kol.tongren.163.com", 40 | "Origin": "http://game.163.com", 41 | "Referer": "http://game.163.com/star/sky/index.html", 42 | } 43 | 44 | try: 45 | response = requests.get(url, params=data, headers=header).json() 46 | # print(response) 47 | articles = response["data"]["articles"] 48 | if articles == []: 49 | print("全部采集完毕!") 50 | sys.exit() 51 | return "全部采集完毕!" 52 | except Exception as e: 53 | print("[Eroor]响应信息:", response) 54 | raise 55 | 56 | DataAll = [] # 所有数据 57 | # 取一个用户的主体数据 58 | for arts in articles: 59 | # print(arts) 60 | name = arts["author"]["nickname"] # 作者名字 61 | 62 | # 遍历取用户主体数据 63 | datas = arts['body'] # 用户主体数据里面的主体内容 64 | picList = [] # 储存图片链接 65 | for data in datas: 66 | 67 | content = data.get('fp_data') 68 | if content == None: 69 | text_content = data.get('text_content') 70 | else: 71 | text_content = None 72 | pic_url = content.get("url") 73 | picList.append(pic_url) # 添加到图片列表 74 | 75 | tags = arts['tags'][0] # 标签 76 | title = arts['title'] # 标题 77 | times = arts["publish_time"] # 时间 78 | # print(times) 79 | # 正则写的好菜 处理时间 处理后:2021-01-23 12:53:43 80 | pat = re.compile(r"(.*?)T\d\d:\d\d:\d\d") 81 | pat_check = re.compile(r"(.*?)T") 82 | 83 | time_s = pat.search(times).group().replace("T", " ") 84 | 85 | times_check = pat_check.findall(times)[0] 86 | # print(times_check) 87 | 88 | # 判断日期 89 | # if times_check != new_time: 90 | # print(times_check,new_time) 91 | # return "stop" 92 | # print(title, tags, name, picList, times) 93 | 94 | dictData = { 95 | "title": title, 96 | "text": text_content, 97 | "tags": tags, 98 | "name": name, 99 | "picList": picList, 100 | "time": time_s 101 | } 102 | # print(dictData) 103 | DataAll.append(dictData) 104 | 105 | # print(DataAll) 106 | return DataAll 107 | 108 | 109 | # getContent(999999) 110 | 111 | # 先删除目录下的所有CVS文件 112 | # 列出脚本目录下的文件 并匹配csv文件 113 | result = os.listdir(os.getcwd()) 114 | csvList=[] 115 | for csvFileName in result: 116 | if ".csv" in csvFileName: 117 | csv_path = os.path.join(os.getcwd(), csvFileName) 118 | csvList.append(csv_path) 119 | 120 | print("在脚本目录找到的.CSV文件:{}".format(csvList)) 121 | 122 | if csvList==[]: 123 | print("[Error]目录下暂无.CVS文件") 124 | else: 125 | for csvName in csvList: 126 | os.remove(csvName) 127 | print("[Suc]删除目录下的CSV文件成功!") 128 | 129 | 130 | page = 0 131 | # 初始化csv 132 | with codecs.open("{}//skyDate {}.csv".format(path, new_time), "w", encoding="utf_8_sig") as cvs_file: 133 | 134 | headers = ["title", "text", "tags", "name", "picList", "time"] # 表头 135 | writer = csv.DictWriter(cvs_file, headers) 136 | writer.writeheader() # 写表头 137 | 138 | while True: 139 | print("[suc]获取第{}条数据".format(page)) 140 | Data = getContent(page) # 字典数据 141 | # if Data == "stop": 142 | # print("[stop]获取到指定日期停止") 143 | # break 144 | with codecs.open("{}//skyDate {}.csv".format(path, new_time), "a", encoding="utf_8_sig") as cvs_file: 145 | writer = csv.DictWriter(cvs_file, headers) 146 | writer.writerows(Data) # 写入多行 147 | page = page + 60 148 | --------------------------------------------------------------------------------