├── config_file ├── filter_config.py ├── detect_config.py └── config.py ├── readme.md ├── main.py ├── cmd_tool.py ├── filter.py ├── reader.py ├── detector.py ├── tool.py └── contents.py /config_file/filter_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTHOR: MIAN 4 | DATE: 2020/11/7 5 | DESCRIBE: 配置 6 | """ 7 | # 添加新的filter应在此处配置 8 | enable = { 9 | "max_chap_len": True, 10 | "max_vol_len": True, 11 | "strict_end": True 12 | } 13 | -------------------------------------------------------------------------------- /config_file/detect_config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTHOR: MIAN 4 | DATE: 2020/11/7 5 | DESCRIBE: 配置 6 | """ 7 | # 添加新的detectors应在此处配置 8 | enable = { 9 | "Chap1": True, 10 | "Chap2": True, 11 | "Chap3": True, 12 | "Vol1": True, 13 | "Vol2": True, 14 | } 15 | -------------------------------------------------------------------------------- /config_file/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTHOR: MIAN 4 | DATE: 2020/10/15 5 | DESCRIBE: 配置 6 | """ 7 | from enum import Enum, auto 8 | 9 | 10 | class System(Enum): 11 | LINUX = auto() 12 | WIN = auto() 13 | 14 | 15 | env = System.WIN # 根据系统环境选 废弃 16 | para_space = 0 # 添加几个英文空格在段首 17 | para_chi_space = 2 # 添加几个中文空格在段首 18 | delete_enter = True # 删除所有空行?(不包括后面的 19 | chapter_enter = 1 # 章节名后添加几个空行 20 | volume_enter = 1 # 卷后添加几个空行 21 | text_enter = 1 # 段后额外添加几个空行 22 | over = False # 是否覆盖原文件 23 | read_code = "" # 源文件编码 置空时自动识别 24 | auto_detect_bytes = 100000 # 仅通过前几个字节判断格式,为0则全部读取。(2M的文件需要好几分钟。。。) 25 | write_code = "utf-8" # reform后编码 26 | warning_chap_len = 1500 # 卷字数小于此长度报警 27 | 28 | max_chap_len = 50 # 最大章节名长度,超过会被当成普通文本,主要还是为了避免有些特殊情况 29 | max_vol_len = 50 # TODO 移动到filter 30 | debug = True 31 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # 小说文本格式化脚本 2 | 3 | ## 作用 4 | 5 | 网络上下载地盗版小说往往排版一塌糊涂,经常会出现以下问题。 6 | 7 | - 章卷标题格式混乱 8 | - 段前空格飘忽不定 9 | - 段间距、标题空行差强人意 10 | - 段落顺序混乱、出现重复章节 11 | - 编码混乱 12 | 13 | 所以我花了几个小时写了这个脚本,主要有以下作用 14 | 15 | - 可自由配置的自动识别章卷标题,并删除没用的空格重新格式化 16 | - 对文件总体进行扫描,显示章卷结构与明显错误 17 | - 对于隐式章卷结构,或不规范的章卷结构进行优化 18 | - 删除所有段前段后空格,并按照我们的想法进行段前空格调整 19 | - 统一调整、添加空行 20 | - 对所有卷、章进行排序后删除重复的 21 | - 可以进行编码转换 22 | 23 | ## 适用范围 24 | 25 | 任何detector与filter可以识别出章卷结构的小说 26 | 27 | 可在`config_file/detect_config.py`和`config_file/filter_config.py`中自由配置过滤器和嗅探器 28 | 29 | ## 如何使用 30 | 31 | 根据`config_file`文件夹里面`config.py`的注释更改为你想要的配置。 32 | 33 | 直接运行`python main.py` 34 | 35 | ~或者下载[exe版本](https://github.com/intmian/txt_reform/releases/tag/v1.2.1),并双击使用~(此版本配置文件被打包编译无法更改,下版本会解耦到json) 36 | 37 | ## 注意 38 | 39 | 有些作者在更新小说的时候会犯迷糊,更新小说却不更新章节,导致会出现两个章节号一样的章节,内容章节名都不一样,这就很麻烦 40 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTHOR: MIAN 4 | DATE: 2020/10/20 5 | DESCRIBE: 驱动程序 6 | """ 7 | import tool, contents 8 | from config_file import config 9 | import os 10 | 11 | 12 | def main(): 13 | # 各个阶段也加入打表信息,并与debug分离 14 | # 读入 15 | tool.ready("正在读入文件") 16 | addr = tool.get_file() 17 | if addr == "": 18 | return 19 | con = contents.Contents(addr) 20 | # 处理 21 | tool.ready("文本结构展示") 22 | tool.analyse_list(con.head) 23 | tool.done() 24 | con.reform() 25 | tool.ready("重整为单一文本") 26 | s = con.output() 27 | tool.done() 28 | # 输出 29 | tool.ready("导出") 30 | (filepath, temp_filename) = os.path.split(addr) 31 | (filename, extension) = os.path.splitext(temp_filename) 32 | if not config.over: 33 | addr = filepath + "/" + filename + "_修改后" + extension 34 | with open(addr, "w", encoding=config.write_code) as file: 35 | file.write(s) 36 | tool.done() 37 | 38 | 39 | if __name__ == '__main__': 40 | main() 41 | -------------------------------------------------------------------------------- /cmd_tool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTHOR: MIAN 4 | DATE: 2021/2/10 5 | DESCRIBE: 一些解耦出来的小模块 6 | """ 7 | import contents 8 | import tool 9 | from filter import FILTERS 10 | from detector import DETECTORS 11 | import sys 12 | 13 | 14 | def judge_line(s: str): 15 | """ 16 | 输出嗅探器与探测器对于 17 | :param s: 18 | """ 19 | sig = FILTERS.filt(s) 20 | ty = DETECTORS.detect(s) 21 | print(sig) 22 | print(ty[0], ty[1], ty[2]) 23 | 24 | 25 | def detect_struct(): 26 | tool.ready("正在读入文件") 27 | addr = tool.get_file() 28 | if addr == "": 29 | return 30 | con = contents.Contents(addr) 31 | # 处理 32 | tool.ready("文本结构展示") 33 | tool.analyse_list(con.head) 34 | 35 | 36 | if __name__ == '__main__': 37 | if len(sys.argv) < 2: 38 | print("参数不足") 39 | exit(0) 40 | mode = sys.argv[1] 41 | if mode == "df": 42 | s = sys.argv[2] 43 | judge_line(s) 44 | elif mode == "show": 45 | detect_struct() 46 | else: 47 | exit(0) 48 | -------------------------------------------------------------------------------- /filter.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTHOR: MIAN 4 | DATE: 2020/11/7 5 | DESCRIBE: 适配过滤器 6 | """ 7 | from abc import * 8 | from typing import * 9 | import enum 10 | from config_file.config import * 11 | 12 | 13 | class SIGNAL(enum.Enum): 14 | # 由filters返回的信号 15 | REJECT_ALL = 1 # 拒绝所有 16 | REJECT_CHAP = 2 # 拒绝章 17 | REJECT_VOL = 3 # 拒绝卷 18 | REJECT_CV = 4 # 拒绝章卷 19 | REJECT_TEXT = 5 # 拒绝正文 20 | REJECT_ENTER = 6 # 拒绝空行 21 | REJECT_TE = 7 # 拒绝正文空行 22 | OK = 8 23 | 24 | 25 | class Filter(ABC): 26 | @abstractmethod 27 | def filt(self, s: str) -> SIGNAL: 28 | pass 29 | 30 | 31 | class MaxChapLen(Filter): 32 | # 超过章限长 33 | def filt(self, s: str) -> SIGNAL: 34 | if len(s) > max_chap_len: 35 | return SIGNAL.REJECT_CHAP 36 | else: 37 | return SIGNAL.OK 38 | 39 | 40 | class MaxVolLen(Filter): 41 | # 超过卷限长 42 | def filt(self, s: str) -> SIGNAL: 43 | if len(s) > max_vol_len: 44 | return SIGNAL.REJECT_VOL 45 | else: 46 | return SIGNAL.OK 47 | 48 | 49 | class StrictEnd(Filter): 50 | # 末尾不能为。 51 | def filt(self, s: str) -> SIGNAL: 52 | if s == "": 53 | return SIGNAL.OK 54 | if s[-1] in ["。"]: 55 | return SIGNAL.REJECT_CV 56 | return SIGNAL.OK 57 | 58 | 59 | class Filters: 60 | def __init__(self): 61 | from config_file.filter_config import enable 62 | t = [] 63 | # 新的filter这里也要配好 64 | if enable["max_chap_len"]: 65 | t.append(MaxChapLen()) 66 | if enable["max_vol_len"]: 67 | t.append(MaxVolLen()) 68 | if enable["strict_end"]: 69 | t.append(StrictEnd()) 70 | self.filters = t 71 | 72 | def filt(self, s: str) -> []: 73 | t = [] 74 | for f in self.filters: 75 | r = f.filt(s) 76 | if r not in t: 77 | t.append(r) 78 | return t 79 | 80 | 81 | FILTERS = Filters() 82 | -------------------------------------------------------------------------------- /reader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTHOR: MIAN 4 | DATE: 2020/10/19 5 | DESCRIBE: 一个用来处理输入流的类 6 | """ 7 | import chardet 8 | 9 | import contents 10 | from config_file import config 11 | import tool 12 | import re 13 | import filter, detector 14 | 15 | 16 | # 记录一个非常奇怪的现象我写from contents import * 程序就报错说类未定义,令人不解 17 | # 可能是因为contents还没完整跑完一遍就跳到这了,其实我也不清楚 18 | 19 | class Reader: 20 | # 从输入流中读入 21 | def __init__(self, file_addr: str): 22 | """ 23 | :param file_addr: 需要被读入的地址 24 | """ 25 | self.exhausted = False # 因为new里面有个迭代器可以调用,当一轮跑完时重新跑就返回None 26 | self.addr = file_addr 27 | 28 | if config.read_code != "": 29 | self.f = open(file_addr, 'r', encoding=config.read_code) 30 | else: 31 | tool.ready("识别格式") 32 | # 检验格式 33 | f3 = open(file=file_addr, mode='rb') # 以二进制模式读取文件 34 | data = None 35 | if config.auto_detect_bytes == 0: 36 | data = f3.read() # 获取文件内容 37 | else: 38 | data = f3.read(config.auto_detect_bytes) 39 | f3.close() # 关闭文件 40 | result = chardet.detect(data) 41 | code_type = result["encoding"] 42 | print(" 格式为:", code_type) 43 | print(" 置信度:", result["confidence"]) 44 | if code_type == "GB2312": 45 | # 很奇怪的是识别为GB2312但是并无法完全解码。。。。可能置信度不足100% 46 | code_type = "GBK" 47 | print(" GB2312默认转为GBK") 48 | self.f = open(file_addr, 'r', encoding=code_type) 49 | tool.done() 50 | tool.done() 51 | 52 | def __del__(self): 53 | self.f.close() 54 | 55 | def gene(self): 56 | """ 57 | 返回Content迭代器,如果为空则返回None 58 | """ 59 | text = self.f.read() 60 | temp = "" 61 | for c in text: 62 | # 注意:用python的r模式读文件时根据系统的区别会用\n代替\n\r 63 | if c == "\n": 64 | temp += c 65 | # 将有效内容拿出 66 | t = "" 67 | if config.env == config.System.WIN: 68 | if temp[-1:] == tool.newline(): 69 | t = temp[:-1] 70 | else: 71 | continue 72 | elif config.env == config.System.LINUX: 73 | if temp[-1:] == tool.newline(): 74 | t = temp[:-1] 75 | else: 76 | continue 77 | else: 78 | pass 79 | # todo:报错,出现了不一样的换行符 80 | t = t.strip() # 去除首尾空格 81 | signals = filter.FILTERS.filt(t) 82 | ty, num, name = detector.DETECTORS.detect(t) 83 | if ty == detector.TYPE.CHAPTER and \ 84 | filter.SIGNAL.REJECT_CHAP not in signals and \ 85 | filter.SIGNAL.REJECT_ALL not in signals and \ 86 | filter.SIGNAL.REJECT_CV not in signals: 87 | # 章 88 | yield contents.Chapter(int(num), name) 89 | elif ty == detector.TYPE.VOLUME and \ 90 | filter.SIGNAL.REJECT_VOL not in signals and \ 91 | filter.SIGNAL.REJECT_ALL not in signals and \ 92 | filter.SIGNAL.REJECT_CV not in signals: 93 | # 卷 94 | yield contents.Volume(int(num), name) 95 | elif ty == detector.TYPE.ENTER: 96 | # 空行 97 | yield contents.Enter() 98 | elif ty == detector.TYPE.TEXT: 99 | # 段 100 | yield contents.Text(name) 101 | else: 102 | yield contents.Text(name) 103 | temp = "" 104 | continue 105 | else: 106 | temp += c 107 | yield None 108 | 109 | 110 | if __name__ == '__main__': 111 | addr = tool.get_file() 112 | r = Reader(addr) 113 | a = [] 114 | for r in r.gene(): 115 | a.append(r) 116 | 117 | print(a) 118 | -------------------------------------------------------------------------------- /detector.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTHOR: MIAN 4 | DATE: 2020/10/28 5 | DESCRIBE: 章卷嗅探器 6 | """ 7 | import re 8 | from abc import * 9 | from typing import * 10 | import enum 11 | 12 | from cn2an import cn2an 13 | 14 | import tool 15 | 16 | 17 | class TYPE(enum.Enum): 18 | CHAPTER = 1 19 | VOLUME = 2 20 | # 合理情况下以下的不会被访问到 21 | TEXT = 3 22 | ENTER = 4 23 | 24 | 25 | class Detector(ABC): 26 | @abstractmethod 27 | def detect(self, s) -> (bool, TYPE): 28 | pass 29 | 30 | @abstractmethod 31 | def num(self, s) -> str: 32 | pass 33 | 34 | @abstractmethod 35 | def name(self, s) -> str: 36 | pass 37 | 38 | 39 | def uni_get_num(t: str) -> str: 40 | """ 41 | 从t中取出第一个中文或阿拉伯数字 42 | :param t: 来源 43 | :return: 取出的数字 44 | """ 45 | num = re.findall("[0-90-9零一二三四五六七八九十百千万]+", t)[0] 46 | return str(cn2an(num, "smart")) 47 | # # 转换为阿拉伯 48 | # if num == "十": 49 | # num = "10" 50 | # elif num[0] == "十": 51 | # num = "1" + num 52 | # elif num[-1] == "十": 53 | # num += "0" 54 | # elif num[-1] == "百": 55 | # num += "00" 56 | # elif num[-1] == "千": 57 | # num += "000" 58 | # elif num[-1] == "万": 59 | # num += "0000" 60 | # s = "" 61 | # n = {"1": "1", "2": "2", "3": "3", "4": "4", "5": "5", "6": "6", "7": "7", "8": "8", "9": "9", 62 | # "0": "0", 63 | # "一": "1", "二": "2", "三": "3", "四": "4", "五": "5", "六": "6", "七": "7", "八": "8", "九": "9", 64 | # "零": "0", 65 | # "十": "", "百": "", "千": "", "万": ""} 66 | # for c in num: 67 | # if c in n: 68 | # s = s + n[c] 69 | # else: 70 | # s += c 71 | # return s 72 | 73 | 74 | # todo: 看一看有没有比match更好的,其实不需要返回值 75 | class Chap1(Detector): 76 | # 第*章 *** 77 | def detect(self, s) -> (bool, TYPE): 78 | return re.match(" *(第)? *0*[0-90-9零一二三四五六七八九十百千万]+ *章 .*", s), TYPE.CHAPTER 79 | 80 | def num(self, s) -> str: 81 | return uni_get_num(s) 82 | 83 | def name(self, s) -> str: 84 | name = s[s.find("章") + 1:].strip() # 章节名,如果没有就是空字符串构造函数里面有处理 85 | if name == "章": # 这个和上面的做法合并才能搞出正确的结果 86 | name = "" 87 | return name 88 | 89 | 90 | class Chap2(Detector): 91 | # *、 *** 92 | def detect(self, s) -> (bool, TYPE): 93 | return re.match(" *[0-90-9零一二三四五六七八九十百千万]+、【.*】*", s), TYPE.CHAPTER 94 | 95 | def num(self, s) -> str: 96 | return uni_get_num(s) 97 | 98 | def name(self, s) -> str: 99 | name = s[s.find("、") + 1:].strip() # 章节名,如果没有就是空字符串构造函数里面有处理 100 | if name == "、": # 这个和上面的做法合并才能搞出正确的结果 101 | name = "" 102 | return name 103 | 104 | 105 | class Chap3(Detector): 106 | # 第*章 107 | def detect(self, s) -> (bool, TYPE): 108 | return re.match(" *(第)?[0-90-9零一二三四五六七八九十百千万]+章", s), TYPE.CHAPTER 109 | 110 | def num(self, s) -> str: 111 | return uni_get_num(s) 112 | 113 | def name(self, s) -> str: 114 | return "" 115 | 116 | 117 | class Vol1(Detector): 118 | # 第*卷 aaa 119 | def detect(self, s) -> (bool, TYPE): 120 | return re.match(" *第 *0*[0-90-9零一二三四五六七八九十百千万]+ *卷 *.*", s), TYPE.VOLUME 121 | 122 | def num(self, s) -> str: 123 | return uni_get_num(s) 124 | 125 | def name(self, s) -> str: 126 | name = s[s.find("卷") + 1:].strip() # 章节名,如果没有就是空字符串构造函数里面有处理 127 | if name == "卷": # 这个和上面的做法合并才能搞出正确的结果 128 | name = "" 129 | return name 130 | 131 | 132 | class Vol2(Detector): 133 | # 第*卷 134 | def detect(self, s) -> (bool, TYPE): 135 | return re.match(" *第[0-90-9零一二三四五六七八九十百千万]+卷", s), TYPE.VOLUME 136 | 137 | def num(self, s) -> str: 138 | return uni_get_num(s) 139 | 140 | def name(self, s) -> str: 141 | name = s[s.find("卷") + 1:].strip() # 章节名,如果没有就是空字符串构造函数里面有处理 142 | if name == "卷": # 这个和上面的做法合并才能搞出正确的结果 143 | name = "" 144 | return name 145 | 146 | 147 | class Detectors: 148 | def __init__(self): 149 | # 添加新的detectors应在此处配置 150 | t = [] 151 | from config_file.detect_config import enable 152 | if enable["Chap1"]: 153 | t.append(Chap1()) 154 | if enable["Chap2"]: 155 | t.append(Chap2()) 156 | if enable["Chap3"]: 157 | t.append(Chap3()) 158 | if enable["Vol1"]: 159 | t.append(Vol1()) 160 | if enable["Vol2"]: 161 | t.append(Vol2()) 162 | 163 | self.detectors = t 164 | 165 | def detect(self, s: str) -> (TYPE, str, str): 166 | """ 167 | 将文本解析为具体的content类型,并提取内容 168 | :param s: 文本 169 | :return : (类型,数字(仅对于章卷),内容) 170 | """ 171 | if s == "": 172 | return TYPE.ENTER, "", "" 173 | for d in self.detectors: 174 | b, t = d.detect(s) 175 | if b: 176 | return t, d.num(s), d.name(s) 177 | return TYPE.TEXT, "", s 178 | 179 | 180 | DETECTORS = Detectors() 181 | -------------------------------------------------------------------------------- /tool.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTHOR: MIAN 4 | DATE: 2020/10/15 5 | DESCRIBE: 基础设备 6 | """ 7 | 8 | from config_file import config 9 | import contents 10 | 11 | 12 | def get_file() -> str: 13 | """根据不同的系统,进行不同操作,返回路径 14 | :return: 返回选中的文件路径 15 | """ 16 | if config.env == config.System.WIN: 17 | import win32ui 18 | import win32con 19 | dlg = win32ui.CreateFileDialog(1, None, None, win32con.OFN_OVERWRITEPROMPT, 20 | "Text Files (*.txt)|*.txt||") # 1表示打开文件对话框 21 | dlg.SetOFNInitialDir("") # 设置打开文件对话框中的初始显示目录 22 | dlg.DoModal() 23 | return dlg.GetPathName() # 获取选择的文件名称 24 | if config.env == config.System.LINUX: 25 | return input("请输入地址:") 26 | 27 | 28 | def newline() -> str: 29 | """生成不同系统对应的换行符 30 | :return: 返回换行符 31 | """ 32 | return {config.System.WIN: "\n", config.System.LINUX: "\n"}[config.env] 33 | 34 | 35 | def space_para() -> str: 36 | """ 37 | :return: 返回段首空格 38 | """ 39 | # 所谓中文空格就是全角空格 40 | return config.para_space * " " + config.para_chi_space * " " 41 | 42 | 43 | def debug_list(head): 44 | """ 45 | 打印链表 46 | """ 47 | p = head 48 | last_cap = -1 49 | 50 | while p is not None: 51 | if type(p) is contents.Chapter: 52 | # 省略中间连续的,更快找出症结 53 | n = p.num 54 | if n == last_cap + 1: 55 | pass 56 | elif n == 1: 57 | print(" 章 ", 1) 58 | else: 59 | print(" ...") 60 | print(" 章 ", last_cap) 61 | err = "" 62 | if last_cap == n: 63 | err = "[与上一章重复]" 64 | if last_cap < n - 1: 65 | err = "[与上一章不连续,中间缺章]" 66 | if last_cap > n: 67 | # 此处不严谨,应该结合下个章节再判断 68 | err = "[与上一章不连续,错章]" 69 | print(" 章 ", n, err) 70 | last_cap = n 71 | elif type(p) is contents.Volume: 72 | if last_cap != -1: 73 | print(" ...") 74 | print(" 章 ", last_cap) 75 | print("卷 ", p.num) 76 | else: 77 | print("卷 ", p.num) 78 | elif type(p) is contents.Text: 79 | pass 80 | elif type(p) is contents.Enter: 81 | pass 82 | p = p.next 83 | print(" ...") 84 | print(" 章 ", last_cap) 85 | 86 | 87 | def analyse_list(head): 88 | """ 89 | 打印链表,并分析错误 90 | """ 91 | node = head 92 | nodes = [] 93 | while node is not None: 94 | if type(node) is contents.Chapter: 95 | nodes.append((1, node.num)) 96 | elif type(node) is contents.Volume: 97 | nodes.append((2, node.num)) 98 | node = node.next 99 | 100 | volumes = [] 101 | p = 0 102 | chaps = [] # 卷前章 103 | v_nums = [] 104 | while p != len(nodes): 105 | n = nodes[p] 106 | if n[0] == 1: 107 | chaps.append(n[1]) 108 | p += 1 109 | if n[0] == 2: 110 | v_nums.append(n[1]) 111 | if len(chaps) != 0 and len(volumes) == 0: 112 | volumes.append(chaps) 113 | # 卷前章 114 | p2 = p + 1 115 | chaps = [] 116 | while p2 != len(nodes): 117 | t1, n1 = nodes[p2] 118 | if t1 == 2: 119 | break 120 | if t1 == 1: 121 | chaps.append(n1) 122 | p2 += 1 123 | volumes.append(chaps) 124 | p = p2 125 | if len(volumes) == 0: 126 | volumes.append(chaps) # 当没有章卷结构时 127 | for v in volumes: 128 | min_i = 0 129 | max_i = len(v) - 1 130 | for i in range(len(v)): 131 | if min_i < i < max_i: 132 | # 上面这个是什么玩意。。。 133 | if (v[i - 1] + 1 == v[i] or v[i-1] == -1) and v[i] + 1 == v[i + 1]: 134 | v[i] = -1 135 | for vol_index in range(len(volumes)): 136 | v = volumes[vol_index] 137 | print("卷", v_nums[vol_index]) 138 | skip = False 139 | for i in range(len(v)): 140 | err = "" 141 | chap = v[i] 142 | # 末章单独处理 143 | if i == len(v) - 1: 144 | print(" 章", chap) 145 | continue 146 | # 第一章 147 | if i == 0: 148 | if chap != 1 and chap != 0: 149 | err = "[本卷不以第一章开头]" 150 | print(" 章", chap, err) 151 | continue 152 | else: 153 | print(" 章", chap, err) 154 | continue 155 | pre = v[i - 1] 156 | next_n = v[i + 1] 157 | # 选择性打印省略号 158 | if chap == -1: 159 | if not skip: 160 | print(" ...") 161 | skip = True 162 | continue 163 | else: 164 | continue 165 | else: 166 | skip = False 167 | if pre == -1: 168 | print(" 章", chap, err) 169 | continue 170 | if v[i - 2] + 1 == chap: 171 | continue 172 | 173 | if pre + 2 == next_n: 174 | err = "[乱章]" 175 | elif pre < chap: 176 | err = "[缺章]" 177 | elif pre > chap: 178 | err = "[错章]" 179 | elif pre == chap: 180 | err = "[重复]" 181 | print(" 章", chap, err) 182 | 183 | 184 | def all_do_list(head, func): 185 | """ 186 | 对于每一个链表中的单元做func,注意:如果对于p点附近的节点做更改可能导致p失效 187 | :param head: 头节点 188 | :param func: 需要做的事func(p) 189 | """ 190 | p = head 191 | while p is not None: 192 | func(p) 193 | p = p.next 194 | 195 | 196 | def ready(s): 197 | print(s + "... ") 198 | 199 | 200 | def done(): 201 | print("done") 202 | 203 | 204 | if __name__ == '__main__': 205 | print(get_file()) 206 | -------------------------------------------------------------------------------- /contents.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | """ 3 | AUTHOR: MIAN 4 | DATE: 2020/10/15 5 | DESCRIBE: 用来存放数据的contents 6 | """ 7 | from abc import * 8 | from typing import * 9 | import tool 10 | import reader 11 | from config_file import config 12 | 13 | 14 | class Content(ABC): 15 | # 接口,为方便使用以链表形式组织 16 | Num = 0 17 | Head = None # 头节点 18 | 19 | @abstractmethod 20 | def __init__(self): 21 | Content.Num += 1 22 | self.last = None 23 | self.next = None 24 | self.reformed = False 25 | self.text = None 26 | if Content.Head is not None: 27 | Content.Head = self 28 | 29 | def inject(self, last: object): 30 | """ 31 | 将此节点插入到某点之后 32 | :param last: 希望被插入的节点 33 | """ 34 | if last is None: 35 | # 由程序结构决定只有这种可能 36 | Content.Head = self 37 | return 38 | t = last.next 39 | last.next = self 40 | self.last = last 41 | self.next = t 42 | if t is not None: 43 | t.last = self 44 | 45 | def delete(self): 46 | """将自己从链表中脱链 47 | """ 48 | Content.Num -= 1 49 | if self.last is None: 50 | Content.head = None 51 | # 头节点,函数链需特别注意 52 | elif self.next is None: 53 | self.last.next = None 54 | # self.last = None 不必进行这一步,不清空可以让迭代器不失效 55 | # 尾节点 56 | else: 57 | self.last.next = self.next 58 | self.next.last = self.last 59 | # self.next = None 60 | # self.last = None 61 | 62 | def swap(self, node): 63 | """与某点 64 | :param node: 65 | """ 66 | if self == node: 67 | return 68 | if node.next == self: 69 | # 减少需要处理的情况 70 | node.swap(self) 71 | return 72 | 73 | if self == Content.Head: 74 | Content.Head = node 75 | elif node == Content.Head: 76 | Content.Head = self 77 | 78 | sl = self.last 79 | sn = self.next 80 | nl = node.last 81 | nn = node.next 82 | 83 | if self.next == node: 84 | # 相邻 85 | if sl is not None: 86 | sl.next = node 87 | node.next = self 88 | self.next = nn 89 | if nn is not None: 90 | nn.last = self 91 | self.last = node 92 | node.last = sl 93 | return 94 | 95 | node.next = sn 96 | node.last = sl 97 | self.last = nl 98 | self.next = nn 99 | # 交换本身 100 | if sl is not None: 101 | sl.next = node 102 | if sn is not None: 103 | sn.last = node 104 | if nl is not None: 105 | nl.next = self 106 | if nn is not None: 107 | nn.last = self 108 | 109 | @abstractmethod 110 | def reform(self): 111 | """将自身格式化 112 | """ 113 | self.reformed = True 114 | 115 | @abstractmethod 116 | def output(self) -> str: 117 | """输出内容 118 | """ 119 | return self.text 120 | 121 | 122 | class Text(Content): 123 | # 单行文本 124 | def __init__(self, text: str): 125 | super().__init__() 126 | self.text = text 127 | 128 | def reform(self): 129 | super().reform() 130 | # 去除首位的空格 131 | self.text = self.text.strip() 132 | # 补空格 133 | self.text = tool.space_para() + self.text + tool.newline() 134 | for i in range(config.text_enter): 135 | self.text += tool.newline() 136 | 137 | def output(self) -> str: 138 | return self.text 139 | 140 | 141 | class Enter(Content): 142 | # 空行 143 | def __init__(self): 144 | super().__init__() 145 | self.text = tool.newline() 146 | 147 | def reform(self): 148 | super().reform() 149 | 150 | def output(self) -> str: 151 | return self.text 152 | 153 | 154 | class Chapter(Content): 155 | # 第?章 156 | def __init__(self, n: int, name: str): 157 | super().__init__() 158 | self.text = "第{}章".format(n) 159 | self.num = n # 用来作比较的,留着 160 | if name != "": 161 | self.text += " " + name 162 | self.text += tool.newline() 163 | self.child = [] 164 | for i in range(config.chapter_enter): 165 | self.child.append(Enter()) 166 | # 预先添加 167 | 168 | def reform(self): 169 | """ 170 | 将后面的正文节点全部折叠进此章节点内 171 | """ 172 | # 将所有的子成员脱链后放进child数组 173 | super().reform() 174 | p = self.next 175 | while p is not None: 176 | if type(p) is Chapter: 177 | break 178 | elif type(p) is Volume: 179 | break 180 | elif type(p) is Text: 181 | p.reform() 182 | self.child.append(p) 183 | p.delete() 184 | elif type(p) is Enter: 185 | if config.delete_enter: 186 | p.delete() # 重整空行 187 | else: 188 | self.child.append(p) 189 | p.reform() 190 | p = p.next 191 | 192 | def output(self) -> str: 193 | r = self.text 194 | for c in self.child: 195 | r += c.output() 196 | return r 197 | 198 | 199 | class Volume(Content): 200 | # 第?卷 201 | def __init__(self, n: int, name: str): 202 | super().__init__() 203 | self.text = "第{}卷".format(n) 204 | self.num = n # 用来作比较的,留着 205 | if name != "": 206 | self.text += " " + name 207 | self.text += tool.newline() 208 | self.child = [] 209 | for i in range(config.volume_enter): 210 | self.child.append(Enter()) 211 | # 预先添加 212 | self.chap_num = 0 # TODO:增加更加泛用的统计 213 | 214 | def reform(self): 215 | # 将所有的子成员脱链后放进child数组 216 | super().reform() 217 | p = self.next 218 | while p is not None: 219 | if type(p) is Chapter: 220 | p.reform() 221 | self.child.append(p) 222 | self.chap_num += 1 223 | elif type(p) is Volume: 224 | break 225 | elif type(p) is Text: # 仅文字空行被收入列表 226 | if self.chap_num == 0: 227 | # TODO: 此处以及其他地方可以确认是否插入卷语与书语作为隐式章节 228 | 229 | # 卷前语,仅在本卷第一章可能存在 230 | if len(self.child) == config.volume_enter: 231 | # 章节仅存在卷前回车 232 | if config.debug: 233 | print("第", self.num, "卷插入卷前语") 234 | c = Chapter(0, "卷前语") 235 | c.reform() 236 | self.inject(c) 237 | self.child.append(c) 238 | 239 | p.reform() 240 | self.child.append(p) 241 | p.delete() 242 | else: 243 | print("第", self.num, "卷,出现格式化错误,请检查程序") 244 | elif type(p) is Enter: 245 | if config.delete_enter: 246 | p.delete() # 重整空行 247 | else: 248 | self.child.append(p) 249 | p.reform() 250 | p = p.next 251 | 252 | def output(self) -> str: 253 | r = self.text 254 | if len(self.child) <= 10 and self.text != "第0卷 书前语\n": 255 | # 不用考虑吧不在卷内的章的情况 256 | print("第", self.num, "卷章节过少,请进行检查") 257 | for c in self.child: 258 | t = c.output() 259 | if type(c) is Chapter: 260 | if len(t) < 2000 and c.text != "第0章 卷前语\n": 261 | # 因为 章一定在卷内所以字数判断放这里 262 | print("第", self.num, "卷 第", c.num, "章字数过少,请进行检查") 263 | # 这里会出现一个黄色提示,是因为pycharm只识别到我在v里面加了enter,而识别不到reform环节加的其他 264 | r += t 265 | return r 266 | 267 | 268 | class Contents: 269 | # 容纳所有content 270 | def __init__(self, addr: str): 271 | """ 272 | :param addr:需要被格式化的字符串地址 273 | """ 274 | # 首尾指针 275 | self.head = Content.Head 276 | self.last = Content.Head 277 | self.reader = reader.Reader(addr) 278 | self.child = [] 279 | no_chap = True 280 | no_volume = True 281 | tool.ready("生成文本链") 282 | for a in self.reader.gene(): 283 | if a is not None: 284 | if self.head is None: 285 | self.head = a 286 | a.inject(self.last) 287 | self.last = a 288 | if type(a) is Chapter: 289 | no_chap = False 290 | if type(a) is Volume: 291 | no_volume = False 292 | tool.done() 293 | 294 | tool.ready("插入隐式章卷") 295 | if no_chap: 296 | # 就是一段话没有章节划分 297 | c = Chapter(1, "总章") 298 | c.inject(self.head) 299 | c.swap(self.head) # 插在最前 300 | self.head = c 301 | if no_volume: 302 | # 不分卷 303 | no = None 304 | c = None 305 | if type(self.head) is not Chapter: 306 | # 需要插入卷前语的时候 307 | 308 | no = 0 309 | c = Volume(no, "书前语") 310 | if config.debug: 311 | print("插入隐式书前语") 312 | else: 313 | no = 1 314 | c = Volume(no, "自动生成卷") 315 | if config.debug: 316 | print("插入隐式卷", no) 317 | 318 | c.inject(self.head) 319 | c.swap(self.head) # 插在最前 320 | self.head = c 321 | last_chap = -1 # 上一章章节号 322 | 323 | # 扫描以生成隐式张卷 324 | p = self.head 325 | while p is not None: 326 | if type(p) is Chapter: 327 | if p.num == 1 and (last_chap == -1 or last_chap > 5): 328 | # 为了避免重复的第一章或短范围乱章被当成新的卷 329 | no += 1 330 | v = Volume(no, "自动生成卷") 331 | v.inject(p.last) 332 | if config.debug: 333 | print("插入隐式卷", no) 334 | last_chap = p.num 335 | p = p.next 336 | else: 337 | # 有卷的时候也需要检查是否需要插入书首语 338 | if type(self.head) is not Volume: 339 | # 需要插入书前语的时候 340 | if config.debug: 341 | print("插入隐式书前语") 342 | c = Volume(0, "书前语") # 对于存在0为卷号的情况会触发问题进行修复(我决定不修复了 TODO:修不修?) 343 | c.inject(self.head) 344 | c.swap(self.head) # 插在最前 345 | 346 | self.head = c 347 | tool.done() 348 | 349 | def reform(self): 350 | """进行重整 351 | """ 352 | p = self.head 353 | if p is None: 354 | p = Text("空文本") 355 | tool.ready("对各内容节点进行格式化并归并为连续结构") 356 | while p is not None: 357 | if type(p) is Chapter: 358 | p.reform() # todo:这个情况是不可能的,应该抛个错误 359 | elif type(p) is Volume: 360 | p.reform() 361 | self.child.append(p) 362 | elif type(p) is Text: 363 | p.reform() 364 | self.child.append(p) 365 | elif type(p) is Enter: 366 | if config.delete_enter: 367 | p.delete() # 重整空行 368 | else: 369 | p.reform() 370 | self.child.append(p) 371 | p = p.next 372 | tool.done() 373 | self.cv_sort() 374 | self.delete_reduplicate() 375 | 376 | def delete_reduplicate(self): 377 | """ 378 | 递归删除child的重复章卷 379 | """ 380 | tool.ready("删除重复章卷") 381 | # 删除重复卷 382 | new_child = [] 383 | last_num = -1 # 上一个章节号 384 | for c in self.child: 385 | if type(c) is Text: 386 | new_child.append(c) 387 | if type(c) is Enter: 388 | new_child.append(c) 389 | if type(c) is Volume: 390 | if c.num != last_num: 391 | new_child.append(c) 392 | last_num = c.num 393 | # todo: 优化重复章删除,优先删除空章,而非删除除第一个的 394 | # 删除重复章 395 | new_child2 = [] 396 | last_num_2 = -1 # 上一个章节号 397 | for cc in c.child: 398 | if type(cc) is Text: 399 | new_child2.append(cc) 400 | if type(cc) is Enter: 401 | new_child2.append(cc) 402 | if type(cc) is Chapter: 403 | if cc.num != last_num_2: 404 | new_child2.append(cc) 405 | last_num_2 = cc.num 406 | else: 407 | if config.debug: 408 | print("第", c.num, "卷 第", cc.num, "章重复已被删除") 409 | c.child = new_child2 410 | else: 411 | if config.debug: 412 | print("第", c.num, "卷重复已被删除") 413 | self.child = new_child 414 | tool.done() 415 | 416 | def cv_sort(self): 417 | """ 418 | 递归排序卷和章 419 | """ 420 | 421 | # 排序必须是稳定的,不然卷前语和书前语顺序可能改变 422 | def key(a): 423 | # 专门为内部排序写的,只考虑可能的情况.所有非段卷只可能在最前面,保持不变 424 | if type(a) is Text or type(a) is Enter: 425 | return 0 426 | else: 427 | return a.num 428 | 429 | tool.ready("章卷排序") 430 | for c in self.child: 431 | if type(c) is Volume: 432 | # 每一卷进行卷内排序 433 | c.child.sort(key=key) 434 | # python3 删除了自定义的比较函数,所以只能这样写... 435 | self.child.sort(key=key) 436 | tool.done() 437 | 438 | def output(self) -> str: 439 | r = "" 440 | for c in self.child: 441 | r += c.output() 442 | return r 443 | --------------------------------------------------------------------------------