├── .editorconfig ├── .eslintignore ├── .eslintrc.yml ├── .gitignore ├── .husky └── pre-commit ├── .mocharc.yml ├── .npmrc ├── .vscode └── launch.json ├── CHANGELOG.md ├── Jakefile.ts ├── LICENSE ├── NOTES.md ├── README.md ├── assets └── templates │ ├── cover.html │ ├── default.css │ ├── default.html │ └── epub │ ├── META-INF │ ├── com.apple.ibooks.display-options.xml │ └── container.xml │ └── OEBPS │ ├── content.opf │ ├── content.xhtml │ ├── cover.xhtml │ ├── nav.xhtml │ └── toc.ncx ├── bak ├── index.main.js └── index.worker.js ├── bin ├── .dev └── weread-spy.js ├── dev-notes └── 2023-10-21.md ├── docs └── epub.md ├── package.json ├── pnpm-lock.yaml ├── pre-generated ├── link.txt └── 红楼梦(全集).epub ├── prettier.config.cjs ├── scripts └── build.ts ├── src ├── bin.ts ├── commands │ ├── check.ts │ ├── download.ts │ ├── gen.ts │ ├── info.ts │ ├── launch.ts │ └── one.ts ├── common │ ├── books-map.ts │ └── index.ts ├── typings │ └── prettier-config.js └── utils │ ├── Book.ts │ ├── EpubModel │ └── index.ts │ ├── epub-img.ts │ ├── epub.ts │ ├── epubcheck.ts │ ├── pptr-anti-spider │ ├── bak.txt │ └── index.ts │ ├── pptr.ts │ └── processContent │ ├── example-start-info.json │ ├── index.ts │ └── worker │ ├── index.main.ts │ └── index.worker.ts ├── test ├── .gitkeep └── mocha.opts ├── tsconfig.json └── tsup.config.ts /.editorconfig: -------------------------------------------------------------------------------- 1 | root = true 2 | 3 | [*] 4 | indent_style = space 5 | indent_size = 2 6 | charset = utf-8 7 | trim_trailing_whitespace = true 8 | insert_final_newline = true 9 | 10 | [*.md] 11 | trim_trailing_whitespace = false 12 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | /lib 2 | /dist 3 | -------------------------------------------------------------------------------- /.eslintrc.yml: -------------------------------------------------------------------------------- 1 | # root 2 | root: true 3 | 4 | parser: '@typescript-eslint/parser' 5 | 6 | plugins: 7 | - '@typescript-eslint' 8 | 9 | extends: 10 | - '@magicdawn' 11 | - 'plugin:@typescript-eslint/recommended' 12 | - prettier 13 | 14 | rules: 15 | '@typescript-eslint/ban-ts-comment': off 16 | '@typescript-eslint/no-extra-semi': off 17 | '@typescript-eslint/no-unused-vars': off 18 | '@typescript-eslint/explicit-module-boundary-types': off 19 | '@typescript-eslint/ban-types': off 20 | '@typescript-eslint/no-explicit-any': off 21 | prefer-const: warn 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *-debug.log 2 | *-error.log 3 | *.log 4 | .DS_Store 5 | .nyc_output 6 | /.nyc_output 7 | /dist 8 | /lib 9 | /package-lock.json 10 | /tmp 11 | coverage 12 | coverage.* 13 | data/ 14 | logs 15 | node_modules 16 | npm-debug.log* 17 | yarn.lock 18 | 19 | *.tsbuildinfo 20 | 21 | # yarn2 22 | .yarn/* 23 | !.yarn/patches 24 | !.yarn/releases 25 | !.yarn/plugins 26 | !.yarn/sdks 27 | !.yarn/versions 28 | .pnp.* 29 | 30 | # ts 31 | **/*.tsbuildinfo 32 | 33 | # lib 34 | /lib-test 35 | 36 | # for dev 37 | /example 38 | -------------------------------------------------------------------------------- /.husky/pre-commit: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | . "$(dirname "$0")/_/husky.sh" 3 | 4 | pnpm lint-staged 5 | -------------------------------------------------------------------------------- /.mocharc.yml: -------------------------------------------------------------------------------- 1 | require: [should] 2 | reporter: spec 3 | timeout: 5000 4 | recursive: true 5 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | registry=https://registry.npmmirror.com/ 2 | -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | "configurations": [ 3 | { 4 | "type": "node", 5 | "request": "launch", 6 | "name": "bin.ts", 7 | "runtimeArgs": ["-r", "ts-node/register"], 8 | "args": ["${workspaceFolder}/src/bin.ts"], 9 | "autoAttachChildProcesses": false 10 | }, 11 | { 12 | "type": "node", 13 | "request": "launch", 14 | "name": "gen-epub", 15 | "runtimeArgs": ["-r", "ts-node/register"], 16 | "args": ["${workspaceFolder}/src/bin.ts", "gen-epub", "910419"], 17 | "autoAttachChildProcesses": false 18 | } 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # CHANGELOG 2 | 3 | ## v0.7.6 4 | 5 | - fix not existing issue when success 6 | 7 | ## v0.7.5 2023-10-21 8 | 9 | - udpate deps 10 | - use `process.env` to fix `__vue__` exposure 11 | 12 | ## v0.7.4 2023-08-26 13 | 14 | - udpate deps 15 | 16 | ## v0.7.3 2023-07-21 17 | 18 | - chore: update puppeteer, proxy-agent related 19 | 20 | ## v0.7.2 2023-07-19 21 | 22 | - fix: use store.subscribe, cause pptr modify fails. 23 | 24 | ## v0.7.1 2023-07-19 25 | 26 | - feat: use epubckeck-assets 27 | - feat: rm execa, use plain `child_process.execSync` 28 | 29 | ## v0.7.0 2023-07-19 30 | 31 | - fix `__vue__` exposure 32 | 33 | ## v0.6.0 2023-06-20 34 | 35 | - 支持多页 36 | 37 | ### v0.5.2 2023-04-26 38 | 39 | - 升级 w3c/epubcheck 40 | 41 | ### v0.5.1 2023-04-26 42 | 43 | - 清理更多没有使用的依赖 44 | 45 | ### v0.5.0 2023-04-26 46 | 47 | - 添加 `weread-spy info` 命令 48 | - 添加 `DEBUG_PROCESS_CONTENT=1` 支持, 不开启 workers process content 49 | - 添加 `weread-spy gen -D/--debug/--decompress `, `-D` 解压缩 `.ePub` 文件, 方便 debug 50 | - 修复 cheerio, xml + cjk + pre/code 的处理, see https://github.com/cheeriojs/cheerio/issues/1198 51 | - 移除 gulp, 移除 globby, 直接用 fast-glob 更好 52 | 53 | ### v0.4.0 2023-03-15 54 | 55 | - 修复 htchapterContentHtml 抓取 56 | 57 | ### v0.3.0 2022-12-25 58 | 59 | - 强制打开微信读书 `__vue__` 属性的使用 60 | 61 | ### v0.2.0 2022-09-03 62 | 63 | - `one` / `dl` 命令新增 `--interval <毫秒数>` 切换章节间隔 64 | 65 | ### v0.1.1 2022-07-09 66 | 67 | - map.json 结构调整, 数据文件夹命名调整 68 | 69 | ### v0.1.0 2022-07-09 70 | 71 | - first publish on npm 72 | 73 | ### v0.0.1 2020-09-12 74 | 75 | - first release 76 | -------------------------------------------------------------------------------- /Jakefile.ts: -------------------------------------------------------------------------------- 1 | import { execSync } from 'child_process' 2 | import fse from 'fs-extra' 3 | import path from 'path' 4 | import { version } from './package.json' 5 | 6 | const exec = (cmd: string) => { 7 | console.log('[exec]: %s', cmd) 8 | execSync(cmd, { stdio: 'inherit' }) 9 | } 10 | 11 | desc('show available tasks') 12 | task('default', () => { 13 | exec('jake -t') 14 | }) 15 | 16 | namespace('build', () => { 17 | desc('build executable via pkg') 18 | task('pkg', () => { 19 | const dir = path.join(__dirname, 'dist', 'v' + version) 20 | fse.emptyDirSync(dir) 21 | 22 | // other 23 | fse.copySync(__dirname + '/node_modules/puppeteer/.local-chromium/', dir + '/puppeteer') 24 | 25 | // %1: node_modules/sharp/build/Release 26 | // %2: path-to-executable/sharp/build/Release 27 | fse.copySync(__dirname + '/node_modules/sharp/build/Release', dir + '/sharp/build/Release') 28 | 29 | // %1: node_modules/sharp/vendor//lib 30 | // %2: path-to-executable/sharp/vendor//lib 31 | fse.copySync(__dirname + '/node_modules/sharp/vendor', dir + '/sharp/vendor') 32 | 33 | // /node_modules/nunjucks/node_modules/chokidar/node_modules/fsevents/fsevents.node 34 | // fsevents.node 35 | // fse.copySync( 36 | // __dirname + 37 | // '/node_modules/nunjucks/node_modules/chokidar/node_modules/fsevents/fsevents.node', 38 | // dir + '/fsevents.node' 39 | // ) 40 | 41 | // epubcheck 42 | fse.copySync(__dirname + '/assets/lib', dir + '/assets/lib') 43 | fse.copySync(__dirname + '/assets/epubcheck.jar', dir + '/assets/epubcheck.jar') 44 | 45 | // build ts 46 | exec('pnpm build') 47 | 48 | // pkg 49 | exec(`pnpm dlx pkg -t node16-mac --out-path ${dir} .`) 50 | }) 51 | }) 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2018 Magicdawn(magicdawn@qq.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /NOTES.md: -------------------------------------------------------------------------------- 1 | ## Todos 2 | 3 | - [ ] map.json 结构 4 | - [ ] download id.json 重命名 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # weread-spy 2 | 3 | > 使用微信读书 Web 版生成 ePub 电子书 (需要无限卡权限, 或已购买的书) 4 | 5 | [![npm version](https://img.shields.io/npm/v/weread-spy.svg?style=flat-square)](https://www.npmjs.com/package/weread-spy) 6 | [![npm downloads](https://img.shields.io/npm/dm/weread-spy.svg?style=flat-square)](https://www.npmjs.com/package/weread-spy) 7 | [![npm license](https://img.shields.io/npm/l/weread-spy.svg?style=flat-square)](http://magicdawn.mit-license.org) 8 | 9 | > [!CAUTION] 10 | > 项目未维护, 可能会 [导致封号](https://github.com/magicdawn/weread-spy/issues/44#issuecomment-1810076252) 11 | 12 | ## 声明 13 | 14 | 本项目仅供技术研究使用, 请勿用于商业用途!
15 | 本项目仅供技术研究使用, 请勿用于商业用途!
16 | 本项目仅供技术研究使用, 请勿用于商业用途!
17 | 18 | ## 安装 19 | 20 | ```sh 21 | $ pnpm add weread-spy -g 22 | ``` 23 | 24 | ### 或者使用源码 25 | 26 | - `git clone` 此项目 27 | - `pnpm i` 28 | - `pnpm link --global` 29 | 30 | 这样就可以使用 `weread-spy` 命令了 31 | 32 | ## epub 规范 & 阅读器 33 | 34 | - 本项目使用 ePub v3 规范, 且使用 epubcheck lint, 如果有 lint 报错的地方, 请添加 issue 35 | - ePub 阅读器推荐 `Koodo Reader` or `Apple Books` 36 | 37 | ## 一站式操作 `weread-spy one` 38 | 39 | - 运行此命令, 会自动打开 puppeteer 浏览器 40 | - 扫码登录 41 | - 浏览自己想下载的书, 返回命令行. 监控到 url 像是一本书, 输入 `y` 开始生成 42 | 43 | ### 注意事项 44 | 45 | - 需要安装 `Java`, epub check 依赖 java, 可以认为是 ePub 文件的 lint 工具 46 | - 数据文件在 `~/Library/Appication Support/weread-spy/` 目录下 47 | - 生成 epub 文件在当前目录下, 或者使用 `weread-spy one -d some-dir` 指定输出目录 48 | 49 | #### Options 50 | 51 | | flag | desc | default | 52 | | ------------ | ---------------------- | ------- | 53 | | `-d,--dir` | 最终 ePub 文件输出目录 | pwd | 54 | | `--interval` | 切换章节间隔, 毫秒 | 0 | 55 | 56 | ## 其他分步的命令 57 | 58 | - `weread-spy dl -u ` 下载电子书信息 59 | - `weread-spy gen -u ` 根据下载的信息, 生成电子书 60 | - `weread-spy check` 跑 epub check 61 | 62 | ## 更新日志 63 | 64 | see CHANGELOG.md 65 | 66 | ## License 67 | 68 | the MIT License http://magicdawn.mit-license.org 69 | -------------------------------------------------------------------------------- /assets/templates/cover.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 8 | 9 | 10 |
11 |
12 |

{{ title }}

13 |

14 | 17 |

18 |
19 |
20 | 21 | 22 | -------------------------------------------------------------------------------- /assets/templates/default.css: -------------------------------------------------------------------------------- 1 | :root { 2 | --main-font: Palatino, 'Palatino Linotype', 'Times New Roman', 'Droid Serif', 3 | Times, 'Source Serif Pro', serif, 'Apple Color Emoji', 'Segoe UI Emoji', 4 | 'Segoe UI Symbol'; 5 | --alt-font: 'helvetica neue', ubuntu, roboto, noto, 'segoe ui', arial, 6 | sans-serif, 'Apple Color Emoji', 'Segoe UI Emoji', 'Segoe UI Symbol'; 7 | --code-font: Menlo, Consolas, monospace; 8 | --accent-color: black; 9 | } 10 | 11 | @page { 12 | size: A5 portrait; 13 | margin: 1cm 1cm 2cm; 14 | } 15 | 16 | html { 17 | font-size: 12pt; 18 | line-height: 1.3; 19 | font-family: var(--main-font); 20 | -webkit-print-color-adjust: exact; 21 | } 22 | 23 | h1, 24 | h2, 25 | h3, 26 | h4, 27 | h5, 28 | h6 { 29 | margin-bottom: 0.5em; 30 | font-family: var(--alt-font); 31 | font-weight: bold; 32 | page-break-after: avoid; 33 | } 34 | 35 | a { 36 | color: inherit; 37 | text-decoration: underline; 38 | } 39 | 40 | /* 41 | Going on a limb here, 42 | but a.anchor in heading elements 43 | is most likely a '#' or '§' anchor 44 | we don't want to display in the PDF. 45 | */ 46 | h1 a.anchor, 47 | h2 a.anchor, 48 | h3 a.anchor, 49 | h4 a.anchor, 50 | h5 a.anchor, 51 | h6 a.anchor { 52 | visibility: hidden; 53 | position: absolute; 54 | } 55 | 56 | th { 57 | font-family: var(--alt-font); 58 | } 59 | 60 | code, 61 | pre { 62 | font-size: 0.85em; 63 | } 64 | 65 | pre code { 66 | font-size: 1em; 67 | } 68 | 69 | /* 70 | Don't display hidden elements 71 | */ 72 | [hidden], 73 | [aria-hidden] { 74 | display: none; 75 | } 76 | 77 | /* 78 | Table of Contents page 79 | ---------------------------------------------------- 80 | */ 81 | 82 | .toc { 83 | page-break-before: always; 84 | page-break-after: always; 85 | } 86 | 87 | /* 88 | Article formatting 89 | ---------------------------------------------------- 90 | */ 91 | 92 | article { 93 | font-size: 1em; 94 | } 95 | 96 | article:not(:last-of-type) { 97 | page-break-after: always; 98 | } 99 | 100 | /* 101 | Article Header 102 | -------------- 103 | */ 104 | 105 | .article__header { 106 | margin: 0 0 1.3em; 107 | } 108 | 109 | .article__title { 110 | font-size: 2.4em; 111 | margin: 0 0 0.25em; 112 | letter-spacing: -0.03em; 113 | line-height: 1.1; 114 | } 115 | 116 | .article__url { 117 | font-style: italic; 118 | font-size: 0.9em; 119 | } 120 | 121 | /* 122 | Article Content 123 | --------------- 124 | */ 125 | 126 | .article__content img { 127 | max-width: 100%; 128 | display: block; 129 | margin: 0 auto; 130 | } 131 | 132 | .article__content figure { 133 | display: block; 134 | margin: 1.5em 0; 135 | padding: 0; 136 | text-align: center; 137 | } 138 | 139 | .article__content figcaption { 140 | font-size: 0.8em; 141 | font-family: var(--alt-font); 142 | margin: 0.81em 0; 143 | line-height: 1.625; 144 | } 145 | 146 | .article__content figure blockquote, 147 | .article__content figure pre { 148 | text-align: left; 149 | } 150 | 151 | .article__content table, 152 | .article__content figure { 153 | page-break-inside: avoid; 154 | } 155 | 156 | .article__content pre, 157 | .article__content code { 158 | font-family: var(--code-font); 159 | } 160 | 161 | .article__content pre { 162 | border: 0.25pt solid #000; 163 | padding: 0.75em; 164 | font-size: 0.9em; 165 | white-space: pre-wrap; 166 | word-wrap: break-word; 167 | } 168 | 169 | .article__content kbd, 170 | .article__content var, 171 | .article__content samp { 172 | padding: 0 0.5em; 173 | box-shadow: 2pt 2pt 0 #ccc; 174 | border: 0.5pt solid #000; 175 | border-radius: 0.25em; 176 | font-size: 0.9em; 177 | } 178 | 179 | .article__content p { 180 | margin: 0; 181 | orphans: 3; 182 | widows: 3; 183 | } 184 | 185 | /* 186 | Indent all subsequent paragraphs. 187 | */ 188 | .article__content p + p { 189 | text-indent: 2em; 190 | } 191 | 192 | /* 193 | Fixes the text indent for images 194 | that get wrapped in a

tag 195 | by Readability. 196 | 197 | Reference: 198 | https://github.com/danburzo/percollate/issues/48 199 | */ 200 | .article__content p + p > img:only-child { 201 | margin-left: -2em; 202 | } 203 | 204 | .article__content hr { 205 | border: none; 206 | height: 0.5pt; 207 | margin: 1.3em 0; 208 | background: #000; 209 | } 210 | 211 | .article__content blockquote { 212 | font-size: 0.9em; 213 | line-height: 1.44; 214 | padding-left: 2em; 215 | border-left: 3pt solid #000; 216 | margin-left: 0; 217 | } 218 | 219 | .article__content table { 220 | width: 100%; 221 | border-collapse: collapse; 222 | page-break-inside: auto; 223 | font-size: 0.9em; 224 | line-height: 1.44; 225 | margin: 1.44em 0; 226 | } 227 | 228 | .article__content th { 229 | } 230 | 231 | .article__content th, 232 | .article__content td { 233 | text-align: left; 234 | vertical-align: top; 235 | padding: 0.36em 1em 0.36em 0; 236 | } 237 | 238 | .article__content tr { 239 | border-bottom: 0.25pt solid #000; 240 | page-break-inside: avoid; 241 | page-break-after: auto; 242 | } 243 | 244 | .article__content dt { 245 | font-weight: bold; 246 | } 247 | 248 | .article__content ol, 249 | .article__content ul { 250 | padding-left: 2em; 251 | list-style-position: outside; 252 | margin: 0.65em 0; 253 | } 254 | 255 | .article__content aside { 256 | font-family: var(--alt-font); 257 | font-size: 0.9em; 258 | line-height: 1.44; 259 | padding-left: 2em; 260 | } 261 | 262 | .article__content details { 263 | margin: 0.65em 0; 264 | } 265 | 266 | .article__content details > summary { 267 | font-weight: bold; 268 | font-size: 0.9em; 269 | font-family: var(--alt-font); 270 | } 271 | 272 | /* 273 | Page header / footer 274 | -------------------- 275 | 276 | These are extracted when generating the PDF 277 | and are not subject to the page's CSS cascade. 278 | 279 | They're just placed here for easier style coordination 280 | */ 281 | 282 | .header-template { 283 | } 284 | 285 | .footer-template { 286 | font-size: 10pt; 287 | font-weight: bold; 288 | } 289 | 290 | /* 291 | Cover page 292 | ---------- 293 | */ 294 | 295 | .cover { 296 | color: var(--accent-color); 297 | border: 0.5em solid; 298 | font-family: var(--cover-font, var(--alt-font)); 299 | padding: 2em; 300 | } 301 | 302 | .cover__title { 303 | font-size: 2.4em; 304 | margin: 0; 305 | line-height: 1.1; 306 | } 307 | 308 | .cover__subtitle { 309 | margin: 1em 0; 310 | } 311 | 312 | .cover__date { 313 | font-weight: bold; 314 | } 315 | 316 | /* 317 | Filetype specific 318 | ----------------- 319 | */ 320 | 321 | .type--pdf body { 322 | margin: 0; 323 | padding: 0; 324 | } 325 | 326 | .type--pdf a:not(.no-href):after { 327 | content: ' → ' attr(href) ''; 328 | font-size: 0.8em; 329 | word-break: break-all; 330 | word-wrap: break-word; 331 | font-family: var(--alt-font); 332 | } 333 | 334 | .type--pdf .cover, 335 | .type--epub .cover { 336 | position: absolute; 337 | overflow: hidden; 338 | } 339 | 340 | .type--pdf .cover { 341 | top: 0; 342 | left: 0; 343 | right: 0; 344 | bottom: 0; 345 | } 346 | 347 | .type--epub .cover { 348 | top: 2em; 349 | left: 2em; 350 | right: 2em; 351 | bottom: 2em; 352 | } 353 | 354 | .type--pdf .cover__content, 355 | .type--epub .cover__content { 356 | position: absolute; 357 | top: 30%; 358 | left: 2em; 359 | right: 2em; 360 | transform: translate(0, -50%); 361 | } 362 | 363 | .type--pdf .cover__sentinel { 364 | page-break-after: always; 365 | } 366 | -------------------------------------------------------------------------------- /assets/templates/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ title }} 6 | 7 | 10 | 11 | 12 | {% if options.use_cover %} 13 |

14 |
15 |

{{ title }}

16 |

17 | 20 |

21 |
22 |
23 |
 
24 | {% endif %} {% if options.use_toc %} 25 | 37 | {% endif %} {% for item in items %} 38 |
39 |
40 |

41 | {{ item.title }} 42 |

43 | {% if item.byline %} 44 | 45 | {% endif %} 46 |

47 | Source: 48 | {{item.url}} 49 |

50 |
51 | 52 |
53 | {{ item.content }} 54 |
55 |
56 | {% endfor %} 57 | 58 | 59 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /assets/templates/epub/META-INF/com.apple.ibooks.display-options.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /assets/templates/epub/META-INF/container.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 8 | 9 | -------------------------------------------------------------------------------- /assets/templates/epub/OEBPS/content.opf: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{ bookId }} 5 | {{ title | e }} 6 | {{ date }} 7 | {{ date }} 8 | {{ creator }} 9 | {{ publisher }} 10 | {{ description }} 11 | {{ lang | default('en-US')}} 12 | {% if cover -%} 13 | 14 | {%- endif %} 15 | 16 | 17 | 18 | {% for item in manifest -%} 19 | 25 | {% endfor %} 26 | 27 | 28 | 29 | {% for item in spine -%} 30 | 31 | {% endfor %} 32 | 33 | 34 | 35 | 36 | 37 | 38 | -------------------------------------------------------------------------------- /assets/templates/epub/OEBPS/content.xhtml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {{ item.title | e }} 6 | 7 | 8 | 9 |
10 |

{{ item.title | e }}

11 | {% if item.byline %} 12 | 13 | {% endif %} 14 |

15 | Source: 16 | {{item.url}} 17 |

18 |
19 |
20 | {{ item.content }} 21 |
22 | 23 | 24 | -------------------------------------------------------------------------------- /assets/templates/epub/OEBPS/cover.xhtml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 封面 5 | 6 | 7 |
8 | 9 |
10 | 11 | 12 | -------------------------------------------------------------------------------- /assets/templates/epub/OEBPS/nav.xhtml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | {{ title | e }} 7 | 8 | 9 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /assets/templates/epub/OEBPS/toc.ncx: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | {{ title }} 11 | 12 | 13 | {% for item in navItems %} 14 | 15 | 16 | {{item.title}} 17 | 18 | 19 | {% for subitem in item.children %} 20 | 21 | 22 | {{subitem.title}} 23 | 24 | 25 | 26 | {%- endfor %} 27 | 28 | {%- endfor %} 29 | 30 | 31 | -------------------------------------------------------------------------------- /bak/index.main.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 如果使用 ts-node 3 | * 开发使用 __dirname + /index.worker.js 4 | * 内容是 require ts-node/register + require index.worker.ts 5 | * 6 | * ts 编译后, 7 | * ./index.worker.ts -> dist/index.worker.js 8 | * ./index.worker.js 不参与 ts 编译 9 | * 10 | * esbuild bundle 后 11 | */ 12 | 13 | // const workerFile = process.env.ESBUILD_BUNDLE 14 | // ? __dirname + '/processContent.worker.js' 15 | // : __dirname + '/index.worker.js' 16 | -------------------------------------------------------------------------------- /bak/index.worker.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-var-requires */ 2 | const path = require('path') 3 | require('ts-node').register({ 4 | project: path.join(__dirname, '/../../../tsconfig.json'), 5 | }) 6 | require(__dirname + '/index.worker.ts') 7 | -------------------------------------------------------------------------------- /bin/.dev: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magicdawn/weread-spy/13e70510a0b7343689fe1bbf76c5f83096396bc3/bin/.dev -------------------------------------------------------------------------------- /bin/weread-spy.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | /* eslint-disable @typescript-eslint/no-var-requires */ 4 | 5 | let isDev = require('fs').existsSync(__dirname + '/.dev') 6 | 7 | // force use dist 8 | // FIXME: comment this before publish 9 | // isDev = false 10 | 11 | if (isDev) { 12 | require('ts-node').register({ 13 | project: __dirname + '/../tsconfig.json', 14 | }) 15 | require('../src/bin') 16 | } else { 17 | require('../dist/bin') 18 | } 19 | -------------------------------------------------------------------------------- /dev-notes/2023-10-21.md: -------------------------------------------------------------------------------- 1 | # 2023-10-21 16:22:26 2 | 3 | prev version 4 | 5 | - 修改 js, 暴露 `__vue__` 6 | - `$store.subscribe` 获取 html 7 | -------------------------------------------------------------------------------- /docs/epub.md: -------------------------------------------------------------------------------- 1 | # ePub 2 | 3 | ## doc 4 | 5 | - http://www.theheratik.net/books/tech-epub/ 6 | 7 | ## toc 8 | 9 | eEpub 2 &3 10 | 11 | ### 3 12 | 13 | - 3 使用 nav.xhtml 14 | - 2 使用 toc.ncx 15 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "weread-spy", 3 | "description": "weread spy", 4 | "version": "0.7.6", 5 | "author": "magicdawn", 6 | "type": "module", 7 | "main": "dist/bin.js", 8 | "bin": { 9 | "weread-spy": "dist/bin.js" 10 | }, 11 | "files": [ 12 | "dist", 13 | "assets", 14 | "CHANGELOG.md" 15 | ], 16 | "bugs": { 17 | "url": "https://github.com/magicdawn/weread-spy/issues" 18 | }, 19 | "engines": { 20 | "node": ">=18" 21 | }, 22 | "homepage": "https://github.com/magicdawn/weread-spy#readme", 23 | "keywords": [ 24 | "download", 25 | "weread" 26 | ], 27 | "scripts": { 28 | "prepare": "husky install", 29 | "_dev": "tsc -w --incremental", 30 | "_build:tsc": "rm -rf lib; rm tsconfig.tsbuildinfo; tsc", 31 | "_build": "./scripts/build.ts", 32 | "dev": "NODE_ENV=development tsup --watch", 33 | "build": "NODE_ENV=production tsup", 34 | "typecheck": "tsc --noEmit", 35 | "test": "mocha", 36 | "test-cover": "nyc --reporter=lcov --reporter=text mocha", 37 | "prepublishOnly": "pnpm build" 38 | }, 39 | "repository": { 40 | "type": "git", 41 | "url": "git+ssh://git@github.com/magicdawn/weread-spy.git" 42 | }, 43 | "dependencies": { 44 | "@magicdawn/prettier-config": "^0.0.2", 45 | "adm-zip": "^0.5.10", 46 | "cheerio": "1.0.0-rc.12", 47 | "clipanion": "^3.2.1", 48 | "comlink": "^4.4.1", 49 | "debug": "^4.3.4", 50 | "delay": "^6.0.0", 51 | "dl-vampire": "^2.0.0", 52 | "env-paths": "^3.0.0", 53 | "epubcheck-assets": "^5.1.0", 54 | "escape-string-regexp": "^5.0.0", 55 | "esm-utils": "^4.1.2", 56 | "fast-glob": "^3.3.1", 57 | "filenamify": "^6.0.0", 58 | "fs-extra": "^11.1.1", 59 | "inquirer": "9", 60 | "jszip": "^3.10.1", 61 | "lodash-es": "^4.17.21", 62 | "mime": "^3.0.0", 63 | "mimetype": "^0.0.8", 64 | "moment": "^2.29.4", 65 | "ms": "^2.1.3", 66 | "nunjucks": "^3.2.4", 67 | "pkg-up": "^4.0.0", 68 | "prettier": "^3.0.3", 69 | "promise.map": "^0.5.0", 70 | "puppeteer": "^21.4.0", 71 | "puppeteer-intercept-and-modify-requests": "^1.2.2", 72 | "sharp": "^0.32.6", 73 | "tslib": "^2.6.2", 74 | "type-fest": "^4.5.0", 75 | "urijs": "^1.19.11" 76 | }, 77 | "devDependencies": { 78 | "@magicdawn/eslint-config": "^0.1.0", 79 | "@types/adm-zip": "^0.5.3", 80 | "@types/debug": "^4.1.10", 81 | "@types/fs-extra": "^11.0.3", 82 | "@types/inquirer": "^9.0.6", 83 | "@types/jake": "^0.0.35", 84 | "@types/lodash-es": "^4.17.10", 85 | "@types/mime": "^3.0.3", 86 | "@types/mocha": "^10.0.3", 87 | "@types/ms": "^0.7.33", 88 | "@types/node": "^20.8.7", 89 | "@types/nunjucks": "^3.2.5", 90 | "@types/prettier": "^2.7.3", 91 | "@types/sharp": "^0.31.1", 92 | "@types/urijs": "^1.19.22", 93 | "@typescript-eslint/eslint-plugin": "^6.8.0", 94 | "@typescript-eslint/parser": "^6.8.0", 95 | "esbuild": "^0.19.5", 96 | "eslint": "^8.52.0", 97 | "eslint-config-prettier": "^9.0.0", 98 | "husky": "^8.0.3", 99 | "lint-staged": "^15.0.2", 100 | "mocha": "^10.2.0", 101 | "should": "^13.2.3", 102 | "tsup": "^7.2.0", 103 | "typescript": "^5.2.2", 104 | "why-is-node-running": "^2.2.2" 105 | }, 106 | "pkg": { 107 | "assets": [ 108 | "assets/templates/" 109 | ], 110 | "scripts": [ 111 | "lib/**/*.worker.js" 112 | ] 113 | }, 114 | "lint-staged": { 115 | "*.{js,jsx,ts,tsx,less,md}": [ 116 | "prettier --write" 117 | ] 118 | }, 119 | "publishConfig": { 120 | "registry": "https://registry.npmjs.org" 121 | }, 122 | "packageManager": "pnpm@9.14.4+sha512.c8180b3fbe4e4bca02c94234717896b5529740a6cbadf19fa78254270403ea2f27d4e1d46a08a0f56c89b63dc8ebfd3ee53326da720273794e6200fcf0d184ab" 123 | } 124 | -------------------------------------------------------------------------------- /pre-generated/link.txt: -------------------------------------------------------------------------------- 1 | 红楼梦(全集) 2 | https://weread.qq.com/web/reader/41432f705de453414ca0b4akc81322c012c81e728d9d180 3 | -------------------------------------------------------------------------------- /pre-generated/红楼梦(全集).epub: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magicdawn/weread-spy/13e70510a0b7343689fe1bbf76c5f83096396bc3/pre-generated/红楼梦(全集).epub -------------------------------------------------------------------------------- /prettier.config.cjs: -------------------------------------------------------------------------------- 1 | // https://github.com/prettier/prettier/issues/12701 2 | module.exports = require('@magicdawn/prettier-config') 3 | -------------------------------------------------------------------------------- /scripts/build.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ts-node 2 | 3 | import esbuild from 'esbuild' 4 | import path from 'path' 5 | 6 | const projectHome = path.join(__dirname, '..') 7 | 8 | esbuild.buildSync({ 9 | entryPoints: { 10 | 'bin': path.join(projectHome, 'src/bin.ts'), 11 | 'processContent.worker': path.join(projectHome, 'src/utils/processContent/index.worker.ts'), 12 | }, 13 | bundle: true, 14 | outdir: path.join(projectHome, 'dist'), 15 | platform: 'node', 16 | target: ['node18'], 17 | packages: 'external', 18 | external: [path.join(projectHome, 'package.json')], 19 | minify: true, 20 | define: { 21 | 'process.env.ESBUILD_BUNDLE': 'true', 22 | }, 23 | }) 24 | 25 | console.log('[bundle]: success') 26 | -------------------------------------------------------------------------------- /src/bin.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | import { Builtins, Cli } from 'clipanion' 4 | import debugFactory from 'debug' 5 | import { CheckCommand } from './commands/check.js' 6 | import { DownloadCommand } from './commands/download.js' 7 | import { GenCommand } from './commands/gen.js' 8 | import { InfoCommand } from './commands/info.js' 9 | import { LaunchCommand } from './commands/launch.js' 10 | import { OneCommand } from './commands/one.js' 11 | import { $esm } from './common/index.js' 12 | 13 | const { require } = $esm(import.meta) 14 | 15 | // enable logs 16 | if (!process.env.DEBUG) { 17 | const enabledNSP = [ 18 | `weread-spy:*`, 19 | process.env.NODE_ENV !== 'production' && 'weread-spy-detail:*', 20 | ] 21 | .filter(Boolean) 22 | .join(',') 23 | debugFactory.enable(enabledNSP) 24 | } 25 | 26 | // @ts-ignore 27 | // eslint-disable-next-line @typescript-eslint/no-var-requires 28 | const { version } = require('../package.json') 29 | 30 | const cli = new Cli({ 31 | binaryLabel: '微信读书下载器', 32 | binaryName: 'weread-spy', 33 | binaryVersion: version, 34 | }) 35 | 36 | // default commands 37 | cli.register(Builtins.HelpCommand) 38 | cli.register(Builtins.VersionCommand) 39 | 40 | // commands 41 | cli.register(OneCommand) 42 | cli.register(DownloadCommand) 43 | cli.register(GenCommand) 44 | cli.register(LaunchCommand) 45 | cli.register(CheckCommand) 46 | cli.register(InfoCommand) 47 | 48 | cli.runExit(process.argv.slice(2), { 49 | ...Cli.defaultContext, 50 | }) 51 | -------------------------------------------------------------------------------- /src/commands/check.ts: -------------------------------------------------------------------------------- 1 | import { Command, Option } from 'clipanion' 2 | import fg from 'fast-glob' 3 | import epubcheck from '../utils/epubcheck.js' 4 | 5 | export class CheckCommand extends Command { 6 | static usage = Command.Usage({ 7 | description: `检查 epub 文件是否符合规范`, 8 | }) 9 | 10 | static paths = [['check'], ['c']] 11 | 12 | files: string[] = Option.Rest({ required: 1 }) 13 | 14 | async execute() { 15 | const files = this.files 16 | 17 | for (const f of files) { 18 | const pattern = f.includes('*') 19 | if (pattern) { 20 | const subfiles = fg.sync(f) 21 | subfiles.forEach((f) => epubcheck(f)) 22 | continue 23 | } 24 | if (f) { 25 | epubcheck(f) 26 | } 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/commands/download.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-non-null-assertion */ 2 | /* eslint-disable @typescript-eslint/no-explicit-any */ 3 | 4 | import { Command, Option } from 'clipanion' 5 | import delay from 'delay' 6 | import filenamify from 'filenamify' 7 | import fse from 'fs-extra' 8 | import path from 'path' 9 | import * as pptr from 'puppeteer' 10 | import { addBook, queryBook } from '../common/books-map.js' 11 | import { BOOKS_DIR, baseDebug } from '../common/index.js' 12 | import { getBrowser } from '../utils/pptr.js' 13 | 14 | const debug = baseDebug.extend('download') 15 | 16 | export class DownloadCommand extends Command { 17 | static usage = Command.Usage({ 18 | description: `下载 epub`, 19 | }) 20 | 21 | static paths = [['dl'], ['download']] 22 | 23 | url: string = Option.String('-u,--url', { 24 | description: 'book url, e.g(https://weread.qq.com/web/reader/9f232de07184869c9f2cc73)', 25 | required: true, 26 | }) 27 | 28 | interval?: string = Option.String('--interval', { 29 | description: '数字, 切换章节间隔, 单位毫秒', 30 | }) 31 | 32 | async execute() { 33 | let { url, interval } = this 34 | 35 | if (/^\w+$/.test(url) && !url.includes('/')) { 36 | // id 37 | if (/^\d+$/.test(url)) { 38 | url = (await queryBook({ id: url }))?.url || '' 39 | if (!url) { 40 | console.error('url not found for id = %s', url) 41 | process.exit(1) 42 | } 43 | } else { 44 | url = `https://weread.qq.com/web/bookDetail/${url}` 45 | } 46 | } 47 | 48 | main(url, { interval }) 49 | } 50 | } 51 | 52 | export async function main( 53 | bookReadUrl: string, 54 | options: { page?: pptr.Page; browser?: pptr.Browser; interval?: number | string } = {} 55 | ) { 56 | // create if not provided 57 | if (!options.page || !options.browser) { 58 | Object.assign(options, await getBrowser()) 59 | } 60 | const browser = options.browser! 61 | const page = options.page! 62 | 63 | await page.goto(bookReadUrl) 64 | 65 | /** 66 | * Engine start 67 | */ 68 | 69 | await waitReaderReady(page) 70 | await subscribeToVuexMutaion(page) 71 | 72 | // save map 73 | const startInfo = await getInfoFromPage(page) 74 | await addBook({ id: startInfo.bookId, title: startInfo.bookInfo.title, url: bookReadUrl }) 75 | 76 | let usingInterval: number | undefined = undefined 77 | if (options.interval) { 78 | if (typeof options.interval === 'number') { 79 | usingInterval = options.interval 80 | } 81 | if (typeof options.interval === 'string') { 82 | usingInterval = Number(options.interval) 83 | if (isNaN(usingInterval)) { 84 | throw new Error('expect a number for --interval') 85 | } 86 | } 87 | } 88 | if (usingInterval) { 89 | debug('切换章节间隔 %s ms', usingInterval) 90 | } 91 | 92 | // 先切到 index = 1, 后面会切到 index = 0, 触发 mutation 93 | await changeChapter(page, startInfo.chapterInfos[1].chapterUid) 94 | 95 | const infos: any[] = [] 96 | for (const [index, c] of startInfo.chapterInfos.entries()) { 97 | const { chapterUid } = c 98 | 99 | // delay before change chapter 100 | if (index > 0 && usingInterval) { 101 | await delay(usingInterval) 102 | } 103 | await changeChapter(page, chapterUid) 104 | 105 | const info = await getInfoFromPage(page) 106 | infos.push(info) 107 | debug('已收集章节 id=%s', chapterUid) 108 | } 109 | 110 | // 书籍信息 111 | const json = { 112 | startInfo, 113 | infos, 114 | } 115 | 116 | const { 117 | bookId, 118 | bookInfo: { title }, 119 | } = startInfo 120 | const bookJsonFile = path.join(BOOKS_DIR, filenamify(`${bookId}-${title}.json`)) 121 | await fse.outputJson(bookJsonFile, json, { 122 | spaces: 2, 123 | }) 124 | 125 | debug('book id = %s url = %s', bookId, bookReadUrl) 126 | debug('downloaded to %s', bookJsonFile) 127 | 128 | await browser.close() 129 | } 130 | 131 | /** 132 | * pptr Actions 133 | */ 134 | 135 | export async function waitReaderReady(page: pptr.Page) { 136 | return page.waitForFunction( 137 | () => { 138 | const state = globalThis.app?.__vue__?.$store?.state 139 | return state?.reader?.chapterContentState === 'DONE' 140 | }, 141 | { polling: 100 } 142 | ) 143 | } 144 | 145 | export async function subscribeToVuexMutaion(page: pptr.Page) { 146 | await page.evaluate(() => { 147 | const $store = globalThis.app.__vue__.$store 148 | $store.subscribe((mutation, state) => { 149 | console.log('VUEX mutation type=%s', mutation.type, mutation.payload) 150 | if (mutation.type === 'updateReaderContentHtml') { 151 | globalThis.__chapterContentHtmlArray__ = mutation.payload 152 | } 153 | }) 154 | }) 155 | } 156 | 157 | export async function changeChapter(page: pptr.Page, uid: number) { 158 | // start 159 | await page.$eval( 160 | '#routerView', 161 | (el, uid) => { 162 | ;(el as any).__vue__.changeChapter({ chapterUid: uid }) 163 | }, 164 | uid 165 | ) 166 | 167 | // wait complete 168 | await waitReaderReady(page) 169 | await page.waitForFunction( 170 | (id) => { 171 | const state = globalThis.app.__vue__.$store.state 172 | const currentChapterId = state?.reader?.currentChapter?.chapterUid 173 | const currentState = state?.reader?.chapterContentState 174 | console.log({ currentChapterId, currentState, id }) 175 | return currentChapterId === id && currentState === 'DONE' 176 | }, 177 | { polling: 100 }, 178 | uid 179 | ) 180 | } 181 | 182 | export async function getInfoFromPage(page: pptr.Page) { 183 | const { state, chapterContentHtmlArray } = await page.evaluate(() => { 184 | const state = globalThis.app.__vue__.$store.state 185 | const chapterContentHtmlArray = globalThis.__chapterContentHtmlArray__ 186 | return { state, chapterContentHtmlArray } 187 | }) 188 | // want 189 | const info = { 190 | bookId: state.reader.bookId, 191 | bookInfo: state.reader.bookInfo, 192 | chapterInfos: state.reader.chapterInfos, 193 | chapterContentHtmlArray: chapterContentHtmlArray, // state.reader.chapterContentHtml, 194 | chapterContentStyles: state.reader.chapterContentStyles, 195 | currentChapterId: state.reader.currentChapter.chapterUid, 196 | } 197 | return info 198 | } 199 | -------------------------------------------------------------------------------- /src/commands/gen.ts: -------------------------------------------------------------------------------- 1 | import { genEpubFor } from '$utils/epub' 2 | import epubcheck from '$utils/epubcheck' 3 | import { Command, Option } from 'clipanion' 4 | import { homedir } from 'os' 5 | import path from 'path' 6 | import { currentBooks, queryBookAny } from '../common/books-map.js' 7 | 8 | export class GenCommand extends Command { 9 | static usage = Command.Usage({ 10 | description: `根据已下载的信息生成 epub 文件`, 11 | details: ` can be id/url/title`, 12 | }) 13 | 14 | static paths = [['gen'], ['gen-epub']] 15 | 16 | // book can be 17 | // url: 'book start url. e.g(https://weread.qq.com/web/reader/41432f705de453414ca0b4akc81322c012c81e728d9d180)', 18 | // title: %s 19 | // id: 812443 20 | book = Option.String({ required: true, name: 'book' }) 21 | 22 | clean = Option.Boolean('-c,--clean', { 23 | description: 'clean imgs before gen', 24 | }) 25 | 26 | dir = Option.String('-d,--dir', { 27 | description: 'epub 文件输出目录, 默认当前文件夹', 28 | }) 29 | 30 | decompress = Option.Boolean('-D,--debug,--decompress', { 31 | description: 'decompress .ePub file for debug purpose', 32 | }) 33 | 34 | err(msg: string) { 35 | console.error('Error: %s', msg) 36 | process.exit(1) 37 | } 38 | 39 | async execute() { 40 | const { clean, dir, decompress } = this 41 | 42 | const book = await queryBookAny(this.book) 43 | if (!book) return this.err('book not found') 44 | 45 | const id = book.id 46 | const url = book.url 47 | await genCommandMain({ url, id, clean: Boolean(clean), dir, decompress }) 48 | } 49 | } 50 | 51 | export async function genCommandMain({ 52 | url, 53 | clean, 54 | id, 55 | dir, 56 | decompress = false, 57 | }: { 58 | url?: string 59 | clean: boolean 60 | id?: string 61 | dir?: string 62 | decompress?: boolean 63 | }) { 64 | let bookId: string | undefined 65 | if (id) { 66 | bookId = id 67 | } 68 | // url => id 69 | else if (url) { 70 | bookId = currentBooks.find((x) => x.url === url)?.id 71 | } 72 | 73 | if (!bookId) { 74 | console.error('can not find id !!!') 75 | return 76 | } 77 | 78 | // normalize 79 | dir = path.resolve(dir || process.cwd()) 80 | // if run in project root, gen to `example/` subdir 81 | if (dir === path.join(homedir(), 'projects/weread-spy-private')) { 82 | dir = path.join(dir, 'example') 83 | } 84 | 85 | const file = await genEpubFor(bookId, dir, clean, decompress) 86 | epubcheck(file) 87 | 88 | setTimeout(async () => { 89 | // console.log('why-is-node-running ->') 90 | // const { default: log } = await import('why-is-node-running') 91 | // log() 92 | // console.log(process.getActiveResourcesInfo()) 93 | // console.log(process._getActiveHandles()) 94 | }, 100) 95 | 96 | return file 97 | } 98 | -------------------------------------------------------------------------------- /src/commands/info.ts: -------------------------------------------------------------------------------- 1 | import { Command, Usage } from 'clipanion' 2 | import { BOOKS_MAP_FILE, currentBooks, loadBooks } from '../common/books-map.js' 3 | import { BOOKS_DIR, PPTR_DATA_DIR } from '../common/index.js' 4 | 5 | export class InfoCommand extends Command { 6 | static paths?: string[][] = [['info']] 7 | static usage?: Usage = { 8 | description: '查看相关文件夹位置, 储存的书籍信息', 9 | } 10 | 11 | async execute(): Promise { 12 | console.log('目录信息:') 13 | console.log(' PPTR_DATA_DIR: %s', PPTR_DATA_DIR) 14 | console.log(' BOOKS_DIR: %s', BOOKS_DIR) 15 | console.log(' BOOKS_MAP_FILE: %s', BOOKS_MAP_FILE) 16 | console.log() 17 | 18 | await loadBooks() 19 | console.log('Books: \n') 20 | for (const item of currentBooks) { 21 | console.log('%s', item.title) 22 | console.log(' ID: %s', item.id) 23 | console.log(' URL: %s', item.url) 24 | console.log('') 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/commands/launch.ts: -------------------------------------------------------------------------------- 1 | import { Command } from 'clipanion' 2 | import { getBrowser } from '../utils/pptr.js' 3 | 4 | export class LaunchCommand extends Command { 5 | static usage = Command.Usage({ 6 | description: '单纯启动内置的 puppeteer 浏览器', 7 | }) 8 | 9 | static paths = [['launch']] 10 | 11 | async execute() { 12 | const { browser } = await getBrowser() 13 | // operate here 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/commands/one.ts: -------------------------------------------------------------------------------- 1 | import { Command, Option } from 'clipanion' 2 | import inquirer from 'inquirer' 3 | import * as _ from 'lodash-es' 4 | import * as pptr from 'puppeteer' 5 | import URI from 'urijs' 6 | import { baseDebug } from '../common/index.js' 7 | import { getBrowser } from '../utils/pptr.js' 8 | import { changeChapter, main as download, waitReaderReady } from './download.js' 9 | import { genCommandMain as gen } from './gen.js' 10 | 11 | const debug = baseDebug.extend('one') 12 | 13 | export class OneCommand extends Command { 14 | static usage = Command.Usage({ 15 | description: '一站式操作, 启动浏览器, 浏览阅读网页, 回到控制台输入 y 开始生成', 16 | }) 17 | 18 | static paths = [['one']] 19 | 20 | dir = Option.String('-d,--dir', { 21 | description: 'epub 文件输出目录, 默认当前文件夹', 22 | }) 23 | 24 | interval?: string = Option.String('--interval', { 25 | description: '数字, 切换章节间隔, 单位毫秒', 26 | }) 27 | 28 | async execute() { 29 | const { browser, page } = await getBrowser() 30 | 31 | let prompt: any 32 | 33 | const handler = async (e: pptr.Frame) => { 34 | const pageUrl = e.url() 35 | const uri = URI(pageUrl) 36 | const path = uri.pathname() 37 | // if (path.startsWith('/web/bookDetail/')) { 38 | if (path.startsWith('/web/reader/')) { 39 | // https://github.com/SBoudrias/Inquirer.js/issues/491#issuecomment-277595658 40 | // clean prev 41 | if (prompt) { 42 | ;(prompt as any).ui.close() 43 | console.log('') 44 | } 45 | 46 | const title = await page.title() 47 | console.log('') 48 | console.log('当前浏览链接像是一本书:') 49 | console.log(' [url]: %s', pageUrl) 50 | console.log(' [title]: %s', title) 51 | 52 | // prompt 53 | prompt = inquirer.prompt([ 54 | { 55 | type: 'confirm', 56 | name: 'confirm', 57 | message: `是否下载: `, 58 | }, 59 | ]) 60 | 61 | // confirm 62 | const { confirm } = await prompt 63 | if (!confirm) return 64 | 65 | // 移除 listener 66 | page.off('framenavigated', handlerDebounced) 67 | 68 | // 确认下载 69 | decideDownload(page, browser, this.dir, this.interval) 70 | } 71 | } 72 | 73 | const handlerDebounced = _.debounce(handler, 1000) 74 | page.on('framenavigated', handlerDebounced) 75 | 76 | // FIXME: only for dev-test 77 | // await page.goto('https://weread.qq.com/web/reader/e1932d70813ab82e7g014f5b') 78 | // await page.goto('https://weread.qq.com/web/reader/f1132f80813ab821eg018540') 79 | } 80 | } 81 | 82 | async function decideDownload( 83 | page: pptr.Page, 84 | browser: pptr.Browser, 85 | dir?: string, 86 | interval?: string 87 | ) { 88 | await waitReaderReady(page) 89 | 90 | const state = await page.evaluate(() => { 91 | return globalThis.app.__vue__.$store.state 92 | }) 93 | 94 | const chapterInfos = state.reader.chapterInfos 95 | // why? 不记得了 96 | // second + first 97 | const firstChapterUid = chapterInfos[0].chapterUid 98 | const secondChapterUid = chapterInfos[1].chapterUid 99 | // to second 100 | await changeChapter(page, secondChapterUid) 101 | // to first 102 | await changeChapter(page, firstChapterUid) 103 | 104 | const bookCoverUrl = page.url() 105 | 106 | // download 107 | await download(bookCoverUrl, { page, browser, interval }) 108 | debug('-'.repeat(20), 'download complete', '-'.repeat(20)) 109 | 110 | // generate 111 | const file = await gen({ url: bookCoverUrl, clean: true, dir }) 112 | console.log('') 113 | debug('-'.repeat(20), 'generate complete', '-'.repeat(20)) 114 | debug('epub 文件: %s', file) 115 | } 116 | -------------------------------------------------------------------------------- /src/common/books-map.ts: -------------------------------------------------------------------------------- 1 | import fse from 'fs-extra' 2 | import path from 'path' 3 | import { BOOKS_DIR } from './index.js' 4 | 5 | // v1: json = { [id]: {title,id,url} } 6 | // v2: json = [ {id, title, url} ] 7 | export const BOOKS_MAP_FILE = path.join(BOOKS_DIR, 'map-v2.json') 8 | 9 | export type BookItem = { 10 | id: string 11 | title: string 12 | url: string 13 | } 14 | 15 | export let currentBooks: BookItem[] = [] 16 | 17 | let loaded = false 18 | export async function loadBooks() { 19 | if (loaded) return 20 | 21 | let list: BookItem[] = [] 22 | if (await fse.pathExists(BOOKS_MAP_FILE)) list = await fse.readJSON(BOOKS_MAP_FILE) 23 | 24 | currentBooks = list 25 | loaded = true 26 | } 27 | export async function saveBooks() { 28 | return fse.outputJSON(BOOKS_MAP_FILE, currentBooks, { spaces: 2 }) 29 | } 30 | 31 | export async function addBook(item: BookItem) { 32 | await loadBooks() 33 | const list = currentBooks.slice() 34 | 35 | // remove 36 | const { id, url } = item 37 | { 38 | const index = list.findIndex((x) => x.id === id) 39 | if (index > -1) list.splice(index, 1) 40 | } 41 | { 42 | const index = list.findIndex((x) => x.url === url) 43 | if (index > -1) list.splice(index, 1) 44 | } 45 | 46 | list.push(item) 47 | currentBooks = list 48 | saveBooks() 49 | } 50 | 51 | export async function queryBook(query: Partial) { 52 | await loadBooks() 53 | const item = currentBooks.find((item) => Object.keys(query).every((k) => item[k] === query[k])) 54 | return item 55 | } 56 | 57 | export async function queryBookAny(query: string) { 58 | let _query: Partial = {} 59 | if (/^\d+$/.test(query)) { 60 | _query = { id: query } 61 | } else if (/^https?:\/\//.test(query)) { 62 | _query = { url: query } 63 | } else { 64 | _query = { title: query } 65 | } 66 | return queryBook(_query) 67 | } 68 | -------------------------------------------------------------------------------- /src/common/index.ts: -------------------------------------------------------------------------------- 1 | import { load as $load } from 'cheerio' 2 | import d from 'debug' 3 | import envPaths from 'env-paths' 4 | import $esm from 'esm-utils' 5 | import path from 'path' 6 | import { pkgUpSync } from 'pkg-up' 7 | import { SetOptional } from 'type-fest' 8 | 9 | export { $esm } 10 | 11 | const { __dirname } = $esm(import.meta) 12 | 13 | import type exampleStartInfo from '../utils/processContent/example-start-info.json' 14 | export type Info = SetOptional< 15 | typeof exampleStartInfo, 16 | // 旧的是 html: string 17 | // 新的是 htmlArray: string[] 18 | 'chapterContentHtml' | 'chapterContentHtmlArray' 19 | > 20 | 21 | export type BookInfo = Info['bookInfo'] 22 | export type ChapterInfo = Info['chapterInfos'][number] 23 | 24 | export function getBookHtml(info: Info) { 25 | // 2021-08-29 出现 chapterContentHtml 为 string[] 26 | // 2023-06-20 处理多页, chapterContentHtmlArray 27 | let htmlArray: string[] = [] 28 | if (info.chapterContentHtmlArray) { 29 | htmlArray = info.chapterContentHtmlArray 30 | } else if (Array.isArray(info.chapterContentHtml)) { 31 | htmlArray = info.chapterContentHtml 32 | } else { 33 | htmlArray = [info.chapterContentHtml || ''] 34 | } 35 | 36 | const extractUselessWrapper = (fullHtml: string) => { 37 | // extract content from {content} 38 | const $ = $load(fullHtml, { decodeEntities: false, lowerCaseTags: true }) 39 | if ($('body').length) { 40 | fullHtml = $('body').html() || '' 41 | } 42 | return fullHtml 43 | } 44 | 45 | const html = htmlArray.map((item) => extractUselessWrapper(item)).join('\n') 46 | return html 47 | } 48 | 49 | export const baseDebug = d('weread-spy') 50 | export const baseDebugDetail = d('weread-spy-detail') 51 | 52 | export interface Data { 53 | startInfo: Info 54 | infos: Info[] 55 | } 56 | 57 | const closetPkgJson = pkgUpSync({ cwd: __dirname }) 58 | if (!closetPkgJson) { 59 | throw new Error('package.json not found') 60 | } 61 | export const PROJECT_ROOT = path.dirname(closetPkgJson) 62 | 63 | /** 64 | * 通用的数据目录 65 | * ApplicationSupport/weread-spy 66 | */ 67 | 68 | export const APP_SUP_DIR = envPaths('weread-spy', { suffix: '' }).data 69 | export const BOOKS_DIR = path.join(APP_SUP_DIR, 'books') 70 | export const PPTR_DATA_DIR = path.join(APP_SUP_DIR, 'pptr-data') 71 | -------------------------------------------------------------------------------- /src/typings/prettier-config.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magicdawn/weread-spy/13e70510a0b7343689fe1bbf76c5f83096396bc3/src/typings/prettier-config.js -------------------------------------------------------------------------------- /src/utils/Book.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-non-null-assertion */ 2 | 3 | import { BOOKS_DIR, Data } from '$common' 4 | import fg from 'fast-glob' 5 | import filenamify from 'filenamify' 6 | import fse from 'fs-extra' 7 | import JSZip, { InputType, JSZipFileOptions } from 'jszip' 8 | import * as _ from 'lodash-es' 9 | import { trimEnd } from 'lodash-es' 10 | import path from 'path' 11 | import { FileItem, FileItemFields } from './EpubModel/index.js' 12 | 13 | export type NavItem = { 14 | id: string 15 | filename: string 16 | title: string 17 | playOrder: number 18 | children?: NavItem[] 19 | } 20 | 21 | interface InputByType { 22 | base64: string 23 | string: string 24 | text: string 25 | binarystring: string 26 | array: number[] 27 | uint8array: Uint8Array 28 | arraybuffer: ArrayBuffer 29 | blob: Blob 30 | stream: NodeJS.ReadableStream 31 | } 32 | 33 | export default class Book { 34 | data: Data 35 | 36 | // normal files 37 | manifestFiles: FileItem[] = [] 38 | 39 | // 封面 40 | coverPageFile: FileItem // cover.xhtml 41 | 42 | // 导航 43 | navPageFile: FileItem 44 | 45 | // 章节 46 | textFiles: FileItem[] = [] 47 | 48 | constructor(data: Data) { 49 | this.data = data 50 | } 51 | 52 | /** 53 | * getters 54 | */ 55 | 56 | get bookId() { 57 | return this.data.startInfo.bookId 58 | } 59 | get bookTitle() { 60 | return this.data.startInfo.bookInfo.title 61 | } 62 | get bookDir() { 63 | return path.join(BOOKS_DIR, filenamify(this.bookId + '-' + this.bookTitle)) 64 | } 65 | 66 | get coverUrl(): string { 67 | let imgUrl = this.data.startInfo.bookInfo.cover 68 | 69 | // e.g 70 | // https://wfqqreader-1252317822.image.myqcloud.com/cover/723/26224723/s_26224723.jpg 71 | // https://wfqqreader-1252317822.image.myqcloud.com/cover/723/26224723/t9_26224723.jpg 72 | if (/(s)_\d+\.\w+$/.test(imgUrl)) { 73 | imgUrl = imgUrl.replace(/s_(\d+\.\w+)$/, 't9_$1') 74 | } 75 | 76 | return imgUrl 77 | } 78 | 79 | addFile = (f: FileItemFields | FileItem) => { 80 | if (f instanceof FileItem) { 81 | this.manifestFiles.push(f) 82 | } else { 83 | const fileItem = new FileItem(f) 84 | this.manifestFiles.push(fileItem) 85 | } 86 | return this 87 | } 88 | 89 | addTextFile = (options: FileItemFields) => { 90 | const f = new FileItem(options) 91 | this.textFiles.push(f) 92 | return this 93 | } 94 | 95 | getManifest() { 96 | return [this.coverPageFile, this.navPageFile, ...this.textFiles, ...this.manifestFiles].filter( 97 | Boolean 98 | ) 99 | } 100 | 101 | getSpine() { 102 | return [this.coverPageFile, this.navPageFile, ...this.textFiles].filter(Boolean) 103 | } 104 | 105 | getNavInfo() { 106 | const navItems: NavItem[] = [] 107 | let f: FileItem 108 | let maxNavDepth = 1 109 | let playOrder = 1 110 | 111 | // 封面 112 | if (this.coverPageFile) { 113 | f = this.coverPageFile 114 | const { id, filename } = f 115 | navItems.push({ 116 | id, 117 | filename, 118 | title: '封面', 119 | playOrder: playOrder++, 120 | }) 121 | } 122 | 123 | // 目录 124 | if (this.navPageFile) { 125 | f = this.navPageFile 126 | const { id, filename } = f 127 | navItems.push({ 128 | id, 129 | filename, 130 | title: '目录', 131 | playOrder: playOrder++, 132 | }) 133 | } 134 | 135 | // 章节 136 | this.data.startInfo.chapterInfos.forEach((cur, index) => { 137 | maxNavDepth = Math.max(maxNavDepth, cur.level) 138 | 139 | let arr = navItems 140 | _.times(cur.level - 1, () => { 141 | const item = _.last(navItems) 142 | // FIXME: none null assert 143 | if (!item!.children) item!.children = [] 144 | arr = item!.children 145 | }) 146 | 147 | const { id, filename } = this.textFiles[index] 148 | arr.push({ 149 | id, 150 | filename, 151 | title: cur.title, 152 | playOrder: playOrder++, 153 | }) 154 | }) 155 | 156 | return { navItems, maxNavDepth } 157 | } 158 | 159 | /** 160 | * zip related 161 | */ 162 | 163 | zip: JSZip = new JSZip() 164 | zipFolders: [string, string][] = [] 165 | 166 | // add file 167 | addZipFile( 168 | path: string, 169 | content: InputByType[T], 170 | options?: JSZipFileOptions 171 | ) { 172 | this.zip.file(path, content, options) 173 | } 174 | 175 | // add folder 176 | async addZipFolder(name: string, localFolder: string) { 177 | const files = await fg('**/*.*', { cwd: localFolder }) 178 | const content = files.map((f) => fse.createReadStream(path.join(localFolder, f))) 179 | files.forEach((f, index) => { 180 | this.addZipFile(trimEnd(name, '/') + '/' + f, content[index]) 181 | }) 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /src/utils/EpubModel/index.ts: -------------------------------------------------------------------------------- 1 | import mime from 'mime' 2 | 3 | export interface MetadataFields { 4 | bookId: string 5 | title: string 6 | 7 | /** 8 | * publish date 9 | */ 10 | 11 | date: Date 12 | modified: Date 13 | 14 | /** 15 | * author 16 | */ 17 | creator: string 18 | 19 | lang: string 20 | 21 | /** 22 | * cover assets id 23 | */ 24 | cover: string 25 | } 26 | 27 | export interface FileItemFields { 28 | filename: string 29 | content?: string | Buffer 30 | filepath?: string // for content 31 | id?: string 32 | mimetype?: string | null 33 | properties?: string 34 | } 35 | 36 | export interface ModelInterface { 37 | generate(): string 38 | } 39 | 40 | class Model { 41 | data: T 42 | toString() { 43 | return JSON.stringify(this.data, null, 2) 44 | } 45 | constructor(options: T) { 46 | this.data = { ...options } 47 | } 48 | set(options: T) { 49 | this.data = { ...this.data, ...options } 50 | } 51 | } 52 | 53 | export class Metadata extends Model { 54 | get val(): MetadataFields { 55 | return { 56 | ...this.data, 57 | } 58 | } 59 | } 60 | 61 | export class FileItem extends Model { 62 | get val(): FileItemFields { 63 | return { 64 | ...this.data, 65 | id: this.id, 66 | mimetype: this.mimetype, 67 | } 68 | } 69 | 70 | get filename() { 71 | return this.data.filename 72 | } 73 | 74 | get properties() { 75 | return this.data.properties 76 | } 77 | 78 | get content() { 79 | return this.data.content 80 | } 81 | 82 | get filepath() { 83 | return this.data.filepath 84 | } 85 | 86 | /** 87 | * with fallback 88 | */ 89 | 90 | get mimetype() { 91 | return this.data.mimetype || mime.getType(this.data.filename) 92 | } 93 | 94 | get id() { 95 | return this.data.id || this.data.filename.replace(/[/]/g, '__') 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/utils/epub-img.ts: -------------------------------------------------------------------------------- 1 | import { getBookHtml } from '$common' 2 | import { createHash } from 'crypto' 3 | import debugFactory from 'debug' 4 | import { dl, is404Error } from 'dl-vampire' 5 | import fse from 'fs-extra' 6 | import mime from 'mime' 7 | import ms from 'ms' 8 | import path from 'path' 9 | import pmap from 'promise.map' 10 | import sharp from 'sharp' 11 | import Book from './Book.js' 12 | import { getImgSrcs } from './processContent/index.js' 13 | 14 | const debug = debugFactory('weread-spy:utils:epub-img') 15 | const md5 = (s: string) => createHash('md5').update(s, 'utf8').digest('hex') 16 | 17 | export interface ImgSrcInfo { 18 | [key: string]: { contentType: string; ext: string; localFile: string; properties?: string } 19 | } 20 | 21 | export default async function getImgSrcInfo(book: Book, clean: boolean) { 22 | let imgSrcInfo: ImgSrcInfo = {} 23 | 24 | const { data, bookDir } = book 25 | const cacheFile = path.join(bookDir, 'imgs.json') 26 | 27 | if (clean) { 28 | debug('cleaning: remove imgs.json %s', cacheFile) 29 | await fse.remove(cacheFile) 30 | 31 | const imgsDir = path.join(bookDir, 'imgs') 32 | debug('cleaning: remove imgs dir %s', imgsDir) 33 | await fse.remove(imgsDir) 34 | } else if (await fse.pathExists(cacheFile)) { 35 | /** 36 | * use cache 37 | */ 38 | imgSrcInfo = await fse.readJsonSync(cacheFile) 39 | if (Object.keys(imgSrcInfo).length) { 40 | return imgSrcInfo 41 | } 42 | } 43 | 44 | const { chapterInfos } = data.startInfo 45 | 46 | // imgSrcs 47 | let imgSrcs: string[] = [] 48 | for (let i = 0; i < chapterInfos.length; i++) { 49 | const html = getBookHtml(data.infos[i]) 50 | const curSrcs = getImgSrcs(html) 51 | imgSrcs = imgSrcs.concat(curSrcs) 52 | } 53 | 54 | /** 55 | * img 去重 56 | */ 57 | 58 | const imgSrcSet = new Set() 59 | const originalImgSrcs = [...imgSrcs] 60 | imgSrcs = [] 61 | for (const src of originalImgSrcs) { 62 | if (imgSrcSet.has(src)) { 63 | continue 64 | } else { 65 | imgSrcSet.add(src) 66 | imgSrcs.push(src) 67 | } 68 | } 69 | debug( 70 | 'imgSrcs collected, length = %s, unique length = %s', 71 | originalImgSrcs.length, 72 | imgSrcs.length 73 | ) 74 | 75 | // head contentType is not correct 76 | // 1.下载 77 | // 2.识别 & 重命名 78 | 79 | imgSrcs.forEach((src) => { 80 | let localFile: string 81 | // https://res.weread.qq.com/wrepub/epub_25462428_587 82 | const match = /^https?:\/\/res\.weread\.qq\.com\/wrepub\/(epub_[\d\w_-]+)$/.exec(src) 83 | if (match) { 84 | const name = match[1] 85 | localFile = `imgs/${name}` 86 | } else { 87 | const hash = md5(src) 88 | localFile = `imgs/${hash}` 89 | } 90 | 91 | imgSrcInfo[src] = { 92 | contentType: '', 93 | ext: '', 94 | localFile, 95 | } 96 | }) 97 | 98 | /** 99 | * cover 100 | */ 101 | 102 | const coverUrl = book.coverUrl 103 | if (coverUrl) { 104 | debug('add cover url = %s', coverUrl) 105 | imgSrcs.push(coverUrl) 106 | imgSrcInfo[coverUrl] = { 107 | contentType: '', 108 | ext: '', 109 | localFile: 'imgs/cover', // ext will be add later 110 | properties: 'cover-image', 111 | } 112 | } 113 | 114 | await pmap( 115 | imgSrcs, 116 | async (src) => { 117 | const { localFile } = imgSrcInfo[src] 118 | const file = path.join(bookDir, localFile) 119 | 120 | // download 121 | try { 122 | await dl({ 123 | url: src, 124 | file, 125 | // 重试3次, 每次超时 40s 126 | retry: { 127 | timeout: ms('40s'), 128 | times: 3, 129 | }, 130 | }) 131 | } catch (e) { 132 | // @example 133 | // https://res.weread.qq.com/wrepub/web/855825/copyright.jpg 134 | // https://res.weread.qq.com/wrepub/CB_3300070708_83%28The_Earth_Through_Time%29.png 135 | if (is404Error(e)) { 136 | delete imgSrcInfo[src] // 剔除了, 当他不存在 137 | return 138 | } 139 | 140 | throw e 141 | } 142 | 143 | // 识别 144 | const buf = await fse.readFile(file) 145 | const meta = await sharp(buf).metadata() 146 | const ext = meta.format 147 | // eslint-disable-next-line @typescript-eslint/no-non-null-assertion 148 | const contentType = mime.getType(ext!) 149 | 150 | // attach 151 | const localFileNew = localFile + '.' + ext 152 | Object.assign(imgSrcInfo[src], { 153 | ext, 154 | contentType, 155 | localFile: localFileNew, 156 | }) 157 | 158 | // rename 159 | await fse.rename(path.join(bookDir, localFile), path.join(bookDir, localFileNew)) 160 | }, 161 | 10 162 | ) 163 | debug('download img complete') 164 | 165 | // save cache 166 | await fse.outputJson(cacheFile, imgSrcInfo, { spaces: 2 }) 167 | 168 | return imgSrcInfo 169 | } 170 | -------------------------------------------------------------------------------- /src/utils/epub.ts: -------------------------------------------------------------------------------- 1 | /* 2 | Produce an EPUB file 3 | -------------------- 4 | Reference: 5 | https://www.ibm.com/developerworks/xml/tutorials/x-epubtut/index.html 6 | https://github.com/danburzo/percollate/blob/master/index.js#L516 7 | */ 8 | 9 | import { baseDebug, BOOKS_DIR, Data, PROJECT_ROOT } from '$common' 10 | import AdmZip from 'adm-zip' 11 | import filenamify from 'filenamify' 12 | import fse from 'fs-extra' 13 | import nunjucks from 'nunjucks' 14 | import path from 'path' 15 | import { performance } from 'perf_hooks' 16 | import pmap, { pmapWorker } from 'promise.map' 17 | import { pipeline } from 'stream/promises' 18 | import { queryBook } from '../common/books-map.js' 19 | import Book from './Book.js' 20 | import getImgSrcInfo from './epub-img.js' 21 | import { FileItem } from './EpubModel/index.js' 22 | 23 | // worker 24 | import { createWorkers } from './processContent/worker/index.main.js' 25 | 26 | // this thread for debugger 27 | import processContent from './processContent/index.js' 28 | 29 | const debug = baseDebug.extend('utils:epub') 30 | 31 | export async function gen({ 32 | epubFile, 33 | data, 34 | clean, 35 | }: { 36 | epubFile: string 37 | data: Data 38 | clean: boolean 39 | }): Promise { 40 | debug('epubgen %s -> %s', data.startInfo.bookId, epubFile) 41 | const templateBase = path.join(PROJECT_ROOT, 'assets/templates/epub/') 42 | 43 | const book = new Book(data) 44 | const { bookDir, addFile, addTextFile } = book 45 | 46 | // mimetype file must be first 47 | book.addZipFile('mimetype', 'application/epub+zip', { compression: 'STORE' }) 48 | 49 | // static files from META-INF 50 | await book.addZipFolder('META-INF', path.join(templateBase, 'META-INF')) 51 | 52 | const [navTemplate, tocTemplate, opfTemplate, coverTemplate] = await Promise.all([ 53 | fse.readFile(path.join(templateBase, 'OEBPS/nav.xhtml'), 'utf8'), 54 | fse.readFile(path.join(templateBase, 'OEBPS/toc.ncx'), 'utf8'), 55 | fse.readFile(path.join(templateBase, 'OEBPS/content.opf'), 'utf8'), 56 | fse.readFile(path.join(templateBase, 'OEBPS/cover.xhtml'), 'utf8'), 57 | ]) 58 | 59 | // 章节 html 60 | const { chapterInfos, bookInfo, bookId } = data.startInfo 61 | 62 | // 图片 63 | const imgSrcInfo = await getImgSrcInfo(book, clean) 64 | 65 | /** 66 | * cover 67 | */ 68 | 69 | const coverUrl = book.coverUrl 70 | let coverFileItem: FileItem | undefined // save for manifest.meta.cover 71 | let coverPageFileItem: FileItem | undefined 72 | 73 | if (book.coverUrl) { 74 | const { localFile } = imgSrcInfo[coverUrl] 75 | delete imgSrcInfo[coverUrl] 76 | 77 | // cover img 78 | coverFileItem = new FileItem({ filename: localFile }) // 内容随 imgs 打包 79 | addFile(coverFileItem) 80 | 81 | // cover xhtml 82 | coverPageFileItem = new FileItem({ 83 | filename: 'cover.xhtml', 84 | content: nunjucks.renderString(coverTemplate, { cover: coverFileItem }), 85 | }) 86 | book.coverPageFile = coverPageFileItem 87 | } 88 | 89 | // extra css 90 | const extraCss: string[] = [] 91 | const customCssFile = path.join(bookDir, 'custom.css') 92 | if (await fse.pathExists(customCssFile)) { 93 | extraCss.push('custom.css') 94 | addFile({ filename: 'custom.css', filepath: customCssFile }) 95 | } 96 | 97 | const DEBUG_PROCESS_CONTENT = !!process.env.DEBUG_PROCESS_CONTENT 98 | const processContentStart = performance.now() 99 | let processResults: Awaited>[] = [] 100 | 101 | // 102 | // processContent in this thread 103 | // 104 | if (DEBUG_PROCESS_CONTENT) { 105 | processResults = await pmap( 106 | chapterInfos, 107 | async (chapterInfo, i, arr) => { 108 | const c = chapterInfos[i] 109 | const { chapterUid } = c 110 | const cssFilenames = [`css/chapter-${chapterUid}.css`, ...extraCss] 111 | return processContent(data.infos[i], { 112 | cssFilenames, 113 | imgSrcInfo, 114 | }) 115 | }, 116 | 5 117 | ) 118 | } 119 | // 120 | // processContent in multiple threads, via workers 121 | // 122 | else { 123 | const workers = createWorkers() 124 | processResults = await pmapWorker( 125 | chapterInfos, 126 | async (chapterInfo, i, arr, worker) => { 127 | const c = chapterInfos[i] 128 | const { chapterUid } = c 129 | const cssFilenames = [`css/chapter-${chapterUid}.css`, ...extraCss] 130 | return await worker.api.processContent(data.infos[i], { 131 | cssFilenames, 132 | imgSrcInfo, 133 | }) 134 | }, 135 | workers 136 | ) 137 | workers.forEach((w) => w.nodeWorker.unref()) 138 | await new Promise((resolve) => setTimeout(resolve)) 139 | } 140 | debug('processContent cost %s ms', (performance.now() - processContentStart).toFixed()) 141 | 142 | for (let i = 0; i < chapterInfos.length; i++) { 143 | const c = chapterInfos[i] 144 | const { chapterUid } = c 145 | const { xhtml, style } = processResults[i] 146 | 147 | // xhtml 148 | { 149 | const filename = `chapter-${chapterUid}.xhtml` 150 | addTextFile({ filename, content: xhtml }) 151 | } 152 | 153 | // css 154 | { 155 | const filename = `css/chapter-${chapterUid}.css` 156 | addFile({ filename, content: style }) 157 | } 158 | } 159 | 160 | /** 161 | * img assets (cover removed) 162 | */ 163 | 164 | for (const src of Object.keys(imgSrcInfo)) { 165 | const { contentType, localFile, properties } = imgSrcInfo[src] 166 | addFile({ filename: localFile, properties }) // content will be imgs dir 167 | } 168 | 169 | const baseRenderData = { 170 | bookId, 171 | e: '', 172 | title: bookInfo.title, 173 | date: new Date(bookInfo.updateTime * 1000).toISOString().replace(/\.\d+Z$/, 'Z'), 174 | lang: 'zh-CN', 175 | creator: bookInfo.author, 176 | publisher: bookInfo.publisher, 177 | description: bookInfo.intro, 178 | category: bookInfo.category, 179 | 180 | // cover 181 | cover: coverFileItem, 182 | coverPage: coverPageFileItem, 183 | } 184 | 185 | /** 186 | * nav 187 | */ 188 | 189 | // add nav.xhtml first 190 | book.navPageFile = new FileItem({ filename: 'nav.xhtml', properties: 'nav' }) // 内容手动写入 191 | const { navItems, maxNavDepth } = book.getNavInfo() 192 | 193 | { 194 | const renderData = { ...baseRenderData, navItems, maxNavDepth } 195 | 196 | const nav = nunjucks.renderString(navTemplate, renderData) 197 | book.addZipFile('OEBPS/nav.xhtml', nav) 198 | 199 | const toc = nunjucks.renderString(tocTemplate, renderData) 200 | addFile({ filename: 'toc.ncx', content: toc, id: 'ncx' }) 201 | } 202 | 203 | const manifest = book.getManifest() 204 | const spine = book.getSpine() 205 | { 206 | // content.opf 207 | const renderData = { ...baseRenderData, manifest, spine } 208 | const opf = nunjucks.renderString(opfTemplate, renderData) 209 | book.addZipFile('OEBPS/content.opf', opf) 210 | } 211 | 212 | // 添加文件 213 | for (const f of manifest) { 214 | let content: string | Buffer 215 | 216 | // f.content = '' 也需要写入 217 | if (typeof f.content !== 'undefined' && f.content !== null) { 218 | content = f.content 219 | } else if (f.filepath) { 220 | content = fse.readFileSync(f.filepath) 221 | } else { 222 | continue 223 | } 224 | 225 | book.addZipFile(`OEBPS/${f.filename}`, content) 226 | } 227 | 228 | // 添加图片 229 | await book.addZipFolder('OEBPS/imgs', path.join(bookDir, 'imgs')) 230 | 231 | // write .epub file 232 | const stream = book.zip.generateNodeStream({ 233 | streamFiles: true, 234 | compression: 'DEFLATE', 235 | compressionOptions: { level: 9 }, 236 | }) 237 | const output = fse.createWriteStream(epubFile) 238 | await pipeline(stream, output) 239 | } 240 | 241 | async function getInfo(id: string, dir: string) { 242 | const { title = '' } = (await queryBook({ id })) || {} 243 | const titleAsFilename = filenamify(title) 244 | 245 | const data = fse.readJsonSync(path.join(BOOKS_DIR, `${id}-${titleAsFilename}.json`)) 246 | 247 | let filename = `${titleAsFilename}.epub` 248 | filename = filename.replace(/(/g, '(').replace(/)/g, ')') // e,g 红楼梦(全集) 249 | const file = path.join(dir, filename) 250 | 251 | return { data, file, titleAsFilename } 252 | } 253 | 254 | export async function genEpubFor(id: string, dir: string, clean: boolean, decompress = false) { 255 | const { data, file, titleAsFilename } = await getInfo(id, dir) 256 | 257 | await fse.ensureDir(dir) 258 | await gen({ 259 | epubFile: file, 260 | data, 261 | clean, 262 | }) 263 | 264 | if (decompress) { 265 | const epubUnzipDir = path.join(dir, titleAsFilename + '.epub.d') 266 | debug('decompress: to %s', epubUnzipDir) 267 | await fse.ensureDir(epubUnzipDir) 268 | const zip = new AdmZip(file) 269 | zip.extractAllTo(epubUnzipDir, true) 270 | } 271 | 272 | debug('epub created: %s', file) 273 | return file 274 | } 275 | -------------------------------------------------------------------------------- /src/utils/epubcheck.ts: -------------------------------------------------------------------------------- 1 | import { baseDebug } from '$common/index' 2 | import { execSync } from 'child_process' 3 | import epubcheckJarPath from 'epubcheck-assets' 4 | 5 | const debug = baseDebug.extend('utils:epubcheck') 6 | 7 | // epubchecker 安装时从 github release 下载, 且没有使用 http_proxy 8 | // function getJarPath() { 9 | // const dir = path.dirname(require.resolve('epubchecker/package.json')) 10 | // const jar = fg.sync('**/epubcheck.jar', { cwd: dir, absolute: true })[0] 11 | // return jar 12 | // } 13 | 14 | function getJarPath() { 15 | // return path.join(PROJECT_ROOT, 'assets/epubcheck-5.0.0/epubcheck.jar') 16 | return epubcheckJarPath 17 | } 18 | 19 | export default function epubcheck(file: string) { 20 | const epubcheckJar = getJarPath() 21 | if (!epubcheckJar) { 22 | console.error('can not find epubcheck.jar') 23 | process.exit(1) 24 | } 25 | 26 | const cmd = `java -jar '${epubcheckJar}' '${file}'` 27 | debug('[exec]: %s', cmd) 28 | try { 29 | execSync(cmd, { stdio: 'inherit' }) 30 | debug('success') 31 | } catch (e) { 32 | console.error(e.stack || e) 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/utils/pptr-anti-spider/bak.txt: -------------------------------------------------------------------------------- 1 | type InterceptionEvent = Parameters>[0]['event'] 2 | 3 | function processJsFile(js: string, event: InterceptionEvent) { 4 | const url = event.request.url 5 | const pathname = new URL(url).pathname 6 | const filename = path.basename(pathname) 7 | 8 | // app.*.js 9 | if (/^app\.\w+\.js$/.test(filename)) { 10 | return processAppJs(js) 11 | } 12 | 13 | // 期望 14 | if (!/^\d+\.\w+\.js$/i.test(filename)) return js 15 | 16 | // vuex 17 | // this['commit'] = function(_0xe59d72, _0x31227c, _0x3aa954) { 18 | // return _0x5068e8['call'](_0x43325a, _0xe59d72, _0x31227c, _0x3aa954); 19 | // } 20 | 21 | const matches = Array.from(js.matchAll(/this\['commit'\] *?= *?function\(/g)) 22 | console.log('file: %s, matches', filename, matches) 23 | 24 | if (matches.length) { 25 | debug('current patch vuex store.commit') 26 | debugger 27 | } 28 | 29 | return js 30 | } 31 | 32 | 33 | import { Page } from 'puppeteer' 34 | import { baseDebug } from '../../common/index' 35 | 36 | const debug = baseDebug.extend('utils:anti-spider') 37 | 38 | export const HTML_CONTENT_STORAGE_KEY = '__chapterContentHtml__' 39 | 40 | export async function hookVuexCommit(page: Page) { 41 | debug('hookVuexCommit') 42 | 43 | // 注入 $store.commit 44 | await page.$eval( 45 | '#app', 46 | (el: any, htmlContentStorageKey: string) => { 47 | const original = el.__vue__.$store.commit 48 | el.__vue__.$store.commit = function (...args: any[]) { 49 | // action, payload, 第三个参数不知道 50 | const [action, payload, thirdArg] = args 51 | console.log('injected vuex.commit: %s %s', action, payload) 52 | 53 | if (action === 'updateReaderContentHtml') { 54 | globalThis[htmlContentStorageKey] = payload[0] 55 | } 56 | return original(...args) 57 | } 58 | }, 59 | HTML_CONTENT_STORAGE_KEY 60 | ) 61 | } 62 | -------------------------------------------------------------------------------- /src/utils/pptr-anti-spider/index.ts: -------------------------------------------------------------------------------- 1 | import { baseDebugDetail } from '$common/index' 2 | 3 | const debug = baseDebugDetail.extend('pptr:anti-spider') 4 | 5 | export function processAppJs(js: string | undefined, fileBasename: string) { 6 | debug('modifying %s', fileBasename) 7 | js ||= '' 8 | 9 | // debugger 10 | js = removeDebuggerLimit(js) || js 11 | 12 | // expose `__vue__` 13 | // https://github.com/vuejs/vue/blob/49b6bd4264c25ea41408f066a1835f38bf6fe9f1/src/core/instance/lifecycle.ts#L78 14 | // 在 Vue.prototype._update 实现 15 | { 16 | // 设置环境变量 17 | // _0x44ebd7['env'] = {}, 18 | // _0x44ebd7['argv'] = [], 19 | js = js.replace(/(_0x\w+)\['env'\]=\{\},(_0x\w+)\['argv'\]=\[\],/, (match, var1, var2) => { 20 | return `${var1}.env = { VUE_DISMISS_DEVTOOLS: 'yes' }, ${var2}.argv = [],` 21 | }) 22 | 23 | // 'yes'===_0x16452a['env']['VUE_DISMISS_DEVTOOLS'] && _0x1be68e && (_0x1be68e['__vue__'] = null), 24 | // 'yes'===_0x16452a['env'][_0x3744('0x22b')] && _0x5ad1f7['$el'] && (console['log']('__vue__'), 25 | js = js.replace(/'yes'===([_\w]+\['env'\])/g, `'yes' !== $1`) 26 | 27 | // 'yes'===_0x1372e5[_0x3db9('0x5ba')]['VUE_DISMISS_DEVTOOLS'] && _0x243be5 && (_0x243be5[_0x3db9('0xcce')] = null), 28 | // 'yes'===_0x1372e5[_0x3db9('0x5ba')][_0x3db9('0xf2')] && _0x45b52d['$el'] && (console['log'](_0x3db9('0xcce')), 29 | js = js.replace(/'yes'===([_\w]+\[_0x\w+\('0x\w+'\)\]\[)/g, `'yes' !== $1`) 30 | 31 | // _0x2a82('0x207')===_0x5d11b9['env']['VUE_DISMISS_DEVTOOLS']&&_0x1051e8[_0x2a82('0xad7')] && (console['log']('__vue__'), 32 | // _0x1051e8['$el'][_0x2a82('0x2b2')] = _0x1051e8), 33 | // 变种太多 34 | } 35 | 36 | { 37 | // vuex 38 | // this['commit'] = function(_0xe59d72, _0x31227c, _0x3aa954) { 39 | // return _0x5068e8['call'](_0x43325a, _0xe59d72, _0x31227c, _0x3aa954); 40 | // } 41 | js = js.replace( 42 | /this\['commit'\]=function\((_0x\w+),(_0x\w+),(_0x[\w]+)\)\{([ \S]+?)\}/, 43 | (match, arg1, arg2, arg3, functionBody) => { 44 | return ` 45 | // access store 46 | window.__stores__ ||= new WeakSet(), 47 | window.__stores__.add(this), 48 | 49 | this['commit'] = function(${arg1}, ${arg2}, ${arg3}) { 50 | // hook 51 | const [mutation, payload] = [${arg1}, ${arg2}] 52 | console.log('injected vuex.commit: ', mutation, payload) 53 | 54 | // access store 55 | if (this) { 56 | window.__stores__.add(this) 57 | if (Object.keys(this._actions).length > 10) { 58 | window.__store__ = this 59 | } 60 | } 61 | 62 | if (mutation === 'updateReaderContentHtml') { 63 | window.__store__ = this 64 | window.__chapterContentHtmlArray__ = payload 65 | } 66 | 67 | ${functionBody} 68 | }` 69 | } 70 | ) 71 | } 72 | 73 | return js 74 | } 75 | 76 | export function findMatchingIndex(input: string, fi: number) { 77 | const pairs = { 78 | '(': ')', 79 | '{': '}', 80 | '[': ']', 81 | } 82 | 83 | const left = input[fi] 84 | const right = pairs[left] 85 | if (!right) { 86 | return -1 87 | } 88 | 89 | let count = 1 // input[fi] = left 90 | 91 | for (let i = fi + 1, len = input.length; i < len; i++) { 92 | const cur = input[i] 93 | if (cur === right) { 94 | count-- 95 | if (count === 0) { 96 | return i 97 | } 98 | } else if (cur === left) { 99 | count++ 100 | } 101 | } 102 | 103 | return -1 // not found 104 | } 105 | 106 | /** 107 | (function() { 108 | _0x24fa27(this, function() { 109 | var _0x2fc99b = new RegExp('function\x20*\x5c(\x20*\x5c)'); 110 | var _0x2ca847 = new RegExp('\x5c+\x5c+\x20*(?:[a-zA-Z_$][0-9a-zA-Z_$]*)','i'); 111 | var _0x1fee8d = _0x4dab42('init'); 112 | if (!_0x2fc99b[_0x1d23('0x403')](_0x1fee8d + 'chain') || !_0x2ca847['test'](_0x1fee8d + 'input')) { 113 | _0x1fee8d('0'); 114 | } else { 115 | _0x4dab42(); 116 | } 117 | })(); 118 | }()); 119 | 120 | const _0x2fdb16 = new RegExp(_0x7031('0x945')); 121 | */ 122 | 123 | // 这种变种太多 124 | // ['constructor']('while\x20(true)\x20{}')['apply']('counter') 125 | // ['constructor']('debu'+_0x4584('0xe0'))['call']('action')) 126 | // ['constructor'](_0x4584('0x3a5')+_0x4584('0xe0'))['apply']('stateObject')) 127 | 128 | function removeDebuggerLimit(js: string): string | undefined { 129 | let index = -1 130 | if (index === -1) { 131 | index = js.indexOf(String.raw`=new RegExp('function\x20*\x5c(\x20*\x5c)')`) 132 | } 133 | if (index === -1) { 134 | index = js.indexOf(String.raw`=new RegExp('\x5c+\x5c+\x20*(?:[a-zA-Z_$][0-9a-zA-Z_$]*)','i')`) 135 | } 136 | 137 | /** 138 | const _0x2fdb16 = new RegExp(_0x7031('0x945')); 139 | const _0x48182a = new RegExp(_0x7031('0x403'),'i'); 140 | */ 141 | if (index === -1) { 142 | const match = 143 | /const _0x\w+=new RegExp\(_0x\w+\('0x\w+'\)\);const _0x\w+=new RegExp\(_0x\w+\('0x\w+'\),'i'\);/g.exec( 144 | js 145 | ) 146 | if (match) { 147 | index = match.index 148 | } 149 | } 150 | 151 | if (index === -1) { 152 | return 153 | } 154 | 155 | let prevBraceIndex = index 156 | while (js[prevBraceIndex] !== '{') prevBraceIndex-- 157 | const endBraceIndex = findMatchingIndex(js, prevBraceIndex) 158 | 159 | // 变成空 function 160 | const code = js.slice(0, prevBraceIndex) + `{}` + js.slice(endBraceIndex + 1) 161 | return code 162 | } 163 | -------------------------------------------------------------------------------- /src/utils/pptr.ts: -------------------------------------------------------------------------------- 1 | import { PPTR_DATA_DIR, baseDebug } from '$common' 2 | import path from 'path' 3 | import pptr from 'puppeteer' 4 | import { RequestInterceptionManager } from 'puppeteer-intercept-and-modify-requests' 5 | import { processAppJs } from './pptr-anti-spider/index.js' 6 | 7 | const debug = baseDebug.extend('pptr') 8 | 9 | export async function getBrowser() { 10 | const browser = await pptr.launch({ 11 | headless: false, 12 | devtools: false, 13 | userDataDir: PPTR_DATA_DIR, 14 | defaultViewport: null, 15 | ignoreDefaultArgs: ['--enable-automation'], 16 | }) 17 | 18 | // close existing page 19 | { 20 | const pages = await browser.pages() 21 | process.nextTick(() => { 22 | pages.forEach((p) => p.close()) 23 | }) 24 | } 25 | 26 | const page = await browser.newPage() 27 | 28 | // disable cache 29 | await page.setCacheEnabled(false) 30 | 31 | // intercept 32 | const client = await page.target().createCDPSession() 33 | // @ts-ignore 34 | const interceptManager = new RequestInterceptionManager(client) 35 | await interceptManager.intercept({ 36 | urlPattern: `*/*.*.js`, 37 | // urlPattern: `*/app.*.js`, 38 | resourceType: 'Script', 39 | modifyResponse({ body, event }) { 40 | const url = event.request.url 41 | const basename = path.basename(url) 42 | 43 | // 1.xxx.js 44 | // app.xxx.js 45 | // utils.xxx.js 46 | if (!/^\w+\.\w+\.js$/.test(basename)) { 47 | return 48 | } 49 | 50 | body = processAppJs(body, basename) 51 | return { body } 52 | }, 53 | }) 54 | 55 | await page.goto('https://weread.qq.com/') 56 | 57 | const loginBtn = '.navBar_link_Login' 58 | const logined = await page.$$eval(loginBtn, (els) => els.length === 0) 59 | if (!logined) { 60 | // 点击登录 61 | await page.click(loginBtn) 62 | 63 | // 扫码 64 | 65 | // 等待登录成功 66 | await page.waitForSelector('.wr_avatar.navBar_avatar', { 67 | timeout: 0, 68 | }) 69 | console.log('登录完成') 70 | } 71 | 72 | const ua = await browser.userAgent() 73 | console.log('ua = %s', ua) 74 | 75 | return { browser, page } 76 | } 77 | -------------------------------------------------------------------------------- /src/utils/processContent/example-start-info.json: -------------------------------------------------------------------------------- 1 | { 2 | "bookId": "25462428", 3 | "bookInfo": { 4 | "bookId": "25462428", 5 | "title": "深入浅出WebAssembly", 6 | "author": "于航", 7 | "cover": "https://wfqqreader-1252317822.image.myqcloud.com/cover/428/25462428/s_25462428.jpg", 8 | "version": 1619444478, 9 | "format": "epub", 10 | "type": 0, 11 | "price": 89.6, 12 | "originalPrice": 0, 13 | "soldout": 0, 14 | "bookStatus": 1, 15 | "payType": 4097, 16 | "finished": 1, 17 | "maxFreeChapter": 9, 18 | "free": 0, 19 | "mcardDiscount": 0, 20 | "ispub": 1, 21 | "cpid": 2000000440, 22 | "centPrice": 8960, 23 | "category": "科学科技榜-计算机", 24 | "source": "所有书籍均已获得正版授权", 25 | "hasLecture": 0, 26 | "intro": "blabla", 27 | "lastChapterIdx": 40, 28 | "paperBook": { 29 | "skuId": "12474852" 30 | }, 31 | "chapterSize": 40, 32 | "updateTime": 1598794618, 33 | "onTime": 1562642264, 34 | "unitPrice": 0.05, 35 | "marketType": 0, 36 | "isbn": "9787121352171", 37 | "publisher": "电子工业出版社", 38 | "publishTime": "2018-11-01 00:00:00", 39 | "totalWords": 0, 40 | "publishPrice": 128, 41 | "bookSize": 1207117, 42 | "recommended": 0, 43 | "lectureRecommended": 0, 44 | "follow": 0, 45 | "secret": 1, 46 | "offline": 0, 47 | "lectureOffline": 0, 48 | "finishReading": 0, 49 | "isAutoPay": 0, 50 | "availables": 0, 51 | "paid": 0, 52 | "isChapterPaid": 0, 53 | "showLectureButton": 1, 54 | "wxtts": 1, 55 | "star": 77, 56 | "ratingCount": 24, 57 | "ratingDetail": { 58 | "one": 3, 59 | "two": 0, 60 | "three": 2, 61 | "four": 1, 62 | "five": 18, 63 | "recent": 3 64 | }, 65 | "copyrightInfo": { 66 | "id": 2000000440, 67 | "name": "电子工业出版社", 68 | "userVid": 0 69 | } 70 | }, 71 | "chapterInfos": [ 72 | { 73 | "chapterUid": 2, 74 | "chapterIdx": 2, 75 | "title": "版权信息", 76 | "paid": 0, 77 | "price": 0, 78 | "level": 1, 79 | "updateTime": 0, 80 | "wordCount": 129, 81 | "anchors": [] 82 | } 83 | ], 84 | "chapterContentHtml": "
html here
", 85 | "chapterContentHtmlArray": ["
html here
"], 86 | "chapterContentStyles": ".readerChapterContent{ color: red; }", 87 | "currentChapterId": 13 88 | } 89 | -------------------------------------------------------------------------------- /src/utils/processContent/index.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-var-requires */ 2 | 3 | import { $esm, Info, getBookHtml } from '$common' 4 | import type { AnyNode as $AnyNode, Element as $Element, Cheerio, CheerioAPI } from 'cheerio' 5 | import { load as $load } from 'cheerio' 6 | import debugFactory from 'debug' 7 | import * as _ from 'lodash-es' 8 | import njk from 'nunjucks' 9 | import prettier from 'prettier' 10 | import { ImgSrcInfo } from '../epub-img.js' 11 | 12 | const debug = debugFactory('weread-spy:utils:processContent') 13 | const { require } = $esm(import.meta) 14 | const prettierConfig = require('@magicdawn/prettier-config') as prettier.Options 15 | 16 | type TransformImgSrc = (src: string) => string 17 | interface ProcessContentOptions { 18 | cssFilenames: string[] 19 | imgSrcInfo: ImgSrcInfo 20 | } 21 | 22 | const DATA_ATTR_WHITELIST = ['data-src', 'data-bg-img'] 23 | 24 | export default async function processContent(info: Info, options: ProcessContentOptions) { 25 | const { chapterContentHtml, chapterContentStyles, currentChapterId } = info 26 | const { cssFilenames, imgSrcInfo } = options 27 | debug('processContent for title=%s chapterUid=%s', info.bookInfo.title, currentChapterId) 28 | 29 | let html = getBookHtml(info) 30 | 31 | // apply templates 32 | html = applyTemplate({ style: chapterContentStyles, content: html, cssFilenames }) 33 | 34 | // new $ 35 | const $ = $load(html, { 36 | // @ts-ignore 37 | _useHtmlParser2: true, 38 | decodeEntities: false, 39 | lowerCaseTags: true, 40 | }) 41 | // debug('cheerio loaded') 42 | 43 | // remove all data-xxx 44 | traverse($.root()[0], $, removeDataAttr) 45 | // debug('removeDataAttr complete') 46 | // debug($.xml().trim()) 47 | 48 | // combine span 49 | traverse($.root()[0], $, combineTextSpan) 50 | // debug('removeUnusedSpan complete') 51 | // debug($.xml().trim()) 52 | 53 | /** 54 | * special cases 55 | */ 56 | 57 | //

我的第一本书《练习的心态》中 58 | // 显示效果差 59 | $('.fDropContent > .ftext').removeClass('ftext').data('removed-class', 'ftext') 60 | 61 | // 图片 62 | const transformImgSrc = (src: string) => imgSrcInfo[src]?.localFile 63 | const ctx: { transformImgSrc: TransformImgSrc; imgs: Array<{ src: string; newSrc: string }> } = { 64 | transformImgSrc, 65 | imgs: [], 66 | } 67 | traverse($.root()[0], $, fixImgSrc, ctx) 68 | // debug('fixImgSrc complete') 69 | 70 | // get xhtml 71 | html = $.xml().trim() 72 | 73 | // format 74 | try { 75 | // html = prettier.format(html, {...prettierConfig, parser: 'html'}) 76 | } catch (e) { 77 | console.warn('[prettier] format met error: currentChapterId = %s', currentChapterId) 78 | console.warn(e.stack || e) 79 | } 80 | 81 | // replace 82 | html = html.replace(/ /g, ' ') 83 | 84 | let style = chapterContentStyles 85 | try { 86 | style = await prettier.format(style, { ...prettierConfig, parser: 'css' }) 87 | } catch (e) { 88 | console.warn('[prettier] format met error: currentChapterId = %s', currentChapterId) 89 | console.error(e.stack || e) 90 | } 91 | 92 | return { 93 | xhtml: html, 94 | style, 95 | imgs: ctx.imgs, 96 | } 97 | } 98 | 99 | /** 100 | * get all img srcs 101 | */ 102 | 103 | export function getImgSrcs(html: string) { 104 | // new $ 105 | const $ = $load(html, { decodeEntities: false, xmlMode: true, lowerCaseTags: true }) 106 | 107 | // collect 108 | const srcs: string[] = [] 109 | traverse($.root()[0], $, collectImgSrc, srcs) 110 | 111 | return srcs 112 | } 113 | 114 | // 117 | function applyTemplate({ 118 | style, 119 | content, 120 | cssFilenames, 121 | }: { 122 | style: string 123 | content: string 124 | cssFilenames: string[] 125 | }) { 126 | const tpl = ` 127 | 128 | 129 | 130 | 131 | Document 132 | {%- for css in cssFilenames -%} 133 | 134 | {%- endfor %} 135 | 136 | 137 |

138 | {{ content | safe }} 139 |
140 | 141 | 142 | ` 143 | 144 | const str = njk 145 | .renderString(tpl, { 146 | style, 147 | content, 148 | cssFilenames, 149 | }) 150 | .trim() 151 | 152 | return str 153 | } 154 | 155 | type OnNodeResult = { traverseChildren?: boolean } | undefined | void 156 | type OnNode = (el: $AnyNode, $: CheerioAPI, extraData?: any) => OnNodeResult 157 | 158 | function traverse(el: $AnyNode, $: CheerioAPI, onNode: OnNode, extraData?: any) { 159 | // self 160 | const { traverseChildren = true } = onNode(el, $, extraData) || {} 161 | 162 | // children 163 | if (traverseChildren && (el.type === 'tag' || el.type === 'root')) { 164 | el.childNodes.forEach((c) => { 165 | if (c.type === 'text') return 166 | traverse(c, $, onNode, extraData) 167 | }) 168 | } 169 | } 170 | 171 | function removeDataAttr(el: $Element, $: CheerioAPI): OnNodeResult { 172 | const $el = $(el) 173 | if (el.type === 'tag') { 174 | Object.keys(el.attribs || {}) 175 | .filter((k) => { 176 | return k.startsWith('data-') && !DATA_ATTR_WHITELIST.includes(k) 177 | }) 178 | .forEach((attr) => { 179 | $el.removeAttr(attr) 180 | }) 181 | } 182 | } 183 | 184 | function combineTextSpan(el: $Element, $: CheerioAPI): OnNodeResult { 185 | if (el.type !== 'tag') return 186 | if (!el.childNodes?.length) { 187 | return 188 | } 189 | 190 | const isSimpleTextSpan = (c: $AnyNode) => 191 | c.type === 'tag' && 192 | c.tagName?.toLowerCase() === 'span' && 193 | Object.keys((c as $Element).attribs || {}).length === 0 194 | 195 | if (isSimpleTextSpan(el)) { 196 | return { traverseChildren: false } 197 | } 198 | 199 | const $el = $(el) 200 | const shouldCombine = el.childNodes.every(isSimpleTextSpan) 201 | if (shouldCombine) { 202 | const text = $el.text() 203 | $el.empty() 204 | $el.append(`${text}`) 205 | return { traverseChildren: false } 206 | } 207 | 208 | const rate = el.childNodes.filter((c) => !isSimpleTextSpan(c)).length / el.childNodes.length 209 | if (rate < 1 / 10) { 210 | const arr: Cheerio<$AnyNode>[] = [] 211 | let lastIsSimpleTextSpan = true 212 | 213 | for (const c of el.childNodes) { 214 | if (isSimpleTextSpan(c)) { 215 | const cur$ = _.last(arr) 216 | if (cur$ && lastIsSimpleTextSpan) { 217 | arr[arr.length - 1] = cur$.add(c) 218 | } else { 219 | arr.push($(c)) 220 | } 221 | } else { 222 | arr.push($(c)) 223 | } 224 | lastIsSimpleTextSpan = isSimpleTextSpan(c) 225 | } 226 | 227 | $el.empty() 228 | 229 | for (const cur$ of arr) { 230 | if (cur$.toArray().every(isSimpleTextSpan)) { 231 | $el.append(`${cur$.text()}`) 232 | } else { 233 | $el.append(cur$) 234 | } 235 | } 236 | 237 | return { traverseChildren: true } 238 | } 239 | 240 | return { traverseChildren: true } 241 | } 242 | 243 | /** 244 | * 收集 img src 245 | */ 246 | 247 | const CLASS_FOOTNOTE = 'qqreader-footnote' 248 | 249 | function collectImgSrc(el: $Element, $: CheerioAPI, ctx: string[]): OnNodeResult { 250 | if (el.type === 'tag' && el.tagName?.toLowerCase?.() === 'img') { 251 | const $el = $(el) 252 | 253 | // 不处理这种脚注 254 | // \"敲西瓜。日本的小孩在夏天常玩的游戏。小孩蒙着眼,手拿棍子,比赛谁先可以把西瓜敲碎。\" 261 | if ($el.hasClass(CLASS_FOOTNOTE)) { 262 | return 263 | } 264 | 265 | const src = ($el.data('src') as string | undefined) || $el.attr('src') 266 | if (src) { 267 | ctx.push(src) 268 | } 269 | } 270 | 271 | // style="background-image:url(https://res.weread.qq.com/wrepub/web/910419/copyright.jpg);" 272 | const style = el.type === 'tag' ? el.attribs?.style : '' 273 | if (style?.includes('background-image:')) { 274 | const m = /(?:^|; *?)background-image *?: *?url\(([\S]+?)\)/.exec(style) 275 | if (m?.[1]) { 276 | const src = m[1] 277 | $(el).attr('data-bg-img', src) // mark, has no effect, the result html will be abondoned 278 | 279 | // TODO: 无法处理 280 | //
312 | // ERROR(RSC-005): ./CSS新世界.epub/OEBPS/chapter-6.xhtml(20,181): Error while parsing file: value of attribute "width" is invalid; must be an integer 313 | const width = $(el).attr('width') 314 | const height = $(el).attr('height') 315 | if (width && isNaN(Number(width))) { 316 | $(el).css('width', width).removeAttr('width') 317 | } 318 | if (height && isNaN(Number(height))) { 319 | $(el).css('height', height).removeAttr('height') 320 | } 321 | 322 | return 323 | } 324 | 325 | // style="background-image:url(https://res.weread.qq.com/wrepub/web/910419/copyright.jpg);" 326 | const style: string = el.attribs?.style 327 | if (style?.includes('background-image:')) { 328 | const m = /(?:^|; *?)background-image *?: *?url\(([\S]+?)\)/.exec(style) 329 | if (m?.[1]) { 330 | const $el = $(el) 331 | const src = m[1] 332 | 333 | // transform 334 | const newSrc = ctx.transformImgSrc(src) 335 | 336 | if (newSrc) { 337 | ctx.imgs.push({ 338 | src, 339 | newSrc, 340 | }) 341 | 342 | // replace style 343 | const newStyle = style.replace(src, newSrc) 344 | $el.attr('style', newStyle) 345 | } 346 | 347 | // 当 src 404 时, 丢弃 style 348 | else { 349 | debug('fixImgSrc: transformImgSrc return empty for %s', src) 350 | 351 | const newStyle = style 352 | .split(';') 353 | .map((s) => s.trim()) 354 | .filter(Boolean) 355 | .filter((oneStyle) => !oneStyle.startsWith('background-image:')) 356 | .join(';') 357 | 358 | if (newStyle) { 359 | $el.attr('style', newStyle) 360 | debug('fixImgSrc: style=%s -> style=%s', style, newStyle) 361 | } else { 362 | $el.removeAttr('style') 363 | debug('fixImgSrc: removeAttr style=%s', style) 364 | } 365 | } 366 | } 367 | } 368 | } 369 | -------------------------------------------------------------------------------- /src/utils/processContent/worker/index.main.ts: -------------------------------------------------------------------------------- 1 | import { $esm } from '$common/index' 2 | import * as Comlink from 'comlink/dist/esm/comlink.mjs' 3 | import nodeEndpoint from 'comlink/dist/esm/node-adapter.mjs' // NOTE: node-adpater 没有 .js 版本 4 | import os from 'os' 5 | import { Worker } from 'worker_threads' 6 | 7 | import type processContent from '../index.js' 8 | type ProcessContent = typeof processContent 9 | 10 | const { __dirname } = $esm(import.meta) 11 | 12 | export function createWorker() { 13 | const workerFile = __dirname + '/processContent.worker.js' 14 | const worker = new Worker(workerFile) 15 | // @ts-ignore 16 | const api = Comlink.wrap(nodeEndpoint(worker)) as Comlink.Remote<{ 17 | processContent: ProcessContent 18 | }> 19 | return { api, nodeWorker: worker } 20 | } 21 | 22 | export function createWorkers() { 23 | const cpuCores = os.cpus().length - 1 // 做个人吧~ 24 | return new Array(cpuCores).fill(0).map(() => { 25 | return createWorker() 26 | }) 27 | } 28 | -------------------------------------------------------------------------------- /src/utils/processContent/worker/index.worker.ts: -------------------------------------------------------------------------------- 1 | import * as Comlink from 'comlink/dist/esm/comlink.mjs' 2 | import nodeEndpoint from 'comlink/dist/esm/node-adapter.mjs' 3 | import { parentPort } from 'worker_threads' 4 | import processContent, { getImgSrcs } from '../index.js' 5 | 6 | const api = { 7 | processContent, 8 | getImgSrcs, 9 | } 10 | 11 | // eslint-disable-next-line @typescript-eslint/no-non-null-assertion 12 | // @ts-ignore 13 | Comlink.expose(api, nodeEndpoint(parentPort!)) 14 | -------------------------------------------------------------------------------- /test/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/magicdawn/weread-spy/13e70510a0b7343689fe1bbf76c5f83096396bc3/test/.gitkeep -------------------------------------------------------------------------------- /test/mocha.opts: -------------------------------------------------------------------------------- 1 | --require ts-node/register 2 | --watch-extensions ts 3 | --recursive 4 | --reporter spec 5 | --timeout 5000 6 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "include": ["src/**/*"], 3 | "compilerOptions": { 4 | "rootDir": "src", 5 | "outDir": "lib", 6 | "baseUrl": "./", 7 | "paths": { 8 | "$*": ["src/*.js", "src/*/index.js"] 9 | }, 10 | "allowSyntheticDefaultImports": true, 11 | "esModuleInterop": true, 12 | "resolveJsonModule": true, 13 | "target": "ES2020", 14 | "module": "Node16", 15 | "strict": false, 16 | "strictNullChecks": true, 17 | "skipLibCheck": true, 18 | "experimentalDecorators": true 19 | }, 20 | "ts-node": { 21 | "transpileOnly": true, 22 | "swc": true, 23 | "esm": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /tsup.config.ts: -------------------------------------------------------------------------------- 1 | import $esm from 'esm-utils' 2 | import { defineConfig } from 'tsup' 3 | 4 | const { __dirname } = $esm(import.meta) 5 | 6 | process.env.NODE_ENV ||= 'development' 7 | const prod = process.env.NODE_ENV === 'production' 8 | 9 | export default defineConfig({ 10 | entry: { 11 | 'bin': 'src/bin.ts', 12 | 'processContent.worker': 'src/utils/processContent/worker/index.worker.ts', 13 | }, 14 | format: 'esm', 15 | platform: 'node', 16 | target: 'node16', 17 | clean: true, 18 | minify: prod, 19 | env: { 20 | NODE_ENV: process.env.NODE_ENV, 21 | }, 22 | 23 | // NOTE: puppeteer-intercept-and-modify-requests 这个包 esm build 有问题 24 | noExternal: ['puppeteer-intercept-and-modify-requests'], 25 | external: ['why-is-node-running'], 26 | 27 | esbuildOptions(options, context) { 28 | // init 29 | options.external ||= [] 30 | 31 | options.external.push(__dirname + '/package.json') 32 | 33 | // use ascii in prod 34 | options.charset = prod ? undefined : 'utf8' 35 | }, 36 | }) 37 | --------------------------------------------------------------------------------