├── content.select
├── remove.select
├── stub.html
├── .gitignore
├── tools
├── package.json
└── unzimmer.js
├── LICENSE
├── stub.css
├── package.json
├── README.md
├── wikizimmer.js
└── zimmer.js
/content.select:
--------------------------------------------------------------------------------
1 | #bodyContent,
2 | article
3 |
--------------------------------------------------------------------------------
/remove.select:
--------------------------------------------------------------------------------
1 | script,
2 | #toc,
3 | .article-status,
4 | .thumbcaption .magnify,
5 | a[href*="action=edit"],
6 | .editsection,
7 | .mw-editsection,
8 | .thumbinner a[data-lon]
--------------------------------------------------------------------------------
/stub.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
10 |
11 |
12 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 |
6 | # Runtime data
7 | pids
8 | *.pid
9 | *.seed
10 |
11 | # Directory for instrumented libs generated by jscoverage/JSCover
12 | lib-cov
13 |
14 | # Coverage directory used by tools like istanbul
15 | coverage
16 |
17 | # nyc test coverage
18 | .nyc_output
19 |
20 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
21 | .grunt
22 |
23 | # node-waf configuration
24 | .lock-wscript
25 |
26 | # Compiled binary addons (http://nodejs.org/api/addons.html)
27 | build/Release
28 |
29 | # Dependency directories
30 | node_modules
31 | jspm_packages
32 |
33 | # Optional npm cache directory
34 | .npm
35 |
36 | # Optional REPL history
37 | .node_repl_history
38 |
--------------------------------------------------------------------------------
/tools/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "unzimmer",
3 | "version": "",
4 | "description": "",
5 | "main": "unzimmer.js",
6 | "scripts": {
7 | "test": "echo no tests"
8 | },
9 | "private": true,
10 | "keywords": [
11 | "mediawiki",
12 | "zim",
13 | "unpack"
14 | ],
15 | "dependencies": {
16 | "xz": "^1.3.0",
17 | "expand-home-dir": "*",
18 | "fs-extra": "^3.0.1",
19 | "mime-db": "*",
20 | "mime-types": "*",
21 | "generic-pool": "^3.1.7",
22 | "promised-read": "^2.0.1",
23 | "cheerio": "*",
24 | "commander": "^2.11.0",
25 | "csv-stringify": "^4.3.1",
26 | "moment": "^2.22.2",
27 | "moment-duration-format": "^2.2.2"
28 | },
29 | "engines": {
30 | "node": ">=8.0.0"
31 | },
32 | "bin": {
33 | "unzimmer": "./unzimmer.js"
34 | },
35 | "author": "Vadim Shlykahov",
36 | "license": "ISC"
37 | }
38 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 Vadim Shlyakhov
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/stub.css:
--------------------------------------------------------------------------------
1 | /*
2 | *
3 | * stub.css
4 | *
5 | */
6 | body#zim {
7 | font-size: 100%;
8 | width: inherit;
9 | padding: inherit;
10 | position: inherit;
11 | margin: inherit;
12 | /* background-image: none;*/
13 | }
14 | #zim #content {
15 | margin: 0px;
16 | padding: 2px;
17 | }
18 | #zim .mw-body-content {
19 | line-height: 1.6;
20 | font-size: 0.875em;
21 | }
22 | #zim .mw-body-content p {
23 | line-height: inherit;
24 | margin: 0.5em 0;
25 | }
26 | /*
27 | #zim #toc,
28 | #zim .article-status,
29 | #zim .thumbcaption .magnify,
30 | #zim .editsection,
31 | #zim .mw-editsection
32 | {
33 | display: none;
34 | }
35 | */
36 | #zim table.wikitable, table.nicetable {
37 | margin-right: 0;
38 | margin-left: 0;
39 | }
40 |
41 | /* for en.wikivoyage.org */
42 | /* supress Kartographer chartlet */
43 | /*
44 | .thumbinner a[data-lon] {
45 | display: none;
46 | }
47 | */
48 | #zim #geoCoord {
49 | top: -20px;
50 | }
51 | #geoCoord img {
52 | display: none;
53 | }
54 | #zim #mw-customtoggle-mapToggle {
55 | display: none;
56 | }
57 |
58 | /* Wikia */
59 | body#zim .WikiaMainContent {
60 | width: inherit;
61 | }
62 | body#zim .WikiaPageBackground {
63 | width: inherit;
64 | height: inherit;
65 | position: inherit;
66 | left: inherit;
67 | top: inherit;
68 |
69 | }
70 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "zimmer",
3 | "version": "0.3.0-test",
4 | "description": "Zim file packer",
5 | "main": "zimmer.js",
6 | "scripts": {
7 | "test": "echo no tests"
8 | },
9 | "private": true,
10 | "keywords": [
11 | "mediawiki",
12 | "zim",
13 | "pack"
14 | ],
15 | "dependencies": {
16 | "animated-gif-detector": "^1.2.0",
17 | "cheerio": "*",
18 | "child-process": "*",
19 | "commander": "^2.20.3",
20 | "csv-parse": "*",
21 | "encodeurl": "^1.0.1",
22 | "expand-home-dir": "*",
23 | "fs-extra": "^3.0.1",
24 | "generic-pool": "^3.1.7",
25 | "html-minifier": "^3.5.21",
26 | "iconv-lite": "^0.4.17",
27 | "langs": "^2.0.0",
28 | "lzma-native": "^5.0.0",
29 | "mime-db": "*",
30 | "mime-types": "*",
31 | "mmmagic": "*",
32 | "moment": "^2.22.2",
33 | "moment-duration-format": "^2.2.2",
34 | "mozjpeg": "*",
35 | "mz": "^2.6.0",
36 | "quick-lru": "^1.0.0",
37 | "request": "*",
38 | "request-promise-native": "^1.0.5",
39 | "sharp": "^0.22.1",
40 | "sqlite": "^2.8.0",
41 | "sqlite3": "*",
42 | "uuid": "^3.4.0"
43 | },
44 | "engines": {
45 | "node": ">=10.4.0"
46 | },
47 | "bin": {
48 | "zimmer": "./zimmer.js",
49 | "wikizimmer": "./wikizimmer.js"
50 | },
51 | "author": "Vadim Shlykahov",
52 | "license": "ISC"
53 | }
54 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ***zimmer*** is a package for creating [ZIM](http://www.openzim.org/wiki/OpenZIM) files from Mediawiki-powered wikis.
2 |
3 | The package consists of 2 scripts:
4 |
5 | - wikizimmer.js — dumps the wiki's articles into a collection of static HTML files.
6 |
7 | - zimmer.js — builds a ZIM file from a static HTML files collection. Historically, zimmer.js is mostly a drop-in replacement for [zimwriterfs](https://github.com/wikimedia/openzim/tree/master/zimwriterfs) with a notable exception: it doesn't support *withFullTextIndex* option (index format is [not documented](http://www.openzim.org/wiki/ZIM_Index_Format)).
8 |
9 | `wikizimmer.js` unlike to [mwoffliner](https://github.com/openzim/mwoffliner) does not depend on the [Parsoid](https://www.mediawiki.org/wiki/Parsoid) and [Redis](https://redis.io/) and `zimmer.js` unlike to [zimwriterfs](https://github.com/wikimedia/openzim/tree/master/zimwriterfs) doesn't depend on the [zimlib](http://www.openzim.org/wiki/Zimlib).
10 |
11 | The package is relatively easy to install and it can even process some wikis running rather old versions of the Mediawiki engine.
12 |
13 | ## Installation
14 | Requirement: `node` version >= 10.4.0
15 |
16 | ### With npm globally
17 |
18 | ```
19 | npm i -g git+https://github.com/vadp/zimmer
20 | ```
21 |
22 | or
23 |
24 | ### Manually
25 |
26 | * Clone *zimmer* from Github or download ZIP
27 | * Install dependencies: `npm install`
28 | * Make `wikizimmer.js` and `zimmer.js` executable
29 | * Optionally symlink both scripts into some directory available in your $PATH:
30 |
31 | ```
32 | ln -s wikizimmer.js /wikizimmer
33 | ln -s zimmer.js /zimmer
34 | ```
35 |
36 | ## Usage
37 |
38 | Run either of scripts with '--help' switch to see the list of all options available.
39 |
40 | The process of creating a ZIM file from a wiki consists of 2 parts.
41 |
42 | Example:
43 |
44 | * Dumping a wiki to a local collection of static HTML files:
45 |
46 | `wikizimmer https://en.wikivoyage.org/wiki/Pisa`
47 |
48 | will dump all articles from the main name space (aka 0 or '') at the `https://en.wikivoyage.org` to the directory `en.wikivoyage.org`. The URL to a particular page is quite important in this case as this page's styling is used as a template for all other pages in the dump, so wikivoyage listings, for example, are rendered correctly at the static page of the dump.
49 |
50 | * Building a ZIM file:
51 |
52 | `zimmer --optimg en.wikivoyage.org`
53 |
54 | will pack the content of the `en.wikivoyage.org` into the `en.wikivoyage.org.zim`. zimmer.js with `--optimg` option will recompress the images in the dump to save some space.
55 |
56 | **Notes**:
57 | * wikizimmer.js requires a public access both the normal web interface and to the wiki's API interface.
58 | * To dump a HTTPS server with a self-signed certificate you need to set an environment variable: `NODE_TLS_REJECT_UNAUTHORIZED=0`
59 | * The most options of the zimmer.js are optional as it fetches the relevant metadata from the dump created by wikizimmer.js. Perhaps only `--optimg` option is rather important if you want to save some space.
60 |
--------------------------------------------------------------------------------
/tools/unzimmer.js:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | ":" //# -*- mode: js -*-; exec /usr/bin/env node --max-old-space-size=9000 --stack-size=42000 "$0" "$@"
3 |
4 | "use strict";
5 |
6 | /************************************/
7 | /* MODULE VARIABLE SECTION **********/
8 | /************************************/
9 |
10 | const os = require('os')
11 | const osProcess = require('process')
12 | const osPath = require( 'path' )
13 |
14 | const expandHomeDir = require( 'expand-home-dir' )
15 | const fs = require( 'fs-extra' )
16 | const mimeDb = require( 'mime-db' )
17 | const mime = require( 'mime-types' )
18 |
19 | const packageInfo = require('./package.json')
20 | const genericPool = require( 'generic-pool' )
21 | const asyncRead = require('promised-read').read
22 | const cheerio = require('cheerio')
23 | const command = require('commander')
24 |
25 | const csvOutput = require('csv-stringify')
26 |
27 | const moment = require("moment")
28 | require("moment-duration-format")
29 |
30 | const startTime = Date.now()
31 |
32 | function elapsedStr( from , to = Date.now()) {
33 | return moment.duration( to - from ).format('d[d]hh:mm:ss.SSS',{ stopTrim: "h" })
34 | }
35 |
36 | function log ( ...args ) {
37 | console.log( elapsedStr( startTime ), ... args )
38 | }
39 |
40 | function warning ( ...args ) {
41 | log( ...args )
42 | }
43 |
44 | function fatal ( ...args ) {
45 | log( ...args )
46 | osProcess.exit( 1 )
47 | }
48 |
49 | //~ var lzma = require('lzma-native')
50 | try {
51 | var lzma = require('xz')
52 | } catch (er) {
53 | if ( os.type() == 'Windows_NT' ) {
54 | fatal( 'Module "xz" is not available on Windows' )
55 | } else {
56 | fatal( 'Module "xz" is required' )
57 | }
58 | }
59 | //~ var lzma = require('node-liblzma')
60 |
61 | var srcPath;
62 | var outPath;
63 | var src; // input file reader
64 |
65 | var articles = null;
66 | var metadata = [];
67 |
68 | function readUInt64LE(buf, offset) {
69 | var lowBits = buf.readUInt32LE(offset);
70 | var highBits = buf.readUInt32LE(offset + 4);
71 | return highBits * 0x100000000 + lowBits
72 | };
73 |
74 | function blobPath(clusterIdx, blobIdx) {
75 | return osPath.join(outPath, clusterIdx + '-' + blobIdx + '-blob');
76 | }
77 |
78 | function articlePath(article) {
79 | return osPath.join(outPath, article.url);
80 | }
81 |
82 | //
83 | // class Reader
84 | //
85 | class Reader {
86 | constructor ( path ) {
87 | this.path = path;
88 | this.position = 0;
89 | this.file = fs.open( path, 'r' )
90 |
91 | this.queue = genericPool.createPool(
92 | {
93 | async create () { return Symbol() },
94 | async destroy ( resource ) { },
95 | },
96 | {}
97 | )
98 | }
99 |
100 | async read ( length, position ) {
101 |
102 | const token = await this.queue.acquire()
103 | const fd = await this.file
104 |
105 | if (typeof position !== 'number')
106 | position = this.position
107 | this.position = position + length
108 |
109 | const data = Buffer.alloc(length)
110 | const bytes = await fs.read( fd, data, 0, length, position )
111 | this.queue.release( token )
112 | return data
113 | }
114 |
115 | async close () {
116 | await this.queue.drain()
117 | const fd = await this.file
118 | await fs.close( fd )
119 | }
120 |
121 | tell () {
122 | return this.position
123 | }
124 | }
125 |
126 | var headerLength = 80;
127 |
128 | var header = {
129 | magicNumber: 72173914, // integer 0 4 Magic number to recognise the file format, must be 72173914
130 | version: 5, // integer 4 4 ZIM=5, bytes 1-2: major, bytes 3-4: minor version of the ZIM file format
131 | uuid: 0, // integer 8 16 unique id of this zim file
132 | articleCount: 0, // integer 24 4 total number of articles
133 | clusterCount: 0, // integer 28 4 total number of clusters
134 | urlPtrPos: 0, // integer 32 8 position of the directory pointerlist ordered by URL
135 | titlePtrPos: 0, // integer 40 8 position of the directory pointerlist ordered by Title
136 | clusterPtrPos: 0, // integer 48 8 position of the cluster pointer list
137 | mimeListPos: headerLength, // integer 56 8 position of the MIME type list (also header size)
138 | mainPage: 0xffffffff, // integer 64 4 main page or 0xffffffff if no main page
139 | layoutPage: 0xffffffff, // integer 68 4 layout page or 0xffffffffff if no layout page
140 | checksumPos: 0, // integer 72 8 pointer to the md5checksum of this file without the checksum itself. This points always 16 bytes before the end of the file.
141 | geoIndexPos: 0, // integer 80 8 pointer to the geo index (optional). Present if mimeListPos is at least 80.
142 | };
143 |
144 | async function readHeader ( ) {
145 | log('reading header')
146 | const buf = await src.read( headerLength, 0 )
147 |
148 | header.articleCount = buf.readUInt32LE(24);
149 | header.clusterCount = buf.readUInt32LE(28);
150 |
151 | header.urlPtrPos = readUInt64LE(buf, 32);
152 | header.titlePtrPos = readUInt64LE(buf, 40);
153 | header.clusterPtrPos = readUInt64LE(buf, 48);
154 | header.mimeListPos = readUInt64LE(buf, 56);
155 |
156 | header.mainPage = buf.readUInt32LE(64);
157 | header.layoutPage = buf.readUInt32LE(68);
158 |
159 | log('header', header);
160 | }
161 |
162 | async function processClusterList ( ) {
163 | log('reading ClusterPointers')
164 | const buf = await src.read( header.clusterCount * 8, header.clusterPtrPos )
165 |
166 | try {
167 | for ( let i=0; i < header.clusterCount; i++ ) {
168 | await processCluster( buf, i )
169 | }
170 | } catch ( err ) {
171 | fatal( 'processClusterList', err )
172 | }
173 | };
174 |
175 | async function processCluster( buf, clusterIdx ) {
176 | var eof = false;
177 |
178 | const clusterOfs = readUInt64LE( buf, clusterIdx * 8 )
179 |
180 | async function readCompression () {
181 | const buf = await src.read( 1, clusterOfs )
182 |
183 | return buf.readUInt8(0) & 4; // xz compressed
184 | }
185 |
186 | async function getSource( isCompressed ) {
187 | var slice = fs.createReadStream(
188 | src.path,
189 | {
190 | start: clusterOfs + 1,
191 | // autoClose: false,
192 | }
193 | );
194 |
195 | slice.on('error', function (err) {
196 | console.error('processCluster', clusterIdx, 'input error', err);
197 | //~ process.exit(1);
198 | });
199 |
200 | slice.on('end', function () {
201 | log('processCluster', clusterIdx, 'input end');
202 | eof = true;
203 | //~ process.exit(1);
204 | });
205 |
206 | slice.on('close', function () {
207 | log('processCluster', clusterIdx, 'input closed');
208 | eof = true;
209 | //~ process.exit(1);
210 | });
211 |
212 | slice.on('open', function (fd) {
213 | log('processCluster', clusterIdx, 'input open', fd);
214 | });
215 |
216 | if ( isCompressed ) { // xz compressed
217 | const decompressed = new lzma.Decompressor()
218 | slice.pipe( decompressed )
219 | return decompressed
220 | }
221 | return slice
222 | }
223 |
224 | async function readOffsets ( input ) {
225 | const offsets = []
226 | let noffsets
227 | for ( var buf; buf = await asyncRead( input, 4 );) {
228 | var ofs = buf.readUInt32LE( 0 )
229 | if ( offsets.length == 0 ) {
230 | noffsets = ofs / 4
231 | }
232 | //~ log('readOffsets', clusterIdx, noffsets, offsets.length, ofs);
233 | offsets.push(ofs)
234 |
235 | if ( offsets.length == noffsets ) {
236 | //~ log('readOffsets done', clusterIdx, noffsets, offsets.length, ofs);
237 | return offsets
238 | }
239 | }
240 | fatal( 'readOffsets prematire stream end' )
241 | }
242 |
243 | async function dumpBlobs ( input, offsets ) {
244 | for ( let i=0; i < offsets.length-1; i++ ) {
245 |
246 | const blobLen = offsets[ i + 1 ] - offsets[ i ]
247 | const blob = blobLen === 0 ?
248 | Buffer.alloc(0)
249 | : await asyncRead( input, blobLen )
250 | await fs.outputFile( blobPath( clusterIdx, i ), blob )
251 |
252 | //~ log('readBlobs', clusterIdx, isCompressed, nblobs, i, blobLen)
253 | }
254 |
255 | //~ log('readBlobs done', clusterIdx, isCompressed, nblobs, blobIdx, blobLen)
256 | }
257 |
258 | let input
259 |
260 | try {
261 | const isCompressed = await readCompression()
262 | log('processCluster', clusterIdx, header.clusterCount, isCompressed);
263 |
264 | input = await getSource( isCompressed )
265 | const offsets = await readOffsets( input )
266 | await dumpBlobs( input, offsets )
267 | } catch ( err ) {
268 | if (!eof) {
269 | //~ slice.fd = null;
270 | input && input.destroy()
271 | }
272 | fatal( 'processCluster error', clusterIdx, header.clusterCount, err )
273 | }
274 | }
275 |
276 | async function getDirEntry ( article ) {
277 | let chunkLen = 512;
278 | let dirEntry
279 |
280 | function parseDirEntry () {
281 | article.mimeIdx = dirEntry.readUInt16LE(0);
282 | article.nameSpace = dirEntry.toString('utf8', 3, 4);
283 |
284 | var strOfs = 16;
285 | if (article.mimeIdx == 0xfffe || article.mimeIdx == 0xfffd) {
286 | // linktarget or deleted entry
287 | return true // noop
288 | } else if (article.mimeIdx == 0xffff ) { //redirect
289 | strOfs = 12;
290 | article.redirectIndex = dirEntry.readUInt32LE(8);
291 | } else {
292 | article.clusterIdx = dirEntry.readUInt32LE(8);
293 | article.blobIdx = dirEntry.readUInt32LE(12);
294 | }
295 |
296 | // read url and title
297 | var end = dirEntry.indexOf(0, strOfs);
298 | if (end != -1) {
299 | article.url = dirEntry.toString('utf8', strOfs, end);
300 |
301 | var strOfs = end + 1;
302 | end = dirEntry.indexOf(0, strOfs);
303 | if (end != -1) {
304 | article.title = dirEntry.toString('utf8', strOfs, end);
305 | }
306 | }
307 |
308 | if (end == -1) // short buffer -- read more
309 | return false
310 |
311 | log('parseDirEntry', article.index, header.articleCount, '\n', article);
312 |
313 | articles[article.index] = article
314 |
315 | return true
316 | }
317 |
318 | try {
319 | while ( true ) {
320 | dirEntry = await src.read( chunkLen, article.offset )
321 | if ( parseDirEntry() )
322 | return article
323 | chunkLen *= 2
324 | }
325 | } catch ( err ) {
326 | fatal( 'processdirEntry read error', article.index, header.articleCount, err )
327 | }
328 | }
329 |
330 | async function renameBlob( article ) {
331 |
332 | var bpath = blobPath(article.clusterIdx, article.blobIdx)
333 |
334 | if (article.nameSpace == 'M') { // metadata
335 | const data = await fs.readFile ( bpath, 'utf8' )
336 | metadata.push([article.url.toLowerCase(), data])
337 | return fs.unlink( bpath )
338 | }
339 | const apath = articlePath( article )
340 |
341 | log('renameBlob', article.index, header.articleCount, bpath, '->', apath )
342 |
343 | return fs.move( bpath, apath, { clobber: true })
344 | }
345 |
346 | async function loadArticle( article ) {
347 | if (article.nameSpace != 'A')
348 | return null
349 | const data = await fs.readFile( articlePath( article ))
350 |
351 | try {
352 | const dom = cheerio.load( data )
353 | return dom
354 | } catch ( e ) {
355 | log( 'cheerio.load error', e, data )
356 | return null
357 | }
358 | }
359 |
360 | var nameSpaces = ['-', 'A', 'B', 'I', 'J', 'M', 'U', 'W', 'X'];
361 |
362 | function alterLinks( article, dom ) {
363 | var nameSpaceLink = function (elem, attr) {
364 | let link
365 | try {
366 | link = url.parse(elem.attribs[attr], true, true)
367 | } catch (err) {
368 | //~ console.error('alterLinks error', err, article, attr, elem.attribs[attr], elem)
369 | console.error('alterLinks', err.message, elem.attribs[attr], 'at', article.path)
370 | return
371 | }
372 | if ( (link.protocol && link.protocol != 'http:' && link.protocol != 'https:')
373 | || link.host || ! link.pathname)
374 | return
375 |
376 | var chunks = link.pathname.split('/')
377 |
378 | if ( chunks[0] == '' // abs path
379 | || chunks[0] == '..'
380 | && nameSpaces.indexOf(chunks[1]) != -1) {
381 | chunks.shift();
382 | chunks.shift();
383 | link.pathname = chunks.join('/');
384 | //~ log('alterLinks', elem.attribs[attr], url.format(link));
385 | elem.attribs[attr] = url.format(link);
386 | return // OK
387 | }
388 | return
389 | }
390 |
391 | dom( '[src]' ).each( (i, elem) => nameSpaceLink( elem, 'src' ))
392 | dom( '[href]' ).each( (i, elem) => nameSpaceLink( elem, 'href' ))
393 | }
394 |
395 | async function processArticle ( articleIndex ) {
396 | if ( articles[ articleIndex ] != null )
397 | return true // already processed
398 |
399 | const article = {
400 | index: articleIndex,
401 | offset: readUInt64LE( rawDirectory, articleIndex * 8 )
402 | }
403 |
404 | await getDirEntry( article )
405 |
406 | if ( article.mimeIdx == 0xfffe || article.mimeIdx == 0xfffd ) {
407 | // linktarget or deleted entry
408 | return true // noop
409 | }
410 | if ( article.mimeIdx == 0xffff ) { //redirect
411 | return storeRedirect( article )
412 | }
413 |
414 | const moved = await renameBlob( article )
415 | if (! moved )
416 | return null
417 | const dom = await loadArticle( article )
418 | if (! dom )
419 | return null
420 | await alterLinks( article, dom )
421 | return fs.outputFile( articlePath( article ), Buffer.from( dom.html() ))
422 | }
423 |
424 | var rawDirectory
425 |
426 | async function processArticleList () {
427 | log('reading ArticleList')
428 | articles = Array( header.articleCount )
429 | rawDirectory = await src.read(header.articleCount * 8, header.urlPtrPos )
430 |
431 | //~ log( 'articleOffsets', articleOffsets);
432 |
433 | for ( let i=0; i < header.articleCount; i++ ) {
434 | await processArticle( i )
435 | }
436 | log( '*** articles' )
437 | articles.forEach( (val, i ) => log( i, val.nameSpace, val.url ))
438 |
439 | if ( redirectOut )
440 | return new Promise( ( resolve, reject ) => {
441 | redirectOut.end( resolve )
442 | })
443 | }
444 |
445 | async function processTitleList () {
446 | log('reading Title List')
447 | const titleDirectory = await src.read( header.articleCount * 4, header.titlePtrPos )
448 |
449 | //~ log( 'articleOffsets', articleOffsets);
450 | log( '*** titles' )
451 |
452 | for ( let i=0; i < header.articleCount; i++ ) {
453 | const idx = titleDirectory.readUInt32LE( i * 4 )
454 | log( i, idx, articles[ idx ].nameSpace, articles[ idx ].title, '>', articles[ idx ].url )
455 | }
456 | }
457 |
458 | var redirectOut = null
459 |
460 | function storeRedirect ( article ) {
461 | log('storeRedirect', article)
462 |
463 | if (article.nameSpace == '-' && (article.url == 'favicon' || article.url == 'mainPage'))
464 | return
465 |
466 | if (! redirectOut) {
467 | redirectOut = csvOutput({delimiter: '\t'})
468 | redirectOut.pipe(fs.createWriteStream(osPath.join(outPath, '..', 'redirects.csv')))
469 | }
470 |
471 | var target = articles[ article.redirectIndex ]
472 | if (! target) { // fetch target artcile isn't yet processed
473 | return processArticle( article.redirectIndex )
474 | .then(() => storeRedirect( article ))
475 | }
476 |
477 | var item = [ article.nameSpace, article.url, article.title, target.url ]
478 |
479 | log('storeRedirect', item)
480 |
481 | return new Promise(( resolve, reject ) => {
482 | var write = function () {
483 | try {
484 | if (! redirectOut.write(item))
485 | return redirectOut.once('drain', write)
486 | resolve( false )
487 | } catch ( err ) {
488 | reject( err )
489 | }
490 | }
491 | write()
492 | })
493 | }
494 |
495 | function storeMetadata () {
496 | log('storeMetadata');
497 | if ( metadata.length == 0 )
498 | return
499 |
500 | var csv = csvOutput({ delimiter: ' ' })
501 | csv.pipe( fs.createWriteStream( osPath.join( outPath, '..', 'metadata.csv' )))
502 |
503 | return new Promise(( resolve, reject ) => {
504 | var write = function () {
505 | try {
506 | var i = 0;
507 | var write = function () {
508 | while (true) {
509 | if ( i == metadata.length ) {
510 | log('storeMetadata finished');
511 | return csv.end( resolve );
512 | }
513 | var item = metadata[i];
514 | log('storeMetadata', metadata.length, i, item);
515 | if (! csv.write( item ))
516 | break;
517 | i++
518 | }
519 | csv.once( 'drain', write )
520 | }
521 | } catch ( err ) {
522 | reject( err )
523 | }
524 | }
525 | write()
526 | })
527 | }
528 |
529 | async function core () {
530 | src = new Reader(srcPath)
531 |
532 | await readHeader( )
533 | await processClusterList()
534 | await processArticleList()
535 | await processTitleList()
536 | await storeMetadata()
537 |
538 | await src.close()
539 | }
540 |
541 | function main () {
542 | command
543 | .version( packageInfo.version )
544 | .arguments( '' )
545 | .description( 'Dumps a ZIM file' )
546 | .option( '-h -help' )
547 | .parse( process.argv )
548 |
549 | log( command.opts() )
550 |
551 | srcPath = expandHomeDir( command.args[0] )
552 | outPath = expandHomeDir( command.args[1] )
553 | if (! outPath ) {
554 | var parsed = osPath.parse(srcPath)
555 | outPath = parsed.name
556 | }
557 |
558 | core()
559 | }
560 |
561 | main ()
562 |
--------------------------------------------------------------------------------
/wikizimmer.js:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | ":" //# -*- mode: js -*-; exec /usr/bin/env TMPDIR=/tmp node --max-old-space-size=2000 --stack-size=42000 "$0" "$@"
3 |
4 | // node --inspect-brk
5 |
6 | "use strict"
7 |
8 | /*
9 |
10 | MIT License
11 |
12 | Copyright (c) 2017 Vadim Shlyakhov
13 |
14 | Permission is hereby granted, free of charge, to any person obtaining a copy
15 | of this software and associated documentation files (the "Software"), to deal
16 | in the Software without restriction, including without limitation the rights
17 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
18 | copies of the Software, and to permit persons to whom the Software is
19 | furnished to do so, subject to the following conditions:
20 |
21 | The above copyright notice and this permission notice shall be included in all
22 | copies or substantial portions of the Software.
23 |
24 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
29 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 | SOFTWARE.
31 |
32 | */
33 |
34 | const packageInfo = require('./package.json');
35 | const os = require('os')
36 | const osProcess = require('process')
37 | const osPath = require( 'path' )
38 | const urlconv = require('url')
39 | const crypto = require("crypto")
40 |
41 | const command = require('commander')
42 | const fs = require('fs-extra')
43 | const requestPromise = require('request-promise-native')
44 | const sqlite = require( 'sqlite' )
45 | const cheerio = require('cheerio')
46 | const minify = require('html-minifier').minify
47 |
48 | const langs = require('langs')
49 | const encodeurl = require('encodeurl')
50 | const iconv = require('iconv-lite')
51 | const lru = require('quick-lru')
52 |
53 | const mimeTypes = require( 'mime-types' )
54 | const mmmagic = require( 'mmmagic' )
55 | const mimeMagic = new mmmagic.Magic( mmmagic.MAGIC_MIME_TYPE )
56 |
57 | const moment = require("moment")
58 | require("moment-duration-format")
59 |
60 | const cpuCount = os.cpus().length
61 |
62 | const startTime = Date.now()
63 |
64 | function elapsedStr( from , to = Date.now()) {
65 | return moment.duration( to - from ).format('d[d]hh:mm:ss.SSS',{ stopTrim: "h" })
66 | }
67 |
68 | function print ( ...args ) {
69 | console.log( ... args )
70 | }
71 |
72 | const tick = (( slow ) => {
73 | let ping = 0
74 | return () => {
75 | if (( ping++ ) % slow == 0 )
76 | osProcess.stdout.write( '.' )
77 | }
78 | }) ( 100 )
79 |
80 | function log ( ...args ) {
81 | if ( command.quiet )
82 | return
83 | else if ( command.verbose )
84 | console.log( elapsedStr( startTime ), ... args )
85 | else
86 | tick()
87 | }
88 |
89 | function warning ( ...args ) {
90 | log( elapsedStr( startTime ), ...args )
91 | }
92 |
93 | function fatal ( ...args ) {
94 | console.trace( elapsedStr( startTime ), ... args )
95 | osProcess.exit( 1 )
96 | }
97 |
98 | const mimeIds = []
99 |
100 | let articleCount = 0
101 | let redirectCount = 0
102 | let http // http request
103 |
104 | // https://msdn.microsoft.com/en-us/library/windows/desktop/aa365247
105 | // just in case https://www.mediawiki.org/wiki/Manual:Page_title
106 | let sanitizeRE = /(?:[\x00-\x1F<>:"~\\\?\*]|%(?:[^0-9A-Fa-f]|[0-9A-Fa-f][^0-9A-Fa-f])|(?:[. ]$))+/g
107 |
108 | function sanitizeFN ( name ) { // after https://github.com/pillarjs/encodeurl
109 | if ( os.type() == 'Windows_NT' ) {
110 | return String( name ).replace( sanitizeRE, encodeURIComponent ).replace( /%/g, '~' )
111 | } else {
112 | return name
113 | }
114 | }
115 |
116 | function mimeFromData ( data ) {
117 | return new Promise(( resolve, reject ) =>
118 | mimeMagic.detect( data, ( error, mimeType ) => {
119 | if ( error )
120 | return reject( error )
121 | return resolve( mimeType )
122 | })
123 | )
124 | }
125 |
126 | let UserAgent = `wikizimmer/${packageInfo.version} (https://github.com/vss-devel/zimmer)`
127 | const UserAgentFirefox = 'Mozilla/5.0 (X11; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0'
128 |
129 | function pooledRequest( request, referenceUri, maxTokens = 1, interval = 10 ) {
130 | const retryErrorCodes = [ 'EPROTO', 'ECONNRESET', 'ESOCKETTIMEDOUT' ]
131 | const retryStatusCodes = [ 408, 420, 423, 429, 500, 503, 504, 509, 524 ]
132 | const retryLimit = 10
133 | const retryExternal = command.retryExternal == null ? retryLimit : command.retryExternal
134 | const requestTimeout = 5 * 60 * 1000
135 | const refHost = urlconv.parse( referenceUri ).host
136 | const hostQueues = {}
137 |
138 | class Queue {
139 | constructor () {
140 | this.queue = []
141 | this.timer = null
142 | this.supressTimer = null
143 | this.supressTimeout = 60 * 1000
144 | this.tokenCounter = 0
145 | this.interval = interval
146 | }
147 |
148 | reshedule () {
149 | if ( this.supressTimer )
150 | return
151 | this.timer = setTimeout(
152 | () => ( this.timer = null, this.run() ),
153 | this.interval
154 | )
155 | }
156 |
157 | pause ( query ) {
158 | clearTimeout( this.timer )
159 | this.timer = null
160 |
161 | clearTimeout( this.supressTimer )
162 | this.supressTimer = setTimeout(
163 | () => ( this.supressTimer = false, this.reshedule()),
164 | query.retries * this.supressTimeout
165 | )
166 | }
167 |
168 | retry ( query, error ) {
169 | const retryCause = retryStatusCodes.includes( error.statusCode ) ? error.statusCode :
170 | error.cause && retryErrorCodes.includes( error.cause.code ) ? error.cause.code : false
171 | const maxRetries = query.external ? retryExternal : retryLimit
172 | if ( ! retryCause || query.retries > maxRetries)
173 | return false
174 |
175 | if ( query.retries > maxRetries / 2 ) {
176 | this.interval = this.interval * 2
177 | }
178 | query.retries ++
179 |
180 | log( 'retry request', query.retries, this.interval, error.name, retryCause, error.options.uri || error.options.url ) // , query )
181 | this.queue.push( query )
182 | this.pause( query )
183 | return true
184 | }
185 |
186 | async submit ( query ) {
187 | this.tokenCounter ++
188 | try {
189 | const reply = await request( query )
190 | this.tokenCounter --
191 | if ( reply )
192 | query.resolve( reply )
193 | else
194 | query.reject( )
195 | this.reshedule()
196 | } catch ( error ) {
197 | this.tokenCounter --
198 | if ( ! this.retry( query, error )) {
199 | warning( 'HTTP error', error.cause && error.cause.code || error.statusCode, error.options.uri || error.options.url )
200 | query.reject( error )
201 | this.reshedule()
202 | return
203 | }
204 | }
205 | }
206 |
207 | run () {
208 | if ( this.timer || this.supressTimer || this.tokenCounter >= maxTokens )
209 | return
210 | const query = this.queue.shift()
211 | if ( query ) {
212 | //~ if ( query.retries > 0 )
213 | //~ debugger
214 | this.submit( query )
215 | this.reshedule()
216 | }
217 | }
218 |
219 | append ( query ) {
220 | return new Promise(( resolve, reject ) => {
221 | query.resolve = resolve
222 | query.reject = reject
223 | query.retries = 0
224 |
225 | if ( query.priority )
226 | this.queue.unshift( query )
227 | else
228 | this.queue.push( query )
229 |
230 | this.run()
231 | })
232 | }
233 | }
234 |
235 | function processOptions ( query ) {
236 | let url
237 | if ( typeof query === 'string' || query.href !== undefined ) {
238 | // string or URL object
239 | url = query
240 | query = {}
241 | } else {
242 | url = query.uri || query.url
243 | delete query.uri
244 | }
245 | query.url = urlconv.resolve( referenceUri, url )
246 | query.host = urlconv.parse( query.url ).host
247 | query.external = query.host != refHost
248 |
249 | if ( ! query.headers )
250 | query.headers = {}
251 | query.headers[ 'User-Agent' ] = UserAgent
252 | query.headers[ 'Referer' ] = referenceUri
253 | query.resolveWithFullResponse = true
254 | query.timeout = requestTimeout
255 | query.forever = true
256 |
257 | log( '^', decodeURI( query.url ), query.qs || '' )
258 |
259 | return query
260 | }
261 |
262 | return function ( query, queueId ) {
263 | processOptions( query )
264 | if ( ! queueId )
265 | queueId = query.host
266 | let queue = hostQueues[ queueId ]
267 | if ( ! queue ) {
268 | queue = new Queue
269 | hostQueues[ queueId ] = queue
270 | }
271 | return queue.append( query )
272 | }
273 | }
274 |
275 | async function api ( params, options = {} ) {
276 | if ( options.method == 'POST' && options.form )
277 | options.form.format = 'json'
278 | else
279 | params.format = 'json'
280 | Object.assign( options, {
281 | url: wiki.apiUrl,
282 | qs: params,
283 | })
284 | const reply = await http( options )
285 | const res = JSON.parse( reply.body )
286 | return res.error || res.warning ? Promise.reject( res.error || res.warning ) : res
287 | }
288 |
289 | function apiPost( params ) {
290 | return api( null, {
291 | method: 'POST',
292 | form: params,
293 | })
294 | }
295 |
296 | class NameSpaceSet {
297 | constructor ( SiteInfo ) {
298 | this.nameSpaces = {}
299 | this.queue = []
300 | this.scheduled = new Set
301 | Object.keys( SiteInfo.namespaces ).forEach( ns => {
302 | const nsInfo = SiteInfo.namespaces[ ns ]
303 | this.nameSpaces[ ns ] = nsInfo
304 | if ( nsInfo[ '*' ] !== undefined )
305 | this.nameSpaces[ nsInfo[ '*' ]] = nsInfo
306 | if ( nsInfo.canonical !== undefined )
307 | this.nameSpaces[ nsInfo.canonical ] = nsInfo
308 | })
309 | if ( SiteInfo.namespacealiases ) {
310 | SiteInfo.namespacealiases.forEach( aliasInfo =>
311 | this.nameSpaces[ aliasInfo[ '*' ]] = this.nameSpaces[ aliasInfo.id ]
312 | )
313 | }
314 | }
315 |
316 | isScheduled ( nsId ) {
317 | return this.scheduled.has( nsId )
318 | }
319 |
320 | toBeDownloaded ( title ) {
321 | const colIndex = title.indexOf( ':' )
322 | if ( colIndex == -1 )
323 | return true
324 | const prefix = title.slice( 0, colIndex )
325 | const ns = this.nameSpaces[ prefix ]
326 | if ( ns !== undefined ) {
327 | return this.isScheduled( ns.id )
328 | }
329 | return true
330 | }
331 |
332 | toDownload ( nsList = '0' ) {
333 | nsList.split( ',' ).map( nsId => this.schedule( nsId ))
334 | }
335 |
336 | schedule ( nsId ) {
337 | const ns = this.nameSpaces[ nsId ]
338 | if ( ! ns ) {
339 | fatal( 'This wiki does not have name space', nsId )
340 | return
341 | }
342 | if ( ! this.isScheduled( ns.id )) {
343 | this.scheduled.add( ns.id )
344 | this.queue.push( ns.id )
345 | }
346 | }
347 |
348 | * [Symbol.iterator] () {
349 | while ( this.queue.length != 0 ) {
350 | yield this.queue.shift()
351 | }
352 | }
353 | }
354 |
355 | const wiki = {
356 | outPath: null,
357 | apiUrl: null,
358 | metadata: {},
359 | nameSpaces: null,
360 | }
361 |
362 | class WikiItem {
363 | constructor ( zimNameSpace, url, title ) {
364 | this.encoding = null
365 | this.revision = 0
366 | this.id = null
367 | this.loadPriority = false
368 | Object.assign( this, { zimNameSpace, url, title })
369 | }
370 |
371 | async getData () {
372 | let data = await ( this.data !== undefined ? this.data : ( this.data = this.load( )))
373 | return this.preProcess( data )
374 | }
375 |
376 | preProcess ( data ) {
377 | return data
378 | }
379 |
380 | urlReplacements () {
381 | if ( typeof command.urlReplace != 'object' ) {
382 | return this.url
383 | } else {
384 | return command.urlReplace.reduce(
385 | ( acc, [ patt, repl ]) => acc.replace( patt, repl ),
386 | this.url
387 | )
388 | }
389 | }
390 |
391 | blackListed () {
392 | if ( typeof command.urlBlacklist != 'object' ) {
393 | return false
394 | }
395 | return command.urlBlacklist.some( patt => this.url.includes( patt ))
396 | }
397 |
398 | async load () {
399 | let resp
400 | try {
401 | resp = await http({
402 | url: this.urlReplacements(),
403 | encoding: null,
404 | priority: this.loadPriority
405 | })
406 | } catch ( error ) {
407 | if ( ! command.downloadErrors || error.options.external || error.statusCode == 404 || error.statusCode == 400 ) {
408 | throw error
409 | }
410 | fatal( 'Fatal load error' )
411 | //~ return Promise.reject( new Error( 'Load error' ))
412 | }
413 | let data = resp.body
414 |
415 | this.url = resp.request.href // possibly redirected
416 | this.headers = resp.headers
417 | if ( ! this.revision ) {
418 | const modified = this.headers[ 'last-modified' ] // "Tue, 27 Jun 2017 14:37:49 GMT"
419 | const dateBasedRevision = Math.round(( Date.parse( modified ) - Date.parse( '2000-01-01' )) / 1000 ) || 0
420 | this.revision = dateBasedRevision
421 | }
422 |
423 | const contentType = resp.headers[ "content-type" ]
424 | let csplit = contentType.split( ';' )
425 | this.mimeType = csplit[ 0 ]
426 |
427 | if ( this.mimeType.split( '/' )[ 0 ] == 'text' ) {
428 | this.encoding = 'utf-8'
429 | if ( csplit.length > 1 && csplit[ 1 ].includes( 'charset=' )) {
430 | this.encoding = csplit[ 1 ].split( '=' )[ 1 ]
431 | }
432 | }
433 |
434 | if ( this.mimeType == 'application/x-www-form-urlencoded' ) {
435 | try {
436 | const mimeType = await mimeFromData( data )
437 | this.mimeType = mimeType
438 | return data
439 | } catch ( err ) {
440 | }
441 | }
442 |
443 | if ( Buffer.isBuffer( data ) && this.encoding != null ) {
444 | data = iconv.decode( data, this.encoding )
445 | }
446 |
447 | return data
448 | }
449 |
450 | basePath () {
451 | const purl = urlconv.parse( this.url )
452 | const pathp = osPath.parse( purl.pathname )
453 | return sanitizeFN( decodeURIComponent( pathp.base ))
454 | }
455 |
456 | localPath () {
457 | return this.zimNameSpace + '/' + this.basePath()
458 | }
459 |
460 | relativePath ( path ) {
461 | const toTop = '../'.repeat( this.basePath().split( '/' ).length - 1 )
462 | return ( toTop.length > 0 ? toTop : './' ) + path
463 | }
464 |
465 | urlKey () {
466 | return this.zimNameSpace + this.basePath()
467 | }
468 |
469 | titleKey () {
470 | return this.title ? this.zimNameSpace + this.title : this.urlKey()
471 | }
472 |
473 | mimeId () {
474 | if ( this.mimeType == null )
475 | fatal( 'this.mimeType == null', this )
476 | let id = mimeIds.indexOf( this.mimeType )
477 | if ( id == -1 ) {
478 | id = mimeIds.length
479 | mimeIds.push( this.mimeType )
480 | }
481 | return id
482 | }
483 |
484 | storeData ( data ) {
485 | if ( data == null )
486 | return
487 |
488 | const savePath = osPath.join( wiki.outPath, this.localPath())
489 | log( '+', savePath )
490 |
491 | return fs.outputFile( savePath, data )
492 | }
493 |
494 | async storeMetadata ( ) {
495 | const row = [
496 | this.urlKey(),
497 | this.titleKey(),
498 | this.revision,
499 | this.mimeId(),
500 | ]
501 | try {
502 | const res = await wiki.db.run(
503 | 'INSERT INTO articles ( urlKey, titleKey, revision, mimeId ) VALUES ( ?,?,?,? )',
504 | row
505 | )
506 | //~ log( 'storeMetadata res', row, res )
507 | this.id = res.stmt.lastID
508 | ++ articleCount
509 | return this.id
510 | } catch ( err ) {
511 | if ( err.code == "SQLITE_CONSTRAINT" )
512 | return null
513 | fatal( 'storeMetadata error', err )
514 | }
515 |
516 | }
517 |
518 | async save () {
519 | if ( this.blackListed() )
520 | return ''
521 | try {
522 | const data = await this.getData()
523 | await this.storeData( data )
524 | await this.storeMetadata()
525 | return this.localPath()
526 | } catch ( err ) {
527 | warning( 'Save error', err.name, this.url, '->', this.localPath())
528 | return ''
529 | }
530 | }
531 | }
532 |
533 | // {
534 | // "pageid": 10,
535 | // "ns": 0,
536 | // "title": "Baltic Sea",
537 | // "touched": "2017-06-27T14:37:49Z",
538 | // "lastrevid": 168879,
539 | // "counter": 62340,
540 | // "length": 9324,
541 | // "fullurl": "http:\/\/www.cruiserswiki.org\/wiki\/Baltic_Sea",
542 | // "editurl": "http:\/\/www.cruiserswiki.org\/index.php?title=Baltic_Sea&action=edit"
543 | // }
544 | // {
545 | // "ns": 0,
546 | // "title": "Anchorages of Lesvos Island",
547 | // "missing": "",
548 | // "fullurl": "http:\/\/www.cruiserswiki.org\/wiki\/Anchorages_of_Lesvos_Island",
549 | // "editurl": "http:\/\/www.cruiserswiki.org\/index.php?title=Anchorages_of_Lesvos_Island&action=edit"
550 | // }
551 | class ArticleStub extends WikiItem {
552 | constructor ( pageInfo ) {
553 | super( 'A', urlconv.resolve( wiki.articleUriPrefix, pageInfo.fullurl ), pageInfo.title )
554 | this.info = pageInfo
555 | this.mwId = pageInfo.pageid
556 | this.revision = pageInfo.lastrevid
557 | }
558 |
559 | getTitle () {
560 | if ( this.title )
561 | return this.title
562 | if ( this.url && this.url.startsWith( wiki.articleUriPrefix )) {
563 | const urlParsed = urlconv.parse( this.url, true )
564 | const subPath = ( urlParsed.query[ 'title' ] || urlParsed.pathname.replace( wiki.articlePath, '' ) ).replace( /_/g, ' ' )
565 | return decodeURIComponent( subPath )
566 | }
567 | return null // not a local article
568 | }
569 |
570 | basePath () {
571 | if ( this.url && this.url.startsWith( wiki.articleUriPrefix )) {
572 | const urlParsed = urlconv.parse( this.url, true )
573 | const subPath = urlParsed.query[ 'title' ] || urlParsed.pathname.replace( wiki.articlePath, '' )
574 | return sanitizeFN( decodeURIComponent( subPath )) + '.html'
575 | }
576 | return null // not a local article
577 | }
578 | }
579 |
580 | class Article extends ArticleStub {
581 | constructor ( pageInfo ) {
582 | super( pageInfo )
583 | }
584 |
585 | async preProcess( data ) {
586 | let src
587 | let out
588 | try {
589 | src = cheerio.load( data )
590 | } catch ( e ) {
591 | log( 'cheerio.load error', e, data )
592 | return data
593 | }
594 | try {
595 | const content = src( wiki.contentSelector )
596 | if ( content.length == 0 ) {
597 | fatal( "Article.preProcess -- fatal error: Can't find article's content:", this.title )
598 | }
599 |
600 | const dom = cheerio.load( wiki.pageTemplate )
601 | dom( 'title' ).text( this.title )
602 |
603 | dom( '#bodyContent' ).replaceWith( content[ 0 ] )
604 |
605 | // display content inside