├── .gitignore
├── LICENSE
├── README.md
├── exportMediaWiki2Html.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .venv
2 | .env
3 | venv
4 | env
5 | 
6 | 
7 | export
8 | *.html


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 SolidCharity
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Export MediaWiki to HTML
 2 | ========================
 3 | 
 4 | The task is to export a MediaWiki to HTML.
 5 | 
 6 | There is the extension DumpHTML, but it is unmaintained: https://www.mediawiki.org/wiki/Extension%3aDumpHTML
 7 | 
 8 | This Python script supports the following features:
 9 | 
10 | * links between the pages
11 | * links to anchors
12 | * links to non-existing pages
13 | * directly embedded images
14 | * thumbnails
15 | * supports authentication for dumping a protected wiki
16 | * export all (currently up to 500) pages, or export a single page
17 | 
18 | You need to use a bot password, to make the script work, see [[Special:BotPasswords]] / https://www.mediawiki.org/wiki/Manual:Bot_passwords
19 | 
20 | Install
21 | =======
22 | 
23 |     git clone https://github.com/SolidCharity/exportMediaWiki2HTML.git
24 |     cd exportMediaWiki2HTML
25 |     python3 -m venv .venv
26 |     source .venv/bin/activate
27 |     pip install -r requirements.txt
28 | 
29 | Usage
30 | =====
31 | 
32 | For all commands, you need to activate the virtual environment first:
33 | 
34 |     cd exportMediaWiki2HTML
35 |     source .venv/bin/activate
36 | 
37 | Please pass the url of the wiki
38 | 
39 |     python3 exportMediaWiki2Html.py --url https://mywiki.example.org
40 | 
41 | Optionally pass the page id of the page you want to download, eg. for debugging:
42 | 
43 |     python3 exportMediaWiki2Html.py --url https://mywiki.example.org --page 180
44 | 
45 | Optionally pass the name of a Bot and the Bot password (create a Bot at https://wiki.example.org/index.php?title=Spezial:BotPasswords):
46 | 
47 |     python3 exportMediaWiki2Html.py --url https://mywiki.example.org --user "myuser@botname" --password "botpwd" [pageid]
48 | 
49 | 
50 | You can use `--help` to see all options.
51 | 
52 | Contribute
53 | ==========
54 | 
55 | Feel free to file any issues, and Pull Requests are welcome as well!
56 | 


--------------------------------------------------------------------------------
/exportMediaWiki2Html.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | # Author: Timotheus Pokorra <timotheus.pokorra@solidcharity.com>
  4 | # source hosted at https://github.com/SolidCharity/exportMediaWiki2HTML
  5 | # licensed under the MIT license
  6 | # Copyright 2020-2021 Timotheus Pokorra
  7 | 
  8 | import os
  9 | from urllib import parse
 10 | import requests
 11 | import json
 12 | import re
 13 | from pathlib import Path
 14 | import argparse
 15 | 
 16 | description = """
 17 | Export MediaWiki pages to HTML
 18 | Call like this:
 19 |    ./exportMediaWiki2Html.py --url=https://mywiki.example.org
 20 | 
 21 |    Optionally pass the page id of the page you want to download, eg. for debugging:
 22 |    ./exportMediaWiki2Html.py --url=https://mywiki.example.org --page=180
 23 | 
 24 |    Optionally pass the page id of the category, all pages with that category will be exported:
 25 |    ./exportMediaWiki2Html.py --url=https://mywiki.example.org --category=22
 26 | 
 27 |    Optionally pass the namespace id, only pages in that namespace will be exported:
 28 |    ./exportMediaWiki2Html.py --url=https://mywiki.example.org --namespace=0
 29 | 
 30 |    Optionally pass the username and password:
 31 |    ./exportMediaWiki2Html.py --url=https://mywiki.example.org --username="myusername@botname" --password=botsecret
 32 | 
 33 |    Optionally pass the directory to dump the export to:
 34 |    ./exportMediaWiki2Html.py --url=https://mywiki.example.org --outputDir=export
 35 | """
 36 | parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawDescriptionHelpFormatter)
 37 | 
 38 | parser.add_argument('-l','--url', help='The url of the wiki',required=True)
 39 | parser.add_argument('-u','--username', help='Your username and bot name, eg. "myuser@botname"',required=False)
 40 | parser.add_argument('-p','--password', help='Your bot password',required=False)
 41 | parser.add_argument('-c','--category', help='The category to export',required=False)
 42 | parser.add_argument('-g','--page', help='The page to export',required=False)
 43 | parser.add_argument('-s', '--namespace', help='The namespace to export', required=False)
 44 | parser.add_argument('-n', '--numberOfPages', help='The number of pages to export, or max', required=False, default=500)
 45 | parser.add_argument('-o', '--outputDir', help='The destination directory for the export', type=Path, required=False, default="export")
 46 | parser.add_argument('--shortUrl', help='Custom short url path for the wiki', required=False, default='wiki/')
 47 | parser.add_argument('--listPages', help='List available pages', required=False, default=False, action='store_true')
 48 | parser.add_argument('--dontOverwrite', help='Skip already downloaded files', required=False, default=False, action='store_true')
 49 | try:
 50 |     parser.add_argument('--ssl', help='Enable SSL redirection', required=False, default=True, action=argparse.BooleanOptionalAction)
 51 | except AttributeError:
 52 |     # BooleanOptionalAction was introduced in Python 3.9
 53 |     parser.add_argument('--ssl', help='Enable SSL redirection', required=False, default=True)
 54 | args = parser.parse_args()
 55 | 
 56 | if args.numberOfPages != "max":
 57 |   try:
 58 |     int(args.numberOfPages)
 59 |     numberOfPages = str(args.numberOfPages)
 60 |   except ValueError:
 61 |       print("Provided number of pages is invalid")
 62 |       exit(-1)
 63 | else:
 64 |   numberOfPages = "max"
 65 | 
 66 | url = args.url
 67 | if not url.endswith('/'):
 68 |   url = url + '/'
 69 | # get the subpath of the url, eg. https://www.example.org/wiki/ => wiki/, or empty for no subpath
 70 | subpath = url[url.index("://") + 3:]
 71 | subpath = subpath[subpath.index("/")+1:]
 72 | 
 73 | pageOnly = -1
 74 | categoryOnly = -1
 75 | namespace = args.namespace
 76 | if args.category is not None:
 77 |   categoryOnly = int(args.category)
 78 |   if namespace is None:
 79 |     namespace = "*" # all namespaces
 80 | else:
 81 |   if namespace is None:
 82 |     namespace = 0
 83 |   # the allpages API only supports integer IDs
 84 |   namespace = str(int(namespace))
 85 | if args.page is not None:
 86 |   pageOnly = int(args.page)
 87 | 
 88 | (args.outputDir / "img").mkdir(parents=True, exist_ok=True)
 89 | 
 90 | if not args.shortUrl.endswith('/'):
 91 |   args.shortUrl = args.shortUrl + '/'
 92 | shortUrl = args.shortUrl 
 93 | 
 94 | S = requests.Session()
 95 | 
 96 | if args.username is not None and args.password is not None:
 97 |   LgUser = args.username
 98 |   LgPassword = args.password
 99 | 
100 |   # Retrieve login token first
101 |   PARAMS_0 = {
102 |       'action':"query",
103 |       'meta':"tokens",
104 |       'type':"login",
105 |       'format':"json"
106 |   }
107 |   R = S.get(url=url + "/api.php", params=PARAMS_0)
108 |   DATA = R.json()
109 |   LOGIN_TOKEN = DATA['query']['tokens']['logintoken']
110 | 
111 |   # Main-account login via "action=login" is deprecated and may stop working without warning. To continue login with "action=login", see [[Special:BotPasswords]]
112 |   PARAMS_1 = {
113 |       'action':"login",
114 |       'lgname':LgUser,
115 |       'lgpassword':LgPassword,
116 |       'lgtoken':LOGIN_TOKEN,
117 |       'format':"json"
118 |   }
119 | 
120 |   R = S.post(url + "/api.php", data=PARAMS_1)
121 |   try:
122 |     DATA = R.json()
123 |   except:
124 |     print("cannot parse json from action:login")
125 |     print(R.content)
126 |     exit(-1)
127 |   if "error" in DATA:
128 |     print(DATA)
129 |     exit(-1)
130 | 
131 | if categoryOnly != -1:
132 |   params_all_pages = {
133 |     'action': 'query',
134 |     'list': 'categorymembers',
135 |     'format': 'json',
136 |     'cmpageid': categoryOnly,
137 |     'cmnamespace': namespace,
138 |     'cmlimit': numberOfPages
139 |   }
140 | else:
141 |   params_all_pages = {
142 |     'action': 'query',
143 |     'list': 'allpages',
144 |     'format': 'json',
145 |     'apnamespace': namespace,
146 |     'aplimit': numberOfPages
147 |   }
148 | 
149 | response = S.get(url + "api.php", params=params_all_pages)
150 | data = response.json()
151 | 
152 | if "error" in data:
153 |   print(data)
154 |   if data['error']['code'] == "readapidenied":
155 |     print()
156 |     print("get login token here: " + url + "/api.php?action=query&meta=tokens&type=login")
157 |     print("and then call this script with parameters: myuser topsecret mytoken")
158 |     exit(-1)
159 | if categoryOnly != -1:
160 |   pages = data['query']['categorymembers']
161 | else:
162 |   pages = data['query']['allpages']
163 | 
164 | # user may want to download a single page, but needs to know the page number
165 | if args.listPages:
166 |     for page in pages:
167 |         print(f'{page["pageid"]}: {page["title"]}')
168 |     exit(0)
169 | 
170 | while 'continue' in data and (numberOfPages == 'max' or len(pages) < int(numberOfPages)):
171 |   if categoryOnly != -1:
172 |     params_all_pages['cmcontinue'] = data['continue']['cmcontinue']
173 |   else:
174 |     params_all_pages['apcontinue'] = data['continue']['apcontinue']
175 | 
176 |   response = S.get(url + "api.php", params=params_all_pages)
177 | 
178 |   data = response.json()
179 | 
180 |   if "error" in data:
181 |     print(data)
182 |     if data['error']['code'] == "readapidenied":
183 |       print()
184 |       print(f'get login token here: {url}/api.php?action=query&meta=tokens&type=login')
185 |       print("and then call this script with parameters: myuser topsecret mytoken")
186 |       exit(-1)
187 | 
188 |   if categoryOnly != -1:
189 |     pages.extend(data['query']['categorymembers'])
190 |   else:
191 |     pages.extend(data['query']['allpages'])
192 | 
193 | def quote_title(title):
194 |   return parse.quote(page['title'].replace(' ', '_'))
195 | 
196 | downloadedimages = []
197 | def DownloadImage(filename, urlimg, ignorethumb=True):
198 |   fileOut = f'{args.outputDir}/img/{filename}'
199 |   if not filename in downloadedimages:
200 |     if ignorethumb and '/thumb/' in urlimg:
201 |       urlimg = urlimg.replace('/thumb/', '/')
202 |       urlimg = urlimg[:urlimg.rindex('/')]
203 |     if not urlimg.startswith("http"):
204 |         urlimg = url + urlimg[1:]
205 |     print(f"Downloading {urlimg}")
206 |     response = S.get(urlimg)
207 |     if response.status_code == 404:
208 |       raise Exception("404: cannot download " + urlimg)
209 |     content = response.content
210 |     f = open(fileOut, "wb")
211 |     f.write(content)
212 |     f.close()
213 |     downloadedimages.append(filename)
214 | 
215 | def DownloadFile(filename, urlfilepage):
216 |   fileOut = f'{args.outputDir}/img/{filename}'
217 |   if args.dontOverwrite and os.path.exists(fileOut):
218 |       print(f'Ignoring {filename} (already downloaded)')
219 |       downloadedimages.append(filename)
220 |       return
221 |   if not filename in downloadedimages:
222 |     # get the file page
223 |     response = S.get(urlfilepage)
224 |     content = response.text
225 |     filepos = content.find('href="/' + subpath + 'images/')
226 |     if filepos == -1:
227 |       return
228 |     fileendquote = content.find('"', filepos + len('href="'))
229 |     urlfile = content[filepos+len('href="') + len(subpath):fileendquote]
230 |     DownloadImage(filename, urlfile)
231 | 
232 | def PageTitleToFilename(title):
233 |     temp = re.sub('[^A-Za-z0-9\u0400-\u0500\u4E00-\u9FFF]+', '_', title);
234 |     return temp.replace("(","_").replace(")","_").replace("__", "_")
235 | 
236 | for page in pages:
237 |     if (pageOnly > -1) and (page['pageid'] != pageOnly):
238 |         continue
239 |     print(page)
240 |     quoted_pagename = quote_title(page['title'])
241 |     url_page = url + "index.php?title=" + quoted_pagename + "&action=render"
242 |     response = S.get(url_page)
243 |     content = response.text
244 |     url_title = url + "index.php?title="
245 |     if (url_title not in content) and args.ssl:
246 |         url_title = url_title.replace("http://", "https://")
247 | 
248 |     # in case we have links like a href="//wiki.example.org/index.php..."
249 |     if url_title not in content:
250 |         protocol = url_title[:url_title.index(":")]
251 |         url_title_without_protocol = url_title[url_title.index('/'):]
252 |         content = content.replace(f'a href="{url_title_without_protocol}', f'a href="{protocol}:{url_title_without_protocol}')
253 | 
254 |     # in case we have links like a href="//wiki.example.org/wiki/..."
255 |     if url_title not in content:
256 |         url_title_without_indexphp = url_title.replace("index.php?title=", shortUrl)
257 |         content = content.replace(f'a href="{url_title_without_indexphp}', f'a href="{url_title}')
258 | 
259 |     pos = 0
260 |     while url_title in content:
261 |         pos = content.find(url_title)
262 |         posendquote = content.find('"', pos)
263 |         file_url = content[pos:posendquote]
264 |         linkedpage = file_url
265 |         linkedpage = linkedpage[linkedpage.find('=') + 1:]
266 |         linkedpage = linkedpage.replace('%27', '_')
267 |         if linkedpage.startswith('File:') or linkedpage.startswith('Datei:') or linkedpage.startswith('Image:'):
268 |           if linkedpage.startswith('File:'):
269 |               linkType = "File"
270 |           elif linkedpage.startswith('Datei:'):
271 |               linkType = "Datei"
272 |           elif linkedpage.startswith('Image:'):
273 |               linkType = "Image"
274 |           origlinkedpage = linkedpage[linkedpage.find(':')+1:]
275 |           linkedpage = parse.unquote(origlinkedpage)
276 | 
277 |           if linkType == "File" or linkType == "Datei":
278 |             DownloadFile(linkedpage, file_url)
279 | 
280 |           # images are only downloaded for "img src="
281 |           # we just replace the link here
282 |           content = content.replace(url_title+linkType+":"+origlinkedpage, "img/"+origlinkedpage)
283 | 
284 |         elif "&amp;action=edit&amp;redlink=1" in linkedpage:
285 |           content = content[:pos] + "page_not_existing.html\" style='color:red'" + content[posendquote+1:]
286 |         elif "#" in linkedpage:
287 |           linkWithoutAnchor = linkedpage[0:linkedpage.find('#')]
288 |           linkWithoutAnchor = PageTitleToFilename(linkWithoutAnchor)
289 |           content = content[:pos] + linkWithoutAnchor + ".html#" + linkedpage[linkedpage.find('#')+1:] + content[posendquote:]
290 |         else:
291 |           linkedpage = PageTitleToFilename(parse.unquote(linkedpage))
292 |           content = content[:pos] + linkedpage + ".html" + content[posendquote:]
293 | 
294 |     # replace all <a href="<url>/<subpath>/images"
295 |     imgpos = 0
296 |     while imgpos > -1:
297 |         imgpos = content.find('href="' + url + 'images/', imgpos)
298 |         if imgpos > -1:
299 |           imgendquote = content.find('"', imgpos + len('href="'))
300 |           imgpath = content[imgpos+len('href="'):imgendquote]
301 |           filename = imgpath[imgpath.rindex("/")+1:]
302 |           DownloadImage(filename, imgpath, ignorethumb=False)
303 |           content = content.replace(content[imgpos + len('href="'):imgendquote], "img/"+filename)
304 | 
305 | 
306 |     # replace all <img src="/<subpath>/images"
307 |     imgpos = 0
308 |     while imgpos > -1:
309 |         imgpos = content.find('src="/' + subpath + 'images/', imgpos)
310 |         if imgpos > -1:
311 |           imgendquote = content.find('"', imgpos + len('src="'))
312 |           imgpath = content[imgpos+len('src="') + len(subpath):imgendquote]
313 |           filename = imgpath[imgpath.rindex("/")+1:]
314 |           DownloadImage(filename, imgpath, ignorethumb=False)
315 |           content = content.replace("/"+subpath+imgpath[1:], "img/"+filename)
316 | 
317 |     # replace all srcset="/<subpath>/images..., /<subpath>/images...""
318 |     imgpos = 0
319 |     while imgpos > -1:
320 |         imgpos = content.find('srcset="/' + subpath + 'images/', imgpos)
321 |         if imgpos > -1:
322 |           imgendquote = content.find('"', imgpos + len('srcset="'))
323 |           srcsetval = content[imgpos+len('srcset="'):imgendquote]
324 |           for srcsetitem in srcsetval.split(','):
325 |             imgpath = srcsetitem.strip().split()[0][len(subpath):]
326 |             filename = imgpath[imgpath.rindex("/")+1:]
327 |             DownloadImage(filename, imgpath, ignorethumb=False)
328 |             content = content.replace("/"+subpath+imgpath[1:], "img/"+filename)
329 | 
330 |     #content = content.replace('<div class="mw-parser-output">'.encode("utf8"), ''.encode("utf8"))
331 |     content = re.sub("(<!--).*?(-->)", '', content, flags=re.DOTALL)
332 | 
333 |     f = open(args.outputDir / (PageTitleToFilename(page['title']) + ".html"), "wb")
334 |     f.write(("<html>\n<head><title>" + page['title'] + "</title></head>\n<body>\n").encode("utf8"))
335 |     f.write(("<h1>" + page['title'] + "</h1>").encode("utf8"))
336 |     f.write(content.encode('utf8'))
337 |     f.write("</body></html>".encode("utf8"))
338 |     f.close()
339 | 
340 | f = open(args.outputDir / "page_not_existing.html", "wb")
341 | f.write(("<html>\n<head><title>This page does not exist yet</title></head>\n<body>\n").encode("utf8"))
342 | f.write(("<h1>This page does not exist yet</h1>").encode("utf8"))
343 | f.write("</body></html>".encode("utf8"))
344 | f.close()
345 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | 


--------------------------------------------------------------------------------