├── .gitignore ├── LICENSE ├── README.md ├── exportMediaWiki2Html.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .env 3 | venv 4 | env 5 | 6 | 7 | export 8 | *.html -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 SolidCharity 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Export MediaWiki to HTML 2 | ======================== 3 | 4 | The task is to export a MediaWiki to HTML. 5 | 6 | There is the extension DumpHTML, but it is unmaintained: https://www.mediawiki.org/wiki/Extension%3aDumpHTML 7 | 8 | This Python script supports the following features: 9 | 10 | * links between the pages 11 | * links to anchors 12 | * links to non-existing pages 13 | * directly embedded images 14 | * thumbnails 15 | * supports authentication for dumping a protected wiki 16 | * export all (currently up to 500) pages, or export a single page 17 | 18 | You need to use a bot password, to make the script work, see [[Special:BotPasswords]] / https://www.mediawiki.org/wiki/Manual:Bot_passwords 19 | 20 | Install 21 | ======= 22 | 23 | git clone https://github.com/SolidCharity/exportMediaWiki2HTML.git 24 | cd exportMediaWiki2HTML 25 | python3 -m venv .venv 26 | source .venv/bin/activate 27 | pip install -r requirements.txt 28 | 29 | Usage 30 | ===== 31 | 32 | For all commands, you need to activate the virtual environment first: 33 | 34 | cd exportMediaWiki2HTML 35 | source .venv/bin/activate 36 | 37 | Please pass the url of the wiki 38 | 39 | python3 exportMediaWiki2Html.py --url https://mywiki.example.org 40 | 41 | Optionally pass the page id of the page you want to download, eg. for debugging: 42 | 43 | python3 exportMediaWiki2Html.py --url https://mywiki.example.org --page 180 44 | 45 | Optionally pass the name of a Bot and the Bot password (create a Bot at https://wiki.example.org/index.php?title=Spezial:BotPasswords): 46 | 47 | python3 exportMediaWiki2Html.py --url https://mywiki.example.org --user "myuser@botname" --password "botpwd" [pageid] 48 | 49 | 50 | You can use `--help` to see all options. 51 | 52 | Contribute 53 | ========== 54 | 55 | Feel free to file any issues, and Pull Requests are welcome as well! 56 | -------------------------------------------------------------------------------- /exportMediaWiki2Html.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | # Author: Timotheus Pokorra 4 | # source hosted at https://github.com/SolidCharity/exportMediaWiki2HTML 5 | # licensed under the MIT license 6 | # Copyright 2020-2021 Timotheus Pokorra 7 | 8 | import os 9 | from urllib import parse 10 | import requests 11 | import json 12 | import re 13 | from pathlib import Path 14 | import argparse 15 | 16 | description = """ 17 | Export MediaWiki pages to HTML 18 | Call like this: 19 | ./exportMediaWiki2Html.py --url=https://mywiki.example.org 20 | 21 | Optionally pass the page id of the page you want to download, eg. for debugging: 22 | ./exportMediaWiki2Html.py --url=https://mywiki.example.org --page=180 23 | 24 | Optionally pass the page id of the category, all pages with that category will be exported: 25 | ./exportMediaWiki2Html.py --url=https://mywiki.example.org --category=22 26 | 27 | Optionally pass the namespace id, only pages in that namespace will be exported: 28 | ./exportMediaWiki2Html.py --url=https://mywiki.example.org --namespace=0 29 | 30 | Optionally pass the username and password: 31 | ./exportMediaWiki2Html.py --url=https://mywiki.example.org --username="myusername@botname" --password=botsecret 32 | 33 | Optionally pass the directory to dump the export to: 34 | ./exportMediaWiki2Html.py --url=https://mywiki.example.org --outputDir=export 35 | """ 36 | parser = argparse.ArgumentParser(description=description, formatter_class=argparse.RawDescriptionHelpFormatter) 37 | 38 | parser.add_argument('-l','--url', help='The url of the wiki',required=True) 39 | parser.add_argument('-u','--username', help='Your username and bot name, eg. "myuser@botname"',required=False) 40 | parser.add_argument('-p','--password', help='Your bot password',required=False) 41 | parser.add_argument('-c','--category', help='The category to export',required=False) 42 | parser.add_argument('-g','--page', help='The page to export',required=False) 43 | parser.add_argument('-s', '--namespace', help='The namespace to export', required=False) 44 | parser.add_argument('-n', '--numberOfPages', help='The number of pages to export, or max', required=False, default=500) 45 | parser.add_argument('-o', '--outputDir', help='The destination directory for the export', type=Path, required=False, default="export") 46 | parser.add_argument('--shortUrl', help='Custom short url path for the wiki', required=False, default='wiki/') 47 | parser.add_argument('--listPages', help='List available pages', required=False, default=False, action='store_true') 48 | parser.add_argument('--dontOverwrite', help='Skip already downloaded files', required=False, default=False, action='store_true') 49 | try: 50 | parser.add_argument('--ssl', help='Enable SSL redirection', required=False, default=True, action=argparse.BooleanOptionalAction) 51 | except AttributeError: 52 | # BooleanOptionalAction was introduced in Python 3.9 53 | parser.add_argument('--ssl', help='Enable SSL redirection', required=False, default=True) 54 | args = parser.parse_args() 55 | 56 | if args.numberOfPages != "max": 57 | try: 58 | int(args.numberOfPages) 59 | numberOfPages = str(args.numberOfPages) 60 | except ValueError: 61 | print("Provided number of pages is invalid") 62 | exit(-1) 63 | else: 64 | numberOfPages = "max" 65 | 66 | url = args.url 67 | if not url.endswith('/'): 68 | url = url + '/' 69 | # get the subpath of the url, eg. https://www.example.org/wiki/ => wiki/, or empty for no subpath 70 | subpath = url[url.index("://") + 3:] 71 | subpath = subpath[subpath.index("/")+1:] 72 | 73 | pageOnly = -1 74 | categoryOnly = -1 75 | namespace = args.namespace 76 | if args.category is not None: 77 | categoryOnly = int(args.category) 78 | if namespace is None: 79 | namespace = "*" # all namespaces 80 | else: 81 | if namespace is None: 82 | namespace = 0 83 | # the allpages API only supports integer IDs 84 | namespace = str(int(namespace)) 85 | if args.page is not None: 86 | pageOnly = int(args.page) 87 | 88 | (args.outputDir / "img").mkdir(parents=True, exist_ok=True) 89 | 90 | if not args.shortUrl.endswith('/'): 91 | args.shortUrl = args.shortUrl + '/' 92 | shortUrl = args.shortUrl 93 | 94 | S = requests.Session() 95 | 96 | if args.username is not None and args.password is not None: 97 | LgUser = args.username 98 | LgPassword = args.password 99 | 100 | # Retrieve login token first 101 | PARAMS_0 = { 102 | 'action':"query", 103 | 'meta':"tokens", 104 | 'type':"login", 105 | 'format':"json" 106 | } 107 | R = S.get(url=url + "/api.php", params=PARAMS_0) 108 | DATA = R.json() 109 | LOGIN_TOKEN = DATA['query']['tokens']['logintoken'] 110 | 111 | # Main-account login via "action=login" is deprecated and may stop working without warning. To continue login with "action=login", see [[Special:BotPasswords]] 112 | PARAMS_1 = { 113 | 'action':"login", 114 | 'lgname':LgUser, 115 | 'lgpassword':LgPassword, 116 | 'lgtoken':LOGIN_TOKEN, 117 | 'format':"json" 118 | } 119 | 120 | R = S.post(url + "/api.php", data=PARAMS_1) 121 | try: 122 | DATA = R.json() 123 | except: 124 | print("cannot parse json from action:login") 125 | print(R.content) 126 | exit(-1) 127 | if "error" in DATA: 128 | print(DATA) 129 | exit(-1) 130 | 131 | if categoryOnly != -1: 132 | params_all_pages = { 133 | 'action': 'query', 134 | 'list': 'categorymembers', 135 | 'format': 'json', 136 | 'cmpageid': categoryOnly, 137 | 'cmnamespace': namespace, 138 | 'cmlimit': numberOfPages 139 | } 140 | else: 141 | params_all_pages = { 142 | 'action': 'query', 143 | 'list': 'allpages', 144 | 'format': 'json', 145 | 'apnamespace': namespace, 146 | 'aplimit': numberOfPages 147 | } 148 | 149 | response = S.get(url + "api.php", params=params_all_pages) 150 | data = response.json() 151 | 152 | if "error" in data: 153 | print(data) 154 | if data['error']['code'] == "readapidenied": 155 | print() 156 | print("get login token here: " + url + "/api.php?action=query&meta=tokens&type=login") 157 | print("and then call this script with parameters: myuser topsecret mytoken") 158 | exit(-1) 159 | if categoryOnly != -1: 160 | pages = data['query']['categorymembers'] 161 | else: 162 | pages = data['query']['allpages'] 163 | 164 | # user may want to download a single page, but needs to know the page number 165 | if args.listPages: 166 | for page in pages: 167 | print(f'{page["pageid"]}: {page["title"]}') 168 | exit(0) 169 | 170 | while 'continue' in data and (numberOfPages == 'max' or len(pages) < int(numberOfPages)): 171 | if categoryOnly != -1: 172 | params_all_pages['cmcontinue'] = data['continue']['cmcontinue'] 173 | else: 174 | params_all_pages['apcontinue'] = data['continue']['apcontinue'] 175 | 176 | response = S.get(url + "api.php", params=params_all_pages) 177 | 178 | data = response.json() 179 | 180 | if "error" in data: 181 | print(data) 182 | if data['error']['code'] == "readapidenied": 183 | print() 184 | print(f'get login token here: {url}/api.php?action=query&meta=tokens&type=login') 185 | print("and then call this script with parameters: myuser topsecret mytoken") 186 | exit(-1) 187 | 188 | if categoryOnly != -1: 189 | pages.extend(data['query']['categorymembers']) 190 | else: 191 | pages.extend(data['query']['allpages']) 192 | 193 | def quote_title(title): 194 | return parse.quote(page['title'].replace(' ', '_')) 195 | 196 | downloadedimages = [] 197 | def DownloadImage(filename, urlimg, ignorethumb=True): 198 | fileOut = f'{args.outputDir}/img/{filename}' 199 | if not filename in downloadedimages: 200 | if ignorethumb and '/thumb/' in urlimg: 201 | urlimg = urlimg.replace('/thumb/', '/') 202 | urlimg = urlimg[:urlimg.rindex('/')] 203 | if not urlimg.startswith("http"): 204 | urlimg = url + urlimg[1:] 205 | print(f"Downloading {urlimg}") 206 | response = S.get(urlimg) 207 | if response.status_code == 404: 208 | raise Exception("404: cannot download " + urlimg) 209 | content = response.content 210 | f = open(fileOut, "wb") 211 | f.write(content) 212 | f.close() 213 | downloadedimages.append(filename) 214 | 215 | def DownloadFile(filename, urlfilepage): 216 | fileOut = f'{args.outputDir}/img/{filename}' 217 | if args.dontOverwrite and os.path.exists(fileOut): 218 | print(f'Ignoring {filename} (already downloaded)') 219 | downloadedimages.append(filename) 220 | return 221 | if not filename in downloadedimages: 222 | # get the file page 223 | response = S.get(urlfilepage) 224 | content = response.text 225 | filepos = content.find('href="/' + subpath + 'images/') 226 | if filepos == -1: 227 | return 228 | fileendquote = content.find('"', filepos + len('href="')) 229 | urlfile = content[filepos+len('href="') + len(subpath):fileendquote] 230 | DownloadImage(filename, urlfile) 231 | 232 | def PageTitleToFilename(title): 233 | temp = re.sub('[^A-Za-z0-9\u0400-\u0500\u4E00-\u9FFF]+', '_', title); 234 | return temp.replace("(","_").replace(")","_").replace("__", "_") 235 | 236 | for page in pages: 237 | if (pageOnly > -1) and (page['pageid'] != pageOnly): 238 | continue 239 | print(page) 240 | quoted_pagename = quote_title(page['title']) 241 | url_page = url + "index.php?title=" + quoted_pagename + "&action=render" 242 | response = S.get(url_page) 243 | content = response.text 244 | url_title = url + "index.php?title=" 245 | if (url_title not in content) and args.ssl: 246 | url_title = url_title.replace("http://", "https://") 247 | 248 | # in case we have links like a href="//wiki.example.org/index.php..." 249 | if url_title not in content: 250 | protocol = url_title[:url_title.index(":")] 251 | url_title_without_protocol = url_title[url_title.index('/'):] 252 | content = content.replace(f'a href="{url_title_without_protocol}', f'a href="{protocol}:{url_title_without_protocol}') 253 | 254 | # in case we have links like a href="//wiki.example.org/wiki/..." 255 | if url_title not in content: 256 | url_title_without_indexphp = url_title.replace("index.php?title=", shortUrl) 257 | content = content.replace(f'a href="{url_title_without_indexphp}', f'a href="{url_title}') 258 | 259 | pos = 0 260 | while url_title in content: 261 | pos = content.find(url_title) 262 | posendquote = content.find('"', pos) 263 | file_url = content[pos:posendquote] 264 | linkedpage = file_url 265 | linkedpage = linkedpage[linkedpage.find('=') + 1:] 266 | linkedpage = linkedpage.replace('%27', '_') 267 | if linkedpage.startswith('File:') or linkedpage.startswith('Datei:') or linkedpage.startswith('Image:'): 268 | if linkedpage.startswith('File:'): 269 | linkType = "File" 270 | elif linkedpage.startswith('Datei:'): 271 | linkType = "Datei" 272 | elif linkedpage.startswith('Image:'): 273 | linkType = "Image" 274 | origlinkedpage = linkedpage[linkedpage.find(':')+1:] 275 | linkedpage = parse.unquote(origlinkedpage) 276 | 277 | if linkType == "File" or linkType == "Datei": 278 | DownloadFile(linkedpage, file_url) 279 | 280 | # images are only downloaded for "img src=" 281 | # we just replace the link here 282 | content = content.replace(url_title+linkType+":"+origlinkedpage, "img/"+origlinkedpage) 283 | 284 | elif "&action=edit&redlink=1" in linkedpage: 285 | content = content[:pos] + "page_not_existing.html\" style='color:red'" + content[posendquote+1:] 286 | elif "#" in linkedpage: 287 | linkWithoutAnchor = linkedpage[0:linkedpage.find('#')] 288 | linkWithoutAnchor = PageTitleToFilename(linkWithoutAnchor) 289 | content = content[:pos] + linkWithoutAnchor + ".html#" + linkedpage[linkedpage.find('#')+1:] + content[posendquote:] 290 | else: 291 | linkedpage = PageTitleToFilename(parse.unquote(linkedpage)) 292 | content = content[:pos] + linkedpage + ".html" + content[posendquote:] 293 | 294 | # replace all -1: 297 | imgpos = content.find('href="' + url + 'images/', imgpos) 298 | if imgpos > -1: 299 | imgendquote = content.find('"', imgpos + len('href="')) 300 | imgpath = content[imgpos+len('href="'):imgendquote] 301 | filename = imgpath[imgpath.rindex("/")+1:] 302 | DownloadImage(filename, imgpath, ignorethumb=False) 303 | content = content.replace(content[imgpos + len('href="'):imgendquote], "img/"+filename) 304 | 305 | 306 | # replace all -1: 309 | imgpos = content.find('src="/' + subpath + 'images/', imgpos) 310 | if imgpos > -1: 311 | imgendquote = content.find('"', imgpos + len('src="')) 312 | imgpath = content[imgpos+len('src="') + len(subpath):imgendquote] 313 | filename = imgpath[imgpath.rindex("/")+1:] 314 | DownloadImage(filename, imgpath, ignorethumb=False) 315 | content = content.replace("/"+subpath+imgpath[1:], "img/"+filename) 316 | 317 | # replace all srcset="//images..., //images..."" 318 | imgpos = 0 319 | while imgpos > -1: 320 | imgpos = content.find('srcset="/' + subpath + 'images/', imgpos) 321 | if imgpos > -1: 322 | imgendquote = content.find('"', imgpos + len('srcset="')) 323 | srcsetval = content[imgpos+len('srcset="'):imgendquote] 324 | for srcsetitem in srcsetval.split(','): 325 | imgpath = srcsetitem.strip().split()[0][len(subpath):] 326 | filename = imgpath[imgpath.rindex("/")+1:] 327 | DownloadImage(filename, imgpath, ignorethumb=False) 328 | content = content.replace("/"+subpath+imgpath[1:], "img/"+filename) 329 | 330 | #content = content.replace('
'.encode("utf8"), ''.encode("utf8")) 331 | content = re.sub("()", '', content, flags=re.DOTALL) 332 | 333 | f = open(args.outputDir / (PageTitleToFilename(page['title']) + ".html"), "wb") 334 | f.write(("\n" + page['title'] + "\n\n").encode("utf8")) 335 | f.write(("

" + page['title'] + "

").encode("utf8")) 336 | f.write(content.encode('utf8')) 337 | f.write("".encode("utf8")) 338 | f.close() 339 | 340 | f = open(args.outputDir / "page_not_existing.html", "wb") 341 | f.write(("\nThis page does not exist yet\n\n").encode("utf8")) 342 | f.write(("

This page does not exist yet

").encode("utf8")) 343 | f.write("".encode("utf8")) 344 | f.close() 345 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | --------------------------------------------------------------------------------