├── wikiteam3 ├── __init__.py ├── utils │ ├── xmlutil.py │ ├── __init__.py │ ├── uprint.py │ ├── user_agent.py │ ├── domain.py │ ├── wiki_avoid.py │ ├── monkey_patch.py │ ├── login │ │ ├── __init__.py │ │ ├── index.py │ │ └── api.py │ └── util.py ├── dumpgenerator │ ├── test │ │ ├── __init__.py │ │ ├── test_config.py │ │ └── data │ │ │ └── html_regexs │ │ │ ├── group0.mediawiki.demo.save-web.org_mediawiki-1.16.5-20230701.html │ │ │ ├── group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701.html │ │ │ └── group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701.html │ ├── dump │ │ ├── image │ │ │ ├── __init__.py │ │ │ ├── html_regexs.py │ │ │ └── html_regexs_test.py │ │ ├── misc │ │ │ ├── __init__.py │ │ │ ├── site_info_test.py │ │ │ ├── index_php.py │ │ │ ├── special_version.py │ │ │ ├── special_logs.py │ │ │ └── site_info.py │ │ ├── page │ │ │ ├── __init__.py │ │ │ ├── xmlrev │ │ │ │ ├── __init__.py │ │ │ │ └── xml_revisions_page.py │ │ │ └── xmlexport │ │ │ │ ├── __init__.py │ │ │ │ ├── page_xml.py │ │ │ │ ├── page_xml_export.py │ │ │ │ └── page_xml_api.py │ │ ├── xmldump │ │ │ ├── __init__.py │ │ │ ├── xml_integrity.py │ │ │ ├── xml_truncate.py │ │ │ ├── xml_header.py │ │ │ └── xml_dump.py │ │ ├── __init__.py │ │ └── generator.py │ ├── log │ │ ├── __init__.py │ │ └── log_error.py │ ├── cli │ │ ├── __init__.py │ │ ├── delay.py │ │ └── greeter.py │ ├── __main__.py │ ├── version.py │ ├── api │ │ ├── __init__.py │ │ ├── get_json.py │ │ ├── handle_status_code.py │ │ ├── index_check.py │ │ ├── namespaces.py │ │ ├── api.py │ │ ├── wiki_check.py │ │ └── page_titles.py │ ├── exceptions.py │ ├── __init__.py │ └── config.py └── launcher.py ├── .gitattributes ├── .travis.yml ├── .gitignore ├── .pymarkdown.json ├── .markdownlint.jsonc ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ └── bug_report.md └── workflows │ └── test-dumpgenerator.yml ├── .pre-commit-config.yaml ├── PUBLISHING.md ├── pyproject.toml ├── README.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── USAGE.md └── INSTALLATION.md /wikiteam3/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wikiteam3/utils/xmlutil.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/image/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/misc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/page/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/xmldump/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/page/xmlrev/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/page/xmlexport/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/log/__init__.py: -------------------------------------------------------------------------------- 1 | from .log_error import logerror 2 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/__init__.py: -------------------------------------------------------------------------------- 1 | from .generator import DumpGenerator 2 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/cli/__init__.py: -------------------------------------------------------------------------------- 1 | from .cli import getParameters 2 | from .delay import Delay 3 | from .greeter import bye, welcome 4 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/__main__.py: -------------------------------------------------------------------------------- 1 | if __name__ == "__main__": 2 | import sys 3 | 4 | from .__init__ import main 5 | 6 | sys.exit(main()) 7 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/version.py: -------------------------------------------------------------------------------- 1 | __VERSION__ = "0.4.0-alpha" # major, minor, micro: semver.org 2 | 3 | 4 | def getVersion(): 5 | return __VERSION__ 6 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.com linguist-vendored 2 | *.org linguist-vendored 3 | 4 | *.py text=auto 5 | *.sh text=auto 6 | *.json text=auto 7 | *.txt text=auto 8 | *.md text=auto 9 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3.8 3 | install: 4 | - pip install poetry 5 | - poetry install 6 | script: 7 | - poetry run pytest --verbose -s 8 | notifications: 9 | email: false 10 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .api import checkAPI, checkRetryAPI, mwGetAPIAndIndex 2 | from .get_json import getJSON 3 | from .handle_status_code import handleStatusCode 4 | from .wiki_check import getWikiEngine 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | .pytest_cache 3 | keys.txt 4 | batchdownload/keys.txt 5 | batchdownload/dumpgenerator.py 6 | batchdownload/uploader.py 7 | __pycache__ 8 | tests/tmp 9 | dist/ 10 | .DS_Store 11 | desktop.ini 12 | 13 | .venv 14 | .vscode 15 | .idea 16 | -------------------------------------------------------------------------------- /.pymarkdown.json: -------------------------------------------------------------------------------- 1 | { 2 | "plugins": { 3 | "line-length": { 4 | "enabled": false 5 | }, 6 | "no-inline-html": { 7 | "allowed_elements": "details,summary,code,!--" 8 | }, 9 | "first-line-heading": { 10 | "enabled": false, 11 | "front_matter_title" : "name" 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py: -------------------------------------------------------------------------------- 1 | from typing import * 2 | 3 | from wikiteam3.dumpgenerator.config import Config 4 | 5 | 6 | def checkXMLIntegrity( 7 | config: Config = None, titles: Iterable[str] = None, session=None 8 | ): 9 | """Check XML dump integrity, to detect broken XML chunks""" 10 | # TODO: Fix XML Integrity Check 11 | return 12 | -------------------------------------------------------------------------------- /wikiteam3/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .domain import domain2prefix 2 | from .login import botLogin, clientLogin, fetchLoginToken, indexLogin, uniLogin 3 | from .monkey_patch import mod_requests_text 4 | from .uprint import uprint 5 | from .user_agent import getUserAgent 6 | from .util import cleanHTML, cleanXML, removeIP, sha1File, undoHTMLEntities 7 | from .wiki_avoid import avoidWikimediaProjects 8 | -------------------------------------------------------------------------------- /wikiteam3/utils/uprint.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def uprint(*objects, sep=" ", end="\n", file=sys.stdout): 5 | enc = file.encoding 6 | if enc == "UTF-8": 7 | print(*objects, sep=sep, end=end, file=file) 8 | else: 9 | f = lambda obj: str(obj).encode(enc, errors="backslashreplace").decode(enc) 10 | print(*map(f, objects), sep=sep, end=end, file=file) 11 | -------------------------------------------------------------------------------- /.markdownlint.jsonc: -------------------------------------------------------------------------------- 1 | // If you change any options here, 2 | // please change them in .pymarkdown.jsonc 3 | // as well! 4 | { 5 | "line-length": false, 6 | "no-inline-html": { 7 | "allowed_elements": [ 8 | "details", 9 | "summary", 10 | "code" 11 | ] 12 | }, 13 | "first-line-heading": { 14 | "front_matter_title" : "name" 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/api/get_json.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | 4 | def getJSON(request: requests.Response): 5 | """Strip Unicode BOM""" 6 | if request.text.startswith("\ufeff"): 7 | request.encoding = "utf-8-sig" 8 | # request.encoding = request.apparent_encoding 9 | try: 10 | return request.json() 11 | except: 12 | # Maybe an older API version which did not return correct JSON 13 | return {} 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | contact_links: 3 | - name: Get help using MediaWiki Dump Generator 4 | url: https://github.com/orgs/mediawiki-client-tools/discussions/categories/q-a 5 | about: If you need help (other than reporting a bug), you can reach out on our Discussions Q&A. 6 | - name: Anything else 7 | url: https://github.com/orgs/mediawiki-client-tools/discussions 8 | about: You can read and post in our GitHub Discussions. 9 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/log/log_error.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from wikiteam3.dumpgenerator.config import Config 4 | 5 | 6 | def logerror(config: Config = None, to_stdout=False, text="") -> None: 7 | """Log error in errors.log""" 8 | if text: 9 | with open(f"{config.path}/errors.log", "a", encoding="utf-8") as outfile: 10 | output = ( 11 | f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}: {text}\n' 12 | ) 13 | outfile.write(output) 14 | if to_stdout: 15 | print(text) 16 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/misc/site_info_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | import requests 5 | 6 | from wikiteam3.dumpgenerator.test.test_config import get_config 7 | 8 | from .site_info import saveSiteInfo 9 | 10 | 11 | def test_mediawiki_version_match(): 12 | with get_config("1.39.7") as config: 13 | sess = requests.Session() 14 | saveSiteInfo(config, sess) 15 | with open(f"{config.path}/siteinfo.json") as f: 16 | siteInfoJson = json.load(f) 17 | assert siteInfoJson["query"]["general"]["generator"] == "MediaWiki 1.39.7" 18 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py: -------------------------------------------------------------------------------- 1 | from wikiteam3.dumpgenerator.config import Config 2 | 3 | from .page_xml_api import getXMLPageWithApi 4 | from .page_xml_export import getXMLPageWithExport 5 | 6 | 7 | def getXMLPage(config: Config = None, title="", verbose=True, session=None): 8 | if config.xmlapiexport: 9 | return getXMLPageWithApi( 10 | config=config, title=title, verbose=verbose, session=session 11 | ) 12 | else: 13 | return getXMLPageWithExport( 14 | config=config, title=title, verbose=verbose, session=session 15 | ) 16 | -------------------------------------------------------------------------------- /wikiteam3/utils/user_agent.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | # Return a cool user-agent to hide Python user-agent 4 | 5 | 6 | def getUserAgent(): 7 | useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36" 8 | return useragent 9 | 10 | 11 | def setupUserAgent(session: requests.Session): 12 | session._orirequest = session.request 13 | 14 | def newrequest(*args, **kwargs): 15 | session.headers.update({"User-Agent": getUserAgent()}) 16 | return session._orirequest(*args, **kwargs) 17 | 18 | session.request = newrequest 19 | -------------------------------------------------------------------------------- /wikiteam3/utils/domain.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from wikiteam3.dumpgenerator.config import Config 4 | 5 | 6 | def domain2prefix(config: Config = None, session=None): 7 | """Convert domain name to a valid prefix filename.""" 8 | 9 | # At this point, both api and index are supposed to be defined 10 | domain = "" 11 | if config.api: 12 | domain = config.api 13 | elif config.index: 14 | domain = config.index 15 | 16 | domain = domain.lower() 17 | domain = re.sub(r"(https?://|www\.|/index\.php.*|/api\.php.*)", "", domain) 18 | domain = domain.rstrip("/") 19 | domain = re.sub(r"/", "_", domain) 20 | domain = re.sub(r"\.", "", domain) 21 | domain = re.sub(r"[^A-Za-z0-9]", "_", domain) 22 | 23 | return domain 24 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/misc/index_php.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from wikiteam3.dumpgenerator.cli import Delay 4 | from wikiteam3.dumpgenerator.config import Config 5 | from wikiteam3.utils import removeIP 6 | 7 | 8 | def saveIndexPHP(config: Config = None, session=None): 9 | """Save index.php as .html, to preserve license details available at the botom of the page""" 10 | 11 | if os.path.exists(f"{config.path}/index.html"): 12 | print("index.html exists, do not overwrite") 13 | else: 14 | print("Downloading index.php (Main Page) as index.html") 15 | r = session.post(url=config.index, params=None, timeout=10) 16 | raw = str(r.text) 17 | Delay(config=config, session=session) 18 | raw = removeIP(raw=raw) 19 | with open(f"{config.path}/index.html", "w", encoding="utf-8") as outfile: 20 | outfile.write(raw) 21 | -------------------------------------------------------------------------------- /wikiteam3/utils/wiki_avoid.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from typing import * 4 | 5 | from wikiteam3.dumpgenerator.config import Config 6 | 7 | 8 | def avoidWikimediaProjects(config: Config = None, other: Dict = None): 9 | """Skip Wikimedia projects and redirect to the dumps website""" 10 | 11 | # notice about wikipedia dumps 12 | url = "" 13 | if config.api: 14 | url += config.api 15 | if config.index: 16 | url = url + config.index 17 | if re.findall( 18 | r"(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org", 19 | url, 20 | ): 21 | print("PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!") 22 | print("Download the dumps from http://dumps.wikimedia.org") 23 | if not other["force"]: 24 | print("Thanks!") 25 | sys.exit() 26 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/exceptions.py: -------------------------------------------------------------------------------- 1 | class PageMissingError(Exception): 2 | def __init__(self, title, xml): 3 | self.title = title 4 | self.xml = xml 5 | 6 | def __str__(self): 7 | return f"page '{self.title}' not found" 8 | 9 | 10 | class ExportAbortedError(Exception): 11 | def __init__(self, index): 12 | self.index = index 13 | 14 | def __str__(self): 15 | return f"Export from '{self.index}' did not return anything." 16 | 17 | 18 | class FileSizeError(Exception): 19 | def __init__(self, file, size): 20 | self.file = file 21 | self.size = size 22 | 23 | def __str__(self): 24 | return f"File '{self.file}' size is not match '{self.size}'." 25 | 26 | 27 | class FileSha1Error(Exception): 28 | def __init__(self, file, sha1): 29 | self.file = file 30 | self.sha1 = sha1 31 | 32 | def __str__(self): 33 | return f"File '{self.file}' sha1 is not match '{self.sha1}'." 34 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/misc/special_version.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from wikiteam3.dumpgenerator.cli import Delay 4 | from wikiteam3.dumpgenerator.config import Config 5 | from wikiteam3.utils import removeIP 6 | 7 | 8 | def saveSpecialVersion(config: Config = None, session=None): 9 | """Save Special:Version as .html, to preserve extensions details""" 10 | 11 | if os.path.exists(f"{config.path}/SpecialVersion.html"): 12 | print("SpecialVersion.html exists, do not overwrite") 13 | else: 14 | print("Downloading Special:Version with extensions and other related info") 15 | r = session.post( 16 | url=config.index, params={"title": "Special:Version"}, timeout=10 17 | ) 18 | raw = str(r.text) 19 | Delay(config=config, session=session) 20 | raw = str(removeIP(raw=raw)) 21 | with open( 22 | f"{config.path}/SpecialVersion.html", "w", encoding="utf-8" 23 | ) as outfile: 24 | outfile.write(raw) 25 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/__init__.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # DumpGenerator A generator of dumps for wikis 4 | # Copyright (C) 2011-2018 WikiTeam developers 5 | # This program is free software: you can redistribute it and/or modify 6 | # it under the terms of the GNU General Public License as published by 7 | # the Free Software Foundation, either version 3 of the License, or 8 | # (at your option) any later version. 9 | # 10 | # This program is distributed in the hope that it will be useful, 11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | # GNU General Public License for more details. 14 | # 15 | # You should have received a copy of the GNU General Public License 16 | # along with this program. If not, see . 17 | 18 | # To learn more, read the documentation: 19 | # https://github.com/WikiTeam/wikiteam/wiki 20 | 21 | 22 | from wikiteam3.dumpgenerator.dump import DumpGenerator 23 | 24 | 25 | def main(): 26 | DumpGenerator() 27 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/misc/special_logs.py: -------------------------------------------------------------------------------- 1 | from wikiteam3.dumpgenerator.cli import Delay 2 | from wikiteam3.dumpgenerator.config import Config 3 | 4 | 5 | def saveLogs(config: Config = None, session=None): 6 | """Save Special:Log""" 7 | # get all logs from Special:Log 8 | """parse 9 | 22 | """ 23 | Delay(config=config, session=session) 24 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/test/test_config.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import tempfile 3 | from contextlib import contextmanager 4 | 5 | from wikiteam3.dumpgenerator.cli import getParameters 6 | from wikiteam3.dumpgenerator.config import newConfig 7 | 8 | CONFIG_CACHE = {} 9 | 10 | 11 | @contextmanager 12 | def _new_config_from_parameter(params): 13 | _params = tuple(params) 14 | if _params in CONFIG_CACHE: 15 | return CONFIG_CACHE[_params] 16 | config, _ = getParameters(["--path=.", "--xml"] + list(params)) 17 | CONFIG_CACHE[_params] = config 18 | _config = newConfig(copy.deepcopy(config.asdict())) 19 | try: 20 | with tempfile.TemporaryDirectory(prefix="wikiteam3test_") as tmpdir: 21 | _config.path = tmpdir 22 | yield _config 23 | finally: 24 | pass 25 | 26 | 27 | def get_config(mediawiki_ver, api=True): 28 | assert api == True 29 | if mediawiki_ver == "1.39.7": 30 | return _new_config_from_parameter( 31 | [ 32 | "--api", 33 | "https://testw.fandom.com/api.php", 34 | ] 35 | ) 36 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/api/handle_status_code.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def handleStatusCode(response): 5 | statuscode = response.status_code 6 | if statuscode >= 200 and statuscode < 300: 7 | return 8 | 9 | print("HTTP Error %d." % statuscode) 10 | if statuscode >= 300 and statuscode < 400: 11 | print("Redirect should happen automatically: please report this as a bug.") 12 | print(response.url) 13 | 14 | elif statuscode == 400: 15 | print("Bad Request: The wiki may be malfunctioning.") 16 | print("Please try again later.") 17 | print(response.url) 18 | sys.exit(1) 19 | 20 | elif statuscode in [401, 403]: 21 | print("Authentication required.") 22 | print("Please use --user and --pass.") 23 | print(response.url) 24 | 25 | elif statuscode == 404: 26 | print("Not found. Is Special:Export enabled for this wiki?") 27 | print(response.url) 28 | sys.exit(1) 29 | 30 | elif statuscode == 429 or (statuscode >= 500 and statuscode < 600): 31 | print("Server error, max retries exceeded.") 32 | print("Please resume the dump later.") 33 | print(response.url) 34 | sys.exit(1) 35 | -------------------------------------------------------------------------------- /wikiteam3/utils/monkey_patch.py: -------------------------------------------------------------------------------- 1 | import requests 2 | 3 | from wikiteam3.dumpgenerator.cli.delay import Delay 4 | 5 | 6 | def mod_requests_text(requests: requests): 7 | """Monkey patch `requests.Response.text` to remove BOM""" 8 | 9 | def new_text(self): 10 | return self.content.lstrip(b"\xef\xbb\xbf").decode(self.encoding) 11 | 12 | requests.Response.text = property(new_text) 13 | 14 | 15 | class DelaySession: 16 | """Monkey patch `requests.Session.send` to add delay""" 17 | 18 | def __init__(self, session, msg=None, delay=None, config=None): 19 | self.session = session 20 | self.msg = msg 21 | self.delay = delay 22 | self.old_send = None 23 | self.config = config 24 | 25 | def hijack(self): 26 | """Don't forget to call `release()`""" 27 | 28 | def new_send(request, **kwargs): 29 | Delay(msg=self.msg, delay=self.delay, config=self.config) 30 | return self.old_send(request, **kwargs) 31 | 32 | self.old_send = self.session.send 33 | self.session.send = new_send 34 | 35 | def release(self): 36 | """Undo monkey patch""" 37 | self.session.send = self.old_send 38 | del self 39 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/api/index_check.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | 6 | def checkIndex(index="", cookies="", session: requests.Session = None): 7 | """Checking index.php availability""" 8 | r = session.post(url=index, data={"title": "Special:Version"}, timeout=30) 9 | if r.status_code >= 400: 10 | print(f"ERROR: The wiki returned status code HTTP {r.status_code}") 11 | return False 12 | raw = r.text 13 | print("Checking index.php...", index) 14 | # Workaround for issue 71 15 | if ( 16 | re.search( 17 | '(Special:Badtitle|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required)', 18 | raw, 19 | ) 20 | and not cookies 21 | ): 22 | print("ERROR: This wiki requires login and we are not authenticated") 23 | return False 24 | if re.search( 25 | '(page-Index_php|"wgPageName":"Index.php"|"firstHeading">Index.php)', 26 | raw, 27 | ): 28 | print("Looks like the page called Index.php, not index.php itself") 29 | return False 30 | return bool( 31 | re.search( 32 | '(This wiki is powered by|

|meta name="generator" content="MediaWiki|class="mediawiki)', 33 | raw, 34 | ) 35 | ) 36 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/cli/delay.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import sys 3 | import threading 4 | import time 5 | 6 | from wikiteam3.dumpgenerator.config import Config 7 | 8 | 9 | class Delay: 10 | done: bool = False 11 | lock: threading.Lock = threading.Lock() 12 | 13 | def animate(self): 14 | while True: 15 | with self.lock: 16 | if self.done: 17 | return 18 | 19 | print("\r" + self.ellipses, end="") 20 | self.ellipses += "." 21 | 22 | time.sleep(0.3) 23 | 24 | def __init__(self, config: Config = None, session=None, msg=None, delay=None): 25 | """Add a delay if configured for that""" 26 | self.ellipses: str = "." 27 | 28 | if delay is None: 29 | delay = config.delay 30 | if delay <= 0: 31 | return 32 | 33 | if msg: 34 | self.ellipses = f"Delay {delay:.1f}s: {msg} {self.ellipses}" 35 | else: 36 | self.ellipses = ("Delay %.1fs " % (delay)) + self.ellipses 37 | 38 | ellipses_animation = threading.Thread(target=self.animate) 39 | ellipses_animation.daemon = True 40 | ellipses_animation.start() 41 | 42 | time.sleep(delay) 43 | 44 | with self.lock: 45 | self.done = True 46 | print("\r" + " " * len(self.ellipses) + "\r", end="") 47 | -------------------------------------------------------------------------------- /.github/workflows/test-dumpgenerator.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: dumpgenerator test 5 | 6 | on: 7 | push: 8 | branches: [ "python3" ] 9 | pull_request: 10 | branches: [ "python3" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.8", "3.9", "3.10", "3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v4 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest poetry 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | - name: Lint with flake8 33 | run: | 34 | # exit if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: run dumpgenerator 39 | run: | 40 | python -m wikiteam3.dumpgenerator -h 41 | - name: Test with pytest 42 | run: | 43 | cd wikiteam3/dumpgenerator && pytest && cd ../../ 44 | -------------------------------------------------------------------------------- /wikiteam3/utils/login/__init__.py: -------------------------------------------------------------------------------- 1 | """ Provide login functions """ 2 | 3 | import time 4 | 5 | import requests 6 | 7 | from wikiteam3.utils.login.api import botLogin, clientLogin, fetchLoginToken 8 | from wikiteam3.utils.login.index import indexLogin 9 | 10 | 11 | def uniLogin( 12 | api: str = "", 13 | index: str = "", 14 | session: requests.Session = requests.Session(), 15 | username: str = "", 16 | password: str = "", 17 | ): 18 | """Try to login to a wiki using various methods.\n 19 | Return `session` if success, else return `None`.\n 20 | Try: `cilent login (api) => bot login (api) => index login (index)`""" 21 | 22 | if (not api and not index) or (not username or not password): 23 | print("uniLogin: api or index or username or password is empty") 24 | return None 25 | 26 | if api: 27 | print("Trying to log in to the wiki using clientLogin... (MW 1.27+)") 28 | if _session := clientLogin( 29 | api=api, session=session, username=username, password=password 30 | ): 31 | return _session 32 | time.sleep(5) 33 | 34 | print("Trying to log in to the wiki using botLogin... (MW 1.27+)") 35 | if _session := botLogin( 36 | api=api, session=session, username=username, password=password 37 | ): 38 | return _session 39 | time.sleep(5) 40 | 41 | if index: 42 | print("Trying to log in to the wiki using indexLogin... (generic)") 43 | if _session := indexLogin( 44 | index=index, session=session, username=username, password=password 45 | ): 46 | return _session 47 | 48 | return None 49 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | 12 | 14 | 15 | ## Describe the Bug 16 | 17 | 18 | 19 | ### Expected Behavior 20 | 21 | 22 | 23 | ### Actual Behavior 24 | 25 | 26 | 27 | ## Command for Reproducing the Bug 28 | 29 | 31 | 32 | ```bash 33 | 34 | ``` 35 | 36 | ## Output 37 | 38 |
39 | stdout 40 | 41 | 43 | 44 | ```bash 45 | 46 | ``` 47 | 48 |
49 | 50 |
51 | errors.log 52 | 53 | 55 | 56 | ```text 57 | 58 | ``` 59 | 60 |
61 | 62 | ## Platform Details 63 | 64 | 66 | 67 | ### Desktop 68 | 69 | - OS and version: 70 | - File system: 71 | - Python version: 72 | - Command line shell: 73 | - `dumpgenerator` version: 74 | 75 | ### Smartphone or Tablet 76 | 77 | - OS: 78 | - Python version: 79 | - Command line shell: 80 | - Terminal application used: 81 | - `dumpgenerator` version: 82 | 83 | ## Additional Context 84 | 85 | 86 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Apply to all files without commiting: 2 | # pre-commit run --all-files 3 | # Update this file: 4 | # pre-commit autoupdate 5 | default_language_version: 6 | python: python3.8 7 | repos: 8 | - repo: https://github.com/python-poetry/poetry 9 | rev: 1.6.0 10 | hooks: 11 | - id: poetry-check 12 | # - id: poetry-lock 13 | - id: poetry-export 14 | args: ["-f", "requirements.txt", "-o", "requirements.txt"] 15 | - repo: https://github.com/pre-commit/pre-commit-hooks 16 | rev: v4.4.0 17 | hooks: 18 | - id: check-ast 19 | - id: fix-byte-order-marker 20 | - id: check-case-conflict 21 | - id: check-docstring-first 22 | - id: check-executables-have-shebangs 23 | - id: check-json 24 | - id: check-yaml 25 | - id: debug-statements 26 | # - id: detect-aws-credentials 27 | # - id: detect-private-key 28 | - id: end-of-file-fixer 29 | - id: trailing-whitespace 30 | - id: mixed-line-ending 31 | # - repo: https://github.com/pre-commit/mirrors-mypy 32 | # rev: v0.942 33 | # hooks: 34 | # - id: mypy 35 | # args: [--ignore-missing-imports] 36 | - repo: https://github.com/PyCQA/isort 37 | rev: 5.12.0 38 | hooks: 39 | - id: isort 40 | args: ["--profile", "black", "--filter-files"] 41 | - repo: https://github.com/psf/black 42 | rev: 23.7.0 43 | hooks: 44 | - id: black 45 | - repo: https://github.com/asottile/pyupgrade 46 | rev: v3.10.1 47 | hooks: 48 | - id: pyupgrade 49 | args: [--py38-plus] 50 | - repo: https://github.com/asottile/blacken-docs 51 | rev: 1.16.0 52 | hooks: 53 | - id: blacken-docs 54 | # additional_dependencies: [black==20.8b1] 55 | ### Needs argument for diasabling line_length 56 | ### https://github.com/jackdewinter/pymarkdown/blob/main/docs/rules/rule_md013.md 57 | - repo: https://github.com/jackdewinter/pymarkdown 58 | rev: v0.9.12 59 | hooks: 60 | - id: pymarkdown 61 | args: 62 | - --config=.pymarkdown.json 63 | # - --disable-rules 64 | # - line-length,no-inline-html 65 | - scan 66 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/misc/site_info.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from wikiteam3.dumpgenerator.api import getJSON 5 | from wikiteam3.dumpgenerator.cli import Delay 6 | from wikiteam3.dumpgenerator.config import Config 7 | 8 | 9 | def saveSiteInfo(config: Config = None, session=None): 10 | """Save a file with site info""" 11 | 12 | if not config.api: 13 | return 14 | if os.path.exists(f"{config.path}/siteinfo.json"): 15 | print("siteinfo.json exists, do not overwrite") 16 | else: 17 | print("Downloading site info as siteinfo.json") 18 | 19 | # MediaWiki 1.13+ 20 | r = session.get( 21 | url=config.api, 22 | params={ 23 | "action": "query", 24 | "meta": "siteinfo", 25 | "siprop": "general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo", 26 | "sinumberingroup": 1, 27 | "format": "json", 28 | }, 29 | timeout=10, 30 | ) 31 | # MediaWiki 1.11-1.12 32 | if "query" not in getJSON(r): 33 | r = session.get( 34 | url=config.api, 35 | params={ 36 | "action": "query", 37 | "meta": "siteinfo", 38 | "siprop": "general|namespaces|statistics|dbrepllag|interwikimap", 39 | "format": "json", 40 | }, 41 | timeout=10, 42 | ) 43 | # MediaWiki 1.8-1.10 44 | if "query" not in getJSON(r): 45 | r = session.get( 46 | url=config.api, 47 | params={ 48 | "action": "query", 49 | "meta": "siteinfo", 50 | "siprop": "general|namespaces", 51 | "format": "json", 52 | }, 53 | timeout=10, 54 | ) 55 | result = getJSON(r) 56 | Delay(config=config, session=session) 57 | with open(f"{config.path}/siteinfo.json", "w", encoding="utf-8") as outfile: 58 | outfile.write(json.dumps(result, indent=4, sort_keys=True)) 59 | -------------------------------------------------------------------------------- /PUBLISHING.md: -------------------------------------------------------------------------------- 1 | # Publishing the dump 2 | 3 | Publishing your dumps to the [Internet Archive's wikiteam collection](https://archive.org/details/wikiteam) is easily done. First [sign up](https://archive.org/account/signup) or [login](http://archive.org/account/login.php). 4 | 5 | ## Launcher and uploader 6 | 7 | Instructions on using the scripts `launcher` and `uploader` are in the file [Usage](./USAGE.md). 8 | 9 | ## Automatic publishing 10 | 11 | Just use `uploader` (especially if you have multiple wikis): the script takes the filename of a list of wikis as argument and uploads their dumps to archive.org. You only need to: 12 | 13 | - Check the 7z compressed dumps are in the same directory as `listfile`. The file `listfile` contains a list of the api.php URLs of the wikis to upload, one per line. 14 | - [Retrieve your S3 keys](http://www.archive.org/account/s3.php), save them one per line (in the order provided) in a keys.txt file in same directory as `uploader`. 15 | - Run the script `uploader listfile`. 16 | 17 | ## Manual publishing 18 | 19 | - After running dumpgenerator, in each dump folder, select all files, right-click on the selection, click 7-Zip, click `Add to archive...` and click OK. 20 | - At Archive.org, for each wiki [create a new item](http://archive.org/create/). 21 | - Click `Upload files`. Then either drag and drop the 7-Zip archive onto the box or click `Choose files` and select the 7-Zip archive. 22 | - `Page Title` and `Page URL` will be filled in by the uploader. 23 | - Add a short `Description`, such as a descriptive name fopr the wiki. 24 | - Add `Subject Tags`, separated by commas, these are the keywords that will help the archive to show up in a Internet Archive search, e.g. wikiteam,wiki,subjects of the wiki, and so on. 25 | - `Creator`, can be left blank. 26 | - `Date`, can be left blank. 27 | - `Collection`, select `Community texts`. 28 | - `Language`, select the language of the wiki. 29 | - `License`, click to expand and select Creative Commons, Allow Remixing, Require Share-Alike for a CC-BY-SA licence. 30 | - Click `Upload and Create Your Item`. 31 | 32 | With the subject tag of wikiteam and collection of community texts, your uploads should appear in a search for [subject:"wikiteam" AND collection:opensource](https://archive.org/search?query=subject%3A%22wikiteam%22+AND+collection%3Aopensource). 33 | 34 | ## Info for developers 35 | 36 | - [Internet Archive’s S3 like server API](https://archive.org/developers/ias3.html). 37 | -------------------------------------------------------------------------------- /wikiteam3/utils/login/index.py: -------------------------------------------------------------------------------- 1 | """ Always available login methods.(mw 1.16-1.39) 2 | Even oler versions of MW may work, but not tested. """ 3 | 4 | from typing import * 5 | 6 | import lxml.html 7 | import requests 8 | 9 | 10 | def indexLogin( 11 | index: str, session: requests.Session, username: str, password: str 12 | ) -> Optional[requests.Session]: 13 | """Try to login to a wiki using username and password through `Special:UserLogin`. 14 | (tested on MW 1.16...1.39)""" 15 | wpEditToken = None 16 | wpLoginToken = None 17 | 18 | params = { 19 | "title": "Special:UserLogin", 20 | } 21 | r = session.get(index, allow_redirects=True, params=params) 22 | 23 | # Sample r.text: 24 | # MW 1.16: 25 | # MW 1.39: 26 | html = lxml.html.fromstring(r.text) 27 | if "wpLoginToken" in r.text: 28 | wpLoginToken = html.xpath('//input[@name="wpLoginToken"]/@value')[0] 29 | 30 | # Sample r.text: 31 | # MW 1.16: None 32 | # MW 1.39: 33 | if "wpEditToken" in r.text: 34 | wpEditToken = html.xpath('//input[@name="wpEditToken"]/@value')[0] 35 | print("index login: wpEditToken found.") 36 | 37 | data = { 38 | "wpName": username, # required 39 | "wpPassword": password, # required 40 | "wpLoginattempt": "Log in", # required 41 | "wpLoginToken": wpLoginToken, # required 42 | "wpRemember": "1", # 0: not remember, 1: remember 43 | "wpEditToken": wpEditToken, # introduced before MW 1.27, not sure whether it's required. 44 | "authAction": "login", # introduced before MW 1.39. 45 | "title": "Special:UserLogin", # introduced before MW 1.39. 46 | "force": "", # introduced before MW 1.39, empty string is OK. 47 | } 48 | r = session.post(index, allow_redirects=False, params=params, data=data) 49 | if r.status_code == 302: 50 | print("index login: Success! Welcome, ", username, "!") 51 | return session 52 | else: 53 | print( 54 | "index login: Oops! Something went wrong -- ", 55 | r.status_code, 56 | "wpLoginToken: ", 57 | wpLoginToken, 58 | "wpEditToken: ", 59 | wpEditToken, 60 | ) 61 | return None 62 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py: -------------------------------------------------------------------------------- 1 | import os 2 | from io import StringIO 3 | from typing import * 4 | 5 | import lxml.etree 6 | from file_read_backwards import FileReadBackwards 7 | 8 | 9 | def endsWithNewlines(filename: str) -> int: 10 | """Returns the number of newlines at the end of file""" 11 | 12 | with FileReadBackwards(filename, encoding="utf-8") as frb: 13 | newlines = 0 14 | while frb.readline() == "": 15 | newlines += 1 16 | return newlines 17 | 18 | 19 | def addNewline(filename: str) -> None: 20 | """Adds a newline to the end of file""" 21 | 22 | print(f"Adding newline to end of {filename}") 23 | with open(filename, "a", encoding="utf-8") as f: 24 | f.write("\n") 25 | 26 | 27 | def truncateXMLDump(filename: str) -> str: 28 | """Removes incomplete elements from the end of XML dump files""" 29 | 30 | with FileReadBackwards(filename, encoding="utf-8") as frb: 31 | incomplete_segment: str = "" 32 | xml_line: str = frb.readline() 33 | while xml_line and "" not in xml_line: 34 | incomplete_segment = xml_line + incomplete_segment 35 | xml_line = frb.readline() 36 | while xml_line and "" not in xml_line: 37 | incomplete_segment = xml_line + incomplete_segment 38 | xml_line = frb.readline() 39 | incomplete_segment_size = len(incomplete_segment.encode("utf-8")) 40 | file_size = os.path.getsize(filename) 41 | if file_size > incomplete_segment_size: 42 | with open(filename, "r+", encoding="utf-8") as fh: 43 | fh.truncate(file_size - incomplete_segment_size) 44 | else: 45 | print( 46 | 'len(incomplete_segment.encode("utf-8")) returned ' 47 | + str(incomplete_segment_size) 48 | + ", while os.path.getsize(filename) returned " 49 | + str(file_size) 50 | + ", so fh.truncate() would be fh.truncate(" 51 | + str(file_size - incomplete_segment_size) 52 | + "), which would be illegal. Something is seriously wrong here!" 53 | ) 54 | 55 | # add newline to prevent ` ` in one line 56 | if endsWithNewlines(filename) == 0: 57 | addNewline(filename) 58 | elif endsWithNewlines(filename) > 1: 59 | print(f"WARNING: {filename} has {endsWithNewlines(filename)} newlines") 60 | return incomplete_segment 61 | 62 | 63 | def parseLastPageChunk(chunk) -> Optional[lxml.etree._ElementTree]: 64 | try: 65 | parser = lxml.etree.XMLParser(recover=True) 66 | tree = lxml.etree.parse(StringIO(chunk), parser) 67 | return tree.getroot() 68 | except lxml.etree.LxmlError: 69 | return None 70 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/image/html_regexs.py: -------------------------------------------------------------------------------- 1 | R_NEXT = r"(?\d+)&" 2 | 3 | REGEX_CANDIDATES = [ 4 | # [0] 5 | # archiveteam 1.15.1 Yahoovideo.jpg (file) 6 | # wikanda 1.15.5 Fernandocg 10 | r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+\s*]+>(?P[^<]+)' 11 | # [1] 12 | # wikijuegos 1.9.5 13 | # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old 14 | # mediawiki version 15 | , 16 | r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+\s*[^<]+\s*[^<]+\s*]+>(?P[^<]+)' 17 | # [2] 18 | # gentoowiki 1.18 19 | , 20 | r'(?im)]+title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>]+>[^<]+]+>(?P[^<]+)' 21 | # [3] 22 | # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch= 23 | # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
24 | , 25 | '(?ism)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)' 26 | # [4] 27 | , 28 | ( 29 | r'(?im)\s*]*?>(?P[^>]+)[^<]*?[^<]*?[^<]*?\s*' 30 | r'[^\n\r]*?\s*' 31 | r'[^<]*?\s*' 32 | r'\s*(?:)?(?:)?(?P[^<]+?)(?:)?(?:)?\s*(?:(?:(?!)(?!).)*?)?' 33 | ), 34 | ] 35 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | config = { 3 | "curonly": args.curonly, 4 | "date": datetime.datetime.now().strftime("%Y%m%d"), 5 | "api": api, 6 | "failfast": args.failfast, 7 | "http_method": "POST", 8 | "index": index, 9 | "images": args.images, 10 | "logs": False, 11 | "xml": args.xml, 12 | "xmlrevisions": args.xmlrevisions, 13 | "namespaces": namespaces, 14 | "exnamespaces": exnamespaces, 15 | "path": args.path and os.path.normpath(args.path) or "", 16 | "cookies": args.cookies or "", 17 | "delay": args.delay, 18 | "retries": int(args.retries), 19 | } 20 | """ 21 | 22 | import dataclasses 23 | import json 24 | import sys 25 | from typing import * 26 | 27 | 28 | def _dataclass_from_dict(klass_or_obj, d): 29 | ret = klass_or_obj() if isinstance(klass_or_obj, type) else klass_or_obj 30 | for k, v in d.items(): 31 | if hasattr(ret, k): 32 | setattr(ret, k, v) 33 | return ret 34 | 35 | 36 | @dataclasses.dataclass 37 | class Config: 38 | def asdict(self): 39 | return dataclasses.asdict(self) 40 | 41 | # General params 42 | delay: float = 0.0 43 | retries: int = 0 44 | path: str = "" 45 | logs: bool = False 46 | date: str = False 47 | 48 | # URL params 49 | index: str = "" 50 | api: str = "" 51 | 52 | # Download params 53 | xml: bool = False 54 | curonly: bool = False 55 | xmlapiexport: bool = False 56 | xmlrevisions: bool = False 57 | xmlrevisions_page: bool = False 58 | images: bool = False 59 | namespaces: List[int] = None 60 | exnamespaces: List[int] = None 61 | 62 | api_chunksize: int = 0 # arvlimit, ailimit, etc 63 | export: str = "" # Special:Export page name 64 | http_method: str = "" 65 | 66 | # Meta info params 67 | failfast: bool = False 68 | 69 | templates: bool = False 70 | 71 | 72 | def newConfig(configDict) -> Config: 73 | return _dataclass_from_dict(Config, configDict) 74 | 75 | 76 | def loadConfig(config: Config = None, configfilename=""): 77 | """Load config file""" 78 | 79 | configDict = dataclasses.asdict(config) 80 | 81 | if config.path: 82 | try: 83 | with open(f"{config.path}/{configfilename}", encoding="utf-8") as infile: 84 | configDict.update(json.load(infile)) 85 | return newConfig(configDict) 86 | except: 87 | pass 88 | 89 | print("There is no config file. we can't resume. Start a new dump.") 90 | sys.exit() 91 | 92 | 93 | def saveConfig(config: Config = None, configfilename=""): 94 | """Save config file""" 95 | 96 | with open(f"{config.path}/{configfilename}", "w", encoding="utf-8") as outfile: 97 | json.dump(dataclasses.asdict(config), outfile) 98 | -------------------------------------------------------------------------------- /wikiteam3/utils/login/api.py: -------------------------------------------------------------------------------- 1 | """ Available since MediaWiki 1.27. login to a wiki using username and password (API) """ 2 | 3 | from typing import * 4 | 5 | import requests 6 | 7 | 8 | def fetchLoginToken(session: requests.Session, api: str) -> Optional[str]: 9 | """fetch login token by API .(MediaWiki 1.27+)""" 10 | 11 | response = session.get( 12 | url=api, 13 | params={"action": "query", "meta": "tokens", "type": "login", "format": "json"}, 14 | ) 15 | data = response.json() 16 | try: 17 | token = data["query"]["tokens"]["logintoken"] 18 | if type(token) is str: 19 | return token 20 | except KeyError: 21 | print("fetch login token: Oops! Something went wrong -- ", data) 22 | return None 23 | 24 | 25 | def clientLogin( 26 | api: str, session: requests.Session, username: str, password: str 27 | ) -> Optional[requests.Session]: 28 | """login to a wiki using username and password. (MediaWiki 1.27+)""" 29 | 30 | login_token = fetchLoginToken(session=session, api=api) 31 | if not login_token: 32 | return None 33 | 34 | response = session.post( 35 | url=api, 36 | data={ 37 | "action": "clientlogin", 38 | "username": username, 39 | "password": password, 40 | "loginreturnurl": "http://127.0.0.1:5000/", 41 | "logintoken": login_token, 42 | "format": "json", 43 | }, 44 | ) 45 | 46 | data = response.json() 47 | 48 | try: 49 | if data["clientlogin"]["status"] == "PASS": 50 | print( 51 | "client login: Success! Welcome, " 52 | + data["clientlogin"]["username"] 53 | + "!" 54 | ) 55 | except KeyError: 56 | print("client login: Oops! Something went wrong -- ", data) 57 | return None 58 | 59 | return session 60 | 61 | 62 | def botLogin( 63 | api: str, session: requests.Session, username: str, password: str 64 | ) -> Optional[requests.Session]: 65 | """login to a wiki using BOT's name and password. (MediaWiki 1.27+)""" 66 | 67 | login_token = fetchLoginToken(session=session, api=api) 68 | if not login_token: 69 | return None 70 | 71 | response = session.post( 72 | url=api, 73 | data={ 74 | "action": "login", 75 | "lgname": username, 76 | "lgpassword": password, 77 | "lgtoken": login_token, 78 | "format": "json", 79 | }, 80 | ) 81 | 82 | data = response.json() 83 | 84 | try: 85 | if data["login"]["result"] == "Success": 86 | print("bot login: Success! Welcome, " + data["login"]["lgusername"] + "!") 87 | except KeyError: 88 | print(f"bot login: Oops! Something went wrong -- {data}") 89 | return None 90 | 91 | return session 92 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/cli/greeter.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from wikiteam3.dumpgenerator.version import getVersion 4 | 5 | 6 | def welcome(): 7 | message = "" 8 | """Opening message""" 9 | message += "#" * 73 10 | message += "\n" 11 | welcome_string = f"# Welcome to DumpGenerator {getVersion()} by WikiTeam (GPL v3)" 12 | welcome_string += " " * (73 - len(welcome_string) - 1) + "#" 13 | message += welcome_string 14 | message += "\n" 15 | message += ( 16 | "# More info at: https://github.com/elsiehupp/wikiteam3 #" 17 | ) 18 | message += "\n" 19 | message += "#" * 73 20 | message += "\n" 21 | message += "" 22 | message += "\n" 23 | message += "#" * 73 24 | message += "\n" 25 | message += ( 26 | "# Copyright (C) 2011-%d WikiTeam developers #\n" 27 | % (datetime.datetime.now().year) 28 | ) 29 | message += """# # 30 | # This program is free software: you can redistribute it and/or modify # 31 | # it under the terms of the GNU General Public License as published by # 32 | # the Free Software Foundation, either version 3 of the License, or # 33 | # (at your option) any later version. # 34 | # # 35 | # This program is distributed in the hope that it will be useful, # 36 | # but WITHOUT ANY WARRANTY; without even the implied warranty of # 37 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 38 | # GNU General Public License for more details. # 39 | # # 40 | # You should have received a copy of the GNU General Public License # 41 | # along with this program. If not, see . #""" 42 | message += "\n" 43 | message += "#" * 73 44 | message += "\n" 45 | message += "" 46 | 47 | return message 48 | 49 | 50 | def bye(): 51 | """Closing message""" 52 | print("") 53 | print("---> Congratulations! Your dump is complete <---") 54 | print("") 55 | print("If you encountered a bug, you can report it on GitHub Issues:") 56 | print(" https://github.com/mediawiki-client-tools/mediawiki-dump-generator/issues") 57 | print("") 58 | print("If you need any other help, you can reach out on GitHub Discussions:") 59 | print(" https://github.com/orgs/mediawiki-client-tools/discussions") 60 | print("") 61 | print("If this is a public wiki, please, consider publishing this dump.") 62 | print("Do it yourself as explained in:") 63 | print(" https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump") 64 | print("") 65 | print("Good luck! Bye!") 66 | print("") 67 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "wikiteam3" 3 | version = "3.0.0" 4 | description = "Tools for downloading and preserving wikis. We archive wikis, from Wikipedia to tiniest wikis. As of 2020, WikiTeam has preserved more than 250,000 wikis." 5 | license = "GPL-3.0-or-later" 6 | authors = ["WikiTeam Contributors "] 7 | maintainers = [ 8 | "Federico Leva ", 9 | "Elsie Hupp " 10 | ] 11 | readme = "README.md" 12 | homepage = "https://wiki.archiveteam.org/index.php/WikiTeam" 13 | repository = "https://github.com/WikiTeam/wikiteam" 14 | documentation = "https://wikiteam.readthedocs.io" 15 | keywords = [ 16 | "archiveteam", 17 | "mediawiki", 18 | "preservation", 19 | "wiki", 20 | "wikipedia" 21 | ] 22 | classifiers = [ 23 | "Development Status :: 3 - Alpha", 24 | "Environment :: Console", 25 | "Intended Audience :: Education", 26 | "Intended Audience :: End Users/Desktop", 27 | "Intended Audience :: Information Technology", 28 | "Intended Audience :: Legal Industry", 29 | "Intended Audience :: Science/Research", 30 | "Intended Audience :: System Administrators", 31 | "Natural Language :: English", 32 | "Operating System :: OS Independent", 33 | "Topic :: Communications", 34 | "Topic :: Internet", 35 | "Topic :: Internet :: WWW/HTTP :: Dynamic Content :: Wiki", 36 | "Topic :: Scientific/Engineering :: Information Analysis", 37 | "Topic :: Sociology :: History", 38 | "Topic :: System :: Archiving", 39 | "Topic :: System :: Archiving :: Backup", 40 | "Topic :: Utilities" 41 | ] 42 | packages = [ 43 | { include = "wikiteam3/**/*"}, 44 | ] 45 | exclude = ["wikiteam3/dumpgenerator/test/*"] 46 | 47 | [tool.poetry.scripts] 48 | dumpgenerator = "wikiteam3.dumpgenerator:main" 49 | # gui = "wikiteam3.gui:main" 50 | launcher = "wikiteam3.launcher:main" 51 | # not-archived = "wikiteam3.not-archived:main" 52 | uploader = "wikiteam3.uploader:main" 53 | # wikiadownloader = "wikiteam3.wikiadownloader:main" 54 | # wikipediadownloader = "wikiteam3.wikipediadownloader:main" 55 | # wikispaces = "wikiteam3.wikispaces:main" 56 | 57 | [tool.poetry.dependencies] 58 | python = "^3.8" 59 | requests = "^2.32.0" 60 | internetarchive = "^3.1.0" 61 | lxml = "^5.0.0" 62 | mwclient = "^0.10.1" 63 | PyMySQL = "^1.1.1" 64 | pywikibot = "^6.6.1" 65 | urllib3 = "^1.26.18" 66 | wikitools3 = "^3.0.0" 67 | pymysql = "*" 68 | file_read_backwards = "^2.0.0" 69 | pre-commit-poetry-export = "^0.1.2" 70 | 71 | [tool.isort] 72 | profile = "black" 73 | 74 | [tool.poetry.dev-dependencies] 75 | pytest = "^6.2.5" 76 | requests = "^2.32.0" 77 | flake8 = "^3.9.2" 78 | pre-commit = "^2.17.0" 79 | pymarkdown = "^0.1.4" 80 | 81 | [build-system] 82 | requires = ["poetry-core>=1.0.0"] 83 | build-backend = "poetry.core.masonry.api" 84 | 85 | [tool.pymarkdown] 86 | disable-rules = "line-length,no-inline-html" 87 | -------------------------------------------------------------------------------- /wikiteam3/utils/util.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import re 3 | import sys 4 | 5 | 6 | def cleanHTML(raw: str = "") -> str: 7 | """Extract only the real wiki content and remove rubbish 8 | This function is ONLY used to retrieve page titles 9 | and file names when no API is available 10 | DO NOT use this function to extract page content""" 11 | # different "tags" used by different MediaWiki versions to mark where 12 | # starts and ends content 13 | if re.search("", raw): 14 | raw = raw.split("")[1].split("")[0] 15 | elif re.search("", raw): 16 | raw = raw.split("")[1].split("")[0] 17 | elif re.search("", raw): 18 | raw = raw.split("")[1].split( 19 | "" 20 | )[0] 21 | elif re.search("", raw): 22 | raw = raw.split("")[1].split("")[0] 23 | elif re.search(r'
', raw): 24 | raw = raw.split('
')[ 25 | 1 26 | ].split("
")[0] 27 | elif re.search("')[0] 29 | else: 30 | print(raw[:250]) 31 | print("This wiki doesn't use marks to split content") 32 | sys.exit() 33 | return raw 34 | 35 | 36 | def undoHTMLEntities(text: str = "") -> str: 37 | """Undo some HTML codes""" 38 | 39 | # i guess only < > & " ' need conversion 40 | # http://www.w3schools.com/html/html_entities.asp 41 | text = re.sub("<", "<", text) 42 | text = re.sub(">", ">", text) 43 | text = re.sub("&", "&", text) 44 | text = re.sub(""", '"', text) 45 | text = re.sub("'", "'", text) 46 | 47 | return text 48 | 49 | 50 | def removeIP(raw: str = "") -> str: 51 | """Remove IP from HTML comments """ 52 | 53 | raw = re.sub(r"\d+\.\d+\.\d+\.\d+", "0.0.0.0", raw) 54 | # http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html 55 | # weird cases as :: are not included 56 | raw = re.sub( 57 | r"(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}", 58 | "0:0:0:0:0:0:0:0", 59 | raw, 60 | ) 61 | 62 | return raw 63 | 64 | 65 | def cleanXML(xml: str = "") -> str: 66 | """Trim redundant info from the XML however it comes""" 67 | # do not touch XML codification, leave AS IS 68 | # EDIT 2022: we are making this explicitly Unicode 69 | # for Windows compatibility. 70 | # If the encoding has to stay as is, we'll have 71 | # to change all the file encodings, as well. 72 | 73 | if re.search(r"\n", xml): 74 | xml = xml.split("\n")[1] 75 | if re.search(r"", xml): 76 | xml = xml.split("")[0] 77 | return xml 78 | 79 | 80 | def sha1File(filename: str = "") -> str: 81 | """Return the SHA1 hash of a file""" 82 | 83 | sha1 = hashlib.sha1() 84 | with open(filename, "rb") as f: 85 | while True: 86 | if data := f.read(65536): 87 | sha1.update(data) 88 | else: 89 | break 90 | return sha1.hexdigest() 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `MediaWiki Dump Generator` 2 | 3 | **MediaWiki Dump Generator can archive wikis from the largest to the tiniest.** 4 | 5 | `MediaWiki Dump Generator` is a project to port the legacy [`wikiteam`](https://github.com/WikiTeam/wikiteam) toolset to Python 3 and PyPI to make it more accessible for today's archivers. 6 | 7 | Most of the focus has been on the core `dumpgenerator` tool. Python 3 versions of the other `wikiteam` tools may be added over time. 8 | 9 | The project is currently mostly in maintenance mode. We will do our best to prevent the project from breaking entirely. Issues and pull requests are welcomed but may not be reviewed promptly. 10 | 11 | ## MediaWiki Dump Generator Toolset 12 | 13 | MediaWiki Dump Generator is a set of tools for archiving wikis. The main general-purpose module of MediaWiki Dump Generator is dumpgenerator, which can download XML dumps of MediaWiki sites that can then be parsed or redeployed elsewhere. 14 | 15 | Wikipedia is far too large to manage the dump easily and [dumps are already freely available](https://en.wikipedia.org/wiki/Wikipedia:Database_download#Where_do_I_get_the_dumps?). 16 | 17 | ## Installing the tools 18 | 19 | For prerequisites and installation see [Installation](./INSTALLATION.md) 20 | 21 | ## Using the tools 22 | 23 | For usage see [Usage](./USAGE.md) 24 | 25 | ## Publishing the dump 26 | 27 | Please consider publishing your wiki dump(s). You can do it yourself as explained in [Publishing](./PUBLISHING.md). 28 | 29 | ## Getting help 30 | 31 | * You can read and post in MediaWiki Client Tools' [GitHub Discussions]( https://github.com/orgs/mediawiki-client-tools/discussions). 32 | * If you need help (other than reporting a bug), you can reach out on MediaWiki Client Tools' [Discussions/Q&A](https://github.com/orgs/mediawiki-client-tools/discussions/categories/q-a). 33 | 34 | ## Contributing 35 | 36 | For information on reporting bugs and proposing changes, please see the [Contributing](./CONTRIBUTING.md) guide. 37 | 38 | ## Code of Conduct 39 | 40 | `mediawiki-client-tools` has a [Code of Conduct](./CODE_OF_CONDUCT.md). 41 | 42 | At the moment the only person responsible for reviewing CoC reports is the repository administrator, Janet Cobb, reachable at [git@randomcat.org](mailto:git@randomcat.org). Please state up front if your message concerns the Code of Conduct, as these messages are confidential. 43 | 44 | In case of emergency (i.e. if Janet is not reachable or if such an issue involves her), you can contact Elsie Hupp, who also retains privileges over this repository, directly via email at [mediawiki-client-tools@elsiehupp.com](mailto:mediawiki-client-tools@elsiehupp.com) or on Matrix at [@elsiehupp:beeper.com](https://matrix.to/#/@elsiehupp:beeper.com). 45 | 46 | ## Contributors 47 | 48 | **WikiTeam** is the [Archive Team](http://www.archiveteam.org) [[GitHub](https://github.com/ArchiveTeam)] subcommittee on wikis. 49 | It was founded and originally developed by [Emilio J. Rodríguez-Posada](https://github.com/emijrp), a Wikipedia veteran editor and amateur archivist. Thanks to people who have helped, especially to: [Federico Leva](https://github.com/nemobis), [Alex Buie](https://github.com/ab2525), [Scott Boyd](http://www.sdboyd56.com), [Hydriz](https://github.com/Hydriz), Platonides, Ian McEwen, [Mike Dupont](https://github.com/h4ck3rm1k3), [balr0g](https://github.com/balr0g) and [PiRSquared17](https://github.com/PiRSquared17). 50 | 51 | **MediaWiki Dump Generator** 52 | The Python 3 initiative was started and originally maintained by [Elsie Hupp](https://github.com/elsiehupp); it is currently primarily maintained by [Janet Cobb](https://github.com/randomnetcat). We are also grateful to have contributions from [Victor Gambier](https://github.com/vgambier), [Thomas Karcher](https://github.com/t-karcher), [yzqzss](https://github.com/yzqzss), [NyaMisty](https://github.com/NyaMisty) and [Rob Kam](https://github.com/robkam). 53 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/api/namespaces.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from wikiteam3.dumpgenerator.api import getJSON 4 | from wikiteam3.dumpgenerator.cli import Delay 5 | from wikiteam3.dumpgenerator.config import Config 6 | 7 | 8 | def getNamespacesScraper(config: Config = None, session=None): 9 | """Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages""" 10 | """Function called if no API is available""" 11 | namespaces = config.namespaces 12 | namespacenames = {0: ""} # main is 0, no prefix 13 | if namespaces: 14 | r = session.post( 15 | url=config.index, params={"title": "Special:Allpages"}, timeout=30 16 | ) 17 | raw = r.text 18 | Delay(config=config, session=session) 19 | 20 | # [^>]*? to include selected="selected" 21 | m = re.compile( 22 | r'' 23 | ).finditer(raw) 24 | if "all" in namespaces: 25 | namespaces = [] 26 | for i in m: 27 | namespaces.append(int(i.group("namespaceid"))) 28 | namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") 29 | else: 30 | # check if those namespaces really exist in this wiki 31 | namespaces2 = [] 32 | for i in m: 33 | if int(i.group("namespaceid")) in namespaces: 34 | namespaces2.append(int(i.group("namespaceid"))) 35 | namespacenames[int(i.group("namespaceid"))] = i.group( 36 | "namespacename" 37 | ) 38 | namespaces = namespaces2 39 | else: 40 | namespaces = [0] 41 | 42 | namespaces = list(set(namespaces)) # uniques 43 | print("%d namespaces found" % (len(namespaces))) 44 | return namespaces, namespacenames 45 | 46 | 47 | def getNamespacesAPI(config: Config = None, session=None): 48 | """Uses the API to get the list of namespaces names and ids""" 49 | namespaces = config.namespaces 50 | namespacenames = {0: ""} # main is 0, no prefix 51 | if namespaces: 52 | r = session.get( 53 | url=config.api, 54 | params={ 55 | "action": "query", 56 | "meta": "siteinfo", 57 | "siprop": "namespaces", 58 | "format": "json", 59 | }, 60 | timeout=30, 61 | ) 62 | result = getJSON(r) 63 | Delay(config=config, session=session) 64 | try: 65 | nsquery = result["query"]["namespaces"] 66 | except KeyError: 67 | print("Error: could not get namespaces from the API request.") 68 | print("HTTP %d" % r.status_code) 69 | print(r.text) 70 | return None 71 | 72 | if "all" in namespaces: 73 | namespaces = [] 74 | for i in nsquery.keys(): 75 | if int(i) < 0: # -1: Special, -2: Media, excluding 76 | continue 77 | namespaces.append(int(i)) 78 | namespacenames[int(i)] = nsquery[i]["*"] 79 | else: 80 | # check if those namespaces really exist in this wiki 81 | namespaces2 = [] 82 | for i in nsquery.keys(): 83 | bi = i 84 | i = int(i) 85 | if i < 0: # -1: Special, -2: Media, excluding 86 | continue 87 | if i in namespaces: 88 | namespaces2.append(i) 89 | namespacenames[i] = nsquery[bi]["*"] 90 | namespaces = namespaces2 91 | else: 92 | namespaces = [0] 93 | 94 | namespaces = list(set(namespaces)) # uniques 95 | print("%d namespaces found" % (len(namespaces))) 96 | return namespaces, namespacenames 97 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/image/html_regexs_test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | from pathlib import Path 4 | from typing import Dict, List 5 | 6 | import pytest 7 | import requests 8 | 9 | from wikiteam3.dumpgenerator.dump.image.html_regexs import REGEX_CANDIDATES 10 | 11 | ONLINE = True 12 | 13 | HTML_DIR = Path("test/data/html_regexs") 14 | os.makedirs(HTML_DIR, exist_ok=True) 15 | 16 | 17 | def prepare_raws_from_urls(urls: Dict[str, str]): 18 | sess = requests.Session() 19 | raws: Dict[str, str] = {} 20 | for site, url in urls.items(): 21 | try: 22 | resp = sess.get(url, timeout=10, allow_redirects=True) 23 | except Exception as e: 24 | pytest.warns(UserWarning, match=f"Could not fetch {url}: {e}") 25 | continue 26 | 27 | if resp.status_code == 200: 28 | raws[url] = resp.text 29 | if not os.path.exists(HTML_DIR / f"{site}.html"): 30 | with open(HTML_DIR / f"{site}.html", "w", encoding="utf-8") as f: 31 | f.write(resp.text) 32 | else: 33 | pytest.warns( 34 | UserWarning, 35 | match=f"Could not fetch {url}: status_code: {resp.status_code}", 36 | ) 37 | 38 | return raws 39 | 40 | 41 | class TestRegexs: 42 | class TestRegexsOnline: 43 | listFiles_urls = { 44 | # site-date: url , `limit=` for counting the number of matches 45 | "archiveteam.org-20230701": "https://wiki.archiveteam.org/index.php?title=Special:ListFiles&sort=byname&limit=7", 46 | "wiki.othing.xyz-20230701": "https://wiki.othing.xyz/index.php?title=Special:ListFiles&sort=byname", 47 | "mediawiki.org-20230701": "https://www.mediawiki.org/w/index.php?title=Special:ListFiles&sort=byname&limit=7", 48 | "asoiaf.fandom.com-20230701": "https://asoiaf.fandom.com/zh/wiki/Special:文件列表?sort=byname&limit=7", 49 | # only for local testing: 50 | # "commons.moegirl.org.cn-20230701": "https://commons.moegirl.org.cn/index.php?title=Special:ListFiles&sort=byname&limit=7", 51 | # # login required: 52 | # "group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701": "http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/index.php?title=Special:文件列表&limit=1", 53 | # "group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701": "http://group1.mediawiki.demo.save-web.org/mediawiki-1.27.7/index.php?title=Special:ListFiles&limit=2", 54 | } 55 | raws: Dict[str, str] = {} 56 | 57 | def test_online(self): 58 | if not ONLINE: 59 | pytest.skip("Online test skipped") 60 | self.raws = prepare_raws_from_urls(self.listFiles_urls) 61 | assert len(self.raws) != 0, "Could not fetch any of the URLs" 62 | for url, raw in self.raws.items(): 63 | best_matched = 0 64 | regexp_best = None 65 | 66 | for regexp in REGEX_CANDIDATES: 67 | _count = len(re.findall(regexp, raw)) 68 | if _count > best_matched: 69 | best_matched = _count 70 | regexp_best = regexp 71 | 72 | assert ( 73 | regexp_best is not None 74 | ), f"Could not find a proper regexp to parse the HTML for {url} (online)" 75 | 76 | if "limit=" in url: 77 | limit = int(url.split("limit=")[-1]) 78 | assert ( 79 | len(re.findall(regexp_best, raw)) == limit 80 | ), f"Could not find {limit} matches for {url} (online)" 81 | 82 | class TestRegexsOffline: 83 | html_files = os.listdir(HTML_DIR) 84 | raws: Dict[str, str] = {} 85 | for html_file in html_files: 86 | with open(HTML_DIR / html_file, encoding="utf-8") as f: 87 | raws[html_file] = f.read() 88 | assert len(raws) != 0, f"Could not find any HTML files in {HTML_DIR}" 89 | 90 | def test_offline(self): 91 | assert len(self.raws) != 0, "Could not fetch any of the URLs" 92 | for site, raw in self.raws.items(): 93 | best_matched = 0 94 | regexp_best = None 95 | 96 | for regexp in REGEX_CANDIDATES: 97 | _count = len(re.findall(regexp, raw)) 98 | if _count > best_matched: 99 | best_matched = _count 100 | regexp_best = regexp 101 | 102 | assert ( 103 | regexp_best is not None 104 | ), f"Could not find a proper regexp to parse the HTML for {site} (local)" 105 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/xmldump/xml_header.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import sys 4 | from typing import * 5 | 6 | import requests 7 | 8 | from wikiteam3.dumpgenerator.config import Config 9 | from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage 10 | from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError 11 | from wikiteam3.dumpgenerator.log import logerror 12 | 13 | 14 | def getXMLHeader(config: Config = None, session=None) -> Tuple[str, Config]: 15 | """Retrieve a random page to extract XML headers (namespace info, etc)""" 16 | print(config.api) 17 | xml = "" 18 | disableSpecialExport = config.xmlrevisions or config.xmlapiexport 19 | randomtitle = "Main_Page" 20 | if disableSpecialExport and config.api and config.api.endswith("api.php"): 21 | try: 22 | print("Getting the XML header from the API") 23 | # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.8 24 | r = session.get( 25 | f"{config.api}?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1", 26 | timeout=10, 27 | ) 28 | xml: str = r.text 29 | # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19 30 | if not re.match(r"\s* does not exist. Not a problem, if we get the . 73 | xml = pme.xml 74 | except ExportAbortedError: 75 | try: 76 | if config.api: 77 | print("Trying the local name for the Special namespace instead") 78 | r = session.get( 79 | url=config.api, 80 | params={ 81 | "action": "query", 82 | "meta": "siteinfo", 83 | "siprop": "namespaces", 84 | "format": "json", 85 | }, 86 | timeout=120, 87 | ) 88 | config.export = ( 89 | json.loads(r.text)["query"]["namespaces"]["-1"]["*"] + ":Export" 90 | ) 91 | xml = "".join( 92 | list( 93 | getXMLPage( 94 | config=config, 95 | title=randomtitle, 96 | verbose=False, 97 | session=session, 98 | ) 99 | ) 100 | ) 101 | except PageMissingError as pme: 102 | xml = pme.xml 103 | except ExportAbortedError: 104 | pass 105 | 106 | header = xml.split("")[0] 107 | if not re.match(r"\s*= 4: 27 | break 28 | if r.status_code == 200: 29 | break 30 | elif r.status_code < 400: 31 | api = r.url 32 | elif r.status_code > 400: 33 | print( 34 | "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code 35 | ) 36 | return None 37 | if "MediaWiki API is not enabled for this site." in r.text: 38 | return None 39 | try: 40 | result = getJSON(r) 41 | index = None 42 | if result: 43 | try: 44 | index = ( 45 | result["query"]["general"]["server"] 46 | + result["query"]["general"]["script"] 47 | ) 48 | return (True, index, api) 49 | except KeyError: 50 | print("MediaWiki API seems to work but returned no index URL") 51 | return (True, None, api) 52 | except ValueError: 53 | print(repr(r.text)) 54 | print("MediaWiki API returned data we could not parse") 55 | return None 56 | return None 57 | 58 | 59 | def mwGetAPIAndIndex(url="", session: requests.Session = None): 60 | """Returns the MediaWiki API and Index.php""" 61 | 62 | api = "" 63 | index = "" 64 | if not session: 65 | session = requests.Session() # Create a new session 66 | session.headers.update({"User-Agent": getUserAgent()}) 67 | r = session.post(url=url, timeout=120) 68 | result = r.text 69 | 70 | if m := re.findall( 71 | r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>', 72 | result, 73 | ): 74 | api = m[0] 75 | if api.startswith("//"): # gentoo wiki 76 | api = url.split("//")[0] + api 77 | if m := re.findall( 78 | r'
  • ]*?>\s*(?:)?\s*]*?>\s*(?:)?\s* len( 99 | re.findall(r"/index\.php\?", result) 100 | ): 101 | index = "/".join(api.split("/")[:-1]) + "/index.php5" 102 | else: 103 | index = "/".join(api.split("/")[:-1]) + "/index.php" 104 | 105 | if not api and index: 106 | api = urljoin(index, "api.php") 107 | 108 | return api, index 109 | 110 | 111 | def checkRetryAPI(api="", apiclient=False, session: requests.Session = None): 112 | """Call checkAPI and mwclient if necessary""" 113 | check = None 114 | try: 115 | check = checkAPI(api, session=session) 116 | except requests.exceptions.ConnectionError as e: 117 | print(f"Connection error: {str(e)}") 118 | 119 | if check and apiclient: 120 | apiurl = urlparse(api) 121 | try: 122 | site = mwclient.Site( 123 | apiurl.netloc, 124 | apiurl.path.replace("api.php", ""), 125 | scheme=apiurl.scheme, 126 | pool=session, 127 | ) 128 | except KeyError: 129 | # Probably KeyError: 'query' 130 | if apiurl.scheme == "https": 131 | newscheme = "http" 132 | api = api.replace("https://", "http://") 133 | else: 134 | newscheme = "https" 135 | api = api.replace("http://", "https://") 136 | print( 137 | f"WARNING: The provided API URL did not work with mwclient. Switched protocol to: {newscheme}" 138 | ) 139 | 140 | try: 141 | site = mwclient.Site( 142 | apiurl.netloc, 143 | apiurl.path.replace("api.php", ""), 144 | scheme=newscheme, 145 | pool=session, 146 | ) 147 | except KeyError: 148 | check = False 149 | 150 | return check, api 151 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py: -------------------------------------------------------------------------------- 1 | from lxml import etree 2 | from lxml.builder import E 3 | 4 | from wikiteam3.dumpgenerator.exceptions import PageMissingError 5 | 6 | 7 | def makeXmlPageFromRaw(xml, arvcontinue) -> str: 8 | """Discard the metadata around a element in string""" 9 | root = etree.XML(xml) 10 | find = etree.XPath("//*[local-name() = 'page']") 11 | page = find(root)[0] 12 | if arvcontinue is not None: 13 | page.attrib["arvcontinue"] = arvcontinue 14 | # The tag will inherit the namespace, like: 15 | # 16 | # FIXME: pretty_print doesn't seem to work, only adds a newline 17 | return etree.tostring(page, pretty_print=True, encoding="unicode") 18 | 19 | 20 | def makeXmlFromPage(page: dict, arvcontinue) -> str: 21 | """Output an XML document as a string from a page as in the API JSON""" 22 | try: 23 | p = E.page( 24 | E.title(str(page["title"])), 25 | E.ns(str(page["ns"])), 26 | E.id(str(page["pageid"])), 27 | ) 28 | if arvcontinue is not None: 29 | p.attrib["arvcontinue"] = arvcontinue 30 | for rev in page["revisions"]: 31 | # Older releases like MediaWiki 1.16 do not return all fields. 32 | userid = rev["userid"] if "userid" in rev else 0 33 | size = rev["size"] if "size" in rev else 0 34 | # Create rev object 35 | revision = [ 36 | E.id(str(rev["revid"])), 37 | E.timestamp(rev["timestamp"]), 38 | ] 39 | 40 | # The text, user, comment, sha1 may be deleted/suppressed 41 | if ("texthidden" in rev) or ("textmissing" in rev): 42 | print( 43 | "Warning: text missing/hidden in pageid %d revid %d" 44 | % (page["pageid"], rev["revid"]) 45 | ) 46 | revision.append( 47 | E.text( 48 | **{ 49 | "bytes": str(size), 50 | "deleted": "deleted", 51 | } 52 | ) 53 | ) 54 | else: 55 | text = str(rev["*"]) 56 | revision.append( 57 | E.text( 58 | text, 59 | **{ 60 | "bytes": str(size), 61 | "{http://www.w3.org/XML/1998/namespace}space": "preserve", 62 | } 63 | ) 64 | ) 65 | 66 | if "user" not in rev: 67 | if "userhidden" not in rev: 68 | print( 69 | "Warning: user not hidden but missing user in pageid %d revid %d" 70 | % (page["pageid"], rev["revid"]) 71 | ) 72 | revision.append(E.contributor(deleted="deleted")) 73 | else: 74 | revision.append( 75 | E.contributor( 76 | E.username(str(rev["user"])), 77 | E.id(str(userid)), 78 | ) 79 | ) 80 | 81 | if "sha1" in rev: 82 | revision.append(E.sha1(rev["sha1"])) 83 | 84 | elif "sha1hidden" in rev: 85 | revision.append(E.sha1()) # stub 86 | if "commenthidden" in rev: 87 | revision.append(E.comment(deleted="deleted")) 88 | elif "comment" in rev and rev["comment"]: 89 | revision.append(E.comment(str(rev["comment"]))) 90 | 91 | if "contentmodel" in rev: 92 | revision.append(E.model(rev["contentmodel"])) 93 | if "contentformat" in rev: 94 | revision.append(E.format(rev["contentformat"])) 95 | # Sometimes a missing parentid is not replaced with a 0 as it should. 96 | if "parentid" in rev: 97 | revision.append(E.parentid(str(rev["parentid"]))) 98 | 99 | if "minor" in rev: 100 | revision.append(E.minor()) 101 | 102 | # mwcli's dump.xml order 103 | revisionTags = [ 104 | "id", 105 | "parentid", 106 | "timestamp", 107 | "contributor", 108 | "minor", 109 | "comment", 110 | "origin", 111 | "model", 112 | "format", 113 | "text", 114 | "sha1", 115 | ] 116 | revisionElementsDict = {elem.tag: elem for elem in revision} 117 | _revision = E.revision() 118 | for tag in revisionTags: 119 | if tag in revisionElementsDict: 120 | _revision.append(revisionElementsDict.pop(tag)) 121 | for elem in revisionElementsDict.values(): 122 | _revision.append(elem) 123 | p.append(_revision) 124 | except KeyError as e: 125 | print(e) 126 | raise PageMissingError(page["title"], e) 127 | return etree.tostring(p, pretty_print=True, encoding="unicode") 128 | -------------------------------------------------------------------------------- /wikiteam3/dumpgenerator/api/wiki_check.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import requests 4 | 5 | from wikiteam3.utils import getUserAgent 6 | 7 | 8 | def getWikiEngine(url="", session: requests.Session = None) -> str: 9 | """Returns the wiki engine of a URL, if known""" 10 | 11 | if not session: 12 | session = requests.Session() # Create a new session 13 | session.headers.update({"User-Agent": getUserAgent()}) 14 | r = session.post(url=url, timeout=30) 15 | if r.status_code == 405 or not r.text: 16 | r = session.get(url=url, timeout=120) 17 | result = r.text 18 | 19 | wikiengine = "Unknown" 20 | if re.search( 21 | '(?im)(MoinMoin Powered|