├── wikiteam3
├── __init__.py
├── utils
│ ├── xmlutil.py
│ ├── __init__.py
│ ├── uprint.py
│ ├── user_agent.py
│ ├── domain.py
│ ├── wiki_avoid.py
│ ├── monkey_patch.py
│ ├── login
│ │ ├── __init__.py
│ │ ├── index.py
│ │ └── api.py
│ └── util.py
├── dumpgenerator
│ ├── test
│ │ ├── __init__.py
│ │ ├── test_config.py
│ │ └── data
│ │ │ └── html_regexs
│ │ │ ├── group0.mediawiki.demo.save-web.org_mediawiki-1.16.5-20230701.html
│ │ │ ├── group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701.html
│ │ │ └── group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701.html
│ ├── dump
│ │ ├── image
│ │ │ ├── __init__.py
│ │ │ ├── html_regexs.py
│ │ │ └── html_regexs_test.py
│ │ ├── misc
│ │ │ ├── __init__.py
│ │ │ ├── site_info_test.py
│ │ │ ├── index_php.py
│ │ │ ├── special_version.py
│ │ │ ├── special_logs.py
│ │ │ └── site_info.py
│ │ ├── page
│ │ │ ├── __init__.py
│ │ │ ├── xmlrev
│ │ │ │ ├── __init__.py
│ │ │ │ └── xml_revisions_page.py
│ │ │ └── xmlexport
│ │ │ │ ├── __init__.py
│ │ │ │ ├── page_xml.py
│ │ │ │ ├── page_xml_export.py
│ │ │ │ └── page_xml_api.py
│ │ ├── xmldump
│ │ │ ├── __init__.py
│ │ │ ├── xml_integrity.py
│ │ │ ├── xml_truncate.py
│ │ │ ├── xml_header.py
│ │ │ └── xml_dump.py
│ │ ├── __init__.py
│ │ └── generator.py
│ ├── log
│ │ ├── __init__.py
│ │ └── log_error.py
│ ├── cli
│ │ ├── __init__.py
│ │ ├── delay.py
│ │ └── greeter.py
│ ├── __main__.py
│ ├── version.py
│ ├── api
│ │ ├── __init__.py
│ │ ├── get_json.py
│ │ ├── handle_status_code.py
│ │ ├── index_check.py
│ │ ├── namespaces.py
│ │ ├── api.py
│ │ ├── wiki_check.py
│ │ └── page_titles.py
│ ├── exceptions.py
│ ├── __init__.py
│ └── config.py
└── launcher.py
├── .gitattributes
├── .travis.yml
├── .gitignore
├── .pymarkdown.json
├── .markdownlint.jsonc
├── .github
├── ISSUE_TEMPLATE
│ ├── config.yml
│ └── bug_report.md
└── workflows
│ └── test-dumpgenerator.yml
├── .pre-commit-config.yaml
├── PUBLISHING.md
├── pyproject.toml
├── README.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── USAGE.md
└── INSTALLATION.md
/wikiteam3/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/wikiteam3/utils/xmlutil.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/image/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/xmldump/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/xmlrev/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/xmlexport/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/log/__init__.py:
--------------------------------------------------------------------------------
1 | from .log_error import logerror
2 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/__init__.py:
--------------------------------------------------------------------------------
1 | from .generator import DumpGenerator
2 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/cli/__init__.py:
--------------------------------------------------------------------------------
1 | from .cli import getParameters
2 | from .delay import Delay
3 | from .greeter import bye, welcome
4 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/__main__.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 | import sys
3 |
4 | from .__init__ import main
5 |
6 | sys.exit(main())
7 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/version.py:
--------------------------------------------------------------------------------
1 | __VERSION__ = "0.4.0-alpha" # major, minor, micro: semver.org
2 |
3 |
4 | def getVersion():
5 | return __VERSION__
6 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.com linguist-vendored
2 | *.org linguist-vendored
3 |
4 | *.py text=auto
5 | *.sh text=auto
6 | *.json text=auto
7 | *.txt text=auto
8 | *.md text=auto
9 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python: 3.8
3 | install:
4 | - pip install poetry
5 | - poetry install
6 | script:
7 | - poetry run pytest --verbose -s
8 | notifications:
9 | email: false
10 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .api import checkAPI, checkRetryAPI, mwGetAPIAndIndex
2 | from .get_json import getJSON
3 | from .handle_status_code import handleStatusCode
4 | from .wiki_check import getWikiEngine
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .pytest_cache
3 | keys.txt
4 | batchdownload/keys.txt
5 | batchdownload/dumpgenerator.py
6 | batchdownload/uploader.py
7 | __pycache__
8 | tests/tmp
9 | dist/
10 | .DS_Store
11 | desktop.ini
12 |
13 | .venv
14 | .vscode
15 | .idea
16 |
--------------------------------------------------------------------------------
/.pymarkdown.json:
--------------------------------------------------------------------------------
1 | {
2 | "plugins": {
3 | "line-length": {
4 | "enabled": false
5 | },
6 | "no-inline-html": {
7 | "allowed_elements": "details,summary,code,!--"
8 | },
9 | "first-line-heading": {
10 | "enabled": false,
11 | "front_matter_title" : "name"
12 | }
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py:
--------------------------------------------------------------------------------
1 | from typing import *
2 |
3 | from wikiteam3.dumpgenerator.config import Config
4 |
5 |
6 | def checkXMLIntegrity(
7 | config: Config = None, titles: Iterable[str] = None, session=None
8 | ):
9 | """Check XML dump integrity, to detect broken XML chunks"""
10 | # TODO: Fix XML Integrity Check
11 | return
12 |
--------------------------------------------------------------------------------
/wikiteam3/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .domain import domain2prefix
2 | from .login import botLogin, clientLogin, fetchLoginToken, indexLogin, uniLogin
3 | from .monkey_patch import mod_requests_text
4 | from .uprint import uprint
5 | from .user_agent import getUserAgent
6 | from .util import cleanHTML, cleanXML, removeIP, sha1File, undoHTMLEntities
7 | from .wiki_avoid import avoidWikimediaProjects
8 |
--------------------------------------------------------------------------------
/wikiteam3/utils/uprint.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def uprint(*objects, sep=" ", end="\n", file=sys.stdout):
5 | enc = file.encoding
6 | if enc == "UTF-8":
7 | print(*objects, sep=sep, end=end, file=file)
8 | else:
9 | f = lambda obj: str(obj).encode(enc, errors="backslashreplace").decode(enc)
10 | print(*map(f, objects), sep=sep, end=end, file=file)
11 |
--------------------------------------------------------------------------------
/.markdownlint.jsonc:
--------------------------------------------------------------------------------
1 | // If you change any options here,
2 | // please change them in .pymarkdown.jsonc
3 | // as well!
4 | {
5 | "line-length": false,
6 | "no-inline-html": {
7 | "allowed_elements": [
8 | "details",
9 | "summary",
10 | "code"
11 | ]
12 | },
13 | "first-line-heading": {
14 | "front_matter_title" : "name"
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/get_json.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 |
4 | def getJSON(request: requests.Response):
5 | """Strip Unicode BOM"""
6 | if request.text.startswith("\ufeff"):
7 | request.encoding = "utf-8-sig"
8 | # request.encoding = request.apparent_encoding
9 | try:
10 | return request.json()
11 | except:
12 | # Maybe an older API version which did not return correct JSON
13 | return {}
14 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 | - name: Get help using MediaWiki Dump Generator
4 | url: https://github.com/orgs/mediawiki-client-tools/discussions/categories/q-a
5 | about: If you need help (other than reporting a bug), you can reach out on our Discussions Q&A.
6 | - name: Anything else
7 | url: https://github.com/orgs/mediawiki-client-tools/discussions
8 | about: You can read and post in our GitHub Discussions.
9 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/log/log_error.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from wikiteam3.dumpgenerator.config import Config
4 |
5 |
6 | def logerror(config: Config = None, to_stdout=False, text="") -> None:
7 | """Log error in errors.log"""
8 | if text:
9 | with open(f"{config.path}/errors.log", "a", encoding="utf-8") as outfile:
10 | output = (
11 | f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}: {text}\n'
12 | )
13 | outfile.write(output)
14 | if to_stdout:
15 | print(text)
16 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/site_info_test.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | import pytest
4 | import requests
5 |
6 | from wikiteam3.dumpgenerator.test.test_config import get_config
7 |
8 | from .site_info import saveSiteInfo
9 |
10 |
11 | def test_mediawiki_version_match():
12 | with get_config("1.39.7") as config:
13 | sess = requests.Session()
14 | saveSiteInfo(config, sess)
15 | with open(f"{config.path}/siteinfo.json") as f:
16 | siteInfoJson = json.load(f)
17 | assert siteInfoJson["query"]["general"]["generator"] == "MediaWiki 1.39.7"
18 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py:
--------------------------------------------------------------------------------
1 | from wikiteam3.dumpgenerator.config import Config
2 |
3 | from .page_xml_api import getXMLPageWithApi
4 | from .page_xml_export import getXMLPageWithExport
5 |
6 |
7 | def getXMLPage(config: Config = None, title="", verbose=True, session=None):
8 | if config.xmlapiexport:
9 | return getXMLPageWithApi(
10 | config=config, title=title, verbose=verbose, session=session
11 | )
12 | else:
13 | return getXMLPageWithExport(
14 | config=config, title=title, verbose=verbose, session=session
15 | )
16 |
--------------------------------------------------------------------------------
/wikiteam3/utils/user_agent.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | # Return a cool user-agent to hide Python user-agent
4 |
5 |
6 | def getUserAgent():
7 | useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
8 | return useragent
9 |
10 |
11 | def setupUserAgent(session: requests.Session):
12 | session._orirequest = session.request
13 |
14 | def newrequest(*args, **kwargs):
15 | session.headers.update({"User-Agent": getUserAgent()})
16 | return session._orirequest(*args, **kwargs)
17 |
18 | session.request = newrequest
19 |
--------------------------------------------------------------------------------
/wikiteam3/utils/domain.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from wikiteam3.dumpgenerator.config import Config
4 |
5 |
6 | def domain2prefix(config: Config = None, session=None):
7 | """Convert domain name to a valid prefix filename."""
8 |
9 | # At this point, both api and index are supposed to be defined
10 | domain = ""
11 | if config.api:
12 | domain = config.api
13 | elif config.index:
14 | domain = config.index
15 |
16 | domain = domain.lower()
17 | domain = re.sub(r"(https?://|www\.|/index\.php.*|/api\.php.*)", "", domain)
18 | domain = domain.rstrip("/")
19 | domain = re.sub(r"/", "_", domain)
20 | domain = re.sub(r"\.", "", domain)
21 | domain = re.sub(r"[^A-Za-z0-9]", "_", domain)
22 |
23 | return domain
24 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/index_php.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from wikiteam3.dumpgenerator.cli import Delay
4 | from wikiteam3.dumpgenerator.config import Config
5 | from wikiteam3.utils import removeIP
6 |
7 |
8 | def saveIndexPHP(config: Config = None, session=None):
9 | """Save index.php as .html, to preserve license details available at the botom of the page"""
10 |
11 | if os.path.exists(f"{config.path}/index.html"):
12 | print("index.html exists, do not overwrite")
13 | else:
14 | print("Downloading index.php (Main Page) as index.html")
15 | r = session.post(url=config.index, params=None, timeout=10)
16 | raw = str(r.text)
17 | Delay(config=config, session=session)
18 | raw = removeIP(raw=raw)
19 | with open(f"{config.path}/index.html", "w", encoding="utf-8") as outfile:
20 | outfile.write(raw)
21 |
--------------------------------------------------------------------------------
/wikiteam3/utils/wiki_avoid.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 | from typing import *
4 |
5 | from wikiteam3.dumpgenerator.config import Config
6 |
7 |
8 | def avoidWikimediaProjects(config: Config = None, other: Dict = None):
9 | """Skip Wikimedia projects and redirect to the dumps website"""
10 |
11 | # notice about wikipedia dumps
12 | url = ""
13 | if config.api:
14 | url += config.api
15 | if config.index:
16 | url = url + config.index
17 | if re.findall(
18 | r"(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org",
19 | url,
20 | ):
21 | print("PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!")
22 | print("Download the dumps from http://dumps.wikimedia.org")
23 | if not other["force"]:
24 | print("Thanks!")
25 | sys.exit()
26 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/exceptions.py:
--------------------------------------------------------------------------------
1 | class PageMissingError(Exception):
2 | def __init__(self, title, xml):
3 | self.title = title
4 | self.xml = xml
5 |
6 | def __str__(self):
7 | return f"page '{self.title}' not found"
8 |
9 |
10 | class ExportAbortedError(Exception):
11 | def __init__(self, index):
12 | self.index = index
13 |
14 | def __str__(self):
15 | return f"Export from '{self.index}' did not return anything."
16 |
17 |
18 | class FileSizeError(Exception):
19 | def __init__(self, file, size):
20 | self.file = file
21 | self.size = size
22 |
23 | def __str__(self):
24 | return f"File '{self.file}' size is not match '{self.size}'."
25 |
26 |
27 | class FileSha1Error(Exception):
28 | def __init__(self, file, sha1):
29 | self.file = file
30 | self.sha1 = sha1
31 |
32 | def __str__(self):
33 | return f"File '{self.file}' sha1 is not match '{self.sha1}'."
34 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/special_version.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from wikiteam3.dumpgenerator.cli import Delay
4 | from wikiteam3.dumpgenerator.config import Config
5 | from wikiteam3.utils import removeIP
6 |
7 |
8 | def saveSpecialVersion(config: Config = None, session=None):
9 | """Save Special:Version as .html, to preserve extensions details"""
10 |
11 | if os.path.exists(f"{config.path}/SpecialVersion.html"):
12 | print("SpecialVersion.html exists, do not overwrite")
13 | else:
14 | print("Downloading Special:Version with extensions and other related info")
15 | r = session.post(
16 | url=config.index, params={"title": "Special:Version"}, timeout=10
17 | )
18 | raw = str(r.text)
19 | Delay(config=config, session=session)
20 | raw = str(removeIP(raw=raw))
21 | with open(
22 | f"{config.path}/SpecialVersion.html", "w", encoding="utf-8"
23 | ) as outfile:
24 | outfile.write(raw)
25 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # DumpGenerator A generator of dumps for wikis
4 | # Copyright (C) 2011-2018 WikiTeam developers
5 | # This program is free software: you can redistribute it and/or modify
6 | # it under the terms of the GNU General Public License as published by
7 | # the Free Software Foundation, either version 3 of the License, or
8 | # (at your option) any later version.
9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program. If not, see .
17 |
18 | # To learn more, read the documentation:
19 | # https://github.com/WikiTeam/wikiteam/wiki
20 |
21 |
22 | from wikiteam3.dumpgenerator.dump import DumpGenerator
23 |
24 |
25 | def main():
26 | DumpGenerator()
27 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/special_logs.py:
--------------------------------------------------------------------------------
1 | from wikiteam3.dumpgenerator.cli import Delay
2 | from wikiteam3.dumpgenerator.config import Config
3 |
4 |
5 | def saveLogs(config: Config = None, session=None):
6 | """Save Special:Log"""
7 | # get all logs from Special:Log
8 | """parse
9 |
22 | """
23 | Delay(config=config, session=session)
24 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/test/test_config.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import tempfile
3 | from contextlib import contextmanager
4 |
5 | from wikiteam3.dumpgenerator.cli import getParameters
6 | from wikiteam3.dumpgenerator.config import newConfig
7 |
8 | CONFIG_CACHE = {}
9 |
10 |
11 | @contextmanager
12 | def _new_config_from_parameter(params):
13 | _params = tuple(params)
14 | if _params in CONFIG_CACHE:
15 | return CONFIG_CACHE[_params]
16 | config, _ = getParameters(["--path=.", "--xml"] + list(params))
17 | CONFIG_CACHE[_params] = config
18 | _config = newConfig(copy.deepcopy(config.asdict()))
19 | try:
20 | with tempfile.TemporaryDirectory(prefix="wikiteam3test_") as tmpdir:
21 | _config.path = tmpdir
22 | yield _config
23 | finally:
24 | pass
25 |
26 |
27 | def get_config(mediawiki_ver, api=True):
28 | assert api == True
29 | if mediawiki_ver == "1.39.7":
30 | return _new_config_from_parameter(
31 | [
32 | "--api",
33 | "https://testw.fandom.com/api.php",
34 | ]
35 | )
36 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/handle_status_code.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 |
4 | def handleStatusCode(response):
5 | statuscode = response.status_code
6 | if statuscode >= 200 and statuscode < 300:
7 | return
8 |
9 | print("HTTP Error %d." % statuscode)
10 | if statuscode >= 300 and statuscode < 400:
11 | print("Redirect should happen automatically: please report this as a bug.")
12 | print(response.url)
13 |
14 | elif statuscode == 400:
15 | print("Bad Request: The wiki may be malfunctioning.")
16 | print("Please try again later.")
17 | print(response.url)
18 | sys.exit(1)
19 |
20 | elif statuscode in [401, 403]:
21 | print("Authentication required.")
22 | print("Please use --user and --pass.")
23 | print(response.url)
24 |
25 | elif statuscode == 404:
26 | print("Not found. Is Special:Export enabled for this wiki?")
27 | print(response.url)
28 | sys.exit(1)
29 |
30 | elif statuscode == 429 or (statuscode >= 500 and statuscode < 600):
31 | print("Server error, max retries exceeded.")
32 | print("Please resume the dump later.")
33 | print(response.url)
34 | sys.exit(1)
35 |
--------------------------------------------------------------------------------
/wikiteam3/utils/monkey_patch.py:
--------------------------------------------------------------------------------
1 | import requests
2 |
3 | from wikiteam3.dumpgenerator.cli.delay import Delay
4 |
5 |
6 | def mod_requests_text(requests: requests):
7 | """Monkey patch `requests.Response.text` to remove BOM"""
8 |
9 | def new_text(self):
10 | return self.content.lstrip(b"\xef\xbb\xbf").decode(self.encoding)
11 |
12 | requests.Response.text = property(new_text)
13 |
14 |
15 | class DelaySession:
16 | """Monkey patch `requests.Session.send` to add delay"""
17 |
18 | def __init__(self, session, msg=None, delay=None, config=None):
19 | self.session = session
20 | self.msg = msg
21 | self.delay = delay
22 | self.old_send = None
23 | self.config = config
24 |
25 | def hijack(self):
26 | """Don't forget to call `release()`"""
27 |
28 | def new_send(request, **kwargs):
29 | Delay(msg=self.msg, delay=self.delay, config=self.config)
30 | return self.old_send(request, **kwargs)
31 |
32 | self.old_send = self.session.send
33 | self.session.send = new_send
34 |
35 | def release(self):
36 | """Undo monkey patch"""
37 | self.session.send = self.old_send
38 | del self
39 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/index_check.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 |
6 | def checkIndex(index="", cookies="", session: requests.Session = None):
7 | """Checking index.php availability"""
8 | r = session.post(url=index, data={"title": "Special:Version"}, timeout=30)
9 | if r.status_code >= 400:
10 | print(f"ERROR: The wiki returned status code HTTP {r.status_code}")
11 | return False
12 | raw = r.text
13 | print("Checking index.php...", index)
14 | # Workaround for issue 71
15 | if (
16 | re.search(
17 | '(Special:Badtitle|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required)',
18 | raw,
19 | )
20 | and not cookies
21 | ):
22 | print("ERROR: This wiki requires login and we are not authenticated")
23 | return False
24 | if re.search(
25 | '(page-Index_php|"wgPageName":"Index.php"|"firstHeading">Index.php)',
26 | raw,
27 | ):
28 | print("Looks like the page called Index.php, not index.php itself")
29 | return False
30 | return bool(
31 | re.search(
32 | '(This wiki is powered by|
|meta name="generator" content="MediaWiki|class="mediawiki)',
33 | raw,
34 | )
35 | )
36 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/cli/delay.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import sys
3 | import threading
4 | import time
5 |
6 | from wikiteam3.dumpgenerator.config import Config
7 |
8 |
9 | class Delay:
10 | done: bool = False
11 | lock: threading.Lock = threading.Lock()
12 |
13 | def animate(self):
14 | while True:
15 | with self.lock:
16 | if self.done:
17 | return
18 |
19 | print("\r" + self.ellipses, end="")
20 | self.ellipses += "."
21 |
22 | time.sleep(0.3)
23 |
24 | def __init__(self, config: Config = None, session=None, msg=None, delay=None):
25 | """Add a delay if configured for that"""
26 | self.ellipses: str = "."
27 |
28 | if delay is None:
29 | delay = config.delay
30 | if delay <= 0:
31 | return
32 |
33 | if msg:
34 | self.ellipses = f"Delay {delay:.1f}s: {msg} {self.ellipses}"
35 | else:
36 | self.ellipses = ("Delay %.1fs " % (delay)) + self.ellipses
37 |
38 | ellipses_animation = threading.Thread(target=self.animate)
39 | ellipses_animation.daemon = True
40 | ellipses_animation.start()
41 |
42 | time.sleep(delay)
43 |
44 | with self.lock:
45 | self.done = True
46 | print("\r" + " " * len(self.ellipses) + "\r", end="")
47 |
--------------------------------------------------------------------------------
/.github/workflows/test-dumpgenerator.yml:
--------------------------------------------------------------------------------
1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
3 |
4 | name: dumpgenerator test
5 |
6 | on:
7 | push:
8 | branches: [ "python3" ]
9 | pull_request:
10 | branches: [ "python3" ]
11 |
12 | jobs:
13 | build:
14 |
15 | runs-on: ubuntu-latest
16 | strategy:
17 | fail-fast: false
18 | matrix:
19 | python-version: ["3.8", "3.9", "3.10", "3.11"]
20 |
21 | steps:
22 | - uses: actions/checkout@v3
23 | - name: Set up Python ${{ matrix.python-version }}
24 | uses: actions/setup-python@v4
25 | with:
26 | python-version: ${{ matrix.python-version }}
27 | - name: Install dependencies
28 | run: |
29 | python -m pip install --upgrade pip
30 | python -m pip install flake8 pytest poetry
31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 | - name: Lint with flake8
33 | run: |
34 | # exit if there are Python syntax errors or undefined names
35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 | - name: run dumpgenerator
39 | run: |
40 | python -m wikiteam3.dumpgenerator -h
41 | - name: Test with pytest
42 | run: |
43 | cd wikiteam3/dumpgenerator && pytest && cd ../../
44 |
--------------------------------------------------------------------------------
/wikiteam3/utils/login/__init__.py:
--------------------------------------------------------------------------------
1 | """ Provide login functions """
2 |
3 | import time
4 |
5 | import requests
6 |
7 | from wikiteam3.utils.login.api import botLogin, clientLogin, fetchLoginToken
8 | from wikiteam3.utils.login.index import indexLogin
9 |
10 |
11 | def uniLogin(
12 | api: str = "",
13 | index: str = "",
14 | session: requests.Session = requests.Session(),
15 | username: str = "",
16 | password: str = "",
17 | ):
18 | """Try to login to a wiki using various methods.\n
19 | Return `session` if success, else return `None`.\n
20 | Try: `cilent login (api) => bot login (api) => index login (index)`"""
21 |
22 | if (not api and not index) or (not username or not password):
23 | print("uniLogin: api or index or username or password is empty")
24 | return None
25 |
26 | if api:
27 | print("Trying to log in to the wiki using clientLogin... (MW 1.27+)")
28 | if _session := clientLogin(
29 | api=api, session=session, username=username, password=password
30 | ):
31 | return _session
32 | time.sleep(5)
33 |
34 | print("Trying to log in to the wiki using botLogin... (MW 1.27+)")
35 | if _session := botLogin(
36 | api=api, session=session, username=username, password=password
37 | ):
38 | return _session
39 | time.sleep(5)
40 |
41 | if index:
42 | print("Trying to log in to the wiki using indexLogin... (generic)")
43 | if _session := indexLogin(
44 | index=index, session=session, username=username, password=password
45 | ):
46 | return _session
47 |
48 | return None
49 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug Report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 |
11 |
12 |
14 |
15 | ## Describe the Bug
16 |
17 |
18 |
19 | ### Expected Behavior
20 |
21 |
22 |
23 | ### Actual Behavior
24 |
25 |
26 |
27 | ## Command for Reproducing the Bug
28 |
29 |
31 |
32 | ```bash
33 |
34 | ```
35 |
36 | ## Output
37 |
38 |
39 | stdout
40 |
41 |
43 |
44 | ```bash
45 |
46 | ```
47 |
48 |
49 |
50 |
51 | errors.log
52 |
53 |
55 |
56 | ```text
57 |
58 | ```
59 |
60 |
61 |
62 | ## Platform Details
63 |
64 |
66 |
67 | ### Desktop
68 |
69 | - OS and version:
70 | - File system:
71 | - Python version:
72 | - Command line shell:
73 | - `dumpgenerator` version:
74 |
75 | ### Smartphone or Tablet
76 |
77 | - OS:
78 | - Python version:
79 | - Command line shell:
80 | - Terminal application used:
81 | - `dumpgenerator` version:
82 |
83 | ## Additional Context
84 |
85 |
86 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # Apply to all files without commiting:
2 | # pre-commit run --all-files
3 | # Update this file:
4 | # pre-commit autoupdate
5 | default_language_version:
6 | python: python3.8
7 | repos:
8 | - repo: https://github.com/python-poetry/poetry
9 | rev: 1.6.0
10 | hooks:
11 | - id: poetry-check
12 | # - id: poetry-lock
13 | - id: poetry-export
14 | args: ["-f", "requirements.txt", "-o", "requirements.txt"]
15 | - repo: https://github.com/pre-commit/pre-commit-hooks
16 | rev: v4.4.0
17 | hooks:
18 | - id: check-ast
19 | - id: fix-byte-order-marker
20 | - id: check-case-conflict
21 | - id: check-docstring-first
22 | - id: check-executables-have-shebangs
23 | - id: check-json
24 | - id: check-yaml
25 | - id: debug-statements
26 | # - id: detect-aws-credentials
27 | # - id: detect-private-key
28 | - id: end-of-file-fixer
29 | - id: trailing-whitespace
30 | - id: mixed-line-ending
31 | # - repo: https://github.com/pre-commit/mirrors-mypy
32 | # rev: v0.942
33 | # hooks:
34 | # - id: mypy
35 | # args: [--ignore-missing-imports]
36 | - repo: https://github.com/PyCQA/isort
37 | rev: 5.12.0
38 | hooks:
39 | - id: isort
40 | args: ["--profile", "black", "--filter-files"]
41 | - repo: https://github.com/psf/black
42 | rev: 23.7.0
43 | hooks:
44 | - id: black
45 | - repo: https://github.com/asottile/pyupgrade
46 | rev: v3.10.1
47 | hooks:
48 | - id: pyupgrade
49 | args: [--py38-plus]
50 | - repo: https://github.com/asottile/blacken-docs
51 | rev: 1.16.0
52 | hooks:
53 | - id: blacken-docs
54 | # additional_dependencies: [black==20.8b1]
55 | ### Needs argument for diasabling line_length
56 | ### https://github.com/jackdewinter/pymarkdown/blob/main/docs/rules/rule_md013.md
57 | - repo: https://github.com/jackdewinter/pymarkdown
58 | rev: v0.9.12
59 | hooks:
60 | - id: pymarkdown
61 | args:
62 | - --config=.pymarkdown.json
63 | # - --disable-rules
64 | # - line-length,no-inline-html
65 | - scan
66 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/site_info.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 |
4 | from wikiteam3.dumpgenerator.api import getJSON
5 | from wikiteam3.dumpgenerator.cli import Delay
6 | from wikiteam3.dumpgenerator.config import Config
7 |
8 |
9 | def saveSiteInfo(config: Config = None, session=None):
10 | """Save a file with site info"""
11 |
12 | if not config.api:
13 | return
14 | if os.path.exists(f"{config.path}/siteinfo.json"):
15 | print("siteinfo.json exists, do not overwrite")
16 | else:
17 | print("Downloading site info as siteinfo.json")
18 |
19 | # MediaWiki 1.13+
20 | r = session.get(
21 | url=config.api,
22 | params={
23 | "action": "query",
24 | "meta": "siteinfo",
25 | "siprop": "general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo",
26 | "sinumberingroup": 1,
27 | "format": "json",
28 | },
29 | timeout=10,
30 | )
31 | # MediaWiki 1.11-1.12
32 | if "query" not in getJSON(r):
33 | r = session.get(
34 | url=config.api,
35 | params={
36 | "action": "query",
37 | "meta": "siteinfo",
38 | "siprop": "general|namespaces|statistics|dbrepllag|interwikimap",
39 | "format": "json",
40 | },
41 | timeout=10,
42 | )
43 | # MediaWiki 1.8-1.10
44 | if "query" not in getJSON(r):
45 | r = session.get(
46 | url=config.api,
47 | params={
48 | "action": "query",
49 | "meta": "siteinfo",
50 | "siprop": "general|namespaces",
51 | "format": "json",
52 | },
53 | timeout=10,
54 | )
55 | result = getJSON(r)
56 | Delay(config=config, session=session)
57 | with open(f"{config.path}/siteinfo.json", "w", encoding="utf-8") as outfile:
58 | outfile.write(json.dumps(result, indent=4, sort_keys=True))
59 |
--------------------------------------------------------------------------------
/PUBLISHING.md:
--------------------------------------------------------------------------------
1 | # Publishing the dump
2 |
3 | Publishing your dumps to the [Internet Archive's wikiteam collection](https://archive.org/details/wikiteam) is easily done. First [sign up](https://archive.org/account/signup) or [login](http://archive.org/account/login.php).
4 |
5 | ## Launcher and uploader
6 |
7 | Instructions on using the scripts `launcher` and `uploader` are in the file [Usage](./USAGE.md).
8 |
9 | ## Automatic publishing
10 |
11 | Just use `uploader` (especially if you have multiple wikis): the script takes the filename of a list of wikis as argument and uploads their dumps to archive.org. You only need to:
12 |
13 | - Check the 7z compressed dumps are in the same directory as `listfile`. The file `listfile` contains a list of the api.php URLs of the wikis to upload, one per line.
14 | - [Retrieve your S3 keys](http://www.archive.org/account/s3.php), save them one per line (in the order provided) in a keys.txt file in same directory as `uploader`.
15 | - Run the script `uploader listfile`.
16 |
17 | ## Manual publishing
18 |
19 | - After running dumpgenerator, in each dump folder, select all files, right-click on the selection, click 7-Zip, click `Add to archive...` and click OK.
20 | - At Archive.org, for each wiki [create a new item](http://archive.org/create/).
21 | - Click `Upload files`. Then either drag and drop the 7-Zip archive onto the box or click `Choose files` and select the 7-Zip archive.
22 | - `Page Title` and `Page URL` will be filled in by the uploader.
23 | - Add a short `Description`, such as a descriptive name fopr the wiki.
24 | - Add `Subject Tags`, separated by commas, these are the keywords that will help the archive to show up in a Internet Archive search, e.g. wikiteam,wiki,subjects of the wiki, and so on.
25 | - `Creator`, can be left blank.
26 | - `Date`, can be left blank.
27 | - `Collection`, select `Community texts`.
28 | - `Language`, select the language of the wiki.
29 | - `License`, click to expand and select Creative Commons, Allow Remixing, Require Share-Alike for a CC-BY-SA licence.
30 | - Click `Upload and Create Your Item`.
31 |
32 | With the subject tag of wikiteam and collection of community texts, your uploads should appear in a search for [subject:"wikiteam" AND collection:opensource](https://archive.org/search?query=subject%3A%22wikiteam%22+AND+collection%3Aopensource).
33 |
34 | ## Info for developers
35 |
36 | - [Internet Archive’s S3 like server API](https://archive.org/developers/ias3.html).
37 |
--------------------------------------------------------------------------------
/wikiteam3/utils/login/index.py:
--------------------------------------------------------------------------------
1 | """ Always available login methods.(mw 1.16-1.39)
2 | Even oler versions of MW may work, but not tested. """
3 |
4 | from typing import *
5 |
6 | import lxml.html
7 | import requests
8 |
9 |
10 | def indexLogin(
11 | index: str, session: requests.Session, username: str, password: str
12 | ) -> Optional[requests.Session]:
13 | """Try to login to a wiki using username and password through `Special:UserLogin`.
14 | (tested on MW 1.16...1.39)"""
15 | wpEditToken = None
16 | wpLoginToken = None
17 |
18 | params = {
19 | "title": "Special:UserLogin",
20 | }
21 | r = session.get(index, allow_redirects=True, params=params)
22 |
23 | # Sample r.text:
24 | # MW 1.16:
25 | # MW 1.39:
26 | html = lxml.html.fromstring(r.text)
27 | if "wpLoginToken" in r.text:
28 | wpLoginToken = html.xpath('//input[@name="wpLoginToken"]/@value')[0]
29 |
30 | # Sample r.text:
31 | # MW 1.16: None
32 | # MW 1.39:
33 | if "wpEditToken" in r.text:
34 | wpEditToken = html.xpath('//input[@name="wpEditToken"]/@value')[0]
35 | print("index login: wpEditToken found.")
36 |
37 | data = {
38 | "wpName": username, # required
39 | "wpPassword": password, # required
40 | "wpLoginattempt": "Log in", # required
41 | "wpLoginToken": wpLoginToken, # required
42 | "wpRemember": "1", # 0: not remember, 1: remember
43 | "wpEditToken": wpEditToken, # introduced before MW 1.27, not sure whether it's required.
44 | "authAction": "login", # introduced before MW 1.39.
45 | "title": "Special:UserLogin", # introduced before MW 1.39.
46 | "force": "", # introduced before MW 1.39, empty string is OK.
47 | }
48 | r = session.post(index, allow_redirects=False, params=params, data=data)
49 | if r.status_code == 302:
50 | print("index login: Success! Welcome, ", username, "!")
51 | return session
52 | else:
53 | print(
54 | "index login: Oops! Something went wrong -- ",
55 | r.status_code,
56 | "wpLoginToken: ",
57 | wpLoginToken,
58 | "wpEditToken: ",
59 | wpEditToken,
60 | )
61 | return None
62 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py:
--------------------------------------------------------------------------------
1 | import os
2 | from io import StringIO
3 | from typing import *
4 |
5 | import lxml.etree
6 | from file_read_backwards import FileReadBackwards
7 |
8 |
9 | def endsWithNewlines(filename: str) -> int:
10 | """Returns the number of newlines at the end of file"""
11 |
12 | with FileReadBackwards(filename, encoding="utf-8") as frb:
13 | newlines = 0
14 | while frb.readline() == "":
15 | newlines += 1
16 | return newlines
17 |
18 |
19 | def addNewline(filename: str) -> None:
20 | """Adds a newline to the end of file"""
21 |
22 | print(f"Adding newline to end of {filename}")
23 | with open(filename, "a", encoding="utf-8") as f:
24 | f.write("\n")
25 |
26 |
27 | def truncateXMLDump(filename: str) -> str:
28 | """Removes incomplete elements from the end of XML dump files"""
29 |
30 | with FileReadBackwards(filename, encoding="utf-8") as frb:
31 | incomplete_segment: str = ""
32 | xml_line: str = frb.readline()
33 | while xml_line and "" not in xml_line:
34 | incomplete_segment = xml_line + incomplete_segment
35 | xml_line = frb.readline()
36 | while xml_line and "" not in xml_line:
37 | incomplete_segment = xml_line + incomplete_segment
38 | xml_line = frb.readline()
39 | incomplete_segment_size = len(incomplete_segment.encode("utf-8"))
40 | file_size = os.path.getsize(filename)
41 | if file_size > incomplete_segment_size:
42 | with open(filename, "r+", encoding="utf-8") as fh:
43 | fh.truncate(file_size - incomplete_segment_size)
44 | else:
45 | print(
46 | 'len(incomplete_segment.encode("utf-8")) returned '
47 | + str(incomplete_segment_size)
48 | + ", while os.path.getsize(filename) returned "
49 | + str(file_size)
50 | + ", so fh.truncate() would be fh.truncate("
51 | + str(file_size - incomplete_segment_size)
52 | + "), which would be illegal. Something is seriously wrong here!"
53 | )
54 |
55 | # add newline to prevent ` ` in one line
56 | if endsWithNewlines(filename) == 0:
57 | addNewline(filename)
58 | elif endsWithNewlines(filename) > 1:
59 | print(f"WARNING: {filename} has {endsWithNewlines(filename)} newlines")
60 | return incomplete_segment
61 |
62 |
63 | def parseLastPageChunk(chunk) -> Optional[lxml.etree._ElementTree]:
64 | try:
65 | parser = lxml.etree.XMLParser(recover=True)
66 | tree = lxml.etree.parse(StringIO(chunk), parser)
67 | return tree.getroot()
68 | except lxml.etree.LxmlError:
69 | return None
70 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/image/html_regexs.py:
--------------------------------------------------------------------------------
1 | R_NEXT = r"(?\d+)&"
2 |
3 | REGEX_CANDIDATES = [
4 | # [0]
5 | # archiveteam 1.15.1
6 | # wikanda 1.15.5
10 | r'(?im)\s*'
11 | # [1]
12 | # wikijuegos 1.9.5
13 | # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
14 | # mediawiki version
15 | ,
16 | r'(?im)\s*\s*\s*'
17 | # [2]
18 | # gentoowiki 1.18
19 | ,
20 | r'(?im)'
21 | # [3]
22 | # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
23 | # (desc) 109 0923.JPG . . 885,713 bytes . . Bfalconer . . 18:44, 17 November 2005
24 | ,
25 | '(?ism)]+ title="[^:>]+:(?P[^>]+)">[^<]+[^<]+[^<]+[^<]+]+>(?P[^<]+)'
26 | # [4]
27 | ,
28 | (
29 | r'(?im)\s*'
30 | r'\s*'
31 | r'\s*'
32 | r').)*?)?'
33 | ),
34 | ]
35 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/config.py:
--------------------------------------------------------------------------------
1 | """
2 | config = {
3 | "curonly": args.curonly,
4 | "date": datetime.datetime.now().strftime("%Y%m%d"),
5 | "api": api,
6 | "failfast": args.failfast,
7 | "http_method": "POST",
8 | "index": index,
9 | "images": args.images,
10 | "logs": False,
11 | "xml": args.xml,
12 | "xmlrevisions": args.xmlrevisions,
13 | "namespaces": namespaces,
14 | "exnamespaces": exnamespaces,
15 | "path": args.path and os.path.normpath(args.path) or "",
16 | "cookies": args.cookies or "",
17 | "delay": args.delay,
18 | "retries": int(args.retries),
19 | }
20 | """
21 |
22 | import dataclasses
23 | import json
24 | import sys
25 | from typing import *
26 |
27 |
28 | def _dataclass_from_dict(klass_or_obj, d):
29 | ret = klass_or_obj() if isinstance(klass_or_obj, type) else klass_or_obj
30 | for k, v in d.items():
31 | if hasattr(ret, k):
32 | setattr(ret, k, v)
33 | return ret
34 |
35 |
36 | @dataclasses.dataclass
37 | class Config:
38 | def asdict(self):
39 | return dataclasses.asdict(self)
40 |
41 | # General params
42 | delay: float = 0.0
43 | retries: int = 0
44 | path: str = ""
45 | logs: bool = False
46 | date: str = False
47 |
48 | # URL params
49 | index: str = ""
50 | api: str = ""
51 |
52 | # Download params
53 | xml: bool = False
54 | curonly: bool = False
55 | xmlapiexport: bool = False
56 | xmlrevisions: bool = False
57 | xmlrevisions_page: bool = False
58 | images: bool = False
59 | namespaces: List[int] = None
60 | exnamespaces: List[int] = None
61 |
62 | api_chunksize: int = 0 # arvlimit, ailimit, etc
63 | export: str = "" # Special:Export page name
64 | http_method: str = ""
65 |
66 | # Meta info params
67 | failfast: bool = False
68 |
69 | templates: bool = False
70 |
71 |
72 | def newConfig(configDict) -> Config:
73 | return _dataclass_from_dict(Config, configDict)
74 |
75 |
76 | def loadConfig(config: Config = None, configfilename=""):
77 | """Load config file"""
78 |
79 | configDict = dataclasses.asdict(config)
80 |
81 | if config.path:
82 | try:
83 | with open(f"{config.path}/{configfilename}", encoding="utf-8") as infile:
84 | configDict.update(json.load(infile))
85 | return newConfig(configDict)
86 | except:
87 | pass
88 |
89 | print("There is no config file. we can't resume. Start a new dump.")
90 | sys.exit()
91 |
92 |
93 | def saveConfig(config: Config = None, configfilename=""):
94 | """Save config file"""
95 |
96 | with open(f"{config.path}/{configfilename}", "w", encoding="utf-8") as outfile:
97 | json.dump(dataclasses.asdict(config), outfile)
98 |
--------------------------------------------------------------------------------
/wikiteam3/utils/login/api.py:
--------------------------------------------------------------------------------
1 | """ Available since MediaWiki 1.27. login to a wiki using username and password (API) """
2 |
3 | from typing import *
4 |
5 | import requests
6 |
7 |
8 | def fetchLoginToken(session: requests.Session, api: str) -> Optional[str]:
9 | """fetch login token by API .(MediaWiki 1.27+)"""
10 |
11 | response = session.get(
12 | url=api,
13 | params={"action": "query", "meta": "tokens", "type": "login", "format": "json"},
14 | )
15 | data = response.json()
16 | try:
17 | token = data["query"]["tokens"]["logintoken"]
18 | if type(token) is str:
19 | return token
20 | except KeyError:
21 | print("fetch login token: Oops! Something went wrong -- ", data)
22 | return None
23 |
24 |
25 | def clientLogin(
26 | api: str, session: requests.Session, username: str, password: str
27 | ) -> Optional[requests.Session]:
28 | """login to a wiki using username and password. (MediaWiki 1.27+)"""
29 |
30 | login_token = fetchLoginToken(session=session, api=api)
31 | if not login_token:
32 | return None
33 |
34 | response = session.post(
35 | url=api,
36 | data={
37 | "action": "clientlogin",
38 | "username": username,
39 | "password": password,
40 | "loginreturnurl": "http://127.0.0.1:5000/",
41 | "logintoken": login_token,
42 | "format": "json",
43 | },
44 | )
45 |
46 | data = response.json()
47 |
48 | try:
49 | if data["clientlogin"]["status"] == "PASS":
50 | print(
51 | "client login: Success! Welcome, "
52 | + data["clientlogin"]["username"]
53 | + "!"
54 | )
55 | except KeyError:
56 | print("client login: Oops! Something went wrong -- ", data)
57 | return None
58 |
59 | return session
60 |
61 |
62 | def botLogin(
63 | api: str, session: requests.Session, username: str, password: str
64 | ) -> Optional[requests.Session]:
65 | """login to a wiki using BOT's name and password. (MediaWiki 1.27+)"""
66 |
67 | login_token = fetchLoginToken(session=session, api=api)
68 | if not login_token:
69 | return None
70 |
71 | response = session.post(
72 | url=api,
73 | data={
74 | "action": "login",
75 | "lgname": username,
76 | "lgpassword": password,
77 | "lgtoken": login_token,
78 | "format": "json",
79 | },
80 | )
81 |
82 | data = response.json()
83 |
84 | try:
85 | if data["login"]["result"] == "Success":
86 | print("bot login: Success! Welcome, " + data["login"]["lgusername"] + "!")
87 | except KeyError:
88 | print(f"bot login: Oops! Something went wrong -- {data}")
89 | return None
90 |
91 | return session
92 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/cli/greeter.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | from wikiteam3.dumpgenerator.version import getVersion
4 |
5 |
6 | def welcome():
7 | message = ""
8 | """Opening message"""
9 | message += "#" * 73
10 | message += "\n"
11 | welcome_string = f"# Welcome to DumpGenerator {getVersion()} by WikiTeam (GPL v3)"
12 | welcome_string += " " * (73 - len(welcome_string) - 1) + "#"
13 | message += welcome_string
14 | message += "\n"
15 | message += (
16 | "# More info at: https://github.com/elsiehupp/wikiteam3 #"
17 | )
18 | message += "\n"
19 | message += "#" * 73
20 | message += "\n"
21 | message += ""
22 | message += "\n"
23 | message += "#" * 73
24 | message += "\n"
25 | message += (
26 | "# Copyright (C) 2011-%d WikiTeam developers #\n"
27 | % (datetime.datetime.now().year)
28 | )
29 | message += """# #
30 | # This program is free software: you can redistribute it and/or modify #
31 | # it under the terms of the GNU General Public License as published by #
32 | # the Free Software Foundation, either version 3 of the License, or #
33 | # (at your option) any later version. #
34 | # #
35 | # This program is distributed in the hope that it will be useful, #
36 | # but WITHOUT ANY WARRANTY; without even the implied warranty of #
37 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
38 | # GNU General Public License for more details. #
39 | # #
40 | # You should have received a copy of the GNU General Public License #
41 | # along with this program. If not, see . #"""
42 | message += "\n"
43 | message += "#" * 73
44 | message += "\n"
45 | message += ""
46 |
47 | return message
48 |
49 |
50 | def bye():
51 | """Closing message"""
52 | print("")
53 | print("---> Congratulations! Your dump is complete <---")
54 | print("")
55 | print("If you encountered a bug, you can report it on GitHub Issues:")
56 | print(" https://github.com/mediawiki-client-tools/mediawiki-dump-generator/issues")
57 | print("")
58 | print("If you need any other help, you can reach out on GitHub Discussions:")
59 | print(" https://github.com/orgs/mediawiki-client-tools/discussions")
60 | print("")
61 | print("If this is a public wiki, please, consider publishing this dump.")
62 | print("Do it yourself as explained in:")
63 | print(" https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump")
64 | print("")
65 | print("Good luck! Bye!")
66 | print("")
67 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "wikiteam3"
3 | version = "3.0.0"
4 | description = "Tools for downloading and preserving wikis. We archive wikis, from Wikipedia to tiniest wikis. As of 2020, WikiTeam has preserved more than 250,000 wikis."
5 | license = "GPL-3.0-or-later"
6 | authors = ["WikiTeam Contributors "]
7 | maintainers = [
8 | "Federico Leva ",
9 | "Elsie Hupp "
10 | ]
11 | readme = "README.md"
12 | homepage = "https://wiki.archiveteam.org/index.php/WikiTeam"
13 | repository = "https://github.com/WikiTeam/wikiteam"
14 | documentation = "https://wikiteam.readthedocs.io"
15 | keywords = [
16 | "archiveteam",
17 | "mediawiki",
18 | "preservation",
19 | "wiki",
20 | "wikipedia"
21 | ]
22 | classifiers = [
23 | "Development Status :: 3 - Alpha",
24 | "Environment :: Console",
25 | "Intended Audience :: Education",
26 | "Intended Audience :: End Users/Desktop",
27 | "Intended Audience :: Information Technology",
28 | "Intended Audience :: Legal Industry",
29 | "Intended Audience :: Science/Research",
30 | "Intended Audience :: System Administrators",
31 | "Natural Language :: English",
32 | "Operating System :: OS Independent",
33 | "Topic :: Communications",
34 | "Topic :: Internet",
35 | "Topic :: Internet :: WWW/HTTP :: Dynamic Content :: Wiki",
36 | "Topic :: Scientific/Engineering :: Information Analysis",
37 | "Topic :: Sociology :: History",
38 | "Topic :: System :: Archiving",
39 | "Topic :: System :: Archiving :: Backup",
40 | "Topic :: Utilities"
41 | ]
42 | packages = [
43 | { include = "wikiteam3/**/*"},
44 | ]
45 | exclude = ["wikiteam3/dumpgenerator/test/*"]
46 |
47 | [tool.poetry.scripts]
48 | dumpgenerator = "wikiteam3.dumpgenerator:main"
49 | # gui = "wikiteam3.gui:main"
50 | launcher = "wikiteam3.launcher:main"
51 | # not-archived = "wikiteam3.not-archived:main"
52 | uploader = "wikiteam3.uploader:main"
53 | # wikiadownloader = "wikiteam3.wikiadownloader:main"
54 | # wikipediadownloader = "wikiteam3.wikipediadownloader:main"
55 | # wikispaces = "wikiteam3.wikispaces:main"
56 |
57 | [tool.poetry.dependencies]
58 | python = "^3.8"
59 | requests = "^2.32.0"
60 | internetarchive = "^3.1.0"
61 | lxml = "^5.0.0"
62 | mwclient = "^0.10.1"
63 | PyMySQL = "^1.1.1"
64 | pywikibot = "^6.6.1"
65 | urllib3 = "^1.26.18"
66 | wikitools3 = "^3.0.0"
67 | pymysql = "*"
68 | file_read_backwards = "^2.0.0"
69 | pre-commit-poetry-export = "^0.1.2"
70 |
71 | [tool.isort]
72 | profile = "black"
73 |
74 | [tool.poetry.dev-dependencies]
75 | pytest = "^6.2.5"
76 | requests = "^2.32.0"
77 | flake8 = "^3.9.2"
78 | pre-commit = "^2.17.0"
79 | pymarkdown = "^0.1.4"
80 |
81 | [build-system]
82 | requires = ["poetry-core>=1.0.0"]
83 | build-backend = "poetry.core.masonry.api"
84 |
85 | [tool.pymarkdown]
86 | disable-rules = "line-length,no-inline-html"
87 |
--------------------------------------------------------------------------------
/wikiteam3/utils/util.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import re
3 | import sys
4 |
5 |
6 | def cleanHTML(raw: str = "") -> str:
7 | """Extract only the real wiki content and remove rubbish
8 | This function is ONLY used to retrieve page titles
9 | and file names when no API is available
10 | DO NOT use this function to extract page content"""
11 | # different "tags" used by different MediaWiki versions to mark where
12 | # starts and ends content
13 | if re.search("", raw):
14 | raw = raw.split("")[1].split("")[0]
15 | elif re.search("", raw):
16 | raw = raw.split("")[1].split("")[0]
17 | elif re.search("", raw):
18 | raw = raw.split("")[1].split(
19 | ""
20 | )[0]
21 | elif re.search("", raw):
22 | raw = raw.split("")[1].split("")[0]
23 | elif re.search(r'', raw):
24 | raw = raw.split('')[
25 | 1
26 | ].split("")[0]
27 | elif re.search("')[0]
29 | else:
30 | print(raw[:250])
31 | print("This wiki doesn't use marks to split content")
32 | sys.exit()
33 | return raw
34 |
35 |
36 | def undoHTMLEntities(text: str = "") -> str:
37 | """Undo some HTML codes"""
38 |
39 | # i guess only < > & " ' need conversion
40 | # http://www.w3schools.com/html/html_entities.asp
41 | text = re.sub("<", "<", text)
42 | text = re.sub(">", ">", text)
43 | text = re.sub("&", "&", text)
44 | text = re.sub(""", '"', text)
45 | text = re.sub("'", "'", text)
46 |
47 | return text
48 |
49 |
50 | def removeIP(raw: str = "") -> str:
51 | """Remove IP from HTML comments """
52 |
53 | raw = re.sub(r"\d+\.\d+\.\d+\.\d+", "0.0.0.0", raw)
54 | # http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
55 | # weird cases as :: are not included
56 | raw = re.sub(
57 | r"(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}",
58 | "0:0:0:0:0:0:0:0",
59 | raw,
60 | )
61 |
62 | return raw
63 |
64 |
65 | def cleanXML(xml: str = "") -> str:
66 | """Trim redundant info from the XML however it comes"""
67 | # do not touch XML codification, leave AS IS
68 | # EDIT 2022: we are making this explicitly Unicode
69 | # for Windows compatibility.
70 | # If the encoding has to stay as is, we'll have
71 | # to change all the file encodings, as well.
72 |
73 | if re.search(r"\n", xml):
74 | xml = xml.split("\n")[1]
75 | if re.search(r"", xml):
76 | xml = xml.split("")[0]
77 | return xml
78 |
79 |
80 | def sha1File(filename: str = "") -> str:
81 | """Return the SHA1 hash of a file"""
82 |
83 | sha1 = hashlib.sha1()
84 | with open(filename, "rb") as f:
85 | while True:
86 | if data := f.read(65536):
87 | sha1.update(data)
88 | else:
89 | break
90 | return sha1.hexdigest()
91 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # `MediaWiki Dump Generator`
2 |
3 | **MediaWiki Dump Generator can archive wikis from the largest to the tiniest.**
4 |
5 | `MediaWiki Dump Generator` is a project to port the legacy [`wikiteam`](https://github.com/WikiTeam/wikiteam) toolset to Python 3 and PyPI to make it more accessible for today's archivers.
6 |
7 | Most of the focus has been on the core `dumpgenerator` tool. Python 3 versions of the other `wikiteam` tools may be added over time.
8 |
9 | The project is currently mostly in maintenance mode. We will do our best to prevent the project from breaking entirely. Issues and pull requests are welcomed but may not be reviewed promptly.
10 |
11 | ## MediaWiki Dump Generator Toolset
12 |
13 | MediaWiki Dump Generator is a set of tools for archiving wikis. The main general-purpose module of MediaWiki Dump Generator is dumpgenerator, which can download XML dumps of MediaWiki sites that can then be parsed or redeployed elsewhere.
14 |
15 | Wikipedia is far too large to manage the dump easily and [dumps are already freely available](https://en.wikipedia.org/wiki/Wikipedia:Database_download#Where_do_I_get_the_dumps?).
16 |
17 | ## Installing the tools
18 |
19 | For prerequisites and installation see [Installation](./INSTALLATION.md)
20 |
21 | ## Using the tools
22 |
23 | For usage see [Usage](./USAGE.md)
24 |
25 | ## Publishing the dump
26 |
27 | Please consider publishing your wiki dump(s). You can do it yourself as explained in [Publishing](./PUBLISHING.md).
28 |
29 | ## Getting help
30 |
31 | * You can read and post in MediaWiki Client Tools' [GitHub Discussions]( https://github.com/orgs/mediawiki-client-tools/discussions).
32 | * If you need help (other than reporting a bug), you can reach out on MediaWiki Client Tools' [Discussions/Q&A](https://github.com/orgs/mediawiki-client-tools/discussions/categories/q-a).
33 |
34 | ## Contributing
35 |
36 | For information on reporting bugs and proposing changes, please see the [Contributing](./CONTRIBUTING.md) guide.
37 |
38 | ## Code of Conduct
39 |
40 | `mediawiki-client-tools` has a [Code of Conduct](./CODE_OF_CONDUCT.md).
41 |
42 | At the moment the only person responsible for reviewing CoC reports is the repository administrator, Janet Cobb, reachable at [git@randomcat.org](mailto:git@randomcat.org). Please state up front if your message concerns the Code of Conduct, as these messages are confidential.
43 |
44 | In case of emergency (i.e. if Janet is not reachable or if such an issue involves her), you can contact Elsie Hupp, who also retains privileges over this repository, directly via email at [mediawiki-client-tools@elsiehupp.com](mailto:mediawiki-client-tools@elsiehupp.com) or on Matrix at [@elsiehupp:beeper.com](https://matrix.to/#/@elsiehupp:beeper.com).
45 |
46 | ## Contributors
47 |
48 | **WikiTeam** is the [Archive Team](http://www.archiveteam.org) [[GitHub](https://github.com/ArchiveTeam)] subcommittee on wikis.
49 | It was founded and originally developed by [Emilio J. Rodríguez-Posada](https://github.com/emijrp), a Wikipedia veteran editor and amateur archivist. Thanks to people who have helped, especially to: [Federico Leva](https://github.com/nemobis), [Alex Buie](https://github.com/ab2525), [Scott Boyd](http://www.sdboyd56.com), [Hydriz](https://github.com/Hydriz), Platonides, Ian McEwen, [Mike Dupont](https://github.com/h4ck3rm1k3), [balr0g](https://github.com/balr0g) and [PiRSquared17](https://github.com/PiRSquared17).
50 |
51 | **MediaWiki Dump Generator**
52 | The Python 3 initiative was started and originally maintained by [Elsie Hupp](https://github.com/elsiehupp); it is currently primarily maintained by [Janet Cobb](https://github.com/randomnetcat). We are also grateful to have contributions from [Victor Gambier](https://github.com/vgambier), [Thomas Karcher](https://github.com/t-karcher), [yzqzss](https://github.com/yzqzss), [NyaMisty](https://github.com/NyaMisty) and [Rob Kam](https://github.com/robkam).
53 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/namespaces.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | from wikiteam3.dumpgenerator.api import getJSON
4 | from wikiteam3.dumpgenerator.cli import Delay
5 | from wikiteam3.dumpgenerator.config import Config
6 |
7 |
8 | def getNamespacesScraper(config: Config = None, session=None):
9 | """Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages"""
10 | """Function called if no API is available"""
11 | namespaces = config.namespaces
12 | namespacenames = {0: ""} # main is 0, no prefix
13 | if namespaces:
14 | r = session.post(
15 | url=config.index, params={"title": "Special:Allpages"}, timeout=30
16 | )
17 | raw = r.text
18 | Delay(config=config, session=session)
19 |
20 | # [^>]*? to include selected="selected"
21 | m = re.compile(
22 | r''
23 | ).finditer(raw)
24 | if "all" in namespaces:
25 | namespaces = []
26 | for i in m:
27 | namespaces.append(int(i.group("namespaceid")))
28 | namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
29 | else:
30 | # check if those namespaces really exist in this wiki
31 | namespaces2 = []
32 | for i in m:
33 | if int(i.group("namespaceid")) in namespaces:
34 | namespaces2.append(int(i.group("namespaceid")))
35 | namespacenames[int(i.group("namespaceid"))] = i.group(
36 | "namespacename"
37 | )
38 | namespaces = namespaces2
39 | else:
40 | namespaces = [0]
41 |
42 | namespaces = list(set(namespaces)) # uniques
43 | print("%d namespaces found" % (len(namespaces)))
44 | return namespaces, namespacenames
45 |
46 |
47 | def getNamespacesAPI(config: Config = None, session=None):
48 | """Uses the API to get the list of namespaces names and ids"""
49 | namespaces = config.namespaces
50 | namespacenames = {0: ""} # main is 0, no prefix
51 | if namespaces:
52 | r = session.get(
53 | url=config.api,
54 | params={
55 | "action": "query",
56 | "meta": "siteinfo",
57 | "siprop": "namespaces",
58 | "format": "json",
59 | },
60 | timeout=30,
61 | )
62 | result = getJSON(r)
63 | Delay(config=config, session=session)
64 | try:
65 | nsquery = result["query"]["namespaces"]
66 | except KeyError:
67 | print("Error: could not get namespaces from the API request.")
68 | print("HTTP %d" % r.status_code)
69 | print(r.text)
70 | return None
71 |
72 | if "all" in namespaces:
73 | namespaces = []
74 | for i in nsquery.keys():
75 | if int(i) < 0: # -1: Special, -2: Media, excluding
76 | continue
77 | namespaces.append(int(i))
78 | namespacenames[int(i)] = nsquery[i]["*"]
79 | else:
80 | # check if those namespaces really exist in this wiki
81 | namespaces2 = []
82 | for i in nsquery.keys():
83 | bi = i
84 | i = int(i)
85 | if i < 0: # -1: Special, -2: Media, excluding
86 | continue
87 | if i in namespaces:
88 | namespaces2.append(i)
89 | namespacenames[i] = nsquery[bi]["*"]
90 | namespaces = namespaces2
91 | else:
92 | namespaces = [0]
93 |
94 | namespaces = list(set(namespaces)) # uniques
95 | print("%d namespaces found" % (len(namespaces)))
96 | return namespaces, namespacenames
97 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/image/html_regexs_test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | from pathlib import Path
4 | from typing import Dict, List
5 |
6 | import pytest
7 | import requests
8 |
9 | from wikiteam3.dumpgenerator.dump.image.html_regexs import REGEX_CANDIDATES
10 |
11 | ONLINE = True
12 |
13 | HTML_DIR = Path("test/data/html_regexs")
14 | os.makedirs(HTML_DIR, exist_ok=True)
15 |
16 |
17 | def prepare_raws_from_urls(urls: Dict[str, str]):
18 | sess = requests.Session()
19 | raws: Dict[str, str] = {}
20 | for site, url in urls.items():
21 | try:
22 | resp = sess.get(url, timeout=10, allow_redirects=True)
23 | except Exception as e:
24 | pytest.warns(UserWarning, match=f"Could not fetch {url}: {e}")
25 | continue
26 |
27 | if resp.status_code == 200:
28 | raws[url] = resp.text
29 | if not os.path.exists(HTML_DIR / f"{site}.html"):
30 | with open(HTML_DIR / f"{site}.html", "w", encoding="utf-8") as f:
31 | f.write(resp.text)
32 | else:
33 | pytest.warns(
34 | UserWarning,
35 | match=f"Could not fetch {url}: status_code: {resp.status_code}",
36 | )
37 |
38 | return raws
39 |
40 |
41 | class TestRegexs:
42 | class TestRegexsOnline:
43 | listFiles_urls = {
44 | # site-date: url , `limit=` for counting the number of matches
45 | "archiveteam.org-20230701": "https://wiki.archiveteam.org/index.php?title=Special:ListFiles&sort=byname&limit=7",
46 | "wiki.othing.xyz-20230701": "https://wiki.othing.xyz/index.php?title=Special:ListFiles&sort=byname",
47 | "mediawiki.org-20230701": "https://www.mediawiki.org/w/index.php?title=Special:ListFiles&sort=byname&limit=7",
48 | "asoiaf.fandom.com-20230701": "https://asoiaf.fandom.com/zh/wiki/Special:文件列表?sort=byname&limit=7",
49 | # only for local testing:
50 | # "commons.moegirl.org.cn-20230701": "https://commons.moegirl.org.cn/index.php?title=Special:ListFiles&sort=byname&limit=7",
51 | # # login required:
52 | # "group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701": "http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/index.php?title=Special:文件列表&limit=1",
53 | # "group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701": "http://group1.mediawiki.demo.save-web.org/mediawiki-1.27.7/index.php?title=Special:ListFiles&limit=2",
54 | }
55 | raws: Dict[str, str] = {}
56 |
57 | def test_online(self):
58 | if not ONLINE:
59 | pytest.skip("Online test skipped")
60 | self.raws = prepare_raws_from_urls(self.listFiles_urls)
61 | assert len(self.raws) != 0, "Could not fetch any of the URLs"
62 | for url, raw in self.raws.items():
63 | best_matched = 0
64 | regexp_best = None
65 |
66 | for regexp in REGEX_CANDIDATES:
67 | _count = len(re.findall(regexp, raw))
68 | if _count > best_matched:
69 | best_matched = _count
70 | regexp_best = regexp
71 |
72 | assert (
73 | regexp_best is not None
74 | ), f"Could not find a proper regexp to parse the HTML for {url} (online)"
75 |
76 | if "limit=" in url:
77 | limit = int(url.split("limit=")[-1])
78 | assert (
79 | len(re.findall(regexp_best, raw)) == limit
80 | ), f"Could not find {limit} matches for {url} (online)"
81 |
82 | class TestRegexsOffline:
83 | html_files = os.listdir(HTML_DIR)
84 | raws: Dict[str, str] = {}
85 | for html_file in html_files:
86 | with open(HTML_DIR / html_file, encoding="utf-8") as f:
87 | raws[html_file] = f.read()
88 | assert len(raws) != 0, f"Could not find any HTML files in {HTML_DIR}"
89 |
90 | def test_offline(self):
91 | assert len(self.raws) != 0, "Could not fetch any of the URLs"
92 | for site, raw in self.raws.items():
93 | best_matched = 0
94 | regexp_best = None
95 |
96 | for regexp in REGEX_CANDIDATES:
97 | _count = len(re.findall(regexp, raw))
98 | if _count > best_matched:
99 | best_matched = _count
100 | regexp_best = regexp
101 |
102 | assert (
103 | regexp_best is not None
104 | ), f"Could not find a proper regexp to parse the HTML for {site} (local)"
105 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | import sys
4 | from typing import *
5 |
6 | import requests
7 |
8 | from wikiteam3.dumpgenerator.config import Config
9 | from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage
10 | from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError
11 | from wikiteam3.dumpgenerator.log import logerror
12 |
13 |
14 | def getXMLHeader(config: Config = None, session=None) -> Tuple[str, Config]:
15 | """Retrieve a random page to extract XML headers (namespace info, etc)"""
16 | print(config.api)
17 | xml = ""
18 | disableSpecialExport = config.xmlrevisions or config.xmlapiexport
19 | randomtitle = "Main_Page"
20 | if disableSpecialExport and config.api and config.api.endswith("api.php"):
21 | try:
22 | print("Getting the XML header from the API")
23 | # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.8
24 | r = session.get(
25 | f"{config.api}?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1",
26 | timeout=10,
27 | )
28 | xml: str = r.text
29 | # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19
30 | if not re.match(r"\s* does not exist. Not a problem, if we get the .
73 | xml = pme.xml
74 | except ExportAbortedError:
75 | try:
76 | if config.api:
77 | print("Trying the local name for the Special namespace instead")
78 | r = session.get(
79 | url=config.api,
80 | params={
81 | "action": "query",
82 | "meta": "siteinfo",
83 | "siprop": "namespaces",
84 | "format": "json",
85 | },
86 | timeout=120,
87 | )
88 | config.export = (
89 | json.loads(r.text)["query"]["namespaces"]["-1"]["*"] + ":Export"
90 | )
91 | xml = "".join(
92 | list(
93 | getXMLPage(
94 | config=config,
95 | title=randomtitle,
96 | verbose=False,
97 | session=session,
98 | )
99 | )
100 | )
101 | except PageMissingError as pme:
102 | xml = pme.xml
103 | except ExportAbortedError:
104 | pass
105 |
106 | header = xml.split("")[0]
107 | if not re.match(r"\s*= 4:
27 | break
28 | if r.status_code == 200:
29 | break
30 | elif r.status_code < 400:
31 | api = r.url
32 | elif r.status_code > 400:
33 | print(
34 | "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code
35 | )
36 | return None
37 | if "MediaWiki API is not enabled for this site." in r.text:
38 | return None
39 | try:
40 | result = getJSON(r)
41 | index = None
42 | if result:
43 | try:
44 | index = (
45 | result["query"]["general"]["server"]
46 | + result["query"]["general"]["script"]
47 | )
48 | return (True, index, api)
49 | except KeyError:
50 | print("MediaWiki API seems to work but returned no index URL")
51 | return (True, None, api)
52 | except ValueError:
53 | print(repr(r.text))
54 | print("MediaWiki API returned data we could not parse")
55 | return None
56 | return None
57 |
58 |
59 | def mwGetAPIAndIndex(url="", session: requests.Session = None):
60 | """Returns the MediaWiki API and Index.php"""
61 |
62 | api = ""
63 | index = ""
64 | if not session:
65 | session = requests.Session() # Create a new session
66 | session.headers.update({"User-Agent": getUserAgent()})
67 | r = session.post(url=url, timeout=120)
68 | result = r.text
69 |
70 | if m := re.findall(
71 | r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>',
72 | result,
73 | ):
74 | api = m[0]
75 | if api.startswith("//"): # gentoo wiki
76 | api = url.split("//")[0] + api
77 | if m := re.findall(
78 | r']*?>\s*(?:)?\s*]*?>\s*(?:)?\s* len(
99 | re.findall(r"/index\.php\?", result)
100 | ):
101 | index = "/".join(api.split("/")[:-1]) + "/index.php5"
102 | else:
103 | index = "/".join(api.split("/")[:-1]) + "/index.php"
104 |
105 | if not api and index:
106 | api = urljoin(index, "api.php")
107 |
108 | return api, index
109 |
110 |
111 | def checkRetryAPI(api="", apiclient=False, session: requests.Session = None):
112 | """Call checkAPI and mwclient if necessary"""
113 | check = None
114 | try:
115 | check = checkAPI(api, session=session)
116 | except requests.exceptions.ConnectionError as e:
117 | print(f"Connection error: {str(e)}")
118 |
119 | if check and apiclient:
120 | apiurl = urlparse(api)
121 | try:
122 | site = mwclient.Site(
123 | apiurl.netloc,
124 | apiurl.path.replace("api.php", ""),
125 | scheme=apiurl.scheme,
126 | pool=session,
127 | )
128 | except KeyError:
129 | # Probably KeyError: 'query'
130 | if apiurl.scheme == "https":
131 | newscheme = "http"
132 | api = api.replace("https://", "http://")
133 | else:
134 | newscheme = "https"
135 | api = api.replace("http://", "https://")
136 | print(
137 | f"WARNING: The provided API URL did not work with mwclient. Switched protocol to: {newscheme}"
138 | )
139 |
140 | try:
141 | site = mwclient.Site(
142 | apiurl.netloc,
143 | apiurl.path.replace("api.php", ""),
144 | scheme=newscheme,
145 | pool=session,
146 | )
147 | except KeyError:
148 | check = False
149 |
150 | return check, api
151 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py:
--------------------------------------------------------------------------------
1 | from lxml import etree
2 | from lxml.builder import E
3 |
4 | from wikiteam3.dumpgenerator.exceptions import PageMissingError
5 |
6 |
7 | def makeXmlPageFromRaw(xml, arvcontinue) -> str:
8 | """Discard the metadata around a element in string"""
9 | root = etree.XML(xml)
10 | find = etree.XPath("//*[local-name() = 'page']")
11 | page = find(root)[0]
12 | if arvcontinue is not None:
13 | page.attrib["arvcontinue"] = arvcontinue
14 | # The tag will inherit the namespace, like:
15 | #
16 | # FIXME: pretty_print doesn't seem to work, only adds a newline
17 | return etree.tostring(page, pretty_print=True, encoding="unicode")
18 |
19 |
20 | def makeXmlFromPage(page: dict, arvcontinue) -> str:
21 | """Output an XML document as a string from a page as in the API JSON"""
22 | try:
23 | p = E.page(
24 | E.title(str(page["title"])),
25 | E.ns(str(page["ns"])),
26 | E.id(str(page["pageid"])),
27 | )
28 | if arvcontinue is not None:
29 | p.attrib["arvcontinue"] = arvcontinue
30 | for rev in page["revisions"]:
31 | # Older releases like MediaWiki 1.16 do not return all fields.
32 | userid = rev["userid"] if "userid" in rev else 0
33 | size = rev["size"] if "size" in rev else 0
34 | # Create rev object
35 | revision = [
36 | E.id(str(rev["revid"])),
37 | E.timestamp(rev["timestamp"]),
38 | ]
39 |
40 | # The text, user, comment, sha1 may be deleted/suppressed
41 | if ("texthidden" in rev) or ("textmissing" in rev):
42 | print(
43 | "Warning: text missing/hidden in pageid %d revid %d"
44 | % (page["pageid"], rev["revid"])
45 | )
46 | revision.append(
47 | E.text(
48 | **{
49 | "bytes": str(size),
50 | "deleted": "deleted",
51 | }
52 | )
53 | )
54 | else:
55 | text = str(rev["*"])
56 | revision.append(
57 | E.text(
58 | text,
59 | **{
60 | "bytes": str(size),
61 | "{http://www.w3.org/XML/1998/namespace}space": "preserve",
62 | }
63 | )
64 | )
65 |
66 | if "user" not in rev:
67 | if "userhidden" not in rev:
68 | print(
69 | "Warning: user not hidden but missing user in pageid %d revid %d"
70 | % (page["pageid"], rev["revid"])
71 | )
72 | revision.append(E.contributor(deleted="deleted"))
73 | else:
74 | revision.append(
75 | E.contributor(
76 | E.username(str(rev["user"])),
77 | E.id(str(userid)),
78 | )
79 | )
80 |
81 | if "sha1" in rev:
82 | revision.append(E.sha1(rev["sha1"]))
83 |
84 | elif "sha1hidden" in rev:
85 | revision.append(E.sha1()) # stub
86 | if "commenthidden" in rev:
87 | revision.append(E.comment(deleted="deleted"))
88 | elif "comment" in rev and rev["comment"]:
89 | revision.append(E.comment(str(rev["comment"])))
90 |
91 | if "contentmodel" in rev:
92 | revision.append(E.model(rev["contentmodel"]))
93 | if "contentformat" in rev:
94 | revision.append(E.format(rev["contentformat"]))
95 | # Sometimes a missing parentid is not replaced with a 0 as it should.
96 | if "parentid" in rev:
97 | revision.append(E.parentid(str(rev["parentid"])))
98 |
99 | if "minor" in rev:
100 | revision.append(E.minor())
101 |
102 | # mwcli's dump.xml order
103 | revisionTags = [
104 | "id",
105 | "parentid",
106 | "timestamp",
107 | "contributor",
108 | "minor",
109 | "comment",
110 | "origin",
111 | "model",
112 | "format",
113 | "text",
114 | "sha1",
115 | ]
116 | revisionElementsDict = {elem.tag: elem for elem in revision}
117 | _revision = E.revision()
118 | for tag in revisionTags:
119 | if tag in revisionElementsDict:
120 | _revision.append(revisionElementsDict.pop(tag))
121 | for elem in revisionElementsDict.values():
122 | _revision.append(elem)
123 | p.append(_revision)
124 | except KeyError as e:
125 | print(e)
126 | raise PageMissingError(page["title"], e)
127 | return etree.tostring(p, pretty_print=True, encoding="unicode")
128 |
--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/wiki_check.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | import requests
4 |
5 | from wikiteam3.utils import getUserAgent
6 |
7 |
8 | def getWikiEngine(url="", session: requests.Session = None) -> str:
9 | """Returns the wiki engine of a URL, if known"""
10 |
11 | if not session:
12 | session = requests.Session() # Create a new session
13 | session.headers.update({"User-Agent": getUserAgent()})
14 | r = session.post(url=url, timeout=30)
15 | if r.status_code == 405 or not r.text:
16 | r = session.get(url=url, timeout=120)
17 | result = r.text
18 |
19 | wikiengine = "Unknown"
20 | if re.search(
21 | '(?im)(MoinMoin Powered|