├── wikiteam3
    ├── __init__.py
    ├── utils
    │   ├── xmlutil.py
    │   ├── __init__.py
    │   ├── uprint.py
    │   ├── user_agent.py
    │   ├── domain.py
    │   ├── wiki_avoid.py
    │   ├── monkey_patch.py
    │   ├── login
    │   │   ├── __init__.py
    │   │   ├── index.py
    │   │   └── api.py
    │   └── util.py
    ├── dumpgenerator
    │   ├── test
    │   │   ├── __init__.py
    │   │   ├── test_config.py
    │   │   └── data
    │   │   │   └── html_regexs
    │   │   │       ├── group0.mediawiki.demo.save-web.org_mediawiki-1.16.5-20230701.html
    │   │   │       ├── group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701.html
    │   │   │       └── group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701.html
    │   ├── dump
    │   │   ├── image
    │   │   │   ├── __init__.py
    │   │   │   ├── html_regexs.py
    │   │   │   └── html_regexs_test.py
    │   │   ├── misc
    │   │   │   ├── __init__.py
    │   │   │   ├── site_info_test.py
    │   │   │   ├── index_php.py
    │   │   │   ├── special_version.py
    │   │   │   ├── special_logs.py
    │   │   │   └── site_info.py
    │   │   ├── page
    │   │   │   ├── __init__.py
    │   │   │   ├── xmlrev
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── xml_revisions_page.py
    │   │   │   └── xmlexport
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── page_xml.py
    │   │   │   │   ├── page_xml_export.py
    │   │   │   │   └── page_xml_api.py
    │   │   ├── xmldump
    │   │   │   ├── __init__.py
    │   │   │   ├── xml_integrity.py
    │   │   │   ├── xml_truncate.py
    │   │   │   ├── xml_header.py
    │   │   │   └── xml_dump.py
    │   │   ├── __init__.py
    │   │   └── generator.py
    │   ├── log
    │   │   ├── __init__.py
    │   │   └── log_error.py
    │   ├── cli
    │   │   ├── __init__.py
    │   │   ├── delay.py
    │   │   └── greeter.py
    │   ├── __main__.py
    │   ├── version.py
    │   ├── api
    │   │   ├── __init__.py
    │   │   ├── get_json.py
    │   │   ├── handle_status_code.py
    │   │   ├── index_check.py
    │   │   ├── namespaces.py
    │   │   ├── api.py
    │   │   ├── wiki_check.py
    │   │   └── page_titles.py
    │   ├── exceptions.py
    │   ├── __init__.py
    │   └── config.py
    └── launcher.py
├── .gitattributes
├── .travis.yml
├── .gitignore
├── .pymarkdown.json
├── .markdownlint.jsonc
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   └── bug_report.md
    └── workflows
    │   └── test-dumpgenerator.yml
├── .pre-commit-config.yaml
├── PUBLISHING.md
├── pyproject.toml
├── README.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── USAGE.md
└── INSTALLATION.md


/wikiteam3/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/xmlutil.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/image/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/xmldump/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/xmlrev/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/xmlexport/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/log/__init__.py:
--------------------------------------------------------------------------------
1 | from .log_error import logerror
2 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/__init__.py:
--------------------------------------------------------------------------------
1 | from .generator import DumpGenerator
2 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/cli/__init__.py:
--------------------------------------------------------------------------------
1 | from .cli import getParameters
2 | from .delay import Delay
3 | from .greeter import bye, welcome
4 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/__main__.py:
--------------------------------------------------------------------------------
1 | if __name__ == "__main__":
2 |     import sys
3 | 
4 |     from .__init__ import main
5 | 
6 |     sys.exit(main())
7 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/version.py:
--------------------------------------------------------------------------------
1 | __VERSION__ = "0.4.0-alpha"  # major, minor, micro: semver.org
2 | 
3 | 
4 | def getVersion():
5 |     return __VERSION__
6 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.com linguist-vendored
2 | *.org linguist-vendored
3 | 
4 | *.py text=auto
5 | *.sh text=auto
6 | *.json text=auto
7 | *.txt text=auto
8 | *.md text=auto
9 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python: 3.8
 3 | install:
 4 |   - pip install poetry
 5 |   - poetry install
 6 | script:
 7 |   - poetry run pytest --verbose -s
 8 | notifications:
 9 |   email: false
10 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .api import checkAPI, checkRetryAPI, mwGetAPIAndIndex
2 | from .get_json import getJSON
3 | from .handle_status_code import handleStatusCode
4 | from .wiki_check import getWikiEngine
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | .pytest_cache
 3 | keys.txt
 4 | batchdownload/keys.txt
 5 | batchdownload/dumpgenerator.py
 6 | batchdownload/uploader.py
 7 | __pycache__
 8 | tests/tmp
 9 | dist/
10 | .DS_Store
11 | desktop.ini
12 | 
13 | .venv
14 | .vscode
15 | .idea
16 | 


--------------------------------------------------------------------------------
/.pymarkdown.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "plugins": {
 3 |         "line-length": {
 4 |             "enabled": false
 5 |         },
 6 |         "no-inline-html": {
 7 |             "allowed_elements": "details,summary,code,!--"
 8 |         },
 9 |         "first-line-heading": {
10 |             "enabled": false,
11 |             "front_matter_title" : "name"
12 |         }
13 |     }
14 | }
15 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py:
--------------------------------------------------------------------------------
 1 | from typing import *
 2 | 
 3 | from wikiteam3.dumpgenerator.config import Config
 4 | 
 5 | 
 6 | def checkXMLIntegrity(
 7 |     config: Config = None, titles: Iterable[str] = None, session=None
 8 | ):
 9 |     """Check XML dump integrity, to detect broken XML chunks"""
10 |     # TODO: Fix XML Integrity Check
11 |     return
12 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .domain import domain2prefix
2 | from .login import botLogin, clientLogin, fetchLoginToken, indexLogin, uniLogin
3 | from .monkey_patch import mod_requests_text
4 | from .uprint import uprint
5 | from .user_agent import getUserAgent
6 | from .util import cleanHTML, cleanXML, removeIP, sha1File, undoHTMLEntities
7 | from .wiki_avoid import avoidWikimediaProjects
8 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/uprint.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | def uprint(*objects, sep=" ", end="\n", file=sys.stdout):
 5 |     enc = file.encoding
 6 |     if enc == "UTF-8":
 7 |         print(*objects, sep=sep, end=end, file=file)
 8 |     else:
 9 |         f = lambda obj: str(obj).encode(enc, errors="backslashreplace").decode(enc)
10 |         print(*map(f, objects), sep=sep, end=end, file=file)
11 | 


--------------------------------------------------------------------------------
/.markdownlint.jsonc:
--------------------------------------------------------------------------------
 1 | // If you change any options here,
 2 | // please change them in .pymarkdown.jsonc
 3 | // as well!
 4 | {
 5 |     "line-length": false,
 6 |     "no-inline-html": {
 7 |         "allowed_elements": [
 8 |             "details",
 9 |             "summary",
10 |             "code"
11 |         ]
12 |     },
13 |     "first-line-heading": {
14 |         "front_matter_title" : "name"
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/get_json.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | 
 4 | def getJSON(request: requests.Response):
 5 |     """Strip Unicode BOM"""
 6 |     if request.text.startswith("\ufeff"):
 7 |         request.encoding = "utf-8-sig"
 8 |     # request.encoding = request.apparent_encoding
 9 |     try:
10 |         return request.json()
11 |     except:
12 |         # Maybe an older API version which did not return correct JSON
13 |         return {}
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | contact_links:
3 |   - name: Get help using MediaWiki Dump Generator
4 |     url: https://github.com/orgs/mediawiki-client-tools/discussions/categories/q-a
5 |     about: If you need help (other than reporting a bug), you can reach out on our Discussions Q&A.
6 |   - name: Anything else
7 |     url: https://github.com/orgs/mediawiki-client-tools/discussions
8 |     about: You can read and post in our GitHub Discussions.
9 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/log/log_error.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from wikiteam3.dumpgenerator.config import Config
 4 | 
 5 | 
 6 | def logerror(config: Config = None, to_stdout=False, text="") -> None:
 7 |     """Log error in errors.log"""
 8 |     if text:
 9 |         with open(f"{config.path}/errors.log", "a", encoding="utf-8") as outfile:
10 |             output = (
11 |                 f'{datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")}: {text}\n'
12 |             )
13 |             outfile.write(output)
14 |     if to_stdout:
15 |         print(text)
16 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/site_info_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pytest
 4 | import requests
 5 | 
 6 | from wikiteam3.dumpgenerator.test.test_config import get_config
 7 | 
 8 | from .site_info import saveSiteInfo
 9 | 
10 | 
11 | def test_mediawiki_version_match():
12 |     with get_config("1.39.7") as config:
13 |         sess = requests.Session()
14 |         saveSiteInfo(config, sess)
15 |         with open(f"{config.path}/siteinfo.json") as f:
16 |             siteInfoJson = json.load(f)
17 |         assert siteInfoJson["query"]["general"]["generator"] == "MediaWiki 1.39.7"
18 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py:
--------------------------------------------------------------------------------
 1 | from wikiteam3.dumpgenerator.config import Config
 2 | 
 3 | from .page_xml_api import getXMLPageWithApi
 4 | from .page_xml_export import getXMLPageWithExport
 5 | 
 6 | 
 7 | def getXMLPage(config: Config = None, title="", verbose=True, session=None):
 8 |     if config.xmlapiexport:
 9 |         return getXMLPageWithApi(
10 |             config=config, title=title, verbose=verbose, session=session
11 |         )
12 |     else:
13 |         return getXMLPageWithExport(
14 |             config=config, title=title, verbose=verbose, session=session
15 |         )
16 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/user_agent.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | # Return a cool user-agent to hide Python user-agent
 4 | 
 5 | 
 6 | def getUserAgent():
 7 |     useragent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36"
 8 |     return useragent
 9 | 
10 | 
11 | def setupUserAgent(session: requests.Session):
12 |     session._orirequest = session.request
13 | 
14 |     def newrequest(*args, **kwargs):
15 |         session.headers.update({"User-Agent": getUserAgent()})
16 |         return session._orirequest(*args, **kwargs)
17 | 
18 |     session.request = newrequest
19 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/domain.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from wikiteam3.dumpgenerator.config import Config
 4 | 
 5 | 
 6 | def domain2prefix(config: Config = None, session=None):
 7 |     """Convert domain name to a valid prefix filename."""
 8 | 
 9 |     # At this point, both api and index are supposed to be defined
10 |     domain = ""
11 |     if config.api:
12 |         domain = config.api
13 |     elif config.index:
14 |         domain = config.index
15 | 
16 |     domain = domain.lower()
17 |     domain = re.sub(r"(https?://|www\.|/index\.php.*|/api\.php.*)", "", domain)
18 |     domain = domain.rstrip("/")
19 |     domain = re.sub(r"/", "_", domain)
20 |     domain = re.sub(r"\.", "", domain)
21 |     domain = re.sub(r"[^A-Za-z0-9]", "_", domain)
22 | 
23 |     return domain
24 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/index_php.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from wikiteam3.dumpgenerator.cli import Delay
 4 | from wikiteam3.dumpgenerator.config import Config
 5 | from wikiteam3.utils import removeIP
 6 | 
 7 | 
 8 | def saveIndexPHP(config: Config = None, session=None):
 9 |     """Save index.php as .html, to preserve license details available at the botom of the page"""
10 | 
11 |     if os.path.exists(f"{config.path}/index.html"):
12 |         print("index.html exists, do not overwrite")
13 |     else:
14 |         print("Downloading index.php (Main Page) as index.html")
15 |         r = session.post(url=config.index, params=None, timeout=10)
16 |         raw = str(r.text)
17 |         Delay(config=config, session=session)
18 |         raw = removeIP(raw=raw)
19 |         with open(f"{config.path}/index.html", "w", encoding="utf-8") as outfile:
20 |             outfile.write(raw)
21 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/wiki_avoid.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | from typing import *
 4 | 
 5 | from wikiteam3.dumpgenerator.config import Config
 6 | 
 7 | 
 8 | def avoidWikimediaProjects(config: Config = None, other: Dict = None):
 9 |     """Skip Wikimedia projects and redirect to the dumps website"""
10 | 
11 |     # notice about wikipedia dumps
12 |     url = ""
13 |     if config.api:
14 |         url += config.api
15 |     if config.index:
16 |         url = url + config.index
17 |     if re.findall(
18 |         r"(?i)(wikipedia|wikisource|wiktionary|wikibooks|wikiversity|wikimedia|wikispecies|wikiquote|wikinews|wikidata|wikivoyage)\.org",
19 |         url,
20 |     ):
21 |         print("PLEASE, DO NOT USE THIS SCRIPT TO DOWNLOAD WIKIMEDIA PROJECTS!")
22 |         print("Download the dumps from http://dumps.wikimedia.org")
23 |         if not other["force"]:
24 |             print("Thanks!")
25 |             sys.exit()
26 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/exceptions.py:
--------------------------------------------------------------------------------
 1 | class PageMissingError(Exception):
 2 |     def __init__(self, title, xml):
 3 |         self.title = title
 4 |         self.xml = xml
 5 | 
 6 |     def __str__(self):
 7 |         return f"page '{self.title}' not found"
 8 | 
 9 | 
10 | class ExportAbortedError(Exception):
11 |     def __init__(self, index):
12 |         self.index = index
13 | 
14 |     def __str__(self):
15 |         return f"Export from '{self.index}' did not return anything."
16 | 
17 | 
18 | class FileSizeError(Exception):
19 |     def __init__(self, file, size):
20 |         self.file = file
21 |         self.size = size
22 | 
23 |     def __str__(self):
24 |         return f"File '{self.file}' size is not match '{self.size}'."
25 | 
26 | 
27 | class FileSha1Error(Exception):
28 |     def __init__(self, file, sha1):
29 |         self.file = file
30 |         self.sha1 = sha1
31 | 
32 |     def __str__(self):
33 |         return f"File '{self.file}' sha1 is not match '{self.sha1}'."
34 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/special_version.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from wikiteam3.dumpgenerator.cli import Delay
 4 | from wikiteam3.dumpgenerator.config import Config
 5 | from wikiteam3.utils import removeIP
 6 | 
 7 | 
 8 | def saveSpecialVersion(config: Config = None, session=None):
 9 |     """Save Special:Version as .html, to preserve extensions details"""
10 | 
11 |     if os.path.exists(f"{config.path}/SpecialVersion.html"):
12 |         print("SpecialVersion.html exists, do not overwrite")
13 |     else:
14 |         print("Downloading Special:Version with extensions and other related info")
15 |         r = session.post(
16 |             url=config.index, params={"title": "Special:Version"}, timeout=10
17 |         )
18 |         raw = str(r.text)
19 |         Delay(config=config, session=session)
20 |         raw = str(removeIP(raw=raw))
21 |         with open(
22 |             f"{config.path}/SpecialVersion.html", "w", encoding="utf-8"
23 |         ) as outfile:
24 |             outfile.write(raw)
25 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/__init__.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # DumpGenerator A generator of dumps for wikis
 4 | # Copyright (C) 2011-2018 WikiTeam developers
 5 | # This program is free software: you can redistribute it and/or modify
 6 | # it under the terms of the GNU General Public License as published by
 7 | # the Free Software Foundation, either version 3 of the License, or
 8 | # (at your option) any later version.
 9 | #
10 | # This program is distributed in the hope that it will be useful,
11 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | # GNU General Public License for more details.
14 | #
15 | # You should have received a copy of the GNU General Public License
16 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 
18 | # To learn more, read the documentation:
19 | #     https://github.com/WikiTeam/wikiteam/wiki
20 | 
21 | 
22 | from wikiteam3.dumpgenerator.dump import DumpGenerator
23 | 
24 | 
25 | def main():
26 |     DumpGenerator()
27 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/special_logs.py:
--------------------------------------------------------------------------------
 1 | from wikiteam3.dumpgenerator.cli import Delay
 2 | from wikiteam3.dumpgenerator.config import Config
 3 | 
 4 | 
 5 | def saveLogs(config: Config = None, session=None):
 6 |     """Save Special:Log"""
 7 |     # get all logs from Special:Log
 8 |     """parse
 9 |     <select name='type'>
10 |     <option value="block">Bloqueos de usuarios</option>
11 |     <option value="rights">Cambios de perfil de usuario</option>
12 |     <option value="protect" selected="selected">Protecciones de páginas</option>
13 |     <option value="delete">Registro de borrados</option>
14 |     <option value="newusers">Registro de creación de usuarios</option>
15 |     <option value="merge">Registro de fusiones</option>
16 |     <option value="import">Registro de importaciones</option>
17 |     <option value="patrol">Registro de revisiones</option>
18 |     <option value="move">Registro de traslados</option>
19 |     <option value="upload">Subidas de archivos</option>
20 |     <option value="">Todos los registros</option>
21 |     </select>
22 | """
23 |     Delay(config=config, session=session)
24 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/test/test_config.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import tempfile
 3 | from contextlib import contextmanager
 4 | 
 5 | from wikiteam3.dumpgenerator.cli import getParameters
 6 | from wikiteam3.dumpgenerator.config import newConfig
 7 | 
 8 | CONFIG_CACHE = {}
 9 | 
10 | 
11 | @contextmanager
12 | def _new_config_from_parameter(params):
13 |     _params = tuple(params)
14 |     if _params in CONFIG_CACHE:
15 |         return CONFIG_CACHE[_params]
16 |     config, _ = getParameters(["--path=.", "--xml"] + list(params))
17 |     CONFIG_CACHE[_params] = config
18 |     _config = newConfig(copy.deepcopy(config.asdict()))
19 |     try:
20 |         with tempfile.TemporaryDirectory(prefix="wikiteam3test_") as tmpdir:
21 |             _config.path = tmpdir
22 |             yield _config
23 |     finally:
24 |         pass
25 | 
26 | 
27 | def get_config(mediawiki_ver, api=True):
28 |     assert api == True
29 |     if mediawiki_ver == "1.39.7":
30 |         return _new_config_from_parameter(
31 |             [
32 |                 "--api",
33 |                 "https://testw.fandom.com/api.php",
34 |             ]
35 |         )
36 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/handle_status_code.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | def handleStatusCode(response):
 5 |     statuscode = response.status_code
 6 |     if statuscode >= 200 and statuscode < 300:
 7 |         return
 8 | 
 9 |     print("HTTP Error %d." % statuscode)
10 |     if statuscode >= 300 and statuscode < 400:
11 |         print("Redirect should happen automatically: please report this as a bug.")
12 |         print(response.url)
13 | 
14 |     elif statuscode == 400:
15 |         print("Bad Request: The wiki may be malfunctioning.")
16 |         print("Please try again later.")
17 |         print(response.url)
18 |         sys.exit(1)
19 | 
20 |     elif statuscode in [401, 403]:
21 |         print("Authentication required.")
22 |         print("Please use --user and --pass.")
23 |         print(response.url)
24 | 
25 |     elif statuscode == 404:
26 |         print("Not found. Is Special:Export enabled for this wiki?")
27 |         print(response.url)
28 |         sys.exit(1)
29 | 
30 |     elif statuscode == 429 or (statuscode >= 500 and statuscode < 600):
31 |         print("Server error, max retries exceeded.")
32 |         print("Please resume the dump later.")
33 |         print(response.url)
34 |         sys.exit(1)
35 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/monkey_patch.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | 
 3 | from wikiteam3.dumpgenerator.cli.delay import Delay
 4 | 
 5 | 
 6 | def mod_requests_text(requests: requests):
 7 |     """Monkey patch `requests.Response.text` to remove BOM"""
 8 | 
 9 |     def new_text(self):
10 |         return self.content.lstrip(b"\xef\xbb\xbf").decode(self.encoding)
11 | 
12 |     requests.Response.text = property(new_text)
13 | 
14 | 
15 | class DelaySession:
16 |     """Monkey patch `requests.Session.send` to add delay"""
17 | 
18 |     def __init__(self, session, msg=None, delay=None, config=None):
19 |         self.session = session
20 |         self.msg = msg
21 |         self.delay = delay
22 |         self.old_send = None
23 |         self.config = config
24 | 
25 |     def hijack(self):
26 |         """Don't forget to call `release()`"""
27 | 
28 |         def new_send(request, **kwargs):
29 |             Delay(msg=self.msg, delay=self.delay, config=self.config)
30 |             return self.old_send(request, **kwargs)
31 | 
32 |         self.old_send = self.session.send
33 |         self.session.send = new_send
34 | 
35 |     def release(self):
36 |         """Undo monkey patch"""
37 |         self.session.send = self.old_send
38 |         del self
39 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/index_check.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import requests
 4 | 
 5 | 
 6 | def checkIndex(index="", cookies="", session: requests.Session = None):
 7 |     """Checking index.php availability"""
 8 |     r = session.post(url=index, data={"title": "Special:Version"}, timeout=30)
 9 |     if r.status_code >= 400:
10 |         print(f"ERROR: The wiki returned status code HTTP {r.status_code}")
11 |         return False
12 |     raw = r.text
13 |     print("Checking index.php...", index)
14 |     # Workaround for issue 71
15 |     if (
16 |         re.search(
17 |             '(Special:Badtitle</a>|class="permissions-errors"|"wgCanonicalSpecialPageName":"Badtitle"|Login Required</h1>)',
18 |             raw,
19 |         )
20 |         and not cookies
21 |     ):
22 |         print("ERROR: This wiki requires login and we are not authenticated")
23 |         return False
24 |     if re.search(
25 |         '(page-Index_php|"wgPageName":"Index.php"|"firstHeading"><span dir="auto">Index.php</span>)',
26 |         raw,
27 |     ):
28 |         print("Looks like the page called Index.php, not index.php itself")
29 |         return False
30 |     return bool(
31 |         re.search(
32 |             '(This wiki is powered by|<h2 id="mw-version-license">|meta name="generator" content="MediaWiki|class="mediawiki)',
33 |             raw,
34 |         )
35 |     )
36 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/cli/delay.py:
--------------------------------------------------------------------------------
 1 | import itertools
 2 | import sys
 3 | import threading
 4 | import time
 5 | 
 6 | from wikiteam3.dumpgenerator.config import Config
 7 | 
 8 | 
 9 | class Delay:
10 |     done: bool = False
11 |     lock: threading.Lock = threading.Lock()
12 | 
13 |     def animate(self):
14 |         while True:
15 |             with self.lock:
16 |                 if self.done:
17 |                     return
18 | 
19 |                 print("\r" + self.ellipses, end="")
20 |                 self.ellipses += "."
21 | 
22 |             time.sleep(0.3)
23 | 
24 |     def __init__(self, config: Config = None, session=None, msg=None, delay=None):
25 |         """Add a delay if configured for that"""
26 |         self.ellipses: str = "."
27 | 
28 |         if delay is None:
29 |             delay = config.delay
30 |         if delay <= 0:
31 |             return
32 | 
33 |         if msg:
34 |             self.ellipses = f"Delay {delay:.1f}s: {msg} {self.ellipses}"
35 |         else:
36 |             self.ellipses = ("Delay %.1fs " % (delay)) + self.ellipses
37 | 
38 |         ellipses_animation = threading.Thread(target=self.animate)
39 |         ellipses_animation.daemon = True
40 |         ellipses_animation.start()
41 | 
42 |         time.sleep(delay)
43 | 
44 |         with self.lock:
45 |             self.done = True
46 |             print("\r" + " " * len(self.ellipses) + "\r", end="")
47 | 


--------------------------------------------------------------------------------
/.github/workflows/test-dumpgenerator.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: dumpgenerator test
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "python3" ]
 9 |   pull_request:
10 |     branches: [ "python3" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v3
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v4
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest poetry
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |     - name: Lint with flake8
33 |       run: |
34 |         # exit if there are Python syntax errors or undefined names
35 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: run dumpgenerator
39 |       run: |
40 |         python -m wikiteam3.dumpgenerator -h
41 |     - name: Test with pytest
42 |       run: |
43 |         cd wikiteam3/dumpgenerator && pytest && cd ../../
44 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/login/__init__.py:
--------------------------------------------------------------------------------
 1 | """ Provide login functions """
 2 | 
 3 | import time
 4 | 
 5 | import requests
 6 | 
 7 | from wikiteam3.utils.login.api import botLogin, clientLogin, fetchLoginToken
 8 | from wikiteam3.utils.login.index import indexLogin
 9 | 
10 | 
11 | def uniLogin(
12 |     api: str = "",
13 |     index: str = "",
14 |     session: requests.Session = requests.Session(),
15 |     username: str = "",
16 |     password: str = "",
17 | ):
18 |     """Try to login to a wiki using various methods.\n
19 |     Return `session` if success, else return `None`.\n
20 |     Try: `cilent login (api) => bot login (api) => index login (index)`"""
21 | 
22 |     if (not api and not index) or (not username or not password):
23 |         print("uniLogin: api or index or username or password is empty")
24 |         return None
25 | 
26 |     if api:
27 |         print("Trying to log in to the wiki using clientLogin... (MW 1.27+)")
28 |         if _session := clientLogin(
29 |             api=api, session=session, username=username, password=password
30 |         ):
31 |             return _session
32 |         time.sleep(5)
33 | 
34 |         print("Trying to log in to the wiki using botLogin... (MW 1.27+)")
35 |         if _session := botLogin(
36 |             api=api, session=session, username=username, password=password
37 |         ):
38 |             return _session
39 |         time.sleep(5)
40 | 
41 |     if index:
42 |         print("Trying to log in to the wiki using indexLogin... (generic)")
43 |         if _session := indexLogin(
44 |             index=index, session=session, username=username, password=password
45 |         ):
46 |             return _session
47 | 
48 |     return None
49 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Create a report to help us improve
 4 | title: ''
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!-- Thank you for helping to improve MediaWiki Dump Generator! -->
11 | 
12 | <!-- So that we can better address your issue,
13 | please fill out as much of the following as possible. -->
14 | 
15 | ## Describe the Bug
16 | 
17 | <!-- A brief but clear summary -->
18 | 
19 | ### Expected Behavior
20 | 
21 | <!-- What did you expect to happen? -->
22 | 
23 | ### Actual Behavior
24 | 
25 | <!-- What happened instead? -->
26 | 
27 | ## Command for Reproducing the Bug
28 | 
29 | <!-- The command string you used, with full URL
30 | (Please copy and paste within the code block below.) -->
31 | 
32 | ```bash
33 | 
34 | ```
35 | 
36 | ## Output
37 | 
38 | <details>
39 | <summary><code>stdout</code></summary>
40 | 
41 | <!-- stdout (the text from the terminal window)
42 | (Please copy and paste within the code block below.) -->
43 | 
44 | ```bash
45 | 
46 | ```
47 | 
48 | </details>
49 | 
50 | <details>
51 | <summary><code>errors.log</code></summary>
52 | 
53 | <!-- The errors.log file from the dump folder, if there is one
54 | (Please copy and paste within the code block below.) -->
55 | 
56 | ```text
57 | 
58 | ```
59 | 
60 | </details>
61 | 
62 | ## Platform Details
63 | 
64 | <!-- Please complete as much of the following as you're
65 | able to and remove whichever section is inapplicable -->
66 | 
67 | ### Desktop
68 | 
69 | - OS and version: <!-- e.g. Kubuntu 23.04, Windows 10, macOS 14.2 -->
70 | - File system: <!-- e.g. EXT4, NTFS, APFS -->
71 | - Python version: <!-- `$ python --version` -->
72 | - Command line shell: <!-- `$ $SHELL --version` -->
73 | - `dumpgenerator` version: <!-- `$ dumpgenerator -v` -->
74 | 
75 | ### Smartphone or Tablet
76 | 
77 | - OS: <!-- e.g. iOS 16.1, Android 11 -->
78 | - Python version: <!-- `$ python --version` -->
79 | - Command line shell: <!-- `$ $SHELL --version` -->
80 | - Terminal application used: <!-- e.g. Termux, Termius -->
81 | - `dumpgenerator` version: <!-- `$ dumpgenerator -v` -->
82 | 
83 | ## Additional Context
84 | 
85 | <!-- Add any other context about the problem here. -->
86 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Apply to all files without commiting:
 2 | #   pre-commit run --all-files
 3 | # Update this file:
 4 | #   pre-commit autoupdate
 5 | default_language_version:
 6 |     python: python3.8
 7 | repos:
 8 | -   repo: https://github.com/python-poetry/poetry
 9 |     rev: 1.6.0
10 |     hooks:
11 |     -   id: poetry-check
12 |     # -   id: poetry-lock
13 |     -   id: poetry-export
14 |         args: ["-f", "requirements.txt", "-o", "requirements.txt"]
15 | -   repo: https://github.com/pre-commit/pre-commit-hooks
16 |     rev: v4.4.0
17 |     hooks:
18 |     -   id: check-ast
19 |     -   id: fix-byte-order-marker
20 |     -   id: check-case-conflict
21 |     -   id: check-docstring-first
22 |     -   id: check-executables-have-shebangs
23 |     -   id: check-json
24 |     -   id: check-yaml
25 |     -   id: debug-statements
26 |     # -   id: detect-aws-credentials
27 |     # -   id: detect-private-key
28 |     -   id: end-of-file-fixer
29 |     -   id: trailing-whitespace
30 |     -   id: mixed-line-ending
31 | # -   repo: https://github.com/pre-commit/mirrors-mypy
32 | #     rev: v0.942
33 | #     hooks:
34 | #     -   id: mypy
35 | #         args: [--ignore-missing-imports]
36 | -   repo: https://github.com/PyCQA/isort
37 |     rev: 5.12.0
38 |     hooks:
39 |     -   id: isort
40 |         args: ["--profile", "black", "--filter-files"]
41 | -   repo: https://github.com/psf/black
42 |     rev: 23.7.0
43 |     hooks:
44 |     -   id: black
45 | -   repo: https://github.com/asottile/pyupgrade
46 |     rev: v3.10.1
47 |     hooks:
48 |     -   id: pyupgrade
49 |         args: [--py38-plus]
50 | -   repo: https://github.com/asottile/blacken-docs
51 |     rev: 1.16.0
52 |     hooks:
53 |     -   id: blacken-docs
54 |         # additional_dependencies: [black==20.8b1]
55 | ### Needs argument for diasabling line_length
56 | ### https://github.com/jackdewinter/pymarkdown/blob/main/docs/rules/rule_md013.md
57 | -   repo: https://github.com/jackdewinter/pymarkdown
58 |     rev: v0.9.12
59 |     hooks:
60 |     -   id: pymarkdown
61 |         args:
62 |         -   --config=.pymarkdown.json
63 |         # -   --disable-rules
64 |         # -   line-length,no-inline-html
65 |         -   scan
66 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/misc/site_info.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from wikiteam3.dumpgenerator.api import getJSON
 5 | from wikiteam3.dumpgenerator.cli import Delay
 6 | from wikiteam3.dumpgenerator.config import Config
 7 | 
 8 | 
 9 | def saveSiteInfo(config: Config = None, session=None):
10 |     """Save a file with site info"""
11 | 
12 |     if not config.api:
13 |         return
14 |     if os.path.exists(f"{config.path}/siteinfo.json"):
15 |         print("siteinfo.json exists, do not overwrite")
16 |     else:
17 |         print("Downloading site info as siteinfo.json")
18 | 
19 |         # MediaWiki 1.13+
20 |         r = session.get(
21 |             url=config.api,
22 |             params={
23 |                 "action": "query",
24 |                 "meta": "siteinfo",
25 |                 "siprop": "general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo",
26 |                 "sinumberingroup": 1,
27 |                 "format": "json",
28 |             },
29 |             timeout=10,
30 |         )
31 |         # MediaWiki 1.11-1.12
32 |         if "query" not in getJSON(r):
33 |             r = session.get(
34 |                 url=config.api,
35 |                 params={
36 |                     "action": "query",
37 |                     "meta": "siteinfo",
38 |                     "siprop": "general|namespaces|statistics|dbrepllag|interwikimap",
39 |                     "format": "json",
40 |                 },
41 |                 timeout=10,
42 |             )
43 |             # MediaWiki 1.8-1.10
44 |         if "query" not in getJSON(r):
45 |             r = session.get(
46 |                 url=config.api,
47 |                 params={
48 |                     "action": "query",
49 |                     "meta": "siteinfo",
50 |                     "siprop": "general|namespaces",
51 |                     "format": "json",
52 |                 },
53 |                 timeout=10,
54 |             )
55 |         result = getJSON(r)
56 |         Delay(config=config, session=session)
57 |         with open(f"{config.path}/siteinfo.json", "w", encoding="utf-8") as outfile:
58 |             outfile.write(json.dumps(result, indent=4, sort_keys=True))
59 | 


--------------------------------------------------------------------------------
/PUBLISHING.md:
--------------------------------------------------------------------------------
 1 | # Publishing the dump
 2 | 
 3 | Publishing your dumps to the [Internet Archive's wikiteam collection](https://archive.org/details/wikiteam) is easily done. First [sign up](https://archive.org/account/signup) or [login](http://archive.org/account/login.php).
 4 | 
 5 | ## Launcher and uploader
 6 | 
 7 | Instructions on using the scripts `launcher` and `uploader` are in the file [Usage](./USAGE.md).
 8 | 
 9 | ## Automatic publishing
10 | 
11 | Just use `uploader` (especially if you have multiple wikis): the script takes the filename of a list of wikis as argument and uploads their dumps to archive.org. You only need to:
12 | 
13 | - Check the 7z compressed dumps are in the same directory as `listfile`. The file `listfile` contains a list of the api.php URLs of the wikis to upload, one per line.
14 | - [Retrieve your S3 keys](http://www.archive.org/account/s3.php), save them one per line (in the order provided) in a keys.txt file in same directory as `uploader`.
15 | - Run the script `uploader listfile`.
16 | 
17 | ## Manual publishing
18 | 
19 | - After running dumpgenerator, in each dump folder, select all files, right-click on the selection, click 7-Zip, click `Add to archive...` and click OK.
20 | - At Archive.org, for each wiki [create a new item](http://archive.org/create/).
21 | - Click `Upload files`. Then either drag and drop the 7-Zip archive onto the box or click `Choose files` and select the 7-Zip archive.
22 | - `Page Title` and `Page URL` will be filled in by the uploader.
23 | - Add a short `Description`, such as a descriptive name fopr the wiki.
24 | - Add `Subject Tags`, separated by commas, these are the keywords that will help the archive to show up in a Internet Archive search, e.g. wikiteam,wiki,subjects of the wiki, and so on.
25 | - `Creator`, can be left blank.
26 | - `Date`, can be left blank.
27 | - `Collection`, select `Community texts`.
28 | - `Language`, select the language of the wiki.
29 | - `License`, click to expand and select Creative Commons, Allow Remixing, Require Share-Alike for a CC-BY-SA licence.
30 | - Click `Upload and Create Your Item`.
31 | 
32 | With the subject tag of wikiteam and collection of community texts, your uploads should appear in a search for [subject:"wikiteam" AND collection:opensource](https://archive.org/search?query=subject%3A%22wikiteam%22+AND+collection%3Aopensource).
33 | 
34 | ## Info for developers
35 | 
36 | - [Internet Archive’s S3 like server API](https://archive.org/developers/ias3.html).
37 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/login/index.py:
--------------------------------------------------------------------------------
 1 | """ Always available login methods.(mw 1.16-1.39)
 2 |     Even oler versions of MW may work, but not tested. """
 3 | 
 4 | from typing import *
 5 | 
 6 | import lxml.html
 7 | import requests
 8 | 
 9 | 
10 | def indexLogin(
11 |     index: str, session: requests.Session, username: str, password: str
12 | ) -> Optional[requests.Session]:
13 |     """Try to login to a wiki using username and password through `Special:UserLogin`.
14 |     (tested on MW 1.16...1.39)"""
15 |     wpEditToken = None
16 |     wpLoginToken = None
17 | 
18 |     params = {
19 |         "title": "Special:UserLogin",
20 |     }
21 |     r = session.get(index, allow_redirects=True, params=params)
22 | 
23 |     # Sample r.text:
24 |     # MW 1.16: <input type="hidden" name="wpLoginToken" value="adf5ed40243e9e5db368808b27dc289c" />
25 |     # MW 1.39: <input name="wpLoginToken" type="hidden" value="ad43f6cc89ef50ac3dbd6d03b56aedca63ec4c90+\"/>
26 |     html = lxml.html.fromstring(r.text)
27 |     if "wpLoginToken" in r.text:
28 |         wpLoginToken = html.xpath('//input[@name="wpLoginToken"]/@value')[0]
29 | 
30 |     # Sample r.text:
31 |     # MW 1.16: None
32 |     # MW 1.39: <input id="wpEditToken" type="hidden" value="+\" name="wpEditToken"/>
33 |     if "wpEditToken" in r.text:
34 |         wpEditToken = html.xpath('//input[@name="wpEditToken"]/@value')[0]
35 |         print("index login: wpEditToken found.")
36 | 
37 |     data = {
38 |         "wpName": username,  # required
39 |         "wpPassword": password,  # required
40 |         "wpLoginattempt": "Log in",  # required
41 |         "wpLoginToken": wpLoginToken,  # required
42 |         "wpRemember": "1",  # 0: not remember, 1: remember
43 |         "wpEditToken": wpEditToken,  # introduced before MW 1.27, not sure whether it's required.
44 |         "authAction": "login",  # introduced before MW 1.39.
45 |         "title": "Special:UserLogin",  # introduced before MW 1.39.
46 |         "force": "",  # introduced before MW 1.39, empty string is OK.
47 |     }
48 |     r = session.post(index, allow_redirects=False, params=params, data=data)
49 |     if r.status_code == 302:
50 |         print("index login: Success! Welcome, ", username, "!")
51 |         return session
52 |     else:
53 |         print(
54 |             "index login: Oops! Something went wrong -- ",
55 |             r.status_code,
56 |             "wpLoginToken: ",
57 |             wpLoginToken,
58 |             "wpEditToken: ",
59 |             wpEditToken,
60 |         )
61 |         return None
62 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from io import StringIO
 3 | from typing import *
 4 | 
 5 | import lxml.etree
 6 | from file_read_backwards import FileReadBackwards
 7 | 
 8 | 
 9 | def endsWithNewlines(filename: str) -> int:
10 |     """Returns the number of newlines at the end of file"""
11 | 
12 |     with FileReadBackwards(filename, encoding="utf-8") as frb:
13 |         newlines = 0
14 |         while frb.readline() == "":
15 |             newlines += 1
16 |     return newlines
17 | 
18 | 
19 | def addNewline(filename: str) -> None:
20 |     """Adds a newline to the end of file"""
21 | 
22 |     print(f"Adding newline to end of {filename}")
23 |     with open(filename, "a", encoding="utf-8") as f:
24 |         f.write("\n")
25 | 
26 | 
27 | def truncateXMLDump(filename: str) -> str:
28 |     """Removes incomplete <page> elements from the end of XML dump files"""
29 | 
30 |     with FileReadBackwards(filename, encoding="utf-8") as frb:
31 |         incomplete_segment: str = ""
32 |         xml_line: str = frb.readline()
33 |         while xml_line and "</title>" not in xml_line:
34 |             incomplete_segment = xml_line + incomplete_segment
35 |             xml_line = frb.readline()
36 |         while xml_line and "</page>" not in xml_line:
37 |             incomplete_segment = xml_line + incomplete_segment
38 |             xml_line = frb.readline()
39 |     incomplete_segment_size = len(incomplete_segment.encode("utf-8"))
40 |     file_size = os.path.getsize(filename)
41 |     if file_size > incomplete_segment_size:
42 |         with open(filename, "r+", encoding="utf-8") as fh:
43 |             fh.truncate(file_size - incomplete_segment_size)
44 |     else:
45 |         print(
46 |             'len(incomplete_segment.encode("utf-8")) returned '
47 |             + str(incomplete_segment_size)
48 |             + ", while os.path.getsize(filename) returned "
49 |             + str(file_size)
50 |             + ", so fh.truncate() would be fh.truncate("
51 |             + str(file_size - incomplete_segment_size)
52 |             + "), which would be illegal. Something is seriously wrong here!"
53 |         )
54 | 
55 |     # add newline to prevent `</page> <page>` in one line
56 |     if endsWithNewlines(filename) == 0:
57 |         addNewline(filename)
58 |     elif endsWithNewlines(filename) > 1:
59 |         print(f"WARNING: {filename} has {endsWithNewlines(filename)} newlines")
60 |     return incomplete_segment
61 | 
62 | 
63 | def parseLastPageChunk(chunk) -> Optional[lxml.etree._ElementTree]:
64 |     try:
65 |         parser = lxml.etree.XMLParser(recover=True)
66 |         tree = lxml.etree.parse(StringIO(chunk), parser)
67 |         return tree.getroot()
68 |     except lxml.etree.LxmlError:
69 |         return None
70 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/image/html_regexs.py:
--------------------------------------------------------------------------------
 1 | R_NEXT = r"(?<!&amp;dir=prev)&amp;offset=(?P<offset>\d+)&amp;"
 2 | 
 3 | REGEX_CANDIDATES = [
 4 |     # [0]
 5 |     # archiveteam 1.15.1 <td class="TablePager_col_img_name"><a href="/index.php?title=File:Yahoovideo.jpg" title="File:Yahoovideo.jpg">Yahoovideo.jpg</a> (<a href="/images/2/2b/Yahoovideo.jpg">file</a>)</td>
 6 |     # wikanda 1.15.5 <td class="TablePager_col_img_user_text"><a
 7 |     # href="/w/index.php?title=Usuario:Fernandocg&amp;action=edit&amp;redlink=1"
 8 |     # class="new" title="Usuario:Fernandocg (página no
 9 |     # existe)">Fernandocg</a></td>
10 |     r'(?im)<td class="TablePager_col_img_name"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a>[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
11 |     # [1]
12 |     # wikijuegos 1.9.5
13 |     # http://softwarelibre.uca.es/wikijuegos/Especial:Imagelist old
14 |     # mediawiki version
15 |     ,
16 |     r'(?im)<td class="TablePager_col_links"><a href[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+/[^>/]+)">[^<]+</a></td>\s*<td class="TablePager_col_img_timestamp">[^<]+</td>\s*<td class="TablePager_col_img_name">[^<]+</td>\s*<td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
17 |     # [2]
18 |     # gentoowiki 1.18
19 |     ,
20 |     r'(?im)<td class="TablePager_col_img_name"><a[^>]+title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+</td><td class="TablePager_col_thumb"><a[^>]+><img[^>]+></a></td><td class="TablePager_col_img_size">[^<]+</td><td class="TablePager_col_img_user_text"><a[^>]+>(?P<uploader>[^<]+)</a></td>'
21 |     # [3]
22 |     # http://www.memoryarchive.org/en/index.php?title=Special:Imagelist&sort=byname&limit=50&wpIlMatch=
23 |     # (<a href="/en/Image:109_0923.JPG" title="Image:109 0923.JPG">desc</a>) <a href="/en/upload/c/cd/109_0923.JPG">109 0923.JPG</a> . . 885,713 bytes . . <a href="/en/User:Bfalconer" title="User:Bfalconer">Bfalconer</a> . . 18:44, 17 November 2005<br />
24 |     ,
25 |     '(?ism)<a href=[^>]+ title="[^:>]+:(?P<filename>[^>]+)">[^<]+</a>[^<]+<a href="(?P<url>[^>]+)">[^<]+</a>[^<]+<a[^>]+>(?P<uploader>[^<]+)</a>'
26 |     # [4]
27 |     ,
28 |     (
29 |         r'(?im)<td class="TablePager_col_img_name">\s*<a href[^>]*?>(?P<filename>[^>]+)</a>[^<]*?<a href="(?P<url>[^>]+)">[^<]*?</a>[^<]*?</td>\s*'
30 |         r'<td class="TablePager_col_thumb">[^\n\r]*?</td>\s*'
31 |         r'<td class="TablePager_col_img_size">[^<]*?</td>\s*'
32 |         r'<td class="(?:TablePager_col_img_user_text|TablePager_col_img_actor)">\s*(?:<a href="[^>]*?" title="[^>]*?">)?(?:<bdi>)?(?P<uploader>[^<]+?)(?:</bdi>)?(?:</a>)?\s*(?:<span class="mw-usertoollinks">(?:(?!</span>)(?!</td>).)*?</span>)?</td>'
33 |     ),
34 | ]
35 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | config = {
 3 |         "curonly": args.curonly,
 4 |         "date": datetime.datetime.now().strftime("%Y%m%d"),
 5 |         "api": api,
 6 |         "failfast": args.failfast,
 7 |         "http_method": "POST",
 8 |         "index": index,
 9 |         "images": args.images,
10 |         "logs": False,
11 |         "xml": args.xml,
12 |         "xmlrevisions": args.xmlrevisions,
13 |         "namespaces": namespaces,
14 |         "exnamespaces": exnamespaces,
15 |         "path": args.path and os.path.normpath(args.path) or "",
16 |         "cookies": args.cookies or "",
17 |         "delay": args.delay,
18 |         "retries": int(args.retries),
19 |     }
20 | """
21 | 
22 | import dataclasses
23 | import json
24 | import sys
25 | from typing import *
26 | 
27 | 
28 | def _dataclass_from_dict(klass_or_obj, d):
29 |     ret = klass_or_obj() if isinstance(klass_or_obj, type) else klass_or_obj
30 |     for k, v in d.items():
31 |         if hasattr(ret, k):
32 |             setattr(ret, k, v)
33 |     return ret
34 | 
35 | 
36 | @dataclasses.dataclass
37 | class Config:
38 |     def asdict(self):
39 |         return dataclasses.asdict(self)
40 | 
41 |     # General params
42 |     delay: float = 0.0
43 |     retries: int = 0
44 |     path: str = ""
45 |     logs: bool = False
46 |     date: str = False
47 | 
48 |     # URL params
49 |     index: str = ""
50 |     api: str = ""
51 | 
52 |     # Download params
53 |     xml: bool = False
54 |     curonly: bool = False
55 |     xmlapiexport: bool = False
56 |     xmlrevisions: bool = False
57 |     xmlrevisions_page: bool = False
58 |     images: bool = False
59 |     namespaces: List[int] = None
60 |     exnamespaces: List[int] = None
61 | 
62 |     api_chunksize: int = 0  # arvlimit, ailimit, etc
63 |     export: str = ""  # Special:Export page name
64 |     http_method: str = ""
65 | 
66 |     # Meta info params
67 |     failfast: bool = False
68 | 
69 |     templates: bool = False
70 | 
71 | 
72 | def newConfig(configDict) -> Config:
73 |     return _dataclass_from_dict(Config, configDict)
74 | 
75 | 
76 | def loadConfig(config: Config = None, configfilename=""):
77 |     """Load config file"""
78 | 
79 |     configDict = dataclasses.asdict(config)
80 | 
81 |     if config.path:
82 |         try:
83 |             with open(f"{config.path}/{configfilename}", encoding="utf-8") as infile:
84 |                 configDict.update(json.load(infile))
85 |             return newConfig(configDict)
86 |         except:
87 |             pass
88 | 
89 |     print("There is no config file. we can't resume. Start a new dump.")
90 |     sys.exit()
91 | 
92 | 
93 | def saveConfig(config: Config = None, configfilename=""):
94 |     """Save config file"""
95 | 
96 |     with open(f"{config.path}/{configfilename}", "w", encoding="utf-8") as outfile:
97 |         json.dump(dataclasses.asdict(config), outfile)
98 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/login/api.py:
--------------------------------------------------------------------------------
 1 | """ Available since MediaWiki 1.27. login to a wiki using username and password (API) """
 2 | 
 3 | from typing import *
 4 | 
 5 | import requests
 6 | 
 7 | 
 8 | def fetchLoginToken(session: requests.Session, api: str) -> Optional[str]:
 9 |     """fetch login token by API .(MediaWiki 1.27+)"""
10 | 
11 |     response = session.get(
12 |         url=api,
13 |         params={"action": "query", "meta": "tokens", "type": "login", "format": "json"},
14 |     )
15 |     data = response.json()
16 |     try:
17 |         token = data["query"]["tokens"]["logintoken"]
18 |         if type(token) is str:
19 |             return token
20 |     except KeyError:
21 |         print("fetch login token: Oops! Something went wrong -- ", data)
22 |         return None
23 | 
24 | 
25 | def clientLogin(
26 |     api: str, session: requests.Session, username: str, password: str
27 | ) -> Optional[requests.Session]:
28 |     """login to a wiki using username and password. (MediaWiki 1.27+)"""
29 | 
30 |     login_token = fetchLoginToken(session=session, api=api)
31 |     if not login_token:
32 |         return None
33 | 
34 |     response = session.post(
35 |         url=api,
36 |         data={
37 |             "action": "clientlogin",
38 |             "username": username,
39 |             "password": password,
40 |             "loginreturnurl": "http://127.0.0.1:5000/",
41 |             "logintoken": login_token,
42 |             "format": "json",
43 |         },
44 |     )
45 | 
46 |     data = response.json()
47 | 
48 |     try:
49 |         if data["clientlogin"]["status"] == "PASS":
50 |             print(
51 |                 "client login: Success! Welcome, "
52 |                 + data["clientlogin"]["username"]
53 |                 + "!"
54 |             )
55 |     except KeyError:
56 |         print("client login: Oops! Something went wrong -- ", data)
57 |         return None
58 | 
59 |     return session
60 | 
61 | 
62 | def botLogin(
63 |     api: str, session: requests.Session, username: str, password: str
64 | ) -> Optional[requests.Session]:
65 |     """login to a wiki using BOT's name and password. (MediaWiki 1.27+)"""
66 | 
67 |     login_token = fetchLoginToken(session=session, api=api)
68 |     if not login_token:
69 |         return None
70 | 
71 |     response = session.post(
72 |         url=api,
73 |         data={
74 |             "action": "login",
75 |             "lgname": username,
76 |             "lgpassword": password,
77 |             "lgtoken": login_token,
78 |             "format": "json",
79 |         },
80 |     )
81 | 
82 |     data = response.json()
83 | 
84 |     try:
85 |         if data["login"]["result"] == "Success":
86 |             print("bot login: Success! Welcome, " + data["login"]["lgusername"] + "!")
87 |     except KeyError:
88 |         print(f"bot login: Oops! Something went wrong -- {data}")
89 |         return None
90 | 
91 |     return session
92 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/cli/greeter.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from wikiteam3.dumpgenerator.version import getVersion
 4 | 
 5 | 
 6 | def welcome():
 7 |     message = ""
 8 |     """Opening message"""
 9 |     message += "#" * 73
10 |     message += "\n"
11 |     welcome_string = f"# Welcome to DumpGenerator {getVersion()} by WikiTeam (GPL v3)"
12 |     welcome_string += " " * (73 - len(welcome_string) - 1) + "#"
13 |     message += welcome_string
14 |     message += "\n"
15 |     message += (
16 |         "# More info at: https://github.com/elsiehupp/wikiteam3                  #"
17 |     )
18 |     message += "\n"
19 |     message += "#" * 73
20 |     message += "\n"
21 |     message += ""
22 |     message += "\n"
23 |     message += "#" * 73
24 |     message += "\n"
25 |     message += (
26 |         "# Copyright (C) 2011-%d WikiTeam developers                           #\n"
27 |         % (datetime.datetime.now().year)
28 |     )
29 |     message += """#                                                                       #
30 | # This program is free software: you can redistribute it and/or modify  #
31 | # it under the terms of the GNU General Public License as published by  #
32 | # the Free Software Foundation, either version 3 of the License, or     #
33 | # (at your option) any later version.                                   #
34 | #                                                                       #
35 | # This program is distributed in the hope that it will be useful,       #
36 | # but WITHOUT ANY WARRANTY; without even the implied warranty of        #
37 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         #
38 | # GNU General Public License for more details.                          #
39 | #                                                                       #
40 | # You should have received a copy of the GNU General Public License     #
41 | # along with this program.  If not, see <http://www.gnu.org/licenses/>. #"""
42 |     message += "\n"
43 |     message += "#" * 73
44 |     message += "\n"
45 |     message += ""
46 | 
47 |     return message
48 | 
49 | 
50 | def bye():
51 |     """Closing message"""
52 |     print("")
53 |     print("---> Congratulations! Your dump is complete <---")
54 |     print("")
55 |     print("If you encountered a bug, you can report it on GitHub Issues:")
56 |     print("  https://github.com/mediawiki-client-tools/mediawiki-dump-generator/issues")
57 |     print("")
58 |     print("If you need any other help, you can reach out on GitHub Discussions:")
59 |     print("  https://github.com/orgs/mediawiki-client-tools/discussions")
60 |     print("")
61 |     print("If this is a public wiki, please, consider publishing this dump.")
62 |     print("Do it yourself as explained in:")
63 |     print("  https://github.com/WikiTeam/wikiteam/wiki/Tutorial#Publishing_the_dump")
64 |     print("")
65 |     print("Good luck! Bye!")
66 |     print("")
67 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "wikiteam3"
 3 | version = "3.0.0"
 4 | description = "Tools for downloading and preserving wikis. We archive wikis, from Wikipedia to tiniest wikis. As of 2020, WikiTeam has preserved more than 250,000 wikis."
 5 | license = "GPL-3.0-or-later"
 6 | authors = ["WikiTeam Contributors <https://github.com/WikiTeam/wikiteam/graphs/contributors>"]
 7 | maintainers = [
 8 |     "Federico Leva <https://github.com/nemobis>",
 9 |     "Elsie Hupp <https://github.com/elsiehupp>"
10 | ]
11 | readme = "README.md"
12 | homepage = "https://wiki.archiveteam.org/index.php/WikiTeam"
13 | repository = "https://github.com/WikiTeam/wikiteam"
14 | documentation = "https://wikiteam.readthedocs.io"
15 | keywords = [
16 |     "archiveteam",
17 |     "mediawiki",
18 |     "preservation",
19 |     "wiki",
20 |     "wikipedia"
21 | ]
22 | classifiers = [
23 |     "Development Status :: 3 - Alpha",
24 |     "Environment :: Console",
25 |     "Intended Audience :: Education",
26 |     "Intended Audience :: End Users/Desktop",
27 |     "Intended Audience :: Information Technology",
28 |     "Intended Audience :: Legal Industry",
29 |     "Intended Audience :: Science/Research",
30 |     "Intended Audience :: System Administrators",
31 |     "Natural Language :: English",
32 |     "Operating System :: OS Independent",
33 |     "Topic :: Communications",
34 |     "Topic :: Internet",
35 |     "Topic :: Internet :: WWW/HTTP :: Dynamic Content :: Wiki",
36 |     "Topic :: Scientific/Engineering :: Information Analysis",
37 |     "Topic :: Sociology :: History",
38 |     "Topic :: System :: Archiving",
39 |     "Topic :: System :: Archiving :: Backup",
40 |     "Topic :: Utilities"
41 | ]
42 | packages = [
43 |     { include = "wikiteam3/**/*"},
44 | ]
45 | exclude = ["wikiteam3/dumpgenerator/test/*"]
46 | 
47 | [tool.poetry.scripts]
48 | dumpgenerator = "wikiteam3.dumpgenerator:main"
49 | # gui = "wikiteam3.gui:main"
50 | launcher = "wikiteam3.launcher:main"
51 | # not-archived = "wikiteam3.not-archived:main"
52 | uploader = "wikiteam3.uploader:main"
53 | # wikiadownloader = "wikiteam3.wikiadownloader:main"
54 | # wikipediadownloader = "wikiteam3.wikipediadownloader:main"
55 | # wikispaces = "wikiteam3.wikispaces:main"
56 | 
57 | [tool.poetry.dependencies]
58 | python = "^3.8"
59 | requests = "^2.32.0"
60 | internetarchive = "^3.1.0"
61 | lxml = "^5.0.0"
62 | mwclient = "^0.10.1"
63 | PyMySQL = "^1.1.1"
64 | pywikibot = "^6.6.1"
65 | urllib3 = "^1.26.18"
66 | wikitools3 = "^3.0.0"
67 | pymysql = "*"
68 | file_read_backwards = "^2.0.0"
69 | pre-commit-poetry-export = "^0.1.2"
70 | 
71 | [tool.isort]
72 | profile = "black"
73 | 
74 | [tool.poetry.dev-dependencies]
75 | pytest = "^6.2.5"
76 | requests = "^2.32.0"
77 | flake8 = "^3.9.2"
78 | pre-commit = "^2.17.0"
79 | pymarkdown = "^0.1.4"
80 | 
81 | [build-system]
82 | requires = ["poetry-core>=1.0.0"]
83 | build-backend = "poetry.core.masonry.api"
84 | 
85 | [tool.pymarkdown]
86 | disable-rules = "line-length,no-inline-html"
87 | 


--------------------------------------------------------------------------------
/wikiteam3/utils/util.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import re
 3 | import sys
 4 | 
 5 | 
 6 | def cleanHTML(raw: str = "") -> str:
 7 |     """Extract only the real wiki content and remove rubbish
 8 |     This function is ONLY used to retrieve page titles
 9 |     and file names when no API is available
10 |     DO NOT use this function to extract page content"""
11 |     # different "tags" used by different MediaWiki versions to mark where
12 |     # starts and ends content
13 |     if re.search("<!-- bodytext -->", raw):
14 |         raw = raw.split("<!-- bodytext -->")[1].split("<!-- /bodytext -->")[0]
15 |     elif re.search("<!-- start content -->", raw):
16 |         raw = raw.split("<!-- start content -->")[1].split("<!-- end content -->")[0]
17 |     elif re.search("<!-- Begin Content Area -->", raw):
18 |         raw = raw.split("<!-- Begin Content Area -->")[1].split(
19 |             "<!-- End Content Area -->"
20 |         )[0]
21 |     elif re.search("<!-- content -->", raw):
22 |         raw = raw.split("<!-- content -->")[1].split("<!-- mw_content -->")[0]
23 |     elif re.search(r'<article id="WikiaMainContent" class="WikiaMainContent">', raw):
24 |         raw = raw.split('<article id="WikiaMainContent" class="WikiaMainContent">')[
25 |             1
26 |         ].split("</article>")[0]
27 |     elif re.search("<body class=", raw):
28 |         raw = raw.split("<body class=")[1].split('<div class="printfooter">')[0]
29 |     else:
30 |         print(raw[:250])
31 |         print("This wiki doesn't use marks to split content")
32 |         sys.exit()
33 |     return raw
34 | 
35 | 
36 | def undoHTMLEntities(text: str = "") -> str:
37 |     """Undo some HTML codes"""
38 | 
39 |     # i guess only < > & " ' need conversion
40 |     # http://www.w3schools.com/html/html_entities.asp
41 |     text = re.sub("&lt;", "<", text)
42 |     text = re.sub("&gt;", ">", text)
43 |     text = re.sub("&amp;", "&", text)
44 |     text = re.sub("&quot;", '"', text)
45 |     text = re.sub("&#039;", "'", text)
46 | 
47 |     return text
48 | 
49 | 
50 | def removeIP(raw: str = "") -> str:
51 |     """Remove IP from HTML comments <!-- -->"""
52 | 
53 |     raw = re.sub(r"\d+\.\d+\.\d+\.\d+", "0.0.0.0", raw)
54 |     # http://www.juniper.net/techpubs/software/erx/erx50x/swconfig-routing-vol1/html/ipv6-config5.html
55 |     # weird cases as :: are not included
56 |     raw = re.sub(
57 |         r"(?i)[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}:[\da-f]{0,4}",
58 |         "0:0:0:0:0:0:0:0",
59 |         raw,
60 |     )
61 | 
62 |     return raw
63 | 
64 | 
65 | def cleanXML(xml: str = "") -> str:
66 |     """Trim redundant info from the XML however it comes"""
67 |     # do not touch XML codification, leave AS IS
68 |     # EDIT 2022: we are making this explicitly Unicode
69 |     # for Windows compatibility.
70 |     # If the encoding has to stay as is, we'll have
71 |     # to change all the file encodings, as well.
72 | 
73 |     if re.search(r"</siteinfo>\n", xml):
74 |         xml = xml.split("</siteinfo>\n")[1]
75 |     if re.search(r"</mediawiki>", xml):
76 |         xml = xml.split("</mediawiki>")[0]
77 |     return xml
78 | 
79 | 
80 | def sha1File(filename: str = "") -> str:
81 |     """Return the SHA1 hash of a file"""
82 | 
83 |     sha1 = hashlib.sha1()
84 |     with open(filename, "rb") as f:
85 |         while True:
86 |             if data := f.read(65536):
87 |                 sha1.update(data)
88 |             else:
89 |                 break
90 |     return sha1.hexdigest()
91 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # `MediaWiki Dump Generator`
 2 | 
 3 | **MediaWiki Dump Generator can archive wikis from the largest to the tiniest.**
 4 | 
 5 | `MediaWiki Dump Generator` is a project to port the legacy [`wikiteam`](https://github.com/WikiTeam/wikiteam) toolset to Python 3 and PyPI to make it more accessible for today's archivers.
 6 | 
 7 | Most of the focus has been on the core `dumpgenerator` tool. Python 3 versions of the other `wikiteam` tools may be added over time.
 8 | 
 9 | The project is currently mostly in maintenance mode. We will do our best to prevent the project from breaking entirely. Issues and pull requests are welcomed but may not be reviewed promptly.
10 | 
11 | ## MediaWiki Dump Generator Toolset
12 | 
13 | MediaWiki Dump Generator is a set of tools for archiving wikis. The main general-purpose module of MediaWiki Dump Generator is dumpgenerator, which can download XML dumps of MediaWiki sites that can then be parsed or redeployed elsewhere.
14 | 
15 | Wikipedia is far too large to manage the dump easily and [dumps are already freely available](https://en.wikipedia.org/wiki/Wikipedia:Database_download#Where_do_I_get_the_dumps?).
16 | 
17 | ## Installing the tools
18 | 
19 | For prerequisites and installation see [Installation](./INSTALLATION.md)
20 | 
21 | ## Using the tools
22 | 
23 | For usage see [Usage](./USAGE.md)
24 | 
25 | ## Publishing the dump
26 | 
27 | Please consider publishing your wiki dump(s). You can do it yourself as explained in [Publishing](./PUBLISHING.md).
28 | 
29 | ## Getting help
30 | 
31 | * You can read and post in MediaWiki Client Tools' [GitHub Discussions]( https://github.com/orgs/mediawiki-client-tools/discussions).
32 | * If you need help (other than reporting a bug), you can reach out on MediaWiki Client Tools' [Discussions/Q&A](https://github.com/orgs/mediawiki-client-tools/discussions/categories/q-a).
33 | 
34 | ## Contributing
35 | 
36 | For information on reporting bugs and proposing changes, please see the [Contributing](./CONTRIBUTING.md) guide.
37 | 
38 | ## Code of Conduct
39 | 
40 | `mediawiki-client-tools` has a [Code of Conduct](./CODE_OF_CONDUCT.md).
41 | 
42 | At the moment the only person responsible for reviewing CoC reports is the repository administrator, Janet Cobb, reachable at [git@randomcat.org](mailto:git@randomcat.org). Please state up front if your message concerns the Code of Conduct, as these messages are confidential.
43 | 
44 | In case of emergency (i.e. if Janet is not reachable or if such an issue involves her), you can contact Elsie Hupp, who also retains privileges over this repository, directly via email at [mediawiki-client-tools@elsiehupp.com](mailto:mediawiki-client-tools@elsiehupp.com) or on Matrix at [@elsiehupp:beeper.com](https://matrix.to/#/@elsiehupp:beeper.com). 
45 | 
46 | ## Contributors
47 | 
48 | **WikiTeam** is the [Archive Team](http://www.archiveteam.org) [[GitHub](https://github.com/ArchiveTeam)] subcommittee on wikis.
49 | It was founded and originally developed by [Emilio J. Rodríguez-Posada](https://github.com/emijrp), a Wikipedia veteran editor and amateur archivist. Thanks to people who have helped, especially to: [Federico Leva](https://github.com/nemobis), [Alex Buie](https://github.com/ab2525), [Scott Boyd](http://www.sdboyd56.com), [Hydriz](https://github.com/Hydriz), Platonides, Ian McEwen, [Mike Dupont](https://github.com/h4ck3rm1k3), [balr0g](https://github.com/balr0g) and [PiRSquared17](https://github.com/PiRSquared17).
50 | 
51 | **MediaWiki Dump Generator**
52 | The Python 3 initiative was started and originally maintained by [Elsie Hupp](https://github.com/elsiehupp); it is currently primarily maintained by [Janet Cobb](https://github.com/randomnetcat). We are also grateful to have contributions from [Victor Gambier](https://github.com/vgambier), [Thomas Karcher](https://github.com/t-karcher), [yzqzss](https://github.com/yzqzss), [NyaMisty](https://github.com/NyaMisty) and [Rob Kam](https://github.com/robkam).
53 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/namespaces.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from wikiteam3.dumpgenerator.api import getJSON
 4 | from wikiteam3.dumpgenerator.cli import Delay
 5 | from wikiteam3.dumpgenerator.config import Config
 6 | 
 7 | 
 8 | def getNamespacesScraper(config: Config = None, session=None):
 9 |     """Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages"""
10 |     """Function called if no API is available"""
11 |     namespaces = config.namespaces
12 |     namespacenames = {0: ""}  # main is 0, no prefix
13 |     if namespaces:
14 |         r = session.post(
15 |             url=config.index, params={"title": "Special:Allpages"}, timeout=30
16 |         )
17 |         raw = r.text
18 |         Delay(config=config, session=session)
19 | 
20 |         # [^>]*? to include selected="selected"
21 |         m = re.compile(
22 |             r'<option [^>]*?value=[\'"](?P<namespaceid>\d+)[\'"][^>]*?>(?P<namespacename>[^<]+)</option>'
23 |         ).finditer(raw)
24 |         if "all" in namespaces:
25 |             namespaces = []
26 |             for i in m:
27 |                 namespaces.append(int(i.group("namespaceid")))
28 |                 namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
29 |         else:
30 |             # check if those namespaces really exist in this wiki
31 |             namespaces2 = []
32 |             for i in m:
33 |                 if int(i.group("namespaceid")) in namespaces:
34 |                     namespaces2.append(int(i.group("namespaceid")))
35 |                     namespacenames[int(i.group("namespaceid"))] = i.group(
36 |                         "namespacename"
37 |                     )
38 |             namespaces = namespaces2
39 |     else:
40 |         namespaces = [0]
41 | 
42 |     namespaces = list(set(namespaces))  # uniques
43 |     print("%d namespaces found" % (len(namespaces)))
44 |     return namespaces, namespacenames
45 | 
46 | 
47 | def getNamespacesAPI(config: Config = None, session=None):
48 |     """Uses the API to get the list of namespaces names and ids"""
49 |     namespaces = config.namespaces
50 |     namespacenames = {0: ""}  # main is 0, no prefix
51 |     if namespaces:
52 |         r = session.get(
53 |             url=config.api,
54 |             params={
55 |                 "action": "query",
56 |                 "meta": "siteinfo",
57 |                 "siprop": "namespaces",
58 |                 "format": "json",
59 |             },
60 |             timeout=30,
61 |         )
62 |         result = getJSON(r)
63 |         Delay(config=config, session=session)
64 |         try:
65 |             nsquery = result["query"]["namespaces"]
66 |         except KeyError:
67 |             print("Error: could not get namespaces from the API request.")
68 |             print("HTTP %d" % r.status_code)
69 |             print(r.text)
70 |             return None
71 | 
72 |         if "all" in namespaces:
73 |             namespaces = []
74 |             for i in nsquery.keys():
75 |                 if int(i) < 0:  # -1: Special, -2: Media, excluding
76 |                     continue
77 |                 namespaces.append(int(i))
78 |                 namespacenames[int(i)] = nsquery[i]["*"]
79 |         else:
80 |             # check if those namespaces really exist in this wiki
81 |             namespaces2 = []
82 |             for i in nsquery.keys():
83 |                 bi = i
84 |                 i = int(i)
85 |                 if i < 0:  # -1: Special, -2: Media, excluding
86 |                     continue
87 |                 if i in namespaces:
88 |                     namespaces2.append(i)
89 |                     namespacenames[i] = nsquery[bi]["*"]
90 |             namespaces = namespaces2
91 |     else:
92 |         namespaces = [0]
93 | 
94 |     namespaces = list(set(namespaces))  # uniques
95 |     print("%d namespaces found" % (len(namespaces)))
96 |     return namespaces, namespacenames
97 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/image/html_regexs_test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | from pathlib import Path
  4 | from typing import Dict, List
  5 | 
  6 | import pytest
  7 | import requests
  8 | 
  9 | from wikiteam3.dumpgenerator.dump.image.html_regexs import REGEX_CANDIDATES
 10 | 
 11 | ONLINE = True
 12 | 
 13 | HTML_DIR = Path("test/data/html_regexs")
 14 | os.makedirs(HTML_DIR, exist_ok=True)
 15 | 
 16 | 
 17 | def prepare_raws_from_urls(urls: Dict[str, str]):
 18 |     sess = requests.Session()
 19 |     raws: Dict[str, str] = {}
 20 |     for site, url in urls.items():
 21 |         try:
 22 |             resp = sess.get(url, timeout=10, allow_redirects=True)
 23 |         except Exception as e:
 24 |             pytest.warns(UserWarning, match=f"Could not fetch {url}: {e}")
 25 |             continue
 26 | 
 27 |         if resp.status_code == 200:
 28 |             raws[url] = resp.text
 29 |             if not os.path.exists(HTML_DIR / f"{site}.html"):
 30 |                 with open(HTML_DIR / f"{site}.html", "w", encoding="utf-8") as f:
 31 |                     f.write(resp.text)
 32 |         else:
 33 |             pytest.warns(
 34 |                 UserWarning,
 35 |                 match=f"Could not fetch {url}: status_code: {resp.status_code}",
 36 |             )
 37 | 
 38 |     return raws
 39 | 
 40 | 
 41 | class TestRegexs:
 42 |     class TestRegexsOnline:
 43 |         listFiles_urls = {
 44 |             # site-date: url , `limit=` for counting the number of matches
 45 |             "archiveteam.org-20230701": "https://wiki.archiveteam.org/index.php?title=Special:ListFiles&sort=byname&limit=7",
 46 |             "wiki.othing.xyz-20230701": "https://wiki.othing.xyz/index.php?title=Special:ListFiles&sort=byname",
 47 |             "mediawiki.org-20230701": "https://www.mediawiki.org/w/index.php?title=Special:ListFiles&sort=byname&limit=7",
 48 |             "asoiaf.fandom.com-20230701": "https://asoiaf.fandom.com/zh/wiki/Special:文件列表?sort=byname&limit=7",
 49 |             # only for local testing:
 50 |             # "commons.moegirl.org.cn-20230701": "https://commons.moegirl.org.cn/index.php?title=Special:ListFiles&sort=byname&limit=7",
 51 |             # # login required:
 52 |             # "group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701": "http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/index.php?title=Special:文件列表&limit=1",
 53 |             # "group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701": "http://group1.mediawiki.demo.save-web.org/mediawiki-1.27.7/index.php?title=Special:ListFiles&limit=2",
 54 |         }
 55 |         raws: Dict[str, str] = {}
 56 | 
 57 |         def test_online(self):
 58 |             if not ONLINE:
 59 |                 pytest.skip("Online test skipped")
 60 |             self.raws = prepare_raws_from_urls(self.listFiles_urls)
 61 |             assert len(self.raws) != 0, "Could not fetch any of the URLs"
 62 |             for url, raw in self.raws.items():
 63 |                 best_matched = 0
 64 |                 regexp_best = None
 65 | 
 66 |                 for regexp in REGEX_CANDIDATES:
 67 |                     _count = len(re.findall(regexp, raw))
 68 |                     if _count > best_matched:
 69 |                         best_matched = _count
 70 |                         regexp_best = regexp
 71 | 
 72 |                 assert (
 73 |                     regexp_best is not None
 74 |                 ), f"Could not find a proper regexp to parse the HTML for {url} (online)"
 75 | 
 76 |                 if "limit=" in url:
 77 |                     limit = int(url.split("limit=")[-1])
 78 |                     assert (
 79 |                         len(re.findall(regexp_best, raw)) == limit
 80 |                     ), f"Could not find {limit} matches for {url} (online)"
 81 | 
 82 |     class TestRegexsOffline:
 83 |         html_files = os.listdir(HTML_DIR)
 84 |         raws: Dict[str, str] = {}
 85 |         for html_file in html_files:
 86 |             with open(HTML_DIR / html_file, encoding="utf-8") as f:
 87 |                 raws[html_file] = f.read()
 88 |         assert len(raws) != 0, f"Could not find any HTML files in {HTML_DIR}"
 89 | 
 90 |         def test_offline(self):
 91 |             assert len(self.raws) != 0, "Could not fetch any of the URLs"
 92 |             for site, raw in self.raws.items():
 93 |                 best_matched = 0
 94 |                 regexp_best = None
 95 | 
 96 |                 for regexp in REGEX_CANDIDATES:
 97 |                     _count = len(re.findall(regexp, raw))
 98 |                     if _count > best_matched:
 99 |                         best_matched = _count
100 |                         regexp_best = regexp
101 | 
102 |                 assert (
103 |                     regexp_best is not None
104 |                 ), f"Could not find a proper regexp to parse the HTML for {site} (local)"
105 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import re
  3 | import sys
  4 | from typing import *
  5 | 
  6 | import requests
  7 | 
  8 | from wikiteam3.dumpgenerator.config import Config
  9 | from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage
 10 | from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError
 11 | from wikiteam3.dumpgenerator.log import logerror
 12 | 
 13 | 
 14 | def getXMLHeader(config: Config = None, session=None) -> Tuple[str, Config]:
 15 |     """Retrieve a random page to extract XML headers (namespace info, etc)"""
 16 |     print(config.api)
 17 |     xml = ""
 18 |     disableSpecialExport = config.xmlrevisions or config.xmlapiexport
 19 |     randomtitle = "Main_Page"
 20 |     if disableSpecialExport and config.api and config.api.endswith("api.php"):
 21 |         try:
 22 |             print("Getting the XML header from the API")
 23 |             # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.8
 24 |             r = session.get(
 25 |                 f"{config.api}?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1",
 26 |                 timeout=10,
 27 |             )
 28 |             xml: str = r.text
 29 |             # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19
 30 |             if not re.match(r"\s*<mediawiki", xml):
 31 |                 r = session.get(
 32 |                     f"{config.api}?action=query&export=1&list=allpages&aplimit=1&format=json",
 33 |                     timeout=10,
 34 |                 )
 35 |                 try:
 36 |                     xml = r.json()["query"]["export"]["*"]
 37 |                 except KeyError:
 38 |                     pass
 39 |             if not re.match(r"\s*<mediawiki", xml):
 40 |                 # Do without a generator, use our usual trick of a random page title
 41 |                 r = session.get(
 42 |                     f"{config.api}?action=query&export=1&exportnowrap=1&titles={randomtitle}",
 43 |                     timeout=10,
 44 |                 )
 45 |                 xml = str(r.text)
 46 |             # Again try without exportnowrap
 47 |             if not re.match(r"\s*<mediawiki", xml):
 48 |                 r = session.get(
 49 |                     f"{config.api}?action=query&export=1&format=json&titles={randomtitle}",
 50 |                     timeout=10,
 51 |                 )
 52 |                 try:
 53 |                     xml = r.json()["query"]["export"]["*"]
 54 |                 except KeyError:
 55 |                     pass
 56 |         except requests.exceptions.RetryError:
 57 |             pass
 58 | 
 59 |     else:
 60 |         try:
 61 |             xml = "".join(
 62 |                 list(
 63 |                     getXMLPage(
 64 |                         config=config,
 65 |                         title=randomtitle,
 66 |                         verbose=False,
 67 |                         session=session,
 68 |                     )
 69 |                 )
 70 |             )
 71 |         except PageMissingError as pme:
 72 |             # The <page> does not exist. Not a problem, if we get the <siteinfo>.
 73 |             xml = pme.xml
 74 |         except ExportAbortedError:
 75 |             try:
 76 |                 if config.api:
 77 |                     print("Trying the local name for the Special namespace instead")
 78 |                     r = session.get(
 79 |                         url=config.api,
 80 |                         params={
 81 |                             "action": "query",
 82 |                             "meta": "siteinfo",
 83 |                             "siprop": "namespaces",
 84 |                             "format": "json",
 85 |                         },
 86 |                         timeout=120,
 87 |                     )
 88 |                     config.export = (
 89 |                         json.loads(r.text)["query"]["namespaces"]["-1"]["*"] + ":Export"
 90 |                     )
 91 |                     xml = "".join(
 92 |                         list(
 93 |                             getXMLPage(
 94 |                                 config=config,
 95 |                                 title=randomtitle,
 96 |                                 verbose=False,
 97 |                                 session=session,
 98 |                             )
 99 |                         )
100 |                     )
101 |             except PageMissingError as pme:
102 |                 xml = pme.xml
103 |             except ExportAbortedError:
104 |                 pass
105 | 
106 |     header = xml.split("</mediawiki>")[0]
107 |     if not re.match(r"\s*<mediawiki", xml):
108 |         if config.xmlrevisions:
109 |             # Try again the old way
110 |             print(
111 |                 "Export test via the API failed. Wiki too old? Trying without xmlrevisions."
112 |             )
113 |             config.xmlrevisions = False
114 |             header, config = getXMLHeader(config=config, session=session)
115 |         else:
116 |             print(xml)
117 |             print("XML export on this wiki is broken, quitting.")
118 |             logerror(
119 |                 to_stdout=True, text="XML export on this wiki is broken, quitting."
120 |             )
121 |             sys.exit()
122 |     return header, config
123 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/api.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | from typing import *
  4 | from urllib.parse import urljoin, urlparse, urlunparse
  5 | 
  6 | import mwclient
  7 | import requests
  8 | 
  9 | from wikiteam3.utils import getUserAgent
 10 | 
 11 | from .get_json import getJSON
 12 | 
 13 | 
 14 | def checkAPI(api="", session: requests.Session = None):
 15 |     """Checking API availability"""
 16 |     global cj
 17 |     # handle redirects
 18 |     r: Optional[requests.Response] = None
 19 |     for i in range(4):
 20 |         print("Checking API...", api)
 21 |         r = session.get(
 22 |             url=api,
 23 |             params={"action": "query", "meta": "siteinfo", "format": "json"},
 24 |             timeout=30,
 25 |         )
 26 |         if i >= 4:
 27 |             break
 28 |         if r.status_code == 200:
 29 |             break
 30 |         elif r.status_code < 400:
 31 |             api = r.url
 32 |         elif r.status_code > 400:
 33 |             print(
 34 |                 "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code
 35 |             )
 36 |             return None
 37 |     if "MediaWiki API is not enabled for this site." in r.text:
 38 |         return None
 39 |     try:
 40 |         result = getJSON(r)
 41 |         index = None
 42 |         if result:
 43 |             try:
 44 |                 index = (
 45 |                     result["query"]["general"]["server"]
 46 |                     + result["query"]["general"]["script"]
 47 |                 )
 48 |                 return (True, index, api)
 49 |             except KeyError:
 50 |                 print("MediaWiki API seems to work but returned no index URL")
 51 |                 return (True, None, api)
 52 |     except ValueError:
 53 |         print(repr(r.text))
 54 |         print("MediaWiki API returned data we could not parse")
 55 |         return None
 56 |     return None
 57 | 
 58 | 
 59 | def mwGetAPIAndIndex(url="", session: requests.Session = None):
 60 |     """Returns the MediaWiki API and Index.php"""
 61 | 
 62 |     api = ""
 63 |     index = ""
 64 |     if not session:
 65 |         session = requests.Session()  # Create a new session
 66 |         session.headers.update({"User-Agent": getUserAgent()})
 67 |     r = session.post(url=url, timeout=120)
 68 |     result = r.text
 69 | 
 70 |     if m := re.findall(
 71 |         r'(?im)<\s*link\s*rel="EditURI"\s*type="application/rsd\+xml"\s*href="([^>]+?)\?action=rsd"\s*/\s*>',
 72 |         result,
 73 |     ):
 74 |         api = m[0]
 75 |         if api.startswith("//"):  # gentoo wiki
 76 |             api = url.split("//")[0] + api
 77 |     if m := re.findall(
 78 |         r'<li id="ca-viewsource"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?',
 79 |         result,
 80 |     ):
 81 |         index = m[0]
 82 |     elif m := re.findall(
 83 |         r'<li id="ca-history"[^>]*?>\s*(?:<span>)?\s*<a href="([^\?]+?)\?',
 84 |         result,
 85 |     ):
 86 |         index = m[0]
 87 |     if index:
 88 |         if index.startswith("/"):
 89 |             index = (
 90 |                 urljoin(api, index.split("/")[-1])
 91 |                 if api
 92 |                 else urljoin(url, index.split("/")[-1])
 93 |             )
 94 |             #     api = index.split("/index.php")[0] + "/api.php"
 95 |             if index.endswith("/Main_Page"):
 96 |                 index = urljoin(index, "index.php")
 97 |     elif api:
 98 |         if len(re.findall(r"/index\.php5\?", result)) > len(
 99 |             re.findall(r"/index\.php\?", result)
100 |         ):
101 |             index = "/".join(api.split("/")[:-1]) + "/index.php5"
102 |         else:
103 |             index = "/".join(api.split("/")[:-1]) + "/index.php"
104 | 
105 |     if not api and index:
106 |         api = urljoin(index, "api.php")
107 | 
108 |     return api, index
109 | 
110 | 
111 | def checkRetryAPI(api="", apiclient=False, session: requests.Session = None):
112 |     """Call checkAPI and mwclient if necessary"""
113 |     check = None
114 |     try:
115 |         check = checkAPI(api, session=session)
116 |     except requests.exceptions.ConnectionError as e:
117 |         print(f"Connection error: {str(e)}")
118 | 
119 |     if check and apiclient:
120 |         apiurl = urlparse(api)
121 |         try:
122 |             site = mwclient.Site(
123 |                 apiurl.netloc,
124 |                 apiurl.path.replace("api.php", ""),
125 |                 scheme=apiurl.scheme,
126 |                 pool=session,
127 |             )
128 |         except KeyError:
129 |             # Probably KeyError: 'query'
130 |             if apiurl.scheme == "https":
131 |                 newscheme = "http"
132 |                 api = api.replace("https://", "http://")
133 |             else:
134 |                 newscheme = "https"
135 |                 api = api.replace("http://", "https://")
136 |             print(
137 |                 f"WARNING: The provided API URL did not work with mwclient. Switched protocol to: {newscheme}"
138 |             )
139 | 
140 |             try:
141 |                 site = mwclient.Site(
142 |                     apiurl.netloc,
143 |                     apiurl.path.replace("api.php", ""),
144 |                     scheme=newscheme,
145 |                     pool=session,
146 |                 )
147 |             except KeyError:
148 |                 check = False
149 | 
150 |     return check, api
151 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py:
--------------------------------------------------------------------------------
  1 | from lxml import etree
  2 | from lxml.builder import E
  3 | 
  4 | from wikiteam3.dumpgenerator.exceptions import PageMissingError
  5 | 
  6 | 
  7 | def makeXmlPageFromRaw(xml, arvcontinue) -> str:
  8 |     """Discard the metadata around a <page> element in <mediawiki> string"""
  9 |     root = etree.XML(xml)
 10 |     find = etree.XPath("//*[local-name() = 'page']")
 11 |     page = find(root)[0]
 12 |     if arvcontinue is not None:
 13 |         page.attrib["arvcontinue"] = arvcontinue
 14 |     # The tag will inherit the namespace, like:
 15 |     # <page xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
 16 |     # FIXME: pretty_print doesn't seem to work, only adds a newline
 17 |     return etree.tostring(page, pretty_print=True, encoding="unicode")
 18 | 
 19 | 
 20 | def makeXmlFromPage(page: dict, arvcontinue) -> str:
 21 |     """Output an XML document as a string from a page as in the API JSON"""
 22 |     try:
 23 |         p = E.page(
 24 |             E.title(str(page["title"])),
 25 |             E.ns(str(page["ns"])),
 26 |             E.id(str(page["pageid"])),
 27 |         )
 28 |         if arvcontinue is not None:
 29 |             p.attrib["arvcontinue"] = arvcontinue
 30 |         for rev in page["revisions"]:
 31 |             # Older releases like MediaWiki 1.16 do not return all fields.
 32 |             userid = rev["userid"] if "userid" in rev else 0
 33 |             size = rev["size"] if "size" in rev else 0
 34 |             # Create rev object
 35 |             revision = [
 36 |                 E.id(str(rev["revid"])),
 37 |                 E.timestamp(rev["timestamp"]),
 38 |             ]
 39 | 
 40 |             # The text, user, comment, sha1 may be deleted/suppressed
 41 |             if ("texthidden" in rev) or ("textmissing" in rev):
 42 |                 print(
 43 |                     "Warning: text missing/hidden in pageid %d revid %d"
 44 |                     % (page["pageid"], rev["revid"])
 45 |                 )
 46 |                 revision.append(
 47 |                     E.text(
 48 |                         **{
 49 |                             "bytes": str(size),
 50 |                             "deleted": "deleted",
 51 |                         }
 52 |                     )
 53 |                 )
 54 |             else:
 55 |                 text = str(rev["*"])
 56 |                 revision.append(
 57 |                     E.text(
 58 |                         text,
 59 |                         **{
 60 |                             "bytes": str(size),
 61 |                             "{http://www.w3.org/XML/1998/namespace}space": "preserve",
 62 |                         }
 63 |                     )
 64 |                 )
 65 | 
 66 |             if "user" not in rev:
 67 |                 if "userhidden" not in rev:
 68 |                     print(
 69 |                         "Warning: user not hidden but missing user in pageid %d revid %d"
 70 |                         % (page["pageid"], rev["revid"])
 71 |                     )
 72 |                 revision.append(E.contributor(deleted="deleted"))
 73 |             else:
 74 |                 revision.append(
 75 |                     E.contributor(
 76 |                         E.username(str(rev["user"])),
 77 |                         E.id(str(userid)),
 78 |                     )
 79 |                 )
 80 | 
 81 |             if "sha1" in rev:
 82 |                 revision.append(E.sha1(rev["sha1"]))
 83 | 
 84 |             elif "sha1hidden" in rev:
 85 |                 revision.append(E.sha1())  # stub
 86 |             if "commenthidden" in rev:
 87 |                 revision.append(E.comment(deleted="deleted"))
 88 |             elif "comment" in rev and rev["comment"]:
 89 |                 revision.append(E.comment(str(rev["comment"])))
 90 | 
 91 |             if "contentmodel" in rev:
 92 |                 revision.append(E.model(rev["contentmodel"]))
 93 |             if "contentformat" in rev:
 94 |                 revision.append(E.format(rev["contentformat"]))
 95 |             # Sometimes a missing parentid is not replaced with a 0 as it should.
 96 |             if "parentid" in rev:
 97 |                 revision.append(E.parentid(str(rev["parentid"])))
 98 | 
 99 |             if "minor" in rev:
100 |                 revision.append(E.minor())
101 | 
102 |             # mwcli's dump.xml order
103 |             revisionTags = [
104 |                 "id",
105 |                 "parentid",
106 |                 "timestamp",
107 |                 "contributor",
108 |                 "minor",
109 |                 "comment",
110 |                 "origin",
111 |                 "model",
112 |                 "format",
113 |                 "text",
114 |                 "sha1",
115 |             ]
116 |             revisionElementsDict = {elem.tag: elem for elem in revision}
117 |             _revision = E.revision()
118 |             for tag in revisionTags:
119 |                 if tag in revisionElementsDict:
120 |                     _revision.append(revisionElementsDict.pop(tag))
121 |             for elem in revisionElementsDict.values():
122 |                 _revision.append(elem)
123 |             p.append(_revision)
124 |     except KeyError as e:
125 |         print(e)
126 |         raise PageMissingError(page["title"], e)
127 |     return etree.tostring(p, pretty_print=True, encoding="unicode")
128 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/wiki_check.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | import requests
  4 | 
  5 | from wikiteam3.utils import getUserAgent
  6 | 
  7 | 
  8 | def getWikiEngine(url="", session: requests.Session = None) -> str:
  9 |     """Returns the wiki engine of a URL, if known"""
 10 | 
 11 |     if not session:
 12 |         session = requests.Session()  # Create a new session
 13 |         session.headers.update({"User-Agent": getUserAgent()})
 14 |     r = session.post(url=url, timeout=30)
 15 |     if r.status_code == 405 or not r.text:
 16 |         r = session.get(url=url, timeout=120)
 17 |     result = r.text
 18 | 
 19 |     wikiengine = "Unknown"
 20 |     if re.search(
 21 |         '(?im)(<meta name="generator" content="DokuWiki)|dokuwiki__site', result
 22 |     ):
 23 |         wikiengine = "DokuWiki"
 24 |     elif re.search(
 25 |         '(?im)(alt="Powered by MediaWiki"|<meta name="generator" content="MediaWiki|class="mediawiki)',
 26 |         result,
 27 |     ):
 28 |         wikiengine = "MediaWiki"
 29 |     elif re.search(
 30 |         '(?im)(>MoinMoin Powered</a>|<option value="LocalSiteMap">)', result
 31 |     ):
 32 |         wikiengine = "MoinMoin"
 33 |     elif re.search(
 34 |         "(?im)(twikiCurrentTopicLink|twikiCurrentWebHomeLink|twikiLink)", result
 35 |     ):
 36 |         wikiengine = "TWiki"
 37 |     elif re.search("(?im)(<!--PageHeaderFmt-->)", result):
 38 |         wikiengine = "PmWiki"
 39 |     elif re.search(
 40 |         '(?im)(<meta name="generator" content="PhpWiki|<meta name="PHPWIKI_VERSION)',
 41 |         result,
 42 |     ):
 43 |         wikiengine = "PhpWiki"
 44 |     elif re.search(
 45 |         r'(?im)(<meta name="generator" content="Tiki Wiki|Powered by <a href="http://(www\.)?tiki\.org"| id="tiki-(top|main)")',
 46 |         result,
 47 |     ):
 48 |         wikiengine = "TikiWiki"
 49 |     elif re.search(
 50 |         r'(?im)(foswikiNoJs|<meta name="foswiki\.|foswikiTable|foswikiContentFooter)',
 51 |         result,
 52 |     ):
 53 |         wikiengine = "FosWiki"
 54 |     elif re.search(r'(?im)(<meta http-equiv="powered by" content="MojoMojo)', result):
 55 |         wikiengine = "MojoMojo"
 56 |     elif re.search(
 57 |         r'(?im)(id="xwiki(content|nav_footer|platformversion|docinfo|maincontainer|data)|/resources/js/xwiki/xwiki|XWiki\.webapppath)',
 58 |         result,
 59 |     ):
 60 |         wikiengine = "XWiki"
 61 |     elif re.search(r'(?im)(<meta id="confluence-(base-url|context-path)")', result):
 62 |         wikiengine = "Confluence"
 63 |     elif re.search(r'(?im)(<meta name="generator" content="Banana Dance)', result):
 64 |         wikiengine = "Banana Dance"
 65 |     elif re.search(
 66 |         r'(?im)(Wheeled by <a class="external-link" href="http://www\.wagn\.org">|<body id="wagn">)',
 67 |         result,
 68 |     ):
 69 |         wikiengine = "Wagn"
 70 |     elif re.search(r'(?im)(<meta name="generator" content="MindTouch)', result):
 71 |         wikiengine = "MindTouch"  # formerly DekiWiki
 72 |     elif re.search(
 73 |         r'(?im)(<div class="wikiversion">\s*(<p>)?JSPWiki|xmlns:jspwiki="http://www\.jspwiki\.org")',
 74 |         result,
 75 |     ):
 76 |         wikiengine = "JSPWiki"
 77 |     elif re.search(
 78 |         r'(?im)(Powered by:?\s*(<br ?/>)?\s*<a href="http://kwiki\.org">|\bKwikiNavigation\b)',
 79 |         result,
 80 |     ):
 81 |         wikiengine = "Kwiki"
 82 |     elif re.search(r'(?im)(Powered by <a href="http://www\.anwiki\.com")', result):
 83 |         wikiengine = "Anwiki"
 84 |     elif re.search(
 85 |         '(?im)(<meta name="generator" content="Aneuch|is powered by <em>Aneuch</em>|<!-- start of Aneuch markup -->)',
 86 |         result,
 87 |     ):
 88 |         wikiengine = "Aneuch"
 89 |     elif re.search(r'(?im)(<meta name="generator" content="bitweaver)', result):
 90 |         wikiengine = "bitweaver"
 91 |     elif re.search(r'(?im)(powered by <a href="[^"]*\bzwiki.org(/[^"]*)?">)', result):
 92 |         wikiengine = "Zwiki"
 93 |     # WakkaWiki forks
 94 |     elif re.search(
 95 |         r'(?im)(<meta name="generator" content="WikkaWiki|<a class="ext" href="(http://wikka\.jsnx\.com/|http://wikkawiki\.org/)">)',
 96 |         result,
 97 |     ):
 98 |         wikiengine = "WikkaWiki"  # formerly WikkaWakkaWiki
 99 |     elif re.search(r'(?im)(<meta name="generator" content="CoMa Wiki)', result):
100 |         wikiengine = "CoMaWiki"
101 |     elif re.search(r'(?im)(Fonctionne avec <a href="http://www\.wikini\.net)', result):
102 |         wikiengine = "WikiNi"
103 |     elif re.search(r'(?im)(Powered by <a href="[^"]*CitiWiki">CitiWiki</a>)', result):
104 |         wikiengine = "CitiWiki"
105 |     elif re.search(
106 |         r'(?im)(Powered by <a href="http://wackowiki\.com/|title="WackoWiki")', result
107 |     ):
108 |         wikiengine = "WackoWiki"
109 |     elif re.search(r'(?im)(Powered by <a href="http://www\.wakkawiki\.com)', result):
110 |         # This may not work for heavily modded/themed installations, e.g.
111 |         # http://operawiki.info/
112 |         wikiengine = "WakkaWiki"
113 |     # Custom wikis used by wiki farms
114 |     elif re.search(r'(?im)(var wikispaces_page|<div class="WikispacesContent)', result):
115 |         wikiengine = "Wikispaces"
116 |     elif re.search(
117 |         r'(?im)(Powered by <a href="http://www\.wikidot\.com">|wikidot-privacy-button-hovertip|javascript:WIKIDOT\.page)',
118 |         result,
119 |     ):
120 |         wikiengine = "Wikidot"
121 |     elif re.search(
122 |         r"(?im)(IS_WETPAINT_USER|wetpaintLoad|WPC-bodyContentContainer)", result
123 |     ):
124 |         wikiengine = "Wetpaint"
125 |     elif re.search(
126 |         '(?im)(<div id="footer-pbwiki">|ws-nav-search|PBinfo *= *{)', result
127 |     ):
128 |         # formerly PBwiki
129 |         wikiengine = "PBworks"
130 |     # if wikiengine == 'Unknown': print (result)
131 | 
132 |     return wikiengine
133 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, caste, color, religion, or sexual
 10 | identity and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the overall
 26 |   community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or advances of
 31 |   any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email address,
 35 |   without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement (at the moment, only the administrator, Elsie Hupp)
 63 | directly via email at [mediawiki-client-tools@elsiehupp.com](mailto:mediawiki-client-tools@elsiehupp.com) or on Matrix at [@elsiehupp:beeper.com](https://matrix.to/#/@elsiehupp:beeper.com).
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series of
 86 | actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or permanent
 93 | ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within the
113 | community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.1, available at
119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120 | 
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 | 
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126 | [https://www.contributor-covenant.org/translations][translations].
127 | 
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq
132 | [translations]: https://www.contributor-covenant.org/translations
133 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | from typing import *
  4 | 
  5 | import lxml.etree
  6 | 
  7 | from wikiteam3.dumpgenerator.api.page_titles import readTitles
  8 | from wikiteam3.dumpgenerator.cli import Delay
  9 | from wikiteam3.dumpgenerator.config import Config
 10 | from wikiteam3.dumpgenerator.dump.page.xmlexport.page_xml import getXMLPage
 11 | from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions import getXMLRevisions
 12 | from wikiteam3.dumpgenerator.dump.xmldump.xml_header import getXMLHeader
 13 | from wikiteam3.dumpgenerator.dump.xmldump.xml_truncate import (
 14 |     parseLastPageChunk,
 15 |     truncateXMLDump,
 16 | )
 17 | from wikiteam3.dumpgenerator.exceptions import PageMissingError
 18 | from wikiteam3.dumpgenerator.log import logerror
 19 | from wikiteam3.utils import cleanXML, domain2prefix, undoHTMLEntities
 20 | 
 21 | 
 22 | def doXMLRevisionDump(
 23 |     config: Config = None,
 24 |     session=None,
 25 |     xmlfile=None,
 26 |     lastPage=None,
 27 |     useAllrevisions=False,
 28 | ):
 29 |     try:
 30 |         r_timestamp = "<timestamp>([^<]+)</timestamp>"
 31 |         r_arvcontinue = '<page arvcontinue="(.*?)">'
 32 | 
 33 |         lastArvcontinue = None
 34 |         for xml in getXMLRevisions(
 35 |             config=config,
 36 |             session=session,
 37 |             lastPage=lastPage,
 38 |             useAllrevision=useAllrevisions,
 39 |         ):
 40 |             numrevs = len(re.findall(r_timestamp, xml))
 41 |             if arvcontinueRe := re.findall(r_arvcontinue, xml):
 42 |                 curArvcontinue = arvcontinueRe[0]
 43 |                 if lastArvcontinue != curArvcontinue:
 44 |                     Delay(config=config, session=session)
 45 |                     lastArvcontinue = curArvcontinue
 46 |             # Due to how generators work, it's expected this may be less
 47 |             xml = cleanXML(xml=xml)
 48 |             xmlfile.write(xml)
 49 | 
 50 |             xmltitle = re.search(r"<title>([^<]+)</title>", xml)
 51 |             title = undoHTMLEntities(text=xmltitle.group(1))
 52 |             print(f"{title}, {numrevs} edits (--xmlrevisions)")
 53 |             # Delay(config=config, session=session)
 54 |     except AttributeError as e:
 55 |         print(e)
 56 |         print("This API library version is not working")
 57 |         sys.exit()
 58 |     except UnicodeEncodeError as e:
 59 |         print(e)
 60 | 
 61 | 
 62 | def doXMLExportDump(config: Config = None, session=None, xmlfile=None, lastPage=None):
 63 |     print("\nRetrieving the XML for every page\n")
 64 | 
 65 |     lock = True
 66 |     start = None
 67 |     if lastPage is not None:
 68 |         try:
 69 |             start = lastPage.find("title").text
 70 |         except Exception:
 71 |             print(
 72 |                 f"Failed to find title in last trunk XML: {lxml.etree.tostring(lastPage)}"
 73 |             )
 74 |             raise
 75 |     else:
 76 |         # requested complete xml dump
 77 |         lock = False
 78 | 
 79 |     c = 1
 80 |     for title in readTitles(config, session=session, start=start):
 81 |         if not title:
 82 |             continue
 83 |         if title == start:  # start downloading from start, included
 84 |             lock = False
 85 |         if lock:
 86 |             continue
 87 |         Delay(config=config, session=session)
 88 |         if c % 10 == 0:
 89 |             print(f"\n->  Downloaded {c} pages\n")
 90 |         try:
 91 |             for xml in getXMLPage(config=config, title=title, session=session):
 92 |                 xml = cleanXML(xml=xml)
 93 |                 xmlfile.write(xml)
 94 |         except PageMissingError:
 95 |             logerror(
 96 |                 config=config,
 97 |                 to_stdout=True,
 98 |                 text=f'The page "{title}" was missing in the wiki (probably deleted)',
 99 |             )
100 |         # here, XML is a correct <page> </page> chunk or
101 |         # an empty string due to a deleted page (logged in errors log) or
102 |         # an empty string due to an error while retrieving the page from server
103 |         # (logged in errors log)
104 |         c += 1
105 | 
106 | 
107 | def generateXMLDump(config: Config = None, resume=False, session=None):
108 |     """Generates a XML dump for a list of titles or from revision IDs"""
109 | 
110 |     header, config = getXMLHeader(config=config, session=session)
111 |     footer = "</mediawiki>\n"  # new line at the end
112 |     xmlfilename = "{}-{}-{}.xml".format(
113 |         domain2prefix(config=config),
114 |         config.date,
115 |         "current" if config.curonly else "history",
116 |     )
117 |     xmlfile = None
118 | 
119 |     lastPage = None
120 |     lastPageChunk = None
121 |     # start != None, means we are resuming a XML dump
122 |     if resume:
123 |         print("Removing the last chunk of past XML dump: it is probably incomplete.")
124 |         # truncate XML dump if it already exists
125 |         lastPageChunk = truncateXMLDump(f"{config.path}/{xmlfilename}")
126 |         if not lastPageChunk.strip():
127 |             print("Last page chunk is NULL, we'll directly start a new dump!")
128 |             resume = False
129 |             lastPage = None
130 |         else:
131 |             lastPage = parseLastPageChunk(lastPageChunk)
132 |             if lastPage is None:
133 |                 print("Failed to parse last page chunk: \n%s" % lastPageChunk)
134 |                 print("Cannot resume, exiting now!")
135 |                 sys.exit(1)
136 | 
137 |         print("WARNING: will try to start the download...")
138 |         xmlfile = open(f"{config.path}/{xmlfilename}", "a", encoding="utf-8")
139 |     else:
140 |         print("\nRetrieving the XML for every page from the beginning\n")
141 |         xmlfile = open(f"{config.path}/{xmlfilename}", "w", encoding="utf-8")
142 |         xmlfile.write(header)
143 | 
144 |     if config.xmlrevisions and not config.xmlrevisions_page:
145 |         doXMLRevisionDump(config, session, xmlfile, lastPage, useAllrevisions=True)
146 |     elif config.xmlrevisions:
147 |         doXMLRevisionDump(config, session, xmlfile, lastPage, useAllrevisions=False)
148 |     else:  # --xml
149 |         doXMLExportDump(config, session, xmlfile, lastPage)
150 |     xmlfile.write(footer)
151 |     xmlfile.close()
152 |     print("XML dump saved at...", xmlfilename)
153 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | Thank you for helping to improve `mediawiki-client-tools`. We're glad you're here!
 4 | 
 5 | This document is an ongoing process for establishing and refining a set of best practices. Most of the things here are flexible or open to further iteration.
 6 | 
 7 | ## Reporting Issues
 8 | 
 9 | If you find anything amiss, you can report it using [GitHub Issues](https://github.com/mediawiki-client-tools/mediawiki-dump-generator/issues). The template is there to help you communicate clearly. It's okay if you change it to meet your needs, though, as it is merely a suggested baseline.
10 | 
11 | For anything that doesn't fit, you can open a less formal conversation in [GitHub Discussions](https://github.com/orgs/mediawiki-client-tools/discussions) and feel free to tag any of the members of our GitHub organization.
12 | 
13 | If you wish to keep your concerns private, you can contact the organization administrator directly via email at [mediawiki-client-tools@elsiehupp.com](mailto:mediawiki-client-tools@elsiehupp.com) or on Matrix at [@elsiehupp:beeper.com](https://matrix.to/#/@elsiehupp:beeper.com).
14 | 
15 | ## Tools
16 | 
17 | GitHub is a fancy frontend built on top of [Git](https://git-scm.com/) source control, and there is an official [Git Book](https://git-scm.com/book) you can read, skim, or search to familiarize yourself with Git, in particular.
18 | 
19 | GitHub also has its own [introduction to Git](https://docs.github.com/en/get-started/using-git) as part of its [Getting Started Guide](https://docs.github.com/get-started), as well as much more extensive [documentation](https://docs.github.com) on how to use the site.
20 | 
21 | Git can be counterintuitive, and [GitHub Desktop](https://desktop.github.com/) on macOS and Windows can be a friendlier and more approachable interface for using it.
22 | 
23 | [Visual Studio Code](https://code.visualstudio.com/) ("VSCode") is a convenient development environment that integrates with GitHub. You can use any environment you'd like, though this is among the easiest.
24 | 
25 | VSCode has [a guide to source control](https://code.visualstudio.com/docs/sourcecontrol/overview), and it has [an extension for working with GitHub](https://marketplace.visualstudio.com/items?itemName=GitHub.vscode-pull-request-github) which you may also find convenient.
26 | 
27 | In addition to the tools listed in the basic installation instructions in the main [README](./README.md), you can install [`pre-commit`](https://pre-commit.com/) in order to check and verify your work before submitting it.
28 | 
29 | ## Contributing Code
30 | 
31 | `mediawiki-client-tools` has implemented a basic code-review and continuous-integration process which we are working to improve. With the following steps you can help us incorporate your work into our common codebase.
32 | 
33 | ### 1. Fork the repository if you don't have write access
34 | 
35 | You can do so [here](https://github.com/mediawiki-client-tools/mediawiki-dump-generator/fork).
36 | 
37 | ### 2. Clone the repository (or your fork) if you'd like to work on it locally (such as in VS Code)
38 | 
39 | This is particularly important if you are contributing executible code, so that you can use "code intelligence" and test your work. You can clone the repository using the big green **Code** button on the homepage of the repository (or your fork).
40 | 
41 | Alternately, you can [create a codespace](https://github.com/mediawiki-client-tools/mediawiki-dump-generator/codespaces) (also from the big green **Code** button), though we have yet to set up a consistent development container.
42 | 
43 | ### 3. Create a new branch for the changes you'd like to make
44 | 
45 | It is helpful if you use a separate branch for each task, in order to keep your Pull Requests narrowly focused so that they are easier to review.
46 | 
47 | See the GitHub Desktop or VSCode documentation for how to create a new branch with either of these tools. If you are using GitHub's web-based editor, you will be prompted to do so when you click the big green **Commit changes...** button in the upper right.
48 | 
49 | While it isn't the end of the world if you use the default suggested branch name, it can be helpful if you use something slightly more memorable like `fix-infinite-loop-bug` or `update-contribution-guidelines` ;-) to make it easier for code reviewers to check out your changes.
50 | 
51 | ### 4. Commit your changes
52 | 
53 | While there isn't really the space here to explain the technical process of Git commits, it can be helpful if you follow [some best practices](https://cbea.ms/git-commit/) with your commit messages. You don't have to do so up front, but we ask that you follow these best practices for writing a commit message when you open a pull request.
54 | 
55 | ### 5. Open a Pull Request
56 | 
57 | > **Note:** If you use a single commit when opening a Pull Request, GitHub will automatically use the commit message to populate the text fields.
58 | 
59 | While we don't currently have a Pull Request template (as we do with Issues), it helps if you do the following (in no particular order):
60 | 
61 | * Please keep your Pull Requests as narrowly focused as possible in order to facilitate code review. If you have multiple unrelated commits, it would probably work better if you manage each one on a separate branch and as separate Pull Requests.
62 | * If you don't want to squash your commits on merge, you should probably open them as separate Pull Requests instead.
63 | * If your Pull Request isn't solving an Issue, you may consider opening one first (and using the template there to explain your rationale). You can feel free to explain if you are planning on solving the Issue yourself.
64 | * If your Pull Request will help close an issue, please link to it in the text.
65 | * It is helpful if you tick the box reading "allow maintainers to edit" so that we can collaborate directly on the code with you.
66 | * If you haven't previously contributed, please add your name to the "Contributors" section at the bottom of the main [README](./README.md).
67 | 
68 | ## Reviewing Code
69 | 
70 | If you have made a helpful contribution in the past, we may invite you to become a member of the `mediawiki-client-tools` GitHub organization.
71 | 
72 | In addition to allowing you to create and edit branches directly on this repository, being a member of the organization will allow you to review and approve other people's Pull Requests.
73 | 
74 | While there are certain hardcoded prerequisites in place before a Pull Request can be merged, you should also use your discretion and invite other contibutors to discuss if you think any changes may be controversial or disruptive.
75 | 
76 | ---
77 | 
78 | Thank you again for your help!
79 | 


--------------------------------------------------------------------------------
/USAGE.md:
--------------------------------------------------------------------------------
  1 | # Usage
  2 | 
  3 | ## `Dumpgenerator`
  4 | 
  5 | `MediaWiki Dump Generator` has been tested on Linux, macOS, Windows and Android. If you are connecting to Linux or macOS via `ssh`, you can continue using the `bash` or `zsh` command prompt in the same terminal, but if you are starting in a desktop environment and don't already have a preferred Terminal see the INSTALLATION.md document.
  6 | 
  7 | After installing `MediaWiki Dump Generator` you should be able to use the `dumpgenerator` command from any local directory. Run `dumpgenerator` in the directory where you'd like the download to be:
  8 | 
  9 | ```bash
 10 | dumpgenerator [args]
 11 | ```
 12 | 
 13 | For a brief summary of the `dumpgenerator` command-line options:
 14 | 
 15 | ```bash
 16 | dumpgenerator --help
 17 | ```
 18 | 
 19 | Several examples follow.
 20 | 
 21 | > **Note:** the `\` and line breaks in the examples below are for legibility in this documentation. Run `dumpgenerator` with the arguments in a single line and a single space between.
 22 | 
 23 | ### Downloading a wiki with complete XML history and images
 24 | 
 25 | ```bash
 26 | dumpgenerator http://wiki.domain.org --xml --images
 27 | ```
 28 | 
 29 | ### Manually specifying `api.php` and/or `index.php`
 30 | 
 31 | If the script itself can't find the `api.php` and/or `index.php` paths, then you can provide them. To find api.php on a particular wiki, see section "Entry point URLs" on the Special:Version page.
 32 | 
 33 | ```bash
 34 | dumpgenerator --api http://wiki.domain.org/w/api.php --xml --images
 35 | ```
 36 | 
 37 | ```bash
 38 | dumpgenerator --api http://wiki.domain.org/w/api.php --index http://wiki.domain.org/w/index.php \
 39 |     --xml --images
 40 | ```
 41 | 
 42 | If you only want the XML histories, just use `--xml`. For only the images, just `--images`. For only the current version of every page, `--xml --curonly`.
 43 | 
 44 | To dump a private wiki you will have to use a login which has at the least read permission on that wiki.
 45 | 
 46 | ### Resuming an incomplete dump
 47 | 
 48 | ```bash
 49 | dumpgenerator \
 50 |     --api http://wiki.domain.org/w/api.php --xml --images --resume --path /path/to/incomplete-dump
 51 | ```
 52 | 
 53 | In the above example, `--path` is only necessary if the download path is not the default.
 54 | 
 55 | `dumpgenerator` will also ask you if you want to resume if it finds an incomplete dump in the path where it is downloading.
 56 | 
 57 | ### Checking dump integrity
 58 | 
 59 | If you want to check the XML dump integrity, type this into your command line to count title, page and revision XML tags:
 60 | 
 61 | ```bash
 62 | grep -Ec "<title(.*?)>" *.xml;grep -Ec "<page(.*?)>" *.xml;grep -Ec "</page>" *.xml; \
 63 |     grep -Ec "<revision(.*?)>" *.xml;grep -Ec "</revision>" *.xml
 64 | ```
 65 | 
 66 | You should see something similar to this (not the actual numbers) - the first three numbers should be the same and the last two should be the same as each other:
 67 | 
 68 | ```bash
 69 | 580
 70 | 580
 71 | 580
 72 | 5677
 73 | 5677
 74 | ```
 75 | 
 76 | If your first three numbers or your last two numbers are different, then, your XML dump is corrupt (it contains one or more unfinished ```</page>``` or ```</revision>```). This is not common in small wikis, but large or very large wikis may fail at this due to truncated XML pages while exporting and merging. The solution is to remove the XML dump and re-download, a bit boring, and it can fail again.
 77 | 
 78 | ## Viewing MediaWiki XML Dumps
 79 | 
 80 | * [XML namespaces](https://www.mediawiki.org/xml/)
 81 | * [XML export format](https://www.mediawiki.org/wiki/Help:Export#Export_format)
 82 | 
 83 | ## Publishing the dump
 84 | 
 85 | Please consider publishing your wiki dump(s). You can do it yourself as explained in [Publishing](./PUBLISHING.md).
 86 | 
 87 | ## `Launcher`
 88 | 
 89 | The script `launcher` is a way to download a list of wikis with a single invocation.
 90 | 
 91 | Usage:
 92 | 
 93 | ```bash
 94 | launcher path-to-apis.txt [--7z-path path-to-7z] [--generator-arg=--arg] ...
 95 | ```
 96 | 
 97 | `launcher` will download a complete dump (XML and images) for a list of wikis, then compress the dump into two `7z` files: `history` (containing only metadata and the XML history of the wiki) and `wikidump` (containing metadata, XML, and images). This is the format that is suitable for upload to a WikiTeam item on the Internet Archive.
 98 | 
 99 | `launcher` will resume incomplete dumps as appropriate and will not attempt to download wikis that have already been downloaded (as determined by the files existing in the working directory).
100 | 
101 | Each wiki will be stored into files contiaining a stripped version of the url and the date the dump was started.
102 | 
103 | `path-to-apis.txt` is a path to a file that contains a list only of URLs to `api.php`s of wikis, one on each line.
104 | 
105 | By default, a `7z` executable is found on `PATH`. The `--7z-path` argument can be used to use a specific executable instead.
106 | 
107 | The `--generator-arg` or `-g` argument can be used on the command line to pass through arguments to the `generator` instances that are spawned. For example:
108 | * `--generator-arg=--xmlrevisions` to use the modern MediaWiki API for retrieving revisions
109 | * `--generator-arg=--delay=2` to use a delay of 2 seconds between requests
110 | * `-g=--user -g=USER -g=--pass -g=PASSWORD` to dump a wiki that only logged in users can read
111 | 
112 | ## `Uploader`
113 | 
114 | The script `uploader` is a way to upload a set of already-generated wiki dumps to the Internet Archive with a single invocation.
115 | 
116 | Usage:
117 | 
118 | ```bash
119 | uploader [-pd] [-pw] [-a] [-c COLLECTION] [-wd WIKIDUMP_DIR] [-u] [-kf KEYSFILE] [-lf LOGFILE] listfile
120 | ```
121 | 
122 | For the positional parameter `listfile`, `uploader` expects a path to a file that contains a list of URLs to `api.php`s of wikis, one on each line (exactly the same as `launcher`).
123 | 
124 | `uploader` will search a configurable directory for files with the names generated by `launcher` and upload any that it finds to an Internet Archive item. The item will be created if it does not already exist.
125 | 
126 | Named arguments (short and long versions):
127 | 
128 | * `-pd`, `--prune_directories`: After uploading, remove the raw directory generated by `launcher`
129 | * `-pw`, `--prune_wikidump`: After uploading, remove the `wikidump.7z` file generated by `launcher`
130 | * `-c`, `--collection`: Assign the Internet Archive items to the specified collection
131 | * `-a`, `--admin`: Used only if you are an admin of the WikiTeam collection on the Internet Archive
132 | * `-wd`, `--wikidump_dir`: The directory to search for dumps. Defaults to `.`.
133 | * `-u`, `--update`: Update the metadata on an existing Internet Archive item
134 | * `-kf`, `--keysfile`: Path to a file containing Internet Archive API keys. Should contain two lines: the access key, then the secret key. Defaults to `./keys.txt`.
135 | * `-lf`, `--logfile`: Where to store a log of uploaded files (to reduce duplicate work). Defaults to `uploader-X.txt`, where `X` is the final part of the `listfile` path.
136 | 
137 | ## Restoring a wiki
138 | 
139 | To restore a wiki from a wikidump follow the instructions at MediaWiki's [Manual:Restoring a wiki from backup](https://www.mediawiki.org/wiki/Manual:Restoring_a_wiki_from_backup).
140 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | import time
  4 | from typing import *
  5 | 
  6 | import requests
  7 | 
  8 | from wikiteam3.dumpgenerator.api import handleStatusCode
  9 | from wikiteam3.dumpgenerator.config import Config
 10 | from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError
 11 | from wikiteam3.dumpgenerator.log import logerror
 12 | from wikiteam3.utils import uprint
 13 | 
 14 | 
 15 | def getXMLPageCore(
 16 |     headers: Dict = None, params: Dict = None, config: Config = None, session=None
 17 | ) -> str:
 18 |     """"""
 19 |     # returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki>
 20 |     # if retrieving params['limit'] revisions fails, returns a current only version
 21 |     # if all fail, returns the empty string
 22 |     xml = ""
 23 |     c = 0
 24 |     maxseconds = 100  # max seconds to wait in a single sleeping
 25 |     maxretries = config.retries  # x retries and skip
 26 |     increment = 20  # increment every retry
 27 | 
 28 |     while not re.search(r"</mediawiki>", xml):
 29 |         if c > 0 and c < maxretries:
 30 |             wait = (
 31 |                 increment if increment * c < maxseconds else maxseconds
 32 |             )  # incremental until maxseconds
 33 |             print(
 34 |                 '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'
 35 |                 % (c, params["pages"], wait)
 36 |             )
 37 |             time.sleep(wait)
 38 |             # reducing server load requesting smallest chunks (if curonly then
 39 |             # limit = 1 from mother function)
 40 |             if params["limit"] > 1:
 41 |                 params["limit"] = params["limit"] / 2  # half
 42 |         if c >= maxretries:
 43 |             print("    We have retried %d times" % (c))
 44 |             print(
 45 |                 f'    MediaWiki error for "{params["pages"]}", network error or whatever...'
 46 |             )
 47 |             if config.failfast:
 48 |                 print("Exit, it will be for another time")
 49 |                 sys.exit()
 50 |             # If it's not already what we tried: our last chance, preserve only the last revision...
 51 |             # config.curonly means that the whole dump is configured to save only the last,
 52 |             # params['curonly'] should mean that we've already tried this
 53 |             # fallback, because it's set by the following if and passed to
 54 |             # getXMLPageCore
 55 |             if not config.curonly and "curonly" not in params:
 56 |                 print("    Trying to save only the last revision for this page...")
 57 |                 params["curonly"] = 1
 58 |                 logerror(
 59 |                     config=config,
 60 |                     to_stdout=True,
 61 |                     text=f'Error while retrieving the full history of "{params["pages"]}". Trying to save only the last revision for this page',
 62 |                 )
 63 |                 return getXMLPageCore(
 64 |                     headers=headers, params=params, config=config, session=session
 65 |                 )
 66 |             else:
 67 |                 print("    Saving in the errors log, and skipping...")
 68 |                 logerror(
 69 |                     config=config,
 70 |                     to_stdout=True,
 71 |                     text=f'Error while retrieving the last revision of "{params["pages"]}". Skipping.',
 72 |                 )
 73 |                 raise ExportAbortedError(config.index)
 74 |         # FIXME HANDLE HTTP Errors HERE
 75 |         try:
 76 |             r = session.post(
 77 |                 url=config.index, params=params, headers=headers, timeout=10
 78 |             )
 79 |             handleStatusCode(r)
 80 |             xml = r.text
 81 |         except requests.exceptions.ConnectionError as e:
 82 |             print(f"    Connection error: {str(e.args[0])}")
 83 |             xml = ""
 84 |         except requests.exceptions.ReadTimeout as e:
 85 |             print(f"    Read timeout: {str(e.args[0])}")
 86 |             xml = ""
 87 |         c += 1
 88 | 
 89 |     return xml
 90 | 
 91 | 
 92 | def getXMLPageWithExport(config: Config = None, title="", verbose=True, session=None):
 93 |     """Get the full history (or current only) of a page"""
 94 | 
 95 |     truncated = False
 96 |     title_ = title
 97 |     title_ = re.sub(" ", "_", title_)
 98 |     # do not convert & into %26, title_ = re.sub('&', '%26', title_)
 99 |     if config.export:
100 |         params = {"title": config.export, "pages": title_, "action": "submit"}
101 |     else:
102 |         params = {"title": "Special:Export", "pages": title_, "action": "submit"}
103 |     if config.curonly:
104 |         params["curonly"] = 1
105 |         params["limit"] = 1
106 |     else:
107 |         params["offset"] = "1"  # 1 always < 2000s
108 |         # if server errors occurs while retrieving the full page history, it may return [oldest OK versions] + last version, excluding middle revisions, so it would be partialy truncated
109 |         # http://www.mediawiki.org/wiki/Manual_talk:Parameters_to_Special:Export#Parameters_no_longer_in_use.3F
110 | 
111 |         limit = 1000
112 |         params["limit"] = limit
113 |     # in other case, do not set params['templates']
114 |     if config.templates:
115 |         params["templates"] = 1
116 | 
117 |     xml = getXMLPageCore(params=params, config=config, session=session)
118 |     if xml == "":
119 |         raise ExportAbortedError(config.index)
120 |     if "</page>" not in xml:
121 |         raise PageMissingError(params["title"], xml)
122 |     # strip these sha1s sums which keep showing up in the export and
123 |     # which are invalid for the XML schema (they only apply to
124 |     # revisions)
125 |     xml = re.sub(r"\n\s*<sha1>\w+</sha1>\s*\n", "\n", xml)
126 |     xml = re.sub(r"\n\s*<sha1/>\s*\n", "\n", xml)
127 | 
128 |     yield xml.split("</page>")[0]
129 | 
130 |     # if complete history, check if this page history has > limit edits, if so, retrieve all using offset if available
131 |     # else, warning about Special:Export truncating large page histories
132 |     r_timestamp = "<timestamp>([^<]+)</timestamp>"
133 | 
134 |     edit_count = 0 + len(re.findall(r_timestamp, xml))
135 |     # search for timestamps in xml to avoid analysing empty pages like
136 |     # Special:Allpages and the random one
137 |     if not config.curonly and re.search(r_timestamp, xml):
138 |         while not truncated and params["offset"]:  # next chunk
139 |             # get the last timestamp from the acum XML
140 |             params["offset"] = re.findall(r_timestamp, xml)[-1]
141 |             try:
142 |                 xml2 = getXMLPageCore(params=params, config=config, session=session)
143 |             except MemoryError:
144 |                 print("The page's history exceeds our memory, halving limit.")
145 |                 params["limit"] /= 2
146 |                 continue
147 | 
148 |             # are there more edits in this next XML chunk or no <page></page>?
149 |             if re.findall(r_timestamp, xml2):
150 |                 if re.findall(r_timestamp, xml2)[-1] == params["offset"]:
151 |                     # again the same XML, this wiki does not support params in
152 |                     # Special:Export, offer complete XML up to X edits (usually
153 |                     # 1000)
154 |                     print(
155 |                         "ATTENTION: This wiki does not allow some parameters in Special:Export, therefore pages with large histories may be truncated"
156 |                     )
157 |                     truncated = True
158 |                     break
159 |                 else:
160 |                     """</namespaces>
161 |                     </siteinfo>
162 |                     <page>
163 |                     <title>Main Page</title>
164 |                     <id>15580374</id>
165 |                     <restrictions>edit=sysop:move=sysop</restrictions> (?)
166 |                     <revision>
167 |                         <id>418009832</id>
168 |                         <timestamp>2011-03-09T19:57:06Z</timestamp>
169 |                         <contributor>
170 |                     """
171 |                     # offset is OK in this wiki, merge with the previous chunk
172 |                     # of this page history and continue
173 |                     try:
174 |                         xml2 = xml2.split("</page>")[0]
175 |                         yield "  <revision>" + (
176 |                             "<revision>".join(xml2.split("<revision>")[1:])
177 |                         )
178 |                     except MemoryError:
179 |                         "The page's history exceeds our memory, halving limit."
180 |                         params["limit"] /= 2
181 |                         continue
182 |                     xml = xml2
183 |                     edit_count += len(re.findall(r_timestamp, xml))
184 |             else:
185 |                 params["offset"] = ""  # no more edits in this page history
186 |     yield "</page>\n"
187 | 
188 |     if verbose:
189 |         if edit_count == 1:
190 |             uprint(f"    {title.strip()}, 1 edit")
191 |         else:
192 |             uprint("    %s, %d edits" % (title.strip(), edit_count))
193 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/api/page_titles.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import sys
  3 | from urllib.parse import urlparse
  4 | 
  5 | import mwclient
  6 | from file_read_backwards import FileReadBackwards
  7 | 
  8 | from wikiteam3.dumpgenerator.api.namespaces import (
  9 |     getNamespacesAPI,
 10 |     getNamespacesScraper,
 11 | )
 12 | from wikiteam3.dumpgenerator.cli import Delay
 13 | from wikiteam3.dumpgenerator.config import Config
 14 | from wikiteam3.utils import cleanHTML, domain2prefix, undoHTMLEntities
 15 | from wikiteam3.utils.monkey_patch import DelaySession
 16 | 
 17 | 
 18 | def getPageTitlesAPI(config: Config = None, session=None):
 19 |     """Uses the API to get the list of page titles"""
 20 |     titles = []
 21 |     namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
 22 | 
 23 |     # apply delay to the session for mwclient.Site.allpages()
 24 |     delay_session = DelaySession(
 25 |         session=session, msg=f"Session delay: {__name__}", config=config
 26 |     )
 27 |     delay_session.hijack()
 28 |     for namespace in namespaces:
 29 |         if namespace in config.exnamespaces:
 30 |             print("    Skipping namespace = %d" % (namespace))
 31 |             continue
 32 | 
 33 |         print("    Retrieving titles in the namespace %d" % (namespace))
 34 |         apiurl = urlparse(config.api)
 35 |         site = mwclient.Site(
 36 |             apiurl.netloc,
 37 |             apiurl.path.replace("api.php", ""),
 38 |             scheme=apiurl.scheme,
 39 |             pool=session,
 40 |         )
 41 |         for page in site.allpages(namespace=namespace):
 42 |             title = page.name
 43 |             titles.append(title)
 44 |             yield title
 45 | 
 46 |         if len(titles) != len(set(titles)):
 47 |             print("Probably a loop, switching to next namespace")
 48 |             titles = list(set(titles))
 49 | 
 50 |     delay_session.release()
 51 | 
 52 | 
 53 | def getPageTitlesScraper(config: Config = None, session=None):
 54 |     """Scrape the list of page titles from Special:Allpages"""
 55 |     titles = []
 56 |     namespaces, namespacenames = getNamespacesScraper(config=config, session=session)
 57 |     r_title = r'title="(?P<title>[^>]+)">'
 58 |     r_suballpages1 = r'&amp;from=(?P<from>[^>"]+)&amp;to=(?P<to>[^>"]+)">'
 59 |     r_suballpages2 = r'Special:Allpages/(?P<from>[^>"]+)">'
 60 |     r_suballpages3 = r'&amp;from=(?P<from>[^>"]+)" title="[^>]+">'
 61 |     # Should be enough subpages on Special:Allpages
 62 |     deep = 50
 63 |     for namespace in namespaces:
 64 |         print("    Retrieving titles in the namespace", namespace)
 65 |         url = f"{config.index}?title=Special:Allpages&namespace={namespace}"
 66 |         r = session.get(url=url, timeout=30)
 67 |         raw = r.text
 68 |         raw = cleanHTML(raw)
 69 | 
 70 |         r_suballpages = ""
 71 |         if re.search(r_suballpages1, raw):
 72 |             r_suballpages = r_suballpages1
 73 |         elif re.search(r_suballpages2, raw):
 74 |             r_suballpages = r_suballpages2
 75 |         elif re.search(r_suballpages3, raw):
 76 |             r_suballpages = r_suballpages3
 77 |         c = 0
 78 |         oldfr = ""
 79 |         checked_suballpages = []
 80 |         rawacum = raw
 81 |         while r_suballpages and re.search(r_suballpages, raw) and c < deep:
 82 |             # load sub-Allpages
 83 |             m = re.compile(r_suballpages).finditer(raw)
 84 |             currfr = None
 85 |             for i in m:
 86 |                 fr = i.group("from")
 87 |                 currfr = fr
 88 | 
 89 |                 if r_suballpages == r_suballpages1:
 90 |                     to = i.group("to")
 91 |                     name = f"{fr}-{to}"
 92 |                     url = f"{config.index}?title=Special:Allpages&namespace={namespace}&from={fr}&to={to}"
 93 |                 elif r_suballpages == r_suballpages2:
 94 |                     # clean &amp;namespace=\d, sometimes happens
 95 |                     fr = fr.split("&amp;namespace=")[0]
 96 |                     name = fr
 97 |                     url = f"{config.index}?title=Special:Allpages/{name}&namespace={namespace}"
 98 |                 elif r_suballpages == r_suballpages3:
 99 |                     fr = fr.split("&amp;namespace=")[0]
100 |                     name = fr
101 |                     url = f"{config.index}?title=Special:Allpages&from={name}&namespace={namespace}"
102 |                 else:
103 |                     assert False, "Unreachable"
104 | 
105 |                 if name not in checked_suballpages:
106 |                     # to avoid reload dupe subpages links
107 |                     checked_suballpages.append(name)
108 |                     Delay(config=config, session=session)
109 |                     # print ('Fetching URL: ', url)
110 |                     r = session.get(url=url, timeout=10)
111 |                     raw = str(r.text)
112 |                     raw = cleanHTML(raw)
113 |                     rawacum += raw  # merge it after removed junk
114 |                     print(
115 |                         "    Reading",
116 |                         name,
117 |                         len(raw),
118 |                         "bytes",
119 |                         len(re.findall(r_suballpages, raw)),
120 |                         "subpages",
121 |                         len(re.findall(r_title, raw)),
122 |                         "pages",
123 |                     )
124 | 
125 |                 Delay(config=config, session=session)
126 | 
127 |             assert (
128 |                 currfr is not None
129 |             ), "re.search found the pattern, but re.finditer fails, why?"
130 |             oldfr = currfr
131 |             c += 1
132 | 
133 |         c = 0
134 |         m = re.compile(r_title).finditer(rawacum)
135 |         for i in m:
136 |             t = undoHTMLEntities(text=i.group("title"))
137 |             if not t.startswith("Special:"):
138 |                 if t not in titles:
139 |                     titles.append(t)
140 |                     c += 1
141 |         print("    %d titles retrieved in the namespace %d" % (c, namespace))
142 |     return titles
143 | 
144 | 
145 | def getPageTitles(config: Config = None, session=None):
146 |     """Get list of page titles"""
147 |     # http://en.wikipedia.org/wiki/Special:AllPages
148 |     # http://wiki.archiveteam.org/index.php?title=Special:AllPages
149 |     # http://www.wikanda.es/wiki/Especial:Todas
150 |     print(
151 |         "Loading page titles from namespaces = %s"
152 |         % (
153 |             ",".join([str(i) for i in config.namespaces])
154 |             if config.namespaces
155 |             else "None"
156 |         )
157 |     )
158 |     print(
159 |         "Excluding titles from namespaces = %s"
160 |         % (
161 |             ",".join([str(i) for i in config.exnamespaces])
162 |             if config.exnamespaces
163 |             else "None"
164 |         )
165 |     )
166 | 
167 |     titles = []
168 |     if config.api:
169 |         try:
170 |             titles = getPageTitlesAPI(config=config, session=session)
171 |         except:
172 |             print("Error: could not get page titles from the API")
173 |             titles = getPageTitlesScraper(config=config, session=session)
174 |     elif config.index:
175 |         titles = getPageTitlesScraper(config=config, session=session)
176 | 
177 |     titlesfilename = "{}-{}-titles.txt".format(
178 |         domain2prefix(config=config), config.date
179 |     )
180 |     with open(f"{config.path}/{titlesfilename}", "w", encoding="utf-8") as titlesfile:
181 |         c = 0
182 |         for title in titles:
183 |             titlesfile.write(str(title) + "\n")
184 |             c += 1
185 |         # TODO: Sort to remove dupes? In CZ, Widget:AddThis appears two times:
186 |         # main namespace and widget namespace.
187 |         # We can use sort -u in UNIX, but is it worth it?
188 |         titlesfile.write("--END--\n")
189 |     print("Titles saved at...", titlesfilename)
190 | 
191 |     print("%d page titles loaded" % (c))
192 |     return titlesfilename
193 | 
194 | 
195 | def checkTitleOk(
196 |     config: Config = None,
197 | ):
198 |     try:
199 |         with FileReadBackwards(
200 |             "%s/%s-%s-titles.txt"
201 |             % (
202 |                 config.path,
203 |                 domain2prefix(config=config),
204 |                 config.date,
205 |             ),
206 |             encoding="utf-8",
207 |         ) as frb:
208 |             lasttitle = frb.readline().strip()
209 |             if lasttitle == "":
210 |                 lasttitle = frb.readline().strip()
211 |     except:
212 |         lasttitle = ""  # probably file does not exists
213 | 
214 |     return lasttitle == "--END--"
215 | 
216 | 
217 | def readTitles(config: Config = None, session=None, start=None, batch=False):
218 |     """Read title list from a file, from the title "start" """
219 |     if not checkTitleOk(config):
220 |         getPageTitles(config=config, session=session)
221 | 
222 |     titlesfilename = "{}-{}-titles.txt".format(
223 |         domain2prefix(config=config), config.date
224 |     )
225 |     titlesfile = open(f"{config.path}/{titlesfilename}", encoding="utf-8")
226 | 
227 |     titlelist = []
228 |     seeking = start is not None
229 |     with titlesfile as f:
230 |         for line in f:
231 |             title = line.strip()
232 |             if title == "--END--":
233 |                 break
234 |             elif seeking and title != start:
235 |                 continue
236 |             elif seeking:
237 |                 seeking = False
238 | 
239 |             if not batch:
240 |                 yield title
241 |             else:
242 |                 titlelist.append(title)
243 |                 if len(titlelist) < batch:
244 |                     continue
245 |                 yield titlelist
246 |                 titlelist = []
247 | 


--------------------------------------------------------------------------------
/wikiteam3/launcher.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Copyright (C) 2011-2016 WikiTeam
  4 | # This program is free software: you can redistribute it and/or modify
  5 | # it under the terms of the GNU General Public License as published by
  6 | # the Free Software Foundation, either version 3 of the License, or
  7 | # (at your option) any later version.
  8 | #
  9 | # This program is distributed in the hope that it will be useful,
 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12 | # GNU General Public License for more details.
 13 | #
 14 | # You should have received a copy of the GNU General Public License
 15 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 16 | 
 17 | # Instructions: https://github.com/mediawiki-client-tools/mediawiki-dump-generator/blob/python3/USAGE.md
 18 | 
 19 | import argparse
 20 | import os
 21 | import re
 22 | import shutil
 23 | import subprocess
 24 | import sys
 25 | import time
 26 | from pathlib import Path
 27 | 
 28 | from wikiteam3.dumpgenerator.config import Config
 29 | from wikiteam3.utils import domain2prefix
 30 | 
 31 | 
 32 | def main():
 33 |     parser = argparse.ArgumentParser(prog="launcher")
 34 | 
 35 |     parser.add_argument("wikispath")
 36 |     parser.add_argument("--7z-path", dest="path7z", metavar="path-to-7z")
 37 |     parser.add_argument("--generator-arg", "-g", dest="generator_args", action="append")
 38 | 
 39 |     args = parser.parse_args()
 40 | 
 41 |     wikispath = args.wikispath
 42 | 
 43 |     # None -> literal '7z', which will find the executable in PATH when running subprocesses
 44 |     # otherwise -> resolve as path relative to current dir, then make absolute because we will change working dir later
 45 |     path7z = str(Path(".", args.path7z).absolute()) if args.path7z is not None else "7z"
 46 | 
 47 |     generator_args = args.generator_args if args.generator_args is not None else []
 48 | 
 49 |     print("Reading list of APIs from", wikispath)
 50 | 
 51 |     wikis = None
 52 | 
 53 |     with open(wikispath) as f:
 54 |         wikis = f.read().splitlines()
 55 | 
 56 |     print("%d APIs found" % (len(wikis)))
 57 | 
 58 |     for wiki in wikis:
 59 |         print("#" * 73)
 60 |         print("# Downloading", wiki)
 61 |         print("#" * 73)
 62 |         wiki = wiki.lower()
 63 |         # Make the prefix in standard way; api and index must be defined, not important which is which
 64 |         prefix = domain2prefix(config=Config(api=wiki, index=wiki))
 65 | 
 66 |         if zipfilename := next(
 67 |             (
 68 |                 f
 69 |                 for f in os.listdir(".")
 70 |                 if f.endswith(".7z") and f.split("-")[0] == prefix
 71 |             ),
 72 |             None,
 73 |         ):
 74 |             print(
 75 |                 "Skipping... This wiki was downloaded and compressed before in",
 76 |                 zipfilename,
 77 |             )
 78 |             # Get the archive's file list.
 79 |             if (sys.version_info[0] == 3) and (sys.version_info[1] > 0):
 80 |                 archivecontent = subprocess.check_output(
 81 |                     [path7z, "l", zipfilename, "-scsUTF-8"],
 82 |                     text=True,
 83 |                     encoding="UTF-8",
 84 |                     errors="strict",
 85 |                 )
 86 |                 if re.search(r"%s.+-history\.xml" % (prefix), archivecontent) is None:
 87 |                     # We should perhaps not create an archive in this case, but we continue anyway.
 88 |                     print("ERROR: The archive contains no history!")
 89 |                 if re.search(r"SpecialVersion\.html", archivecontent) is None:
 90 |                     print(
 91 |                         "WARNING: The archive doesn't contain SpecialVersion.html, this may indicate that download didn't finish."
 92 |                     )
 93 |             else:
 94 |                 print("WARNING: Content of the archive not checked, we need 3.1+.")
 95 |                 # TODO: Find a way like grep -q below without doing a 7z l multiple times?
 96 |             continue
 97 | 
 98 |         # download
 99 |         started = False  # was this wiki download started before? then resume
100 |         wikidir = ""
101 |         for f in os.listdir("."):
102 |             # Does not find numbered wikidumps not verify directories
103 |             if f.endswith("wikidump") and f.split("-")[0] == prefix:
104 |                 wikidir = f
105 |                 started = True
106 |                 break  # stop searching, dot not explore subdirectories
107 | 
108 |         subenv = dict(os.environ)
109 |         subenv["PYTHONPATH"] = os.pathsep.join(sys.path)
110 | 
111 |         # time.sleep(60)
112 |         # Uncomment what above and add --delay=60 in the py calls below for broken wiki farms
113 |         # such as editthis.info, wiki-site.com, wikkii (adjust the value as needed;
114 |         # typically they don't provide any crawl-delay value in their robots.txt).
115 |         if started and wikidir:  # then resume
116 |             print("Resuming download, using directory", wikidir)
117 |             subprocess.call(
118 |                 [
119 |                     sys.executable,
120 |                     "-m",
121 |                     "wikiteam3.dumpgenerator",
122 |                     f"--api={wiki}",
123 |                     "--xml",
124 |                     "--images",
125 |                     "--resume",
126 |                     f"--path={wikidir}",
127 |                 ]
128 |                 + generator_args,
129 |                 shell=False,
130 |                 env=subenv,
131 |             )
132 |         else:  # download from scratch
133 |             subprocess.call(
134 |                 [
135 |                     sys.executable,
136 |                     "-m",
137 |                     "wikiteam3.dumpgenerator",
138 |                     f"--api={wiki}",
139 |                     "--xml",
140 |                     "--images",
141 |                 ]
142 |                 + generator_args,
143 |                 shell=False,
144 |                 env=subenv,
145 |             )
146 |             started = True
147 |             # save wikidir now
148 |             for f in os.listdir("."):
149 |                 # Does not find numbered wikidumps not verify directories
150 |                 if f.endswith("wikidump") and f.split("-")[0] == prefix:
151 |                     wikidir = f
152 |                     break  # stop searching, dot not explore subdirectories
153 | 
154 |         prefix = wikidir.split("-wikidump")[0]
155 | 
156 |         finished = False
157 |         if started and wikidir and prefix:
158 |             if subprocess.call(
159 |                 [f'tail -n 1 {wikidir}/{prefix}-history.xml | grep -q "</mediawiki>"'],
160 |                 shell=True,
161 |             ):
162 |                 print(
163 |                     "No </mediawiki> tag found: dump failed, needs fixing; resume didn't work. Exiting."
164 |                 )
165 |             else:
166 |                 finished = True
167 |         # You can also issue this on your working directory to find all incomplete dumps:
168 |         # tail -n 1 */*-history.xml | grep -Ev -B 1 "</page>|</mediawiki>|==|^$"
169 | 
170 |         # compress
171 |         if finished:
172 |             time.sleep(1)
173 |             os.chdir(Path(wikidir))
174 |             print("Changed directory to", os.getcwd())
175 |             # Basic integrity check for the xml. The script doesn't actually do anything, so you should check if it's broken. Nothing can be done anyway, but redownloading.
176 |             subprocess.call(
177 |                 'grep "<title>" *.xml -c;grep "<page>" *.xml -c;grep "</page>" *.xml -c;grep "<revision>" *.xml -c;grep "</revision>" *.xml -c',
178 |                 shell=True,
179 |             )
180 | 
181 |             pathHistoryTmp = Path("..", f"{prefix}-history.xml.7z.tmp")
182 |             pathHistoryFinal = Path("..", f"{prefix}-history.xml.7z")
183 |             pathFullTmp = Path("..", f"{prefix}-wikidump.7z.tmp")
184 |             pathFullFinal = Path("..", f"{prefix}-wikidump.7z")
185 | 
186 |             # Make a non-solid archive with all the text and metadata at default compression. You can also add config.txt if you don't care about your computer and user names being published or you don't use full paths so that they're not stored in it.
187 |             compressed = subprocess.call(
188 |                 [
189 |                     path7z,
190 |                     "a",
191 |                     "-ms=off",
192 |                     "--",
193 |                     str(pathHistoryTmp),
194 |                     f"{prefix}-history.xml",
195 |                     f"{prefix}-titles.txt",
196 |                     "index.html",
197 |                     "SpecialVersion.html",
198 |                     "errors.log",
199 |                     "siteinfo.json",
200 |                 ],
201 |                 shell=False,
202 |             )
203 |             if compressed < 2:
204 |                 pathHistoryTmp.rename(pathHistoryFinal)
205 |             else:
206 |                 print("ERROR: Compression failed, will have to retry next time")
207 |                 pathHistoryTmp.unlink()
208 | 
209 |             # Now we add the images, if there are some, to create another archive, without recompressing everything, at the min compression rate, higher doesn't compress images much more.
210 |             shutil.copy(pathHistoryFinal, pathFullTmp)
211 | 
212 |             subprocess.call(
213 |                 [
214 |                     path7z,
215 |                     "a",
216 |                     "-ms=off",
217 |                     "-mx=1",
218 |                     "--",
219 |                     str(pathFullTmp),
220 |                     f"{prefix}-images.txt",
221 |                     "images/",
222 |                 ],
223 |                 shell=False,
224 |             )
225 | 
226 |             pathFullTmp.rename(pathFullFinal)
227 | 
228 |             os.chdir("..")
229 |             print("Changed directory to", os.getcwd())
230 |             time.sleep(1)
231 | 
232 | 
233 | if __name__ == "__main__":
234 |     main()
235 | 


--------------------------------------------------------------------------------
/INSTALLATION.md:
--------------------------------------------------------------------------------
  1 | # Installation
  2 | 
  3 | ## Python Environment
  4 | 
  5 | `MediaWiki Dump Generator` requires [Python 3.8](https://www.python.org/downloads/release/python-380/) or later (less than 4.0), but you may be able to get it run with earlier versions of Python 3. On recent versions of Linux and macOS Python 3.8 should come preinstalled, but on Windows you will need to install it from [python.org](https://www.python.org/downloads/release/python-380/).
  6 | 
  7 | `MediaWiki Dump Generator` has been tested on Linux, macOS, Windows and Android. If you are connecting to Linux or macOS via `ssh`, you can continue using the `bash` or `zsh` command prompt in the same terminal, but if you are starting in a desktop environment and don't already have a preferred Terminal environment you can try one of the following.
  8 | 
  9 | > **NOTE:** You may need to update and pre-install dependencies in order for `MediaWiki Dump Generator` to work properly. Shell commands for these dependencies appear below each item in the list. (Also note that while installing and running `MediaWiki Dump Generator` itself should not require administrative priviliges, installing dependencies usually will.)
 10 | 
 11 | * On desktop Linux you can use the default terminal application such as [Konsole](https://konsole.kde.org/) or [GNOME Terminal](https://help.gnome.org/users/gnome-terminal/stable/).
 12 | 
 13 |   <details>
 14 |   <summary>Linux Dependencies</summary>
 15 | 
 16 |   While most Linux distributions will have Python 3 preinstalled, if you are cloning `MediaWiki Dump Generator` rather than downloading it directly you may need to install `git`.
 17 | 
 18 |   On Debian, Ubuntu, and the like:
 19 | 
 20 |   ```bash
 21 |   sudo apt update && sudo apt upgrade && sudo install git
 22 |   ```
 23 | 
 24 |   (On Fedora, Arch, etc., use `dnf`, `pacman`, etc., instead.)
 25 | 
 26 |   </details>
 27 | 
 28 | * On macOS you can use the built-in application [Terminal](https://support.apple.com/guide/terminal), which is found in `Applications/Utilities`.
 29 | 
 30 |   <details>
 31 |   <summary>macOS Dependencies</summary>
 32 | 
 33 |   While macOS will have Python 3 preinstalled, if you are cloning `MediaWiki Dump Generator` rather than downloading it directly and you are using an older versions of macOS, you may need to install `git`.
 34 | 
 35 |   If `git` is not preinstalled, however, macOS will prompt you to install it the first time you run the command. Therefore, to check whether you have `git` installed or to install `git`, simply run `git` (with no arguments) in Terminal:
 36 | 
 37 |   ```bash
 38 |   git
 39 |   ```
 40 | 
 41 |   If `git` is already installed, it will print its usage instructions. If `git` is not preinstalled, the command will pop up a window asking if you want to install Apple's command line developer tools, and clicking "Install" in the popup window will install `git`.
 42 | 
 43 |   </details>
 44 | 
 45 | * On Windows 10 or Windows 11 you can use [Windows Terminal](https://aka.ms/terminal).
 46 | 
 47 |   <details>
 48 |   <summary>Windows Dependencies</summary>
 49 | 
 50 |   The latest version of Python is available from [python.org](https://www.python.org/downloads/). Python will then be available from any Command Prompt or PowerShell session. Optionally, adding C:\Program Files\Git\usr\bin to the PATH environment variable will add some some useful Linux commands and utilities to Command Prompt.
 51 | 
 52 |   If you are already using the [Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/about), you can follow the Linux instructions above. If you don't want to install a full WSL distribution, [Git for Windows](https://gitforwindows.org/) provides Bash emulation, so you can use it as a more lightweight option instead. Git Bash also provides some useful Linux commands and utilities.
 53 | 
 54 |   > When installing [Python 3.8](https://www.python.org/downloads/release/python-380/) (from python.org), be sure to check "Add Python to PATH" so that installed Python scripts are accessible from any location. If for some reason installed Python scripts, e.g. `pip`, are not available from any location, you can add Python to the `PATH` environment variable using the instructions [here](https://datatofish.com/add-python-to-windows-path/).
 55 |   >
 56 |   > And while doing so should not be necessary if you follow the instructions further down and install `MediaWiki Dump Generator` using `pip`, if you'd prefer that Windows store installed Python scripts somewhere other than the default Python folder under `%appdata%`, you can also add your preferred alternative path such as `C:\Program Files\Python3\Scripts\` or a subfolder of `My Documents`. (You will need to restart any terminal sessions in order for this to take effect.)
 57 | 
 58 |   Whenever you'd like to run a Bash session, you can open a Bash terminal prompt from any folder in Windows Explorer by right-clicking and choosing the option from the context menu. (For some purposes you may wish to run Bash as an administrator.) This way you can open a Bash prompt and clone the `MediaWiki Dump Generator` repository in one location, and subsequently or later open another Bash prompt and run `MediaWiki Dump Generator` to dump a wiki wherever else you'd like without having to browse to the directory manually using Bash.
 59 | 
 60 |   </details>
 61 | 
 62 | * On Android you can use [Termux](https://termux.dev).
 63 | 
 64 |   <details>
 65 |   <summary>Termux Dependencies</summary>
 66 | 
 67 |   ```bash
 68 |   pkg update && pkg upgrade && pkg install git libxslt python
 69 |   ```
 70 | 
 71 |   </details>
 72 | 
 73 | * On iOS you can use [iSH](https://ish.app/).
 74 | 
 75 |   <details>
 76 |   <summary>iSH Dependencies</summary>
 77 | 
 78 |   ```bash
 79 |   apk update && apk upgrade && apk add git py3-pip
 80 |   ```
 81 | 
 82 |   > **Note:** iSH may automatically quit if your iOS device goes to sleep, and it may lose its status if you switch to another app. You can disable auto-sleep while iSH is running by clicking the gear icon and toggling "Disable Screen Dimming". (You may wish to connect your device to a charger while running iSH.)
 83 | 
 84 |   </details>
 85 | 
 86 | ## Downloading and installing dumpgenerator
 87 | 
 88 | The Python 3 port of the `dumpgenerator` module of `wikiteam3` is largely functional and can be installed from a downloaded or cloned copy of this repository.
 89 | 
 90 | > If you run into a problem with the version that mostly works, you can [open an Issue](https://github.com/mediawiki-client-tools/mediawiki-dump-generator/issues/new/choose). Be sure to include the following:
 91 | >
 92 | > 1. The operating system you're using
 93 | > 2. What command you ran that didn't work
 94 | > 3. What output was printed to your terminal
 95 | 
 96 | ### 1. Downloading and installing `MediaWiki Dump Generator`
 97 | 
 98 | In whatever folder you use for cloned repositories:
 99 | 
100 | ```bash
101 | git clone https://github.com/mediawiki-client-tools/mediawiki-dump-generator
102 | ```
103 | 
104 | ```bash
105 | cd mediawiki-dump-generator
106 | ```
107 | 
108 | ```bash
109 | poetry update && poetry install && poetry build
110 | ```
111 | 
112 | ```bash
113 | pip install --force-reinstall dist/*.whl
114 | ```
115 | 
116 | <details>
117 | <summary>For Windows Command Prompt, enter this pip command instead, (in a batch file use %%x).</summary>
118 | 
119 | ```bash
120 | for %x in (dist\*.whl) do pip install --force-reinstall %x
121 | ```
122 | 
123 | </details>
124 | <details>
125 | <summary>For Windows Powershell, enter this pip command instead.</summary>
126 | 
127 | ```bash
128 | pip install --force-reinstall (Get-ChildItem .\dist\*.whl).FullName
129 | ```
130 | 
131 | </details>
132 | 
133 | ### 2. Running `dumpgenerator` for whatever purpose you need
134 | 
135 | After installing `MediaWiki Dump Generator` using `pip` you should be able to use the `dumpgenerator` command from any local directory.
136 | 
137 | ```bash
138 | dumpgenerator [args]
139 | ```
140 | 
141 | ### 3. Uninstalling the package and deleting the cloned repository when you're done
142 | 
143 | ```shell
144 | pip uninstall wikiteam3
145 | ```
146 | 
147 | ```bash
148 | rm -fr [cloned mediawiki dump generator folder]
149 | ```
150 | 
151 | ### 4. Updating MediaWiki Dump Generator
152 | 
153 | > **Note:** Re-run the following steps each time to reinstall each time the MediaWiki Dump Generator branch is updated.
154 | 
155 | ```bash
156 | git pull
157 | ```
158 | 
159 | ```bash
160 | poetry update && poetry install && poetry build
161 | ```
162 | 
163 | ```bash
164 | pip install --force-reinstall dist/*.whl
165 | ```
166 | 
167 | <details>
168 | <summary>For Windows Command Prompt, enter this pip command instead, (in a batch file use %%x).</summary>
169 | 
170 | ```bash
171 | for %x in (dist\*.whl) do pip install --force-reinstall %x
172 | ```
173 | 
174 | </details>
175 | <details>
176 | <summary>For Windows Powershell, enter this pip command instead.</summary>
177 | 
178 | ```bash
179 | pip install --force-reinstall (Get-ChildItem .\dist\*.whl).FullName
180 | ```
181 | 
182 | </details>
183 | 
184 | ### 5. Manually build and install `MediaWiki Dump Generator`
185 | 
186 | If you'd like to manually build and install `MediaWiki Dump Generator` from a cloned or downloaded copy of this repository, run the following commands from the downloaded base directory:
187 | 
188 | ```bash
189 | curl -sSL https://install.python-poetry.org | python3 -
190 | ```
191 | 
192 | ```bash
193 | poetry update && poetry install && poetry build
194 | ```
195 | 
196 | ```bash
197 | pip install --force-reinstall dist/*.whl
198 | ```
199 | 
200 | <details>
201 | <summary>For Windows Command Prompt, enter this pip command instead, (in a batch file use %%x).</summary>
202 | 
203 | ```bash
204 | for %x in (dist\*.whl) do pip install --force-reinstall %x
205 | ```
206 | 
207 | </details>
208 | <details>
209 | <summary>For Windows Powershell, enter this pip command instead.</summary>
210 | 
211 | ```bash
212 | pip install --force-reinstall (Get-ChildItem .\dist\*.whl).FullName
213 | ```
214 | 
215 | </details>
216 | 
217 | ### 6. To run the test suite
218 | 
219 | To run the test suite, run:
220 | 
221 | ```bash
222 | test-dumpgenerator
223 | ```
224 | 
225 | ### 7. Switching branches
226 | 
227 | ```bash
228 | git checkout --track origin/python3
229 | ```
230 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/test/data/html_regexs/group0.mediawiki.demo.save-web.org_mediawiki-1.16.5-20230701.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
  2 | <html lang="zh-cn" dir="ltr">
  3 | <head>
  4 | <title>文件列表 - 1165哈😂</title>
  5 | <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
  6 | <meta name="generator" content="MediaWiki 1.16.5" />
  7 | <meta name="robots" content="noindex,nofollow" />
  8 | <link rel="shortcut icon" href="/favicon.ico" />
  9 | <link rel="search" type="application/opensearchdescription+xml" href="/mediawiki-1.16.5/opensearch_desc.php" title="1165哈😂 (zh-cn)" />
 10 | <link rel="alternate" type="application/atom+xml" title="1165哈😂的Atom订阅" href="/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E6%9C%80%E8%BF%91%E6%9B%B4%E6%94%B9&amp;feed=atom" />
 11 | <link rel="stylesheet" href="/mediawiki-1.16.5/skins/common/shared.css?270" media="screen" />
 12 | <link rel="stylesheet" href="/mediawiki-1.16.5/skins/common/commonPrint.css?270" media="print" />
 13 | <link rel="stylesheet" href="/mediawiki-1.16.5/skins/monobook/main.css?270" media="screen" />
 14 | <!--[if lt IE 5.5000]><link rel="stylesheet" href="/mediawiki-1.16.5/skins/monobook/IE50Fixes.css?270" media="screen" /><![endif]-->
 15 | <!--[if IE 5.5000]><link rel="stylesheet" href="/mediawiki-1.16.5/skins/monobook/IE55Fixes.css?270" media="screen" /><![endif]-->
 16 | <!--[if IE 6]><link rel="stylesheet" href="/mediawiki-1.16.5/skins/monobook/IE60Fixes.css?270" media="screen" /><![endif]-->
 17 | <!--[if IE 7]><link rel="stylesheet" href="/mediawiki-1.16.5/skins/monobook/IE70Fixes.css?270" media="screen" /><![endif]-->
 18 | <link rel="stylesheet" href="/mediawiki-1.16.5/index.php?title=MediaWiki:Common.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=18000&amp;action=raw&amp;maxage=18000" />
 19 | <link rel="stylesheet" href="/mediawiki-1.16.5/index.php?title=MediaWiki:Print.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=18000&amp;action=raw&amp;maxage=18000" media="print" />
 20 | <link rel="stylesheet" href="/mediawiki-1.16.5/index.php?title=MediaWiki:Monobook.css&amp;usemsgcache=yes&amp;ctype=text%2Fcss&amp;smaxage=18000&amp;action=raw&amp;maxage=18000" />
 21 | <link rel="stylesheet" href="/mediawiki-1.16.5/index.php?title=-&amp;action=raw&amp;maxage=18000&amp;gen=css" />
 22 | <script>
 23 | var skin="monobook",
 24 | stylepath="/mediawiki-1.16.5/skins",
 25 | wgUrlProtocols="http\\:\\/\\/|https\\:\\/\\/|ftp\\:\\/\\/|irc\\:\\/\\/|gopher\\:\\/\\/|telnet\\:\\/\\/|nntp\\:\\/\\/|worldwind\\:\\/\\/|mailto\\:|news\\:|svn\\:\\/\\/",
 26 | wgArticlePath="/mediawiki-1.16.5/index.php?title=$1",
 27 | wgScriptPath="/mediawiki-1.16.5",
 28 | wgScriptExtension=".php",
 29 | wgScript="/mediawiki-1.16.5/index.php",
 30 | wgVariantArticlePath=false,
 31 | wgActionPaths={},
 32 | wgServer="http://group0.mediawiki.demo.save-web.org",
 33 | wgCanonicalNamespace="Special",
 34 | wgCanonicalSpecialPageName="Listfiles",
 35 | wgNamespaceNumber=-1,
 36 | wgPageName="特殊:文件列表",
 37 | wgTitle="文件列表",
 38 | wgAction="view",
 39 | wgArticleId=0,
 40 | wgIsArticle=false,
 41 | wgUserName=null,
 42 | wgUserGroups=null,
 43 | wgUserLanguage="zh-cn",
 44 | wgContentLanguage="zh-cn",
 45 | wgBreakFrames=true,
 46 | wgCurRevisionId=0,
 47 | wgVersion="1.16.5",
 48 | wgEnableAPI=true,
 49 | wgEnableWriteAPI=true,
 50 | wgSeparatorTransformTable=["", ""],
 51 | wgDigitTransformTable=["", ""],
 52 | wgMainPageTitle="首页",
 53 | wgFormattedNamespaces={"-2": "媒体", "-1": "特殊", "0": "", "1": "讨论", "2": "用户", "3": "用户讨论", "4": "1165哈😂", "5": "1165哈😂讨论", "6": "文件", "7": "文件讨论", "8": "MediaWiki", "9": "MediaWiki讨论", "10": "模板", "11": "模板讨论", "12": "帮助", "13": "帮助讨论", "14": "分类", "15": "分类讨论"},
 54 | wgNamespaceIds={"媒体": -2, "特殊": -1, "": 0, "讨论": 1, "用户": 2, "用户讨论": 3, "1165哈😂": 4, "1165哈😂讨论": 5, "文件": 6, "文件讨论": 7, "mediawiki": 8, "mediawiki讨论": 9, "模板": 10, "模板讨论": 11, "帮助": 12, "帮助讨论": 13, "分类": 14, "分类讨论": 15, "对话": 1, "用户对话": 3, "图像": 6, "档案": 6, "image": 6, "image_talk": 7, "图像对话": 7, "图像讨论": 7, "档案对话": 7, "档案讨论": 7, "文件对话": 7, "模板对话": 11, "帮助对话": 13, "分类对话": 15},
 55 | wgSiteName="1165哈😂",
 56 | wgCategories=[],
 57 | wgRestrictionEdit=[],
 58 | wgRestrictionMove=[];
 59 | </script><script src="/mediawiki-1.16.5/skins/common/wikibits.js?270"></script>
 60 | <script src="/mediawiki-1.16.5/skins/common/ajax.js?270"></script>
 61 | <script src="/mediawiki-1.16.5/index.php?title=-&amp;action=raw&amp;gen=js&amp;useskin=monobook&amp;270"></script>
 62 | 
 63 | </head>
 64 | <body class="mediawiki ltr ns--1 ns-special page-特殊_文件列表 skin-monobook">
 65 | <div id="globalWrapper">
 66 | <div id="column-content"><div id="content" >
 67 | 	<a id="top"></a>
 68 | 
 69 | 	<h1 id="firstHeading" class="firstHeading">文件列表</h1>
 70 | 	<div id="bodyContent">
 71 | 		<h3 id="siteSub">出自1165哈😂</h3>
 72 | 		<div id="contentSub"></div>
 73 | 		<div id="jump-to-nav">跳转到： <a href="#column-one">导航</a>, <a href="#searchInput">搜索</a></div>
 74 | 		<!-- start content -->
 75 | <div class="mw-specialpage-summary">
 76 | <p>这个特殊页面显示所有已上传文件。
 77 | 默认设置中，最后上传的文件会显示在这个列表的顶端。
 78 | </p>
 79 | 点击任一列标题可修改排序方式。</div>
 80 | <form method="get" action="/mediawiki-1.16.5/index.php" id="mw-listfiles-form"><fieldset><legend>文件列表</legend><label>每页显示<select name="limit"><option value="20" >20</option>
 81 | <option value="50" selected="selected">50</option>
 82 | <option value="100" >100</option>
 83 | <option value="250" >250</option>
 84 | <option value="500" >500</option>
 85 | </select>项</label><br />
 86 | <label for="mw-ilsearch">按媒体名称搜索：</label>&nbsp;<input name="ilsearch" size="20" value="" id="mw-ilsearch" /> <input type="submit" value="到" />
 87 | <input type="hidden" name="title" value="特殊:文件列表"/>
 88 | </fieldset></form>
 89 | <br />
 90 | <table border='1' class="listfiles TablePager"><thead><tr>
 91 | <th class="listfiles_sort TablePager_sort"><a href="/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;sort=img_timestamp&amp;limit=50&amp;asc=1" title="特殊:文件列表"><img width="12" height="12" alt="降" src="/mediawiki-1.16.5/skins/common/images/Arr_u.png" />日期</a></th>
 92 | <th><a href="/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;sort=img_name&amp;limit=50" title="特殊:文件列表">名称</a></th>
 93 | <th>用户</th>
 94 | <th><a href="/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;sort=img_size&amp;limit=50" title="特殊:文件列表">大小</a></th>
 95 | <th>描述</th>
 96 | <th>版本</th>
 97 | </tr></thead><tbody>
 98 | <tr class=""><td class="TablePager_col_img_timestamp">2023年1月11日 (三) 19:05</td><td class="TablePager_col_img_name"><a href="/mediawiki-1.16.5/index.php?title=%E6%96%87%E4%BB%B6:Screenshot_20230112025903.jpg" title="文件:Screenshot 20230112025903.jpg">Screenshot_20230112025903.jpg</a> (<a href="/mediawiki-1.16.5/images/7/71/Screenshot_20230112025903.jpg">文件</a>)</td><td class="TablePager_col_img_user_text"><a href="/mediawiki-1.16.5/index.php?title=%E7%94%A8%E6%88%B7:Saveweb&amp;action=edit&amp;redlink=1" class="new" title="用户:Saveweb（尚未撰写）">Saveweb</a></td><td class="TablePager_col_img_size">8 KB</td><td class="TablePager_col_img_description">&nbsp;</td><td class="TablePager_col_count">1</td></tr>
 99 | <tr class=""><td class="TablePager_col_img_timestamp">2023年1月10日 (二) 14:31</td><td class="TablePager_col_img_name"><a href="/mediawiki-1.16.5/index.php?title=%E6%96%87%E4%BB%B6:%E6%88%AA%E5%9B%BE_2023-01-10_22-29-18.png" title="文件:截图 2023-01-10 22-29-18.png">截图_2023-01-10_22-29-18.png</a> (<a href="/mediawiki-1.16.5/images/e/ec/%E6%88%AA%E5%9B%BE_2023-01-10_22-29-18.png">文件</a>)</td><td class="TablePager_col_img_user_text"><a href="/mediawiki-1.16.5/index.php?title=%E7%94%A8%E6%88%B7:Saveweb&amp;action=edit&amp;redlink=1" class="new" title="用户:Saveweb（尚未撰写）">Saveweb</a></td><td class="TablePager_col_img_size">117 KB</td><td class="TablePager_col_img_description"> <span class="comment">(hulahula)</span></td><td class="TablePager_col_count">1</td></tr>
100 | </tbody></table>
101 | <br />
102 | <div class="printfooter">
103 | 取自“<a href="http://group0.mediawiki.demo.save-web.org/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8">http://group0.mediawiki.demo.save-web.org/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8</a>”</div>
104 | 		<div id='catlinks' class='catlinks catlinks-allhidden'></div>		<!-- end content -->
105 | 				<div class="visualClear"></div>
106 | 	</div>
107 | </div></div>
108 | <div id="column-one">
109 | 	<div id="p-cactions" class="portlet">
110 | 		<h5>查看</h5>
111 | 		<div class="pBody">
112 | 			<ul>
113 | 				 <li id="ca-nstab-special" class="selected"><a href="/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8" title="这是一个特殊页面，您不能对它进行编辑">特殊页面</a></li>
114 | 			</ul>
115 | 		</div>
116 | 	</div>
117 | 	<div class="portlet" id="p-personal">
118 | 		<h5>个人工具</h5>
119 | 		<div class="pBody">
120 | 			<ul>
121 | 				<li id="pt-login"><a href="/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E7%94%A8%E6%88%B7%E7%99%BB%E5%BD%95&amp;returnto=%E7%89%B9%E6%AE%8A:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8" title="我们鼓励您登录，但这并不是必须的 [o]" accesskey="o">登录／创建账户</a></li>
122 | 			</ul>
123 | 		</div>
124 | 	</div>
125 | 	<div class="portlet" id="p-logo">
126 | 		<a style="background-image: url(/mediawiki-1.16.5/skins/common/images/wiki.png);" href="/mediawiki-1.16.5/index.php?title=%E9%A6%96%E9%A1%B5" title="访问首页"></a>
127 | 	</div>
128 | 	<script type="text/javascript"> if (window.isMSIE55) fixalpha(); </script>
129 | 	<div class='generated-sidebar portlet' id='p-navigation'>
130 | 		<h5>导航</h5>
131 | 		<div class='pBody'>
132 | 			<ul>
133 | 				<li id="n-mainpage-description"><a href="/mediawiki-1.16.5/index.php?title=%E9%A6%96%E9%A1%B5" title="访问首页 [z]" accesskey="z">首页</a></li>
134 | 				<li id="n-portal"><a href="/mediawiki-1.16.5/index.php?title=1165%E5%93%88%F0%9F%98%82:%E7%A4%BE%E5%8C%BA" title="关于本计划，您可以做什么，应该如何做">社区入口</a></li>
135 | 				<li id="n-currentevents"><a href="/mediawiki-1.16.5/index.php?title=1165%E5%93%88%F0%9F%98%82:%E5%BD%93%E5%89%8D%E4%BA%8B%E4%BB%B6" title="查找当前事件的背景信息">当前事件</a></li>
136 | 				<li id="n-recentchanges"><a href="/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E6%9C%80%E8%BF%91%E6%9B%B4%E6%94%B9" title="列出该网站的最近修改 [r]" accesskey="r">最近更改</a></li>
137 | 				<li id="n-randompage"><a href="/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E9%9A%8F%E6%9C%BA%E9%A1%B5%E9%9D%A2" title="随机载入一个页面 [x]" accesskey="x">随机页面</a></li>
138 | 				<li id="n-help"><a href="/mediawiki-1.16.5/index.php?title=%E5%B8%AE%E5%8A%A9:%E7%9B%AE%E5%BD%95" title="寻求帮助">帮助</a></li>
139 | 			</ul>
140 | 		</div>
141 | 	</div>
142 | 	<div id="p-search" class="portlet">
143 | 		<h5><label for="searchInput">搜索</label></h5>
144 | 		<div id="searchBody" class="pBody">
145 | 			<form action="/mediawiki-1.16.5/index.php" id="searchform">
146 | 				<input type='hidden' name="title" value="特殊:搜索"/>
147 | 				<input id="searchInput" title="搜索该网站" accesskey="f" type="search" name="search" />
148 | 				<input type='submit' name="go" class="searchButton" id="searchGoButton"	value="进入" title="如果相同的标题存在的话便直接前往该页面" />&nbsp;
149 | 				<input type='submit' name="fulltext" class="searchButton" id="mw-searchButton" value="搜索" title="搜索该文字的页面" />
150 | 			</form>
151 | 		</div>
152 | 	</div>
153 | 	<div class="portlet" id="p-tb">
154 | 		<h5>工具箱</h5>
155 | 		<div class="pBody">
156 | 			<ul>
157 | <li id="t-specialpages"><a href="/mediawiki-1.16.5/index.php?title=%E7%89%B9%E6%AE%8A:%E7%89%B9%E6%AE%8A%E9%A1%B5%E9%9D%A2" title="所有特殊页面列表 [q]" accesskey="q">特殊页面</a></li>
158 | 			</ul>
159 | 		</div>
160 | 	</div>
161 | </div><!-- end of the left (by default at least) column -->
162 | <div class="visualClear"></div>
163 | <div id="footer">
164 | 	<div id="f-poweredbyico"><a href="http://www.mediawiki.org/"><img src="/mediawiki-1.16.5/skins/common/images/poweredby_mediawiki_88x31.png" height="31" width="88" alt="Powered by MediaWiki" /></a></div>
165 | 	<ul id="f-list">
166 | 		<li id="privacy"><a href="/mediawiki-1.16.5/index.php?title=1165%E5%93%88%F0%9F%98%82:%E9%9A%90%E7%A7%81%E6%94%BF%E7%AD%96" title="1165哈😂:隐私政策">隐私政策</a></li>
167 | 		<li id="about"><a href="/mediawiki-1.16.5/index.php?title=1165%E5%93%88%F0%9F%98%82:%E5%85%B3%E4%BA%8E" title="1165哈😂:关于">关于1165哈😂</a></li>
168 | 		<li id="disclaimer"><a href="/mediawiki-1.16.5/index.php?title=1165%E5%93%88%F0%9F%98%82:%E5%85%8D%E8%B4%A3%E5%A3%B0%E6%98%8E" title="1165哈😂:免责声明">免责声明</a></li>
169 | 	</ul>
170 | </div>
171 | </div>
172 | 
173 | <script>if (window.runOnloadHook) runOnloadHook();</script>
174 | <!-- Served in 0.247 secs. --></body></html>
175 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/generator.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     import contextlib
  3 |     import http.cookiejar
  4 |     import os
  5 |     import re
  6 |     import sys
  7 |     import traceback
  8 | 
  9 |     from file_read_backwards import FileReadBackwards
 10 | 
 11 | 
 12 | except ImportError:
 13 |     print(
 14 |         """
 15 |         Please install poetry with:
 16 |             $ pip install poetry.
 17 |         Then rerun py with:
 18 |             $ poetry run python py
 19 |     """
 20 |     )
 21 |     sys.exit(1)
 22 | 
 23 | from typing import *
 24 | 
 25 | from wikiteam3.dumpgenerator.cli import bye, getParameters, welcome
 26 | from wikiteam3.dumpgenerator.config import Config, loadConfig, saveConfig
 27 | from wikiteam3.dumpgenerator.dump.image.image import Image
 28 | from wikiteam3.dumpgenerator.dump.misc.index_php import saveIndexPHP
 29 | from wikiteam3.dumpgenerator.dump.misc.site_info import saveSiteInfo
 30 | from wikiteam3.dumpgenerator.dump.misc.special_logs import saveLogs
 31 | from wikiteam3.dumpgenerator.dump.misc.special_version import saveSpecialVersion
 32 | from wikiteam3.dumpgenerator.dump.xmldump.xml_dump import generateXMLDump
 33 | from wikiteam3.dumpgenerator.dump.xmldump.xml_integrity import checkXMLIntegrity
 34 | from wikiteam3.dumpgenerator.log import logerror
 35 | from wikiteam3.utils import avoidWikimediaProjects, domain2prefix, undoHTMLEntities
 36 | 
 37 | 
 38 | # From https://stackoverflow.com/a/57008707
 39 | class Tee:
 40 |     def __init__(self, filename):
 41 |         self.file = open(filename, "w", encoding="utf-8")
 42 |         self.stdout = sys.stdout
 43 | 
 44 |     def __enter__(self):
 45 |         sys.stdout = self
 46 | 
 47 |     def __exit__(self, exc_type, exc_value, tb):
 48 |         sys.stdout = self.stdout
 49 |         if exc_type is not None:
 50 |             self.file.write(traceback.format_exc())
 51 |         self.file.close()
 52 | 
 53 |     def write(self, data):
 54 |         self.file.write(data)
 55 |         self.stdout.write(data)
 56 | 
 57 |     def flush(self):
 58 |         self.file.flush()
 59 |         self.stdout.flush()
 60 | 
 61 | 
 62 | class DumpGenerator:
 63 |     configfilename = "config.json"
 64 | 
 65 |     @staticmethod
 66 |     def __init__(params=None):
 67 |         """Main function"""
 68 |         configfilename = DumpGenerator.configfilename
 69 |         config, other = getParameters(params=params)
 70 |         avoidWikimediaProjects(config=config, other=other)
 71 | 
 72 |         with (
 73 |             Tee(other["stdout_log_path"])
 74 |             if other["stdout_log_path"] is not None
 75 |             else contextlib.nullcontext()
 76 |         ):
 77 |             print(welcome())
 78 |             print(f"Analysing {config.api if config.api else config.index}")
 79 | 
 80 |             # creating path or resuming if desired
 81 |             c = 2
 82 |             # to avoid concat blabla-2, blabla-2-3, and so on...
 83 |             originalpath = config.path
 84 |             # do not enter if resume is requested from begining
 85 |             while not other["resume"] and os.path.isdir(config.path):
 86 |                 print('\nWarning!: "%s" path exists' % (config.path))
 87 |                 reply = ""
 88 |                 if config.failfast:
 89 |                     reply = "yes"
 90 |                 while reply.lower() not in ["yes", "y", "no", "n"]:
 91 |                     reply = input(
 92 |                         'There is a dump in "%s", probably incomplete.\nIf you choose resume, to avoid conflicts, the parameters you have chosen in the current session will be ignored\nand the parameters available in "%s/%s" will be loaded.\nDo you want to resume ([yes, y], [no, n])? '
 93 |                         % (config.path, config.path, configfilename)
 94 |                     )
 95 |                 if reply.lower() in ["yes", "y"]:
 96 |                     if not os.path.isfile(f"{config.path}/{configfilename}"):
 97 |                         print("No config file found. I can't resume. Aborting.")
 98 |                         sys.exit()
 99 |                     print("You have selected: YES")
100 |                     other["resume"] = True
101 |                     break
102 |                 elif reply.lower() in ["no", "n"]:
103 |                     print("You have selected: NO")
104 |                     other["resume"] = False
105 |                 config.path = "%s-%d" % (originalpath, c)
106 |                 print(f'Trying to use path "{config.path}"...')
107 |                 c += 1
108 | 
109 |             if other["resume"]:
110 |                 print("Loading config file...")
111 |                 config = loadConfig(config=config, configfilename=configfilename)
112 |             else:
113 |                 os.mkdir(config.path)
114 |                 saveConfig(config=config, configfilename=configfilename)
115 | 
116 |             if other["resume"]:
117 |                 DumpGenerator.resumePreviousDump(config=config, other=other)
118 |             else:
119 |                 DumpGenerator.createNewDump(config=config, other=other)
120 | 
121 |             saveIndexPHP(config=config, session=other["session"])
122 |             saveSpecialVersion(config=config, session=other["session"])
123 |             saveSiteInfo(config=config, session=other["session"])
124 |             bye()
125 | 
126 |     @staticmethod
127 |     def createNewDump(config: Config = None, other: Dict = None):
128 |         # we do lazy title dumping here :)
129 |         images = []
130 |         print("Trying generating a new dump into a new directory...")
131 |         if config.xml:
132 |             generateXMLDump(config=config, session=other["session"])
133 |             checkXMLIntegrity(config=config, session=other["session"])
134 |         if config.images:
135 |             images += Image.getImageNames(config=config, session=other["session"])
136 |             Image.saveImageNames(config=config, images=images, session=other["session"])
137 |             Image.generateImageDump(
138 |                 config=config, other=other, images=images, session=other["session"]
139 |             )
140 |         if config.logs:
141 |             saveLogs(config=config, session=other["session"])
142 | 
143 |     @staticmethod
144 |     def resumePreviousDump(config: Config = None, other: Dict = None):
145 |         images = []
146 |         print("Resuming previous dump process...")
147 |         if config.xml:
148 |             # checking xml dump
149 |             xmliscomplete = False
150 |             lastxmltitle = None
151 |             lastxmlrevid = None
152 |             try:
153 |                 with FileReadBackwards(
154 |                     "%s/%s-%s-%s.xml"
155 |                     % (
156 |                         config.path,
157 |                         domain2prefix(config=config, session=other["session"]),
158 |                         config.date,
159 |                         "current" if config.curonly else "history",
160 |                     ),
161 |                     encoding="utf-8",
162 |                 ) as frb:
163 |                     for l in frb:
164 |                         if l.strip() == "</mediawiki>":
165 |                             # xml dump is complete
166 |                             xmliscomplete = True
167 |                             break
168 | 
169 |                         if xmlrevid := re.search(r"    <id>([^<]+)</id>", l):
170 |                             lastxmlrevid = int(xmlrevid.group(1))
171 |                         if xmltitle := re.search(r"<title>([^<]+)</title>", l):
172 |                             lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
173 |                             break
174 | 
175 |             except:
176 |                 pass  # probably file does not exists
177 | 
178 |             if xmliscomplete:
179 |                 print("XML dump was completed in the previous session")
180 |             elif lastxmltitle:
181 |                 # resuming...
182 |                 print(
183 |                     f'Resuming XML dump from "{lastxmltitle}" (revision id {lastxmlrevid})'
184 |                 )
185 |                 generateXMLDump(
186 |                     config=config,
187 |                     session=other["session"],
188 |                     resume=True,
189 |                 )
190 |             else:
191 |                 # corrupt? only has XML header?
192 |                 print("XML is corrupt? Regenerating...")
193 |                 generateXMLDump(config=config, session=other["session"])
194 | 
195 |         if config.images:
196 |             # load images list
197 |             lastimage = ""
198 |             imagesFilePath = "{}/{}-{}-images.txt".format(
199 |                 config.path,
200 |                 domain2prefix(config=config),
201 |                 config.date,
202 |             )
203 |             if os.path.exists(imagesFilePath):
204 |                 with open(imagesFilePath) as f:
205 |                     lines = f.read().splitlines()
206 |                     images.extend(l.split("\t") for l in lines if re.search(r"\t", l))
207 |                     if len(lines) == 0:  # empty file
208 |                         lastimage = "--EMPTY--"
209 |                     if not lastimage:
210 |                         lastimage = lines[-1].strip()
211 |                     if lastimage == "":
212 |                         lastimage = lines[-2].strip()
213 |             if images and len(images[0]) < 5:
214 |                 print(
215 |                     "Warning: Detected old images list (images.txt) format.\n"
216 |                     + "You can delete 'images.txt' manually and restart the script."
217 |                 )
218 |                 sys.exit(1)
219 |             if lastimage == "--END--":
220 |                 print("Image list was completed in the previous session")
221 |             else:
222 |                 print("Image list is incomplete. Reloading...")
223 |                 # do not resume, reload, to avoid inconsistences, deleted images or
224 |                 # so
225 |                 images = Image.getImageNames(config=config, session=other["session"])
226 |                 Image.saveImageNames(config=config, images=images)
227 |             # checking images directory
228 |             listdir = []
229 |             try:
230 |                 listdir = os.listdir(f"{config.path}/images")
231 |             except OSError:
232 |                 pass  # probably directory does not exist
233 |             listdir = set(listdir)
234 |             c_desc = 0
235 |             c_images = 0
236 |             c_checked = 0
237 |             for filename, url, uploader, size, sha1 in images:
238 |                 lastfilename = filename
239 |                 if other["filenamelimit"] < len(filename.encode("utf-8")):
240 |                     logerror(
241 |                         config=config,
242 |                         to_stdout=True,
243 |                         text=f"Filename too long(>240 bytes), skipping: {filename}",
244 |                     )
245 |                     continue
246 |                 if filename in listdir:
247 |                     c_images += 1
248 |                 if f"{filename}.desc" in listdir:
249 |                     c_desc += 1
250 |                 c_checked += 1
251 |                 if c_checked % 100000 == 0:
252 |                     print(f"checked {c_checked}/{len(images)} records", end="\r")
253 |             print(
254 |                 f"{len(images)} records in images.txt, {c_images} images and {c_desc} .desc were saved in the previous session"
255 |             )
256 |             if c_desc < len(images):
257 |                 complete = False
258 |             elif c_images < len(images):
259 |                 complete = False
260 |                 print(
261 |                     "WARNING: Some images were not saved. You may want to delete their \n"
262 |                     + ".desc files and re-run the script to redownload the missing images.\n"
263 |                     + "(If images URL are unavailable, you can ignore this warning.)\n"
264 |                     + "(In most cases, if the number of .desc files equals the number of \n"
265 |                     + "images.txt records, you can ignore this warning, images dump was completed.)"
266 |                 )
267 |                 sys.exit()
268 |             else:  # c_desc == c_images == len(images)
269 |                 complete = True
270 |             if complete:
271 |                 # image dump is complete
272 |                 print("Image dump was completed in the previous session")
273 |             else:
274 |                 # we resume from previous image, which may be corrupted (or missing
275 |                 # .desc)  by the previous session ctrl-c or abort
276 |                 Image.generateImageDump(
277 |                     config=config,
278 |                     other=other,
279 |                     images=images,
280 |                     session=other["session"],
281 |                 )
282 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import time
  3 | import traceback
  4 | from typing import *
  5 | 
  6 | import requests
  7 | 
  8 | from wikiteam3.dumpgenerator.api import handleStatusCode
  9 | from wikiteam3.dumpgenerator.config import Config
 10 | from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingError
 11 | from wikiteam3.dumpgenerator.log import logerror
 12 | 
 13 | try:
 14 |     import xml.etree.ElementTree as ET
 15 | except ImportError:
 16 |     import xml.etree.ElementTree as ET
 17 | 
 18 | import xml.dom.minidom as MD
 19 | 
 20 | 
 21 | def reconstructRevisions(root=None):
 22 |     # print ET.tostring(rev)
 23 |     page = ET.Element("stub")
 24 |     edits = 0
 25 |     for rev in (
 26 |         root.find("query").find("pages").find("page").find("revisions").findall("rev")
 27 |     ):
 28 |         try:
 29 |             rev_ = ET.SubElement(page, "revision")
 30 |             # id
 31 |             ET.SubElement(rev_, "id").text = rev.attrib["revid"]
 32 |             # parentid (optional, export-0.7+)
 33 |             if "parentid" in rev.attrib:
 34 |                 ET.SubElement(rev_, "parentid").text = rev.attrib["parentid"]
 35 |             # timestamp
 36 |             ET.SubElement(rev_, "timestamp").text = rev.attrib["timestamp"]
 37 |             # contributor
 38 |             contributor = ET.SubElement(rev_, "contributor")
 39 |             if "userhidden" not in rev.attrib:
 40 |                 ET.SubElement(contributor, "username").text = rev.attrib["user"]
 41 |                 ET.SubElement(contributor, "id").text = rev.attrib["userid"]
 42 |             else:
 43 |                 contributor.set("deleted", "deleted")
 44 |             # comment (optional)
 45 |             if "commenthidden" in rev.attrib:
 46 |                 print("commenthidden")
 47 |                 comment = ET.SubElement(rev_, "comment")
 48 |                 comment.set("deleted", "deleted")
 49 |             elif "comment" in rev.attrib and rev.attrib["comment"]:  # '' is empty
 50 |                 comment = ET.SubElement(rev_, "comment")
 51 |                 comment.text = rev.attrib["comment"]
 52 |             # minor edit (optional)
 53 |             if "minor" in rev.attrib:
 54 |                 ET.SubElement(rev_, "minor")
 55 |             # model and format (optional, export-0.8+)
 56 |             if "contentmodel" in rev.attrib:
 57 |                 ET.SubElement(rev_, "model").text = rev.attrib[
 58 |                     "contentmodel"
 59 |                 ]  # default: 'wikitext'
 60 |             if "contentformat" in rev.attrib:
 61 |                 ET.SubElement(rev_, "format").text = rev.attrib[
 62 |                     "contentformat"
 63 |                 ]  # default: 'text/x-wiki'
 64 |             # text
 65 |             text = ET.SubElement(rev_, "text")
 66 |             if "texthidden" not in rev.attrib:
 67 |                 text.attrib["xml:space"] = "preserve"
 68 |                 text.attrib["bytes"] = rev.attrib["size"]
 69 |                 text.text = rev.text
 70 |             else:
 71 |                 # NOTE: this is not the same as the text being empty
 72 |                 text.set("deleted", "deleted")
 73 |             # sha1
 74 |             if "sha1" in rev.attrib:
 75 |                 sha1 = ET.SubElement(rev_, "sha1")
 76 |                 sha1.text = rev.attrib["sha1"]
 77 | 
 78 |             elif "sha1hidden" in rev.attrib:
 79 |                 ET.SubElement(rev_, "sha1")  # stub
 80 |             edits += 1
 81 |         except Exception as e:
 82 |             # logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
 83 |             print(ET.tostring(rev))
 84 |             traceback.print_exc()
 85 |             page = None
 86 |             edits = 0
 87 |             raise e
 88 |     return page, edits
 89 | 
 90 | 
 91 | def getXMLPageCoreWithApi(
 92 |     headers: Dict = None, params: Dict = None, config: Config = None, session=None
 93 | ):
 94 |     """ """
 95 |     # just send the API request
 96 |     # if it fails, it will reduce params['rvlimit']
 97 |     xml = ""
 98 |     c = 0
 99 |     maxseconds = 100  # max seconds to wait in a single sleeping
100 |     maxretries = config.retries  # x retries and skip
101 |     increment = 20  # increment every retry
102 | 
103 |     while not re.search(
104 |         r"</api>" if not config.curonly else r"</mediawiki>", xml
105 |     ) or re.search(r"</error>", xml):
106 |         if c > 0 and c < maxretries:
107 |             wait = (
108 |                 increment * c < maxseconds and increment * c or maxseconds
109 |             )  # incremental until maxseconds
110 |             print(
111 |                 '    In attempt %d, XML for "%s" is wrong. Waiting %d seconds and reloading...'
112 |                 % (c, params["titles" if config.xmlapiexport else "pages"], wait)
113 |             )
114 |             time.sleep(wait)
115 |             # reducing server load requesting smallest chunks (if curonly then
116 |             # rvlimit = 1 from mother function)
117 |             if params["rvlimit"] > 1:
118 |                 params["rvlimit"] = params["rvlimit"] / 2  # half
119 |         if c >= maxretries:
120 |             print("    We have retried %d times" % (c))
121 |             print(
122 |                 f'    MediaWiki error for "{params["titles" if config.xmlapiexport else "pages"]}", network error or whatever...'
123 |             )
124 |             # If it's not already what we tried: our last chance, preserve only the last revision...
125 |             # config.curonly means that the whole dump is configured to save only the last,
126 |             # params['curonly'] should mean that we've already tried this
127 |             # fallback, because it's set by the following if and passed to
128 |             # getXMLPageCore
129 |             # TODO: save only the last version when failed
130 |             print("    Saving in the errors log, and skipping...")
131 |             logerror(
132 |                 config=config,
133 |                 text=f'Error while retrieving the last revision of "{params["titles" if config.xmlapiexport else "pages"].decode("utf-8")}". Skipping.',
134 |             )
135 |             raise ExportAbortedError(config.index)
136 |         # FIXME HANDLE HTTP Errors HERE
137 |         try:
138 |             r = session.get(url=config.api, params=params, headers=headers)
139 |             handleStatusCode(r)
140 |             xml = r.text
141 |             # print xml
142 |         except requests.exceptions.ConnectionError as e:
143 |             print(f"    Connection error: {str(e.args[0])}")
144 |             xml = ""
145 |         except requests.exceptions.ReadTimeout as e:
146 |             print(f"    Read timeout: {str(e.args[0])}")
147 |             xml = ""
148 |         c += 1
149 |     return xml
150 | 
151 | 
152 | def getXMLPageWithApi(config: Config = None, title="", verbose=True, session=None):
153 |     """Get the full history (or current only) of a page using API:Query
154 |     if params['curonly'] is set, then using export&exportwrap to export
155 |     """
156 | 
157 |     title_ = title
158 |     title_ = re.sub(" ", "_", title_)
159 |     if not config.curonly:
160 |         params = {
161 |             "titles": title_,
162 |             "action": "query",
163 |             "format": "xml",
164 |             "prop": "revisions",
165 |             "rvprop": "timestamp|user|comment|content|"  # rvprop: <https://www.mediawiki.org/wiki/API:Revisions#Parameter_history>  # MW v????
166 |             "ids|flags|size|"  # MW v1.11
167 |             "userid|"  # MW v1.17
168 |             "sha1|"  # MW v1.19
169 |             "contentmodel|",  # MW v1.21
170 |             "rvcontinue": None,
171 |             "rvlimit": config.api_chunksize,
172 |         }
173 |         firstpartok = False
174 |         lastcontinue = None
175 |         numberofedits = 0
176 |         ret = ""
177 |         continueKey: Optional[str] = None
178 |         while True:
179 |             # in case the last request is not right, saving last time's progress
180 |             if not firstpartok:
181 |                 try:
182 |                     lastcontinue = params[continueKey]
183 |                 except:
184 |                     lastcontinue = None
185 | 
186 |             xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
187 |             if xml == "":
188 |                 # just return so that we can continue, and getXMLPageCoreWithApi will log the error
189 |                 return
190 |             try:
191 |                 root = ET.fromstring(xml.encode("utf-8"))
192 |             except:
193 |                 continue
194 |             try:
195 |                 retpage = root.find("query").find("pages").find("page")
196 |             except:
197 |                 continue
198 |             if "missing" in retpage.attrib or "invalid" in retpage.attrib:
199 |                 print("Page not found")
200 |                 raise PageMissingError(params["titles"], xml)
201 |             if not firstpartok:
202 |                 try:
203 |                     # build the firstpart by ourselves to improve the memory usage
204 |                     ret = "  <page>\n"
205 |                     ret += "    <title>%s</title>\n" % (retpage.attrib["title"])
206 |                     ret += "    <ns>%s</ns>\n" % (retpage.attrib["ns"])
207 |                     ret += "    <id>%s</id>\n" % (retpage.attrib["pageid"])
208 |                 except:
209 |                     firstpartok = False
210 |                     continue
211 |                 else:
212 |                     firstpartok = True
213 |                     yield ret
214 | 
215 |             continueVal = None
216 |             if root.find("continue") is not None:
217 |                 # uses continue.rvcontinue
218 |                 # MW 1.26+
219 |                 continueKey = "rvcontinue"
220 |                 continueVal = root.find("continue").attrib["rvcontinue"]
221 |             elif root.find("query-continue") is not None:
222 |                 revContinue = root.find("query-continue").find("revisions")
223 |                 assert revContinue is not None, "Should only have revisions continue"
224 |                 if "rvcontinue" in revContinue.attrib:
225 |                     # MW 1.21 ~ 1.25
226 |                     continueKey = "rvcontinue"
227 |                     continueVal = revContinue.attrib["rvcontinue"]
228 |                 elif "rvstartid" in revContinue.attrib:
229 |                     # TODO: MW ????
230 |                     continueKey = "rvstartid"
231 |                     continueVal = revContinue.attrib["rvstartid"]
232 |                 else:
233 |                     # blindly assume the first attribute is the continue key
234 |                     # may never happen
235 |                     assert (
236 |                         len(revContinue.attrib) > 0
237 |                     ), "Should have at least one attribute"
238 |                     for continueKey in revContinue.attrib.keys():
239 |                         continueVal = revContinue.attrib[continueKey]
240 |                         break
241 |             if continueVal is not None:
242 |                 params[continueKey] = continueVal
243 |             try:
244 |                 ret = ""
245 |                 edits = 0
246 | 
247 |                 # transform the revision
248 |                 rev_, edits = reconstructRevisions(root=root)
249 |                 xmldom = MD.parseString(b"<stub1>" + ET.tostring(rev_) + b"</stub1>")
250 |                 # convert it into text in case it throws MemoryError
251 |                 # delete the first three line and last two line,which is for setting the indent
252 |                 ret += "".join(xmldom.toprettyxml(indent="  ").splitlines(True)[3:-2])
253 |                 yield ret
254 |                 numberofedits += edits
255 |                 if config.curonly or continueVal is None:  # no continue
256 |                     break
257 |             except:
258 |                 traceback.print_exc()
259 |                 params["rvcontinue"] = lastcontinue
260 |                 ret = ""
261 |         yield "  </page>\n"
262 |     else:
263 |         params = {
264 |             "titles": title_,
265 |             "action": "query",
266 |             "format": "xml",
267 |             "export": 1,
268 |             "exportnowrap": 1,
269 |         }
270 |         xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
271 |         if xml == "":
272 |             raise ExportAbortedError(config.index)
273 |         if "</page>" not in xml:
274 |             raise PageMissingError(params["titles"], xml)
275 |         # strip these sha1s sums which keep showing up in the export and
276 |         # which are invalid for the XML schema (they only apply to
277 |         # revisions)
278 |         xml = re.sub(r"\n\s*<sha1>\w+</sha1>\s*\n", r"\n", xml)
279 |         xml = re.sub(r"\n\s*<sha1/>\s*\n", r"\n", xml)
280 | 
281 |         yield xml.split("</page>")[0]
282 | 
283 |         # just for looking good :)
284 |         r_timestamp = r"<timestamp>([^<]+)</timestamp>"
285 | 
286 |         numberofedits = 0 + len(re.findall(r_timestamp, xml))
287 |         yield "</page>\n"
288 | 
289 |     if verbose:
290 |         if numberofedits == 1:
291 |             print(f"    {title.strip()}, 1 edit")
292 |         else:
293 |             print("    %s, %d edits" % (title.strip(), numberofedits))
294 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/test/data/html_regexs/group0.mediawiki.demo.save-web.org_mediawiki-1.23.17-20230701.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh" dir="ltr" class="client-nojs">
  3 | <head>
  4 | <meta charset="UTF-8"/>
  5 | <title>文件列表 - 12317 哈|「、」‘/-&amp;^%$%@😒</title>
  6 | <meta http-equiv="X-UA-Compatible" content="IE=EDGE"/>
  7 | <meta name="generator" content="MediaWiki 1.23.17"/>
  8 | <meta name="robots" content="noindex,nofollow"/>
  9 | <link rel="shortcut icon" href="/favicon.ico"/>
 10 | <link rel="search" type="application/opensearchdescription+xml" href="/mediawiki-1.23.17/opensearch_desc.php" title="12317 哈|「、」‘/-&amp;^%$%@😒 (zh)"/>
 11 | <link rel="EditURI" type="application/rsd+xml" href="http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/api.php?action=rsd"/>
 12 | <link rel="alternate" hreflang="zh" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh"/>
 13 | <link rel="alternate" hreflang="zh-Hans" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-hans"/>
 14 | <link rel="alternate" hreflang="zh-Hant" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-hant"/>
 15 | <link rel="alternate" hreflang="zh-CN" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-cn"/>
 16 | <link rel="alternate" hreflang="zh-HK" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-hk"/>
 17 | <link rel="alternate" hreflang="zh-MO" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-mo"/>
 18 | <link rel="alternate" hreflang="zh-MY" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-my"/>
 19 | <link rel="alternate" hreflang="zh-SG" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-sg"/>
 20 | <link rel="alternate" hreflang="zh-TW" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-tw"/>
 21 | <link rel="alternate" type="application/atom+xml" title="12317 哈|「、」‘/-&amp;^%$%@😒的Atom feed" href="/mediawiki-1.23.17/index.php?title=Special:%E6%9C%80%E8%BF%91%E6%9B%B4%E6%94%B9&amp;feed=atom"/>
 22 | <link rel="stylesheet" href="http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/load.php?debug=false&amp;lang=zh&amp;modules=mediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.skinning.interface%7Cmediawiki.ui.button%7Cskins.vector.styles&amp;only=styles&amp;skin=vector&amp;*"/>
 23 | <meta name="ResourceLoaderDynamicStyles" content=""/>
 24 | <style>a:lang(ar),a:lang(kk-arab),a:lang(mzn),a:lang(ps),a:lang(ur){text-decoration:none}
 25 | /* cache key: my_wiki12317:resourceloader:filter:minify-css:7:ab3e4a20431e78168c14bfbf19a87aa6 */</style>
 26 | <script src="http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/load.php?debug=false&amp;lang=zh&amp;modules=startup&amp;only=scripts&amp;skin=vector&amp;*"></script>
 27 | <script>if(window.mw){
 28 | mw.config.set({"wgCanonicalNamespace":"Special","wgCanonicalSpecialPageName":"Listfiles","wgNamespaceNumber":-1,"wgPageName":"Special:文件列表","wgTitle":"文件列表","wgCurRevisionId":0,"wgRevisionId":0,"wgArticleId":0,"wgIsArticle":false,"wgIsRedirect":false,"wgAction":"view","wgUserName":"Saveweb","wgUserGroups":["bureaucrat","sysop","*","user","autoconfirmed"],"wgCategories":[],"wgBreakFrames":true,"wgPageContentLanguage":"zh","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"zh","wgMonthNames":["","1月","2月","3月","4月","5月","6月","7月","8月","9月","10月","11月","12月"],"wgMonthNamesShort":["","1月","2月","3月","4月","5月","6月","7月","8月","9月","10月","11月","12月"],"wgRelevantPageName":"Special:文件列表","wgUserId":1,"wgUserEditCount":3,"wgUserRegistration":1672759428000,"wgUserNewMsgRevisionId":null,"wgUserVariant":"zh","wgIsProbablyEditable":false,"wgWikiEditorEnabledModules":{"toolbar":false,"dialogs":false,"hidesig":true,"preview":false,"previewDialog":false,"publish":false}});
 29 | }</script><script>if(window.mw){
 30 | mw.loader.implement("user.options",function($,jQuery){mw.user.options.set({"ccmeonemails":0,"cols":80,"date":"default","diffonly":0,"disablemail":0,"editfont":"default","editondblclick":0,"editsectiononrightclick":0,"enotifminoredits":0,"enotifrevealaddr":0,"enotifusertalkpages":1,"enotifwatchlistpages":1,"extendwatchlist":0,"fancysig":0,"forceeditsummary":0,"gender":"unknown","hideminor":0,"hidepatrolled":0,"imagesize":2,"math":1,"minordefault":0,"newpageshidepatrolled":0,"nickname":"","norollbackdiff":0,"numberheadings":0,"previewonfirst":0,"previewontop":1,"rcdays":7,"rclimit":50,"rows":25,"showhiddencats":0,"shownumberswatching":1,"showtoolbar":1,"skin":"vector","stubthreshold":0,"thumbsize":2,"underline":2,"uselivepreview":0,"usenewrc":0,"watchcreations":1,"watchdefault":1,"watchdeletion":0,"watchlistdays":3,"watchlisthideanons":0,"watchlisthidebots":0,"watchlisthideliu":0,"watchlisthideminor":0,"watchlisthideown":0,"watchlisthidepatrolled":0,"watchmoves":0,"wllimit":250,
 31 | "useeditwarning":1,"prefershttps":1,"language":"zh","variant-gan":"gan","variant-iu":"iu","variant-kk":"kk","variant-ku":"ku","variant-shi":"shi","variant-sr":"sr","variant-tg":"tg","variant-uz":"uz","variant":"zh","searchNs0":true,"searchNs1":false,"searchNs2":false,"searchNs3":false,"searchNs4":false,"searchNs5":false,"searchNs6":false,"searchNs7":false,"searchNs8":false,"searchNs9":false,"searchNs10":false,"searchNs11":false,"searchNs12":false,"searchNs13":false,"searchNs14":false,"searchNs15":false});},{},{});mw.loader.implement("user.tokens",function($,jQuery){mw.user.tokens.set({"editToken":"f4b15c82c95afac4c08197181cd5d257+\\","patrolToken":"165ef5a1ecc5b3de50cd36903be5c126+\\","watchToken":"29ecedbbcb4fd0b3fcd1a0fee42f8bc9+\\"});},{},{});
 32 | /* cache key: my_wiki12317:resourceloader:filter:minify-js:7:e00c01275a839204d27ce0fa330122c8 */
 33 | }</script>
 34 | <script>if(window.mw){
 35 | mw.loader.load(["mediawiki.page.startup","mediawiki.legacy.wikibits","mediawiki.legacy.ajax","skins.vector.js"]);
 36 | }</script>
 37 | <!--[if lt IE 7]><style type="text/css">body{behavior:url("/mediawiki-1.23.17/skins/vector/csshover.min.htc")}</style><![endif]--></head>
 38 | <body class="mediawiki ltr sitedir-ltr ns--1 ns-special mw-special-Listfiles page-Special_文件列表 skin-vector action-view vector-animateLayout">
 39 | 		<div id="mw-page-base" class="noprint"></div>
 40 | 		<div id="mw-head-base" class="noprint"></div>
 41 | 		<div id="content" class="mw-body" role="main">
 42 | 			<a id="top"></a>
 43 | 			<div id="mw-js-message" style="display:none;"></div>
 44 | 						<h1 id="firstHeading" class="firstHeading" lang="zh"><span dir="auto">文件列表</span></h1>
 45 | 						<div id="bodyContent">
 46 | 								<div id="contentSub"></div>
 47 | 												<div id="jump-to-nav" class="mw-jump">
 48 | 					跳转至：					<a href="#mw-navigation">导航</a>、					<a href="#p-search">搜索</a>
 49 | 				</div>
 50 | 				<div id="mw-content-text"><div class="mw-specialpage-summary">
 51 | <p>本特殊页面展示所有上传的文件。
 52 | </p>
 53 | </div>
 54 | <form method="get" action="/mediawiki-1.23.17/index.php" id="mw-listfiles-form"><fieldset>
 55 | <legend>文件列表</legend>
 56 | <input type="hidden" value="Special:文件列表" name="title"/><table><tbody><tr id="mw-table_pager_limit_label"><td class="mw-label">每页项数：</td><td class="mw-input"><select name="limit" tabindex="1"><option value="1" selected="">1</option>
 57 | <option value="20">20</option>
 58 | <option value="50">50</option>
 59 | <option value="100">100</option>
 60 | <option value="250">250</option>
 61 | <option value="500">500</option></select></td></tr><tr id="mw-listfiles_search_for"><td class="mw-label">按媒体名称搜索：</td><td class="mw-input"><input size="40" maxlength="255" id="mw-ilsearch" tabindex="2" name="ilsearch"/></td></tr><tr id="mw-username"><td class="mw-label">用户名：</td><td class="mw-input"><input size="40" maxlength="255" id="mw-listfiles-user" tabindex="3" name="user"/></td></tr><tr id="mw-listfiles-show-all"><td class="mw-label">包括图片的旧版本</td><td class="mw-input"><input tabindex="4" type="checkbox" value="1" name="ilshowall"/></td></tr><tr><td></td><td class="mw-submit"><input type="submit" value="提交" tabindex="5"/></td></tr></tbody></table></fieldset></form>
 62 | <br />
 63 | <table style="border:1px;" class="mw-datatable listfiles TablePager"><thead><tr>
 64 | <th class="listfiles_sort TablePager_sort"><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;sort=img_timestamp&amp;limit=1&amp;asc=1&amp;desc=" title="Special:文件列表"><img width="12" height="12" alt="降" src="/mediawiki-1.23.17/skins/common/images/Arr_d.png"/>日期</a></th>
 65 | <th><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;sort=img_name&amp;limit=1" title="Special:文件列表">名称</a></th>
 66 | <th>缩略图</th>
 67 | <th><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;sort=img_size&amp;limit=1" title="Special:文件列表">尺寸</a></th>
 68 | <th>用户</th>
 69 | <th>说明</th>
 70 | <th>版本</th>
 71 | 
 72 | </tr></thead><tbody>
 73 | <tr>
 74 | <td class="TablePager_col_img_timestamp">2023年1月10日 (二) 22:26</td>
 75 | <td class="TablePager_col_img_name"><a href="/mediawiki-1.23.17/index.php?title=File:Tibet.png" title="File:Tibet.png">Tibet.png</a> （<a href="/mediawiki-1.23.17/images/0/03/Tibet.png">文件</a>）</td>
 76 | <td class="TablePager_col_thumb"><a href="/mediawiki-1.23.17/index.php?title=File:Tibet.png" class="image"><img alt="" src="/mediawiki-1.23.17/images/thumb/0/03/Tibet.png/180px-Tibet.png" width="180" height="99" /></a></td>
 77 | <td class="TablePager_col_img_size">730 KB</td>
 78 | <td class="TablePager_col_img_user_text"><a href="/mediawiki-1.23.17/index.php?title=User:Saveweb&amp;action=edit&amp;redlink=1" class="new" title="User:Saveweb（页面不存在）">Saveweb</a></td>
 79 | <td class="TablePager_col_img_description">rszdggrsrergsrsgrsgz</td>
 80 | <td class="TablePager_col_count">1</td>
 81 | </tr>
 82 | </tbody></table>
 83 | <br />
 84 | </div>								<div class="printfooter">
 85 | 				取自“<a href="http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/index.php?title=Special:文件列表">http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/index.php?title=Special:文件列表</a>”				</div>
 86 | 												<div id='catlinks' class='catlinks catlinks-allhidden'></div>												<div class="visualClear"></div>
 87 | 							</div>
 88 | 		</div>
 89 | 		<div id="mw-navigation">
 90 | 			<h2>导航菜单</h2>
 91 | 			<div id="mw-head">
 92 | 				<div id="p-personal" role="navigation" class="" aria-labelledby="p-personal-label">
 93 | 	<h3 id="p-personal-label">个人工具</h3>
 94 | 	<ul>
 95 | <li id="pt-userpage"><a href="/mediawiki-1.23.17/index.php?title=User:Saveweb" class="new" dir="auto" title="你的用户页面[.]" accesskey=".">Saveweb</a></li><li id="pt-mytalk"><a href="/mediawiki-1.23.17/index.php?title=User_talk:Saveweb" class="new" title="你的讨论页面[n]" accesskey="n">讨论</a></li><li id="pt-preferences"><a href="/mediawiki-1.23.17/index.php?title=Special:%E5%8F%82%E6%95%B0%E8%AE%BE%E7%BD%AE" title="你的设置">设置</a></li><li id="pt-watchlist"><a href="/mediawiki-1.23.17/index.php?title=Special:%E7%9B%91%E8%A7%86%E5%88%97%E8%A1%A8" title="你正在监视更改的页面的列表[l]" accesskey="l">监视列表</a></li><li id="pt-mycontris"><a href="/mediawiki-1.23.17/index.php?title=Special:%E7%94%A8%E6%88%B7%E8%B4%A1%E7%8C%AE/Saveweb" title="你的贡献的列表[y]" accesskey="y">贡献</a></li><li id="pt-logout"><a href="/mediawiki-1.23.17/index.php?title=Special:%E7%94%A8%E6%88%B7%E7%99%BB%E5%87%BA&amp;returnto=Special%3A%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;returntoquery=limit%3D1" title="退出">退出</a></li>	</ul>
 96 | </div>
 97 | 				<div id="left-navigation">
 98 | 					<div id="p-namespaces" role="navigation" class="vectorTabs" aria-labelledby="p-namespaces-label">
 99 | 	<h3 id="p-namespaces-label">名字空间</h3>
100 | 	<ul>
101 | 					<li  id="ca-nstab-special" class="selected"><span><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;limit=1"  title="这是特殊页面，你无法编辑该页">特殊页面</a></span></li>
102 | 			</ul>
103 | </div>
104 | <div id="p-variants" role="navigation" class="vectorMenu emptyPortlet" aria-labelledby="p-variants-label">
105 | 	<h3 id="mw-vector-current-variant">
106 | 		</h3>
107 | 	<h3 id="p-variants-label"><span>变种</span><a href="#"></a></h3>
108 | 	<div class="menu">
109 | 		<ul>
110 | 					</ul>
111 | 	</div>
112 | </div>
113 | 				</div>
114 | 				<div id="right-navigation">
115 | 					<div id="p-views" role="navigation" class="vectorTabs emptyPortlet" aria-labelledby="p-views-label">
116 | 	<h3 id="p-views-label">查看</h3>
117 | 	<ul>
118 | 			</ul>
119 | </div>
120 | <div id="p-cactions" role="navigation" class="vectorMenu emptyPortlet" aria-labelledby="p-cactions-label">
121 | 	<h3 id="p-cactions-label"><span>操作</span><a href="#"></a></h3>
122 | 	<div class="menu">
123 | 		<ul>
124 | 					</ul>
125 | 	</div>
126 | </div>
127 | <div id="p-search" role="search">
128 | 	<h3><label for="searchInput">搜索</label></h3>
129 | 	<form action="/mediawiki-1.23.17/index.php" id="searchform">
130 | 					<div id="simpleSearch">
131 | 					<input type="search" name="search" placeholder="搜索" title="搜索12317 哈|「、」‘/-&amp;^%$%@😒[f]" accesskey="f" id="searchInput"/><input type="hidden" value="Special:搜索" name="title"/><input type="submit" name="fulltext" value="搜索" title="搜索含这些文字的页面" id="mw-searchButton" class="searchButton mw-fallbackSearchButton"/><input type="submit" name="go" value="前往" title="如果相同的标题存在的话便直接前往该页面" id="searchButton" class="searchButton"/>		</div>
132 | 	</form>
133 | </div>
134 | 				</div>
135 | 			</div>
136 | 			<div id="mw-panel">
137 | 					<div id="p-logo" role="banner"><a style="background-image: url(/mediawiki-1.23.17/skins/common/images/wiki.png);" href="/mediawiki-1.23.17/index.php?title=%E9%A6%96%E9%A1%B5"  title="访问首页"></a></div>
138 | 				<div class="portal" role="navigation" id='p-navigation' aria-labelledby='p-navigation-label'>
139 | 	<h3 id='p-navigation-label'>导航</h3>
140 | 	<div class="body">
141 | 		<ul>
142 | 			<li id="n-mainpage-description"><a href="/mediawiki-1.23.17/index.php?title=%E9%A6%96%E9%A1%B5" title="访问首页[z]" accesskey="z">首页</a></li>
143 | 			<li id="n-recentchanges"><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%9C%80%E8%BF%91%E6%9B%B4%E6%94%B9" title="本wiki最近更改的列表[r]" accesskey="r">最近更改</a></li>
144 | 			<li id="n-randompage"><a href="/mediawiki-1.23.17/index.php?title=Special:%E9%9A%8F%E6%9C%BA%E9%A1%B5%E9%9D%A2" title="载入一个随机页面[x]" accesskey="x">随机页面</a></li>
145 | 			<li id="n-help"><a href="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents" title="查找帮助的地方">帮助</a></li>
146 | 		</ul>
147 | 	</div>
148 | </div>
149 | <div class="portal" role="navigation" id='p-tb' aria-labelledby='p-tb-label'>
150 | 	<h3 id='p-tb-label'>工具</h3>
151 | 	<div class="body">
152 | 		<ul>
153 | 			<li id="t-upload"><a href="/mediawiki-1.23.17/index.php?title=Special:%E4%B8%8A%E4%BC%A0%E6%96%87%E4%BB%B6" title="上传文件[u]" accesskey="u">上传文件</a></li>
154 | 			<li id="t-specialpages"><a href="/mediawiki-1.23.17/index.php?title=Special:%E7%89%B9%E6%AE%8A%E9%A1%B5%E9%9D%A2" title="所有特殊页面的列表[q]" accesskey="q">特殊页面</a></li>
155 | 			<li id="t-print"><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;limit=1&amp;printable=yes" rel="alternate" title="本页面的可打印版本[p]" accesskey="p">打印版本</a></li>
156 | 		</ul>
157 | 	</div>
158 | </div>
159 | 			</div>
160 | 		</div>
161 | 		<div id="footer" role="contentinfo">
162 | 							<ul id="footer-places">
163 | 											<li id="footer-places-privacy"><a href="/mediawiki-1.23.17/index.php?title=12317_%E5%93%88_%E3%80%8C%E3%80%81%E3%80%8D%E2%80%98/-%26%5E_$_@%F0%9F%98%92:%E9%9A%90%E7%A7%81%E6%9D%83%E6%94%BF%E7%AD%96" title="12317 哈 「、」‘/-&amp;^ $ @😒:隐私权政策">隐私权政策</a></li>
164 | 											<li id="footer-places-about"><a href="/mediawiki-1.23.17/index.php?title=12317_%E5%93%88_%E3%80%8C%E3%80%81%E3%80%8D%E2%80%98/-%26%5E_$_@%F0%9F%98%92:%E5%85%B3%E4%BA%8E" title="12317 哈 「、」‘/-&amp;^ $ @😒:关于">关于12317 哈|「、」‘/-&amp;^%$%@😒</a></li>
165 | 											<li id="footer-places-disclaimer"><a href="/mediawiki-1.23.17/index.php?title=12317_%E5%93%88_%E3%80%8C%E3%80%81%E3%80%8D%E2%80%98/-%26%5E_$_@%F0%9F%98%92:%E5%85%8D%E8%B4%A3%E5%A3%B0%E6%98%8E" title="12317 哈 「、」‘/-&amp;^ $ @😒:免责声明">免责声明</a></li>
166 | 									</ul>
167 | 										<ul id="footer-icons" class="noprint">
168 | 					<li id="footer-poweredbyico">
169 | 						<a href="//www.mediawiki.org/"><img src="/mediawiki-1.23.17/skins/common/images/poweredby_mediawiki_88x31.png" alt="Powered by MediaWiki" width="88" height="31"/></a>
170 | 					</li>
171 | 				</ul>
172 | 						<div style="clear:both"></div>
173 | 		</div>
174 | 		<script>/*<![CDATA[*/window.jQuery && jQuery.ready();/*]]>*/</script><script>if(window.mw){
175 | mw.loader.state({"site":"ready","user":"ready","user.groups":"ready"});
176 | }</script>
177 | <script>if(window.mw){
178 | mw.loader.load(["mediawiki.user","mediawiki.hidpi","mediawiki.page.ready","mediawiki.searchSuggest","mediawiki.page.watch.ajax","skins.vector.collapsibleNav"],null,true);
179 | }</script>
180 | <script>if(window.mw){
181 | mw.config.set({"wgBackendResponseTime":576});
182 | }</script>
183 | 	</body>
184 | </html>
185 | 


--------------------------------------------------------------------------------
/wikiteam3/dumpgenerator/test/data/html_regexs/group1.mediawiki.demo.save-web.org_mediawiki-1.27.7-20230701.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="zh" dir="ltr" class="client-nojs">
  3 | <head>
  4 | <meta charset="UTF-8"/>
  5 | <title>文件列表 - 12317 哈|「、」‘/-&amp;^%$%@😒</title>
  6 | <meta http-equiv="X-UA-Compatible" content="IE=EDGE"/>
  7 | <meta name="generator" content="MediaWiki 1.23.17"/>
  8 | <meta name="robots" content="noindex,nofollow"/>
  9 | <link rel="shortcut icon" href="/favicon.ico"/>
 10 | <link rel="search" type="application/opensearchdescription+xml" href="/mediawiki-1.23.17/opensearch_desc.php" title="12317 哈|「、」‘/-&amp;^%$%@😒 (zh)"/>
 11 | <link rel="EditURI" type="application/rsd+xml" href="http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/api.php?action=rsd"/>
 12 | <link rel="alternate" hreflang="zh" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh"/>
 13 | <link rel="alternate" hreflang="zh-Hans" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-hans"/>
 14 | <link rel="alternate" hreflang="zh-Hant" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-hant"/>
 15 | <link rel="alternate" hreflang="zh-CN" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-cn"/>
 16 | <link rel="alternate" hreflang="zh-HK" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-hk"/>
 17 | <link rel="alternate" hreflang="zh-MO" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-mo"/>
 18 | <link rel="alternate" hreflang="zh-MY" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-my"/>
 19 | <link rel="alternate" hreflang="zh-SG" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-sg"/>
 20 | <link rel="alternate" hreflang="zh-TW" href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;variant=zh-tw"/>
 21 | <link rel="alternate" type="application/atom+xml" title="12317 哈|「、」‘/-&amp;^%$%@😒的Atom feed" href="/mediawiki-1.23.17/index.php?title=Special:%E6%9C%80%E8%BF%91%E6%9B%B4%E6%94%B9&amp;feed=atom"/>
 22 | <link rel="stylesheet" href="http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/load.php?debug=false&amp;lang=zh&amp;modules=mediawiki.legacy.commonPrint%2Cshared%7Cmediawiki.skinning.interface%7Cmediawiki.ui.button%7Cskins.vector.styles&amp;only=styles&amp;skin=vector&amp;*"/>
 23 | <meta name="ResourceLoaderDynamicStyles" content=""/>
 24 | <style>a:lang(ar),a:lang(kk-arab),a:lang(mzn),a:lang(ps),a:lang(ur){text-decoration:none}
 25 | /* cache key: my_wiki12317:resourceloader:filter:minify-css:7:ab3e4a20431e78168c14bfbf19a87aa6 */</style>
 26 | <script src="http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/load.php?debug=false&amp;lang=zh&amp;modules=startup&amp;only=scripts&amp;skin=vector&amp;*"></script>
 27 | <script>if(window.mw){
 28 | mw.config.set({"wgCanonicalNamespace":"Special","wgCanonicalSpecialPageName":"Listfiles","wgNamespaceNumber":-1,"wgPageName":"Special:文件列表","wgTitle":"文件列表","wgCurRevisionId":0,"wgRevisionId":0,"wgArticleId":0,"wgIsArticle":false,"wgIsRedirect":false,"wgAction":"view","wgUserName":"Saveweb","wgUserGroups":["bureaucrat","sysop","*","user","autoconfirmed"],"wgCategories":[],"wgBreakFrames":true,"wgPageContentLanguage":"zh","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"zh","wgMonthNames":["","1月","2月","3月","4月","5月","6月","7月","8月","9月","10月","11月","12月"],"wgMonthNamesShort":["","1月","2月","3月","4月","5月","6月","7月","8月","9月","10月","11月","12月"],"wgRelevantPageName":"Special:文件列表","wgUserId":1,"wgUserEditCount":3,"wgUserRegistration":1672759428000,"wgUserNewMsgRevisionId":null,"wgUserVariant":"zh","wgIsProbablyEditable":false,"wgWikiEditorEnabledModules":{"toolbar":false,"dialogs":false,"hidesig":true,"preview":false,"previewDialog":false,"publish":false}});
 29 | }</script><script>if(window.mw){
 30 | mw.loader.implement("user.options",function($,jQuery){mw.user.options.set({"ccmeonemails":0,"cols":80,"date":"default","diffonly":0,"disablemail":0,"editfont":"default","editondblclick":0,"editsectiononrightclick":0,"enotifminoredits":0,"enotifrevealaddr":0,"enotifusertalkpages":1,"enotifwatchlistpages":1,"extendwatchlist":0,"fancysig":0,"forceeditsummary":0,"gender":"unknown","hideminor":0,"hidepatrolled":0,"imagesize":2,"math":1,"minordefault":0,"newpageshidepatrolled":0,"nickname":"","norollbackdiff":0,"numberheadings":0,"previewonfirst":0,"previewontop":1,"rcdays":7,"rclimit":50,"rows":25,"showhiddencats":0,"shownumberswatching":1,"showtoolbar":1,"skin":"vector","stubthreshold":0,"thumbsize":2,"underline":2,"uselivepreview":0,"usenewrc":0,"watchcreations":1,"watchdefault":1,"watchdeletion":0,"watchlistdays":3,"watchlisthideanons":0,"watchlisthidebots":0,"watchlisthideliu":0,"watchlisthideminor":0,"watchlisthideown":0,"watchlisthidepatrolled":0,"watchmoves":0,"wllimit":250,
 31 | "useeditwarning":1,"prefershttps":1,"language":"zh","variant-gan":"gan","variant-iu":"iu","variant-kk":"kk","variant-ku":"ku","variant-shi":"shi","variant-sr":"sr","variant-tg":"tg","variant-uz":"uz","variant":"zh","searchNs0":true,"searchNs1":false,"searchNs2":false,"searchNs3":false,"searchNs4":false,"searchNs5":false,"searchNs6":false,"searchNs7":false,"searchNs8":false,"searchNs9":false,"searchNs10":false,"searchNs11":false,"searchNs12":false,"searchNs13":false,"searchNs14":false,"searchNs15":false});},{},{});mw.loader.implement("user.tokens",function($,jQuery){mw.user.tokens.set({"editToken":"f4b15c82c95afac4c08197181cd5d257+\\","patrolToken":"165ef5a1ecc5b3de50cd36903be5c126+\\","watchToken":"29ecedbbcb4fd0b3fcd1a0fee42f8bc9+\\"});},{},{});
 32 | /* cache key: my_wiki12317:resourceloader:filter:minify-js:7:e00c01275a839204d27ce0fa330122c8 */
 33 | }</script>
 34 | <script>if(window.mw){
 35 | mw.loader.load(["mediawiki.page.startup","mediawiki.legacy.wikibits","mediawiki.legacy.ajax","skins.vector.js"]);
 36 | }</script>
 37 | <!--[if lt IE 7]><style type="text/css">body{behavior:url("/mediawiki-1.23.17/skins/vector/csshover.min.htc")}</style><![endif]--></head>
 38 | <body class="mediawiki ltr sitedir-ltr ns--1 ns-special mw-special-Listfiles page-Special_文件列表 skin-vector action-view vector-animateLayout">
 39 | 		<div id="mw-page-base" class="noprint"></div>
 40 | 		<div id="mw-head-base" class="noprint"></div>
 41 | 		<div id="content" class="mw-body" role="main">
 42 | 			<a id="top"></a>
 43 | 			<div id="mw-js-message" style="display:none;"></div>
 44 | 						<h1 id="firstHeading" class="firstHeading" lang="zh"><span dir="auto">文件列表</span></h1>
 45 | 						<div id="bodyContent">
 46 | 								<div id="contentSub"></div>
 47 | 												<div id="jump-to-nav" class="mw-jump">
 48 | 					跳转至：					<a href="#mw-navigation">导航</a>、					<a href="#p-search">搜索</a>
 49 | 				</div>
 50 | 				<div id="mw-content-text"><div class="mw-specialpage-summary">
 51 | <p>本特殊页面展示所有上传的文件。
 52 | </p>
 53 | </div>
 54 | <form method="get" action="/mediawiki-1.23.17/index.php" id="mw-listfiles-form"><fieldset>
 55 | <legend>文件列表</legend>
 56 | <input type="hidden" value="Special:文件列表" name="title"/><table><tbody><tr id="mw-table_pager_limit_label"><td class="mw-label">每页项数：</td><td class="mw-input"><select name="limit" tabindex="1"><option value="1" selected="">1</option>
 57 | <option value="20">20</option>
 58 | <option value="50">50</option>
 59 | <option value="100">100</option>
 60 | <option value="250">250</option>
 61 | <option value="500">500</option></select></td></tr><tr id="mw-listfiles_search_for"><td class="mw-label">按媒体名称搜索：</td><td class="mw-input"><input size="40" maxlength="255" id="mw-ilsearch" tabindex="2" name="ilsearch"/></td></tr><tr id="mw-username"><td class="mw-label">用户名：</td><td class="mw-input"><input size="40" maxlength="255" id="mw-listfiles-user" tabindex="3" name="user"/></td></tr><tr id="mw-listfiles-show-all"><td class="mw-label">包括图片的旧版本</td><td class="mw-input"><input tabindex="4" type="checkbox" value="1" name="ilshowall"/></td></tr><tr><td></td><td class="mw-submit"><input type="submit" value="提交" tabindex="5"/></td></tr></tbody></table></fieldset></form>
 62 | <br />
 63 | <table style="border:1px;" class="mw-datatable listfiles TablePager"><thead><tr>
 64 | <th class="listfiles_sort TablePager_sort"><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;sort=img_timestamp&amp;limit=1&amp;asc=1&amp;desc=" title="Special:文件列表"><img width="12" height="12" alt="降" src="/mediawiki-1.23.17/skins/common/images/Arr_d.png"/>日期</a></th>
 65 | <th><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;sort=img_name&amp;limit=1" title="Special:文件列表">名称</a></th>
 66 | <th>缩略图</th>
 67 | <th><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;sort=img_size&amp;limit=1" title="Special:文件列表">尺寸</a></th>
 68 | <th>用户</th>
 69 | <th>说明</th>
 70 | <th>版本</th>
 71 | 
 72 | </tr></thead><tbody>
 73 | <tr>
 74 | <td class="TablePager_col_img_timestamp">2023年1月10日 (二) 22:26</td>
 75 | <td class="TablePager_col_img_name"><a href="/mediawiki-1.23.17/index.php?title=File:Tibet.png" title="File:Tibet.png">Tibet.png</a> （<a href="/mediawiki-1.23.17/images/0/03/Tibet.png">文件</a>）</td>
 76 | <td class="TablePager_col_thumb"><a href="/mediawiki-1.23.17/index.php?title=File:Tibet.png" class="image"><img alt="" src="/mediawiki-1.23.17/images/thumb/0/03/Tibet.png/180px-Tibet.png" width="180" height="99" /></a></td>
 77 | <td class="TablePager_col_img_size">730 KB</td>
 78 | <td class="TablePager_col_img_user_text"><a href="/mediawiki-1.23.17/index.php?title=User:Saveweb&amp;action=edit&amp;redlink=1" class="new" title="User:Saveweb（页面不存在）">Saveweb</a></td>
 79 | <td class="TablePager_col_img_description">rszdggrsrergsrsgrsgz</td>
 80 | <td class="TablePager_col_count">1</td>
 81 | </tr>
 82 | </tbody></table>
 83 | <br />
 84 | </div>								<div class="printfooter">
 85 | 				取自“<a href="http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/index.php?title=Special:文件列表">http://group0.mediawiki.demo.save-web.org/mediawiki-1.23.17/index.php?title=Special:文件列表</a>”				</div>
 86 | 												<div id='catlinks' class='catlinks catlinks-allhidden'></div>												<div class="visualClear"></div>
 87 | 							</div>
 88 | 		</div>
 89 | 		<div id="mw-navigation">
 90 | 			<h2>导航菜单</h2>
 91 | 			<div id="mw-head">
 92 | 				<div id="p-personal" role="navigation" class="" aria-labelledby="p-personal-label">
 93 | 	<h3 id="p-personal-label">个人工具</h3>
 94 | 	<ul>
 95 | <li id="pt-userpage"><a href="/mediawiki-1.23.17/index.php?title=User:Saveweb" class="new" dir="auto" title="你的用户页面[.]" accesskey=".">Saveweb</a></li><li id="pt-mytalk"><a href="/mediawiki-1.23.17/index.php?title=User_talk:Saveweb" class="new" title="你的讨论页面[n]" accesskey="n">讨论</a></li><li id="pt-preferences"><a href="/mediawiki-1.23.17/index.php?title=Special:%E5%8F%82%E6%95%B0%E8%AE%BE%E7%BD%AE" title="你的设置">设置</a></li><li id="pt-watchlist"><a href="/mediawiki-1.23.17/index.php?title=Special:%E7%9B%91%E8%A7%86%E5%88%97%E8%A1%A8" title="你正在监视更改的页面的列表[l]" accesskey="l">监视列表</a></li><li id="pt-mycontris"><a href="/mediawiki-1.23.17/index.php?title=Special:%E7%94%A8%E6%88%B7%E8%B4%A1%E7%8C%AE/Saveweb" title="你的贡献的列表[y]" accesskey="y">贡献</a></li><li id="pt-logout"><a href="/mediawiki-1.23.17/index.php?title=Special:%E7%94%A8%E6%88%B7%E7%99%BB%E5%87%BA&amp;returnto=Special%3A%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;returntoquery=limit%3D1" title="退出">退出</a></li>	</ul>
 96 | </div>
 97 | 				<div id="left-navigation">
 98 | 					<div id="p-namespaces" role="navigation" class="vectorTabs" aria-labelledby="p-namespaces-label">
 99 | 	<h3 id="p-namespaces-label">名字空间</h3>
100 | 	<ul>
101 | 					<li  id="ca-nstab-special" class="selected"><span><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;limit=1"  title="这是特殊页面，你无法编辑该页">特殊页面</a></span></li>
102 | 			</ul>
103 | </div>
104 | <div id="p-variants" role="navigation" class="vectorMenu emptyPortlet" aria-labelledby="p-variants-label">
105 | 	<h3 id="mw-vector-current-variant">
106 | 		</h3>
107 | 	<h3 id="p-variants-label"><span>变种</span><a href="#"></a></h3>
108 | 	<div class="menu">
109 | 		<ul>
110 | 					</ul>
111 | 	</div>
112 | </div>
113 | 				</div>
114 | 				<div id="right-navigation">
115 | 					<div id="p-views" role="navigation" class="vectorTabs emptyPortlet" aria-labelledby="p-views-label">
116 | 	<h3 id="p-views-label">查看</h3>
117 | 	<ul>
118 | 			</ul>
119 | </div>
120 | <div id="p-cactions" role="navigation" class="vectorMenu emptyPortlet" aria-labelledby="p-cactions-label">
121 | 	<h3 id="p-cactions-label"><span>操作</span><a href="#"></a></h3>
122 | 	<div class="menu">
123 | 		<ul>
124 | 					</ul>
125 | 	</div>
126 | </div>
127 | <div id="p-search" role="search">
128 | 	<h3><label for="searchInput">搜索</label></h3>
129 | 	<form action="/mediawiki-1.23.17/index.php" id="searchform">
130 | 					<div id="simpleSearch">
131 | 					<input type="search" name="search" placeholder="搜索" title="搜索12317 哈|「、」‘/-&amp;^%$%@😒[f]" accesskey="f" id="searchInput"/><input type="hidden" value="Special:搜索" name="title"/><input type="submit" name="fulltext" value="搜索" title="搜索含这些文字的页面" id="mw-searchButton" class="searchButton mw-fallbackSearchButton"/><input type="submit" name="go" value="前往" title="如果相同的标题存在的话便直接前往该页面" id="searchButton" class="searchButton"/>		</div>
132 | 	</form>
133 | </div>
134 | 				</div>
135 | 			</div>
136 | 			<div id="mw-panel">
137 | 					<div id="p-logo" role="banner"><a style="background-image: url(/mediawiki-1.23.17/skins/common/images/wiki.png);" href="/mediawiki-1.23.17/index.php?title=%E9%A6%96%E9%A1%B5"  title="访问首页"></a></div>
138 | 				<div class="portal" role="navigation" id='p-navigation' aria-labelledby='p-navigation-label'>
139 | 	<h3 id='p-navigation-label'>导航</h3>
140 | 	<div class="body">
141 | 		<ul>
142 | 			<li id="n-mainpage-description"><a href="/mediawiki-1.23.17/index.php?title=%E9%A6%96%E9%A1%B5" title="访问首页[z]" accesskey="z">首页</a></li>
143 | 			<li id="n-recentchanges"><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%9C%80%E8%BF%91%E6%9B%B4%E6%94%B9" title="本wiki最近更改的列表[r]" accesskey="r">最近更改</a></li>
144 | 			<li id="n-randompage"><a href="/mediawiki-1.23.17/index.php?title=Special:%E9%9A%8F%E6%9C%BA%E9%A1%B5%E9%9D%A2" title="载入一个随机页面[x]" accesskey="x">随机页面</a></li>
145 | 			<li id="n-help"><a href="https://www.mediawiki.org/wiki/Special:MyLanguage/Help:Contents" title="查找帮助的地方">帮助</a></li>
146 | 		</ul>
147 | 	</div>
148 | </div>
149 | <div class="portal" role="navigation" id='p-tb' aria-labelledby='p-tb-label'>
150 | 	<h3 id='p-tb-label'>工具</h3>
151 | 	<div class="body">
152 | 		<ul>
153 | 			<li id="t-upload"><a href="/mediawiki-1.23.17/index.php?title=Special:%E4%B8%8A%E4%BC%A0%E6%96%87%E4%BB%B6" title="上传文件[u]" accesskey="u">上传文件</a></li>
154 | 			<li id="t-specialpages"><a href="/mediawiki-1.23.17/index.php?title=Special:%E7%89%B9%E6%AE%8A%E9%A1%B5%E9%9D%A2" title="所有特殊页面的列表[q]" accesskey="q">特殊页面</a></li>
155 | 			<li id="t-print"><a href="/mediawiki-1.23.17/index.php?title=Special:%E6%96%87%E4%BB%B6%E5%88%97%E8%A1%A8&amp;limit=1&amp;printable=yes" rel="alternate" title="本页面的可打印版本[p]" accesskey="p">打印版本</a></li>
156 | 		</ul>
157 | 	</div>
158 | </div>
159 | 			</div>
160 | 		</div>
161 | 		<div id="footer" role="contentinfo">
162 | 							<ul id="footer-places">
163 | 											<li id="footer-places-privacy"><a href="/mediawiki-1.23.17/index.php?title=12317_%E5%93%88_%E3%80%8C%E3%80%81%E3%80%8D%E2%80%98/-%26%5E_$_@%F0%9F%98%92:%E9%9A%90%E7%A7%81%E6%9D%83%E6%94%BF%E7%AD%96" title="12317 哈 「、」‘/-&amp;^ $ @😒:隐私权政策">隐私权政策</a></li>
164 | 											<li id="footer-places-about"><a href="/mediawiki-1.23.17/index.php?title=12317_%E5%93%88_%E3%80%8C%E3%80%81%E3%80%8D%E2%80%98/-%26%5E_$_@%F0%9F%98%92:%E5%85%B3%E4%BA%8E" title="12317 哈 「、」‘/-&amp;^ $ @😒:关于">关于12317 哈|「、」‘/-&amp;^%$%@😒</a></li>
165 | 											<li id="footer-places-disclaimer"><a href="/mediawiki-1.23.17/index.php?title=12317_%E5%93%88_%E3%80%8C%E3%80%81%E3%80%8D%E2%80%98/-%26%5E_$_@%F0%9F%98%92:%E5%85%8D%E8%B4%A3%E5%A3%B0%E6%98%8E" title="12317 哈 「、」‘/-&amp;^ $ @😒:免责声明">免责声明</a></li>
166 | 									</ul>
167 | 										<ul id="footer-icons" class="noprint">
168 | 					<li id="footer-poweredbyico">
169 | 						<a href="//www.mediawiki.org/"><img src="/mediawiki-1.23.17/skins/common/images/poweredby_mediawiki_88x31.png" alt="Powered by MediaWiki" width="88" height="31"/></a>
170 | 					</li>
171 | 				</ul>
172 | 						<div style="clear:both"></div>
173 | 		</div>
174 | 		<script>/*<![CDATA[*/window.jQuery && jQuery.ready();/*]]>*/</script><script>if(window.mw){
175 | mw.loader.state({"site":"ready","user":"ready","user.groups":"ready"});
176 | }</script>
177 | <script>if(window.mw){
178 | mw.loader.load(["mediawiki.user","mediawiki.hidpi","mediawiki.page.ready","mediawiki.searchSuggest","mediawiki.page.watch.ajax","skins.vector.collapsibleNav"],null,true);
179 | }</script>
180 | <script>if(window.mw){
181 | mw.config.set({"wgBackendResponseTime":463});
182 | }</script>
183 | 	</body>
184 | </html>
185 | 


--------------------------------------------------------------------------------