├── wp2git ├── __init__.py ├── version.py └── wp2git.py ├── version.py ├── requirements.txt ├── .gitignore ├── AUTHORS ├── pyproject.toml ├── LICENSE ├── README.md └── setup.py /wp2git/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /version.py: -------------------------------------------------------------------------------- 1 | wp2git/version.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | mwclient>=0.10.1 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.exe 2 | *.py[co] 3 | *~ 4 | -------------------------------------------------------------------------------- /wp2git/version.py: -------------------------------------------------------------------------------- 1 | # Do not edit this file, wp2git versioning is governed by git tags 2 | __version__="2.0" 3 | 4 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | This project contains code written by: 2 | 3 | Daniel Lenski 5 | Robin Green 6 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "wp2git" 3 | version = "2.0" 4 | description = "Downloads and imports Wikipedia page histories to a git repository" 5 | authors = ["Daniel Lenski "] 6 | readme = "README.md" 7 | license = "GPLv3 or later" 8 | 9 | [project.urls] 10 | Homepage = "https://github.com/dlenski/wp2git" 11 | 12 | [tool.poetry.dependencies] 13 | python = "^3.8" 14 | mwclient = "^0.11.0" 15 | 16 | [tool.poetry.scripts] 17 | wp2git = 'wp2git.wp2git:main' 18 | 19 | [build-system] 20 | requires = ["poetry-core"] 21 | build-backend = "poetry.core.masonry.api" 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Daniel Lenski and contributors (see the file AUTHORS 2 | for a complete list) 3 | 4 | This software is provided 'as-is', without any express or implied 5 | warranty. In no event will the authors be held liable for any damages 6 | arising from the use of this software. 7 | 8 | Permission is granted to anyone to use this software for any purpose, 9 | including commercial applications, and to alter it and redistribute it 10 | freely, subject to the following restrictions: 11 | 12 | 1. The origin of this software must not be misrepresented; you must not 13 | claim that you wrote the original software. If you use this software 14 | in a product, an acknowledgment in the product documentation would be 15 | appreciated but is not required. 16 | 17 | 2. Altered source versions must be plainly marked as such, and must not be 18 | misrepresented as being the original software. 19 | 20 | 3. This notice may not be removed or altered from any source 21 | distribution. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | wp2git 2 | ====== 3 | 4 | This program allows you to download and convert any Wikipedia article's history to a `git` repository, for easy browsing, [annotation](https://git-scm.com/docs/git-annotate), 5 | and [bisecting](https://git-scm.com/docs/git-annotate) (etc.) of older revisions. 6 | 7 | ### Requirements 8 | 9 | Requires Python 3.x and `git` accessible in your `PATH`, and the [`mwclient` package](https://github.com/mwclient/mwclient) 10 | (which will be auto-installed by `pip`). 11 | 12 | ### Quick installation 13 | 14 | For the latest release, install with: 15 | 16 | ``` 17 | pip3 install https://github.com/dlenski/wp2git/archive/v2.0.zip 18 | ``` 19 | 20 | For the latest development build, install with 21 | 22 | ``` 23 | pip3 install https://github.com/dlenski/wp2git/archive/master.zip 24 | ``` 25 | 26 | ### Usage 27 | 28 | ``` 29 | $ wp2git [--lang XY] article_name 30 | ``` 31 | 32 | `wp2git` will create a directory, in which a new `git` repository will be created. 33 | The repository will contain a single file named `article_name.mw`, along with the entire edit history 34 | of that article on `XY.wikipedia.org`. (If unspecified, the default language is guessed according to 35 | your locale.) 36 | 37 | Use `wp2git --help` to show more options. 38 | 39 | ### Entirely based on 40 | 41 | [CyberShadow's version](https://github.com/CyberShadow/wp2git) written in the D language. 42 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import sys, os, re, subprocess as sp 3 | 4 | try: 5 | from setuptools import setup 6 | except ImportError: 7 | from distutils.core import setup 8 | 9 | if sys.version_info < (3,): 10 | sys.exit("Python 2.x is not supported; Python 3.x is required.") 11 | 12 | ######################################## 13 | 14 | # Based on this recipe, adapted for Python 3, Git 2.8.x, and PEP-440 version identifiers 15 | # http://blogs.nopcode.org/brainstorm/2013/05/20/pragmatic-python-versioning-via-setuptools-and-git-tags/ 16 | # https://www.python.org/dev/peps/pep-0440/#version-scheme 17 | 18 | # Fetch version from git tags, and write to version.py. 19 | # Also, when git is not available (PyPI package), use stored version.py. 20 | version_py = os.path.join(os.path.dirname(__file__), 'version.py') 21 | 22 | try: 23 | version_git = sp.check_output(["git", "describe", "--tags"]).strip().decode('ascii') 24 | final, dev, blob = re.match(r'v?((?:\d+\.)*\d+)(?:-(\d+)-(g[a-z0-9]+))?', version_git).groups() 25 | version_pep = final+('.dev%s+%s'%(dev,blob) if dev else '') 26 | except (sp.CalledProcessError, OSError): 27 | with open(version_py, 'r') as fh: 28 | version_pep = open(version_py).read().strip().split('=')[-1][1:-1] 29 | else: 30 | with open(version_py, 'w') as fh: 31 | print("# Do not edit this file, wp2git versioning is governed by git tags", file=fh) 32 | print('__version__="%s"\n' % version_pep, file=fh) 33 | 34 | ######################################## 35 | 36 | setup(name="wp2git", 37 | version=version_pep, 38 | description=("Downloads and imports Wikipedia page histories to a git repository"), 39 | long_description=open('README.md').read(), 40 | author=open('AUTHORS').read(), 41 | author_email="dlenski@gmail.com", 42 | install_requires=open('requirements.txt').readlines(), 43 | python_requires='3.8', 44 | license=open('LICENSE').read(), 45 | url="https://github.com/dlenski/wp2git", 46 | packages=["wp2git"], 47 | entry_points={ 'console_scripts': [ 'wp2git=wp2git.wp2git:main' ] } 48 | ) 49 | -------------------------------------------------------------------------------- /wp2git/wp2git.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from sys import stderr, stdout, version_info 3 | from itertools import chain, count 4 | from pathlib import Path 5 | import argparse 6 | import subprocess as sp 7 | import urllib.parse as urlparse 8 | import os, locale, time 9 | import re 10 | from datetime import datetime 11 | 12 | import mwclient 13 | 14 | from .version import __version__ 15 | 16 | lang, enc = locale.getlocale() 17 | if lang == 'C': 18 | lang = None 19 | elif lang is not None: 20 | lang = lang.split('_')[0] 21 | 22 | def sanitize(s): 23 | forbidden = r'?*<>|:\/"' 24 | for c in forbidden: 25 | s = s.replace(c, '_') 26 | return s 27 | 28 | def timestamp_num_or_iso(s): 29 | if s == 'now': 30 | return s 31 | try: 32 | return int(s, 10) 33 | except ValueError: 34 | try: 35 | return float(s) 36 | except ValueError: 37 | try: 38 | if version_info < (3, 11) and s.endswith('Z'): 39 | return datetime.fromisoformat(s[:-1] + '+00:00') 40 | else: 41 | return datetime.fromisoformat(s) 42 | except ValueError: 43 | return argparse.ArgumentError(f'Could not parse {s!r} as Unix epoch seconds, ISO8601 timestamp, or "now"') 44 | 45 | 46 | def shortgit(git): 47 | return next(git[:ii] for ii in range(6, len(git)) if not git[:ii].isdigit()) 48 | 49 | def parse_args(): 50 | p = argparse.ArgumentParser(description='Create a git repository with the history of one or more specified Wikipedia articles.') 51 | p.add_argument('--version', action='version', version=__version__) 52 | p.add_argument('article_name', nargs='+') 53 | g = p.add_argument_group('Git output options') 54 | g.add_argument('-n', '--no-import', dest='doimport', default=True, action='store_false', 55 | help="Don't invoke git fast-import; only generate fast-import data stream") 56 | g.add_argument('-b', '--bare', action='store_true', help="Import to a bare repository (no working tree)") 57 | g.add_argument('-o', '--out', type=Path, help='Output directory (default is "wp2git") or fast-import stream file (defaults is stdout)') 58 | g = p.add_argument_group('Output cleanup') 59 | g.add_argument('-g', '--git-refs', action='store_true', help="Replace references to earlier revisions with their Git hashes") 60 | g.add_argument('-D', '--denoise', action='store_true', help='Simplify common "noisy" wikitext in comments') 61 | g = p.add_argument_group('MediaWiki site selection') 62 | x=g.add_mutually_exclusive_group() 63 | x.add_argument('--lang', default=lang, help='Wikipedia language code (default %(default)s)') 64 | x.add_argument('--site', help='Alternate MediaWiki site (e.g. https://commons.wikimedia.org[/w/])') 65 | g = p.add_argument_group('Time range restriction (accepted formats are Unix epoch seconds, ISO8601 timestamps, or "now")') 66 | g.add_argument('--not-before', '-B', type=timestamp_num_or_iso) 67 | g.add_argument('--not-after', '-A', type=timestamp_num_or_iso) 68 | 69 | args = p.parse_args() 70 | if args.doimport: 71 | if args.out is None: 72 | args.out = next(pp for n in chain(('',), count(2)) if not (pp := Path(f'wp2git{n}')).exists()) 73 | if args.out.exists(): 74 | p.error(f'path {args.out} exists') 75 | args.out.mkdir(parents=True) 76 | else: 77 | if args.bare or args.git_refs: 78 | p.error('--no-import cannot be combined with --bare or --git-refs') 79 | 80 | if args.out is None: 81 | args.out = stdout.buffer 82 | else: 83 | try: 84 | args.out = args.out.open('xb') 85 | except OSError as e: 86 | p.error(e.args[0]) 87 | 88 | return p, args 89 | 90 | def main(): 91 | p, args = parse_args() 92 | 93 | # Connect to site with mwclient 94 | if args.site is not None: 95 | scheme, host, path = urlparse.urlparse(args.site, scheme='https')[:3] 96 | if path=='': 97 | path = '/w/' 98 | elif not path.endswith('/'): 99 | path += '/' 100 | elif args.lang is not None: 101 | scheme, host, path = 'https', f'{args.lang}.wikipedia.org', '/w/' 102 | else: 103 | scheme, host, path = 'https', 'wikipedia.org', '/w/' 104 | site = mwclient.Site(host, path=path, scheme=scheme) 105 | print(f'Connected to {scheme}://{host}{path}', file=stderr) 106 | 107 | # Find the page(s) 108 | fns = [] 109 | rev_iters = [] 110 | next_revs = [] 111 | for an in args.article_name: 112 | page = site.pages[an] 113 | if not page.exists: 114 | p.error(f'Page {an} does not exist') 115 | fns.append(sanitize(an)) 116 | 117 | revit = iter(page.revisions(dir='newer', prop='ids|timestamp|flags|comment|user|userid|content|tags', 118 | start=args.not_before, end=args.not_after)) 119 | rev_iters.append(revit) 120 | next_revs.append(next(revit, None)) 121 | 122 | if args.doimport: 123 | # Pipe to git fast-import 124 | sp.check_call(['git', 'init'] + (['--bare'] if args.bare else []), cwd=args.out) 125 | with open(args.out / ('.' if args.bare else '.git') / 'HEAD', 'rb') as f: 126 | head = f.read().removeprefix(b'ref: ').strip() 127 | pipe = sp.Popen(['git', 'fast-import', '--quiet', '--done'], stdin=sp.PIPE, stdout=sp.PIPE, cwd=args.out) 128 | fid = pipe.stdin 129 | else: 130 | fid = args.out 131 | head = b'refs/heads/master' 132 | 133 | # Output fast-import data stream to file or git pipe 134 | with fid: 135 | fid.write(b'reset %s\n' % head) 136 | id2git = {} 137 | 138 | # Round robin through all the pages' revisions, ordering by timestamp 139 | while any(next_revs): 140 | # Pick which of the pages' revisions has the lowest timestamp 141 | min_ts = (1<<63) 142 | ii = -1 143 | for ii, rev in enumerate(next_revs): 144 | if rev and time.mktime(rev['timestamp']) < min_ts: 145 | min_ii, min_ts = ii, time.mktime(rev['timestamp']) 146 | else: 147 | rev = next_revs[min_ii] 148 | fn = fns[min_ii] 149 | 150 | id = rev['revid'] 151 | id2git[id] = None 152 | text = rev.get('*','') 153 | user = rev.get('user','') 154 | user_ = user.replace(' ','_') 155 | comment = rev.get('comment','') 156 | userid = rev['userid'] or None # this is zero for anon/IP users 157 | tags = (['minor'] if 'minor' in rev else []) + rev['tags'] 158 | ts = time.mktime(rev['timestamp']) 159 | 160 | userlink = f'{scheme}://{host}{path}index.php?title=' + (f'Special:Redirect/user/{userid}' if userid else f"User:{urlparse.quote(user_)}") 161 | committer = f"{user.replace('<',' ').replace('>',' ')} <>" # I don't think Wikipedia allows this, but other Mediawiki sites do 162 | 163 | print(f"{time.ctime(ts)} >> {'Minor ' if 'minor' in rev else ' '}Revision {id}" 164 | f"{' of ' + args.article_name[min_ii] if len(args.article_name) > 1 else ''} by {user}: {comment}", file=stderr) 165 | 166 | # TODO: get and use 'parsedcomment' which HTML-ifies the comment? 167 | # May make identification of links to revisions, users, etc. much easier 168 | refs = set() 169 | if args.doimport: 170 | for num in map(lambda n: int(n, 10), re.findall(r'\b\d+\b', comment)): 171 | if num in id2git: 172 | if id2git[num] is None: 173 | fid.write(b'get-mark :%d\n' % num) 174 | fid.flush() 175 | id2git[num] = pipe.stdout.readline().strip().decode() 176 | refs.add(num) 177 | 178 | if args.git_refs: 179 | for num in refs: 180 | comment = re.sub(r'\[\[(?::?%s:)?Special\:Diff/%d\s*(?:\|[^]]*)?\]\]' % (args.lang, num), shortgit(id2git[num]), comment, flags=re.IGNORECASE) 181 | comment = re.sub(r'\b%d\b' % num, shortgit(id2git[num]), comment) 182 | 183 | section_frag = '' 184 | if m := re.search(r'^\s*/\*\s*(.*?)\s*\*/\s*', comment): 185 | section = m.group(1) 186 | section_frag = f'#{urlparse.quote(section.replace(" ", "_"))}' 187 | if args.denoise: 188 | if m.group(0) == comment: 189 | comment = f'Edited section "{section}"' 190 | else: 191 | comment = comment.replace(m.group(0), '', 1) 192 | 193 | if args.denoise: 194 | comment = re.sub(r'\[\[(?::?%s:)?Special\:Contrib(?:ution)?s/([^]|]+)\s*(?:\|[^]]*)?\]\](?:\s* \(\[\[User talk\:[^]]+\]\]\))?' % args.lang, r'\1', comment, flags=re.IGNORECASE) 195 | comment = re.sub(r'^\[\[WP:UNDO\|Undid\]\] ', 'Undid ', comment) 196 | 197 | if not comment: 198 | comment = '' 199 | 200 | summary = f'{comment}\n\nURL: {scheme}://{host}{path}index.php?oldid={id:d}{section_frag}\nEditor: {userlink}' 201 | 202 | if tags: 203 | summary += '\nTags: ' + ', '.join(tags) 204 | if refs and not args.git_refs: 205 | summary += '\nReferences: ' + ', '.join(f'{r} ({id2git[r]})' for r in refs) 206 | 207 | summary = summary.encode() 208 | text = text.encode() 209 | fid.write(b'commit %s\n' % head) 210 | fid.write(b'mark :%d\n' % id) 211 | fid.write(b'committer %s %d +0000\n' % (committer.encode(), ts)) 212 | fid.write(b'data %d\n%s\n' % (len(summary), summary)) 213 | fid.write(b'M 644 inline %s.mw\n' % fn.encode()) 214 | fid.write(b'data %d\n%s\n' % (len(text), text)) 215 | 216 | # Get the next revision for the page we just output 217 | next_revs[min_ii] = next(rev_iters[min_ii], None) 218 | else: 219 | fid.write(b'done\n') 220 | 221 | if args.doimport: 222 | retcode = pipe.wait() 223 | if retcode != 0: 224 | p.error(f'git fast-import returned {retcode}') 225 | if not args.bare: 226 | sp.check_call(['git', 'checkout', '-q', head.decode().removeprefix('refs/heads/')], cwd=args.out) 227 | print(f'Created git repository in {args.out}', file=stderr) 228 | 229 | if __name__=='__main__': 230 | main() 231 | --------------------------------------------------------------------------------