├── README.md └── dokuwiki2git /README.md: -------------------------------------------------------------------------------- 1 | dokuwiki2git 2 | ============ 3 | 4 | dokuwiki2git converts dokuwiki data directory into a git repository containing 5 | the wiki pages, with proper history. Thus, migration to git-backed wiki engines 6 | (eg. gollum) becomes easier. 7 | 8 | Usage 9 | ----- 10 | 11 | $ dokuwiki2git /path/to/dokuwiki/data 12 | 13 | This will create a git repository in `gitdir/`, containing the whole history of 14 | the dokuwiki pages, one commit per change. 15 | 16 | Details 17 | ------- 18 | 19 | Change files (`*.changes`) under `data/meta` are read for changelog information 20 | of each page. The changelog of all pages is then sorted by date, and a separate 21 | commit is created from each changelog entry, with the content taken from 22 | `data/attic/..txt.gz`. The original *author name*, *IP*, 23 | *date* and *change message* become standard parts of the created git commit. 24 | 25 | Media files are imported under `media/`. 26 | 27 | Caveats 28 | ------- 29 | 30 | NOTE: Media file history is not imported yet. Let me know if you need this. In 31 | new DokuWiki: 32 | 33 | * `media/.` contains the latest version 34 | * `media_meta/..changes` contains the changelog 35 | * `media_attic/..` contains the old versions the 36 | changelog mentions, except for the last one (which is under `media/`). 37 | 38 | License 39 | ------- 40 | 41 | dokuwiki2git is licensed under AGPLv3. 42 | 43 | Contacting 44 | ---------- 45 | 46 | Bugs? Feature requests? Mail the author! 47 | -------------------------------------------------------------------------------- /dokuwiki2git: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Copyright (c) 2011-2014 Heikki Hokkanen 3 | # License: AGPLv3 4 | import fnmatch 5 | import logging 6 | import optparse 7 | import os 8 | import subprocess 9 | import sys 10 | import time 11 | 12 | USAGE = """ 13 | dokuwiki2git converts dokuwiki data directory into a git repository containing 14 | the wiki pages, with proper history. Thus, migration to git-backed wiki engines 15 | (eg. gollum) becomes easier. 16 | 17 | $ dokuwiki2git [options] /path/to/dokuwiki/data""" 18 | 19 | logging.basicConfig(level = logging.DEBUG, format = '%(levelname)s - %(message)s') 20 | log = logging.getLogger() 21 | 22 | class Converter: 23 | def __init__(self): 24 | self.datadir = None 25 | self.atticdir = None 26 | self.mediadir = None 27 | self.metadir = None 28 | self.changelog = [] # (timestamp, ip, changetype, pagename, author, comment) 29 | self.commands = [] # commands to run to create the git repository 30 | self.gitdir = 'gitdir' 31 | self.users = {} 32 | 33 | def create_git_repository(self): 34 | log.info('Creating git repository') 35 | origdir = os.getcwd() 36 | os.mkdir(self.gitdir) 37 | os.chdir(self.gitdir) 38 | # run all commands 39 | for c in self.commands: 40 | log.debug('CMD: %s' % c) 41 | ret = subprocess.call(c, shell=True) 42 | if ret != 0: 43 | raise RuntimeError('Command "%s" failed' % c) 44 | os.chdir(origdir) 45 | 46 | def get_pagepath_and_timestamp(self, filename): 47 | filename = os.path.relpath(filename, self.atticdir) 48 | parts = filename.rsplit('.', 3) 49 | return parts[0], parts[1] # pagepath, filename 50 | 51 | def has_changelog_entry(self, pagepath, timestamp): 52 | for c in self.changelog: 53 | ts = c[0] 54 | pagename = c[3] 55 | if timestamp == ts and pagepath == pagename.replace(':', '/'): 56 | return 57 | log.warn('Attic contains "%s" timestamp %s, but is not referenced by changelog, skipping. Please report this!' % (pagepath, timestamp)) 58 | 59 | def read_attic(self): 60 | log.info('Reading attic') 61 | 62 | # Check that all referenced pages exist in attic 63 | for c in self.changelog: 64 | pagepath = c[3].replace(':', '/') 65 | filename = os.path.join(self.atticdir, pagepath + '.%s.txt.gz' % c[0]) 66 | if not os.path.exists(filename): 67 | log.warn('File "%s" does not exist, despite being in changelog, skipping' % filename) 68 | continue 69 | 70 | # depending on type of change, either add or remove 71 | pagepath, timestamp = self.get_pagepath_and_timestamp(filename) 72 | pagefile = pagepath + '.txt' 73 | message = pagepath + ': ' + c[5] 74 | user = c[4] 75 | email = 'dokuwiki@%s' % (c[1]) 76 | if len(user) == 0: 77 | user = 'dokuwiki2git' 78 | elif user in self.users: 79 | email = self.users[user]['email'] 80 | user = self.users[user]['name'] 81 | author = '%s <%s>' % (user, email) 82 | cmds = [] 83 | if c[2] in ('C', 'E', 'e', 'R'): # create, edit, minor edit, restore 84 | dirname = os.path.dirname(pagefile) 85 | if len(dirname) > 0: 86 | cmds.append('mkdir -p "%s"' % dirname) 87 | cmds.append('gunzip -c "%s" > "%s"' % (filename, pagefile)) 88 | cmds.append('git add "%s"' % pagefile) 89 | elif c[2] == 'D': # delete 90 | cmds.append('git rm --quiet "%s"' % pagefile) 91 | cmds.append('git commit --quiet --allow-empty --allow-empty-message --author="%s" --date="%s +0000" -m "%s"' % (author, timestamp, message.replace('"', '\\"'))) 92 | self.commands.extend(cmds) 93 | 94 | # check that all pages in attic have a matching changelog entry 95 | for path, dirs, files in os.walk(self.atticdir): 96 | for f in files: 97 | if fnmatch.fnmatch(f, '*.txt.gz'): 98 | filename = os.path.join(path, f) 99 | pagepath, timestamp = self.get_pagepath_and_timestamp(filename) 100 | self.has_changelog_entry(pagepath, timestamp) 101 | 102 | def read_data(self): 103 | self.commands.append('git init --quiet') 104 | # find user Real Name and email 105 | self.read_user_data() 106 | # go through data/meta 107 | self.read_meta() 108 | # sort history 109 | self.changelog.sort() 110 | # go through data/attic, importing pages referenced by .changes in meta 111 | self.read_attic() 112 | self.read_media() 113 | self.commands.append('git commit --quiet --allow-empty --author="dokuwiki2git " -m "Dokuwiki data imported by dokuwiki2git"') 114 | 115 | def read_media(self): 116 | log.info('Reading media') 117 | for path, dirs, files in os.walk(self.mediadir): 118 | for f in files: 119 | fullfile = os.path.join(path, f) 120 | filename = os.path.relpath(fullfile, self.datadir) 121 | dirname = os.path.dirname(filename) 122 | cmds = [ 123 | 'mkdir -p "%s"' % dirname, 124 | 'cp "%s" "%s"' % (fullfile, filename), 125 | 'git add "%s"' % filename 126 | ] 127 | self.commands.extend(cmds) 128 | self.commands.append('git commit --quiet --allow-empty --author="dokuwiki2git " -m "Import media files"') 129 | 130 | def read_meta(self): 131 | log.info('Reading meta') 132 | pages = 0 133 | for path, dirs, files in os.walk(self.metadir): 134 | for f in files: 135 | if fnmatch.fnmatch(f, '*.changes'): 136 | relpath = os.path.relpath(os.path.join(path, f), self.metadir) 137 | pagepath = relpath.rsplit('.', 1)[0] 138 | self.read_meta_page(pagepath, os.path.join(path, f)) 139 | pages += 1 140 | log.info('%d changelog entries for %d pages found' % (len(self.changelog), pages)) 141 | 142 | def read_meta_page(self, pagepath, fullpath): 143 | if pagepath in ('_dokuwiki', '_comments', '_media'): 144 | return 145 | pagename = pagepath.replace('/', ':') 146 | log.debug('Reading meta for page "%s"' % pagename) 147 | with open(fullpath, 'rb') as f: 148 | for line in f: 149 | changeparts = line.split('\t') 150 | log.debug(changeparts) 151 | assert(len(changeparts) == 7) 152 | assert(changeparts[3] == pagename) 153 | assert(changeparts[2] in ('C', 'D', 'E', 'e', 'R')) # create, delete, edit, minor edit, restore 154 | self.changelog.append(changeparts) 155 | 156 | def read_user_data(self): 157 | log.info('Reading users.auth.php') 158 | parentdir = os.path.abspath(os.path.join(self.datadir, os.pardir)) 159 | users_file = os.path.join(parentdir, 'conf', 'users.auth.php') 160 | with open(users_file, 'rb') as f: 161 | for line in f: 162 | if not line.startswith("#") and len(line) > 1: 163 | userparts = line.split(':') 164 | assert(len(userparts) == 5) 165 | log.debug(userparts) 166 | self.users[userparts[0]] = {'name' : userparts[2], 'email': userparts[3]} 167 | log.info('Read %d users' % len(self.users)) 168 | 169 | def run(self, params): 170 | parser = optparse.OptionParser(usage = USAGE) 171 | parser.add_option('-o', '--output', dest='outputdir', help='Create git directory at outputdir. Default is "gitdir"', default = 'gitdir') 172 | parser.add_option('-q', '--quiet', action='store_const', const=0, dest='verbose', help='Show only warnings and errors') 173 | parser.add_option('-v', '--verbose', action='store_const', const=2, dest='verbose', help='Show debug messages', default=1) 174 | (options, args) = parser.parse_args(params) 175 | level = logging.WARN 176 | if options.verbose: 177 | level = (logging.WARN, logging.INFO, logging.DEBUG)[options.verbose] 178 | log.setLevel(level) 179 | self.gitdir = options.outputdir 180 | 181 | time_start = time.time() 182 | if len(args) == 0: 183 | parser.print_help() 184 | log.error('Dokuwiki data directory is a required argument') 185 | sys.exit(1) 186 | self.set_datadir(args[0]) 187 | self.read_data() 188 | log.info('%d commands queued to be executed' % len(self.commands)) 189 | self.create_git_repository() 190 | time_end = time.time() 191 | time_took = time_end - time_start 192 | log.info('Finished converting dokuwiki data dir "%s" into a git repository "%s", took %.2f seconds' % (self.datadir, self.gitdir, time_took)) 193 | 194 | def set_datadir(self, datadir): 195 | if not os.path.isfile(os.path.join(datadir, '_dummy')): 196 | raise RuntimeError('Directory "%s" does not look like a dokuwiki datadir' % datadir) 197 | self.datadir = os.path.abspath(datadir) 198 | self.metadir = os.path.join(self.datadir, 'meta') 199 | self.atticdir = os.path.join(self.datadir, 'attic') 200 | self.mediadir = os.path.join(self.datadir, 'media') 201 | log.info('Using datadir: %s' % self.datadir) 202 | 203 | if __name__ == '__main__': 204 | c = Converter() 205 | c.run(sys.argv[1:]) 206 | --------------------------------------------------------------------------------