├── .gitignore ├── tests ├── unicode_support │ ├── dokuwiki.txt │ └── mediawiki.txt ├── tags_in_lists │ ├── notes.txt │ ├── dokuwiki.txt │ └── mediawiki.txt ├── references_to_footnotes │ ├── dokuwiki.txt │ └── mediawiki.txt ├── accented_links │ ├── mediawiki.txt │ └── dokuwiki.txt ├── notoc │ ├── mediawiki.txt │ └── dokuwiki.txt ├── code_tags │ ├── mediawiki.txt │ └── dokuwiki.txt ├── named_links │ ├── dokuwiki.txt │ └── mediawiki.txt ├── sections_test │ ├── mediawiki.txt │ └── dokuwiki.txt ├── tables │ ├── dokuwiki.txt │ └── mediawiki.txt ├── del_tags │ ├── mediawiki.txt │ └── dokuwiki.txt ├── math_tags │ ├── dokuwiki.txt │ └── mediawiki.txt ├── nowiki_tags │ ├── dokuwiki.txt │ └── mediawiki.txt ├── nested_lists │ ├── mediawiki.txt │ └── dokuwiki.txt └── header_anchors │ ├── mediawiki.txt │ └── dokuwiki.txt ├── .travis.yml ├── requirements.txt ├── names.py ├── LICENSE ├── wikicontent_tests.py ├── yamdwe.py ├── yamdwe_users.py ├── visitor.py ├── mediawiki.py ├── README.md ├── dokuwiki.py └── wikicontent.py /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | __pycache__ 3 | *.pyc 4 | env/ 5 | -------------------------------------------------------------------------------- /tests/unicode_support/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | Abhängigkeiten 2 | -------------------------------------------------------------------------------- /tests/unicode_support/mediawiki.txt: -------------------------------------------------------------------------------- 1 | Abhängigkeiten 2 | -------------------------------------------------------------------------------- /tests/tags_in_lists/notes.txt: -------------------------------------------------------------------------------- 1 | From https://github.com/projectgus/yamdwe/issues/7 2 | -------------------------------------------------------------------------------- /tests/references_to_footnotes/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | See the related search engine Duck Duck Go.((https://duckduckgo.com)) 2 | 3 | -------------------------------------------------------------------------------- /tests/accented_links/mediawiki.txt: -------------------------------------------------------------------------------- 1 | Example from [https://github.com/projectgus/yamdwe/issues/11 Issue 11] [[SSH-Schlüssel]]. 2 | -------------------------------------------------------------------------------- /tests/notoc/mediawiki.txt: -------------------------------------------------------------------------------- 1 | __NOTOC__ 2 | =Heading= 3 | Based on issue seen in https://github.com/projectgus/yamdwe/issues/30 4 | -------------------------------------------------------------------------------- /tests/references_to_footnotes/mediawiki.txt: -------------------------------------------------------------------------------- 1 | See the related search engine Duck Duck Go.https://duckduckgo.com 2 | -------------------------------------------------------------------------------- /tests/accented_links/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | Example from [[https://github.com/projectgus/yamdwe/issues/11|Issue 11]] [[ssh-schlussel]]. 2 | -------------------------------------------------------------------------------- /tests/notoc/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | ~~NOTOC~~ 2 | 3 | ======= Heading ======= 4 | 5 | Based on issue seen in https://github.com/projectgus/yamdwe/issues/30 6 | -------------------------------------------------------------------------------- /tests/code_tags/mediawiki.txt: -------------------------------------------------------------------------------- 1 | Code tags should work as plain paragraphs 2 | 3 | * Code tags should work in lists 4 | * Another item 5 | -------------------------------------------------------------------------------- /tests/named_links/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | [[http://example1.org/|Named Link]] 2 | https://plainlink.org/ 3 | http://example2.org/ 4 | [[http://example3.org/|OneWordLabel]] 5 | -------------------------------------------------------------------------------- /tests/named_links/mediawiki.txt: -------------------------------------------------------------------------------- 1 | [http://example1.org/ Named Link] 2 | [https://plainlink.org/] 3 | [http://example2.org/ ] 4 | [http://example3.org/ OneWordLabel] 5 | -------------------------------------------------------------------------------- /tests/code_tags/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | Code tags should work as plain paragraphs 2 | 3 | * Code tags should work in lists 4 | * Another item 5 | 6 | -------------------------------------------------------------------------------- /tests/sections_test/mediawiki.txt: -------------------------------------------------------------------------------- 1 | == Section == 2 | === Subsection === 3 | ==== Sub-subsection ==== 4 | ===== Sub-Sub-subsection ===== 5 | ====== Sub-Sub-Sub-subsection ====== 6 | -------------------------------------------------------------------------------- /tests/tables/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | Taken from the MediaWiki examples at https://www.mediawiki.org/wiki/Help:Tables 2 | 3 | ** Food complements ** 4 | | Orange| Apple | 5 | | Bread| Pie | 6 | | Butter| Ice cream | 7 | -------------------------------------------------------------------------------- /tests/sections_test/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | ====== Section ====== 2 | 3 | 4 | ===== Subsection ===== 5 | 6 | 7 | ==== Sub-subsection ==== 8 | 9 | 10 | === Sub-Sub-subsection === 11 | 12 | 13 | == Sub-Sub-Sub-subsection == 14 | -------------------------------------------------------------------------------- /tests/tables/mediawiki.txt: -------------------------------------------------------------------------------- 1 | Taken from the MediaWiki examples at https://www.mediawiki.org/wiki/Help:Tables 2 | 3 | {| class="wikitable" 4 | |+Food complements 5 | |- 6 | |Orange 7 | |Apple 8 | |- 9 | |Bread 10 | |Pie 11 | |- 12 | |Butter 13 | |Ice cream 14 | |} 15 | -------------------------------------------------------------------------------- /tests/del_tags/mediawiki.txt: -------------------------------------------------------------------------------- 1 | ~~del should be fine on a plain paragraph.~~ 2 | 3 | ~~As should s, but converted to del.~~ 4 | 5 | Within pragraphs ~~del tags should come through unchanged~~, ~~However the MediaWiki specific s tags should be converted to del~~. 6 | 7 | -------------------------------------------------------------------------------- /tests/math_tags/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | The squared distance from a point on ellipse to a given point$(x,y)$ is: 2 | 3 | $$ 4 | \begin{array}{rcl} 5 | s^{2}&=&(x-a\cos t)^{2}+(y-b\sin t)^{2}\\ 6 | &=&x^{2}+y^{2}+a^{2}\cos^{2}t+b^{2}\sin^{2}t-2xa\cos t-2yb\sin t 7 | \end{array} 8 | $$ 9 | 10 | -------------------------------------------------------------------------------- /tests/del_tags/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | ~~del should be fine on a plain paragraph.~~ 2 | 3 | ~~As should s, but converted to del.~~ 4 | 5 | Within pragraphs ~~del tags should come through unchanged~~, ~~However the MediaWiki specific s tags should be converted to del~~. 6 | 7 | -------------------------------------------------------------------------------- /tests/math_tags/mediawiki.txt: -------------------------------------------------------------------------------- 1 | The squared distance from a point on ellipse to a given point

(x,y)

is: 2 | 3 |

4 | \begin{array}{rcl}
 5 | s^{2}&=&(x-a\cos t)^{2}+(y-b\sin t)^{2}\\
 6 | &=&x^{2}+y^{2}+a^{2}\cos^{2}t+b^{2}\sin^{2}t-2xa\cos t-2yb\sin t
 7 | \end{array}
 8 |

9 | 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "2.7" 4 | install: 5 | - pip install -r requirements.txt 6 | script: 7 | # Running yamdwe itself is only to confirm the main program imports/runs correctly 8 | # TODO: find a friendly Mediawiki that we can hammer part of an integration test! 9 | - ./yamdwe.py --help 10 | - ./wikicontent_tests.py 11 | -------------------------------------------------------------------------------- /tests/nowiki_tags/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | Non-wiki content in plaintext 2 | 3 | * Non-wiki content in list 4 | 5 | 6 | Mediawiki tags not converted inside here 7 | Dividing paragraph. 8 | 9 | '''' and something not so trivial ''<nowiki></nowiki>'' 10 | 11 | -------------------------------------------------------------------------------- /tests/nowiki_tags/mediawiki.txt: -------------------------------------------------------------------------------- 1 | Non-wiki content in plaintext 2 | 3 |

Non-wiki content in list

6 | 7 | Mediawiki tags not converted inside here 8 | 9 | Dividing paragraph. 10 | 11 | and something not so trivial <nowiki></nowiki> 12 | 13 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | setuptools 2 | mwlib==0.15.14 3 | MySQL-python==1.2.5 4 | Pillow==2.8.1 5 | apipkg==1.4 6 | argparse==1.2.1 7 | bottle==0.12.8 8 | gevent==1.0.1 9 | greenlet==0.4.6 10 | kitchen==1.2.1 11 | lxml==3.4.4 12 | odfpy==0.9.6 13 | py==1.4.27 14 | pyPdf==1.13 15 | pyparsing==1.5.6 16 | qserve==0.2.8 17 | requests==2.7.0 18 | roman==2.0.0 19 | simplejson==3.6.5 20 | simplemediawiki==1.2.0b2 21 | sqlite3dbm==0.1.4 22 | timelib==0.2.4 23 | wsgiref==0.1.2 24 | -------------------------------------------------------------------------------- /tests/tags_in_lists/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | 2 | * WinDjView with wine: highlighting is acceptable (one needs to select and then press Strg+H or click on menu); but no comment function 3 | * sumatrapdf with wine: no highlighting facilities 4 | * active-documentviewer: doesn't start, error ''file:///usr/share/kde4/apps/plasma/packages/org.kde.active.documentviewer/contents/ui/main.qml:21:1: module "org.kde.okular" is not installed import org.kde.okular 0.1 as Okular'' 5 | * some other list item 6 | 7 | 8 | Some additional paragraph here. 9 | 10 | Second paragraph here. 11 | 12 | -------------------------------------------------------------------------------- /tests/nested_lists/mediawiki.txt: -------------------------------------------------------------------------------- 1 | * Top level 1 2 | * Top level 2 3 | ** Next level 2.1 4 | ** Next level 2.2 5 | *** One more level 2.2.1 6 | *** One more level 2.2.2 7 | ** Next level 2.3 8 | ** Next level 2.4 9 | * Top level 3 10 | ** Next level 3.1 11 | *** One more level 3.1.1 12 | **** Getting serious 3.1.1.1 13 | ** Next level 3.2 14 | * Top level 4 15 | * Top level 5 16 | 17 | # Numbered lists 18 | # Same deal 19 | ## but with numbers 20 | ### so many numbers 21 | ## even more numbers 22 | # OK 23 | 24 | * Mixed lists? 25 | *# Also a thing 26 | *## Can we convert this? 27 | * Hopefully 28 | 29 | -------------------------------------------------------------------------------- /tests/tags_in_lists/mediawiki.txt: -------------------------------------------------------------------------------- 1 |

WinDjView with wine: highlighting is acceptable (one needs to select and then press Strg+H or click on menu); but no comment function
sumatrapdf with wine: no highlighting facilities
active-documentviewer: doesn't start, error file:///usr/share/kde4/apps/plasma/packages/org.kde.active.documentviewer/contents/ui/main.qml:21:1: module "org.kde.okular" is not installed import org.kde.okular 0.1 as Okular
some other list item

7 | 8 | Some additional paragraph here. 9 | 10 | Second paragraph here. 11 | -------------------------------------------------------------------------------- /tests/nested_lists/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | * Top level 1 2 | * Top level 2 3 | * Next level 2.1 4 | * Next level 2.2 5 | * One more level 2.2.1 6 | * One more level 2.2.2 7 | * Next level 2.3 8 | * Next level 2.4 9 | * Top level 3 10 | * Next level 3.1 11 | * One more level 3.1.1 12 | * Getting serious 3.1.1.1 13 | * Next level 3.2 14 | * Top level 4 15 | * Top level 5 16 | 17 | - Numbered lists 18 | - Same deal 19 | - but with numbers 20 | - so many numbers 21 | - even more numbers 22 | - OK 23 | 24 | * Mixed lists? 25 | - Also a thing 26 | - Can we convert this? 27 | * Hopefully 28 | -------------------------------------------------------------------------------- /tests/header_anchors/mediawiki.txt: -------------------------------------------------------------------------------- 1 | == Section A == 2 | 3 | this section has a plaintext anchor link. [[#Section C]] 4 | 5 | == Section B == 6 | 7 | this section has an anchor link formatted as Mediawiki formats them. [[#Section_A]] 8 | 9 | == Section C == 10 | 11 | this section has a page link with a plaintext anchor component [[Page#Section C]] 12 | 13 | == Section D & complex Noteworthiness == 14 | 15 | this section has a page link with a complex anchor component [[#Section D & complex Noteworthiness]] 16 | 17 | === Bork bork === 18 | 19 | this subsection has a page link with a mediawiki-formatted anchor component. [[Page#Section_C]] 20 | 21 | === SectionHeaderWithUppercaseLetters E === 22 | [[#SectionHeaderWithUppercaseLetters E]] 23 | 24 | 25 | === Section header with - minus === 26 | [[#Section header with - minus]] 27 | 28 | -------------------------------------------------------------------------------- /tests/header_anchors/dokuwiki.txt: -------------------------------------------------------------------------------- 1 | ====== Section A ====== 2 | 3 | 4 | this section has a plaintext anchor link. [[#Section_C]] 5 | 6 | 7 | ====== Section B ====== 8 | 9 | 10 | this section has an anchor link formatted as Mediawiki formats them. [[#Section_A]] 11 | 12 | 13 | ====== Section C ====== 14 | 15 | 16 | this section has a page link with a plaintext anchor component [[page#Section_C]] 17 | 18 | 19 | ====== Section D & complex Noteworthiness ====== 20 | 21 | 22 | this section has a page link with a complex anchor component [[#Section_D_complex_Noteworthiness]] 23 | 24 | 25 | ===== Bork bork ===== 26 | 27 | 28 | this subsection has a page link with a mediawiki-formatted anchor component. [[page#Section_C]] 29 | 30 | 31 | ===== SectionHeaderWithUppercaseLetters E ===== 32 | 33 | [[#SectionHeaderWithUppercaseLetters_E]] 34 | 35 | 36 | ===== Section header with - minus ===== 37 | 38 | [[#Section_header_with_-_minus]] 39 | 40 | -------------------------------------------------------------------------------- /names.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple name munging functions used by both yamdwe.py and yamdwe_users.py 3 | 4 | Copyright (C) 2014 Angus Gratton 5 | Licensed under New BSD License as described in the file LICENSE. 6 | """ 7 | import re, os.path, unicodedata 8 | 9 | def clean_id(name, preserve_case=False): 10 | """ 11 | Return a 'clean' dokuwiki-compliant name. Based on the cleanID() PHP function in inc/pageutils.php 12 | 13 | Ignores both slashes and colons as valid namespace choices (to convert slashes to colons, 14 | call make_dokuwiki_pagename) 15 | """ 16 | main,ext = os.path.splitext(name) 17 | 18 | # remove accents 19 | try: 20 | decomposed = unicodedata.normalize("NFKD", main) 21 | no_accent = ''.join(c for c in decomposed if ord(c)<0x7f) 22 | except TypeError: 23 | no_accent = main # name was plaintext to begin with 24 | 25 | # recombine without any other characters 26 | result = (re.sub(r'[^\w/:-]+', '_', no_accent) + ext) 27 | if not preserve_case: 28 | result = result.lower() 29 | while "__" in result: 30 | result = result.replace("__", "_") # this is a hack, unsure why regex doesn't catch it 31 | return result 32 | 33 | def clean_user(name): 34 | """ 35 | Return a 'clean' dokuwiki-authplain-compliant username. 36 | Based on the cleanUser() PHP function in lib/plugins/authplain/auth.php 37 | """ 38 | return clean_id(name).replace(":","_") 39 | 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2014 Angus Gratton, All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | Neither the name of Angus Gratton nor the names of this software's 15 | contributors may be used to endorse or promote products derived from 16 | this software without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /wikicontent_tests.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Test suite for mediawiki->dokuwiki conversions. 3 | 4 | Goes through every subdirectory of the tests/ directory. Each subdirectory contains: 5 | 6 | * mediawiki.txt -> snippet of Mediawiki syntax to be converted. 7 | * dokuwiki.txt -> correct Dokuwiki output to be expected. 8 | * notes.txt -> (optional) file describing what's special about this test. 9 | 10 | Converts mediawiki.txt and compares output to dokuwiki.txt, prints an 11 | error (and contents of notes.txt) if the output does not match. 12 | 13 | Copyright (C) 2014 Angus Gratton 14 | Licensed under New BSD License as described in the file LICENSE. 15 | 16 | """ 17 | from __future__ import print_function, unicode_literals, absolute_import, division 18 | import sys, os, codecs, inspect, traceback, difflib, unicodedata 19 | from pprint import pprint 20 | import wikicontent, yamdwe 21 | 22 | DELIMITER="@"*40 23 | 24 | def prep_difflines(content): 25 | """ difflib takes input in this "readlines" compatible format """ 26 | return [ x+"\n" for x in content.split("\n") ] 27 | 28 | def run_test(testdir): 29 | """ 30 | Run the test contained in the directory 'testdir' 31 | 32 | Return True on success 33 | """ 34 | print("Running %s..." % testdir) 35 | mw = _readfile(testdir, "mediawiki.txt").strip() # ignore leading/trailing whitespace, too annoying 36 | dw = _readfile(testdir, "dokuwiki.txt").strip() # ignore leading/trailing whitespace, too annoying 37 | notes = _readfile(testdir, "notes.txt") 38 | 39 | # use directory name as the page name 40 | pagename = os.path.split(testdir)[1] 41 | 42 | if len(mw) == 0: 43 | print("WARNING: No mediawiki input!!!") 44 | 45 | try: 46 | converted = wikicontent.convert_pagecontent(pagename, mw).strip() 47 | if converted == dw: 48 | return True 49 | except: 50 | print("CONVERSION ERROR") 51 | traceback.print_exc() 52 | print(DELIMITER) 53 | if len(notes): 54 | print("Test notes:") 55 | print(notes) 56 | return False 57 | 58 | print("OUTPUT MISMATCH") 59 | if len(notes): 60 | print("Test notes:") 61 | print(notes) 62 | print(DELIMITER) 63 | print("Input Mediawiki:") 64 | print(mw) 65 | print(DELIMITER) 66 | 67 | diff = difflib.unified_diff(prep_difflines(dw), prep_difflines(converted), fromfile='Expected Dokuwiki', tofile='Actual Dokuwiki', lineterm="\n") 68 | sys.stdout.writelines(diff) 69 | sys.stdout.write("\n") 70 | print(DELIMITER) 71 | return False 72 | 73 | def tests_dirpath(): 74 | """ Return path to the test directory """ 75 | execdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) 76 | return os.path.join(execdir, "tests") 77 | 78 | def run_all_tests(): 79 | """ 80 | Run all tests. Return True on success. 81 | """ 82 | successes = 0 83 | testsrun = 0 84 | testsdir = tests_dirpath() 85 | for test in os.listdir(testsdir): 86 | path = os.path.join(testsdir, test) 87 | if os.path.isdir(path): 88 | testsrun += 1 89 | if run_test(path): 90 | successes += 1 91 | print("--- %d/%d TESTS PASSED ---" % (successes, testsrun)) 92 | return successes == testsrun 93 | 94 | def _readfile(dirpath, filename): 95 | """ 96 | Read a complete file and return content as a unicode string, or 97 | empty string if file not found 98 | """ 99 | try: 100 | with codecs.open(os.path.join(dirpath, filename), "r", "utf-8") as f: 101 | return f.read() 102 | except IOError: 103 | return u"" 104 | 105 | if __name__ == "__main__": 106 | try: 107 | if sys.argv[1] in ["-h", "--help"]: 108 | print("Usage: %s " % (sys.argv[0])) 109 | print("(If test name not specified, all tests in tests/ directory will be run.)") 110 | sys.exit(0) 111 | except IndexError: 112 | pass 113 | 114 | try: 115 | res = run_test(os.path.join(tests_dirpath(), sys.argv[1])) 116 | except IndexError: # no argv[1], so run all 117 | res = run_all_tests() 118 | if res: 119 | sys.exit(0) 120 | else: 121 | sys.exit(1) 122 | -------------------------------------------------------------------------------- /yamdwe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Export all revisions of all pages, plus all images/meda, from a 4 | Mediawiki install to a Dokuwiki install. Mediawiki install can be 5 | remote (uses API, but check terms of service.) Dokuwiki install is 6 | local. 7 | 8 | Requirements: 9 | Python 2.7, mwlib, simplemediawiki, requests 10 | 11 | Copyright (C) 2014 Angus Gratton 12 | Licensed under New BSD License as described in the file LICENSE. 13 | """ 14 | from __future__ import print_function, unicode_literals, absolute_import, division 15 | import argparse, sys, codecs, locale, getpass, datetime 16 | from pprint import pprint 17 | import mediawiki, dokuwiki, wikicontent 18 | # only needed to check for domain functionality 19 | import simplemediawiki, inspect 20 | 21 | def main(): 22 | # the wikicontent code (that uses visitor module) tends to recurse quite deeply for complex pages 23 | sys.setrecursionlimit(20000) 24 | 25 | # try not to crash if the output/console has a character we can't encode 26 | sys.stdout = codecs.getwriter(locale.getpreferredencoding())(sys.stdout, "replace") 27 | 28 | args = arguments.parse_args() 29 | 30 | if args.http_pass is not None and args.http_user is None: 31 | raise RuntimeError("ERROR: Option --http_pass requires --http_user to also be specified") 32 | if args.wiki_pass is not None and args.wiki_user is None: 33 | raise RuntimeError("ERROR: Option --wiki_pass requires --wiki_user to also be specified") 34 | 35 | if args.http_user is not None and args.http_pass is None: 36 | args.http_pass = getpass.getpass("Enter password for HTTP auth (%s):" % args.http_user) 37 | if args.wiki_user is not None and args.wiki_pass is None: 38 | args.wiki_pass = getpass.getpass("Enter password for Wiki login (%s):" % args.wiki_user) 39 | 40 | if not args.mediawiki.endswith("api.php"): 41 | print("WARNING: Mediawiki URL does not end in 'api.php'... This has to be the URL of the Mediawiki API, not just the wiki. If you can't export anything, try adding '/api.php' to the wiki URL.") 42 | 43 | if "domain" in inspect.getargspec(simplemediawiki.MediaWiki.__init__)[0]: 44 | importer = mediawiki.Importer(args.mediawiki, args.http_user, args.http_pass, args.wiki_user, args.wiki_pass, args.wiki_domain, args.verbose) 45 | else: 46 | importer = mediawiki.Importer(args.mediawiki, args.http_user, args.http_pass, args.wiki_user, args.wiki_pass, args.verbose) 47 | exporter = dokuwiki.Exporter(args.dokuwiki) 48 | 49 | # Set the wikicontent's definition of File: and Image: prefixes (varies by language settings) 50 | canonical_file, aliases = importer.get_file_namespaces() 51 | wikicontent.set_file_namespaces(canonical_file, aliases) 52 | 53 | # Read all pages and page revisions 54 | pages = importer.get_all_pages() 55 | print("Found %d pages to export..." % len(pages)) 56 | 57 | # Add a shameless "exported by yamdwe" note to the front page of the wiki 58 | mainpage = importer.get_main_pagetitle() 59 | 60 | for page in pages: 61 | if page["title"] == mainpage: 62 | latest = dict(page["revisions"][0]) 63 | latest["user"] = "yamdwe" 64 | now = datetime.datetime.utcnow().replace(microsecond=0) 65 | latest["timestamp"] = now.isoformat() + "Z" 66 | latest["comment"] = "Automated note about use of yamdwe Dokuwiki import tool" 67 | latest["*"] += "\n\n(Automatically exported to Dokuwiki from Mediawiki by [https://github.com/projectgus/yamdwe Yamdwe] on %s.)" % (datetime.date.today().strftime("%x")) 68 | page["revisions"].insert(0, latest) 69 | 70 | # Export pages to Dokuwiki format 71 | exporter.write_pages(pages) 72 | 73 | # Bring over images 74 | images = importer.get_all_images() 75 | print("Found %d images to export..." % len(images)) 76 | exporter.write_images(images, canonical_file, args.http_user, args.http_pass) 77 | 78 | # fix permissions on data directory if possible 79 | exporter.fixup_permissions() 80 | 81 | # touch conf file to invalidate cached pages 82 | exporter.invalidate_cache() 83 | 84 | print("Done.") 85 | 86 | # Parser for command line arguments 87 | arguments = argparse.ArgumentParser(description='Convert a Mediawiki installation to a Dokuwiki installation.') 88 | #arguments.add_argument('-y', '--yes',help="Don't pause for confirmation before exporting", action="store_true") 89 | arguments.add_argument('--http_user', help="Username for HTTP basic auth") 90 | arguments.add_argument('--http_pass', help="Password for HTTP basic auth (if --http_user is specified but not --http_pass, yamdwe will prompt for a password)") 91 | arguments.add_argument('--wiki_user', help="Mediawiki login username") 92 | arguments.add_argument('--wiki_pass', help="Mediawiki login password (if --wiki_user is specified but not --wiki_pass, yamdwe will prompt for a password)") 93 | if "domain" in inspect.getargspec(simplemediawiki.MediaWiki.__init__)[0]: 94 | arguments.add_argument('--wiki_domain', help="Mediawiki login domain (needs a non-standard simplemediawiki library)") 95 | arguments.add_argument('-v', '--verbose',help="Print verbose progress and error messages", action="store_true") 96 | arguments.add_argument('mediawiki', metavar='MEDIAWIKI_API_URL', help="URL of mediawiki's api.php file (something like http://mysite/wiki/api.php)") 97 | arguments.add_argument('dokuwiki', metavar='DOKUWIKI_ROOT', help="Root path to an existing dokuwiki installation to add the Mediawiki pages to (can be a brand new install.)") 98 | 99 | if __name__ == "__main__": 100 | try: 101 | main() 102 | except RuntimeError as e: 103 | print("ERROR: %s" % e) 104 | sys.exit(3) 105 | -------------------------------------------------------------------------------- /yamdwe_users.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """ 3 | Migrate user accounts from a Mediawiki installation to a Dokuwiki installation. 4 | 5 | Unlike yamdwe.py which can use the Mediawiki API, yamdwe_users.py requires 6 | a database connection to migrate user accounts. 7 | 8 | Assumes MySQL, sorry Postgres fans. 9 | 10 | Requirements: 11 | Python 2.7, MySQLdb 12 | 13 | On Debian/Ubuntu: 14 | sudo apt-get install python-mysqldb 15 | 16 | Copyright (C) 2014 Angus Gratton 17 | Licensed under New BSD License as described in the file LICENSE. 18 | """ 19 | from __future__ import print_function, unicode_literals, absolute_import, division 20 | import argparse, sys, os.path, collections, getpass, re, codecs, MySQLdb 21 | import names 22 | from pprint import pprint 23 | 24 | def main(): 25 | args = arguments.parse_args() 26 | 27 | userfile = os.path.join(args.dokuwiki, "conf", "users.auth.php") 28 | 29 | if not os.path.exists(userfile): 30 | print("Error: users.auth doesn't exist at %s" % userfile) 31 | if os.path.exists(userfile + ".dist"): 32 | print("users.auth.dist exists. This suggests you haven't yet run install.php to create a superuser (recommended.") 33 | sys.exit(1) 34 | 35 | commentblock, dw_users = get_dokuwiki_users(userfile) 36 | print("Found %d existing dokuwiki users..." % len(dw_users)) 37 | 38 | if not args.no_password: 39 | print("Enter MySQL password for user %s:" % args.user) 40 | pw = getpass.getpass() 41 | else: 42 | pw = None 43 | 44 | mw_users = get_mediawiki_users(args.host, args.user, pw, args.db, args.prefix) 45 | 46 | for mw_username in mw_users.keys(): 47 | if mw_username in dw_users: 48 | print("%s already exists in users.auth. Updating attributes..." % mw_username) 49 | dw_users[mw_username]["name"] = mw_users[mw_username]["name"] 50 | dw_users[mw_username]["email"] = mw_users[mw_username]["email"] 51 | dw_users[mw_username]["pwhash"] = mw_users[mw_username]["pwhash"] 52 | else: 53 | print("Adding new user %s..." % mw_users[mw_username]["login"]) 54 | dw_users[mw_username] = mw_users[mw_username] 55 | 56 | print("Writing %d users back to dokuwiki users.auth.php..." % len(dw_users)) 57 | write_dokuwiki_users(userfile, commentblock, dw_users) 58 | print("Done.") 59 | 60 | 61 | def get_dokuwiki_users(userfile): 62 | """ Parse the dokuwiki users.auth file and return block of comment text, dict of user info structures """ 63 | users = collections.OrderedDict() 64 | comments = "" 65 | with codecs.open(userfile, "r", "utf-8") as f: 66 | for line in f: 67 | if line.startswith("#") or line.strip() == "": 68 | comments += line 69 | elif ":" in line: 70 | login,pwhash,name,email,groups = re.split(r'(? 1 or (version[0] == 1 and version[1] >= 24) 36 | print("%s meets version requirements." % generator) 37 | except IndexError: 38 | raise RuntimeError("Failed to read Mediawiki siteinfo/generator. Is version older than 1.8? Yamdwe requires 1.13 or greater.") 39 | 40 | def verbose_print(self, msg): 41 | if self.verbose: 42 | print(msg) 43 | 44 | def get_all_pages(self): 45 | """ 46 | Slurp all pages down from the mediawiki instance, together with all revisions including content. 47 | WARNING: Hits API hard, don't do this without knowledge/permission of wiki operator!! 48 | """ 49 | query = {'list' : 'allpages'} 50 | print("Getting list of pages...") 51 | pages = self._query(query, [ 'allpages' ]) 52 | self.verbose_print("Got %d pages." % len(pages)) 53 | print("Query page revisions (this may take a while)...") 54 | for page in pages: 55 | self.verbose_print("Querying revisions for pageid %s (%s)..." % (page['pageid'], page['title'])) 56 | page["revisions"] = self._get_revisions(page) 57 | self.verbose_print("Got %d revisions." % len(page["revisions"])) 58 | return pages 59 | 60 | def _get_revisions(self, page): 61 | pageid = page['pageid'] 62 | query = { 'prop' : 'revisions', 63 | 'pageids' : pageid, 64 | 'rvprop' : 'timestamp|user|comment|content', 65 | 'rvlimit' : '5', 66 | } 67 | revisions = self._query(query, [ 'pages', str(pageid), 'revisions' ]) 68 | return revisions 69 | 70 | def get_all_images(self): 71 | """ 72 | Slurp all images down from the mediawiki instance, latest revision of each image, only. 73 | 74 | WARNING: Hits API hard, don't do this without knowledge/permission of wiki operator!! 75 | """ 76 | query = {'list' : 'allimages'} 77 | return self._query(query, [ 'allimages' ]) 78 | 79 | def get_all_users(self): 80 | """ 81 | Slurp down all usernames from the mediawiki instance. 82 | """ 83 | query = {'list' : 'allusers'} 84 | return self._query(query, [ 'allusers' ]) 85 | 86 | def _query(self, args, path_to_result): 87 | """ 88 | Make a Mediawiki API query that results a list of results, 89 | handle the possibility of making a paginated query using query-continue 90 | """ 91 | query = { 'action' : 'query' } 92 | if self.need_rawcontinue: 93 | query["rawcontinue"] = "" 94 | query.update(args) 95 | result = [] 96 | continuations = 0 97 | while True: 98 | try: 99 | response = self.mw.call(query) 100 | except simplejson.scanner.JSONDecodeError as e: 101 | if e.pos == 0: 102 | if not self.verbose: 103 | raise RuntimeError("Mediawiki gave us back a non-JSON response. You may need to double-check the Mediawiki API URL you are providing (it usually ends in api.php), and also your Mediawiki permissions. To see the response content, pass the --verbose flag to yamdwe.") 104 | else: 105 | raise RuntimeError("Mediawiki gave us back a non-JSON response:\n\n\nInvalid response follows (%d bytes):\n%s\n\n(End of content)\nFailed to parse. You may need to double-check the Mediawiki API URL you are providing (it usually ends in api.php), and also your Mediawiki permissions." % (len(e.doc), e.doc.decode("utf-8"))) 106 | raise 107 | 108 | # fish around in the response for our actual data (location depends on query) 109 | try: 110 | inner = response['query'] 111 | for key in path_to_result: 112 | inner = inner[key] 113 | except KeyError: 114 | raise RuntimeError("Mediawiki query '%s' returned unexpected response '%s' after %d continuations" % (args, response, continuations)) 115 | result += inner 116 | 117 | # if there's a warning print it out (shouldn't need a debug flag since this is of interest to any user) 118 | if 'warnings' in response: 119 | for warnkey in response['warnings']: 120 | print("WARNING: %s function throws the warning %s" % (warnkey, response['warnings'][warnkey]['*'])) 121 | 122 | # if there's a continuation, find the new arguments and follow them 123 | try: 124 | query.update(response['query-continue'][path_to_result[-1]]) 125 | continuations += 1 126 | except KeyError: 127 | return result 128 | 129 | def get_file_namespaces(self): 130 | """ 131 | Return a tuple. First entry is the name used by default for the file namespace (which dokuwiki will also use.) 132 | Second entry is a list of all aliases used for that namespace, and aliases used for the 'media' namespace. 133 | """ 134 | query = { 'action' : 'query', 'meta' : 'siteinfo', 'siprop' : 'namespaces|namespacealiases' } 135 | result = self.mw.call(query)['query'] 136 | namespaces = result['namespaces'].values() 137 | aliases = result.get('namespacealiases', {}) 138 | file_namespace = {'*' : 'Files', 'canonical' : 'File'} 139 | media_namespace = {'*' : 'Media', 'canonical' : 'Media'} 140 | # search for the File namespace 141 | for namespace in namespaces: 142 | if namespace.get('canonical', None) == 'File': 143 | file_namespace = namespace 144 | elif namespace.get('canonical', None) == 'Media': 145 | media_namespace = namespace 146 | # alias list starts with the file & media namespace canonical values, and the media "real" value 147 | aliases_result = [ file_namespace['canonical'], media_namespace['canonical'], media_namespace['*'] ] 148 | # look for any aliases by searching the file namespace id, add to the list 149 | ids = [ file_namespace.get('id', None), media_namespace.get('id', None) ] 150 | for alias in aliases: 151 | if alias['id'] in ids: 152 | aliases_result.append(alias['*']) 153 | return file_namespace['*'], aliases_result 154 | 155 | def get_main_pagetitle(self): 156 | """ 157 | Return the title of the main Mediawiki page 158 | """ 159 | query = { 'action' : 'query', 'meta' : 'siteinfo', 'siprop' : 'general' } 160 | result = self.mw.call(query)['query'] 161 | return result['general'].get("mainpage", "Main") 162 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Yet Another Mediawiki to DokuWiki Exporter 2 | 3 | Yamdwe is made up of two Python programs to export an existing 4 | Mediawiki install to a Dokuwiki install. 5 | 6 | [![Build Status](https://travis-ci.org/projectgus/yamdwe.svg?branch=master)](https://travis-ci.org/projectgus/yamdwe) 7 | 8 | **yamde needs a new maintainer** - I've gotten busy with other responsibilities and I'm not giving yamdwe the attention it deserves. It's mostly mature software, the only issue is occasionally content in some wikis that doesn't convert properly. Yamdwe has automated tests and continuous integration so it's not too painful to add bugfixes, the usual slow point is investigating behaviour of mediawiki installs that aren't publically available. If you're interested in helping out then please get in touch, or just browse the [Issues](https://github.com/projectgus/yamdwe/issues) list and maybe send some PRs! Any active maintainer will be gladly credited and/or I'll transfer the repo to you if you'd like that. *-- Angus* 9 | 10 | # Features 11 | 12 | * Exports and recreates full revision history of all pages, including author information for correct attribution. 13 | * Exports images and maintains modification dates (but not past revisions of an image.) 14 | * Can optionally export user accounts to the default dokuiwiki "basicauth" format (see below.) 15 | * Parses MediaWiki syntax using the [mwlib library](http://mwlib.readthedocs.org/en/latest/index.html) (as used by Wikipedia), so can convert most pages very cleanly - minimal manual cleanup. 16 | * Syntax support includes: tables, image embeds, code blocks. 17 | * Uses the MediaWiki API to export pages and images, so a MediaWiki install can be exported remotely and without admin privileges (NB: Yamdwe does hit the API quite hard, so please do not export other people's wikis for fun. Or, at minimum, please read their Terms of Service first and comply by them.) 18 | * Supports logging in to Mediawiki to export, and also HTTP Basic Auth. 19 | 20 | # Compatible Versions 21 | 22 | * Dokuwiki 2014-09-29a "Hrun", but should work on any recent version. Exporting users only works on 2014-09-29a or newer (see below). 23 | * MediaWiki 1.13 or newer (ie any recent version, 1.13 is from *2008*!) 24 | 25 | Yamdwe has now been used successfully on many wikis of various sizes. If you've used it on a particularly large or unusual wiki, please let me know! 26 | 27 | # Requirements 28 | 29 | * Python 2.7 or newer (Python 3 not supported by all dependencies at time of writing.) 30 | * [requests module](http://docs.python-requests.org/en/latest/) 31 | * [simplemediawiki module](http://pythonhosted.org/simplemediawiki/) 32 | * [mwlib module](http://mwlib.readthedocs.org/en/latest/index.html) 33 | 34 | ## If exporting users is required 35 | 36 | * [Python MySQLDb](http://sourceforge.net/projects/mysql-python/) 37 | 38 | # Using yamdwe 39 | 40 | ## Installation "the Python way" 41 | 42 | Note: It's strongly recommended to use a [virtualenv](https://virtualenv.pypa.io/en/latest/) environment to keep yamdwe's libraries isolated from the rest of your system. yamdwe has over 20 package dependencies including some very specific versions to support mwlib. Good introductory posts about virtualenv can be found [here](http://davedash.com/tutorial/virtualenv/) and [also here](http://www.dabapps.com/blog/introduction-to-pip-and-virtualenv-python/). You may want to check out [virtualenvwrapper](http://virtualenvwrapper.readthedocs.org/), which provides handy shortcuts for common virtualenv operations. 43 | 44 | Once the virtualenv is made and activated: 45 | 46 | pip install -r requirements.txt 47 | 48 | ## Alternative Installation for Debian/Ubuntu Linux 49 | 50 | Installing everything via pip as shown above means compiling some 51 | common packages from source. Here's an alternative set of commands to 52 | set up a virtualenv on Debian/Ubuntu, but with some common packages 53 | installed into the main system: 54 | 55 | sudo apt-get install python-mysqldb python-pip python-lxml python-requests python-dev python-virtualenv 56 | virtualenv --system-site-packages -p python2.7 env 57 | source env/bin/activate 58 | pip install simplemediawiki==1.2.0b2 59 | pip install -i http://pypi.pediapress.com/simple/ mwlib 60 | 61 | (Once done working with yamdwe, run `deactivate` to leave the virtualenv, `source env/bin/activate` again to re-enter it). 62 | 63 | If the installation of mwlib fails install the packages recommended for mwlib https://mwlib.readthedocs.io/en/latest/installation.html#ubuntu-install 64 | 65 | sudo apt-get install -y gcc g++ make python python-dev python-virtualenv \ 66 | libjpeg-dev libz-dev libfreetype6-dev liblcms-dev \ 67 | libxml2-dev libxslt-dev \ 68 | ocaml-nox git-core \ 69 | python-imaging python-lxml \ 70 | texlive-latex-recommended ploticus dvipng imagemagick \ 71 | pdftk 72 | 73 | ## Set up Dokuwiki 74 | 75 | If you're creating a new DokuWiki then set up your 76 | [DokuWiki](http://dokuwiki.org) installation and perform the initial 77 | installation steps (name the wiki, set up an admin user, etc.) You can 78 | also use yamdwe with an existing wiki, but any existing content with 79 | the same name will be overwritten. 80 | 81 | ## Exporting pages & images 82 | 83 | To start an export, you will need the URL of the mediawiki API (usually http://mywiki/wiki/api.php or similar) and the local path to the Dokuwiki installation. 84 | 85 | yamdwe.py MEDIAWIKI_API_URL DOKUWIKI_ROOT_PATH 86 | 87 | If you need to log in to to your Mediawiki install (either with a Mediawiki username and if you are in a domain with the domain-name, or via HTTP Basic Auth) then run `yamdwe.py -h` to view the command line options for authentication. 88 | 89 | Domain functionality is added through the "develop" branch of this [simplemediawiki fork](https://github.com/BlackLotus/python-simplemediawiki/tree/develop) and can be used through. 90 | 91 | yamdwe.py --wiki_domain WIKI_DOMAIN MEDIAWIKI_API_URL DOKUWIKI_ROOT_PATH 92 | 93 | If installation goes well it should print the names of pages and images as it is exporting, and finally print "Done". This process can be slow, and can load up the Mediawiki server for large wikis. 94 | 95 | Yamdwe may warn you at the end that it is unable to set [correct permissions for the Dokuwiki data directories and files](https://www.dokuwiki.org/install:permissions) - regardless, you should check and correct these manually. 96 | 97 | Inevitably some content will not import cleanly, so a manual check/edit/cleanup pass is almost certainly necessary. 98 | 99 | ## Dokuwiki Plugin Features 100 | 101 | Yamdwe supports some features in Mediawiki that aren't supported by a base Dokuwiki installaton. To display these elements Dokuwiki plugins are required: 102 | 103 | * `

` tags in Mediawiki can use the [blockquote plugin](https://www.dokuwiki.org/plugin:blockquote) in Dokuwiki. 104 | * ` $` tags in Mediawiki can use the [MathJax plugin](https://www.dokuwiki.org/plugin:mathjax) (or similar) in Dokuwiki.
105 |
106 | You only need the plugins for any features that you are using in your Mediawiki and want to keep using as-is.
107 |
108 | ## Exporting users
109 |
110 | This step is optional, but it's nice as it matches the user names in the imported revision history with actual users in dokuwiki.
111 |
112 | For this step you need access to the MySQL database backing the mediawiki install, and local access to the dokuwiki root directory.
113 |
114 | An example usage looks like this:
115 |
116 | ./yamdwe_users.py -u mediawiki --prefix wiki_ /srv/www/dokuwiki/
117 |
118 | Run yamdwe_users.py with "-h" to see all options:
119 |
120 | Any settings you're unsure about (like `--prefix` for table prefix)
121 | can be found in the LocalSettings.php file of your Mediawiki
122 | installation.
123 |
124 | yamdwe_users exports mediawiki password hashes to a dokuwiki "basicauth" text file. These imported passwords require Dokuwiki version 2014-09-29 "Hrun" or newer. On older Dokuwiki installs the password file format is not compatible and it will break user auth. The best thing to do is to update to 2014-09-29 or newer before running `yamdwe_users.py`.
125 |
126 | ## Post Import Steps
127 |
128 | * After the export please check for [correct permissions](https://www.dokuwiki.org/install:permissions) on
129 | the dokuwiki `data/conf/users.auth.php` file and other data/conf files.
130 |
131 | * The search index needs to be manually rebuilt with the contents of the new pages. The [searchindex plugin](https://www.dokuwiki.org/plugin:searchindex) can do this.
132 |
133 | ## Common Manual Cleanup Items
134 |
135 | * Page naming and namespaces will probably need some rearranging/renaming to seem "natural" in Dokuwiki. The [move Plugin](https://www.dokuwiki.org/plugin:move) makes this straightforward.
136 |
137 | * Some uncommon URL schemes, such as `file://`, are not detected by Dokuwiki as links unless you [add a scheme.local.conf file as described here](https://www.dokuwiki.org/urlschemes)
138 |
139 |
140 | # Known Issues
141 |
142 | Please check the [Issues list on github](https://github.com/projectgus/yamdwe/issues) to see what's going on.
143 |
144 | If you do find a bug or have trouble exporting a wiki then please open an issue there and I (or other yamdwe users) can try and help you out.
145 |
146 | ## Submitting Good Bug Reports
147 |
148 | If the bug is with some Mediawiki markup that doesn't provide the expected Dokuwiki markup, for a good bug report please include:
149 |
150 | * Excerpt of the Mediawiki markup causing the problem.
151 | * Desired Dokuwiki markup output.
152 | * Actual (problematic) Dokuwiki output from yamdwe.
153 |
154 | ## Better Bug Reports?
155 |
156 | Want to put a huge smile on my face and get a massive karma dose by
157 | submitting an even better bug report? Are you comfortable using git &
158 | github?
159 |
160 | * Fork the yamdwe repository on github.
161 | * Add a test case directory under tests/ and place the problematic Mediawiki markup into a file `mediawiki.txt`, and the desired correct Dokuwiki output into `dokuwiki.txt`.
162 | * Run `wikicontent_tests.py` to verify that the incorrect output you expected is printed as part of the test failure.
163 | * Add a commmit which adds the new test case directory.
164 | * Submit a Pull Request for the test failure. Use the Pull Request description field to explain the problem.
165 |
166 | ## Best Bug Reports?
167 |
168 | If you want to outclass even that bug report, your commit could also add a fix for the conversion problem in yamdwe, so all tests pass including the new one you added! *A+++ would accept Pull Request again!*
169 |
170 | Don't worry if you don't want to perform any extra steps though, any (polite) bug report is always welcome!
171 |

--------------------------------------------------------------------------------
/dokuwiki.py:
--------------------------------------------------------------------------------
1 | """
2 | Methods for exporting mediawiki pages & images to a dokuwiki data/ directory.
3 |
4 | Tested with Dokuwiki 2014-05-05 "Ponder Stibbons".
5 |
6 | Copyright (C) 2014 Angus Gratton
7 | Licensed under New BSD License as described in the file LICENSE.
8 | """
9 | from __future__ import print_function, unicode_literals, absolute_import, division
10 | import os, os.path, gzip, shutil, re, requests, calendar, codecs, sys
11 | from requests.auth import HTTPBasicAuth
12 | import wikicontent
13 | import simplemediawiki
14 | import names
15 |
16 | class Exporter(object):
17 | def __init__(self, rootpath):
18 |
19 | # verify the dokuwiki rootpath exists
20 | self.root = rootpath
21 | if not os.path.isdir(rootpath):
22 | raise RuntimeError("Dokuwiki root path '%s' does not point to a directory" % rootpath)
23 |
24 | # check a 'data' directory exists, establish pathes for each subdirectory
25 | self.data = os.path.join(rootpath, "data")
26 | if not os.path.isdir(self.data):
27 | raise RuntimeError("Dokuwiki root path '%s' does not contain a data directory" % rootpath)
28 |
29 | # create meta, attic, pages subdirs if they don't exist (OK to have deleted them before the import)
30 | self.meta = os.path.join(self.data, "meta")
31 | self.attic = os.path.join(self.data, "attic")
32 | self.pages = os.path.join(self.data, "pages")
33 | for subdir in [ self.meta, self.attic, self.pages]:
34 | ensure_directory_exists(subdir)
35 |
36 | def write_pages(self, pages):
37 | """
38 | Given 'pages' as a list of mediawiki pages with revisions attached, export them to dokuwiki pages
39 | """
40 | for page in pages:
41 | self._convert_page(page)
42 | self._aggregate_changes(self.meta, "_dokuwiki.changes")
43 |
44 | def write_images(self, images, file_namespace, http_user=None, http_pass=None):
45 | """
46 | Given 'images' as a list of mediawiki image metadata API entries,
47 | download and write out dokuwiki images. Does not bring over revisions.
48 |
49 | Images are all written to the file_namespace specified (file: by default), to match mediawiki.
50 | """
51 | auth=None if http_user is None else HTTPBasicAuth(http_user, http_pass)
52 | file_namespace = file_namespace.lower()
53 | filedir = os.path.join(self.data, "media", file_namespace)
54 | ensure_directory_exists(filedir)
55 | filemeta = os.path.join(self.data, "media_meta", file_namespace)
56 | ensure_directory_exists(filemeta)
57 | for image in images:
58 | # download the image from the Mediawiki server
59 | print("Downloading %s... (%s)" % (image['name'], image['url']))
60 | r = requests.get(image['url'], auth=auth)
61 | # write the actual image out to the data/file directory
62 | name = make_dokuwiki_pagename(image['name'])
63 | imagepath = os.path.join(filedir, name)
64 | with open(imagepath, "wb") as f:
65 | f.write(r.content)
66 | # set modification time appropriately
67 | timestamp = get_timestamp(image)
68 | os.utime(imagepath, (timestamp,timestamp))
69 | # write a .changes file out to the media_meta/file directory
70 | changepath = os.path.join(filemeta, "%s.changes" % name)
71 | with codecs.open(changepath, "w", "utf-8") as f:
72 | fields = (str(timestamp), "::1", "C", u"%s:%s"%(file_namespace,name), "", "created")
73 | f.write(u"\t".join(fields) + "\r\n")
74 | # aggregate all the new changes to the media_meta/_media.changes file
75 | self._aggregate_changes(os.path.join(self.data, "media_meta"), "_media.changes")
76 |
77 | def _convert_page(self, page):
78 | """ Convert the supplied mediawiki page to a Dokuwiki page """
79 | print("Converting %d revisions of page '%s'..." %
80 | (len(page["revisions"]), page['title']))
81 | # Sanitise the mediawiki pagename to something matching the dokuwiki pagename convention
82 | full_title = make_dokuwiki_pagename(page['title'])
83 |
84 | # Mediawiki pagenames can contain namespace :s, convert these to dokuwiki / paths on the filesystem (becoming : namespaces in dokuwiki)
85 | subdir, pagename = os.path.split(full_title.replace(':','/'))
86 | pagedir = os.path.join(self.pages, subdir)
87 | metadir = os.path.join(self.meta, subdir)
88 | atticdir = os.path.join(self.attic, subdir)
89 | for d in pagedir, metadir, atticdir:
90 | ensure_directory_exists(d)
91 |
92 | # Walk through the list of revisions
93 | revisions = list(reversed(page["revisions"])) # order as oldest first
94 | for revision in revisions:
95 | is_current = (revision == revisions[-1])
96 | is_first = (revision == revisions[0])
97 | content = wikicontent.convert_pagecontent(full_title, revision["*"])
98 | timestamp = get_timestamp(revision)
99 | comment = revision.get("comment", "").replace("\t", " ").split("\n")[0]
100 | # path to the .changes metafile
101 | changespath = os.path.join(metadir, "%s.changes"%pagename)
102 | # for current revision, create 'pages' .txt
103 | if is_current:
104 | txtpath = os.path.join(pagedir, "%s.txt"%pagename)
105 | with codecs.open(txtpath, "w", "utf-8") as f:
106 | f.write(content)
107 | os.utime(txtpath, (timestamp,timestamp))
108 | # create gzipped attic revision
109 | atticname = "%s.%s.txt.gz" % (pagename, timestamp)
110 | atticpath = os.path.join(atticdir, atticname).encode("utf-8")
111 | with gzip.open(atticpath, "wb") as f:
112 | f.write(content.encode("utf-8"))
113 | os.utime(atticpath, (timestamp,timestamp))
114 | # append entry to page's 'changes' metadata index
115 | with codecs.open(changespath, "w" if is_first else "a", "utf-8") as f:
116 | changes_title = full_title.replace("/", ":")
117 | fields = (str(timestamp), "::1", "C" if is_first else "E", changes_title, names.clean_user(revision["user"]), comment)
118 | print(u"\t".join(fields), file=f)
119 |
120 |
121 | def _aggregate_changes(self, metadir, aggregate):
122 | """
123 | Rebuild the wiki-wide changelong from meta/ to meta/_dokuwiki.changes or
124 | from media_meta to media_meta/_media.changes
125 |
126 | This is a Pythonified version of https://www.dokuwiki.org/tips:Recreate_Wiki_Change_Log
127 | """
128 | lines = []
129 | for root, dirs, files in os.walk(metadir):
130 | for changesfile in files:
131 | if changesfile == aggregate or not changesfile.endswith(".changes"):
132 | continue
133 | with codecs.open(os.path.join(root,changesfile), "r", "utf-8") as f:
134 | lines += f.readlines()
135 | lines = sorted(lines, key=lambda r: int(r.split("\t")[0]))
136 | with codecs.open(os.path.join(metadir, aggregate), "w", "utf-8") as f:
137 | f.writelines(lines)
138 |
139 | def fixup_permissions(self):
140 | """ Fix permissions under the data directory
141 |
142 | This means applying the data directory's permissions and ownership to all underlying parts.
143 |
144 | If this fails due to insufficient privileges then it just prints a warning and continues on.
145 | """
146 | stat = os.stat(self.data)
147 | try:
148 | for root, dirs, files in os.walk(self.data):
149 | for name in files:
150 | path = os.path.join(root, name)
151 | os.chmod(path, stat.st_mode & 0o666)
152 | os.chown(path, stat.st_uid, stat.st_gid)
153 | for name in dirs:
154 | path = os.path.join(root, name)
155 | os.chmod(path, stat.st_mode)
156 | os.chown(path, stat.st_uid, stat.st_gid)
157 |
158 | except OSError:
159 | print("WARNING: Failed to set permissions under the data directory (not owned by process?) May need to be manually fixed.")
160 |
161 | def invalidate_cache(self):
162 | """ Invalidate cached pages by updating modification date of a config file
163 |
164 | If this fails due to insufficient privileges then it just prints a warning and continues on.
165 | """
166 | confpath = os.path.join(self.root, "conf", "local.php")
167 | try:
168 | os.utime('myfile', None)
169 | except OSError:
170 | print(CACHE_WARNING_MSG % confpath)
171 |
172 | CACHE_WARNING_MSG = """WARNING: Failed to invalidate page cache by updating config file timestamp.
173 | If pre-existing pages exist in Dokuwiki, run the following command (with sufficient privileges):
174 | touch "%s"
175 | """
176 |
177 | def get_timestamp(node):
178 | """
179 | Return a dokuwiki-Compatible Unix int timestamp for a mediawiki API page/image/revision
180 | """
181 | dt = simplemediawiki.MediaWiki.parse_date(node['timestamp'])
182 | return int(calendar.timegm(dt.utctimetuple()))
183 |
184 | def ensure_directory_exists(path):
185 | if not os.path.isdir(path):
186 | os.makedirs(path)
187 |
188 | def make_dokuwiki_pagename(mediawiki_name):
189 | """
190 | Convert a canonical mediawiki pagename to a dokuwiki pagename
191 |
192 | Any namespacing that is in the form of a / is replaced with a :
193 | """
194 | result = mediawiki_name.replace(" ","_")
195 | # We have pages that have ':' in them - replace with underscores
196 | result = result.replace(':', '_')
197 | result = names.clean_id(camel_to_underscore(result)).replace("/",":")
198 | # Some of our mediawiki page names begin with a '/', which results in os.path.join assuming the page is an absolute path.
199 | if result[0] == ':':
200 | result = result.lstrip(':')
201 | # Fix any pages that began with a space, because that breaks dokuwiki
202 | result = result.replace(":_", ":")
203 | result = codecs.encode(result, sys.getfilesystemencoding(), "replace")
204 | return result
205 |
206 | def make_dokuwiki_heading_id(mw_heading_name):
207 | """
208 | Convert a Mediawiki internal anchor heading link to the Dokuwiki anchor heading link id
209 |
210 | Equivalent function in dokuwiki is _headerToLink in inc/parser/xhtml.php
211 | which calls sectionID in inc/pageutils.php
212 | """
213 | result = names.clean_id(mw_heading_name, True)
214 | result = re.sub(r'[:.]', '', result)
215 |
216 | nums_stripped = result.lstrip("0123456789_-")
217 | if len(nums_stripped):
218 | return nums_stripped
219 | else:
220 | return "section"+re.sub(r"[^0-9]+", "", result)
221 |
222 | def camel_to_underscore(camelcase):
223 | """
224 | Convert a camelcased string to underscore_delimited (tweaked from this StackOverflow answer)
225 | http://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-camel-case
226 | """
227 | s1 = re.sub('(^/_)([A-Z][a-z]+)', r'\1_\2', camelcase)
228 | s2 = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
229 | return s2
230 |

--------------------------------------------------------------------------------
/wikicontent.py:
--------------------------------------------------------------------------------
1 | """
2 | Methods for converting Mediawiki content to the Dokuwiki format.
3 |
4 | Uses mwlib to parse the Mediawiki markup.
5 |
6 | Copyright (C) 2014 Angus Gratton
7 | Licensed under New BSD License as described in the file LICENSE.
8 | """
9 | from __future__ import print_function, unicode_literals, absolute_import, division
10 | import re, string, dokuwiki, visitor
11 | from mwlib.parser import *
12 | from mwlib import uparser
13 |
14 | # Regex to match any known File: namespace (can be updated based on the mediawiki installation language)
15 | mw_file_namespace_aliases = re.compile("^(Image|File):", re.IGNORECASE)
16 | dw_file_namespace = "File:"
17 |
18 | def set_file_namespaces(canonical_alias, aliases):
19 | """
20 | Allow the mediawiki parser to match localised namespaces for files/images
21 |
22 | Arguments:
23 | canonical_alias is the single namespace that dokuwiki will use (default File:)
24 | aliases is a list of alternative namespace names that will be converted to the canonical alias
25 | """
26 | global mw_file_namespace_aliases
27 | global dw_file_namespace
28 | dw_file_namespace = canonical_alias + ":"
29 | mw_file_namespace_aliases = re.compile("^(%s):" % "|".join(aliases), re.IGNORECASE)
30 |
31 | def is_file_namespace(target):
32 | """
33 | Is this target URL part of a known File or Image path?
34 | """
35 | return re.match(mw_file_namespace_aliases, target)
36 |
37 | def canonicalise_file_namespace(target):
38 | """
39 | Convert any known File: or Image: (or alias) namespace link to be File:
40 | ... mediawiki stores all these under a common namespace, so dokuwiki has no choice but to import
41 | them all under a single canonical
42 | """
43 | return re.sub(mw_file_namespace_aliases, dw_file_namespace, target)
44 |
45 | def convert_pagecontent(title, content):
46 | """
47 | Convert a string in Mediawiki content format to a string in
48 | Dokuwiki content format.
49 | """
50 |
51 | # this is a hack for mwlib discarding the content of tags
52 | # and replacing them with plaintext parsed HTML versions of the
53 | # content (pragmatic, but not what we want)
54 | nowiki_plaintext = []
55 |
56 | # Instead we save the content here, replace it with the "magic" placeholder
57 | # tag <__yamdwe_nowiki> and the index where the content was saved, then pass
58 | # the list of nowiki content into the parser as context.
59 | def add_nowiki_block(match):
60 | nowiki_plaintext.append(match.group(0))
61 | return "<__yamdwe_nowiki>%d " % (len(nowiki_plaintext)-1,)
62 | content = re.sub(r" .+? ", add_nowiki_block, content)
63 |
64 | root = uparser.parseString(title, content) # create parse tree
65 | context = {}
66 | context["list_stack"] = []
67 | context["nowiki_plaintext"] = nowiki_plaintext # hacky way of attaching to child nodes
68 | result = convert(root, context, False)
69 |
70 | # mwlib doesn't parse NOTOC, so check for it manually
71 | if re.match(r"^\s*__NOTOC__\s*$", content, re.MULTILINE):
72 | result = "~~NOTOC~~"+("\n" if not result.startswith("\n") else "")+result
73 | return result
74 |
75 | def convert_children(node, context):
76 | """Walk the children of this parse node and call convert() on each.
77 | """
78 | result = ""
79 | for child in node.children:
80 | res = convert(child, context, result.endswith("\n"))
81 | if type(res) is str:
82 | res = unicode(res)
83 | if type(res) is not unicode:
84 | print("Got invalid response '%s' when processing '%s'" % (res,child))
85 | result += res
86 | return result
87 |
88 | @visitor.when(Article)
89 | def convert(node, context, trailing_newline):
90 | return convert_children(node, context)
91 |
92 | @visitor.when(Paragraph)
93 | def convert(node, context, trailing_newline):
94 | return convert_children(node, context) + "\n"
95 |
96 | @visitor.when(Text)
97 | def convert(text, context, trailing_newline):
98 | if text._text is None:
99 | return ""
100 | m = re.match(r"<__yamdwe_nowiki>([0-9]+) ", text._text)
101 | if m is not None: # nowiki content!
102 | index = int(m.group(1))
103 | return context["nowiki_plaintext"][index] # nowiki_plaintext entry includes tags
104 | else:
105 | return text.caption
106 |
107 | @visitor.when(Section)
108 | def convert(section, context, trailing_newline):
109 | result = ""
110 | if section.tagname == "p":
111 | pass
112 | elif section.tagname == "@section":
113 | level = section.level
114 | heading = convert(section.children.pop(0), context, trailing_newline).strip()
115 | heading_boundary = "="*(8-level)
116 | result = "\n%s %s %s\n" % (heading_boundary, heading, heading_boundary)
117 | else:
118 | print("Unknown tagname %s" % section.tagname)
119 |
120 | return result + convert_children(section, context)
121 |
122 | @visitor.when(Style)
123 | def convert(style, context, trailing_newline):
124 | formatter = {
125 | ";" : ("**", r"**\\"), # definition (essentially boldface)
126 | "''" : ("//", "//"), # italics
127 | "'''" :("**", "**"), # boldface
128 | ":" : ("", ""), # other end of a definition???
129 | "sub" : ("$ _",""), 130 | "sup" : ("^",""), 131 | "big" : ("**", "**"), # not in dokuwiki so use bold 132 | "-" : ("
", "
"), # use dokuwikis Blockquote Plugin for this 133 | "u" : ("", ""), # already handled in TagNode @visitor 134 | "s" : ("~~", "~~") # According to the mediawiki docs .. is synonymous with ~~...~~ (although one is treates as a tag and one a style in the parser??) 135 | }.get(style.caption, None) 136 | if formatter is None: 137 | print("WARNING: Ignoring unknown formatter %s" % style.caption) 138 | formatter = ("","") 139 | return formatter[0] + convert_children(style, context) + formatter[1] 140 | 141 | @visitor.when(NamedURL) 142 | def convert(url, context, trailing_newline): 143 | text = convert_children(url, context).strip(" ") 144 | url = url.caption 145 | if len(text): 146 | return u"[[%s|%s]]" % (url, text) 147 | else: 148 | return u"%s" % (url) 149 | 150 | @visitor.when(URL) 151 | def convert(url, context, trailing_newline): 152 | return url.caption 153 | 154 | @visitor.when(ImageLink) 155 | def convert(link, context, trailing_newline): 156 | suffix = "" 157 | if link.width is not None: 158 | if link.height is None: 159 | suffix = "?%s" % link.width 160 | else: 161 | suffix = "?%sx%s" % (link.width, link.height) 162 | else: 163 | try: 164 | if link.in_gallery: # see below for Tag->gallery handling 165 | suffix = "?160" # gallery images should be thumbnailed 166 | except AttributeError: 167 | pass # not in a gallery 168 | prealign = " " if link.align in [ "center", "right" ] else "" 169 | postalign = " " if link.align in [ "center", "left" ] else "" 170 | target = canonicalise_file_namespace(link.target) 171 | target = convert_internal_link(target) 172 | return "{{%s%s%s%s}}" % (prealign, target, suffix, postalign) 173 | 174 | @visitor.when(ArticleLink) 175 | def convert(link, context, trailing_newline): 176 | text = convert_children(link, context).strip(" ") 177 | pagename = convert_internal_link(link.target) 178 | if len(text): 179 | return u"[[%s|%s]]" % (pagename, text) 180 | else: 181 | return u"[[%s]]" % pagename 182 | 183 | @visitor.when(CategoryLink) 184 | def convert(link, context, trailing_newline): 185 | # Category functionality can be implemented with plugin:tag, but not used here 186 | return "" 187 | 188 | @visitor.when(NamespaceLink) 189 | def convert(link, context, trailing_newline): 190 | if is_file_namespace(link.target): # is a link to a file or image 191 | filename = dokuwiki.make_dokuwiki_pagename(canonicalise_file_namespace(link.target)) 192 | caption = convert_children(link, context).strip() 193 | if len(caption) > 0: 194 | return u"{{%s%s}}" % (filename, caption) 195 | else: 196 | return u"{{%s}}" % filename 197 | 198 | print("WARNING: Ignoring namespace link to " + link.target) 199 | return convert_children(link, context) 200 | 201 | 202 | @visitor.when(ItemList) 203 | def convert(itemlist, context, trailing_newline): 204 | context["list_stack"].append("* " if itemlist.tagname == "ul" else "- ") 205 | converted_list = convert_children(itemlist, context) 206 | context["list_stack"].pop() 207 | return converted_list 208 | 209 | @visitor.when(Item) 210 | def convert(item, context, trailing_newline): 211 | item_content = convert_children(item, context) 212 | list_stack = context["list_stack"] 213 | return " "*len(list_stack) + list_stack[-1] + item_content 214 | 215 | @visitor.when(Table) 216 | def convert(table, context, trailing_newline): 217 | # we ignore the actual Table tags, instead convert each Row & Cell individually 218 | return convert_children(table, context) 219 | 220 | @visitor.when(Cell) 221 | def convert(cell, context, trailing_newline): 222 | marker = "^" if cell.tagname == "th" else "|" 223 | result = u"%s %s" % (marker, convert_children(cell, context).replace('\n','').strip()) 224 | return result 225 | 226 | @visitor.when(Row) 227 | def convert(row, context, trailing_newline): 228 | return convert_children(row, context) + " |\n" 229 | 230 | @visitor.when(PreFormatted) 231 | def convert(pre, context, trailing_newline): 232 | in_list = len(context["list_stack"]) > 0 233 | if trailing_newline and not in_list: # in its own paragraph, use a two space indent 234 | return " " + convert_children(pre, context).replace("\n","\n ").strip(" ") 235 | else: # inline in a list or a paragraph body, use tags 236 | return "" + convert_children(pre, context) + "" 237 | 238 | @visitor.when(TagNode) 239 | def convert(tag, context, trailing_newline): 240 | # dict maps mediawiki tag name to tuple of starting, ending dokuwiki tag 241 | simple_tagitems = { 242 | "tt" : ("''", "''"), 243 | "ref" : ("((","))"), # references converted to footnotes 244 | "code" : ("",""), 245 | "del": ("", ""), 246 | } 247 | if tag.tagname in simple_tagitems: 248 | pre,post = simple_tagitems[tag.tagname] 249 | return pre + convert_children(tag, context) + post 250 | elif tag._text is not None: 251 | if tag._text.replace(" ","").replace("/","") == "": 252 | return "\n" # this is a oneoff hack for one wiki page covered in 253 | return tag._text # may not work for non-selfclosing tags 254 | elif tag.tagname == "gallery": 255 | # with a lot of cleverness we could use the gallery plugin for this, 256 | # but this should do. We flag each child image as in the gallery, then 257 | # deal with the ImageLinks above 258 | for child in tag.children: 259 | child.in_gallery = True 260 | elif tag.tagname == "references": 261 | print("WARNING: tag has no equivalent in Dokuwiki, ignoring...") 262 | 263 | return convert_children(tag, context) 264 | 265 | @visitor.when(Math) 266 | def convert(node, context, trailing_newline): 267 | """ 268 | Convert tags for rendering of math terms 269 | there are a couple of extension to support this in dokuwiki 270 | tested successfully with MathJax plugin 271 | """ 272 | if "\n" in node.math: 273 | # multiple lines are a formula block 274 | return "$$" + node.math + "$$" 275 | # anything else is inline term 276 | return "$" + node.math + "$" 277 | 278 | 279 | @visitor.when(Caption) 280 | def convert(node, context, trailing_newline): 281 | """ 282 | Convert table captions to bold paragraph preceeding the table. 283 | 284 | Because we ignore the tags when converting to dokuwiki, 285 | we can get away with simply converting to bold text without 286 | worrying about it being inside (which should be) 287 | in the rendered HTML. 288 | """ 289 | return "** %s **\n" % convert_children(node, context) 290 | 291 | # catchall for Node, which is the parent class of everything above 292 | @visitor.when(Node) 293 | def convert(node, context, trailing_newline): 294 | if node.__class__ != Node: 295 | print("WARNING: Unsupported node type: %s" % (node.__class__)) 296 | return convert_children(node, context) 297 | 298 | def convert_internal_link(mw_target): 299 | """ 300 | Convert an internal Mediawiki link, with or without an anchor # in the middle. 301 | 302 | Same as converting a plain pagename, only we want to preserve any #s in the target text. 303 | """ 304 | if "#" in mw_target: 305 | page,anchor = mw_target.split("#",1) 306 | else: 307 | page = mw_target 308 | anchor = None 309 | if len(page): 310 | page = dokuwiki.make_dokuwiki_pagename(page) 311 | if anchor is not None: 312 | page = page + "#" + dokuwiki.make_dokuwiki_heading_id(anchor) 313 | return page 314 | --------------------------------------------------------------------------------