├── .gitignore ├── README.md ├── composer.json └── convert.php /.gitignore: -------------------------------------------------------------------------------- 1 | composer.phar 2 | composer.lock 3 | vendor -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | MediaWiki to Markdown 2 | ===================== 3 | 4 | Convert MediaWiki pages to GitHub flavored Markdown (or other formats supported by Pandoc). The conversion uses an XML export from MediaWiki and converts each wiki page to an individual markdown file. Directory structures will be preserved. The generated export can also include frontmatter for Github pages. 5 | 6 | You may also be interested in a forked version of this codebase available at https://github.com/outofcontrol/mediawiki-to-gfm 7 | 8 | ## Requirements 9 | 10 | * PHP 11 | * Pandoc 12 | 13 | 14 | ## Export MediaWiki Pages 15 | 16 | You'll export all your pages as a single XML file following these steps: http://en.wikipedia.org/wiki/Help:Export 17 | 18 | 19 | ## Installation 20 | 21 | ### Install Pandoc 22 | 23 | http://johnmacfarlane.net/pandoc/installing.html 24 | 25 | 26 | ### Get Composer 27 | 28 | `curl -sS https://getcomposer.org/installer | php` 29 | 30 | 31 | ### Install Composer Packages 32 | 33 | `php composer.phar install` 34 | 35 | 36 | ## Run 37 | 38 | ####--filename#### 39 | The only required parameter is `filename` for the name of the xml file you exported from MediaWiki, eg: 40 | 41 | `php convert.php --filename=mediawiki.xml` 42 | 43 | ####--output#### 44 | You can also use `output` to specify an output folder since each wiki page in the XML file will generate it's own separate markdown file. 45 | 46 | `php convert.php --filename=mediawiki.xml --output=export` 47 | 48 | 49 | ####--indexes#### 50 | You can set `indexes` as `true` if you want pages with the same name as a directory to be renamed as index.md and placed into their directory 51 | 52 | `php convert.php --filename=mediawiki.xml --output=export --indexes=true` 53 | 54 | ####--frontmatter#### 55 | You can specify whether you want frontmatter included. This is automatically set to `true` when the output format is `markdown_github` 56 | 57 | `php convert.php --filename=mediawiki.xml --output=export --format=markdown_phpextra --frontmatter=true` 58 | 59 | 60 | ####--format#### 61 | You can specify different output formats with `format`. The default is `markdown_github`. See 62 | 63 | `php convert.php --filename=mediawiki.xml --output=export --format=markdown_phpextra` 64 | 65 | Supported pandoc formats are: 66 | 67 | * asciidoc 68 | * beamer 69 | * context 70 | * docbook 71 | * docx 72 | * dokuwiki 73 | * dzslides 74 | * epub 75 | * epub3 76 | * fb2 77 | * haddock 78 | * html 79 | * html5 80 | * icml 81 | * json 82 | * latex 83 | * man 84 | * markdown 85 | * markdown_github 86 | * markdown_mmd 87 | * markdown_phpextra 88 | * markdown_strict 89 | * mediawiki 90 | * native 91 | * odt 92 | * opendocument 93 | * opml 94 | * org 95 | * plain 96 | * revealjs 97 | * rst 98 | * rtf 99 | * s5 100 | * slideous 101 | * slidy 102 | * texinfo 103 | * textile 104 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "require": { 3 | "ryakad/pandoc-php": "dev-master" 4 | } 5 | } -------------------------------------------------------------------------------- /convert.php: -------------------------------------------------------------------------------- 1 | 2 | xpath('page'); 54 | $count = 0; 55 | $directory_list = array(); 56 | 57 | // Iterate through XML 58 | while(list( , $node) = each($result)) { 59 | 60 | $title = $node->xpath('title'); 61 | $title = $title[0]; 62 | $url = str_replace(' ', '_', $title); 63 | 64 | if($slash = strpos($url, '/')){ 65 | $title = str_replace('/', ' ', $title); 66 | $directory = substr($url, 0, $slash); 67 | $filename = substr($url, $slash+1); 68 | $directory_list[$directory] = true; 69 | } else { 70 | $directory = ''; 71 | $filename = $url; 72 | } 73 | 74 | $text = $node->xpath('revision/text'); 75 | $text = $text[0]; 76 | $text = html_entity_decode($text); // decode inline html 77 | $text = preg_replace_callback('/\[\[(.+?)\]\]/', "new_link", $text); // adds leading slash to links, "absolute-path reference" 78 | 79 | // prepare to append page title frontmatter to text 80 | if ($add_meta) { 81 | $frontmatter = "---\n"; 82 | $frontmatter .= "title: $title\n"; 83 | $frontmatter .= "permalink: /$url/\n"; 84 | $frontmatter .= "---\n\n"; 85 | } 86 | 87 | $pandoc = new Pandoc\Pandoc(); 88 | $options = array( 89 | "from" => "mediawiki", 90 | "to" => $format 91 | ); 92 | $text = $pandoc->runWith($text, $options); 93 | 94 | $text = str_replace('\_', '_', $text); 95 | 96 | if ($add_meta) { 97 | $text = $frontmatter . $text; 98 | } 99 | 100 | if (substr($output_path, -1) != '/') $output_path = $output_path . '/'; 101 | 102 | $directory = $output_path . $directory; 103 | 104 | // create directory if necessary 105 | if(!empty($directory)) { 106 | if(!file_exists($directory)) { 107 | mkdir($directory); 108 | } 109 | 110 | $directory = $directory . '/'; 111 | } 112 | 113 | // create file 114 | $file = fopen(normalizePath($directory . $filename . '.md'), 'w'); 115 | fwrite($file, $text); 116 | fclose($file); 117 | 118 | $count++; 119 | 120 | } 121 | 122 | 123 | // Rename and move files with the same name as directories 124 | if (!empty($directory_list) && !empty($arguments['indexes'])) { 125 | 126 | $directory_list = array_keys($directory_list); 127 | 128 | foreach ($directory_list as $directory_name) { 129 | 130 | if(file_exists($output_path . $directory_name . '.md')) { 131 | rename($output_path . $directory_name . '.md', $output_path . $directory_name . '/index.md'); 132 | } 133 | } 134 | 135 | } 136 | 137 | if ($count > 0) { 138 | echo "$count files converted" . PHP_EOL . PHP_EOL; 139 | } 140 | 141 | 142 | function arguments($argv) { 143 | $_ARG = array(); 144 | foreach ($argv as $arg) { 145 | if (preg_match('/--([^=]+)=(.*)/',$arg,$reg)) { 146 | $_ARG[$reg[1]] = $reg[2]; 147 | } elseif(preg_match('/-([a-zA-Z0-9])/',$arg,$reg)) { 148 | $_ARG[$reg[1]] = 'true'; 149 | } 150 | 151 | } 152 | return $_ARG; 153 | } 154 | 155 | 156 | function new_link($matches){ 157 | if(strpos($matches[1], '|') != true) { 158 | $new_link = str_replace(' ', '_', $matches[1]); 159 | return "[[/$new_link|${matches[1]}]]"; 160 | } else { 161 | 162 | $link = trim(substr($matches[1], 0, strpos($matches[1], '|'))); 163 | $link = '/' . str_replace(' ', '_', $link); 164 | 165 | $link_text = trim(substr($matches[1], strpos($matches[1], '|')+1)); 166 | 167 | return "[[$link|$link_text]]"; 168 | } 169 | } 170 | 171 | 172 | // Borrowed from http://php.net/manual/en/function.realpath.php 173 | function normalizePath($path) 174 | { 175 | $parts = array(); // Array to build a new path from the good parts 176 | $path = str_replace('\\', '/', $path); // Replace backslashes with forwardslashes 177 | $path = preg_replace('/\/+/', '/', $path);// Combine multiple slashes into a single slash 178 | $segments = explode('/', $path); // Collect path segments 179 | $test = ''; // Initialize testing variable 180 | foreach($segments as $segment) 181 | { 182 | if($segment != '.') 183 | { 184 | $test = array_pop($parts); 185 | if(is_null($test)) 186 | $parts[] = $segment; 187 | else if($segment == '..') 188 | { 189 | if($test == '..') 190 | $parts[] = $test; 191 | if($test == '..' || $test == '') 192 | $parts[] = $segment; 193 | } 194 | else 195 | { 196 | $parts[] = $test; 197 | $parts[] = $segment; 198 | } 199 | } 200 | } 201 | return implode('/', $parts); 202 | } 203 | 204 | 205 | ?> 206 | --------------------------------------------------------------------------------