├── .env.example ├── .gitignore ├── Makefile ├── README.md ├── app ├── console └── export_users ├── composer.json ├── composer.lock ├── data └── missed.yml ├── lib ├── loader.php └── mediawiki.php ├── reports ├── 1_nginx.map ├── 2_nginx_almost_same_casing.map ├── 3_nginx_almost_same_1.map ├── 3_nginx_almost_same_2.map ├── 4_nginx_redirects_spaces.map ├── directly_on_root.txt ├── hundred_revs.txt ├── location_spaghetti.txt ├── location_spaghetti_duplicated.txt ├── nginx_redirects.map ├── numbers.txt ├── redirects.txt ├── redirects_sanity.txt ├── summary.yml ├── summary_meta.yml ├── summary_wpd.yml ├── translations.txt ├── url_all.txt ├── url_parts.txt └── url_parts_variants.txt └── src └── WebPlatform └── Importer ├── Commands ├── AbstractImporterCommand.php ├── CacheWarmerCommand.php ├── RefreshPagesCommand.php ├── RunCommand.php └── SummaryCommand.php ├── Converter ├── HtmlToMarkdown.php └── MediaWikiToHtml.php ├── Filter └── TitleFilter.php ├── GitPhp ├── CommitCommandBuilder.php └── GitRepository.php ├── Helpers └── MediaWikiHelper.php └── Model ├── HtmlRevision.php ├── MarkdownRevision.php └── MediaWikiDocument.php /.env.example: -------------------------------------------------------------------------------- 1 | # Set to your own, or use https://github.com/wikimedia/mediawiki-vagrant 2 | MEDIAWIKI_API_ORIGIN="https://docs.webplatform.org" 3 | COMMITER_ANONYMOUS_DOMAIN="docs.webplatform.org" 4 | 5 | # Make sure this fits with your own MediaWiki instance 6 | MEDIAWIKI_USERID="10080" 7 | MEDIAWIKI_USERNAME="Renoirb" 8 | MEDIAWIKI_WIKINAME="wpwiki" 9 | 10 | # Your session cookie value, according to example values shown here, the cookie 11 | # value to seek for is "...; wpwiki_session=foo; ...;", and set like this; 12 | MEDIAWIKI_SESSION="foo" 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | vendor/ 3 | phpunit.xml 4 | bin/ 5 | src/WebPlatform/ContentConverter/ 6 | out/ 7 | *.sublime* 8 | errors/ 9 | .env 10 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | SHELL := bash 2 | 3 | dump: 4 | @if [[ -f ../mediawiki/maintenance/dumpBackup.php ]]; then\ 5 | php ../mediawiki/maintenance/dumpBackup.php --full --filter=namespace:0,108 > data/dumps/main_full.xml;\ 6 | php ../mediawiki/maintenance/dumpBackup.php --full --filter=namespace:3000 > data/dumps/wpd_full.xml;\ 7 | php ../mediawiki/maintenance/dumpBackup.php --current --filter=namespace:0 > data/dumps/main.xml;\ 8 | php ../mediawiki/maintenance/dumpBackup.php --current --filter=namespace:3000 > data/dumps/wpd.xml;\ 9 | php ../mediawiki/maintenance/dumpBackup.php --current --filter=namespace:4 > data/dumps/project.xml;\ 10 | php ../mediawiki/maintenance/dumpBackup.php --current --filter=namespace:2,200 > data/dumps/user.xml;\ 11 | php ../mediawiki/maintenance/dumpBackup.php --current --filter=namespace:3020 > data/dumps/meta.xml;\ 12 | app/export_users > data/users.json;\ 13 | fi 14 | -------------------------------------------------------------------------------- /app/console: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | run(); 15 | -------------------------------------------------------------------------------- /app/export_users: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env php 2 | data/users.json 21 | * 22 | * @author Renoir Boulanger 23 | */ 24 | 25 | /** 26 | * You can also adjust the path yourself. 27 | */ 28 | $cli = realpath(__DIR__ . '/../../mediawiki/maintenance/commandLine.inc'); 29 | 30 | if (!file_exists($cli)) { 31 | throw new \Exception('Could not find MediaWiki code checkout in parent directory'); 32 | } 33 | require $cli; 34 | 35 | /** 36 | * Export all user data into a big JSON string. 37 | * 38 | * Will be an array of objects, looking like this; 39 | * 40 | * {"1": { 41 | * "user_email":"public-webplatform@w3.org", 42 | * "user_id":"1", 43 | * "user_name":"WikiSysop", 44 | * "user_real_name":"", 45 | * "user_email_authenticated": null 46 | * }, 47 | * "21": { 48 | * "user_email":"foo@example.org", 49 | * "user_id":"21", 50 | * "user_name":"Foo", 51 | * "user_real_name":"Foo Bar-Baz", 52 | * "user_email_authenticated": true 53 | * }} 54 | **/ 55 | 56 | // ref: https://www.mediawiki.org/wiki/Manual:Database_access 57 | $dbr = wfGetDB(DB_SLAVE); 58 | 59 | // ref: https://www.mediawiki.org/wiki/Manual:User_table 60 | $id_list = $dbr->select('user', array('user_email', 'user_id','user_name','user_real_name','user_email_authenticated')); 61 | 62 | $out = array(); 63 | foreach ($id_list as $user_data) { 64 | $data = (array) $user_data; 65 | if (!in_array($data['user_id'], $out)) { 66 | $out[$data['user_id']] = $data; 67 | } else { 68 | throw new Exception('Duplicate email adress found!'); 69 | } 70 | } 71 | 72 | echo json_encode($out); 73 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "webplatform/mediawiki-conversion", 3 | "description": "Convert MediaWiki XML backup into structured raw text file tree", 4 | "type": "project", 5 | "keywords": ["mediawiki","backupDump","exporter","migration"], 6 | "license": "MIT", 7 | "require": { 8 | "webplatform/content-converter": "~1.2", 9 | "prewk/xml-string-streamer": "^0.7.1", 10 | "ryakad/pandoc-php": "~1.0", 11 | "glicer/simply-html": "~1.0", 12 | "symfony/filesystem": "~2.7", 13 | "vlucas/phpdotenv": "~2.0", 14 | "symfony/console": "~2.7", 15 | "bit3/git-php": "~1.0" 16 | }, 17 | "require-dev": { 18 | "doctrine/annotations": "~1.2", 19 | "fabpot/php-cs-fixer": "^1.9", 20 | "phpunit/phpunit": "~4.7" 21 | }, 22 | "authors": [ 23 | { 24 | "name": "Renoir Boulanger", 25 | "email": "hello@renoirboulanger.com" 26 | } 27 | ], 28 | "repositories": [ 29 | { 30 | "type": "git", 31 | "url": "https://github.com/webplatform/content-converter.git" 32 | }, 33 | { 34 | "type": "git", 35 | "url": "https://github.com/webplatform/mediawiki-conversion.git" 36 | } 37 | ], 38 | "autoload": { 39 | "psr-0": { 40 | "WebPlatform\\Importer\\": "src/", 41 | "WebPlatform\\ContentConverter\\": "src/WebPlatform/ContentConverter/lib/" 42 | }, 43 | "files": [ 44 | "lib/mediawiki.php" 45 | ] 46 | }, 47 | "config": { 48 | "bin-dir": "bin" 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /data/missed.yml: -------------------------------------------------------------------------------- 1 | # 2 | # missed entries to retry 3 | # 4 | # To use, use --missed argument (e.g. `app/console mediawiki:run 3 --missed`) 5 | # 6 | # Each entry is relative to the out/ directory. Its assumed that what is not in 7 | # MediaWiki’s main content namespace (e.g. Meta:Main_Page) will be in a separate git 8 | # repository but will end up accessible as a sub folder (e.g. Meta/Main_Page). 9 | # 10 | # https://docs.webplatform.org/w/index.php?action=edit&title=WPD:Getting_Started/examples 11 | # 12 | missed: 13 | - css/fr 14 | - Beginners/ja 15 | - javascript/RegExp 16 | - Main_Page/zh-hant 17 | - concepts/es 18 | - concepts/accessibility/ja 19 | - concepts/accessibility/es 20 | - tutorials/using_css_background_images/ja 21 | - Main_Page/chs 22 | - svg/attributes/clip-rule 23 | - css/cssom/CSSImportRule 24 | - css/cssom/properties 25 | - css/properties/background-position 26 | - html/elements/a 27 | - apis/indexeddb/IDBKeyRange/upperBound 28 | - tutorials/content-security-policy 29 | - css/properties/background-position 30 | - tutorials/mobile_mobifying 31 | - html/elements/table 32 | - Beginners/the_beginning 33 | - glossary/main 34 | - en 35 | - css 36 | - tutorials/css_transitions 37 | - css/properties/border-image-outset 38 | - css/cssom/properties/pixelWidth 39 | - concepts/accessibility 40 | - dom/DOMTokenList/length 41 | - dom/DomTokenList/item 42 | - dom/DataTransfer/files 43 | - apis/appcache/ApplicationCache/status 44 | - tutorials/table_styling_basics 45 | - apis/audio-video/TimeRanges/start 46 | - dom/HTMLCanvasElement 47 | - dom/HTMLDataElement 48 | - dom/HTMLMediaElement 49 | - dom/HTMLTrackElement 50 | - tutorials/eventsource_basics 51 | - tutorials/styling_xml_with_css 52 | - tutorials/debugging_css 53 | - tutorials/javascript_statements 54 | - concepts/programming/about_javascript 55 | - Accessibility_basics 56 | - Meta/HTML/Elements/spacer 57 | - Meta/Editors_Guide 58 | - Meta/Editors_Guide/content 59 | - Meta/svg_test 60 | - Meta/web_platform_wednesday/past_reports 61 | - WPD/Community/Meetings/General/Earlier 62 | - WPD/Community/Task_Force 63 | - WPD/Getting_Started/examples 64 | - apis/appcache/ApplicationCache 65 | - concepts/Internet_and_Web/mime_types 66 | - css/properties/border-radius 67 | - css/properties/font-size 68 | - css/selectors/pseudo-classes/not 69 | - dom/NamedNodeMap/removeNamedItem 70 | - dom/HTMLLabelElement 71 | - css/media_queries/media_groups/visual 72 | - dom/interface 73 | - tutorials/forms_html5forms 74 | - guides/html_forms_basics 75 | - guides/html_links 76 | - html/attributes/cellPadding 77 | - html/elements 78 | - html/elements/input/type/file 79 | - html/tutorials 80 | - svg/properties/cx_SVGRadialGradientElement 81 | - svg/tutorials/smarter_svg_filters 82 | - tutorials/html5_form_features 83 | -------------------------------------------------------------------------------- /lib/loader.php: -------------------------------------------------------------------------------- 1 | load(); 16 | $dotenv->required(['MEDIAWIKI_API_ORIGIN', 'COMMITER_ANONYMOUS_DOMAIN']); 17 | 18 | /** 19 | * Poor man project loader so we dont need 20 | * config files for such a small project 21 | **/ 22 | 23 | if ($console instanceof Application) { 24 | 25 | // Load all commands here directly 26 | $console->add(new RefreshPagesCommand()); 27 | $console->add(new CacheWarmerCommand()); 28 | $console->add(new SummaryCommand()); 29 | $console->add(new RunCommand()); 30 | 31 | } else { 32 | throw new \Exception('Did you require lib/loader.php AFTER bootstrapping the application?'); 33 | } 34 | -------------------------------------------------------------------------------- /lib/mediawiki.php: -------------------------------------------------------------------------------- 1 | 26 | */ 27 | abstract class AbstractImporterCommand extends Command 28 | { 29 | /** @var WebPlatform\ContentConverter\Helpers\ApiRequestHelperInterface Conversion helper instance */ 30 | protected $apiHelper; 31 | 32 | /** @var WebPlatform\ContentConverter\Helpers\YamlHelper Yaml Helper instance */ 33 | protected $yaml; 34 | 35 | /** @var Symfony\Component\Filesystem\Filesystem Symfony Filesystem handler */ 36 | protected $filesystem; 37 | 38 | protected $users = []; 39 | 40 | protected $missed = []; 41 | 42 | protected function configure() 43 | { 44 | $helpText = 'What file to read from. Argument is relative from data/ '; 45 | $helpText .= 'folder from this directory (e.g. dumps/wpd_full.xml, would read from data/dumps/foo.xml)'; 46 | 47 | $this->addOption('xml-source', '', InputOption::VALUE_OPTIONAL, $helpText, 'dumps/main_full.xml'); 48 | } 49 | 50 | protected function execute(InputInterface $input, OutputInterface $output) 51 | { 52 | $this->yaml = new YamlHelper(); 53 | $this->filesystem = new Filesystem(); 54 | $this->initCookieString(); 55 | } 56 | 57 | /** 58 | * Source XML file read stream factory. 59 | * 60 | * @param string $xmlSourcePath path where the XML file should be read from, relative to DATA_DIR 61 | * 62 | * @return Prewk\XmlStringStreamer A XML String stream 63 | */ 64 | protected function sourceXmlStreamFactory($xmlSourcePath) 65 | { 66 | $file = realpath($xmlSourcePath); 67 | if ($file === false) { 68 | $message = 'Cannot run script, source XML file at "%s" could not be found'; 69 | throw new RuntimeException(sprintf($message, $xmlSourcePath)); 70 | } 71 | 72 | return XmlStringStreamer::createStringWalkerParser($file); 73 | } 74 | 75 | /** 76 | * Load Authors. 77 | * 78 | * Author array of MediaWikiContributor objects with $this->users[$uid], 79 | * where $uid is MediaWiki user_id. 80 | * 81 | * You may have to increase memory_limit value, 82 | * but we’ll load this only once. 83 | **/ 84 | protected function loadUsers($usersSourcePath) 85 | { 86 | $file = realpath($usersSourcePath); 87 | if ($file === false) { 88 | $message = 'Cannot run script, source users file at "%s" could not be found'; 89 | throw new RuntimeException(sprintf($message, $usersSourcePath)); 90 | } 91 | 92 | $users_loop = json_decode(file_get_contents($file), 1); 93 | 94 | foreach ($users_loop as &$u) { 95 | $uid = (int) $u['user_id']; 96 | $this->users[$uid] = new MediaWikiContributor($u); 97 | unset($u); // Dont fill too much memory, if that helps. 98 | } 99 | } 100 | 101 | private function load($loadFilePath) 102 | { 103 | if (realpath($loadFilePath) === false) { 104 | $message = 'Could not find file at %s'; 105 | throw new RuntimeException(sprintf($message, $loadFilePath)); 106 | } 107 | 108 | return file_get_contents($loadFilePath); 109 | } 110 | 111 | protected function loadMissed($missedNormalizedTitlesSource) 112 | { 113 | if (realpath($missedNormalizedTitlesSource) === false) { 114 | $message = 'Could not find missed file at %s'; 115 | throw new RuntimeException(sprintf($message, $missedNormalizedTitlesSource)); 116 | } 117 | 118 | $missedFileContents = file_get_contents($missedNormalizedTitlesSource); 119 | 120 | try { 121 | $missed = $this->yaml->unserialize($missedFileContents); 122 | } catch (Exception $e) { 123 | $message = 'Could not get file %s contents to be parsed as YAML. Is it in YAML format?'; 124 | throw new Exception(sprintf($message, $missedNormalizedTitlesSource), null, $e); 125 | } 126 | 127 | if (!isset($missed['missed'])) { 128 | throw new Exception('Please ensure missed.yml has a list of titles under a "missed:" top level key'); 129 | } 130 | 131 | $this->missed = $missed['missed']; 132 | } 133 | 134 | protected function initMediaWikiHelper($actionName) 135 | { 136 | /** 137 | * Your MediaWiki API URL 138 | * 139 | * https://www.mediawiki.org/wiki/API:Data_formats 140 | * https://www.mediawiki.org/wiki/API:Parsing_wikitext 141 | **/ 142 | $apiUrl = getenv('MEDIAWIKI_API_ORIGIN').'/w/api.php?action='; 143 | 144 | switch ($actionName) { 145 | case 'parse': 146 | $apiUrl .= 'parse&pst=1&utf8=&prop=indicators|text|templates|categories|links|displaytitle'; 147 | $apiUrl .= '&disabletoc=true&disablepp=true&disableeditsection=true&preview=true&format=json&page='; 148 | break; 149 | case 'purge': 150 | $apiUrl .= 'purge&title='; 151 | break; 152 | } 153 | // Let’s use the Converter makeRequest() helper. 154 | $this->apiHelper = new MediaWikiHelper($apiUrl); 155 | } 156 | 157 | protected function apiRequest($title) 158 | { 159 | return $this->apiHelper->makeRequest($title, $this->cookieString); 160 | } 161 | 162 | protected function documentPurge(MediaWikiDocument $wikiDocument) 163 | { 164 | $id = $wikiDocument->getId(); 165 | 166 | $cacheDir = sprintf('%s/.cache', GIT_OUTPUT_DIR); 167 | $cacheFile = sprintf('%s/%d.json', $cacheDir, $id); 168 | 169 | if ($this->filesystem->exists($cacheFile) === true) { 170 | $this->filesystem->remove($cacheFile); 171 | } 172 | } 173 | 174 | protected function documentFetch(MediaWikiDocument $wikiDocument) 175 | { 176 | $id = $wikiDocument->getId(); 177 | $title = $wikiDocument->getTitle(); 178 | 179 | $cacheDir = sprintf('%s/.cache', GIT_OUTPUT_DIR); 180 | $cacheFile = sprintf('%s/%d.json', $cacheDir, $id); 181 | 182 | if ($this->filesystem->exists($cacheFile) === false) { 183 | if ($this->filesystem->exists($cacheDir) === false) { 184 | $this->filesystem->mkdir($cacheDir); 185 | } 186 | 187 | $obj = $this->apiHelper->retrieve($title, $this->cookieString); 188 | $this->filesystem->dumpFile($cacheFile, json_encode($obj)); 189 | } else { 190 | $contents = file_get_contents($cacheFile); 191 | $obj = new MediaWikiApiParseActionResponse($contents); 192 | $obj->toggleFromCache(); 193 | } 194 | 195 | return $obj; 196 | } 197 | 198 | private function initCookieString() 199 | { 200 | if ( 201 | isset($_ENV['MEDIAWIKI_USERID']) && 202 | isset($_ENV['MEDIAWIKI_USERNAME']) && 203 | isset($_ENV['MEDIAWIKI_SESSION']) && 204 | isset($_ENV['MEDIAWIKI_WIKINAME']) 205 | ) { 206 | $cookies['UserID'] = getenv('MEDIAWIKI_USERID'); 207 | $cookies['UserName'] = getenv('MEDIAWIKI_USERNAME'); 208 | $cookies['_session'] = getenv('MEDIAWIKI_SESSION'); 209 | $cookieString = str_replace( 210 | ['":"', '","', '{"', '"}'], 211 | ['=', ';'.getenv('MEDIAWIKI_WIKINAME'), getenv('MEDIAWIKI_WIKINAME'), ';'], 212 | json_encode($cookies) 213 | ); 214 | } else { 215 | $cookieString = null; 216 | } 217 | 218 | $this->cookieString = $cookieString; 219 | } 220 | } 221 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/Commands/CacheWarmerCommand.php: -------------------------------------------------------------------------------- 1 | 19 | */ 20 | class CacheWarmerCommand extends AbstractImporterCommand 21 | { 22 | protected function configure() 23 | { 24 | $description = <<setName('mediawiki:cache-warmer') 37 | ->setDescription($description) 38 | ->setDefinition( 39 | [ 40 | new InputOption('missed', '', InputOption::VALUE_NONE, 'Give XML node indexes of missed conversion so we can run through only them'), 41 | new InputOption('max-pages', '', InputOption::VALUE_OPTIONAL, 'Do not make full run, limit to a maximum of pages', 0), 42 | new InputOption('resume-at', '', InputOption::VALUE_OPTIONAL, 'Resume run at a specific XML document index number ', 0), 43 | ] 44 | ); 45 | 46 | parent::configure(); 47 | } 48 | 49 | protected function execute(InputInterface $input, OutputInterface $output) 50 | { 51 | parent::execute($input, $output); 52 | 53 | $this->initMediaWikiHelper('parse'); 54 | 55 | $xmlSource = $input->getOption('xml-source'); 56 | $listMissed = $input->getOption('missed'); 57 | 58 | $maxHops = (int) $input->getOption('max-pages'); // Maximum number of pages we go through 59 | 60 | $resumeAt = (int) $input->getOption('resume-at'); 61 | 62 | $ids = []; 63 | 64 | if ($listMissed === true) { 65 | $this->loadMissed(DATA_DIR.'/missed.yml'); 66 | $total = count($this->missed); 67 | } 68 | 69 | $output->writeln('Warming cache:'); 70 | 71 | $streamer = $this->sourceXmlStreamFactory(DATA_DIR.'/'.$xmlSource); 72 | $counter = 0; 73 | while ($node = $streamer->getNode()) { 74 | $pageNode = new SimpleXMLElement($node); 75 | if (isset($pageNode->title)) { 76 | ++$counter; 77 | if ($maxHops > 0 && $maxHops === $counter - 1) { 78 | $output->writeln(sprintf(PHP_EOL.'Reached desired maximum of %d documents', $maxHops).PHP_EOL); 79 | break; 80 | } 81 | 82 | /* 83 | * Handle interruption by telling where to resume work. 84 | * 85 | * This is useful if job stopped and you want to resume work back at a specific point. 86 | */ 87 | if ($counter < $resumeAt) { 88 | continue; 89 | } 90 | 91 | $wikiDocument = new MediaWikiDocument($pageNode); 92 | $previous_location = (isset($normalized_location))?$normalized_location:''; 93 | $normalized_location = $wikiDocument->getTitle(); 94 | $id = $wikiDocument->getId(); 95 | 96 | /** 97 | * Do not make API requests to a wiki page that is known 98 | * to be deleted or has a redirect. 99 | */ 100 | if ($wikiDocument->hasRedirect() === true) { 101 | continue; 102 | } 103 | 104 | /** 105 | * This is when we want only to pass through files described in data/missed.yml 106 | * 107 | * Much useful if you want to make slow API requests and not run the import again. 108 | */ 109 | if ($listMissed === true && !in_array($normalized_location, $this->missed)) { 110 | continue; 111 | } 112 | 113 | /** 114 | * If we went thus far, we got a match. But what if we have none left. Just quit it! 115 | */ 116 | if ($listMissed === true && --$total < 1) { 117 | break; 118 | } 119 | 120 | if (in_array($id, array_keys($ids))) { 121 | $text = 'We got an unexpected situation, two wiki pages has the same id. The wiki page "%s" with id %d, has same as "%s"'; 122 | throw new Exception(sprintf($text, $previous_location, $id, $normalized_location)); 123 | } 124 | 125 | $ids[$id] = $normalized_location; 126 | 127 | $respObj = $this->documentFetch($wikiDocument); 128 | $isCachedAppend = ($respObj->isFromCache())?null:' (new)'; 129 | $output->writeln(sprintf(' - %d: %s%s', $id, $normalized_location, $isCachedAppend)); 130 | 131 | 132 | } 133 | } 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/Commands/RefreshPagesCommand.php: -------------------------------------------------------------------------------- 1 | 28 | */ 29 | class RefreshPagesCommand extends AbstractImporterCommand 30 | { 31 | /** @var WebPlatform\ContentConverter\Converter\ConverterInterface Converter instance */ 32 | protected $converter; 33 | 34 | protected function configure() 35 | { 36 | $description = <<setName('mediawiki:refresh-pages') 64 | ->setDescription($description) 65 | ->setDefinition( 66 | [ 67 | new InputOption('missed', '', InputOption::VALUE_NONE, 'Give XML node indexes of missed conversion so we can run through only them'), 68 | new InputOption('max-pages', '', InputOption::VALUE_OPTIONAL, 'Do not make full run, limit to a maximum of pages', 0), 69 | new InputOption('resume-at', '', InputOption::VALUE_OPTIONAL, 'Resume run at a specific XML document index number ', 0), 70 | ] 71 | ); 72 | 73 | parent::configure(); 74 | } 75 | 76 | protected function execute(InputInterface $input, OutputInterface $output) 77 | { 78 | parent::execute($input, $output); 79 | 80 | $this->initMediaWikiHelper('purge'); 81 | 82 | $xmlSource = $input->getOption('xml-source'); 83 | $listMissed = $input->getOption('missed'); 84 | 85 | $maxHops = (int) $input->getOption('max-pages'); // Maximum number of pages we go through 86 | 87 | $resumeAt = (int) $input->getOption('resume-at'); 88 | 89 | $this->loadMissed(DATA_DIR.'/missed.yml'); 90 | 91 | $output->writeln(sprintf('Sending purge to %s:', $this->apiHelper->getHelperEndpoint())); 92 | 93 | $streamer = $this->sourceXmlStreamFactory(DATA_DIR.'/'.$xmlSource); 94 | $counter = 0; 95 | while ($node = $streamer->getNode()) { 96 | $pageNode = new SimpleXMLElement($node); 97 | if (isset($pageNode->title)) { 98 | ++$counter; 99 | if ($maxHops > 0 && $maxHops === $counter - 1) { 100 | $output->writeln(sprintf(PHP_EOL.'Reached desired maximum of %d documents', $maxHops).PHP_EOL); 101 | break; 102 | } 103 | 104 | $wikiDocument = new MediaWikiDocument($pageNode); 105 | $normalized_location = $wikiDocument->getName(); 106 | $title = $wikiDocument->getTitle(); 107 | $id = $wikiDocument->getId(); 108 | 109 | /** 110 | * Handle interruption by telling where to resume work. 111 | * 112 | * This is useful if job stopped and you want to resume work back at a specific point. 113 | */ 114 | if ($counter < $resumeAt) { 115 | continue; 116 | } 117 | 118 | /** 119 | * This is when we want only to pass through files described in data/missed.yml 120 | * 121 | * Much useful if you want to make slow API requests and not run the import again. 122 | */ 123 | if ($listMissed === true && !in_array($normalized_location, $this->missed)) { 124 | continue; 125 | } 126 | 127 | $this->documentPurge($wikiDocument); 128 | 129 | try { 130 | $purgeCall = $this->apiRequest($title); 131 | } catch (Exception $e) { 132 | $message = 'Had issue with attempt to refresh page from MediaWiki for %s'; 133 | throw new Exception(sprintf($message, $title), 0, $e); 134 | } 135 | 136 | 137 | if (empty($purgeCall)) { 138 | $message = 'Refresh call did not work, we expected a HTML and got nothing, check at %s%s gives from a web browser'; 139 | throw new Exception(sprintf($message, $this->apiHelper->getHelperEndpoint(), $title)); 140 | } 141 | 142 | $output->writeln(sprintf(' - %d: %s', $id, $title)); 143 | } 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/Commands/RunCommand.php: -------------------------------------------------------------------------------- 1 | 29 | */ 30 | class RunCommand extends AbstractImporterCommand 31 | { 32 | /** @var WebPlatform\ContentConverter\Converter\ConverterInterface Converter instance */ 33 | protected $converter; 34 | 35 | /** @var Bit3\GitPhp\GitRepository Git Repository handler */ 36 | protected $git; 37 | 38 | protected function configure() 39 | { 40 | $description = <<setName('mediawiki:run') 72 | ->setDescription($description) 73 | ->setDefinition( 74 | [ 75 | new InputArgument('pass', InputArgument::REQUIRED, 'The pass number: 1,2,3', null), 76 | new InputOption('missed', '', InputOption::VALUE_NONE, 'Give XML node indexes of missed conversion so we can run a 3rd pass only for them'), 77 | new InputOption('max-revs', '', InputOption::VALUE_OPTIONAL, 'Do not run full import, limit it to maximum of revisions per page ', 0), 78 | new InputOption('max-pages', '', InputOption::VALUE_OPTIONAL, 'Do not run full import, limit to a maximum of pages', 0), 79 | new InputOption('namespace-prefix', '', InputOption::VALUE_OPTIONAL, 'If not against main MediaWiki namespace, set prefix (e.g. Meta) so we can create a git repo with all contents on root so that we can use export as a submodule.', false), 80 | new InputOption('resume-at', '', InputOption::VALUE_OPTIONAL, 'Resume run at a specific XML document index number ', 0), 81 | new InputOption('only-assets', '', InputOption::VALUE_NONE, '3rd pass specific. Skip document conversion, git add only assets that are refered in documents'), 82 | ] 83 | ); 84 | 85 | parent::configure(); 86 | } 87 | 88 | protected function execute(InputInterface $input, OutputInterface $output) 89 | { 90 | parent::execute($input, $output); 91 | 92 | $passNbr = (int) $input->getArgument('pass'); 93 | 94 | $xmlSource = $input->getOption('xml-source'); 95 | $listMissed = $input->getOption('missed'); 96 | 97 | $maxHops = (int) $input->getOption('max-pages'); // Maximum number of pages we go through 98 | $revMaxHops = (int) $input->getOption('max-revs'); // Maximum number of revisions per page we go through 99 | $namespacePrefix = $input->getOption('namespace-prefix'); 100 | 101 | $resumeAt = (int) $input->getOption('resume-at'); 102 | 103 | $onlyAssets = $input->getOption('only-assets'); 104 | 105 | $redirects = []; 106 | $pages = []; 107 | 108 | if ($listMissed === true && $passNbr === 3) { 109 | $this->loadMissed(DATA_DIR.'/missed.yml'); 110 | } elseif ($listMissed === true && $passNbr !== 3) { 111 | throw new DomainException('Missed option is only supported at 3rd pass'); 112 | } 113 | 114 | if ($onlyAssets === true && $passNbr !== 3) { 115 | throw new DomainException('only-assets option is only useful at 3rd pass'); 116 | } 117 | 118 | $repoInitialized = (realpath(GIT_OUTPUT_DIR.'/.git') === false) ? false : true; 119 | if ($this->filesystem->exists(GIT_OUTPUT_DIR) === false) { 120 | $this->filesystem->mkdir(GIT_OUTPUT_DIR); 121 | } 122 | $this->git = new GitRepository(realpath(GIT_OUTPUT_DIR)); 123 | if ($repoInitialized === false) { 124 | $this->git->init()->execute(); 125 | } 126 | 127 | if ($passNbr === 3) { 128 | // We are at conversion pass, instantiate our Converter! 129 | // instanceof WebPlatform\ContentConverter\Converter\ConverterInterface 130 | $this->converter = new HtmlToMarkdown(); 131 | $this->initMediaWikiHelper('parse'); 132 | } else { 133 | $this->loadUsers(DATA_DIR.'/users.json'); 134 | } 135 | 136 | 137 | $this->titleFilter = new TitleFilter(); 138 | 139 | $streamer = $this->sourceXmlStreamFactory(DATA_DIR.'/'.$xmlSource); 140 | $counter = 0; 141 | while ($node = $streamer->getNode()) { 142 | $pageNode = new SimpleXMLElement($node); 143 | if (isset($pageNode->title)) { 144 | ++$counter; 145 | if ($maxHops > 0 && $maxHops === $counter - 1) { 146 | $output->writeln(sprintf('Reached desired maximum of %d documents', $maxHops).PHP_EOL); 147 | break; 148 | } 149 | 150 | /* 151 | * Handle interruption by telling where to resume work. 152 | * 153 | * This is useful if job stopped and you want to resume work back at a specific point. 154 | */ 155 | if ($counter < $resumeAt) { 156 | continue; 157 | } 158 | 159 | $wikiDocument = new MediaWikiDocument($pageNode); 160 | $persistable = new GitCommitFileRevision($wikiDocument, 'out/', '.md'); 161 | 162 | $title = $wikiDocument->getTitle(); 163 | $normalized_location = $wikiDocument->getName(); 164 | $file_path = $this->titleFilter->filter($persistable->getName()); 165 | $file_path = ($namespacePrefix === false) ? $file_path : str_replace(sprintf('%s/', $namespacePrefix), '', $file_path); 166 | $redirect_to = $this->titleFilter->filter($wikiDocument->getRedirect()); // False if not a redirect, string if it is 167 | 168 | $language_code = $wikiDocument->getLanguageCode(); 169 | $language_name = $wikiDocument->getLanguageName(); 170 | $revs = $wikiDocument->getRevisions()->count(); 171 | $revList = $wikiDocument->getRevisions(); 172 | $revLast = $wikiDocument->getLatest(); 173 | 174 | /** 175 | * This is when we want only to pass through files described in data/missed.yml 176 | * 177 | * Much useful if you want to make slow API requests and not run the import again. 178 | */ 179 | if ($listMissed === true && !in_array($normalized_location, $this->missed)) { 180 | continue; 181 | } 182 | 183 | $output->writeln(sprintf('"%s":', $title)); 184 | $output->writeln(sprintf(' id: %d', $wikiDocument->getId())); 185 | $output->writeln(sprintf(' index: %d', $counter)); 186 | $output->writeln(sprintf(' normalized: %s', $normalized_location)); 187 | $output->writeln(sprintf(' file: %s', $file_path)); 188 | 189 | if ($wikiDocument->isTranslation() === true) { 190 | $output->writeln(sprintf(' lang: %s (%s)', $language_code, $language_name)); 191 | } 192 | 193 | if ($wikiDocument->hasRedirect() === true) { 194 | $output->writeln(sprintf(' redirect_to: %s', $redirect_to)); 195 | } 196 | 197 | /* 198 | * Merge deleted content history under current content. 199 | * 200 | * 1st pass: Only those with redirects (i.e. deleted pages). Should leave an empty out/ directory! 201 | * 2nd pass: Only those without redirects (i.e. current content). 202 | * 3nd pass: Only for those without redirects, they are going to get the latest version passed through the convertor 203 | */ 204 | if ($wikiDocument->hasRedirect() === false && $passNbr === 1) { 205 | // Skip all NON redirects for pass 1 206 | $output->writeln(sprintf(' skip: Document %s WITHOUT redirect, at pass 1 (handling redirects)', $title).PHP_EOL.PHP_EOL); 207 | continue; 208 | } elseif ($wikiDocument->hasRedirect() && $passNbr === 2) { 209 | // Skip all redirects for pass 2 210 | $output->writeln(sprintf(' skip: Document %s WITH redirect, at pass 2 (handling non redirects)', $title).PHP_EOL.PHP_EOL); 211 | continue; 212 | } elseif ($wikiDocument->hasRedirect() && $passNbr === 3) { 213 | // Skip all redirects for pass 2 214 | $output->writeln(sprintf(' skip: Document %s WITH redirect, at pass 3', $title).PHP_EOL.PHP_EOL); 215 | continue; 216 | } 217 | 218 | if ($passNbr < 1 || $passNbr > 3) { 219 | throw new DomainException('This command has only three pases.'); 220 | } 221 | 222 | if ($passNbr === 3) { 223 | // Overwriting $revList for last pass we’ll 224 | // use for conversion. 225 | $revList = new SplDoublyLinkedList(); 226 | $revList->push($revLast); 227 | } else { 228 | $output->writeln(sprintf(' revisions_count: %d', $revs)); 229 | $output->writeln(sprintf(' revisions:')); 230 | } 231 | 232 | /* ----------- REVISIONS --------------- **/ 233 | $revCounter = 0; 234 | for ($revList->rewind(); $revList->valid(); $revList->next()) { 235 | ++$revCounter; 236 | 237 | if ($revMaxHops > 0 && $revMaxHops === $revCounter) { 238 | $output->writeln(sprintf(' stop: Reached maximum %d revisions', $revMaxHops).PHP_EOL.PHP_EOL); 239 | break; 240 | } 241 | 242 | $removeFile = false; 243 | 244 | $wikiRevision = $revList->current(); 245 | 246 | /* -------------------- Author -------------------- **/ 247 | // An edge case where MediaWiki may give author as user_id 0, even though we dont have it 248 | // so we’ll give the first user instead. 249 | $contributor_id = ($wikiRevision->getContributorId() === 0) ? 1 : $wikiRevision->getContributorId(); 250 | 251 | /* 252 | * Fix duplicates and merge them as only one. 253 | * 254 | * Please adjust to suit your own. 255 | * 256 | * Queried using jq; 257 | * 258 | * cat data/users.json | jq '.[]|select(.user_real_name == "Renoir Boulanger")' 259 | * 260 | * #TODO: Change the hardcoded list. 261 | */ 262 | if (in_array($contributor_id, [172943, 173060, 173278, 173275, 173252, 173135, 173133, 173087, 173086, 173079, 173059, 173058, 173057])) { 263 | $contributor_id = getenv('MEDIAWIKI_USERID'); 264 | } 265 | /* -------------------- /Author -------------------- **/ 266 | 267 | // Lets handle conversion only at 3rd pass. 268 | if ($passNbr === 3) { 269 | try { 270 | /* @var MediaWikiApiParseActionResponse object to work with */ 271 | $respObj = $this->documentFetch($wikiDocument); 272 | } catch (Exception $e) { 273 | $output->writeln(sprintf(' ERROR: %s, left a note in errors/%d.txt', $e->getMessage(), $counter)); 274 | $this->filesystem->dumpFile(sprintf('errors/%d.txt', $counter), $e->getMessage()); 275 | throw new Exception('Debugging why API call did not work.', 0, $e); // DEBUG 276 | continue; 277 | } 278 | 279 | if ($respObj->isFromCache()) { 280 | // #XXX: Make sure AbstractImporterCommand has the same path as below 281 | $output->writeln(sprintf(' cached: %s', sprintf('out/.cache/%d.json', $wikiDocument->getId()))); 282 | } else { 283 | $output->writeln(' cached: Not from cache'); 284 | } 285 | 286 | if ($respObj->isEmpty() === true) { 287 | $output->writeln(sprintf(' skip: Document %s is empty, maybe deleted or been emptied without a redirect left', $title).PHP_EOL.PHP_EOL); 288 | continue; 289 | } 290 | 291 | $newRev = new HtmlRevision($respObj, true); 292 | $newRev->enableMarkdownConversion(); 293 | $newRev->setTitle($wikiDocument->getDocumentTitle()); 294 | 295 | $assets = $newRev->getAssets(); 296 | if (count($assets) >= 1) { 297 | $output->writeln(sprintf(' assets: %d', count($assets))); 298 | } else { 299 | $output->writeln(' assets: None'); 300 | } 301 | if ($onlyAssets === true) { 302 | if (count($assets) >= 1) { 303 | $problematicAssets = []; 304 | foreach ($newRev->getAssets() as $file) { 305 | try { 306 | $this->git 307 | ->add() 308 | ->execute(preg_replace('/^\//', '', $file)); 309 | } catch (Exception $e) { 310 | $problematicAssets[] = $file; 311 | } 312 | } 313 | 314 | if (count($problematicAssets) >= 1) { 315 | $message = ' assets_status: NOT OK, %d problematic files, see errors/problematic_assets/%d.txt'; 316 | $output->writeln(sprintf($message, count($problematicAssets), $wikiDocument->getId())); 317 | $this->filesystem->dumpFile(sprintf('errors/problematic_assets/%d.txt', $wikiDocument->getId()), print_r($problematicAssets, 1)); 318 | } else { 319 | $output->writeln(' assets_status: OK, all added.'); 320 | } 321 | } 322 | 323 | continue; 324 | } /* End $onlyAssets */ 325 | 326 | // NOTE: 327 | // 328 | // In HtmlRevision, if the file is empty or only has a comment, we 329 | // rewrite the file to contain only a title. 330 | // 331 | // We could use `$newRev->isEmpty()` here to detect the fact that its 332 | // empty, but we would need to refactor the logic on how to delete 333 | // revisions. 334 | // 335 | // Since there are not many empty files, it has been decided to leave 336 | // as is. 337 | // 338 | if ($newRev->isEmpty()) { 339 | //die('Manually delete file?'); 340 | $wikiRevision = $this->converter->apply($newRev); 341 | $removeFile = true; // Won't work. But, it could be a start. 342 | } else { 343 | $wikiRevision = $this->converter->apply($newRev); 344 | } 345 | 346 | // Most of the time, title is better written from the document itself than 347 | // from the URL. Let's only set title front matter attribute when we aren't a 348 | // translation. We'll then use instead the text in the first h1 we find 349 | // in the DOM. 350 | $metadata = $newRev->getMetadata(); 351 | if (isset($metadata['first_title'])) { 352 | $wikiRevision->setTitle($metadata['first_title']); 353 | } else { 354 | $wikiRevision->setTitle($wikiDocument->getDocumentTitle()); 355 | } 356 | 357 | if ($wikiDocument->isTranslation() === true) { 358 | $wikiRevision->setFrontMatter(['lang' => $wikiDocument->getLanguageCode()]); 359 | } 360 | 361 | $revision_id = $revLast->getId(); 362 | } else { 363 | if (isset($this->users[$contributor_id])) { 364 | $contributor = clone $this->users[$contributor_id]; // We want a copy, because its specific to here only anyway. 365 | $wikiRevision->setContributor($contributor, false); 366 | } else { 367 | // In case we didn’t find data for $this->users[$contributor_id] 368 | $contributor = clone $this->users[1]; // We want a copy, because its specific to here only anyway. 369 | $wikiRevision->setContributor($contributor, false); 370 | } 371 | 372 | $revision_id = $wikiRevision->getId(); 373 | $output->writeln(sprintf(' - id: %d', $revision_id)); 374 | $output->writeln(sprintf(' index: %d', $revCounter)); 375 | } 376 | 377 | $persistArgs = $persistable->setRevision($wikiRevision)->getArgs(); 378 | if ($passNbr < 3) { 379 | foreach ($persistArgs as $argKey => $argVal) { 380 | if ($argKey === 'message') { 381 | $argVal = mb_strimwidth($argVal, strpos($argVal, ': ') + 2, 100); 382 | } 383 | $output->writeln(sprintf(' %s: "%s"', $argKey, $argVal)); 384 | } 385 | } 386 | 387 | if ($passNbr < 3 && $revLast->getId() === $wikiRevision->getId() && $wikiDocument->hasRedirect()) { 388 | $output->writeln(' is_last_and_has_redirect: True'); 389 | $removeFile = true; 390 | } 391 | 392 | $persistable->setRevision($wikiRevision); 393 | $this->filesystem->dumpFile($file_path, (string) $persistable); 394 | try { 395 | $this->git 396 | ->add() 397 | // Make sure out/ matches what we set at GitCommitFileRevision constructor. 398 | ->execute(preg_replace('/^out\//', '', $file_path)); 399 | } catch (GitException $e) { 400 | $message = sprintf('Could not add file "%s" with title "%s" for revision %d', $file_path, $title, $revision_id); 401 | throw new Exception($message, null, $e); 402 | } 403 | 404 | if ($passNbr < 3) { 405 | 406 | // We won’t expose all WebPlatform user emails to the public. Instead, 407 | // we’ll create a bogus email alias based on their MediaWiki username. 408 | $real_name = $wikiRevision->getContributor()->getRealName(); 409 | $username = $wikiRevision->getContributor()->getName(); 410 | $email = sprintf('%s@%s', $username, getenv('COMMITER_ANONYMOUS_DOMAIN')); 411 | $author_overload = sprintf('%s <%s>', $real_name, $email); 412 | 413 | try { 414 | $this->git 415 | ->commit() 416 | // In order to enforce git to use the same commiter data 417 | // than the author’s we had to overload CommitCommandBuilder 418 | // class. 419 | // 420 | // In WebPlatform\Importer\GitPhp\CommitCommandBuilder, we 421 | // overload [date, author] methods so we can inject the same 422 | // matching GIT_COMMITTER_* values at commit time. 423 | ->message($persistArgs['message']) 424 | ->author('"'.$author_overload.'"') 425 | ->date('"'.$persistArgs['date'].'"') 426 | ->allowEmpty() 427 | ->execute(); 428 | } catch (GitException $e) { 429 | var_dump($this->git); 430 | $message = sprintf('Could not commit for revision %d', $revision_id); 431 | throw new Exception($message, null, $e); 432 | } 433 | 434 | if ($removeFile === true) { 435 | try { 436 | $this->git 437 | ->rm() 438 | // Make sure out/ matches what we set at GitCommitFileRevision constructor. 439 | ->execute(preg_replace('/^out\//', '', $file_path)); 440 | } catch (GitException $e) { 441 | $message = sprintf('Could remove %s at revision %d', $file_path, $revision_id); 442 | throw new Exception($message, null, $e); 443 | } 444 | 445 | $this->git 446 | ->commit() 447 | ->message('Remove file; '.$persistArgs['message']) 448 | // ... no need to worry here. We overloaded author, date 449 | // remember? 450 | ->author('"'.$author_overload.'"') 451 | ->date('"'.$persistArgs['date'].'"') 452 | ->allowEmpty() 453 | ->execute(); 454 | 455 | $this->filesystem->remove($file_path); 456 | } 457 | } /* End of $passNubr === 3 */ 458 | } 459 | /* ----------- REVISIONS --------------- **/ 460 | $output->writeln(PHP_EOL); 461 | } 462 | } 463 | } 464 | } 465 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/Commands/SummaryCommand.php: -------------------------------------------------------------------------------- 1 | 21 | */ 22 | class SummaryCommand extends AbstractImporterCommand 23 | { 24 | protected function configure() 25 | { 26 | $description = <<setName('mediawiki:summary') 41 | ->setDescription($description) 42 | ->setDefinition( 43 | [ 44 | new InputOption('missed', '', InputOption::VALUE_NONE, 'Give XML node indexes of missed conversion so we can run a 3rd pass only for them'), 45 | new InputOption('max-revs', '', InputOption::VALUE_OPTIONAL, 'Do not make full run, limit it to maximum of revisions per document ', 0), 46 | new InputOption('max-pages', '', InputOption::VALUE_OPTIONAL, 'Do not make full run, limit to a maximum of documents', 0), 47 | new InputOption('namespace-prefix', '', InputOption::VALUE_OPTIONAL, 'If not against main MediaWiki namespace, set prefix (e.g. Meta) so we can create a git repo with all contents on root so that we can use export as a submodule.', false), 48 | new InputOption('display-author', '', InputOption::VALUE_NONE, 'Display or not the author and email address (useful to hide info for public reports), defaults to false'), 49 | new InputOption('indexes', '', InputOption::VALUE_NONE, 'Whether or not we display loop indexes'), 50 | ] 51 | ); 52 | 53 | parent::configure(); 54 | } 55 | 56 | protected function execute(InputInterface $input, OutputInterface $output) 57 | { 58 | parent::execute($input, $output); 59 | 60 | $xmlSource = $input->getOption('xml-source'); 61 | $listMissed = $input->getOption('missed'); 62 | 63 | $maxHops = (int) $input->getOption('max-pages'); // Maximum number of pages we go through 64 | $revMaxHops = (int) $input->getOption('max-revs'); // Maximum number of revisions per page we go through 65 | $namespacePrefix = $input->getOption('namespace-prefix'); 66 | 67 | $displayIndex = $input->getOption('indexes'); 68 | $displayAuthor = $input->getOption('display-author'); 69 | 70 | $redirects = []; 71 | $pages = []; 72 | $urlParts = []; 73 | $urlPartsAll = []; 74 | $missedIndexes = []; 75 | 76 | $urlsWithContent = []; 77 | $moreThanHundredRevs = []; 78 | $translations = []; 79 | $sanity_redirs = []; 80 | $directlyOnRoot = []; 81 | $rev_count = []; // So we can know what’s the average 82 | 83 | // Pages we have to make sure aren’t duplicate on the CMS prior 84 | // to the final migration. 85 | $temporary_acceptable_duplicates = []; 86 | //$temporary_acceptable_duplicates[] = 'css/selectors/pseudo-classes/:lang'; // DONE 87 | 88 | if ($listMissed === true) { 89 | $this->loadMissed(DATA_DIR.'/missed.yml'); 90 | } 91 | 92 | $this->loadUsers(DATA_DIR.'/users.json'); 93 | 94 | $this->titleFilter = new TitleFilter(); 95 | 96 | $streamer = $this->sourceXmlStreamFactory(DATA_DIR.'/'.$xmlSource); 97 | $counter = 0; 98 | while ($node = $streamer->getNode()) { 99 | $pageNode = new SimpleXMLElement($node); 100 | if (isset($pageNode->title)) { 101 | ++$counter; 102 | if ($maxHops > 0 && $maxHops === $counter - 1) { 103 | $output->writeln(sprintf('Reached desired maximum of %d documents', $maxHops).PHP_EOL); 104 | break; 105 | } 106 | 107 | $wikiDocument = new MediaWikiDocument($pageNode); 108 | $persistable = new GitCommitFileRevision($wikiDocument, 'out/', '.md'); 109 | 110 | $title = $wikiDocument->getTitle(); 111 | $normalized_location = $wikiDocument->getName(); 112 | $file_path = $this->titleFilter->filter($persistable->getName()); 113 | $file_path = ($namespacePrefix === false) ? $file_path : str_replace(sprintf('%s/', $namespacePrefix), '', $file_path); 114 | $redirect_to = $this->titleFilter->filter($wikiDocument->getRedirect()); // False if not a redirect, string if it is 115 | 116 | $language_code = $wikiDocument->getLanguageCode(); 117 | $language_name = $wikiDocument->getLanguageName(); 118 | $revs = $wikiDocument->getRevisions()->count(); 119 | $revList = $wikiDocument->getRevisions(); 120 | $revLast = $wikiDocument->getLatest(); 121 | 122 | $output->writeln(sprintf('"%s":', $title)); 123 | $output->writeln(sprintf(' - id: %d', $wikiDocument->getId())); 124 | if ($displayIndex === true) { 125 | $output->writeln(sprintf(' - index: %d', $counter)); 126 | } 127 | $output->writeln(sprintf(' - normalized: %s', $normalized_location)); 128 | $output->writeln(sprintf(' - file: %s', $file_path)); 129 | 130 | if ($wikiDocument->isTranslation() === true) { 131 | $output->writeln(sprintf(' - lang: %s (%s)', $language_code, $language_name)); 132 | } 133 | 134 | if ($wikiDocument->hasRedirect() === true) { 135 | $output->writeln(sprintf(' - redirect_to: %s', $redirect_to)); 136 | } else { 137 | /** 138 | * Gather what we can know from the location. 139 | * 140 | * Explode all parts in two separate arrays so we’ll be able to tell 141 | * if we have conflicts (e.g. CSS/Selectors .. css/selectors). So we can 142 | * harmonize the names to have **ONLY ONE** way of writing the casing for a 143 | * given path. 144 | * 145 | * If you want to define how to write an URL part, refer to TitleFilter class. 146 | */ 147 | $urlsWithContent[] = $title; 148 | foreach (explode('/', $normalized_location) as $urlDepth => $urlPart) { 149 | $urlPartKey = strtolower($urlPart); 150 | $urlParts[$urlPartKey] = $urlPart; 151 | $urlPartsAll[$urlPartKey][] = $urlPart; 152 | } 153 | } 154 | 155 | if ($listMissed === true && in_array($normalized_location, $this->missed)) { 156 | $missedIndexes[$counter] = $title; 157 | } 158 | 159 | $output->writeln(sprintf(' - revs: %d', $revs)); 160 | $output->writeln(sprintf(' - revisions:')); 161 | 162 | /* ----------- REVISION --------------- **/ 163 | $revCounter = 0; 164 | for ($revList->rewind(); $revList->valid(); $revList->next()) { 165 | ++$revCounter; 166 | 167 | if ($revMaxHops > 0 && $revMaxHops === $revCounter) { 168 | $output->writeln(sprintf(' - stop: Reached maximum %d revisions', $revMaxHops).PHP_EOL.PHP_EOL); 169 | break; 170 | } 171 | 172 | $wikiRevision = $revList->current(); 173 | 174 | /* -------------------- Author -------------------- **/ 175 | // An edge case where MediaWiki may give author as user_id 0, even though we dont have it 176 | // so we’ll give the first user instead. 177 | $contributor_id = ($wikiRevision->getContributorId() === 0) ? 1 : $wikiRevision->getContributorId(); 178 | 179 | /** 180 | * Fix duplicates and merge them as only one. 181 | * 182 | * Please adjust to suit your own. 183 | * 184 | * Queried using jq; 185 | * 186 | * cat data/users.json | jq '.[]|select(.user_real_name == "Renoir Boulanger")' 187 | * 188 | * #TODO: Change the hardcoded list. 189 | */ 190 | if (in_array($contributor_id, [172943, 173060, 173278, 173275, 173252, 173135, 173133, 173087, 173086, 173079, 173059, 173058, 173057])) { 191 | $contributor_id = getenv('MEDIAWIKI_USERID'); 192 | } 193 | 194 | if (isset($this->users[$contributor_id])) { 195 | $contributor = clone $this->users[$contributor_id]; // We want a copy, because its specific to here only anyway. 196 | $wikiRevision->setContributor($contributor, false); 197 | } else { 198 | // In case we didn’t find data for $this->users[$contributor_id] 199 | $contributor = clone $this->users[1]; // We want a copy, because its specific to here only anyway. 200 | $wikiRevision->setContributor($contributor, false); 201 | } 202 | /* -------------------- /Author -------------------- **/ 203 | 204 | $output->writeln(sprintf(' - id: %d', $wikiRevision->getId())); 205 | if ($displayIndex === true) { 206 | $output->writeln(sprintf(' index: %d', $revCounter)); 207 | } 208 | 209 | $persistArgs = $persistable->setRevision($wikiRevision)->getArgs(); 210 | foreach ($persistArgs as $argKey => $argVal) { 211 | if ($argKey === 'message') { 212 | $argVal = trim(mb_strimwidth($argVal, strpos($argVal, ': ') + 2, 100)); 213 | } 214 | if ($argKey === 'message' && empty($argVal)) { 215 | // Lets not pollute report with empty messages 216 | continue; 217 | } 218 | if ($displayAuthor === false && $argKey === 'author') { 219 | continue; 220 | } 221 | $output->writeln(sprintf(' %s: "%s"', $argKey, $argVal)); 222 | } 223 | 224 | if ($revLast->getId() === $wikiRevision->getId() && $wikiDocument->hasRedirect()) { 225 | $output->writeln(' is_last_and_has_redirect: True'); 226 | } 227 | } 228 | 229 | /* ----------- REVISION --------------- */ 230 | 231 | $rev_count[] = $revs; 232 | 233 | // Which pages are directly on /wiki/foo. Are there some we 234 | // should move elsewhere such as the glossary items? 235 | if (count(explode('/', $title)) == 1 && $wikiDocument->hasRedirect() === false) { 236 | $directlyOnRoot[] = $title; 237 | } 238 | 239 | if ($revs > 99) { 240 | $moreThanHundredRevs[] = sprintf('%s (%d)', $title, $revs); 241 | } 242 | 243 | if ($wikiDocument->isTranslation() === true && $wikiDocument->hasRedirect() === false) { 244 | $translations[] = $title; 245 | } 246 | 247 | // The ones with invalid URL characters that shouldn’t be part of 248 | // a page name because they may confuse with their natural use (:,(,),!,?) 249 | if ($title !== $normalized_location && $wikiDocument->hasRedirect() === false) { 250 | $sanity_redirs[$title] = $normalized_location; 251 | } 252 | 253 | // We have a number of pages, some of them had been 254 | // deleted or erased with a redirect left behind. 255 | // 256 | // Since we want to write to files all pages that currently 257 | // has content into a filesystem, we have to generate a file 258 | // name that can be stored into a filesystem. We therefore have 259 | // to normalize the names. 260 | // 261 | // We don’t want to have two entries with the same name. 262 | // 263 | // If a redirect (i.e. an empty file) exist, let’s set keep it 264 | // separate from the pages that still has content. 265 | // 266 | // Sanity check; 267 | // 1. Get list of redirects 268 | // 2. Get list of pages 269 | // 270 | // If we have a page duplicate, throw an exception! 271 | if ($wikiDocument->hasRedirect() === true) { 272 | // Pages we know are redirects within MediaWiki, we won’t 273 | // pass them within the $pages aray because they would be 274 | // empty content with only a redirect anyway. 275 | if ($normalized_location !== $redirect_to) { 276 | $redirects[str_replace('_', ' ', $normalized_location)] = $redirect_to; 277 | } 278 | } elseif (!in_array($normalized_location, array_keys($pages))) { 279 | // Pages we know has content, lets count them! 280 | if ($wikiDocument->hasRedirect() === false) { 281 | $pages[$normalized_location] = $title; 282 | } 283 | } elseif (in_array($title, $temporary_acceptable_duplicates)) { 284 | // Lets not throw, we got that covered. 285 | } else { 286 | // Hopefully we should never encounter this. 287 | $previous = $pages[$normalized_location]; 288 | $duplicatePagesExceptionText = 'We have duplicate entry for %s it ' 289 | .'would be stored in %s which would override content of %s'; 290 | throw new Exception(sprintf($duplicatePagesExceptionText, $title, $file_path, $previous)); 291 | } 292 | 293 | $output->writeln(PHP_EOL.PHP_EOL); 294 | } /* End of if (isset($pageNode->title)) */ 295 | } /* End of while ($node = $streamer->getNode()) */ 296 | 297 | /* 298 | * Work some numbers on number of edits 299 | * 300 | * - Average 301 | * - Median 302 | */ 303 | $total_edits = 0; 304 | sort($rev_count); 305 | $edit_average = array_sum($rev_count) / $counter; 306 | 307 | // Calculate median 308 | $value_in_middle = floor(($counter - 1) / 2); 309 | if ($counter % 2) { 310 | // odd number, middle is the median 311 | $edit_median = $rev_count[$value_in_middle]; 312 | } else { 313 | // even number, calculate avg of 2 medians 314 | $low = $rev_count[$value_in_middle]; 315 | $high = $rev_count[$value_in_middle + 1]; 316 | $edit_median = (($low + $high) / 2); 317 | } 318 | 319 | $numbers = array('Numbers:'); 320 | $numbers[] = sprintf(' - "iterations": %d', $counter); 321 | $numbers[] = sprintf(' - "content pages": %d', count($pages)); 322 | $numbers[] = sprintf(' - "redirects": %d', count($redirects)); 323 | $numbers[] = sprintf(' - "translated": %d', count($translations)); 324 | $numbers[] = sprintf(' - "not in a directory": %d', count($directlyOnRoot)); 325 | $numbers[] = sprintf(' - "redirects for URL sanity": %d', count($sanity_redirs)); 326 | $numbers[] = sprintf(' - "edits average": %d', $edit_average); 327 | $numbers[] = sprintf(' - "edits median": %d', $edit_median); 328 | $this->filesystem->dumpFile('reports/numbers.txt', implode($numbers, PHP_EOL)); 329 | 330 | $this->filesystem->dumpFile('reports/hundred_revs.txt', implode($moreThanHundredRevs, PHP_EOL)); 331 | 332 | natcasesort($translations); 333 | $this->filesystem->dumpFile('reports/translations.txt', implode(PHP_EOL, $translations)); 334 | natcasesort($directlyOnRoot); 335 | $this->filesystem->dumpFile('reports/directly_on_root.txt', implode(PHP_EOL, $directlyOnRoot)); 336 | natcasesort($urlsWithContent); 337 | $this->filesystem->dumpFile('reports/url_all.txt', implode(PHP_EOL, $urlsWithContent)); 338 | 339 | natcasesort($urlParts); 340 | $this->filesystem->dumpFile('reports/url_parts.txt', implode(PHP_EOL, $urlParts)); 341 | 342 | // Creating list for https://github.com/webplatform/mediawiki-conversion/issues/2 343 | ksort($urlPartsAll); 344 | $urlPartsAllOut = array('All words that exists in an URL, and the different ways they are written (needs harmonizing!):'); 345 | foreach ($urlPartsAll as $urlPartsAllKey => $urlPartsAllRow) { 346 | $urlPartsAllEntryUnique = array_unique($urlPartsAllRow); 347 | if (count($urlPartsAllEntryUnique) > 1) { 348 | $urlPartsAllOut[] = sprintf(' - %s', implode(', ', $urlPartsAllEntryUnique)); 349 | } 350 | } 351 | $this->filesystem->dumpFile('reports/url_parts_variants.txt', implode(PHP_EOL, $urlPartsAllOut)); 352 | 353 | ksort($redirects, SORT_NATURAL | SORT_FLAG_CASE); 354 | ksort($sanity_redirs, SORT_NATURAL | SORT_FLAG_CASE); 355 | 356 | $nginx_almost_same_1 = ['# Most likely OK to ignore, but good enough to check if adresses here works']; 357 | $nginx_almost_same_2 = ['# Most likely OK to ignore, but good enough to check if adresses here works']; 358 | $nginx_almost_same_casing = []; 359 | $nginx_redirects_spaces = []; 360 | $nginx_redirects = []; 361 | 362 | $nginx_esc['Meta:'] = 'Meta/'; 363 | $nginx_esc['WPD:'] = 'WPD/'; 364 | $nginx_esc[':'] = '\\:'; 365 | $nginx_esc['('] = '\\('; 366 | $nginx_esc[')'] = '\\)'; 367 | $nginx_esc['?'] = '\\?)'; 368 | $nginx_esc[' '] = '(\ |_)'; // Ordering matter, otherwise the () will be escaped and we want them here! 369 | 370 | $rewriteCheck[' '] = '(\ |_)'; // Ordering matter, otherwise the () will be escaped and we want them here! 371 | 372 | $location_spaghetti = []; 373 | $location_spaghetti_duplicated = []; 374 | $hopefully_not_duplicate = []; 375 | 376 | $prepare_nginx_redirects = array_merge($sanity_redirs, $redirects); 377 | foreach ($prepare_nginx_redirects as $url => $redirect_to) { 378 | // NGINX Case-insensitive redirect? Its done through (?i)! Should be documented!!! 379 | $new_location = str_replace(array_keys($nginx_esc), $nginx_esc, $url); 380 | $url_match_attempt = str_replace('(\ |_)', '_', $new_location); 381 | $work_item = $url.':'.PHP_EOL.' - new_location: "'.$new_location.'"'.PHP_EOL.' - url_match_attempt: "'.$url_match_attempt.'"'.PHP_EOL.' - redirect_to: "'.$redirect_to.'"'.PHP_EOL; 382 | $duplicate = false; 383 | 384 | if (array_key_exists(strtolower($url), $hopefully_not_duplicate)) { 385 | $location_spaghetti_duplicated[strtolower($url)] = $work_item; 386 | $duplicate = true; 387 | } else { 388 | $hopefully_not_duplicate[strtolower($url)] = $work_item; 389 | } 390 | $location_spaghetti[] = $work_item; 391 | 392 | if ($duplicate === true) { 393 | $nginx_almost_same_1[] = sprintf('rewrite (?i)^/%s$ /%s break;', $new_location, $redirect_to); 394 | } elseif ($url_match_attempt === $redirect_to) { 395 | $nginx_almost_same_2[] = sprintf('rewrite (?i)^/%s$ /%s break;', $new_location, $redirect_to); 396 | } elseif (strtolower($url_match_attempt) === strtolower($redirect_to)) { 397 | $nginx_almost_same_casing[] = sprintf('rewrite (?i)^/%s$ /%s break;', $new_location, $redirect_to); 398 | } elseif (stripos($url, ' ') > 1) { 399 | $nginx_redirects_spaces[] = sprintf('rewrite (?i)^/%s$ /%s break;', $new_location, $redirect_to); 400 | } else { 401 | $nginx_redirects[] = sprintf('rewrite (?i)^/%s$ /%s break;', $new_location, $redirect_to); 402 | } 403 | } 404 | $this->filesystem->dumpFile('reports/location_spaghetti_duplicated.txt', implode(PHP_EOL, $location_spaghetti_duplicated)); 405 | $this->filesystem->dumpFile('reports/location_spaghetti.txt', implode(PHP_EOL, $location_spaghetti)); 406 | $this->filesystem->dumpFile('reports/4_nginx_redirects_spaces.map', implode(PHP_EOL, $nginx_redirects_spaces)); 407 | $this->filesystem->dumpFile('reports/3_nginx_almost_same_1.map', implode(PHP_EOL, $nginx_almost_same_1)); 408 | $this->filesystem->dumpFile('reports/3_nginx_almost_same_2.map', implode(PHP_EOL, $nginx_almost_same_2)); 409 | $this->filesystem->dumpFile('reports/2_nginx_almost_same_casing.map', implode(PHP_EOL, $nginx_almost_same_casing)); 410 | $this->filesystem->dumpFile('reports/1_nginx.map', implode(PHP_EOL, $nginx_redirects)); 411 | 412 | $redirects_sanity_out = array('URLs to return new Location (from => to):'); 413 | foreach ($sanity_redirs as $title => $sanitized) { 414 | $redirects_sanity_out[] = sprintf(' - "%s": "%s"', $title, $sanitized); 415 | } 416 | $this->filesystem->dumpFile('reports/redirects_sanity.txt', implode(PHP_EOL, $redirects_sanity_out)); 417 | 418 | $redirects_out = array('Redirects (from => to):'); 419 | foreach ($redirects as $url => $redirect_to) { 420 | $redirects_out[] = sprintf(' - "%s": "%s"', $url, $redirect_to); 421 | } 422 | $this->filesystem->dumpFile('reports/redirects.txt', implode(PHP_EOL, $redirects_out)); 423 | 424 | if ($listMissed === true) { 425 | try { 426 | $missed_out = $this->yaml->serialize($missedIndexes); 427 | } catch (Exception $e) { 428 | $missed_out = sprintf('Could not create YAML out of missedIndexes array; Error was %s', $e->getMessage()); 429 | } 430 | $this->filesystem->dumpFile('reports/missed_retry_argument.txt', 'app/console mediawiki:run 3 --retry='.implode(',', array_keys($missedIndexes))); 431 | $this->filesystem->dumpFile('reports/missed_entries.yml', 'Missed:'.PHP_EOL.$missed_out); 432 | $output->writeln('Created missed_retry_argument.txt and missed_entries.yml in reports/ you can try to recover!'); 433 | } 434 | } 435 | } 436 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/Converter/HtmlToMarkdown.php: -------------------------------------------------------------------------------- 1 | 18 | */ 19 | class HtmlToMarkdown implements ConverterInterface 20 | { 21 | 22 | protected $converter; 23 | 24 | protected $options = array( 25 | "from" => "html", 26 | "to" => "markdown_github+blank_before_header+blank_before_blockquote+definition_lists", 27 | "atx-headers" => null, 28 | "parse-raw" => null, 29 | "no-highlight" => null, 30 | "normalize" => null 31 | ); 32 | 33 | public function __construct() 34 | { 35 | $this->converter = new Pandoc(); 36 | 37 | /** 38 | * Found language code in WebPlatform code samples 39 | * 40 | * - brush: 41 | * - css 42 | * - de1 43 | * - glsl 44 | * - html 45 | * - html4strict 46 | * - http 47 | * - js 48 | * - lang-css 49 | * - lang-markup 50 | * - other 51 | * - php 52 | * - prettyprint 53 | * - python 54 | * - script 55 | * - xml 56 | * - yaml 57 | * - style="background-color: 58 | */ 59 | $validLanguageCodes['html'] = ['markup', 'xhtml', 'html5', 'html4strict', 'lang-markup']; 60 | $validLanguageCodes['css'] = ['lang-css']; 61 | $validLanguageCodes['svg'] = []; 62 | $validLanguageCodes['xml'] = []; 63 | $validLanguageCodes['yaml'] = []; 64 | $validLanguageCodes['js'] = ['script', 'javascript']; 65 | 66 | $this->languageCodeCallback = function ($matches) use ($validLanguageCodes) { 67 | if (!is_array($matches) || !isset($matches[1])) { 68 | return '```'; 69 | } 70 | 71 | if (in_array($matches[1], array_keys($validLanguageCodes))) { 72 | return sprintf('``` %s', $matches[1]); 73 | } 74 | 75 | // Some entries such as '``` {.script style="font-size: 16px;"}' has been found in $matches[0] :( 76 | // ... in this case, we'll change $matches[1] to have ' style="..."' removed. 77 | $matches[1] = substr($matches[1], 0, strpos($matches[1], ' ')); 78 | // ... Yup. Another input has "brush: .js" at $matches[1]. Let's trim that out too. 79 | $matches[1] = str_replace('brush: .', '', $matches[1]); 80 | 81 | foreach ($validLanguageCodes as $kp => $possibilities) { 82 | if (in_array($matches[1], $possibilities)) { 83 | return sprintf('``` %s', $kp); 84 | } 85 | } 86 | 87 | return '```'; 88 | }; 89 | 90 | return $this; 91 | } 92 | 93 | public function markdownify($html) 94 | { 95 | return $this->converter->runWith($html, $this->options); 96 | } 97 | 98 | /** 99 | * Apply Wikitext rewrites. 100 | * 101 | * @param AbstractRevision $revision Input we want to transfer into Markdown 102 | * 103 | * @return AbstractRevision 104 | */ 105 | public function apply(AbstractRevision $revision) 106 | { 107 | if ($revision instanceof HtmlRevision) { 108 | $wasEmpty = $revision->isEmpty(); 109 | 110 | // Since MediaWikiApiParseActionResponse 111 | // implements \JsonSerializable 112 | $dto = $revision->getApiResponseObject()->jsonSerialize(); 113 | $title = (isset($dto['parse']['displaytitle'])) ? $dto['parse']['displaytitle'] : $revision->getTitle(); 114 | 115 | $html = $revision->getContent(); 116 | $matter_local = $revision->getFrontMatterData(); 117 | 118 | $matter_local['uri'] = $title; 119 | 120 | if (isset($matter_local['broken_links']) && count($matter_local['broken_links']) >= 1) { 121 | $links = $matter_local['broken_links']; 122 | $matter_local['todo_broken_links']['note'] = 'During import MediaWiki could not find the following links,'; 123 | $matter_local['todo_broken_links']['note'] .= ' please fix and adjust this list.'; 124 | $matter_local['todo_broken_links']['links'] = $links; 125 | } 126 | unset($matter_local['broken_links']); 127 | 128 | if (isset($matter_local['tags']) && count($matter_local['tags']) < 1) { 129 | unset($matter_local['tags']); 130 | } 131 | 132 | if (isset($matter_local['readiness'])) { 133 | $matter_local['readiness'] = str_replace('_', ' ', $matter_local['readiness']); 134 | } 135 | 136 | if ($revision->isMarkdownConvertible() === true) { 137 | $content = $this->markdownify($html); 138 | $content = preg_replace_callback("/```\s?\{\.(.*)\}/muS", $this->languageCodeCallback, $content); 139 | } else { 140 | $content = $html; 141 | } 142 | 143 | if (isset($matter_local['tables']) && is_array($matter_local['tables'])) { 144 | $newTables = []; 145 | foreach ($matter_local['tables'] as $tableKey => $tableData) { 146 | $newTableData = []; 147 | foreach ($tableData as $subTableKey => $subtableValue) { 148 | $rowKeyCopy = $this->markdownify($subTableKey); 149 | $rowDataCopy = $this->markdownify($subtableValue); 150 | $newTableData[$rowKeyCopy] = $rowDataCopy; 151 | } 152 | $newTables[$tableKey] = $newTableData; 153 | } 154 | unset($matter_local['tables']); 155 | $matter_local = array_merge($matter_local, $newTables); 156 | } 157 | 158 | if (isset($matter_local['attributions'])) { 159 | $newAttributions = []; 160 | foreach ($matter_local['attributions'] as $attributionRow) { 161 | $rowData = $this->markdownify($attributionRow); 162 | if (!empty($rowData)) { 163 | $newAttributions[] = $rowData; 164 | } 165 | } 166 | if (count($newAttributions) >= 1) { 167 | $matter_local['attributions'] = $newAttributions; 168 | } else { 169 | unset($matter_local['attributions']); 170 | } 171 | } 172 | 173 | if (empty($content) && $wasEmpty === false) { 174 | $matter_local['notes'][] = 'Require manual conversion! See https://github.com/webplatform/mediawiki-conversion/issues/24'; 175 | $content = $revision->getTextContent(); 176 | } 177 | 178 | $newRev = new MarkdownRevision($content, $matter_local); 179 | $newRev->setAuthor($revision->getAuthor()); 180 | 181 | return $newRev; 182 | } 183 | 184 | return $revision; 185 | } 186 | } 187 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/Converter/MediaWikiToHtml.php: -------------------------------------------------------------------------------- 1 | 27 | */ 28 | class MediaWikiToHtml extends BaseConverter implements ConverterInterface 29 | { 30 | /** 31 | * Apply Wikitext rewrites. 32 | * 33 | * Notice we purposefully NOT extend parent::apply(), nor enforce 34 | * $revision instanceof MediaWikiRevision because we’ll send back 35 | * the HtmlRevision object self::factory gives us 36 | * 37 | * @param AbstractRevision $revision Input we want to transfer into Markdown 38 | * 39 | * @return AbstractRevision 40 | */ 41 | public function apply(AbstractRevision $revision) 42 | { 43 | if ($revision instanceof MediaWikiRevision) { 44 | try { 45 | $mwparse = $this->getPageFromApi($revision->getTitle()); 46 | } catch (Exception $e) { 47 | $title = $revision->getTitle(); 48 | $url = $this->apiUrl.urlencode($title); 49 | $message = sprintf('Could not get data from API for %s with the following URI %s', $title, $url); 50 | throw new Exception($message, 0, $e); 51 | } 52 | 53 | if (!isset($mwparse['text']) || !isset($mwparse['text']['*'])) { 54 | throw new Exception('MediaWiki API did not return HTML string from parser'); 55 | } 56 | 57 | $content = $mwparse['text']['*']; 58 | $matter_local = []; 59 | 60 | $matter_local['displaytitle'] = $mwparse['displaytitle']; 61 | 62 | if (isset($mwparse['categories']) && is_array($mwparse['categories'])) { 63 | foreach ($mwparse['categories'] as $catObj) { 64 | $matter_local['categories'][] = $catObj['*']; 65 | } 66 | } 67 | 68 | if (isset($mwparse['links']) && is_array($mwparse['links'])) { 69 | foreach ($mwparse['links'] as $linkObj) { 70 | if (!isset($linkObj['exists'])) { 71 | $broken_links[] = $linkObj['*']; 72 | } 73 | } 74 | if (isset($broken_links) && count($broken_links) >= 1) { 75 | $matter_local['todo_broken_links']['note'] = 'During import MediaWiki could not find the following links,'; 76 | $matter_local['todo_broken_links']['note'] .= ' please fix and adjust this list.'; 77 | $matter_local['todo_broken_links']['links'] = $broken_links; 78 | } 79 | } 80 | 81 | $pageDom = new GlHtml($content); 82 | 83 | $readinessMatches = $pageDom->get('.readiness-state'); 84 | if (isset($readinessMatches[0])) { 85 | $matter_local['readiness'] = str_replace('readiness-state ', '', $readinessMatches[0]->getAttribute('class')); 86 | $readinessMatches[0]->delete(); 87 | } 88 | 89 | $standardizationStatus = $pageDom->get('.standardization_status'); 90 | if (isset($standardizationStatus[0])) { 91 | $matter_local['standardisation_status'] = $standardizationStatus[0]->getText(); 92 | $standardizationStatus[0]->delete(); 93 | } 94 | 95 | $contentRevisionNote = $pageDom->get('.is-revision-notes'); 96 | if (count($contentRevisionNote) >= 1) { 97 | if (isset($contentRevisionNote[0])) { 98 | foreach ($contentRevisionNote as $note) { 99 | $contentRevisionNoteText = $note->getText(); 100 | $note->delete(); 101 | if (!empty($contentRevisionNoteText) && strcmp('{{{', substr($contentRevisionNoteText, 0, 3)) !== 0) { 102 | $matter_local['notes'][] = $contentRevisionNoteText; 103 | } 104 | } 105 | } 106 | } 107 | 108 | $dataMetasOut = []; 109 | // Use data-type instead, and if data-meta exists, we know the key, 110 | // the other one must be the value. 111 | $tags = $pageDom->get('[data-meta]'); 112 | if (count($tags) >= 1) { 113 | foreach ($tags as $tag) { 114 | //$dataMetasKey = $tag->getDOMNode()->parentNode->getAttribute('data-meta'); 115 | //$dataNodeObj = $tag->getDOMNode()->firstChild; 116 | //$dataMetasBody = ''; 117 | 118 | $metaName = $tag->getDOMNode()->parentNode->getAttribute('data-meta'); 119 | $obj = ['content' => $tag->getHtml(), 'name' => $metaName]; 120 | var_dump($obj); 121 | 122 | /* 123 | if (isset($dataNodeObj->tagName) && $dataNodeObj->tagName !== 'span') { 124 | echo 'Is NOT a Span. Dig deeper.'.PHP_EOL; 125 | //$dataMetasBody = $dataNodeObj->nextSibling->textContent; 126 | var_dump($dataNodeObj->nextSibling->textContent); 127 | } else { 128 | echo 'Is a Span'.PHP_EOL; 129 | var_dump($dataNodeObj->textContent); 130 | } 131 | 132 | if (isset($dataNodeObj->wholeText)) { 133 | echo 'Has wholeText'; 134 | var_dump($dataNodeObj->wholeText); 135 | } 136 | */ 137 | 138 | //if (is_string($dataNodeObj->nextSibling) && $dataNodeObj->childNodes === null) { 139 | // echo 'case 1'.PHP_EOL; 140 | /* 141 | * When we have text directly in the node 142 | * 143 | * 144 | * Returns 145 | * 146 | * Returns an object of type Object 147 | * 148 | * e.g.: 149 | * 150 | * object(DOMText)#176272 (19) { 151 | * ["wholeText"]=> string(26) "Returns an object of type ", 152 | * ["data"]=> string(26) "Returns an object of type ", 153 | * ["length"]=> int(26), 154 | * ["nodeName"]=> string(5) "#text", 155 | * ["nodeValue"]=> string(26) "Returns an object of type ", 156 | * ["nodeType"]=> int(3), 157 | * ["parentNode"]=> string(22) "(object value omitted)", 158 | * ["childNodes"]=> NULL, 159 | * ["firstChild"]=> NULL, 160 | * ["lastChild"]=> NULL, 161 | * ["previousSibling"]=> NULL, 162 | * ["nextSibling"]=> string(22) "(object value omitted)", 163 | * ["attributes"]=> NULL, 164 | * ["ownerDocument"]=> string(22) "(object value omitted)", 165 | * ["namespaceURI"]=> NULL, 166 | * ["prefix"]=> string(0) "", 167 | * ["localName"]=> NULL, 168 | * ["baseURI"]=> NULL, 169 | * ["textContent"]=> string(26) "Returns an object of type " 170 | * } 171 | */ 172 | // $dataMetasBody = $dataNodeObj->nextSibling->textContent; 173 | //} elseif ($dataNodeObj->childNodes !== null && count($dataNodeObj->childNodes) > 1) { 174 | // echo 'case 2'.PHP_EOL; 175 | 176 | /* 177 | * When we have nested italic. 178 | * 179 | * We want internal value "apis/web-storage/Storage"; 180 | * 181 | * e.g. 182 | * 183 | * {{API_Object_Property 184 | * |Property_applies_to=apis/web-storage/Storage 185 | * }} 186 | * 187 | * If we dig at API_Object_Property has, we have... 188 | * 189 | * {{#if:{{{Property_applies_to|}}}|''Property of [[{{{Property_applies_to|}}}]]''|}} 190 | * 191 | * Notice the ''property...'' between doubled single quotes. 192 | * 193 | * Generates the following HTML 194 | * 195 | * 196 | * Property of 197 | * 198 | * apis/web-storage/Storage 199 | * 200 | * 201 | * 202 | * 203 | * object(DOMElement)#176272 (17) { 204 | * ["tagName"]=> string(1) "i", 205 | * ["schemaTypeInfo"]=> NULL, 206 | * ["nodeName"]=> string(1) "i", 207 | * ["nodeValue"]=> string(36) "Property of apis/web-storage/Storage", 208 | * ["nodeType"]=> int(1), 209 | * ["parentNode"]=> string(22) "(object value omitted)", 210 | * ["childNodes"]=> string(22) "(object value omitted)", 211 | * ["firstChild"]=> string(22) "(object value omitted)", 212 | * ["lastChild"]=> string(22) "(object value omitted)", 213 | * ["previousSibling"]=> NULL, 214 | * ["attributes"]=> string(22) "(object value omitted)", 215 | * ["ownerDocument"]=> string(22) "(object value omitted)", 216 | * ["namespaceURI"]=> NULL, 217 | * ["prefix"]=> string(0) "", 218 | * ["localName"]=> string(1) "i", 219 | * ["baseURI"]=> NULL, 220 | * ["textContent"]=> string(36) "Property of apis/web-storage/Storage" 221 | * } 222 | */ 223 | // $dataMetasBody = $dataNodeObj->childNodes[1]->textContent; 224 | //} else { 225 | // echo 'case else'.PHP_EOL; 226 | //} 227 | 228 | //var_dump($dataNodeObj); 229 | 230 | //if (!empty($dataMetasBody)) { 231 | // $dataMetasOut[$dataMetasKey] = $dataMetasBody; 232 | //} 233 | } 234 | //$matter_local['foo'] = $dataMetasOut; 235 | } 236 | 237 | $titles = $pageDom->get('h1,h2,h3,h4'); 238 | foreach ($titles as $title) { 239 | $title->replaceInner($title->getText()); 240 | } 241 | 242 | // Replacing HTML with purified version 243 | //$configObject = [ 'safe' => 1, 'deny_attribute' => '*', 'keep_bad' => 2, 'make_tag_strict' => 1, 'balance' => 2]; 244 | //$configObject['elements'] => 'a,h1,h2,h3,h4,pre,code' 245 | $content = $pageDom->get('body')[0]->getHtml(); 246 | 247 | $matter_rev = $revision->getFrontMatterData(); 248 | 249 | $newRev = new HtmlRevision($content, array_merge($matter_rev, $matter_local)); 250 | 251 | return $newRev->setTitle($revision->getTitle()); 252 | } 253 | 254 | return $revision; 255 | } 256 | } 257 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/Filter/TitleFilter.php: -------------------------------------------------------------------------------- 1 | 50 | */ 51 | class TitleFilter extends AbstractFilter 52 | { 53 | /** 54 | * Rewrite only ones that would end up creating two folders with different Casing !== casing and create 55 | * an issue when we write files on a filesystem due to case sensitivity. 56 | * 57 | * List of replacements from mediawiki-conversion/data/url_parts_variants.txt, and notes on why some are commented 58 | * and other are. All should have been compared with their actual use from 2015-07-24 snapshot of our content and 59 | * all the urls in use from mediawiki-conversion/data/url_all.txt 60 | * 61 | * Don’t rewrite unless necessary. Otherwise we might lose links within the content. 62 | **/ 63 | public function __construct() 64 | { 65 | 66 | // ones we shouldn’t impact // Keep commented // Why we commented 67 | // -------------------------------------- // -------------- // -------------------- 68 | //$words[] = 'Accept'; // X // http/headers/Accept, html/attributes/accept, html/attributes/acceptCharset 69 | //$words[] = 'ReadOnly'; // X // html/attributes/readonly, .../MediaStreamTrack/readonly 70 | //$words[] = 'Accessibility_basics'; // X // Accessibility_basics 71 | //$words[] = 'Accessibility_testing'; // X // Accessibility_testing 72 | $words[] = 'Accessibility_article_ideas'; 73 | //$words[] = 'Animatable'; // X 74 | //$words[] = 'Animation'; // X // css/properties/animation, css/properties/animations, 75 | $words[] = 'Canvas_tutorial'; 76 | //$words[] = 'Connection'; // X 77 | //$words[] = 'Cookie'; // X // http/headers/Cookie, dom/Document/cookie 78 | //$words[] = 'css'; // X 79 | //$words[] = 'DataTransfer'; // X // dom/DragEvent/dataTransfer, dom/DataTransfer, dom/DataTransfer/clearData 80 | //$words[] = 'Date'; // X 81 | //$words[] = 'DOCTYPE'; // X // html/elements/DOCTYPE, dom/Document/doctype 82 | //$words[] = 'Document'; // X 83 | //$words[] = 'element'; // X 84 | //$words[] = 'Error'; // X 85 | //$words[] = 'Event'; // X 86 | //$words[] = 'File'; // X 87 | //$words[] = 'FileSystem'; // X 88 | //$words[] = 'Floats_and_clearing'; // X // tutorials/floats_and_clearing, Floats_and_clearing 89 | //$words[] = 'formTarget'; // X // html/attributes/formtarget, dom/HTMLInputElement/formTarget, html/attributes/formtarget 90 | //$words[] = 'Function'; // X // concepts/programming/javascript/functions, css/functions, javascript/Function, javascript/Function/bind 91 | //$words[] = 'GamePad'; // X // tutorials/gamepad, apis/gamepad/Gamepad, apis/gamepad/GamepadEvent/gamepad 92 | //$words[] = 'GeoLocation'; // X // apis/geolocation, apis/geolocation/Coordinates/accuracy, apis/geolocation/Geolocation/clearWatch 93 | $words[] = 'Getting_Your_Content_Online'; 94 | //$words[] = 'Global'; // X 95 | $words[] = 'History'; 96 | $words[] = 'How_does_the_Internet_Work'; 97 | $words[] = 'Internet_and_Web'; 98 | //$words[] = 'ID'; // X 99 | //$words[] = 'Image'; // X 100 | //$words[] = 'Implementation'; // X 101 | //$words[] = 'indexeddb'; // X 102 | //$words[] = 'ISO'; // X 103 | $words[] = 'JavaScript_for_mobile'; 104 | //$words[] = 'Link'; // X 105 | //$words[] = 'Location'; // X // apis/location/assign, apis/workers/WorkerGlobalScope/location, dom/KeyboardEvent/location, dom/Location/hash 106 | //$words[] = 'Math'; // X 107 | //$words[] = 'MoveEnd'; // X 108 | //$words[] = 'MoveStart'; // X 109 | //$words[] = 'Navigator'; // X 110 | //$words[] = 'Node'; // X 111 | //$words[] = 'Number'; // X 112 | //$words[] = 'oauth'; // X 113 | //$words[] = 'Object'; // X 114 | //$words[] = 'onLine'; // X 115 | //$words[] = 'Option'; // X 116 | //$words[] = 'Performance'; // X 117 | //$words[] = 'PhotoSettingsOptions'; // X 118 | //$words[] = 'PointerEvents'; // X 119 | //$words[] = 'Position'; // X 120 | //$words[] = 'Q'; // X 121 | //$words[] = 'Range'; // X 122 | //$words[] = 'Region'; // X 123 | $words[] = 'removeStream'; 124 | //$words[] = 'selection'; // X 125 | //$words[] = 'selectors'; // X 126 | //$words[] = 'storage'; // X 127 | //$words[] = 'string'; // X 128 | //$words[] = 'StyleMedia'; // X 129 | //$words[] = 'styleSheet'; // X 130 | //$words[] = 'Styling_lists_and_links'; // X // guides/Styling lists and links, tutorials/styling lists and links 131 | //$words[] = 'Styling_tables'; // X // guides/styling tables, Styling tables 132 | //$words[] = 'text'; // X 133 | //$words[] = 'tfoot'; // X 134 | //$words[] = 'the_basics_of_html'; // X // guides/the basics of html/ko, guides/the basics of html, tutorials/The basics of HTML 135 | $words[] = 'The_History_of_the_Web'; 136 | //$words[] = 'thead'; // X 137 | //$words[] = 'timeStamp'; // X 138 | //$words[] = 'tutorials'; // X 139 | //$words[] = 'Unicode'; // X 140 | //$words[] = 'url'; // X 141 | //$words[] = 'websocket'; // X 142 | $words[] = 'What_does_a_good_web_page_need'; 143 | $words[] = 'Translations'; 144 | 145 | // Ones that are common in an URL but yet, in this 146 | // precise context, had casing discrepancies. 147 | $words[] = 'css\/cssom\/styleSheet'; 148 | $words[] = 'css\/selectors'; 149 | $words[] = 'dom\/DOMTokenList'; 150 | $words[] = 'tutorials\/HTML_forms'; 151 | 152 | $matchers = []; 153 | foreach ($words as $k => $word) { 154 | $matchers[] = sprintf('/%s/iuS', $word); 155 | // We need remove RegEx escaping for 156 | // replacement at addPass below. 157 | $words[$k] = stripslashes($word); 158 | } 159 | 160 | $this->addPass($matchers, $words); 161 | 162 | return $this; 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/GitPhp/CommitCommandBuilder.php: -------------------------------------------------------------------------------- 1 | /', $author, $matches); 21 | 22 | if (isset($matches[1])) { 23 | $this->processBuilder->setEnv('GIT_COMMITTER_NAME', $matches[1]); 24 | } 25 | if (isset($matches[2])) { 26 | $this->processBuilder->setEnv('GIT_COMMITTER_EMAIL', $matches[2]); 27 | } 28 | 29 | return parent::author($author); 30 | } 31 | 32 | public function date($date) 33 | { 34 | $this->processBuilder->setEnv('GIT_COMMITTER_DATE', $date); 35 | 36 | return parent::date($date); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/GitPhp/GitRepository.php: -------------------------------------------------------------------------------- 1 | 14 | */ 15 | class GitRepository extends BaseGitRepository 16 | { 17 | /** 18 | * Create commit command. 19 | * 20 | * @return CommitCommandBuilder 21 | */ 22 | public function commit() 23 | { 24 | return new CommitCommandBuilder($this); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/Helpers/MediaWikiHelper.php: -------------------------------------------------------------------------------- 1 | 14 | */ 15 | class MediaWikiHelper extends BaseMediaWikiHelper 16 | { 17 | } 18 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/Model/MarkdownRevision.php: -------------------------------------------------------------------------------- 1 | 15 | */ 16 | class MarkdownRevision extends BaseMarkdownRevision 17 | { 18 | public function getFrontMatter() 19 | { 20 | $yaml = new Dumper(); 21 | $yaml->setIndentation(2); 22 | 23 | if (!empty($this->getTitle()) && !isset($this->front_matter['title'])) { 24 | $this->front_matter['title'] = $this->getTitle(); 25 | } 26 | 27 | ksort($this->front_matter); 28 | 29 | $out[] = '---'; 30 | $titleCopy = str_replace("'", "\'", $this->front_matter['title']); 31 | unset($this->front_matter['title']); 32 | $out[] .= sprintf("title: '%s'", $titleCopy); 33 | 34 | if (!empty($this->front_matter)) { 35 | $out[] = $yaml->dump($this->front_matter, 3, 0, false, false); 36 | } 37 | $out[] = '---'; 38 | 39 | return implode($out, PHP_EOL); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/WebPlatform/Importer/Model/MediaWikiDocument.php: -------------------------------------------------------------------------------- 1 | 14 | */ 15 | class MediaWikiDocument extends BaseMediaWikiDocument 16 | { 17 | // List namespaces 18 | public static $NAMESPACE_PREFIXES = array('10' => 'Template:','102' => 'Property:','15' => 'Category:','3000' => 'WPD:','3020' => 'Meta:'); 19 | 20 | /** @var string page Title, but in MW it ends up being an URL too */ 21 | protected $title = null; 22 | 23 | /** @var mixed string representation of the possible path or false if no redirect was specified */ 24 | protected $redirect = false; 25 | 26 | const LANG_ENGLISH = 0; 27 | 28 | const LANG_JAPANESE = 'ja'; 29 | 30 | const LANG_GERMAN = 'de'; 31 | 32 | const LANG_TURKISH = 'tr'; 33 | 34 | const LANG_KOREAN = 'ko'; 35 | 36 | const LANG_SPANISH = 'es'; 37 | 38 | const LANG_PORTUGUESE_BRAZIL = 'pt-br'; 39 | 40 | const LANG_PORTUGUESE = 'pt'; 41 | 42 | const LANG_CHINESE = 'zh'; 43 | 44 | const LANG_CHINESE_HANT = 'zh-hant'; 45 | 46 | const LANG_CHINESE_HANS = 'zh-hans'; 47 | 48 | const LANG_FRENCH = 'fr'; 49 | 50 | const LANG_SWEDISH = 'sv'; 51 | 52 | const LANG_DUTCH = 'nl'; 53 | 54 | /** 55 | * String RegEx to find if the page is a page translation. 56 | * 57 | * From https://docs.webplatform.org/wiki/Template:Languages?action=raw 58 | * 59 | * Removed: 60 | * 61 | * - id (no translations made in this language) 62 | * - th (^) 63 | * 64 | * Added: 65 | * 66 | * - zh-hant 67 | * - zh-hans 68 | * 69 | * Should reflect the list of defined translation in [[Template:Languages]] source. 70 | */ 71 | const REGEX_LANGUAGES = '/\/(ar|ast|az|bcc|bg|ca|cs|da|de|diq|el|eo|es|fa|fi|fr|gl|gu|he|hu|hy|it|ja|ka|kk|km|ko|ksh|kw|mk|ml|mr|ms|nl|no|oc|pl|pt|pt\-br|ro|ru|si|sk|sl|sq|sr|sv|ta|tr|uk|vi|yue|zh|zh\-hant|zh\-hans)"$/'; 72 | 73 | /** 74 | * Commonly used translation codes used in WebPlatform Docs. 75 | * 76 | * Each key represent a language code generally put at the end of a page URL (e.g. Main_Page/es). 77 | * 78 | * Value is an array of two; 79 | * 1. CAPITALIZED english name of the language (e.g. self::$translationCodes['zh'][0] would be 'CHINESE'), so we could map back to self::CHINESE, 80 | * 2. Language name in its native form (e.g. self::$translationCodes['zh'][1] would be '中文') 81 | * 82 | * See also: 83 | * - https://docs.webplatform.org/w/index.php?title=Special%3AWhatLinksHere&target=Template%3ALanguages&namespace=0 84 | * - https://docs.webplatform.org/wiki/WPD:Translations 85 | * - https://docs.webplatform.org/wiki/WPD:Multilanguage_Support 86 | * - https://docs.webplatform.org/wiki/WPD:Implementation_Patterns 87 | * - http://www.w3.org/International/articles/language-tags/ 88 | * 89 | * Ideally, we should use self::REGEX_LANGUAGES, but in the end after looking up dumpBackup XML file, only those had contents; 90 | * 91 | * [de,es,fr,ja,ko,nl,pt-br,sv,tr,zh,zh-hant,zh-hans] 92 | * 93 | * @var array 94 | */ 95 | public static $translationCodes = array( 96 | 'en' => ['ENGLISH', 'English'], 97 | 'ja' => ['JAPANESE', '日本語'], 98 | 'de' => ['GERMAN', 'Deutsch'], 99 | 'tr' => ['TURKISH', 'Türkçe'], 100 | 'ko' => ['KOREAN', '한국어'], 101 | 'es' => ['SPANISH', 'Español'], 102 | 'pt-br' => ['PORTUGUESE_BRAZIL', 'Português do Brasil'], 103 | 'pt' => ['PORTUGUESE', 'Português'], 104 | 'zh' => ['CHINESE', '中文'], 105 | 'zh-hant' => ['CHINESE_HANT', '中文(繁體)'], 106 | 'zh-hans' => ['CHINESE_HANS', '中文(简体)'], 107 | 'fr' => ['FRENCH', 'Français'], 108 | 'sv' => ['SWEDISH', 'Svenska'], 109 | 'nl' => ['DUTCH', 'Nederlands'], 110 | ); 111 | 112 | /** 113 | * We expect this is *only* OK the entry *just before* 114 | * the last *IS* either "elements" or "attributes" because 115 | * the current implementation used language codes that was 116 | * conflated with valid HTML/SVG/SGML elements and attributes. 117 | * 118 | * e.g. [tr, id, ...] 119 | * 120 | * - html/elements/tr 121 | * - html/attributes/id 122 | * - svg/attributes/marker/tr 123 | * - mathml/elements/menclose 124 | * 125 | * @return bool 126 | */ 127 | public function isChildOfKnownPageListing() 128 | { 129 | $knownPageListings = ['elements','attributes']; 130 | 131 | $needles = explode('/', $this->getName()); 132 | $size = (int) count($needles); 133 | 134 | if ($size < 2) { 135 | return false; 136 | } 137 | 138 | return in_array($needles[ $size - 2 ], $knownPageListings); 139 | } 140 | 141 | public function isTranslation() 142 | { 143 | // An edge case. Contents in html/elements/tr, 144 | if ($this->isChildOfKnownPageListing()) { 145 | return false; 146 | } 147 | 148 | return in_array($this->getLastTitleFragment(), array_keys(self::$translationCodes)) === true; 149 | } 150 | 151 | public function getDocumentTitle() 152 | { 153 | $title = $this->title; 154 | if ($this->isTranslation()) { 155 | $parts = explode('/', $title); 156 | $select = count($parts) - 2; 157 | 158 | if (isset($parts[$select])) { 159 | return $parts[$select]; 160 | } 161 | } 162 | 163 | return $this->getLastTitleFragment(); 164 | } 165 | 166 | public function getLastTitleFragment() 167 | { 168 | $title = $this->getTitle(); 169 | 170 | return (strrpos($title, '/') === false)?$title:substr($title, (int) strrpos($title, '/') + 1); 171 | } 172 | } 173 | --------------------------------------------------------------------------------