$count unknown wikipedia tags found while WIWOSM-processing
194 |
$now
195 | $sortmessage
196 |
12.01.2013: Now all wikipedia tags with undefined language like wikipedia=article are shown, too! Sorry for the big load!
197 |
14.01.2013: Edit links are now available for potlatch2 and JOSM or Merkaator with Remotecontrol plugin.
198 |
199 |
200 |
OSM-Object
201 |
Edit links
202 |
language
203 |
article
204 |
205 | $htmlrows
206 |
207 |
208 |
209 | EOT;
210 |
211 | //write that stuff to a file
212 | $fh = fopen(self::PROJECT_PATH . 'public_html/wiwosmlog/broken.html','w');
213 | fwrite($fh, $html);
214 | fclose($fh);
215 | }
216 |
217 | /**
218 | * Write all broken languages with their articles into a nice JSON logfile
219 | **/
220 | function logUnknownJSON() {
221 | // get all broken languages
222 | $query = 'SELECT DISTINCT ON (osm_id) osm_id,lang,article,geomtype,iso2,name FROM ( SELECT osm_id,lang,article,array_agg(ST_GeometryType(way)) AS geomtype, ST_Transform(ST_SetSRID(ST_Extent(way),900913),4326) AS extent FROM wiwosm WHERE wikidata_ref = -1 GROUP BY osm_id,lang,article ) AS w LEFT JOIN wiwosm_tm_world_borders_simple ON ST_Intersects(extent, ST_SetSRID(geom,4326))';
223 | $result = pg_query($this->getPgConn(),$query);
224 | $count = pg_num_rows($result);
225 | $json = '{"created":"'.date(DATE_RFC822).'","count":"'.$count.'","items":[';
226 | $r = array();
227 | while ($row = pg_fetch_assoc($result)) {
228 | $r['i'] = $row['osm_id'];
229 | $r['t'] = 'w';
230 | if ($row['geomtype']=='{ST_Point}') $r['t'] = 'n';
231 | if ($row['osm_id'] < 0) {
232 | $r['t'] = 'r';
233 | // if relation remove leading minus
234 | $r['i'] = substr($row['osm_id'],1);
235 | }
236 | $r['l'] = $row['lang'];
237 | $r['a'] = $row['article'];
238 | $r['c'] = ''.$row['name'];
239 | $r['s'] = ''.$row['iso2'];
240 | $json .= json_encode($r).',';
241 |
242 | }
243 | $json = rtrim($json,',');
244 | $json .= ']}';
245 |
246 | //write that stuff to a gzipped json file
247 | $handle = gzopen(self::PROJECT_PATH . 'public_html/wiwosmlog/broken.json.gz','w');
248 | gzwrite($handle,$json);
249 | gzclose($handle);
250 | }
251 |
252 | /**
253 | * We have to make sure that the full update process did work and so we simply test, if there are enough files in our update directory.
254 | * If this is the case we remove the _old dir, move the current dir to _old and the _update to current dir.
255 | **/
256 | function testAndRename() {
257 | //$countFiles = system('ls -RU1 --color=never '.$json_path.' | wc -l');
258 | $this->logMessage('Execution time: '.((microtime(true)-$this->start)/60)."min\n", 2);
259 | $this->logMessage('Counting generated files …'."\n", 2);
260 | $countFiles = system('find '.$this->json_path.' -type f | wc -l');
261 | // if there are more than 100000
262 | if ( $countFiles > 100000 ) {
263 | //exec('mv -T ' . self::JSON_PATH . '_old ' . self::JSON_PATH . '_old_remove');
264 | //exec('mv -T ' . self::JSON_PATH . ' ' . self::JSON_PATH . '_old');
265 | //exec('mv -T ' . $this->json_path . ' ' . self::JSON_PATH );
266 | //unlink(self::JSON_PATH);
267 | //symlink($this->json_path,'geojsongz');
268 | exec('ln -snf '.basename($this->json_path).' '.dirname($this->json_path).'/geojsongz');
269 | //rename(self::JSON_PATH . '_old',self::JSON_PATH . '_old_remove');
270 | //rename(self::JSON_PATH , self::JSON_PATH . '_old');
271 | //rename($this->json_path , self::JSON_PATH );
272 | // let cronie remove the old directory
273 | //exec('rm -rf /mnt/user-store/wiwosm/geojsongz_old_remove &');
274 | }
275 |
276 | }
277 |
278 | /**
279 | * Just create some indices on wiwosm table that may help to speed things up.
280 | **/
281 | function createIndices() {
282 | $query = <<getPgConn(),$query);
287 | }
288 |
289 | /**
290 | * Throw away the wiwosm table and rebuild it from the mapnik db
291 | **/
292 | function createWiwosmDB() {
293 | $query = <<0 THEN lower(split_part(wikipedia, ':', 1)) ELSE '' END ) AS lang, split_part(substring(wikipedia from position(':' in wikipedia)+1),'#', 1) AS article, split_part(wikipedia,'#', 2) AS anchor FROM (
298 | SELECT osm_id, way,
299 | ( CASE WHEN strpos(keys_string, 'wikipedia')>0 THEN
300 | regexp_replace(
301 | substring(
302 | concat(
303 | substring(keys_string from 'wikipedia:?[^,]*'), -- this is the tagname for example "wikipedia" or "wikipedia:de"
304 | ':',
305 | regexp_replace(
306 | tags->substring(keys_string from 'wikipedia:?[^,]*'), -- get the first wikipedia tag from hstore
307 | '^(?:https?://)?(\\w*)\\.wikipedia\\.org/wiki/(.*)$', -- matches if the value is a wikipedia url (otherwise it is an article)
308 | '\\1:\\2' -- get the domain prefix and use it as language key followed by the article name
309 | )
310 | ) -- resulting string is for example wikipedia:de:Dresden
311 | from 11 -- remove the "wikipedia:" prefix
312 | ),
313 | '^(\\w*:)\\1','\\1' -- it is possible that there is such a thing like "de:de:Artikel" left if there was a tag like "wikipedia:de=http://de.wikipedia.org/wiki/Artikel", so remove double language labels
314 | ) ELSE '' END) AS "wikipedia",
315 | ( CASE WHEN wikidata ~ '^Q\\d+$' THEN -- try to get the wikidata ref from osm if it is formed like wikidata=Q1234
316 | CAST(substring(wikidata from 2) AS INTEGER) -- we strip the Q and cast to Integer
317 | ELSE 0 END) AS "wikidata_ref"
318 | FROM (
319 | ( SELECT osm_id, tags, array_to_string(akeys(tags),',') AS keys_string, tags->'wikidata' AS wikidata, way FROM planet_osm_point WHERE concat(',',array_to_string(akeys(tags),',')) ~ ',wiki(data|pedia)' )
320 | UNION ( SELECT osm_id, tags, array_to_string(akeys(tags),',') AS keys_string, tags->'wikidata' AS wikidata, way FROM planet_osm_line WHERE concat(',',array_to_string(akeys(tags),',')) ~ ',wiki(data|pedia)' AND NOT EXISTS (SELECT 1 FROM planet_osm_polygon WHERE planet_osm_polygon.osm_id = planet_osm_line.osm_id) ) -- we don't want LineStrings that exist as polygon, yet
321 | UNION ( SELECT osm_id, tags, array_to_string(akeys(tags),',') AS keys_string, tags->'wikidata' AS wikidata, way FROM planet_osm_polygon WHERE concat(',',array_to_string(akeys(tags),',')) ~ ',wiki(data|pedia)' )
322 | ) AS wikistaff
323 | ) AS wikiobjects
324 | ORDER BY article,lang ASC
325 | )
326 | ;
327 | UPDATE wiwosm SET wikidata_ref=-1 WHERE wikidata_ref = 0 AND lang = ANY (ARRAY['','http','subject','name','operator','related','sculptor','architect','maker']); -- we know that there could not be a language reference in Wikipedia for some lang values.
328 | COMMIT;
329 | EOQ;
330 |
331 | pg_query($this->getPgConn(),$query);
332 | if($e = pg_last_error()) {
333 | trigger_error($e, E_USER_ERROR);
334 | exit();
335 | }
336 | }
337 |
338 | function updateWiwosmDB() {
339 | $this->createWiwosmDB();
340 | $this->logMessage('wiwosm DB basic table build in '.((microtime(true)-$this->start)/60)." min\nStarting additional relation adding …\n", 2);
341 | $this->addMissingRelationObjects();
342 | $this->logMessage('Missing Relations added '.((microtime(true)-$this->start)/60)." min\nCreate Indices and link articleslanguages …\n", 2);
343 | $this->createIndices();
344 | $this->map_wikidata_languages();
345 | $this->logMessage('wiwosm DB upgraded in '.((microtime(true)-$this->start)/60)." min\n", 2);
346 | }
347 |
348 | /**
349 | * Get all members of a relation and if there are children that are relations too recursivly traverse them
350 | * @param string $memberscsv This is a comma separated string of relation children to process
351 | * @param array $nodelist This is the list of all nodes we have traversed until now. It is passed by reference so we can add the current node members there.
352 | * @param array $waylist Same as with nodes but for ways.
353 | * @param array $rellist Same as with nodes but for relations. This list is also used to check for loopings while recursivly traverse the relations.
354 | **/
355 | function getAllMembers($memberscsv,&$nodelist,&$waylist,&$rellist) {
356 | $subrellist = array();
357 | $members = str_getcsv($memberscsv,',','"');
358 | for($i=0; $i0) {
376 | $newrelscomplement = array();
377 | foreach ($newrels AS $rel) {
378 | $newrelscomplement[] = '-'.$rel;
379 | $rellist[] = $rel;
380 | }
381 |
382 | $newrelscsv = implode(',',$newrelscomplement);
383 |
384 | $result = pg_execute($this->getPgConn(),'get_existing_member_relations',array('{'.$newrelscsv.'}'));
385 | $existingrels = ($result) ? pg_fetch_all_columns($result, 0) : array();
386 | // we can simply add the existing relations with there negative id as if they were nodes or ways
387 | $nodelist = array_merge($nodelist,$existingrels);
388 | $waylist = array_merge($waylist,$existingrels);
389 |
390 | // all other relations we have to pick from the planet_osm_rels table
391 | $othersubrels = array_diff($newrelscomplement,$existingrels);
392 | if (count($othersubrels)>0) {
393 | $othersubrelscsv = '';
394 | // first strip of the "-" and build csv
395 | foreach($othersubrels AS $subrel) {
396 | $othersubrelscsv .= ','.substr($subrel,1);
397 | }
398 | $othersubrelscsv = substr($othersubrelscsv,1);
399 |
400 | $res = pg_execute($this->getPgConn(),'get_member_relations_planet_osm_rels',array('{'.$othersubrelscsv.'}'));
401 | if ($res) {
402 | // fetch all members of all subrelations and combine them to one csv string
403 | $allsubmembers = pg_fetch_all_columns($res, 0);
404 | $allsubmemberscsv = '';
405 | foreach($allsubmembers AS $submem) {
406 | // if submembers exist add them to csv
407 | if ($submem) $allsubmemberscsv .= ','.substr($submem,1,-1);
408 | }
409 | // call this function again to process all subrelations and add them to the arrays
410 | $this->getAllMembers(substr($allsubmemberscsv,1),$nodelist,$waylist,$rellist);
411 | }
412 | }
413 | }
414 | }
415 |
416 | /**
417 | * Osm2pgsql doesn't get all relations by default in the standard mapnik database, because mapnik doesn't need them.
418 | * But we need them, because they can be tagged with wikipedia tags so we have to look in the planet_osm_rels table, that is a preprocessing table for osm2pgsql that holds all the information we need, but in an ugly format.
419 | * So we have to search in the tags and members arrays if we can find something usefull, get the objects from the mapnik tables and store it in wiwosm.
420 | **/
421 | function addMissingRelationObjects() {
422 | // prepare some often used queries:
423 | $pgconn = $this->getPgConn();
424 |
425 | // search for existing relations that are build in osm2pgsql default scheme ( executed in getAllMembers function!)
426 | $result = pg_prepare($pgconn,'get_existing_member_relations','SELECT DISTINCT osm_id FROM (
427 | (SELECT osm_id FROM planet_osm_point WHERE osm_id = ANY ($1))
428 | UNION (SELECT osm_id FROM planet_osm_line WHERE osm_id = ANY ($1))
429 | UNION (SELECT osm_id FROM planet_osm_polygon WHERE osm_id = ANY ($1))
430 | ) AS existing');
431 | if ($result === false) exit();
432 |
433 | // fetch all members of all subrelations and combine them to one csv string ( executed in getAllMembers function!)
434 | $result = pg_prepare($pgconn,'get_member_relations_planet_osm_rels','SELECT members FROM planet_osm_rels WHERE id = ANY ($1)');
435 | if ($result === false) exit();
436 |
437 | // insert ways and polygons in wiwosm
438 | $result = pg_prepare($pgconn,'insert_relways_wiwosm','INSERT INTO wiwosm SELECT $1 AS osm_id, $2 AS wikidata_ref, ST_Collect(way) AS way, $3 AS lang, $4 AS article, $5 AS anchor FROM (
439 | (SELECT way FROM planet_osm_polygon WHERE osm_id = ANY ($6) )
440 | UNION ( SELECT way FROM planet_osm_line WHERE osm_id = ANY ($6) AND NOT EXISTS (SELECT 1 FROM planet_osm_polygon WHERE planet_osm_polygon.osm_id = planet_osm_line.osm_id) )
441 | ) AS members');
442 | if ($result === false) exit();
443 |
444 | // insert nodes in wiwosm
445 | $result = pg_prepare($pgconn,'insert_relnodes_wiwosm','INSERT INTO wiwosm SELECT $1 AS osm_id, $2 AS wikidata_ref, ST_Collect(way) AS way, $3 AS lang, $4 AS article, $5 AS anchor FROM (
446 | (SELECT way FROM planet_osm_point WHERE osm_id = ANY ($6) )
447 | ) AS members');
448 | if ($result === false) exit();
449 |
450 | $query = "SELECT id,members,tags FROM planet_osm_rels WHERE array_to_string(tags,',') ~ 'wiki(pedia|data)' AND -id NOT IN ( SELECT osm_id FROM wiwosm WHERE osm_id<0 )";
451 | $result = pg_query($pgconn,$query);
452 | if ($result === false) exit();
453 | while ($row = pg_fetch_assoc($result)) {
454 | // if the relation has no members ignore it and try the next one
455 | if (!$row['members']) continue;
456 | $wikidata_ref = 0;
457 | $lang = '';
458 | $article = '';
459 | $anchor = '';
460 |
461 | $has_wikipedia_tag = false;
462 | $has_wikidata_tag = false;
463 |
464 | $tagscsv = str_getcsv(substr($row['tags'],1,-1),',','"');
465 | for($i=0; $i stop looping the tags
491 | if ($has_wikipedia_tag && $has_wikidata_tag) break;
492 | }
493 | // if we found a wikipedia or wikidata tag we fetch all relation members
494 | if ($has_wikipedia_tag || $has_wikidata_tag) {
495 | $nodelist = array();
496 | $waylist = array();
497 | $rellist = array($row['id']);
498 | $this->getAllMembers(substr($row['members'],1,-1),$nodelist,$waylist,$rellist);
499 | $nodelist = array_unique($nodelist);
500 | $waylist = array_unique($waylist);
501 | $nodescsv = implode(',',$nodelist);
502 | $wayscsv = implode(',',$waylist);
503 | $hasNodes = (count($nodelist)>0);
504 | $hasWays = (count($waylist)>0);
505 | if ($hasWays) {
506 | pg_execute($pgconn,'insert_relways_wiwosm',array('-'.$row['id'],$wikidata_ref,$lang,$article,$anchor,'{'.$wayscsv.'}'));
507 | }
508 | if ($hasNodes) {
509 | pg_execute($pgconn,'insert_relnodes_wiwosm',array('-'.$row['id'],$wikidata_ref,$lang,$article,$anchor,'{'.$nodescsv.'}'));
510 | }
511 | }
512 | }
513 | }
514 |
515 | /**
516 | * We want to work with real php arrays, so we have to process the hstore string
517 | * @param string $hstore This is a string returned by a postgresql hstore column that looks like: '"foo"=>"bar", "baz"=>"blub", "lang"=>"article"' …
518 | * @return array return a php array with languages as keys and articles as values
519 | **/
520 | public static function hstoreToArray($hstore) {
521 | $ret_array = array();
522 | if (preg_match_all('/(?:^| )"((?:[^"]|(?<=\\\\)")*)"=>"((?:[^"]|(?<=\\\\)")*)"(?:$|,)/',$hstore,$matches)) {
523 | $count = count($matches[1]);
524 | if ($count == count($matches[2])) {
525 | for($i=0; $i<$count; $i++) {
526 | $lang = stripslashes($matches[1][$i]);
527 | $article = stripslashes($matches[2][$i]);
528 | $ret_array[$lang] = $article;
529 | }
530 | }
531 | }
532 | return $ret_array;
533 | }
534 |
535 | /**
536 | * Believe it or not - in the year 2012 string encoding still sucks!
537 | * @param string $str A string that is maybe not correct UTF-8 (reason is a bad urlencoding for example)
538 | * @return string return a string that is definitly a valid UTF-8 string even if some characters were dropped
539 | **/
540 | public static function fixUTF8($str) {
541 | $curenc = mb_detect_encoding($str);
542 | if ($curenc != 'UTF-8') {
543 | if ($curenc === false) {
544 | // if mb_detect_encoding failed we have to enforce clean UTF8 somehow
545 | return mb_convert_encoding(utf8_encode($str), 'UTF-8', 'UTF-8');
546 | }
547 | // if we can guess the encoding we can convert it
548 | return mb_convert_encoding($str,'UTF-8',$curenc);
549 | } elseif (!mb_check_encoding($str,'UTF-8')) {
550 | // if there are invalid bytes try to remove them
551 | return mb_convert_encoding($str, 'UTF-8', 'UTF-8');
552 | }
553 | // if it is already clean UTF-8 there should be no problems
554 | return $str;
555 | }
556 |
557 | function queryWikidataLanguagesByLangArticle($lang, $article) {
558 | // if no lang or article is given, we can stop here
559 | if (!$lang || !$article) return false;
560 |
561 | // if the lang for example is fiu-vro the site is named fiu_vrowiki so we have to replace - by _
562 | $lang = str_replace('-','_',$lang).'wiki';
563 |
564 | if (!$this->prep_wikidata_by_lang_article->bind_param('ss', $lang, $article)) {
565 | $this->logMessage('bind_param failed with lang="'.$lang.'" and article="'.$article.'": '.$this->prep_wikidata_by_lang_article->error."\n", 1);
566 | return false;
567 | }
568 |
569 | if (!$this->prep_wikidata_by_lang_article->execute()) {
570 | $this->logMessage('wikidata query failed with lang="'.$lang.'" and article="'.$article.'": '.$this->prep_wikidata_by_lang_article->error."\n", 1);
571 | return false;
572 | }
573 |
574 | if (!$this->prep_wikidata_by_lang_article->store_result()) {
575 | $this->logMessage('wikidata query store result failed with lang="'.$lang.'" and article="'.$article."\"\n", 1);
576 | return false;
577 | }
578 |
579 | if ($this->prep_wikidata_by_lang_article->num_rows == 0) return false;
580 |
581 | if (!$this->prep_wikidata_by_lang_article->bind_result($wd_id, $ll_lang, $ll_title)) {
582 | $this->logMessage('bind_result failed with lastarticle='.$this->lastarticle.': '.$this->prep_wikidata_by_lang_article->error."\n", 1);
583 | return false;
584 | }
585 |
586 | $langarray = array();
587 | while ($this->prep_wikidata_by_lang_article->fetch()) {
588 | $langarray[str_replace('wiki', '', $ll_lang)] = $ll_title;
589 | }
590 | return array($wd_id, $langarray);
591 | }
592 |
593 | function queryWikidataLanguagesByWikidataref($wikidata_ref) {
594 | // if no $wikidata_ref is given, we can stop here
595 | if (!$wikidata_ref) return false;
596 |
597 | if (!$this->prep_wikidata_by_wikidata_ref->bind_param('s', $wikidata_ref)) {
598 | $this->logMessage('bind_param failed with wikidata_ref="'.$wikidata_ref.'": '.$this->prep_wikidata_by_wikidata_ref->error."\n", 1);
599 | return false;
600 | }
601 |
602 | if (!$this->prep_wikidata_by_wikidata_ref->execute()) {
603 | $this->logMessage('wikidata query failed with wikidata_ref="'.$wikidata_ref.'": '.$this->prep_wikidata_by_wikidata_ref->error."\n", 1);
604 | return false;
605 | }
606 |
607 | if (!$this->prep_wikidata_by_wikidata_ref->store_result()) {
608 | $this->logMessage('wikidata query store result failed with wikidata_ref="'.$wikidata_ref."\"\n", 1);
609 | return false;
610 | }
611 |
612 | if ($this->prep_wikidata_by_wikidata_ref->num_rows == 0) return false;
613 |
614 | if (!$this->prep_wikidata_by_wikidata_ref->bind_result($wd_id, $ll_lang, $ll_title)) {
615 | $this->logMessage('bind_result failed with lastarticle='.$this->lastarticle.': '.$this->prep_wikidata_by_wikidata_ref->error."\n", 1);
616 | return false;
617 | }
618 |
619 | $langarray = array();
620 | while ($this->prep_wikidata_by_wikidata_ref->fetch()) {
621 | $langarray[str_replace('wiki', '', $ll_lang)] = $ll_title;
622 | }
623 | return array($wd_id, $langarray);
624 | }
625 |
626 | function escape($str) {
627 | return str_replace(array('\\','"'),array('\\\\','\\"'),$str);
628 | }
629 |
630 | function insert_wiwosm_wikidata_languages($res) {
631 | if ($res) {
632 | $pgconn = $this->getPgConn();
633 | $wikidata_id = $res[0];
634 | foreach ($res[1] as $l => $a) {
635 | pg_execute($pgconn,'insert_wiwosm_wikidata_languages',array($wikidata_id, str_replace('_','-',$l), $a));
636 | }
637 | pg_execute($pgconn,'insert_wiwosm_wikidata_languages',array($wikidata_id,'wikidata','Q'.$wikidata_id));
638 | }
639 | }
640 |
641 | function map_wikidata_languages() {
642 | $mysqlconn = $this->getMysqlConn();
643 | $this->prep_wikidata_by_wikidata_ref = $mysqlconn->prepare('SELECT `ips_item_id`,`ips_site_id`,`ips_site_page` FROM `wb_items_per_site` WHERE `ips_item_id` = ? ');
644 |
645 | $pgconn = $this->getPgConn();
646 |
647 | // delete cached wikidata_id before refetching them
648 | $result = pg_prepare($pgconn,'delete_wikidata_refs','DELETE FROM wiwosm_wikidata_languages WHERE wikidata_id = ANY ($1)');
649 | if ($result === false) exit();
650 |
651 | $result = pg_prepare($pgconn,'insert_wiwosm_wikidata_languages','INSERT INTO wiwosm_wikidata_languages (wikidata_id,lang,article) VALUES ($1,$2,$3)');
652 | if ($result === false) exit();
653 |
654 | // every wikidata_ref that is not present in wiwosm_wikidata_languages should get fetched from wikidata
655 | $sql = 'SELECT DISTINCT wikidata_ref FROM wiwosm WHERE wikidata_ref > 0 AND NOT EXISTS (SELECT 1 FROM wiwosm_wikidata_languages WHERE wiwosm_wikidata_languages.wikidata_id = wiwosm.wikidata_ref LIMIT 1)';
656 |
657 | if (!pg_query($pgconn,'BEGIN WORK') || !pg_query($pgconn,'DECLARE wikidatarefcur NO SCROLL CURSOR FOR '.$sql)) {
658 | $this->logMessage('Could not declare cursor wikidatarefcur'. "\n" . pg_last_error() . "\n", 1);
659 | exit();
660 | }
661 |
662 | $count = 0;
663 |
664 | // fetch from wikidatarefcur in steps of 1000 elements
665 | $result = pg_prepare($pgconn,'fetch_wikidatarefcur','FETCH 1000 FROM wikidatarefcur');
666 | if ($result === false) exit();
667 |
668 | $result = pg_execute($pgconn,'fetch_wikidatarefcur',array());
669 |
670 | $fetchcount = pg_num_rows($result);
671 |
672 | $this->logMessage('Get the first '.$fetchcount.' wikidatarefs:'.((microtime(true)-$this->start)/60)." min\n", 2);
673 |
674 | //we use a cursor loop just to be sure that memory consumption does not explode:
675 | while ($fetchcount > 0) {
676 | $wikidata_refs = pg_fetch_all_columns($result);
677 |
678 | pg_execute($pgconn,'delete_wikidata_refs',array('{'.implode(',',$wikidata_refs).'}'));
679 | foreach ($wikidata_refs as $wikidata_ref) {
680 | $this->insert_wiwosm_wikidata_languages($this->queryWikidataLanguagesByWikidataref($wikidata_ref));
681 | }
682 | $count += $fetchcount;
683 | $this->logMessage($count.' wikidatarefs processed:'.((microtime(true)-$this->start)/60)." min\n", 2);
684 | $result = pg_execute($pgconn,'fetch_wikidatarefcur',array());
685 | $fetchcount = pg_num_rows($result);
686 | }
687 |
688 | pg_query($pgconn,'CLOSE wikidatarefcur');
689 | pg_query($pgconn,'COMMIT WORK');
690 |
691 | // try to fastconnect the obvious rows
692 | $query = "UPDATE wiwosm SET wikidata_ref=wikidata_id FROM wiwosm_wikidata_languages WHERE wikidata_ref = 0 AND wiwosm.lang=wiwosm_wikidata_languages.lang AND wiwosm.article=wiwosm_wikidata_languages.article";
693 | $result = pg_query($pgconn,$query);
694 |
695 | // try to fetch wikidata_ref by language and article
696 | $this->prep_wikidata_by_lang_article = $mysqlconn->prepare('SELECT `ips_item_id`,`ips_site_id`,`ips_site_page` FROM `wb_items_per_site` WHERE `ips_item_id` = (SELECT `ips_item_id` FROM `wb_items_per_site` WHERE `ips_site_id` = ? AND `ips_site_page` = ? LIMIT 1)');
697 |
698 | // every row in wiwosm with wikidata_ref=0 is an error and should get -1 or has no entries in wiwosm_wikidata_languages, yet
699 | $query = "SELECT lang,article FROM wiwosm WHERE wikidata_ref=0 ORDER BY lang,article";
700 | // lets update every row and use a cursor for that
701 | if (!pg_query($pgconn,'BEGIN WORK') || !pg_query($pgconn,'DECLARE updatelangcur NO SCROLL CURSOR FOR '.$query.' FOR UPDATE OF wiwosm')) {
702 | $this->logMessage('Could not declare cursor for updating language refs'. "\n" . pg_last_error() . "\n", 1);
703 | exit();
704 | }
705 |
706 | $langbefore = '';
707 | $articlebefore = '';
708 | $wikidata_id = '-1';
709 |
710 | $count = 0;
711 |
712 | // prepare some sql queries that are used very often:
713 |
714 | // this is to search for an entry in wiwosm_wikidata_languages table by given article and language
715 | $result = pg_prepare($pgconn,'get_wikidata_id','SELECT wikidata_id FROM wiwosm_wikidata_languages WHERE lang=$1 AND article=$2');
716 | if ($result === false) exit();
717 |
718 | // update the wikidata_ref column in wiwosm table by using the current row from updatelangcur cursor
719 | $result = pg_prepare($pgconn,'update_wiwosm_wikidata_ref','UPDATE wiwosm SET wikidata_ref=$1 WHERE CURRENT OF updatelangcur');
720 | if ($result === false) exit();
721 |
722 | $result = pg_prepare($pgconn,'fetch_next_updatelangcur','FETCH NEXT FROM updatelangcur');
723 | if ($result === false) exit();
724 | $result = pg_execute($pgconn,'fetch_next_updatelangcur',array());
725 | $fetchcount = pg_num_rows($result);
726 |
727 | while ($fetchcount == 1) {
728 | $row = pg_fetch_assoc($result);
729 | $article = str_replace('_',' ',stripcslashes(self::fixUTF8(urldecode($row['article']))));
730 | $lang = $row['lang'];
731 | if ($langbefore !== $lang || $articlebefore !== $article) {
732 | if ($langbefore !== $lang) {
733 | $this->logMessage('Lastlang was:'.$langbefore."\n".'Handled '.$count.' rows '.((microtime(true)-$this->start)/60)." min\n", 2);
734 | }
735 | $langbefore = $lang;
736 | $articlebefore = $article;
737 | $wikidata_id = '-1';
738 | $result = pg_execute($pgconn,'get_wikidata_id',array($lang, $article));
739 | if ($result && pg_num_rows($result) == 1) {
740 | // if we found an entry in our wiwosm_wikidata table we use that id to link
741 | $wikidata_id = pg_fetch_result($result,0,0);
742 | } else {
743 | // if there was no such entry we have to query the wikidata mysql db
744 | $this->insert_wiwosm_wikidata_languages($this->queryWikidataLanguagesByLangArticle($lang,$article));
745 | }
746 | }
747 | pg_execute($pgconn,'update_wiwosm_wikidata_ref',array($wikidata_id));
748 | if ($result === false) exit();
749 | $count += $fetchcount;
750 | $result = pg_execute($pgconn,'fetch_next_updatelangcur',array());
751 | $fetchcount = pg_num_rows($result);
752 | }
753 | pg_query($pgconn,'CLOSE updatelangcur');
754 | pg_query($pgconn,'COMMIT WORK');
755 |
756 | if ($this->prep_wikidata_by_lang_article) $this->prep_wikidata_by_lang_article->close();
757 | if ($this->prep_wikidata_by_wikidata_ref) $this->prep_wikidata_by_wikidata_ref->close();
758 | }
759 |
760 | function createWikidataLangTable() {
761 | $query = <<getPgConn(),$query);
774 | }
775 |
776 |
777 | function createlinks($lang, $article, $geojson, $lang_hstore = '', $forceIWLLupdate = false) {
778 | // for every osm object with a valid wikipedia-tag print the geojson to file
779 | $filepath = $this->getFilePath($lang,$article);
780 |
781 | // we need no update of the Interwiki language links if there are no other languages in hstore given or
782 | // the file exists already and there is no force parameter given that forces to overwrite the existing links.
783 | // So we should create the Links, if the file does not exist already (and hstore is given) because it is new then
784 | $neednoIWLLupdate = ($lang_hstore == '') || (file_exists($filepath) && !$forceIWLLupdate);
785 |
786 | $handle = gzopen($filepath,'w');
787 | gzwrite($handle,$geojson);
788 | gzclose($handle);
789 |
790 | // check if we need an update of the Interwiki language links
791 | if ($neednoIWLLupdate) return true;
792 |
793 | // get the relativ filepath
794 | // $filepath = $this->getFilePath($lang,$article,true);
795 |
796 | $langarray = self::hstoreToArray($lang_hstore);
797 | // for every interwikilink do a hard link to the real file written above
798 | foreach ($langarray as $l => $a) {
799 | if ($l != $lang) {
800 | $linkpath = $this->getFilePath($l,$a);
801 | @unlink($linkpath);
802 | //symlink('../../'.$filepath,$linkpath);
803 | link($filepath,$linkpath);
804 | unset($linkpath);
805 | }
806 | }
807 | // free the memory
808 | unset($filepath,$handle,$geojson,$lang_hstore,$langarray);
809 | return true;
810 | }
811 |
812 |
813 | function updateOneObject($lang,$article) {
814 | $pgconn = $this->getPgConn();
815 | $articlefilter = '( tags @> $1::hstore ) OR ( tags @> $2::hstore ) OR ( tags @> $3::hstore ) OR ( tags @> $4::hstore ) OR ( tags @> $5::hstore ) OR ( tags @> $6::hstore )';
816 | $sql = 'SELECT '.self::simplifyGeoJSON.' FROM (
817 | ( SELECT way FROM planet_osm_polygon WHERE '.$articlefilter.' )
818 | UNION ( SELECT way FROM planet_osm_line WHERE ( '.$articlefilter.' ) AND NOT EXISTS (SELECT 1 FROM planet_osm_polygon WHERE planet_osm_polygon.osm_id = planet_osm_line.osm_id) )
819 | UNION ( SELECT way FROM planet_osm_point WHERE '.$articlefilter.' )
820 | ) AS wikistaff
821 | ';
822 | pg_prepare($pgconn,'select_wikipedia_object',$sql);
823 |
824 | $a = $this->escape(str_replace('_',' ',$article));
825 | $aurl = urlencode(str_replace(' ','_',$a));
826 | $l = $this->escape(str_replace('_','-',$lang));
827 | $lurl = str_replace('-','_',$l);
828 | $params = array('"wikipedia:'.$l.'"=>"'.$a.'"',
829 | '"wikipedia"=>"'.$l.':'.$a.'"',
830 | '"wikipedia"=>"http://'.$lurl.'.wikipedia.org/wiki/'.$aurl.'"',
831 | '"wikipedia"=>"https://'.$lurl.'.wikipedia.org/wiki/'.$aurl.'"',
832 | '"wikipedia:'.$l.'"=>"http://'.$lurl.'.wikipedia.org/wiki/'.$aurl.'"',
833 | '"wikipedia:'.$l.'"=>"https://'.$lurl.'.wikipedia.org/wiki/'.$aurl.'"');
834 |
835 | $result = pg_execute($pgconn,'select_wikipedia_object',$params);
836 | if($e = pg_last_error()) trigger_error($e, E_USER_ERROR);
837 |
838 | if ($result && pg_num_rows($result) == 1 ) {
839 | $row = pg_fetch_assoc($result);
840 | $this->createlinks($lang, $article, $row['geojson']);
841 | }
842 | }
843 |
844 | function processOsmItems() {
845 | $pgconn = $this->getPgConn();
846 | // to avoid problems with geometrycollections first dump all geometries and collect them again
847 | $sql = 'SELECT wikidata_ref, languages, geojson FROM
848 | ( SELECT wikidata_ref, '.self::simplifyGeoJSON.' FROM (
849 | SELECT wikidata_ref,(ST_Dump(way)).geom AS way FROM wiwosm WHERE wikidata_ref > 0
850 | ) AS geomdump GROUP BY wikidata_ref) AS wiwosm_geom, (
851 | SELECT wikidata_id, hstore(array_agg(wiwosm_wikidata_languages.lang), array_agg(wiwosm_wikidata_languages.article)) AS languages FROM wiwosm_wikidata_languages GROUP BY wikidata_id
852 | ) AS wikidata_languages
853 | WHERE wikidata_ref=wikidata_id';
854 |
855 | // this consumes just too mutch memory:
856 | /*
857 | $result = pg_query($conn, $sql);
858 | if (!$result) {
859 | $this->logMessage("Fail to fetch results from postgis \n", 1);
860 | exit;
861 | }
862 | */
863 |
864 | // so we have to use a cursor because its too much data:
865 | if (!pg_query($pgconn,'BEGIN WORK') || !pg_query($pgconn,'DECLARE osmcur NO SCROLL CURSOR FOR '.$sql)) {
866 | $this->logMessage('Could not declare cursor'. "\n" . pg_last_error() . "\n", 1);
867 | exit();
868 | }
869 |
870 |
871 | $count = 0;
872 |
873 | // fetch from osmcur in steps of 1000 elements
874 | $result = pg_prepare($pgconn,'fetch_osmcur','FETCH 1000 FROM osmcur');
875 | if ($result === false) exit();
876 |
877 | $result = pg_execute($pgconn,'fetch_osmcur',array());
878 |
879 | $fetchcount = pg_num_rows($result);
880 |
881 | $this->logMessage('Get the first '.$fetchcount.' rows:'.((microtime(true)-$this->start)/60)." min\n", 2);
882 |
883 | //damn cursor loop:
884 | while ($fetchcount > 0) {
885 | while ($row = pg_fetch_assoc($result)) {
886 |
887 | $this->createlinks('wikidata', 'Q'.$row['wikidata_ref'], $row['geojson'], $row['languages']);
888 | // free the memory
889 | unset($row);
890 | }
891 | $count += $fetchcount;
892 | $this->logMessage($count.' results processed:'.((microtime(true)-$this->start)/60)." min\n", 2);
893 | $result = pg_execute($pgconn,'fetch_osmcur',array());
894 | $fetchcount = pg_num_rows($result);
895 | }
896 |
897 | pg_query($pgconn,'CLOSE osmcur');
898 | pg_query($pgconn,'COMMIT WORK');
899 | }
900 |
901 | }
902 |
--------------------------------------------------------------------------------
/server/cleanup.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 | #$ -N wiwosmcleanup
3 | #$ -l h_rt=2:00:00
4 | #$ -l virtual_free=5M
5 | #$ -l arch=*
6 | #$ -m a
7 | #$ -o $HOME/log/cleanup.out
8 | #$ -e $HOME/log/cleanup.err
9 |
10 | /usr/bin/rm -rf /mnt/user-store/wiwosm/geojsongz_old_remove
11 |
12 |
--------------------------------------------------------------------------------
/server/gen_json_files.php:
--------------------------------------------------------------------------------
1 | 1) && ($argv[1] == 'full');
11 | $linkupdate = ($argc > 1) && ($argv[1] == 'link');
12 |
13 | echo date(DATE_RFC822)."\n";
14 |
15 | $wiwosm = new Wiwosm(2);
16 |
17 | $defaultpath = $wiwosm->json_path;
18 |
19 | if ($fullupdate) {
20 | $wiwosm->json_path = dirname($defaultpath).'/'.date('Y-m-d_H-i').'_geojsongz';
21 | echo 'doing full update'."\n";
22 | $wiwosm->createWikidataLangTable();
23 | }
24 |
25 | if (!$linkupdate) {
26 | $wiwosm->updateWiwosmDB();
27 | $wiwosm->logUnknownJSON();
28 | } else {
29 | $wiwosm->json_path = dirname($defaultpath).'/'.date('Y-m-d_H-i').'_geojsongz';
30 | echo 'skip DB Update - doing linkupdate only'."\n";
31 | }
32 | $wiwosm->processOsmItems();
33 | if ($fullupdate || $linkupdate) $wiwosm->testAndRename();
34 | }
35 |
--------------------------------------------------------------------------------
/server/public_html/osmjson/getGeoJSON.php:
--------------------------------------------------------------------------------
1 | getFilePath($lang, $article, false);
21 | print "$article\t" . (file_exists($file) ? 1 : 0) . "\n";
22 | }
23 | exit();
24 | }
25 |
26 | if ($_GET['action']=='purge' && $article && $lang) {
27 | if ($lang == 'wikidata') {
28 | echo 'update of wikidata objects in WIWOSM is not possible at the moment. Sorry!';
29 | exit();
30 | }
31 | echo 'Sorry, wiwosm is under development, so the purge feature is disabled for now!';
32 | exit();
33 | $wiwosm->updateOneObject($lang,$article);
34 | }
35 |
36 | $file = $wiwosm->getFilePath($lang,$article,false);
37 | if (file_exists($file)) {
38 | if ($_GET['action']=='check') {
39 | echo 1;
40 | } else {
41 | header('Content-Encoding: gzip');
42 | readfile($file);
43 | }
44 | } else {
45 | if ($_GET['action']=='check') {
46 | echo 0;
47 | } else {
48 | header("HTTP/1.0 404 Not Found");
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/server/public_html/wiwosmlog/broken.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | WIWOSM broken languages
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
Unknown wikipedia tags found while WIWOSM-processing
18.01.2013: Now I try to guess the country with iso2 country code by geographic match with World Borders Dataset.For Performance reasons I use the simple boundaries, so there could be some mistakes near borders! This matching is just a hint. You should check that manually in every case!
27 |
23.01.2013: You can use Quickfilters now, yeah!
28 |
29.01.2013: Quickfilters are case insensitive now.
29 |
03.03.2014: Now we use wikidata to get all associated languages! You see all articles here that were not found in wikidata.
30 |
31 |
Important: Most of the wikipedia=article tags point to the english wikipedia. Please check it first to be sure to link to the correct article. It is not enough to add just a guessed language! Please verify that this article is really present in the specified wikipedia!