├── incl ├── json.php ├── blogpost.php ├── bootstrap.php ├── xml.php ├── rdf.php ├── blogfeed.php ├── utils.php └── basefeed.php ├── README.md ├── rules ├── presse.php ├── saarland.php ├── berlin.php ├── sachsen.php ├── brandenburg.php ├── zoll.php └── polizei.php ├── tasks.todo ├── parser.php └── test_rdf.php /incl/json.php: -------------------------------------------------------------------------------- 1 | feed_url))) { 8 | return array(); 9 | } 10 | return $x->channel->item; 11 | } 12 | 13 | function parse(&$post, $item) 14 | { 15 | $post->date = (string) $item->pubDate; 16 | $post->link = (string) $item->link; 17 | $post->title = (string) $item->title; 18 | $post->category = $this->category; 19 | $this->set_category_details($post); 20 | return $post; 21 | } 22 | } 23 | 24 | -------------------------------------------------------------------------------- /rules/presse.php: -------------------------------------------------------------------------------- 1 | query($this->text_cnt); 15 | $textbody = $elements->item(0)->parentNode; 16 | return $this->_get_inner_html($textbody); 17 | } 18 | 19 | function ignore_content($node) 20 | { 21 | if ($node->nodeName == 'img') { 22 | return true; 23 | } 24 | return false; 25 | } 26 | 27 | 28 | function get_missing_title($xpath, &$post) 29 | { 30 | 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /rules/saarland.php: -------------------------------------------------------------------------------- 1 | is_first = true; 15 | $elements = $xpath->query($this->text_cnt); 16 | return $this->_get_inner_html($elements->item(0)->parentNode); 17 | } 18 | 19 | function ignore_content($child) 20 | { 21 | if ($child->nodeName == 'h1' && $this->is_first) { 22 | $this->is_first = false; 23 | return true; 24 | } 25 | 26 | if ($child->nodeName == 'div' && 27 | !($divbtn_id = $this->is_elm_with_attr($child, 'id')) === false) { 28 | if ($divbtn_id == 'readspeaker_button1') { 29 | return true; 30 | } 31 | } 32 | return false; 33 | } 34 | } 35 | 36 | -------------------------------------------------------------------------------- /rules/berlin.php: -------------------------------------------------------------------------------- 1 | parentNode->getAttribute('href'); 16 | } 17 | 18 | function get_content($xpath) 19 | { 20 | $elements = $xpath->query($this->text_cnt); 21 | return $this->_get_inner_html($elements->item(0)->parentNode); 22 | } 23 | 24 | function ignore_content($node) 25 | { 26 | return false; 27 | } 28 | 29 | /** 30 | * Clean up and add formatting as you like 31 | * 32 | * @return string 33 | * @author Tarin Mahmood 34 | */ 35 | function text_formatting($text) 36 | { 37 | /// NO CHANGES BEFORE THIS 38 | // replacing extra line breaks 39 | $text = str_replace("\n", ' ', $text); 40 | 41 | /// NO CHANGES AFTER THIS 42 | return $text; 43 | } 44 | 45 | 46 | function get_missing_text($xpath, &$post) 47 | { 48 | print_r ($post); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /incl/rdf.php: -------------------------------------------------------------------------------- 1 | feed_url)) { 10 | return $this->get_items_from_feed($this->feed_url); 11 | } 12 | $feeds = array(); 13 | foreach ($this->feed_url as $url){ 14 | $feeds = array_merge($feeds, 15 | $this->get_items_from_feed($url)); 16 | } 17 | return $feeds; 18 | } 19 | 20 | function get_items_from_feed($feed_url) 21 | { 22 | $rdf = file_get_contents($feed_url); 23 | $dom = new DomDocument; 24 | $dom->loadXml($rdf); 25 | $xph = new DOMXPath($dom); 26 | $xph->registerNamespace('rdf', RDF_SPEC_URL); 27 | $items = $xph->query('//@rdf:about'); 28 | $nodes = array(); 29 | foreach($items as $node) { 30 | $nodes[] = $node; 31 | } 32 | array_shift($nodes); 33 | return $nodes; 34 | } 35 | 36 | function parse(&$post, $item) 37 | { 38 | $post->link = (string) $item->value; 39 | $post->date = null; 40 | $post->title = null; 41 | if (isset($this->category) && !is_array($this->category)) { 42 | $post->category = $this->category; 43 | } else { 44 | $post->category = null; 45 | } 46 | $this->set_category_details($post); 47 | return $post; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /tasks.todo: -------------------------------------------------------------------------------- 1 | # vim:ft=todo 2 | 3 | DONE 2016-06-15 Image parsing @Berlin 4 | CLOSED: 2016-06-15 15:04:24 5 | :LOGBOOK: 6 | DONE: 2016-06-15 15:04:24 7 | DONE 2016-06-15 Fix new lines in texts @Berlin 8 | CLOSED: 2016-06-15 15:04:27 9 | :LOGBOOK: 10 | DONE: 2016-06-15 15:04:27 11 | DONE 2016-06-15 Image parsing @Polizei 12 | CLOSED: 2016-06-15 15:04:30 13 | :LOGBOOK: 14 | DONE: 2016-06-15 15:04:30 15 | DONE 2016-06-15 Image parsing @Prese 16 | CLOSED: 2016-06-15 15:08:50 17 | :LOGBOOK: 18 | DONE: 2016-06-15 15:08:50 19 | DONE 2016-06-15 Correct line-breaking @Prese 20 | CLOSED: 2016-06-15 15:08:47 21 | :LOGBOOK: 22 | DONE: 2016-06-15 15:08:47 23 | DONE: 2016-06-15 15:04:36 24 | DONE 2016-06-15 Image parsing @Saarland 25 | CLOSED: 2016-06-15 15:08:53 26 | :LOGBOOK: 27 | DONE: 2016-06-15 15:08:53 28 | DONE 2016-06-15 Image parsing @Zoll 29 | CLOSED: 2016-06-15 15:07:49 30 | :LOGBOOK: 31 | DONE: 2016-06-15 15:07:49 32 | DONE 2016-06-15 Image parsing @Sachsen 33 | CLOSED: 2016-06-15 15:08:22 34 | :LOGBOOK: 35 | DONE: 2016-06-15 15:08:22 36 | DONE 2016-06-15 Image parsing @Brandenburg 37 | CLOSED: 2016-06-15 15:15:42 38 | :LOGBOOK: 39 | DONE: 2016-06-15 15:15:42 40 | 41 | -------------------------------------------------------------------------------- /rules/sachsen.php: -------------------------------------------------------------------------------- 1 | 'pd-chemnitz', 17 | 'PD Leipzig' => 'pd-leipzig', 18 | 'PD Dresden' => 'pd-dresden', 19 | ); 20 | 21 | function get_content($xpath) 22 | { 23 | $content_div = $xpath->query($this->txt_selector)->item(0); 24 | if ($content_div == null) { 25 | return ''; 26 | } 27 | return $this->_get_inner_html($content_div); 28 | } 29 | 30 | function ignore_content($node) 31 | { 32 | if ($node->nodeName == 'strong') { 33 | return true; 34 | } 35 | return false; 36 | } 37 | 38 | 39 | function get_missing_category($xpath, &$post) 40 | { 41 | $title = trim($post->title); 42 | $city = explode('-', $title); 43 | $post->category = trim($city[0]); 44 | $post->title = $title; 45 | } 46 | 47 | function get_missing_text($xpath, &$post) 48 | { 49 | print_r ($post); 50 | } 51 | 52 | } 53 | 54 | -------------------------------------------------------------------------------- /incl/blogfeed.php: -------------------------------------------------------------------------------- 1 | feed = new $feed; 10 | } 11 | 12 | /** 13 | * Do parsing 14 | */ 15 | function parse_feed() 16 | { 17 | $items = $this->feed->get_items(); 18 | if (count($items) == 0) { 19 | return; 20 | } 21 | foreach ($items as $item) { 22 | $post = new BlogPost; 23 | $this->feed->parse($post, $item); 24 | if($this->parse_source_link($post) === false){ 25 | continue; 26 | } 27 | $post->text = $this->feed->text_formatting($post->text); 28 | $this->posts[] = $post; 29 | } 30 | } 31 | 32 | /** 33 | * Download webpage and apply rule to parse data 34 | */ 35 | function parse_source_link(&$post) 36 | { 37 | $xpath = $this->feed->get_page_obj($post); 38 | if (!is_object($xpath) || $xpath == null) { 39 | return false; 40 | } 41 | if ($post->picture != null) { 42 | $post->picture = $this->feed->get_image($xpath); 43 | } 44 | $post->text = (string) $this->feed->get_content($xpath); 45 | $this->fill_missing_data($xpath, $post); 46 | $this->feed->set_category_details($post); 47 | } 48 | 49 | /** 50 | * Fill up missing information, obj could be either JSON object 51 | * or XML object 52 | */ 53 | function fill_missing_data($xpath, &$post) 54 | { 55 | if ($post->title == null) { 56 | $this->feed->get_missing_title($xpath, $post); 57 | } 58 | if ($post->category == null) { 59 | $this->feed->get_missing_category($xpath, $post); 60 | } 61 | if ($post->text == null) { 62 | $this->feed->get_missing_text($xpath, $post); 63 | } 64 | } 65 | } 66 | 67 | -------------------------------------------------------------------------------- /rules/brandenburg.php: -------------------------------------------------------------------------------- 1 | "Prignitz", 9 | "2" => "Ostprignitz-Ruppin", 10 | "3" => "Oberhavel", 11 | "4" => "Uckermark", 12 | "5" => "Havelland", 13 | "6" => "Barnim", 14 | "7" => "Potsdam-Mittelmark", 15 | "8" => "Märkisch-Oderland", 16 | "9" => "Teltow-Fläming", 17 | "10" => "Dahme-Spreewald", 18 | "11" => "Oder-Spree", 19 | "12" => "Elbe-Elster", 20 | "13" => "Oberspreewald-Lausitz", 21 | "14" => "Spree-Neiße", 22 | "30" => "Berlin", 23 | "85" => "Potsdam (PDM)", 24 | "101" => "Oder-Spree/Frankfurt am Main (LOS/FFO)", 25 | "102" => "Brandenburg an der Havel (PM/BRB)", 26 | "103" => "Cottbus (CBS)", 27 | "500" => "Überregional", 28 | ); 29 | var $category_slug = array( 30 | '85'=> 'potsdam-pdm', 31 | '3'=> 'oberhavel'); 32 | var $sel = '//div[@class="pbb-article-text"]'; 33 | var $imgs_sel = array( '//div[@class="pbb-article-text"]/img', 34 | "id('pbb-slides')//img"); 35 | 36 | function parse(&$post, $item) 37 | { 38 | $post->category = $this->category[$item->district]; 39 | $post->title = $item->title; 40 | $post->link = $this->base_url . $item->url; 41 | if ($item->images != null) { 42 | $post->picture = array_map(function($img) { 43 | return $this->base_url . '/' . $img; }, $item->images); 44 | } 45 | } 46 | 47 | function get_content($xpath) 48 | { 49 | $textbody = $xpath->query($this->sel)->item(0); 50 | return $this->_get_inner_html($textbody); 51 | } 52 | 53 | function get_items() 54 | { 55 | $str = file_get_contents ($this->feed_url); 56 | $items = json_decode($str); 57 | return $items->data; 58 | } 59 | 60 | function ignore_content($node) 61 | { 62 | return false; 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /rules/zoll.php: -------------------------------------------------------------------------------- 1 | first_h1 = true; 16 | $txt = array(); 17 | $textbody = $xpath->query($this->text_cnt)->item(0)->parentNode; 18 | $this->remove_links($textbody); 19 | $txt = $this->_get_inner_html($textbody); 20 | if (trim($txt) == '') { 21 | $elm = $xpath->query('id("main")'); 22 | return trim($elm[0]->nodeValue); 23 | } 24 | return $txt; 25 | } 26 | 27 | function ignore_content($child) 28 | { 29 | if ($child->nodeName == 'h1' && $this->first_h1) { 30 | $this->first_h1 = false; 31 | return true; 32 | } 33 | if ($this->is_div_with_class($child)) { 34 | return true; 35 | } 36 | if ($this->is_elm_with_class($child, 'p')) { 37 | if($child->getAttribute('class') == 'navToTop') { 38 | return true; 39 | } 40 | } 41 | if ($child->nodeName == 'div' && $child->hasChildNodes()) { 42 | foreach($child->childNodes as $innerchild) { 43 | if ($this->is_p_with_a_picture($innerchild)) { 44 | return true; 45 | } 46 | } 47 | } 48 | if($this->is_directurl_span($child)) { 49 | return true; 50 | } 51 | return false; 52 | } 53 | 54 | private function is_p_with_a_picture($innerchild) 55 | { 56 | $s = $this->is_elm_with_class($innerchild, 'p'); 57 | if ($s) { 58 | $cls = $innerchild->getAttribute('class'); 59 | return $cls == 'picture rechts' || $cls == 'picture links'; 60 | } 61 | return false; 62 | } 63 | 64 | private function is_directurl_span($child) 65 | { 66 | if (!($child->nodeName =='span' && $child->hasAttribute('class'))) { 67 | return false; 68 | } 69 | $cls = $child->getAttribute('class'); 70 | if($cls == 'directURL') { 71 | return true; 72 | } 73 | } 74 | 75 | private function is_div_with_class($child) 76 | { 77 | if(!($child->nodeName =='div' && $child->hasAttribute('class'))) { 78 | return false; 79 | } 80 | if ($child->getAttribute('class') == 'gallery') { 81 | return true; 82 | } 83 | return false; 84 | } 85 | 86 | function get_image_custom($imgelm) 87 | { 88 | $fullurl = $imgelm->getAttribute('href'); 89 | if (!$this->is_full_url($fullurl)) { 90 | $fullurl = $this->base_url . $fullurl; 91 | } 92 | return $fullurl; 93 | } 94 | 95 | function get_missing_text($xpath, &$post) 96 | { 97 | return ''; 98 | } 99 | } 100 | 101 | 102 | -------------------------------------------------------------------------------- /parser.php: -------------------------------------------------------------------------------- 1 | 1) { 22 | $cls = $argv[1]; 23 | $parts = explode(',', $cls); 24 | if (count($parts) > 1) { 25 | foreach ($parts as $part){ 26 | $feed = new BlogFeed($part); 27 | $feed->parse_feed(); 28 | $posts = array_merge($feed->posts, $posts); 29 | } 30 | } else { 31 | $feed = new BlogFeed($cls); 32 | $feed->parse_feed(); 33 | $posts = $feed->posts; 34 | } 35 | } else { 36 | $ar = array('Berlin', 'Polizei', 'Presse', 'Saarland', 37 | 'Zoll', 'Sachsen', 'Brandenburg'); 38 | foreach ($ar as $cls){ 39 | $dt = new \DateTime; 40 | printf("[%s] - parsing: %s\n", $dt->format('d/m/Y - h:m'), $cls); 41 | $feed = new BlogFeed($cls); 42 | $feed->parse_feed(); 43 | $posts = array_merge($posts, $feed->posts); 44 | } 45 | } 46 | 47 | // unregister our autoload fuction so that 48 | // it does not conflict with Wordpresses autoloader 49 | $functions = spl_autoload_functions(); 50 | foreach($functions as $function) { 51 | spl_autoload_unregister($function); 52 | } 53 | 54 | // Load WordPress 55 | require_once WORDPRESS_PATH .'/wp-load.php'; 56 | require_once WORDPRESS_PATH .'/wp-admin/includes/post.php'; 57 | require_once WORDPRESS_PATH .'/wp-admin/includes/taxonomy.php'; 58 | require_once WORDPRESS_PATH .'/wp-admin/includes/file.php'; 59 | require_once WORDPRESS_PATH .'/wp-admin/includes/media.php'; 60 | 61 | 62 | $user_id = 9; 63 | $tmpl = '%s

Zum Originalartikel'; 64 | foreach ($posts as $post){ 65 | print_r ($post); 66 | $content = sprintf($tmpl, $post->text, $post->link); 67 | if (post_exists($post->title, $content) !== 0) { 68 | echo "x"; 69 | continue; 70 | } else { 71 | echo "."; 72 | } 73 | if(isset($post->category_slug)) { 74 | $category_id = get_category_by_slug($post->category_slug)->term_id; 75 | } elseif (isset($post->parent_category)) { 76 | $category_id = wp_create_category($post->category, $post->parent_category); 77 | } else { 78 | $category_id = wp_create_category($post->category); 79 | } 80 | $id = wp_insert_post(array( 81 | 'post_title' => $post->title, 82 | 'post_content' => $content, 83 | 'post_author' => $user_id, 84 | 'post_type' => 'post', 85 | 'post_status' => 'publish', 86 | 'tax_input' => array('polizei report'), 87 | )); 88 | if($id) { 89 | wp_set_post_terms($id, $category_id, 'category'); 90 | } else { 91 | echo "WARNING: Failed to insert post into WordPress\n"; 92 | continue; 93 | } 94 | $cnt_pictures = count($post->picture); 95 | if (!is_array($post->picture) || $cnt_pictures == 0) { 96 | continue; 97 | } 98 | $imgs = array(); 99 | foreach ($post->picture as $ky=>$pic){ 100 | $attachment_id = Utils::upload_to_wordpress($pic, $id); 101 | if ($attachment_id == null) { 102 | continue; 103 | } 104 | $wp_img = wp_get_attachment_link($attachment_id); 105 | if ($ky < $cnt_pictures - 1) { 106 | $wp_img = str_replace('class="', 'class="alignleft ', $wp_img); 107 | } 108 | $imgs[] = $wp_img; 109 | } 110 | if (count($imgs) == 0) { 111 | continue; 112 | } 113 | $imgs_txt = implode("", $imgs); 114 | $content = sprintf("%s

%s", $imgs_txt, $content); 115 | wp_update_post(array( 116 | 'ID' => $id, 117 | 'post_content' => $content, 118 | )); 119 | } 120 | -------------------------------------------------------------------------------- /test_rdf.php: -------------------------------------------------------------------------------- 1 | title = null; 22 | $post->link = $link; 23 | $post->date = null; 24 | $post->category = null; 25 | $xpath = $feed->get_page_obj($post); 26 | $blogfeed->parse_source_link($post); 27 | print_r($post); 28 | } 29 | 30 | function test_berlin() 31 | { 32 | $link = 'http://www.berlin.de/polizei/polizeimeldungen/pressemitteilung.504938.php'; 33 | $blogfeed = new BlogFeed('Berlin'); 34 | $feed = new Berlin; 35 | $post = new BlogPost; 36 | $post->title = 'sd'; 37 | $post->link = $link; 38 | $post->date = 'sd'; 39 | $post->category = "Berlin (Polizei)"; 40 | $xpath = $feed->get_page_obj($post); 41 | $blogfeed->parse_source_link($post); 42 | print ($post->text); 43 | } 44 | 45 | function test_zoll() 46 | { 47 | $link = 'http://www.zoll.de/SharedDocs/Pressemitteilungen/DE/Produktpiraterie/2016/z83_plagiate_h.html'; 48 | // $link = 'http://www.zoll.de/SharedDocs/Pressemitteilungen/DE/Sonstiges/2016/z12_horb_schroeder_stuttgart.html'; 49 | $blogfeed = new BlogFeed('Zoll'); 50 | $feed = new Zoll; 51 | $post = new BlogPost; 52 | $post->title = 'title'; 53 | $post->link = $link; 54 | $post->date = 'd'; 55 | $post->category = "Zoll Deutschland (Bundesweite Meldungen des Dienstes “Zoll im Fokus”)"; 56 | $xpath = $feed->get_page_obj($post); 57 | $blogfeed->parse_source_link($post); 58 | Utils::d($post->picture); 59 | } 60 | 61 | function test_polizei() 62 | { 63 | $link = 'http://www.polizei.bayern.de/oberfranken/fahndung/personen/tote/index.html/245358'; 64 | $blogfeed = new BlogFeed('Polizei'); 65 | $feed = new Polizei; 66 | $post = new BlogPost; 67 | $post->title = 'title'; 68 | $post->link = $link; 69 | $post->date = 'd'; 70 | $post->category = "Bayreuth (Polizeipräsidium Oberfranken)"; 71 | $xpath = $feed->get_page_obj($post); 72 | $blogfeed->parse_source_link($post); 73 | Utils::d($post->text); 74 | echo preg_replace('//', '', $post->text); 75 | } 76 | 77 | function test_brandenbur() 78 | { 79 | $link = 'https://polizei.brandenburg.de/fahndung/einbruch-in-arztpraxis-wer-erkennt-den-m/351879'; 80 | $blogfeed = new BlogFeed('Brandenburg'); 81 | $feed = new Brandenburg; 82 | $post = new BlogPost; 83 | $item = json_decode('{"district": "500", "timestamp": 1470828300, "category": "8", "url": "/fahndung/polizei-bittet-bevoelkerung-um-mithilfe/351871", "title": "Polizei bittet Bevölkerung um Mithilfe", "text": "\nAktuell sucht die Kriminalpolizei in Potsdam nach noch drei unbekannten Männern. Diese sind verdächtigt, in der Silvesternacht vom 31.12.2013 auf den 01.101.2014 einen Briefkasten eines Verwaltungsgebäudes mittels Einsatz von Pyrotechnik erheblich beschädigt zu haben.\n\n \n\n \n\n \n\nDie Polizei fragt: Wer kennt die Männer auf den abgebildeten Fotos und kann Hinweise zu deren Identität oder Aufenthaltsort machen? Ihre Hinweise richten Sie bitte unter der Telefonnummer: 0331 5508- 1224 an die Polizeiinspektion Potsdam oder jede andere Polizeidienstelle. Gerne können sie auch unser Hinweisformular im Internet nutzen. Dieses erreichen Sie unter: www.polizei.brandenburg.de\n\n \n\nTatzeit: 01.01.2014\n", "thumbnail": "/fm/24/thumbnails/PM%201874%20Potsdam%201.jpg.66864.jpg", "images": [ "/fm/24/thumbnails/PM%201874%20Potsdam%201.jpg.66863.jpg", "/fm/24/thumbnails/PM-%201874%20Potsdam.jpg.66871.jpg" ], "e_ort": "Potsdam, Nördliche Innenstadt, Friedrich-Ebert-Straße" }'); 84 | $feed->parse($post, $item); 85 | print_r ($post); 86 | } 87 | 88 | test_rdf(); 89 | 90 | -------------------------------------------------------------------------------- /incl/utils.php: -------------------------------------------------------------------------------- 1 | ', '
', '
'); 25 | $output = str_replace($replace, "\n", $output); 26 | } 27 | file_put_contents($filename, $output); 28 | } 29 | 30 | public static function download_content($url, $is_html=true) 31 | { 32 | $ch = curl_init(); 33 | curl_setopt($ch, CURLOPT_URL, $url); 34 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 35 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); 36 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true ); 37 | $output = curl_exec($ch); 38 | $info = curl_getinfo($ch); 39 | curl_close($ch); 40 | if ($is_html) { 41 | $replace = array('
', '
', '
'); 42 | $output = str_replace($replace, "\n", $output); 43 | } 44 | if (DEBUG) { 45 | $filepath = self::get_cache_path($url); 46 | file_put_contents($filepath, $output); 47 | } 48 | return $output; 49 | } 50 | 51 | public static function get_cache_path($url) 52 | { 53 | $filename = md5($url); 54 | $uobj = parse_url($url); 55 | $fullpath = "cache/{$uobj['host']}"; 56 | if (!file_exists($fullpath)) { 57 | mkdir($fullpath, 0777, true); 58 | } 59 | return "$fullpath/$filename"; 60 | } 61 | 62 | public static function get_base_url($url) 63 | { 64 | $uobj = parse_url($url); 65 | $stpath = sprintf("%s%s/", IMG_STORE_PATH, $uobj['host']); 66 | $srpath = sprintf("%s%s/", IMG_SRC_PATH, $uobj['host']); 67 | return array($stpath, $srpath); 68 | } 69 | 70 | 71 | public static function d($elm, $vd=false) 72 | { 73 | echo '
';
 74 | 		if (is_object($elm)) {
 75 | 			var_dump($elm);
 76 | 		} elseif(is_array($elm)) {
 77 | 			print_r($elm);
 78 | 		} else {
 79 | 			print($elm);
 80 | 		}
 81 | 		echo '
'; 82 | } 83 | 84 | function upload_to_wordpress($file, $parent_post_id) 85 | { 86 | if (!function_exists('wp_upload_bits')) { 87 | print("Not available"); 88 | return false; 89 | } 90 | $filename = basename($file); 91 | $upload_file = wp_upload_bits($filename, null, file_get_contents($file)); 92 | if (!$upload_file['error']) { 93 | $wp_filetype = wp_check_filetype($filename, null ); 94 | $attachment = array( 95 | 'post_mime_type' => $wp_filetype['type'], 96 | 'post_parent' => $parent_post_id, 97 | 'post_title' => preg_replace('/\.[^.]+$/', '', $filename), 98 | 'post_content' => '', 99 | 'post_status' => 'inherit' 100 | ); 101 | $attachment_id = wp_insert_attachment($attachment, $upload_file['file'], 102 | $parent_post_id ); 103 | if (!is_wp_error($attachment_id)) { 104 | require_once(WORDPRESS_PATH . '/wp-admin/includes/image.php'); 105 | $attachment_data = wp_generate_attachment_metadata($attachment_id, 106 | $upload_file['file']); 107 | wp_update_attachment_metadata( $attachment_id, $attachment_data ); 108 | add_post_meta($parent_post_id, 'article_images', $attachment_id, true); 109 | } 110 | return $attachment_id; 111 | } 112 | } 113 | 114 | function upload_media_sideload($img, $post_id) 115 | { 116 | $tmp = download_url( $url ); 117 | if( is_wp_error( $tmp ) ){ 118 | // download failed, handle error 119 | } 120 | $desc = "The WordPress Logo"; 121 | $file_array = array(); 122 | 123 | // Set variables for storage 124 | // fix file filename for query strings 125 | preg_match('/[^\?]+\.(jpg|jpe|jpeg|gif|png)/i', $url, $matches); 126 | $file_array['name'] = basename($matches[0]); 127 | $file_array['tmp_name'] = $tmp; 128 | 129 | // If error storing temporarily, unlink 130 | if ( is_wp_error( $tmp ) ) { 131 | @unlink($file_array['tmp_name']); 132 | $file_array['tmp_name'] = ''; 133 | } 134 | 135 | // do the validation and storage stuff 136 | $id = media_handle_sideload( $file_array, $post_id, $desc ); 137 | 138 | // If error storing permanently, unlink 139 | if ( is_wp_error($id) ) { 140 | @unlink($file_array['tmp_name']); 141 | return $id; 142 | } 143 | } 144 | } 145 | 146 | 147 | -------------------------------------------------------------------------------- /rules/polizei.php: -------------------------------------------------------------------------------- 1 | "München (Bayerisches Landeskriminalamt)", 22 | "verwaltungsamt" => "München (Polizeiverwaltungsamt)", 23 | "bepo" => "München (Bayerische Bereitschaftspolizei)", 24 | "muenchen" => "München (Polizei)", 25 | "niederbayern" => "Straubing (Polizeipräsidium Niederbayern)", 26 | "oberbayern_nord" => "Ingolstadt (Polizeipräsidium Oberbayern Nord)", 27 | "oberbayern" => "Rosenheim (Polizeipräsidiums Oberbayern Süd)", 28 | "oberfranken" => "Bayreuth (Polizeipräsidium Oberfranken)", 29 | "oberpfalz" => "Regensburg (Polizeipräsidium Oberpfalz)", 30 | "schwaben" => "Augsburg (Polizeipräsidium Schwaben Nord)", 31 | "schwaben_sw" => "Kempten (Polizeipräsidium Schwaben Süd/West)", 32 | "unterfranken" => "Würzburg (Polizeipräsidiums Unterfranken)", 33 | ); 34 | var $category_slug = array( 35 | 'muenchen' => 'muenchen-polizei' 36 | ); 37 | var $text_cnt = ''; 38 | var $imgs_sel = '//div[@class="inhaltBilderZoom"]/a'; 39 | var $custom_image_src = true; 40 | 41 | var $replace_elms_child = array('imp:live-info'); 42 | var $replace_with_child = array('div'); 43 | 44 | function get_missing_text($xpath, $post) 45 | { 46 | $currentnode = $xpath->query('//h1')->item(0)->nextSibling; 47 | $parent = $currentnode->parentNode; 48 | $innerhtml = array(); 49 | while ($currentnode) { 50 | if($this->stop_adding($currentnode)) { 51 | break; 52 | } 53 | if ($this->should_ignore($currentnode)) { 54 | $currentnode = $currentnode->nextSibling; 55 | continue; 56 | } 57 | $innerhtml[] = $parent->ownerDocument->saveXML($currentnode); 58 | $currentnode = $currentnode->nextSibling; 59 | } 60 | $post->content = trim(implode("", $innerhtml)); 61 | } 62 | 63 | function get_content($xpath) 64 | { 65 | $text = array(); 66 | $textbody = $xpath->query('//h1')->item(0)->parentNode; 67 | $this->remove_links($textbody); 68 | $childNodes = $textbody->childNodes; 69 | $innerhtml = array(); 70 | $indx = 0; 71 | foreach ($childNodes as $indx => $child) { 72 | if ($child->nodeName == 'h1') { 73 | break; 74 | } 75 | } 76 | ++$indx; 77 | for (;$indx < $childNodes->length; ++$indx){ 78 | $child = $childNodes->item($indx); 79 | if($this->stop_adding($child)) { 80 | break; 81 | } 82 | if ($this->should_ignore($child)) { 83 | continue; 84 | } 85 | $innerhtml[] = $textbody->ownerDocument->saveXML($child); 86 | } 87 | $txt = implode("", $innerhtml); 88 | return preg_replace('//', '', $txt); 89 | } 90 | 91 | 92 | function should_ignore($child) 93 | { 94 | if ($child->nodeName == '#text' || 95 | $child->nodeName == '#script' || 96 | $child->nodeName == 'img'|| 97 | $child->nodeName == 'a'|| 98 | $child->nodeName == '#style') { 99 | return true; 100 | } 101 | if($child->nodeName == 'table' && trim($child->nodeValue) == '') { 102 | return true; 103 | } 104 | if (method_exists($child, 'getAttribute')) { 105 | $cls = $child->getAttribute('class'); 106 | if ($cls == 'inhaltBilderZoom') { 107 | return true; 108 | } 109 | } 110 | return false; 111 | } 112 | 113 | function stop_adding($child) 114 | { 115 | if(method_exists($child, 'getAttribute')) { 116 | $cls = $child->getAttribute('class'); 117 | if ($cls == 'inhaltFooter') { 118 | return true; 119 | } 120 | } 121 | return false; 122 | } 123 | 124 | function get_missing_title($xpath, &$post) 125 | { 126 | $post->title = $xpath->query('//h1')->item(0)->nodeValue; 127 | } 128 | 129 | function get_missing_category($xpath, &$post) 130 | { 131 | $segs = explode('/', $post->link); 132 | $keys = array_keys($this->category); 133 | foreach ($segs as $seg){ 134 | if (in_array($seg, $keys)) { 135 | $post->category = $this->category[$seg]; 136 | if (array_key_exists($seg, $this->category_slug)) { 137 | $post->category_slug = $this->category_slug[$seg]; 138 | } 139 | break; 140 | } 141 | } 142 | } 143 | 144 | function get_image_custom($elm) 145 | { 146 | $href = $elm->getAttribute('href'); 147 | preg_match("/'(.[^']*)'/", $href, $match); 148 | return $match[1]; 149 | } 150 | 151 | } 152 | -------------------------------------------------------------------------------- /incl/basefeed.php: -------------------------------------------------------------------------------- 1 | '); 5 | 6 | class Basefeed 7 | { 8 | var $img_root; 9 | 10 | /** 11 | * downloads content 12 | * @return string 13 | * @author Tarin Mahmood 14 | **/ 15 | protected function download_content($link) 16 | { 17 | $content = Utils::download_content($link); 18 | if(!$this->is_html_file($content, $link)) { 19 | return false; 20 | } 21 | // force html utf8 22 | $content = str_replace('', UTF8_TAG, $content); 23 | // check if needs to remove any nodes 24 | if (property_exists($this, 'replace_elms_child')) { 25 | $content = str_replace($this->replace_elms_child, 26 | $this->replace_with_child, 27 | $content); 28 | } 29 | return $content; 30 | } 31 | 32 | /** 33 | * get_xpath 34 | * @return xpath object 35 | * @author Tarin Mahmood 36 | **/ 37 | protected function get_xpath($content) 38 | { 39 | $doc = new DOMDocument(); 40 | @$doc->loadHTML($content); 41 | // remove scripts 42 | while (($r = $doc->getElementsByTagName("script")) && $r->length) { 43 | $r->item(0)->parentNode->removeChild($r->item(0)); 44 | } 45 | return new DOMXpath($doc); 46 | } 47 | 48 | /** 49 | * Downloads page, cleans up set 50 | * @Return DOMXPath object 51 | */ 52 | function get_page_obj($post) 53 | { 54 | // download page 55 | $content = $this->download_content($post->link); 56 | // return xpath 57 | return $this->get_xpath($content); 58 | } 59 | 60 | /** 61 | * is_html_file 62 | * @return true/false 63 | * @author Tarin Mahmood 64 | **/ 65 | public function is_html_file($content, $link) 66 | { 67 | $tmp_file_name = IMG_SRC_PATH . '/tmp/' . md5($link); 68 | file_put_contents($tmp_file_name , $content); 69 | $mtype = mime_content_type($tmp_file_name); 70 | unlink($tmp_file_name); 71 | return $mtype == 'text/html'; 72 | } 73 | 74 | /** 75 | * is_bad_url, check if the url is in list of bad URLS 76 | * @return true/false 77 | * @author Tarin Mahmood 78 | **/ 79 | function is_bad_url($imgurl) 80 | { 81 | return isset($this->bad_url) && 82 | in_array($imgurl, $this->bad_url); 83 | } 84 | 85 | function is_full_url($imgsrc) 86 | { 87 | return strpos($imgsrc, 'https://') === 0 || 88 | strpos($imgsrc, 'http://') === 0; 89 | } 90 | 91 | function call_custom_image_parsing() 92 | { 93 | return isset($this->custom_image_src) && 94 | $this->custom_image_src; 95 | } 96 | 97 | /** 98 | * Downloads and save images in page in dis 99 | * 100 | * @return list of images found 101 | * @author Tarin Mahmood 102 | */ 103 | function get_image($xpath) 104 | { 105 | $images = array(); 106 | $count_images = 0; 107 | if (is_array($this->imgs_sel)) { 108 | foreach ($this->imgs_sel as $sel){ 109 | $imglist = $xpath->query($sel); 110 | foreach ($imglist as $img){ 111 | $images[] = $img; 112 | } 113 | } 114 | $count_images = count($images); 115 | } else { 116 | $images = $xpath->query($this->imgs_sel); 117 | $count_images = $images->length; 118 | } 119 | if ($count_images == 0) { 120 | return ""; 121 | } 122 | return $this->download_store_images($images); 123 | } 124 | 125 | function download_store_images($images) 126 | { 127 | $imgs = array(); 128 | $added = array(); 129 | list($storepath, $imgsrcpath) = Utils::get_base_url($this->base_url); 130 | if (!file_exists($storepath)) { 131 | mkdir($storepath, 0777, true); 132 | } 133 | foreach ($images as $imgelm){ 134 | if ($this->call_custom_image_parsing()) { 135 | $imgsrc = $this->get_image_custom($imgelm); 136 | } else { 137 | $imgsrc = $imgelm->getAttribute('src'); 138 | } 139 | if (in_array($imgsrc, $added)) { 140 | continue; 141 | } 142 | $added[] = $imgsrc; 143 | $is_data = false; 144 | if (strpos($imgsrc, 'data:') === 0) { 145 | $is_data = true; 146 | } else { 147 | if ($this->is_bad_url($imgsrc)) { 148 | continue; 149 | } 150 | if ($this->is_full_url($imgsrc)) { 151 | $imgurl = $imgsrc; 152 | } else { 153 | $imgurl = $this->base_url . $imgsrc; 154 | } 155 | } 156 | $filename = md5($imgsrc); 157 | $filepath = $storepath . $filename; 158 | $filesrc = $imgsrcpath. $filename; 159 | if ($is_data) { 160 | $v = explode(',', $imgsrc); 161 | $imgdata = imagecreatefromstring(array_pop($v)); 162 | file_put_contents($filepath, $imgdata); 163 | } else { 164 | $imgcontent = file_get_contents($imgurl); 165 | if ($imgcontent != null) { 166 | file_put_contents($filepath, $imgcontent); 167 | } 168 | } 169 | $r = exif_imagetype($filepath); 170 | if ($r == IMAGETYPE_JPEG) { 171 | $newpath = "$filepath.jpg"; 172 | rename($filepath, $newpath); 173 | $filepath = $newpath; 174 | } elseif ($r == IMAGETYPE_PNG) { 175 | $newpath = "$filepath.png"; 176 | rename($filepath, $newpath); 177 | $filepath = $newpath; 178 | } 179 | $imgs[] = $filepath; 180 | } 181 | return $imgs; 182 | } 183 | 184 | protected function _get_inner_html($node) 185 | { 186 | $txt = array(); 187 | $childNodes = $node->childNodes; 188 | foreach ($childNodes as $ky=>$child){ 189 | if($this->ignore_content($child)) { 190 | continue; 191 | } 192 | if (trim($node->nodeValue) == '') { 193 | continue; 194 | } 195 | $txt[] = $node->ownerDocument->saveXML($child); 196 | } 197 | $txt = implode(" ", $txt); 198 | $txt = preg_replace("/[\r\n]+/", "\n", $txt); 199 | return "

$txt

"; 200 | } 201 | 202 | protected function is_elm_with_class($node, $nodename) 203 | { 204 | return $node->nodeName == $nodename && 205 | $node->hasAttribute('class'); 206 | } 207 | 208 | protected function is_elm_with_attr($elm, $attr) 209 | { 210 | if($elm->hasAttribute('id')) { 211 | return $elm->getAttribute('id'); 212 | } 213 | return false; 214 | } 215 | 216 | 217 | function remove_links($parentnode) 218 | { 219 | if ($parentnode == null) { 220 | return; 221 | } 222 | $tobe_removed = array(); 223 | foreach ($parentnode->getElementsByTagName('a') as $link){ 224 | $tobe_removed[] = $link; 225 | } 226 | foreach ($tobe_removed as $link){ 227 | $link->parentNode->removeChild($link); 228 | } 229 | } 230 | 231 | /** 232 | * set_category_details 233 | * @return void 234 | * @author Tarin Mahmood 235 | **/ 236 | public function set_category_details(&$post) 237 | { 238 | if (isset($this->parent_category)) { 239 | $post->parent_category = $this->parent_category; 240 | } 241 | if ($post->category_slug == null && $this->category_slug_set($post)) { 242 | $post->category_slug = $this->category_slug[$post->category]; 243 | } 244 | } 245 | 246 | function category_slug_set($post) 247 | { 248 | return isset($this->category_slug) 249 | && array_key_exists($post->category, $this->category_slug); 250 | } 251 | 252 | /** 253 | * text_formatting, place holder fuction 254 | * @return void 255 | * @author Tarin Mahmood 256 | **/ 257 | public function text_formatting($text) 258 | { 259 | return $text; 260 | } 261 | 262 | } 263 | 264 | --------------------------------------------------------------------------------