├── incl
├── json.php
├── blogpost.php
├── bootstrap.php
├── xml.php
├── rdf.php
├── blogfeed.php
├── utils.php
└── basefeed.php
├── README.md
├── rules
├── presse.php
├── saarland.php
├── berlin.php
├── sachsen.php
├── brandenburg.php
├── zoll.php
└── polizei.php
├── tasks.todo
├── parser.php
└── test_rdf.php
/incl/json.php:
--------------------------------------------------------------------------------
1 | feed_url))) {
8 | return array();
9 | }
10 | return $x->channel->item;
11 | }
12 |
13 | function parse(&$post, $item)
14 | {
15 | $post->date = (string) $item->pubDate;
16 | $post->link = (string) $item->link;
17 | $post->title = (string) $item->title;
18 | $post->category = $this->category;
19 | $this->set_category_details($post);
20 | return $post;
21 | }
22 | }
23 |
24 |
--------------------------------------------------------------------------------
/rules/presse.php:
--------------------------------------------------------------------------------
1 | query($this->text_cnt);
15 | $textbody = $elements->item(0)->parentNode;
16 | return $this->_get_inner_html($textbody);
17 | }
18 |
19 | function ignore_content($node)
20 | {
21 | if ($node->nodeName == 'img') {
22 | return true;
23 | }
24 | return false;
25 | }
26 |
27 |
28 | function get_missing_title($xpath, &$post)
29 | {
30 |
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/rules/saarland.php:
--------------------------------------------------------------------------------
1 | is_first = true;
15 | $elements = $xpath->query($this->text_cnt);
16 | return $this->_get_inner_html($elements->item(0)->parentNode);
17 | }
18 |
19 | function ignore_content($child)
20 | {
21 | if ($child->nodeName == 'h1' && $this->is_first) {
22 | $this->is_first = false;
23 | return true;
24 | }
25 |
26 | if ($child->nodeName == 'div' &&
27 | !($divbtn_id = $this->is_elm_with_attr($child, 'id')) === false) {
28 | if ($divbtn_id == 'readspeaker_button1') {
29 | return true;
30 | }
31 | }
32 | return false;
33 | }
34 | }
35 |
36 |
--------------------------------------------------------------------------------
/rules/berlin.php:
--------------------------------------------------------------------------------
1 | parentNode->getAttribute('href');
16 | }
17 |
18 | function get_content($xpath)
19 | {
20 | $elements = $xpath->query($this->text_cnt);
21 | return $this->_get_inner_html($elements->item(0)->parentNode);
22 | }
23 |
24 | function ignore_content($node)
25 | {
26 | return false;
27 | }
28 |
29 | /**
30 | * Clean up and add formatting as you like
31 | *
32 | * @return string
33 | * @author Tarin Mahmood
34 | */
35 | function text_formatting($text)
36 | {
37 | /// NO CHANGES BEFORE THIS
38 | // replacing extra line breaks
39 | $text = str_replace("\n", ' ', $text);
40 |
41 | /// NO CHANGES AFTER THIS
42 | return $text;
43 | }
44 |
45 |
46 | function get_missing_text($xpath, &$post)
47 | {
48 | print_r ($post);
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/incl/rdf.php:
--------------------------------------------------------------------------------
1 | feed_url)) {
10 | return $this->get_items_from_feed($this->feed_url);
11 | }
12 | $feeds = array();
13 | foreach ($this->feed_url as $url){
14 | $feeds = array_merge($feeds,
15 | $this->get_items_from_feed($url));
16 | }
17 | return $feeds;
18 | }
19 |
20 | function get_items_from_feed($feed_url)
21 | {
22 | $rdf = file_get_contents($feed_url);
23 | $dom = new DomDocument;
24 | $dom->loadXml($rdf);
25 | $xph = new DOMXPath($dom);
26 | $xph->registerNamespace('rdf', RDF_SPEC_URL);
27 | $items = $xph->query('//@rdf:about');
28 | $nodes = array();
29 | foreach($items as $node) {
30 | $nodes[] = $node;
31 | }
32 | array_shift($nodes);
33 | return $nodes;
34 | }
35 |
36 | function parse(&$post, $item)
37 | {
38 | $post->link = (string) $item->value;
39 | $post->date = null;
40 | $post->title = null;
41 | if (isset($this->category) && !is_array($this->category)) {
42 | $post->category = $this->category;
43 | } else {
44 | $post->category = null;
45 | }
46 | $this->set_category_details($post);
47 | return $post;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/tasks.todo:
--------------------------------------------------------------------------------
1 | # vim:ft=todo
2 |
3 | DONE 2016-06-15 Image parsing @Berlin
4 | CLOSED: 2016-06-15 15:04:24
5 | :LOGBOOK:
6 | DONE: 2016-06-15 15:04:24
7 | DONE 2016-06-15 Fix new lines in texts @Berlin
8 | CLOSED: 2016-06-15 15:04:27
9 | :LOGBOOK:
10 | DONE: 2016-06-15 15:04:27
11 | DONE 2016-06-15 Image parsing @Polizei
12 | CLOSED: 2016-06-15 15:04:30
13 | :LOGBOOK:
14 | DONE: 2016-06-15 15:04:30
15 | DONE 2016-06-15 Image parsing @Prese
16 | CLOSED: 2016-06-15 15:08:50
17 | :LOGBOOK:
18 | DONE: 2016-06-15 15:08:50
19 | DONE 2016-06-15 Correct line-breaking @Prese
20 | CLOSED: 2016-06-15 15:08:47
21 | :LOGBOOK:
22 | DONE: 2016-06-15 15:08:47
23 | DONE: 2016-06-15 15:04:36
24 | DONE 2016-06-15 Image parsing @Saarland
25 | CLOSED: 2016-06-15 15:08:53
26 | :LOGBOOK:
27 | DONE: 2016-06-15 15:08:53
28 | DONE 2016-06-15 Image parsing @Zoll
29 | CLOSED: 2016-06-15 15:07:49
30 | :LOGBOOK:
31 | DONE: 2016-06-15 15:07:49
32 | DONE 2016-06-15 Image parsing @Sachsen
33 | CLOSED: 2016-06-15 15:08:22
34 | :LOGBOOK:
35 | DONE: 2016-06-15 15:08:22
36 | DONE 2016-06-15 Image parsing @Brandenburg
37 | CLOSED: 2016-06-15 15:15:42
38 | :LOGBOOK:
39 | DONE: 2016-06-15 15:15:42
40 |
41 |
--------------------------------------------------------------------------------
/rules/sachsen.php:
--------------------------------------------------------------------------------
1 | 'pd-chemnitz',
17 | 'PD Leipzig' => 'pd-leipzig',
18 | 'PD Dresden' => 'pd-dresden',
19 | );
20 |
21 | function get_content($xpath)
22 | {
23 | $content_div = $xpath->query($this->txt_selector)->item(0);
24 | if ($content_div == null) {
25 | return '';
26 | }
27 | return $this->_get_inner_html($content_div);
28 | }
29 |
30 | function ignore_content($node)
31 | {
32 | if ($node->nodeName == 'strong') {
33 | return true;
34 | }
35 | return false;
36 | }
37 |
38 |
39 | function get_missing_category($xpath, &$post)
40 | {
41 | $title = trim($post->title);
42 | $city = explode('-', $title);
43 | $post->category = trim($city[0]);
44 | $post->title = $title;
45 | }
46 |
47 | function get_missing_text($xpath, &$post)
48 | {
49 | print_r ($post);
50 | }
51 |
52 | }
53 |
54 |
--------------------------------------------------------------------------------
/incl/blogfeed.php:
--------------------------------------------------------------------------------
1 | feed = new $feed;
10 | }
11 |
12 | /**
13 | * Do parsing
14 | */
15 | function parse_feed()
16 | {
17 | $items = $this->feed->get_items();
18 | if (count($items) == 0) {
19 | return;
20 | }
21 | foreach ($items as $item) {
22 | $post = new BlogPost;
23 | $this->feed->parse($post, $item);
24 | if($this->parse_source_link($post) === false){
25 | continue;
26 | }
27 | $post->text = $this->feed->text_formatting($post->text);
28 | $this->posts[] = $post;
29 | }
30 | }
31 |
32 | /**
33 | * Download webpage and apply rule to parse data
34 | */
35 | function parse_source_link(&$post)
36 | {
37 | $xpath = $this->feed->get_page_obj($post);
38 | if (!is_object($xpath) || $xpath == null) {
39 | return false;
40 | }
41 | if ($post->picture != null) {
42 | $post->picture = $this->feed->get_image($xpath);
43 | }
44 | $post->text = (string) $this->feed->get_content($xpath);
45 | $this->fill_missing_data($xpath, $post);
46 | $this->feed->set_category_details($post);
47 | }
48 |
49 | /**
50 | * Fill up missing information, obj could be either JSON object
51 | * or XML object
52 | */
53 | function fill_missing_data($xpath, &$post)
54 | {
55 | if ($post->title == null) {
56 | $this->feed->get_missing_title($xpath, $post);
57 | }
58 | if ($post->category == null) {
59 | $this->feed->get_missing_category($xpath, $post);
60 | }
61 | if ($post->text == null) {
62 | $this->feed->get_missing_text($xpath, $post);
63 | }
64 | }
65 | }
66 |
67 |
--------------------------------------------------------------------------------
/rules/brandenburg.php:
--------------------------------------------------------------------------------
1 | "Prignitz",
9 | "2" => "Ostprignitz-Ruppin",
10 | "3" => "Oberhavel",
11 | "4" => "Uckermark",
12 | "5" => "Havelland",
13 | "6" => "Barnim",
14 | "7" => "Potsdam-Mittelmark",
15 | "8" => "Märkisch-Oderland",
16 | "9" => "Teltow-Fläming",
17 | "10" => "Dahme-Spreewald",
18 | "11" => "Oder-Spree",
19 | "12" => "Elbe-Elster",
20 | "13" => "Oberspreewald-Lausitz",
21 | "14" => "Spree-Neiße",
22 | "30" => "Berlin",
23 | "85" => "Potsdam (PDM)",
24 | "101" => "Oder-Spree/Frankfurt am Main (LOS/FFO)",
25 | "102" => "Brandenburg an der Havel (PM/BRB)",
26 | "103" => "Cottbus (CBS)",
27 | "500" => "Überregional",
28 | );
29 | var $category_slug = array(
30 | '85'=> 'potsdam-pdm',
31 | '3'=> 'oberhavel');
32 | var $sel = '//div[@class="pbb-article-text"]';
33 | var $imgs_sel = array( '//div[@class="pbb-article-text"]/img',
34 | "id('pbb-slides')//img");
35 |
36 | function parse(&$post, $item)
37 | {
38 | $post->category = $this->category[$item->district];
39 | $post->title = $item->title;
40 | $post->link = $this->base_url . $item->url;
41 | if ($item->images != null) {
42 | $post->picture = array_map(function($img) {
43 | return $this->base_url . '/' . $img; }, $item->images);
44 | }
45 | }
46 |
47 | function get_content($xpath)
48 | {
49 | $textbody = $xpath->query($this->sel)->item(0);
50 | return $this->_get_inner_html($textbody);
51 | }
52 |
53 | function get_items()
54 | {
55 | $str = file_get_contents ($this->feed_url);
56 | $items = json_decode($str);
57 | return $items->data;
58 | }
59 |
60 | function ignore_content($node)
61 | {
62 | return false;
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/rules/zoll.php:
--------------------------------------------------------------------------------
1 | first_h1 = true;
16 | $txt = array();
17 | $textbody = $xpath->query($this->text_cnt)->item(0)->parentNode;
18 | $this->remove_links($textbody);
19 | $txt = $this->_get_inner_html($textbody);
20 | if (trim($txt) == '') {
21 | $elm = $xpath->query('id("main")');
22 | return trim($elm[0]->nodeValue);
23 | }
24 | return $txt;
25 | }
26 |
27 | function ignore_content($child)
28 | {
29 | if ($child->nodeName == 'h1' && $this->first_h1) {
30 | $this->first_h1 = false;
31 | return true;
32 | }
33 | if ($this->is_div_with_class($child)) {
34 | return true;
35 | }
36 | if ($this->is_elm_with_class($child, 'p')) {
37 | if($child->getAttribute('class') == 'navToTop') {
38 | return true;
39 | }
40 | }
41 | if ($child->nodeName == 'div' && $child->hasChildNodes()) {
42 | foreach($child->childNodes as $innerchild) {
43 | if ($this->is_p_with_a_picture($innerchild)) {
44 | return true;
45 | }
46 | }
47 | }
48 | if($this->is_directurl_span($child)) {
49 | return true;
50 | }
51 | return false;
52 | }
53 |
54 | private function is_p_with_a_picture($innerchild)
55 | {
56 | $s = $this->is_elm_with_class($innerchild, 'p');
57 | if ($s) {
58 | $cls = $innerchild->getAttribute('class');
59 | return $cls == 'picture rechts' || $cls == 'picture links';
60 | }
61 | return false;
62 | }
63 |
64 | private function is_directurl_span($child)
65 | {
66 | if (!($child->nodeName =='span' && $child->hasAttribute('class'))) {
67 | return false;
68 | }
69 | $cls = $child->getAttribute('class');
70 | if($cls == 'directURL') {
71 | return true;
72 | }
73 | }
74 |
75 | private function is_div_with_class($child)
76 | {
77 | if(!($child->nodeName =='div' && $child->hasAttribute('class'))) {
78 | return false;
79 | }
80 | if ($child->getAttribute('class') == 'gallery') {
81 | return true;
82 | }
83 | return false;
84 | }
85 |
86 | function get_image_custom($imgelm)
87 | {
88 | $fullurl = $imgelm->getAttribute('href');
89 | if (!$this->is_full_url($fullurl)) {
90 | $fullurl = $this->base_url . $fullurl;
91 | }
92 | return $fullurl;
93 | }
94 |
95 | function get_missing_text($xpath, &$post)
96 | {
97 | return '';
98 | }
99 | }
100 |
101 |
102 |
--------------------------------------------------------------------------------
/parser.php:
--------------------------------------------------------------------------------
1 | 1) {
22 | $cls = $argv[1];
23 | $parts = explode(',', $cls);
24 | if (count($parts) > 1) {
25 | foreach ($parts as $part){
26 | $feed = new BlogFeed($part);
27 | $feed->parse_feed();
28 | $posts = array_merge($feed->posts, $posts);
29 | }
30 | } else {
31 | $feed = new BlogFeed($cls);
32 | $feed->parse_feed();
33 | $posts = $feed->posts;
34 | }
35 | } else {
36 | $ar = array('Berlin', 'Polizei', 'Presse', 'Saarland',
37 | 'Zoll', 'Sachsen', 'Brandenburg');
38 | foreach ($ar as $cls){
39 | $dt = new \DateTime;
40 | printf("[%s] - parsing: %s\n", $dt->format('d/m/Y - h:m'), $cls);
41 | $feed = new BlogFeed($cls);
42 | $feed->parse_feed();
43 | $posts = array_merge($posts, $feed->posts);
44 | }
45 | }
46 |
47 | // unregister our autoload fuction so that
48 | // it does not conflict with Wordpresses autoloader
49 | $functions = spl_autoload_functions();
50 | foreach($functions as $function) {
51 | spl_autoload_unregister($function);
52 | }
53 |
54 | // Load WordPress
55 | require_once WORDPRESS_PATH .'/wp-load.php';
56 | require_once WORDPRESS_PATH .'/wp-admin/includes/post.php';
57 | require_once WORDPRESS_PATH .'/wp-admin/includes/taxonomy.php';
58 | require_once WORDPRESS_PATH .'/wp-admin/includes/file.php';
59 | require_once WORDPRESS_PATH .'/wp-admin/includes/media.php';
60 |
61 |
62 | $user_id = 9;
63 | $tmpl = '%s
Zum Originalartikel';
64 | foreach ($posts as $post){
65 | print_r ($post);
66 | $content = sprintf($tmpl, $post->text, $post->link);
67 | if (post_exists($post->title, $content) !== 0) {
68 | echo "x";
69 | continue;
70 | } else {
71 | echo ".";
72 | }
73 | if(isset($post->category_slug)) {
74 | $category_id = get_category_by_slug($post->category_slug)->term_id;
75 | } elseif (isset($post->parent_category)) {
76 | $category_id = wp_create_category($post->category, $post->parent_category);
77 | } else {
78 | $category_id = wp_create_category($post->category);
79 | }
80 | $id = wp_insert_post(array(
81 | 'post_title' => $post->title,
82 | 'post_content' => $content,
83 | 'post_author' => $user_id,
84 | 'post_type' => 'post',
85 | 'post_status' => 'publish',
86 | 'tax_input' => array('polizei report'),
87 | ));
88 | if($id) {
89 | wp_set_post_terms($id, $category_id, 'category');
90 | } else {
91 | echo "WARNING: Failed to insert post into WordPress\n";
92 | continue;
93 | }
94 | $cnt_pictures = count($post->picture);
95 | if (!is_array($post->picture) || $cnt_pictures == 0) {
96 | continue;
97 | }
98 | $imgs = array();
99 | foreach ($post->picture as $ky=>$pic){
100 | $attachment_id = Utils::upload_to_wordpress($pic, $id);
101 | if ($attachment_id == null) {
102 | continue;
103 | }
104 | $wp_img = wp_get_attachment_link($attachment_id);
105 | if ($ky < $cnt_pictures - 1) {
106 | $wp_img = str_replace('class="', 'class="alignleft ', $wp_img);
107 | }
108 | $imgs[] = $wp_img;
109 | }
110 | if (count($imgs) == 0) {
111 | continue;
112 | }
113 | $imgs_txt = implode("", $imgs);
114 | $content = sprintf("%s
%s", $imgs_txt, $content);
115 | wp_update_post(array(
116 | 'ID' => $id,
117 | 'post_content' => $content,
118 | ));
119 | }
120 |
--------------------------------------------------------------------------------
/test_rdf.php:
--------------------------------------------------------------------------------
1 | title = null;
22 | $post->link = $link;
23 | $post->date = null;
24 | $post->category = null;
25 | $xpath = $feed->get_page_obj($post);
26 | $blogfeed->parse_source_link($post);
27 | print_r($post);
28 | }
29 |
30 | function test_berlin()
31 | {
32 | $link = 'http://www.berlin.de/polizei/polizeimeldungen/pressemitteilung.504938.php';
33 | $blogfeed = new BlogFeed('Berlin');
34 | $feed = new Berlin;
35 | $post = new BlogPost;
36 | $post->title = 'sd';
37 | $post->link = $link;
38 | $post->date = 'sd';
39 | $post->category = "Berlin (Polizei)";
40 | $xpath = $feed->get_page_obj($post);
41 | $blogfeed->parse_source_link($post);
42 | print ($post->text);
43 | }
44 |
45 | function test_zoll()
46 | {
47 | $link = 'http://www.zoll.de/SharedDocs/Pressemitteilungen/DE/Produktpiraterie/2016/z83_plagiate_h.html';
48 | // $link = 'http://www.zoll.de/SharedDocs/Pressemitteilungen/DE/Sonstiges/2016/z12_horb_schroeder_stuttgart.html';
49 | $blogfeed = new BlogFeed('Zoll');
50 | $feed = new Zoll;
51 | $post = new BlogPost;
52 | $post->title = 'title';
53 | $post->link = $link;
54 | $post->date = 'd';
55 | $post->category = "Zoll Deutschland (Bundesweite Meldungen des Dienstes “Zoll im Fokus”)";
56 | $xpath = $feed->get_page_obj($post);
57 | $blogfeed->parse_source_link($post);
58 | Utils::d($post->picture);
59 | }
60 |
61 | function test_polizei()
62 | {
63 | $link = 'http://www.polizei.bayern.de/oberfranken/fahndung/personen/tote/index.html/245358';
64 | $blogfeed = new BlogFeed('Polizei');
65 | $feed = new Polizei;
66 | $post = new BlogPost;
67 | $post->title = 'title';
68 | $post->link = $link;
69 | $post->date = 'd';
70 | $post->category = "Bayreuth (Polizeipräsidium Oberfranken)";
71 | $xpath = $feed->get_page_obj($post);
72 | $blogfeed->parse_source_link($post);
73 | Utils::d($post->text);
74 | echo preg_replace('//', '', $post->text);
75 | }
76 |
77 | function test_brandenbur()
78 | {
79 | $link = 'https://polizei.brandenburg.de/fahndung/einbruch-in-arztpraxis-wer-erkennt-den-m/351879';
80 | $blogfeed = new BlogFeed('Brandenburg');
81 | $feed = new Brandenburg;
82 | $post = new BlogPost;
83 | $item = json_decode('{"district": "500", "timestamp": 1470828300, "category": "8", "url": "/fahndung/polizei-bittet-bevoelkerung-um-mithilfe/351871", "title": "Polizei bittet Bevölkerung um Mithilfe", "text": "\nAktuell sucht die Kriminalpolizei in Potsdam nach noch drei unbekannten Männern. Diese sind verdächtigt, in der Silvesternacht vom 31.12.2013 auf den 01.101.2014 einen Briefkasten eines Verwaltungsgebäudes mittels Einsatz von Pyrotechnik erheblich beschädigt zu haben.\n\n \n\n \n\n \n\nDie Polizei fragt: Wer kennt die Männer auf den abgebildeten Fotos und kann Hinweise zu deren Identität oder Aufenthaltsort machen? Ihre Hinweise richten Sie bitte unter der Telefonnummer: 0331 5508- 1224 an die Polizeiinspektion Potsdam oder jede andere Polizeidienstelle. Gerne können sie auch unser Hinweisformular im Internet nutzen. Dieses erreichen Sie unter: www.polizei.brandenburg.de\n\n \n\nTatzeit: 01.01.2014\n", "thumbnail": "/fm/24/thumbnails/PM%201874%20Potsdam%201.jpg.66864.jpg", "images": [ "/fm/24/thumbnails/PM%201874%20Potsdam%201.jpg.66863.jpg", "/fm/24/thumbnails/PM-%201874%20Potsdam.jpg.66871.jpg" ], "e_ort": "Potsdam, Nördliche Innenstadt, Friedrich-Ebert-Straße" }');
84 | $feed->parse($post, $item);
85 | print_r ($post);
86 | }
87 |
88 | test_rdf();
89 |
90 |
--------------------------------------------------------------------------------
/incl/utils.php:
--------------------------------------------------------------------------------
1 | ', '
', '
');
25 | $output = str_replace($replace, "\n", $output);
26 | }
27 | file_put_contents($filename, $output);
28 | }
29 |
30 | public static function download_content($url, $is_html=true)
31 | {
32 | $ch = curl_init();
33 | curl_setopt($ch, CURLOPT_URL, $url);
34 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
35 | curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
36 | curl_setopt($ch, CURLOPT_RETURNTRANSFER, true );
37 | $output = curl_exec($ch);
38 | $info = curl_getinfo($ch);
39 | curl_close($ch);
40 | if ($is_html) {
41 | $replace = array('
', '
', '
');
42 | $output = str_replace($replace, "\n", $output);
43 | }
44 | if (DEBUG) {
45 | $filepath = self::get_cache_path($url);
46 | file_put_contents($filepath, $output);
47 | }
48 | return $output;
49 | }
50 |
51 | public static function get_cache_path($url)
52 | {
53 | $filename = md5($url);
54 | $uobj = parse_url($url);
55 | $fullpath = "cache/{$uobj['host']}";
56 | if (!file_exists($fullpath)) {
57 | mkdir($fullpath, 0777, true);
58 | }
59 | return "$fullpath/$filename";
60 | }
61 |
62 | public static function get_base_url($url)
63 | {
64 | $uobj = parse_url($url);
65 | $stpath = sprintf("%s%s/", IMG_STORE_PATH, $uobj['host']);
66 | $srpath = sprintf("%s%s/", IMG_SRC_PATH, $uobj['host']);
67 | return array($stpath, $srpath);
68 | }
69 |
70 |
71 | public static function d($elm, $vd=false)
72 | {
73 | echo '
';
74 | if (is_object($elm)) {
75 | var_dump($elm);
76 | } elseif(is_array($elm)) {
77 | print_r($elm);
78 | } else {
79 | print($elm);
80 | }
81 | echo '';
82 | }
83 |
84 | function upload_to_wordpress($file, $parent_post_id)
85 | {
86 | if (!function_exists('wp_upload_bits')) {
87 | print("Not available");
88 | return false;
89 | }
90 | $filename = basename($file);
91 | $upload_file = wp_upload_bits($filename, null, file_get_contents($file));
92 | if (!$upload_file['error']) {
93 | $wp_filetype = wp_check_filetype($filename, null );
94 | $attachment = array(
95 | 'post_mime_type' => $wp_filetype['type'],
96 | 'post_parent' => $parent_post_id,
97 | 'post_title' => preg_replace('/\.[^.]+$/', '', $filename),
98 | 'post_content' => '',
99 | 'post_status' => 'inherit'
100 | );
101 | $attachment_id = wp_insert_attachment($attachment, $upload_file['file'],
102 | $parent_post_id );
103 | if (!is_wp_error($attachment_id)) {
104 | require_once(WORDPRESS_PATH . '/wp-admin/includes/image.php');
105 | $attachment_data = wp_generate_attachment_metadata($attachment_id,
106 | $upload_file['file']);
107 | wp_update_attachment_metadata( $attachment_id, $attachment_data );
108 | add_post_meta($parent_post_id, 'article_images', $attachment_id, true);
109 | }
110 | return $attachment_id;
111 | }
112 | }
113 |
114 | function upload_media_sideload($img, $post_id)
115 | {
116 | $tmp = download_url( $url );
117 | if( is_wp_error( $tmp ) ){
118 | // download failed, handle error
119 | }
120 | $desc = "The WordPress Logo";
121 | $file_array = array();
122 |
123 | // Set variables for storage
124 | // fix file filename for query strings
125 | preg_match('/[^\?]+\.(jpg|jpe|jpeg|gif|png)/i', $url, $matches);
126 | $file_array['name'] = basename($matches[0]);
127 | $file_array['tmp_name'] = $tmp;
128 |
129 | // If error storing temporarily, unlink
130 | if ( is_wp_error( $tmp ) ) {
131 | @unlink($file_array['tmp_name']);
132 | $file_array['tmp_name'] = '';
133 | }
134 |
135 | // do the validation and storage stuff
136 | $id = media_handle_sideload( $file_array, $post_id, $desc );
137 |
138 | // If error storing permanently, unlink
139 | if ( is_wp_error($id) ) {
140 | @unlink($file_array['tmp_name']);
141 | return $id;
142 | }
143 | }
144 | }
145 |
146 |
147 |
--------------------------------------------------------------------------------
/rules/polizei.php:
--------------------------------------------------------------------------------
1 | "München (Bayerisches Landeskriminalamt)",
22 | "verwaltungsamt" => "München (Polizeiverwaltungsamt)",
23 | "bepo" => "München (Bayerische Bereitschaftspolizei)",
24 | "muenchen" => "München (Polizei)",
25 | "niederbayern" => "Straubing (Polizeipräsidium Niederbayern)",
26 | "oberbayern_nord" => "Ingolstadt (Polizeipräsidium Oberbayern Nord)",
27 | "oberbayern" => "Rosenheim (Polizeipräsidiums Oberbayern Süd)",
28 | "oberfranken" => "Bayreuth (Polizeipräsidium Oberfranken)",
29 | "oberpfalz" => "Regensburg (Polizeipräsidium Oberpfalz)",
30 | "schwaben" => "Augsburg (Polizeipräsidium Schwaben Nord)",
31 | "schwaben_sw" => "Kempten (Polizeipräsidium Schwaben Süd/West)",
32 | "unterfranken" => "Würzburg (Polizeipräsidiums Unterfranken)",
33 | );
34 | var $category_slug = array(
35 | 'muenchen' => 'muenchen-polizei'
36 | );
37 | var $text_cnt = '';
38 | var $imgs_sel = '//div[@class="inhaltBilderZoom"]/a';
39 | var $custom_image_src = true;
40 |
41 | var $replace_elms_child = array('imp:live-info');
42 | var $replace_with_child = array('div');
43 |
44 | function get_missing_text($xpath, $post)
45 | {
46 | $currentnode = $xpath->query('//h1')->item(0)->nextSibling;
47 | $parent = $currentnode->parentNode;
48 | $innerhtml = array();
49 | while ($currentnode) {
50 | if($this->stop_adding($currentnode)) {
51 | break;
52 | }
53 | if ($this->should_ignore($currentnode)) {
54 | $currentnode = $currentnode->nextSibling;
55 | continue;
56 | }
57 | $innerhtml[] = $parent->ownerDocument->saveXML($currentnode);
58 | $currentnode = $currentnode->nextSibling;
59 | }
60 | $post->content = trim(implode("", $innerhtml));
61 | }
62 |
63 | function get_content($xpath)
64 | {
65 | $text = array();
66 | $textbody = $xpath->query('//h1')->item(0)->parentNode;
67 | $this->remove_links($textbody);
68 | $childNodes = $textbody->childNodes;
69 | $innerhtml = array();
70 | $indx = 0;
71 | foreach ($childNodes as $indx => $child) {
72 | if ($child->nodeName == 'h1') {
73 | break;
74 | }
75 | }
76 | ++$indx;
77 | for (;$indx < $childNodes->length; ++$indx){
78 | $child = $childNodes->item($indx);
79 | if($this->stop_adding($child)) {
80 | break;
81 | }
82 | if ($this->should_ignore($child)) {
83 | continue;
84 | }
85 | $innerhtml[] = $textbody->ownerDocument->saveXML($child);
86 | }
87 | $txt = implode("", $innerhtml);
88 | return preg_replace('//', '', $txt);
89 | }
90 |
91 |
92 | function should_ignore($child)
93 | {
94 | if ($child->nodeName == '#text' ||
95 | $child->nodeName == '#script' ||
96 | $child->nodeName == 'img'||
97 | $child->nodeName == 'a'||
98 | $child->nodeName == '#style') {
99 | return true;
100 | }
101 | if($child->nodeName == 'table' && trim($child->nodeValue) == '') {
102 | return true;
103 | }
104 | if (method_exists($child, 'getAttribute')) {
105 | $cls = $child->getAttribute('class');
106 | if ($cls == 'inhaltBilderZoom') {
107 | return true;
108 | }
109 | }
110 | return false;
111 | }
112 |
113 | function stop_adding($child)
114 | {
115 | if(method_exists($child, 'getAttribute')) {
116 | $cls = $child->getAttribute('class');
117 | if ($cls == 'inhaltFooter') {
118 | return true;
119 | }
120 | }
121 | return false;
122 | }
123 |
124 | function get_missing_title($xpath, &$post)
125 | {
126 | $post->title = $xpath->query('//h1')->item(0)->nodeValue;
127 | }
128 |
129 | function get_missing_category($xpath, &$post)
130 | {
131 | $segs = explode('/', $post->link);
132 | $keys = array_keys($this->category);
133 | foreach ($segs as $seg){
134 | if (in_array($seg, $keys)) {
135 | $post->category = $this->category[$seg];
136 | if (array_key_exists($seg, $this->category_slug)) {
137 | $post->category_slug = $this->category_slug[$seg];
138 | }
139 | break;
140 | }
141 | }
142 | }
143 |
144 | function get_image_custom($elm)
145 | {
146 | $href = $elm->getAttribute('href');
147 | preg_match("/'(.[^']*)'/", $href, $match);
148 | return $match[1];
149 | }
150 |
151 | }
152 |
--------------------------------------------------------------------------------
/incl/basefeed.php:
--------------------------------------------------------------------------------
1 | ');
5 |
6 | class Basefeed
7 | {
8 | var $img_root;
9 |
10 | /**
11 | * downloads content
12 | * @return string
13 | * @author Tarin Mahmood
14 | **/
15 | protected function download_content($link)
16 | {
17 | $content = Utils::download_content($link);
18 | if(!$this->is_html_file($content, $link)) {
19 | return false;
20 | }
21 | // force html utf8
22 | $content = str_replace('', UTF8_TAG, $content);
23 | // check if needs to remove any nodes
24 | if (property_exists($this, 'replace_elms_child')) {
25 | $content = str_replace($this->replace_elms_child,
26 | $this->replace_with_child,
27 | $content);
28 | }
29 | return $content;
30 | }
31 |
32 | /**
33 | * get_xpath
34 | * @return xpath object
35 | * @author Tarin Mahmood
36 | **/
37 | protected function get_xpath($content)
38 | {
39 | $doc = new DOMDocument();
40 | @$doc->loadHTML($content);
41 | // remove scripts
42 | while (($r = $doc->getElementsByTagName("script")) && $r->length) {
43 | $r->item(0)->parentNode->removeChild($r->item(0));
44 | }
45 | return new DOMXpath($doc);
46 | }
47 |
48 | /**
49 | * Downloads page, cleans up set
50 | * @Return DOMXPath object
51 | */
52 | function get_page_obj($post)
53 | {
54 | // download page
55 | $content = $this->download_content($post->link);
56 | // return xpath
57 | return $this->get_xpath($content);
58 | }
59 |
60 | /**
61 | * is_html_file
62 | * @return true/false
63 | * @author Tarin Mahmood
64 | **/
65 | public function is_html_file($content, $link)
66 | {
67 | $tmp_file_name = IMG_SRC_PATH . '/tmp/' . md5($link);
68 | file_put_contents($tmp_file_name , $content);
69 | $mtype = mime_content_type($tmp_file_name);
70 | unlink($tmp_file_name);
71 | return $mtype == 'text/html';
72 | }
73 |
74 | /**
75 | * is_bad_url, check if the url is in list of bad URLS
76 | * @return true/false
77 | * @author Tarin Mahmood
78 | **/
79 | function is_bad_url($imgurl)
80 | {
81 | return isset($this->bad_url) &&
82 | in_array($imgurl, $this->bad_url);
83 | }
84 |
85 | function is_full_url($imgsrc)
86 | {
87 | return strpos($imgsrc, 'https://') === 0 ||
88 | strpos($imgsrc, 'http://') === 0;
89 | }
90 |
91 | function call_custom_image_parsing()
92 | {
93 | return isset($this->custom_image_src) &&
94 | $this->custom_image_src;
95 | }
96 |
97 | /**
98 | * Downloads and save images in page in dis
99 | *
100 | * @return list of images found
101 | * @author Tarin Mahmood
102 | */
103 | function get_image($xpath)
104 | {
105 | $images = array();
106 | $count_images = 0;
107 | if (is_array($this->imgs_sel)) {
108 | foreach ($this->imgs_sel as $sel){
109 | $imglist = $xpath->query($sel);
110 | foreach ($imglist as $img){
111 | $images[] = $img;
112 | }
113 | }
114 | $count_images = count($images);
115 | } else {
116 | $images = $xpath->query($this->imgs_sel);
117 | $count_images = $images->length;
118 | }
119 | if ($count_images == 0) {
120 | return "";
121 | }
122 | return $this->download_store_images($images);
123 | }
124 |
125 | function download_store_images($images)
126 | {
127 | $imgs = array();
128 | $added = array();
129 | list($storepath, $imgsrcpath) = Utils::get_base_url($this->base_url);
130 | if (!file_exists($storepath)) {
131 | mkdir($storepath, 0777, true);
132 | }
133 | foreach ($images as $imgelm){
134 | if ($this->call_custom_image_parsing()) {
135 | $imgsrc = $this->get_image_custom($imgelm);
136 | } else {
137 | $imgsrc = $imgelm->getAttribute('src');
138 | }
139 | if (in_array($imgsrc, $added)) {
140 | continue;
141 | }
142 | $added[] = $imgsrc;
143 | $is_data = false;
144 | if (strpos($imgsrc, 'data:') === 0) {
145 | $is_data = true;
146 | } else {
147 | if ($this->is_bad_url($imgsrc)) {
148 | continue;
149 | }
150 | if ($this->is_full_url($imgsrc)) {
151 | $imgurl = $imgsrc;
152 | } else {
153 | $imgurl = $this->base_url . $imgsrc;
154 | }
155 | }
156 | $filename = md5($imgsrc);
157 | $filepath = $storepath . $filename;
158 | $filesrc = $imgsrcpath. $filename;
159 | if ($is_data) {
160 | $v = explode(',', $imgsrc);
161 | $imgdata = imagecreatefromstring(array_pop($v));
162 | file_put_contents($filepath, $imgdata);
163 | } else {
164 | $imgcontent = file_get_contents($imgurl);
165 | if ($imgcontent != null) {
166 | file_put_contents($filepath, $imgcontent);
167 | }
168 | }
169 | $r = exif_imagetype($filepath);
170 | if ($r == IMAGETYPE_JPEG) {
171 | $newpath = "$filepath.jpg";
172 | rename($filepath, $newpath);
173 | $filepath = $newpath;
174 | } elseif ($r == IMAGETYPE_PNG) {
175 | $newpath = "$filepath.png";
176 | rename($filepath, $newpath);
177 | $filepath = $newpath;
178 | }
179 | $imgs[] = $filepath;
180 | }
181 | return $imgs;
182 | }
183 |
184 | protected function _get_inner_html($node)
185 | {
186 | $txt = array();
187 | $childNodes = $node->childNodes;
188 | foreach ($childNodes as $ky=>$child){
189 | if($this->ignore_content($child)) {
190 | continue;
191 | }
192 | if (trim($node->nodeValue) == '') {
193 | continue;
194 | }
195 | $txt[] = $node->ownerDocument->saveXML($child);
196 | }
197 | $txt = implode(" ", $txt);
198 | $txt = preg_replace("/[\r\n]+/", "\n", $txt);
199 | return "$txt
"; 200 | } 201 | 202 | protected function is_elm_with_class($node, $nodename) 203 | { 204 | return $node->nodeName == $nodename && 205 | $node->hasAttribute('class'); 206 | } 207 | 208 | protected function is_elm_with_attr($elm, $attr) 209 | { 210 | if($elm->hasAttribute('id')) { 211 | return $elm->getAttribute('id'); 212 | } 213 | return false; 214 | } 215 | 216 | 217 | function remove_links($parentnode) 218 | { 219 | if ($parentnode == null) { 220 | return; 221 | } 222 | $tobe_removed = array(); 223 | foreach ($parentnode->getElementsByTagName('a') as $link){ 224 | $tobe_removed[] = $link; 225 | } 226 | foreach ($tobe_removed as $link){ 227 | $link->parentNode->removeChild($link); 228 | } 229 | } 230 | 231 | /** 232 | * set_category_details 233 | * @return void 234 | * @author Tarin Mahmood 235 | **/ 236 | public function set_category_details(&$post) 237 | { 238 | if (isset($this->parent_category)) { 239 | $post->parent_category = $this->parent_category; 240 | } 241 | if ($post->category_slug == null && $this->category_slug_set($post)) { 242 | $post->category_slug = $this->category_slug[$post->category]; 243 | } 244 | } 245 | 246 | function category_slug_set($post) 247 | { 248 | return isset($this->category_slug) 249 | && array_key_exists($post->category, $this->category_slug); 250 | } 251 | 252 | /** 253 | * text_formatting, place holder fuction 254 | * @return void 255 | * @author Tarin Mahmood 256 | **/ 257 | public function text_formatting($text) 258 | { 259 | return $text; 260 | } 261 | 262 | } 263 | 264 | --------------------------------------------------------------------------------