├── boot.php ├── lang └── de_de.lang ├── pages ├── index.php ├── info.php ├── crawler.php └── settings.php ├── fragments └── loglink.php ├── package.yml ├── assets └── css │ └── linkchecker_css.css ├── README.md ├── LICENSE └── lib └── linkchecker.php /boot.php: -------------------------------------------------------------------------------- 1 | getAssetsUrl('css/linkchecker_css.css')); 4 | } 5 | 6 | 7 | -------------------------------------------------------------------------------- /lang/de_de.lang: -------------------------------------------------------------------------------- 1 | linkchecker_title = Linkchecker 2 | linkchecker_einstellungen = Einstellungen 3 | linkchecker_crawler = Links checken 4 | 5 | 6 | linkchecker_info = Info -------------------------------------------------------------------------------- /pages/index.php: -------------------------------------------------------------------------------- 1 | i18n('linkchecker_title')); 3 | 4 | //include rex_be_controller::getCurrentPageObject()->getSubPath(); 5 | rex_be_controller::includeCurrentPageSubPath(); 6 | -------------------------------------------------------------------------------- /fragments/loglink.php: -------------------------------------------------------------------------------- 1 |
2 | 3 |
status_code ?>
4 |
-------------------------------------------------------------------------------- /pages/info.php: -------------------------------------------------------------------------------- 1 | '.$Parsedown->text($file).''; 7 | 8 | $fragment = new rex_fragment(); 9 | $fragment->setVar('title', $this->i18n('linkchecker_info')); 10 | $fragment->setVar('body', $content, false); 11 | echo $fragment->parse('core/page/section.php'); 12 | 13 | 14 | -------------------------------------------------------------------------------- /package.yml: -------------------------------------------------------------------------------- 1 | package: linkchecker 2 | version: '0.0.beta' 3 | author: Friends Of REDAXO 4 | supportpage: https://github.com/FriendsOfREDAXO/linkchecker 5 | 6 | page: 7 | title: translate:linkchecker_title 8 | perm: linkchecker[params] 9 | icon: rex-icon fa-chain-broken 10 | pjax: false 11 | subpages: 12 | settings: { title: translate:linkchecker_einstellungen, perm: admin } 13 | crawler: { title: translate:linkchecker_crawler } 14 | info: { title: translate:linkchecker_info } 15 | 16 | requires: 17 | redaxo: '^5.5' 18 | -------------------------------------------------------------------------------- /assets/css/linkchecker_css.css: -------------------------------------------------------------------------------- 1 | .lc_log { display: flex; flex-wrap: nowrap; width: 100%; } 2 | .lc_log + .lc_log { border-top: 1px solid #bbbbbb; } 3 | .lc_log div { padding: 4px } 4 | .lc_log .code200 { background: green; color: #fff; } 5 | .lc_log .code301 { background: red; color: #fff; } 6 | .lc_log .code302 { background: red; color: #fff; } 7 | .lc_log .code404 { background: red; color: #fff; } 8 | .lc_log .code400 { background: red; color: #fff; } 9 | .lc_log .codeerr { background: red; color: #fff; } 10 | .lc_log .codemailto { background: yellow; } 11 | .lc_log .codetel { background: yellow; } 12 | .lc_log .clink { width: 100%; } 13 | .lc_log .statuscode { width: 50px; text-align: center; } 14 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Linkchecker ### 2 | 3 | 4 | Hier eine kurze Anleitung schreiben. 5 | Aber erst wenn alle Grundfunktionen fertig sind :-) 6 | 7 | Infos zu Statuscodes: https://de.wikipedia.org/wiki/HTTP-Statuscode 8 | 9 | --- 10 | 11 | ### ToDo ### 12 | 13 | Während der Ausführung wird die Ausgabe noch nicht korrekt aktualisiert. 14 | 15 | --- 16 | 17 | ### Autor 18 | 19 | **Friends Of REDAXO** 20 | 21 | * http://www.redaxo.org 22 | * https://github.com/FriendsOfREDAXO 23 | 24 | **Projekt-Lead** 25 | 26 | * [Oliver Kreischer](https://github.com/olien) 27 | 28 | ___ 29 | 30 | * Idee und Realisierung der ersten Version: [Wolfgang Bund](https://github.com/dtpop) // [Oliver Kreischer](https://github.com/olien) // [Tim Filler](https://github.com/elricco) 31 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Friends Of REDAXO 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /pages/crawler.php: -------------------------------------------------------------------------------- 1 | 11 |
12 |
Ergebnis
13 |
'; 14 | $linkchecker = new linkchecker(); 15 | $linkchecker->run(); 16 | echo '
17 |
18 | '; 19 | 20 | 21 | } else { 22 | $content .= ' 23 |
24 |
25 | Linkchecker - Crawler starten 26 | '; 27 | $fragment = new rex_fragment(); 28 | $formElements = array(); 29 | $elements = array(); 30 | $elements['field'] = ' 31 | 32 | '; 33 | $formElements[] = $elements; 34 | $fragment = new rex_fragment(); 35 | $fragment->setVar('elements', $formElements, false); 36 | $content .= $fragment->parse('core/form/submit.php'); 37 | $content .= ' 38 |
39 |
40 | '; 41 | } 42 | 43 | $fragment = new rex_fragment(); 44 | if ($config['submit']) { 45 | $fragment->setVar('title', $this->i18n('linkchecker_crawler')); 46 | } 47 | $fragment->setVar('body', $content, false); 48 | echo $fragment->parse('core/page/section.php'); 49 | -------------------------------------------------------------------------------- /pages/settings.php: -------------------------------------------------------------------------------- 1 | setConfig('baselink',$config['baselink']); 22 | $this->setConfig('depth',$config['depth']); 23 | $this->setConfig('maxlinks',$config['maxlinks']); 24 | $this->setConfig('no200',$config['no200']); 25 | $form .= rex_view::info('Werte gespeichert'); 26 | } 27 | 28 | // open form 29 | $form .= ' 30 |
31 |
32 | Linkchecker - Einstellungen 33 | '; 34 | 35 | $fragment = new rex_fragment(); 36 | 37 | // Ausgangslink 38 | $formElements = []; 39 | $n = []; 40 | $n['label'] = ''; 41 | $n['field'] = ''; 42 | $n['note'] = 'Bitte unbedingt den richtigen und vollständigen Link eingeben (http(s)://(www.).'; 43 | $formElements[] = $n; 44 | $fragment->setVar('elements', $formElements, false); 45 | $form .= $fragment->parse('core/form/container.php'); 46 | 47 | 48 | // Tiefe 49 | $formElements = []; 50 | $n = []; 51 | $n['label'] = ''; 52 | $n['field'] = ''; 53 | $n['note'] = 'Tiefe (Anzahl der Ebenen), die bei der Linksuche berücksichtigt werden sollen.'; 54 | $formElements[] = $n; 55 | $fragment->setVar('elements', $formElements, false); 56 | $form .= $fragment->parse('core/form/container.php'); 57 | 58 | // Maximale Anzahl an Links 59 | $formElements = []; 60 | $n = []; 61 | $n['label'] = ''; 62 | $n['field'] = ''; 63 | $n['note'] = 'Anzahl der Links, die maximal ausgewertet werden sollen.'; 64 | $formElements[] = $n; 65 | $fragment->setVar('elements', $formElements, false); 66 | $form .= $fragment->parse('core/form/container.php'); 67 | 68 | 69 | // Maximale Anzahl an Links 70 | $formElements = []; 71 | $n = []; 72 | $n['label'] = ''; 73 | $n['field'] = 'getConfig('no200') ? ' checked="checked"' : '') . ' />'; 74 | $n['note'] = 'Funktioniert noch nicht!'; 75 | $formElements[] = $n; 76 | $fragment->setVar('elements', $formElements, false); 77 | $form .= $fragment->parse('core/form/container.php'); 78 | 79 | 80 | $form .= '
'; 81 | 82 | $form .= '
' 83 | . ''; 84 | 85 | 86 | 87 | // create submit button 88 | $formElements = array(); 89 | $elements = array(); 90 | $elements['field'] = ' 91 | 92 | '; 93 | $formElements[] = $elements; 94 | 95 | // parse submit element 96 | $fragment = new rex_fragment(); 97 | $fragment->setVar('elements', $formElements, false); 98 | $form .= $fragment->parse('core/form/submit.php'); 99 | 100 | // close form 101 | $form .= ' 102 |
103 |
104 | '; 105 | 106 | $fragment = new rex_fragment(); 107 | $fragment->setVar('class', 'edit'); 108 | $fragment->setVar('title', 'Einstellung'); 109 | $fragment->setVar('body', $form, false); 110 | echo $fragment->parse('core/page/section.php'); 111 | -------------------------------------------------------------------------------- /lib/linkchecker.php: -------------------------------------------------------------------------------- 1 | baselink = rex_config::get('linkchecker','baselink'); 19 | $this->maxlinks = rex_config::get('linkchecker','maxlinks'); 20 | $this->maxdepth = rex_config::get('linkchecker','depth'); 21 | $this->no200 = rex_config::get('linkchecker','no200'); 22 | 23 | $parsed_root = parse_url($this->baselink); 24 | $this->real_root = $parsed_root['scheme'] . '://' . $parsed_root['host'] . '/'; 25 | 26 | } 27 | 28 | public function run () { 29 | // error_reporting(0); 30 | $this->crawl_page($this->baselink); 31 | echo rex_view::info('Fertig'); 32 | return; 33 | } 34 | 35 | 36 | /** 37 | * Ruft den Inhalt der Seite ($plink) auf 38 | * @param type $plink = pagelink 39 | * return: links 40 | */ 41 | private function crawl_page ($plink) { 42 | $pages_to_crawl = []; // wird gefüllt mit den von dieser Seite weiter zu crawlenden Links 43 | $this->pages_checked[$plink]['link'] = $plink; 44 | 45 | if ($this->depth >= $this->maxdepth) { 46 | $message = 'Seitentiefe überschritten!'; 47 | $this->log_page($plink,$message); 48 | return; 49 | } 50 | 51 | $this->depth++; 52 | 53 | $this->log_page($plink); 54 | 55 | $doc = new DOMDocument(); 56 | libxml_use_internal_errors(true); 57 | $doc->loadHTML(file_get_contents($plink)); 58 | 59 | $a_elements = $doc->getElementsByTagName('a'); 60 | $base = $doc->getElementsByTagName('base'); 61 | if (@is_object($base[0]->attributes['href']) && $base[0]->attributes['href']->value) { 62 | $this->base = trim($base[0]->attributes['href']->value, '/') . '/'; 63 | } else { 64 | $this->base = $this->real_root; 65 | } 66 | foreach ($a_elements as $a) { 67 | if (count($this->links_checked) > $this->maxlinks) { 68 | $message = 'Maximale Linkanzahl überschritten'; 69 | $this->log_page($plink,$message); 70 | return; 71 | } 72 | 73 | $alink = $a->getAttribute('href'); 74 | $clink = $this->get_checklink($alink); 75 | 76 | if (!$clink) { 77 | $this->log_link($clink,$alink); 78 | continue; 79 | } 80 | 81 | $parsed_link = parse_url($clink); 82 | 83 | if (in_array($parsed_link['scheme'],['tel','mailto'])) { 84 | $this->log_link($clink,$alink,$parsed_link['scheme']); 85 | continue; 86 | } 87 | 88 | // links nicht 2x checken - aber mehrfach loggen 89 | if (isset($this->links_checked[$alink])) { 90 | extract($this->links_checked[$alink]['status']); 91 | $this->log_link($clink,$alink,$status_code); 92 | continue; 93 | } 94 | 95 | $status_code = false; 96 | 97 | if (isset($parsed_link['scheme']) && strpos($parsed_link['scheme'],'http') === 0) { 98 | $status_code = $this->get_status_code($clink); 99 | } 100 | 101 | // Nur spidern, wenn Link korrekt + noch nicht gespidert 102 | // nur Links von der eigenen Seite weiter verfolgen 103 | // unique per Page 104 | if ($status_code && $status_code < 400 && $this->check_page_link($clink)) { 105 | $pages_to_crawl[$clink] = $clink; 106 | } 107 | $this->log_link($clink,$alink,$status_code); 108 | } 109 | 110 | foreach ($pages_to_crawl as $l) { 111 | $this->crawl_page($l); 112 | } 113 | $this->depth--; 114 | 115 | } 116 | 117 | /** 118 | * 119 | * @param type $clink - kann auch false sein - durch get_checklink geprüft 120 | * @param type $alink - ursprünglicher Link 121 | */ 122 | private function log_link ($clink,$alink,$status_code = 'err') { 123 | $this->links_checked[$alink]['status'] = [ 124 | 'clink'=>$clink, 125 | 'alink'=>$alink, 126 | 'status_code'=>$status_code 127 | ]; 128 | 129 | $fragment = new rex_fragment(); 130 | $fragment->setVar('clink', $clink); 131 | $fragment->setVar('alink', $alink); 132 | $fragment->setVar('status_code', $status_code); 133 | echo $fragment->parse('loglink.php'); 134 | 135 | /* 136 | while (@ob_get_status()) { 137 | @ob_end_flush(); 138 | } 139 | */ 140 | // dump(ob_get_status()); 141 | // ob_end_flush(); 142 | // while(@ob_end_flush()); 143 | // flush(); 144 | } 145 | 146 | /** 147 | * 148 | * @param type $link 149 | */ 150 | private function log_page ($link,$message = '') { 151 | echo '

Aktuelle Seite: '.$link.'

'; 152 | if ($message) { 153 | echo '

'.$message.'

'; 154 | } 155 | echo '

Tiefe: '.$this->depth.'

'; 156 | } 157 | 158 | 159 | private function get_status_code ($link) { 160 | // dump($link); 161 | $header = get_headers($link); 162 | if (!$header) { 163 | return false; 164 | } 165 | $statuscode = explode(' ',$header[0])[1]; 166 | return $statuscode; 167 | } 168 | 169 | /** 170 | * Prüft den Link und passt ihn gemäß base href bzw. Startadresse für den Check an. 171 | * @param type $alink 172 | * @return boolean 173 | */ 174 | private function get_checklink ($alink) { 175 | $p_link = parse_url($alink); 176 | if (!$p_link) { 177 | return false; 178 | } 179 | if (!isset($p_link['scheme'])) { 180 | return($this->base . ltrim($alink,'/')); 181 | } 182 | return $alink; 183 | } 184 | 185 | /** 186 | * prüft, ob der übergebene Link zur eigenen Seite gehört und gespidert werden soll 187 | * @param type $link 188 | */ 189 | private function check_page_link ($link) { 190 | 191 | if (strpos($link,$this->real_root) === 0) { 192 | if (!isset($this->pages_checked[$link])) { 193 | return true; 194 | } 195 | } 196 | return false; 197 | } 198 | 199 | 200 | } --------------------------------------------------------------------------------