├── lang └── en │ └── local_linkchecker.php ├── README.txt ├── version.php ├── tests └── lib_test.php ├── lib.php ├── index.php └── multicheck.php /lang/en/local_linkchecker.php: -------------------------------------------------------------------------------- 1 | . 16 | 17 | $plugin->version = 2015051300; // The (date) version of this module + 2 extra digital for daily versions 18 | // This version number is displayed into /admin/forms.php 19 | $plugin->requires = 2014111005; // Requires at least Moodle 2.8.5 (Build: 20150505) 20 | // The required version was somewhat arbitrarily chosen when adding version.php 21 | // local_linkchecker may in fact work fine with earlier versions of Moodle. 22 | $plugin->component = 'local_linkchecker'; // Full name of the plugin (used for diagnostics). 23 | $plugin->maturity = MATURITY_STABLE; 24 | $plugin->cron = 0; 25 | -------------------------------------------------------------------------------- /tests/lib_test.php: -------------------------------------------------------------------------------- 1 | assertEquals('http://example1.com/', check_for_manual_redirect('')); 19 | // No trailing slash. 20 | $this->assertEquals('http://example2.com', check_for_manual_redirect('')); 21 | // uppercase URL 22 | $this->assertEquals('http://example3.com', check_for_manual_redirect('')); 23 | // Http-equiv has to be before content. 24 | $this->assertEquals(false, check_for_manual_redirect('')); 25 | // No space after redirection time. 26 | $this->assertEquals('http://example5.com', check_for_manual_redirect('')); 27 | } 28 | 29 | public function test_is_bogon() { 30 | // Valid addresses. 31 | $this->assertFalse(is_bogon('http://moodle.org')); 32 | $this->assertFalse(is_bogon('http://moodle.com/testing')); 33 | $this->assertFalse(is_bogon('http://141.101.113.179')); // Moodle.org. 34 | $this->assertFalse(is_bogon('https://210.187.22.153')); // Google.com 35 | $this->assertFalse(is_bogon('http://doesntexitXXXXYYYYZZZZ.com')); // Not a real site but not a bogon. 36 | 37 | // Bogons. 38 | $this->assertTrue(is_bogon('http://0.0.0.0')); 39 | $this->assertTrue(is_bogon('http://127.0.0.2')); 40 | $this->assertTrue(is_bogon('http://10.255.255.10/moodle')); 41 | $this->assertTrue(is_bogon('https://172.26.0.12')); 42 | $this->assertTrue(is_bogon('http://192.168.10.190:8090')); 43 | } 44 | 45 | public function test_create_handle() { 46 | 47 | $this->assertEquals(false, create_handle('')); 48 | $this->assertEquals(false, create_handle(null)); 49 | 50 | $handle = create_handle('http://createhandle.com'); 51 | $this->assertEquals('http://createhandle.com', curl_getinfo($handle, CURLINFO_EFFECTIVE_URL)); 52 | 53 | // Trailing slash. 54 | $handle = create_handle('http://createhandle.com/'); 55 | $this->assertEquals('http://createhandle.com/', curl_getinfo($handle, CURLINFO_EFFECTIVE_URL)); 56 | 57 | // Subdomain. 58 | $handle = create_handle('http://sub.createhandle.com'); 59 | $this->assertEquals('http://sub.createhandle.com', curl_getinfo($handle, CURLINFO_EFFECTIVE_URL)); 60 | 61 | // A subdirectory. 62 | $handle = create_handle('http://createhandle.com/dir'); 63 | $this->assertEquals('http://createhandle.com/dir', curl_getinfo($handle, CURLINFO_EFFECTIVE_URL)); 64 | 65 | // No "." == invalid. 66 | $handle = create_handle('http://createhandlecom'); 67 | $this->assertEquals($handle, false); 68 | } 69 | 70 | public function test_get_head_fingerprint() { 71 | // A set cookie meta tag from a non-Moodle site. 72 | $head = ''; 73 | $this->assertEquals('000', get_head_fingerprint($head)); 74 | 75 | // None of these seem to exist in a vanilla 2.9 site. 76 | /*Set\-Cookie: MoodleSessionTest=", 77 | "Set\-Cookie: MoodleSession=", 78 | "Set\-Cookie: MOODLEID\_="*/ 79 | } 80 | 81 | public function test_get_html_fingerprint() { 82 | $html = ''; 83 | $this->assertEquals('0000000000000000000000000000', get_html_fingerprint($html)); 84 | 85 | // The following strings were taken from a vanilla 2.9dev Moodle install (Build: 20141128). 86 | // The commented out tests are those where I could not find the relevant string on the site's home or login page. 87 | // This is presumably since the HTML has changed since the test was originally written. 88 | $html = '' . 89 | '' . 90 | //"var moodle_cfg = \{", // moodle 2 only 91 | //"function openpopup\(url,", // moodle 1.x only 92 | //"function inserttext\(text\)", // moodle 1.x only 93 | '
You are not logged in. (Log in)
' . 94 | //"type=\"hidden\" name=\"testcookies\"", 95 | //"type=\"hidden\" name=\"sesskey\" value=\"", 96 | //"method=\"get\" name=\"changepassword\"", 97 | //"lib\/cookies\.js\"><\/script>", 98 | //"class=\"headermain\">", 99 | //"function getElementsByClassName\(oElm,", 100 | //"
' . 103 | //"lib\/overlib.js", // moodle 1.x 104 | //"src=\"pix\/madewithmoodle", // moodle 1.x 105 | //"function popUpProperties\(inobj\)", // BELOW BEGINS NEW RULES 106 | //"var moodleConfigFn =", 107 | '' . 108 | '\"comboBase\":\"http:\/\/localhost\/moodle\/int\/master\/theme\/yui_combo.php?\"' . 109 | //"M.core_dock.init_genericblock", 110 | '' .//"\/theme\/javascript.php", 111 | 'Skip available courses' . 112 | //"<\/span>
", 113 | '' .//"\n\s+
\n\s+
", 115 | '' .//"'; 117 | $this->assertEquals('11000100000000100001101101011', get_html_fingerprint($html)); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /lib.php: -------------------------------------------------------------------------------- 1 | prefix.LINKCHECKER_TABLENAME." WHERE unreachable <= %d AND timelinkchecked <= %d AND id < %d AND url <> 'https://moodle.org' ORDER BY id DESC LIMIT %d"; 18 | //sort sites randomly for more evenly distributed use of curl multi handle buffer - too many sequential wait times hog up the buffer. 19 | $GLOBALS['sitessofar'] = null; 20 | $GLOBALS['totalsites'] = $DB->count_records_select(LINKCHECKER_TABLENAME, "unreachable <= " . LINKCHECKER_MAXIMUMUNREACHABLE . " AND timelinkchecked <= {$GLOBALS['timelinkchecked']}"); 21 | } 22 | 23 | /** 24 | * Does the cURL response look like a redirection? 25 | * @return string The URL to redirect to 26 | */ 27 | function check_for_manual_redirect($sitecontent) { 28 | if (preg_match('# $r) { 51 | switch ($r['type']) { 52 | case "A": 53 | $addr = $r['ip']; 54 | $addr = explode('.', $addr); 55 | $addr = array_reverse ($addr, true); 56 | $revaddr = ''; 57 | foreach ($addr as $kk => $v) $revaddr.=$v.'.'; 58 | $a_records[] = $revaddr; 59 | break; 60 | case "AAAA": 61 | $addr = $r['ipv6']; 62 | $addr = str_replace(':', '', $addr); 63 | $addr = str_split($addr); 64 | $addr = array_reverse ($addr, true); 65 | $revaddr = ''; 66 | foreach ($addr as $kk => $v) $revaddr.=$v.'.'; 67 | $aaaa_records[] = $revaddr; 68 | break; 69 | default: 70 | break; 71 | } 72 | } 73 | foreach ($a_records as $k => $record) { 74 | $response = dns_get_record($record.'v4.fullbogons.cymru.com', DNS_A); 75 | if (empty($response)) continue; 76 | if ($response[0]['ip'] == '127.0.0.2') $failure++; 77 | } 78 | foreach ($aaaa_records as $k => $record) { 79 | $response = dns_get_record($record.'v6.fullbogons.cymru.com', DNS_A); 80 | if (empty($response)) continue; 81 | if ($response[0]['ip'] == '127.0.0.2') $failure++; 82 | } 83 | return $failure > 0; 84 | } 85 | 86 | /** 87 | * Initializes a cURL session 88 | * @return returns a cURL handle or false if an error occured 89 | **/ 90 | function create_handle($url) { 91 | if (trim($url)=='') { 92 | return false; 93 | } 94 | $urlbits = parse_url($url); 95 | $handle = curl_init(); 96 | curl_setopt ($handle, CURLOPT_URL, $url); 97 | if (is_array($urlbits) && array_key_exists('host', $urlbits) && array_key_exists('scheme', $urlbits)) { 98 | if (strpos($urlbits['host'],'.')===false) { 99 | return false; 100 | } 101 | curl_setopt ($handle, CURLOPT_HTTPHEADER, array('Cache-Control: no-cache', 'Accept: text/plain, text/html', 'Host: '.$urlbits['host'], 'Connection: close')); 102 | } else { 103 | return false; 104 | } 105 | curl_setopt ($handle, CURLOPT_USERAGENT, 'Moodle.org Link Checker (http://moodle.org/sites/)'); 106 | curl_setopt ($handle, CURLOPT_MAXCONNECTS, 1024); 107 | curl_setopt ($handle, CURLOPT_FRESH_CONNECT, TRUE); 108 | curl_setopt ($handle, CURLOPT_RETURNTRANSFER, TRUE); 109 | curl_setopt ($handle, CURLOPT_FAILONERROR, true); 110 | curl_setopt ($handle, CURLOPT_FOLLOWLOCATION, TRUE); 111 | curl_setopt ($handle, CURLOPT_MAXREDIRS, LINKCHECKER_MAXREDIRECTS); 112 | curl_setopt ($handle, CURLOPT_COOKIEFILE, '/dev/null'); 113 | curl_setopt ($handle, CURLOPT_AUTOREFERER, true); 114 | curl_setopt ($handle, CURLOPT_DNS_USE_GLOBAL_CACHE, false); 115 | curl_setopt ($handle, CURLOPT_HEADER, true); 116 | curl_setopt ($handle, CURLOPT_SSL_VERIFYPEER, false); 117 | curl_setopt ($handle,CURLOPT_ENCODING , "gzip"); // for some curl OPTIONAL speed and perhaps more accomodating 118 | //1 to check the existence of a common name in the SSL peer certificate. 119 | //2 to check the existence of a common name and also verify that it matches the hostname provided. 120 | //In production environments the value of this option should be kept at 2 (default value). 121 | //Support for value 1 removed in cURL 7.28.1 122 | curl_setopt ($handle, CURLOPT_SSL_VERIFYHOST, 0); 123 | curl_setopt ($handle, CURLOPT_TIMEOUT, LINKCHECKER_MAXCURLTIMEOUT); 124 | curl_setopt ($handle, CURLOPT_CONNECTTIMEOUT, LINKCHECKER_MAXCONNECTIONTIMEOUT); 125 | // curl_setopt ($handle, CURLOPT_INTERFACE, "184.172.24.2"); 126 | return $handle; 127 | } 128 | 129 | function get_head_fingerprint($head) { 130 | $fingerprint = ''; 131 | $rules = array("Set\-Cookie: MoodleSessionTest=", 132 | "Set\-Cookie: MoodleSession=", 133 | "Set\-Cookie: MOODLEID\_="); 134 | foreach ($rules as &$header_regex) { 135 | if (preg_match("/".$header_regex."/", $head)) { 136 | $fingerprint .= '1'; 137 | } else { 138 | $fingerprint .= '0'; 139 | } 140 | } 141 | return $fingerprint; 142 | } 143 | 144 | function get_html_fingerprint($html) { 145 | $fingerprint = ''; 146 | $rules = array("lib\/javascript\-static\.js\"><\/script>", 147 | "content=\"moodle,", 148 | "var moodle_cfg = \{", // moodle 2 only 149 | "function openpopup\(url,", // moodle 1.x only 150 | "function inserttext\(text\)", // moodle 1.x only 151 | "
", 152 | "type=\"hidden\" name=\"testcookies\"", 153 | "type=\"hidden\" name=\"sesskey\" value=\"", 154 | "method=\"get\" name=\"changepassword\"", 155 | "lib\/cookies\.js\"><\/script>", 156 | "class=\"headermain\">", 157 | "function getElementsByClassName\(oElm,", 158 | "
", 161 | "lib\/overlib.js", // moodle 1.x 162 | "src=\"pix\/madewithmoodle", // moodle 1.x 163 | "function popUpProperties\(inobj\)", // BELOW BEGINS NEW RULES 164 | "var moodleConfigFn =", 165 | "<\/span>
", 171 | "\n\s+
\n\s+
", 173 | // some moodles force login and we end up at login page. these are rules specific to moodle login page scoring. 174 | ""); 176 | 177 | foreach ($rules as &$body_regex) { 178 | if (preg_match("/".$body_regex."/", $html)) { 179 | $fingerprint .= '1'; 180 | } else { 181 | $fingerprint .= '0'; 182 | } 183 | } 184 | return $fingerprint; 185 | } 186 | -------------------------------------------------------------------------------- /index.php: -------------------------------------------------------------------------------- 1 | libdir.'/tablelib.php'); 6 | 7 | define('LINKCHECKER_DIR', 'local/linkchecker'); 8 | 9 | $limitnum = optional_param('limitnum', 400, PARAM_INT); //for now a fixed 400 out of 183000 for 95% (-/+5%) confidence. 10 | 11 | $PAGE->set_context(context_system::instance()); 12 | $PAGE->set_url(new moodle_url('/local/linkchecker/')); 13 | $PAGE->requires->jquery(); 14 | 15 | $isadmin = ismoodlesiteadmin(); 16 | if (!$isadmin) { 17 | redirect('http://moodle.net/sites'); //shoo 18 | } 19 | $chkurl = optional_param('url', null, PARAM_URL); 20 | $siteid = optional_param('siteid', null, PARAM_INT); 21 | 22 | $totrecs = $DB->count_records('hub_site_directory'); 23 | 24 | $img = optional_param('img', null, PARAM_URL); 25 | if (!is_null($siteid) && !is_null($img)) { 26 | $im = getcoverageimg($totrecs, $siteid, true); //get an overlay transparent png for indication. 27 | header('content-type: image/png'); 28 | // header("Content-Length: " . filesize($name)); 29 | imagepng($im); 30 | die(); 31 | } 32 | 33 | if ($chkurl !== null) { 34 | list($hdrs, $redircnt) = reachaUrl($chkurl.'/lib/womenslib.php'); 35 | if ($hdrs == false) { 36 | //return some error info. 37 | $err = error_get_last(); 38 | $result = $err['message']; 39 | } else { 40 | //also can check login/forgot_password.php ..for some stuff... 41 | // anyway the point is humans check and feedback. Devs filter that feedback and transform into rules.. barring AI ofcourse. 42 | $result = htmlspecialchars($hdrs[0]); // ' Mod:'. $hdrs[3]); //http code and modified. 43 | } 44 | // $result = '200 OK'; //test. 45 | echo json_encode(array('result'=> 'redirections:'. $redircnt. ' final response:'. $result, 'siteid' => $siteid, 'imgsrc' => $CFG->wwwroot.'/local/linkchecker/index.php?siteid='.$siteid.'&img=1')); 46 | die(); 47 | } 48 | 49 | $PAGE->navbar->add('Registered sites', new moodle_url('/local/linkchecker/')); 50 | $PAGE->set_title(get_string('registeredmoodlesites_moodlenet', 'local_hub')); 51 | $PAGE->set_heading(get_string('registeredmoodlesites', 'local_hub')); 52 | 53 | 54 | echo $OUTPUT->header(); 55 | 56 | $limitfrom = optional_param('limitfrom', rand(1, $totrecs-$limitnum), PARAM_INT); 57 | 58 | list($where, $params) = local_hub_stats_get_confirmed_sql(); 59 | 60 | $allfailedrecids = $DB->get_fieldset_sql('Select id from {hub_site_directory} r WHERE NOT('. $where. ')' , $params); //get all failed site ids. 61 | 62 | $randomrecordids = array(); 63 | 64 | if (is_null($siteid) && $totrecs > 0 && !empty($allfailedrecids)) { 65 | $id=0; 66 | while (count($randomrecordids)<$limitnum) { 67 | while(!in_array($id=rand(0, $totrecs-1), $randomrecordids) && in_array($id,$allfailedrecids)) { //get unique random list of ids. 68 | $randomrecordids[] = $id; 69 | break; 70 | } 71 | } 72 | } else { 73 | $randomrecordids[] = $siteid; 74 | } 75 | 76 | list($in_sql, $params) = $DB->get_in_or_equal($randomrecordids, SQL_PARAMS_NAMED, 'r', true); 77 | $im = getcoverageimg($totrecs, $randomrecordids); 78 | $failedrecs = $DB->get_records_sql('Select id, url, countrycode, privacy, unreachable, score, fingerprint, errormsg ' 79 | . 'from {hub_site_directory} r WHERE id '.$in_sql , $params); 80 | 81 | // Outputs table 82 | $table = new html_table(); 83 | // $table->attributes['class'] = 'collection'; 84 | 85 | $table->head = array( 86 | "id", 87 | 'url', 88 | "CC", 89 | 'unreachable', 90 | 'score', 91 | 'privacy', 92 | 'Previous fingerprint OR cron-linkchecker:errormsg', 93 | 'Now checking for womens liberty..' 94 | ); 95 | $table->colclasses = array(); 96 | $scorefield = ''; 97 | 98 | foreach ($failedrecs as $rec) { 99 | 100 | $cell = new html_table_cell('Checking..'); 101 | $cell->attributes = array('siteid' => $rec->id, 'class' => 'manualcheck', 'url' => $rec->url); 102 | 103 | $trackerparams = array ( 104 | 'pid'=>'10020', 'issuetype'=>'3', //tracker.moodle.org : moodle community sites project id. 105 | 'description' => 'see linkcheck test page @ '.$CFG->wwwroot.'/local/linkchecker/index.php?siteid='.$rec->id, 106 | 'components' => '12633', //tracker.moodle.org : moodle community sites project's 'moodle.net' component id. 107 | 'summary' => 'linkchecker found to have missed moodle site (id '.$rec->id.')', 108 | 'security' => '10030' //, 'schemeId' => '10000' //set atleast 'could be a security issue' as site privacy settings can be changing/changed on moodle.net 109 | ); 110 | if (!is_null($siteid)) { 111 | $scorefield = (!in_array($siteid, $allfailedrecids)) ?'(moodley)':''; 112 | } 113 | $row = array($rec->id, 114 | ' Browse it |' 115 | .' Linkcheck it |' 116 | .' Report it ' 117 | ,''.$rec->countrycode.'' 118 | , $rec->unreachable, $rec->score. $scorefield, $rec->privacy, (strlen($rec->errormsg)>0)?$rec->errormsg:'fingerprint:'.$rec->fingerprint, $cell); 119 | $table->data[] = $row; 120 | } 121 | $htmltable = html_writer::table($table); 122 | 123 | list($sql, $params) = local_hub_stats_get_confirmed_sql(); 124 | $sql = "SELECT count(*) as onlinesitescount FROM {hub_site_directory} r WHERE ".$sql; 125 | $totsitesonline = $DB->get_record_sql($sql, $params); 126 | 127 | echo 'Total sites: '. $totrecs. ' | Total online & moodley: '.$totsitesonline->onlinesitescount .' | Offline sites loaded in table: '. $limitnum. ' | '; 128 | echo 'Checked: 0 | not failed: 0 | Desired Fails: 0 | linkchecking fraction (desiredfails/checked): '; 129 | 130 | echo '

Test coverage of unmoodley sites:

Flags for unmoodley sites detected online:
'; 137 | 138 | echo '
'; 139 | 140 | echo '
Default sample size 400 is for a 95% (+/-5%) confidence test. Adjustable in url by appending "?limitnum=xxx". Please remember to adjust your linkchecking percentage after human checking and raise moodley sites from here to developers.'; 141 | 142 | echo $htmltable ; 143 | // $nonmoodlecnt = $totrecs-$totsitesonline->onlinesitescount; 144 | 145 | $jsscr = << 147 | function linkchecker() { 148 | var chkcnt; 149 | var failcnt; 150 | var notfailcnt; 151 | var percentage; 152 | var confidence; 153 | var Z = 1.96; //for 95% confidence in a std distribution (about Random 400 samples for over 183k sites will be good) 154 | $(".manualcheck").each(function(index, element){ 155 | element.innerHTML = 'Checking, sent to server, awaiting response..'; 156 | $.get('index.php', 157 | { url: this.getAttribute('url'), 158 | siteid: this.getAttribute('siteid') 159 | }, 160 | function(responseText) { 161 | responseText = jQuery.parseJSON(responseText); 162 | console.log(responseText); 163 | element.innerHTML = responseText.result; 164 | chkcnt = $(".chkcnt").html(); 165 | chkcnt++; 166 | $(".chkcnt").html(chkcnt); 167 | if (responseText.result.indexOf("200 OK") >= 0 || responseText.result.indexOf("303 See Other") >= 0) { 168 | element.innerHTML = '' + responseText.result + ' (Human verification preferred.)'; 169 | notfailcnt = $(".notfailcnt").html(); 170 | notfailcnt++; 171 | $(".notfailcnt").html(notfailcnt); 172 | var samplingrs = $(".samplingresult"); 173 | $(".samplingresult").append(''); 174 | } else { 175 | failcnt = $(".failcnt").html(); 176 | failcnt++; 177 | $(".failcnt").html(failcnt); 178 | console.log("desired fail count:"+failcnt); 179 | percentage = failcnt / chkcnt; 180 | console.log("linkchecker percentage(%):"+percentage); 181 | $(".perc").html(percentage); 182 | } 183 | } 184 | ); 185 | }); 186 | } 187 | linkchecker(); 188 | 189 | js; 190 | 191 | echo $jsscr; // bloody yui 192 | 193 | echo $OUTPUT->footer(); 194 | 195 | function getcoverageimg($totrecs, $randomrecordids, $highlightrecs=false) { 196 | core_php_time_limit::raise(300); 197 | $width = 18000; $height = 20; $padding = 5; 198 | if ($totrecs == 0) { 199 | // Make it work even on an empty hub. 200 | $totrecs = 1; 201 | } 202 | $column_width = $width / $totrecs ; 203 | $im = imagecreate($width,$height); 204 | imagesavealpha($im, true); 205 | //setting completely transparent color 206 | $transparent = imagecolorallocatealpha($im, 0, 0, 0, 127); 207 | $gray = imagecolorallocate ($im,0x4c,0x4c,0x4c); 208 | $gray_lite = imagecolorallocate ($im,0x7e,0x7e,0x7e); 209 | $gray_dark = imagecolorallocate ($im,0x05,0x05,0x05); 210 | $white = imagecolorallocate ($im,0xff,0xff,0xff); 211 | $red = imagecolorallocate ($im,0xee,0x10,0x10); 212 | 213 | $maxv = 1; 214 | if (is_array($randomrecordids)) { 215 | imagefilledrectangle($im,0,0,$width,$height,$white); 216 | //left in some column ht code (c&p) 217 | for($i=0;$i<$totrecs;$i++) 218 | { 219 | $column_height = ($height * ((int) in_array($i, $randomrecordids))); 220 | $x1 = $i*$column_width; 221 | $y1 = $height-$column_height; 222 | $x2 = (($i+1)*$column_width)-$padding; 223 | $y2 = $height; 224 | imagefilledrectangle($im,$x1,$y1,$x2,$y2,$highlightrecs?$red:$gray); 225 | } 226 | } else if (is_int ($randomrecordids)) { //highlight - transparent fill 227 | imagefilledrectangle($im,0,0,$width,$height,$transparent); 228 | $column_height = $height; 229 | $x1 = $randomrecordids*$column_width; 230 | $y1 = $height-$column_height; 231 | $x2 = (($randomrecordids+1)*$column_width)-$padding; 232 | $y2 = $height; 233 | imagefilledrectangle($im,$x1,$y1,$x2,$y2,$highlightrecs?$red:$gray); 234 | } 235 | return $im; 236 | } 237 | 238 | function reachaUrl ($url, $redirectcount=0) { 239 | stream_context_set_default(array( 240 | 'http' => array( 241 | 'method' => 'HEAD' 242 | ) 243 | )); 244 | $headers = get_headers($url, 1); 245 | if ($headers !== false && isset($headers['Location'])) { 246 | reachaUrl($headers['Location'], ++$redirectcount); 247 | } 248 | return array($headers, $redirectcount); 249 | } 250 | -------------------------------------------------------------------------------- /multicheck.php: -------------------------------------------------------------------------------- 1 | #!/usr/bin/php 2 | \n"; exit; } 9 | //allow script to run on different set while earlier scripts wait for timeouts etc in buffer (timelinkchecked is updated upon buffer filling) 10 | $GLOBALS['lockfile'] = "./.multicheck.lock.".rand(1,3); 11 | declare(ticks = 1); 12 | 13 | // setup signal handlers 14 | pcntl_signal(SIGTERM, "cleanup"); 15 | pcntl_signal(SIGINT, "cleanup"); 16 | 17 | // signal handler function 18 | function cleanup($signal) { 19 | global $lockfile,$multihandle; 20 | echo "\nCaught signal ".$signal.", cleaning up\n"; 21 | unlink($lockfile); 22 | curl_multi_close($multihandle); 23 | exit; 24 | } 25 | 26 | if (file_exists($lockfile)) { 27 | $self = basename(__FILE__); 28 | $pids = `ps axw |grep $self |grep -v grep |awk '{print \$1}'`; 29 | $pids = trim($pids); 30 | if (!empty($pids)) { 31 | $mypid = posix_getpid(); 32 | $pidarr = explode("\n", $pids); 33 | foreach ($pidarr as $pid) { 34 | if ($pid == $mypid) { continue; } 35 | print "Another instance is already running, killing pid $pid\n"; 36 | exec("kill -9 $pid"); 37 | } 38 | touch($lockfile); 39 | } 40 | }else { 41 | touch($lockfile) or die("Unable to create $lockfile\n"); 42 | } 43 | 44 | $overallstarttime = microtime(true); 45 | require_once("../../config.php"); 46 | 47 | error_reporting(E_ALL); 48 | ini_set('display_errors','On'); 49 | 50 | echo "\n\nStarting...\n"; 51 | 52 | define_globals(); 53 | 54 | $maxsitecurls = 20; 55 | $sitecurlsrunning = 0; 56 | $GLOBALS['multihandle'] = curl_multi_init(); 57 | $curledsofar = 0; 58 | 59 | echo "$totalsites To Process\n"; 60 | 61 | $outcome = fill_site_buffer(); 62 | if ($outcome===false) die('WHOOAAA no sites'); 63 | 64 | $sitespassed = 0; 65 | $sitesfailed = 0; 66 | $siteserrored = 0; 67 | 68 | $moodleypage = 'login/index.php'; // This should be accessible and constant over time. 69 | 70 | for(;;) { 71 | 72 | while ($sitecurlsrunning<$maxsitecurls && count($sitebuffer)>0) { 73 | $site = array_shift($sitebuffer); 74 | if (is_bogon($site->url)) { 75 | update_site($site, -1, ((int)$site->unreachable+1), addslashes("Non-routable IP found - Didnt attempt curl")); 76 | writeline($site->id, $site->url, 'F', '-','0','0', "Non-routable IP found - Didnt attempt curl"); 77 | $siteserrored++; 78 | continue; 79 | } 80 | $handle = create_handle($site->url); 81 | if ($handle===false) { 82 | update_site($site, -1, ((int)$site->unreachable+1), addslashes("Malformed URL - Didnt attempt curl")); 83 | writeline($site->id, $site->url, 'F', '-','0','0', "Malformed URL - Didnt attempt curl"); 84 | $siteserrored++; 85 | continue; 86 | } 87 | curl_multi_add_handle($multihandle, $handle); 88 | $sitesrunning[(string)$handle] = $site; 89 | $sitecurlsrunning++; 90 | $curledsofar++; 91 | } 92 | 93 | if (count($sitebuffer)===0) { 94 | $filledbuffer = fill_site_buffer(); 95 | } else { 96 | $filledbuffer = true; 97 | } 98 | 99 | if ($sitecurlsrunning == 0 && $filledbuffer===false) { 100 | break; 101 | } 102 | 103 | curl_multi_select($multihandle); 104 | while(($mcRes = curl_multi_exec($multihandle, $mcActive)) == CURLM_CALL_MULTI_PERFORM); 105 | if($mcRes != CURLM_OK) break; 106 | while($done = curl_multi_info_read($multihandle)) { 107 | $handle = $done['handle']; 108 | 109 | $sitecontent = curl_multi_getcontent($handle); 110 | 111 | $site = $sitesrunning[(string)$handle]; 112 | $site->originalurl = $site->url; 113 | $site->url = curl_getinfo($handle, CURLINFO_EFFECTIVE_URL); 114 | $info = curl_getinfo($handle); 115 | $curl_error = new stdClass; 116 | $curl_error = curl_error($handle); 117 | if (!empty($curl_error)) { 118 | $curl_error_string = clean_param($curl_error, PARAM_NOTAGS); 119 | update_site($site, -1, ((int)$site->unreachable+1), $curl_error_string); 120 | writeline($site->id, $site->url, 'F', '-',curl_getinfo($handle, CURLINFO_REDIRECT_COUNT),curl_errno($handle), $curl_error_string); 121 | $siteserrored++; 122 | } else { 123 | $manualredirect = check_for_manual_redirect($sitecontent); 124 | if ($manualredirect!==false) { 125 | $oldurl = $site->url; 126 | $outcome = reinsert_site_into_buffer($site, $manualredirect); 127 | if ($outcome===false) { 128 | update_site($site, '', ((int)$site->unreachable+1), 'Max manual redirects exceeded'); 129 | writeline($site->id, $oldurl,'', '', $site->manualredirect, '', 'Maximum manual redirects exceeded: '.$site->manualredirect); 130 | } else { 131 | writeline($site->id, $oldurl,'', '', $site->manualredirect, '', 'Manual redirect '.$site->url); 132 | } 133 | } else { 134 | $outcome = link_checker_test_result($site, $handle, $sitecontent); 135 | if ($outcome) { 136 | $sitespassed++; 137 | } else { 138 | // frontpages are heavily modified.. in addition, check one $timelongmoodleypgs page 139 | if (!preg_match('#'.$moodleypage.'#', $site->url)) { 140 | $oldurl = $site->url; 141 | 142 | $joiner = '/'; 143 | if (substr(trim($site->url), -1) == '/') { 144 | $joiner = ''; 145 | } 146 | $newurl = $site->url . $joiner . $moodleypage; 147 | 148 | $outcome = reinsert_site_into_buffer($site, $newurl); 149 | if ($outcome===false) { 150 | update_site($site, '', ((int)$site->unreachable+1), 'Max manual redirects exceeded (moodleypage)'); 151 | writeline($site->id, $oldurl,'', '', $site->manualredirect, '', 'Maximum manual redirects exceeded : '.$newurl); 152 | } else { 153 | writeline($site->id, $oldurl,'', '', $site->manualredirect, '', 'Extra page check redirect for '.$site->url); 154 | } 155 | } else { 156 | $sitesfailed++; 157 | } 158 | } 159 | } 160 | } 161 | curl_multi_remove_handle($multihandle, $handle); 162 | curl_close($handle); 163 | $sitecurlsrunning--; 164 | unset($sitesrunning[(string)$handle]); 165 | echo " [ stat: $curledsofar/$totalsites curls:$sitecurlsrunning/$maxsitecurls buf: ".count($sitebuffer).' ]'; 166 | } 167 | } 168 | curl_multi_close($multihandle); 169 | unlink($lockfile); 170 | echo "\n\nProcess Complete\nPassed: $sitespassed\tFailed: $sitesfailed\tErrored: $siteserrored"; 171 | echo "\nTotal time: ". (microtime(true)-$overallstarttime)."\n\n"; 172 | flush(); 173 | 174 | /** 175 | * Update the site's record in hub_site_directory 176 | */ 177 | function update_site(&$site, $score='', $unreachable=0, $errormessage='', $moodlerelease=null, $serverstring=null, $fingerprint=null) { 178 | global $DB; 179 | $updatedsite = new stdClass; 180 | $updatedsite->id = $site->id; 181 | $updatedsite->timelinkchecked = time(); 182 | //reset for further checking if error message indicates time out. (not the exact string as this may change due to curl changes in php...) 183 | // @todo for some reason error number is not used here.... 184 | if (strpos($errormessage, "Connection") !== false || strpos($errormessage, "timed out") !== false || strpos($errormessage, "milliseconds") !== false ) { 185 | $unreachable = 0; 186 | $score = 0; 187 | $updatedsite->override = 2; 188 | } 189 | $updatedsite->unreachable = $unreachable; 190 | $updatedsite->score = $score; 191 | $updatedsite->errormsg = $errormessage; 192 | $updatedsite->fingerprint = $fingerprint; 193 | 194 | if (isset($site->redirectto)) { 195 | $updatedsite->redirectto = $site->redirectto; 196 | } 197 | if ($moodlerelease!=null) { 198 | $updatedsite->moodlerelease = $moodlerelease; 199 | } 200 | if ($serverstring!=null) { 201 | $updatedsite->serverstring = $serverstring; 202 | } 203 | if ($unreachable!=0 && $site->timeunreachable==0) { 204 | $updatedsite->timeunreachable = time(); 205 | } else if ($unreachable==0) { 206 | $updatedsite->timeunreachable = 0; 207 | } 208 | 209 | $DB->update_record(LINKCHECKER_TABLENAME, $updatedsite); 210 | return true; 211 | } 212 | 213 | /** 214 | * Requeue the site for examination 215 | * The site may have responded with a redirect or we need to check a page other than the front page 216 | */ 217 | function reinsert_site_into_buffer($site, $newurl) { 218 | global $sitebuffer; 219 | $urlbits = @parse_url($site->url); 220 | if ($urlbits===false || !is_array($urlbits) || !array_key_exists('host',$urlbits) || !array_key_exists('scheme',$urlbits)) return false; 221 | if (empty($urlbits['port'])) $urlbits['port'] = "80"; 222 | if (empty($urlbits['path'])) $urlbits['path'] = "/"; 223 | $oldurl = $site->url; 224 | if (!preg_match('#^http[s]?://#', $newurl)) { 225 | if (strpos($newurl, '/')!==0) { 226 | $newurl = str_replace('./', '', $newurl); 227 | $path = $urlbits['path']; 228 | $uribits = explode($path, '/'); 229 | array_pop($uribits); 230 | $path = '/'.join('/', $uribits).'/'; 231 | $newurl = $path.$newurl; 232 | } 233 | $newurl = $urlbits['scheme'].'://'.$urlbits['host'].":".$urlbits['port'].$newurl; 234 | } 235 | $site->url = $newurl; 236 | $site->manualredirect++; 237 | if ($site->manualredirect < LINKCHECKER_MAXREDIRECTS) { 238 | $sitebuffer[] = $site; 239 | return true; 240 | } else { 241 | return false; 242 | } 243 | } 244 | 245 | /** 246 | * Load a subset of sites from hub_site_directory to examine via cURL 247 | **/ 248 | function fill_site_buffer() { 249 | global $sitebuffer, $CFG, $DB, $siteselectorsql, $sitessofar, $totalsites, $timelinkchecked; 250 | 251 | static $lastsiteid; 252 | static $runhasfailed; 253 | 254 | if ($runhasfailed===true) return false; 255 | 256 | if ($lastsiteid==null) $lastsiteid= 1000000000; 257 | if ($runhasfailed==null) $runhasfailed = false; 258 | if ($sitessofar==null) $sitessofar = 0; 259 | 260 | echo "\nFilling Buffer with " . LINKCHECKER_SITEBUFFERLIMIT . " sites starting from id ".$lastsiteid."\r\n"; 261 | 262 | $sql = sprintf($siteselectorsql, LINKCHECKER_MAXIMUMUNREACHABLE, $timelinkchecked, $lastsiteid, LINKCHECKER_SITEBUFFERLIMIT); 263 | $sites = $DB->get_records_sql($sql); 264 | 265 | if (!is_array($sites) || count($sites)==0) { 266 | $runhasfailed = true; 267 | return false; 268 | } 269 | $sitessofar += count($sites); 270 | 271 | foreach ($sites as $site) { 272 | $site->manualredirect = 0; 273 | $sitebuffer[] = $site; 274 | 275 | // Update timelinkchecked early. 276 | // This is useful when running some multiple linkchecker processes to go faster when testing fingerprinting. 277 | $site->timelinkchecked = time(); 278 | $DB->update_record(LINKCHECKER_TABLENAME, $site); 279 | } 280 | 281 | if ($lastsiteid===$site->id) { 282 | return false; 283 | } 284 | 285 | $lastsiteid = $site->id; 286 | return true; 287 | } 288 | 289 | /** 290 | * Examines the header and html of cURL response to determine whether the response came from a Moodle site 291 | */ 292 | function link_checker_test_result(&$site, $handle, $html) { 293 | 294 | $head = substr($html, 0, curl_getinfo($handle, CURLINFO_HEADER_SIZE)); 295 | $html = substr($html, curl_getinfo($handle, CURLINFO_HEADER_SIZE)); 296 | 297 | $head = trim($head); 298 | $html = trim($html); 299 | 300 | $serverstring = null; 301 | if (preg_match("/Server: (.*)(\n|\r)/", $head, $smatches)) { 302 | $serverstring = trim($smatches[1]); 303 | } 304 | $fingerprint = ''; //reflects $rules set matching. 305 | $headscore = 0; 306 | if (strlen($head)>10) { 307 | $headfingerprint = get_head_fingerprint($head); 308 | $headscore = 2 * substr_count($headfingerprint, '1'); // 2 points for each match. 309 | $fingerprint .= $headfingerprint; 310 | } 311 | $fingerprint .= '/'; 312 | $htmlscore = 0; 313 | $moodlerelease = null; 314 | if (strlen($html)>40) { 315 | $htmlfingerprint = get_html_fingerprint($html); 316 | $htmlscore = substr_count($htmlfingerprint, '1'); 317 | $fingerprint .= $htmlfingerprint; 318 | 319 | if (preg_match("/(?:title=\"Moodle )(.{0,20})(?: \((Build: |))(\d+)(?:\)\")/i", $html, $matches)) { 320 | $moodlerelease = $matches[1]." (".$matches[3].")"; 321 | } 322 | } 323 | 324 | $score = $htmlscore+$headscore; 325 | 326 | if ($score >= 5) { // Success! 327 | if (curl_getinfo($handle, CURLINFO_EFFECTIVE_URL) != $site->originalurl) { 328 | $site->redirectto = curl_getinfo($handle, CURLINFO_EFFECTIVE_URL); 329 | } 330 | update_site($site, $score, 0, '', $moodlerelease, $serverstring, $fingerprint); 331 | if ($moodlerelease==null) { 332 | $moodlerelease = "Unknown"; 333 | } 334 | writeline($site->id, $site->url, 'P', (string)$htmlscore.'/'.(string)$headscore, (string)$fingerprint, curl_getinfo($handle, CURLINFO_REDIRECT_COUNT),'-', '', $moodlerelease); 335 | return true; 336 | } else { // Failure, but we did reach the site! 337 | update_site($site, $score, 0, '', $moodlerelease, $serverstring, $fingerprint); 338 | if ($moodlerelease==null) { 339 | $moodlerelease = "Unknown"; 340 | } 341 | writeline($site->id, $site->url, 'F', (string)$htmlscore.'/'.(string)$headscore, (string)$fingerprint, curl_getinfo($handle, CURLINFO_REDIRECT_COUNT),'0', 'Failed Check with score '.(string)$score, $moodlerelease); 342 | return false; 343 | } 344 | } 345 | 346 | /** 347 | * Formats the supplied paramters into a pipe separated string and echoes it out. 348 | **/ 349 | function writeline($id, $url, $outcome='F', $score='0', $fingerprint='', $redirects='0', $errorno='', $errormsg='', $moodlerelease='') { 350 | static $header; 351 | static $count; 352 | if ($header==null) { 353 | $hdstr = "\nC |ID | URL | P/F | Score (Body/Head) | Fingerprint | Redir |ErNum | Version | Error Msg"; 354 | echo $hdstr; 355 | echo "\n".str_repeat('-', strlen($hdstr)); 356 | $header = true; 357 | } 358 | if ($count==null) { 359 | $count = 1; 360 | } else { 361 | $count++; 362 | } 363 | $countstr = (strlen($count)<4)?str_pad($count,4):substr($count,0,4); 364 | if (trim($outcome=='')) { 365 | $countstr = ' '; 366 | $count--; 367 | } 368 | $id = (strlen($id)<10)?str_pad($id,10):substr($id,0,10); 369 | $url = (strlen($url)<50)?str_pad($url,50):substr($url,0,50); 370 | $outcome = (strlen($outcome)<4)?str_pad($outcome,4):substr($outcome,0,4); 371 | $score = (strlen($score)<18)?str_pad($score,18):substr($score,0,18); 372 | $redirects = (strlen($redirects)<4)?str_pad($redirects,5):substr($redirects,0,5); 373 | //don't pad this. 374 | $moodlerelease = (strlen($moodlerelease)<24)?str_pad($moodlerelease,24):substr($moodlerelease,0,24); 375 | // $errormsg = (strlen($errormsg)<70)?str_pad($errormsg,70):substr($errormsg,0,70); 376 | echo "\n$countstr|$id| $url| $outcome| $score| $fingerprint| $redirects| $errorno| $moodlerelease| $errormsg"; 377 | flush(); 378 | } 379 | 380 | ?> 381 | --------------------------------------------------------------------------------