",
115 | '' .//"';
117 | $this->assertEquals('11000100000000100001101101011', get_html_fingerprint($html));
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/lib.php:
--------------------------------------------------------------------------------
1 | prefix.LINKCHECKER_TABLENAME." WHERE unreachable <= %d AND timelinkchecked <= %d AND id < %d AND url <> 'https://moodle.org' ORDER BY id DESC LIMIT %d";
18 | //sort sites randomly for more evenly distributed use of curl multi handle buffer - too many sequential wait times hog up the buffer.
19 | $GLOBALS['sitessofar'] = null;
20 | $GLOBALS['totalsites'] = $DB->count_records_select(LINKCHECKER_TABLENAME, "unreachable <= " . LINKCHECKER_MAXIMUMUNREACHABLE . " AND timelinkchecked <= {$GLOBALS['timelinkchecked']}");
21 | }
22 |
23 | /**
24 | * Does the cURL response look like a redirection?
25 | * @return string The URL to redirect to
26 | */
27 | function check_for_manual_redirect($sitecontent) {
28 | if (preg_match('#
$r) {
51 | switch ($r['type']) {
52 | case "A":
53 | $addr = $r['ip'];
54 | $addr = explode('.', $addr);
55 | $addr = array_reverse ($addr, true);
56 | $revaddr = '';
57 | foreach ($addr as $kk => $v) $revaddr.=$v.'.';
58 | $a_records[] = $revaddr;
59 | break;
60 | case "AAAA":
61 | $addr = $r['ipv6'];
62 | $addr = str_replace(':', '', $addr);
63 | $addr = str_split($addr);
64 | $addr = array_reverse ($addr, true);
65 | $revaddr = '';
66 | foreach ($addr as $kk => $v) $revaddr.=$v.'.';
67 | $aaaa_records[] = $revaddr;
68 | break;
69 | default:
70 | break;
71 | }
72 | }
73 | foreach ($a_records as $k => $record) {
74 | $response = dns_get_record($record.'v4.fullbogons.cymru.com', DNS_A);
75 | if (empty($response)) continue;
76 | if ($response[0]['ip'] == '127.0.0.2') $failure++;
77 | }
78 | foreach ($aaaa_records as $k => $record) {
79 | $response = dns_get_record($record.'v6.fullbogons.cymru.com', DNS_A);
80 | if (empty($response)) continue;
81 | if ($response[0]['ip'] == '127.0.0.2') $failure++;
82 | }
83 | return $failure > 0;
84 | }
85 |
86 | /**
87 | * Initializes a cURL session
88 | * @return returns a cURL handle or false if an error occured
89 | **/
90 | function create_handle($url) {
91 | if (trim($url)=='') {
92 | return false;
93 | }
94 | $urlbits = parse_url($url);
95 | $handle = curl_init();
96 | curl_setopt ($handle, CURLOPT_URL, $url);
97 | if (is_array($urlbits) && array_key_exists('host', $urlbits) && array_key_exists('scheme', $urlbits)) {
98 | if (strpos($urlbits['host'],'.')===false) {
99 | return false;
100 | }
101 | curl_setopt ($handle, CURLOPT_HTTPHEADER, array('Cache-Control: no-cache', 'Accept: text/plain, text/html', 'Host: '.$urlbits['host'], 'Connection: close'));
102 | } else {
103 | return false;
104 | }
105 | curl_setopt ($handle, CURLOPT_USERAGENT, 'Moodle.org Link Checker (http://moodle.org/sites/)');
106 | curl_setopt ($handle, CURLOPT_MAXCONNECTS, 1024);
107 | curl_setopt ($handle, CURLOPT_FRESH_CONNECT, TRUE);
108 | curl_setopt ($handle, CURLOPT_RETURNTRANSFER, TRUE);
109 | curl_setopt ($handle, CURLOPT_FAILONERROR, true);
110 | curl_setopt ($handle, CURLOPT_FOLLOWLOCATION, TRUE);
111 | curl_setopt ($handle, CURLOPT_MAXREDIRS, LINKCHECKER_MAXREDIRECTS);
112 | curl_setopt ($handle, CURLOPT_COOKIEFILE, '/dev/null');
113 | curl_setopt ($handle, CURLOPT_AUTOREFERER, true);
114 | curl_setopt ($handle, CURLOPT_DNS_USE_GLOBAL_CACHE, false);
115 | curl_setopt ($handle, CURLOPT_HEADER, true);
116 | curl_setopt ($handle, CURLOPT_SSL_VERIFYPEER, false);
117 | curl_setopt ($handle,CURLOPT_ENCODING , "gzip"); // for some curl OPTIONAL speed and perhaps more accomodating
118 | //1 to check the existence of a common name in the SSL peer certificate.
119 | //2 to check the existence of a common name and also verify that it matches the hostname provided.
120 | //In production environments the value of this option should be kept at 2 (default value).
121 | //Support for value 1 removed in cURL 7.28.1
122 | curl_setopt ($handle, CURLOPT_SSL_VERIFYHOST, 0);
123 | curl_setopt ($handle, CURLOPT_TIMEOUT, LINKCHECKER_MAXCURLTIMEOUT);
124 | curl_setopt ($handle, CURLOPT_CONNECTTIMEOUT, LINKCHECKER_MAXCONNECTIONTIMEOUT);
125 | // curl_setopt ($handle, CURLOPT_INTERFACE, "184.172.24.2");
126 | return $handle;
127 | }
128 |
129 | function get_head_fingerprint($head) {
130 | $fingerprint = '';
131 | $rules = array("Set\-Cookie: MoodleSessionTest=",
132 | "Set\-Cookie: MoodleSession=",
133 | "Set\-Cookie: MOODLEID\_=");
134 | foreach ($rules as &$header_regex) {
135 | if (preg_match("/".$header_regex."/", $head)) {
136 | $fingerprint .= '1';
137 | } else {
138 | $fingerprint .= '0';
139 | }
140 | }
141 | return $fingerprint;
142 | }
143 |
144 | function get_html_fingerprint($html) {
145 | $fingerprint = '';
146 | $rules = array("lib\/javascript\-static\.js\"><\/script>",
147 | "content=\"moodle,",
148 | "var moodle_cfg = \{", // moodle 2 only
149 | "function openpopup\(url,", // moodle 1.x only
150 | "function inserttext\(text\)", // moodle 1.x only
151 | "
",
152 | "type=\"hidden\" name=\"testcookies\"",
153 | "type=\"hidden\" name=\"sesskey\" value=\"",
154 | "method=\"get\" name=\"changepassword\"",
155 | "lib\/cookies\.js\"><\/script>",
156 | "class=\"headermain\">",
157 | "function getElementsByClassName\(oElm,",
158 | "
",
161 | "lib\/overlib.js", // moodle 1.x
162 | "src=\"pix\/madewithmoodle", // moodle 1.x
163 | "function popUpProperties\(inobj\)", // BELOW BEGINS NEW RULES
164 | "var moodleConfigFn =",
165 | "<\/span>
",
171 | "\n\s+
\n\s+
",
173 | // some moodles force login and we end up at login page. these are rules specific to moodle login page scoring.
174 | "");
176 |
177 | foreach ($rules as &$body_regex) {
178 | if (preg_match("/".$body_regex."/", $html)) {
179 | $fingerprint .= '1';
180 | } else {
181 | $fingerprint .= '0';
182 | }
183 | }
184 | return $fingerprint;
185 | }
186 |
--------------------------------------------------------------------------------
/index.php:
--------------------------------------------------------------------------------
1 | libdir.'/tablelib.php');
6 |
7 | define('LINKCHECKER_DIR', 'local/linkchecker');
8 |
9 | $limitnum = optional_param('limitnum', 400, PARAM_INT); //for now a fixed 400 out of 183000 for 95% (-/+5%) confidence.
10 |
11 | $PAGE->set_context(context_system::instance());
12 | $PAGE->set_url(new moodle_url('/local/linkchecker/'));
13 | $PAGE->requires->jquery();
14 |
15 | $isadmin = ismoodlesiteadmin();
16 | if (!$isadmin) {
17 | redirect('http://moodle.net/sites'); //shoo
18 | }
19 | $chkurl = optional_param('url', null, PARAM_URL);
20 | $siteid = optional_param('siteid', null, PARAM_INT);
21 |
22 | $totrecs = $DB->count_records('hub_site_directory');
23 |
24 | $img = optional_param('img', null, PARAM_URL);
25 | if (!is_null($siteid) && !is_null($img)) {
26 | $im = getcoverageimg($totrecs, $siteid, true); //get an overlay transparent png for indication.
27 | header('content-type: image/png');
28 | // header("Content-Length: " . filesize($name));
29 | imagepng($im);
30 | die();
31 | }
32 |
33 | if ($chkurl !== null) {
34 | list($hdrs, $redircnt) = reachaUrl($chkurl.'/lib/womenslib.php');
35 | if ($hdrs == false) {
36 | //return some error info.
37 | $err = error_get_last();
38 | $result = $err['message'];
39 | } else {
40 | //also can check login/forgot_password.php ..for some stuff...
41 | // anyway the point is humans check and feedback. Devs filter that feedback and transform into rules.. barring AI ofcourse.
42 | $result = htmlspecialchars($hdrs[0]); // ' Mod:'. $hdrs[3]); //http code and modified.
43 | }
44 | // $result = '200 OK'; //test.
45 | echo json_encode(array('result'=> 'redirections:'. $redircnt. ' final response:'. $result, 'siteid' => $siteid, 'imgsrc' => $CFG->wwwroot.'/local/linkchecker/index.php?siteid='.$siteid.'&img=1'));
46 | die();
47 | }
48 |
49 | $PAGE->navbar->add('Registered sites', new moodle_url('/local/linkchecker/'));
50 | $PAGE->set_title(get_string('registeredmoodlesites_moodlenet', 'local_hub'));
51 | $PAGE->set_heading(get_string('registeredmoodlesites', 'local_hub'));
52 |
53 |
54 | echo $OUTPUT->header();
55 |
56 | $limitfrom = optional_param('limitfrom', rand(1, $totrecs-$limitnum), PARAM_INT);
57 |
58 | list($where, $params) = local_hub_stats_get_confirmed_sql();
59 |
60 | $allfailedrecids = $DB->get_fieldset_sql('Select id from {hub_site_directory} r WHERE NOT('. $where. ')' , $params); //get all failed site ids.
61 |
62 | $randomrecordids = array();
63 |
64 | if (is_null($siteid) && $totrecs > 0 && !empty($allfailedrecids)) {
65 | $id=0;
66 | while (count($randomrecordids)<$limitnum) {
67 | while(!in_array($id=rand(0, $totrecs-1), $randomrecordids) && in_array($id,$allfailedrecids)) { //get unique random list of ids.
68 | $randomrecordids[] = $id;
69 | break;
70 | }
71 | }
72 | } else {
73 | $randomrecordids[] = $siteid;
74 | }
75 |
76 | list($in_sql, $params) = $DB->get_in_or_equal($randomrecordids, SQL_PARAMS_NAMED, 'r', true);
77 | $im = getcoverageimg($totrecs, $randomrecordids);
78 | $failedrecs = $DB->get_records_sql('Select id, url, countrycode, privacy, unreachable, score, fingerprint, errormsg '
79 | . 'from {hub_site_directory} r WHERE id '.$in_sql , $params);
80 |
81 | // Outputs table
82 | $table = new html_table();
83 | // $table->attributes['class'] = 'collection';
84 |
85 | $table->head = array(
86 | "id",
87 | 'url',
88 | "CC",
89 | 'unreachable',
90 | 'score',
91 | 'privacy',
92 | 'Previous fingerprint OR cron-linkchecker:errormsg',
93 | 'Now checking for womens liberty..'
94 | );
95 | $table->colclasses = array();
96 | $scorefield = '';
97 |
98 | foreach ($failedrecs as $rec) {
99 |
100 | $cell = new html_table_cell('Checking..');
101 | $cell->attributes = array('siteid' => $rec->id, 'class' => 'manualcheck', 'url' => $rec->url);
102 |
103 | $trackerparams = array (
104 | 'pid'=>'10020', 'issuetype'=>'3', //tracker.moodle.org : moodle community sites project id.
105 | 'description' => 'see linkcheck test page @ '.$CFG->wwwroot.'/local/linkchecker/index.php?siteid='.$rec->id,
106 | 'components' => '12633', //tracker.moodle.org : moodle community sites project's 'moodle.net' component id.
107 | 'summary' => 'linkchecker found to have missed moodle site (id '.$rec->id.')',
108 | 'security' => '10030' //, 'schemeId' => '10000' //set atleast 'could be a security issue' as site privacy settings can be changing/changed on moodle.net
109 | );
110 | if (!is_null($siteid)) {
111 | $scorefield = (!in_array($siteid, $allfailedrecids)) ?'(moodley)':'';
112 | }
113 | $row = array($rec->id,
114 | '
Browse it |'
115 | .'
Linkcheck it |'
116 | .'
Report it '
117 | ,'
'.$rec->countrycode.''
118 | , $rec->unreachable, $rec->score. $scorefield, $rec->privacy, (strlen($rec->errormsg)>0)?$rec->errormsg:'fingerprint:'.$rec->fingerprint, $cell);
119 | $table->data[] = $row;
120 | }
121 | $htmltable = html_writer::table($table);
122 |
123 | list($sql, $params) = local_hub_stats_get_confirmed_sql();
124 | $sql = "SELECT count(*) as onlinesitescount FROM {hub_site_directory} r WHERE ".$sql;
125 | $totsitesonline = $DB->get_record_sql($sql, $params);
126 |
127 | echo '
Total sites: '. $totrecs. ' |
Total online & moodley: '.$totsitesonline->onlinesitescount .' |
Offline sites loaded in table: '. $limitnum. ' | ';
128 | echo '
Checked: 0 |
not failed: 0 |
Desired Fails: 0 |
linkchecking fraction (desiredfails/checked): ';
129 |
130 | echo '
Test coverage of unmoodley sites:
;
132 | imagepng($im);
133 | $im = ob_get_contents();
134 | ob_end_clean();
135 | echo base64_encode($im);
136 | echo ')
Flags for unmoodley sites detected online:
';
137 |
138 | echo '
';
139 |
140 | echo '
Default sample size 400 is for a 95% (+/-5%) confidence test. Adjustable in url by appending "?limitnum=xxx". Please remember to adjust your linkchecking percentage after human checking and raise moodley sites from here to developers.';
141 |
142 | echo $htmltable ;
143 | // $nonmoodlecnt = $totrecs-$totsitesonline->onlinesitescount;
144 |
145 | $jsscr = <<
147 | function linkchecker() {
148 | var chkcnt;
149 | var failcnt;
150 | var notfailcnt;
151 | var percentage;
152 | var confidence;
153 | var Z = 1.96; //for 95% confidence in a std distribution (about Random 400 samples for over 183k sites will be good)
154 | $(".manualcheck").each(function(index, element){
155 | element.innerHTML = 'Checking, sent to server, awaiting response..';
156 | $.get('index.php',
157 | { url: this.getAttribute('url'),
158 | siteid: this.getAttribute('siteid')
159 | },
160 | function(responseText) {
161 | responseText = jQuery.parseJSON(responseText);
162 | console.log(responseText);
163 | element.innerHTML = responseText.result;
164 | chkcnt = $(".chkcnt").html();
165 | chkcnt++;
166 | $(".chkcnt").html(chkcnt);
167 | if (responseText.result.indexOf("200 OK") >= 0 || responseText.result.indexOf("303 See Other") >= 0) {
168 | element.innerHTML = '' + responseText.result + ' (Human verification preferred.)';
169 | notfailcnt = $(".notfailcnt").html();
170 | notfailcnt++;
171 | $(".notfailcnt").html(notfailcnt);
172 | var samplingrs = $(".samplingresult");
173 | $(".samplingresult").append('
');
174 | } else {
175 | failcnt = $(".failcnt").html();
176 | failcnt++;
177 | $(".failcnt").html(failcnt);
178 | console.log("desired fail count:"+failcnt);
179 | percentage = failcnt / chkcnt;
180 | console.log("linkchecker percentage(%):"+percentage);
181 | $(".perc").html(percentage);
182 | }
183 | }
184 | );
185 | });
186 | }
187 | linkchecker();
188 |
189 | js;
190 |
191 | echo $jsscr; // bloody yui
192 |
193 | echo $OUTPUT->footer();
194 |
195 | function getcoverageimg($totrecs, $randomrecordids, $highlightrecs=false) {
196 | core_php_time_limit::raise(300);
197 | $width = 18000; $height = 20; $padding = 5;
198 | if ($totrecs == 0) {
199 | // Make it work even on an empty hub.
200 | $totrecs = 1;
201 | }
202 | $column_width = $width / $totrecs ;
203 | $im = imagecreate($width,$height);
204 | imagesavealpha($im, true);
205 | //setting completely transparent color
206 | $transparent = imagecolorallocatealpha($im, 0, 0, 0, 127);
207 | $gray = imagecolorallocate ($im,0x4c,0x4c,0x4c);
208 | $gray_lite = imagecolorallocate ($im,0x7e,0x7e,0x7e);
209 | $gray_dark = imagecolorallocate ($im,0x05,0x05,0x05);
210 | $white = imagecolorallocate ($im,0xff,0xff,0xff);
211 | $red = imagecolorallocate ($im,0xee,0x10,0x10);
212 |
213 | $maxv = 1;
214 | if (is_array($randomrecordids)) {
215 | imagefilledrectangle($im,0,0,$width,$height,$white);
216 | //left in some column ht code (c&p)
217 | for($i=0;$i<$totrecs;$i++)
218 | {
219 | $column_height = ($height * ((int) in_array($i, $randomrecordids)));
220 | $x1 = $i*$column_width;
221 | $y1 = $height-$column_height;
222 | $x2 = (($i+1)*$column_width)-$padding;
223 | $y2 = $height;
224 | imagefilledrectangle($im,$x1,$y1,$x2,$y2,$highlightrecs?$red:$gray);
225 | }
226 | } else if (is_int ($randomrecordids)) { //highlight - transparent fill
227 | imagefilledrectangle($im,0,0,$width,$height,$transparent);
228 | $column_height = $height;
229 | $x1 = $randomrecordids*$column_width;
230 | $y1 = $height-$column_height;
231 | $x2 = (($randomrecordids+1)*$column_width)-$padding;
232 | $y2 = $height;
233 | imagefilledrectangle($im,$x1,$y1,$x2,$y2,$highlightrecs?$red:$gray);
234 | }
235 | return $im;
236 | }
237 |
238 | function reachaUrl ($url, $redirectcount=0) {
239 | stream_context_set_default(array(
240 | 'http' => array(
241 | 'method' => 'HEAD'
242 | )
243 | ));
244 | $headers = get_headers($url, 1);
245 | if ($headers !== false && isset($headers['Location'])) {
246 | reachaUrl($headers['Location'], ++$redirectcount);
247 | }
248 | return array($headers, $redirectcount);
249 | }
250 |
--------------------------------------------------------------------------------
/multicheck.php:
--------------------------------------------------------------------------------
1 | #!/usr/bin/php
2 | \n"; exit; }
9 | //allow script to run on different set while earlier scripts wait for timeouts etc in buffer (timelinkchecked is updated upon buffer filling)
10 | $GLOBALS['lockfile'] = "./.multicheck.lock.".rand(1,3);
11 | declare(ticks = 1);
12 |
13 | // setup signal handlers
14 | pcntl_signal(SIGTERM, "cleanup");
15 | pcntl_signal(SIGINT, "cleanup");
16 |
17 | // signal handler function
18 | function cleanup($signal) {
19 | global $lockfile,$multihandle;
20 | echo "\nCaught signal ".$signal.", cleaning up\n";
21 | unlink($lockfile);
22 | curl_multi_close($multihandle);
23 | exit;
24 | }
25 |
26 | if (file_exists($lockfile)) {
27 | $self = basename(__FILE__);
28 | $pids = `ps axw |grep $self |grep -v grep |awk '{print \$1}'`;
29 | $pids = trim($pids);
30 | if (!empty($pids)) {
31 | $mypid = posix_getpid();
32 | $pidarr = explode("\n", $pids);
33 | foreach ($pidarr as $pid) {
34 | if ($pid == $mypid) { continue; }
35 | print "Another instance is already running, killing pid $pid\n";
36 | exec("kill -9 $pid");
37 | }
38 | touch($lockfile);
39 | }
40 | }else {
41 | touch($lockfile) or die("Unable to create $lockfile\n");
42 | }
43 |
44 | $overallstarttime = microtime(true);
45 | require_once("../../config.php");
46 |
47 | error_reporting(E_ALL);
48 | ini_set('display_errors','On');
49 |
50 | echo "\n\nStarting...\n";
51 |
52 | define_globals();
53 |
54 | $maxsitecurls = 20;
55 | $sitecurlsrunning = 0;
56 | $GLOBALS['multihandle'] = curl_multi_init();
57 | $curledsofar = 0;
58 |
59 | echo "$totalsites To Process\n";
60 |
61 | $outcome = fill_site_buffer();
62 | if ($outcome===false) die('WHOOAAA no sites');
63 |
64 | $sitespassed = 0;
65 | $sitesfailed = 0;
66 | $siteserrored = 0;
67 |
68 | $moodleypage = 'login/index.php'; // This should be accessible and constant over time.
69 |
70 | for(;;) {
71 |
72 | while ($sitecurlsrunning<$maxsitecurls && count($sitebuffer)>0) {
73 | $site = array_shift($sitebuffer);
74 | if (is_bogon($site->url)) {
75 | update_site($site, -1, ((int)$site->unreachable+1), addslashes("Non-routable IP found - Didnt attempt curl"));
76 | writeline($site->id, $site->url, 'F', '-','0','0', "Non-routable IP found - Didnt attempt curl");
77 | $siteserrored++;
78 | continue;
79 | }
80 | $handle = create_handle($site->url);
81 | if ($handle===false) {
82 | update_site($site, -1, ((int)$site->unreachable+1), addslashes("Malformed URL - Didnt attempt curl"));
83 | writeline($site->id, $site->url, 'F', '-','0','0', "Malformed URL - Didnt attempt curl");
84 | $siteserrored++;
85 | continue;
86 | }
87 | curl_multi_add_handle($multihandle, $handle);
88 | $sitesrunning[(string)$handle] = $site;
89 | $sitecurlsrunning++;
90 | $curledsofar++;
91 | }
92 |
93 | if (count($sitebuffer)===0) {
94 | $filledbuffer = fill_site_buffer();
95 | } else {
96 | $filledbuffer = true;
97 | }
98 |
99 | if ($sitecurlsrunning == 0 && $filledbuffer===false) {
100 | break;
101 | }
102 |
103 | curl_multi_select($multihandle);
104 | while(($mcRes = curl_multi_exec($multihandle, $mcActive)) == CURLM_CALL_MULTI_PERFORM);
105 | if($mcRes != CURLM_OK) break;
106 | while($done = curl_multi_info_read($multihandle)) {
107 | $handle = $done['handle'];
108 |
109 | $sitecontent = curl_multi_getcontent($handle);
110 |
111 | $site = $sitesrunning[(string)$handle];
112 | $site->originalurl = $site->url;
113 | $site->url = curl_getinfo($handle, CURLINFO_EFFECTIVE_URL);
114 | $info = curl_getinfo($handle);
115 | $curl_error = new stdClass;
116 | $curl_error = curl_error($handle);
117 | if (!empty($curl_error)) {
118 | $curl_error_string = clean_param($curl_error, PARAM_NOTAGS);
119 | update_site($site, -1, ((int)$site->unreachable+1), $curl_error_string);
120 | writeline($site->id, $site->url, 'F', '-',curl_getinfo($handle, CURLINFO_REDIRECT_COUNT),curl_errno($handle), $curl_error_string);
121 | $siteserrored++;
122 | } else {
123 | $manualredirect = check_for_manual_redirect($sitecontent);
124 | if ($manualredirect!==false) {
125 | $oldurl = $site->url;
126 | $outcome = reinsert_site_into_buffer($site, $manualredirect);
127 | if ($outcome===false) {
128 | update_site($site, '', ((int)$site->unreachable+1), 'Max manual redirects exceeded');
129 | writeline($site->id, $oldurl,'', '', $site->manualredirect, '', 'Maximum manual redirects exceeded: '.$site->manualredirect);
130 | } else {
131 | writeline($site->id, $oldurl,'', '', $site->manualredirect, '', 'Manual redirect '.$site->url);
132 | }
133 | } else {
134 | $outcome = link_checker_test_result($site, $handle, $sitecontent);
135 | if ($outcome) {
136 | $sitespassed++;
137 | } else {
138 | // frontpages are heavily modified.. in addition, check one $timelongmoodleypgs page
139 | if (!preg_match('#'.$moodleypage.'#', $site->url)) {
140 | $oldurl = $site->url;
141 |
142 | $joiner = '/';
143 | if (substr(trim($site->url), -1) == '/') {
144 | $joiner = '';
145 | }
146 | $newurl = $site->url . $joiner . $moodleypage;
147 |
148 | $outcome = reinsert_site_into_buffer($site, $newurl);
149 | if ($outcome===false) {
150 | update_site($site, '', ((int)$site->unreachable+1), 'Max manual redirects exceeded (moodleypage)');
151 | writeline($site->id, $oldurl,'', '', $site->manualredirect, '', 'Maximum manual redirects exceeded : '.$newurl);
152 | } else {
153 | writeline($site->id, $oldurl,'', '', $site->manualredirect, '', 'Extra page check redirect for '.$site->url);
154 | }
155 | } else {
156 | $sitesfailed++;
157 | }
158 | }
159 | }
160 | }
161 | curl_multi_remove_handle($multihandle, $handle);
162 | curl_close($handle);
163 | $sitecurlsrunning--;
164 | unset($sitesrunning[(string)$handle]);
165 | echo " [ stat: $curledsofar/$totalsites curls:$sitecurlsrunning/$maxsitecurls buf: ".count($sitebuffer).' ]';
166 | }
167 | }
168 | curl_multi_close($multihandle);
169 | unlink($lockfile);
170 | echo "\n\nProcess Complete\nPassed: $sitespassed\tFailed: $sitesfailed\tErrored: $siteserrored";
171 | echo "\nTotal time: ". (microtime(true)-$overallstarttime)."\n\n";
172 | flush();
173 |
174 | /**
175 | * Update the site's record in hub_site_directory
176 | */
177 | function update_site(&$site, $score='', $unreachable=0, $errormessage='', $moodlerelease=null, $serverstring=null, $fingerprint=null) {
178 | global $DB;
179 | $updatedsite = new stdClass;
180 | $updatedsite->id = $site->id;
181 | $updatedsite->timelinkchecked = time();
182 | //reset for further checking if error message indicates time out. (not the exact string as this may change due to curl changes in php...)
183 | // @todo for some reason error number is not used here....
184 | if (strpos($errormessage, "Connection") !== false || strpos($errormessage, "timed out") !== false || strpos($errormessage, "milliseconds") !== false ) {
185 | $unreachable = 0;
186 | $score = 0;
187 | $updatedsite->override = 2;
188 | }
189 | $updatedsite->unreachable = $unreachable;
190 | $updatedsite->score = $score;
191 | $updatedsite->errormsg = $errormessage;
192 | $updatedsite->fingerprint = $fingerprint;
193 |
194 | if (isset($site->redirectto)) {
195 | $updatedsite->redirectto = $site->redirectto;
196 | }
197 | if ($moodlerelease!=null) {
198 | $updatedsite->moodlerelease = $moodlerelease;
199 | }
200 | if ($serverstring!=null) {
201 | $updatedsite->serverstring = $serverstring;
202 | }
203 | if ($unreachable!=0 && $site->timeunreachable==0) {
204 | $updatedsite->timeunreachable = time();
205 | } else if ($unreachable==0) {
206 | $updatedsite->timeunreachable = 0;
207 | }
208 |
209 | $DB->update_record(LINKCHECKER_TABLENAME, $updatedsite);
210 | return true;
211 | }
212 |
213 | /**
214 | * Requeue the site for examination
215 | * The site may have responded with a redirect or we need to check a page other than the front page
216 | */
217 | function reinsert_site_into_buffer($site, $newurl) {
218 | global $sitebuffer;
219 | $urlbits = @parse_url($site->url);
220 | if ($urlbits===false || !is_array($urlbits) || !array_key_exists('host',$urlbits) || !array_key_exists('scheme',$urlbits)) return false;
221 | if (empty($urlbits['port'])) $urlbits['port'] = "80";
222 | if (empty($urlbits['path'])) $urlbits['path'] = "/";
223 | $oldurl = $site->url;
224 | if (!preg_match('#^http[s]?://#', $newurl)) {
225 | if (strpos($newurl, '/')!==0) {
226 | $newurl = str_replace('./', '', $newurl);
227 | $path = $urlbits['path'];
228 | $uribits = explode($path, '/');
229 | array_pop($uribits);
230 | $path = '/'.join('/', $uribits).'/';
231 | $newurl = $path.$newurl;
232 | }
233 | $newurl = $urlbits['scheme'].'://'.$urlbits['host'].":".$urlbits['port'].$newurl;
234 | }
235 | $site->url = $newurl;
236 | $site->manualredirect++;
237 | if ($site->manualredirect < LINKCHECKER_MAXREDIRECTS) {
238 | $sitebuffer[] = $site;
239 | return true;
240 | } else {
241 | return false;
242 | }
243 | }
244 |
245 | /**
246 | * Load a subset of sites from hub_site_directory to examine via cURL
247 | **/
248 | function fill_site_buffer() {
249 | global $sitebuffer, $CFG, $DB, $siteselectorsql, $sitessofar, $totalsites, $timelinkchecked;
250 |
251 | static $lastsiteid;
252 | static $runhasfailed;
253 |
254 | if ($runhasfailed===true) return false;
255 |
256 | if ($lastsiteid==null) $lastsiteid= 1000000000;
257 | if ($runhasfailed==null) $runhasfailed = false;
258 | if ($sitessofar==null) $sitessofar = 0;
259 |
260 | echo "\nFilling Buffer with " . LINKCHECKER_SITEBUFFERLIMIT . " sites starting from id ".$lastsiteid."\r\n";
261 |
262 | $sql = sprintf($siteselectorsql, LINKCHECKER_MAXIMUMUNREACHABLE, $timelinkchecked, $lastsiteid, LINKCHECKER_SITEBUFFERLIMIT);
263 | $sites = $DB->get_records_sql($sql);
264 |
265 | if (!is_array($sites) || count($sites)==0) {
266 | $runhasfailed = true;
267 | return false;
268 | }
269 | $sitessofar += count($sites);
270 |
271 | foreach ($sites as $site) {
272 | $site->manualredirect = 0;
273 | $sitebuffer[] = $site;
274 |
275 | // Update timelinkchecked early.
276 | // This is useful when running some multiple linkchecker processes to go faster when testing fingerprinting.
277 | $site->timelinkchecked = time();
278 | $DB->update_record(LINKCHECKER_TABLENAME, $site);
279 | }
280 |
281 | if ($lastsiteid===$site->id) {
282 | return false;
283 | }
284 |
285 | $lastsiteid = $site->id;
286 | return true;
287 | }
288 |
289 | /**
290 | * Examines the header and html of cURL response to determine whether the response came from a Moodle site
291 | */
292 | function link_checker_test_result(&$site, $handle, $html) {
293 |
294 | $head = substr($html, 0, curl_getinfo($handle, CURLINFO_HEADER_SIZE));
295 | $html = substr($html, curl_getinfo($handle, CURLINFO_HEADER_SIZE));
296 |
297 | $head = trim($head);
298 | $html = trim($html);
299 |
300 | $serverstring = null;
301 | if (preg_match("/Server: (.*)(\n|\r)/", $head, $smatches)) {
302 | $serverstring = trim($smatches[1]);
303 | }
304 | $fingerprint = ''; //reflects $rules set matching.
305 | $headscore = 0;
306 | if (strlen($head)>10) {
307 | $headfingerprint = get_head_fingerprint($head);
308 | $headscore = 2 * substr_count($headfingerprint, '1'); // 2 points for each match.
309 | $fingerprint .= $headfingerprint;
310 | }
311 | $fingerprint .= '/';
312 | $htmlscore = 0;
313 | $moodlerelease = null;
314 | if (strlen($html)>40) {
315 | $htmlfingerprint = get_html_fingerprint($html);
316 | $htmlscore = substr_count($htmlfingerprint, '1');
317 | $fingerprint .= $htmlfingerprint;
318 |
319 | if (preg_match("/(?:title=\"Moodle )(.{0,20})(?: \((Build: |))(\d+)(?:\)\")/i", $html, $matches)) {
320 | $moodlerelease = $matches[1]." (".$matches[3].")";
321 | }
322 | }
323 |
324 | $score = $htmlscore+$headscore;
325 |
326 | if ($score >= 5) { // Success!
327 | if (curl_getinfo($handle, CURLINFO_EFFECTIVE_URL) != $site->originalurl) {
328 | $site->redirectto = curl_getinfo($handle, CURLINFO_EFFECTIVE_URL);
329 | }
330 | update_site($site, $score, 0, '', $moodlerelease, $serverstring, $fingerprint);
331 | if ($moodlerelease==null) {
332 | $moodlerelease = "Unknown";
333 | }
334 | writeline($site->id, $site->url, 'P', (string)$htmlscore.'/'.(string)$headscore, (string)$fingerprint, curl_getinfo($handle, CURLINFO_REDIRECT_COUNT),'-', '', $moodlerelease);
335 | return true;
336 | } else { // Failure, but we did reach the site!
337 | update_site($site, $score, 0, '', $moodlerelease, $serverstring, $fingerprint);
338 | if ($moodlerelease==null) {
339 | $moodlerelease = "Unknown";
340 | }
341 | writeline($site->id, $site->url, 'F', (string)$htmlscore.'/'.(string)$headscore, (string)$fingerprint, curl_getinfo($handle, CURLINFO_REDIRECT_COUNT),'0', 'Failed Check with score '.(string)$score, $moodlerelease);
342 | return false;
343 | }
344 | }
345 |
346 | /**
347 | * Formats the supplied paramters into a pipe separated string and echoes it out.
348 | **/
349 | function writeline($id, $url, $outcome='F', $score='0', $fingerprint='', $redirects='0', $errorno='', $errormsg='', $moodlerelease='') {
350 | static $header;
351 | static $count;
352 | if ($header==null) {
353 | $hdstr = "\nC |ID | URL | P/F | Score (Body/Head) | Fingerprint | Redir |ErNum | Version | Error Msg";
354 | echo $hdstr;
355 | echo "\n".str_repeat('-', strlen($hdstr));
356 | $header = true;
357 | }
358 | if ($count==null) {
359 | $count = 1;
360 | } else {
361 | $count++;
362 | }
363 | $countstr = (strlen($count)<4)?str_pad($count,4):substr($count,0,4);
364 | if (trim($outcome=='')) {
365 | $countstr = ' ';
366 | $count--;
367 | }
368 | $id = (strlen($id)<10)?str_pad($id,10):substr($id,0,10);
369 | $url = (strlen($url)<50)?str_pad($url,50):substr($url,0,50);
370 | $outcome = (strlen($outcome)<4)?str_pad($outcome,4):substr($outcome,0,4);
371 | $score = (strlen($score)<18)?str_pad($score,18):substr($score,0,18);
372 | $redirects = (strlen($redirects)<4)?str_pad($redirects,5):substr($redirects,0,5);
373 | //don't pad this.
374 | $moodlerelease = (strlen($moodlerelease)<24)?str_pad($moodlerelease,24):substr($moodlerelease,0,24);
375 | // $errormsg = (strlen($errormsg)<70)?str_pad($errormsg,70):substr($errormsg,0,70);
376 | echo "\n$countstr|$id| $url| $outcome| $score| $fingerprint| $redirects| $errorno| $moodlerelease| $errormsg";
377 | flush();
378 | }
379 |
380 | ?>
381 |
--------------------------------------------------------------------------------