├── .coveralls.yml ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── composer.json ├── phpunit.xml.dist ├── src ├── GlLinkChecker.php ├── GlLinkCheckerError.php └── GlLinkCheckerReport.php └── tests ├── GlLinkCheckerErrorTest.php ├── GlLinkCheckerTest.php ├── bootstrap.php ├── expectedReport.html ├── json └── blog.json ├── md └── example.md └── site1 ├── download └── index.html ├── img └── index.html ├── index.html ├── robots.txt ├── section └── probleme-solution │ └── compresser-css-html-js.html └── sitemap.xml /.coveralls.yml: -------------------------------------------------------------------------------- 1 | coverage_clover: tests/logs/clover.xml 2 | json_path: tests/logs/coveralls-upload.json 3 | service_name: travis-ci 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | composer.phar 2 | composer.lock 3 | composer-test.lock 4 | vendor/ 5 | 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: php 2 | php: 3 | - '5.5' 4 | - '5.6' 5 | - '7.0' 6 | - '7.1' 7 | - '7.2' 8 | install: 9 | - composer install 10 | script: 11 | - ./vendor/bin/phpunit --coverage-clover ./tests/logs/clover.xml 12 | after_script: 13 | - php vendor/bin/coveralls -v 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015 Glicer - Emmanuel ROECKER & Rym BOUCHAGOUR, https://github.com/emmanuelroecker 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # php-linkchecker 2 | 3 | [![Scrutinizer Code Quality](https://scrutinizer-ci.com/g/emmanuelroecker/php-linkchecker/badges/quality-score.png?b=master)](https://scrutinizer-ci.com/g/emmanuelroecker/php-linkchecker/?branch=master) 4 | [![Build Status](https://travis-ci.org/emmanuelroecker/php-linkchecker.svg?branch=master)](https://travis-ci.org/emmanuelroecker/php-linkchecker) 5 | [![Coverage Status](https://coveralls.io/repos/github/emmanuelroecker/php-linkchecker/badge.svg?branch=master)](https://coveralls.io/github/emmanuelroecker/php-linkchecker?branch=master) 6 | [![SensioLabsInsight](https://insight.sensiolabs.com/projects/4f63b147-1922-4527-9d0d-e369397a1c13/mini.png)](https://insight.sensiolabs.com/projects/4f63b147-1922-4527-9d0d-e369397a1c13) 7 | 8 | Check broken links in html / json files, sitemap.xml, markdown and robots.txt. 9 | 10 | It's working with : 11 | 12 | * [Guzzle](http://docs.guzzlephp.org) 13 | * [Symfony Finder Component](http://symfony.com/doc/2.3/components/finder.html) 14 | * [Glicer Simply-html Component](https://github.com/emmanuelroecker/php-simply-html) 15 | 16 | ## Installation 17 | 18 | This library can be found on [Packagist](https://packagist.org/packages/glicer/link-checker). 19 | 20 | The recommended way to install is through [composer](http://getcomposer.org). 21 | 22 | Edit your `composer.json` and add : 23 | 24 | ```json 25 | { 26 | "require": { 27 | "glicer/link-checker": "dev-master" 28 | } 29 | } 30 | ``` 31 | 32 | Install dependencies : 33 | 34 | ```bash 35 | php composer.phar install 36 | ``` 37 | 38 | ## How to check links in html / json files ? 39 | 40 | ```php 41 | require 'vendor/autoload.php'; 42 | 43 | use GlLinkChecker\GlLinkChecker; 44 | use GlLinkChecker\GlLinkCheckerReport; 45 | use Symfony\Component\Finder\Finder; 46 | 47 | //relative url use host http://lyon.glicer.com to check link 48 | $linkChecker = new GlLinkChecker('http://lyon.glicer.com'); 49 | 50 | //construct list of local html and json files to check 51 | $finder = new Finder(); 52 | $files = $finder->files()->in('./public')->name("*.html")->name("*.json"); 53 | 54 | //launch links checking 55 | $result = $linkChecker->checkFiles( 56 | $files, 57 | function ($nbr) { 58 | // called at beginning - $nbr urls to check 59 | }, 60 | function ($url, $files) { 61 | // called each $url - $files : list of filename containing $url link 62 | }, 63 | function () { 64 | // called at the end 65 | } 66 | ); 67 | 68 | //convert $result array in a temp html file 69 | $filereport = GlLinkCheckerReport::toTmpHtml('lyonCheck',$result); 70 | 71 | //$filereport contain fullpath to html file 72 | print_r($filereport); 73 | ``` 74 | 75 | you can view $filereport with your browser 76 | 77 | ## How to check links in robots.txt and sitemap files ? 78 | 79 | ```php 80 | require 'vendor/autoload.php'; 81 | 82 | use GlLinkChecker\GlLinkChecker; 83 | 84 | $linkChecker = new GlLinkChecker('http://lyon.glicer.com'); 85 | $result = $linkChecker->checkRobotsSitemap(); 86 | 87 | print_r($result); 88 | ``` 89 | 90 | GlLinkChecker::checkRobotsSitemap() return an array like : 91 | 92 | ```php 93 | $result = [ 94 | 'disallow' => 95 | ['error' => ['/img/', '/download/']], 96 | 'sitemap' => 97 | [ 98 | 'ok' => [ 99 | '/sitemap.xml' => 100 | [ 101 | 'ok' => 102 | [ 103 | '/index.html', 104 | '/section/probleme-solution/compresser-css-html-js.html' 105 | ] 106 | ] 107 | ] 108 | ] 109 | ]; 110 | ``` 111 | 112 | ## Running Tests 113 | 114 | Launch from command line : 115 | 116 | ```console 117 | vendor\bin\phpunit 118 | ``` 119 | 120 | ## License MIT 121 | 122 | ## Contact 123 | 124 | Authors : Emmanuel ROECKER & Rym BOUCHAGOUR 125 | 126 | [Web Development Blog - http://dev.glicer.com](http://dev.glicer.com) 127 | -------------------------------------------------------------------------------- /composer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "glicer/link-checker", 3 | "type": "library", 4 | "description": "Check broken links in html files, sitemap.xml and robots.txt", 5 | "keywords": ["html", "broken", "links"], 6 | "homepage": "https://github.com/emmanuelroecker/php-linkchecker", 7 | "license": "MIT", 8 | "authors": [ 9 | { 10 | "name": "Emmanuel ROECKER", 11 | "homepage": "http://dev.glicer.com" 12 | }, 13 | { 14 | "name": "Rym BOUCHAGOUR", 15 | "homepage": "http://dev.glicer.com" 16 | } 17 | ], 18 | "require": { 19 | "php": ">=5.5", 20 | "guzzlehttp/guzzle": "^6.2", 21 | "glicer/simply-html": "^1.0", 22 | "symfony/finder": "^2.3 || ^3.0", 23 | "symfony/console": "^2.3 || ^3.0" 24 | }, 25 | "require-dev": { 26 | "phpunit/phpunit": "^4.8 || ^5.7 || ^6.5", 27 | "symfony/process": "^2.3 || ^3.0", 28 | "php-coveralls/php-coveralls": "^2.0" 29 | }, 30 | "autoload": { 31 | "psr-4": { 32 | "GlLinkChecker\\": "src/" 33 | } 34 | }, 35 | "autoload-dev": { 36 | "psr-4": { 37 | "GlLinkChecker\\Tests\\": "tests/" 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /phpunit.xml.dist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | tests 6 | 7 | 8 | 9 | 10 | src 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/GlLinkChecker.php: -------------------------------------------------------------------------------- 1 | client = new Client([ 48 | 'base_uri' => $rooturl, 49 | 'verify' => false, 50 | 'defaults' => [ 51 | 'headers' => [ 52 | 'User-Agent' => 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:36.0) Gecko/20100101 Firefox/36.0', 53 | 'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 54 | 'Accept-Language' => 'fr,fr-FR;q=0.8,en-US;q=0.5,en;q=0.3', 55 | 'Accept-Encoding' => 'gzip, deflate' 56 | ] 57 | ] 58 | ]); 59 | $this->internalurls = $internalurls; 60 | } 61 | 62 | /** 63 | * get all links in an object 64 | * 65 | * @param $obj 66 | * @param array $links 67 | */ 68 | private function searchInArray($obj, array &$links) 69 | { 70 | foreach ($obj as $key => $elem) { 71 | if (is_string($elem)) { 72 | if (preg_match("/^(http|https|ftp|ftps).*$/", $elem)) { 73 | if (filter_var($elem, FILTER_VALIDATE_URL)) { 74 | $links[$elem] = $elem; 75 | } 76 | } 77 | } else { 78 | if (is_array($elem)) { 79 | $this->searchInArray($elem, $links); 80 | } 81 | } 82 | } 83 | } 84 | 85 | /** 86 | * get all links in a json 87 | * 88 | * @param string $json 89 | * 90 | * @return array 91 | */ 92 | private function getJsonLinks($json) 93 | { 94 | $obj = json_decode($json, true); 95 | $links = []; 96 | $this->searchInArray($obj, $links); 97 | 98 | return $links; 99 | } 100 | 101 | 102 | /** 103 | * check links in a sitemap 104 | * 105 | * @param string $sitemap 106 | * 107 | * @return array 108 | * @throws \Exception 109 | */ 110 | private function checkSitemap($sitemap) 111 | { 112 | $xml = new GlHtml($sitemap); 113 | $listloc = $xml->get("loc"); 114 | $result = []; 115 | foreach ($listloc as $loc) { 116 | $response = $this->client->get($loc->getText(), ['exceptions' => false]); 117 | if ($response->getStatusCode() != 200) { 118 | $result['error'][] = $loc->getText(); 119 | } else { 120 | $result['ok'][] = $loc->getText(); 121 | } 122 | } 123 | 124 | return $result; 125 | } 126 | 127 | /** 128 | * check http error status code 129 | * 130 | * @param array $result 131 | * @param array $urls 132 | * @param int $statuscode 133 | */ 134 | private function checkStatus(array &$result, array $urls, $statuscode) { 135 | foreach ($urls as $url) { 136 | $response = $this->client->get($url, ['exceptions' => false]); 137 | if ($response->getStatusCode() != $statuscode) { 138 | $result[$statuscode]["error"][] = $url; 139 | } else { 140 | $result[$statuscode]["ok"][] = $url; 141 | } 142 | } 143 | } 144 | 145 | /** 146 | * check 403 and 404 errors 147 | * 148 | * @param array $urlerrors 149 | * @param array $urlforbiddens 150 | * 151 | * @return string 152 | */ 153 | public function checkErrors(array $urlerrors, array $urlforbiddens) 154 | { 155 | $result = []; 156 | 157 | $this->checkStatus($result,$urlerrors,404); 158 | $this->checkStatus($result,$urlforbiddens, 403); 159 | 160 | return $result; 161 | } 162 | 163 | /** 164 | * check links in robots.txt and sitemap 165 | * 166 | * @return array 167 | * @throws \Exception 168 | */ 169 | public function checkRobotsSitemap() 170 | { 171 | $response = $this->client->get("/robots.txt"); 172 | if ($response->getStatusCode() != 200) { 173 | throw new \Exception("Cannot find robots.txt"); 174 | } 175 | 176 | $robotstxt = $response->getBody()->getContents(); 177 | $robotstxt = explode("\n", $robotstxt); 178 | $result = []; 179 | foreach ($robotstxt as $line) { 180 | if (preg_match('/^\s*Sitemap:(.*)/i', $line, $match)) { 181 | $urlsitemap = trim($match[1]); 182 | $response = $this->client->get($urlsitemap, ['exceptions' => false]); 183 | if ($response->getStatusCode() != 200) { 184 | $result['sitemap']['error'][] = $urlsitemap; 185 | } else { 186 | $result['sitemap']['ok'][$urlsitemap] = $this->checkSitemap($response->getBody()->getContents()); 187 | } 188 | } 189 | 190 | if (preg_match('/^\s*Disallow:(.*)/i', $line, $match)) { 191 | $urldisallow = trim($match[1]); 192 | $response = $this->client->get($urldisallow, ['exceptions' => false]); 193 | if (($response->getStatusCode() != 200) && ($response->getStatusCode() != 403)) { 194 | $result['disallow']['error'][] = $urldisallow; 195 | } else { 196 | $result['disallow']['ok'][] = $urldisallow; 197 | } 198 | } 199 | } 200 | 201 | return $result; 202 | } 203 | 204 | /** 205 | * check links in any text file 206 | * 207 | * @return array 208 | * @throws \Exception 209 | */ 210 | public function getLinksFromMarkdown($markdownContent) 211 | { 212 | $pattern = '/\[.+\]\((https?:\/\/\S+)\)/'; 213 | 214 | if($num_found = preg_match_all($pattern, $markdownContent, $out)) return $out[1]; 215 | else return []; 216 | } 217 | 218 | 219 | /** 220 | * check links in html and json files 221 | * 222 | * @param Finder $files 223 | * @param callable $checkstart 224 | * @param callable $checking 225 | * @param callable $checkend 226 | * 227 | * @throws \Exception 228 | * @return GlLinkCheckerError[] 229 | */ 230 | public function checkFiles(Finder $files, callable $checkstart, callable $checking, callable $checkend, Array $criterias = ['lowercase', 'endslash', 'absolute']) 231 | { 232 | $linksByFile = []; 233 | /** 234 | * @var SplFileInfo $file 235 | */ 236 | foreach ($files as $file) { 237 | $inner = file_get_contents($file->getRealPath()); 238 | $keyname = $file->getRelativePathname(); 239 | $extension = $file->getExtension(); 240 | switch($extension){ 241 | case "html": 242 | $html = new GlHtml($inner); 243 | $linksByFile[$keyname] = $html->getLinks(); 244 | break; 245 | case "json": 246 | $linksByFile[$keyname] = $this->getJsonLinks($inner); 247 | break; 248 | case "md": 249 | $linksByFile[$keyname] = $this->getLinksFromMarkdown($inner); 250 | break; 251 | default: 252 | throw new \Exception("Extension unknown : " . $keyname); 253 | break; 254 | } 255 | } 256 | 257 | //reverse $linksByFile 258 | $links = []; 259 | foreach ($linksByFile as $filename => $filelinks) { 260 | foreach ($filelinks as $filelink) { 261 | $links[$filelink][] = $filename; 262 | } 263 | } 264 | 265 | $checkstart(count($links)); 266 | $result = []; 267 | foreach ($links as $link => $files) { 268 | $checking($link, $files); 269 | 270 | $gllink = new GlLinkCheckerError($this->client, $link, $files); 271 | 272 | if ($gllink->isInternal($this->internalurls)) { 273 | $gllink->check($criterias); 274 | } 275 | 276 | $gllink->check(['exist']); 277 | $result[] = $gllink; 278 | } 279 | $checkend(); 280 | 281 | return $result; 282 | } 283 | } 284 | -------------------------------------------------------------------------------- /src/GlLinkCheckerError.php: -------------------------------------------------------------------------------- 1 | client = $client; 90 | $this->url = $url; 91 | $this->link = $link; 92 | $this->files = $files; 93 | } 94 | 95 | /** 96 | * @return bool 97 | */ 98 | private function checkexisthead() 99 | { 100 | try { 101 | $response = $this->client->head($this->link); 102 | $this->statuscode = $response->getStatusCode(); 103 | $this->isExist = (($this->statuscode == 200) || ($this->statuscode == 204)); 104 | 105 | return $this->isExist; 106 | } catch (ClientException $e) { 107 | $this->statuscode = $e->getCode(); 108 | } catch (RequestException $e) { 109 | 110 | } 111 | 112 | $this->isExist = false; 113 | 114 | return false; 115 | } 116 | 117 | /** 118 | * @return bool 119 | */ 120 | private function checkexistget() 121 | { 122 | try { 123 | $response = $this->client->get($this->link); 124 | $this->statuscode = $response->getStatusCode(); 125 | $this->isExist = (($this->statuscode == 200) || ($this->statuscode == 204)); 126 | 127 | return $this->isExist; 128 | } catch (ClientException $e) { 129 | $this->statuscode = $e->getCode(); 130 | } catch (RequestException $e) { 131 | } 132 | 133 | $this->isExist = false; 134 | 135 | return false; 136 | } 137 | 138 | /** 139 | * 140 | */ 141 | private function checkexist() 142 | { 143 | if ($this->checkexisthead()) { 144 | return true; 145 | } 146 | if ($this->checkexistget()) { 147 | return true; 148 | } 149 | 150 | return false; 151 | } 152 | 153 | /** 154 | * @return bool 155 | */ 156 | private function checkendslash() 157 | { 158 | if (substr($this->link, -1) == '/') { 159 | $this->isNotEndSlash = true; 160 | 161 | return true; 162 | } 163 | 164 | if (isset($this->url['path']) && (strlen($this->url['path']) > 0)) { 165 | $extension = pathinfo($this->url['path'], PATHINFO_EXTENSION); 166 | if (isset($extension) && (strlen($extension) > 0)) { 167 | $this->isNotEndSlash = true; 168 | 169 | return true; 170 | } 171 | } 172 | 173 | $this->isNotEndSlash = false; 174 | 175 | return false; 176 | } 177 | 178 | /** 179 | * @return bool 180 | */ 181 | private function checkabsolute() 182 | { 183 | if (isset($this->url['host']) && (strlen($this->url['host']) > 0)) { 184 | $this->isAbsolute = true; 185 | 186 | return true; 187 | } 188 | if (isset($this->url['path']) && (strpos($this->url['path'], "/") === 0)) { 189 | $this->isAbsolute = true; 190 | 191 | return true; 192 | } 193 | $this->isAbsolute = false; 194 | 195 | return false; 196 | } 197 | 198 | /** 199 | * @return bool 200 | */ 201 | private function checklowercase() 202 | { 203 | $this->isLowerCase = ($this->link === strtolower($this->link)); 204 | 205 | return $this->isLowerCase; 206 | } 207 | 208 | 209 | /** 210 | * @param array|null $internalurls 211 | * 212 | * @return bool 213 | */ 214 | public function isInternal($internalurls) 215 | { 216 | if (!isset($internalurls)) { 217 | return true; 218 | } 219 | 220 | if (!isset($this->url['host']) || (strlen($this->url['host']) <= 0)) { 221 | return true; 222 | } 223 | 224 | foreach ($internalurls as $internalurl) { 225 | if (strpos($this->link, $internalurl) === 0) { 226 | return true; 227 | } 228 | } 229 | 230 | return false; 231 | } 232 | 233 | /** 234 | * @param array $list 235 | * 236 | * @return bool 237 | */ 238 | public function check(array $list) 239 | { 240 | $result = true; 241 | foreach ($list as $element) { 242 | $element = "check" . trim(strtolower($element)); 243 | $result &= $this->$element(); 244 | } 245 | 246 | return $result; 247 | } 248 | 249 | /** 250 | * @return string 251 | */ 252 | public function getLink() 253 | { 254 | return $this->link; 255 | } 256 | 257 | /** 258 | * @return array 259 | */ 260 | public function getFiles() 261 | { 262 | return $this->files; 263 | } 264 | 265 | /** 266 | * @return int 267 | */ 268 | public function getStatusCode() 269 | { 270 | return $this->statuscode; 271 | } 272 | 273 | /** 274 | * @return array 275 | */ 276 | public function getErrorMessages() 277 | { 278 | $message = []; 279 | 280 | if (!($this->isAbsolute)) { 281 | $message[] = "Must be absolute (Sample : /article/index.html)"; 282 | } 283 | 284 | if (!($this->isLowerCase)) { 285 | $message[] = "Must be in lowercase (Sample : http://www.example.com/index.html)"; 286 | } 287 | 288 | if (!($this->isExist)) { 289 | $message[] = "Must exist (Http get error)"; 290 | } 291 | 292 | if (!($this->isNotEndSlash)) { 293 | $message[] = "Must have a slash at the end (Sample : http://www.example.com/)"; 294 | } 295 | 296 | return $message; 297 | } 298 | 299 | /** 300 | * @return array 301 | */ 302 | public function getErrorArray() { 303 | $error = []; 304 | 305 | $error['absolute'] = $this->isAbsolute; 306 | $error['lowercase'] = $this->isLowerCase; 307 | $error['exist'] = $this->isExist; 308 | $error['notendslash'] = $this->isNotEndSlash; 309 | 310 | return $error; 311 | } 312 | } 313 | -------------------------------------------------------------------------------- /src/GlLinkCheckerReport.php: -------------------------------------------------------------------------------- 1 | write("\xEF\xBB\xBF"); //add ut8 bom to txt file 40 | $resultoutput->write(print_r($result, true)); 41 | 42 | return $resultfile; 43 | } 44 | 45 | /** 46 | * write links test result in a temp html file 47 | * 48 | * @param string $name 49 | * @param GlLinkCheckerError[] $result 50 | * 51 | * @return string 52 | */ 53 | public static function toTmpHtml($name, $result) 54 | { 55 | $resultfile = sys_get_temp_dir() . "/" . uniqid($name) . ".html"; 56 | $html = self::toHtml($name, $result); 57 | file_put_contents($resultfile, $html); 58 | 59 | return $resultfile; 60 | } 61 | 62 | /** 63 | * render report in html format 64 | * 65 | * @param string $title 66 | * @param GlLinkCheckerError[] $links 67 | * 68 | * @return string 69 | */ 70 | private static function toHtml($title,array $links) 71 | { 72 | $html = ''; 73 | $html .= ''; 74 | $html .= '' . $title . ''; 75 | $html .= ''; 102 | $html .= ''; 103 | 104 | /** 105 | * @var GlLinkCheckerError $link 106 | */ 107 | foreach ($links as $link) { 108 | $html .= ''; 116 | continue; 117 | } 118 | 119 | $tooltip = implode(' ', $errors); 120 | $html .= '' . $url . '' . $link->getStatusCode( 121 | ) . $files; 122 | $html .= ''; 123 | } 124 | $html .= '


'; 125 | 126 | return $html; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /tests/GlLinkCheckerErrorTest.php: -------------------------------------------------------------------------------- 1 | assertEquals('http://dev.glicer.com', $linkerror->getLink()); 37 | $this->assertEquals(['file1','file2'], $linkerror->getFiles()); 38 | } 39 | 40 | public function testCheck() 41 | { 42 | $client = new Client(); 43 | $linkerror = new GlLinkCheckerError($client, 'http://dev.glicer.com',['index.html']); 44 | 45 | $linkerror->check(['exist','endslash','absolute','lowercase']); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /tests/GlLinkCheckerTest.php: -------------------------------------------------------------------------------- 1 | checkRobotsSitemap(); 39 | $expected = [ 40 | 'disallow' => 41 | ['ok' => ['/img/', '/download/']], 42 | 'sitemap' => 43 | [ 44 | 'ok' => [ 45 | '/sitemap.xml' => 46 | [ 47 | 'ok' => 48 | [ 49 | '/index.html', 50 | '/section/probleme-solution/compresser-css-html-js.html' 51 | ] 52 | ] 53 | ] 54 | ] 55 | ]; 56 | $this->assertEquals($expected, $result); 57 | } 58 | 59 | public function testErrors() 60 | { 61 | $linkChecker = new GlLinkChecker('http://' . WEB_SERVER_HOST . ':' . WEB_SERVER_PORT); 62 | $result = $linkChecker->checkErrors(['/nothing.html'], ['/test.html']); 63 | 64 | $expected = [ 65 | '404' => 66 | ['ok' => ['/nothing.html']], 67 | '403' => 68 | ['error' => ['/test.html']] 69 | ]; 70 | $this->assertEquals($expected, $result); 71 | } 72 | 73 | private function validatelink($link, $links, $result, array $errorarray) 74 | { 75 | $key = array_search($link, $links); 76 | if ($key === false) { 77 | $this->fail($link . " - " . var_export($links, TRUE)); 78 | } 79 | 80 | $this->assertEquals( 81 | $errorarray,$result[$key]->getErrorArray(),$link 82 | ); 83 | } 84 | 85 | public function testJson() 86 | { 87 | $finder = new Finder(); 88 | $files = $finder->files()->in('./tests/json')->name("*.json"); 89 | 90 | $linkChecker = new GlLinkChecker(); 91 | $result = $linkChecker->checkFiles( 92 | $files, 93 | function () { 94 | }, 95 | function () { 96 | }, 97 | function () { 98 | } 99 | ); 100 | $this->assertEquals(3, count($result)); 101 | 102 | $links = []; 103 | foreach ($result as $link) { 104 | $links[] = $link->getLink(); 105 | } 106 | 107 | $this->validatelink("http://dev.glicer.com/", $links,$result, ['absolute' => true, 'lowercase' => true, 'exist' => true, 'notendslash' => true]); 108 | $this->validatelink("http://lyon.glicer.com/", $links, $result, ['absolute' => true, 'lowercase' => true, 'exist' => false, 'notendslash' => true]); 109 | $this->validatelink("http://dev.glicer.com/section/probleme-solution/prefixer-automatiquement-css.html", $links, $result, ['absolute' => true, 'lowercase' => true, 'exist' => true, 'notendslash' => true]); 110 | } 111 | 112 | public function testMarkdown() 113 | { 114 | $finder = new Finder(); 115 | $files = $finder->files()->in('./tests/md')->name("*.md"); 116 | 117 | $linkChecker = new GlLinkChecker(); 118 | $result = $linkChecker->checkFiles( 119 | $files, 120 | function () { 121 | }, 122 | function () { 123 | }, 124 | function () { 125 | } 126 | ); 127 | $this->assertEquals(3, count($result)); 128 | 129 | $links = []; 130 | foreach ($result as $link) { 131 | $links[] = $link->getLink(); 132 | } 133 | 134 | $this->validatelink("https://ucarecdn.com/aa1a5994-8de9-4d24-99ce-3a0d686c30bd/-/resize/700x/", $links,$result, ['absolute' => true, 'lowercase' => true, 'exist' => true, 'notendslash' => true]); 135 | $this->validatelink("https://projects.breatheco.de/d/landing-page-with-react#readme", $links, $result, ['absolute' => true, 'lowercase' => true, 'exist' => true, 'notendslash' => false]); 136 | $this->validatelink("https://ucarecdn.com/8729c2f0-e4a6-4721-9ee9-3f29e6e852b5/", $links, $result, ['absolute' => true, 'lowercase' => true, 'exist' => false, 'notendslash' => true]); 137 | } 138 | 139 | public function testLinks() 140 | { 141 | $finder = new Finder(); 142 | $files = $finder->files()->in('./tests/site1')->name("*.html"); 143 | 144 | $linkChecker = new GlLinkChecker('http://' . WEB_SERVER_HOST . ':' . WEB_SERVER_PORT); 145 | $result = $linkChecker->checkFiles( 146 | $files, 147 | function () { 148 | }, 149 | function () { 150 | }, 151 | function () { 152 | } 153 | ); 154 | 155 | $this->assertEquals(6, count($result)); 156 | 157 | $links = []; 158 | foreach ($result as $link) { 159 | $links[] = $link->getLink(); 160 | } 161 | 162 | $this->validatelink("/section/probleme-solution/compresser-css-html-js.html", $links, $result,['absolute' => true, 'lowercase' => true, 'exist' => true, 'notendslash' => true]); 163 | $this->validatelink("http://dev.glicer.com/", $links,$result, ['absolute' => true, 'lowercase' => true, 'exist' => true, 'notendslash' => true]); 164 | $this->validatelink("http://stop.glicer.com/no-exist.html", $links, $result, ['absolute' => true, 'lowercase' => true, 'exist' => false, 'notendslash' => true]); 165 | $this->validatelink("/index.html", $links, $result, ['absolute' => true, 'lowercase' => true, 'exist' => true, 'notendslash' => true]); 166 | $this->validatelink("http://lyon.glicer.com/", $links, $result, ['absolute' => true, 'lowercase' => true, 'exist' => false, 'notendslash' => true]); 167 | $this->validatelink("http://dev.glicer.com/section/probleme-solution/prefixer-automatiquement-css.html", $links, $result, ['absolute' => true, 'lowercase' => true, 'exist' => true, 'notendslash' => true]); 168 | } 169 | 170 | public function testReport() 171 | { 172 | $finder = new Finder(); 173 | $files = $finder->files()->in('./tests/site1')->name("*.html"); 174 | $linkChecker = new GlLinkChecker('http://' . WEB_SERVER_HOST . ':' . WEB_SERVER_PORT); 175 | $result = $linkChecker->checkFiles( 176 | $files, 177 | function () { 178 | }, 179 | function () { 180 | }, 181 | function () { 182 | } 183 | ); 184 | 185 | //sort link by name 186 | usort($result,function(GlLinkCheckerError $linkA,GlLinkCheckerError $linkB) { 187 | return strcmp($linkA->getLink(), $linkB->getlink()); 188 | }); 189 | 190 | $filereport = GlLinkCheckerReport::toTmpHtml('testReport',$result); 191 | 192 | $report = file_get_contents($filereport); 193 | $reportexpected = file_get_contents(__DIR__ . '/expectedReport.html'); 194 | 195 | $this->assertEquals($reportexpected,$report); 196 | } 197 | 198 | /** 199 | * @expectedException \Exception 200 | */ 201 | public function testUnknownExtension() 202 | { 203 | $finder = new Finder(); 204 | $files = $finder->files()->in( __DIR__. '/../')->name("*.yml"); 205 | $linkChecker = new GlLinkChecker('http://' . WEB_SERVER_HOST . ':' . WEB_SERVER_PORT); 206 | $linkChecker->checkFiles( 207 | $files, 208 | function () { 209 | }, 210 | function () { 211 | }, 212 | function () { 213 | } 214 | ); 215 | } 216 | 217 | public function getLinksFromMarkdownProvider() 218 | { 219 | return [ 220 | ['', []], 221 | ['[a link](http://link.com)', ['http://link.com']], 222 | ]; 223 | } 224 | 225 | /** 226 | * @dataProvider getLinksFromMarkdownProvider 227 | */ 228 | public function testGetLinksFromMarkdown($linkString, $expected) 229 | { 230 | $linkChecker = new GlLinkChecker('http://' . WEB_SERVER_HOST . ':' . WEB_SERVER_PORT); 231 | $result = $linkChecker->getLinksFromMarkdown($linkString); 232 | 233 | $this->assertSame($expected, $result); 234 | } 235 | } 236 | -------------------------------------------------------------------------------- /tests/bootstrap.php: -------------------------------------------------------------------------------- 1 | start(); 16 | 17 | echo sprintf( 18 | '%s - Web server started on %s:%d', 19 | date('r'), 20 | WEB_SERVER_HOST, 21 | WEB_SERVER_PORT 22 | ) . PHP_EOL; 23 | 24 | //wait server start 25 | sleep(1); 26 | 27 | // Kill the web server when the process ends 28 | register_shutdown_function( 29 | function () use ($process) { 30 | echo 'Web server shutdown' . PHP_EOL; 31 | $process->stop(); 32 | } 33 | ); 34 | 35 | // More bootstrap code -------------------------------------------------------------------------------- /tests/expectedReport.html: -------------------------------------------------------------------------------- 1 | testReport


-------------------------------------------------------------------------------- /tests/json/blog.json: -------------------------------------------------------------------------------- 1 | {"blog": [ 2 | { 3 | "name": "url1", 4 | "title": "title url1", 5 | "link": "http://dev.glicer.com/", 6 | "date": 1470082706 7 | }, 8 | { 9 | "name": "url2", 10 | "title": "title url2", 11 | "link": "http://lyon.glicer.com/", 12 | "date": 1470065791 13 | }, 14 | { 15 | "name": "url3", 16 | "title": "title url3", 17 | "link": "http://dev.glicer.com/section/probleme-solution/prefixer-automatiquement-css.html", 18 | "date": 1470064766 19 | } 20 | ]} -------------------------------------------------------------------------------- /tests/md/example.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Learn HTML" 3 | subtitle: "HTML is to websites what columns are for buildings. 4 | 5 | Learn the basics of HTML - the foundation of the web." 6 | time: "12 minutes" 7 | date: "2018-31-10" 8 | tags: ["fale"] 9 | --- 10 | 11 | [[info]] 12 | | :point_up: Since in the previous chapter we equated houses, stores and buildings to web pages, now we have to say that HTML are the blueprints. 13 | 14 | 15 | # **HTML is the Website Skeleton** 16 | *** 17 | 18 | All web pages have HTML – it’s the structure of EVERYTHING. Think of it as building columns at a construction site. 19 | 20 | HTML makes you divide the website information into parts – similar to the basic parts of a document: header, title, content, footnote, subtitle, etc. Then, with CSS, you can make your page beautiful, and, with JavaScript, make it interactive. 21 | 22 | Originally browsers only knew how to interpret HTML. Websites were simple and neither CSS or JavaScript was used. A website was a simple plain text document with the typical elements any Word Document has: Headings, Bullet lists, Paragraphs, etc. 23 | 24 | ![buildinghtml](https://ucarecdn.com/aa1a5994-8de9-4d24-99ce-3a0d686c30bd/-/resize/700x/) 25 | 26 | 27 | All tags must open and close. To close a tag you must place the same word but using the `/` symbol. 28 | 29 | # **The Attributes** 30 | *** 31 | Once the `` is defined, we can describe in detail its behavior by assigning attributes to those ``. For example, if we want our HTML document/page to have a link to another page, we use the `` tag, and we assign to it an attribute called **href**, which allows us to specify the URL of the page with which we want to have a connection. 32 | 33 | ```html 34 | Click here and it will take you to Google.com 35 | ``` 36 | 37 | 38 | In theory, you have to use [one of these tags](https://projects.breatheco.de/d/landing-page-with-react#readme) and don’t invent your own because the browser won’t know how to interpret them. You must learn what each tag means and does in order to put them to good use…but, please, don’t worry! There aren’t that many! 🙂 39 | 40 | For the main heading of the document, the tag that we use is `

`. For example: An online store has an "electronics" category, the title that applies would be "Electronics" and the `

` tag would be written as follows: 41 | 42 | ```html 43 |

Electronic items

44 | ``` 45 | 46 | 47 | ##### **Nested Tags** : 48 | Finally, tags can contain one or more tags within them. For example, if we would like to give a cursive style to the word "electronic" we must wrap that word with the tag ``: 49 | 50 | ```html 51 |

Electronic Tags

52 | ``` 53 | 54 | ## Blank Spaces and Line Jumps 55 | *** 56 | The browser ignores blank spaces and end of lines. If we want to jump one line, we have to use the `
` tag. If we want more "spaces" we need to insert one ` ` per each blank space (yes, we know it’s weird, but it is what it is). 57 | 58 | **These three alternatives will look the same (spaces and jumps of line will be ignored):** 59 | ```html 60 | HelloWorld 61 | ``` 62 | ```html 63 | Hello 64 | World 65 | ``` 66 | 67 | ```html 68 | Hello World 69 | ``` 70 | 71 | 72 | # **Page Structure** 73 | *** 74 | All pages must begin with the `` statement, then the `` and the `` should follow. These tags **must** contain other tags within them (nested tags) because they will split the page in 2 main parts: the HEAD and the BODY: 75 | 76 | 77 | ```html 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | ``` 89 | 90 | Lets simulate how a browser thinks: Imagine a user on his browser (client side) that types the URL: breatheco.de 91 | 92 | + The server will open the default HTML file on that server, which will probably be: index.html. 93 | + Then, it will read its content and interpret it as HTML (because the extension of the file is index.html). 94 | + The user will not see the text content of the file, instead it will view a visual interpretation of that text. 95 | 96 | As you can see, the page in question will include AT LEAST the following tags: 97 | 98 | ![html](https://ucarecdn.com/8729c2f0-e4a6-4721-9ee9-3f29e6e852b5/) 99 | 100 | |**Name** |**Tags** |**Description** | 101 | |:----------|:----------|:-----------------| 102 | |HTML |`` |We must begin by letting the browser know that this is an HTML document. We can also specify the HTML version that we are using. | 103 | |Head |`` |Everything that is written inside of the HEAD won’t be seen by the user. It’s the part of the page where the developer specifies information about the website itself: the language being used, what the website is about, the necessary fonts, the icon that the tab will have on the browser (favicon), and many other important things. | 104 | |Body |`` |Here you will place all the content that will be viewed by the end user.
If this were MS Word, the body would mark the beginning of your page content (the first line of your document). | 105 | 106 | # **The \ is like the Envelope of a Letter.** 107 | *** 108 | We read the envelope of a letter to find out information of the letter itself, but not of its content. Here you can find out who wrote the letter, in what language is it written, where is it from, etc. 109 | 110 | In the case of HTML, the `` can contain the following tags (among less important ones): 111 | 112 | |**Name** |**Tag** |**Description** | 113 | |:----------|:---------|:-----------------| 114 | |Title |`` |The title appears in the browser’s window, it’s also used when you share the page through social media: Twitter, Instagram, Facebook, etc. All those networks use the title of the page as the excerpt when a user copies the URL of your page to share on their wall. | 115 | |Meta |`<meta>` |The meta tags describe a document. They are used to specify things like: the author, title, date, keywords, descriptions, etc. Search engines love these tags because they allow an easier comprehension of the content before it is read. | 116 | |Link |`<link>` |Used for linking the page with the CSS style sheets. In the CSS chapter we will learn how to create style sheets and we will be able to import them using this tag. | 117 | |Style |`<style>` |If we can’t or don’t want to import a CSS style sheet, we may also define styles directly on the HTML document inside this tag. This is a practice that we rarely recommend and should only be used when you don’t have any other choice. | 118 | |Script |`<script>` |Used to add JavaScript code to the page. All of the JavaScript code must be contained in these tags that can be used also in the BODY, if desired. The difference is that any JavaScript code that we place in a style tag in the BODY won’t be available when the page begins to run (that’s exactly why the is HEAD is so useful). | 119 | 120 | # **The \<body\> is Similar to any MS Word Document** 121 | 122 | Ok, now that we are familiar with the general and necessary structure of the page, lets review the tags we can and must use to define the content of the page. 123 | 124 | Remember – for the fifteenth time – that a web page is…a text document! That’s right, if you knew the answer before you read it you are getting it! And, if not, don’t worry. We’ve never known of anyone who gets HTML and CSS rather quickly ;). 125 | 126 | Lets see how a website compares to a Word document: 127 | -------------------------------------------------------------------------------- /tests/site1/download/index.html: -------------------------------------------------------------------------------- 1 | <html> 2 | 3 | </html> -------------------------------------------------------------------------------- /tests/site1/img/index.html: -------------------------------------------------------------------------------- 1 | <html> 2 | 3 | </html> -------------------------------------------------------------------------------- /tests/site1/index.html: -------------------------------------------------------------------------------- 1 | <!DOCTYPE html> 2 | <html> 3 | <head> 4 | <title>Test Home 5 | 6 | 7 | 8 | 9 |

10 |

11 | Un p'tit test dans le texte : http://stop.glicer.com/no-exist.html 12 |

13 | 14 | -------------------------------------------------------------------------------- /tests/site1/robots.txt: -------------------------------------------------------------------------------- 1 | User-agent: * 2 | Disallow: /img/ 3 | Disallow: /download/ 4 | Sitemap: /sitemap.xml 5 | -------------------------------------------------------------------------------- /tests/site1/section/probleme-solution/compresser-css-html-js.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Test Article 5 | 6 | 7 | 8 | 9 |
10 |

11 | Un p'tit test dans le texte : http://dev.glicer.com/section/probleme-solution/prefixer-automatiquement-css.html 12 |

13 |
14 | 15 | -------------------------------------------------------------------------------- /tests/site1/sitemap.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | /index.html 5 | 2015-04-04T00:45:34+02:00 6 | monthly 7 | 1.0 8 | 9 | 10 | /section/probleme-solution/compresser-css-html-js.html 11 | 2015-04-04T00:45:34+02:00 12 | monthly 13 | 1.0 14 | 15 | 16 | --------------------------------------------------------------------------------