├── README.md ├── normalize_url.php └── normalize_url_test.php /README.md: -------------------------------------------------------------------------------- 1 | PHP library for normalizing URLs as specified in 2 | [RFC 3986](http://en.wikipedia.org/wiki/URI_scheme#Generic_syntax). 3 | The library doesn't require any dependencies. 4 | The library is very simple to use: simply include it 5 | and call `normalizeURL()` with the input URL as a str param. 6 | 7 | Supports: 8 | 9 | - Removal of WWW subdomains, e.g. if www.foo.bar points to the same location as foo.bar. 10 | - Removal of default ports (HTTP and HTTPS supported by default; 11 | edit `$defaultSchemes` to support additional protocols and ports if you wish) 12 | - Removal of duplicate slashes 13 | - Decoding unreserved characters 14 | - Removal of default directory index files 15 | - Removal of dot segments in URL path 16 | - Sorting GET params alphabetically 17 | -------------------------------------------------------------------------------- /normalize_url.php: -------------------------------------------------------------------------------- 1 | 4 | * 5 | * Permission is hereby granted, free of charge, to any person obtaining a copy 6 | * of this software and associated documentation files (the "Software"), to deal 7 | * in the Software without restriction, including without limitation the rights 8 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | * copies of the Software, and to permit persons to whom the Software is 10 | * furnished to do so, subject to the following conditions: 11 | * 12 | * The above copyright notice and this permission notice shall be included in 13 | * all copies or substantial portions of the Software. 14 | * 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | * THE SOFTWARE. 22 | */ 23 | 24 | function normalizeURL($url) 25 | { 26 | $newUrl = ""; 27 | $url = parse_url($url); 28 | $defaultSchemes = array("http" => 80, "https" => 443); 29 | 30 | if(isset($url['scheme'])) 31 | { 32 | $url['scheme'] = strtolower($url['scheme']); 33 | // Strip scheme default ports 34 | if(isset($defaultSchemes[$url['scheme']]) && isset($url['port']) && $defaultSchemes[$url['scheme']] == $url['port']) 35 | unset($url['port']); 36 | $newUrl .= "{$url['scheme']}://"; 37 | } 38 | 39 | if(isset($url['host'])) 40 | { 41 | $url['host'] = strtolower($url['host']); 42 | // Seems like a valid domain, properly validation should be made in higher layers. 43 | if(preg_match("/[a-z]+\Z/", $url['host'])) 44 | { 45 | if(preg_match("/^www\./", $url['host']) && gethostbyname($url['host']) == gethostbyname(str_replace("www.", "", $url['host']))) 46 | $newUrl .= str_replace("www.", "", $url['host']); 47 | else 48 | $newUrl .= $url['host']; 49 | } 50 | else 51 | $newUrl .= $url['host']; 52 | } 53 | 54 | if(isset($url['port'])) 55 | $newUrl .= ":{$url['port']}"; 56 | 57 | if(isset($url['path'])) 58 | { 59 | // Case normalization 60 | $url['path'] = preg_replace('/(%([0-9abcdef][0-9abcdef]))/ex', "'%'.strtoupper('\\2')", $url['path']); 61 | //Strip duplicate slashes 62 | while(preg_match("/\/\//", $url['path'])) 63 | $url['path'] = preg_replace("/\/\//", "/", $url['path']); 64 | 65 | /* 66 | * Decode unreserved characters, http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.3 67 | * Heavily rewritten version of urlDecodeUnreservedChars() in Glen Scott's url-normalizer. 68 | */ 69 | 70 | $u = array(); 71 | for ($o = 65; $o <= 90; $o++) 72 | $u[] = dechex($o); 73 | for ($o = 97; $o <= 122; $o++) 74 | $u[] = dechex($o); 75 | for ($o = 48; $o <= 57; $o++) 76 | $u[] = dechex($o); 77 | $chrs = array('-', '.', '_', '~'); 78 | foreach($chrs as $chr) 79 | $u[] = dechex(ord($chr)); 80 | $url['path'] = preg_replace_callback(array_map(create_function('$str', 'return "/%" . strtoupper($str) . "/x";'), $u), 81 | create_function('$matches', 'return chr(hexdec($matches[0]));'), $url['path']); 82 | // Remove directory index 83 | $defaultIndexes = array("/default\.aspx/" => "default.aspx", "/default\.asp/" => "default.asp", 84 | "/index\.html/" => "index.html", "/index\.htm/" => "index.htm", 85 | "/default\.html/" => "default.html", "/default\.htm/" => "default.htm", 86 | "/index\.php/" => "index.php", "/index\.jsp/" => "index.jsp"); 87 | foreach($defaultIndexes as $index => $strip) 88 | { 89 | if(preg_match($index, $url['path'])) 90 | $url['path'] = str_replace($strip, "", $url['path']); 91 | } 92 | 93 | /** 94 | * Path segment normalization, http://www.apps.ietf.org/rfc/rfc3986.html#sec-5.2.4 95 | * Heavily rewritten version of removeDotSegments() in Glen Scott's url-normalizer. 96 | */ 97 | 98 | $new_path = ''; 99 | while(!empty($url['path'])) 100 | { 101 | if(preg_match('!^(\.\./|\./)!x', $url['path'])) 102 | $url['path'] = preg_replace('!^(\.\./|\./)!x', '', $url['path']); 103 | elseif(preg_match('!^(/\./)!x', $url['path'], $matches) || preg_match('!^(/\.)$!x', $url['path'], $matches)) 104 | $url['path'] = preg_replace("!^" . $matches[1] . "!", '/', $url['path']); 105 | elseif(preg_match('!^(/\.\./|/\.\.)!x', $url['path'], $matches)) 106 | { 107 | $url['path'] = preg_replace( '!^' . preg_quote( $matches[1], '!' ) . '!x', '/', $url['path'] ); 108 | $new_path = preg_replace( '!/([^/]+)$!x', '', $new_path ); 109 | } 110 | elseif(preg_match('!^(\.|\.\.)$!x', $url['path'])) 111 | $url['path'] = preg_replace('!^(\.|\.\.)$!x', '', $url['path']); 112 | else 113 | { 114 | if(preg_match('!(/*[^/]*)!x', $url['path'], $matches)) 115 | { 116 | $first_path_segment = $matches[1]; 117 | $url['path'] = preg_replace( '/^' . preg_quote( $first_path_segment, '/' ) . '/', '', $url['path'], 1 ); 118 | $new_path .= $first_path_segment; 119 | } 120 | } 121 | } 122 | $newUrl .= $new_path; 123 | } 124 | 125 | if(isset($url['fragment'])) 126 | unset($url['fragment']); 127 | 128 | // Sort GET params alphabetically, not because the RFC requires it but because it's cool! 129 | if(isset($url['query'])) 130 | { 131 | if(preg_match("/&/", $url['query'])) 132 | { 133 | $s = explode("&", $url['query']); 134 | $url['query'] = ""; 135 | sort($s); 136 | foreach($s as $z) 137 | $url['query'] .= "{$z}&"; 138 | $url['query'] = preg_replace("/&\Z/", "", $url['query']); 139 | } 140 | $newUrl .= "?{$url['query']}"; 141 | } 142 | 143 | return $newUrl; 144 | } -------------------------------------------------------------------------------- /normalize_url_test.php: -------------------------------------------------------------------------------- 1 | assertEquals("http://example.com/", normalizeURL("http://www.example.com/")); 12 | } 13 | 14 | public function testDefaultPorts() 15 | { 16 | $this->assertEquals("http://example.com/", normalizeURL("http://example.com:80/")); 17 | $this->assertEquals("https://example.com/", normalizeURL("https://example.com:443/")); 18 | } 19 | 20 | public function testDuplicateSlashes() 21 | { 22 | $this->assertEquals("http://example.com/", normalizeURL("http://example.com///")); 23 | } 24 | 25 | public function testDecodeUnreservedChars() 26 | { 27 | $this->assertEquals("http://example.com/c/", normalizeURL("http://example.com/%63/")); 28 | } 29 | 30 | public function testDirectoryIndex() 31 | { 32 | $this->assertEquals("http://example.com/", normalizeURL("http://example.com/index.html")); 33 | } 34 | 35 | public function testPathSegment() 36 | { 37 | $this->assertEquals("http://example.com/a/b/c/", normalizeURL("http://example.com/a/./b/../b/%63/")); 38 | } 39 | 40 | public function testAlphabeticParams() 41 | { 42 | $this->assertEquals("http://example.com/?a=b&c=d", normalizeURL("http://example.com/?c=d&a=b")); 43 | } 44 | } --------------------------------------------------------------------------------