├── README.md
├── normalize_url.php
└── normalize_url_test.php


/README.md:
--------------------------------------------------------------------------------
 1 | PHP library for normalizing URLs as specified in
 2 | [RFC 3986](http://en.wikipedia.org/wiki/URI_scheme#Generic_syntax).
 3 | The library doesn't require any dependencies.
 4 | The library is very simple to use: simply include it
 5 | and call `normalizeURL()` with the input URL as a str param.
 6 | 
 7 | Supports:
 8 | 
 9 | - Removal of WWW subdomains, e.g. if www.foo.bar points to the same location as foo.bar.
10 | - Removal of default ports (HTTP and HTTPS supported by default;
11 |   edit `$defaultSchemes` to support additional protocols and ports if you wish)
12 | - Removal of duplicate slashes
13 | - Decoding unreserved characters
14 | - Removal of default directory index files
15 | - Removal of dot segments in URL path
16 | - Sorting GET params alphabetically
17 | 


--------------------------------------------------------------------------------
/normalize_url.php:
--------------------------------------------------------------------------------
  1 | <?php
  2 | /**
  3 |  * Copyright (c) 2011 Niklas A. Femerstrand <niklas@flattr.com>
  4 | *
  5 | * Permission is hereby granted, free of charge, to any person obtaining a copy
  6 | * of this software and associated documentation files (the "Software"), to deal
  7 | * in the Software without restriction, including without limitation the rights
  8 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9 | * copies of the Software, and to permit persons to whom the Software is
 10 | * furnished to do so, subject to the following conditions:
 11 | *
 12 | * The above copyright notice and this permission notice shall be included in
 13 | * all copies or substantial portions of the Software.
 14 | *
 15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 18 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 21 | * THE SOFTWARE.
 22 | */
 23 | 
 24 | function normalizeURL($url)
 25 | {
 26 | 	$newUrl = "";
 27 | 	$url = parse_url($url);	
 28 | 	$defaultSchemes = array("http" => 80, "https" => 443);
 29 | 
 30 | 	if(isset($url['scheme']))
 31 | 	{
 32 | 		$url['scheme'] = strtolower($url['scheme']);
 33 | 		// Strip scheme default ports
 34 | 		if(isset($defaultSchemes[$url['scheme']]) && isset($url['port']) && $defaultSchemes[$url['scheme']] == $url['port'])
 35 | 			unset($url['port']);
 36 | 		$newUrl .= "{$url['scheme']}://";
 37 | 	}
 38 | 
 39 | 	if(isset($url['host']))
 40 | 	{
 41 | 		$url['host'] = strtolower($url['host']);
 42 | 		// Seems like a valid domain, properly validation should be made in higher layers.
 43 | 		if(preg_match("/[a-z]+\Z/", $url['host']))
 44 | 		{
 45 | 			if(preg_match("/^www\./", $url['host']) && gethostbyname($url['host']) == gethostbyname(str_replace("www.", "", $url['host'])))
 46 | 				$newUrl .= str_replace("www.", "", $url['host']);
 47 | 			else
 48 | 				$newUrl .= $url['host'];
 49 | 		}
 50 | 		else
 51 | 			$newUrl .= $url['host'];
 52 | 	}
 53 | 
 54 | 	if(isset($url['port']))
 55 | 		$newUrl .= ":{$url['port']}";
 56 | 
 57 | 	if(isset($url['path']))
 58 | 	{
 59 | 		// Case normalization
 60 | 		$url['path'] = preg_replace('/(%([0-9abcdef][0-9abcdef]))/ex', "'%'.strtoupper('\\2')", $url['path']);
 61 | 		//Strip duplicate slashes
 62 | 		while(preg_match("/\/\//", $url['path']))
 63 | 			$url['path'] = preg_replace("/\/\//", "/", $url['path']);
 64 | 
 65 | 		/*
 66 | 		 * Decode unreserved characters, http://www.apps.ietf.org/rfc/rfc3986.html#sec-2.3
 67 | 		 * Heavily rewritten version of urlDecodeUnreservedChars() in Glen Scott's url-normalizer.
 68 | 		 */
 69 | 
 70 | 		$u = array();
 71 | 		for ($o = 65; $o <= 90; $o++)
 72 | 			$u[] = dechex($o);
 73 | 		for ($o = 97; $o <= 122; $o++)
 74 | 			$u[] = dechex($o);
 75 | 		for ($o = 48; $o <= 57; $o++)
 76 | 			$u[] = dechex($o);
 77 | 		$chrs = array('-', '.', '_', '~');
 78 | 		foreach($chrs as $chr)
 79 | 			$u[] = dechex(ord($chr));
 80 | 		$url['path'] = preg_replace_callback(array_map(create_function('$str', 'return "/%" . strtoupper($str) . "/x";'), $u),
 81 | 	                                                   create_function('$matches', 'return chr(hexdec($matches[0]));'), $url['path']);
 82 | 	    // Remove directory index
 83 | 		$defaultIndexes = array("/default\.aspx/" => "default.aspx", "/default\.asp/"  => "default.asp",
 84 | 	                            "/index\.html/"   => "index.html",   "/index\.htm/"    => "index.htm",
 85 | 	                            "/default\.html/" => "default.html", "/default\.htm/"  => "default.htm",
 86 | 	                            "/index\.php/"    => "index.php",    "/index\.jsp/"    => "index.jsp");
 87 | 		foreach($defaultIndexes as $index => $strip)
 88 | 		{
 89 | 			if(preg_match($index, $url['path']))
 90 | 				$url['path'] = str_replace($strip, "", $url['path']);
 91 | 		}
 92 | 	
 93 | 	    /**
 94 | 	     * Path segment normalization, http://www.apps.ietf.org/rfc/rfc3986.html#sec-5.2.4
 95 | 	     * Heavily rewritten version of removeDotSegments() in Glen Scott's url-normalizer.
 96 | 	     */
 97 | 		
 98 | 		$new_path = '';
 99 | 		while(!empty($url['path']))
100 | 		{
101 | 			if(preg_match('!^(\.\./|\./)!x', $url['path']))
102 | 				$url['path'] = preg_replace('!^(\.\./|\./)!x', '', $url['path']);
103 | 			elseif(preg_match('!^(/\./)!x', $url['path'], $matches) || preg_match('!^(/\.)$!x', $url['path'], $matches))
104 | 				$url['path'] = preg_replace("!^" . $matches[1] . "!", '/', $url['path']);
105 | 			elseif(preg_match('!^(/\.\./|/\.\.)!x', $url['path'], $matches))
106 | 			{
107 | 				$url['path'] = preg_replace( '!^' . preg_quote( $matches[1], '!' ) . '!x', '/', $url['path'] );
108 | 				$new_path = preg_replace( '!/([^/]+)$!x', '', $new_path );
109 | 			}
110 | 			elseif(preg_match('!^(\.|\.\.)$!x', $url['path']))
111 | 				$url['path'] = preg_replace('!^(\.|\.\.)$!x', '', $url['path']);
112 | 			else
113 | 			{
114 | 				if(preg_match('!(/*[^/]*)!x', $url['path'], $matches))
115 | 				{
116 | 					$first_path_segment = $matches[1];
117 | 					$url['path'] = preg_replace( '/^' . preg_quote( $first_path_segment, '/' ) . '/', '', $url['path'], 1 );
118 | 					$new_path .= $first_path_segment;
119 | 				}
120 | 			}
121 | 		}
122 | 		$newUrl .= $new_path;
123 | 	}
124 | 	
125 | 	if(isset($url['fragment']))
126 | 		unset($url['fragment']);
127 | 	
128 | 	// Sort GET params alphabetically, not because the RFC requires it but because it's cool!
129 | 	if(isset($url['query']))
130 | 	{
131 | 		if(preg_match("/&/", $url['query']))
132 | 		{
133 | 			$s = explode("&", $url['query']);
134 | 			$url['query'] = "";
135 | 			sort($s);
136 | 			foreach($s as $z)
137 | 				$url['query'] .= "{$z}&";
138 | 			$url['query'] = preg_replace("/&\Z/", "", $url['query']);
139 | 		}
140 | 		$newUrl .= "?{$url['query']}";
141 | 	}
142 | 
143 | 	return $newUrl;
144 | }


--------------------------------------------------------------------------------
/normalize_url_test.php:
--------------------------------------------------------------------------------
 1 | <?php
 2 | class NormalizeUrlTests extends PHPUnit_Framework_TestCase
 3 | {
 4 | 	public static function setUpBeforeClass()
 5 | 	{
 6 | 		require("normalize_url.php");
 7 | 	}
 8 | 	
 9 | 	public function testWwwSubdomain()
10 | 	{
11 | 		$this->assertEquals("http://example.com/", normalizeURL("http://www.example.com/"));
12 | 	}
13 | 	
14 | 	public function testDefaultPorts()
15 | 	{
16 | 		$this->assertEquals("http://example.com/", normalizeURL("http://example.com:80/"));
17 | 		$this->assertEquals("https://example.com/", normalizeURL("https://example.com:443/"));
18 | 	}
19 | 	
20 | 	public function testDuplicateSlashes()
21 | 	{
22 | 		$this->assertEquals("http://example.com/", normalizeURL("http://example.com///"));
23 | 	}
24 | 	
25 | 	public function testDecodeUnreservedChars()
26 | 	{
27 | 		$this->assertEquals("http://example.com/c/", normalizeURL("http://example.com/%63/"));
28 | 	}
29 | 	
30 | 	public function testDirectoryIndex()
31 | 	{
32 | 		$this->assertEquals("http://example.com/", normalizeURL("http://example.com/index.html"));
33 | 	}
34 | 	
35 | 	public function testPathSegment()
36 | 	{
37 | 		$this->assertEquals("http://example.com/a/b/c/", normalizeURL("http://example.com/a/./b/../b/%63/"));
38 | 	}
39 | 	
40 | 	public function testAlphabeticParams()
41 | 	{
42 | 		$this->assertEquals("http://example.com/?a=b&c=d", normalizeURL("http://example.com/?c=d&a=b"));
43 | 	}
44 | }


--------------------------------------------------------------------------------