├── gradle.properties ├── settings.gradle ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── .travis.yml ├── src ├── main │ └── java │ │ └── com │ │ └── shekhargulati │ │ └── urlcleaner │ │ ├── UrlCleanerException.java │ │ ├── Options.java │ │ ├── UrlExtractor.java │ │ └── UrlCleaner.java └── test │ └── java │ └── com │ └── shekhargulati │ └── urlcleaner │ ├── UrlExtractorTest.java │ └── UrlCleanerSpec.java ├── LICENSE.txt ├── .gitignore ├── gradlew.bat ├── README.md └── gradlew /gradle.properties: -------------------------------------------------------------------------------- 1 | version=0.5.0-SNAPSHOT -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | rootProject.name = 'urlcleaner' 2 | 3 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shekhargulati/urlcleaner/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | install: ./gradlew assemble -x :signArchives 3 | script: ./gradlew clean build -x :signArchives 4 | jdk: 5 | - oraclejdk8 6 | after_success: 7 | - bash <(curl -s https://codecov.io/bash) -------------------------------------------------------------------------------- /src/main/java/com/shekhargulati/urlcleaner/UrlCleanerException.java: -------------------------------------------------------------------------------- 1 | package com.shekhargulati.urlcleaner; 2 | 3 | public class UrlCleanerException extends RuntimeException { 4 | 5 | public UrlCleanerException(Throwable cause) { 6 | super(cause); 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Sat Mar 12 19:55:29 IST 2016 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-2.5-all.zip 7 | -------------------------------------------------------------------------------- /src/main/java/com/shekhargulati/urlcleaner/Options.java: -------------------------------------------------------------------------------- 1 | package com.shekhargulati.urlcleaner; 2 | 3 | public class Options { 4 | 5 | public static final Options DEFAULT_OPTIONS = new Options(); 6 | 7 | private boolean stripFragment = true; 8 | private boolean stripWWW = true; 9 | 10 | private Options() { 11 | } 12 | 13 | public Options(boolean stripFragment, boolean stripWWW) { 14 | this.stripFragment = stripFragment; 15 | this.stripWWW = stripWWW; 16 | } 17 | 18 | public boolean isStripFragment() { 19 | return stripFragment; 20 | } 21 | 22 | public boolean isStripWWW() { 23 | return stripWWW; 24 | } 25 | } -------------------------------------------------------------------------------- /src/main/java/com/shekhargulati/urlcleaner/UrlExtractor.java: -------------------------------------------------------------------------------- 1 | package com.shekhargulati.urlcleaner; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | public interface UrlExtractor { 9 | 10 | Pattern URL_PATTERN = Pattern.compile("((https?|ftp|gopher|telnet|file):((//)|(\\\\))+[\\w\\d:#@%/;$()~_?\\+-=\\\\\\.&]*)", Pattern.CASE_INSENSITIVE); 11 | 12 | static List extractUrls(final String text) { 13 | List urls = new ArrayList<>(); 14 | Matcher matcher = URL_PATTERN.matcher(text); 15 | while (matcher.find()) { 16 | urls.add(text.substring(matcher.start(0), matcher.end(0))); 17 | } 18 | return urls; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/test/java/com/shekhargulati/urlcleaner/UrlExtractorTest.java: -------------------------------------------------------------------------------- 1 | package com.shekhargulati.urlcleaner; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.List; 6 | 7 | import static org.hamcrest.CoreMatchers.equalTo; 8 | import static org.hamcrest.core.IsCollectionContaining.hasItems; 9 | import static org.junit.Assert.assertThat; 10 | 11 | public class UrlExtractorTest { 12 | 13 | @Test 14 | public void shouldExtractAllValidUrlFromText() throws Exception { 15 | final String text = "CloudABI now available for Arch Linux https://nuxi.nl/doc/archlinux/ (cmts https://google.com )"; 16 | List urls = UrlExtractor.extractUrls(text); 17 | assertThat(urls.size(), equalTo(2)); 18 | assertThat(urls, hasItems(equalTo("https://nuxi.nl/doc/archlinux/"), equalTo("https://google.com"))); 19 | } 20 | 21 | @Test 22 | public void shouldReturnEmptyListWhenNoUrlExist() throws Exception { 23 | final String text = "CloudABI now available for Arch Linux"; 24 | List urls = UrlExtractor.extractUrls(text); 25 | assertThat(urls.size(), equalTo(0)); 26 | } 27 | } -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright 2016 Shekhar Gulati . 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by .ignore support plugin (hsz.mobi) 2 | ### JetBrains template 3 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio 4 | 5 | *.iml 6 | 7 | ## Directory-based project format: 8 | .idea/ 9 | # if you remove the above rule, at least ignore the following: 10 | 11 | # User-specific stuff: 12 | # .idea/workspace.xml 13 | # .idea/tasks.xml 14 | # .idea/dictionaries 15 | 16 | # Sensitive or high-churn files: 17 | # .idea/dataSources.ids 18 | # .idea/dataSources.xml 19 | # .idea/sqlDataSources.xml 20 | # .idea/dynamic.xml 21 | # .idea/uiDesigner.xml 22 | 23 | # Gradle: 24 | # .idea/gradle.xml 25 | # .idea/libraries 26 | 27 | # Mongo Explorer plugin: 28 | # .idea/mongoSettings.xml 29 | 30 | ## File-based project format: 31 | *.ipr 32 | *.iws 33 | 34 | ## Plugin-specific files: 35 | 36 | # IntelliJ 37 | /out/ 38 | 39 | # mpeltonen/sbt-idea plugin 40 | .idea_modules/ 41 | 42 | # JIRA plugin 43 | atlassian-ide-plugin.xml 44 | 45 | # Crashlytics plugin (for Android Studio and IntelliJ) 46 | com_crashlytics_export_strings.xml 47 | crashlytics.properties 48 | crashlytics-build.properties 49 | ### Java template 50 | *.class 51 | 52 | # Mobile Tools for Java (J2ME) 53 | .mtj.tmp/ 54 | 55 | # Package Files # 56 | *.jar 57 | *.war 58 | *.ear 59 | 60 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 61 | hs_err_pid* 62 | ### Gradle template 63 | .gradle 64 | build/ 65 | 66 | # Ignore Gradle GUI config 67 | gradle-app.setting 68 | 69 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored) 70 | !gradle-wrapper.jar 71 | 72 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 12 | set DEFAULT_JVM_OPTS= 13 | 14 | set DIRNAME=%~dp0 15 | if "%DIRNAME%" == "" set DIRNAME=. 16 | set APP_BASE_NAME=%~n0 17 | set APP_HOME=%DIRNAME% 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windowz variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | if "%@eval[2+2]" == "4" goto 4NT_args 53 | 54 | :win9xME_args 55 | @rem Slurp the command line arguments. 56 | set CMD_LINE_ARGS= 57 | set _SKIP=2 58 | 59 | :win9xME_args_slurp 60 | if "x%~1" == "x" goto execute 61 | 62 | set CMD_LINE_ARGS=%* 63 | goto execute 64 | 65 | :4NT_args 66 | @rem Get arguments from the 4NT Shell from JP Software 67 | set CMD_LINE_ARGS=%$ 68 | 69 | :execute 70 | @rem Setup the command line 71 | 72 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 73 | 74 | @rem Execute Gradle 75 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 76 | 77 | :end 78 | @rem End local scope for the variables with windows NT shell 79 | if "%ERRORLEVEL%"=="0" goto mainEnd 80 | 81 | :fail 82 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 83 | rem the _cmd.exe /c_ return code! 84 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 85 | exit /b 1 86 | 87 | :mainEnd 88 | if "%OS%"=="Windows_NT" endlocal 89 | 90 | :omega 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | urlcleaner [![Build Status](https://travis-ci.org/shekhargulati/urlcleaner.svg?branch=master)](https://travis-ci.org/shekhargulati/urlcleaner) [![codecov.io](https://codecov.io/github/shekhargulati/urlcleaner/coverage.svg?branch=master)](https://codecov.io/github/shekhargulati/urlcleaner?branch=master) [![License](https://img.shields.io/:license-mit-blue.svg)](./LICENSE.txt) 2 | ----- 3 | 4 | This library provides functions to normalize, unshorted, and extract URL. 5 | 6 | `urlcleaner` API uses JDK 8. 7 | 8 | > [URL normalization](https://en.wikipedia.org/wiki/URL_normalization) is the process by which URLs are modified and standardized in a consistent manner. The goal of the normalization process is to transform a URL into a normalized URL so it is possible to determine if two syntactically different URLs may be equivalent. 9 | 10 | Getting Started 11 | -------- 12 | 13 | To use `urlcleaner` in your application, you have to add `urlcleaner` in your classpath. urlcleaner is available on Maven Central so you just need to add dependency to your favorite build tool as show below. 14 | 15 | For Apache Maven users, please add following to your pom.xml. 16 | 17 | ```xml 18 | 19 | 20 | com.shekhargulati.urlcleaner 21 | urlcleaner 22 | 0.4.0 23 | jar 24 | 25 | 26 | ``` 27 | 28 | Gradle users can add following to their build.gradle file. 29 | 30 | ```groovy 31 | compile(group: 'com.shekhargulati.urlcleaner', name: 'urlcleaner', version: '0.4.0', ext: 'jar') 32 | ``` 33 | 34 | ## URL Normalization Usage 35 | 36 | ```java 37 | import com.shekhargulati.urlcleaner.UrlCleaner; 38 | 39 | UrlCleaner.normalizeUrl("shekhargulati.com") // http://shekhargulati.com 40 | 41 | UrlCleaner.normalizeUrl("https://www.shekhargulati.com:443") // https://shekhargulati.com 42 | 43 | UrlCleaner.normalizeUrl("www.shekhargulati.com") // http://shekhargulati.com 44 | 45 | UrlCleaner.normalizeUrl("http://shekhargulati.com/%7Eabout/") // http://shekhargulati.com/~about 46 | 47 | UrlCleaner.normalizeUrl("http://shekhargulati.com/hello%5Fabout/") // http://shekhargulati.com/hello_about 48 | 49 | UrlCleaner.normalizeUrl("http://shekhargulati.com?lang=en&article=fred") // http://shekhargulati.com?article=fred&lang=en 50 | 51 | UrlCleaner.normalizeUrl("http://xn--xample-hva.com") // http://êxample.com 52 | ``` 53 | 54 | ## Unshorten URL Usage 55 | 56 | ```java 57 | UrlCleaner.unshortenUrl("http://bit.ly/1Wtrl9t"); // http://shekhargulati.com/ 58 | 59 | // It can also work for multi level shortened URL. The below URL is shortened 4 times 60 | UrlCleaner.unshortenUrl("http://bit.ly/1pwuGdF"); //http://www.bloomberg.com/news/articles/2016-03-17/unmasking-startup-l-jackson-silicon-valley-s-favorite-twitter-persona 61 | ``` 62 | 63 | 64 | ## URL Extraction 65 | 66 | ```java 67 | final String text = "CloudABI now available for Arch Linux https://nuxi.nl/doc/archlinux/ (cmts https://google.com )"; 68 | List urls = UrlExtractor.extractUrls(text); 69 | // urls -> [https://nuxi.nl/doc/archlinux/,https://google.com] 70 | ``` 71 | 72 | License 73 | ------- 74 | 75 | urlcleaner is licensed under the MIT License - see the `LICENSE` file for details. 76 | 77 | 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 10 | DEFAULT_JVM_OPTS="" 11 | 12 | APP_NAME="Gradle" 13 | APP_BASE_NAME=`basename "$0"` 14 | 15 | # Use the maximum available, or set MAX_FD != -1 to use that value. 16 | MAX_FD="maximum" 17 | 18 | warn ( ) { 19 | echo "$*" 20 | } 21 | 22 | die ( ) { 23 | echo 24 | echo "$*" 25 | echo 26 | exit 1 27 | } 28 | 29 | # OS specific support (must be 'true' or 'false'). 30 | cygwin=false 31 | msys=false 32 | darwin=false 33 | case "`uname`" in 34 | CYGWIN* ) 35 | cygwin=true 36 | ;; 37 | Darwin* ) 38 | darwin=true 39 | ;; 40 | MINGW* ) 41 | msys=true 42 | ;; 43 | esac 44 | 45 | # For Cygwin, ensure paths are in UNIX format before anything is touched. 46 | if $cygwin ; then 47 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 48 | fi 49 | 50 | # Attempt to set APP_HOME 51 | # Resolve links: $0 may be a link 52 | PRG="$0" 53 | # Need this for relative symlinks. 54 | while [ -h "$PRG" ] ; do 55 | ls=`ls -ld "$PRG"` 56 | link=`expr "$ls" : '.*-> \(.*\)$'` 57 | if expr "$link" : '/.*' > /dev/null; then 58 | PRG="$link" 59 | else 60 | PRG=`dirname "$PRG"`"/$link" 61 | fi 62 | done 63 | SAVED="`pwd`" 64 | cd "`dirname \"$PRG\"`/" >&- 65 | APP_HOME="`pwd -P`" 66 | cd "$SAVED" >&- 67 | 68 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 69 | 70 | # Determine the Java command to use to start the JVM. 71 | if [ -n "$JAVA_HOME" ] ; then 72 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 73 | # IBM's JDK on AIX uses strange locations for the executables 74 | JAVACMD="$JAVA_HOME/jre/sh/java" 75 | else 76 | JAVACMD="$JAVA_HOME/bin/java" 77 | fi 78 | if [ ! -x "$JAVACMD" ] ; then 79 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 80 | 81 | Please set the JAVA_HOME variable in your environment to match the 82 | location of your Java installation." 83 | fi 84 | else 85 | JAVACMD="java" 86 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 87 | 88 | Please set the JAVA_HOME variable in your environment to match the 89 | location of your Java installation." 90 | fi 91 | 92 | # Increase the maximum file descriptors if we can. 93 | if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then 94 | MAX_FD_LIMIT=`ulimit -H -n` 95 | if [ $? -eq 0 ] ; then 96 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 97 | MAX_FD="$MAX_FD_LIMIT" 98 | fi 99 | ulimit -n $MAX_FD 100 | if [ $? -ne 0 ] ; then 101 | warn "Could not set maximum file descriptor limit: $MAX_FD" 102 | fi 103 | else 104 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 105 | fi 106 | fi 107 | 108 | # For Darwin, add options to specify how the application appears in the dock 109 | if $darwin; then 110 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 111 | fi 112 | 113 | # For Cygwin, switch paths to Windows format before running java 114 | if $cygwin ; then 115 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 116 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules 158 | function splitJvmOpts() { 159 | JVM_OPTS=("$@") 160 | } 161 | eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS 162 | JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME" 163 | 164 | exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@" 165 | -------------------------------------------------------------------------------- /src/main/java/com/shekhargulati/urlcleaner/UrlCleaner.java: -------------------------------------------------------------------------------- 1 | package com.shekhargulati.urlcleaner; 2 | 3 | import java.io.IOException; 4 | import java.net.*; 5 | import java.nio.file.Paths; 6 | import java.util.AbstractMap.SimpleEntry; 7 | import java.util.*; 8 | import java.util.stream.Stream; 9 | 10 | import static java.util.stream.Collectors.*; 11 | 12 | /** 13 | * UrlCleaner provides API to normalize and unshorten URLs. 14 | *

15 | * URL normalization is based on normalizations described in RFC 3986. 16 | */ 17 | public abstract class UrlCleaner { 18 | 19 | private static final Map DEFAULT_PORTS = Stream.of( 20 | new SimpleEntry<>("http", 80), 21 | new SimpleEntry<>("https", 443), 22 | new SimpleEntry<>("ftp", 21)) 23 | .collect(toMap(SimpleEntry::getKey, SimpleEntry::getValue)); 24 | 25 | /** 26 | * Normalizes a URL in a standardized and consistent manner. 27 | * 28 | * @param inputUrl URL to process 29 | * @param options option 30 | * @return normalized url 31 | */ 32 | public static String normalizeUrl(final String inputUrl, final Options options) throws IllegalArgumentException { 33 | URI uri = Optional.ofNullable(inputUrl) 34 | .filter(u -> u.trim().length() > 0) 35 | .map(u -> u.replaceFirst("^//", "http://")) 36 | .map(URI::create) 37 | .map(u -> u.getScheme() == null ? URI.create("http://" + inputUrl) : u) 38 | .orElseThrow(() -> new IllegalArgumentException("url can't be empty of null.")); 39 | 40 | String scheme = uri.getScheme().toLowerCase(); 41 | String host = uri.getHost().toLowerCase(); 42 | int port = -1; // URI does not add port iff it is -1 43 | if (uri.getPort() != DEFAULT_PORTS.get(scheme)) { 44 | port = uri.getPort(); 45 | } 46 | if (options.isStripWWW()) { 47 | host = host.replaceFirst("^www\\.", ""); 48 | } 49 | String fragment = uri.getFragment(); 50 | if (options.isStripFragment()) { 51 | fragment = null; 52 | } 53 | String path = uri.getPath(); 54 | if (path != null) { 55 | path = Paths.get(path).normalize().toString(); 56 | } 57 | String newUri = null; 58 | try { 59 | newUri = new URI(scheme, uri.getUserInfo(), host, port, path, sortQueryString(uri.getQuery()), fragment).normalize().toString(); 60 | return newUri.replace(host, IDN.toUnicode(host)).replaceFirst("/$", ""); 61 | } catch (URISyntaxException e) { 62 | throw new UrlCleanerException(e); 63 | } 64 | } 65 | 66 | /** 67 | * Normalizes a URL in a standardized and consistent manner. 68 | * 69 | * @param inputUrl URL to process 70 | * @return normalized url 71 | */ 72 | public static String normalizeUrl(final String inputUrl) throws UrlCleanerException, IllegalArgumentException { 73 | return normalizeUrl(inputUrl, Options.DEFAULT_OPTIONS); 74 | } 75 | 76 | /** 77 | * Normalizes a list of urls 78 | * 79 | * @param inputUrls 80 | * @return normalized URLs 81 | * @throws UrlCleanerException 82 | * @throws IllegalArgumentException 83 | */ 84 | public static List normalizeUrl(final String... inputUrls) throws UrlCleanerException, IllegalArgumentException { 85 | return normalizeUrl(Arrays.asList(inputUrls), Options.DEFAULT_OPTIONS); 86 | } 87 | 88 | /** 89 | * Normalizes a list of urls 90 | * 91 | * @param inputUrls 92 | * @return normalized URLs 93 | * @throws UrlCleanerException 94 | * @throws IllegalArgumentException 95 | */ 96 | public static List normalizeUrl(final List inputUrls) throws UrlCleanerException, IllegalArgumentException { 97 | return normalizeUrl(inputUrls, Options.DEFAULT_OPTIONS); 98 | } 99 | 100 | /** 101 | * Normalizes a list of urls using the give options 102 | * 103 | * @param urls a list of urls 104 | * @param options option 105 | * @return list of normalized URLs 106 | * @throws UrlCleanerException 107 | * @throws IllegalArgumentException 108 | */ 109 | public static List normalizeUrl(List urls, final Options options) throws UrlCleanerException, IllegalArgumentException { 110 | return urls.stream().map(inputUrl -> normalizeUrl(inputUrl, options)).collect(toList()); 111 | } 112 | 113 | /** 114 | * Unshortens a given URL to its full form 115 | * 116 | * @param shortUrl short URL like bit.ly/abc 117 | * @return full URL 118 | */ 119 | public static String unshortenUrl(final String shortUrl) throws IOException { 120 | HttpURLConnection connection = (HttpURLConnection) new URL(shortUrl).openConnection(); 121 | connection.setInstanceFollowRedirects(false); 122 | connection.setRequestMethod("HEAD"); 123 | int responseCode = connection.getResponseCode(); 124 | String url = connection.getHeaderField("Location"); 125 | if (responseCode / 100 == 3 && url != null) { 126 | String expandedUrl = unshortenUrl(new URL(new URL(shortUrl),url).toString()); 127 | if (Objects.equals(expandedUrl, url)) 128 | return url; 129 | else { 130 | return expandedUrl; 131 | } 132 | } 133 | return shortUrl; 134 | } 135 | 136 | private static String sortQueryString(final String query) { 137 | if (query == null || query.trim().length() == 0) { 138 | return null; 139 | } 140 | return Arrays.stream(query.split("&")) 141 | .map(p -> p.split("=")) 142 | .map(p -> p.length == 2 ? new SimpleEntry<>(p[0], p[1]) : new SimpleEntry<>(p[0], "")) 143 | .sorted((e1, e2) -> e1.getKey().compareTo(e2.getKey())) 144 | .map(e -> String.format("%s=%s", e.getKey(), e.getValue())).collect(joining("&")); 145 | } 146 | 147 | } 148 | -------------------------------------------------------------------------------- /src/test/java/com/shekhargulati/urlcleaner/UrlCleanerSpec.java: -------------------------------------------------------------------------------- 1 | package com.shekhargulati.urlcleaner; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.List; 6 | 7 | import static org.hamcrest.CoreMatchers.equalTo; 8 | import static org.hamcrest.core.IsCollectionContaining.hasItems; 9 | import static org.junit.Assert.assertThat; 10 | 11 | public class UrlCleanerSpec { 12 | 13 | @Test 14 | public void shouldNormalizeSchemeAndHostToLowercase() throws Exception { 15 | final String url = "HTTP://ShekharGulati.com"; 16 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 17 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com")); 18 | } 19 | 20 | @Test 21 | public void shouldPrependHttpWhenSchemeIsNotPresent() throws Exception { 22 | final String url = "shekhargulati.com"; 23 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 24 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com")); 25 | } 26 | 27 | @Test 28 | public void shouldPrependHttpWhenUrlContainsSlashes() throws Exception { 29 | final String url = "//shekhargulati.com"; 30 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 31 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com")); 32 | } 33 | 34 | @Test 35 | public void shouldReturnSameUrlWhenAlreadyNormalized() throws Exception { 36 | final String url = "http://shekhargulati.com"; 37 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 38 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com")); 39 | } 40 | 41 | @Test 42 | public void shouldRemoveDefaultPortForHttpProtocol() throws Exception { 43 | final String url = "http://shekhargulati.com:80"; 44 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 45 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com")); 46 | } 47 | 48 | @Test 49 | public void shouldRemoveDefaultPortForHttpsProtocol() throws Exception { 50 | final String url = "https://shekhargulati.com:443"; 51 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 52 | assertThat(normalizedUrl, equalTo("https://shekhargulati.com")); 53 | } 54 | 55 | @Test 56 | public void shouldRemoveDefaultPortForHttpsProtocolWithWWW() throws Exception { 57 | final String url = "https://www.shekhargulati.com:443"; 58 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 59 | assertThat(normalizedUrl, equalTo("https://shekhargulati.com")); 60 | } 61 | 62 | @Test 63 | public void shouldRemoveDefaultPortForFtpProtocol() throws Exception { 64 | final String url = "ftp://shekhargulati.com:21"; 65 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 66 | assertThat(normalizedUrl, equalTo("ftp://shekhargulati.com")); 67 | } 68 | 69 | @Test 70 | public void shouldRemoveWWWFromTheUrl() throws Exception { 71 | final String url = "http://www.shekhargulati.com"; 72 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 73 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com")); 74 | } 75 | 76 | @Test 77 | public void shouldNotRemoveWWWWhenItIsPartOfHostNameFromTheUrl() throws Exception { 78 | final String url = "http://wwwshekhargulati.com"; 79 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 80 | assertThat(normalizedUrl, equalTo("http://wwwshekhargulati.com")); 81 | } 82 | 83 | @Test 84 | public void shouldRemoveWWWAndPrependProtocolWhenNotPresent() throws Exception { 85 | final String url = "www.shekhargulati.com"; 86 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 87 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com")); 88 | } 89 | 90 | @Test 91 | public void shouldAddTrailingSlashes() throws Exception { 92 | final String url = "http://shekhargulati.com/about"; 93 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 94 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/about")); 95 | } 96 | 97 | @Test 98 | public void shouldDecodePercentEncodedTilde() throws Exception { 99 | final String url = "http://shekhargulati.com/%7Eabout/"; 100 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 101 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/~about")); 102 | } 103 | 104 | @Test 105 | public void shouldDecodePercentEncodedUnderscore() throws Exception { 106 | final String url = "http://shekhargulati.com/hello%5Fabout/"; 107 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 108 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/hello_about")); 109 | } 110 | 111 | @Test 112 | public void shouldDecodePercentEncodedPeriod() throws Exception { 113 | final String url = "http://shekhargulati.com/hello%2Eabout/"; 114 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 115 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/hello.about")); 116 | } 117 | 118 | @Test 119 | public void shouldDecodePercentEncodedHyphen() throws Exception { 120 | final String url = "http://shekhargulati.com/hello%2Dabout/"; 121 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 122 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/hello-about")); 123 | } 124 | 125 | @Test 126 | public void shouldDecodePercentEncodedDigit() throws Exception { 127 | final String url = "http://shekhargulati.com/hello%39about/"; 128 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 129 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/hello9about")); 130 | } 131 | 132 | @Test 133 | public void shouldDecodePercentEncodedAlpha() throws Exception { 134 | final String url = "http://shekhargulati.com/hello%41about/"; 135 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 136 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/helloAabout")); 137 | } 138 | 139 | @Test 140 | public void shouldSortQueryParameters() throws Exception { 141 | final String url = "http://shekhargulati.com?lang=en&article=fred"; 142 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 143 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com?article=fred&lang=en")); 144 | } 145 | 146 | @Test 147 | public void shouldSortQueryParameters_2() throws Exception { 148 | final String url = "http://shekhargulati.com/?b=bar&a=foo"; 149 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 150 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/?a=foo&b=bar")); 151 | } 152 | 153 | @Test 154 | public void shouldRemoveQuestionMarkWhenThereIsNoQueryParameter() throws Exception { 155 | final String url = "http://shekhargulati.com?"; 156 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 157 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com")); 158 | } 159 | 160 | @Test 161 | public void shouldRemoveQuestionMarkWhenThereIsNoQueryParameter_2() throws Exception { 162 | final String url = "http://shekhargulati.com/?"; 163 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 164 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com")); 165 | } 166 | 167 | @Test 168 | public void shouldHandleQueryParametersWithoutValues() throws Exception { 169 | final String url = "http://shekhargulati.com/?b=&a=foo"; 170 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 171 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/?a=foo&b=")); 172 | } 173 | 174 | @Test 175 | public void shouldDecodePercentEncodedQueryParameterValue() throws Exception { 176 | final String url = "http://shekhargulati.com/?foo=bar%2Dbaz"; 177 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 178 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/?foo=bar-baz")); 179 | } 180 | 181 | @Test 182 | public void shouldNotRemoveNonDefaultPorts() throws Exception { 183 | final String url = "http://shekhargulati.com:8080"; 184 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 185 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com:8080")); 186 | } 187 | 188 | @Test 189 | public void shouldConvertPunnycodeHostToUnicode() throws Exception { 190 | final String url = "http://xn--xample-hva.com"; 191 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 192 | assertThat(normalizedUrl, equalTo("http://êxample.com")); 193 | } 194 | 195 | @Test 196 | public void shouldRemoveDuplicateSlashesInPath() throws Exception { 197 | final String url = "http://shekhargulati.com/foo//bar.html"; 198 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 199 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/foo/bar.html")); 200 | } 201 | 202 | @Test 203 | public void shouldRemoveMultipleSlashesInPath() throws Exception { 204 | final String url = "http://shekhargulati.com/////foo//bar.html"; 205 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 206 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/foo/bar.html")); 207 | } 208 | 209 | @Test 210 | public void shouldRemovePathFragments() throws Exception { 211 | final String url = "http://shekhargulati.com/whoami#about"; 212 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 213 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/whoami")); 214 | } 215 | 216 | @Test 217 | public void shouldRemoveDotSegmentsFromAUrl() throws Exception { 218 | final String url = "http://shekhargulati.com/../a/b/../c/./d.html"; 219 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 220 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/a/c/d.html")); 221 | } 222 | 223 | @Test 224 | public void shouldRemoveDotSegmentsFromAUrl_2() throws Exception { 225 | final String url = "http://shekhargulati.com/foo/bar/./baz"; 226 | String normalizedUrl = UrlCleaner.normalizeUrl(url); 227 | assertThat(normalizedUrl, equalTo("http://shekhargulati.com/foo/bar/baz")); 228 | } 229 | 230 | @Test 231 | public void shouldNormalizeAListOfUrl() throws Exception { 232 | final String url1 = "http://shekhargulati.com/hello%2Dabout/"; 233 | final String url2 = "http://shekhargulati.com/?"; 234 | final String url3 = "http://shekhargulati.com/foo/bar/./baz"; 235 | 236 | List normalizedUrls = UrlCleaner.normalizeUrl(url1, url2, url3); 237 | assertThat(normalizedUrls, hasItems(equalTo("http://shekhargulati.com/hello-about"), equalTo("http://shekhargulati.com"), equalTo("http://shekhargulati.com/foo/bar/baz"))); 238 | } 239 | 240 | @Test 241 | public void shouldUnshortenABitLyUrl() throws Exception { 242 | String unshortenUrl = UrlCleaner.unshortenUrl("http://bit.ly/1Wtrl9t"); 243 | assertThat(unshortenUrl, equalTo("https://shekhargulati.com/")); 244 | } 245 | 246 | @Test 247 | public void shouldReturnSameUrlWhenItIsNotShortened() throws Exception { 248 | String unshortenUrl = UrlCleaner.unshortenUrl("https://shekhargulati.com/"); 249 | assertThat(unshortenUrl, equalTo("https://shekhargulati.com/")); 250 | } 251 | 252 | @Test 253 | public void shouldUnshortenMultiLevelShortenUrl() throws Exception { 254 | String unshortenUrl = UrlCleaner.unshortenUrl("http://bit.ly/1pquoV5"); 255 | assertThat(unshortenUrl, equalTo("https://shekhargulati.com/")); 256 | } 257 | 258 | @Test 259 | public void shouldUnshortenMultiLevelShortenUrl_2() throws Exception { 260 | String unshortenUrl = UrlCleaner.unshortenUrl("http://bit.ly/1pwuGdF"); 261 | assertThat(unshortenUrl, equalTo("https://www.bloomberg.com/news/articles/2016-03-17/unmasking-startup-l-jackson-silicon-valley-s-favorite-twitter-persona")); 262 | } 263 | 264 | @Test 265 | public void shouldReproduceIssue1() throws Exception { 266 | String rtnStr = UrlCleaner.normalizeUrl("stackoverflow.com/questions/tagged/java"); 267 | System.out.println(rtnStr); 268 | assertThat(rtnStr, equalTo("http://stackoverflow.com/questions/tagged/java")); 269 | } 270 | 271 | @Test 272 | public void shouldUnshortenRelativeURL() throws Exception { 273 | String rtnStr = UrlCleaner.unshortenUrl("https://www.technologyreview.com/s/609054/warning-this-algorithm-will-self-destruct-after-its-used"); 274 | System.out.println(rtnStr); 275 | assertThat(rtnStr, equalTo("https://www.technologyreview.com/s/609054/warning-this-algorithm-will-self-destruct-after-its-used/")); 276 | } 277 | } 278 | --------------------------------------------------------------------------------