├── .github └── workflows │ └── gradle.yml ├── .gitignore ├── .travis.yml ├── LICENSE.txt ├── README.md ├── build.gradle ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── pom.xml └── src ├── main └── java │ └── net │ └── ricecode │ └── similarity │ ├── AscendingSimilarityScoreComparator.java │ ├── DescendingSimilarityScoreComparator.java │ ├── DiceCoefficientStrategy.java │ ├── JaroStrategy.java │ ├── JaroWinklerStrategy.java │ ├── LevenshteinDistanceStrategy.java │ ├── SimilarityScore.java │ ├── SimilarityStrategy.java │ ├── StringSimilarityService.java │ └── StringSimilarityServiceImpl.java └── test └── java └── net └── ricecode └── similarity ├── AscendingComparatorTest.java ├── DescendingComparatorTest.java ├── DiceCoefficientStrategyTest.java ├── JaroStrategyTest.java ├── JaroWinklerStrategyTest.java ├── LevenshteinDistanceStrategyTest.java ├── SimilarityScoreTest.java └── StringSimilarityServiceImplTest.java /.github/workflows/gradle.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Gradle 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-gradle 3 | 4 | name: Java CI with Gradle 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v2 19 | - name: Set up JDK 1.8 20 | uses: actions/setup-java@v1 21 | with: 22 | java-version: 1.8 23 | - name: Grant execute permission for gradlew 24 | run: chmod +x gradlew 25 | - name: Build with Gradle 26 | run: ./gradlew test 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # General files 3 | *~ 4 | *.lock 5 | *.DS_Store 6 | *.Trashes 7 | *.swp 8 | *.out 9 | *.bak 10 | 11 | # SVN 12 | .svn 13 | 14 | # Maven 15 | .m2 16 | target/ 17 | 18 | # Eclipse 19 | .project 20 | .metadata 21 | bin/** 22 | tmp/** 23 | tmp/**/* 24 | .classpath 25 | .settings/ 26 | .loadpath 27 | local.properties 28 | *~.nib 29 | *.launch 30 | .externalToolBuilders/ 31 | 32 | # Intellij 33 | *.iml 34 | *.ipr 35 | *.iws 36 | .idea/ 37 | 38 | # Netbeans 39 | nbproject/private/ 40 | build/ 41 | nbbuild/ 42 | dist/ 43 | nbdist/ 44 | nbactions.xml 45 | nb-configuration.xml 46 | 47 | # Java 48 | *.class 49 | *.jar 50 | *.war 51 | *.ear 52 | *.db 53 | 54 | 55 | .gradle 56 | 57 | # Exceptions 58 | !/gradle/** 59 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | 2 | language: java 3 | 4 | jdk: 5 | - openjdk8 6 | - openjdk10 7 | - openjdk11 8 | 9 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010 Ralph Allan Rice 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![License: MIT](https://img.shields.io/github/license/rrice/java-string-similarity)](https://opensource.org/licenses/MIT) 3 | [![Issues](https://img.shields.io/github/issues/rrice/java-string-similarity)](https://github.com/rrice/java-string-similarity/actions) 4 | ![Java CI](https://github.com/rrice/java-string-similarity/workflows/Java%20CI%20with%20Gradle/badge.svg) 5 | 6 | 7 | [java-string-similarity](https://github.com/rrice/java-string-similarity) that calculates a normalized distance or similarity score between two strings. A score of 0.0 means that the two strings are absolutely dissimilar, and 1.0 means that absolutely similar (or equal). Anything in between indicates how similar each the two strings are. 8 | 9 | Example 10 | ------- 11 | 12 | In this simple example, we want to calculate a similarity score between the words `McDonalds` and `MacMahons`. We are selecting the [Jaro-Winkler distance algorithm](http://www.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) algorithm. 13 | 14 | ```java 15 | SimilarityStrategy strategy = new JaroWinklerStrategy(); 16 | String target = "McDonalds"; 17 | String source = "MacMahons"; 18 | StringSimilarityService service = new StringSimilarityServiceImpl(strategy); 19 | double score = service.score(source, target); // Score is 0.90 20 | ``` 21 | 22 | Algorithms 23 | ---------- 24 | 25 | * [Jaro distance](http://www.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) 26 | * [Jaro-Winkler distance](http://www.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) 27 | * [Levenshtein distance](http://www.wikipedia.org/wiki/Levenshtein_distance) 28 | * [Sørensen–Dice coefficient](http://www.wikipedia.org/wiki/Sørensen–Dice_coefficient) 29 | 30 | 31 | Installation 32 | ------------ 33 | 34 | This project currently uses [Maven](http://maven.apache.org/) for management. You can compile, test and install the component to your local repo by calling: 35 | 36 | ``` 37 | mvn install 38 | ``` 39 | 40 | Then, you can add this component to your project by adding a dependency: 41 | 42 | ``` 43 | 44 | net.ricecode 45 | string-similarity 46 | 1.0.0 47 | 48 | ``` 49 | 50 | TODO 51 | ---- 52 | 53 | * Ant/Ivy build scripts. 54 | 55 | * [Jaccard index](http://www.wikipedia.org/wiki/Jaccard_index) 56 | 57 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | 3 | repositories { 4 | mavenCentral() 5 | } 6 | dependencies { 7 | testCompile 'junit:junit:4.+', 8 | 'org.mockito:mockito-all:1.9.5' 9 | } 10 | 11 | /* 12 | * Permission is hereby granted, free of charge, to any person obtaining a copy 13 | * of this software and associated documentation files (the "Software"), to deal 14 | * in the Software without restriction, including without limitation the rights 15 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 16 | * copies of the Software, and to permit persons to whom the Software is 17 | * furnished to do so, subject to the following conditions: 18 | * 19 | * The above copyright notice and this permission notice shall be included in 20 | * all copies or substantial portions of the Software. 21 | * 22 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 23 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 27 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 28 | * THE SOFTWARE. 29 | * 30 | */ 31 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rrice/java-string-similarity/379f3c6ae55ec9d8b264637e89c6011137cf0c42/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.2-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /gradlew: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | 3 | ############################################################################## 4 | ## 5 | ## Gradle start up script for UN*X 6 | ## 7 | ############################################################################## 8 | 9 | # Attempt to set APP_HOME 10 | # Resolve links: $0 may be a link 11 | PRG="$0" 12 | # Need this for relative symlinks. 13 | while [ -h "$PRG" ] ; do 14 | ls=`ls -ld "$PRG"` 15 | link=`expr "$ls" : '.*-> \(.*\)$'` 16 | if expr "$link" : '/.*' > /dev/null; then 17 | PRG="$link" 18 | else 19 | PRG=`dirname "$PRG"`"/$link" 20 | fi 21 | done 22 | SAVED="`pwd`" 23 | cd "`dirname \"$PRG\"`/" >/dev/null 24 | APP_HOME="`pwd -P`" 25 | cd "$SAVED" >/dev/null 26 | 27 | APP_NAME="Gradle" 28 | APP_BASE_NAME=`basename "$0"` 29 | 30 | # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 31 | DEFAULT_JVM_OPTS="" 32 | 33 | # Use the maximum available, or set MAX_FD != -1 to use that value. 34 | MAX_FD="maximum" 35 | 36 | warn () { 37 | echo "$*" 38 | } 39 | 40 | die () { 41 | echo 42 | echo "$*" 43 | echo 44 | exit 1 45 | } 46 | 47 | # OS specific support (must be 'true' or 'false'). 48 | cygwin=false 49 | msys=false 50 | darwin=false 51 | nonstop=false 52 | case "`uname`" in 53 | CYGWIN* ) 54 | cygwin=true 55 | ;; 56 | Darwin* ) 57 | darwin=true 58 | ;; 59 | MINGW* ) 60 | msys=true 61 | ;; 62 | NONSTOP* ) 63 | nonstop=true 64 | ;; 65 | esac 66 | 67 | CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar 68 | 69 | # Determine the Java command to use to start the JVM. 70 | if [ -n "$JAVA_HOME" ] ; then 71 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 72 | # IBM's JDK on AIX uses strange locations for the executables 73 | JAVACMD="$JAVA_HOME/jre/sh/java" 74 | else 75 | JAVACMD="$JAVA_HOME/bin/java" 76 | fi 77 | if [ ! -x "$JAVACMD" ] ; then 78 | die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME 79 | 80 | Please set the JAVA_HOME variable in your environment to match the 81 | location of your Java installation." 82 | fi 83 | else 84 | JAVACMD="java" 85 | which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 86 | 87 | Please set the JAVA_HOME variable in your environment to match the 88 | location of your Java installation." 89 | fi 90 | 91 | # Increase the maximum file descriptors if we can. 92 | if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then 93 | MAX_FD_LIMIT=`ulimit -H -n` 94 | if [ $? -eq 0 ] ; then 95 | if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then 96 | MAX_FD="$MAX_FD_LIMIT" 97 | fi 98 | ulimit -n $MAX_FD 99 | if [ $? -ne 0 ] ; then 100 | warn "Could not set maximum file descriptor limit: $MAX_FD" 101 | fi 102 | else 103 | warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" 104 | fi 105 | fi 106 | 107 | # For Darwin, add options to specify how the application appears in the dock 108 | if $darwin; then 109 | GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" 110 | fi 111 | 112 | # For Cygwin, switch paths to Windows format before running java 113 | if $cygwin ; then 114 | APP_HOME=`cygpath --path --mixed "$APP_HOME"` 115 | CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` 116 | JAVACMD=`cygpath --unix "$JAVACMD"` 117 | 118 | # We build the pattern for arguments to be converted via cygpath 119 | ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` 120 | SEP="" 121 | for dir in $ROOTDIRSRAW ; do 122 | ROOTDIRS="$ROOTDIRS$SEP$dir" 123 | SEP="|" 124 | done 125 | OURCYGPATTERN="(^($ROOTDIRS))" 126 | # Add a user-defined pattern to the cygpath arguments 127 | if [ "$GRADLE_CYGPATTERN" != "" ] ; then 128 | OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" 129 | fi 130 | # Now convert the arguments - kludge to limit ourselves to /bin/sh 131 | i=0 132 | for arg in "$@" ; do 133 | CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` 134 | CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option 135 | 136 | if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition 137 | eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` 138 | else 139 | eval `echo args$i`="\"$arg\"" 140 | fi 141 | i=$((i+1)) 142 | done 143 | case $i in 144 | (0) set -- ;; 145 | (1) set -- "$args0" ;; 146 | (2) set -- "$args0" "$args1" ;; 147 | (3) set -- "$args0" "$args1" "$args2" ;; 148 | (4) set -- "$args0" "$args1" "$args2" "$args3" ;; 149 | (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; 150 | (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; 151 | (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; 152 | (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; 153 | (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; 154 | esac 155 | fi 156 | 157 | # Escape application args 158 | save () { 159 | for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done 160 | echo " " 161 | } 162 | APP_ARGS=$(save "$@") 163 | 164 | # Collect all arguments for the java command, following the shell quoting and substitution rules 165 | eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" 166 | 167 | # by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong 168 | if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then 169 | cd "$(dirname "$0")" 170 | fi 171 | 172 | exec "$JAVACMD" "$@" 173 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS= 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | 4 | net.ricecode 5 | string-similarity 6 | 1.0.1-SNAPSHOT 7 | jar 8 | 9 | string-similarity 10 | A Java library that implements several algorithms that calculate similarity between strings. 11 | http://www.ricecode.net 12 | 13 | 14 | UTF-8 15 | 4.13.1 16 | 1.10.19 17 | 18 | 19 | 20 | 21 | MIT License 22 | http://www.opensource.org/licenses/mit-license.php 23 | repo 24 | 25 | 26 | 27 | 28 | https://github.com/rrice/java-string-similarity/issues 29 | GitHub Issues 30 | 31 | 32 | 33 | https://github.com/rrice/java-string-similarity 34 | scm:git:git://github.com/rrice/java-string-similarity.git 35 | scm:git:git@github.com:rrice/java-string-similarity.git 36 | HEAD 37 | 38 | 39 | 40 | 41 | ralph.rice@gmail.com 42 | Ralph Allan Rice 43 | https://github.com/rrice 44 | rrice 45 | 46 | 47 | 48 | 49 | 50 | doclint-java8-disable 51 | 52 | [1.8,) 53 | 54 | 55 | -Xdoclint:none 56 | 57 | 58 | 59 | release 60 | 61 | 62 | 63 | org.apache.maven.plugins 64 | maven-gpg-plugin 65 | 1.6 66 | 67 | 68 | sign-artifacts 69 | verify 70 | 71 | sign 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | org.mockito 84 | mockito-all 85 | ${mockito.version} 86 | test 87 | 88 | 89 | junit 90 | junit 91 | ${junit.version} 92 | test 93 | 94 | 95 | 96 | 97 | 98 | org.apache.maven.plugins 99 | maven-compiler-plugin 100 | 3.3 101 | 102 | 1.5 103 | 1.5 104 | false 105 | false 106 | 107 | 108 | 109 | org.apache.maven.plugins 110 | maven-source-plugin 111 | 2.4 112 | 113 | 114 | attach-source 115 | 116 | jar-no-fork 117 | test-jar-no-fork 118 | 119 | 120 | 121 | 122 | 123 | org.apache.maven.plugins 124 | maven-javadoc-plugin 125 | 2.9.1 126 | 127 | public 128 | 129 | 130 | 131 | attach-javadocs 132 | 133 | jar 134 | 135 | 136 | ${javadoc.opts} 137 | 138 | 139 | 140 | 141 | 142 | org.apache.maven.plugins 143 | maven-release-plugin 144 | 2.5.3 145 | 146 | @{project.version} 147 | 148 | 149 | 150 | org.sonatype.plugins 151 | nexus-staging-maven-plugin 152 | 1.6.3 153 | true 154 | 155 | ossrh 156 | https://oss.sonatype.org/ 157 | true 158 | 159 | 160 | 161 | org.codehaus.mojo 162 | versions-maven-plugin 163 | 2.2 164 | 165 | 166 | 167 | 168 | -------------------------------------------------------------------------------- /src/main/java/net/ricecode/similarity/AscendingSimilarityScoreComparator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import java.util.Comparator; 27 | 28 | /** 29 | * A comparator that allows SimilarityScore to be sorted in 30 | * ascending order. 31 | * @author Ralph Allan Rice 32 | * 33 | */ 34 | public class AscendingSimilarityScoreComparator implements Comparator 35 | { 36 | /** 37 | * Compares two similarity scores. 38 | * @param x The first score to be compared. 39 | * @param y The second score to be compared. 40 | * @return a negative integer, zero, or a positive integer as the first score is less than, 41 | * equal to, or greater than the second score. 42 | */ 43 | public int compare(SimilarityScore x, SimilarityScore y) { 44 | double first = x.getScore(); 45 | double second = y.getScore(); 46 | if (first == second) { 47 | return 0; 48 | } 49 | if (first < second) { 50 | return -1; 51 | } 52 | return 1; 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /src/main/java/net/ricecode/similarity/DescendingSimilarityScoreComparator.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import java.util.Comparator; 27 | 28 | /** 29 | * A comparator that allows SimilarityScore to be sorted in 30 | * descending order. 31 | * @author Ralph Allan Rice 32 | * 33 | */ 34 | public class DescendingSimilarityScoreComparator implements Comparator 35 | { 36 | /** 37 | * Compares two similarity scores. 38 | * @param x The first score to be compared. 39 | * @param y The second score to be compared. 40 | * @return a negative integer, zero, or a positive integer as the first score is greater than, 41 | * equal to, or less than the second score. 42 | */public int compare(SimilarityScore x, SimilarityScore y) { 43 | double first = x.getScore(); 44 | double second = y.getScore(); 45 | if (first == second) { 46 | return 0; 47 | } 48 | if (first < second) { 49 | return 1; 50 | } 51 | return -1; 52 | } 53 | 54 | } 55 | 56 | -------------------------------------------------------------------------------- /src/main/java/net/ricecode/similarity/DiceCoefficientStrategy.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import java.util.ArrayList; 27 | import java.util.Set; 28 | import java.util.TreeSet; 29 | 30 | /** 31 | * A strategy that uses the Dice's Coefficient to calculate the similarity of two strings. 32 | * @author Ralph Allan Rice 33 | * @see About Dice Coefficient 34 | */ 35 | public class DiceCoefficientStrategy implements SimilarityStrategy { 36 | 37 | /** 38 | * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity 39 | * and 1.0 implies absolute similarity. 40 | * 41 | * @param first The first string to compare. 42 | * @param second The second string to compare. 43 | * @return A number between 0.0 and 1.0. 44 | */ 45 | public double score(String first, String second) { 46 | 47 | // Create two sets of character bigrams, one for each string. 48 | Set s1 = splitIntoBigrams(first); 49 | Set s2 = splitIntoBigrams(second); 50 | 51 | // Get the number of elements in each set. 52 | int n1 = s1.size(); 53 | int n2 = s2.size(); 54 | 55 | // Find the intersection, and get the number of elements in that set. 56 | s1.retainAll(s2); 57 | int nt = s1.size(); 58 | 59 | 60 | // The coefficient is: 61 | // 62 | // 2 ∙ | s1 ⋂ s2 | 63 | // D = ---------------------- 64 | // | s1 | + | s2 | 65 | // 66 | return (2.0 * (double)nt) / ((double)(n1 + n2)); 67 | 68 | } 69 | 70 | 71 | private Set splitIntoBigrams(String s) { 72 | ArrayList bigrams = new ArrayList(); 73 | 74 | if (s.length() < 2) { 75 | bigrams.add(s); 76 | } 77 | else { 78 | for (int i = 1; i < s.length(); i++) { 79 | StringBuilder sb = new StringBuilder(); 80 | sb.append(s.charAt(i-1)); 81 | sb.append(s.charAt(i)); 82 | bigrams.add(sb.toString()); 83 | } 84 | } 85 | return new TreeSet(bigrams); 86 | } 87 | 88 | 89 | 90 | } 91 | -------------------------------------------------------------------------------- /src/main/java/net/ricecode/similarity/JaroStrategy.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | /** 27 | * A strategy that uses the Jaro Distance to calculate the similarity of two strings. 28 | * @author Ralph Allan Rice 29 | * @see About Jaro Distance 30 | */ 31 | public class JaroStrategy implements SimilarityStrategy { 32 | 33 | /** 34 | * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity 35 | * and 1.0 implies absolute similarity. 36 | * 37 | * @param first The first string to compare. 38 | * @param second The second string to compare. 39 | * @return A number between 0.0 and 1.0. 40 | */ 41 | public double score(String first, String second) { 42 | String shorter; 43 | String longer; 44 | 45 | // Determine which String is longer. 46 | if (first.length() > second.length()) 47 | { 48 | longer = first.toLowerCase(); 49 | shorter = second.toLowerCase(); 50 | } 51 | else 52 | { 53 | longer = second.toLowerCase(); 54 | shorter = first.toLowerCase(); 55 | } 56 | 57 | // Calculate the half length() distance of the shorter String. 58 | int halflength = (shorter.length() / 2) + 1; 59 | 60 | // Find the set of matching characters between the shorter and longer strings. Note that 61 | // the set of matching characters may be different depending on the order of the strings. 62 | String m1 = getSetOfMatchingCharacterWithin(shorter, longer, halflength); 63 | String m2 = getSetOfMatchingCharacterWithin(longer, shorter, halflength); 64 | 65 | 66 | // If one or both of the sets of common characters is empty, then 67 | // there is no similarity between the two strings. 68 | if (m1.length() == 0 || m2.length() == 0) return 0.0; 69 | 70 | // If the set of common characters is not the same size, then 71 | // there is no similarity between the two strings, either. 72 | if (m1.length() != m2.length()) return 0.0; 73 | 74 | // Calculate the number of transpositions between the two sets 75 | // of common characters. 76 | int transpositions = transpositions(m1, m2); 77 | 78 | // Calculate the distance. 79 | double dist = 80 | (m1.length() / ((double)shorter.length()) + 81 | m2.length() / ((double)longer.length()) + 82 | (m1.length() - transpositions) / ((double)m1.length())) / 3.0; 83 | return dist; 84 | 85 | 86 | } 87 | 88 | /** 89 | * Gets a set of matching characters between two strings. 90 | * 91 | * @param first The first string. 92 | * @param second The second string. 93 | * @param limit The maximum distance to consider. 94 | * @return A string contain the set of common characters. 95 | * @remarks Two characters from the first string and the second string are considered matching if the character's 96 | * respective positions are no farther than the limit value. 97 | */ 98 | private String getSetOfMatchingCharacterWithin(String first, String second, int limit) 99 | { 100 | 101 | StringBuilder common = new StringBuilder(); 102 | StringBuilder copy = new StringBuilder(second); 103 | for (int i = 0; i < first.length(); i++) 104 | { 105 | char ch = first.charAt(i); 106 | boolean found = false; 107 | 108 | // See if the character is within the limit positions away from the original position of that character. 109 | for (int j = Math.max(0, i - limit); !found && j < Math.min(i + limit, second.length()); j++) 110 | { 111 | if (copy.charAt(j) == ch) 112 | { 113 | found = true; 114 | common.append(ch); 115 | copy.setCharAt(j,'*'); 116 | } 117 | } 118 | } 119 | return common.toString(); 120 | } 121 | 122 | /** 123 | * Calculates the number of transpositions between two strings. 124 | * @param first The first string. 125 | * @param second The second string. 126 | * @return The number of transpositions between the two strings. 127 | */ 128 | private int transpositions(String first, String second) 129 | { 130 | int transpositions = 0; 131 | for (int i = 0; i < first.length(); i++) 132 | { 133 | if (first.charAt(i) != second.charAt(i)) 134 | { 135 | transpositions++; 136 | } 137 | } 138 | transpositions /= 2; 139 | return transpositions; 140 | } 141 | 142 | } 143 | -------------------------------------------------------------------------------- /src/main/java/net/ricecode/similarity/JaroWinklerStrategy.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | /** 27 | * A strategy that uses the Jaro-Winkler Distance to calculate the similarity of two strings. 28 | * 29 | * @author Ralph Allan Rice 30 | * @see About Jaro-Winkler Distance 31 | */ 32 | public class JaroWinklerStrategy extends JaroStrategy implements SimilarityStrategy { 33 | final double DEFAULT_SCALING_FACTOR = 0.1; // This is the default scaling factor Winkler used. 34 | 35 | private double scalingFactor; 36 | 37 | /** 38 | * Constructs a new JaroWinklerStrategy instance. 39 | * @param scalingFactor The scaling factor between 0.00 and 0.25. If the scaling factor is greater than 0.25, the scaling factor is set to 0.25. 40 | */ 41 | public JaroWinklerStrategy(double scalingFactor) 42 | { 43 | if (scalingFactor > 0.25) 44 | { 45 | scalingFactor = 0.25; 46 | } 47 | this.scalingFactor = scalingFactor; 48 | } 49 | 50 | /** 51 | * Constructs a new JaroWinklerStrategy instance. 52 | */ 53 | public JaroWinklerStrategy() 54 | { 55 | this.scalingFactor = DEFAULT_SCALING_FACTOR; 56 | } 57 | 58 | /** 59 | * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity 60 | * and 1.0 implies absolute similarity. 61 | * 62 | * @param first The first string to compare. 63 | * @param second The second string to compare. 64 | * @return A number between 0.0 and 1.0. 65 | */ 66 | public double score(String first, String second) 67 | { 68 | double jaro = super.score(first, second); 69 | 70 | int cl = commonPrefixLength(first, second); 71 | 72 | // The Jaro–Winkler distance uses a prefix scale which gives more favorable ratings 73 | // to strings that match from the beginning for a set prefix length. 74 | double winkler = jaro + (scalingFactor * cl * (1.0 - jaro)); 75 | 76 | return winkler; 77 | 78 | } 79 | 80 | /** 81 | * Calculates the number of characters from the beginning of the strings that match exactly one-to-one, 82 | * up to a maximum of four (4) characters. 83 | * @param first The first string. 84 | * @param second The second string. 85 | * @return A number between 0 and 4. 86 | */ 87 | private int commonPrefixLength(String first, String second) 88 | { 89 | String shorter; 90 | String longer; 91 | 92 | // Determine which string is longer. 93 | if (first.length() > second.length()) 94 | { 95 | longer = first.toLowerCase(); 96 | shorter = second.toLowerCase(); 97 | } 98 | else 99 | { 100 | longer = second.toLowerCase(); 101 | shorter = first.toLowerCase(); 102 | } 103 | 104 | int result = 0; 105 | 106 | // Iterate through the shorter string. 107 | for (int i = 0; i < shorter.length(); i++) 108 | { 109 | if (shorter.charAt(i) != longer.charAt(i)) 110 | { 111 | break; 112 | } 113 | result++; 114 | } 115 | 116 | // Limit the result to 4. 117 | return result > 4? 4: result; 118 | } 119 | 120 | 121 | } 122 | -------------------------------------------------------------------------------- /src/main/java/net/ricecode/similarity/LevenshteinDistanceStrategy.java: -------------------------------------------------------------------------------- 1 | package net.ricecode.similarity; 2 | 3 | /** 4 | * A strategy that uses the Levenshtein's Distance to calculate the edit distance of two strings. 5 | * Then it converts this to a "score" to fit the framework. 6 | * 7 | * @see About Levenshtein Distance 8 | */ 9 | public class LevenshteinDistanceStrategy implements SimilarityStrategy { 10 | /** 11 | * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity 12 | * and 1.0 implies absolute similarity. 13 | * 14 | * @param first The first string to compare. 15 | * @param second The second string to compare. 16 | * @return A number between 0.0 and 1.0. 17 | * @throws NullPointerException if one or both of the strings are null 18 | */ 19 | public double score(String first, String second) { 20 | int maxLength = Math.max(first.length(), second.length()); 21 | //Can't divide by 0 22 | if (maxLength == 0) return 1.0d; 23 | return ((double) (maxLength - computeEditDistance(first, second))) / (double) maxLength; 24 | } 25 | 26 | protected int computeEditDistance(String first, String second) { 27 | first = first.toLowerCase(); 28 | second = second.toLowerCase(); 29 | 30 | int[] costs = new int[second.length() + 1]; 31 | for (int i = 0; i <= first.length(); i++) { 32 | int previousValue = i; 33 | for (int j = 0; j <= second.length(); j++) { 34 | if (i == 0) { 35 | costs[j] = j; 36 | } 37 | else if (j > 0) { 38 | int useValue = costs[j - 1]; 39 | if (first.charAt(i - 1) != second.charAt(j - 1)) { 40 | useValue = Math.min(Math.min(useValue, previousValue), costs[j]) + 1; 41 | } 42 | costs[j - 1] = previousValue; 43 | previousValue = useValue; 44 | 45 | } 46 | } 47 | if (i > 0) { 48 | costs[second.length()] = previousValue; 49 | } 50 | } 51 | return costs[second.length()]; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/java/net/ricecode/similarity/SimilarityScore.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | /** 27 | * A value object contains a similarity score. 28 | * @author Ralph Allan Rice 29 | * 30 | */ 31 | public class SimilarityScore { 32 | 33 | private String key; 34 | private double score; 35 | 36 | /** 37 | * Constructs a similarity score. 38 | * @param key The string key. 39 | * @param score The score value. 40 | */ 41 | 42 | public SimilarityScore(String key, double score) { 43 | this.key = key; 44 | this.score = score; 45 | } 46 | 47 | /** 48 | * Gets the key for this score. 49 | * @return A string. 50 | */ 51 | public String getKey() { 52 | return this.key; 53 | } 54 | 55 | /** 56 | * Gets the value of the score. 57 | * @return A double. 58 | */ 59 | public double getScore() { 60 | return this.score; 61 | } 62 | 63 | 64 | /** 65 | * Returns the hash code for this object. 66 | * @return An integer representing the hash code. 67 | */ 68 | public int hashCode() { 69 | int hash = 11; 70 | hash = 23 * hash + key.hashCode(); 71 | hash = 23 * hash + (int)(score * 1000000); 72 | return hash; 73 | } 74 | 75 | /** 76 | * Determines if the supplied object equals this object. 77 | * @return True if the keys and scores match between the two objects. Otherwise false. 78 | */ 79 | @Override 80 | public boolean equals(Object o) { 81 | if((o == null) || (o.getClass() != this.getClass())) { 82 | return false; 83 | } 84 | SimilarityScore other=(SimilarityScore)o; 85 | 86 | return this.key.equals(other.key) 87 | && this.score == other.score; 88 | } 89 | 90 | 91 | } 92 | -------------------------------------------------------------------------------- /src/main/java/net/ricecode/similarity/SimilarityStrategy.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | /** 27 | * @author Ralph Allan Rice 28 | * An interface that defines methods to perform string similarity calculation. 29 | */ 30 | public interface SimilarityStrategy { 31 | 32 | /** 33 | * Calculates the similarity score of objects, where 0.0 implies absolutely no similarity 34 | * and 1.0 implies absolute similarity. 35 | * 36 | * @param first The first string to compare. 37 | * @param second The second string to compare. 38 | * @return A number between 0.0 and 1.0. 39 | */ 40 | double score(String first, String second); 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/net/ricecode/similarity/StringSimilarityService.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import java.util.Comparator; 27 | import java.util.List; 28 | 29 | /** 30 | * A service that performs string similarity calculations. 31 | * @author Ralph Allan Rice 32 | * 33 | */ 34 | public interface StringSimilarityService { 35 | 36 | /** 37 | * Calculates all similarity scores for a given set of features. 38 | * @param features The list of features. 39 | * @param target The target string to compare against the features. 40 | * @return A list of similarity scores. 41 | */ 42 | List scoreAll(List features, String target); 43 | 44 | /** 45 | * Calculates the similarity score of a single feature. 46 | * @param feature The feature string to compare. 47 | * @param target The target string to compare against the feature. 48 | * @return The similarity score between the feature and target. 49 | */ 50 | double score(String feature, String target); 51 | 52 | 53 | /** 54 | * Finds the feature within a set of given features that best match the target string. 55 | * @param features A list of strings containing the features to compare. 56 | * @param target The target string to compare against the features. 57 | * @return A SimilarityScore that has the highest score value amongst the features. 58 | */ 59 | SimilarityScore findTop(List features, String target); 60 | 61 | /** 62 | * Finds the feature within a set of given features that best match the target string. 63 | * @param features A list of strings containing the features to compare. 64 | * @param target The target string to compare against the features. 65 | * @param comparator A comparator that is used sort the scores. 66 | * @return A SimilarityScore that has the top value amongst the features, according to the comparator. 67 | */ 68 | SimilarityScore findTop(List features, String target, Comparator comparator); 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/net/ricecode/similarity/StringSimilarityServiceImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import java.util.ArrayList; 27 | import java.util.Comparator; 28 | import java.util.List; 29 | import java.util.Collections; 30 | 31 | 32 | /** 33 | * An implementation of StringSimilarityService. 34 | * @author Ralph Allan Rice 35 | * @see StringSimilarityService 36 | */ 37 | public class StringSimilarityServiceImpl implements StringSimilarityService { 38 | 39 | private SimilarityStrategy strategy; 40 | 41 | 42 | /** 43 | * Creates a similarity calculator instance. 44 | * @param strategy The similarity strategy to use when calculating similarity scores. 45 | */ 46 | public StringSimilarityServiceImpl(SimilarityStrategy strategy) { 47 | this.strategy = strategy; 48 | } 49 | 50 | /** 51 | * Calculates all similarity scores for a given set of features. 52 | * @param features The list of features. 53 | * @param target The target string to compare against the features. 54 | * @return A list of similarity scores. 55 | */ 56 | public List scoreAll(List features, String target) 57 | { 58 | ArrayList scores = new ArrayList(); 59 | 60 | for(String feature: features) { 61 | double score = strategy.score(feature, target); 62 | scores.add(new SimilarityScore(feature, score)); 63 | } 64 | 65 | return scores; 66 | } 67 | 68 | 69 | /** 70 | * Calculates the similarity score of a single feature. 71 | * @param feature The feature string to compare. 72 | * @param target The target string to compare against the feature. 73 | * @return The similarity score between the feature and target. 74 | */ 75 | public double score(String feature, String target) 76 | { 77 | return strategy.score(feature, target); 78 | } 79 | 80 | /** 81 | * Finds the feature within a set of given features that best match the target string. 82 | * @param features A list of strings containing the features to compare. 83 | * @param target The target string to compare against the features. 84 | * @return The similarity score with the highest value. 85 | */ 86 | public SimilarityScore findTop(List features, String target) 87 | { 88 | return findTop(features, target, new DescendingSimilarityScoreComparator()); 89 | } 90 | 91 | /** 92 | * Finds the feature within a set of given features that best match the target string. 93 | * @param features A list of strings containing the features to compare. 94 | * @param target The target string to compare against the features. 95 | * @param comparator A comparator that is used sort the scores. 96 | * @return A SimilarityScore that has the top value amongst the features, according to the comparator. 97 | */ 98 | public SimilarityScore findTop(List features, String target, Comparator comparator) 99 | { 100 | if (features.size() == 0) { 101 | return null; 102 | } 103 | List scores= scoreAll(features, target); 104 | Collections.sort(scores, comparator); 105 | return scores.get(0); 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/test/java/net/ricecode/similarity/AscendingComparatorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import static org.junit.Assert.*; 27 | 28 | import org.junit.Test; 29 | 30 | public class AscendingComparatorTest { 31 | 32 | @Test 33 | public void testCompareScoreFirstGreater() { 34 | SimilarityScore first = new SimilarityScore("First", 0.87); 35 | SimilarityScore second = new SimilarityScore("Second", 0.54); 36 | AscendingSimilarityScoreComparator c = new AscendingSimilarityScoreComparator(); 37 | assertTrue(c.compare(first, second)>0); 38 | assertTrue(c.compare(second, first)<0); 39 | } 40 | 41 | @Test 42 | public void testCompareScoreSecondGreater() { 43 | SimilarityScore first = new SimilarityScore("First", 0.37); 44 | SimilarityScore second = new SimilarityScore("Second", 0.65); 45 | AscendingSimilarityScoreComparator c = new AscendingSimilarityScoreComparator(); 46 | assertTrue(c.compare(first, second)<0); 47 | assertTrue(c.compare(second, first)>0); 48 | } 49 | 50 | @Test 51 | public void testCompareScoreEquality() { 52 | SimilarityScore first = new SimilarityScore("First", 0.96); 53 | SimilarityScore second = new SimilarityScore("Second", 0.96); 54 | AscendingSimilarityScoreComparator c = new AscendingSimilarityScoreComparator(); 55 | assertEquals(c.compare(first, second), 0); 56 | assertEquals(c.compare(second, first), 0); 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/test/java/net/ricecode/similarity/DescendingComparatorTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import static org.junit.Assert.*; 27 | 28 | import org.junit.Test; 29 | 30 | public class DescendingComparatorTest { 31 | 32 | @Test 33 | public void testCompareScoreFirstGreater() { 34 | SimilarityScore first = new SimilarityScore("First", 0.87); 35 | SimilarityScore second = new SimilarityScore("Second", 0.54); 36 | DescendingSimilarityScoreComparator c = new DescendingSimilarityScoreComparator(); 37 | assertTrue(c.compare(first, second)<0); 38 | assertTrue(c.compare(second, first)>0); 39 | } 40 | 41 | @Test 42 | public void testCompareScoreSecondGreater() { 43 | SimilarityScore first = new SimilarityScore("First", 0.37); 44 | SimilarityScore second = new SimilarityScore("Second", 0.65); 45 | DescendingSimilarityScoreComparator c = new DescendingSimilarityScoreComparator(); 46 | assertTrue(c.compare(first, second)>0); 47 | assertTrue(c.compare(second, first)<0); 48 | } 49 | 50 | @Test 51 | public void testCompareScoreEquality() { 52 | SimilarityScore first = new SimilarityScore("First", 0.96); 53 | SimilarityScore second = new SimilarityScore("Second", 0.96); 54 | DescendingSimilarityScoreComparator c = new DescendingSimilarityScoreComparator(); 55 | assertEquals(c.compare(first, second), 0); 56 | assertEquals(c.compare(second, first), 0); 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/test/java/net/ricecode/similarity/DiceCoefficientStrategyTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import static org.junit.Assert.*; 27 | 28 | import org.junit.Test; 29 | 30 | public class DiceCoefficientStrategyTest { 31 | 32 | @Test 33 | public void testOneTranspostion() { 34 | SimilarityStrategy s = new DiceCoefficientStrategy(); 35 | String first = "Martha"; 36 | String second = "Marhta"; 37 | double expected = 0.400; 38 | double delta = 0.001; 39 | double actual = s.score(first, second); 40 | assertEquals(expected, actual, delta); 41 | } 42 | 43 | @Test 44 | public void testSoundAlike() { 45 | SimilarityStrategy s = new DiceCoefficientStrategy(); 46 | String first = "Dwayne"; 47 | String second = "Duane"; 48 | double expected = 0.2222; 49 | double delta = 0.001; 50 | double actual = s.score(first, second); 51 | assertEquals(expected, actual, delta); 52 | 53 | } 54 | 55 | @Test 56 | public void testMisspelledSoundAlike() { 57 | SimilarityStrategy s = new DiceCoefficientStrategy(); 58 | String first = "Dixon"; 59 | String second = "Dicksonx"; 60 | double expected = 0.363636; 61 | double delta = 0.001; 62 | double actual = s.score(first, second); 63 | assertEquals(expected, actual, delta); 64 | 65 | } 66 | 67 | @Test 68 | public void testAbsoluteSimilarity() { 69 | SimilarityStrategy s = new DiceCoefficientStrategy(); 70 | String first = "Mississippi"; 71 | String second = "Mississippi"; 72 | double expected = 1.000; 73 | double delta = 0.000; 74 | double actual = s.score(first, second); 75 | assertEquals(expected, actual, delta); 76 | } 77 | 78 | @Test 79 | public void testAbsoluteDissimilarity() { 80 | SimilarityStrategy s = new DiceCoefficientStrategy(); 81 | String first = "Mississippi"; 82 | String second = "Oklahoma"; 83 | double expected = 0.000; 84 | double delta = 0.000; 85 | double actual = s.score(first, second); 86 | assertEquals(expected, actual, delta); 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /src/test/java/net/ricecode/similarity/JaroStrategyTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import static org.junit.Assert.*; 27 | import org.junit.Test; 28 | 29 | public class JaroStrategyTest { 30 | 31 | @Test 32 | public void testOneTranspostion() { 33 | SimilarityStrategy s = new JaroStrategy(); 34 | String first = "Martha"; 35 | String second = "Marhta"; 36 | double expected = 0.944; 37 | double delta = 0.001; 38 | double actual = s.score(first, second); 39 | assertEquals(expected, actual, delta); 40 | } 41 | 42 | @Test 43 | public void testSoundAlike() { 44 | SimilarityStrategy s = new JaroStrategy(); 45 | String first = "Dwayne"; 46 | String second = "Duane"; 47 | double expected = 0.822; 48 | double delta = 0.001; 49 | double actual = s.score(first, second); 50 | assertEquals(expected, actual, delta); 51 | 52 | } 53 | 54 | @Test 55 | public void testMisspelledSoundAlike() { 56 | SimilarityStrategy s = new JaroStrategy(); 57 | String first = "Dixon"; 58 | String second = "Dicksonx"; 59 | double expected = 0.767; 60 | double delta = 0.001; 61 | double actual = s.score(first, second); 62 | assertEquals(expected, actual, delta); 63 | 64 | } 65 | 66 | @Test 67 | public void testAbsoluteSimilarity() { 68 | SimilarityStrategy s = new JaroStrategy(); 69 | String first = "Mississippi"; 70 | String second = "Mississippi"; 71 | double expected = 1.000; 72 | double delta = 0.000; 73 | double actual = s.score(first, second); 74 | assertEquals(expected, actual, delta); 75 | } 76 | 77 | @Test 78 | public void testAbsoluteDissimilarity() { 79 | SimilarityStrategy s = new JaroStrategy(); 80 | String first = "Mississippi"; 81 | String second = "Oklahoma"; 82 | double expected = 0.000; 83 | double delta = 0.000; 84 | double actual = s.score(first, second); 85 | assertEquals(expected, actual, delta); 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/test/java/net/ricecode/similarity/JaroWinklerStrategyTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import static org.junit.Assert.*; 27 | 28 | import org.junit.Test; 29 | 30 | public class JaroWinklerStrategyTest { 31 | 32 | @Test 33 | public void testOneTranspostion() { 34 | SimilarityStrategy s = new JaroWinklerStrategy(); 35 | String first = "Martha"; 36 | String second = "Marhta"; 37 | double expected = 0.961; 38 | double delta = 0.001; 39 | double actual = s.score(first, second); 40 | assertEquals(expected, actual, delta); 41 | } 42 | 43 | @Test 44 | public void testSoundAlike() { 45 | SimilarityStrategy s = new JaroWinklerStrategy(); 46 | String first = "Dwayne"; 47 | String second = "Duane"; 48 | double expected = 0.840; 49 | double delta = 0.001; 50 | double actual = s.score(first, second); 51 | assertEquals(expected, actual, delta); 52 | 53 | } 54 | 55 | @Test 56 | public void testMisspelledSoundAlike() { 57 | SimilarityStrategy s = new JaroWinklerStrategy(); 58 | String first = "Dixon"; 59 | String second = "Dicksonx"; 60 | double expected = 0.813; 61 | double delta = 0.001; 62 | double actual = s.score(first, second); 63 | assertEquals(expected, actual, delta); 64 | 65 | } 66 | 67 | @Test 68 | public void testAbsoluteSimilarity() { 69 | SimilarityStrategy s = new JaroStrategy(); 70 | String first = "Mississippi"; 71 | String second = "Mississippi"; 72 | double expected = 1.000; 73 | double delta = 0.000; 74 | double actual = s.score(first, second); 75 | assertEquals(expected, actual, delta); 76 | } 77 | 78 | @Test 79 | public void testAbsoluteDissimilarity() { 80 | SimilarityStrategy s = new JaroStrategy(); 81 | String first = "Mississippi"; 82 | String second = "Oklahoma"; 83 | double expected = 0.000; 84 | double delta = 0.000; 85 | double actual = s.score(first, second); 86 | assertEquals(expected, actual, delta); 87 | } 88 | } -------------------------------------------------------------------------------- /src/test/java/net/ricecode/similarity/LevenshteinDistanceStrategyTest.java: -------------------------------------------------------------------------------- 1 | package net.ricecode.similarity; 2 | 3 | import org.junit.Test; 4 | import static org.junit.Assert.*; 5 | 6 | public class LevenshteinDistanceStrategyTest { 7 | 8 | @Test (expected = NullPointerException.class) 9 | public void testNullThrows() { 10 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 11 | lds.score(null, "kEvIn"); 12 | } 13 | 14 | @Test 15 | public void emptyStringTest() { 16 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 17 | double response = lds.score("", ""); 18 | assertEquals(1.0, response, 0.0); 19 | } 20 | 21 | @Test 22 | public void testExactMatchDifferentCase() { 23 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 24 | double response = lds.score("KeViN", "kevin"); 25 | assertEquals(1.0, response, 0.0); 26 | } 27 | 28 | @Test 29 | public void testExactMatchSameCase() { 30 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 31 | double response = lds.score("java", "java"); 32 | assertEquals(1.0, response, 0.0); 33 | } 34 | 35 | @Test 36 | public void testNoSimilarity() { 37 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 38 | double response = lds.score("abc", "def"); 39 | assertEquals(0.0, response, 0.0); 40 | } 41 | 42 | @Test 43 | public void score1() { 44 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 45 | double response = lds.score("he", "head"); 46 | assertEquals(0.5d, response, 0.0001d); 47 | } 48 | 49 | @Test 50 | public void score2() { 51 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 52 | double response = lds.score("hd", "head"); 53 | assertEquals(0.5d, response, 0.0001d); 54 | } 55 | 56 | @Test 57 | public void score3() { 58 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 59 | double response = lds.score("d", "head"); 60 | assertEquals(0.25d, response, 0.0001d); 61 | } 62 | 63 | @Test 64 | public void score4() { 65 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 66 | double response = lds.score("head", "he"); 67 | assertEquals(0.5d, response, 0.0001d); 68 | } 69 | 70 | @Test 71 | public void score5() { 72 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 73 | double response = lds.score("kitten", "sitting"); 74 | assertEquals(0.5714d, response, 0.0001d); 75 | } 76 | 77 | @Test 78 | public void score6() { 79 | LevenshteinDistanceStrategy lds = new LevenshteinDistanceStrategy(); 80 | double response = lds.score("Saturday", "Sunday"); 81 | assertEquals(0.625d, response, 0.0001d); 82 | } 83 | 84 | 85 | 86 | 87 | } 88 | -------------------------------------------------------------------------------- /src/test/java/net/ricecode/similarity/SimilarityScoreTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import static org.junit.Assert.*; 27 | 28 | import org.junit.Test; 29 | 30 | public class SimilarityScoreTest { 31 | 32 | @Test 33 | public void testGetKey() { 34 | SimilarityScore s = new SimilarityScore("Test", 0.99); 35 | assertEquals("Test", s.getKey()); 36 | } 37 | 38 | @Test 39 | public void testGetScore() { 40 | SimilarityScore s = new SimilarityScore("Test", 0.99); 41 | assertEquals(0.99, s.getScore(), 0.000); 42 | 43 | } 44 | 45 | 46 | 47 | } 48 | -------------------------------------------------------------------------------- /src/test/java/net/ricecode/similarity/StringSimilarityServiceImplTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2010 Ralph Allan Rice 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to deal 6 | * in the Software without restriction, including without limitation the rights 7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | * copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 20 | * THE SOFTWARE. 21 | * 22 | */ 23 | 24 | package net.ricecode.similarity; 25 | 26 | import static org.junit.Assert.*; 27 | import static org.mockito.Mockito.*; 28 | 29 | import java.util.ArrayList; 30 | import java.util.List; 31 | 32 | import org.junit.Test; 33 | 34 | 35 | public class StringSimilarityServiceImplTest { 36 | 37 | 38 | @Test 39 | public void testScoreAll() { 40 | SimilarityStrategy strategy = mock(SimilarityStrategy.class); 41 | String target = "McDonalds"; 42 | String c1 = "MacMahons"; 43 | String c2 = "McPherson"; 44 | String c3 = "McDonalds"; 45 | 46 | when(strategy.score(target, c1)).thenReturn(0.90); 47 | when(strategy.score(target, c2)).thenReturn(0.74); 48 | when(strategy.score(target, c3)).thenReturn(1.000); 49 | 50 | StringSimilarityService service = new StringSimilarityServiceImpl(strategy); 51 | List features = new ArrayList(); 52 | features.add(c1); 53 | features.add(c2); 54 | features.add(c3); 55 | 56 | List scores = service.scoreAll(features, target); 57 | verify(strategy).score(c1, target); 58 | verify(strategy).score(c2, target); 59 | verify(strategy).score(c3, target); 60 | assertEquals(3, scores.size()); 61 | } 62 | 63 | @Test 64 | public void testScore() { 65 | SimilarityStrategy strategy = mock(SimilarityStrategy.class); 66 | String target = "McDonalds"; 67 | String c1 = "MacMahons"; 68 | String c2 = "McPherson"; 69 | String c3 = "McDonalds"; 70 | 71 | when(strategy.score(c1, target)).thenReturn(0.90); 72 | when(strategy.score(c2, target)).thenReturn(0.74); 73 | when(strategy.score(c3, target)).thenReturn(1.000); 74 | 75 | StringSimilarityService service = new StringSimilarityServiceImpl(strategy); 76 | 77 | double score = service.score(c1, target); 78 | verify(strategy).score(c1, target); 79 | assertEquals(0.90, score, 0.000); 80 | 81 | } 82 | 83 | @Test 84 | public void testFindTop() { 85 | SimilarityStrategy strategy = mock(SimilarityStrategy.class); 86 | String target = "McDonalds"; 87 | String c1 = "MacMahons"; 88 | String c2 = "McPherson"; 89 | String c3 = "McDonalds"; 90 | 91 | SimilarityScore expected = new SimilarityScore(c3, 1.000); 92 | 93 | when(strategy.score(c1, target)).thenReturn(0.90); 94 | when(strategy.score(c2, target)).thenReturn(0.74); 95 | when(strategy.score(c3, target)).thenReturn(1.000); 96 | 97 | StringSimilarityService service = new StringSimilarityServiceImpl(strategy); 98 | List features = new ArrayList(); 99 | features.add(c1); 100 | features.add(c2); 101 | features.add(c3); 102 | 103 | SimilarityScore top= service.findTop(features,target); 104 | verify(strategy).score(c1, target); 105 | verify(strategy).score(c2, target); 106 | verify(strategy).score(c3, target); 107 | assertEquals(expected, top); 108 | 109 | } 110 | 111 | @Test 112 | public void testFindTop_Ascending() { 113 | SimilarityStrategy strategy = mock(SimilarityStrategy.class); 114 | String target = "McDonalds"; 115 | String c1 = "MacMahons"; 116 | String c2 = "McPherson"; 117 | String c3 = "McDonalds"; 118 | 119 | SimilarityScore expected = new SimilarityScore(c2, 0.74); 120 | 121 | when(strategy.score(c1, target)).thenReturn(0.90); 122 | when(strategy.score(c2, target)).thenReturn(0.74); 123 | when(strategy.score(c3, target)).thenReturn(1.000); 124 | 125 | StringSimilarityService service = new StringSimilarityServiceImpl(strategy); 126 | List features = new ArrayList(); 127 | features.add(c1); 128 | features.add(c2); 129 | features.add(c3); 130 | 131 | AscendingSimilarityScoreComparator comparator = new AscendingSimilarityScoreComparator(); 132 | SimilarityScore top= service.findTop(features,target, comparator); 133 | verify(strategy).score(c1, target); 134 | verify(strategy).score(c2, target); 135 | verify(strategy).score(c3, target); 136 | assertEquals(expected, top); 137 | } 138 | 139 | @Test 140 | public void testFindTop_Descending() { 141 | SimilarityStrategy strategy = mock(SimilarityStrategy.class); 142 | String target = "McDonalds"; 143 | String c1 = "MacMahons"; 144 | String c2 = "McPherson"; 145 | String c3 = "McDonalds"; 146 | 147 | SimilarityScore expected = new SimilarityScore(c3, 1.000); 148 | 149 | when(strategy.score(c1, target)).thenReturn(0.90); 150 | when(strategy.score(c2, target)).thenReturn(0.74); 151 | when(strategy.score(c3, target)).thenReturn(1.000); 152 | 153 | StringSimilarityService service = new StringSimilarityServiceImpl(strategy); 154 | List features = new ArrayList(); 155 | features.add(c1); 156 | features.add(c2); 157 | features.add(c3); 158 | 159 | DescendingSimilarityScoreComparator comparator = new DescendingSimilarityScoreComparator(); 160 | SimilarityScore top= service.findTop(features,target, comparator); 161 | verify(strategy).score(c1, target); 162 | verify(strategy).score(c2, target); 163 | verify(strategy).score(c3, target); 164 | assertEquals(expected, top); 165 | 166 | } 167 | } 168 | --------------------------------------------------------------------------------