├── duke-core ├── src │ ├── test │ │ ├── resources │ │ │ ├── old-format.txt │ │ │ ├── sparql-empty.xml │ │ │ ├── config-empty.xml │ │ │ ├── config-database.xml │ │ │ ├── config-custom-comparator.xml │ │ │ ├── sparql-bnode.xml │ │ │ ├── sparql-onerow.xml │ │ │ ├── config-default-probs.xml │ │ │ ├── sparql-onerow2col.xml │ │ │ ├── config-no-comparator.xml │ │ │ ├── config-custom-estimator.xml │ │ │ ├── sparql-tworow2col.xml │ │ │ ├── config-no-object.xml │ │ │ ├── config-lookup.xml │ │ │ └── sparql-tworow2col-inconsistent.xml │ │ └── java │ │ │ └── no │ │ │ └── priv │ │ │ └── garshol │ │ │ └── duke │ │ │ ├── test │ │ │ ├── InMemoryClassDatabaseTest.java │ │ │ └── RecordImplTest.java │ │ │ ├── matchers │ │ │ └── InMemoryLinkDatabaseMatchListenerTest.java │ │ │ ├── databases │ │ │ ├── InMemoryDatabaseTest.java │ │ │ ├── KeyValueDatabaseTest.java │ │ │ ├── PriorityQueueTest.java │ │ │ └── InMemoryBlockingDatabaseTest.java │ │ │ ├── JDBCClassDatabaseTest.java │ │ │ ├── cleaners │ │ │ ├── PersonNameCleanerTest.java │ │ │ ├── TrimCleanerTest.java │ │ │ ├── DigitsOnlyCleanerTest.java │ │ │ ├── FamilyCommaGivenCleanerTest.java │ │ │ ├── NorwegianCompanyNameCleanerTest.java │ │ │ ├── HTMLCleanerTest.java │ │ │ ├── RegexpCleanerTest.java │ │ │ ├── PhoneNumberCleanerTest.java │ │ │ └── LowerCaseNormalizeCleanerTest.java │ │ │ ├── comparators │ │ │ ├── DifferentComparatorTest.java │ │ │ ├── JaccardIndexComparatorTest.java │ │ │ ├── DiceCoefficientComparatorTest.java │ │ │ ├── WeightedLevenshteinTest.java │ │ │ ├── NumericComparatorTest.java │ │ │ ├── SoundexComparatorTest.java │ │ │ ├── QGramComparatorTest.java │ │ │ └── GeopositionComparatorTest.java │ │ │ ├── utils │ │ │ ├── LinkDatabaseUtilsTest.java │ │ │ └── PropertyUtilsTest.java │ │ │ ├── CompactRecordTest.java │ │ │ ├── datasources │ │ │ └── InMemoryDataSourceTest.java │ │ │ └── genetic │ │ │ ├── ComparatorAspectTest.java │ │ │ └── ActiveLearningTest.java │ └── main │ │ ├── java │ │ └── no │ │ │ └── priv │ │ │ └── garshol │ │ │ └── duke │ │ │ ├── package.html │ │ │ ├── comparators │ │ │ ├── package.html │ │ │ ├── ExactComparator.java │ │ │ ├── DifferentComparator.java │ │ │ ├── Matcher.java │ │ │ ├── NumericComparator.java │ │ │ ├── DiceCoefficientComparator.java │ │ │ ├── JaccardIndexComparator.java │ │ │ └── SoundexComparator.java │ │ │ ├── datasources │ │ │ ├── package.html │ │ │ ├── JNDIDataSource.java │ │ │ ├── Column.java │ │ │ ├── InMemoryDataSource.java │ │ │ ├── RecordBuilder.java │ │ │ └── ColumnarDataSource.java │ │ │ ├── utils │ │ │ ├── package.html │ │ │ ├── DefaultRecordIterator.java │ │ │ ├── SparqlResult.java │ │ │ ├── YesNoConsole.java │ │ │ ├── TestFileUtils.java │ │ │ ├── PropertyUtils.java │ │ │ ├── LinkFileWriter.java │ │ │ ├── Utils.java │ │ │ ├── LinkDatabaseUtils.java │ │ │ └── StringUtils.java │ │ │ ├── matchers │ │ │ ├── package.html │ │ │ ├── AbstractMatchListener.java │ │ │ ├── ClassDatabaseMatchListener.java │ │ │ └── MatchListener.java │ │ │ ├── databases │ │ │ ├── package.html │ │ │ ├── KeyFunction.java │ │ │ ├── AbstractKeyFunction.java │ │ │ ├── Bucket.java │ │ │ ├── InMemoryBlockingDatabase.java │ │ │ ├── KeyValueStore.java │ │ │ ├── InMemoryDatabase.java │ │ │ └── InMemoryKeyValueStore.java │ │ │ ├── cleaners │ │ │ ├── package.html │ │ │ ├── TrimCleaner.java │ │ │ ├── DigitsOnlyCleaner.java │ │ │ ├── GenericValueCleaner.java │ │ │ ├── StripNontextCharacters.java │ │ │ ├── ChainedCleaner.java │ │ │ ├── Transform.java │ │ │ ├── FamilyCommaGivenCleaner.java │ │ │ ├── AbstractRuleBasedCleaner.java │ │ │ ├── MappingFileCleaner.java │ │ │ ├── NorwegianCompanyNameCleaner.java │ │ │ ├── NorwegianAddressCleaner.java │ │ │ ├── HTMLCleaner.java │ │ │ ├── PersonNameCleaner.java │ │ │ ├── LowerCaseNormalizeCleaner.java │ │ │ └── RegexpCleaner.java │ │ │ ├── genetic │ │ │ ├── package.html │ │ │ ├── Oracle.java │ │ │ ├── Aspect.java │ │ │ ├── Pair.java │ │ │ ├── ThresholdAspect.java │ │ │ ├── FloatAspect.java │ │ │ ├── LinkFileOracle.java │ │ │ ├── HighProbabilityAspect.java │ │ │ ├── LowProbabilityAspect.java │ │ │ ├── ComparatorAspect.java │ │ │ ├── ConsoleOracle.java │ │ │ └── ExemplarsTracker.java │ │ │ ├── examples │ │ │ ├── package.html │ │ │ ├── CountryNameCleaner.java │ │ │ └── CapitalCleaner.java │ │ │ ├── StatementHandler.java │ │ │ ├── Cleaner.java │ │ │ ├── DukeConfigException.java │ │ │ ├── LinkSource.java │ │ │ ├── DukeException.java │ │ │ ├── ModifiableRecord.java │ │ │ ├── Comparator.java │ │ │ ├── JNDILinkDatabase.java │ │ │ ├── DummyLogger.java │ │ │ ├── Logger.java │ │ │ ├── RecordIterator.java │ │ │ ├── LinkStatus.java │ │ │ ├── LinkKind.java │ │ │ ├── JDBCLinkDatabase.java │ │ │ ├── Record.java │ │ │ ├── Database.java │ │ │ ├── DataSource.java │ │ │ ├── LinkDatabase.java │ │ │ ├── EquivalenceClassDatabase.java │ │ │ ├── RecordSearch.java │ │ │ ├── RecordImpl.java │ │ │ ├── CompactRecord.java │ │ │ ├── Property.java │ │ │ └── AbstractCmdlineTool.java │ │ └── resources │ │ └── no │ │ └── priv │ │ └── garshol │ │ └── duke │ │ ├── duke.properties │ │ ├── name-mappings.txt │ │ └── config-schema.rnc └── buildNumber.properties ├── .travis.yml ├── .gitignore ├── duke-es ├── src │ ├── main │ │ └── java │ │ │ └── no │ │ │ └── priv │ │ │ └── garshol │ │ │ └── duke │ │ │ └── databases │ │ │ └── es │ │ │ └── StorageType.java │ └── test │ │ ├── resources │ │ └── config-database.xml │ │ └── java │ │ └── no │ │ └── priv │ │ └── garshol │ │ └── duke │ │ └── databases │ │ └── es │ │ └── ElasticSearchConfigLoaderTest.java └── pom.xml ├── duke-server ├── src │ └── main │ │ └── java │ │ └── no │ │ └── priv │ │ └── garshol │ │ └── duke │ │ └── server │ │ ├── package.html │ │ ├── DukeTimer.java │ │ ├── CommonJTimer.java │ │ └── BasicTimer.java └── pom.xml ├── duke-lucene ├── src │ ├── test │ │ ├── java │ │ │ └── no │ │ │ │ └── priv │ │ │ │ └── garshol │ │ │ │ └── duke │ │ │ │ └── databases │ │ │ │ ├── LuceneDatabaseTest.java │ │ │ │ ├── PersistentLuceneDatabaseTest.java │ │ │ │ ├── LuceneConfigLoaderTest.java │ │ │ │ └── ExtraLuceneDatabaseTest.java │ │ └── resources │ │ │ └── config-database.xml │ └── main │ │ └── java │ │ └── no │ │ └── priv │ │ └── garshol │ │ └── duke │ │ └── databases │ │ └── DocumentRecord.java └── pom.xml ├── duke-mapdb ├── src │ └── test │ │ └── java │ │ └── no │ │ └── priv │ │ └── garshol │ │ └── duke │ │ └── databases │ │ ├── MapDBBlockingDatabaseTest.java │ │ └── PersistentMapDBBlockingDatabaseTest.java └── pom.xml ├── changes.txt ├── doc ├── example-data │ ├── db-nationality.txt │ ├── namebase.xml │ ├── dogfood-sparql.xml │ └── dogfood.xml └── tutorials │ └── 2011_05_data-cleansing.textile ├── duke-json ├── pom.xml └── src │ └── test │ └── java │ └── no │ └── priv │ └── garshol │ └── duke │ └── datasources │ └── JsonDataSourceTest.java ├── duke-mongodb └── pom.xml └── duke-dist ├── src └── main │ └── assembly │ └── dep.xml └── pom.xml /duke-core/src/test/resources/old-format.txt: -------------------------------------------------------------------------------- 1 | +http://dbpedia.org/resource/Slovakia,19283 2 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: java 2 | script: "mvn verify -P fast" 3 | jdk: 4 | - oraclejdk7 5 | - oraclejdk8 -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/package.html: -------------------------------------------------------------------------------- 1 | 2 |

The main Duke API is here. 3 | 4 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/sparql-empty.xml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/comparators/package.html: -------------------------------------------------------------------------------- 1 | 2 |

Duke's built-in comparators. 3 | 4 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/datasources/package.html: -------------------------------------------------------------------------------- 1 | 2 |

Duke's built-in data sources. 3 | 4 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/utils/package.html: -------------------------------------------------------------------------------- 1 | 2 |

Various helper classes used by Duke. 3 | 4 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/config-empty.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 0.4 4 | 5 | -------------------------------------------------------------------------------- /duke-core/buildNumber.properties: -------------------------------------------------------------------------------- 1 | #maven.buildNumber.plugin properties file 2 | #Wed Oct 11 12:15:08 WIB 2023 3 | buildNumber0=3666 4 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/matchers/package.html: -------------------------------------------------------------------------------- 1 | 2 |

Duke's event listeners for receiving matches between records. 3 | 4 | -------------------------------------------------------------------------------- /duke-core/src/main/resources/no/priv/garshol/duke/duke.properties: -------------------------------------------------------------------------------- 1 | duke.version=${project.version} 2 | duke.build=${buildNumber} 3 | duke.builder=${user.name} 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Eclipse 2 | .classpath 3 | .project 4 | .settings/ 5 | 6 | ### Maven 7 | target/ 8 | 9 | ### JetBrains 10 | *.iml 11 | *.ipr 12 | *.iws 13 | /.idea/ -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/databases/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

This package contains all implementations of the Database interface. 4 | 5 | -------------------------------------------------------------------------------- /duke-es/src/main/java/no/priv/garshol/duke/databases/es/StorageType.java: -------------------------------------------------------------------------------- 1 | package no.priv.garshol.duke.databases.es; 2 | 3 | public enum StorageType { 4 | MEMORY, DISK 5 | } 6 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/package.html: -------------------------------------------------------------------------------- 1 | 2 |

Duke's built-in cleaners, plus utility classes for building your 3 | own cleaners. 4 | 5 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/package.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |

Implementation of the genetic algorithm for automatically creating 4 | Duke configurations. 5 | 6 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/config-database.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 0.4 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/examples/package.html: -------------------------------------------------------------------------------- 1 | 2 |

This package contains helper classes for the examples, like 3 | cleaners and comparators which are too specific to include as part of 4 | Duke. 5 | 6 | -------------------------------------------------------------------------------- /duke-server/src/main/java/no/priv/garshol/duke/server/package.html: -------------------------------------------------------------------------------- 1 | 2 |

Contains classes for running Duke as an app in a servlet container, 3 | allowing it to incrementally process new and changed data as it 4 | arrives from a remote service. 5 | 6 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/config-custom-comparator.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 0.89 5 | 6 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/sparql-bnode.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | r2 8 | 9 | 10 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/sparql-onerow.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 1 8 | 9 | 10 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/StatementHandler.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * Event-handler which receives parsed statements. 6 | */ 7 | public interface StatementHandler { 8 | public void statement(String subject, String property, String object, 9 | boolean literal); 10 | } 11 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/Cleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * A function which can turn a value into a normalized value suitable 6 | * for comparison. 7 | */ 8 | public interface Cleaner { 9 | 10 | /** 11 | * Returns a cleaned value. 12 | */ 13 | public String clean(String value); 14 | 15 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/DukeConfigException.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * Thrown when there is an error in the configuration of Duke. 6 | */ 7 | public class DukeConfigException extends RuntimeException { 8 | 9 | public DukeConfigException(String message) { 10 | super(message); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/databases/KeyFunction.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import no.priv.garshol.duke.Record; 5 | 6 | /** 7 | * A key function produces a blocking key from a record. 8 | * @since 1.2 9 | */ 10 | public interface KeyFunction { 11 | 12 | public String makeKey(Record record); 13 | 14 | } 15 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/config-default-probs.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 0.89 7 | 8 | 9 | FIRSTNAME 10 | no.priv.garshol.duke.comparators.JaroWinklerTokenized 11 | 12 | 13 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/sparql-onerow2col.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 1 9 | http://example.org 10 | 11 | 12 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/test/InMemoryClassDatabaseTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.test; 3 | 4 | import no.priv.garshol.duke.InMemoryClassDatabase; 5 | import no.priv.garshol.duke.EquivalenceClassDatabase; 6 | 7 | public class InMemoryClassDatabaseTest extends ClassDatabaseTest { 8 | 9 | public EquivalenceClassDatabase createDatabase() { 10 | return new InMemoryClassDatabase(); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/matchers/InMemoryLinkDatabaseMatchListenerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.matchers; 3 | 4 | import no.priv.garshol.duke.InMemoryLinkDatabase; 5 | import no.priv.garshol.duke.LinkDatabase; 6 | 7 | public class InMemoryLinkDatabaseMatchListenerTest 8 | extends LinkDatabaseMatchListenerTest { 9 | 10 | protected LinkDatabase makeDatabase() { 11 | return new InMemoryLinkDatabase(); 12 | } 13 | 14 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/databases/InMemoryDatabaseTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import no.priv.garshol.duke.Configuration; 5 | import no.priv.garshol.duke.Database; 6 | 7 | public class InMemoryDatabaseTest extends DatabaseTest { 8 | 9 | public Database createDatabase(Configuration config) { 10 | Database db = new InMemoryDatabase(); 11 | db.setConfiguration(config); 12 | return db; 13 | } 14 | 15 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/databases/KeyValueDatabaseTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import no.priv.garshol.duke.Configuration; 5 | import no.priv.garshol.duke.Database; 6 | 7 | public class KeyValueDatabaseTest extends DatabaseTest { 8 | 9 | public Database createDatabase(Configuration config) { 10 | Database db = new KeyValueDatabase(); 11 | db.setConfiguration(config); 12 | return db; 13 | } 14 | 15 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/LinkSource.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.util.Collection; 5 | 6 | /** 7 | * Experimental interface for retrieving link information from outside 8 | * sources for use inside the Duke processing. Intended to feed into a 9 | * LinkDatabase. 10 | */ 11 | public interface LinkSource { 12 | 13 | /** 14 | * Returns the links known by the source. 15 | */ 16 | public Collection getLinks(); 17 | 18 | } -------------------------------------------------------------------------------- /duke-core/src/main/resources/no/priv/garshol/duke/name-mappings.txt: -------------------------------------------------------------------------------- 1 | al,albert 2 | ben,benjamin 3 | dan,daniel 4 | danny,daniel 5 | dave,david 6 | deb,deborah 7 | debbie,deborah 8 | greg,gregory 9 | jim,james 10 | joe,joseph 11 | josh,joshua 12 | matt,matthew 13 | mike,michael 14 | norm,norman 15 | rich,richard 16 | richie,richard 17 | rick,richard 18 | rob,robert 19 | robbie,robert 20 | robby,robert 21 | sam,samuel 22 | sammy,samuel 23 | tim,timothy 24 | tony,anthony 25 | wes,wesley 26 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/config-no-comparator.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 0.89 4 | 5 | 6 | FIRSTNAME 7 | no.priv.garshol.duke.comparators.JaroWinklerTokenized 8 | 0.48 9 | 0.6 10 | 11 | 12 | 13 | LASTNAME 14 | 0.48 15 | 0.6 16 | 17 | 18 | -------------------------------------------------------------------------------- /duke-lucene/src/test/java/no/priv/garshol/duke/databases/LuceneDatabaseTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import no.priv.garshol.duke.Configuration; 5 | import no.priv.garshol.duke.Database; 6 | 7 | public class LuceneDatabaseTest extends DatabaseTest { 8 | 9 | public Database createDatabase(Configuration config) { 10 | Database db = new LuceneDatabase(); 11 | db.setOverwrite(true); 12 | db.setConfiguration(config); 13 | return db; 14 | } 15 | 16 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/TrimCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | 6 | /** 7 | * A cleaner which removes leading and trailing whitespace, without 8 | * making any other changes. 9 | */ 10 | public class TrimCleaner implements Cleaner { 11 | 12 | public String clean(String value) { 13 | value = value.trim(); 14 | if (value.equals("")) 15 | return null; 16 | return value; 17 | } 18 | 19 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/JDBCClassDatabaseTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.util.Properties; 5 | 6 | import no.priv.garshol.duke.test.ClassDatabaseTest; 7 | 8 | public class JDBCClassDatabaseTest extends ClassDatabaseTest { 9 | 10 | public EquivalenceClassDatabase createDatabase() { 11 | return new JDBCEquivalenceClassDatabase("org.h2.Driver", "jdbc:h2:mem:", 12 | "h2", new Properties()); 13 | } 14 | 15 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/DukeException.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * Used to signal that something has gone wrong during Duke 6 | * processing. 7 | */ 8 | public class DukeException extends RuntimeException { 9 | 10 | public DukeException(String msg) { 11 | super(msg); 12 | } 13 | 14 | public DukeException(String msg, Throwable e) { 15 | super(msg, e); 16 | } 17 | 18 | public DukeException(Throwable e) { 19 | super(e); 20 | } 21 | 22 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/Oracle.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | import no.priv.garshol.duke.LinkKind; 5 | 6 | /** 7 | * An oracle can say whether a given match is correct or not. 8 | */ 9 | public interface Oracle { 10 | 11 | /** 12 | * Asks the oracle whether the two IDs represent the same thing or 13 | * not, and returns the answer. MAYBESAME means we don't know. 14 | */ 15 | public LinkKind getLinkKind(String id1, String id2); 16 | 17 | } 18 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/config-custom-estimator.xml: -------------------------------------------------------------------------------- 1 | 2 | 4 | 5 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | 0.89 14 | 15 | 16 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/ModifiableRecord.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * Extended Record interface with support for modification. Mainly 6 | * used by RecordBuilder. 7 | * @since 1.2 8 | */ 9 | public interface ModifiableRecord extends Record { 10 | 11 | /** 12 | * Adds a new value to the record. 13 | */ 14 | public void addValue(String property, String value); 15 | 16 | /** 17 | * Returns true iff the record has no values. 18 | */ 19 | public boolean isEmpty(); 20 | } -------------------------------------------------------------------------------- /duke-core/src/test/resources/sparql-tworow2col.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 1 9 | http://example.org 10 | 11 | 12 | 2 13 | http://example.com 14 | 15 | 16 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/config-no-object.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 0.89 6 | 7 | 8 | FIRSTNAME 9 | no.priv.garshol.duke.comparators.JaroWinklerTokenized 10 | 0.48 11 | 0.6 12 | 13 | 14 | 15 | LASTNAME 16 | 0.48 17 | 0.6 18 | 19 | 20 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/comparators/ExactComparator.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import no.priv.garshol.duke.Comparator; 5 | 6 | /** 7 | * Comparator which compares two values exactly. It returns 1.0 if 8 | * they are equal, and 0.0 if they are different. 9 | */ 10 | public class ExactComparator implements Comparator { 11 | 12 | public boolean isTokenized() { 13 | return false; 14 | } 15 | 16 | public double compare(String v1, String v2) { 17 | return v1.equals(v2) ? 1.0 : 0.0; 18 | } 19 | 20 | } -------------------------------------------------------------------------------- /duke-core/src/test/resources/config-lookup.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | 6 | 0.89 7 | 8 | 9 | FIRSTNAME 10 | no.priv.garshol.duke.comparators.JaroWinklerTokenized 11 | 12 | 13 | 14 | LASTNAME 15 | no.priv.garshol.duke.comparators.JaroWinklerTokenized 16 | 17 | 18 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/Comparator.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * An operator which compares two values for similarity, and returns a 6 | * number in the range 0.0 to 1.0 indicating the degree of similarity. 7 | */ 8 | public interface Comparator { 9 | 10 | /** 11 | * Returns true if the comparator breaks string values up into 12 | * tokens when comparing. Necessary because this impacts indexing of 13 | * values. 14 | */ 15 | public boolean isTokenized(); 16 | 17 | public double compare(String v1, String v2); 18 | 19 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/comparators/DifferentComparator.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import no.priv.garshol.duke.Comparator; 5 | 6 | /** 7 | * A comparator which returns 0.0 if two values are exactly equal, and 8 | * 1.0 if they are different. The inverse of ExactComparator. 9 | */ 10 | public class DifferentComparator implements Comparator { 11 | 12 | public boolean isTokenized() { 13 | return false; 14 | } 15 | 16 | public double compare(String v1, String v2) { 17 | return v1.equals(v2) ? 0.0 : 1.0; 18 | } 19 | 20 | } -------------------------------------------------------------------------------- /duke-lucene/src/test/java/no/priv/garshol/duke/databases/PersistentLuceneDatabaseTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import no.priv.garshol.duke.Configuration; 5 | import no.priv.garshol.duke.Database; 6 | 7 | public class PersistentLuceneDatabaseTest extends PersistentDatabaseTest { 8 | 9 | public Database createDatabase(Configuration config) { 10 | LuceneDatabase db = new LuceneDatabase(); 11 | db.setOverwrite(false); 12 | db.setConfiguration(config); 13 | db.setPath(tmpdir.getRoot().getAbsolutePath()); 14 | return db; 15 | } 16 | 17 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/utils/DefaultRecordIterator.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | import java.util.Iterator; 5 | 6 | import no.priv.garshol.duke.Record; 7 | import no.priv.garshol.duke.RecordIterator; 8 | 9 | public class DefaultRecordIterator extends RecordIterator { 10 | private Iterator it; 11 | 12 | public DefaultRecordIterator(Iterator it) { 13 | this.it = it; 14 | } 15 | 16 | public boolean hasNext() { 17 | return it.hasNext(); 18 | } 19 | 20 | public Record next() { 21 | return it.next(); 22 | } 23 | } -------------------------------------------------------------------------------- /duke-es/src/test/resources/config-database.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 0.89 5 | 6 | 7 | FIRSTNAME 8 | no.priv.garshol.duke.comparators.JaroWinklerTokenized 9 | 0.48 10 | 0.6 11 | 12 | 13 | 14 | LASTNAME 15 | 0.48 16 | 0.6 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /duke-lucene/src/test/resources/config-database.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 0.89 5 | 6 | 7 | FIRSTNAME 8 | no.priv.garshol.duke.comparators.JaroWinklerTokenized 9 | 0.48 10 | 0.6 11 | 12 | 13 | 14 | LASTNAME 15 | 0.48 16 | 0.6 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/JNDILinkDatabase.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import no.priv.garshol.duke.utils.JDBCUtils; 5 | 6 | /** 7 | * A link database that gets its connection via JNDI lookup. 8 | */ 9 | public class JNDILinkDatabase extends RDBMSLinkDatabase { 10 | private String jndipath; 11 | 12 | public JNDILinkDatabase(String jndipath, String dbtype) { 13 | super(dbtype); 14 | this.jndipath = jndipath; 15 | this.stmt = JDBCUtils.open(jndipath); 16 | } 17 | 18 | public void validateConnection() { 19 | if (stmt != null && !JDBCUtils.validate(stmt)) 20 | stmt = JDBCUtils.open(jndipath); 21 | } 22 | 23 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/DigitsOnlyCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | 6 | /** 7 | * Cleaner which removes all characters except the digits 0-9. 8 | */ 9 | public class DigitsOnlyCleaner implements Cleaner { 10 | 11 | public String clean(String value) { 12 | char[] tmp = new char[value.length()]; 13 | int pos = 0; 14 | for (int ix = 0; ix < tmp.length; ix++) { 15 | char ch = value.charAt(ix); 16 | if (ch >= '0' && ch <= '9') 17 | tmp[pos++] = ch; 18 | } 19 | if (pos == 0) 20 | return null; 21 | return new String(tmp, 0, pos); 22 | } 23 | 24 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/Aspect.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | /** 5 | * Represents an aspect of a Configuration that might be changed by 6 | * the genetic algorithm. 7 | */ 8 | public abstract class Aspect { 9 | 10 | /** 11 | * Randomly modify this aspect of the configuration. 12 | */ 13 | public abstract void setRandomly(GeneticConfiguration config); 14 | 15 | /** 16 | * Set this aspect of the configuration to be the same as that of 17 | * the other configuration. 18 | */ 19 | public abstract void setFromOther(GeneticConfiguration config, 20 | GeneticConfiguration other); 21 | } 22 | -------------------------------------------------------------------------------- /duke-core/src/test/resources/sparql-tworow2col-inconsistent.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 10 | 11 | 1 12 | http://example.org 13 | 14 | 15 | http://example.com 16 | 2 17 | 18 | 19 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/examples/CountryNameCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.examples; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | import no.priv.garshol.duke.cleaners.LowerCaseNormalizeCleaner; 6 | 7 | public class CountryNameCleaner implements Cleaner { 8 | private LowerCaseNormalizeCleaner sub; 9 | 10 | public CountryNameCleaner() { 11 | this.sub = new LowerCaseNormalizeCleaner(); 12 | } 13 | 14 | public String clean(String value) { 15 | // do basic cleaning 16 | value = sub.clean(value); 17 | if (value == null || value.equals("")) 18 | return ""; 19 | 20 | // do our stuff 21 | if (value.startsWith("the ")) 22 | value = value.substring(4); 23 | 24 | return value; 25 | } 26 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/examples/CapitalCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.examples; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | import no.priv.garshol.duke.cleaners.LowerCaseNormalizeCleaner; 6 | 7 | public class CapitalCleaner implements Cleaner { 8 | private LowerCaseNormalizeCleaner sub; 9 | 10 | public CapitalCleaner() { 11 | this.sub = new LowerCaseNormalizeCleaner(); 12 | } 13 | 14 | public String clean(String value) { 15 | // do basic cleaning 16 | value = sub.clean(value); 17 | if (value == null || value.equals("")) 18 | return ""; 19 | 20 | // do our stuff 21 | int ix = value.indexOf(','); 22 | if (ix != -1) 23 | value = value.substring(0, ix); 24 | 25 | return value; 26 | } 27 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/Pair.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | /** 5 | * Represents a pair of records. 6 | */ 7 | public class Pair { 8 | public String id1; 9 | public String id2; 10 | public int counter; 11 | public boolean[] believers; // which configurations think this pair is correct 12 | 13 | public Pair(String id1, String id2) { 14 | this.id1 = id1; 15 | this.id2 = id2; 16 | } 17 | 18 | public boolean equals(Object other) { 19 | if (!(other instanceof Pair)) 20 | return false; 21 | 22 | Pair opair = (Pair) other; 23 | return opair.id1.equals(id1) && opair.id2.equals(id2); 24 | } 25 | 26 | public int hashCode() { 27 | return id1.hashCode() + id2.hashCode(); 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/cleaners/PersonNameCleanerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class PersonNameCleanerTest extends LowerCaseNormalizeCleanerTest { 10 | 11 | @Before 12 | public void setUp() { 13 | cleaner = new PersonNameCleaner(); 14 | } 15 | 16 | @Test 17 | public void testMapping() { 18 | assertEquals("joseph stalin", 19 | cleaner.clean("Joe Stalin")); 20 | } 21 | 22 | @Test 23 | public void testMappingEmpty() { 24 | assertEquals("", cleaner.clean("")); 25 | } 26 | 27 | // @Test 28 | // public void testMappingNull() { 29 | // assertEquals(null, cleaner.clean(null)); 30 | // } 31 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/cleaners/TrimCleanerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class TrimCleanerTest { 10 | private TrimCleaner cleaner; 11 | 12 | @Before 13 | public void setup() { 14 | cleaner = new TrimCleaner(); 15 | } 16 | 17 | @Test 18 | public void testEmpty() { 19 | test("", null); 20 | } 21 | 22 | @Test 23 | public void testOnlyDigits() { 24 | test("314", "314"); 25 | } 26 | 27 | @Test 28 | public void testDigitsAndSpaces() { 29 | test(" 3 1 4 ", "3 1 4"); 30 | } 31 | 32 | private void test(String value, String result) { 33 | assertEquals(result, cleaner.clean(value)); 34 | } 35 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/databases/PriorityQueueTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import org.junit.Test; 5 | 6 | import static junit.framework.Assert.assertEquals; 7 | 8 | public class PriorityQueueTest { 9 | 10 | @Test 11 | public void test100() { 12 | KeyValueDatabase.Score scores[] = new KeyValueDatabase.Score[100]; 13 | for (int ix = 0; ix < scores.length; ix++) { 14 | scores[ix] = new KeyValueDatabase.Score(ix); 15 | scores[ix].score = (double) ix; 16 | } 17 | KeyValueDatabase.PriorityQueue pq = 18 | new KeyValueDatabase.PriorityQueue(scores); 19 | 20 | for (int ix = 0; ix < scores.length; ix++) { 21 | KeyValueDatabase.Score score = pq.next(); 22 | assertEquals((99 - ix), (int) score.score); 23 | } 24 | } 25 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/ThresholdAspect.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | import no.priv.garshol.duke.Configuration; 5 | 6 | /** 7 | * Sets the threshold. 8 | */ 9 | public class ThresholdAspect extends FloatAspect { 10 | 11 | public void setRandomly(GeneticConfiguration cfg) { 12 | Configuration config = cfg.getConfiguration(); 13 | double new_value = drift(config.getThreshold(), 1.0, 0.0); 14 | config.setThreshold(new_value); 15 | } 16 | 17 | public void setFromOther(GeneticConfiguration cfg1, 18 | GeneticConfiguration cfg2) { 19 | Configuration config = cfg1.getConfiguration(); 20 | Configuration other = cfg2.getConfiguration(); 21 | 22 | config.setThreshold(other.getThreshold()); 23 | } 24 | 25 | 26 | } 27 | -------------------------------------------------------------------------------- /duke-lucene/src/test/java/no/priv/garshol/duke/databases/LuceneConfigLoaderTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import java.io.IOException; 5 | 6 | import no.priv.garshol.duke.ConfigLoader; 7 | import no.priv.garshol.duke.Configuration; 8 | import no.priv.garshol.duke.Database; 9 | import org.junit.Test; 10 | import org.xml.sax.SAXException; 11 | 12 | import static junit.framework.Assert.assertEquals; 13 | 14 | public class LuceneConfigLoaderTest { 15 | 16 | @Test 17 | public void testDatabase() throws IOException, SAXException { 18 | Configuration config = ConfigLoader.load("classpath:config-database.xml"); 19 | Database db = config.getDatabase(false); 20 | LuceneDatabase lucene = (LuceneDatabase) db; 21 | assertEquals("/tmp/ct-visma-1", lucene.getPath()); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/GenericValueCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | 6 | /** 7 | * A cleaner which returns values as they are, but removes specific 8 | * values. This is useful in cases where users have entered so-called 9 | * "generic values". For example, if the unknown company number is 10 | * always set as "999999999", then you can use this cleaner to remove 11 | * that specific value. 12 | */ 13 | public class GenericValueCleaner implements Cleaner { 14 | private String generic; 15 | 16 | public String clean(String value) { 17 | if (generic.equals(value)) 18 | return null; 19 | return value; 20 | } 21 | 22 | public void setGeneric(String generic) { 23 | this.generic = generic; 24 | } 25 | 26 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/matchers/AbstractMatchListener.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.matchers; 3 | 4 | import no.priv.garshol.duke.Record; 5 | 6 | /** 7 | * Convenience implementation with dummy methods, since most 8 | * implementations will only implement matches(). 9 | */ 10 | public abstract class AbstractMatchListener implements MatchListener { 11 | 12 | public void batchReady(int size) { 13 | } 14 | 15 | public void batchDone() { 16 | } 17 | 18 | public void matches(Record r1, Record r2, double confidence) { 19 | } 20 | 21 | public void matchesPerhaps(Record r1, Record r2, double confidence) { 22 | } 23 | 24 | public void noMatchFor(Record record) { 25 | } 26 | 27 | public void startProcessing() { 28 | } 29 | 30 | public void endProcessing() { 31 | } 32 | 33 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/utils/SparqlResult.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | import java.util.List; 5 | import java.util.ArrayList; 6 | 7 | /** 8 | * Represents the result of a SPARQL query. 9 | */ 10 | public class SparqlResult { 11 | private List variables; 12 | private List rows; 13 | 14 | public SparqlResult() { 15 | this.variables = new ArrayList(); 16 | this.rows = new ArrayList(); 17 | } 18 | 19 | public List getVariables() { 20 | return variables; 21 | } 22 | 23 | public List getRows() { 24 | return rows; 25 | } 26 | 27 | // public for test purposes 28 | public void addVariable(String variable) { 29 | variables.add(variable); 30 | } 31 | 32 | // public for test purposes 33 | public void addRow(String[] row) { 34 | rows.add(row); 35 | } 36 | } -------------------------------------------------------------------------------- /duke-server/src/main/java/no/priv/garshol/duke/server/DukeTimer.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.server; 3 | 4 | import java.util.Properties; 5 | 6 | public interface DukeTimer { 7 | 8 | /** 9 | * Initializes the timer, giving it access to configuration settings. 10 | */ 11 | public void init(Properties props); 12 | 13 | /** 14 | * Starts a background thread which calls the controller every 15 | * check_interval seconds. Returns immediately, leaving the 16 | * background thread running. 17 | */ 18 | public void spawnThread(DukeController controller, int check_interval); 19 | 20 | /** 21 | * Returns true iff the background thread is running. 22 | */ 23 | public boolean isRunning(); 24 | 25 | /** 26 | * Stops the background thread. It can be restarted with a new call 27 | * to spawnThread. 28 | */ 29 | public void stop(); 30 | 31 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/StripNontextCharacters.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | 6 | /** 7 | * A cleaner which removes non-text characters. Specifically it strips 8 | * control characters (0-0x1F, 0x7F-0x9F) and special symbols in the 9 | * range 0xA1-0xBF. 10 | */ 11 | public class StripNontextCharacters implements Cleaner { 12 | 13 | public String clean(String value) { 14 | char[] tmp = new char[value.length()]; 15 | int pos = 0; 16 | for (int ix = 0; ix < value.length(); ix++) { 17 | char ch = value.charAt(ix); 18 | if (ch < 0x20 || 19 | (ch >= 0x7F && ch < 0xA0) || 20 | (ch > 0xA0 && ch < 0xC0)) 21 | continue; // skip Euro symbol, soft hyphen, etc etc 22 | tmp[pos++] = ch; 23 | } 24 | return new String(tmp, 0, pos); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /duke-es/src/test/java/no/priv/garshol/duke/databases/es/ElasticSearchConfigLoaderTest.java: -------------------------------------------------------------------------------- 1 | package no.priv.garshol.duke.databases.es; 2 | 3 | import java.io.IOException; 4 | 5 | import no.priv.garshol.duke.ConfigLoader; 6 | import no.priv.garshol.duke.Configuration; 7 | import no.priv.garshol.duke.Database; 8 | import no.priv.garshol.duke.databases.es.ElasticSearchDatabase; 9 | 10 | import org.junit.Test; 11 | import org.xml.sax.SAXException; 12 | 13 | import static org.junit.Assert.assertEquals; 14 | 15 | public class ElasticSearchConfigLoaderTest { 16 | 17 | @Test 18 | public void testDatabase() throws IOException, SAXException { 19 | Configuration config = ConfigLoader 20 | .load("classpath:config-database.xml"); 21 | Database db = config.getDatabase(false); 22 | ElasticSearchDatabase es = (ElasticSearchDatabase) db; 23 | assertEquals("duke-es", es.getCluster()); 24 | } 25 | 26 | } 27 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/ChainedCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | 6 | /** 7 | * Internal cleaner used to implement chaining of multiple cleaners. 8 | * Basically, if you list multiple cleaners in the cleaner="" 9 | * attribute in the configuration file, it gets turned into a 10 | * ChainedCleaner that runs all the cleaners in sequence. 11 | */ 12 | public class ChainedCleaner implements Cleaner { 13 | private Cleaner[] cleaners; 14 | 15 | public ChainedCleaner(Cleaner[] cleaners) { 16 | this.cleaners = cleaners; 17 | } 18 | 19 | public String clean(String value) { 20 | for (int ix = 0; ix < cleaners.length; ix++) { 21 | if (value == null || value.equals("")) 22 | return null; 23 | 24 | value = cleaners[ix].clean(value); 25 | } 26 | return value; 27 | } 28 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/comparators/DifferentComparatorTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class DifferentComparatorTest { 10 | private DifferentComparator comp; 11 | 12 | @Before 13 | public void setup() { 14 | this.comp = new DifferentComparator(); 15 | } 16 | 17 | @Test 18 | public void testEmpty() { 19 | assertEquals(0.0, comp.compare("", "")); 20 | } 21 | 22 | @Test 23 | public void testEmpty1() { 24 | assertEquals(1.0, comp.compare("", "1")); 25 | } 26 | 27 | @Test 28 | public void testEmpty2() { 29 | assertEquals(1.0, comp.compare("1", "")); 30 | } 31 | 32 | @Test 33 | public void testSame() { 34 | assertEquals(0.0, comp.compare("same", "same")); // but different 35 | } 36 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/cleaners/DigitsOnlyCleanerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | import static junit.framework.Assert.assertTrue; 9 | 10 | public class DigitsOnlyCleanerTest { 11 | private DigitsOnlyCleaner cleaner; 12 | 13 | @Before 14 | public void setup() { 15 | cleaner = new DigitsOnlyCleaner(); 16 | } 17 | 18 | @Test 19 | public void testEmpty() { 20 | assertTrue(cleaner.clean("") == null); 21 | } 22 | 23 | @Test 24 | public void testOnlyDigits() { 25 | test("314", "314"); 26 | } 27 | 28 | @Test 29 | public void testDigitsAndSpaces() { 30 | test(" 3 1 4 ", "314"); 31 | } 32 | 33 | private void test(String value, String result) { 34 | assertEquals(result, cleaner.clean(value)); 35 | } 36 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/DummyLogger.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | public class DummyLogger implements Logger { 5 | 6 | public void trace(String msg) { 7 | } 8 | 9 | public void debug(String msg) { 10 | } 11 | 12 | public void info(String msg) { 13 | } 14 | 15 | public void warn(String msg) { 16 | } 17 | 18 | public void warn(String msg, Throwable e) { 19 | } 20 | 21 | public void error(String msg) { 22 | } 23 | 24 | public void error(String msg, Throwable e) { 25 | } 26 | 27 | public boolean isTraceEnabled() { 28 | return false; 29 | } 30 | 31 | public boolean isDebugEnabled() { 32 | return false; 33 | } 34 | 35 | public boolean isInfoEnabled() { 36 | return false; 37 | } 38 | 39 | public boolean isWarnEnabled() { 40 | return false; 41 | } 42 | 43 | public boolean isErrorEnabled() { 44 | return false; 45 | } 46 | } -------------------------------------------------------------------------------- /duke-mapdb/src/test/java/no/priv/garshol/duke/databases/MapDBBlockingDatabaseTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import java.util.ArrayList; 5 | import java.util.Collection; 6 | 7 | import no.priv.garshol.duke.Configuration; 8 | import no.priv.garshol.duke.Database; 9 | import no.priv.garshol.duke.Record; 10 | 11 | public class MapDBBlockingDatabaseTest extends DatabaseTest { 12 | 13 | public Database createDatabase(Configuration config) { 14 | MapDBBlockingDatabase db = new MapDBBlockingDatabase(); 15 | db.setConfiguration(config); 16 | 17 | Collection functions = new ArrayList(); 18 | functions.add(new TestKeyFunction()); 19 | db.setKeyFunctions(functions); 20 | return db; 21 | } 22 | 23 | private static class TestKeyFunction implements KeyFunction { 24 | public String makeKey(Record record) { 25 | return record.getValue("NAME"); 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/databases/InMemoryBlockingDatabaseTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import java.util.ArrayList; 5 | import java.util.Collection; 6 | 7 | import no.priv.garshol.duke.Configuration; 8 | import no.priv.garshol.duke.Database; 9 | import no.priv.garshol.duke.Record; 10 | 11 | public class InMemoryBlockingDatabaseTest extends DatabaseTest { 12 | 13 | public Database createDatabase(Configuration config) { 14 | InMemoryBlockingDatabase db = new InMemoryBlockingDatabase(); 15 | db.setConfiguration(config); 16 | 17 | Collection functions = new ArrayList(); 18 | functions.add(new TestKeyFunction()); 19 | db.setKeyFunctions(functions); 20 | return db; 21 | } 22 | 23 | private static class TestKeyFunction implements KeyFunction { 24 | public String makeKey(Record record) { 25 | return record.getValue("NAME"); 26 | } 27 | } 28 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/comparators/JaccardIndexComparatorTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class JaccardIndexComparatorTest { 10 | private JaccardIndexComparator comp; 11 | 12 | @Before 13 | public void setup() { 14 | comp = new JaccardIndexComparator(); 15 | } 16 | 17 | @Test 18 | public void testEmpty() { 19 | assertEquals(1.0, comp.compare("", "")); 20 | } 21 | 22 | @Test 23 | public void testOneIsEmpty() { 24 | assertEquals(0.0, comp.compare("", "abc")); 25 | } 26 | 27 | @Test 28 | public void testOneIsDifferent() { 29 | assertEquals((1.0 / 3.0), comp.compare("abc def", "cba def")); 30 | } 31 | 32 | @Test 33 | public void testSameSets() { 34 | assertEquals(1.0, comp.compare("abc def", "def abc")); 35 | } 36 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/Logger.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * Experimental attempt at internal log handling which works 6 | * naturally on the command-line, doesn't introduce dependencies, and 7 | * at the same time allows integration with a full logging system. 8 | * This may go away again if I change my mind. 9 | */ 10 | public interface Logger { 11 | 12 | public void trace(String msg); 13 | 14 | public boolean isTraceEnabled(); 15 | 16 | public void debug(String msg); 17 | 18 | public boolean isDebugEnabled(); 19 | 20 | public void info(String msg); 21 | 22 | public boolean isInfoEnabled(); 23 | 24 | public void warn(String msg); 25 | 26 | public void warn(String msg, Throwable e); 27 | 28 | public boolean isWarnEnabled(); 29 | 30 | public void error(String msg); 31 | 32 | public void error(String msg, Throwable e); 33 | 34 | public boolean isErrorEnabled(); 35 | 36 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/FloatAspect.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | /** 5 | * Common code for the float aspects. 6 | */ 7 | public abstract class FloatAspect extends Aspect { 8 | protected double drift(double original, double max, double min) { 9 | 10 | // FIXME: the following is a chunk of experimental code that 11 | // hasn't been fully evaluated yet. leaving it in since it *may* 12 | // be reactivated, after more evaluation 13 | 14 | // 15 | // double upper = original + (float_drift_range / 2.0); 16 | // if (original + (float_drift_range / 2.0) > max) 17 | // upper = max; 18 | // else if (original - (float_drift_range / 2.0) < min) 19 | // upper = float_drift_range + min; 20 | 21 | // double delta = float_drift_range * Math.random(); 22 | // return upper - delta; 23 | // 24 | 25 | return Math.random() * (max - min) + min; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/cleaners/FamilyCommaGivenCleanerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class FamilyCommaGivenCleanerTest { 10 | private FamilyCommaGivenCleaner cleaner; 11 | 12 | @Before 13 | public void setup() { 14 | cleaner = new FamilyCommaGivenCleaner(); 15 | } 16 | 17 | @Test 18 | public void testEmpty() { 19 | test("", ""); 20 | } 21 | 22 | @Test 23 | public void testHenrikIbsen() { 24 | test("henrik ibsen", "Henrik Ibsen"); 25 | } 26 | 27 | @Test 28 | public void testIbsenHenrik() { 29 | test("henrik ibsen", "Ibsen, Henrik"); 30 | } 31 | 32 | @Test 33 | public void testJRAckerley() { 34 | test("j. r. ackerley", "Ackerley, J.R."); 35 | } 36 | 37 | private void test(String s1, String s2) { 38 | assertEquals(s1, cleaner.clean(s2)); 39 | } 40 | 41 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/RecordIterator.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.io.Closeable; 5 | import java.util.Iterator; 6 | 7 | /** 8 | * Special Iterator class for Record collections, in order to add some 9 | * extra methods for resource management. 10 | */ 11 | public abstract class RecordIterator 12 | implements Iterator, Closeable { 13 | 14 | /** 15 | * Releases any resources held by this iterator, and cleans up any 16 | * temporary storage. 17 | */ 18 | public void close() { 19 | } 20 | 21 | /** 22 | * Informs the iterator that the latest batch of records retrieved 23 | * from the iterator has been processed. This may in some cases 24 | * allow iterators to free resources, but iterators are not required 25 | * to perform any action in response to this call. 26 | */ 27 | public void batchProcessed() { 28 | } 29 | 30 | public void remove() { 31 | throw new UnsupportedOperationException(); 32 | } 33 | 34 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/LinkStatus.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * Represents the status of a link between two identities. That is, do 6 | * we believe it, and why? 7 | */ 8 | public enum LinkStatus { 9 | /** 10 | * Means we have outside evidence indicating this is true. 11 | */ 12 | ASSERTED(2), 13 | 14 | /** 15 | * Means Duke has worked this out on its own. 16 | */ 17 | INFERRED(1), 18 | 19 | /** 20 | * Means Duke used to believe this, but has since changed its mind. 21 | */ 22 | RETRACTED(0); 23 | 24 | private int id; 25 | private LinkStatus(int id) { 26 | this.id = id; 27 | } 28 | 29 | public int getId() { 30 | return id; 31 | } 32 | 33 | public static LinkStatus getbyid(int id) { 34 | if (id == 2) 35 | return ASSERTED; 36 | else if (id == 1) 37 | return INFERRED; 38 | else if (id == 0) 39 | return RETRACTED; 40 | throw new DukeException("No status with id " + id); 41 | } 42 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/Transform.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import java.util.regex.Pattern; 5 | import java.util.regex.Matcher; 6 | 7 | /** 8 | * Helper class used by AbstractRuleBasedCleaner. 9 | */ 10 | public class Transform { 11 | private Pattern regex; 12 | private String replacement; 13 | private int groupno; 14 | 15 | public Transform(String regex, String replacement) { 16 | this(regex, replacement, 1); 17 | } 18 | 19 | public Transform(String regex, String replacement, int groupno) { 20 | this.regex = Pattern.compile(regex); 21 | this.replacement = replacement; 22 | this.groupno = groupno; 23 | } 24 | 25 | public String transform(String value) { 26 | Matcher m = regex.matcher(value); 27 | if (!m.find()) 28 | return value; 29 | 30 | return value.substring(0, m.start(groupno)) + 31 | replacement + 32 | value.substring(m.end(groupno), value.length()); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/LinkKind.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * Represents the meaning of a link between two identities. 6 | */ 7 | public enum LinkKind { 8 | /** 9 | * Means we assume the two identities refer to the same real-world object. 10 | */ 11 | SAME(1), 12 | 13 | /** 14 | * Means we think it possible that the two identities refer to the 15 | * same real-world object. 16 | */ 17 | MAYBESAME(2), 18 | 19 | /** 20 | * Means we assume the two identities refer to different real-world objects. 21 | */ 22 | DIFFERENT(3); 23 | 24 | private int id; 25 | private LinkKind(int id) { 26 | this.id = id; 27 | } 28 | 29 | public int getId() { 30 | return id; 31 | } 32 | 33 | public static LinkKind getbyid(int id) { 34 | if (id == 1) 35 | return SAME; 36 | else if (id == 2) 37 | return MAYBESAME; 38 | else if (id == 3) 39 | return DIFFERENT; 40 | throw new DukeException("No kind with id " + id); 41 | } 42 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/test/RecordImplTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.test; 3 | 4 | import java.util.Collection; 5 | 6 | import no.priv.garshol.duke.Record; 7 | import no.priv.garshol.duke.utils.TestUtils; 8 | import org.junit.Test; 9 | 10 | import static junit.framework.Assert.assertEquals; 11 | import static junit.framework.Assert.assertTrue; 12 | 13 | public class RecordImplTest { 14 | 15 | @Test 16 | public void testNormal() { 17 | Record r = TestUtils.makeRecord("ID", "abc", "NAME", "b"); 18 | 19 | assertEquals("abc", r.getValue("ID")); 20 | Collection values = r.getValues("ID"); 21 | assertEquals(1, values.size()); 22 | assertEquals("abc", values.iterator().next()); 23 | 24 | assertEquals("b", r.getValue("NAME")); 25 | values = r.getValues("NAME"); 26 | assertEquals(1, values.size()); 27 | assertEquals("b", values.iterator().next()); 28 | 29 | assertEquals(null, r.getValue("EMAIL")); 30 | assertTrue(r.getValues("EMAIL").isEmpty()); 31 | } 32 | 33 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/utils/YesNoConsole.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | import java.io.IOException; 5 | import java.io.InputStreamReader; 6 | import java.io.BufferedReader; 7 | 8 | import no.priv.garshol.duke.DukeException; 9 | 10 | public class YesNoConsole { 11 | private BufferedReader console; 12 | 13 | public YesNoConsole() { 14 | this.console = new BufferedReader(new InputStreamReader(System.in)); 15 | } 16 | 17 | public boolean yesorno() { 18 | System.out.print("Correct? (Y/N) "); 19 | try { 20 | String line = console.readLine(); 21 | if (line == null) 22 | throw new DukeException("End of file on console"); 23 | line = line.trim(); 24 | 25 | if (line.equalsIgnoreCase("Y")) 26 | return true; 27 | else if (line.equalsIgnoreCase("N")) 28 | return false; 29 | else 30 | return yesorno(); 31 | } catch (IOException e) { 32 | throw new DukeException("Couldn't read input line", e); 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/utils/LinkDatabaseUtilsTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | import java.io.IOException; 5 | import java.io.InputStream; 6 | import java.io.InputStreamReader; 7 | 8 | import no.priv.garshol.duke.DukeException; 9 | import no.priv.garshol.duke.LinkDatabase; 10 | import org.junit.Test; 11 | 12 | import static org.junit.Assert.fail; 13 | 14 | public class LinkDatabaseUtilsTest { 15 | private LinkDatabase db; 16 | 17 | @Test 18 | public void testOldStyle() throws IOException { 19 | // tries to load a pre-1.2 format test file 20 | try { 21 | load("old-format.txt"); 22 | fail("accepted old-style test file"); 23 | } catch (DukeException e) { 24 | // this is expected 25 | } 26 | } 27 | 28 | private void load(String filename) throws IOException { 29 | ClassLoader cloader = Thread.currentThread().getContextClassLoader(); 30 | InputStream istream = cloader.getResourceAsStream(filename); 31 | db = LinkDatabaseUtils.loadTestFile(new InputStreamReader(istream)); 32 | } 33 | } -------------------------------------------------------------------------------- /changes.txt: -------------------------------------------------------------------------------- 1 | CHANGES SINCE 1.2 2 | ===================== 3 | 4 | Threading in the genetic algorithm is now much more efficient 5 | Set up continuous integration with Travis 6 | Added integration tests 7 | Can now run without Lucene on classpath 8 | Added HTMLCleaner 9 | Added StripNontextCharacters 10 | Genetic algorithm: 11 | mutation and recombination rates now evolve by themselves 12 | user can set both rates 13 | improved choice of questions asked under active learning 14 | Support for boosting in LuceneDatabase 15 | implemented by Fabrizio Fortino 16 | JSON data source 17 | implemented by https://github.com/dmnpignaud 18 | jar file now runnable 19 | Added --no-comparators option to genetic algorithm 20 | Added --original=N option to genetic algorithm 21 | Added ConfigLoader.loadFromString 22 | Added --incomplete-data option to genetic algorithm 23 | Added support for incremental record linkage (plus 1.3 methods) 24 | Let genetic algorithm use custom comparators from config (ztsmith) 25 | Split-on property not included in genetic output (ztsmith) 26 | MongoDB data source (antonimmo) 27 | -------------------------------------------------------------------------------- /doc/example-data/db-nationality.txt: -------------------------------------------------------------------------------- 1 | American,http://dbpedia.org/resource/United_States 2 | British,http://dbpedia.org/resource/United_Kingdom 3 | Australian,http://dbpedia.org/resource/Australia 4 | Indian,http://dbpedia.org/resource/India 5 | Norwegian,http://dbpedia.org/resource/Norway 6 | http://dbpedia.org/resource/Norwegians,http://dbpedia.org/resource/Norway 7 | norwegian,http://dbpedia.org/resource/Norway 8 | CAN,http://dbpedia.org/resource/Canada 9 | http://dbpedia.org/resource/British_people,http://dbpedia.org/resource/United_Kingdom 10 | USA,http://dbpedia.org/resource/United_States 11 | Japanese,http://dbpedia.org/resource/Japan 12 | United States,http://dbpedia.org/resource/United_States 13 | French,http://dbpedia.org/resource/France 14 | English,http://dbpedia.org/resource/United_Kingdom 15 | German,http://dbpedia.org/resource/Germany 16 | Canadian,http://dbpedia.org/resource/Canada 17 | http://dbpedia.org/resource/England,http://dbpedia.org/resource/United_Kingdom 18 | Italian,http://dbpedia.org/resource/Italy 19 | Polish,http://dbpedia.org/resource/Polish 20 | Canada,http://dbpedia.org/resource/Canada 21 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/CompactRecordTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import org.junit.After; 5 | import org.junit.Before; 6 | import org.junit.Test; 7 | 8 | import static junit.framework.Assert.assertEquals; 9 | import static junit.framework.Assert.assertTrue; 10 | 11 | public class CompactRecordTest { 12 | 13 | @Test 14 | public void testEmpty() { 15 | CompactRecord r = new CompactRecord(); 16 | r.toString(); 17 | 18 | assertTrue(r.isEmpty()); 19 | assertTrue(r.getProperties().isEmpty()); 20 | assertTrue(r.getValues("foo").isEmpty()); 21 | assertTrue(r.getValue("foo") == null); 22 | } 23 | 24 | @Test 25 | public void testSingle() { 26 | CompactRecord r = new CompactRecord(); 27 | r.addValue("foo", "bar"); 28 | r.toString(); 29 | 30 | assertTrue(!r.isEmpty()); 31 | assertTrue(r.getProperties().size() == 1); 32 | assertTrue(r.getProperties().iterator().next().equals("foo")); 33 | assertTrue(r.getValues("foo").iterator().next().equals("bar")); 34 | assertTrue(r.getValue("foo").equals("bar")); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/LinkFileOracle.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | import java.io.IOException; 5 | 6 | import no.priv.garshol.duke.Link; 7 | import no.priv.garshol.duke.LinkKind; 8 | import no.priv.garshol.duke.LinkDatabase; 9 | import no.priv.garshol.duke.InMemoryLinkDatabase; 10 | import no.priv.garshol.duke.utils.LinkDatabaseUtils; 11 | 12 | /** 13 | * This oracle looks up the answer in a link file. 14 | */ 15 | public class LinkFileOracle implements Oracle { 16 | private InMemoryLinkDatabase linkdb; 17 | 18 | public LinkFileOracle(String testfile) throws IOException { 19 | this.linkdb = new InMemoryLinkDatabase(); 20 | linkdb.setDoInference(true); 21 | LinkDatabaseUtils.loadTestFile(testfile, linkdb); 22 | } 23 | 24 | public LinkDatabase getLinkDatabase() { 25 | return linkdb; 26 | } 27 | 28 | public LinkKind getLinkKind(String id1, String id2) { 29 | Link link = linkdb.inferLink(id1, id2); 30 | if (link == null) 31 | return LinkKind.DIFFERENT; // we assume missing links are incorrect 32 | return link.getKind(); 33 | } 34 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/comparators/Matcher.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | /** 5 | * Wrapping this around the input string to simplify the code. 6 | */ 7 | public class Matcher { 8 | private String str; 9 | private int ix; 10 | 11 | public Matcher(String str) { 12 | this.str = str; 13 | this.ix = -1; 14 | } 15 | 16 | public boolean isNext(char ch) { 17 | return ix + 1 < str.length() && str.charAt(ix + 1) == ch; 18 | } 19 | 20 | public boolean atStart() { 21 | return ix == 0; 22 | } 23 | 24 | public boolean hasNext() { 25 | return ix + 1 < str.length(); 26 | } 27 | 28 | public boolean nextIsLast() { 29 | return ix + 2 == str.length(); 30 | } 31 | 32 | public boolean isLast() { 33 | return ix + 1 == str.length(); 34 | } 35 | 36 | public char next() { 37 | return str.charAt(++ix); 38 | } 39 | 40 | public void skip() { 41 | ix++; 42 | } 43 | 44 | public boolean previousOneOf(String chars) { 45 | if (ix == 0) 46 | return false; 47 | return chars.indexOf(str.charAt(ix - 1)) != -1; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/FamilyCommaGivenCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | 6 | /** 7 | * Experimental cleaner for person names of the form "Smith, 8 | * John". Based on the PersonNameCleaner. It also normalizes periods 9 | * in initials, so that "J.R. Ackerley" becomes "J. R. Ackerley". 10 | */ 11 | public class FamilyCommaGivenCleaner implements Cleaner { 12 | private PersonNameCleaner sub; 13 | 14 | public FamilyCommaGivenCleaner() { 15 | this.sub = new PersonNameCleaner(); 16 | } 17 | 18 | public String clean(String value) { 19 | int i = value.indexOf(','); 20 | if (i != -1) 21 | value = value.substring(i + 1) + " " + value.substring(0, i); 22 | 23 | char[] tmp = new char[value.length() * 2]; 24 | int pos = 0; 25 | for (int ix = 0; ix < value.length(); ix++) { 26 | tmp[pos++] = value.charAt(ix); 27 | if (value.charAt(ix) == '.' && 28 | ix+1 < value.length() && 29 | value.charAt(ix + 1) != ' ') 30 | tmp[pos++] = ' '; 31 | } 32 | 33 | return sub.clean(new String(tmp, 0, pos)); 34 | } 35 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/HighProbabilityAspect.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | import no.priv.garshol.duke.Property; 5 | import no.priv.garshol.duke.Configuration; 6 | 7 | /** 8 | * Sets the high probability. 9 | */ 10 | public class HighProbabilityAspect extends FloatAspect { 11 | private Property prop; 12 | 13 | public HighProbabilityAspect(Property prop) { 14 | this.prop = prop; 15 | } 16 | 17 | public void setRandomly(GeneticConfiguration cfg) { 18 | Configuration config = cfg.getConfiguration(); 19 | Property p = config.getPropertyByName(prop.getName()); 20 | double new_value = drift(config.getThreshold(), 1.0, 0.5); 21 | p.setHighProbability(new_value); 22 | } 23 | 24 | public void setFromOther(GeneticConfiguration cfg1, 25 | GeneticConfiguration cfg2) { 26 | Configuration config = cfg1.getConfiguration(); 27 | Configuration other = cfg2.getConfiguration(); 28 | 29 | Property p1 = config.getPropertyByName(prop.getName()); 30 | Property p2 = other.getPropertyByName(prop.getName()); 31 | p1.setHighProbability(p2.getHighProbability()); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/LowProbabilityAspect.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | import no.priv.garshol.duke.Property; 5 | import no.priv.garshol.duke.Configuration; 6 | 7 | /** 8 | * Sets the low probability. 9 | */ 10 | public class LowProbabilityAspect extends FloatAspect { 11 | private Property prop; 12 | 13 | public LowProbabilityAspect(Property prop) { 14 | this.prop = prop; 15 | } 16 | 17 | public void setRandomly(GeneticConfiguration cfg) { 18 | Configuration config = cfg.getConfiguration(); 19 | Property p = config.getPropertyByName(prop.getName()); 20 | double new_value = drift(config.getThreshold(), 0.5, 0.0); 21 | p.setLowProbability(new_value); 22 | } 23 | 24 | public void setFromOther(GeneticConfiguration cfg1, 25 | GeneticConfiguration cfg2) { 26 | Configuration config = cfg1.getConfiguration(); 27 | Configuration other = cfg2.getConfiguration(); 28 | 29 | Property p1 = config.getPropertyByName(prop.getName()); 30 | Property p2 = other.getPropertyByName(prop.getName()); 31 | p1.setLowProbability(p2.getLowProbability()); 32 | } 33 | 34 | } 35 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/comparators/DiceCoefficientComparatorTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class DiceCoefficientComparatorTest { 10 | private DiceCoefficientComparator comp; 11 | 12 | @Before 13 | public void setup() { 14 | comp = new DiceCoefficientComparator(); 15 | } 16 | 17 | @Test 18 | public void testEmpty() { 19 | assertEquals(1.0, comp.compare("", "")); 20 | } 21 | 22 | @Test 23 | public void testOneIsEmpty() { 24 | assertEquals(0.0, comp.compare("", "abc")); 25 | } 26 | 27 | @Test 28 | public void testOneIsDifferent() { 29 | assertEquals(0.5, comp.compare("abc def", "cba def")); 30 | } 31 | 32 | @Test 33 | public void testReordering() { 34 | assertEquals(1.0, comp.compare("def abc", "abc def")); 35 | } 36 | 37 | @Test 38 | public void testLengthDifference() { 39 | assertEquals(0.8, comp.compare("def abc ghe", "abc def")); 40 | } 41 | 42 | @Test 43 | public void testLengthDifference2() { 44 | assertEquals(0.8, comp.compare("def abc", "abc def ghe")); 45 | } 46 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/JDBCLinkDatabase.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.util.Properties; 5 | 6 | import no.priv.garshol.duke.utils.JDBCUtils; 7 | 8 | /** 9 | * A link database which can maintain a set of links in an H2 or 10 | * Oracle database over JDBC. It could be extended to work with more 11 | * database implementations. 12 | */ 13 | public class JDBCLinkDatabase extends RDBMSLinkDatabase { 14 | private String driverklass; 15 | private String jdbcuri; 16 | private Properties props; 17 | 18 | public JDBCLinkDatabase(String driverklass, 19 | String jdbcuri, 20 | String dbtype, 21 | Properties props) { 22 | super(dbtype); 23 | this.driverklass = driverklass; 24 | this.jdbcuri = jdbcuri; 25 | this.props = props; 26 | this.stmt = JDBCUtils.open(driverklass, jdbcuri, props); 27 | } 28 | 29 | public void validateConnection() { 30 | if (stmt != null && !JDBCUtils.validate(stmt)) 31 | // it failed to validate, and was closed by the validate method. 32 | // we therefore reopen so that we have a proper connection. 33 | stmt = JDBCUtils.open(driverklass, jdbcuri, props); 34 | } 35 | 36 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/utils/PropertyUtilsTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | import java.util.Properties; 5 | 6 | import no.priv.garshol.duke.DukeConfigException; 7 | import org.junit.Before; 8 | import org.junit.Test; 9 | 10 | import static junit.framework.Assert.assertEquals; 11 | import static junit.framework.Assert.fail; 12 | 13 | public class PropertyUtilsTest { 14 | private Properties props; 15 | 16 | @Before 17 | public void setup() { 18 | props = new Properties(); 19 | props.setProperty("foo", "bar"); 20 | props.setProperty("baz", "2"); 21 | } 22 | 23 | @Test 24 | public void testGet1() { 25 | assertEquals(PropertyUtils.get(props, "foo"), "bar"); 26 | 27 | try { 28 | PropertyUtils.get(props, "bar"); 29 | fail("exception not thrown"); 30 | } catch (DukeConfigException e) { 31 | } 32 | } 33 | 34 | @Test 35 | public void testGet2() { 36 | assertEquals(PropertyUtils.get(props, "foo", "huhu"), "bar"); 37 | assertEquals(PropertyUtils.get(props, "quux", "huhu"), "huhu"); 38 | } 39 | 40 | @Test 41 | public void testGet3() { 42 | assertEquals(PropertyUtils.get(props, "baz", 0), 2); 43 | assertEquals(PropertyUtils.get(props, "quux", 27), 27); 44 | } 45 | } -------------------------------------------------------------------------------- /duke-json/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | no.priv.garshol.duke 6 | duke 7 | 1.4-SNAPSHOT 8 | ../ 9 | 10 | duke-json 11 | jar 12 | 13 | 14 | 15 | 16 | no.priv.garshol.duke 17 | duke-core 18 | 19 | 20 | no.priv.garshol.duke 21 | duke-core 22 | test-jar 23 | test 24 | 25 | 26 | 27 | 28 | com.fasterxml.jackson.core 29 | jackson-core 30 | 2.3.2 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /duke-mapdb/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | no.priv.garshol.duke 6 | duke 7 | 1.4-SNAPSHOT 8 | ../ 9 | 10 | duke-mapdb 11 | jar 12 | 13 | 14 | 15 | 16 | no.priv.garshol.duke 17 | duke-core 18 | 19 | 20 | no.priv.garshol.duke 21 | duke-core 22 | test-jar 23 | test 24 | 25 | 26 | 27 | 28 | org.mapdb 29 | mapdb 30 | 0.9.13 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /duke-mongodb/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | no.priv.garshol.duke 6 | duke 7 | 1.4-SNAPSHOT 8 | ../ 9 | 10 | duke-mongodb 11 | jar 12 | 13 | 14 | 15 | 16 | no.priv.garshol.duke 17 | duke-core 18 | 19 | 20 | no.priv.garshol.duke 21 | duke-core 22 | test-jar 23 | test 24 | 25 | 26 | 27 | 28 | org.mongodb 29 | mongo-java-driver 30 | 3.12.14 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/Record.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.util.Collection; 5 | 6 | /** 7 | * Represents a record, which may be a single source record from a 8 | * data source, or a record created from merging data from many 9 | * records. 10 | */ 11 | public interface Record { 12 | 13 | /** 14 | * The names of the properties this record has. May be a subset of 15 | * the properties defined in the configuration if not all properties 16 | * have values. 17 | */ 18 | public Collection getProperties(); 19 | 20 | /** 21 | * All values for the named property. May be empty. May not contain 22 | * null or empty strings. Never returns null. 23 | */ 24 | public Collection getValues(String prop); 25 | 26 | /** 27 | * Returns a value for the named property. May be null. May not be 28 | * the empty string. If the property has more than one value there is 29 | * no way to predict which value is returned. 30 | */ 31 | public String getValue(String prop); 32 | 33 | /** 34 | * Merges the other record into this one. None of the 35 | * implementations support this method yet, but it's going to be 36 | * used when we implement issue 4. 37 | */ 38 | public void merge(Record other); 39 | 40 | } 41 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/cleaners/NorwegianCompanyNameCleanerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class NorwegianCompanyNameCleanerTest { 10 | private NorwegianCompanyNameCleaner cleaner; 11 | 12 | @Before 13 | public void setup() { 14 | cleaner = new NorwegianCompanyNameCleaner(); 15 | } 16 | 17 | @Test 18 | public void testEmpty() { 19 | test("", ""); 20 | } 21 | 22 | @Test 23 | public void testAslashsAs() { 24 | test("sundby maskin as", "sundby maskin a/s"); 25 | } 26 | 27 | @Test 28 | public void testAbackslashAs() { 29 | test("sundby maskin as", "sundby maskin a\\s"); 30 | } 31 | 32 | @Test 33 | public void testAslashL() { 34 | test("al follestadgata sameie", "a/l follestadgata sameie"); 35 | } 36 | 37 | @Test 38 | public void testMoveALToEnd() { 39 | test("a/l follestadgata sameie", "follestadgata sameie al"); 40 | } 41 | 42 | @Test 43 | public void testMoveASToEnd() { 44 | test("a/s sundby maskin", "sundby maskin as"); 45 | } 46 | 47 | private void test(String s1, String s2) { 48 | assertEquals(cleaner.clean(s1), cleaner.clean(s2)); 49 | } 50 | 51 | } -------------------------------------------------------------------------------- /duke-mapdb/src/test/java/no/priv/garshol/duke/databases/PersistentMapDBBlockingDatabaseTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import java.io.IOException; 5 | import java.util.ArrayList; 6 | import java.util.Collection; 7 | 8 | import no.priv.garshol.duke.Configuration; 9 | import no.priv.garshol.duke.Database; 10 | import no.priv.garshol.duke.Record; 11 | 12 | public class PersistentMapDBBlockingDatabaseTest extends PersistentDatabaseTest { 13 | private String dbfile; 14 | 15 | public Database createDatabase(Configuration config) throws IOException { 16 | if (dbfile == null) 17 | dbfile = tmpdir.newFile().getAbsolutePath(); // ensure same every time 18 | 19 | MapDBBlockingDatabase db = new MapDBBlockingDatabase(); 20 | db.setConfiguration(config); 21 | db.setOverwrite(false); 22 | db.setFile(dbfile); 23 | db.setAsync(false); // slows down tests too much 24 | db.setWindowSize(0); // otherwise we'll find way too many candidates 25 | 26 | Collection functions = new ArrayList(); 27 | functions.add(new TestKeyFunction()); 28 | db.setKeyFunctions(functions); 29 | return db; 30 | } 31 | 32 | private static class TestKeyFunction implements KeyFunction { 33 | public String makeKey(Record record) { 34 | return record.getValue("NAME"); 35 | } 36 | } 37 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/matchers/ClassDatabaseMatchListener.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.matchers; 3 | 4 | import no.priv.garshol.duke.Configuration; 5 | import no.priv.garshol.duke.EquivalenceClassDatabase; 6 | import no.priv.garshol.duke.Property; 7 | import no.priv.garshol.duke.Record; 8 | 9 | /** 10 | * Writes recorded matches to an EquivalenceClassDatabase. 11 | */ 12 | public class ClassDatabaseMatchListener extends AbstractMatchListener { 13 | private Configuration config; 14 | protected EquivalenceClassDatabase classdb; 15 | 16 | public ClassDatabaseMatchListener(Configuration config, 17 | EquivalenceClassDatabase classdb) { 18 | this.config = config; 19 | this.classdb = classdb; 20 | } 21 | 22 | public void matches(Record r1, Record r2, double confidence) { 23 | String id1 = getIdentity(r1); 24 | String id2 = getIdentity(r2); 25 | classdb.addLink(id1, id2); 26 | } 27 | 28 | public void batchDone() { 29 | classdb.commit(); 30 | } 31 | 32 | private String getIdentity(Record r) { 33 | for (Property p : config.getIdentityProperties()) 34 | for (String v : r.getValues(p.getName())) 35 | return v; 36 | throw new RuntimeException("No identity found in record [" + 37 | PrintMatchListener.toString(r) + "]"); 38 | } 39 | 40 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/ComparatorAspect.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | import java.util.List; 5 | import java.util.ArrayList; 6 | 7 | import no.priv.garshol.duke.Property; 8 | import no.priv.garshol.duke.Comparator; 9 | import no.priv.garshol.duke.Configuration; 10 | import no.priv.garshol.duke.utils.ObjectUtils; 11 | 12 | /** 13 | * Sets the comparator. 14 | */ 15 | public class ComparatorAspect extends Aspect { 16 | private Property prop; 17 | private List comparators; 18 | 19 | public ComparatorAspect(Property prop, List comparators) { 20 | this.prop = prop; 21 | this.comparators = comparators; 22 | } 23 | 24 | public void setRandomly(GeneticConfiguration cfg) { 25 | Configuration config = cfg.getConfiguration(); 26 | Property p = config.getPropertyByName(prop.getName()); 27 | p.setComparator(comparators.get((int) (comparators.size() * Math.random()))); 28 | } 29 | 30 | public void setFromOther(GeneticConfiguration cfg1, 31 | GeneticConfiguration cfg2) { 32 | Configuration config = cfg1.getConfiguration(); 33 | Configuration other = cfg2.getConfiguration(); 34 | 35 | Property p1 = config.getPropertyByName(prop.getName()); 36 | Property p2 = other.getPropertyByName(prop.getName()); 37 | p1.setComparator(p2.getComparator()); 38 | } 39 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/ConsoleOracle.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | import java.io.Writer; 5 | import java.io.FileWriter; 6 | import java.io.IOException; 7 | 8 | import no.priv.garshol.duke.LinkKind; 9 | import no.priv.garshol.duke.DukeException; 10 | import no.priv.garshol.duke.utils.YesNoConsole; 11 | import no.priv.garshol.duke.utils.LinkFileWriter; 12 | 13 | /** 14 | * This oracle asks the user via the console. 15 | */ 16 | public class ConsoleOracle implements Oracle { 17 | private YesNoConsole console; 18 | private LinkFileWriter writer; 19 | private Writer out; 20 | 21 | public ConsoleOracle() { 22 | this.console = new YesNoConsole(); 23 | } 24 | 25 | public LinkKind getLinkKind(String id1, String id2) { 26 | boolean match = console.yesorno(); 27 | if (writer != null) 28 | try { 29 | writer.write(id1, id2, match, 1.0); 30 | out.flush(); // make sure everything's saved 31 | } catch (IOException e) { 32 | throw new DukeException(e); 33 | } 34 | return match ? LinkKind.SAME : LinkKind.DIFFERENT; 35 | } 36 | 37 | public void setLinkFile(String linkfile) throws IOException { 38 | out = new FileWriter(linkfile, true); 39 | writer = new LinkFileWriter(out); 40 | // FIXME: strictly speaking, this leaks file handles. in practice it 41 | // probably won't matter 42 | } 43 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/databases/AbstractKeyFunction.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import no.priv.garshol.duke.utils.StringUtils; 5 | 6 | /** 7 | * Helper class for writing key functions. 8 | * @since 1.2 9 | */ 10 | public abstract class AbstractKeyFunction implements KeyFunction { 11 | 12 | public String firstLongerThan(String value, int min) { 13 | if (value == null) 14 | return "null"; 15 | 16 | String[] tokens = StringUtils.split(value); 17 | for (int ix = 0; ix < tokens.length; ix++) 18 | if (tokens[ix].length() > min) 19 | return tokens[ix]; 20 | return tokens[0]; 21 | } 22 | 23 | public String lastLongerThan(String value, int min) { 24 | if (value == null) 25 | return "null"; 26 | 27 | String[] tokens = StringUtils.split(value); 28 | for (int ix = tokens.length - 1; ix >= 0; ix--) 29 | if (tokens[ix].length() > min) 30 | return tokens[ix]; 31 | return tokens[0]; 32 | } 33 | 34 | public String allDigits(String value) { 35 | if (value == null) 36 | return "null"; 37 | 38 | char[] tmp = new char[value.length()]; 39 | int free = 0; 40 | for (int ix = 0; ix < value.length(); ix++) { 41 | char ch = value.charAt(ix); 42 | if (ch >= '0' && ch <= '9') 43 | tmp[free++] = ch; 44 | } 45 | return new String(tmp, 0, free); 46 | } 47 | 48 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/AbstractRuleBasedCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import java.util.List; 5 | import java.util.ArrayList; 6 | import java.util.regex.Pattern; 7 | import java.util.regex.Matcher; 8 | 9 | import no.priv.garshol.duke.Cleaner; 10 | 11 | /** 12 | * Helper class for building regular-expression based cleaners. 13 | */ 14 | public abstract class AbstractRuleBasedCleaner implements Cleaner { 15 | private List transforms; 16 | 17 | /** 18 | * Initializes an empty cleaner. 19 | */ 20 | public AbstractRuleBasedCleaner() { 21 | this.transforms = new ArrayList(); 22 | } 23 | 24 | public String clean(String value) { 25 | // perform pre-registered transforms 26 | for (Transform t : transforms) 27 | value = t.transform(value); 28 | 29 | return value; 30 | } 31 | 32 | /** 33 | * Adds a rule replacing all substrings matching the regular 34 | * expression with the replacement string. 35 | */ 36 | public void add(String regex, String replacement) { 37 | add(regex, replacement, 1); 38 | } 39 | 40 | /** 41 | * Adds a rule replacing all substrings matching the specified group 42 | * within the regular expression with the replacement string. 43 | */ 44 | public void add(String regex, String replacement, int groupno) { 45 | transforms.add(new Transform(regex, replacement, groupno)); 46 | } 47 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/utils/TestFileUtils.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | import java.util.Map; 5 | import java.util.HashMap; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | import java.io.BufferedReader; 9 | 10 | import no.priv.garshol.duke.Link; 11 | import no.priv.garshol.duke.LinkKind; 12 | import no.priv.garshol.duke.LinkStatus; 13 | 14 | /** 15 | * A utility class for loading link files. Deprecated: Please 16 | * don't use. Use the LinkDatabase concept instead. This class will 17 | * be removed in a future version. 18 | * @deprecated 19 | */ 20 | public class TestFileUtils { 21 | 22 | public static Map load(String testfile) throws IOException { 23 | Map links = new HashMap(); 24 | BufferedReader reader = new BufferedReader(new FileReader(testfile)); 25 | String line = reader.readLine(); 26 | while (line != null) { 27 | int pos = line.indexOf(','); 28 | 29 | String id1 = line.substring(1, pos); 30 | String id2 = line.substring(pos + 1, line.length()); 31 | 32 | links.put(id1 + "," + id2, 33 | new Link(id1, id2, LinkStatus.ASSERTED, 34 | line.charAt(0) == '+' ? 35 | LinkKind.SAME : LinkKind.DIFFERENT, 0.0)); 36 | 37 | line = reader.readLine(); 38 | } 39 | reader.close(); 40 | 41 | return links; 42 | } 43 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/Database.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.util.Collection; 5 | 6 | /** 7 | * Used to store and index records for later matching. 8 | */ 9 | public interface Database { 10 | 11 | /** 12 | * Returns true iff the database is held entirely in memory, and 13 | * thus is not persistent. 14 | */ 15 | public boolean isInMemory(); 16 | 17 | /** 18 | * Add the record to the index. 19 | */ 20 | public void index(Record record); 21 | 22 | /** 23 | * Flushes all changes to disk. For in-memory databases this is a 24 | * no-op. 25 | */ 26 | public void commit(); 27 | 28 | /** 29 | * Look up record by identity. 30 | */ 31 | public Record findRecordById(String id); 32 | 33 | /** 34 | * Look up potentially matching records. This method must be 35 | * thread-safe. 36 | */ 37 | public Collection findCandidateMatches(Record record); 38 | 39 | /** 40 | * Stores state to disk and closes all open resources. 41 | */ 42 | public void close(); 43 | 44 | /** 45 | * Gives the database its configuration (called by Duke framework). 46 | * @since 1.2 47 | */ 48 | public void setConfiguration(Configuration config); 49 | 50 | /** 51 | * Sets whether or not to overwrite any existing index (called by 52 | * Duke framework). 53 | * @since 1.2 54 | */ 55 | public void setOverwrite(boolean overwrite); 56 | } 57 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/comparators/WeightedLevenshteinTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class WeightedLevenshteinTest { 10 | private WeightedLevenshtein.DefaultWeightEstimator e; 11 | 12 | @Before 13 | public void setup() { 14 | e = new WeightedLevenshtein.DefaultWeightEstimator(); 15 | } 16 | 17 | @Test 18 | public void testEmpty() { 19 | assertEquals(0.0, WeightedLevenshtein.distance("", "", e)); 20 | } 21 | 22 | @Test 23 | public void testEmpty1() { 24 | e.setDigitWeight(1.0); 25 | assertEquals(1.0, WeightedLevenshtein.distance("", "1", e)); 26 | } 27 | 28 | @Test 29 | public void testEmpty2() { 30 | e.setDigitWeight(2.0); 31 | assertEquals(2.0, WeightedLevenshtein.distance("1", "", e)); 32 | } 33 | 34 | @Test 35 | public void testSubstitute1() { 36 | e.setDigitWeight(2.0); 37 | assertEquals(2.0, WeightedLevenshtein.distance("titanic 1", "titanic 2", e)); 38 | } 39 | 40 | @Test 41 | public void testSubstitute2() { 42 | e.setDigitWeight(2.0); 43 | assertEquals(3.0, WeightedLevenshtein.distance("totanic 1", "titanic 2", e)); 44 | } 45 | 46 | @Test 47 | public void testComparator() { 48 | WeightedLevenshtein comp = new WeightedLevenshtein(); 49 | assertEquals(0.0, comp.compare("1", "")); 50 | } 51 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/MappingFileCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import java.util.Map; 5 | import java.util.HashMap; 6 | import java.io.FileReader; 7 | import java.io.IOException; 8 | 9 | import no.priv.garshol.duke.Cleaner; 10 | import no.priv.garshol.duke.DukeException; 11 | import no.priv.garshol.duke.utils.CSVReader; 12 | 13 | // FIXME: we may also want an option to allow unmapped values to be 14 | // returned as is (or even via the sub-cleaner) 15 | 16 | /** 17 | * A cleaner which loads a mapping file in CSV format and maps values 18 | * according to that file. 19 | * @since 0.5 20 | */ 21 | public class MappingFileCleaner implements Cleaner { 22 | private Map mapping; 23 | 24 | public String clean(String value) { 25 | String newvalue = mapping.get(value); 26 | if (newvalue == null) 27 | return value; 28 | return newvalue; 29 | } 30 | 31 | public void setMappingFile(String filename) { 32 | mapping = new HashMap(); 33 | 34 | // FIXME: character encoding? 35 | try { 36 | CSVReader csv = new CSVReader(new FileReader(filename)); 37 | 38 | String[] row = csv.next(); 39 | while (row != null) { 40 | mapping.put(row[0], row[1]); 41 | row = csv.next(); 42 | } 43 | 44 | csv.close(); 45 | } catch (IOException e) { 46 | throw new DukeException("Error loading mapping file " + filename, e); 47 | } 48 | } 49 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/NorwegianCompanyNameCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.utils.StringUtils; 5 | 6 | public class NorwegianCompanyNameCleaner extends AbstractRuleBasedCleaner { 7 | private LowerCaseNormalizeCleaner sub; 8 | 9 | public NorwegianCompanyNameCleaner() { 10 | super(); 11 | this.sub = new LowerCaseNormalizeCleaner(); 12 | 13 | add("\\s(a/s)(\\s|$)", "as"); 14 | add("\\s(a\\\\s)(\\s|$)", "as"); 15 | add("^(a/s)\\s", "as"); 16 | add("^(a\\\\s)\\s", "as"); 17 | add("\\s(a/l)(\\s|$)", "al"); 18 | add("^(a/l)\\s", "al"); 19 | } 20 | 21 | public String clean(String value) { 22 | // get rid of commas 23 | value = StringUtils.replaceAnyOf(value, ",().-_", ' '); 24 | 25 | // do basic cleaning 26 | value = sub.clean(value); 27 | if (value == null || value.equals("")) 28 | return ""; 29 | 30 | // perform pre-registered transforms 31 | value = super.clean(value); 32 | 33 | // renormalize whitespace, since being able to replace tokens with spaces 34 | // makes writing transforms easier 35 | value = StringUtils.normalizeWS(value); 36 | 37 | // transforms: 38 | // "as foo bar" -> "foo bar as" 39 | // "al foo bar" -> "foo bar al" 40 | if (value.startsWith("as ") || value.startsWith("al ")) 41 | value = value.substring(3) + ' ' + value.substring(0, 2); 42 | 43 | return value; 44 | } 45 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/DataSource.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * Any class which implements this interface can be used as a data 6 | * source, so you can plug in your own data sources. Configuration 7 | * properties are received as bean setter calls via reflection. 8 | */ 9 | public interface DataSource { 10 | 11 | /** 12 | * Return an iterator over all the records in this data source. This 13 | * should preferably not load all records into memory, but instead 14 | * produce them lazily. 15 | */ 16 | public RecordIterator getRecords(); 17 | 18 | /** 19 | * Gives the data source a logger to report diagnostic information 20 | * to. Ignoring the logger is allowed.

21 | * 22 | *

WARN: This method is experimental. I'm far from certain 23 | * that this is how I want this to work. May go for slf4j logging 24 | * instead, or something similar. 25 | */ 26 | public void setLogger(Logger logger); 27 | 28 | /** 29 | * Each {@link no.priv.garshol.duke.DataSource} is responsible of writing 30 | * its XML configuration using provided {@link no.priv.garshol.duke.ConfigWriter} 31 | * instance. 32 | *

Each implementation should start with a specific tag (unique identifier of 33 | * DataSource implementation inside Duke) and close it before returning. 34 | *

35 | * 36 | * @param cw Handler which keep reference to an XML printer. 37 | */ 38 | void writeConfig(ConfigWriter cw); 39 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/utils/PropertyUtils.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | import java.util.Properties; 5 | import no.priv.garshol.duke.DukeConfigException; 6 | 7 | /** 8 | * Utilities for making Java Properties objects easier to deal with. 9 | */ 10 | public class PropertyUtils { 11 | 12 | /** 13 | * Used for getting required properties, will throw an exception if 14 | * the property is not specified. 15 | */ 16 | public static String get(Properties props, String name) { 17 | String value = props.getProperty(name); 18 | if (value == null) 19 | throw new DukeConfigException("Required property " + name + 20 | " not specified"); 21 | return value; 22 | } 23 | 24 | /** 25 | * Returns the value of an optional property, if the property is 26 | * set. If it is not set defval is returned. 27 | */ 28 | public static String get(Properties props, String name, String defval) { 29 | String value = props.getProperty(name); 30 | if (value == null) 31 | value = defval; 32 | return value; 33 | } 34 | 35 | /** 36 | * Returns the value of an optional property, if the property is 37 | * set. If it is not set defval is returned. 38 | */ 39 | public static int get(Properties props, String name, int defval) { 40 | String value = props.getProperty(name); 41 | if (value == null) 42 | return defval; 43 | return Integer.parseInt(value); 44 | } 45 | 46 | } -------------------------------------------------------------------------------- /doc/example-data/namebase.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 0.8 4 | /Users/larsga/tmp/duke-sdshare/tomcat-lucene-index 5 | 6 | 7 | ID 8 | 9 | 10 | 11 | NameField1 12 | no.priv.garshol.duke.JaroWinkler 13 | 0.4 14 | 0.7 15 | 16 | 17 | 18 | NameField2 19 | no.priv.garshol.duke.JaroWinkler 20 | 0.4 21 | 0.6 22 | 23 | 24 | 25 | NameField3 26 | no.priv.garshol.duke.JaroWinkler 27 | 0.4 28 | 0.55 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 41 | 44 | 47 | 48 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/comparators/NumericComparator.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import no.priv.garshol.duke.Comparator; 5 | 6 | /** 7 | * Comparator which compares two values numerically. The similarity is 8 | * the ratio of the smaller number to the greater number, if both 9 | * numbers are either negative or positive. If one is negative and the 10 | * other positive, the similarity is 0.0. 11 | */ 12 | public class NumericComparator implements Comparator { 13 | private double minratio; 14 | 15 | public boolean isTokenized() { 16 | return false; 17 | } 18 | 19 | public void setMinRatio(double minratio) { 20 | this.minratio = minratio; 21 | } 22 | 23 | public double compare(String v1, String v2) { 24 | double d1; 25 | double d2; 26 | try { 27 | d1 = Double.parseDouble(v1); 28 | d2 = Double.parseDouble(v2); 29 | } catch (NumberFormatException e) { 30 | return 0.5; // we just ignore this. whether it's wise I'm not sure 31 | } 32 | 33 | // if they're both zero, they're equal 34 | if (d1 == 0.0 && d2 == 0.0) 35 | return 1.0; 36 | 37 | // if both are negative, flip the signs 38 | if (d1 < 0.0 && d2 < 0.0) { 39 | d1 *= -1.0; 40 | d2 *= -1.0; 41 | } 42 | 43 | if (d2 < d1) { 44 | double tmp = d2; 45 | d2 = d1; 46 | d1 = tmp; 47 | } 48 | 49 | double ratio = d1 / d2; 50 | if (ratio < minratio) 51 | return 0.0; 52 | else 53 | return ratio; 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/NorwegianAddressCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | public class NorwegianAddressCleaner extends AbstractRuleBasedCleaner { 5 | private LowerCaseNormalizeCleaner sub; 6 | 7 | public NorwegianAddressCleaner() { 8 | super(); 9 | this.sub = new LowerCaseNormalizeCleaner(); 10 | 11 | add("^(co/ ?)", "c/o "); 12 | add("^(c\\\\o)", "c/o"); 13 | add("[A-Za-z]+(g\\.) [0-9]+", "gata"); 14 | add("[A-Za-z]+ (gt?\\.?) [0-9]+", "gate"); 15 | add("[A-Za-z]+(v\\.) [0-9]+", "veien"); 16 | add("[A-Za-z]+ (v\\.?) [0-9]+", "vei"); 17 | add("[A-Za-z]+(vn\\.?)[0-9]+", "veien "); 18 | add("[A-Za-z]+(vn\\.?) [0-9]+", "veien"); 19 | add("[A-Za-z]+(gt\\.?) [0-9]+", "gata"); 20 | add("[A-Za-z]+(gaten) [0-9]+", "gata"); 21 | add("(\\s|^)(pb\\.?) [0-9]+", "postboks", 2); 22 | add("(\\s|^)(boks) [0-9]+", "postboks", 2); 23 | add("[A-Za-z]+ [0-9]+(\\s+)[A-Za-z](\\s|$)", ""); 24 | add("[A-Za-z]+(gata|veien)()[0-9]+[a-z]?(\\s|$)", " "); 25 | 26 | // FIXME: not sure about the following rules 27 | add("postboks\\s+[0-9]+(\\s*-\\s*)", " "); 28 | } 29 | 30 | public String clean(String value) { 31 | // get rid of commas 32 | value = value.replace(',', ' '); 33 | 34 | // do basic cleaning 35 | value = sub.clean(value); 36 | if (value == null || value.equals("")) 37 | return value; 38 | 39 | // perform pre-registered transforms 40 | value = super.clean(value); 41 | 42 | return value; 43 | } 44 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/HTMLCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import java.util.Map; 5 | import java.util.HashMap; 6 | import no.priv.garshol.duke.Cleaner; 7 | 8 | /** 9 | * A cleaner that removes HTML-style entity references, such as 10 | * &#222; and &mdash;. 11 | * @since 1.3 12 | */ 13 | public class HTMLCleaner implements Cleaner { 14 | private static Map entities; 15 | 16 | static { 17 | entities = new HashMap(); 18 | entities.put("mdash", "\u2014"); 19 | } 20 | 21 | public String clean(String value) { 22 | StringBuilder buf = new StringBuilder(value.length()); 23 | for (int ix = 0; ix < value.length(); ix++) { 24 | char ch = value.charAt(ix); 25 | if (ch != '&') { 26 | buf.append(ch); 27 | continue; 28 | } 29 | 30 | ch = value.charAt(++ix); 31 | if (ch == '#') { 32 | ix++; 33 | if (value.charAt(ix) == 'x') 34 | throw new UnsupportedOperationException("Don't support &#x...;"); 35 | int pos = ix; 36 | for (; ix < value.length() && value.charAt(ix) != ';'; ix++) 37 | ; 38 | ch = (char) Integer.parseInt(value.substring(pos, ix)); 39 | buf.append(ch); 40 | } else { 41 | int pos = ix; 42 | for (; ix < value.length() && value.charAt(ix) != ';'; ix++) 43 | ; 44 | String v = entities.get(value.substring(pos, ix)); 45 | buf.append(v); 46 | } 47 | } 48 | return buf.toString(); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/comparators/DiceCoefficientComparator.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import no.priv.garshol.duke.Comparator; 5 | import no.priv.garshol.duke.utils.StringUtils; 6 | 7 | /** 8 | * An implementation of the Dice coefficient using exact matching by 9 | * default, but can be overridden to use any sub-comparator. 10 | */ 11 | public class DiceCoefficientComparator implements Comparator { 12 | private Comparator subcomp; 13 | 14 | public DiceCoefficientComparator() { 15 | this.subcomp = new ExactComparator(); 16 | } 17 | 18 | public void setComparator(Comparator comp) { 19 | this.subcomp = comp; 20 | } 21 | 22 | public boolean isTokenized() { 23 | return true; 24 | } 25 | 26 | public double compare(String s1, String s2) { 27 | if (s1.equals(s2)) 28 | return 1.0; 29 | 30 | // tokenize 31 | String[] t1 = StringUtils.split(s1); 32 | String[] t2 = StringUtils.split(s2); 33 | 34 | // ensure that t1 is shorter than or same length as t2 35 | if (t1.length > t2.length) { 36 | String[] tmp = t2; 37 | t2 = t1; 38 | t1 = tmp; 39 | } 40 | 41 | // find best matches for each token in t1 42 | double sum = 0; 43 | for (int ix1 = 0; ix1 < t1.length; ix1++) { 44 | double highest = 0; 45 | for (int ix2 = 0; ix2 < t2.length; ix2++) 46 | highest = Math.max(highest, subcomp.compare(t1[ix1], t2[ix2])); 47 | sum += highest; 48 | } 49 | 50 | return (sum * 2) / (t1.length + t2.length); 51 | } 52 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/LinkDatabase.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.util.Collection; 5 | 6 | /** 7 | * A LinkDatabase is a class which can keep track of links between 8 | * entities. 9 | */ 10 | public interface LinkDatabase { 11 | 12 | /** 13 | * Returns all links modified since the given time. 14 | */ 15 | public Collection getChangesSince(long since); 16 | 17 | /** 18 | * Get all links. 19 | */ 20 | public Collection getAllLinks(); 21 | 22 | /** 23 | * Get all links for this identity. If there are no links it returns 24 | * an empty collection, never null. 25 | */ 26 | public Collection getAllLinksFor(String id); 27 | 28 | /** 29 | * Assert a link. 30 | */ 31 | public void assertLink(Link link); 32 | 33 | /** 34 | * Can we work out, based on what we know, the relationship between 35 | * these two? Returns null if we don't know the relationship. 36 | */ 37 | public Link inferLink(String id1, String id2); 38 | 39 | /** 40 | * Verifies that we still have a connection to the database, and 41 | * reestablishes it, if not. Useful when connections live a long 42 | * time and are rarely used. 43 | */ 44 | public void validateConnection(); 45 | 46 | /** 47 | * Commit asserted links to persistent store. 48 | */ 49 | public void commit(); 50 | 51 | /** 52 | * Removes all links from the database. 53 | */ 54 | public void clear(); 55 | 56 | /** 57 | * Shuts down the database, releasing resources. 58 | */ 59 | public void close(); 60 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/EquivalenceClassDatabase.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.util.Iterator; 5 | import java.util.Collection; 6 | 7 | // FIXME: if we are going to implement retraction we will need 8 | // something like a linkdatabase as backing. probably equiv dbs need 9 | // to be aware of the linkdatabase anyway, in order to avoid known bad 10 | // links and make use of extra known links not inferred from data etc. 11 | 12 | /** 13 | * A tool for collecting matching records into groups where all 14 | * records are considered to match. Note that this means treating the 15 | * matching relation between records as transitive, which in practice 16 | * it is not. 17 | */ 18 | public interface EquivalenceClassDatabase { 19 | 20 | /** 21 | * Returns the number of equivalence classes in the database. 22 | */ 23 | public int getClassCount(); 24 | 25 | /** 26 | * Returns an iterator over all the classes in the database. 27 | */ 28 | public Iterator> getClasses(); 29 | 30 | /** 31 | * Get all records linked to the given record (that is, all records 32 | * in the same equivalence class as the given record). 33 | * @param id the ID of a record 34 | * @return Always returns a collection, but it may be empty. 35 | */ 36 | public Collection getClass(String id); 37 | 38 | /** 39 | * Add a new link between two records. 40 | */ 41 | public void addLink(String id1, String id2); 42 | 43 | /** 44 | * Commit changes made to persistent store. 45 | */ 46 | public void commit(); 47 | 48 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/matchers/MatchListener.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.matchers; 3 | 4 | import java.util.Collection; 5 | 6 | import no.priv.garshol.duke.Record; 7 | 8 | /** 9 | * Interface implemented by code which can receive notifications that 10 | * two records are considered to match. 11 | * 12 | *

Note that when running Duke with multiple threads, the 13 | * matches(), matchesPerhaps(), and noMatchFor() methods need to be 14 | * thread-safe. 15 | */ 16 | public interface MatchListener { 17 | 18 | /** 19 | * Notification that Duke is about to process a new batch of records. 20 | */ 21 | public void batchReady(int size); 22 | 23 | /** 24 | * Notification that Duke has finished processing a batch of records. 25 | */ 26 | public void batchDone(); 27 | 28 | /** 29 | * Notification that the two records match. There will have been a 30 | * previous startRecord(r1) notification. 31 | */ 32 | public void matches(Record r1, Record r2, double confidence); 33 | 34 | /** 35 | * Notification that the two records might match. There will have 36 | * been a previous startRecord(r1) notification. 37 | */ 38 | public void matchesPerhaps(Record r1, Record r2, double confidence); 39 | 40 | /** 41 | * Called if no link is found for the record. 42 | */ 43 | public void noMatchFor(Record record); 44 | 45 | /** 46 | * Notification that the processing run is beginning. 47 | */ 48 | public void startProcessing(); 49 | 50 | /** 51 | * Notification that this processing run is over. 52 | */ 53 | public void endProcessing(); 54 | } -------------------------------------------------------------------------------- /doc/tutorials/2011_05_data-cleansing.textile: -------------------------------------------------------------------------------- 1 | h1. Duke'em - data cleansing in the Linked Data publishing process 2 | 3 | Authors: Michael Hausenblas and Lars Marius Garshol 4 | 5 | I'm going to show you how can do data cleansing as part of the "Linked Data publishing process":http://linkeddatabook.com/editions/1.0/#htoc62, based on an open source tool called "Duke":http://code.google.com/p/duke/. 6 | 7 | h2. What is Duke? 8 | 9 | Duke is a fast and flexible deduplication engine, written in Java on top of Apache "Lucene":http://lucene.apache.org/. The current implementation allows a throughput of 1500 records/sec single-threaded, on a commodity machine. 10 | 11 | h2. STEP1: Prepare your data source 12 | 13 | For demonstration purposes we will use a CSV dump from "NameBase":http://www.namebase.org/csvdump.html containing some 140k records: 14 | 15 | bq. NameBase is a cumulative index of the names of individuals, corporations, and groups compiled from 800 investigative books published since 1962, and thousands of pages from periodicals since 1973. Areas covered include the international intelligence community, political elites from the Right and Left, the U.S. foreign policy establishment, assassinations and political scandals, Latin America, big business, and organized crime. 16 | 17 | Now, the structure of the NameBase data source is as follows: 18 |

19 | NameField1 | NameField2 | NameField3 | relative URL 
20 | ---------------------------------------------------
21 | 
22 | 
23 | 24 | 25 | h2. STEP2: Download and install Duke 26 | 27 | h2. STEP3: Configure Duke 28 | 29 | h2. STEP4: Run and tune Duke 30 | 31 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/utils/LinkFileWriter.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | import java.util.Collection; 5 | import java.io.Writer; 6 | import java.io.IOException; 7 | 8 | import no.priv.garshol.duke.Record; 9 | import no.priv.garshol.duke.Property; 10 | import no.priv.garshol.duke.Configuration; 11 | import no.priv.garshol.duke.DukeException; 12 | 13 | /** 14 | * Utility class for writing link files. The format is 15 | * _,id,id,confidence, where the first character is either '+' or '-'. 16 | * @since 1.1 17 | */ 18 | public class LinkFileWriter { 19 | private Writer out; 20 | private Collection idprops; 21 | 22 | public LinkFileWriter(Writer out) { 23 | this(out, null); 24 | } 25 | 26 | public LinkFileWriter(Writer out, Configuration config) { 27 | this.out = out; 28 | if (config != null) 29 | this.idprops = config.getIdentityProperties(); 30 | } 31 | 32 | public void write(Record r1, Record r2, boolean match, double confidence) 33 | throws IOException { 34 | write(getid(r1), getid(r2), match, confidence); 35 | } 36 | 37 | public void write(String id1, String id2, boolean match, double confidence) 38 | throws IOException { 39 | out.write("" + (match ? "+," : "-,") + id1 + ',' + id2 + ',' + confidence + 40 | "\n"); 41 | } 42 | 43 | private String getid(Record r) { 44 | for (Property p : idprops) { 45 | String v = r.getValue(p.getName()); 46 | if (v == null) 47 | continue; 48 | 49 | return v; 50 | } 51 | 52 | throw new DukeException("No identity for record " + r); 53 | } 54 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/datasources/InMemoryDataSourceTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.datasources; 3 | 4 | import java.util.ArrayList; 5 | import java.util.Collection; 6 | 7 | import no.priv.garshol.duke.Record; 8 | import no.priv.garshol.duke.RecordIterator; 9 | import no.priv.garshol.duke.utils.TestUtils; 10 | import org.junit.Test; 11 | 12 | import static junit.framework.Assert.assertEquals; 13 | import static junit.framework.Assert.assertFalse; 14 | import static junit.framework.Assert.assertTrue; 15 | 16 | public class InMemoryDataSourceTest { 17 | 18 | @Test 19 | public void testEmpty() { 20 | InMemoryDataSource src = new InMemoryDataSource(); 21 | RecordIterator it = src.getRecords(); 22 | assertFalse("empty data source contains records", 23 | it.hasNext()); 24 | } 25 | 26 | @Test 27 | public void testSimple() { 28 | Collection records = new ArrayList(); 29 | records.add(TestUtils.makeRecord("ID", "1")); 30 | records.add(TestUtils.makeRecord("ID", "2")); 31 | records.add(TestUtils.makeRecord("ID", "3")); 32 | 33 | InMemoryDataSource src = new InMemoryDataSource(records); 34 | RecordIterator it = src.getRecords(); 35 | 36 | assertTrue("record missing", it.hasNext()); 37 | assertEquals("wrong record", it.next().getValue("ID"), "1"); 38 | assertTrue("record missing", it.hasNext()); 39 | assertEquals("wrong record", it.next().getValue("ID"), "2"); 40 | assertTrue("record missing", it.hasNext()); 41 | assertEquals("wrong record", it.next().getValue("ID"), "3"); 42 | 43 | assertFalse("too many records", 44 | it.hasNext()); 45 | } 46 | } -------------------------------------------------------------------------------- /duke-es/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | no.priv.garshol.duke 6 | duke 7 | 1.4-SNAPSHOT 8 | ../ 9 | 10 | duke-es 11 | jar 12 | 13 | 14 | 15 | 16 | no.priv.garshol.duke 17 | duke-core 18 | 19 | 20 | no.priv.garshol.duke 21 | duke-core 22 | test-jar 23 | test 24 | 25 | 26 | 27 | org.elasticsearch 28 | elasticsearch 29 | 1.4.4 30 | 31 | 32 | 33 | 34 | 35 | 36 | fabriziofortino 37 | Fabrizio Fortino 38 | fabrizio.fortino@gmail.com 39 | http://fabriziofortino.github.io 40 | 41 | architect 42 | developer 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/datasources/JNDIDataSource.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.datasources; 3 | 4 | import java.sql.ResultSet; 5 | import java.sql.SQLException; 6 | import java.sql.Statement; 7 | 8 | import no.priv.garshol.duke.ConfigWriter; 9 | import no.priv.garshol.duke.RecordIterator; 10 | import no.priv.garshol.duke.utils.JDBCUtils; 11 | 12 | /** 13 | * Data source which retrieves a JDBC connection from JNDI. 14 | * @since 0.4 15 | */ 16 | public class JNDIDataSource extends JDBCDataSource { 17 | private String jndipath; 18 | 19 | public JNDIDataSource() { 20 | super(); 21 | } 22 | 23 | @Override 24 | public RecordIterator getRecords() { 25 | verifyProperty(jndipath, "jndi-path"); 26 | 27 | try { 28 | Statement stmt = JDBCUtils.open(jndipath); 29 | ResultSet rs = stmt.executeQuery(this.getQuery()); 30 | // iterator takes care of closing the connection 31 | return new JDBCIterator(rs); 32 | } catch (SQLException e) { 33 | throw new RuntimeException(e); 34 | } 35 | } 36 | 37 | protected String getSourceName() { 38 | return "JNDI"; 39 | } 40 | 41 | public void setJndiPath(String path) { 42 | this.jndipath = path; 43 | } 44 | 45 | public String getJndiPath() { 46 | return jndipath; 47 | } 48 | 49 | @Override 50 | public void writeConfig(ConfigWriter cw) { 51 | final String name = "jndi"; 52 | cw.writeStartElement(name, null); 53 | 54 | cw.writeParam("jndi-path", getJndiPath()); 55 | cw.writeParam("query", getQuery()); 56 | 57 | // Write columns 58 | writeColumnsConfig(cw); 59 | 60 | cw.writeEndElement(name); 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/cleaners/HTMLCleanerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | import org.junit.Before; 6 | import org.junit.Test; 7 | 8 | import static org.junit.Assert.assertEquals; 9 | 10 | public class HTMLCleanerTest { 11 | protected Cleaner cleaner; 12 | 13 | @Before 14 | public void setUp() { 15 | cleaner = new HTMLCleaner(); 16 | } 17 | 18 | @Test 19 | public void testEmpty() { 20 | assertEquals("", cleaner.clean("")); 21 | } 22 | 23 | @Test 24 | public void testSingleChar() { 25 | assertEquals("a", cleaner.clean("a")); 26 | } 27 | 28 | @Test 29 | public void testSingleEntity() { 30 | assertEquals("ralf hartmut g\u00FCting", 31 | cleaner.clean("ralf hartmut güting")); 32 | } 33 | 34 | @Test 35 | public void testFirst() { 36 | assertEquals("ABC", 37 | cleaner.clean("ABC")); 38 | } 39 | 40 | @Test 41 | public void testLast() { 42 | assertEquals("ABC", 43 | cleaner.clean("ABC")); 44 | } 45 | 46 | @Test 47 | public void testSingleNamedEntity() { 48 | assertEquals("the vldb journal \u2014 the international journal on very large data bases", 49 | cleaner.clean("the vldb journal — the international journal on very large data bases")); 50 | } 51 | 52 | @Test 53 | public void testThreeEntities() { 54 | assertEquals("ricardo jim\u00e9nez-peris, m. pati\u00f1o-mart\u00ednez, gustavo alonso, bettina kemme", 55 | cleaner.clean("ricardo jiménez-peris, m. patiño-martínez, gustavo alonso, bettina kemme")); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /duke-server/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | no.priv.garshol.duke 6 | duke 7 | 1.4-SNAPSHOT 8 | ../ 9 | 10 | duke-server 11 | jar 12 | 13 | 14 | 15 | 16 | no.priv.garshol.duke 17 | duke-core 18 | 19 | 20 | no.priv.garshol.duke 21 | duke-core 22 | test-jar 23 | test 24 | 25 | 26 | 27 | 28 | 29 | javax.servlet 30 | servlet-api 31 | 2.4 32 | 33 | provided 34 | 35 | 36 | 37 | 38 | org.codehaus.fabric3.api 39 | commonj 40 | 1.1.1 41 | provided 42 | 43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/datasources/Column.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.datasources; 3 | 4 | import java.util.ArrayList; 5 | import java.util.Collection; 6 | import java.util.regex.Pattern; 7 | 8 | import no.priv.garshol.duke.Cleaner; 9 | 10 | public class Column { 11 | private String name; 12 | private String property; 13 | private String prefix; 14 | private Cleaner cleaner; 15 | private Pattern splitter; 16 | 17 | public Column(String name, String property, String prefix, Cleaner cleaner) { 18 | this.name = name; 19 | this.property = property; 20 | this.prefix = prefix; 21 | this.cleaner = cleaner; 22 | } 23 | 24 | public String getName() { 25 | return name; 26 | } 27 | 28 | public String getProperty() { 29 | if (property == null) 30 | return name; 31 | else 32 | return property; 33 | } 34 | 35 | public String getPrefix() { 36 | return prefix; 37 | } 38 | 39 | public Cleaner getCleaner() { 40 | return cleaner; 41 | } 42 | 43 | public void setSplitOn(String spliton) { 44 | this.splitter = Pattern.compile(spliton); 45 | } 46 | 47 | /** 48 | * Returns true iff this column needs to be split into multiple values. 49 | */ 50 | public boolean isSplit() { 51 | return splitter != null; 52 | } 53 | 54 | /** 55 | * Splits the given string into multiple values. 56 | */ 57 | public Collection split(String value) { 58 | String[] parts = splitter.split(value); 59 | Collection values = new ArrayList(parts.length); 60 | for (int ix = 0; ix < parts.length; ix++) 61 | values.add(parts[ix]); 62 | return values; 63 | } 64 | 65 | public String getSplitOn() { 66 | return splitter.toString(); 67 | } 68 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/comparators/JaccardIndexComparator.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import no.priv.garshol.duke.Comparator; 5 | import no.priv.garshol.duke.utils.StringUtils; 6 | 7 | /** 8 | * An implementation of the Jaccard index using exact matching by 9 | * default, but can be overridden to use any sub-comparator. 10 | */ 11 | public class JaccardIndexComparator implements Comparator { 12 | private Comparator subcomp; 13 | 14 | public JaccardIndexComparator() { 15 | this.subcomp = new ExactComparator(); 16 | } 17 | 18 | public void setComparator(Comparator comp) { 19 | this.subcomp = comp; 20 | } 21 | 22 | public boolean isTokenized() { 23 | return true; 24 | } 25 | 26 | public double compare(String s1, String s2) { 27 | if (s1.equals(s2)) 28 | return 1.0; 29 | 30 | // tokenize 31 | String[] t1 = StringUtils.split(s1); 32 | String[] t2 = StringUtils.split(s2); 33 | 34 | // FIXME: we assume t1 and t2 do not have internal duplicates 35 | 36 | // ensure that t1 is shorter than or same length as t2 37 | if (t1.length > t2.length) { 38 | String[] tmp = t2; 39 | t2 = t1; 40 | t1 = tmp; 41 | } 42 | 43 | // find best matches for each token in t1 44 | double intersection = 0; 45 | double union = t1.length + t2.length; 46 | for (int ix1 = 0; ix1 < t1.length; ix1++) { 47 | double highest = 0; 48 | for (int ix2 = 0; ix2 < t2.length; ix2++) 49 | highest = Math.max(highest, subcomp.compare(t1[ix1], t2[ix2])); 50 | 51 | // INV: the best match for t1[ix1] in t2 is has similarity highest 52 | intersection += highest; 53 | union -= highest; // we reduce the union by this similarity 54 | } 55 | 56 | return intersection / union; 57 | } 58 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/cleaners/RegexpCleanerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.cleaners.RegexpCleaner; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class RegexpCleanerTest extends LowerCaseNormalizeCleanerTest { 10 | 11 | @Test 12 | public void testNoMatch() { 13 | test("^(\\d\\d\\d\\d)-", "gurble", null); 14 | } 15 | 16 | @Test 17 | public void testStartYear() { 18 | test("^(\\d\\d\\d\\d)-", "1850-1888", "1850"); 19 | } 20 | 21 | @Test 22 | public void testEndYear() { 23 | test("-(\\d\\d\\d\\d)$", "1850-1888", "1888"); 24 | } 25 | 26 | @Test 27 | public void discardSecondGroup() { 28 | RegexpCleaner cl = new RegexpCleaner(); 29 | cl.setDiscardGroup(true); 30 | cl.setGroup(2); 31 | cl.setRegexp("([a-zA-Z])(\\d+)"); 32 | assertEquals("IDontLikeDigitsBut53inTheEndIsOk", cl.clean("ID42ontLikeDigitsBut53inTheEndIsOk")); 33 | } 34 | 35 | @Test 36 | public void discardAll() { 37 | RegexpCleaner cl = new RegexpCleaner(); 38 | cl.setDiscardGroup(false); //independent of discard flag 39 | cl.setDiscardAllGroup(true); 40 | cl.setRegexp("(\\d+)"); 41 | assertEquals("IDontLikeDigits $", cl.clean("I123Dont454Like450Di3gits 4234 0234$")); 42 | } 43 | 44 | @Test 45 | public void discardAllSecondGroup() { 46 | RegexpCleaner cl = new RegexpCleaner(); 47 | cl.setDiscardAllGroup(true); 48 | cl.setGroup(2); 49 | cl.setRegexp("([A-Z])(\\d+\\s?)"); 50 | assertEquals("This is DUKE",cl.clean("This is D1 U312 K1231 E4332")); 51 | } 52 | 53 | private void test(String regexp, String value, String result) { 54 | RegexpCleaner cl = new RegexpCleaner(); 55 | cl.setRegexp(regexp); 56 | assertEquals(result, cl.clean(value)); 57 | } 58 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/cleaners/PhoneNumberCleanerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | import static junit.framework.Assert.assertTrue; 9 | 10 | public class PhoneNumberCleanerTest { 11 | private PhoneNumberCleaner cleaner; 12 | 13 | @Before 14 | public void setup() { 15 | cleaner = new PhoneNumberCleaner(); 16 | } 17 | 18 | @Test 19 | public void testEmpty() { 20 | assertTrue(cleaner.clean("") == null); 21 | } 22 | 23 | @Test 24 | public void testPathological() { 25 | test("123", "123"); 26 | } 27 | 28 | @Test 29 | public void testUKInitialZero() { 30 | test("+44 020 77921414", "+44 2077921414"); 31 | } 32 | 33 | @Test 34 | public void testNorwaySpace() { 35 | test("+47 23 155100", "+47 23155100"); 36 | } 37 | 38 | @Test 39 | public void testNorwayWithoutCode() { 40 | test("23 21 20 00", "23212000"); 41 | } 42 | 43 | @Test 44 | public void testZeroZeroSweden() { 45 | test("00 46 8 506 16100", "+46 850616100"); 46 | } 47 | 48 | @Test 49 | public void testZeroZeroGermany() { 50 | test("00 49 30 881 3001", "+49 308813001"); 51 | } 52 | 53 | @Test 54 | public void testUSNumber() { 55 | test("+ 1 212 554 6120", "+1 2125546120"); 56 | } 57 | 58 | @Test 59 | public void testSwedenInitialZero() { 60 | test("+46 (0)31 751 5300 ", "+46 317515300"); 61 | } 62 | 63 | @Test 64 | public void testFinland() { 65 | test("+358 40 7600231", "+358 407600231"); 66 | } 67 | 68 | @Test 69 | public void testParenthesis() { 70 | test("0047 (0)55551000", "+47 55551000"); 71 | } 72 | 73 | private void test(String value, String result) { 74 | assertEquals(result, cleaner.clean(value)); 75 | } 76 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/comparators/NumericComparatorTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class NumericComparatorTest { 10 | private NumericComparator comp; 11 | 12 | @Before 13 | public void setUp() { 14 | comp = new NumericComparator(); 15 | } 16 | 17 | @Test 18 | public void testEqual() { 19 | assertEquals(1.0, comp.compare("42", "42")); 20 | } 21 | 22 | @Test 23 | public void testEqual2() { 24 | assertEquals(1.0, comp.compare("42.0", "42.0")); 25 | } 26 | 27 | @Test 28 | public void testHalf() { 29 | assertEquals(0.5, comp.compare("21.0", "42.0")); 30 | } 31 | 32 | @Test 33 | public void testHalfInverted() { 34 | assertEquals(0.5, comp.compare("42.0", "21.0")); 35 | } 36 | 37 | @Test 38 | public void testHalfBelowMin() { 39 | comp.setMinRatio(0.75); 40 | assertEquals(0.0, comp.compare("21.0", "42.0")); 41 | } 42 | 43 | @Test 44 | public void testHalfAboveMin() { 45 | comp.setMinRatio(0.25); 46 | assertEquals(0.5, comp.compare("21.0", "42.0")); 47 | } 48 | 49 | @Test 50 | public void testZero() { 51 | assertEquals(1.0, comp.compare("0.0", "0.0")); 52 | } 53 | 54 | @Test 55 | public void testFirstIsZero() { 56 | assertEquals(0.0, comp.compare("0.0", "42.0")); 57 | } 58 | 59 | @Test 60 | public void testSecondIsZero() { 61 | assertEquals(0.0, comp.compare("42.0", "0.0")); 62 | } 63 | 64 | @Test 65 | public void testOneNegativeOnePositive() { 66 | assertEquals(0.0, comp.compare("-1", "2")); 67 | } 68 | 69 | @Test 70 | public void testNegativeNumbers() { 71 | assertEquals(0.5, comp.compare("-1", "-2")); 72 | assertEquals(0.5, comp.compare("-2", "-1")); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/comparators/SoundexComparatorTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class SoundexComparatorTest { 10 | private SoundexComparator comp; 11 | 12 | // ----- TEST CODE 13 | 14 | @Test 15 | public void testEmpty() { 16 | check("", ""); 17 | } 18 | 19 | @Test 20 | public void testSue() { 21 | check("S000", "Sue"); 22 | } 23 | 24 | @Test 25 | public void testSUE() { 26 | check("S000", "SUE"); 27 | } 28 | 29 | @Test 30 | public void testGarshol() { 31 | check("G624", "Garshol"); 32 | } 33 | 34 | @Test 35 | public void testGARSHOL() { 36 | check("G624", "GARSHOL"); 37 | } 38 | 39 | @Test 40 | public void testGarskol() { 41 | check("G624", "Garskol"); 42 | } 43 | 44 | @Test 45 | public void testGARSKOL() { 46 | check("G624", "GARSKOL"); 47 | } 48 | 49 | private void check(String key, String value) { 50 | assertEquals("wrong key for '" + value + "'", 51 | key, SoundexComparator.soundex(value)); 52 | } 53 | 54 | // ----- TEST COMPARISON 55 | 56 | @Before 57 | public void setup() { 58 | comp = new SoundexComparator(); 59 | } 60 | 61 | @Test 62 | public void testEqual() { 63 | assertEquals("wrong score for equal values", 1.0, 64 | comp.compare("LMG", "LMG")); 65 | } 66 | 67 | @Test 68 | public void testEqualCode() { 69 | assertEquals("wrong score for values with equal codes", 0.9, 70 | comp.compare("Garshol", "Garskol")); 71 | } 72 | 73 | @Test 74 | public void testDifferentCode() { 75 | assertEquals("wrong score for values with different codes", 0.0, 76 | comp.compare("Garshol", "Sue")); 77 | } 78 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/datasources/InMemoryDataSource.java: -------------------------------------------------------------------------------- 1 | package no.priv.garshol.duke.datasources; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collection; 5 | 6 | import no.priv.garshol.duke.ConfigWriter; 7 | import no.priv.garshol.duke.DataSource; 8 | import no.priv.garshol.duke.Logger; 9 | import no.priv.garshol.duke.Record; 10 | import no.priv.garshol.duke.RecordIterator; 11 | import no.priv.garshol.duke.utils.DefaultRecordIterator; 12 | 13 | /** 14 | * Data source which can be passed Record objects, and which then 15 | * returns them. 16 | * @since 0.4 17 | */ 18 | public class InMemoryDataSource implements DataSource { 19 | /** 20 | * The records held by the data source. 21 | */ 22 | protected Collection records; 23 | 24 | /** 25 | * Creates an empty source. 26 | */ 27 | public InMemoryDataSource() { 28 | this.records = new ArrayList(); 29 | } 30 | 31 | /** 32 | * Creates a source populated with the records in the 33 | * records parameter. 34 | */ 35 | public InMemoryDataSource(Collection records) { 36 | this.records = records; 37 | } 38 | 39 | @Override 40 | public RecordIterator getRecords() { 41 | return new DefaultRecordIterator(records.iterator()); 42 | } 43 | 44 | /** 45 | * Removes all records held by the data source. 46 | */ 47 | public void clear() { 48 | records.clear(); 49 | } 50 | 51 | /** 52 | * Adds a record to the collection held by the source. 53 | */ 54 | public void add(Record record) { 55 | records.add(record); 56 | } 57 | 58 | public void setLogger(Logger logger) { 59 | // there's not really much to log here, so... 60 | } 61 | 62 | @Override 63 | public void writeConfig(ConfigWriter cw) { 64 | String name = "memory"; 65 | cw.writeStartElement(name, null); 66 | cw.writeEndElement(name); 67 | } 68 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/databases/Bucket.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import java.util.Arrays; 5 | 6 | public class Bucket implements Comparable { 7 | // the index of the next free cell in the array (== size()) 8 | public int nextfree; 9 | // if the bucket has gone over size, this is 'null' 10 | public long[] records; 11 | // true iff new records have been added to the bucket since last sorting 12 | private boolean dirty; 13 | // if buckets go over this size, discard contents 14 | private static final int MAX_BUCKET_SIZE = 1000000; 15 | 16 | public Bucket() { 17 | this.records = new long[10]; 18 | } 19 | 20 | public void add(long id) { 21 | if (records == null) 22 | return; // bucket went over size, now discarding all records 23 | 24 | if (nextfree >= records.length) { 25 | if (nextfree >= MAX_BUCKET_SIZE) { 26 | // this bucket is now oversized 27 | records = null; 28 | dirty = false; 29 | return; 30 | } 31 | 32 | long[] newbuf = new long[Math.min(records.length * 2, MAX_BUCKET_SIZE)]; 33 | System.arraycopy(records, 0, newbuf, 0, records.length); 34 | records = newbuf; 35 | } 36 | records[nextfree++] = id; 37 | dirty = true; 38 | } 39 | 40 | public int compareTo(Bucket other) { 41 | return nextfree - other.nextfree; 42 | } 43 | 44 | public void sort() { 45 | if (!dirty) 46 | return; 47 | 48 | Arrays.sort(records, 0, nextfree); 49 | dirty = false; 50 | } 51 | 52 | public double getScore() { 53 | //return 1.0 / (double) nextfree; 54 | if (nextfree == 0) 55 | return 1.0; 56 | else 57 | return 1.0 / Math.log((double) (nextfree + 1)); 58 | } 59 | 60 | public boolean contains(long record) { 61 | return Arrays.binarySearch(records, 0, nextfree, record) >= 0; 62 | } 63 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/genetic/ExemplarsTracker.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | import java.util.Map; 5 | import java.util.List; 6 | import java.util.HashMap; 7 | import java.util.ArrayList; 8 | import java.util.Collection; 9 | import java.util.Comparator; 10 | import java.util.Collections; 11 | 12 | import no.priv.garshol.duke.Record; 13 | import no.priv.garshol.duke.Configuration; 14 | import no.priv.garshol.duke.matchers.AbstractMatchListener; 15 | 16 | /** 17 | * A listener to decide which potential matches to ask the oracle 18 | * about. 19 | */ 20 | public class ExemplarsTracker extends AbstractMatchListener { 21 | // we cheat in this map, and map the pair onto itself, which is why 22 | // the pair object contains a counter (saves one object, and thus 23 | // some memory) 24 | private Map exemplars; 25 | private Configuration config; 26 | private Comparator comparator; 27 | 28 | public ExemplarsTracker(Configuration config, Comparator comparator) { 29 | this.config = config; 30 | this.exemplars = new HashMap(); 31 | this.comparator = comparator; 32 | } 33 | 34 | public synchronized void matches(Record r1, Record r2, double confidence) { 35 | Pair key = new Pair(getid(r1), getid(r2)); 36 | Pair counter = exemplars.get(key); 37 | if (counter == null) { 38 | exemplars.put(key, key); 39 | counter = key; 40 | } 41 | counter.counter++; 42 | } 43 | 44 | public List getExemplars() { 45 | List sorted = new ArrayList(exemplars.size()); 46 | sorted.addAll(exemplars.keySet()); 47 | Collections.sort(sorted, comparator); 48 | return sorted; 49 | } 50 | 51 | private String getid(Record r) { 52 | for (String propname : r.getProperties()) 53 | if (config.getPropertyByName(propname).isIdProperty()) 54 | return r.getValue(propname); 55 | return null; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/utils/Utils.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | import java.io.File; 5 | import java.io.IOException; 6 | 7 | import no.priv.garshol.duke.DukeException; 8 | 9 | public class Utils { 10 | 11 | /** 12 | * Combines two probabilities using Bayes' theorem. This is the 13 | * approach known as "naive Bayes", very well explained here: 14 | * http://www.paulgraham.com/naivebayes.html 15 | */ 16 | public static double computeBayes(double prob1, double prob2) { 17 | return (prob1 * prob2) / 18 | ((prob1 * prob2) + ((1.0 - prob1) * (1.0 - prob2))); 19 | } 20 | 21 | /** 22 | * Returns true iff we are running on Windows. Used to detect 23 | * whether it's safe to use Lucene's NIOFSDirectory. It's slow on 24 | * Windows due to a Java bug. 25 | */ 26 | public static boolean isWindowsOS() { 27 | return System.getProperty("os.name").startsWith("Windows"); 28 | } 29 | 30 | /** 31 | * Creates a temporary folder using the given prefix to generate its name. 32 | * @param prefix the prefix string to be used in generating the directory's name; may be null 33 | * @return the File to the newly created folder 34 | * @throws IOException 35 | */ 36 | public static File createTempDirectory(String prefix) { 37 | File temp = null; 38 | 39 | try { 40 | temp = File.createTempFile(prefix != null ? prefix : "temp", Long.toString(System.nanoTime())); 41 | 42 | if (!(temp.delete())) { 43 | throw new IOException("Could not delete temp file: " 44 | + temp.getAbsolutePath()); 45 | } 46 | 47 | if (!(temp.mkdir())) { 48 | throw new IOException("Could not create temp directory: " 49 | + temp.getAbsolutePath()); 50 | } 51 | } catch (IOException e) { 52 | throw new DukeException("Unable to create temporary directory with prefix " + prefix, e); 53 | } 54 | 55 | return temp; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /duke-lucene/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | no.priv.garshol.duke 6 | duke 7 | 1.4-SNAPSHOT 8 | ../ 9 | 10 | duke-lucene 11 | jar 12 | 13 | 14 | 15 | 16 | no.priv.garshol.duke 17 | duke-core 18 | 19 | 20 | no.priv.garshol.duke 21 | duke-core 22 | test-jar 23 | test 24 | 25 | 26 | 27 | org.apache.lucene 28 | lucene-core 29 | 4.0.0 30 | 31 | 32 | 33 | 34 | org.apache.lucene 35 | lucene-analyzers-common 36 | 4.0.0 37 | 38 | 39 | 42 | 43 | org.apache.lucene 44 | lucene-spatial 45 | 4.0.0 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/RecordSearch.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.util.Collection; 5 | import java.util.Collections; 6 | import java.io.IOException; 7 | 8 | import org.xml.sax.SAXException; 9 | 10 | import no.priv.garshol.duke.RecordImpl; 11 | import no.priv.garshol.duke.utils.CommandLineParser; 12 | import no.priv.garshol.duke.matchers.PrintMatchListener; 13 | 14 | /** 15 | * Search for records and display the matching ones. 16 | */ 17 | public class RecordSearch extends AbstractCmdlineTool { 18 | 19 | public static void main(String[] argv) throws IOException, SAXException { 20 | new RecordSearch().run(argv); 21 | } 22 | 23 | public void run(String[] argv) 24 | throws IOException, SAXException { 25 | Collection options = 26 | Collections.singleton((CommandLineParser.Option) new CommandLineParser.StringOption("maxhits", 'H')); 27 | argv = init(argv, 3, 3, options); 28 | int max_hits = 10000; 29 | if (parser.getOptionValue("maxhits") != null) 30 | max_hits = Integer.parseInt(parser.getOptionValue("maxhits")); 31 | 32 | // build record 33 | RecordImpl prototype = new RecordImpl(); 34 | prototype.addValue(argv[1], argv[2]); 35 | 36 | // search 37 | Collection records = database.findCandidateMatches(prototype); 38 | int hitno = 1; 39 | for (Record record : records) { 40 | PrintMatchListener.prettyPrint(record, config.getProperties()); 41 | System.out.println(); 42 | if (hitno++ == max_hits) 43 | break; 44 | } 45 | } 46 | 47 | protected void usage() { 48 | System.out.println(""); 49 | System.out.println("java no.priv.garshol.duke.RecordSearch "); 50 | System.out.println(""); 51 | System.out.println(" --reindex: Reindex all records before comparing"); 52 | System.out.println(" --maxhits: Don't return more than this number of records"); 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /duke-dist/src/main/assembly/dep.xml: -------------------------------------------------------------------------------- 1 | 4 | 5 | bin 6 | 7 | zip 8 | 9 | 10 | 11 | 12 | ${project.basedir}/../ 13 | 14 | 15 | README* 16 | LICENSE* 17 | NOTICE* 18 | doc/example-data/dogfood* 19 | doc/example-data/countries* 20 | doc/example-data/deichmann.xml 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | true 29 | 30 | 31 | ${project.groupId}:duke-core 32 | ${project.groupId}:duke-mapdb 33 | ${project.groupId}:duke-lucene 34 | ${project.groupId}:duke-server 35 | ${project.groupId}:duke-mongodb 36 | ${project.groupId}:duke-json 37 | ${project.groupId}:duke-es 38 | 39 | 40 | 41 | lib/ 42 | false 43 | 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /duke-core/src/main/resources/no/priv/garshol/duke/config-schema.rnc: -------------------------------------------------------------------------------- 1 | 2 | # RELAX-NG schema (compact syntax) for the Duke configuration file syntax. 3 | # Currently not being used for validation, but exists for documentation 4 | # purposes, if nothing else. 5 | 6 | start = duke 7 | 8 | duke = element duke { param*, object*, schema, database?, 9 | ((group, group) | source*) } 10 | 11 | schema = element schema { 12 | threshold, maybe-threshold?, property* 13 | } 14 | 15 | threshold = element threshold { xsd:decimal } 16 | maybe-threshold = element maybe-threshold { xsd:decimal } 17 | 18 | 19 | lookupattr = attribute lookup { "true" | "false" | "required" | "default"} 20 | property = idproperty | ignoreproperty | compareproperty 21 | idproperty = element property { 22 | attribute type { "id" }, name 23 | } 24 | ignoreproperty = element property { 25 | attribute type { "ignore" }, lookupattr?, name, comparator?, low?, high? 26 | } 27 | compareproperty = element property { 28 | attribute type { "compare" }?, lookupattr?, name, comparator?, low, high 29 | } 30 | 31 | name = element name { text } 32 | comparator = element comparator { text } 33 | low = element low { xsd:decimal } 34 | high = element high { xsd:decimal } 35 | 36 | database = element database { attribute class { text }?, param* } 37 | 38 | source = ntriples | sparql | csv | jdbc | plugin 39 | ntriples = element ntriples { param*, column+ } 40 | sparql = element sparql { param*, column+ } 41 | csv = element csv { param*, column+ } 42 | jdbc = element jdbc { param*, column+ } 43 | plugin = element data-source { classatt, param*, column* } 44 | 45 | param = element param { nameatt, attribute value { text } } 46 | nameatt = attribute name { text } 47 | 48 | column = element column { nameatt, propertyatt, cleaner? } 49 | propertyatt = attribute property { text } 50 | cleaner = attribute cleaner { text } 51 | 52 | group = element group { source+ } 53 | 54 | object = element object { classatt, nameatt, param* } 55 | classatt = attribute class { text } 56 | -------------------------------------------------------------------------------- /doc/example-data/dogfood-sparql.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 0.85 4 | test 5 | 6 | 7 | ID 8 | 9 | 10 | 11 | NAME 12 | no.priv.garshol.duke.comparators.JaroWinkler 13 | 0.4 14 | 0.8 15 | 16 | 17 | MBOX_HASH 18 | no.priv.garshol.duke.comparators.ExactComparator 19 | 0.48 20 | 0.8 21 | 22 | 23 | AFFILIATION 24 | no.priv.garshol.duke.comparators.ExactComparator 25 | 0.45 26 | 0.6 27 | 28 | 29 | HOMEPAGE 30 | no.priv.garshol.duke.comparators.ExactComparator 31 | 0.48 32 | 0.9 33 | 34 | 35 | 36 | 37 | 38 | 39 | 47 | 48 | 50 | 53 | 55 | 57 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/databases/InMemoryBlockingDatabase.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import java.util.Map; 5 | import java.util.TreeMap; 6 | import java.util.HashMap; 7 | import java.util.HashSet; 8 | import java.util.ArrayList; 9 | import java.util.Collection; 10 | import java.util.NavigableMap; 11 | 12 | import no.priv.garshol.duke.Record; 13 | import no.priv.garshol.duke.Property; 14 | import no.priv.garshol.duke.Database; 15 | import no.priv.garshol.duke.Configuration; 16 | 17 | /** 18 | * A database using blocking to find candidate records. It's in-memory 19 | * so capacity is limited, but it's primarily intended as a prototype 20 | * to test the performance and recall of the blocking approach. 21 | * @since 1.2 22 | */ 23 | public class InMemoryBlockingDatabase extends AbstractBlockingDatabase { 24 | 25 | public InMemoryBlockingDatabase() { 26 | super(); 27 | this.idmap = new HashMap(); 28 | } 29 | 30 | public void index(Record record) { 31 | indexById(record); 32 | 33 | // index by key 34 | for (KeyFunction keyfunc : functions) { 35 | NavigableMap> blocks = getBlocks(keyfunc); 36 | String key = keyfunc.makeKey(record); 37 | Collection block = blocks.get(key); 38 | if (block == null) { 39 | block = new ArrayList(); 40 | blocks.put(key, block); 41 | } 42 | block.add(record); 43 | } 44 | } 45 | 46 | public boolean isInMemory() { 47 | return true; 48 | } 49 | 50 | public String toString() { 51 | return "InMemoryBlockingDatabase window_size=" + window_size + "\n " + 52 | functions; 53 | } 54 | 55 | // --- plug in extensions 56 | 57 | protected int addBlock(Collection candidates, 58 | Map.Entry block) { 59 | Collection recs = (Collection) block.getValue(); 60 | candidates.addAll(recs); 61 | return recs.size(); 62 | } 63 | 64 | protected NavigableMap makeMap(KeyFunction keyfunc) { 65 | return new TreeMap(); 66 | } 67 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/genetic/ComparatorAspectTest.java: -------------------------------------------------------------------------------- 1 | package no.priv.garshol.duke.genetic; 2 | 3 | import static org.junit.Assert.assertTrue; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import no.priv.garshol.duke.Comparator; 9 | import no.priv.garshol.duke.Configuration; 10 | import no.priv.garshol.duke.ConfigurationImpl; 11 | import no.priv.garshol.duke.Property; 12 | import no.priv.garshol.duke.PropertyImpl; 13 | 14 | import org.junit.Before; 15 | import org.junit.Test; 16 | 17 | public class ComparatorAspectTest { 18 | private Configuration config1; 19 | private TestComparator comparator = new TestComparator(); 20 | private String propName = "NAME"; 21 | 22 | @Before 23 | public void setup() { 24 | 25 | config1 = new ConfigurationImpl(); 26 | 27 | List props = new ArrayList(); 28 | props.add(new PropertyImpl("ID")); 29 | props.add(new PropertyImpl(propName, null, 0.3, 0.8)); 30 | 31 | ((ConfigurationImpl) config1).setProperties(props); 32 | ((ConfigurationImpl) config1).setThreshold(0.85); 33 | } 34 | 35 | @Test 36 | public void canAddCustomComparator() { 37 | GeneticConfiguration conf = new GeneticConfiguration(config1); 38 | Property aspectProp = new PropertyImpl(propName, null, 0.5, 0.5); 39 | List compList = new ArrayList(); 40 | compList.add(comparator); 41 | ComparatorAspect aspect = new ComparatorAspect(aspectProp, compList); 42 | 43 | aspect.setRandomly(conf); 44 | 45 | Property updatedProp = config1.getPropertyByName(propName); 46 | Comparator randomComparator = updatedProp.getComparator(); 47 | assertTrue("should have custom comparator set, but has : " + randomComparator.getClass(), randomComparator.equals(comparator)); 48 | } 49 | } 50 | 51 | class TestComparator implements Comparator { 52 | 53 | @Override 54 | public boolean isTokenized() { 55 | return false; 56 | } 57 | 58 | @Override 59 | public double compare(String v1, String v2) { 60 | return 0; 61 | } 62 | 63 | } -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/cleaners/LowerCaseNormalizeCleanerTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | import org.junit.Before; 6 | import org.junit.Test; 7 | 8 | import static junit.framework.Assert.assertEquals; 9 | 10 | public class LowerCaseNormalizeCleanerTest { 11 | protected Cleaner cleaner; 12 | 13 | @Before 14 | public void setUp() { 15 | cleaner = new LowerCaseNormalizeCleaner(); 16 | } 17 | 18 | @Test 19 | public void testEmpty() { 20 | assertEquals("", cleaner.clean("")); 21 | } 22 | 23 | @Test 24 | public void testSingleChar() { 25 | assertEquals("a", cleaner.clean("A")); 26 | } 27 | 28 | @Test 29 | public void testSingleChar2() { 30 | assertEquals("a", cleaner.clean("a")); 31 | } 32 | 33 | @Test 34 | public void testSingleSpace() { 35 | assertEquals("", cleaner.clean(" ")); 36 | } 37 | 38 | @Test 39 | public void testManySpaces() { 40 | assertEquals("", cleaner.clean(" ")); 41 | } 42 | 43 | @Test 44 | public void testManyLeadingSpaces() { 45 | assertEquals("a", cleaner.clean(" a")); 46 | } 47 | 48 | @Test 49 | public void testManyTrailingSpaces() { 50 | assertEquals("a", cleaner.clean("a ")); 51 | } 52 | 53 | @Test 54 | public void testLarsMarius() { 55 | assertEquals("lars marius", cleaner.clean("Lars Marius")); 56 | } 57 | 58 | @Test 59 | public void testLarsMarius3Spaces() { 60 | assertEquals("lars marius", cleaner.clean("Lars Marius")); 61 | } 62 | 63 | @Test 64 | public void testLarsMariusPadded() { 65 | assertEquals("lars marius", cleaner.clean(" Lars Marius ")); 66 | } 67 | 68 | @Test 69 | public void testRealData() { 70 | assertEquals("inger elisabeth foyn havre", 71 | cleaner.clean("Inger Elisabeth Foyn Havre")); 72 | } 73 | 74 | @Test 75 | public void testAccentStripping() { 76 | assertEquals("male", cleaner.clean("Mal\u00E9")); 77 | } 78 | 79 | @Test 80 | public void testAccentStripping2() { 81 | assertEquals("h\u00F8ybr\u00E5ten", cleaner.clean("H\u00F8ybr\u00E5ten")); 82 | } 83 | 84 | } -------------------------------------------------------------------------------- /duke-server/src/main/java/no/priv/garshol/duke/server/CommonJTimer.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.server; 3 | 4 | import java.util.Properties; 5 | import javax.naming.InitialContext; 6 | 7 | import commonj.timers.Timer; 8 | import commonj.timers.TimerManager; 9 | import commonj.timers.TimerListener; 10 | 11 | import no.priv.garshol.duke.DukeException; 12 | 13 | // makes it easier to deal with properties 14 | import static no.priv.garshol.duke.utils.PropertyUtils.get; 15 | 16 | /** 17 | * Timer implementation which uses the JSR-236 API, in order to 18 | * provide managed threads within servlet containers that support 19 | * them. 20 | */ 21 | public class CommonJTimer implements DukeTimer, TimerListener { 22 | private TimerManager mgr; 23 | private Timer timer; 24 | private DukeController controller; 25 | 26 | public CommonJTimer() { 27 | } 28 | 29 | public void init(Properties props) { 30 | String path = get(props, "duke.timer-jndipath"); 31 | try { 32 | InitialContext ctx = new InitialContext(); 33 | mgr = (TimerManager) ctx.lookup(path); 34 | } 35 | catch (Exception e) { 36 | throw new DukeException(e); 37 | } 38 | } 39 | 40 | /** 41 | * Starts a background thread which calls the controller every 42 | * check_interval milliseconds. Returns immediately, leaving the 43 | * background thread running. 44 | */ 45 | public void spawnThread(DukeController controller, int check_interval) { 46 | this.controller = controller; 47 | timer = mgr.schedule(this, 0, check_interval * 1000); // convert to ms 48 | } 49 | 50 | /** 51 | * Returns true iff the background thread is running. 52 | */ 53 | public boolean isRunning() { 54 | return timer != null; 55 | } 56 | 57 | /** 58 | * Stops the background thread. It can be restarted with a new call 59 | * to spawnThread. 60 | */ 61 | public void stop() { 62 | timer.cancel(); 63 | timer = null; 64 | } 65 | 66 | /** 67 | * This is the callback from the timer service, letting us know it's 68 | * time do something. 69 | */ 70 | public void timerExpired(Timer timer) { 71 | controller.process(); 72 | } 73 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/databases/KeyValueStore.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import no.priv.garshol.duke.Record; 5 | 6 | /** 7 | * An interface encapsulating the way KeyValueDatabase interacts with 8 | * the underlying database. Used to allow different key/value 9 | * databases to be plugged in and tested. 10 | */ 11 | public interface KeyValueStore { 12 | 13 | /** 14 | * Returns true iff the database is held entirely in memory, and 15 | * thus is not persistent. 16 | */ 17 | public boolean isInMemory(); 18 | 19 | /** 20 | * Flushes all changes to disk. For in-memory databases this is a 21 | * no-op. 22 | */ 23 | public void commit(); 24 | 25 | /** 26 | * Stores state to disk and closes all open resources. 27 | */ 28 | public void close(); 29 | 30 | /** 31 | * Returns a new internal record ID. 32 | */ 33 | public long makeNewRecordId(); 34 | 35 | /** 36 | * Stores the entire record under the given internal record ID. 37 | */ 38 | public void registerRecord(long id, Record record); 39 | 40 | /** 41 | * Records that this external ID refers to the given internal record 42 | * ID. 43 | * @param id the internal record ID 44 | * @param extid the external ID 45 | */ 46 | public void registerId(long id, String extid); 47 | 48 | /** 49 | * Records that the given token occurred in the given record. 50 | * @param id the ID of the record the token occurred in 51 | * @param propname the property the token occurred in 52 | * @param token the actual token 53 | */ 54 | public void registerToken(long id, String propname, String token); 55 | 56 | /** 57 | * Returns the record with the given external ID. 58 | */ 59 | public Record findRecordById(String extid); 60 | 61 | /** 62 | * Returns the record with the given internal ID. This method must 63 | * be thread-safe. 64 | */ 65 | public Record findRecordById(long id); 66 | 67 | /** 68 | * Returns the IDs of all records which have the given token in a 69 | * value for this property. This method must be thread-safe. 70 | */ 71 | public Bucket lookupToken(String propname, String token); 72 | } 73 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/comparators/QGramComparatorTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import org.junit.Before; 5 | import org.junit.Test; 6 | 7 | import static junit.framework.Assert.assertEquals; 8 | 9 | public class QGramComparatorTest { 10 | private QGramComparator comp; 11 | 12 | @Before 13 | public void setup() { 14 | comp = new QGramComparator(); 15 | } 16 | 17 | @Test 18 | public void testEmpty() { 19 | assertEquals(1.0, comp.compare("", "")); 20 | } 21 | 22 | @Test 23 | public void testOneIsEmpty() { 24 | assertEquals(0.0, comp.compare("", "abc")); 25 | } 26 | 27 | @Test 28 | public void testOneIsDifferent() { 29 | assertEquals((4.0 / 6.0), comp.compare("abc def", "cab def")); 30 | } 31 | 32 | @Test 33 | public void testGail() { 34 | assertEquals((1.0 / 3.0), comp.compare("gail", "gayle")); 35 | } 36 | 37 | @Test 38 | public void testGailJaccard() { 39 | comp.setFormula(QGramComparator.Formula.JACCARD); 40 | assertEquals((1.0 / 6.0), comp.compare("gail", "gayle")); 41 | } 42 | 43 | @Test 44 | public void testGailDice() { 45 | comp.setFormula(QGramComparator.Formula.DICE); 46 | assertEquals((2.0 / 7.0), comp.compare("gail", "gayle")); 47 | } 48 | 49 | @Test 50 | public void testGail3() { 51 | comp.setQ(3); 52 | assertEquals(0.0, comp.compare("gail", "gayle")); 53 | } 54 | 55 | @Test 56 | public void testGarshol3() { 57 | comp.setQ(3); 58 | assertEquals((4.0 / 5.0), comp.compare("garshol", "garshoel")); 59 | } 60 | 61 | @Test 62 | public void testGailPositional() { 63 | comp.setTokenizer(QGramComparator.Tokenizer.POSITIONAL); 64 | assertEquals((1.0 / 3.0), comp.compare("gail", "gayle")); 65 | } 66 | 67 | @Test 68 | public void testKakadu() { 69 | assertEquals((1.0 / 2.0), comp.compare("kakadu", "cacadu")); 70 | } 71 | 72 | @Test 73 | public void testKakaduPositional() { 74 | comp.setTokenizer(QGramComparator.Tokenizer.POSITIONAL); 75 | assertEquals((2.0 / 5.0), comp.compare("kakadu", "cacadu")); 76 | } 77 | 78 | @Test 79 | public void testGailEnds() { 80 | comp.setTokenizer(QGramComparator.Tokenizer.ENDS); 81 | assertEquals((2.0 / 5.0), comp.compare("gail", "gayle")); 82 | } 83 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/PersonNameCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | 3 | package no.priv.garshol.duke.cleaners; 4 | 5 | import java.util.Map; 6 | import java.util.HashMap; 7 | import java.io.InputStream; 8 | import java.io.BufferedReader; 9 | import java.io.InputStreamReader; 10 | import java.io.IOException; 11 | 12 | import no.priv.garshol.duke.Cleaner; 13 | import no.priv.garshol.duke.utils.StringUtils; 14 | 15 | /** 16 | * Experimental cleaner for person names, which understands 17 | * about abbreviations like "joe" for "joseph", etc. 18 | */ 19 | public class PersonNameCleaner implements Cleaner { 20 | private LowerCaseNormalizeCleaner sub; 21 | private Map mapping; 22 | 23 | public PersonNameCleaner() { 24 | this.sub = new LowerCaseNormalizeCleaner(); 25 | 26 | // load token translation mapping (FIXME: move to static init?) 27 | try { 28 | this.mapping = loadMapping(); 29 | } catch (IOException e) { 30 | throw new RuntimeException(e); 31 | } 32 | } 33 | 34 | public String clean(String value) { 35 | // do basic cleaning 36 | value = sub.clean(value); 37 | if (value == null || value.equals("")) 38 | return value; 39 | 40 | // tokenize, then map tokens, then rejoin 41 | String[] tokens = StringUtils.split(value); 42 | for (int ix = 0; ix < tokens.length; ix++) { 43 | String mapsto = mapping.get(tokens[ix]); 44 | if (mapsto != null) 45 | tokens[ix] = mapsto; 46 | } 47 | 48 | return StringUtils.join(tokens); 49 | } 50 | 51 | private Map loadMapping() throws IOException { 52 | String mapfile = "no/priv/garshol/duke/name-mappings.txt"; 53 | 54 | Map mapping = new HashMap(); 55 | ClassLoader cloader = Thread.currentThread().getContextClassLoader(); 56 | InputStream istream = cloader.getResourceAsStream(mapfile); 57 | InputStreamReader reader = new InputStreamReader(istream, "utf-8"); 58 | BufferedReader in = new BufferedReader(reader); 59 | 60 | String line = in.readLine(); 61 | while (line != null) { 62 | int pos = line.indexOf(','); 63 | mapping.put(line.substring(0, pos), line.substring(pos + 1)); 64 | line = in.readLine(); 65 | } 66 | 67 | in.close(); 68 | return mapping; 69 | } 70 | } -------------------------------------------------------------------------------- /duke-lucene/src/main/java/no/priv/garshol/duke/databases/DocumentRecord.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import java.util.HashSet; 5 | import java.util.ArrayList; 6 | import java.util.Collection; 7 | import java.util.Collections; 8 | import org.apache.lucene.document.Document; 9 | import org.apache.lucene.index.IndexableField; 10 | 11 | import no.priv.garshol.duke.Record; 12 | 13 | /** 14 | * Wraps a Lucene Document to provide a representation of it as a Record. 15 | */ 16 | public class DocumentRecord implements Record { 17 | /** 18 | * Beware: this document number will change when changes are made to 19 | * the Lucene index. So while it's safe to use right now, it is not 20 | * safe if record objects persist across batch process calls. It 21 | * might also not be safe in a multi-threaded setting. So 22 | * longer-term we may need a better solution for removing duplicate 23 | * candidates. 24 | */ 25 | private int docno; 26 | private Document doc; 27 | 28 | public DocumentRecord(int docno, Document doc) { 29 | this.docno = docno; 30 | this.doc = doc; 31 | } 32 | 33 | public Collection getProperties() { 34 | Collection props = new HashSet(); 35 | for (IndexableField f : doc.getFields()) 36 | props.add(f.name()); 37 | return props; 38 | } 39 | 40 | public String getValue(String prop) { 41 | return doc.get(prop); 42 | } 43 | 44 | public Collection getValues(String prop) { 45 | IndexableField[] fields = doc.getFields(prop); 46 | if (fields.length == 1) 47 | return Collections.singleton(fields[0].stringValue()); 48 | 49 | Collection values = new ArrayList(fields.length); 50 | for (int ix = 0; ix < fields.length; ix++) 51 | values.add(fields[ix].stringValue()); 52 | return values; 53 | } 54 | 55 | public void merge(Record other) { 56 | throw new UnsupportedOperationException(); 57 | } 58 | 59 | public String toString() { 60 | return "[DocumentRecord " + docno + " " + doc + "]"; 61 | } 62 | 63 | public int hashCode() { 64 | return docno; 65 | } 66 | 67 | public boolean equals(Object other) { 68 | if (!(other instanceof DocumentRecord)) 69 | return false; 70 | 71 | return ((DocumentRecord) other).docno == docno; 72 | } 73 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/databases/InMemoryDatabase.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import java.util.Map; 5 | import java.util.HashMap; 6 | import java.util.ArrayList; 7 | import java.util.Collection; 8 | 9 | import no.priv.garshol.duke.Record; 10 | import no.priv.garshol.duke.Property; 11 | import no.priv.garshol.duke.Database; 12 | import no.priv.garshol.duke.Configuration; 13 | 14 | /** 15 | * Naïve in-memory store of records. Matches all records against all 16 | * other records. 17 | */ 18 | public class InMemoryDatabase implements Database { 19 | private Configuration config; 20 | private Map idindex; 21 | private Collection records; 22 | 23 | public InMemoryDatabase() { 24 | this.idindex = new HashMap(); 25 | this.records = new ArrayList(); 26 | } 27 | 28 | public void setConfiguration(Configuration config) { 29 | this.config = config; 30 | } 31 | 32 | public void setOverwrite(boolean overwrite) { 33 | } 34 | 35 | /** 36 | * Returns true iff the database is held entirely in memory, and 37 | * thus is not persistent. 38 | */ 39 | public boolean isInMemory() { 40 | return true; 41 | } 42 | 43 | /** 44 | * Add the record to the index. 45 | */ 46 | public void index(Record record) { 47 | for (Property p : config.getIdentityProperties()) { 48 | Collection values = record.getValues(p.getName()); 49 | if (values == null) 50 | continue; 51 | 52 | for (String id : values) 53 | idindex.put(id, record); 54 | } 55 | records.add(record); 56 | } 57 | 58 | /** 59 | * Look up record by identity. 60 | */ 61 | public Record findRecordById(String id) { 62 | return idindex.get(id); 63 | } 64 | 65 | /** 66 | * Look up potentially matching records. 67 | */ 68 | public Collection findCandidateMatches(Record record) { 69 | return records; 70 | } 71 | 72 | /** 73 | * Flushes all changes to disk. For in-memory databases this is a 74 | * no-op. 75 | */ 76 | public void commit() { 77 | } 78 | 79 | /** 80 | * Stores state to disk and closes all open resources. 81 | */ 82 | public void close() { 83 | } 84 | 85 | public String toString() { 86 | return "InMemoryDatabase"; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /doc/example-data/dogfood.xml: -------------------------------------------------------------------------------- 1 | 17 | 18 | 19 | 0.89 20 | 21 | 22 | 23 | ID 24 | 25 | 26 | 27 | NAME 28 | no.priv.garshol.duke.comparators.JaroWinklerTokenized 29 | 0.2 30 | 0.88 31 | 32 | 33 | MBOX_HASH 34 | no.priv.garshol.duke.comparators.ExactComparator 35 | 0.48 36 | 0.6 37 | 38 | 39 | AFFILIATION 40 | no.priv.garshol.duke.comparators.ExactComparator 41 | 0.48 42 | 0.6 43 | 44 | 45 | HOMEPAGE 46 | no.priv.garshol.duke.comparators.ExactComparator 47 | 0.48 48 | 0.6 49 | 50 | 51 | 52 | 53 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 63 | 66 | 68 | 70 | 72 | 73 | 74 | -------------------------------------------------------------------------------- /duke-json/src/test/java/no/priv/garshol/duke/datasources/JsonDataSourceTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.datasources; 3 | 4 | import java.io.IOException; 5 | 6 | import no.priv.garshol.duke.Record; 7 | import no.priv.garshol.duke.RecordIterator; 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | 11 | import static org.junit.Assert.assertEquals; 12 | import static org.junit.Assert.assertTrue; 13 | 14 | /** 15 | * Created by damien on 08/04/14. 16 | */ 17 | public class JsonDataSourceTest { 18 | private JsonDataSource source; 19 | 20 | @Before 21 | public void setup() { 22 | source = new JsonDataSource(); 23 | source.addColumn(new Column("F1", null, null, null)); 24 | source.addColumn(new Column("F2", null, null, null)); 25 | source.addColumn(new Column("F3", null, null, null)); 26 | } 27 | 28 | @Test 29 | public void testEmpty() throws IOException { 30 | RecordIterator it = source.getRecordsFromString(""); 31 | assertTrue(!it.hasNext()); 32 | } 33 | 34 | @Test 35 | public void testSingleRecord() throws IOException { 36 | Record r = source.getRecordsFromString("{\"F1\":\"a\",\"F2\" : \"b\", \"F3\" : \"c\", \"F4\" : \"d\"}").next(); 37 | 38 | assertEquals("a", r.getValue("F1")); 39 | assertEquals("b", r.getValue("F2")); 40 | assertEquals("c", r.getValue("F3")); 41 | } 42 | 43 | @Test 44 | public void testArrayField() { 45 | Record r = source.getRecordsFromString("{\"F1\":[\"a\",\"b\",\"c\"]}").next(); 46 | assertEquals(3, r.getValues("F1").size()); 47 | } 48 | 49 | @Test 50 | public void testNestRecords() { 51 | Record r = source.getRecordsFromString("{\"F1\":\"a\",\"FF2\" : {\"F2\" : \"b\"}, \"FFF3\" : {\"FF3\" : {\"F3\" : \"c\",\"F4\" : \"d\"}}}").next(); 52 | assertEquals("a", r.getValue("F1")); 53 | assertEquals("b", r.getValue("F2")); 54 | assertEquals("c", r.getValue("F3")); 55 | } 56 | 57 | @Test 58 | public void multipleRecords() { 59 | RecordIterator it = source.getRecordsFromString("{\"F1\":\"a\",\"F2\" : \"b\", \"F3\" : \"c\"}{\"F1\":\"a2\",\"F2\" : \"b2\", \"F3\" : \"c2\"}"); 60 | Record r1 = it.next(); 61 | assertEquals("a", r1.getValue("F1")); 62 | assertEquals("b", r1.getValue("F2")); 63 | assertEquals("c", r1.getValue("F3")); 64 | Record r2 = it.next(); 65 | assertEquals("a2", r2.getValue("F1")); 66 | assertEquals("b2", r2.getValue("F2")); 67 | assertEquals("c2", r2.getValue("F3")); 68 | } 69 | } 70 | 71 | -------------------------------------------------------------------------------- /duke-lucene/src/test/java/no/priv/garshol/duke/databases/ExtraLuceneDatabaseTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import java.util.List; 5 | import java.util.ArrayList; 6 | import java.io.File; 7 | import java.io.IOException; 8 | 9 | import org.apache.lucene.analysis.Analyzer; 10 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 11 | import org.apache.lucene.index.IndexWriter; 12 | import org.apache.lucene.index.IndexWriterConfig; 13 | import org.apache.lucene.store.Directory; 14 | import org.apache.lucene.store.FSDirectory; 15 | import org.apache.lucene.util.Version; 16 | 17 | import no.priv.garshol.duke.Configuration; 18 | import no.priv.garshol.duke.ConfigurationImpl; 19 | import no.priv.garshol.duke.Property; 20 | import no.priv.garshol.duke.PropertyImpl; 21 | import no.priv.garshol.duke.Record; 22 | import no.priv.garshol.duke.RecordImpl; 23 | import no.priv.garshol.duke.DukeException; 24 | 25 | import org.junit.After; 26 | import org.junit.Before; 27 | import org.junit.Test; 28 | 29 | import static org.junit.Assert.assertEquals; 30 | 31 | public class ExtraLuceneDatabaseTest { 32 | 33 | @Test 34 | public void testLockedIndex() throws IOException { 35 | // this test verifies that we don't wind up in an inconsistent state 36 | // when the index we want to work with is already locked 37 | 38 | // make a locked index 39 | Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); 40 | IndexWriterConfig cfg = 41 | new IndexWriterConfig(Version.LUCENE_CURRENT, analyzer); 42 | File tmp = new File(System.getProperty("java.io.tmpdir"), 43 | "lucene-temp-" + (Math.random() * 100000)); 44 | Directory directory = FSDirectory.open(tmp); 45 | IndexWriter writer = new IndexWriter(directory, cfg); 46 | 47 | // now try to open a LuceneDatabase in the same place 48 | List properties = new ArrayList(); 49 | properties.add(new PropertyImpl("id")); 50 | ConfigurationImpl config = new ConfigurationImpl(); 51 | config.setProperties(properties); 52 | LuceneDatabase db = new LuceneDatabase(); 53 | db.setPath(tmp.getAbsolutePath()); 54 | db.setConfiguration(config); 55 | 56 | Record r = new RecordImpl(); 57 | try { 58 | db.index(r); 59 | } catch (DukeException e) { 60 | // this is expected 61 | } 62 | 63 | try { 64 | db.index(r); 65 | } catch (DukeException e) { 66 | // this is also expected 67 | } 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/databases/InMemoryKeyValueStore.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.databases; 3 | 4 | import java.util.Map; 5 | import java.util.HashMap; 6 | 7 | import no.priv.garshol.duke.Record; 8 | 9 | /** 10 | * A simple key value store that keeps all data in memory. 11 | * @since 1.0 12 | */ 13 | public class InMemoryKeyValueStore implements KeyValueStore { 14 | private long nextid; // next available id 15 | private Record[] records; // key into this array is the internal id 16 | private Map tokens; // lookup token -> internal ids 17 | private Map byid; // lookup extid -> internal id 18 | 19 | public InMemoryKeyValueStore() { 20 | this.records = new Record[1000]; 21 | this.tokens = new HashMap(); 22 | this.byid = new HashMap(); 23 | } 24 | 25 | public boolean isInMemory() { 26 | return true; 27 | } 28 | 29 | public void commit() { 30 | //System.out.println("Buckets: " + tokens.size()); 31 | for (Bucket b : tokens.values()) 32 | b.sort(); 33 | } 34 | 35 | public void close() { 36 | } 37 | 38 | public long makeNewRecordId() { 39 | return nextid++; 40 | } 41 | 42 | public void registerRecord(long id, Record record) { 43 | // grow array if necessary 44 | if (id >= records.length) { 45 | Record[] newbuf = new Record[records.length * 2]; 46 | System.arraycopy(records, 0, newbuf, 0, records.length); 47 | records = newbuf; 48 | } 49 | 50 | // register 51 | records[(int) id] = record; 52 | } 53 | 54 | public void registerId(long id, String extid) { 55 | byid.put(extid, id); 56 | } 57 | 58 | public void registerToken(long id, String propname, String token) { 59 | String key = propname + '|' + token; 60 | Bucket bucket = tokens.get(key); 61 | if (bucket == null) { 62 | bucket = new Bucket(); 63 | tokens.put(key, bucket); 64 | } 65 | bucket.add(id); 66 | } 67 | 68 | public Record findRecordById(String extid) { 69 | Long id = byid.get(extid); 70 | if (id == null) 71 | return null; 72 | return records[id.intValue()]; 73 | } 74 | 75 | public Record findRecordById(long id) { 76 | return records[(int) id]; 77 | } 78 | 79 | public Bucket lookupToken(String propname, String token) { 80 | return tokens.get(propname + '|' + token); 81 | } 82 | 83 | public String toString() { 84 | return "InMemoryKeyValueStore"; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/datasources/RecordBuilder.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.datasources; 3 | 4 | import java.util.Collection; 5 | 6 | import no.priv.garshol.duke.Record; 7 | import no.priv.garshol.duke.Cleaner; 8 | import no.priv.garshol.duke.RecordImpl; 9 | import no.priv.garshol.duke.CompactRecord; 10 | import no.priv.garshol.duke.ModifiableRecord; 11 | 12 | /** 13 | * Helper class for building records, to avoid having to copy all the 14 | * cleaning logic etc in each single data source. 15 | */ 16 | public class RecordBuilder { 17 | private ColumnarDataSource source; 18 | private ModifiableRecord record; 19 | 20 | public RecordBuilder(ColumnarDataSource source) { 21 | this.source = source; 22 | } 23 | 24 | public void newRecord() { 25 | record = new CompactRecord(); 26 | } 27 | 28 | public boolean isRecordEmpty() { 29 | return record.isEmpty(); 30 | } 31 | 32 | public void addValue(String column, String value) { 33 | Collection cols = source.getColumn(column); 34 | if (cols == null || cols.isEmpty()) 35 | return; 36 | Column col = cols.iterator().next(); 37 | addValue(col, value); 38 | } 39 | 40 | public void addValue(Column col, String value) { 41 | if (value == null || value.equals("")) 42 | return; 43 | 44 | String prop = col.getProperty(); 45 | Cleaner cleaner = col.getCleaner(); 46 | if (col.isSplit()) { 47 | for (String v : col.split(value)) { 48 | if (cleaner != null) 49 | v = cleaner.clean(v); 50 | if (v != null && !v.equals("")) 51 | record.addValue(prop, v); 52 | } 53 | } else { 54 | if (cleaner != null) 55 | value = cleaner.clean(value); 56 | if (value != null && !value.equals("")) 57 | record.addValue(prop, value); 58 | } 59 | } 60 | 61 | // FIXME: probably we should just get rid of these 62 | public void setValue(String column, String value) { 63 | Collection cols = source.getColumn(column); 64 | Column col = cols.iterator().next(); 65 | setValue(col, value); 66 | } 67 | 68 | public void setValue(Column col, String value) { 69 | if (col.getCleaner() != null) 70 | value = col.getCleaner().clean(value); 71 | if (value == null || value.equals("")) 72 | return; // nothing here, move on 73 | 74 | record.addValue(col.getProperty(), value); 75 | } 76 | 77 | public Record getRecord() { 78 | return record; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/RecordImpl.java: -------------------------------------------------------------------------------- 1 | package no.priv.garshol.duke; 2 | 3 | import java.util.Map; 4 | import java.util.HashMap; 5 | import java.util.ArrayList; 6 | import java.util.Collection; 7 | import java.util.Collections; 8 | 9 | /** 10 | * Previously the default implementation of the Record interface; now 11 | * superceded by CompactRecord. 12 | */ 13 | public class RecordImpl implements ModifiableRecord { 14 | private Map> data; 15 | 16 | public RecordImpl(Map> data) { 17 | this.data = data; // FIXME: should we copy? 18 | } 19 | 20 | public RecordImpl() { 21 | this.data = new HashMap(); 22 | } 23 | 24 | public boolean isEmpty() { 25 | return data.isEmpty(); 26 | } 27 | 28 | public Collection getProperties() { 29 | return data.keySet(); 30 | } 31 | 32 | public String getValue(String prop) { 33 | Collection values = getValues(prop); 34 | if (values == null || values.isEmpty()) 35 | return null; 36 | else 37 | return values.iterator().next(); 38 | } 39 | 40 | public Collection getValues(String prop) { 41 | Collection values = data.get(prop); 42 | if (values == null) 43 | return Collections.EMPTY_LIST; 44 | return values; 45 | } 46 | 47 | public void addValue(String property, String value) { 48 | Collection values = data.get(property); 49 | if (values == null) { 50 | values = new ArrayList(); 51 | data.put(property, values); 52 | } 53 | values.add(value); 54 | } 55 | 56 | public void remove(String property) { 57 | data.remove(property); 58 | } 59 | 60 | public void merge(Record other) { 61 | throw new UnsupportedOperationException(); 62 | } 63 | 64 | public String toString() { 65 | return "[RecordImpl " + data + "]"; 66 | } 67 | 68 | @Override 69 | public int hashCode() { 70 | final int prime = 31; 71 | int result = 1; 72 | result = prime * result + ((data == null) ? 0 : data.hashCode()); 73 | return result; 74 | } 75 | 76 | @Override 77 | public boolean equals(Object obj) { 78 | if (this == obj) 79 | return true; 80 | if (obj == null) 81 | return false; 82 | if (getClass() != obj.getClass()) 83 | return false; 84 | RecordImpl other = (RecordImpl) obj; 85 | if (data == null) { 86 | if (other.data != null) 87 | return false; 88 | } else if (!data.equals(other.data)) 89 | return false; 90 | return true; 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/comparators/GeopositionComparatorTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import no.priv.garshol.duke.DukeException; 5 | import org.junit.Before; 6 | import org.junit.Test; 7 | 8 | import static junit.framework.Assert.assertEquals; 9 | import static junit.framework.Assert.assertTrue; 10 | import static junit.framework.Assert.fail; 11 | 12 | public class GeopositionComparatorTest { 13 | private GeopositionComparator comp; 14 | 15 | @Before 16 | public void setup() { 17 | comp = new GeopositionComparator(); 18 | } 19 | 20 | @Test 21 | public void testEmpty() { 22 | assertEquals(0.5, comp.compare("", "")); 23 | } 24 | 25 | @Test 26 | public void testMalformed() { 27 | assertEquals(0.5, comp.compare("41.5,27.2", "41.5127.21")); 28 | } 29 | 30 | @Test 31 | public void testMalformed2() { 32 | assertEquals(0.5, comp.compare("41.5,27.2", "1231,123123")); 33 | } 34 | 35 | @Test 36 | public void testMalformedStrict() { 37 | comp.setStrict(true); 38 | try { 39 | assertEquals(0.5, comp.compare("41.5,27.2", "41.5127.21")); 40 | fail("Didn't catch bad value"); 41 | } catch (DukeException e) { 42 | // success 43 | } 44 | } 45 | 46 | @Test 47 | public void testMalformed2Strict() { 48 | comp.setStrict(true); 49 | try { 50 | assertEquals(0.5, comp.compare("41.5,27.2", "1231,123123")); 51 | fail("Didn't catch bad value"); 52 | } catch (DukeException e) { 53 | // success 54 | } 55 | } 56 | 57 | @Test 58 | public void testOsloKiev() { 59 | assertEquals(0.0, comp.compare("59.913869,10.752245", "50.45,30.5234")); 60 | } 61 | 62 | @Test 63 | public void testOsloKiev2() { 64 | String oslo = "59.913869,10.752245"; 65 | String kiev = "50.45,30.5234"; 66 | comp.setMaxDistance(2000 * 1000); // WolframAlpha gives distance as 1632km 67 | assertTrue(ratio(1550.0, 2000.0) > comp.compare(oslo, kiev)); 68 | assertTrue(ratio(1700.0, 2000.0) < comp.compare(oslo, kiev)); 69 | } 70 | 71 | private double ratio(double dist, double maxdist) { 72 | return ((1.0 - (dist / maxdist)) * 0.5 ) + 0.5; 73 | } 74 | 75 | @Test 76 | public void testOsloKiev3() { 77 | String oslo = "59.913869,10.752245"; 78 | String kiev = "50.45,30.5234"; 79 | comp.setMaxDistance(2000 * 1000); // WolframAlpha gives distance as 1632km 80 | assertEquals(comp.compare(oslo, kiev), comp.compare(kiev, oslo)); 81 | } 82 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/utils/LinkDatabaseUtils.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | import java.io.Reader; 5 | import java.io.FileReader; 6 | import java.io.IOException; 7 | import java.io.BufferedReader; 8 | 9 | import no.priv.garshol.duke.Link; 10 | import no.priv.garshol.duke.LinkKind; 11 | import no.priv.garshol.duke.LinkStatus; 12 | import no.priv.garshol.duke.LinkDatabase; 13 | import no.priv.garshol.duke.DukeException; 14 | import no.priv.garshol.duke.InMemoryLinkDatabase; 15 | 16 | /** 17 | * Utilities for dealing with link databases. 18 | */ 19 | public class LinkDatabaseUtils { 20 | 21 | /** 22 | * Loads a test file into an in-memory link database. 23 | */ 24 | public static LinkDatabase loadTestFile(String testfile) throws IOException { 25 | LinkDatabase linkdb = new InMemoryLinkDatabase(); 26 | loadTestFile(testfile, linkdb); 27 | return linkdb; 28 | } 29 | 30 | /** 31 | * Loads a test file into an in-memory link database. 32 | * @since 1.2 33 | */ 34 | public static LinkDatabase loadTestFile(Reader reader) throws IOException { 35 | LinkDatabase linkdb = new InMemoryLinkDatabase(); 36 | loadTestFile(reader, linkdb); 37 | return linkdb; 38 | } 39 | 40 | /** 41 | * Loads a test file into an in-memory link database. 42 | */ 43 | public static void loadTestFile(String testfile, LinkDatabase linkdb) 44 | throws IOException { 45 | loadTestFile(new FileReader(testfile), linkdb); 46 | } 47 | 48 | /** 49 | * Loads a test file into an in-memory link database. 50 | * @since 1.2 51 | */ 52 | public static void loadTestFile(Reader input, LinkDatabase linkdb) 53 | throws IOException { 54 | CSVReader reader = new CSVReader(input); 55 | String[] row = reader.next(); 56 | while (row != null) { 57 | if (row.length != 4) 58 | throw new DukeException("Wrong test file format, row had " + 59 | row.length + " values, should be 4"); 60 | 61 | LinkKind kind = row[0].equals("+") ? LinkKind.SAME : LinkKind.DIFFERENT; 62 | String id1 = row[1]; 63 | String id2 = row[2]; 64 | if (id1.compareTo(id2) < 0) { 65 | String tmp = id1; 66 | id1 = id2; 67 | id2 = tmp; 68 | } 69 | double conf = Double.valueOf(row[3]); 70 | 71 | linkdb.assertLink(new Link(id1, id2, LinkStatus.ASSERTED, kind, conf)); 72 | 73 | row = reader.next(); 74 | } 75 | 76 | reader.close(); 77 | } 78 | 79 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/CompactRecord.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.util.Arrays; 5 | import java.util.HashSet; 6 | import java.util.ArrayList; 7 | import java.util.Collection; 8 | import java.io.Serializable; 9 | 10 | /** 11 | * An implementation of the Record interface which uses less memory 12 | * than RecordImpl, and which seems to be a little faster. 13 | * @since 1.2 14 | */ 15 | public class CompactRecord implements ModifiableRecord, Serializable { 16 | private String[] s; // 0: prop name, 1: value, 2: prop, 3: value, ... 17 | private int free; // index of next free prop name cell 18 | 19 | public CompactRecord() { 20 | this.s = new String[16]; 21 | } 22 | 23 | public CompactRecord(int free, String[] s) { 24 | this.free = free; 25 | this.s = s; 26 | } 27 | 28 | public Collection getProperties() { 29 | Collection props = new HashSet(); 30 | for (int ix = 0; ix < free; ix += 2) 31 | props.add(s[ix]); 32 | return props; 33 | } 34 | 35 | public Collection getValues(String prop) { 36 | Collection values = new ArrayList(); 37 | for (int ix = 0; ix < free; ix += 2) 38 | if (s[ix].equals(prop)) 39 | values.add(s[ix + 1]); 40 | return values; 41 | } 42 | 43 | public String getValue(String prop) { 44 | for (int ix = 0; ix < free; ix += 2) 45 | if (s[ix].equals(prop)) 46 | return s[ix + 1]; 47 | return null; 48 | } 49 | 50 | public void merge(Record other) { 51 | throw new UnsupportedOperationException(); 52 | } 53 | 54 | public void addValue(String property, String value) { 55 | if (free >= s.length) { 56 | String[] olds = s; 57 | s = new String[olds.length * 3]; 58 | for (int ix = 0; ix < olds.length; ix++) 59 | s[ix] = olds[ix]; 60 | } 61 | s[free++] = property; 62 | s[free++] = value; 63 | } 64 | 65 | public boolean isEmpty() { 66 | return free == 0; 67 | } 68 | 69 | public int getFree() { 70 | return free; 71 | } 72 | 73 | public String[] getArray() { 74 | return s; 75 | } 76 | 77 | public String toString() { 78 | StringBuilder builder = new StringBuilder("{"); 79 | for (int ix = 0; ix < free; ix += 2) { 80 | if (ix > 0) 81 | builder.append(", "); 82 | builder.append(s[ix]) 83 | .append("=[") 84 | .append(s[ix + 1]) 85 | .append(']'); 86 | } 87 | builder.append("}"); 88 | return "[CompactRecord " + builder + "]"; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/LowerCaseNormalizeCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import java.text.Normalizer; 5 | 6 | import no.priv.garshol.duke.Cleaner; 7 | 8 | /** 9 | * A cleaner which removes leading and trailing whitespace, normalized 10 | * internal whitespace, lowercases all characters, and (by default) 11 | * strips accents. This is the most commonly used cleaner for textual 12 | * data. 13 | */ 14 | public class LowerCaseNormalizeCleaner implements Cleaner { 15 | private boolean strip_accents = true; 16 | 17 | /** 18 | * Controls whether accents are stripped (that is, "é" becomes "e", 19 | * and so on). The default is true. 20 | */ 21 | public void setStripAccents(boolean strip_accents) { 22 | this.strip_accents = strip_accents; 23 | } 24 | 25 | public String clean(String value) { 26 | if (strip_accents) 27 | // after this, accents will be represented as separate combining 28 | // accent characters trailing the character they belong with. the 29 | // next step will strip them out. 30 | value = Normalizer.normalize(value, Normalizer.Form.NFD); 31 | 32 | char[] tmp = new char[value.length()]; 33 | int pos = 0; 34 | boolean prevws = false; 35 | for (int ix = 0; ix < tmp.length; ix++) { 36 | char ch = value.charAt(ix); 37 | 38 | // we make an exception for \u030A (combining ring above) when 39 | // following 'a', because this is a Scandinavian character that 40 | // should *not* be normalized 41 | if (ch == 0x030A && (value.charAt(ix - 1) == 'a' || 42 | value.charAt(ix - 1) == 'A')) { 43 | prevws = false; 44 | // this overwrites the previously written 'a' with 'aa' 45 | tmp[pos - 1] = '\u00E5'; 46 | continue; 47 | } 48 | 49 | // if character is combining diacritical mark, skip it. 50 | if ((ch >= 0x0300 && ch <= 0x036F) || 51 | (ch >= 0x1DC0 && ch <= 0x1DFF) || 52 | (ch >= 0x20D0 && ch <= 0x20FF) || 53 | (ch >= 0xFE20 && ch <= 0xFE2F)) 54 | continue; 55 | 56 | // whitespace processing 57 | if (ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r' && 58 | ch != 0xA0 /* NBSP */) { 59 | if (prevws && pos != 0) 60 | tmp[pos++] = ' '; 61 | 62 | tmp[pos++] = Character.toLowerCase(ch); 63 | 64 | prevws = false; 65 | } else 66 | prevws = true; 67 | } 68 | return new String(tmp, 0, pos); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /duke-core/src/test/java/no/priv/garshol/duke/genetic/ActiveLearningTest.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.genetic; 3 | 4 | import java.util.List; 5 | import java.util.ArrayList; 6 | import java.io.File; 7 | import java.io.FileWriter; 8 | import java.io.IOException; 9 | 10 | import org.junit.Test; 11 | import org.junit.Rule; 12 | import org.junit.Before; 13 | import static org.junit.Assert.fail; 14 | import static org.junit.Assert.assertTrue; 15 | import static org.junit.Assert.assertEquals; 16 | import org.junit.rules.TemporaryFolder; 17 | 18 | import no.priv.garshol.duke.Comparator; 19 | import no.priv.garshol.duke.Property; 20 | import no.priv.garshol.duke.PropertyImpl; 21 | import no.priv.garshol.duke.ConfigurationImpl; 22 | import no.priv.garshol.duke.databases.InMemoryDatabase; 23 | import no.priv.garshol.duke.comparators.ExactComparator; 24 | import no.priv.garshol.duke.datasources.Column; 25 | import no.priv.garshol.duke.datasources.CSVDataSource; 26 | 27 | public class ActiveLearningTest { 28 | @Rule 29 | public TemporaryFolder tmpdir = new TemporaryFolder(); 30 | 31 | @Test 32 | public void testSmallData() throws IOException { 33 | File outfile = tmpdir.newFile("test.csv"); 34 | 35 | FileWriter out = new FileWriter(outfile); 36 | out.write("id;name;age\n"); 37 | out.write("1;LMG;39\n"); 38 | out.write("2;GOG;40\n"); 39 | out.write("3;GDM;29\n"); 40 | out.write("4;AB;49\n"); 41 | out.close(); 42 | 43 | File tstfile = tmpdir.newFile("testfile.csv"); 44 | out = new FileWriter(tstfile); 45 | out.close(); 46 | 47 | CSVDataSource csv = new CSVDataSource(); 48 | csv.setSeparator(';'); 49 | csv.setInputFile(outfile.getAbsolutePath()); 50 | csv.addColumn(new Column("id", null, null, null)); 51 | csv.addColumn(new Column("name", null, null, null)); 52 | csv.addColumn(new Column("age", null, null, null)); 53 | 54 | ConfigurationImpl cfg = new ConfigurationImpl(); 55 | cfg.addDatabase(new InMemoryDatabase()); 56 | cfg.addDataSource(0, csv); 57 | 58 | Comparator cmp = new ExactComparator(); 59 | 60 | List props = new ArrayList(); 61 | props.add(new PropertyImpl("id")); 62 | props.add(new PropertyImpl("name", cmp, 0.0, 1.0)); 63 | props.add(new PropertyImpl("age", cmp, 0.0, 1.0)); 64 | 65 | cfg.setProperties(props); 66 | 67 | GeneticAlgorithm gen = new GeneticAlgorithm(cfg, tstfile.getAbsolutePath(), 68 | true); 69 | gen.setQuiet(true); 70 | gen.run(); // should not crash! 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/Property.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | /** 5 | * Represents a property. 6 | */ 7 | public interface Property { 8 | 9 | /** 10 | * Returns the name of the property. 11 | */ 12 | public String getName(); 13 | 14 | /** 15 | * Returns true iff the property is an identifying property. These 16 | * properties are never used for comparisons. 17 | */ 18 | public boolean isIdProperty(); 19 | 20 | // FIXME: should we remove this? 21 | public boolean isAnalyzedProperty(); 22 | 23 | public Comparator getComparator(); 24 | 25 | public double getHighProbability(); 26 | 27 | public double getLowProbability(); 28 | 29 | public Lookup getLookupBehaviour(); 30 | 31 | /** 32 | * Sets the comparator used for this property. Note that changing 33 | * this while Duke is processing may have unpredictable 34 | * consequences. 35 | */ 36 | public void setComparator(Comparator comparator); 37 | 38 | /** 39 | * Sets the high probability used for this property. Note that 40 | * changing this while Duke is processing may have unpredictable 41 | * consequences. 42 | */ 43 | public void setHighProbability(double high); 44 | 45 | /** 46 | * Sets the low probability used for this property. Note that 47 | * changing this while Duke is processing may have unpredictable 48 | * consequences. 49 | */ 50 | public void setLowProbability(double low); 51 | 52 | /** 53 | * Iff true the property should not be used for comparing records. 54 | */ 55 | public boolean isIgnoreProperty(); 56 | 57 | /** 58 | * Makes Duke skip this property when comparing records. 59 | */ 60 | public void setIgnoreProperty(boolean ignore); 61 | 62 | /** 63 | * Sets the lookup behaviour of this property. 64 | */ 65 | public void setLookupBehaviour(Lookup lookup); 66 | 67 | /** 68 | * Returns the probability that the records v1 and v2 represent the 69 | * same entity, based on high and low probability settings etc. 70 | */ 71 | public double compare(String v1, String v2); 72 | 73 | /** 74 | * Returns a copy of the property. 75 | */ 76 | public Property copy(); 77 | 78 | /** 79 | * The lookup behaviour for this property. 80 | */ 81 | public enum Lookup { 82 | // means: always look up this property, and require values to match 83 | REQUIRED, 84 | 85 | // always look up this property 86 | TRUE, 87 | 88 | // never look up this property 89 | FALSE, 90 | 91 | // default behaviour (look up if analysis says we should) 92 | DEFAULT 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /duke-server/src/main/java/no/priv/garshol/duke/server/BasicTimer.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.server; 3 | 4 | import java.util.Properties; 5 | 6 | import no.priv.garshol.duke.DukeException; 7 | 8 | /** 9 | * A basic timer implementation that will work in any context, but 10 | * which does not (unfortunately) provide managed threads in more 11 | * advanced servlet containers. For that, look at CommonJTimer. 12 | */ 13 | public class BasicTimer implements DukeTimer, Runnable { 14 | private DukeController controller; 15 | private int check_interval; 16 | private int sleep_interval; 17 | private boolean keep_running; 18 | 19 | // --- Setup 20 | 21 | public BasicTimer() { 22 | this.sleep_interval = 100; // default 23 | } 24 | 25 | // --- DukeTimer implementation 26 | 27 | public void init(Properties props) { 28 | // don't need to do anything 29 | } 30 | 31 | public void spawnThread(DukeController controller, int check_interval) { 32 | if (this.controller != null) 33 | throw new DukeException("Timer thread already running!"); 34 | 35 | this.controller = controller; 36 | this.check_interval = check_interval * 1000; // convert to ms 37 | keep_running = true; 38 | 39 | // spawn away 40 | Thread thread = new Thread(this); 41 | thread.setDaemon(true); 42 | thread.start(); 43 | } 44 | 45 | public boolean isRunning() { 46 | return keep_running; 47 | } 48 | 49 | public void stop() { 50 | controller = null; 51 | keep_running = false; 52 | } 53 | 54 | // --- Runnable implementation 55 | 56 | public void run() { 57 | while (keep_running) { 58 | try { 59 | // tell controller to do some real work for a change 60 | controller.process(); 61 | 62 | // waiting check_interval ms, while taking sleep_interval ms 63 | // long naps so we can break off faster if the server is shut 64 | // down 65 | long wait_start = System.currentTimeMillis(); 66 | do { 67 | Thread.sleep(sleep_interval); 68 | } while (keep_running && 69 | (System.currentTimeMillis() - wait_start) < check_interval); 70 | 71 | } catch (Throwable e) { 72 | controller.reportError(e); 73 | try { 74 | Thread.sleep(getErrorWaitInteral()); // wait a good while, then retry 75 | } catch (InterruptedException e2) { 76 | } 77 | } 78 | } 79 | controller.reportStopped(); 80 | } 81 | 82 | // --- Internal methods 83 | 84 | private int getErrorWaitInteral() { 85 | return check_interval * 6; 86 | } 87 | 88 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/AbstractCmdlineTool.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke; 3 | 4 | import java.util.Collection; 5 | import java.io.IOException; 6 | 7 | import org.xml.sax.SAXException; 8 | 9 | import no.priv.garshol.duke.utils.CommandLineParser; 10 | 11 | /** 12 | * INTERNAL: Shared code between the simplest command-line tools. 13 | * @since 1.2 14 | */ 15 | public abstract class AbstractCmdlineTool { 16 | protected Database database; 17 | protected Configuration config; 18 | protected CommandLineParser parser; 19 | private static final int DEFAULT_BATCH_SIZE = 40000; 20 | 21 | /** 22 | * These exact lines are shared between three different tools, so 23 | * they have been moved here to reduce code duplication. 24 | * @return The parsed command-line, with options removed. 25 | */ 26 | public String[] init(String[] argv, int min, int max, 27 | Collection options) 28 | throws IOException, SAXException { 29 | // parse command line 30 | parser = new CommandLineParser(); 31 | parser.setMinimumArguments(min); 32 | parser.setMaximumArguments(max); 33 | parser.registerOption(new CommandLineParser.BooleanOption("reindex", 'I')); 34 | if (options != null) 35 | for (CommandLineParser.Option option : options) 36 | parser.registerOption(option); 37 | 38 | try { 39 | argv = parser.parse(argv); 40 | } catch (CommandLineParser.CommandLineParserException e) { 41 | System.err.println("ERROR: " + e.getMessage()); 42 | usage(); 43 | System.exit(1); 44 | } 45 | 46 | // do we need to reindex? 47 | boolean reindex = parser.getOptionState("reindex"); 48 | 49 | // load configuration 50 | config = ConfigLoader.load(argv[0]); 51 | database = config.getDatabase(reindex); // overwrite iff reindex 52 | if (database.isInMemory()) 53 | reindex = true; // no other way to do it in this case 54 | 55 | // reindex, if requested 56 | if (reindex) 57 | reindex(config, database); 58 | 59 | return argv; 60 | } 61 | 62 | protected abstract void usage(); 63 | 64 | private static void reindex(Configuration config, Database database) { 65 | System.out.println("Reindexing all records..."); 66 | Processor processor = new Processor(config, database); 67 | if (config.isDeduplicationMode()) 68 | processor.index(config.getDataSources(), DEFAULT_BATCH_SIZE); 69 | else { 70 | processor.index(config.getDataSources(1), DEFAULT_BATCH_SIZE); 71 | processor.index(config.getDataSources(2), DEFAULT_BATCH_SIZE); 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/comparators/SoundexComparator.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.comparators; 3 | 4 | import no.priv.garshol.duke.Comparator; 5 | 6 | /** 7 | * An implementation of the Soundex algorithm, and a comparator which 8 | * considers strings to have a score of 0.9 if their Soundex values 9 | * match. 10 | */ 11 | public class SoundexComparator implements Comparator { 12 | // this table is keyed 0-25 (for 'a' to 'z') to the numeric value to put 13 | // in the key. 0 means the letter is to be omitted. 14 | private static char[] number = buildTable(); 15 | 16 | public double compare(String s1, String s2) { 17 | if (s1.equals(s2)) 18 | return 1.0; 19 | 20 | if (soundex(s1).equals(soundex(s2))) 21 | return 0.9; 22 | 23 | return 0.0; 24 | } 25 | 26 | public boolean isTokenized() { 27 | return true; // I guess? 28 | } 29 | 30 | /** 31 | * Produces the Soundex key for the given string. 32 | */ 33 | public static String soundex(String str) { 34 | if (str.length() < 1) 35 | return ""; // no soundex key for the empty string (could use 000) 36 | 37 | char[] key = new char[4]; 38 | key[0] = str.charAt(0); 39 | int pos = 1; 40 | char prev = '0'; 41 | for (int ix = 1; ix < str.length() && pos < 4; ix++) { 42 | char ch = str.charAt(ix); 43 | int charno; 44 | if (ch >= 'A' && ch <= 'Z') 45 | charno = ch - 'A'; 46 | else if (ch >= 'a' && ch <= 'z') 47 | charno = ch - 'a'; 48 | else 49 | continue; 50 | 51 | if (number[charno] != '0' && number[charno] != prev) 52 | key[pos++] = number[charno]; 53 | prev = number[charno]; 54 | } 55 | 56 | for ( ; pos < 4; pos++) 57 | key[pos] = '0'; 58 | 59 | return new String(key); 60 | } 61 | 62 | /** 63 | * Builds the mapping table. 64 | */ 65 | private static char[] buildTable() { 66 | char[] table = new char[26]; 67 | for (int ix = 0; ix < table.length; ix++) 68 | table[ix] = '0'; 69 | table['B' - 'A'] = '1'; 70 | table['P' - 'A'] = '1'; 71 | table['F' - 'A'] = '1'; 72 | table['V' - 'A'] = '1'; 73 | table['C' - 'A'] = '2'; 74 | table['S' - 'A'] = '2'; 75 | table['K' - 'A'] = '2'; 76 | table['G' - 'A'] = '2'; 77 | table['J' - 'A'] = '2'; 78 | table['Q' - 'A'] = '2'; 79 | table['X' - 'A'] = '2'; 80 | table['Z' - 'A'] = '2'; 81 | table['D' - 'A'] = '3'; 82 | table['T' - 'A'] = '3'; 83 | table['L' - 'A'] = '4'; 84 | table['M' - 'A'] = '5'; 85 | table['N' - 'A'] = '5'; 86 | table['R' - 'A'] = '6'; 87 | return table; 88 | } 89 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/datasources/ColumnarDataSource.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.datasources; 3 | 4 | import java.util.ArrayList; 5 | import java.util.Collection; 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | import no.priv.garshol.duke.ConfigWriter; 10 | import no.priv.garshol.duke.DataSource; 11 | import no.priv.garshol.duke.DukeConfigException; 12 | import no.priv.garshol.duke.Logger; 13 | import org.xml.sax.helpers.AttributeListImpl; 14 | 15 | /** 16 | * Abstract class for sharing code that is common to column-based data 17 | * sources. 18 | */ 19 | public abstract class ColumnarDataSource implements DataSource { 20 | protected Map> columns; 21 | protected Logger logger; 22 | 23 | public ColumnarDataSource() { 24 | this.columns = new HashMap(); 25 | } 26 | 27 | public void addColumn(Column column) { 28 | Collection cols = columns.get(column.getName()); 29 | if (cols == null) { 30 | cols = new ArrayList(); 31 | columns.put(column.getName(), cols); 32 | } 33 | cols.add(column); 34 | } 35 | 36 | public Collection getColumn(String name) { 37 | return columns.get(name); 38 | } 39 | 40 | public Collection getColumns() { 41 | Collection all = new ArrayList(columns.size()); 42 | for (Collection col : columns.values()) 43 | all.addAll(col); 44 | return all; 45 | } 46 | 47 | public void setLogger(Logger logger) { 48 | this.logger = logger; 49 | } 50 | 51 | protected abstract String getSourceName(); 52 | 53 | protected void verifyProperty(String value, String name) { 54 | if (value == null) 55 | throw new DukeConfigException("Missing '" + name + "' property to " + 56 | getSourceName() + " data source"); 57 | } 58 | 59 | protected void writeColumnsConfig(ConfigWriter cw) { 60 | // FIXME: this breaks the order... 61 | for (Column col : getColumns()) { 62 | AttributeListImpl atts = new AttributeListImpl(); 63 | atts.addAttribute("name", "CDATA", col.getName()); 64 | atts.addAttribute("property", "CDATA", col.getProperty()); 65 | if (col.getPrefix() != null) 66 | atts.addAttribute("prefix", "CDATA", col.getPrefix()); 67 | // FIXME: cleaner really requires object support ... :-( 68 | if (col.getCleaner() != null) 69 | atts.addAttribute("cleaner", "CDATA", col.getCleaner().getClass().getName()); 70 | if (col.isSplit()) 71 | atts.addAttribute("split-on", "CDATA", col.getSplitOn()); 72 | 73 | cw.writeStartElement("column", atts); 74 | cw.writeEndElement("column"); 75 | } 76 | } 77 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/utils/StringUtils.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.utils; 3 | 4 | public class StringUtils { 5 | 6 | /** 7 | * Replaces all characters in the second parameter found in the first 8 | * parameter with the final character. 9 | * @param value the string to replace characters in 10 | * @param chars the characters to replace 11 | * @param replacement the character to insert as replacement 12 | */ 13 | public static String replaceAnyOf(String value, String chars, 14 | char replacement) { 15 | char[] tmp = new char[value.length()]; 16 | int pos = 0; 17 | for (int ix = 0; ix < tmp.length; ix++) { 18 | char ch = value.charAt(ix); 19 | if (chars.indexOf(ch) != -1) 20 | tmp[pos++] = replacement; 21 | else 22 | tmp[pos++] = ch; 23 | } 24 | return new String(tmp, 0, tmp.length); 25 | } 26 | 27 | /** 28 | * Removes trailing and leading whitespace, and also reduces each 29 | * sequence of internal whitespace to a single space. 30 | */ 31 | public static String normalizeWS(String value) { 32 | char[] tmp = new char[value.length()]; 33 | int pos = 0; 34 | boolean prevws = false; 35 | for (int ix = 0; ix < tmp.length; ix++) { 36 | char ch = value.charAt(ix); 37 | if (ch != ' ' && ch != '\t' && ch != '\n' && ch != '\r') { 38 | if (prevws && pos != 0) 39 | tmp[pos++] = ' '; 40 | 41 | tmp[pos++] = ch; 42 | prevws = false; 43 | } else 44 | prevws = true; 45 | } 46 | return new String(tmp, 0, pos); 47 | } 48 | 49 | public static String[] split(String str) { 50 | String[] tokens = new String[(int) (str.length() / 2) + 1]; 51 | int start = 0; 52 | int tcount = 0; 53 | boolean prevws = false; 54 | int ix; 55 | for (ix = 0; ix < str.length(); ix++) { 56 | if (str.charAt(ix) == ' ') { 57 | if (!prevws && ix > 0) 58 | tokens[tcount++] = str.substring(start, ix); 59 | prevws = true; 60 | start = ix + 1; 61 | } else 62 | prevws = false; 63 | } 64 | 65 | if (!prevws && start != ix) 66 | tokens[tcount++] = str.substring(start); 67 | 68 | String[] tmp = new String[tcount]; 69 | for (ix = 0; ix < tcount; ix++) 70 | tmp[ix] = tokens[ix]; 71 | return tmp; 72 | } 73 | 74 | public static String join(String[] pieces) { 75 | StringBuilder tmp = new StringBuilder(); 76 | for (int ix = 0; ix < pieces.length; ix++) { 77 | if (ix != 0) 78 | tmp.append(" "); 79 | tmp.append(pieces[ix]); 80 | } 81 | return tmp.toString(); 82 | } 83 | } -------------------------------------------------------------------------------- /duke-core/src/main/java/no/priv/garshol/duke/cleaners/RegexpCleaner.java: -------------------------------------------------------------------------------- 1 | 2 | package no.priv.garshol.duke.cleaners; 3 | 4 | import no.priv.garshol.duke.Cleaner; 5 | 6 | import java.util.regex.Matcher; 7 | import java.util.regex.Pattern; 8 | 9 | /** 10 | * Cleaner which returns the part of the input string matched by 11 | * either the entire regular expression or a group in the regexp. 12 | * The default is for it to return the contents of group number 1. 13 | * 14 | * Can also discard the matching group and return rest of string. 15 | * @since 0.5 16 | */ 17 | public class RegexpCleaner implements Cleaner { 18 | private Pattern regexp; 19 | private int groupno; 20 | // if true, discard group, otherwise keep only group. default: false 21 | private boolean discard; 22 | // if true, discard all result of group. default: false 23 | private boolean discardAllGroup; 24 | 25 | public RegexpCleaner() { 26 | this.groupno = 1; // default 27 | } 28 | 29 | public String clean(String value) { 30 | if (value == null || value.length() == 0) 31 | return null; 32 | 33 | Matcher matcher = regexp.matcher(value); 34 | if (!discard && !discardAllGroup) { 35 | if (!matcher.find()) 36 | return null; 37 | return matcher.group(groupno); 38 | } else { 39 | if (!matcher.find()) 40 | return value; 41 | else { 42 | StringBuilder discardBuilder = new StringBuilder(value); 43 | discardBuilder.delete(matcher.start(groupno), matcher.end(groupno)); 44 | if (discardAllGroup) { 45 | Matcher discardAllMatcher = regexp.matcher(discardBuilder); 46 | while(discardAllMatcher.find()) { 47 | discardBuilder.delete(discardAllMatcher.start(groupno), discardAllMatcher.end(groupno)); 48 | discardAllMatcher.reset(); 49 | } 50 | } 51 | return discardBuilder.toString(); 52 | } 53 | } 54 | } 55 | 56 | public void setRegexp(String regexp) { 57 | this.regexp = Pattern.compile(regexp); 58 | } 59 | 60 | /** 61 | * The group in the pattern to keep or discard 62 | * @param groupno 63 | */ 64 | public void setGroup(int groupno) { 65 | this.groupno = groupno; 66 | } 67 | 68 | /** 69 | * If true, discards the first occurrence of matching {@code group} instead of keeping it. 70 | * @param discard 71 | */ 72 | public void setDiscardGroup(boolean discard) { 73 | this.discard = discard; 74 | } 75 | 76 | /** 77 | * If true, discards all results of matching {@code group} 78 | * @param discardAllGroup 79 | */ 80 | public void setDiscardAllGroup(boolean discardAllGroup) { 81 | this.discardAllGroup = discardAllGroup; 82 | } 83 | } -------------------------------------------------------------------------------- /duke-dist/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | 5 | no.priv.garshol.duke 6 | duke 7 | 1.4-SNAPSHOT 8 | ../ 9 | 10 | duke-dist 11 | pom 12 | 13 | 19 | 20 | 21 | no.priv.garshol.duke 22 | duke-core 23 | 24 | 25 | no.priv.garshol.duke 26 | duke-mapdb 27 | 28 | 29 | no.priv.garshol.duke 30 | duke-lucene 31 | 32 | 33 | no.priv.garshol.duke 34 | duke-server 35 | 36 | 37 | no.priv.garshol.duke 38 | duke-mongodb 39 | 40 | 41 | no.priv.garshol.duke 42 | duke-json 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | maven-assembly-plugin 51 | 52 | 53 | distro-assembly 54 | package 55 | 56 | single 57 | 58 | 59 | 60 | src/main/assembly/dep.xml 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | --------------------------------------------------------------------------------