The main Duke API is here.
3 |
4 |
--------------------------------------------------------------------------------
/duke-core/src/test/resources/sparql-empty.xml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/comparators/package.html:
--------------------------------------------------------------------------------
1 |
This package contains all implementations of the Database interface.
4 |
5 |
--------------------------------------------------------------------------------
/duke-es/src/main/java/no/priv/garshol/duke/databases/es/StorageType.java:
--------------------------------------------------------------------------------
1 | package no.priv.garshol.duke.databases.es;
2 |
3 | public enum StorageType {
4 | MEMORY, DISK
5 | }
6 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/package.html:
--------------------------------------------------------------------------------
1 |
2 |
Duke's built-in cleaners, plus utility classes for building your
3 | own cleaners.
4 |
5 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/package.html:
--------------------------------------------------------------------------------
1 |
2 |
This package contains helper classes for the examples, like
3 | cleaners and comparators which are too specific to include as part of
4 | Duke.
5 |
6 |
--------------------------------------------------------------------------------
/duke-server/src/main/java/no/priv/garshol/duke/server/package.html:
--------------------------------------------------------------------------------
1 |
2 |
Contains classes for running Duke as an app in a servlet container,
3 | allowing it to incrementally process new and changed data as it
4 | arrives from a remote service.
5 |
6 |
--------------------------------------------------------------------------------
/duke-core/src/test/resources/config-custom-comparator.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 | 0.89
5 |
6 |
--------------------------------------------------------------------------------
/duke-core/src/test/resources/sparql-bnode.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | r2
8 |
9 |
10 |
--------------------------------------------------------------------------------
/duke-core/src/test/resources/sparql-onerow.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | 1
8 |
9 |
10 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/StatementHandler.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | /**
5 | * Event-handler which receives parsed statements.
6 | */
7 | public interface StatementHandler {
8 | public void statement(String subject, String property, String object,
9 | boolean literal);
10 | }
11 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/Cleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | /**
5 | * A function which can turn a value into a normalized value suitable
6 | * for comparison.
7 | */
8 | public interface Cleaner {
9 |
10 | /**
11 | * Returns a cleaned value.
12 | */
13 | public String clean(String value);
14 |
15 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/DukeConfigException.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | /**
5 | * Thrown when there is an error in the configuration of Duke.
6 | */
7 | public class DukeConfigException extends RuntimeException {
8 |
9 | public DukeConfigException(String message) {
10 | super(message);
11 | }
12 |
13 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/databases/KeyFunction.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import no.priv.garshol.duke.Record;
5 |
6 | /**
7 | * A key function produces a blocking key from a record.
8 | * @since 1.2
9 | */
10 | public interface KeyFunction {
11 |
12 | public String makeKey(Record record);
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/duke-core/src/test/resources/config-default-probs.xml:
--------------------------------------------------------------------------------
1 |
4 |
5 |
6 | 0.89
7 |
8 |
9 | FIRSTNAME
10 | no.priv.garshol.duke.comparators.JaroWinklerTokenized
11 |
12 |
13 |
--------------------------------------------------------------------------------
/duke-core/src/test/resources/sparql-onerow2col.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | 1
9 | http://example.org
10 |
11 |
12 |
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/test/InMemoryClassDatabaseTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.test;
3 |
4 | import no.priv.garshol.duke.InMemoryClassDatabase;
5 | import no.priv.garshol.duke.EquivalenceClassDatabase;
6 |
7 | public class InMemoryClassDatabaseTest extends ClassDatabaseTest {
8 |
9 | public EquivalenceClassDatabase createDatabase() {
10 | return new InMemoryClassDatabase();
11 | }
12 |
13 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/matchers/InMemoryLinkDatabaseMatchListenerTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.matchers;
3 |
4 | import no.priv.garshol.duke.InMemoryLinkDatabase;
5 | import no.priv.garshol.duke.LinkDatabase;
6 |
7 | public class InMemoryLinkDatabaseMatchListenerTest
8 | extends LinkDatabaseMatchListenerTest {
9 |
10 | protected LinkDatabase makeDatabase() {
11 | return new InMemoryLinkDatabase();
12 | }
13 |
14 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/databases/InMemoryDatabaseTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import no.priv.garshol.duke.Configuration;
5 | import no.priv.garshol.duke.Database;
6 |
7 | public class InMemoryDatabaseTest extends DatabaseTest {
8 |
9 | public Database createDatabase(Configuration config) {
10 | Database db = new InMemoryDatabase();
11 | db.setConfiguration(config);
12 | return db;
13 | }
14 |
15 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/databases/KeyValueDatabaseTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import no.priv.garshol.duke.Configuration;
5 | import no.priv.garshol.duke.Database;
6 |
7 | public class KeyValueDatabaseTest extends DatabaseTest {
8 |
9 | public Database createDatabase(Configuration config) {
10 | Database db = new KeyValueDatabase();
11 | db.setConfiguration(config);
12 | return db;
13 | }
14 |
15 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/LinkSource.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | import java.util.Collection;
5 |
6 | /**
7 | * Experimental interface for retrieving link information from outside
8 | * sources for use inside the Duke processing. Intended to feed into a
9 | * LinkDatabase.
10 | */
11 | public interface LinkSource {
12 |
13 | /**
14 | * Returns the links known by the source.
15 | */
16 | public Collection getLinks();
17 |
18 | }
--------------------------------------------------------------------------------
/duke-core/src/main/resources/no/priv/garshol/duke/name-mappings.txt:
--------------------------------------------------------------------------------
1 | al,albert
2 | ben,benjamin
3 | dan,daniel
4 | danny,daniel
5 | dave,david
6 | deb,deborah
7 | debbie,deborah
8 | greg,gregory
9 | jim,james
10 | joe,joseph
11 | josh,joshua
12 | matt,matthew
13 | mike,michael
14 | norm,norman
15 | rich,richard
16 | richie,richard
17 | rick,richard
18 | rob,robert
19 | robbie,robert
20 | robby,robert
21 | sam,samuel
22 | sammy,samuel
23 | tim,timothy
24 | tony,anthony
25 | wes,wesley
26 |
--------------------------------------------------------------------------------
/duke-core/src/test/resources/config-no-comparator.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 0.89
4 |
5 |
6 | FIRSTNAME
7 | no.priv.garshol.duke.comparators.JaroWinklerTokenized
8 | 0.48
9 | 0.6
10 |
11 |
12 |
13 | LASTNAME
14 | 0.48
15 | 0.6
16 |
17 |
18 |
--------------------------------------------------------------------------------
/duke-lucene/src/test/java/no/priv/garshol/duke/databases/LuceneDatabaseTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import no.priv.garshol.duke.Configuration;
5 | import no.priv.garshol.duke.Database;
6 |
7 | public class LuceneDatabaseTest extends DatabaseTest {
8 |
9 | public Database createDatabase(Configuration config) {
10 | Database db = new LuceneDatabase();
11 | db.setOverwrite(true);
12 | db.setConfiguration(config);
13 | return db;
14 | }
15 |
16 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/TrimCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import no.priv.garshol.duke.Cleaner;
5 |
6 | /**
7 | * A cleaner which removes leading and trailing whitespace, without
8 | * making any other changes.
9 | */
10 | public class TrimCleaner implements Cleaner {
11 |
12 | public String clean(String value) {
13 | value = value.trim();
14 | if (value.equals(""))
15 | return null;
16 | return value;
17 | }
18 |
19 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/JDBCClassDatabaseTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | import java.util.Properties;
5 |
6 | import no.priv.garshol.duke.test.ClassDatabaseTest;
7 |
8 | public class JDBCClassDatabaseTest extends ClassDatabaseTest {
9 |
10 | public EquivalenceClassDatabase createDatabase() {
11 | return new JDBCEquivalenceClassDatabase("org.h2.Driver", "jdbc:h2:mem:",
12 | "h2", new Properties());
13 | }
14 |
15 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/DukeException.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | /**
5 | * Used to signal that something has gone wrong during Duke
6 | * processing.
7 | */
8 | public class DukeException extends RuntimeException {
9 |
10 | public DukeException(String msg) {
11 | super(msg);
12 | }
13 |
14 | public DukeException(String msg, Throwable e) {
15 | super(msg, e);
16 | }
17 |
18 | public DukeException(Throwable e) {
19 | super(e);
20 | }
21 |
22 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/Oracle.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.genetic;
3 |
4 | import no.priv.garshol.duke.LinkKind;
5 |
6 | /**
7 | * An oracle can say whether a given match is correct or not.
8 | */
9 | public interface Oracle {
10 |
11 | /**
12 | * Asks the oracle whether the two IDs represent the same thing or
13 | * not, and returns the answer. MAYBESAME means we don't know.
14 | */
15 | public LinkKind getLinkKind(String id1, String id2);
16 |
17 | }
18 |
--------------------------------------------------------------------------------
/duke-core/src/test/resources/config-custom-estimator.xml:
--------------------------------------------------------------------------------
1 |
2 |
6 |
7 |
11 |
12 |
13 | 0.89
14 |
15 |
16 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/ModifiableRecord.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | /**
5 | * Extended Record interface with support for modification. Mainly
6 | * used by RecordBuilder.
7 | * @since 1.2
8 | */
9 | public interface ModifiableRecord extends Record {
10 |
11 | /**
12 | * Adds a new value to the record.
13 | */
14 | public void addValue(String property, String value);
15 |
16 | /**
17 | * Returns true iff the record has no values.
18 | */
19 | public boolean isEmpty();
20 | }
--------------------------------------------------------------------------------
/duke-core/src/test/resources/sparql-tworow2col.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | 1
9 | http://example.org
10 |
11 |
12 | 2
13 | http://example.com
14 |
15 |
16 |
--------------------------------------------------------------------------------
/duke-core/src/test/resources/config-no-object.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 0.89
6 |
7 |
8 | FIRSTNAME
9 | no.priv.garshol.duke.comparators.JaroWinklerTokenized
10 | 0.48
11 | 0.6
12 |
13 |
14 |
15 | LASTNAME
16 | 0.48
17 | 0.6
18 |
19 |
20 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/comparators/ExactComparator.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.comparators;
3 |
4 | import no.priv.garshol.duke.Comparator;
5 |
6 | /**
7 | * Comparator which compares two values exactly. It returns 1.0 if
8 | * they are equal, and 0.0 if they are different.
9 | */
10 | public class ExactComparator implements Comparator {
11 |
12 | public boolean isTokenized() {
13 | return false;
14 | }
15 |
16 | public double compare(String v1, String v2) {
17 | return v1.equals(v2) ? 1.0 : 0.0;
18 | }
19 |
20 | }
--------------------------------------------------------------------------------
/duke-core/src/test/resources/config-lookup.xml:
--------------------------------------------------------------------------------
1 |
4 |
5 |
6 | 0.89
7 |
8 |
9 | FIRSTNAME
10 | no.priv.garshol.duke.comparators.JaroWinklerTokenized
11 |
12 |
13 |
14 | LASTNAME
15 | no.priv.garshol.duke.comparators.JaroWinklerTokenized
16 |
17 |
18 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/Comparator.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | /**
5 | * An operator which compares two values for similarity, and returns a
6 | * number in the range 0.0 to 1.0 indicating the degree of similarity.
7 | */
8 | public interface Comparator {
9 |
10 | /**
11 | * Returns true if the comparator breaks string values up into
12 | * tokens when comparing. Necessary because this impacts indexing of
13 | * values.
14 | */
15 | public boolean isTokenized();
16 |
17 | public double compare(String v1, String v2);
18 |
19 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/comparators/DifferentComparator.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.comparators;
3 |
4 | import no.priv.garshol.duke.Comparator;
5 |
6 | /**
7 | * A comparator which returns 0.0 if two values are exactly equal, and
8 | * 1.0 if they are different. The inverse of ExactComparator.
9 | */
10 | public class DifferentComparator implements Comparator {
11 |
12 | public boolean isTokenized() {
13 | return false;
14 | }
15 |
16 | public double compare(String v1, String v2) {
17 | return v1.equals(v2) ? 0.0 : 1.0;
18 | }
19 |
20 | }
--------------------------------------------------------------------------------
/duke-lucene/src/test/java/no/priv/garshol/duke/databases/PersistentLuceneDatabaseTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import no.priv.garshol.duke.Configuration;
5 | import no.priv.garshol.duke.Database;
6 |
7 | public class PersistentLuceneDatabaseTest extends PersistentDatabaseTest {
8 |
9 | public Database createDatabase(Configuration config) {
10 | LuceneDatabase db = new LuceneDatabase();
11 | db.setOverwrite(false);
12 | db.setConfiguration(config);
13 | db.setPath(tmpdir.getRoot().getAbsolutePath());
14 | return db;
15 | }
16 |
17 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/utils/DefaultRecordIterator.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.utils;
3 |
4 | import java.util.Iterator;
5 |
6 | import no.priv.garshol.duke.Record;
7 | import no.priv.garshol.duke.RecordIterator;
8 |
9 | public class DefaultRecordIterator extends RecordIterator {
10 | private Iterator it;
11 |
12 | public DefaultRecordIterator(Iterator it) {
13 | this.it = it;
14 | }
15 |
16 | public boolean hasNext() {
17 | return it.hasNext();
18 | }
19 |
20 | public Record next() {
21 | return it.next();
22 | }
23 | }
--------------------------------------------------------------------------------
/duke-es/src/test/resources/config-database.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 0.89
5 |
6 |
7 | FIRSTNAME
8 | no.priv.garshol.duke.comparators.JaroWinklerTokenized
9 | 0.48
10 | 0.6
11 |
12 |
13 |
14 | LASTNAME
15 | 0.48
16 | 0.6
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/duke-lucene/src/test/resources/config-database.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 0.89
5 |
6 |
7 | FIRSTNAME
8 | no.priv.garshol.duke.comparators.JaroWinklerTokenized
9 | 0.48
10 | 0.6
11 |
12 |
13 |
14 | LASTNAME
15 | 0.48
16 | 0.6
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/JNDILinkDatabase.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | import no.priv.garshol.duke.utils.JDBCUtils;
5 |
6 | /**
7 | * A link database that gets its connection via JNDI lookup.
8 | */
9 | public class JNDILinkDatabase extends RDBMSLinkDatabase {
10 | private String jndipath;
11 |
12 | public JNDILinkDatabase(String jndipath, String dbtype) {
13 | super(dbtype);
14 | this.jndipath = jndipath;
15 | this.stmt = JDBCUtils.open(jndipath);
16 | }
17 |
18 | public void validateConnection() {
19 | if (stmt != null && !JDBCUtils.validate(stmt))
20 | stmt = JDBCUtils.open(jndipath);
21 | }
22 |
23 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/DigitsOnlyCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import no.priv.garshol.duke.Cleaner;
5 |
6 | /**
7 | * Cleaner which removes all characters except the digits 0-9.
8 | */
9 | public class DigitsOnlyCleaner implements Cleaner {
10 |
11 | public String clean(String value) {
12 | char[] tmp = new char[value.length()];
13 | int pos = 0;
14 | for (int ix = 0; ix < tmp.length; ix++) {
15 | char ch = value.charAt(ix);
16 | if (ch >= '0' && ch <= '9')
17 | tmp[pos++] = ch;
18 | }
19 | if (pos == 0)
20 | return null;
21 | return new String(tmp, 0, pos);
22 | }
23 |
24 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/Aspect.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.genetic;
3 |
4 | /**
5 | * Represents an aspect of a Configuration that might be changed by
6 | * the genetic algorithm.
7 | */
8 | public abstract class Aspect {
9 |
10 | /**
11 | * Randomly modify this aspect of the configuration.
12 | */
13 | public abstract void setRandomly(GeneticConfiguration config);
14 |
15 | /**
16 | * Set this aspect of the configuration to be the same as that of
17 | * the other configuration.
18 | */
19 | public abstract void setFromOther(GeneticConfiguration config,
20 | GeneticConfiguration other);
21 | }
22 |
--------------------------------------------------------------------------------
/duke-core/src/test/resources/sparql-tworow2col-inconsistent.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
10 |
11 | 1
12 | http://example.org
13 |
14 |
15 | http://example.com
16 | 2
17 |
18 |
19 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/examples/CountryNameCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.examples;
3 |
4 | import no.priv.garshol.duke.Cleaner;
5 | import no.priv.garshol.duke.cleaners.LowerCaseNormalizeCleaner;
6 |
7 | public class CountryNameCleaner implements Cleaner {
8 | private LowerCaseNormalizeCleaner sub;
9 |
10 | public CountryNameCleaner() {
11 | this.sub = new LowerCaseNormalizeCleaner();
12 | }
13 |
14 | public String clean(String value) {
15 | // do basic cleaning
16 | value = sub.clean(value);
17 | if (value == null || value.equals(""))
18 | return "";
19 |
20 | // do our stuff
21 | if (value.startsWith("the "))
22 | value = value.substring(4);
23 |
24 | return value;
25 | }
26 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/examples/CapitalCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.examples;
3 |
4 | import no.priv.garshol.duke.Cleaner;
5 | import no.priv.garshol.duke.cleaners.LowerCaseNormalizeCleaner;
6 |
7 | public class CapitalCleaner implements Cleaner {
8 | private LowerCaseNormalizeCleaner sub;
9 |
10 | public CapitalCleaner() {
11 | this.sub = new LowerCaseNormalizeCleaner();
12 | }
13 |
14 | public String clean(String value) {
15 | // do basic cleaning
16 | value = sub.clean(value);
17 | if (value == null || value.equals(""))
18 | return "";
19 |
20 | // do our stuff
21 | int ix = value.indexOf(',');
22 | if (ix != -1)
23 | value = value.substring(0, ix);
24 |
25 | return value;
26 | }
27 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/Pair.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.genetic;
3 |
4 | /**
5 | * Represents a pair of records.
6 | */
7 | public class Pair {
8 | public String id1;
9 | public String id2;
10 | public int counter;
11 | public boolean[] believers; // which configurations think this pair is correct
12 |
13 | public Pair(String id1, String id2) {
14 | this.id1 = id1;
15 | this.id2 = id2;
16 | }
17 |
18 | public boolean equals(Object other) {
19 | if (!(other instanceof Pair))
20 | return false;
21 |
22 | Pair opair = (Pair) other;
23 | return opair.id1.equals(id1) && opair.id2.equals(id2);
24 | }
25 |
26 | public int hashCode() {
27 | return id1.hashCode() + id2.hashCode();
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/cleaners/PersonNameCleanerTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import org.junit.Before;
5 | import org.junit.Test;
6 |
7 | import static junit.framework.Assert.assertEquals;
8 |
9 | public class PersonNameCleanerTest extends LowerCaseNormalizeCleanerTest {
10 |
11 | @Before
12 | public void setUp() {
13 | cleaner = new PersonNameCleaner();
14 | }
15 |
16 | @Test
17 | public void testMapping() {
18 | assertEquals("joseph stalin",
19 | cleaner.clean("Joe Stalin"));
20 | }
21 |
22 | @Test
23 | public void testMappingEmpty() {
24 | assertEquals("", cleaner.clean(""));
25 | }
26 |
27 | // @Test
28 | // public void testMappingNull() {
29 | // assertEquals(null, cleaner.clean(null));
30 | // }
31 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/cleaners/TrimCleanerTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import org.junit.Before;
5 | import org.junit.Test;
6 |
7 | import static junit.framework.Assert.assertEquals;
8 |
9 | public class TrimCleanerTest {
10 | private TrimCleaner cleaner;
11 |
12 | @Before
13 | public void setup() {
14 | cleaner = new TrimCleaner();
15 | }
16 |
17 | @Test
18 | public void testEmpty() {
19 | test("", null);
20 | }
21 |
22 | @Test
23 | public void testOnlyDigits() {
24 | test("314", "314");
25 | }
26 |
27 | @Test
28 | public void testDigitsAndSpaces() {
29 | test(" 3 1 4 ", "3 1 4");
30 | }
31 |
32 | private void test(String value, String result) {
33 | assertEquals(result, cleaner.clean(value));
34 | }
35 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/databases/PriorityQueueTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import org.junit.Test;
5 |
6 | import static junit.framework.Assert.assertEquals;
7 |
8 | public class PriorityQueueTest {
9 |
10 | @Test
11 | public void test100() {
12 | KeyValueDatabase.Score scores[] = new KeyValueDatabase.Score[100];
13 | for (int ix = 0; ix < scores.length; ix++) {
14 | scores[ix] = new KeyValueDatabase.Score(ix);
15 | scores[ix].score = (double) ix;
16 | }
17 | KeyValueDatabase.PriorityQueue pq =
18 | new KeyValueDatabase.PriorityQueue(scores);
19 |
20 | for (int ix = 0; ix < scores.length; ix++) {
21 | KeyValueDatabase.Score score = pq.next();
22 | assertEquals((99 - ix), (int) score.score);
23 | }
24 | }
25 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/ThresholdAspect.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.genetic;
3 |
4 | import no.priv.garshol.duke.Configuration;
5 |
6 | /**
7 | * Sets the threshold.
8 | */
9 | public class ThresholdAspect extends FloatAspect {
10 |
11 | public void setRandomly(GeneticConfiguration cfg) {
12 | Configuration config = cfg.getConfiguration();
13 | double new_value = drift(config.getThreshold(), 1.0, 0.0);
14 | config.setThreshold(new_value);
15 | }
16 |
17 | public void setFromOther(GeneticConfiguration cfg1,
18 | GeneticConfiguration cfg2) {
19 | Configuration config = cfg1.getConfiguration();
20 | Configuration other = cfg2.getConfiguration();
21 |
22 | config.setThreshold(other.getThreshold());
23 | }
24 |
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/duke-lucene/src/test/java/no/priv/garshol/duke/databases/LuceneConfigLoaderTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import java.io.IOException;
5 |
6 | import no.priv.garshol.duke.ConfigLoader;
7 | import no.priv.garshol.duke.Configuration;
8 | import no.priv.garshol.duke.Database;
9 | import org.junit.Test;
10 | import org.xml.sax.SAXException;
11 |
12 | import static junit.framework.Assert.assertEquals;
13 |
14 | public class LuceneConfigLoaderTest {
15 |
16 | @Test
17 | public void testDatabase() throws IOException, SAXException {
18 | Configuration config = ConfigLoader.load("classpath:config-database.xml");
19 | Database db = config.getDatabase(false);
20 | LuceneDatabase lucene = (LuceneDatabase) db;
21 | assertEquals("/tmp/ct-visma-1", lucene.getPath());
22 | }
23 |
24 | }
25 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/GenericValueCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import no.priv.garshol.duke.Cleaner;
5 |
6 | /**
7 | * A cleaner which returns values as they are, but removes specific
8 | * values. This is useful in cases where users have entered so-called
9 | * "generic values". For example, if the unknown company number is
10 | * always set as "999999999", then you can use this cleaner to remove
11 | * that specific value.
12 | */
13 | public class GenericValueCleaner implements Cleaner {
14 | private String generic;
15 |
16 | public String clean(String value) {
17 | if (generic.equals(value))
18 | return null;
19 | return value;
20 | }
21 |
22 | public void setGeneric(String generic) {
23 | this.generic = generic;
24 | }
25 |
26 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/matchers/AbstractMatchListener.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.matchers;
3 |
4 | import no.priv.garshol.duke.Record;
5 |
6 | /**
7 | * Convenience implementation with dummy methods, since most
8 | * implementations will only implement matches().
9 | */
10 | public abstract class AbstractMatchListener implements MatchListener {
11 |
12 | public void batchReady(int size) {
13 | }
14 |
15 | public void batchDone() {
16 | }
17 |
18 | public void matches(Record r1, Record r2, double confidence) {
19 | }
20 |
21 | public void matchesPerhaps(Record r1, Record r2, double confidence) {
22 | }
23 |
24 | public void noMatchFor(Record record) {
25 | }
26 |
27 | public void startProcessing() {
28 | }
29 |
30 | public void endProcessing() {
31 | }
32 |
33 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/utils/SparqlResult.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.utils;
3 |
4 | import java.util.List;
5 | import java.util.ArrayList;
6 |
7 | /**
8 | * Represents the result of a SPARQL query.
9 | */
10 | public class SparqlResult {
11 | private List variables;
12 | private List rows;
13 |
14 | public SparqlResult() {
15 | this.variables = new ArrayList();
16 | this.rows = new ArrayList();
17 | }
18 |
19 | public List getVariables() {
20 | return variables;
21 | }
22 |
23 | public List getRows() {
24 | return rows;
25 | }
26 |
27 | // public for test purposes
28 | public void addVariable(String variable) {
29 | variables.add(variable);
30 | }
31 |
32 | // public for test purposes
33 | public void addRow(String[] row) {
34 | rows.add(row);
35 | }
36 | }
--------------------------------------------------------------------------------
/duke-server/src/main/java/no/priv/garshol/duke/server/DukeTimer.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.server;
3 |
4 | import java.util.Properties;
5 |
6 | public interface DukeTimer {
7 |
8 | /**
9 | * Initializes the timer, giving it access to configuration settings.
10 | */
11 | public void init(Properties props);
12 |
13 | /**
14 | * Starts a background thread which calls the controller every
15 | * check_interval seconds. Returns immediately, leaving the
16 | * background thread running.
17 | */
18 | public void spawnThread(DukeController controller, int check_interval);
19 |
20 | /**
21 | * Returns true iff the background thread is running.
22 | */
23 | public boolean isRunning();
24 |
25 | /**
26 | * Stops the background thread. It can be restarted with a new call
27 | * to spawnThread.
28 | */
29 | public void stop();
30 |
31 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/StripNontextCharacters.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import no.priv.garshol.duke.Cleaner;
5 |
6 | /**
7 | * A cleaner which removes non-text characters. Specifically it strips
8 | * control characters (0-0x1F, 0x7F-0x9F) and special symbols in the
9 | * range 0xA1-0xBF.
10 | */
11 | public class StripNontextCharacters implements Cleaner {
12 |
13 | public String clean(String value) {
14 | char[] tmp = new char[value.length()];
15 | int pos = 0;
16 | for (int ix = 0; ix < value.length(); ix++) {
17 | char ch = value.charAt(ix);
18 | if (ch < 0x20 ||
19 | (ch >= 0x7F && ch < 0xA0) ||
20 | (ch > 0xA0 && ch < 0xC0))
21 | continue; // skip Euro symbol, soft hyphen, etc etc
22 | tmp[pos++] = ch;
23 | }
24 | return new String(tmp, 0, pos);
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/duke-es/src/test/java/no/priv/garshol/duke/databases/es/ElasticSearchConfigLoaderTest.java:
--------------------------------------------------------------------------------
1 | package no.priv.garshol.duke.databases.es;
2 |
3 | import java.io.IOException;
4 |
5 | import no.priv.garshol.duke.ConfigLoader;
6 | import no.priv.garshol.duke.Configuration;
7 | import no.priv.garshol.duke.Database;
8 | import no.priv.garshol.duke.databases.es.ElasticSearchDatabase;
9 |
10 | import org.junit.Test;
11 | import org.xml.sax.SAXException;
12 |
13 | import static org.junit.Assert.assertEquals;
14 |
15 | public class ElasticSearchConfigLoaderTest {
16 |
17 | @Test
18 | public void testDatabase() throws IOException, SAXException {
19 | Configuration config = ConfigLoader
20 | .load("classpath:config-database.xml");
21 | Database db = config.getDatabase(false);
22 | ElasticSearchDatabase es = (ElasticSearchDatabase) db;
23 | assertEquals("duke-es", es.getCluster());
24 | }
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/ChainedCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import no.priv.garshol.duke.Cleaner;
5 |
6 | /**
7 | * Internal cleaner used to implement chaining of multiple cleaners.
8 | * Basically, if you list multiple cleaners in the cleaner=""
9 | * attribute in the configuration file, it gets turned into a
10 | * ChainedCleaner that runs all the cleaners in sequence.
11 | */
12 | public class ChainedCleaner implements Cleaner {
13 | private Cleaner[] cleaners;
14 |
15 | public ChainedCleaner(Cleaner[] cleaners) {
16 | this.cleaners = cleaners;
17 | }
18 |
19 | public String clean(String value) {
20 | for (int ix = 0; ix < cleaners.length; ix++) {
21 | if (value == null || value.equals(""))
22 | return null;
23 |
24 | value = cleaners[ix].clean(value);
25 | }
26 | return value;
27 | }
28 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/comparators/DifferentComparatorTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.comparators;
3 |
4 | import org.junit.Before;
5 | import org.junit.Test;
6 |
7 | import static junit.framework.Assert.assertEquals;
8 |
9 | public class DifferentComparatorTest {
10 | private DifferentComparator comp;
11 |
12 | @Before
13 | public void setup() {
14 | this.comp = new DifferentComparator();
15 | }
16 |
17 | @Test
18 | public void testEmpty() {
19 | assertEquals(0.0, comp.compare("", ""));
20 | }
21 |
22 | @Test
23 | public void testEmpty1() {
24 | assertEquals(1.0, comp.compare("", "1"));
25 | }
26 |
27 | @Test
28 | public void testEmpty2() {
29 | assertEquals(1.0, comp.compare("1", ""));
30 | }
31 |
32 | @Test
33 | public void testSame() {
34 | assertEquals(0.0, comp.compare("same", "same")); // but different
35 | }
36 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/cleaners/DigitsOnlyCleanerTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import org.junit.Before;
5 | import org.junit.Test;
6 |
7 | import static junit.framework.Assert.assertEquals;
8 | import static junit.framework.Assert.assertTrue;
9 |
10 | public class DigitsOnlyCleanerTest {
11 | private DigitsOnlyCleaner cleaner;
12 |
13 | @Before
14 | public void setup() {
15 | cleaner = new DigitsOnlyCleaner();
16 | }
17 |
18 | @Test
19 | public void testEmpty() {
20 | assertTrue(cleaner.clean("") == null);
21 | }
22 |
23 | @Test
24 | public void testOnlyDigits() {
25 | test("314", "314");
26 | }
27 |
28 | @Test
29 | public void testDigitsAndSpaces() {
30 | test(" 3 1 4 ", "314");
31 | }
32 |
33 | private void test(String value, String result) {
34 | assertEquals(result, cleaner.clean(value));
35 | }
36 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/DummyLogger.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | public class DummyLogger implements Logger {
5 |
6 | public void trace(String msg) {
7 | }
8 |
9 | public void debug(String msg) {
10 | }
11 |
12 | public void info(String msg) {
13 | }
14 |
15 | public void warn(String msg) {
16 | }
17 |
18 | public void warn(String msg, Throwable e) {
19 | }
20 |
21 | public void error(String msg) {
22 | }
23 |
24 | public void error(String msg, Throwable e) {
25 | }
26 |
27 | public boolean isTraceEnabled() {
28 | return false;
29 | }
30 |
31 | public boolean isDebugEnabled() {
32 | return false;
33 | }
34 |
35 | public boolean isInfoEnabled() {
36 | return false;
37 | }
38 |
39 | public boolean isWarnEnabled() {
40 | return false;
41 | }
42 |
43 | public boolean isErrorEnabled() {
44 | return false;
45 | }
46 | }
--------------------------------------------------------------------------------
/duke-mapdb/src/test/java/no/priv/garshol/duke/databases/MapDBBlockingDatabaseTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import java.util.ArrayList;
5 | import java.util.Collection;
6 |
7 | import no.priv.garshol.duke.Configuration;
8 | import no.priv.garshol.duke.Database;
9 | import no.priv.garshol.duke.Record;
10 |
11 | public class MapDBBlockingDatabaseTest extends DatabaseTest {
12 |
13 | public Database createDatabase(Configuration config) {
14 | MapDBBlockingDatabase db = new MapDBBlockingDatabase();
15 | db.setConfiguration(config);
16 |
17 | Collection functions = new ArrayList();
18 | functions.add(new TestKeyFunction());
19 | db.setKeyFunctions(functions);
20 | return db;
21 | }
22 |
23 | private static class TestKeyFunction implements KeyFunction {
24 | public String makeKey(Record record) {
25 | return record.getValue("NAME");
26 | }
27 | }
28 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/databases/InMemoryBlockingDatabaseTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import java.util.ArrayList;
5 | import java.util.Collection;
6 |
7 | import no.priv.garshol.duke.Configuration;
8 | import no.priv.garshol.duke.Database;
9 | import no.priv.garshol.duke.Record;
10 |
11 | public class InMemoryBlockingDatabaseTest extends DatabaseTest {
12 |
13 | public Database createDatabase(Configuration config) {
14 | InMemoryBlockingDatabase db = new InMemoryBlockingDatabase();
15 | db.setConfiguration(config);
16 |
17 | Collection functions = new ArrayList();
18 | functions.add(new TestKeyFunction());
19 | db.setKeyFunctions(functions);
20 | return db;
21 | }
22 |
23 | private static class TestKeyFunction implements KeyFunction {
24 | public String makeKey(Record record) {
25 | return record.getValue("NAME");
26 | }
27 | }
28 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/comparators/JaccardIndexComparatorTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.comparators;
3 |
4 | import org.junit.Before;
5 | import org.junit.Test;
6 |
7 | import static junit.framework.Assert.assertEquals;
8 |
9 | public class JaccardIndexComparatorTest {
10 | private JaccardIndexComparator comp;
11 |
12 | @Before
13 | public void setup() {
14 | comp = new JaccardIndexComparator();
15 | }
16 |
17 | @Test
18 | public void testEmpty() {
19 | assertEquals(1.0, comp.compare("", ""));
20 | }
21 |
22 | @Test
23 | public void testOneIsEmpty() {
24 | assertEquals(0.0, comp.compare("", "abc"));
25 | }
26 |
27 | @Test
28 | public void testOneIsDifferent() {
29 | assertEquals((1.0 / 3.0), comp.compare("abc def", "cba def"));
30 | }
31 |
32 | @Test
33 | public void testSameSets() {
34 | assertEquals(1.0, comp.compare("abc def", "def abc"));
35 | }
36 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/Logger.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | /**
5 | * Experimental attempt at internal log handling which works
6 | * naturally on the command-line, doesn't introduce dependencies, and
7 | * at the same time allows integration with a full logging system.
8 | * This may go away again if I change my mind.
9 | */
10 | public interface Logger {
11 |
12 | public void trace(String msg);
13 |
14 | public boolean isTraceEnabled();
15 |
16 | public void debug(String msg);
17 |
18 | public boolean isDebugEnabled();
19 |
20 | public void info(String msg);
21 |
22 | public boolean isInfoEnabled();
23 |
24 | public void warn(String msg);
25 |
26 | public void warn(String msg, Throwable e);
27 |
28 | public boolean isWarnEnabled();
29 |
30 | public void error(String msg);
31 |
32 | public void error(String msg, Throwable e);
33 |
34 | public boolean isErrorEnabled();
35 |
36 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/FloatAspect.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.genetic;
3 |
4 | /**
5 | * Common code for the float aspects.
6 | */
7 | public abstract class FloatAspect extends Aspect {
8 | protected double drift(double original, double max, double min) {
9 |
10 | // FIXME: the following is a chunk of experimental code that
11 | // hasn't been fully evaluated yet. leaving it in since it *may*
12 | // be reactivated, after more evaluation
13 |
14 | //
15 | // double upper = original + (float_drift_range / 2.0);
16 | // if (original + (float_drift_range / 2.0) > max)
17 | // upper = max;
18 | // else if (original - (float_drift_range / 2.0) < min)
19 | // upper = float_drift_range + min;
20 |
21 | // double delta = float_drift_range * Math.random();
22 | // return upper - delta;
23 | //
24 |
25 | return Math.random() * (max - min) + min;
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/cleaners/FamilyCommaGivenCleanerTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import org.junit.Before;
5 | import org.junit.Test;
6 |
7 | import static junit.framework.Assert.assertEquals;
8 |
9 | public class FamilyCommaGivenCleanerTest {
10 | private FamilyCommaGivenCleaner cleaner;
11 |
12 | @Before
13 | public void setup() {
14 | cleaner = new FamilyCommaGivenCleaner();
15 | }
16 |
17 | @Test
18 | public void testEmpty() {
19 | test("", "");
20 | }
21 |
22 | @Test
23 | public void testHenrikIbsen() {
24 | test("henrik ibsen", "Henrik Ibsen");
25 | }
26 |
27 | @Test
28 | public void testIbsenHenrik() {
29 | test("henrik ibsen", "Ibsen, Henrik");
30 | }
31 |
32 | @Test
33 | public void testJRAckerley() {
34 | test("j. r. ackerley", "Ackerley, J.R.");
35 | }
36 |
37 | private void test(String s1, String s2) {
38 | assertEquals(s1, cleaner.clean(s2));
39 | }
40 |
41 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/RecordIterator.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | import java.io.Closeable;
5 | import java.util.Iterator;
6 |
7 | /**
8 | * Special Iterator class for Record collections, in order to add some
9 | * extra methods for resource management.
10 | */
11 | public abstract class RecordIterator
12 | implements Iterator, Closeable {
13 |
14 | /**
15 | * Releases any resources held by this iterator, and cleans up any
16 | * temporary storage.
17 | */
18 | public void close() {
19 | }
20 |
21 | /**
22 | * Informs the iterator that the latest batch of records retrieved
23 | * from the iterator has been processed. This may in some cases
24 | * allow iterators to free resources, but iterators are not required
25 | * to perform any action in response to this call.
26 | */
27 | public void batchProcessed() {
28 | }
29 |
30 | public void remove() {
31 | throw new UnsupportedOperationException();
32 | }
33 |
34 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/LinkStatus.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | /**
5 | * Represents the status of a link between two identities. That is, do
6 | * we believe it, and why?
7 | */
8 | public enum LinkStatus {
9 | /**
10 | * Means we have outside evidence indicating this is true.
11 | */
12 | ASSERTED(2),
13 |
14 | /**
15 | * Means Duke has worked this out on its own.
16 | */
17 | INFERRED(1),
18 |
19 | /**
20 | * Means Duke used to believe this, but has since changed its mind.
21 | */
22 | RETRACTED(0);
23 |
24 | private int id;
25 | private LinkStatus(int id) {
26 | this.id = id;
27 | }
28 |
29 | public int getId() {
30 | return id;
31 | }
32 |
33 | public static LinkStatus getbyid(int id) {
34 | if (id == 2)
35 | return ASSERTED;
36 | else if (id == 1)
37 | return INFERRED;
38 | else if (id == 0)
39 | return RETRACTED;
40 | throw new DukeException("No status with id " + id);
41 | }
42 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/Transform.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import java.util.regex.Pattern;
5 | import java.util.regex.Matcher;
6 |
7 | /**
8 | * Helper class used by AbstractRuleBasedCleaner.
9 | */
10 | public class Transform {
11 | private Pattern regex;
12 | private String replacement;
13 | private int groupno;
14 |
15 | public Transform(String regex, String replacement) {
16 | this(regex, replacement, 1);
17 | }
18 |
19 | public Transform(String regex, String replacement, int groupno) {
20 | this.regex = Pattern.compile(regex);
21 | this.replacement = replacement;
22 | this.groupno = groupno;
23 | }
24 |
25 | public String transform(String value) {
26 | Matcher m = regex.matcher(value);
27 | if (!m.find())
28 | return value;
29 |
30 | return value.substring(0, m.start(groupno)) +
31 | replacement +
32 | value.substring(m.end(groupno), value.length());
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/LinkKind.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | /**
5 | * Represents the meaning of a link between two identities.
6 | */
7 | public enum LinkKind {
8 | /**
9 | * Means we assume the two identities refer to the same real-world object.
10 | */
11 | SAME(1),
12 |
13 | /**
14 | * Means we think it possible that the two identities refer to the
15 | * same real-world object.
16 | */
17 | MAYBESAME(2),
18 |
19 | /**
20 | * Means we assume the two identities refer to different real-world objects.
21 | */
22 | DIFFERENT(3);
23 |
24 | private int id;
25 | private LinkKind(int id) {
26 | this.id = id;
27 | }
28 |
29 | public int getId() {
30 | return id;
31 | }
32 |
33 | public static LinkKind getbyid(int id) {
34 | if (id == 1)
35 | return SAME;
36 | else if (id == 2)
37 | return MAYBESAME;
38 | else if (id == 3)
39 | return DIFFERENT;
40 | throw new DukeException("No kind with id " + id);
41 | }
42 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/test/RecordImplTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.test;
3 |
4 | import java.util.Collection;
5 |
6 | import no.priv.garshol.duke.Record;
7 | import no.priv.garshol.duke.utils.TestUtils;
8 | import org.junit.Test;
9 |
10 | import static junit.framework.Assert.assertEquals;
11 | import static junit.framework.Assert.assertTrue;
12 |
13 | public class RecordImplTest {
14 |
15 | @Test
16 | public void testNormal() {
17 | Record r = TestUtils.makeRecord("ID", "abc", "NAME", "b");
18 |
19 | assertEquals("abc", r.getValue("ID"));
20 | Collection values = r.getValues("ID");
21 | assertEquals(1, values.size());
22 | assertEquals("abc", values.iterator().next());
23 |
24 | assertEquals("b", r.getValue("NAME"));
25 | values = r.getValues("NAME");
26 | assertEquals(1, values.size());
27 | assertEquals("b", values.iterator().next());
28 |
29 | assertEquals(null, r.getValue("EMAIL"));
30 | assertTrue(r.getValues("EMAIL").isEmpty());
31 | }
32 |
33 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/utils/YesNoConsole.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.utils;
3 |
4 | import java.io.IOException;
5 | import java.io.InputStreamReader;
6 | import java.io.BufferedReader;
7 |
8 | import no.priv.garshol.duke.DukeException;
9 |
10 | public class YesNoConsole {
11 | private BufferedReader console;
12 |
13 | public YesNoConsole() {
14 | this.console = new BufferedReader(new InputStreamReader(System.in));
15 | }
16 |
17 | public boolean yesorno() {
18 | System.out.print("Correct? (Y/N) ");
19 | try {
20 | String line = console.readLine();
21 | if (line == null)
22 | throw new DukeException("End of file on console");
23 | line = line.trim();
24 |
25 | if (line.equalsIgnoreCase("Y"))
26 | return true;
27 | else if (line.equalsIgnoreCase("N"))
28 | return false;
29 | else
30 | return yesorno();
31 | } catch (IOException e) {
32 | throw new DukeException("Couldn't read input line", e);
33 | }
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/utils/LinkDatabaseUtilsTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.utils;
3 |
4 | import java.io.IOException;
5 | import java.io.InputStream;
6 | import java.io.InputStreamReader;
7 |
8 | import no.priv.garshol.duke.DukeException;
9 | import no.priv.garshol.duke.LinkDatabase;
10 | import org.junit.Test;
11 |
12 | import static org.junit.Assert.fail;
13 |
14 | public class LinkDatabaseUtilsTest {
15 | private LinkDatabase db;
16 |
17 | @Test
18 | public void testOldStyle() throws IOException {
19 | // tries to load a pre-1.2 format test file
20 | try {
21 | load("old-format.txt");
22 | fail("accepted old-style test file");
23 | } catch (DukeException e) {
24 | // this is expected
25 | }
26 | }
27 |
28 | private void load(String filename) throws IOException {
29 | ClassLoader cloader = Thread.currentThread().getContextClassLoader();
30 | InputStream istream = cloader.getResourceAsStream(filename);
31 | db = LinkDatabaseUtils.loadTestFile(new InputStreamReader(istream));
32 | }
33 | }
--------------------------------------------------------------------------------
/changes.txt:
--------------------------------------------------------------------------------
1 | CHANGES SINCE 1.2
2 | =====================
3 |
4 | Threading in the genetic algorithm is now much more efficient
5 | Set up continuous integration with Travis
6 | Added integration tests
7 | Can now run without Lucene on classpath
8 | Added HTMLCleaner
9 | Added StripNontextCharacters
10 | Genetic algorithm:
11 | mutation and recombination rates now evolve by themselves
12 | user can set both rates
13 | improved choice of questions asked under active learning
14 | Support for boosting in LuceneDatabase
15 | implemented by Fabrizio Fortino
16 | JSON data source
17 | implemented by https://github.com/dmnpignaud
18 | jar file now runnable
19 | Added --no-comparators option to genetic algorithm
20 | Added --original=N option to genetic algorithm
21 | Added ConfigLoader.loadFromString
22 | Added --incomplete-data option to genetic algorithm
23 | Added support for incremental record linkage (plus 1.3 methods)
24 | Let genetic algorithm use custom comparators from config (ztsmith)
25 | Split-on property not included in genetic output (ztsmith)
26 | MongoDB data source (antonimmo)
27 |
--------------------------------------------------------------------------------
/doc/example-data/db-nationality.txt:
--------------------------------------------------------------------------------
1 | American,http://dbpedia.org/resource/United_States
2 | British,http://dbpedia.org/resource/United_Kingdom
3 | Australian,http://dbpedia.org/resource/Australia
4 | Indian,http://dbpedia.org/resource/India
5 | Norwegian,http://dbpedia.org/resource/Norway
6 | http://dbpedia.org/resource/Norwegians,http://dbpedia.org/resource/Norway
7 | norwegian,http://dbpedia.org/resource/Norway
8 | CAN,http://dbpedia.org/resource/Canada
9 | http://dbpedia.org/resource/British_people,http://dbpedia.org/resource/United_Kingdom
10 | USA,http://dbpedia.org/resource/United_States
11 | Japanese,http://dbpedia.org/resource/Japan
12 | United States,http://dbpedia.org/resource/United_States
13 | French,http://dbpedia.org/resource/France
14 | English,http://dbpedia.org/resource/United_Kingdom
15 | German,http://dbpedia.org/resource/Germany
16 | Canadian,http://dbpedia.org/resource/Canada
17 | http://dbpedia.org/resource/England,http://dbpedia.org/resource/United_Kingdom
18 | Italian,http://dbpedia.org/resource/Italy
19 | Polish,http://dbpedia.org/resource/Polish
20 | Canada,http://dbpedia.org/resource/Canada
21 |
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/CompactRecordTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | import org.junit.After;
5 | import org.junit.Before;
6 | import org.junit.Test;
7 |
8 | import static junit.framework.Assert.assertEquals;
9 | import static junit.framework.Assert.assertTrue;
10 |
11 | public class CompactRecordTest {
12 |
13 | @Test
14 | public void testEmpty() {
15 | CompactRecord r = new CompactRecord();
16 | r.toString();
17 |
18 | assertTrue(r.isEmpty());
19 | assertTrue(r.getProperties().isEmpty());
20 | assertTrue(r.getValues("foo").isEmpty());
21 | assertTrue(r.getValue("foo") == null);
22 | }
23 |
24 | @Test
25 | public void testSingle() {
26 | CompactRecord r = new CompactRecord();
27 | r.addValue("foo", "bar");
28 | r.toString();
29 |
30 | assertTrue(!r.isEmpty());
31 | assertTrue(r.getProperties().size() == 1);
32 | assertTrue(r.getProperties().iterator().next().equals("foo"));
33 | assertTrue(r.getValues("foo").iterator().next().equals("bar"));
34 | assertTrue(r.getValue("foo").equals("bar"));
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/LinkFileOracle.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.genetic;
3 |
4 | import java.io.IOException;
5 |
6 | import no.priv.garshol.duke.Link;
7 | import no.priv.garshol.duke.LinkKind;
8 | import no.priv.garshol.duke.LinkDatabase;
9 | import no.priv.garshol.duke.InMemoryLinkDatabase;
10 | import no.priv.garshol.duke.utils.LinkDatabaseUtils;
11 |
12 | /**
13 | * This oracle looks up the answer in a link file.
14 | */
15 | public class LinkFileOracle implements Oracle {
16 | private InMemoryLinkDatabase linkdb;
17 |
18 | public LinkFileOracle(String testfile) throws IOException {
19 | this.linkdb = new InMemoryLinkDatabase();
20 | linkdb.setDoInference(true);
21 | LinkDatabaseUtils.loadTestFile(testfile, linkdb);
22 | }
23 |
24 | public LinkDatabase getLinkDatabase() {
25 | return linkdb;
26 | }
27 |
28 | public LinkKind getLinkKind(String id1, String id2) {
29 | Link link = linkdb.inferLink(id1, id2);
30 | if (link == null)
31 | return LinkKind.DIFFERENT; // we assume missing links are incorrect
32 | return link.getKind();
33 | }
34 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/comparators/Matcher.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.comparators;
3 |
4 | /**
5 | * Wrapping this around the input string to simplify the code.
6 | */
7 | public class Matcher {
8 | private String str;
9 | private int ix;
10 |
11 | public Matcher(String str) {
12 | this.str = str;
13 | this.ix = -1;
14 | }
15 |
16 | public boolean isNext(char ch) {
17 | return ix + 1 < str.length() && str.charAt(ix + 1) == ch;
18 | }
19 |
20 | public boolean atStart() {
21 | return ix == 0;
22 | }
23 |
24 | public boolean hasNext() {
25 | return ix + 1 < str.length();
26 | }
27 |
28 | public boolean nextIsLast() {
29 | return ix + 2 == str.length();
30 | }
31 |
32 | public boolean isLast() {
33 | return ix + 1 == str.length();
34 | }
35 |
36 | public char next() {
37 | return str.charAt(++ix);
38 | }
39 |
40 | public void skip() {
41 | ix++;
42 | }
43 |
44 | public boolean previousOneOf(String chars) {
45 | if (ix == 0)
46 | return false;
47 | return chars.indexOf(str.charAt(ix - 1)) != -1;
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/FamilyCommaGivenCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import no.priv.garshol.duke.Cleaner;
5 |
6 | /**
7 | * Experimental cleaner for person names of the form "Smith,
8 | * John". Based on the PersonNameCleaner. It also normalizes periods
9 | * in initials, so that "J.R. Ackerley" becomes "J. R. Ackerley".
10 | */
11 | public class FamilyCommaGivenCleaner implements Cleaner {
12 | private PersonNameCleaner sub;
13 |
14 | public FamilyCommaGivenCleaner() {
15 | this.sub = new PersonNameCleaner();
16 | }
17 |
18 | public String clean(String value) {
19 | int i = value.indexOf(',');
20 | if (i != -1)
21 | value = value.substring(i + 1) + " " + value.substring(0, i);
22 |
23 | char[] tmp = new char[value.length() * 2];
24 | int pos = 0;
25 | for (int ix = 0; ix < value.length(); ix++) {
26 | tmp[pos++] = value.charAt(ix);
27 | if (value.charAt(ix) == '.' &&
28 | ix+1 < value.length() &&
29 | value.charAt(ix + 1) != ' ')
30 | tmp[pos++] = ' ';
31 | }
32 |
33 | return sub.clean(new String(tmp, 0, pos));
34 | }
35 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/HighProbabilityAspect.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.genetic;
3 |
4 | import no.priv.garshol.duke.Property;
5 | import no.priv.garshol.duke.Configuration;
6 |
7 | /**
8 | * Sets the high probability.
9 | */
10 | public class HighProbabilityAspect extends FloatAspect {
11 | private Property prop;
12 |
13 | public HighProbabilityAspect(Property prop) {
14 | this.prop = prop;
15 | }
16 |
17 | public void setRandomly(GeneticConfiguration cfg) {
18 | Configuration config = cfg.getConfiguration();
19 | Property p = config.getPropertyByName(prop.getName());
20 | double new_value = drift(config.getThreshold(), 1.0, 0.5);
21 | p.setHighProbability(new_value);
22 | }
23 |
24 | public void setFromOther(GeneticConfiguration cfg1,
25 | GeneticConfiguration cfg2) {
26 | Configuration config = cfg1.getConfiguration();
27 | Configuration other = cfg2.getConfiguration();
28 |
29 | Property p1 = config.getPropertyByName(prop.getName());
30 | Property p2 = other.getPropertyByName(prop.getName());
31 | p1.setHighProbability(p2.getHighProbability());
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/LowProbabilityAspect.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.genetic;
3 |
4 | import no.priv.garshol.duke.Property;
5 | import no.priv.garshol.duke.Configuration;
6 |
7 | /**
8 | * Sets the low probability.
9 | */
10 | public class LowProbabilityAspect extends FloatAspect {
11 | private Property prop;
12 |
13 | public LowProbabilityAspect(Property prop) {
14 | this.prop = prop;
15 | }
16 |
17 | public void setRandomly(GeneticConfiguration cfg) {
18 | Configuration config = cfg.getConfiguration();
19 | Property p = config.getPropertyByName(prop.getName());
20 | double new_value = drift(config.getThreshold(), 0.5, 0.0);
21 | p.setLowProbability(new_value);
22 | }
23 |
24 | public void setFromOther(GeneticConfiguration cfg1,
25 | GeneticConfiguration cfg2) {
26 | Configuration config = cfg1.getConfiguration();
27 | Configuration other = cfg2.getConfiguration();
28 |
29 | Property p1 = config.getPropertyByName(prop.getName());
30 | Property p2 = other.getPropertyByName(prop.getName());
31 | p1.setLowProbability(p2.getLowProbability());
32 | }
33 |
34 | }
35 |
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/comparators/DiceCoefficientComparatorTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.comparators;
3 |
4 | import org.junit.Before;
5 | import org.junit.Test;
6 |
7 | import static junit.framework.Assert.assertEquals;
8 |
9 | public class DiceCoefficientComparatorTest {
10 | private DiceCoefficientComparator comp;
11 |
12 | @Before
13 | public void setup() {
14 | comp = new DiceCoefficientComparator();
15 | }
16 |
17 | @Test
18 | public void testEmpty() {
19 | assertEquals(1.0, comp.compare("", ""));
20 | }
21 |
22 | @Test
23 | public void testOneIsEmpty() {
24 | assertEquals(0.0, comp.compare("", "abc"));
25 | }
26 |
27 | @Test
28 | public void testOneIsDifferent() {
29 | assertEquals(0.5, comp.compare("abc def", "cba def"));
30 | }
31 |
32 | @Test
33 | public void testReordering() {
34 | assertEquals(1.0, comp.compare("def abc", "abc def"));
35 | }
36 |
37 | @Test
38 | public void testLengthDifference() {
39 | assertEquals(0.8, comp.compare("def abc ghe", "abc def"));
40 | }
41 |
42 | @Test
43 | public void testLengthDifference2() {
44 | assertEquals(0.8, comp.compare("def abc", "abc def ghe"));
45 | }
46 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/JDBCLinkDatabase.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | import java.util.Properties;
5 |
6 | import no.priv.garshol.duke.utils.JDBCUtils;
7 |
8 | /**
9 | * A link database which can maintain a set of links in an H2 or
10 | * Oracle database over JDBC. It could be extended to work with more
11 | * database implementations.
12 | */
13 | public class JDBCLinkDatabase extends RDBMSLinkDatabase {
14 | private String driverklass;
15 | private String jdbcuri;
16 | private Properties props;
17 |
18 | public JDBCLinkDatabase(String driverklass,
19 | String jdbcuri,
20 | String dbtype,
21 | Properties props) {
22 | super(dbtype);
23 | this.driverklass = driverklass;
24 | this.jdbcuri = jdbcuri;
25 | this.props = props;
26 | this.stmt = JDBCUtils.open(driverklass, jdbcuri, props);
27 | }
28 |
29 | public void validateConnection() {
30 | if (stmt != null && !JDBCUtils.validate(stmt))
31 | // it failed to validate, and was closed by the validate method.
32 | // we therefore reopen so that we have a proper connection.
33 | stmt = JDBCUtils.open(driverklass, jdbcuri, props);
34 | }
35 |
36 | }
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/utils/PropertyUtilsTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.utils;
3 |
4 | import java.util.Properties;
5 |
6 | import no.priv.garshol.duke.DukeConfigException;
7 | import org.junit.Before;
8 | import org.junit.Test;
9 |
10 | import static junit.framework.Assert.assertEquals;
11 | import static junit.framework.Assert.fail;
12 |
13 | public class PropertyUtilsTest {
14 | private Properties props;
15 |
16 | @Before
17 | public void setup() {
18 | props = new Properties();
19 | props.setProperty("foo", "bar");
20 | props.setProperty("baz", "2");
21 | }
22 |
23 | @Test
24 | public void testGet1() {
25 | assertEquals(PropertyUtils.get(props, "foo"), "bar");
26 |
27 | try {
28 | PropertyUtils.get(props, "bar");
29 | fail("exception not thrown");
30 | } catch (DukeConfigException e) {
31 | }
32 | }
33 |
34 | @Test
35 | public void testGet2() {
36 | assertEquals(PropertyUtils.get(props, "foo", "huhu"), "bar");
37 | assertEquals(PropertyUtils.get(props, "quux", "huhu"), "huhu");
38 | }
39 |
40 | @Test
41 | public void testGet3() {
42 | assertEquals(PropertyUtils.get(props, "baz", 0), 2);
43 | assertEquals(PropertyUtils.get(props, "quux", 27), 27);
44 | }
45 | }
--------------------------------------------------------------------------------
/duke-json/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | no.priv.garshol.duke
6 | duke
7 | 1.4-SNAPSHOT
8 | ../
9 |
10 | duke-json
11 | jar
12 |
13 |
14 |
15 |
16 | no.priv.garshol.duke
17 | duke-core
18 |
19 |
20 | no.priv.garshol.duke
21 | duke-core
22 | test-jar
23 | test
24 |
25 |
26 |
27 |
28 | com.fasterxml.jackson.core
29 | jackson-core
30 | 2.3.2
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/duke-mapdb/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | no.priv.garshol.duke
6 | duke
7 | 1.4-SNAPSHOT
8 | ../
9 |
10 | duke-mapdb
11 | jar
12 |
13 |
14 |
15 |
16 | no.priv.garshol.duke
17 | duke-core
18 |
19 |
20 | no.priv.garshol.duke
21 | duke-core
22 | test-jar
23 | test
24 |
25 |
26 |
27 |
28 | org.mapdb
29 | mapdb
30 | 0.9.13
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/duke-mongodb/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 |
5 | no.priv.garshol.duke
6 | duke
7 | 1.4-SNAPSHOT
8 | ../
9 |
10 | duke-mongodb
11 | jar
12 |
13 |
14 |
15 |
16 | no.priv.garshol.duke
17 | duke-core
18 |
19 |
20 | no.priv.garshol.duke
21 | duke-core
22 | test-jar
23 | test
24 |
25 |
26 |
27 |
28 | org.mongodb
29 | mongo-java-driver
30 | 3.12.14
31 |
32 |
33 |
34 |
35 |
36 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/Record.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | import java.util.Collection;
5 |
6 | /**
7 | * Represents a record, which may be a single source record from a
8 | * data source, or a record created from merging data from many
9 | * records.
10 | */
11 | public interface Record {
12 |
13 | /**
14 | * The names of the properties this record has. May be a subset of
15 | * the properties defined in the configuration if not all properties
16 | * have values.
17 | */
18 | public Collection getProperties();
19 |
20 | /**
21 | * All values for the named property. May be empty. May not contain
22 | * null or empty strings. Never returns null.
23 | */
24 | public Collection getValues(String prop);
25 |
26 | /**
27 | * Returns a value for the named property. May be null. May not be
28 | * the empty string. If the property has more than one value there is
29 | * no way to predict which value is returned.
30 | */
31 | public String getValue(String prop);
32 |
33 | /**
34 | * Merges the other record into this one. None of the
35 | * implementations support this method yet, but it's going to be
36 | * used when we implement issue 4.
37 | */
38 | public void merge(Record other);
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/cleaners/NorwegianCompanyNameCleanerTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import org.junit.Before;
5 | import org.junit.Test;
6 |
7 | import static junit.framework.Assert.assertEquals;
8 |
9 | public class NorwegianCompanyNameCleanerTest {
10 | private NorwegianCompanyNameCleaner cleaner;
11 |
12 | @Before
13 | public void setup() {
14 | cleaner = new NorwegianCompanyNameCleaner();
15 | }
16 |
17 | @Test
18 | public void testEmpty() {
19 | test("", "");
20 | }
21 |
22 | @Test
23 | public void testAslashsAs() {
24 | test("sundby maskin as", "sundby maskin a/s");
25 | }
26 |
27 | @Test
28 | public void testAbackslashAs() {
29 | test("sundby maskin as", "sundby maskin a\\s");
30 | }
31 |
32 | @Test
33 | public void testAslashL() {
34 | test("al follestadgata sameie", "a/l follestadgata sameie");
35 | }
36 |
37 | @Test
38 | public void testMoveALToEnd() {
39 | test("a/l follestadgata sameie", "follestadgata sameie al");
40 | }
41 |
42 | @Test
43 | public void testMoveASToEnd() {
44 | test("a/s sundby maskin", "sundby maskin as");
45 | }
46 |
47 | private void test(String s1, String s2) {
48 | assertEquals(cleaner.clean(s1), cleaner.clean(s2));
49 | }
50 |
51 | }
--------------------------------------------------------------------------------
/duke-mapdb/src/test/java/no/priv/garshol/duke/databases/PersistentMapDBBlockingDatabaseTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import java.io.IOException;
5 | import java.util.ArrayList;
6 | import java.util.Collection;
7 |
8 | import no.priv.garshol.duke.Configuration;
9 | import no.priv.garshol.duke.Database;
10 | import no.priv.garshol.duke.Record;
11 |
12 | public class PersistentMapDBBlockingDatabaseTest extends PersistentDatabaseTest {
13 | private String dbfile;
14 |
15 | public Database createDatabase(Configuration config) throws IOException {
16 | if (dbfile == null)
17 | dbfile = tmpdir.newFile().getAbsolutePath(); // ensure same every time
18 |
19 | MapDBBlockingDatabase db = new MapDBBlockingDatabase();
20 | db.setConfiguration(config);
21 | db.setOverwrite(false);
22 | db.setFile(dbfile);
23 | db.setAsync(false); // slows down tests too much
24 | db.setWindowSize(0); // otherwise we'll find way too many candidates
25 |
26 | Collection functions = new ArrayList();
27 | functions.add(new TestKeyFunction());
28 | db.setKeyFunctions(functions);
29 | return db;
30 | }
31 |
32 | private static class TestKeyFunction implements KeyFunction {
33 | public String makeKey(Record record) {
34 | return record.getValue("NAME");
35 | }
36 | }
37 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/matchers/ClassDatabaseMatchListener.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.matchers;
3 |
4 | import no.priv.garshol.duke.Configuration;
5 | import no.priv.garshol.duke.EquivalenceClassDatabase;
6 | import no.priv.garshol.duke.Property;
7 | import no.priv.garshol.duke.Record;
8 |
9 | /**
10 | * Writes recorded matches to an EquivalenceClassDatabase.
11 | */
12 | public class ClassDatabaseMatchListener extends AbstractMatchListener {
13 | private Configuration config;
14 | protected EquivalenceClassDatabase classdb;
15 |
16 | public ClassDatabaseMatchListener(Configuration config,
17 | EquivalenceClassDatabase classdb) {
18 | this.config = config;
19 | this.classdb = classdb;
20 | }
21 |
22 | public void matches(Record r1, Record r2, double confidence) {
23 | String id1 = getIdentity(r1);
24 | String id2 = getIdentity(r2);
25 | classdb.addLink(id1, id2);
26 | }
27 |
28 | public void batchDone() {
29 | classdb.commit();
30 | }
31 |
32 | private String getIdentity(Record r) {
33 | for (Property p : config.getIdentityProperties())
34 | for (String v : r.getValues(p.getName()))
35 | return v;
36 | throw new RuntimeException("No identity found in record [" +
37 | PrintMatchListener.toString(r) + "]");
38 | }
39 |
40 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/ComparatorAspect.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.genetic;
3 |
4 | import java.util.List;
5 | import java.util.ArrayList;
6 |
7 | import no.priv.garshol.duke.Property;
8 | import no.priv.garshol.duke.Comparator;
9 | import no.priv.garshol.duke.Configuration;
10 | import no.priv.garshol.duke.utils.ObjectUtils;
11 |
12 | /**
13 | * Sets the comparator.
14 | */
15 | public class ComparatorAspect extends Aspect {
16 | private Property prop;
17 | private List comparators;
18 |
19 | public ComparatorAspect(Property prop, List comparators) {
20 | this.prop = prop;
21 | this.comparators = comparators;
22 | }
23 |
24 | public void setRandomly(GeneticConfiguration cfg) {
25 | Configuration config = cfg.getConfiguration();
26 | Property p = config.getPropertyByName(prop.getName());
27 | p.setComparator(comparators.get((int) (comparators.size() * Math.random())));
28 | }
29 |
30 | public void setFromOther(GeneticConfiguration cfg1,
31 | GeneticConfiguration cfg2) {
32 | Configuration config = cfg1.getConfiguration();
33 | Configuration other = cfg2.getConfiguration();
34 |
35 | Property p1 = config.getPropertyByName(prop.getName());
36 | Property p2 = other.getPropertyByName(prop.getName());
37 | p1.setComparator(p2.getComparator());
38 | }
39 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/genetic/ConsoleOracle.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.genetic;
3 |
4 | import java.io.Writer;
5 | import java.io.FileWriter;
6 | import java.io.IOException;
7 |
8 | import no.priv.garshol.duke.LinkKind;
9 | import no.priv.garshol.duke.DukeException;
10 | import no.priv.garshol.duke.utils.YesNoConsole;
11 | import no.priv.garshol.duke.utils.LinkFileWriter;
12 |
13 | /**
14 | * This oracle asks the user via the console.
15 | */
16 | public class ConsoleOracle implements Oracle {
17 | private YesNoConsole console;
18 | private LinkFileWriter writer;
19 | private Writer out;
20 |
21 | public ConsoleOracle() {
22 | this.console = new YesNoConsole();
23 | }
24 |
25 | public LinkKind getLinkKind(String id1, String id2) {
26 | boolean match = console.yesorno();
27 | if (writer != null)
28 | try {
29 | writer.write(id1, id2, match, 1.0);
30 | out.flush(); // make sure everything's saved
31 | } catch (IOException e) {
32 | throw new DukeException(e);
33 | }
34 | return match ? LinkKind.SAME : LinkKind.DIFFERENT;
35 | }
36 |
37 | public void setLinkFile(String linkfile) throws IOException {
38 | out = new FileWriter(linkfile, true);
39 | writer = new LinkFileWriter(out);
40 | // FIXME: strictly speaking, this leaks file handles. in practice it
41 | // probably won't matter
42 | }
43 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/databases/AbstractKeyFunction.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.databases;
3 |
4 | import no.priv.garshol.duke.utils.StringUtils;
5 |
6 | /**
7 | * Helper class for writing key functions.
8 | * @since 1.2
9 | */
10 | public abstract class AbstractKeyFunction implements KeyFunction {
11 |
12 | public String firstLongerThan(String value, int min) {
13 | if (value == null)
14 | return "null";
15 |
16 | String[] tokens = StringUtils.split(value);
17 | for (int ix = 0; ix < tokens.length; ix++)
18 | if (tokens[ix].length() > min)
19 | return tokens[ix];
20 | return tokens[0];
21 | }
22 |
23 | public String lastLongerThan(String value, int min) {
24 | if (value == null)
25 | return "null";
26 |
27 | String[] tokens = StringUtils.split(value);
28 | for (int ix = tokens.length - 1; ix >= 0; ix--)
29 | if (tokens[ix].length() > min)
30 | return tokens[ix];
31 | return tokens[0];
32 | }
33 |
34 | public String allDigits(String value) {
35 | if (value == null)
36 | return "null";
37 |
38 | char[] tmp = new char[value.length()];
39 | int free = 0;
40 | for (int ix = 0; ix < value.length(); ix++) {
41 | char ch = value.charAt(ix);
42 | if (ch >= '0' && ch <= '9')
43 | tmp[free++] = ch;
44 | }
45 | return new String(tmp, 0, free);
46 | }
47 |
48 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/AbstractRuleBasedCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import java.util.List;
5 | import java.util.ArrayList;
6 | import java.util.regex.Pattern;
7 | import java.util.regex.Matcher;
8 |
9 | import no.priv.garshol.duke.Cleaner;
10 |
11 | /**
12 | * Helper class for building regular-expression based cleaners.
13 | */
14 | public abstract class AbstractRuleBasedCleaner implements Cleaner {
15 | private List transforms;
16 |
17 | /**
18 | * Initializes an empty cleaner.
19 | */
20 | public AbstractRuleBasedCleaner() {
21 | this.transforms = new ArrayList();
22 | }
23 |
24 | public String clean(String value) {
25 | // perform pre-registered transforms
26 | for (Transform t : transforms)
27 | value = t.transform(value);
28 |
29 | return value;
30 | }
31 |
32 | /**
33 | * Adds a rule replacing all substrings matching the regular
34 | * expression with the replacement string.
35 | */
36 | public void add(String regex, String replacement) {
37 | add(regex, replacement, 1);
38 | }
39 |
40 | /**
41 | * Adds a rule replacing all substrings matching the specified group
42 | * within the regular expression with the replacement string.
43 | */
44 | public void add(String regex, String replacement, int groupno) {
45 | transforms.add(new Transform(regex, replacement, groupno));
46 | }
47 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/utils/TestFileUtils.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.utils;
3 |
4 | import java.util.Map;
5 | import java.util.HashMap;
6 | import java.io.FileReader;
7 | import java.io.IOException;
8 | import java.io.BufferedReader;
9 |
10 | import no.priv.garshol.duke.Link;
11 | import no.priv.garshol.duke.LinkKind;
12 | import no.priv.garshol.duke.LinkStatus;
13 |
14 | /**
15 | * A utility class for loading link files. Deprecated: Please
16 | * don't use. Use the LinkDatabase concept instead. This class will
17 | * be removed in a future version.
18 | * @deprecated
19 | */
20 | public class TestFileUtils {
21 |
22 | public static Map load(String testfile) throws IOException {
23 | Map links = new HashMap();
24 | BufferedReader reader = new BufferedReader(new FileReader(testfile));
25 | String line = reader.readLine();
26 | while (line != null) {
27 | int pos = line.indexOf(',');
28 |
29 | String id1 = line.substring(1, pos);
30 | String id2 = line.substring(pos + 1, line.length());
31 |
32 | links.put(id1 + "," + id2,
33 | new Link(id1, id2, LinkStatus.ASSERTED,
34 | line.charAt(0) == '+' ?
35 | LinkKind.SAME : LinkKind.DIFFERENT, 0.0));
36 |
37 | line = reader.readLine();
38 | }
39 | reader.close();
40 |
41 | return links;
42 | }
43 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/Database.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | import java.util.Collection;
5 |
6 | /**
7 | * Used to store and index records for later matching.
8 | */
9 | public interface Database {
10 |
11 | /**
12 | * Returns true iff the database is held entirely in memory, and
13 | * thus is not persistent.
14 | */
15 | public boolean isInMemory();
16 |
17 | /**
18 | * Add the record to the index.
19 | */
20 | public void index(Record record);
21 |
22 | /**
23 | * Flushes all changes to disk. For in-memory databases this is a
24 | * no-op.
25 | */
26 | public void commit();
27 |
28 | /**
29 | * Look up record by identity.
30 | */
31 | public Record findRecordById(String id);
32 |
33 | /**
34 | * Look up potentially matching records. This method must be
35 | * thread-safe.
36 | */
37 | public Collection findCandidateMatches(Record record);
38 |
39 | /**
40 | * Stores state to disk and closes all open resources.
41 | */
42 | public void close();
43 |
44 | /**
45 | * Gives the database its configuration (called by Duke framework).
46 | * @since 1.2
47 | */
48 | public void setConfiguration(Configuration config);
49 |
50 | /**
51 | * Sets whether or not to overwrite any existing index (called by
52 | * Duke framework).
53 | * @since 1.2
54 | */
55 | public void setOverwrite(boolean overwrite);
56 | }
57 |
--------------------------------------------------------------------------------
/duke-core/src/test/java/no/priv/garshol/duke/comparators/WeightedLevenshteinTest.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.comparators;
3 |
4 | import org.junit.Before;
5 | import org.junit.Test;
6 |
7 | import static junit.framework.Assert.assertEquals;
8 |
9 | public class WeightedLevenshteinTest {
10 | private WeightedLevenshtein.DefaultWeightEstimator e;
11 |
12 | @Before
13 | public void setup() {
14 | e = new WeightedLevenshtein.DefaultWeightEstimator();
15 | }
16 |
17 | @Test
18 | public void testEmpty() {
19 | assertEquals(0.0, WeightedLevenshtein.distance("", "", e));
20 | }
21 |
22 | @Test
23 | public void testEmpty1() {
24 | e.setDigitWeight(1.0);
25 | assertEquals(1.0, WeightedLevenshtein.distance("", "1", e));
26 | }
27 |
28 | @Test
29 | public void testEmpty2() {
30 | e.setDigitWeight(2.0);
31 | assertEquals(2.0, WeightedLevenshtein.distance("1", "", e));
32 | }
33 |
34 | @Test
35 | public void testSubstitute1() {
36 | e.setDigitWeight(2.0);
37 | assertEquals(2.0, WeightedLevenshtein.distance("titanic 1", "titanic 2", e));
38 | }
39 |
40 | @Test
41 | public void testSubstitute2() {
42 | e.setDigitWeight(2.0);
43 | assertEquals(3.0, WeightedLevenshtein.distance("totanic 1", "titanic 2", e));
44 | }
45 |
46 | @Test
47 | public void testComparator() {
48 | WeightedLevenshtein comp = new WeightedLevenshtein();
49 | assertEquals(0.0, comp.compare("1", ""));
50 | }
51 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/MappingFileCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import java.util.Map;
5 | import java.util.HashMap;
6 | import java.io.FileReader;
7 | import java.io.IOException;
8 |
9 | import no.priv.garshol.duke.Cleaner;
10 | import no.priv.garshol.duke.DukeException;
11 | import no.priv.garshol.duke.utils.CSVReader;
12 |
13 | // FIXME: we may also want an option to allow unmapped values to be
14 | // returned as is (or even via the sub-cleaner)
15 |
16 | /**
17 | * A cleaner which loads a mapping file in CSV format and maps values
18 | * according to that file.
19 | * @since 0.5
20 | */
21 | public class MappingFileCleaner implements Cleaner {
22 | private Map mapping;
23 |
24 | public String clean(String value) {
25 | String newvalue = mapping.get(value);
26 | if (newvalue == null)
27 | return value;
28 | return newvalue;
29 | }
30 |
31 | public void setMappingFile(String filename) {
32 | mapping = new HashMap();
33 |
34 | // FIXME: character encoding?
35 | try {
36 | CSVReader csv = new CSVReader(new FileReader(filename));
37 |
38 | String[] row = csv.next();
39 | while (row != null) {
40 | mapping.put(row[0], row[1]);
41 | row = csv.next();
42 | }
43 |
44 | csv.close();
45 | } catch (IOException e) {
46 | throw new DukeException("Error loading mapping file " + filename, e);
47 | }
48 | }
49 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/NorwegianCompanyNameCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import no.priv.garshol.duke.utils.StringUtils;
5 |
6 | public class NorwegianCompanyNameCleaner extends AbstractRuleBasedCleaner {
7 | private LowerCaseNormalizeCleaner sub;
8 |
9 | public NorwegianCompanyNameCleaner() {
10 | super();
11 | this.sub = new LowerCaseNormalizeCleaner();
12 |
13 | add("\\s(a/s)(\\s|$)", "as");
14 | add("\\s(a\\\\s)(\\s|$)", "as");
15 | add("^(a/s)\\s", "as");
16 | add("^(a\\\\s)\\s", "as");
17 | add("\\s(a/l)(\\s|$)", "al");
18 | add("^(a/l)\\s", "al");
19 | }
20 |
21 | public String clean(String value) {
22 | // get rid of commas
23 | value = StringUtils.replaceAnyOf(value, ",().-_", ' ');
24 |
25 | // do basic cleaning
26 | value = sub.clean(value);
27 | if (value == null || value.equals(""))
28 | return "";
29 |
30 | // perform pre-registered transforms
31 | value = super.clean(value);
32 |
33 | // renormalize whitespace, since being able to replace tokens with spaces
34 | // makes writing transforms easier
35 | value = StringUtils.normalizeWS(value);
36 |
37 | // transforms:
38 | // "as foo bar" -> "foo bar as"
39 | // "al foo bar" -> "foo bar al"
40 | if (value.startsWith("as ") || value.startsWith("al "))
41 | value = value.substring(3) + ' ' + value.substring(0, 2);
42 |
43 | return value;
44 | }
45 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/DataSource.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | /**
5 | * Any class which implements this interface can be used as a data
6 | * source, so you can plug in your own data sources. Configuration
7 | * properties are received as bean setter calls via reflection.
8 | */
9 | public interface DataSource {
10 |
11 | /**
12 | * Return an iterator over all the records in this data source. This
13 | * should preferably not load all records into memory, but instead
14 | * produce them lazily.
15 | */
16 | public RecordIterator getRecords();
17 |
18 | /**
19 | * Gives the data source a logger to report diagnostic information
20 | * to. Ignoring the logger is allowed.
21 | *
22 | *
WARN: This method is experimental. I'm far from certain
23 | * that this is how I want this to work. May go for slf4j logging
24 | * instead, or something similar.
25 | */
26 | public void setLogger(Logger logger);
27 |
28 | /**
29 | * Each {@link no.priv.garshol.duke.DataSource} is responsible of writing
30 | * its XML configuration using provided {@link no.priv.garshol.duke.ConfigWriter}
31 | * instance.
32 | *
Each implementation should start with a specific tag (unique identifier of
33 | * DataSource implementation inside Duke) and close it before returning.
34 | *
35 | *
36 | * @param cw Handler which keep reference to an XML printer.
37 | */
38 | void writeConfig(ConfigWriter cw);
39 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/utils/PropertyUtils.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.utils;
3 |
4 | import java.util.Properties;
5 | import no.priv.garshol.duke.DukeConfigException;
6 |
7 | /**
8 | * Utilities for making Java Properties objects easier to deal with.
9 | */
10 | public class PropertyUtils {
11 |
12 | /**
13 | * Used for getting required properties, will throw an exception if
14 | * the property is not specified.
15 | */
16 | public static String get(Properties props, String name) {
17 | String value = props.getProperty(name);
18 | if (value == null)
19 | throw new DukeConfigException("Required property " + name +
20 | " not specified");
21 | return value;
22 | }
23 |
24 | /**
25 | * Returns the value of an optional property, if the property is
26 | * set. If it is not set defval is returned.
27 | */
28 | public static String get(Properties props, String name, String defval) {
29 | String value = props.getProperty(name);
30 | if (value == null)
31 | value = defval;
32 | return value;
33 | }
34 |
35 | /**
36 | * Returns the value of an optional property, if the property is
37 | * set. If it is not set defval is returned.
38 | */
39 | public static int get(Properties props, String name, int defval) {
40 | String value = props.getProperty(name);
41 | if (value == null)
42 | return defval;
43 | return Integer.parseInt(value);
44 | }
45 |
46 | }
--------------------------------------------------------------------------------
/doc/example-data/namebase.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 0.8
4 | /Users/larsga/tmp/duke-sdshare/tomcat-lucene-index
5 |
6 |
7 | ID
8 |
9 |
10 |
11 | NameField1
12 | no.priv.garshol.duke.JaroWinkler
13 | 0.4
14 | 0.7
15 |
16 |
17 |
18 | NameField2
19 | no.priv.garshol.duke.JaroWinkler
20 | 0.4
21 | 0.6
22 |
23 |
24 |
25 | NameField3
26 | no.priv.garshol.duke.JaroWinkler
27 | 0.4
28 | 0.55
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
41 |
44 |
47 |
48 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/comparators/NumericComparator.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.comparators;
3 |
4 | import no.priv.garshol.duke.Comparator;
5 |
6 | /**
7 | * Comparator which compares two values numerically. The similarity is
8 | * the ratio of the smaller number to the greater number, if both
9 | * numbers are either negative or positive. If one is negative and the
10 | * other positive, the similarity is 0.0.
11 | */
12 | public class NumericComparator implements Comparator {
13 | private double minratio;
14 |
15 | public boolean isTokenized() {
16 | return false;
17 | }
18 |
19 | public void setMinRatio(double minratio) {
20 | this.minratio = minratio;
21 | }
22 |
23 | public double compare(String v1, String v2) {
24 | double d1;
25 | double d2;
26 | try {
27 | d1 = Double.parseDouble(v1);
28 | d2 = Double.parseDouble(v2);
29 | } catch (NumberFormatException e) {
30 | return 0.5; // we just ignore this. whether it's wise I'm not sure
31 | }
32 |
33 | // if they're both zero, they're equal
34 | if (d1 == 0.0 && d2 == 0.0)
35 | return 1.0;
36 |
37 | // if both are negative, flip the signs
38 | if (d1 < 0.0 && d2 < 0.0) {
39 | d1 *= -1.0;
40 | d2 *= -1.0;
41 | }
42 |
43 | if (d2 < d1) {
44 | double tmp = d2;
45 | d2 = d1;
46 | d1 = tmp;
47 | }
48 |
49 | double ratio = d1 / d2;
50 | if (ratio < minratio)
51 | return 0.0;
52 | else
53 | return ratio;
54 | }
55 |
56 | }
57 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/NorwegianAddressCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | public class NorwegianAddressCleaner extends AbstractRuleBasedCleaner {
5 | private LowerCaseNormalizeCleaner sub;
6 |
7 | public NorwegianAddressCleaner() {
8 | super();
9 | this.sub = new LowerCaseNormalizeCleaner();
10 |
11 | add("^(co/ ?)", "c/o ");
12 | add("^(c\\\\o)", "c/o");
13 | add("[A-Za-z]+(g\\.) [0-9]+", "gata");
14 | add("[A-Za-z]+ (gt?\\.?) [0-9]+", "gate");
15 | add("[A-Za-z]+(v\\.) [0-9]+", "veien");
16 | add("[A-Za-z]+ (v\\.?) [0-9]+", "vei");
17 | add("[A-Za-z]+(vn\\.?)[0-9]+", "veien ");
18 | add("[A-Za-z]+(vn\\.?) [0-9]+", "veien");
19 | add("[A-Za-z]+(gt\\.?) [0-9]+", "gata");
20 | add("[A-Za-z]+(gaten) [0-9]+", "gata");
21 | add("(\\s|^)(pb\\.?) [0-9]+", "postboks", 2);
22 | add("(\\s|^)(boks) [0-9]+", "postboks", 2);
23 | add("[A-Za-z]+ [0-9]+(\\s+)[A-Za-z](\\s|$)", "");
24 | add("[A-Za-z]+(gata|veien)()[0-9]+[a-z]?(\\s|$)", " ");
25 |
26 | // FIXME: not sure about the following rules
27 | add("postboks\\s+[0-9]+(\\s*-\\s*)", " ");
28 | }
29 |
30 | public String clean(String value) {
31 | // get rid of commas
32 | value = value.replace(',', ' ');
33 |
34 | // do basic cleaning
35 | value = sub.clean(value);
36 | if (value == null || value.equals(""))
37 | return value;
38 |
39 | // perform pre-registered transforms
40 | value = super.clean(value);
41 |
42 | return value;
43 | }
44 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/cleaners/HTMLCleaner.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.cleaners;
3 |
4 | import java.util.Map;
5 | import java.util.HashMap;
6 | import no.priv.garshol.duke.Cleaner;
7 |
8 | /**
9 | * A cleaner that removes HTML-style entity references, such as
10 | * Þ and —.
11 | * @since 1.3
12 | */
13 | public class HTMLCleaner implements Cleaner {
14 | private static Map entities;
15 |
16 | static {
17 | entities = new HashMap();
18 | entities.put("mdash", "\u2014");
19 | }
20 |
21 | public String clean(String value) {
22 | StringBuilder buf = new StringBuilder(value.length());
23 | for (int ix = 0; ix < value.length(); ix++) {
24 | char ch = value.charAt(ix);
25 | if (ch != '&') {
26 | buf.append(ch);
27 | continue;
28 | }
29 |
30 | ch = value.charAt(++ix);
31 | if (ch == '#') {
32 | ix++;
33 | if (value.charAt(ix) == 'x')
34 | throw new UnsupportedOperationException("Don't support ...;");
35 | int pos = ix;
36 | for (; ix < value.length() && value.charAt(ix) != ';'; ix++)
37 | ;
38 | ch = (char) Integer.parseInt(value.substring(pos, ix));
39 | buf.append(ch);
40 | } else {
41 | int pos = ix;
42 | for (; ix < value.length() && value.charAt(ix) != ';'; ix++)
43 | ;
44 | String v = entities.get(value.substring(pos, ix));
45 | buf.append(v);
46 | }
47 | }
48 | return buf.toString();
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/comparators/DiceCoefficientComparator.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.comparators;
3 |
4 | import no.priv.garshol.duke.Comparator;
5 | import no.priv.garshol.duke.utils.StringUtils;
6 |
7 | /**
8 | * An implementation of the Dice coefficient using exact matching by
9 | * default, but can be overridden to use any sub-comparator.
10 | */
11 | public class DiceCoefficientComparator implements Comparator {
12 | private Comparator subcomp;
13 |
14 | public DiceCoefficientComparator() {
15 | this.subcomp = new ExactComparator();
16 | }
17 |
18 | public void setComparator(Comparator comp) {
19 | this.subcomp = comp;
20 | }
21 |
22 | public boolean isTokenized() {
23 | return true;
24 | }
25 |
26 | public double compare(String s1, String s2) {
27 | if (s1.equals(s2))
28 | return 1.0;
29 |
30 | // tokenize
31 | String[] t1 = StringUtils.split(s1);
32 | String[] t2 = StringUtils.split(s2);
33 |
34 | // ensure that t1 is shorter than or same length as t2
35 | if (t1.length > t2.length) {
36 | String[] tmp = t2;
37 | t2 = t1;
38 | t1 = tmp;
39 | }
40 |
41 | // find best matches for each token in t1
42 | double sum = 0;
43 | for (int ix1 = 0; ix1 < t1.length; ix1++) {
44 | double highest = 0;
45 | for (int ix2 = 0; ix2 < t2.length; ix2++)
46 | highest = Math.max(highest, subcomp.compare(t1[ix1], t2[ix2]));
47 | sum += highest;
48 | }
49 |
50 | return (sum * 2) / (t1.length + t2.length);
51 | }
52 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/LinkDatabase.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | import java.util.Collection;
5 |
6 | /**
7 | * A LinkDatabase is a class which can keep track of links between
8 | * entities.
9 | */
10 | public interface LinkDatabase {
11 |
12 | /**
13 | * Returns all links modified since the given time.
14 | */
15 | public Collection getChangesSince(long since);
16 |
17 | /**
18 | * Get all links.
19 | */
20 | public Collection getAllLinks();
21 |
22 | /**
23 | * Get all links for this identity. If there are no links it returns
24 | * an empty collection, never null.
25 | */
26 | public Collection getAllLinksFor(String id);
27 |
28 | /**
29 | * Assert a link.
30 | */
31 | public void assertLink(Link link);
32 |
33 | /**
34 | * Can we work out, based on what we know, the relationship between
35 | * these two? Returns null if we don't know the relationship.
36 | */
37 | public Link inferLink(String id1, String id2);
38 |
39 | /**
40 | * Verifies that we still have a connection to the database, and
41 | * reestablishes it, if not. Useful when connections live a long
42 | * time and are rarely used.
43 | */
44 | public void validateConnection();
45 |
46 | /**
47 | * Commit asserted links to persistent store.
48 | */
49 | public void commit();
50 |
51 | /**
52 | * Removes all links from the database.
53 | */
54 | public void clear();
55 |
56 | /**
57 | * Shuts down the database, releasing resources.
58 | */
59 | public void close();
60 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/EquivalenceClassDatabase.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke;
3 |
4 | import java.util.Iterator;
5 | import java.util.Collection;
6 |
7 | // FIXME: if we are going to implement retraction we will need
8 | // something like a linkdatabase as backing. probably equiv dbs need
9 | // to be aware of the linkdatabase anyway, in order to avoid known bad
10 | // links and make use of extra known links not inferred from data etc.
11 |
12 | /**
13 | * A tool for collecting matching records into groups where all
14 | * records are considered to match. Note that this means treating the
15 | * matching relation between records as transitive, which in practice
16 | * it is not.
17 | */
18 | public interface EquivalenceClassDatabase {
19 |
20 | /**
21 | * Returns the number of equivalence classes in the database.
22 | */
23 | public int getClassCount();
24 |
25 | /**
26 | * Returns an iterator over all the classes in the database.
27 | */
28 | public Iterator> getClasses();
29 |
30 | /**
31 | * Get all records linked to the given record (that is, all records
32 | * in the same equivalence class as the given record).
33 | * @param id the ID of a record
34 | * @return Always returns a collection, but it may be empty.
35 | */
36 | public Collection getClass(String id);
37 |
38 | /**
39 | * Add a new link between two records.
40 | */
41 | public void addLink(String id1, String id2);
42 |
43 | /**
44 | * Commit changes made to persistent store.
45 | */
46 | public void commit();
47 |
48 | }
--------------------------------------------------------------------------------
/duke-core/src/main/java/no/priv/garshol/duke/matchers/MatchListener.java:
--------------------------------------------------------------------------------
1 |
2 | package no.priv.garshol.duke.matchers;
3 |
4 | import java.util.Collection;
5 |
6 | import no.priv.garshol.duke.Record;
7 |
8 | /**
9 | * Interface implemented by code which can receive notifications that
10 | * two records are considered to match.
11 | *
12 | *
Note that when running Duke with multiple threads, the
13 | * matches(), matchesPerhaps(), and noMatchFor() methods need to be
14 | * thread-safe.
15 | */
16 | public interface MatchListener {
17 |
18 | /**
19 | * Notification that Duke is about to process a new batch of records.
20 | */
21 | public void batchReady(int size);
22 |
23 | /**
24 | * Notification that Duke has finished processing a batch of records.
25 | */
26 | public void batchDone();
27 |
28 | /**
29 | * Notification that the two records match. There will have been a
30 | * previous startRecord(r1) notification.
31 | */
32 | public void matches(Record r1, Record r2, double confidence);
33 |
34 | /**
35 | * Notification that the two records might match. There will have
36 | * been a previous startRecord(r1) notification.
37 | */
38 | public void matchesPerhaps(Record r1, Record r2, double confidence);
39 |
40 | /**
41 | * Called if no link is found for the record.
42 | */
43 | public void noMatchFor(Record record);
44 |
45 | /**
46 | * Notification that the processing run is beginning.
47 | */
48 | public void startProcessing();
49 |
50 | /**
51 | * Notification that this processing run is over.
52 | */
53 | public void endProcessing();
54 | }
--------------------------------------------------------------------------------
/doc/tutorials/2011_05_data-cleansing.textile:
--------------------------------------------------------------------------------
1 | h1. Duke'em - data cleansing in the Linked Data publishing process
2 |
3 | Authors: Michael Hausenblas and Lars Marius Garshol
4 |
5 | I'm going to show you how can do data cleansing as part of the "Linked Data publishing process":http://linkeddatabook.com/editions/1.0/#htoc62, based on an open source tool called "Duke":http://code.google.com/p/duke/.
6 |
7 | h2. What is Duke?
8 |
9 | Duke is a fast and flexible deduplication engine, written in Java on top of Apache "Lucene":http://lucene.apache.org/. The current implementation allows a throughput of 1500 records/sec single-threaded, on a commodity machine.
10 |
11 | h2. STEP1: Prepare your data source
12 |
13 | For demonstration purposes we will use a CSV dump from "NameBase":http://www.namebase.org/csvdump.html containing some 140k records:
14 |
15 | bq. NameBase is a cumulative index of the names of individuals, corporations, and groups compiled from 800 investigative books published since 1962, and thousands of pages from periodicals since 1973. Areas covered include the international intelligence community, political elites from the Right and Left, the U.S. foreign policy establishment, assassinations and political scandals, Latin America, big business, and organized crime.
16 |
17 | Now, the structure of the NameBase data source is as follows:
18 |