├── COPYING
├── README.html
├── README.md
├── build.xml
├── lib
    ├── JavaEWAH-0.4.2.jar
    ├── basics2_20100910.jar
    ├── commons-cli-1.2.jar
    ├── commons-collections-3.2.1.jar
    ├── commons-io-1.4.jar
    ├── commons-lang-2.3.jar
    ├── javatools_20120619.jar
    ├── jgrapht.jar
    ├── joda-time.jar
    ├── jollyday.jar
    ├── junit-4.11.jar
    ├── log4j-1.2.17.jar
    ├── mpi-DBManager-20121219.jar
    ├── mpi-TokenizerService2-20130124.jar
    ├── postgresql-9.2-1002.jdbc4.jar
    ├── slf4j-api-1.7.2.jar
    ├── slf4j-log4j12-1.7.2.jar
    ├── stanford-corenlp-1.3.4-models.jar
    ├── stanford-corenlp-1.3.4.jar
    ├── trove-3.0.3.jar
    └── xom.jar
├── licenses
    ├── apache-license-2.txt
    ├── cc-by-nc-sa.txt
    ├── cern.txt
    ├── cpl-1.0.txt
    ├── gnu-gpl-v2.txt
    ├── gnu-gpl-v3.txt
    ├── gnu-lgpl-v2.1.txt
    └── mit.txt
├── resources
    └── log4j.properties
├── sample_settings
    ├── aida.properties
    └── database_aida.properties
├── settings
    └── tokens
    │   ├── stopwords6.txt
    │   └── symbols.txt
├── src
    └── mpi
    │   ├── aida
    │       ├── AidaManager.java
    │       ├── CommandLineDisambiguator.java
    │       ├── Disambiguator.java
    │       ├── Preparator.java
    │       ├── access
    │       │   ├── DataAccess.java
    │       │   ├── DataAccessForTesting.java
    │       │   ├── DataAccessInterface.java
    │       │   └── DataAccessSQL.java
    │       ├── config
    │       │   ├── AidaConfig.java
    │       │   └── settings
    │       │   │   ├── DisambiguationSettings.java
    │       │   │   ├── PreparationSettings.java
    │       │   │   ├── Settings.java
    │       │   │   ├── disambiguation
    │       │   │       ├── CocktailPartyDisambiguationSettings.java
    │       │   │       ├── CocktailPartyKOREDisambiguationSettings.java
    │       │   │       ├── LocalDisambiguationSettings.java
    │       │   │       └── PriorOnlyDisambiguationSettings.java
    │       │   │   └── preparation
    │       │   │       ├── StanfordHybridPreparationSettings.java
    │       │   │       └── StanfordManualPreparationSettings.java
    │       ├── data
    │       │   ├── Context.java
    │       │   ├── DisambiguationResults.java
    │       │   ├── Entities.java
    │       │   ├── Entity.java
    │       │   ├── Keyphrases.java
    │       │   ├── Mention.java
    │       │   ├── Mentions.java
    │       │   ├── PreparedInput.java
    │       │   ├── ResultEntity.java
    │       │   └── ResultMention.java
    │       ├── disambiguationtechnique
    │       │   └── LocalDisambiguation.java
    │       ├── graph
    │       │   ├── Graph.java
    │       │   ├── GraphGenerator.java
    │       │   ├── GraphNode.java
    │       │   ├── GraphNodeTypes.java
    │       │   ├── algorithms
    │       │   │   ├── CocktailParty.java
    │       │   │   ├── CocktailPartySizeConstrained.java
    │       │   │   ├── DisambiguationAlgorithm.java
    │       │   │   ├── GreedyHillClimbing.java
    │       │   │   ├── Node.java
    │       │   │   └── ShortestPath.java
    │       │   ├── extraction
    │       │   │   ├── DegreeComparator.java
    │       │   │   ├── ExtractGraph.java
    │       │   │   └── ExtractGraphAllEdges.java
    │       │   └── similarity
    │       │   │   ├── EnsembleEntityEntitySimilarity.java
    │       │   │   ├── EnsembleMentionEntitySimilarity.java
    │       │   │   ├── EntityEntitySimilarity.java
    │       │   │   ├── MaterializedPriorProbability.java
    │       │   │   ├── MentionEntitySimilarity.java
    │       │   │   ├── PriorProbability.java
    │       │   │   ├── context
    │       │   │       ├── EmptyEntitiesContext.java
    │       │   │       ├── EntitiesContext.java
    │       │   │       ├── EntitiesContextSettings.java
    │       │   │       ├── FastWeightedKeyphrasesContext.java
    │       │   │       ├── KeyphraseReweightedKeywordContext.java
    │       │   │       ├── KeyphrasesContext.java
    │       │   │       ├── TextContext.java
    │       │   │       └── WeightedKeyphrasesContext.java
    │       │   │   ├── exception
    │       │   │       └── MissingSettingException.java
    │       │   │   ├── importance
    │       │   │       ├── EntityImportance.java
    │       │   │       └── InlinkCountImportance.java
    │       │   │   ├── measure
    │       │   │       ├── AlwaysOneSimilarityMeasure.java
    │       │   │       ├── EntityEntitySimilarityMeasure.java
    │       │   │       ├── InlinkOverlapEntityEntitySimilarity.java
    │       │   │       ├── JaccardEntityEntitySimilarityMeasure.java
    │       │   │       ├── JaccardSimilarityMeasure.java
    │       │   │       ├── KOREEntityEntitySimilarityMeasure.java
    │       │   │       ├── KeyphraseCosineSimilarityMeasure.java
    │       │   │       ├── MentionEntitySimilarityMeasure.java
    │       │   │       ├── MilneWittenEntityEntitySimilarity.java
    │       │   │       ├── NGDSimilarityMeasure.java
    │       │   │       ├── NormalizedKeyphrasesBasedIDFSimilarity.java
    │       │   │       ├── NormalizedKeyphrasesBasedMISimilarity.java
    │       │   │       ├── NullEntityEntitySimilarityMeasure.java
    │       │   │       ├── NullMentionEntittySimilarityMeasure.java
    │       │   │       ├── SimilarityMeasure.java
    │       │   │       ├── TfIdfCosineSimilarityMeasure.java
    │       │   │       ├── UnnormalizedKeyphrasesBasedIDFSimilarity.java
    │       │   │       ├── UnnormalizedKeyphrasesBasedMISimilarity.java
    │       │   │       ├── WeightComputation.java
    │       │   │       ├── WeightedJaccardEntityEntitySimilarityMeasure.java
    │       │   │       ├── WeightedNGDSimilarityMeasure.java
    │       │   │       └── WordCountVectorDotProductSimilarityMeasure.java
    │       │   │   └── util
    │       │   │       ├── EntitiesContextCreator.java
    │       │   │       ├── MaxMinSettings.java
    │       │   │       ├── ParallelEntityEntityRelatednessComputation.java
    │       │   │       ├── ParallelEntityEntityRelatednessComputationThread.java
    │       │   │       └── SimilaritySettings.java
    │       ├── preparation
    │       │   ├── AidaTokenizerManager.java
    │       │   └── mentionrecognition
    │       │   │   ├── FilterMentions.java
    │       │   │   ├── HybridFilter.java
    │       │   │   ├── ManualFilter.java
    │       │   │   └── NamedEntityFilter.java
    │       └── util
    │       │   ├── CollectionUtils.java
    │       │   ├── DocumentCounter.java
    │       │   ├── InputTextInvertedIndex.java
    │       │   ├── Measures.java
    │       │   ├── MinCover.java
    │       │   ├── MinCoverCalculator.java
    │       │   ├── NiceTime.java
    │       │   ├── Result.java
    │       │   ├── RunningTimer.java
    │       │   ├── SortByAvgPre.java
    │       │   ├── StopWord.java
    │       │   ├── WikipediaDumpArticleIdExtractor.java
    │       │   ├── WikipediaUtil.java
    │       │   ├── YagoUtil.java
    │       │   └── htmloutput
    │       │       ├── GenerateWebHtml.java
    │       │       └── ResultMention.java
    │   └── experiment
    │       ├── measure
    │           └── EvaluationMeasures.java
    │       ├── reader
    │           ├── AidaFormatCollectionReader.java
    │           ├── CoNLLReader.java
    │           ├── CollectionReader.java
    │           └── KORE50Reader.java
    │       └── trace
    │           ├── EntityEntityTracing.java
    │           ├── GraphTracer.java
    │           ├── NullEntityEntityTracing.java
    │           ├── NullGraphTracer.java
    │           ├── NullTracer.java
    │           ├── Tracer.java
    │           ├── data
    │               ├── EntityTracer.java
    │               └── MentionTracer.java
    │           └── measures
    │               ├── GenericEntityEntitySimilarityMeasureTracer.java
    │               ├── KeyphrasesMeasureTracer.java
    │               ├── KeytermEntityEntityMeasureTracer.java
    │               ├── KeywordContextEntityTracer.java
    │               ├── MeasureTracer.java
    │               ├── PriorMeasureTracer.java
    │               ├── TermTracer.java
    │               └── TracerPart.java
└── test
    └── mpi
        └── aida
            ├── DisambiguatorTest.java
            ├── data
                └── ContextTest.java
            ├── graph
                ├── algorithms
                │   ├── CocktailPartySizeConstrainedTest.java
                │   └── CocktailPartyTest.java
                └── similarity
                │   ├── EnsembleMentionEntitySimilarityTest.java
                │   ├── PriorProbabilityTest.java
                │   ├── context
                │       └── EntitiesContextTest.java
                │   └── measure
                │       ├── KORETest.java
                │       ├── KeyphrasesBasedSimilarityTest.java
                │       ├── MilneWittenEntityEntitySimilarityTest.java
                │       └── WeightComputationTest.java
            └── util
                └── WikipediaUtilTest.java


/build.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" ?>
  2 | <!DOCTYPE project>
  3 | <project name="aida" default="fat-dist" basedir=".">	
  4 | 	<description>
  5 | 		Compile, test, generate a jar or run the rmi service for AIDA.
  6 | 	</description>
  7 | 
  8 | 	<property name="src" location="./src" />
  9 | 	<property name="src-test" location="./test" />
 10 | 	<property name="build" location="./bin" />
 11 | 	<property name="external_jars" location="./lib" />
 12 | 	<property name="created_jars" location="./" />
 13 | 	<property name="resources" location="./resources" />
 14 | 	<property name="test.reports" location="./reports" />
 15 | 	<property environment="env" />
 16 | 
 17 | 	<target name="init">
 18 | 		<!-- Creates the time stamp. -->
 19 | 		<tstamp />
 20 | 		<!-- Creates the build directory structure used by build. -->
 21 | 		<mkdir dir="${build}" />
 22 | 	</target>
 23 | 
 24 | 	<target name="compile" depends="init" 
 25 | 		description="Compiles the project.">
 26 | 		<javac includeJavaRuntime="true" includeAntRuntime="false" srcdir="${src}" debug="yes" destdir="${build}">
 27 | 			<classpath>
 28 | 				<pathelement location="./" />
 29 | 				<fileset dir="${external_jars}">
 30 | 					<include name="*.jar" />
 31 | 				</fileset>
 32 | 				<pathelement path="${build}" />
 33 | 			</classpath>
 34 | 		</javac>
 35 | 		<copy todir="${build}">
 36 | 			<fileset dir="${resources}">
 37 | 				<exclude name="**/*.java" />
 38 | 			</fileset>
 39 | 		</copy>
 40 | 	</target>
 41 | 
 42 | 	<target name="dist" depends="compile" 
 43 | 		description="Generate the slim jar file.">
 44 | 		<jar jarfile="${created_jars}/aida.jar" basedir="${build}" />
 45 | 	</target>
 46 | 
 47 | 	<target name="fat-dist" depends="compile" 
 48 | 			description="Generate the fat jar file with all dependencies.">
 49 | 		<jar jarfile="${created_jars}/aida.jar" basedir="${build}">
 50 | 			<zipgroupfileset dir="${external_jars}" includes="*.jar"/>
 51 | 		</jar>
 52 | 	</target>
 53 | 
 54 | 	<target name="clean" 
 55 | 		description="Cleans up the build path.">
 56 | 		<delete dir="${build}" />
 57 | 	</target>
 58 | 
 59 | 	<target name="cleanall" description="Cleans up the bins and jars.">
 60 | 		<delete dir="${build}" />
 61 | 		<delete file="${created_jars}/aida.jar" />
 62 | 	</target>
 63 | 
 64 | 	<target name="rmi-service" 
 65 | 		description="AIDA RMI service.">
 66 | 		<java classname="mpi.aida.service.AidaRMIService" 
 67 | 			fork="true" spawn="false">
 68 | 			<jvmarg value="-Xmx45000M" />
 69 | 			<classpath>
 70 | 				<pathelement location="./" />
 71 | 				<fileset dir="${created_jars}">
 72 | 					<include name="*.jar" />
 73 | 				</fileset>
 74 | 				<fileset dir="${external_jars}">
 75 | 					<include name="*.jar" />
 76 | 				</fileset>
 77 | 				<fileset dir="${resources}">
 78 | 					<include name="*.properties" />
 79 | 				</fileset>
 80 | 			</classpath>
 81 | 		</java>
 82 | 	</target>
 83 | 
 84 | 	<target name="run" depends="cleanall,dist,rmi-service" 
 85 | 		description="Cleans, compiles, and runs the rmi service.">
 86 | 	</target>
 87 | 
 88 | 	<target name="compile-test" depends="init,compile" 
 89 | 		description="Compiles the unit tests.">
 90 | 		<javac includeantruntime="false" srcdir="${src-test}" debug="yes" destdir="${build}">
 91 | 			<classpath>
 92 | 				<pathelement location="./" />
 93 | 				<fileset dir="${external_jars}">
 94 | 					<include name="*.jar" />
 95 | 				</fileset>
 96 | 				<pathelement path="${build}" />
 97 | 			</classpath>
 98 | 		</javac>
 99 | 		<copy todir="${build}">
100 | 			<fileset dir="${src-test}">
101 | 				<exclude name="**/*.java" />
102 | 			</fileset>
103 | 		</copy>
104 | 	</target>
105 | 	
106 | 	<target name="test" depends="compile-test" description="Run JUnit.">
107 | 		<delete dir="${test.reports}" />
108 | 		<mkdir dir="${test.reports}" />
109 | 		<junit maxmemory="2G" printsummary="yes" fork="yes" haltonfailure="no">
110 | 			<classpath>
111 | 				<pathelement location="./" />
112 | 				<fileset dir="${external_jars}">
113 | 					<include name="*.jar" />
114 | 				</fileset>
115 | 				<pathelement path="${build}" />
116 | 			</classpath>
117 | 			
118 | 			<formatter type="xml" />
119 | 
120 | 			<batchtest todir="${test.reports}">
121 | 				<fileset dir="${src-test}">
122 | 					<include name="**/*Test.java" />
123 | 				</fileset>
124 | 			</batchtest>
125 | 		</junit>
126 | 		<echo message="JUnit tests done, results are in ${test.reports}" />
127 | 	</target>
128 | 
129 | </project>
130 | 


--------------------------------------------------------------------------------
/lib/JavaEWAH-0.4.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/JavaEWAH-0.4.2.jar


--------------------------------------------------------------------------------
/lib/basics2_20100910.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/basics2_20100910.jar


--------------------------------------------------------------------------------
/lib/commons-cli-1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/commons-cli-1.2.jar


--------------------------------------------------------------------------------
/lib/commons-collections-3.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/commons-collections-3.2.1.jar


--------------------------------------------------------------------------------
/lib/commons-io-1.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/commons-io-1.4.jar


--------------------------------------------------------------------------------
/lib/commons-lang-2.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/commons-lang-2.3.jar


--------------------------------------------------------------------------------
/lib/javatools_20120619.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/javatools_20120619.jar


--------------------------------------------------------------------------------
/lib/jgrapht.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/jgrapht.jar


--------------------------------------------------------------------------------
/lib/joda-time.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/joda-time.jar


--------------------------------------------------------------------------------
/lib/jollyday.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/jollyday.jar


--------------------------------------------------------------------------------
/lib/junit-4.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/junit-4.11.jar


--------------------------------------------------------------------------------
/lib/log4j-1.2.17.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/log4j-1.2.17.jar


--------------------------------------------------------------------------------
/lib/mpi-DBManager-20121219.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/mpi-DBManager-20121219.jar


--------------------------------------------------------------------------------
/lib/mpi-TokenizerService2-20130124.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/mpi-TokenizerService2-20130124.jar


--------------------------------------------------------------------------------
/lib/postgresql-9.2-1002.jdbc4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/postgresql-9.2-1002.jdbc4.jar


--------------------------------------------------------------------------------
/lib/slf4j-api-1.7.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/slf4j-api-1.7.2.jar


--------------------------------------------------------------------------------
/lib/slf4j-log4j12-1.7.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/slf4j-log4j12-1.7.2.jar


--------------------------------------------------------------------------------
/lib/stanford-corenlp-1.3.4-models.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/stanford-corenlp-1.3.4-models.jar


--------------------------------------------------------------------------------
/lib/stanford-corenlp-1.3.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/stanford-corenlp-1.3.4.jar


--------------------------------------------------------------------------------
/lib/trove-3.0.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/trove-3.0.3.jar


--------------------------------------------------------------------------------
/lib/xom.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/xom.jar


--------------------------------------------------------------------------------
/licenses/cern.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 1999 CERN - European Organization for Nuclear Research. Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose is hereby granted without fee, provided that the above copyright notice appear in all copies and that both that copyright notice and this permission notice appear in supporting documentation. CERN makes no representations about the suitability of this software for any purpose. It is provided "as is" without expressed or implied warranty.


--------------------------------------------------------------------------------
/licenses/mit.txt:
--------------------------------------------------------------------------------
 1 |  Permission is hereby granted, free  of charge, to any person obtaining
 2 |  a  copy  of this  software  and  associated  documentation files  (the
 3 |  "Software"), to  deal in  the Software without  restriction, including
 4 |  without limitation  the rights to  use, copy, modify,  merge, publish,
 5 |  distribute,  sublicense, and/or sell  copies of  the Software,  and to
 6 |  permit persons to whom the Software  is furnished to do so, subject to
 7 |  the following conditions:
 8 |  
 9 |  The  above  copyright  notice  and  this permission  notice  shall  be
10 |  included in all copies or substantial portions of the Software.
11 |  
12 |  THE  SOFTWARE IS  PROVIDED  "AS  IS", WITHOUT  WARRANTY  OF ANY  KIND,
13 |  EXPRESS OR  IMPLIED, INCLUDING  BUT NOT LIMITED  TO THE  WARRANTIES OF
14 |  MERCHANTABILITY,    FITNESS    FOR    A   PARTICULAR    PURPOSE    AND
15 |  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
16 |  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
17 |  OF CONTRACT, TORT OR OTHERWISE,  ARISING FROM, OUT OF OR IN CONNECTION
18 |  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set root logger level to DEBUG and its only appender to A1.
2 | log4j.rootLogger=INFO, A1
3 | 
4 | # A1 is set to be a ConsoleAppender.
5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
6 | 
7 | # A1 uses PatternLayout.
8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
9 | log4j.appender.A1.layout.ConversionPattern=%d{HH:mm:ss,SSS} [%t] %-5p %C{1} %x - %m%n


--------------------------------------------------------------------------------
/sample_settings/aida.properties:
--------------------------------------------------------------------------------
1 | # Type of data backend. For now only 'sql' is used
2 | dataAccess = sql
3 | # Number of parallel threads to use in the computation of the entity-entity
4 | # similarity. Use as many as you have CPU cores.
5 | eeNumThreads = 4


--------------------------------------------------------------------------------
/sample_settings/database_aida.properties:
--------------------------------------------------------------------------------
1 | type = postgres
2 | hostname = <host>
3 | port = 5432
4 | schema = <schema>
5 | username = <user>
6 | password = <pass>
7 | maxConnection = 50


--------------------------------------------------------------------------------
/settings/tokens/symbols.txt:
--------------------------------------------------------------------------------
 1 | "
 2 | %
 3 | '
 4 | ''
 5 | (
 6 | )
 7 | +
 8 | ,
 9 | -
10 | .
11 | :
12 | ;
13 | _
14 | —
15 | –
16 | |
17 | \
18 | /
19 | *
20 | {
21 | }
22 | !
23 | §
24 | $
25 | &
26 | =
27 | *
28 | ~
29 | #
30 | ,
31 | >
32 | <
33 | [
34 | ]


--------------------------------------------------------------------------------
/src/mpi/aida/Preparator.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida;
 2 | 
 3 | import java.util.Arrays;
 4 | import java.util.List;
 5 | 
 6 | import mpi.aida.config.settings.PreparationSettings;
 7 | import mpi.aida.data.PreparedInput;
 8 | 
 9 | public class Preparator {
10 | 
11 |   public PreparedInput prepare(String docId, String text, PreparationSettings settings) {
12 |     PreparedInput preparedInput = AidaManager.prepareInputData(text, docId, settings.getMentionsFilter());
13 |     String[] types = settings.getFilteringTypes();
14 |     if (types != null) {
15 |       List<String> filteringTypes = Arrays.asList(settings.getFilteringTypes());
16 |       preparedInput.getMentions().setEntitiesTypes(filteringTypes);
17 |     }
18 |     return preparedInput;
19 |   }
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/src/mpi/aida/access/DataAccessInterface.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.access;
 2 | 
 3 | import gnu.trove.map.hash.TIntDoubleHashMap;
 4 | import gnu.trove.map.hash.TIntIntHashMap;
 5 | import gnu.trove.map.hash.TIntObjectHashMap;
 6 | import gnu.trove.map.hash.TObjectIntHashMap;
 7 | import gnu.trove.set.hash.TIntHashSet;
 8 | 
 9 | import java.util.Collection;
10 | import java.util.List;
11 | import java.util.Map;
12 | import java.util.Set;
13 | 
14 | import mpi.aida.data.Entities;
15 | import mpi.aida.data.Entity;
16 | import mpi.aida.data.Keyphrases;
17 | import mpi.aida.util.YagoUtil.Gender;
18 | 
19 | public interface DataAccessInterface {
20 | 
21 |   public DataAccess.type getAccessType();
22 | 
23 |   public Entities getEntitiesForMention(String mention);
24 | 
25 |   public int[] getInlinkNeighbors(Entity e);
26 | 
27 |   public TIntObjectHashMap<int[]> getInlinkNeighbors(Entities entities);
28 |  
29 |   public Map<String, Gender> getGenderForEntities(Entities entities);
30 | 
31 |   public Map<String, List<String>> getTypes(Set<String> entities);
32 | 
33 |   public List<String> getTypes(String Entity);
34 | 
35 |   public TIntIntHashMap getKeyphraseDocumentFrequencies(TIntHashSet keyphrases);
36 | 
37 |   public List<String> getParentTypes(String queryType);
38 | 
39 |   public String getKeyphraseSource(String entityName, String keyphrase);
40 | 
41 |   public Map<String, List<String>> getEntityKeyphrases(Set<String> entities);
42 | 
43 |   public Map<String, List<String>> getKeyphraseEntities(Set<String> keyphrases);
44 | 
45 |   public Map<Entity, int[]> getEntityLSHSignatures(Entities entities);
46 | 
47 |   public Map<Entity, int[]> getEntityLSHSignatures(Entities entities, String table);
48 | 
49 |   public String getFamilyName(String entity);
50 | 
51 |   public String getGivenName(String entity);
52 | 
53 |   public TIntDoubleHashMap getEntityPriors(String mention);
54 | 
55 |   public void getEntityKeyphraseTokens(
56 |       Entities entities, String keyphraseSourceExclusion,
57 |       TIntObjectHashMap<int[]> entityKeyphrases,
58 |       TIntObjectHashMap<int[]> kpTokens);
59 | 
60 |   public TIntIntHashMap getKeywordDocumentFrequencies(TIntHashSet keywords);
61 | 
62 |   public TIntIntHashMap getEntitySuperdocSize(Entities entities);
63 | 
64 |   public TIntObjectHashMap<TIntIntHashMap> getEntityKeywordIntersectionCount(Entities entities);
65 | 
66 |   public TIntObjectHashMap<String> getYagoEntityIdsForIds(int[] ids);
67 | 
68 |   public TObjectIntHashMap<String> getIdsForYagoEntityIds(Collection<String> entityIds);
69 | 
70 |   public TIntObjectHashMap<String> getWordsForIds(int[] keywordIds);
71 | 
72 |   public TObjectIntHashMap<String> getIdsForWords(Collection<String> keywords);
73 | 
74 |   public TObjectIntHashMap<String> getAllEntityIds();
75 | 
76 |   public Entities getAllEntities();
77 | 
78 |   public int[] getAllWordExpansions();
79 | 
80 |   public boolean checkEntityNameExists(String entity);
81 |   
82 |   public boolean isYagoEntity(Entity entity);
83 | 
84 |   public Keyphrases getEntityKeyphrases(Entities entities,
85 |       String keyphraseSourceExclusion);
86 |  }
87 | 


--------------------------------------------------------------------------------
/src/mpi/aida/config/AidaConfig.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.config;
  2 | 
  3 | import java.io.File;
  4 | import java.io.FileReader;
  5 | import java.util.Properties;
  6 | 
  7 | import org.slf4j.Logger;
  8 | import org.slf4j.LoggerFactory;
  9 | 
 10 | /**
 11 |  * Main configuration path for global settings.
 12 |  */
 13 | public class AidaConfig {
 14 |   private static final Logger logger = 
 15 |       LoggerFactory.getLogger(AidaConfig.class);
 16 | 
 17 |   public static final String SERVICENAME = "serviceName";
 18 | 
 19 |   public static final String SERVERPORT = "serverport";
 20 | 
 21 |   public static final String CLIENTPORT = "clientport";
 22 | 
 23 |   public static final String STORAGEPATH = "storagePath";
 24 | 
 25 |   public static final String DATAACCESS = "dataAccess";
 26 | 
 27 |   public static final String DATAACCESS_IP = "dataAccessIP";
 28 | 
 29 |   public static final String DATAACCESS_PORT = "dataAccessPort";
 30 | 
 31 |   public static final String DATAACCESS_SERVICENAME = "dataAccessServiceName";
 32 | 
 33 |   public static final String DATAACCESS_DIRECT_PATH = "dataAccessDirectPath";
 34 | 
 35 |   public static final String RMI_TOKENIZER_LANGUAGE = "tokenizerLanguage";
 36 | 
 37 |   public static final String LOG_TO_DB = "logToDB";
 38 | 
 39 |   public static final String LOG_TABLENAME = "logTableName";
 40 | 
 41 |   public static final String LOG_STATS_TABLENAME = "logStatsTableName";
 42 |   
 43 |   public static final String MAX_NUM_CANDIDATE_ENTITIES_FOR_GRAPH = "maxNumCandidateEntitiesForGraph";
 44 |   
 45 |   public static final String EE_NUM_THREADS = "eeNumThreads";
 46 | 
 47 |   private Properties properties;
 48 | 
 49 |   private String path = "./settings/aida.properties";
 50 | 
 51 |   private static AidaConfig config = null;
 52 | 
 53 |   private AidaConfig() {
 54 |     properties = new Properties();
 55 |     try {
 56 |       properties.load(new FileReader(new File(path)));
 57 |     } catch (Exception e) {
 58 |       properties = new Properties();
 59 |       logger.error("Main settings file missing. " +
 60 |       		"Copy 'sample_settings/aida.properties' to the 'settings/' " +
 61 |       		"directory and adjust it.");
 62 |     }
 63 |   }
 64 | 
 65 |   private static AidaConfig getInstance() {
 66 |     if (config == null) {
 67 |       config = new AidaConfig();
 68 |     }
 69 |     return config;
 70 |   }
 71 | 
 72 |   private String getValue(String key) {
 73 |     return (String) properties.get(key);
 74 |   }
 75 |   
 76 |   private void setValue(String key, String value) {
 77 |     properties.setProperty(key, value);
 78 |   }
 79 | 
 80 |   private boolean hasKey(String key) {
 81 |     return properties.containsKey(key);
 82 |   }
 83 | 
 84 |   public static String get(String key) {
 85 |     String value = null;
 86 |     if (AidaConfig.getInstance().hasKey(key)) {
 87 |       value = AidaConfig.getInstance().getValue(key);
 88 |     } else {
 89 |       // Some default values.
 90 |       if (key.equals(EE_NUM_THREADS)) {
 91 |         value = "8";
 92 |       } else if (key.equals(MAX_NUM_CANDIDATE_ENTITIES_FOR_GRAPH)) {
 93 |         // 0 means no limit.
 94 |         value = "0";
 95 |       } else {
 96 |         logger.error("" +
 97 |         		"Missing key in properties file with no default value: " + key);
 98 |       }
 99 |     }
100 |     return value;
101 |   }
102 |   
103 |   public static void set(String key, String value) {
104 |     AidaConfig.getInstance().setValue(key, value);
105 |   }
106 | }
107 | 


--------------------------------------------------------------------------------
/src/mpi/aida/config/settings/PreparationSettings.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.config.settings;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | import mpi.aida.preparation.mentionrecognition.FilterMentions.FilterType;
 6 | 
 7 | /**
 8 |  * Settings for the preparator. Predefined settings are available in
 9 |  * {@see mpi.aida.config.settings.preparation}.
10 |  */
11 | public class PreparationSettings implements Serializable {
12 | 
13 |   private static final long serialVersionUID = -2825720730925914648L;
14 | 
15 |   private FilterType mentionsFilter = FilterType.Hybrid;
16 | 
17 |   private String[] filteringTypes = null;
18 |   
19 |   private LANGUAGE language = LANGUAGE.ENGLISH;
20 |   
21 |   public static enum LANGUAGE {
22 |     ENGLISH, GERMAN
23 |   }
24 | 
25 |   public FilterType getMentionsFilter() {
26 |     return mentionsFilter;
27 |   }
28 | 
29 |   public void setMentionsFilter(FilterType mentionsFilter) {
30 |     this.mentionsFilter = mentionsFilter;
31 |   }
32 | 
33 |   public String[] getFilteringTypes() {
34 |     return filteringTypes;
35 |   }
36 | 
37 |   public void setFilteringTypes(String[] filteringTypes) {
38 |     this.filteringTypes = filteringTypes;
39 |   }
40 |   
41 |   public LANGUAGE getLanguage() {
42 |     return language;
43 |   }
44 |   
45 |   public void setLanguage(LANGUAGE language) {
46 |     this.language = language;
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/mpi/aida/config/settings/Settings.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.config.settings;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | /**
 6 |  * Combined PreparationSettings and DisambiguationSettings, useful for
 7 |  * a calling API.
 8 |  */
 9 | public class Settings implements Serializable {
10 | 
11 |   public enum TECHNIQUE {
12 |     LOCAL, LOCAL_ITERATIVE, GRAPH, CHAKRABARTI
13 |   }
14 | 
15 |   public enum ALGORITHM {
16 |     COCKTAIL_PARTY, COCKTAIL_PARTY_SIZE_CONSTRAINED, RANDOM_WALK
17 |   }
18 |   
19 |   private static final long serialVersionUID = -6602287193597852191L;
20 | 
21 |   private PreparationSettings preparationSettings = null;
22 | 
23 |   private DisambiguationSettings disambiguationSettings = null;
24 | 
25 |   public Settings(PreparationSettings preparationSettings, 
26 |                   DisambiguationSettings disambiguationSettings) {
27 |     this.preparationSettings = preparationSettings;
28 |     this.disambiguationSettings = disambiguationSettings;
29 |   }
30 | 
31 |   public PreparationSettings getPreparationSettings() {
32 |     return preparationSettings;
33 |   }
34 | 
35 |   public void setPreparationSettings(PreparationSettings preparationSettings) {
36 |     this.preparationSettings = preparationSettings;
37 |   }
38 | 
39 |   public DisambiguationSettings getDisambiguationSettings() {
40 |     return disambiguationSettings;
41 |   }
42 | 
43 |   public void setDisambiguationSettings(DisambiguationSettings disambiguationSettings) {
44 |     this.disambiguationSettings = disambiguationSettings;
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/mpi/aida/config/settings/disambiguation/CocktailPartyDisambiguationSettings.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.config.settings.disambiguation;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.LinkedList;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | 
 8 | import mpi.aida.config.settings.DisambiguationSettings;
 9 | import mpi.aida.config.settings.Settings.ALGORITHM;
10 | import mpi.aida.config.settings.Settings.TECHNIQUE;
11 | import mpi.aida.graph.similarity.exception.MissingSettingException;
12 | import mpi.aida.graph.similarity.util.SimilaritySettings;
13 | import mpi.experiment.trace.GraphTracer.TracingTarget;
14 | 
15 | /**
16 |  * Preconfigured settings for the {@see Disambiguator} using the mention-entity
17 |  * prior, the keyphrase based similarity, and the MilneWitten Wikipedia link
18 |  * based entity coherence.
19 |  * 
20 |  * This gives the best quality and should be used in comparing results against
21 |  * AIDA. 
22 |  */
23 | public class CocktailPartyDisambiguationSettings extends DisambiguationSettings {
24 |     
25 |   private static final long serialVersionUID = 5867674989478781057L;
26 | 
27 |   public CocktailPartyDisambiguationSettings() throws MissingSettingException {
28 |     setAlpha(0.6);
29 |     setTracingTarget(TracingTarget.WEB_INTERFACE);
30 |      
31 |     setDisambiguationTechnique(TECHNIQUE.GRAPH);
32 |     setDisambiguationAlgorithm(ALGORITHM.COCKTAIL_PARTY_SIZE_CONSTRAINED);
33 |     setUseExhaustiveSearch(true);
34 |     setUseNormalizedObjective(true);
35 |     setEntitiesPerMentionConstraint(5);
36 |     setUseCoherenceRobustnessTest(true);
37 |     setCohRobustnessThreshold(0.9);
38 |     
39 |     Map<String, double[]> minMaxs = new HashMap<String, double[]>();
40 |     minMaxs.put("prior", new double[] { 0.0, 1.0} );
41 |     minMaxs.put("UnnormalizedKeyphrasesBasedMISimilarity:KeyphrasesContext", new double[] { 0.0, 840.1373501651881});
42 |     minMaxs.put("UnnormalizedKeyphrasesBasedIDFSimilarity:KeyphrasesContext", new double[] { 0.0, 63207.231647131});
43 |     
44 |     List<String[]> simConfigs = new LinkedList<String[]>();
45 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "1.4616111666431395E-5" });
46 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "4.291375037765039E-5" });
47 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.15586170799823845" });
48 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.645200419577534" }); 
49 |     List<String[]> cohConfigs = new LinkedList<String[]>();
50 |     cohConfigs.add(new String[] { "MilneWittenEntityEntitySimilarity", "1.0" });
51 | 
52 |     SimilaritySettings switchedKPsettings = new SimilaritySettings(simConfigs, cohConfigs, 0.19888034256218348, minMaxs);
53 |     switchedKPsettings.setIdentifier("SwitchedKP");
54 |     switchedKPsettings.setPriorThreshold(0.9);
55 |     setSimilaritySettings(switchedKPsettings);
56 |     
57 |     simConfigs = new LinkedList<String[]>();
58 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.971742997195044" });
59 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.028257002804955994" });
60 |     SimilaritySettings unnormalizedKPsettings = new SimilaritySettings(simConfigs, null, 0.0, minMaxs);
61 |     switchedKPsettings.setIdentifier("CoherenceRobustnessTest");
62 |     setCoherenceSimilaritySetting(unnormalizedKPsettings);
63 |   }
64 | }


--------------------------------------------------------------------------------
/src/mpi/aida/config/settings/disambiguation/CocktailPartyKOREDisambiguationSettings.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.config.settings.disambiguation;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.LinkedList;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | 
 8 | import mpi.aida.access.DataAccess;
 9 | import mpi.aida.config.settings.DisambiguationSettings;
10 | import mpi.aida.config.settings.Settings.ALGORITHM;
11 | import mpi.aida.config.settings.Settings.TECHNIQUE;
12 | import mpi.aida.graph.similarity.exception.MissingSettingException;
13 | import mpi.aida.graph.similarity.util.SimilaritySettings;
14 | import mpi.experiment.trace.GraphTracer.TracingTarget;
15 | 
16 | /**
17 |  * Preconfigured settings for the {@see Disambiguator} using the mention-entity
18 |  * prior, the keyphrase based similarity, and the KORE keyphrase based
19 |  * entity coherence.
20 |  */
21 | public class CocktailPartyKOREDisambiguationSettings extends DisambiguationSettings {
22 |     
23 |   private static final long serialVersionUID = 5867674989478781057L;
24 | 
25 |   public CocktailPartyKOREDisambiguationSettings() throws MissingSettingException {
26 |     setAlpha(0.6);
27 |     setTracingTarget(TracingTarget.WEB_INTERFACE);
28 |      
29 |     setDisambiguationTechnique(TECHNIQUE.GRAPH);
30 |     setDisambiguationAlgorithm(ALGORITHM.COCKTAIL_PARTY_SIZE_CONSTRAINED);
31 |     setUseExhaustiveSearch(true);
32 |     setUseNormalizedObjective(true);
33 |     setEntitiesPerMentionConstraint(5);
34 |     setUseCoherenceRobustnessTest(true);
35 |     setCohRobustnessThreshold(0.9);
36 |     
37 |     Map<String, double[]> minMaxs = new HashMap<String, double[]>();
38 |     minMaxs.put("prior", new double[] { 0.0, 1.0} );
39 |     minMaxs.put("UnnormalizedKeyphrasesBasedMISimilarity:KeyphrasesContext", new double[] { 0.0, 840.1373501651881});
40 |     minMaxs.put("UnnormalizedKeyphrasesBasedIDFSimilarity:KeyphrasesContext", new double[] { 0.0, 63207.231647131});
41 |     
42 |     List<String[]> simConfigs = new LinkedList<String[]>();
43 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "1.4616111666431395E-5" });
44 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "4.291375037765039E-5" });
45 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.15586170799823845" });
46 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.645200419577534" });  
47 |     List<String[]> cohConfigs = new LinkedList<String[]>();
48 |     cohConfigs.add(new String[] { "KOREEntityEntitySimilarity", "1.0" });
49 | 
50 |     SimilaritySettings switchedKPsettings = new SimilaritySettings(simConfigs, cohConfigs, 0.19888034256218348, minMaxs);
51 |     switchedKPsettings.setIdentifier("SwitchedKP");
52 |     switchedKPsettings.setPriorThreshold(0.9);
53 |     switchedKPsettings.setEntityCohKeyphraseAlpha(1.0);
54 |     switchedKPsettings.setEntityCohKeywordAlpha(0.0);
55 |     switchedKPsettings.setShouldNormalizeCoherenceWeights(true);
56 |     switchedKPsettings.setKeyphraseSourceExclusion(DataAccess.KPSOURCE_INLINKTITLE);
57 |     setSimilaritySettings(switchedKPsettings);
58 |     
59 |     simConfigs = new LinkedList<String[]>();
60 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.971742997195044" });
61 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.028257002804955994" });
62 |     SimilaritySettings unnormalizedKPsettings = new SimilaritySettings(simConfigs, null, 0.0, minMaxs);
63 |     switchedKPsettings.setIdentifier("CoherenceRobustnessTest");
64 |     setCoherenceSimilaritySetting(unnormalizedKPsettings);
65 |   }
66 | }


--------------------------------------------------------------------------------
/src/mpi/aida/config/settings/disambiguation/LocalDisambiguationSettings.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.config.settings.disambiguation;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.LinkedList;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | 
 8 | import mpi.aida.config.settings.DisambiguationSettings;
 9 | import mpi.aida.config.settings.Settings.TECHNIQUE;
10 | import mpi.aida.graph.similarity.exception.MissingSettingException;
11 | import mpi.aida.graph.similarity.util.SimilaritySettings;
12 | 
13 | /**
14 |  * Preconfigured settings for the {@see Disambiguator} using only the 
15 |  * mention-entity prior and the keyphrase based similarity.
16 |  */
17 | public class LocalDisambiguationSettings extends DisambiguationSettings {
18 |     
19 |   private static final long serialVersionUID = -1943862223862927646L;
20 | 
21 |   public LocalDisambiguationSettings() throws MissingSettingException {
22 |     setDisambiguationTechnique(TECHNIQUE.LOCAL);
23 |     
24 |     List<String[]> simConfigs = new LinkedList<String[]>();
25 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "1.4616111666431395E-5" });
26 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "4.291375037765039E-5" });
27 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.15586170799823845" });
28 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.645200419577534" });   
29 |     
30 |     Map<String, double[]> minMaxs = new HashMap<String, double[]>();
31 |     minMaxs.put("prior", new double[] { 0.0, 1.0} );
32 |     minMaxs.put("UnnormalizedKeyphrasesBasedMISimilarity:KeyphrasesContext", new double[] { 0.0, 840.1373501651881});
33 |     minMaxs.put("UnnormalizedKeyphrasesBasedIDFSimilarity:KeyphrasesContext", new double[] { 0.0, 63207.231647131});
34 |     
35 |     SimilaritySettings switchedKPsettings = new SimilaritySettings(simConfigs, null, 0.19888034256218348, minMaxs);
36 |     switchedKPsettings.setIdentifier("SwitchedKP");
37 |     switchedKPsettings.setPriorThreshold(0.9);
38 |     setSimilaritySettings(switchedKPsettings);
39 |     
40 |     setIncludeNullAsEntityCandidate(false);
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/mpi/aida/config/settings/disambiguation/PriorOnlyDisambiguationSettings.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.config.settings.disambiguation;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | import mpi.aida.config.settings.DisambiguationSettings;
 7 | import mpi.aida.config.settings.Settings.TECHNIQUE;
 8 | import mpi.aida.graph.similarity.exception.MissingSettingException;
 9 | import mpi.aida.graph.similarity.util.SimilaritySettings;
10 | 
11 | /**
12 |  * Preconfigured settings for the {@see Disambiguator} using only the 
13 |  * mention-entity prior.
14 |  */
15 | public class PriorOnlyDisambiguationSettings extends DisambiguationSettings {
16 |     
17 |   private static final long serialVersionUID = 2212272023159361340L;
18 | 
19 |   public PriorOnlyDisambiguationSettings() throws MissingSettingException {
20 |     setDisambiguationTechnique(TECHNIQUE.LOCAL);
21 |     
22 |     Map<String, double[]> minMaxs = new HashMap<String, double[]>();
23 |     minMaxs.put("prior", new double[] { 0.0, 1.0} );
24 |     
25 |     SimilaritySettings priorSettings = new SimilaritySettings(null, null, 1.0, minMaxs);
26 |     priorSettings.setIdentifier("Prior");
27 |     setSimilaritySettings(priorSettings);
28 |     
29 |     setIncludeNullAsEntityCandidate(false);
30 |   }
31 | }
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/src/mpi/aida/config/settings/preparation/StanfordHybridPreparationSettings.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.config.settings.preparation;
 2 | 
 3 | import mpi.aida.config.settings.PreparationSettings;
 4 | import mpi.aida.preparation.mentionrecognition.FilterMentions.FilterType;
 5 | 
 6 | /**
 7 |  * Preparator setting that tokenizes the input text using the 
 8 |  * Stanford CoreNLP tokenizer. Mentions are recognized using the 'ner'
 9 |  * stage of the CoreNLP pipeline. In additon, they can be marked up 
10 |  * explicitly by square brackets, e.g.:
11 |  * [[Einstein]] was born in [[Ulm]].
12 |  */
13 | public class StanfordHybridPreparationSettings extends PreparationSettings {
14 | 
15 |   private static final long serialVersionUID = 3743560957961384100L;
16 | 
17 |   public StanfordHybridPreparationSettings() {
18 |     this.setMentionsFilter(FilterType.Hybrid);
19 |     this.setFilteringTypes(null);
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/mpi/aida/config/settings/preparation/StanfordManualPreparationSettings.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.config.settings.preparation;
 2 | 
 3 | import mpi.aida.config.settings.PreparationSettings;
 4 | import mpi.aida.preparation.mentionrecognition.FilterMentions.FilterType;
 5 | 
 6 | /**
 7 |  * Preparator setting that tokenizes the input text using the 
 8 |  * Stanford CoreNLP tokenizer. Mentions need to be marked up with square
 9 |  * bracktets. E.g.:
10 |  * [[Einstein]] was born in [[Ulm]].
11 |  */
12 | public class StanfordManualPreparationSettings extends PreparationSettings {
13 | 
14 |   private static final long serialVersionUID = 3743560957961384100L;
15 | 
16 |   public StanfordManualPreparationSettings() {
17 |     this.setMentionsFilter(FilterType.Manual);
18 |     this.setFilteringTypes(null);
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/mpi/aida/data/Context.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.data;
 2 | 
 3 | import gnu.trove.map.hash.TObjectIntHashMap;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.List;
 7 | 
 8 | import mpi.aida.access.DataAccess;
 9 | import mpi.tokenizer.data.Token;
10 | import mpi.tokenizer.data.Tokens;
11 | 
12 | /**
13 |  * Holds the input document as context representation.
14 |  * 
15 |  *
16 |  */
17 | public class Context {
18 | 
19 |   private List<String> tokenStrings;
20 |   private int[] tokenIds;
21 |   
22 |   public Context(Tokens tokens) {
23 |     List<String> ts = new ArrayList<String>(tokens.size());
24 |     for (Token token : tokens) {
25 |       ts.add(token.getOriginal());
26 |     }
27 |     init(ts);
28 |   }
29 |   
30 |   public Context(List<String> tokens) {
31 |     init(tokens);
32 |   }
33 | 
34 |   public void init(List<String> tokens) {
35 |     tokenStrings = new ArrayList<String>(tokens);
36 |     TObjectIntHashMap<String> token2ids = 
37 |         DataAccess.getIdsForWords(tokenStrings);
38 |     tokenIds = new int[tokens.size()];
39 |     for (int i = 0; i < tokens.size(); ++i) {
40 |       tokenIds[i] = token2ids.get(tokenStrings.get(i));
41 |     }
42 |   }
43 |   
44 |   public List<String> getTokens() {
45 |     return tokenStrings;
46 |   }
47 |   
48 |   public int[] getTokenIds() {
49 |     return tokenIds;
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/mpi/aida/data/DisambiguationResults.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.data;
  2 | 
  3 | import java.io.Serializable;
  4 | import java.util.ArrayList;
  5 | import java.util.Collections;
  6 | import java.util.HashMap;
  7 | import java.util.List;
  8 | import java.util.Map;
  9 | 
 10 | import mpi.experiment.trace.Tracer;
 11 | 
 12 | public class DisambiguationResults implements Serializable {
 13 | 
 14 |   private static final long serialVersionUID = 8366493180300359941L;
 15 | 
 16 |   private Map<ResultMention, List<ResultEntity>> mentionMappings;
 17 | 
 18 |   private String gTracerHtml;
 19 | 
 20 |   private Tracer tracer = null;
 21 | 
 22 |   public DisambiguationResults(Map<ResultMention, List<ResultEntity>> mentionMappings, String gTracerHtml) {
 23 |     super();
 24 |     this.mentionMappings = mentionMappings;
 25 |     this.gTracerHtml = gTracerHtml;
 26 |   }
 27 | 
 28 |   public List<ResultMention> getResultMentions() {
 29 |     List<ResultMention> mentions = new ArrayList<ResultMention>(mentionMappings.keySet());
 30 |     Collections.sort(mentions);
 31 |     return mentions;
 32 |   }
 33 | 
 34 |   public List<ResultEntity> getResultEntities(ResultMention rm) {
 35 |     return mentionMappings.get(rm);
 36 |   }
 37 | 
 38 |   public void setResultEntities(ResultMention rm, List<ResultEntity> res) {
 39 |     mentionMappings.put(rm, res);
 40 |   }
 41 | 
 42 |   public ResultEntity getBestEntity(ResultMention rm) {
 43 |     List<ResultEntity> res = getResultEntities(rm);
 44 | 
 45 |     if (res.size() == 0) {
 46 |       return null;
 47 |     } else {
 48 |       return res.get(0);
 49 |     }
 50 |   }
 51 | 
 52 |   /**
 53 |    * THIS METHOD IS DEPRECATED!
 54 |    * Please use getResultMentions() and getResultEntities()/getBestEntity()
 55 |    * 
 56 |    * Return a map from all mentions found in the input document
 57 |    * to the best entity it could be disambiguated to.
 58 |    * 
 59 |    * Mentions are in the format: mention name:::character-offset:::character-length:::score
 60 |    * Entities are a String identifying the YAGO2 entity (see http://www.yago-knowledge.org)
 61 |    * 
 62 |    * @return  Map of mentions to the best entity
 63 |    */
 64 |   @Deprecated
 65 |   public Map<String, String> getMentionMappings() {
 66 |     Map<String, String> mappings = new HashMap<String, String>();
 67 | 
 68 |     for (ResultMention rm : getResultMentions()) {
 69 |       String entityId = null;
 70 |       ResultEntity re = getBestEntity(rm);
 71 |       if (re != null) {
 72 |         entityId = re.getEntity();
 73 |       }
 74 | 
 75 |       mappings.put(rm.getMention() + ":::" + rm.getCharacterOffset() + ":::" + rm.getCharacterLength() + ":::" + re.getDisambiguationScore(), entityId);
 76 |     }
 77 | 
 78 |     return mappings;
 79 |   }
 80 | 
 81 |   public String getgTracerHtml() {
 82 |     return gTracerHtml;
 83 |   }
 84 | 
 85 |   public void setgTracerHtml(String gTracerHtml) {
 86 |     this.gTracerHtml = gTracerHtml;
 87 |   }
 88 | 
 89 |   public Tracer getTracer() {
 90 |     return tracer;
 91 |   }
 92 | 
 93 |   public void setTracer(Tracer tracer) {
 94 |     this.tracer = tracer;
 95 |   }
 96 | 
 97 |   public String toString() {
 98 |     return mentionMappings.toString();
 99 |   }
100 | }
101 | 


--------------------------------------------------------------------------------
/src/mpi/aida/data/Entities.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.data;
  2 | 
  3 | import java.io.Serializable;
  4 | import java.util.Collection;
  5 | import java.util.HashMap;
  6 | import java.util.HashSet;
  7 | import java.util.Iterator;
  8 | import java.util.Set;
  9 | 
 10 | public class Entities implements Serializable, Iterable<Entity> {
 11 | 
 12 |   private static final long serialVersionUID = -5405018666688695438L;
 13 |   
 14 |   private boolean includesNmeEntities;
 15 |   
 16 |   private HashMap<String, Integer> entitiesNames;
 17 | 
 18 |   private Set<Entity> entities = null;
 19 | 
 20 |   public Entities() {
 21 |     this.entitiesNames = new HashMap<String, Integer>();
 22 |     entities = new HashSet<Entity>();
 23 |   }
 24 | 
 25 |   public Entities(Set<Entity> entities) {
 26 |     this.entities = entities;
 27 |     this.entitiesNames = new HashMap<String, Integer>();
 28 |     for (Entity entity : entities) {
 29 |       this.entitiesNames.put(entity.getName(), entity.getId());
 30 |     }
 31 |   }
 32 | 
 33 |   public int getId(String entity) {
 34 |     return entitiesNames.get(entity);
 35 |   }
 36 | 
 37 |   public boolean contains(String entity) {
 38 |     return entitiesNames.containsKey(entity);
 39 |   }
 40 | 
 41 |   public Set<String> getUniqueNames() {
 42 |     return entitiesNames.keySet();
 43 |   }
 44 |   
 45 |   public Set<String> getUniqueNamesNormalizingNME() {
 46 |     Set<String> names = new HashSet<String>();
 47 |     
 48 |     for (Entity e : entities) {
 49 |       if (e.isNMEentity()) {
 50 |         names.add(e.getNMEnormalizedName());
 51 |       } else {
 52 |         names.add(e.getName());
 53 |       }
 54 |     }
 55 |     
 56 |     return names;
 57 |   }
 58 |   
 59 |   public Collection<Integer> getUniqueIds() {
 60 |     return entitiesNames.values();
 61 |   }
 62 | 
 63 |   public Set<Entity> getEntities() {
 64 |     return entities;
 65 |   }
 66 | 
 67 |   /**
 68 |    * Should only be used for testing or if you know the exact id for each entity
 69 |    * @param entity
 70 |    * @param id
 71 |    */
 72 |   public void add(Entity entity) {
 73 |     entities.add(entity);
 74 |     entitiesNames.put(entity.getName(), entity.getId());
 75 |   }
 76 | 
 77 |   public void addAll(Entities entities) {
 78 |     this.entities.addAll(entities.entities);
 79 |     this.entitiesNames.putAll(entities.entitiesNames);
 80 |   }
 81 | 
 82 |   public int uniqueNameSize() {
 83 |     return entitiesNames.size();
 84 |   }
 85 | 
 86 |   public int size() {
 87 |     return entities.size();
 88 |   }
 89 | 
 90 |   @Override
 91 |   public Iterator<Entity> iterator() {
 92 |     return entities.iterator();
 93 |   }
 94 | 
 95 |   public boolean isEmpty() {
 96 |     return entities.isEmpty();
 97 |   }
 98 | 
 99 |   public boolean isIncludesNmeEntities() {
100 |     return includesNmeEntities;
101 |   }
102 | 
103 |   public void setIncludesNmeEntities(boolean includesNmeEntities) {
104 |     this.includesNmeEntities = includesNmeEntities;
105 |   }
106 | 
107 |   public static String getMentionNMEKey(String mentionName) {
108 |     return mentionName+"-"+Entity.NO_MATCHING_ENTITY;
109 |   }
110 | 
111 |   public static boolean isNMEName(String name) {
112 |     return name.endsWith("-"+Entity.NO_MATCHING_ENTITY);
113 |   }
114 |   
115 |   public static String getNameForNME(String nmeName) {
116 |     String name = nmeName.replace("-" + Entity.NO_MATCHING_ENTITY, "");
117 |     return name;
118 |   }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/mpi/aida/data/Entity.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.data;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.List;
 5 | 
 6 | import javatools.parsers.Char;
 7 | 
 8 | public class Entity implements Serializable, Comparable<Entity>, Cloneable {
 9 | 
10 |   private static final long serialVersionUID = 131444964369556633L;
11 | 
12 |   private String name;
13 |   
14 |   private List<String> surroundingMentionNames;
15 | 
16 |   private int id = -1;
17 | 
18 |   
19 |   public static final String NO_MATCHING_ENTITY = "--NME--";
20 | 
21 |   /**
22 |    * Use this field to represent the mention-entity similarity computed with 
23 |    * some method (not the score stored in the DB). This field will not be set 
24 |    * in the constructor. We set it later on, when we compute the similarity
25 |    */
26 |   private double mentionEntitySimilarity;
27 | 
28 |   public Entity(String name, int id) {
29 |     this.name = name;
30 |     this.mentionEntitySimilarity = -1.0;
31 |     this.id = id;
32 |   }
33 | 
34 |   public String getName() {
35 |     return name;
36 |   }
37 | 
38 |   public String toString() {
39 |     return name + " (" + id + ")";
40 |   }
41 | 
42 |   public String tohtmlString() {
43 |     return "<td></td><td></td><td>" + Char.toHTML(name) + "</td><td></td><td></td><td></td>";
44 |   }
45 | 
46 |   public int getId() {
47 |     return id;
48 |   }
49 | 
50 |   public double getMentionEntitySimilarity() {
51 |     return this.mentionEntitySimilarity;
52 |   }
53 | 
54 |   public void setMentionEntitySimilarity(double mes) {
55 |     this.mentionEntitySimilarity = mes;
56 |   }
57 | 
58 |   public int compareTo(Entity e) {
59 |     return name.compareTo(e.getName());
60 |   }
61 |   
62 |   public boolean equals(Object o) {
63 |     if (o instanceof Entity) {
64 |       Entity e = (Entity) o;
65 |       return name.equals(e.getName());
66 |     } else {
67 |       return false;
68 |     }
69 |   }
70 |   
71 |   public int hashCode() {
72 |     return name.hashCode();
73 |   }
74 | 
75 |   public boolean isNMEentity() {
76 |     return Entities.isNMEName(name);
77 |   }
78 | 
79 |   public String getNMEnormalizedName() {
80 |     String normName = name.replace("-"+NO_MATCHING_ENTITY, "").replace(' ', '_');
81 |     return normName;
82 |   }
83 | 
84 |   public List<String> getSurroundingMentionNames() {
85 |     return surroundingMentionNames;
86 |   }
87 | 
88 |   public void setSurroundingMentionNames(List<String> surroundingMentionNames) {
89 |     this.surroundingMentionNames = surroundingMentionNames;
90 |   }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/mpi/aida/data/Keyphrases.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.data;
 2 | 
 3 | import gnu.trove.map.hash.TIntDoubleHashMap;
 4 | import gnu.trove.map.hash.TIntObjectHashMap;
 5 | 
 6 | /**
 7 |  * Holds all the keyphrase data describing a set of entities.
 8 |  * 
 9 |  *
10 |  */
11 | public class Keyphrases {
12 |   
13 |   private TIntObjectHashMap<int[]> entityKeyphrases;
14 |   private TIntObjectHashMap<int[]> keyphraseTokens;
15 |   private TIntObjectHashMap<TIntDoubleHashMap> entity2keyphrase2mi;
16 |   private TIntObjectHashMap<TIntDoubleHashMap> entity2keyword2mi;
17 | 
18 |   public void setEntityKeyphrases(TIntObjectHashMap<int[]> entityKeyphrases) {
19 |    this.entityKeyphrases = entityKeyphrases;    
20 |   }
21 | 
22 |   public void setKeyphraseTokens(TIntObjectHashMap<int[]> keyphraseTokens) {
23 |     this.keyphraseTokens = keyphraseTokens;    
24 |   }
25 | 
26 |   public void setEntityKeyphraseWeights(
27 |       TIntObjectHashMap<TIntDoubleHashMap> entity2keyphrase2mi) {
28 |     this.entity2keyphrase2mi = entity2keyphrase2mi;
29 |   }
30 | 
31 |   public void setEntityKeywordWeights(
32 |       TIntObjectHashMap<TIntDoubleHashMap> entity2keyword2mi) {
33 |     this.entity2keyword2mi = entity2keyword2mi;
34 |   }
35 | 
36 |   public TIntObjectHashMap<int[]> getEntityKeyphrases() {
37 |     return entityKeyphrases;
38 |   }
39 | 
40 |   public TIntObjectHashMap<int[]> getKeyphraseTokens() {
41 |     return keyphraseTokens;
42 |   }
43 | 
44 |   public TIntObjectHashMap<TIntDoubleHashMap> getEntityKeywordWeights() {
45 |     return entity2keyword2mi;
46 |   }
47 | 
48 |   public TIntObjectHashMap<TIntDoubleHashMap> getEntityKeyphraseWeights() {
49 |     return entity2keyphrase2mi;
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/mpi/aida/data/Mention.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.data;
  2 | 
  3 | import java.io.Serializable;
  4 | 
  5 | public class Mention implements Serializable, Comparable<Mention> {
  6 | 
  7 |   private static final long serialVersionUID = 3177945435296705498L;
  8 | 
  9 |   private String mention;
 10 | 
 11 |   /** Starting token offset of the mention. */
 12 |   private int startToken;
 13 | 
 14 |   /** Ending token offset of the mention (including this token). */
 15 |   private int endToken;
 16 | 
 17 |   private int startStanford;
 18 | 
 19 |   private int endStanford;
 20 | 
 21 |   private int sentenceId;
 22 | 
 23 |   private String groundTruthEntity = null;
 24 | 
 25 |   private double disambiguationConfidence;
 26 | 
 27 |   // Character offset
 28 |   private int charOffset, charLength;
 29 | 
 30 |   private Entities candidateEntities;
 31 | 
 32 |   private int id = -1;
 33 | 
 34 |   public Mention() {
 35 |   }
 36 | 
 37 |   public Mention(String mention, int startToken, int endToken, int startStanford, int endStanford, int sentenceId) {
 38 |     this.startToken = startToken;
 39 |     this.endToken = endToken;
 40 |     this.startStanford = startStanford;
 41 |     this.endStanford = endStanford;
 42 |     this.mention = mention;
 43 |     this.sentenceId = sentenceId;
 44 |   }
 45 | 
 46 |   public String getMention() {
 47 |     return mention;
 48 |   }
 49 | 
 50 |   public int getStartToken() {
 51 |     return startToken;
 52 |   }
 53 | 
 54 |   public int getEndToken() {
 55 |     return endToken;
 56 |   }
 57 | 
 58 |   public int getStartStanford() {
 59 |     return startStanford;
 60 |   }
 61 | 
 62 |   public int getEndStanford() {
 63 |     return endStanford;
 64 |   }
 65 | 
 66 |   public int getSentenceId() {
 67 |     return sentenceId;
 68 |   }
 69 | 
 70 |   public void setSentenceId(int sentenceId) {
 71 |     this.sentenceId = sentenceId;
 72 |   }
 73 | 
 74 |   public void addCandidateEntity(Entity entity) {
 75 |     candidateEntities.add(entity);
 76 |   }
 77 | 
 78 |   public Entities getCandidateEntities() {
 79 |     return candidateEntities;
 80 |   }
 81 | 
 82 |   public void setCandidateEntities(Entities candidateEntities) {
 83 |     this.candidateEntities = candidateEntities;
 84 |   }
 85 | 
 86 |   public String toString() {
 87 |     return mention + ", From:" + startToken + "/" + startStanford + ", To:" + endToken + "/" + endStanford + ", Offset: " + charOffset + ", Length: " + charLength;
 88 |   }
 89 | 
 90 |   public void setStartToken(int start) {
 91 |     this.startToken = start;
 92 |   }
 93 | 
 94 |   public void setEndToken(int end) {
 95 |     this.endToken = end;
 96 |   }
 97 | 
 98 |   public int getCharOffset() {
 99 |     return this.charOffset;
100 |   }
101 | 
102 |   public int getCharLength() {
103 |     return this.charLength;
104 |   }
105 | 
106 |   public void setCharOffset(int offset) {
107 |     this.charOffset = offset;
108 | 
109 |   }
110 | 
111 |   public void setCharLength(int length) {
112 |     this.charLength = length;
113 |   }
114 | 
115 |   public void setMention(String mention) {
116 |     this.mention = mention;
117 |   }
118 | 
119 |   @Override
120 |   public boolean equals(Object obj) {
121 |     if (obj instanceof Mention) {
122 |       Mention m = (Mention) obj;
123 | 
124 |       return m.getMention().equals(getMention()) && m.getCharOffset() == charOffset;
125 |     } else {
126 |       return false;
127 |     }
128 |   }
129 | 
130 |   @Override
131 |   public int hashCode() {
132 |     return mention.hashCode() + charOffset;
133 |   }
134 | 
135 |   @Override
136 |   public int compareTo(Mention mention) {
137 |     return this.charOffset - mention.charOffset;
138 |   }
139 | 
140 |   public void setGroundTruthResult(String result) {
141 |     this.groundTruthEntity = result;
142 |   }
143 | 
144 |   public String getGroundTruthResult() {
145 |     return groundTruthEntity;
146 |   }
147 | 
148 |   public void setDisambiguationConfidence(double confidence) {
149 |     disambiguationConfidence = confidence;
150 |   }
151 | 
152 |   public double getDisambiguationConfidence() {
153 |     return disambiguationConfidence;
154 |   }
155 | 
156 |   public int getId() {
157 |     return id;
158 |   }
159 | 
160 |   public void setId(int id) {
161 |     this.id = id;
162 |   }
163 | 
164 |   public void setStartStanford(int startStanford) {
165 |     this.startStanford = startStanford;
166 |   }
167 | 
168 |   public void setEndStanford(int endStanford) {
169 |     this.endStanford = endStanford;
170 |   }
171 | 
172 |   public String getIdentifiedRepresentation() {
173 |     return mention + ":::" + charOffset;
174 |   }
175 | }
176 | 


--------------------------------------------------------------------------------
/src/mpi/aida/data/Mentions.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.data;
  2 | 
  3 | import java.io.Serializable;
  4 | import java.util.ArrayList;
  5 | import java.util.Collections;
  6 | import java.util.HashMap;
  7 | import java.util.LinkedList;
  8 | import java.util.List;
  9 | 
 10 | public class Mentions implements Serializable {
 11 | 
 12 |   private static final long serialVersionUID = -383105468450056989L;
 13 | 
 14 |   private List<Mention> mentions = null;
 15 | 
 16 |   private HashMap<Integer, Integer> subStrings = null;
 17 |   
 18 |   /**
 19 |    * The expected types for entities to which those mentions will be disambiguated
 20 |    */
 21 |   private List<String> entitiesTypes = null;
 22 |   
 23 |   public Mentions() {
 24 |     mentions = new LinkedList<Mention>();
 25 |   }
 26 | 
 27 |   public boolean containsOffset(int offset) {
 28 |     for (Mention mention : mentions) {
 29 |       if (mention.getCharOffset() == offset) {
 30 |         return true;
 31 |       }
 32 |     }
 33 |     return false;
 34 |   }
 35 | 
 36 |   public Mention getMentionForOffset(int offset) {
 37 |     for (Mention mention : mentions) {
 38 |       if (mention.getCharOffset() == offset) {
 39 |         return mention;
 40 |       }
 41 |     }
 42 |     return null;
 43 |   }
 44 | 
 45 |   public void addMention(Mention mention) {
 46 |     mentions.add(mention);
 47 |   }
 48 | 
 49 |   public List<Mention> getMentions() {
 50 |     return mentions;
 51 |   }
 52 |   
 53 |   public ArrayList<Integer> getMentionTokenStanfordIndices()
 54 |   {
 55 | 	  ArrayList<Integer> mentionTokenIndices = new ArrayList<Integer>();
 56 | 	  // there's just one
 57 | 	  for (Mention mention : mentions)
 58 | 	  {
 59 | 		  for (int i=mention.getStartStanford();i<=mention.getEndStanford();i++)
 60 | 			  mentionTokenIndices.add(i);
 61 | 	  }
 62 | 	  return mentionTokenIndices;
 63 |   }
 64 |   
 65 |   public int getMentionTokenSentenceIndex()
 66 |   {
 67 | 	  // there's just one
 68 | 	  return mentions.get(0).getSentenceId();
 69 |   }
 70 | 
 71 |   public boolean remove(Mention mention) {
 72 |     return mentions.remove(mention);
 73 |   }
 74 | 
 75 |   public String toString() {
 76 |     StringBuffer sb = new StringBuffer(200);
 77 |     for (int i = 0; i < mentions.size(); i++) {
 78 |       sb.append(mentions.get(i).toString()).append('\n');
 79 |     }
 80 |     return sb.toString();
 81 |   }
 82 | 
 83 |   public void setSubstring(HashMap<Integer, Integer> subStrings) {
 84 |     this.subStrings = subStrings;
 85 |   }
 86 | 
 87 |   public HashMap<Integer, Integer> getSubstrings() {
 88 |     return subStrings;
 89 |   }
 90 | 
 91 |   public void sortMentions() {
 92 |     Collections.sort(mentions);
 93 |   }
 94 | 
 95 |   public List<String> getEntitiesTypes() {
 96 |     return entitiesTypes;
 97 |   }
 98 | 
 99 |   public void setEntitiesTypes(List<String> entitiesTypes) {
100 |     this.entitiesTypes = entitiesTypes;
101 |   }
102 | }
103 | 


--------------------------------------------------------------------------------
/src/mpi/aida/data/PreparedInput.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.data;
 2 | 
 3 | import mpi.tokenizer.data.Tokens;
 4 | 
 5 | public class PreparedInput {
 6 | 
 7 |   private String docId;
 8 | 
 9 |   private Tokens tokens;
10 | 
11 |   /** Used by the local similarity methods in the disambiguation. It holds
12 |    * the document tokens both as strings and converted to word ids. */ 
13 |   private Context context;
14 |   
15 |   private Mentions mentions;
16 | 
17 |   public PreparedInput(String docId) {
18 |     this.docId = docId;
19 |   }
20 | 
21 |   public PreparedInput(String docId, Tokens tokens, Mentions mentions) {
22 |     this.docId = docId;
23 |     this.tokens = tokens;
24 |     this.mentions = mentions;
25 |     context = createContextFromTokens(tokens);
26 |   }
27 | 
28 |   public Tokens getTokens() {
29 |     return tokens;
30 |   }
31 | 
32 |   public void setTokens(Tokens tokens) {
33 |     this.tokens = tokens;
34 |     context = createContextFromTokens(tokens);
35 |   }
36 | 
37 |   public Mentions getMentions() {
38 |     return mentions;
39 |   }
40 | 
41 |   public void setMentions(Mentions mentions) {
42 |     this.mentions = mentions;
43 |   }
44 |   
45 |   public Context getContext() {
46 |     return context;
47 |   }
48 | 
49 |   private Context createContextFromTokens(Tokens t) {
50 |     return new Context(t);
51 |   }
52 | 
53 |   public String getDocId() {
54 |     return docId;
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/mpi/aida/data/ResultEntity.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.data;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.text.NumberFormat;
 5 | import java.util.ArrayList;
 6 | import java.util.List;
 7 | import java.util.Locale;
 8 | 
 9 | /**
10 |  * Entity the was assigned to a ResultMention.
11 |  * The entity String is the identifier in YAGO2 
12 |  * (see http://www.yago-knowledge.org)
13 |  * 
14 |  *
15 |  */
16 | public class ResultEntity implements Comparable<ResultEntity>, Serializable {
17 | 
18 |   private static final long serialVersionUID = -7062155406718136994L;
19 | 
20 |   /** YAGO2 identifier of the entity (http://www.yago-knowledge.org) */
21 |   private String entity;
22 | 
23 |   /** Score assigned to the entity */
24 |   private double disambiguationScore;
25 | 
26 |   public ResultEntity(String entity, double disambiguationScore) {
27 |     super();
28 |     this.entity = entity;
29 |     this.disambiguationScore = disambiguationScore;
30 |   }
31 | 
32 |   public static ResultEntity getNoMatchingEntity() {
33 |     return new ResultEntity(Entity.NO_MATCHING_ENTITY, 0.0);
34 |   }
35 | 
36 |   public static List<ResultEntity> getResultEntityAsList(ResultEntity re) {
37 |     List<ResultEntity> res = new ArrayList<ResultEntity>(1);
38 |     res.add(re);
39 |     return res;
40 |   }
41 | 
42 |   /**
43 |    * @return  YAGO2 identifier of the entity (http://www.yago-knowledge.org)
44 |    */
45 |   public String getEntity() {
46 |     return entity;
47 |   }
48 | 
49 |   public void setEntity(String entity) {
50 |     this.entity = entity;
51 |   }
52 | 
53 |   public double getDisambiguationScore() {
54 |     return disambiguationScore;
55 |   }
56 | 
57 |   public void setDisambiguationScore(double disambiguationScore) {
58 |     this.disambiguationScore = disambiguationScore;
59 |   }
60 |   
61 |   public boolean isNoMatchingEntity() {
62 |     return entity.equals(Entity.NO_MATCHING_ENTITY);
63 |   }
64 | 
65 |   @Override
66 |   public int compareTo(ResultEntity re) {
67 |     // natural ordering for ResultEntities is descending
68 |     return new Double(new Double(re.getDisambiguationScore())).compareTo(disambiguationScore);
69 |   }
70 | 
71 |   public String toString() {
72 |     NumberFormat df = NumberFormat.getInstance(Locale.ENGLISH);
73 |     df.setMaximumFractionDigits(5);
74 |     return entity + " (" + df.format(disambiguationScore) + ")";
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/mpi/aida/data/ResultMention.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.data;
  2 | 
  3 | import java.io.Serializable;
  4 | 
  5 | import org.slf4j.Logger;
  6 | import org.slf4j.LoggerFactory;
  7 | 
  8 | /**
  9 |  * Mention detected in the input text. It is identified uniquely
 10 |  * by the combination of the three members docId+mention+characterOffset.
 11 |  * 
 12 |  *
 13 |  */
 14 | public class ResultMention implements Comparable<ResultMention>, Serializable {
 15 |   private static final Logger logger = 
 16 |       LoggerFactory.getLogger(ResultMention.class);
 17 |   
 18 |   private static final long serialVersionUID = -6791087404868641006L;
 19 | 
 20 |   private String docId;
 21 | 
 22 |   private String mention;
 23 | 
 24 |   private int characterOffset;
 25 | 
 26 |   private int characterLength;
 27 | 
 28 |   public ResultMention(String docId, String mention, int characterOffset, int characterLength) {
 29 |     super();
 30 |     this.docId = docId;
 31 |     this.mention = mention;
 32 |     this.characterOffset = characterOffset;
 33 |     this.characterLength = characterLength;
 34 |   }
 35 | 
 36 |   public String getDocId() {
 37 |     return docId;
 38 |   }
 39 | 
 40 |   public void setDocId(String docId) {
 41 |     this.docId = docId;
 42 |   }
 43 | 
 44 |   public String getMention() {
 45 |     return mention;
 46 |   }
 47 | 
 48 |   public void setMention(String mention) {
 49 |     this.mention = mention;
 50 |   }
 51 | 
 52 |   public int getCharacterOffset() {
 53 |     return characterOffset;
 54 |   }
 55 | 
 56 |   public void setCharacterOffset(int characterOffset) {
 57 |     this.characterOffset = characterOffset;
 58 |   }
 59 | 
 60 |   public int getCharacterLength() {
 61 |     return characterLength;
 62 |   }
 63 | 
 64 |   public void setCharacterLength(int characterLength) {
 65 |     this.characterLength = characterLength;
 66 |   }
 67 | 
 68 |   public static ResultMention getResultMentionFromMentionString(String docId, String mentionString) {
 69 |     String[] data = mentionString.split(":::");
 70 | 
 71 |     if (data.length < 3) {
 72 |       logger.error("Could not create ResultMention from mentionString: " + mentionString);
 73 |       return null;
 74 |     }
 75 | 
 76 |     String mention = data[0];
 77 |     int characterOffset = Integer.parseInt(data[1]);
 78 |     int characterLength = Integer.parseInt(data[2]);
 79 | 
 80 |     ResultMention rm = new ResultMention(docId, mention, characterOffset, characterLength);
 81 |     return rm;
 82 |   }
 83 | 
 84 |   @Override
 85 |   public boolean equals(Object o) {
 86 |     if (o instanceof ResultMention) {
 87 |       ResultMention rm = (ResultMention) o;
 88 |       return (docId.equals(rm.getDocId()) && mention.equals(rm.getMention()) && characterOffset == rm.getCharacterOffset());
 89 |     } else {
 90 |       return false;
 91 |     }
 92 |   }
 93 | 
 94 |   @Override
 95 |   public int hashCode() {
 96 |     return docId.hashCode() + mention.hashCode() + characterOffset;
 97 |   }
 98 | 
 99 |   @Override
100 |   public int compareTo(ResultMention rm) {
101 |     int result = docId.compareTo(rm.getDocId());
102 | 
103 |     if (result == 0) {
104 |       result = new Integer(characterOffset).compareTo(new Integer(rm.getCharacterOffset()));
105 |     }
106 | 
107 |     return result;
108 |   }
109 | 
110 |   public String toString() {
111 |     return "[" + docId + "] " + mention + " (" + characterOffset + "/" + characterLength + ")";
112 |   }
113 | }
114 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/GraphNode.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph;
 2 | 
 3 | import gnu.trove.map.hash.TIntDoubleHashMap;
 4 | 
 5 | public class GraphNode {
 6 | 	
 7 | 	private int id;
 8 | 	private GraphNodeTypes type;
 9 | 	private Object NodeData = null;
10 | 	private TIntDoubleHashMap successors;
11 | 	
12 | 	public GraphNode() {
13 | 		successors = new TIntDoubleHashMap();
14 | 	}
15 | 	
16 | 	public int getId() {
17 | 		return id;
18 | 	}
19 | 	public void setId(int id) {
20 | 		this.id = id;
21 | 	}
22 | 	public GraphNodeTypes getType() {
23 | 		return type;
24 | 	}
25 | 	public void setType(GraphNodeTypes type) {
26 | 		this.type = type;
27 | 	}
28 | 	public Object getNodeData() {
29 | 		return NodeData;
30 | 	}
31 | 	public void setNodeData(Object nodeData) {
32 | 		NodeData = nodeData;
33 | 	}
34 | 	public TIntDoubleHashMap getSuccessors() {
35 | 		return successors;
36 | 	}
37 | 	public void setSuccessors(TIntDoubleHashMap successors) {
38 | 		this.successors = successors;
39 | 	}
40 | 	
41 | }
42 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/GraphNodeTypes.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph;
2 | 
3 | public enum GraphNodeTypes {
4 | 	MENTION, ENTITY
5 | }
6 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/algorithms/DisambiguationAlgorithm.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.algorithms;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Map;
 5 | 
 6 | import mpi.aida.data.ResultEntity;
 7 | import mpi.aida.data.ResultMention;
 8 | 
 9 | 
10 | public abstract class DisambiguationAlgorithm {
11 | 
12 |   public abstract Map<ResultMention, List<ResultEntity>> disambiguate() throws Exception;
13 |   
14 | }
15 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/algorithms/Node.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.algorithms;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | /**
 6 |  * Utility class to be used in the implemenation of the shortest-path
 7 |  * algorithms. We store a node together with its distance, and then we develop a
 8 |  * comparator that sorts nodes according to their distances
 9 |  */
10 | public class Node {
11 | 
12 | 	private int key;
13 | 
14 | 	private double distance;
15 | 
16 | 	public Node(int k, double d) {
17 | 
18 | 		key = k;
19 | 		distance = d;
20 | 
21 | 	}
22 | 
23 | 	public int getKey() {
24 | 
25 | 		return key;
26 | 	}
27 | 
28 | 	public double getDistance() {
29 | 
30 | 		return distance;
31 | 	}
32 | 
33 | 	public void setDistance(double d) {
34 | 
35 | 		distance = d;
36 | 
37 | 	}
38 | }
39 | 
40 | class NodeComparator implements Comparator<Node> {
41 | 
42 | 	public int compare(Node first, Node second) {
43 | 
44 | 		// I want to use the opposite order, so that I can build a max priority
45 | 		// queue using the default
46 | 		// implementation of a min priority queue
47 | 		Double firstDistance = first.getDistance();
48 | 		Double secondDistance = second.getDistance();
49 | 		return firstDistance.compareTo(secondDistance);
50 | 
51 | 	}
52 | 
53 | 	public boolean equals(Node first, Node second) {
54 | 
55 | 		// I just want only one node with a given key in the priority queue
56 | 		if (first.getKey() == second.getKey())
57 | 			return true;
58 | 		else
59 | 			return false;
60 | 	}
61 | 
62 | }


--------------------------------------------------------------------------------
/src/mpi/aida/graph/extraction/DegreeComparator.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.extraction;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | public class DegreeComparator implements Comparator<String> {
 6 | 	
 7 | 
 8 | 	@Override
 9 | 	public int compare(String arg0, String arg1) {
10 | 		// I want to use the opposite order, so that I can build a max priority queue using the default
11 | 		// implementation of a min priority queue
12 | 		String first = arg0;
13 | 		String second = arg1;
14 | 		Double firstDegree = Double.parseDouble(first.split(":::")[1]);
15 | 		Double secondDegree = Double.parseDouble(second.split(":::")[1]);
16 | 		return firstDegree.compareTo(secondDegree);
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/extraction/ExtractGraphAllEdges.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.extraction;
 2 | 
 3 | import mpi.aida.data.Entities;
 4 | import mpi.aida.data.Mentions;
 5 | import mpi.aida.graph.similarity.EnsembleEntityEntitySimilarity;
 6 | 
 7 | 
 8 | public class ExtractGraphAllEdges extends ExtractGraph {
 9 |   
10 |   public ExtractGraphAllEdges(String graphName, Mentions m, Entities ue, EnsembleEntityEntitySimilarity eeSim, double alpha) {
11 |     super(graphName, m, ue, eeSim, alpha);
12 |   }
13 | 
14 |   protected boolean haveDistinceMentions(String e1, String e2) {
15 |     return true;
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/EnsembleEntityEntitySimilarity.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import mpi.aida.data.Entities;
 6 | import mpi.aida.data.Entity;
 7 | import mpi.aida.graph.similarity.util.SimilaritySettings;
 8 | import mpi.experiment.trace.Tracer;
 9 | 
10 | public class EnsembleEntityEntitySimilarity {
11 | 
12 |   private List<EntityEntitySimilarity> eeSims;
13 | 
14 |   public EnsembleEntityEntitySimilarity(Entities uniqueEntities, SimilaritySettings settings, Tracer tracer) throws Exception {
15 |     eeSims = settings.getEntityEntitySimilarities(uniqueEntities, tracer);
16 |   }
17 | 
18 |   public double calcSimilarity(Entity a, Entity b) throws Exception {
19 |     double weightedSimilarity = 0.0;
20 | 
21 |     for (EntityEntitySimilarity eeSim : eeSims) {
22 |       double sim = eeSim.calcSimilarity(a, b) * eeSim.getWeight();
23 |       weightedSimilarity += sim;
24 |     }
25 | 
26 |     return weightedSimilarity;
27 |   }
28 | 
29 |   public List<EntityEntitySimilarity> getEeSims() {
30 |     return eeSims;
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/MaterializedPriorProbability.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity;
 2 | 
 3 | import gnu.trove.map.hash.TIntDoubleHashMap;
 4 | 
 5 | import java.sql.SQLException;
 6 | import java.util.HashMap;
 7 | import java.util.Set;
 8 | 
 9 | import mpi.aida.access.DataAccess;
10 | 
11 | /**
12 |  * This class calculates the prior probability of a mention
13 |  * being associated with a given entity. The prior probability is based
14 |  * on the occurrence count of links (and their anchor text as mention) with
15 |  * a given Wikipedia/YAGO entity as target.
16 |  * 
17 |  * It is faster than {@link PriorProbability} because it uses a table with 
18 |  * all the priors materialized. To get the table, run the {@link MaterializedPriorProbability}
19 |  * main method, it will create another table in the YAGO2 database which can
20 |  * then be used by this class. 
21 |  *    
22 |  *
23 |  */
24 | public class MaterializedPriorProbability extends PriorProbability {
25 | 
26 |   public MaterializedPriorProbability(Set<String> mentions) throws SQLException {
27 |     super(mentions);
28 |   }
29 | 
30 |   public void setupMentions(Set<String> mentions) throws SQLException {
31 |     priors = new HashMap<String, TIntDoubleHashMap>();
32 |     for (String mention : mentions) {
33 |       mention = conflateMention(mention);
34 |       TIntDoubleHashMap entityPriors = DataAccess.getEntityPriors(mention);
35 |       priors.put(mention, entityPriors);
36 |     }
37 |   }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/PriorProbability.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.graph.similarity;
  2 | 
  3 | import gnu.trove.iterator.TIntDoubleIterator;
  4 | import gnu.trove.map.hash.TIntDoubleHashMap;
  5 | 
  6 | import java.sql.SQLException;
  7 | import java.util.HashMap;
  8 | import java.util.Locale;
  9 | import java.util.NoSuchElementException;
 10 | import java.util.Set;
 11 | 
 12 | import mpi.aida.data.Entity;
 13 | 
 14 | /**
 15 |  * This class calculates the prior probability of a mention
 16 |  * being associated with a given entity. The prior probability is based
 17 |  * on the occurrence count of links (and their anchor text as mention) with
 18 |  * a given Wikipedia/YAGO entity as target.
 19 |  * 
 20 |  * The calculation is done on the fly, so it is a bit slow. For a faster implementation,
 21 |  * use {@link MaterializedPriorProbability}.
 22 |  * 
 23 |  * It uses the 'hasInternalWikipediaLinkTo' and 'hasAnchorText' relations
 24 |  * in the YAGO2 database.
 25 |  * 
 26 |  *
 27 |  */
 28 | public abstract class PriorProbability {
 29 |  
 30 |   protected HashMap<String, TIntDoubleHashMap> priors;
 31 |   
 32 |   private double weight;
 33 |   
 34 |   public PriorProbability(Set<String> mentions) throws SQLException {
 35 |     setupMentions(mentions);
 36 |   }
 37 |   
 38 |   public double getWeight() {
 39 |     return weight;
 40 |   }
 41 | 
 42 |   public void setWeight(double weight) {
 43 |     this.weight = weight;
 44 |   }
 45 |   
 46 |   protected abstract void setupMentions(Set<String> mentions) throws SQLException;
 47 | 
 48 |   /**
 49 |    * Returns the prior probability for the given mention-entity pair.
 50 |    * If smoothing is true, it will return the lowest prior among all entities if
 51 |    * there is no real prior.
 52 |    * 
 53 |    * @param mention
 54 |    * @param entity
 55 |    * @param smoothing
 56 |    * @return
 57 |    */
 58 |   public double getPriorProbability(
 59 |       String mentionText, Entity entity, boolean smoothing) {
 60 |     mentionText = conflateMention(mentionText);
 61 |     TIntDoubleHashMap mentionPriors = priors.get(mentionText);
 62 |     
 63 |     if (mentionPriors == null) {
 64 |       throw new NoSuchElementException(
 65 |           "Mention " + mentionText + " must be passed to constructor!");
 66 |     }
 67 |     
 68 |     double entityPrior = mentionPriors.get(entity.getId());
 69 |     if (smoothing && entityPrior == 0.0) {
 70 |       double smallestPrior = 1.0;
 71 |       
 72 |       for (TIntDoubleIterator it = mentionPriors.iterator(); it.hasNext();) {
 73 |         it.advance();
 74 |         double currentPrior = it.value(); 
 75 |         if (currentPrior < smallestPrior) {
 76 |           smallestPrior = currentPrior;
 77 |         }
 78 |       }      
 79 |       entityPrior = smallestPrior;
 80 |     }
 81 |     
 82 |     return entityPrior;
 83 |   }
 84 |   
 85 |   public double getBestPrior(String mentionText) {
 86 |     mentionText = conflateMention(mentionText);
 87 |     TIntDoubleHashMap mentionPriors = priors.get(mentionText);
 88 | 
 89 |     double bestPrior = 0.0;
 90 |     for (TIntDoubleIterator it = mentionPriors.iterator(); it.hasNext();) {
 91 |       it.advance();
 92 |       double currentPrior = it.value();
 93 |       if (currentPrior > bestPrior) {
 94 |         bestPrior = currentPrior;
 95 |       }
 96 |     }
 97 |     
 98 |     return bestPrior;
 99 |   }
100 |   
101 |   public double getPriorProbability(String mentionText, Entity entity) {
102 |     return getPriorProbability(mentionText, entity, false);
103 |   }
104 |   
105 |   public static String conflateMention(String mention) {
106 |     // conflate cases for mentions of length >= 4
107 |     if (mention.length() >= 4) {
108 |       mention = mention.toUpperCase(Locale.ENGLISH);
109 |     }
110 |     
111 |     return mention;
112 |   }
113 | } 
114 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/context/EmptyEntitiesContext.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.context;
 2 | 
 3 | import mpi.aida.data.Entities;
 4 | import mpi.aida.data.Entity;
 5 | 
 6 | public class EmptyEntitiesContext extends EntitiesContext {
 7 | 
 8 |   public EmptyEntitiesContext(Entities entities) throws Exception {
 9 |     super(entities, null);
10 |   }
11 | 
12 |   @Override
13 |   public int[] getContext(Entity entity) {
14 |     return null;
15 |   }
16 | 
17 |   @Override
18 |   protected void setupEntities(Entities entities) throws Exception {
19 |     // nothing
20 |   }
21 | 
22 |   public String toString() {
23 |     return "EmptyEntitiesContext";
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/context/EntitiesContext.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.context;
 2 | 
 3 | import java.util.LinkedList;
 4 | import java.util.List;
 5 | 
 6 | import mpi.aida.AidaManager;
 7 | import mpi.aida.data.Entities;
 8 | import mpi.aida.data.Entity;
 9 | import mpi.tokenizer.data.Token;
10 | import mpi.tokenizer.data.Tokens;
11 | 
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 | 
15 | import basics.Normalize;
16 | 
17 | public abstract class EntitiesContext {
18 |   private static final Logger logger = 
19 |       LoggerFactory.getLogger(EntitiesContext.class);
20 |   
21 |   protected Entities entities;
22 |   protected EntitiesContextSettings settings;
23 | 
24 |   public EntitiesContext(Entities entities, EntitiesContextSettings settings) throws Exception {
25 |     this.entities = entities;
26 |     this.settings = settings;
27 | 
28 |     long beginTime = System.currentTimeMillis();
29 | 
30 |     setupEntities(entities);
31 | 
32 |     long runTime = (System.currentTimeMillis() - beginTime) / 1000;
33 |     logger.debug("Done setting up " + this + ": " + runTime + "s");
34 |   }
35 | 
36 |   public void setEntities(Entities entities) throws Exception {
37 |     this.entities = entities;
38 |     setupEntities(entities);
39 |   }
40 | 
41 |   public Entities getEntities() {
42 |     return entities;
43 |   }
44 | 
45 |   public abstract int[] getContext(Entity entity);
46 | 
47 |   protected abstract void setupEntities(Entities entities) throws Exception;
48 | 
49 |   protected List<String> getTokens(String string) {
50 |     List<String> tokens = new LinkedList<String>();
51 | 
52 |     Tokens advTokens = AidaManager.tokenize("EntitiesContext", string);
53 | 
54 |     for (Token token : advTokens) {
55 |       tokens.add(token.getOriginal());
56 |     }
57 | 
58 |     return tokens;
59 |   }
60 | 
61 |   public static String getEntityName(String entity) {
62 |     String norm = Normalize.unEntity(entity);
63 |     norm = norm.replaceAll(" \\(.*?\\)$", "");
64 | 
65 |     return norm;
66 |   }
67 |   
68 |   public String toString() {
69 |     return getIdentifier();
70 |   }
71 |   
72 |   public String getIdentifier() {
73 |     return this.getClass().getSimpleName();
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/context/EntitiesContextSettings.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.graph.similarity.context;
  2 | 
  3 | 
  4 | public class EntitiesContextSettings {
  5 |   private int numberOfEntityKeyphrases = Integer.MAX_VALUE;
  6 |   
  7 |   private boolean normalizeWeights = true; // default is to normalize
  8 |   private boolean useConfusableMIWeight = false;
  9 |   private boolean averageWeights = false;
 10 | 
 11 |   private int nGramLength = 2;
 12 |   
 13 |   public static final double DEFAULT_KEYPHRASE_ALPHA = 0.9713705285593512;
 14 |   public static final double DEFAULT_KEYWORD_ALPHA = 0.9713705285593512;
 15 | 
 16 |   private double entityCoherenceKeyphraseAlpha = DEFAULT_KEYPHRASE_ALPHA;
 17 |   private double entityCoherenceKeywordAlpha = DEFAULT_KEYWORD_ALPHA;
 18 |     
 19 |   private String keyphraseSourceExclusion;  
 20 |   
 21 |   // LSH
 22 |   private int lshBandSize;
 23 |   private int lshBandCount;
 24 |   private String lshDatabaseTable;
 25 |   
 26 |   /**
 27 |    * 
 28 |    * @return Balance between Keyphrase MI/IDF. Use alpha*mi, (1-alpha)*idf 
 29 |    */
 30 |   public double getEntityCoherenceKeyphraseAlpha() {
 31 |     return entityCoherenceKeyphraseAlpha;
 32 |   }
 33 |   
 34 |   public void setEntityCoherenceKeyphraseAlpha(double entityCoherenceKeyphraseAlpha) {
 35 |     this.entityCoherenceKeyphraseAlpha = entityCoherenceKeyphraseAlpha;
 36 |   }
 37 | 
 38 |   /**
 39 |    * 
 40 |    * @return Balance between Keyword MI/IDF. Use alpha*mi, (1-alpha)*idf 
 41 |    */
 42 |   public double getEntityCoherenceKeywordAlpha() {
 43 |     return entityCoherenceKeywordAlpha;
 44 |   }
 45 |   
 46 |   public void setEntityCoherenceKeywordAlpha(double entityCoherenceKeywordAlpha) {
 47 |     this.entityCoherenceKeywordAlpha = entityCoherenceKeywordAlpha;
 48 |   }
 49 | 
 50 |   public int getNumberOfEntityKeyphrases() {
 51 |     return numberOfEntityKeyphrases;
 52 |   }
 53 |   
 54 |   public void setNumberOfEntityKeyphrases(int numberOfEntityKeyphrases) {
 55 |     this.numberOfEntityKeyphrases = numberOfEntityKeyphrases;
 56 |   }
 57 | 
 58 | 
 59 |   public String getKeyphraseSourceExclusion() {
 60 |     return keyphraseSourceExclusion;
 61 |   }
 62 | 
 63 | 
 64 |   public void setKeyphraseSourceExclusion(String keyphraseSourceExclusion) {
 65 |     this.keyphraseSourceExclusion = keyphraseSourceExclusion;
 66 |   }
 67 | 
 68 |   public boolean shouldNormalizeWeights() {
 69 |     return normalizeWeights;
 70 |   }
 71 |   
 72 |   public void setShouldNormalizeWeights(boolean flag) {
 73 |     normalizeWeights = flag;
 74 |   }
 75 | 
 76 | 
 77 |   public boolean shouldUseConfusableMIWeight() {
 78 |     return useConfusableMIWeight;
 79 |   }
 80 | 
 81 |   public void setUseConfusableMIWeight(boolean useConfusableMIWeight) {
 82 |     this.useConfusableMIWeight = useConfusableMIWeight;
 83 |   }
 84 | 
 85 |   public boolean shouldAverageWeights() {
 86 |     return averageWeights;
 87 |   }
 88 |   
 89 |   public void setShouldAverageWeights(boolean flag) {
 90 |     this.averageWeights = flag;
 91 |   }
 92 |   
 93 |   public void setNgramLength(int nGramLength) {
 94 | 	  this.nGramLength = nGramLength;
 95 |   }
 96 |   
 97 |   public int getNgramLength() {
 98 | 	  return nGramLength;
 99 |   }
100 | 
101 |   public int getLshBandSize() {
102 |     return lshBandSize;
103 |   }
104 | 
105 |   public void setLshBandSize(int lshBandSize) {
106 |     this.lshBandSize = lshBandSize;
107 |   }
108 | 
109 |   public int getLshBandCount() {
110 |     return lshBandCount;
111 |   }
112 | 
113 |   public void setLshBandCount(int lshBandCount) {
114 |     this.lshBandCount = lshBandCount;
115 |   }
116 |   
117 |   public String getLshDatabaseTable() {
118 |     return lshDatabaseTable;
119 |   }
120 |  
121 |   public void setLshDatabaseTable(String lshDatabaseTable) {
122 |     this.lshDatabaseTable = lshDatabaseTable;
123 |   }
124 | } 
125 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/context/KeyphraseReweightedKeywordContext.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.context;
 2 | 
 3 | import gnu.trove.map.hash.TIntObjectHashMap;
 4 | 
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | 
 8 | import mpi.aida.data.Entities;
 9 | import mpi.aida.data.Entity;
10 | import mpi.experiment.trace.GraphTracer;
11 | import mpi.experiment.trace.NullGraphTracer;
12 | 
13 | 
14 | public class KeyphraseReweightedKeywordContext extends FastWeightedKeyphrasesContext {
15 | 
16 |   public KeyphraseReweightedKeywordContext(Entities entities) throws Exception {
17 |     super(entities);
18 |   }
19 |   
20 |   public KeyphraseReweightedKeywordContext(Entities entities, EntitiesContextSettings settings) throws Exception {
21 |     super(entities, settings);
22 |   }
23 | 
24 |   @Override
25 |   protected TIntObjectHashMap<float[]> fillEntityVectors() {
26 |     TIntObjectHashMap<float[]> vectors = new TIntObjectHashMap<float[]>();
27 | 
28 |     for (Entity e : entities) {
29 |       float[] weights = new float[allKeywords.size()];
30 | 
31 |       for (int kp : getEntityKeyphraseIds(e)) {
32 |         for (int tokenId : getKeyphraseTokenIds(kp, true)) { 
33 |           double mi = entity2keyword2mi.get(e.getId()).get(tokenId);
34 |          
35 |           double finalTokenWeight = mi;
36 |           
37 |           double keyphraseWeight = getKeyphraseMiWeight(e, kp);
38 |           double reweightedFinalTokenWeight =  keyphraseWeight * finalTokenWeight;
39 | 
40 |           if (Double.isNaN(reweightedFinalTokenWeight)) {
41 |             System.err.println("NAN");
42 |           }
43 |           
44 |           weights[tokenId] = (float) reweightedFinalTokenWeight;
45 |         }
46 |       }
47 | 
48 |       if (!(GraphTracer.gTracer instanceof NullGraphTracer)) {
49 |         Map<String, Float> entityKeywords = new HashMap<String, Float>();
50 | 
51 |         for (int i = 0; i < weights.length; i++) {
52 |           if (weights[i] > 0.0) {
53 |             entityKeywords.put(getKeywordForId(i), weights[i]);
54 |           }
55 |         }
56 |       }
57 | 
58 |       vectors.put(e.getId(), weights);
59 |     }
60 |     
61 |     return vectors;
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/context/TextContext.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.context;
 2 | 
 3 | import gnu.trove.map.hash.TIntObjectHashMap;
 4 | import mpi.aida.data.Entities;
 5 | import mpi.aida.data.Entity;
 6 | 
 7 | /**
 8 |  * Abstract class for all contexts containing solely integer ids 
 9 |  * representing tokens. 
10 |  * 
11 |  *
12 |  */
13 | public abstract class TextContext extends EntitiesContext {
14 | 
15 |   private TIntObjectHashMap<int[]> entityTokens;
16 | 
17 |   public TextContext(Entities entities, EntitiesContextSettings settings) throws Exception {
18 |     super(entities, settings);
19 |   }
20 | 
21 |   @Override
22 |   public int[] getContext(Entity entity) {
23 |     return entityTokens.get(entity.getId());
24 |   }
25 |   
26 |   @Override
27 |   protected void setupEntities(Entities entities) throws Exception {
28 |     entityTokens = new TIntObjectHashMap<int[]>();
29 | 
30 |     for (int entity : entities.getUniqueIds()) {
31 |       entityTokens.put(entity, getTextTokens(entity));
32 |     }
33 |   }
34 | 
35 |   protected abstract int[] getTextTokens(int entity);
36 | }
37 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/exception/MissingSettingException.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.exception;
 2 | 
 3 | 
 4 | public class MissingSettingException extends Exception {
 5 | 
 6 |   public MissingSettingException(String string) {
 7 |     super(string);
 8 |   }
 9 | 
10 |   /**
11 |    * 
12 |    */
13 |   private static final long serialVersionUID = -1610134821236307372L;
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/importance/EntityImportance.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.importance;
 2 | 
 3 | import java.sql.SQLException;
 4 | 
 5 | import mpi.aida.data.Entities;
 6 | import mpi.aida.data.Entity;
 7 | 
 8 | /**
 9 |  * This class serves as way to get the importance of an entity
10 |  * with regard to the complete collection, not to a specific mention (such as prior probability)
11 |  * 
12 |  *
13 |  */
14 | public abstract class EntityImportance {
15 | 
16 |   private Entities entities;
17 | 
18 |   private double weight = 0.0;
19 | 
20 |   public EntityImportance(Entities entities) throws SQLException {
21 |     this.entities = entities;
22 |     setupEntities(entities);
23 |   }
24 | 
25 |   public Entities getEntities() {
26 |     return entities;
27 |   }
28 | 
29 |   protected abstract void setupEntities(Entities e) throws SQLException;
30 | 
31 |   public abstract double getImportance(Entity entity);
32 | 
33 |   public double getWeight() {
34 |     return weight;
35 |   }
36 | 
37 |   public void setWeight(double weight) {
38 |     this.weight = weight;
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/importance/InlinkCountImportance.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.importance;
 2 | 
 3 | import gnu.trove.map.hash.TIntDoubleHashMap;
 4 | import gnu.trove.map.hash.TIntObjectHashMap;
 5 | 
 6 | import java.sql.SQLException;
 7 | 
 8 | import mpi.aida.access.DataAccess;
 9 | import mpi.aida.data.Entities;
10 | import mpi.aida.data.Entity;
11 | import mpi.aida.util.YagoUtil;
12 | import mpi.database.DBConnection;
13 | 
14 | /**
15 |  * Measures the importance of an entity by the number of
16 |  * incoming links in Wikipedia/YAGO
17 |  * 
18 |  *
19 |  */
20 | public class InlinkCountImportance extends EntityImportance {
21 | 
22 |   private TIntDoubleHashMap inlinkImportance;
23 | 
24 |   DBConnection con;
25 | 
26 |   public InlinkCountImportance(Entities entities) throws SQLException {
27 |     super(entities);
28 |   }
29 | 
30 |   @Override
31 |   protected void setupEntities(Entities e) throws SQLException {
32 |     TIntObjectHashMap<int[]> neighbors = DataAccess.getInlinkNeighbors(e);
33 |     for (int eId : e.getUniqueIds()) {
34 |       double importance = 
35 |           (double) neighbors.get(eId).length 
36 |           / (double) YagoUtil.TOTAL_YAGO_ENTITIES;
37 |       inlinkImportance.put(eId, importance);
38 |     }
39 |   }
40 | 
41 |   @Override
42 |   public double getImportance(Entity entity) {
43 |     return inlinkImportance.get(entity.getId());
44 |   }
45 | 
46 |   public String toString() {
47 |     return "InlinkCountImportance";
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/AlwaysOneSimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import mpi.aida.data.Context;
 4 | import mpi.aida.data.Entity;
 5 | import mpi.aida.data.Mention;
 6 | import mpi.aida.graph.similarity.context.EntitiesContext;
 7 | import mpi.experiment.trace.Tracer;
 8 | 
 9 | public class AlwaysOneSimilarityMeasure extends MentionEntitySimilarityMeasure {
10 | 
11 |   public AlwaysOneSimilarityMeasure(Tracer tracer) {
12 |     super(tracer);
13 |   }
14 | 
15 |   @Override
16 |   public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) {
17 |     return 1.0;
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/EntityEntitySimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import mpi.aida.data.Entity;
 4 | import mpi.aida.graph.similarity.context.EntitiesContext;
 5 | import mpi.experiment.trace.Tracer;
 6 | 
 7 | public abstract class EntityEntitySimilarityMeasure extends SimilarityMeasure {
 8 | 
 9 |   public EntityEntitySimilarityMeasure(Tracer tracer) {
10 |     super(tracer);
11 |   }
12 | 
13 |   public abstract double calcSimilarity(Entity a, Entity b, EntitiesContext context);
14 | }
15 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/InlinkOverlapEntityEntitySimilarity.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.graph.similarity.measure;
  2 | 
  3 | import gnu.trove.list.array.TIntArrayList;
  4 | import gnu.trove.map.hash.TIntObjectHashMap;
  5 | import gnu.trove.set.hash.TIntHashSet;
  6 | 
  7 | import java.util.BitSet;
  8 | 
  9 | import mpi.aida.AidaManager;
 10 | import mpi.aida.access.DataAccess;
 11 | import mpi.aida.data.Entities;
 12 | import mpi.aida.data.Entity;
 13 | import mpi.aida.graph.similarity.EntityEntitySimilarity;
 14 | import mpi.aida.graph.similarity.context.EntitiesContext;
 15 | import mpi.database.DBConnection;
 16 | 
 17 | import org.slf4j.Logger;
 18 | import org.slf4j.LoggerFactory;
 19 | 
 20 | /**
 21 |  * Similarity of two entities is the number of common inlinks
 22 |  * 
 23 |  *
 24 |  */
 25 | public class InlinkOverlapEntityEntitySimilarity extends EntityEntitySimilarity {
 26 |   private static final Logger logger = 
 27 |       LoggerFactory.getLogger(InlinkOverlapEntityEntitySimilarity.class);
 28 | 
 29 |   private TIntObjectHashMap<int[]> entity2inlink;
 30 |   private TIntObjectHashMap<BitSet> entity2vector;
 31 | 
 32 |   DBConnection con;
 33 | 
 34 |   public InlinkOverlapEntityEntitySimilarity(EntityEntitySimilarityMeasure similarityMeasure, EntitiesContext entityContext) throws Exception {
 35 |     // not needed - uses entites directly
 36 |     super(similarityMeasure, entityContext);
 37 | 
 38 |     setupEntities(entityContext.getEntities());
 39 |   }
 40 | 
 41 |   private void setupEntities(Entities entities) throws Exception {
 42 |     if (entities.uniqueNameSize() == 0) {
 43 |       logger.info("Skipping initialization of InlinkEntityEntitySimilarity for " + entities.uniqueNameSize() + " entities");
 44 |       return;
 45 |     }
 46 | 
 47 |     logger.info("Initializing InlinkEntityEntitySimilarity for " + entities.uniqueNameSize() + " entities");
 48 | 
 49 |     con = AidaManager.getConnectionForDatabase(AidaManager.DB_AIDA, "getting inlinks");
 50 | 
 51 |     entity2inlink = DataAccess.getInlinkNeighbors(entities);
 52 | 
 53 |     // get all inlinks for all entities
 54 |     // get all inlinks for all entities
 55 |     TIntHashSet allInlinks = new TIntHashSet();
 56 | 
 57 |     for (int[] neighbors : entity2inlink.valueCollection()) {
 58 |       allInlinks.addAll(neighbors);
 59 |     }
 60 | 
 61 |     TIntArrayList allInlinksList = new TIntArrayList(allInlinks.size());
 62 |     for (int entry : allInlinksList.toArray()) {
 63 |       allInlinksList.add(entry);
 64 |     }
 65 |     allInlinksList.sort();
 66 |     
 67 |     // now create the bitvectors for each entity
 68 |     logger.info("Creating bitvectors for entities");
 69 | 
 70 |     entity2vector = new TIntObjectHashMap<BitSet>();
 71 | 
 72 |     for (int entity : entities.getUniqueIds()) {
 73 |       int[] inlinks = entity2inlink.get(entity);
 74 | 
 75 |       BitSet bs = new BitSet(allInlinksList.size());
 76 | 
 77 |       int current = 0;
 78 | 
 79 |       for (int inlink : inlinks) {
 80 |         // move to position of inlink in allInlinks
 81 |         while (allInlinksList.get(current) != inlink) {
 82 |           current++;
 83 |         }
 84 |         bs.set(current);
 85 |       }
 86 | 
 87 |       entity2vector.put(entity, bs);
 88 |     }
 89 | 
 90 |     AidaManager.releaseConnection(AidaManager.DB_AIDA, con);
 91 | 
 92 |     logger.info("Done initializing InlinkEntityEntitySimilarity");
 93 |   }
 94 | 
 95 |   @Override
 96 |   public double calcSimilarity(Entity a, Entity b) throws Exception {
 97 |     BitSet bsA = entity2vector.get(a.getId());
 98 |     BitSet bsB = entity2vector.get(b.getId());
 99 | 
100 |     BitSet intersection = (BitSet) bsA.clone();
101 |     intersection.and(bsB);
102 | 
103 |     BitSet union = (BitSet) bsA.clone();
104 |     union.or(bsB);
105 | 
106 |     if (intersection.cardinality() == 0 || union.cardinality() == 0) {
107 |       return 0.0; // cannot calc
108 |     }
109 | 
110 |     double sim = (double) intersection.cardinality() 
111 |                  / (double) union.cardinality();
112 |     
113 |     return sim;
114 |   }
115 |   
116 |   public String toString() {
117 |     return "InlinkOverlapEntityEntitySimilarity";
118 |   }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/JaccardEntityEntitySimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import gnu.trove.set.hash.TIntHashSet;
 4 | 
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | 
 8 | import mpi.aida.AidaManager;
 9 | import mpi.aida.data.Entity;
10 | import mpi.aida.graph.similarity.context.EntitiesContext;
11 | import mpi.aida.graph.similarity.context.FastWeightedKeyphrasesContext;
12 | import mpi.aida.util.CollectionUtils;
13 | import mpi.experiment.trace.Tracer;
14 | import mpi.experiment.trace.measures.KeytermEntityEntityMeasureTracer;
15 | import mpi.experiment.trace.measures.TermTracer;
16 | 
17 | public class JaccardEntityEntitySimilarityMeasure extends EntityEntitySimilarityMeasure {
18 | 
19 |   public JaccardEntityEntitySimilarityMeasure(Tracer tracer) {
20 |     super(tracer);
21 |   }
22 | 
23 |   @Override
24 |   public double calcSimilarity(Entity a, Entity b, EntitiesContext context) {   
25 |     TIntHashSet contextA = new TIntHashSet(context.getContext(a));
26 |     TIntHashSet contextB = new TIntHashSet(context.getContext(b));
27 | 
28 |     TIntHashSet union = getUnion(contextA, contextB);
29 |     TIntHashSet intersection = getIntersection(contextA, contextB);
30 | 
31 |     double jaccardSim = (double) intersection.size() / (double) union.size();   
32 |     return jaccardSim;
33 |   }
34 | 
35 |   private TIntHashSet getIntersection(TIntHashSet contextA, TIntHashSet contextB) {
36 |     TIntHashSet is = new TIntHashSet();
37 | 
38 |     for (int a : contextA.toArray()) {
39 |       if (contextB.contains(a) || contextB.contains(AidaManager.expandTerm(a))) {
40 |         is.add(a);
41 |       }
42 |     }
43 | 
44 |     return is;
45 |   }
46 | 
47 |   private TIntHashSet getUnion(TIntHashSet contextA, TIntHashSet contextB) {
48 |     TIntHashSet union = new TIntHashSet();
49 | 
50 |     for (int a : contextB.toArray()) {
51 |       union.add(a);
52 |     }
53 | 
54 |     for (int a : contextA.toArray()) {
55 |       if (!union.contains(a) && !union.contains(AidaManager.expandTerm(a))) {
56 |         union.add(a);
57 |       }
58 |     }
59 | 
60 |     return union;
61 |   }
62 |   
63 |   @SuppressWarnings("unused")
64 |   private void collectTracingInfo(Entity a, Entity b, int[] kpsA, int[] kpsB, double sim, Map<String, TermTracer> matches, FastWeightedKeyphrasesContext kwc) {
65 |     Map<String, Double> e1keyphrases = new HashMap<String, Double>();     
66 |     for (int kp : kpsA) {
67 |       if (kwc.getCombinedKeyphraseMiIdfWeight(a, kp) > 0.0) {
68 |         e1keyphrases.put(kwc.getKeyphraseForId(kp), kwc.getCombinedKeyphraseMiIdfWeight(a, kp));
69 |       }
70 |     }     
71 |     e1keyphrases = CollectionUtils.sortMapByValue(e1keyphrases, true);    
72 | 
73 |     Map<String, Double> e2keyphrases = new HashMap<String, Double>();
74 |     for (int kp : kpsB) {
75 |       if (kwc.getCombinedKeyphraseMiIdfWeight(b, kp) > 0.0) {
76 |         e2keyphrases.put(kwc.getKeyphraseForId(kp), kwc.getCombinedKeyphraseMiIdfWeight(b, kp));
77 |       }
78 |     }  
79 |     e2keyphrases = CollectionUtils.sortMapByValue(e2keyphrases, true);
80 | 
81 |     tracer.eeTracing().addEntityContext(a.getName(), e1keyphrases);
82 |     tracer.eeTracing().addEntityContext(b.getName(), e2keyphrases);
83 | 
84 |     KeytermEntityEntityMeasureTracer mt = new KeytermEntityEntityMeasureTracer("PartialKeyphraseSim", 0.0, e2keyphrases, matches);
85 |     mt.setScore(sim);
86 |     tracer.eeTracing().addEntityEntityMeasureTracer(a.getName(), b.getName(), mt);
87 | 
88 |     KeytermEntityEntityMeasureTracer mt2 = new KeytermEntityEntityMeasureTracer("PartialKeyphraseSim", 0.0, e1keyphrases, matches);
89 |     mt2.setScore(sim);
90 |     tracer.eeTracing().addEntityEntityMeasureTracer(b.getName(), a.getName(), mt2);
91 |   }
92 | }
93 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/JaccardSimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import gnu.trove.set.hash.TIntHashSet;
 4 | import mpi.aida.AidaManager;
 5 | import mpi.aida.data.Context;
 6 | import mpi.aida.data.Entity;
 7 | import mpi.aida.data.Mention;
 8 | import mpi.aida.graph.similarity.context.EntitiesContext;
 9 | import mpi.experiment.trace.Tracer;
10 | 
11 | public class JaccardSimilarityMeasure extends MentionEntitySimilarityMeasure {
12 | 
13 |   public JaccardSimilarityMeasure(Tracer tracer) {
14 |     super(tracer);
15 |   }
16 | 
17 |   @Override
18 |   public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) {
19 |     TIntHashSet contextA = new TIntHashSet(context.getTokenIds());
20 |     TIntHashSet contextB = new TIntHashSet(entitiesContext.getContext(entity));
21 | 
22 |     TIntHashSet union = getUnion(contextA, contextB);
23 |     TIntHashSet intersection = getIntersection(contextA, contextB);
24 | 
25 |     double jaccardSim = (double) intersection.size() / (double) union.size();   
26 |     return jaccardSim;
27 |   }
28 | 
29 |   private TIntHashSet getIntersection(TIntHashSet contextA, TIntHashSet contextB) {
30 |     TIntHashSet is = new TIntHashSet();
31 | 
32 |     for (int a : contextA.toArray()) {
33 |       if (contextB.contains(a) || contextB.contains(AidaManager.expandTerm(a))) {
34 |         is.add(a);
35 |       }
36 |     }
37 | 
38 |     return is;
39 |   }
40 | 
41 |   private TIntHashSet getUnion(TIntHashSet contextA, TIntHashSet contextB) {
42 |     TIntHashSet union = new TIntHashSet();
43 | 
44 |     for (int a : contextB.toArray()) {
45 |       union.add(a);
46 |     }
47 | 
48 |     for (int a : contextA.toArray()) {
49 |       if (!union.contains(a) && !union.contains(AidaManager.expandTerm(a))) {
50 |         union.add(a);
51 |       }
52 |     }
53 | 
54 |     return union;
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/MentionEntitySimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import mpi.aida.data.Context;
 4 | import mpi.aida.data.Entity;
 5 | import mpi.aida.data.Mention;
 6 | import mpi.aida.graph.similarity.context.EntitiesContext;
 7 | import mpi.experiment.trace.Tracer;
 8 | 
 9 | public abstract class MentionEntitySimilarityMeasure extends SimilarityMeasure {
10 | 
11 |   public MentionEntitySimilarityMeasure(Tracer tracer) {
12 |     super(tracer);
13 |   }
14 | 
15 |   protected boolean useDistanceDiscount = false;
16 | 
17 |   public boolean isUseDistanceDiscount() {
18 |     return useDistanceDiscount;
19 |   }
20 | 
21 |   public void setUseDistanceDiscount(boolean useDistanceDiscount) {
22 |     this.useDistanceDiscount = useDistanceDiscount;
23 |   }
24 | 
25 |   public abstract double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext);
26 |   
27 |   /**
28 |    * This method is a place holder to enable the framework to add extra context to a specific mention 
29 |    * during the processing of the code
30 |    * subclasses should override this method accordingly
31 |    * @param context the context to add
32 |    */
33 |   
34 |   /**
35 |    * This method is a place holder to enable the framework to add extra context to a specific mention 
36 |    * during the processing of the code
37 |    * subclasses should override this method accordingly
38 |    * 
39 |    * @param mention the mention to which this context belongs
40 |    * @param context the context to add
41 |    */
42 |   public void addExtraContext(Mention mention, Object context) {
43 | 	  return;
44 |   }
45 |   
46 | 
47 | /**
48 |  *  This method is a place holder to enable the framework to announce when a mention gets assigned to an entity
49 |  *  different measures may perform different upon such event.
50 |  *  default implementation is doing nothing
51 |  *  
52 |  * @param mention the mention that was assigned
53 |  * @param entity the entity to which the mention got assigned
54 |  */
55 |   public void announceMentionAssignment(Mention mention, Entity entity) {
56 | 	return;
57 |   }
58 |   
59 | }
60 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/MilneWittenEntityEntitySimilarity.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import gnu.trove.iterator.TIntObjectIterator;
 4 | import gnu.trove.map.hash.TIntObjectHashMap;
 5 | import javaewah.EWAHCompressedBitmap;
 6 | import mpi.aida.access.DataAccess;
 7 | import mpi.aida.data.Entities;
 8 | import mpi.aida.data.Entity;
 9 | import mpi.aida.graph.similarity.EntityEntitySimilarity;
10 | import mpi.aida.graph.similarity.context.EntitiesContext;
11 | import mpi.aida.util.YagoUtil;
12 | 
13 | import org.slf4j.Logger;
14 | import org.slf4j.LoggerFactory;
15 | 
16 | public class MilneWittenEntityEntitySimilarity extends EntityEntitySimilarity {
17 |   private static final Logger logger = 
18 |       LoggerFactory.getLogger(MilneWittenEntityEntitySimilarity.class);
19 |   
20 |   private TIntObjectHashMap<EWAHCompressedBitmap> entity2vector;
21 | 
22 | 
23 |   public MilneWittenEntityEntitySimilarity(EntityEntitySimilarityMeasure similarityMeasure, EntitiesContext entityContext) throws Exception {
24 |     // not needed - uses entites directly
25 |     super(similarityMeasure, entityContext);
26 | 
27 |     setupEntities(entityContext.getEntities());
28 |   }
29 | 
30 |   private void setupEntities(Entities entities) throws Exception {
31 |     logger.info("Initializing MilneWittenEntityEntitySimilarity for " + 
32 |                 entities.uniqueNameSize() + " entities");
33 | 
34 |     TIntObjectHashMap<int[]> entityInlinks = 
35 |         DataAccess.getInlinkNeighbors(entities);
36 |     
37 |     entity2vector = new TIntObjectHashMap<EWAHCompressedBitmap>();
38 | 
39 |     for (TIntObjectIterator<int[]> itr = entityInlinks.iterator();
40 |         itr.hasNext(); ) {
41 |       itr.advance();
42 |       int entity = itr.key();
43 |       int[] inLinks = itr.value();
44 |           
45 |       EWAHCompressedBitmap bs = new EWAHCompressedBitmap();
46 |       for (int l : inLinks) {
47 |         bs.set(l);
48 |       }
49 |       entity2vector.put(entity, bs);
50 |     }
51 |     
52 |     logger.info("Done initializing MilneWittenEntityEntitySimilarity for " + 
53 |                 entities.uniqueNameSize() + " entities");
54 |   }
55 | 
56 |   @Override
57 |   public double calcSimilarity(Entity a, Entity b) throws Exception {
58 |     EWAHCompressedBitmap bsA = entity2vector.get(a.getId());
59 |     EWAHCompressedBitmap bsB = entity2vector.get(b.getId());
60 | 
61 |     double sizeA = bsA.cardinality();
62 |     double sizeB = bsB.cardinality();
63 | 
64 |     double max = -1.0;
65 |     double min = -1.0;
66 | 
67 |     if (sizeA >= sizeB) {
68 |       max = sizeA;
69 |       min = sizeB;
70 |     } else {
71 |       max = sizeB;
72 |       min = sizeA;
73 |     }
74 |     
75 |     double sim = 0.0; // default is no sim
76 |     
77 |     int overlap = bsA.andCardinality(bsB);
78 |     
79 |     if (overlap > 0) {  
80 |       // now calc the real similarity
81 |       double distance = (Math.log(max) - Math.log((double) overlap)) / (Math.log(YagoUtil.TOTAL_YAGO_ENTITIES) - Math.log(min));
82 |   
83 |       sim = 1 - distance;
84 |       
85 |       if (distance > 1.0) {
86 |         // really far apart ...
87 |         sim = 0.0;
88 |       }
89 |     }
90 |       
91 |     return sim;
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/NGDSimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import gnu.trove.set.hash.TIntHashSet;
 4 | import mpi.aida.data.Entity;
 5 | import mpi.aida.graph.similarity.context.EntitiesContext;
 6 | import mpi.aida.graph.similarity.context.WeightedKeyphrasesContext;
 7 | import mpi.aida.util.YagoUtil;
 8 | import mpi.experiment.trace.Tracer;
 9 | 
10 | public class NGDSimilarityMeasure extends EntityEntitySimilarityMeasure {
11 | 
12 |   public NGDSimilarityMeasure(Tracer tracer) {
13 |     super(tracer);
14 |   }
15 | 
16 |   protected WeightedKeyphrasesContext kwc;
17 | 
18 |   @Override
19 |   public double calcSimilarity(Entity a, Entity b, EntitiesContext entitiesContext) {
20 |     kwc = (WeightedKeyphrasesContext) entitiesContext;
21 | 
22 |     double max = getMax(a, b, entitiesContext);
23 |     double min = getMin(a, b, entitiesContext);
24 |     double intersect = getIntersect(a, b, entitiesContext); 
25 |     double collection = getCollection();
26 |     
27 |     double sim = 0.0;
28 |    
29 |     if (intersect > 0) {
30 |       double ngd = 
31 |           ( Math.log(max) - Math.log(intersect) ) 
32 |           / ( Math.log(collection) - Math.log(min) );
33 |       sim = 1 - ngd;      
34 |       if (sim < 0) sim = 0.0;
35 |     }
36 | 
37 |     return sim;
38 |   }
39 | 
40 |   protected double getMax(Entity a, Entity b, EntitiesContext entitiesContext) {
41 |     int[] e1context = kwc.getContext(a);
42 |     int[] e2context = kwc.getContext(b);
43 |     
44 |     return Math.max(e1context.length, e2context.length);
45 |   }
46 |   
47 |   protected double getMin(Entity a, Entity b, EntitiesContext entitiesContext) {
48 |     int[] e1context = kwc.getContext(a);
49 |     int[] e2context = kwc.getContext(b);
50 |     
51 |     return Math.min(e1context.length, e2context.length);
52 |   }
53 | 
54 |   protected double getIntersect(Entity a, Entity b, EntitiesContext entitiesContext) {
55 |     TIntHashSet e1context = new TIntHashSet(kwc.getContext(a));
56 |     TIntHashSet e2context = new TIntHashSet(kwc.getContext(b));
57 |     
58 |     e1context.retainAll(e2context); 
59 |     int intersectSize = e1context.size();
60 |     return (double) intersectSize;
61 |   }
62 | 
63 |   protected double getCollection() {
64 |     return ((double) YagoUtil.TOTAL_YAGO_ENTITIES);
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/NormalizedKeyphrasesBasedIDFSimilarity.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import mpi.experiment.trace.Tracer;
 4 | 
 5 | public class NormalizedKeyphrasesBasedIDFSimilarity extends UnnormalizedKeyphrasesBasedIDFSimilarity {
 6 | 
 7 |   public NormalizedKeyphrasesBasedIDFSimilarity(Tracer tracer) {
 8 |     super(tracer);
 9 |     normalize = true;
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/NormalizedKeyphrasesBasedMISimilarity.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import mpi.experiment.trace.Tracer;
 4 | 
 5 | public class NormalizedKeyphrasesBasedMISimilarity extends UnnormalizedKeyphrasesBasedMISimilarity {
 6 | 
 7 |   public NormalizedKeyphrasesBasedMISimilarity(Tracer tracer) {
 8 |     super(tracer);
 9 |     normalize = true;
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/NullEntityEntitySimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import mpi.aida.data.Entity;
 4 | import mpi.aida.graph.similarity.context.EntitiesContext;
 5 | import mpi.experiment.trace.Tracer;
 6 | 
 7 | public class NullEntityEntitySimilarityMeasure extends EntityEntitySimilarityMeasure {
 8 | 
 9 |   public NullEntityEntitySimilarityMeasure(Tracer tracer) {
10 |     super(tracer);
11 |   }
12 | 
13 |   @Override
14 |   public double calcSimilarity(Entity a, Entity b, EntitiesContext context) {
15 |     return -1;
16 |   }
17 | 
18 | }
19 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/NullMentionEntittySimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import mpi.aida.data.Context;
 4 | import mpi.aida.data.Entity;
 5 | import mpi.aida.data.Mention;
 6 | import mpi.aida.graph.similarity.context.EntitiesContext;
 7 | import mpi.experiment.trace.Tracer;
 8 | 
 9 | 
10 | public class NullMentionEntittySimilarityMeasure extends MentionEntitySimilarityMeasure {
11 | 
12 |   public NullMentionEntittySimilarityMeasure(Tracer tracer) {
13 |     super(tracer);
14 |   }
15 | 
16 |   @Override
17 |   public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) {
18 |     return 0;
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/SimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import mpi.experiment.trace.Tracer;
 4 | 
 5 | public abstract class SimilarityMeasure {
 6 | 
 7 |   protected Tracer tracer = null;
 8 | 
 9 |   public SimilarityMeasure(Tracer tracer) {
10 |     this.tracer = tracer;
11 |   }
12 | 
13 |   public String toString() {
14 |     return getIdentifier();
15 |   }
16 | 
17 |   public String getIdentifier() {
18 |     String id = this.getClass().getSimpleName();
19 |     return id;
20 |   }
21 |   
22 |   public Tracer getTracer() {
23 |     return tracer;
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/TfIdfCosineSimilarityMeasure.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.graph.similarity.measure;
  2 | 
  3 | import gnu.trove.map.hash.TIntDoubleHashMap;
  4 | import gnu.trove.map.hash.TIntIntHashMap;
  5 | import gnu.trove.set.hash.TIntHashSet;
  6 | import mpi.aida.AidaManager;
  7 | import mpi.aida.access.DataAccess;
  8 | import mpi.aida.data.Context;
  9 | import mpi.aida.data.Entity;
 10 | import mpi.aida.data.Mention;
 11 | import mpi.aida.graph.similarity.context.EntitiesContext;
 12 | import mpi.aida.util.YagoUtil;
 13 | import mpi.experiment.trace.Tracer;
 14 | 
 15 | /**
 16 |  * Calculates the similarity of two contexts by the cosine similarity
 17 |  * of their tf.idf weighted term vectors.
 18 |  * 
 19 |  *
 20 |  */
 21 | public class TfIdfCosineSimilarityMeasure extends MentionEntitySimilarityMeasure {
 22 | 
 23 |   public TfIdfCosineSimilarityMeasure(Tracer tracer) {
 24 |     super(tracer);
 25 |   }
 26 | 
 27 |   @Override
 28 |   public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) {
 29 |     TIntDoubleHashMap contextVec = getTfIdfVector(context.getTokenIds());
 30 |     TIntDoubleHashMap entityVec = getTfIdfVector(entitiesContext.getContext(entity));
 31 | 
 32 |     double sim = calcCosine(entityVec, contextVec);
 33 |     return sim;
 34 |   }
 35 | 
 36 |   protected double calcCosine(TIntDoubleHashMap entityVec, TIntDoubleHashMap contextVec) {
 37 |     double dotProduct = 0.0;
 38 | 
 39 |     for (int termA : entityVec.keys()) {
 40 |       int expandedA = AidaManager.expandTerm(termA);
 41 |       if (contextVec.containsKey(termA)) {
 42 |         double tempProduct = entityVec.get(termA) * contextVec.get(termA);
 43 |         dotProduct += tempProduct;
 44 |       }
 45 |       if (contextVec.containsKey(expandedA)) {
 46 |         double tempProduct = entityVec.get(termA) * contextVec.get(expandedA);
 47 |         dotProduct += tempProduct;
 48 |       }
 49 |     }
 50 | 
 51 |     double normA = 0.0;
 52 |     for (double weightA : entityVec.values()) {
 53 |       normA += weightA * weightA;
 54 |     }
 55 |     normA = Math.sqrt(normA);
 56 | 
 57 |     double normB = 0.0;
 58 |     for (double weightB : contextVec.values()) {
 59 |       normB += weightB * weightB;
 60 |     }
 61 |     normB = Math.sqrt(normB);
 62 | 
 63 |     double sim = 0.0;
 64 | 
 65 |     if (normA * normB != 0) {
 66 |       sim = dotProduct / (normA * normB);
 67 |     }
 68 | 
 69 |     return sim;
 70 |   }
 71 | 
 72 |   private TIntDoubleHashMap getTfIdfVector(int[] is) {
 73 |     TIntDoubleHashMap vector = new TIntDoubleHashMap();
 74 | 
 75 |     TIntIntHashMap tfs = new TIntIntHashMap();
 76 | 
 77 |     for (int term : is) {
 78 |       tfs.adjustOrPutValue(term, 1, 1);
 79 |     }
 80 | 
 81 |     TIntIntHashMap termDFs =
 82 |         DataAccess.getKeywordDocumentFrequencies(new TIntHashSet(is));
 83 |     
 84 |     for (int term : new TIntHashSet(is).toArray()) {
 85 |       int tf = tfs.get(term);
 86 |       int df = termDFs.get(term);
 87 |       if (df == 0) df = YagoUtil.TOTAL_YAGO_ENTITIES; // default smoothing
 88 | 
 89 |       double tfIdf = 
 90 |           (double) tf 
 91 |           * log2((double) YagoUtil.TOTAL_YAGO_ENTITIES / (double) df);
 92 | 
 93 |       vector.put(term, tfIdf);
 94 |     }
 95 | 
 96 |     return vector;
 97 |   }
 98 | 
 99 |   public static double log2(double x) {
100 |     return Math.log(x) / Math.log(2);
101 |   }
102 | }
103 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/UnnormalizedKeyphrasesBasedIDFSimilarity.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import mpi.aida.data.Entity;
 4 | import mpi.experiment.trace.Tracer;
 5 | 
 6 | public class UnnormalizedKeyphrasesBasedIDFSimilarity extends UnnormalizedKeyphrasesBasedMISimilarity {
 7 | 
 8 |   public UnnormalizedKeyphrasesBasedIDFSimilarity(Tracer tracer) {
 9 |     super(tracer);
10 |   }
11 | 
12 |   protected double getKeywordScore(Entity entity, int keyword) {
13 |     return keyphrasesContext.getKeywordIDFWeight(keyword);
14 |   }
15 | 
16 |   public String getIdentifier() {
17 |     String identifier = "UnnormalizedKeyphrasesBasedIDFSimilarity";
18 | 
19 |     if (isUseDistanceDiscount()) {
20 |       identifier += ",i";
21 |     }
22 | 
23 |     return identifier;
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/WeightedJaccardEntityEntitySimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import gnu.trove.set.hash.TIntHashSet;
 4 | 
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | 
 8 | import mpi.aida.data.Entity;
 9 | import mpi.aida.graph.similarity.context.EntitiesContext;
10 | import mpi.aida.graph.similarity.context.WeightedKeyphrasesContext;
11 | import mpi.experiment.trace.Tracer;
12 | 
13 | public class WeightedJaccardEntityEntitySimilarityMeasure extends EntityEntitySimilarityMeasure {
14 | 
15 |   public WeightedJaccardEntityEntitySimilarityMeasure(Tracer tracer) {
16 |     super(tracer);
17 |   }
18 | 
19 |   @Override
20 |   public double calcSimilarity(Entity a, Entity b, EntitiesContext context) {
21 |     WeightedKeyphrasesContext kpc = (WeightedKeyphrasesContext) context;
22 |        
23 |     TIntHashSet contextA = new TIntHashSet(kpc.getEntityKeyphraseIds(a));
24 |     TIntHashSet contextB = new TIntHashSet(kpc.getEntityKeyphraseIds(b));
25 | 
26 |     double intersection = getIntersection(a, contextA, b, contextB, kpc);
27 |     double union = getUnion(a, contextA, b, contextB, kpc);
28 | 
29 |     double jaccardSim = intersection / union;
30 | 
31 |     return jaccardSim;
32 |   }
33 | 
34 |   private double getIntersection(Entity a, TIntHashSet contextA, Entity b, TIntHashSet contextB, WeightedKeyphrasesContext kpc) {
35 |     double intersectWeight = 0.0;
36 |     
37 |     for (int k : contextA.toArray()) {
38 |       if (contextB.contains(k)) {
39 |         intersectWeight += Math.min(kpc.getCombinedKeyphraseMiIdfWeight(a, k), kpc.getCombinedKeyphraseMiIdfWeight(b, k));
40 |       }
41 |     }
42 |     
43 |     return intersectWeight;
44 |   }
45 | 
46 |   private double getUnion(Entity a, TIntHashSet contextA, Entity b, TIntHashSet contextB, WeightedKeyphrasesContext kpc) {
47 |     Map<Integer, Double> weights = new HashMap<Integer, Double>();
48 |     
49 |     for (int k : contextA.toArray()) {
50 |       weights.put(k, kpc.getCombinedKeyphraseMiIdfWeight(a, k));
51 |     }
52 |     
53 |     for (int k : contextB.toArray()) {
54 |       Double kwbWeight = kpc.getCombinedKeyphraseMiIdfWeight(b, k);
55 |       Double kwaWeight = weights.get(k);
56 |       
57 |       if (kwaWeight != null) {
58 |         weights.put(k, Math.max(kwaWeight, kwbWeight));
59 |       } else {
60 |         weights.put(k, kwbWeight);
61 |       }
62 |     }
63 |     
64 |     double unionWeight = 0.0;
65 |     
66 |     for (Double d : weights.values()) {
67 |       unionWeight += d;
68 |     }
69 |     
70 |     return unionWeight;
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/WeightedNGDSimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import gnu.trove.set.hash.TIntHashSet;
 4 | import mpi.aida.data.Entity;
 5 | import mpi.aida.graph.similarity.context.EntitiesContext;
 6 | import mpi.aida.util.YagoUtil;
 7 | import mpi.experiment.trace.Tracer;
 8 | 
 9 | 
10 | public class WeightedNGDSimilarityMeasure extends NGDSimilarityMeasure {
11 | 
12 |   public WeightedNGDSimilarityMeasure(Tracer tracer) {
13 |     super(tracer);
14 |   }
15 | 
16 |   @Override
17 |   protected double getMax(Entity a, Entity b, EntitiesContext entitiesContext) {
18 |     int[] e1context = kwc.getEntityKeyphraseIds(a);
19 |     int[] e2context = kwc.getEntityKeyphraseIds(b);
20 |     
21 |     double e1weight = 0.0;
22 |     for (int kp : e1context) {
23 |       e1weight += kwc.getCombinedKeyphraseMiIdfWeight(a, kp);
24 |     }  
25 |     
26 |     double e2weight = 0.0;
27 |     for (int kp : e2context) {
28 |       e2weight += kwc.getCombinedKeyphraseMiIdfWeight(b, kp);
29 |     }  
30 |     
31 |     return Math.max(e1weight, e2weight);
32 |   }
33 | 
34 |   @Override
35 |   protected double getMin(Entity a, Entity b, EntitiesContext entitiesContext) {
36 |     int[] e1context = kwc.getEntityKeyphraseIds(a);
37 |     int[] e2context = kwc.getEntityKeyphraseIds(b);
38 |     
39 |     double e1weight = 0.0;
40 |     for (int kp : e1context) {
41 |       e1weight += kwc.getCombinedKeyphraseMiIdfWeight(a, kp);
42 |     }  
43 |     
44 |     double e2weight = 0.0;
45 |     for (int kp : e2context) {
46 |       e2weight += kwc.getCombinedKeyphraseMiIdfWeight(b, kp);
47 |     }  
48 |     
49 |     return Math.min(e1weight, e2weight);
50 |   }
51 | 
52 |   @Override
53 |   protected double getIntersect(Entity a, Entity b, EntitiesContext entitiesContext) {
54 |     int[] e1context = kwc.getEntityKeyphraseIds(a);
55 |     int[] e2context = kwc.getEntityKeyphraseIds(b);
56 |     
57 |     TIntHashSet e1forIntersect = new TIntHashSet(e1context);
58 |     TIntHashSet e2forIntersect = new TIntHashSet(e2context);
59 |     e1forIntersect.retainAll(e2forIntersect);
60 |     
61 |     double intersectWeight = 0.0;
62 |     
63 |     for (int kp : e1forIntersect.toArray()) {
64 |       intersectWeight += kwc.getCombinedKeyphraseMiIdfWeight(a, kp);
65 |       intersectWeight += kwc.getCombinedKeyphraseMiIdfWeight(b, kp);
66 |     }
67 |     
68 |     // everthing was counted twice
69 |     intersectWeight /= 2;
70 |     
71 |     return intersectWeight;
72 |   }
73 | 
74 |   @Override
75 |   protected double getCollection() {
76 |     return YagoUtil.TOTAL_YAGO_ENTITIES;
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/WordCountVectorDotProductSimilarityMeasure.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import gnu.trove.iterator.TIntIntIterator;
 4 | import gnu.trove.map.hash.TIntIntHashMap;
 5 | import mpi.aida.AidaManager;
 6 | import mpi.aida.data.Context;
 7 | import mpi.aida.data.Entity;
 8 | import mpi.aida.data.Mention;
 9 | import mpi.aida.graph.similarity.context.EntitiesContext;
10 | import mpi.experiment.trace.Tracer;
11 | 
12 | /**
13 |  * This class calculates the similarity between a mention and an
14 |  * entity context by a dot product between the word count vectors.
15 |  * 
16 |  *
17 |  */
18 | public class WordCountVectorDotProductSimilarityMeasure extends MentionEntitySimilarityMeasure {
19 | 
20 |   public WordCountVectorDotProductSimilarityMeasure(Tracer tracer) {
21 |     super(tracer);
22 |   }
23 | 
24 |   @Override
25 |   public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) {
26 |     // create two Maps representing the word count vectors
27 |     TIntIntHashMap contextVec = createWordCountVector(context.getTokenIds());
28 |     TIntIntHashMap entityVec = createWordCountVector(entitiesContext.getContext(entity));
29 | 
30 |     // calc dot product between them
31 |     double similarity = calcDotProduct(entityVec, contextVec);
32 |     return similarity;
33 |   }
34 | 
35 |   private TIntIntHashMap createWordCountVector(int[] is) {
36 |     TIntIntHashMap wordCountVector = new TIntIntHashMap();
37 | 
38 |     for (int word : is) {
39 |       wordCountVector.adjustOrPutValue(word, 1, 1);
40 |     }
41 | 
42 |     return wordCountVector;
43 |   }
44 | 
45 |   private double calcDotProduct(
46 |       TIntIntHashMap entityVec, TIntIntHashMap contextVec) {
47 |     int dotProduct = 0;
48 | 
49 |     for (TIntIntIterator it = entityVec.iterator(); it.hasNext(); ) {
50 |       it.advance();
51 |       int wordA = it.key();
52 | 
53 |       int expandedA = AidaManager.expandTerm(wordA);
54 | 
55 |       // get counts of word in both vectors
56 |       int wordAcount = entityVec.get(wordA);
57 |       int wordBcount = contextVec.get(wordA);
58 | 
59 |       wordBcount += contextVec.get(expandedA); // add expanded count if available
60 | 
61 |       int temp = wordAcount * wordBcount;
62 |       dotProduct += temp;
63 |     }
64 | 
65 |     return dotProduct;
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/util/EntitiesContextCreator.java:
--------------------------------------------------------------------------------
  1 |  package mpi.aida.graph.similarity.util;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.LinkedList;
  5 | import java.util.List;
  6 | import java.util.Map;
  7 | import java.util.concurrent.locks.Lock;
  8 | import java.util.concurrent.locks.ReentrantLock;
  9 | 
 10 | import mpi.aida.data.Entities;
 11 | import mpi.aida.graph.similarity.context.EntitiesContext;
 12 | 
 13 | /** 
 14 |  * Caches entity contexts based on the context id and document id.
 15 |  * Assumes distinct document ids and caches up to ecc contexts.
 16 |  * 
 17 |  *
 18 |  */
 19 | public class EntitiesContextCreator {
 20 |   /** Has to be at least 1. */
 21 |   private static final int CACHE_SIZE = 10;
 22 |   
 23 |   /** Holds the cached EntityContexts. */
 24 |   private Map<String, EntitiesContext> cache = 
 25 |       new HashMap<String, EntitiesContext>();
 26 |   
 27 |   /** 
 28 |    * Keeps the order in which the EntityContexts were created for 
 29 |    * discarding the least recently used on cache overflow.
 30 |    */
 31 |   private List<String> cacheIds = new LinkedList<String>();
 32 | 
 33 |   /**
 34 |    * Synchronized the creation of different contexts. Allows the parallel
 35 |    * creation of contexts for distinct documents but blocks for requests
 36 |    * of the same context. 
 37 |    */
 38 |   private Map<String, Lock> contextCreationLocks = new HashMap<String, Lock>();
 39 |   
 40 |   private static class EntitiesContextCreatorHolder {
 41 |     public static EntitiesContextCreator ecc = new EntitiesContextCreator();
 42 |   }
 43 | 
 44 |   public static EntitiesContextCreator getEntitiesContextCache() {
 45 |     return EntitiesContextCreatorHolder.ecc;
 46 |   }
 47 |     
 48 |   public EntitiesContext getEntitiesContext(
 49 |       String contextClassName, String docId, Entities entities) 
 50 |           throws Exception {
 51 |     
 52 |     String id = getCacheId(contextClassName, docId);
 53 |     
 54 |     // Allow the parallel creation of distinct contexts but only
 55 |     // one creation per id.
 56 |     Lock contextLock = getContextCreationLock(id);
 57 |     contextLock.lock();
 58 |     EntitiesContext context = null;
 59 |     try {
 60 |       context = cache.get(id);
 61 |       
 62 |       if (context == null) {
 63 |         // Create context.
 64 |         context = 
 65 |             (EntitiesContext) 
 66 |             Class.forName(contextClassName).
 67 |               getDeclaredConstructor(Entities.class).newInstance(entities);
 68 |         
 69 |         // Put it into the cache, deleting the oldest cache if the cache
 70 |         // size is exceeded.
 71 |         synchronized(cache) {
 72 |           cache.put(id, context);
 73 |           cacheIds.add(id);
 74 |           
 75 |           if (cacheIds.size() > CACHE_SIZE) {
 76 |             String removedId = cacheIds.get(0);
 77 |             cacheIds.remove(0);
 78 |             cache.remove(removedId);
 79 |           }
 80 |         }
 81 |       }
 82 |     } catch (Exception e) {
 83 |       throw e;    
 84 |     } finally {
 85 |       contextLock.unlock();
 86 |     }
 87 |     
 88 |     // Will be null if something goes wrong in the creation process.
 89 |     return context;
 90 |   }
 91 |   
 92 |   private String getCacheId(String contextClassName, String docId) {
 93 |     return contextClassName + "\t" + docId;
 94 |   }
 95 | 
 96 |   private synchronized Lock getContextCreationLock(String id) {
 97 |     Lock lock = contextCreationLocks.get(id);
 98 |     if (lock == null) {
 99 |       lock = new ReentrantLock();
100 |       contextCreationLocks.put(id, lock);
101 |     }
102 |     return lock;
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/util/MaxMinSettings.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.util;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.FileReader;
 5 | import java.io.IOException;
 6 | import java.io.Serializable;
 7 | import java.util.HashMap;
 8 | import java.util.Map;
 9 | 
10 | public class MaxMinSettings implements Serializable {
11 | 
12 |   private static final long serialVersionUID = -3088993650033149824L;
13 | 
14 |   Map<String, double[]> minMaxs;
15 | 
16 |   public MaxMinSettings(String propertiesFilePath) 
17 |       throws NumberFormatException, IOException {
18 |     minMaxs = new HashMap<String, double[]>();
19 | 
20 |     BufferedReader reader = 
21 |         new BufferedReader(new FileReader(propertiesFilePath));
22 |     for (String line = reader.readLine(); line != null; line = reader.readLine()) {
23 |       String[] data = line.split("=");
24 | 
25 |       double min = Double.parseDouble(data[1].split(" ")[0]);
26 |       double max = Double.parseDouble(data[1].split(" ")[1]);
27 | 
28 |       minMaxs.put(data[0], new double[] { min, max });
29 |     }
30 |     reader.close();
31 |   }
32 |   
33 |   public MaxMinSettings(Map<String, double[]> minMaxs) {
34 |     this.minMaxs = minMaxs;
35 |   }
36 | 
37 |   public double getMin(String featureName) {
38 |     if (!minMaxs.containsKey(featureName)) {
39 |       throw new IllegalArgumentException("No min for '"+featureName+"'");
40 |     }
41 |     return minMaxs.get(featureName)[0];
42 |   }
43 | 
44 |   public double getMax(String featureName) {
45 |     if (!minMaxs.containsKey(featureName)) {
46 |       throw new IllegalArgumentException("No max for '"+featureName+"'");
47 |     }
48 |     return minMaxs.get(featureName)[1];
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/util/ParallelEntityEntityRelatednessComputation.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.graph.similarity.util;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collections;
  5 | import java.util.HashMap;
  6 | import java.util.HashSet;
  7 | import java.util.LinkedList;
  8 | import java.util.List;
  9 | import java.util.Map;
 10 | import java.util.Set;
 11 | import java.util.concurrent.CountDownLatch;
 12 | 
 13 | import mpi.aida.config.AidaConfig;
 14 | import mpi.aida.data.Entities;
 15 | import mpi.aida.data.Entity;
 16 | import mpi.aida.data.Mention;
 17 | import mpi.aida.data.Mentions;
 18 | import mpi.aida.graph.similarity.EnsembleEntityEntitySimilarity;
 19 | 
 20 | 
 21 | public class ParallelEntityEntityRelatednessComputation {
 22 |   private int numThreads = 4; // default.
 23 |   private long totalNumCalcs = 0; // this is only valid if the object is created anew for each entitiy set - used for timing experiments
 24 |   
 25 |   public ParallelEntityEntityRelatednessComputation() {
 26 |     this(Integer.parseInt(AidaConfig.get(AidaConfig.EE_NUM_THREADS)));
 27 |   }
 28 |   
 29 |   public ParallelEntityEntityRelatednessComputation(int numThreads) {
 30 |     this.numThreads = numThreads;
 31 |   }
 32 |   
 33 |   public Map<Entity, Map<Entity, Double>> computeRelatedness(EnsembleEntityEntitySimilarity entitySimilarity, Entities entities) throws InterruptedException {
 34 |     return computeRelatedness(entitySimilarity, entities, null);
 35 |   }
 36 |     
 37 |   public Map<Entity, Map<Entity, Double>> computeRelatedness(EnsembleEntityEntitySimilarity entitySimilarity, Entities entities, Mentions mentions) throws InterruptedException {
 38 |     Map<Entity, Map<Entity, Double>> entityEntitySimilarities = Collections.synchronizedMap(new HashMap<Entity, Map<Entity, Double>>());
 39 |     
 40 |     Map<Entity, List<Mention>> entityMentionsMap = null;    
 41 |     if (mentions != null) {
 42 |        entityMentionsMap = prepareEntityMentionsMap(mentions);
 43 |     }
 44 | 
 45 |     List<Set<Entity>> entityPartitions = new LinkedList<Set<Entity>>();
 46 |     List<Entity> allEntities = new ArrayList<Entity>(entities.getEntities());
 47 |     
 48 |     int overall = 0;
 49 |     Set<Entity> part = null;
 50 |     int partSize = entities.uniqueNameSize() / numThreads;
 51 | 
 52 |     for (int currentPart = 0; currentPart < numThreads; currentPart++) {
 53 |       part = new HashSet<Entity>();
 54 |       entityPartitions.add(part);
 55 | 
 56 |       for (int j = 0; j < partSize; j++) {
 57 |         int total = (currentPart * partSize) + j;
 58 |         part.add(allEntities.get(total));
 59 | 
 60 |         overall++;
 61 |       }
 62 |     }
 63 | 
 64 |     // add rest to last part
 65 |     for (; overall < allEntities.size(); overall++) {
 66 |       part.add(allEntities.get(overall));
 67 |     }
 68 | 
 69 |     // create threads and run
 70 |     CountDownLatch cdl = new CountDownLatch(numThreads);
 71 | 
 72 |     List<ParallelEntityEntityRelatednessComputationThread> scs = new LinkedList<ParallelEntityEntityRelatednessComputationThread>();
 73 | 
 74 |     for (int i = 0; i < numThreads; i++) {
 75 |       ParallelEntityEntityRelatednessComputationThread sc = new ParallelEntityEntityRelatednessComputationThread(entityPartitions.get(i), entities, entitySimilarity, entityEntitySimilarities, entityMentionsMap, cdl);
 76 |       scs.add(sc);
 77 |       sc.start();
 78 |     }
 79 |     
 80 |     // wait for calculation to finish
 81 |     cdl.await();
 82 |     
 83 |     // sum up total number of calculations
 84 |     for (ParallelEntityEntityRelatednessComputationThread sc : scs) {
 85 |       totalNumCalcs += sc.getNumCalcs();
 86 |     }
 87 |     
 88 |     return entityEntitySimilarities;
 89 |   }
 90 |     
 91 |   private Map<Entity, List<Mention>> prepareEntityMentionsMap(Mentions mentions) {
 92 |     Map<Entity, List<Mention>> entityMentionsMap = new HashMap<Entity, List<Mention>>();
 93 |     
 94 |     for (int i = 0; i < mentions.getMentions().size(); i++) {
 95 |       Mention mention = mentions.getMentions().get(i);
 96 |       Entities entities = mention.getCandidateEntities();
 97 |       for (Entity entity : entities) {
 98 |         List<Mention> entityMentions = entityMentionsMap.get(entity);
 99 |         if (entityMentions == null) {
100 |           entityMentions = new LinkedList<Mention>();
101 |           entityMentionsMap.put(entity, entityMentions);
102 |         }
103 |         entityMentions.add(mention);
104 |       }
105 |     }
106 |     
107 |     return entityMentionsMap;
108 |   }
109 |  
110 |   public long getTotalNumCalcs() {
111 |     return totalNumCalcs;
112 |   }  
113 | }
114 | 


--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/util/ParallelEntityEntityRelatednessComputationThread.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.graph.similarity.util;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.HashSet;
  5 | import java.util.List;
  6 | import java.util.Map;
  7 | import java.util.Set;
  8 | import java.util.concurrent.CountDownLatch;
  9 | 
 10 | import mpi.aida.data.Entities;
 11 | import mpi.aida.data.Entity;
 12 | import mpi.aida.data.Mention;
 13 | import mpi.aida.graph.similarity.EnsembleEntityEntitySimilarity;
 14 | 
 15 | import org.slf4j.Logger;
 16 | import org.slf4j.LoggerFactory;
 17 | 
 18 | 
 19 | public class ParallelEntityEntityRelatednessComputationThread extends Thread {
 20 |   private static final Logger logger = 
 21 |       LoggerFactory.getLogger(ParallelEntityEntityRelatednessComputationThread.class);
 22 |   
 23 |   private Set<Entity> partition;
 24 |   private Entities allEntities;
 25 |   private EnsembleEntityEntitySimilarity eeSimMeasure;
 26 |   private Map<Entity, Map<Entity, Double>> entityEntitySimilarities;
 27 |   private Map<Entity, List<Mention>> entityMentionsMap;
 28 |   private CountDownLatch cdl;
 29 |   private int numCalcs = 0;
 30 | 
 31 |   public ParallelEntityEntityRelatednessComputationThread(Set<Entity> partition, Entities allEntities, EnsembleEntityEntitySimilarity eeSim, Map<Entity, Map<Entity, Double>> entityEntitySimilarities, Map<Entity, List<Mention>> entityMentionsMap, CountDownLatch cdl) {
 32 |     this.partition = partition;
 33 |     this.allEntities = allEntities;
 34 |     this.eeSimMeasure = eeSim;
 35 |     this.entityEntitySimilarities = entityEntitySimilarities;
 36 |     this.entityMentionsMap = entityMentionsMap;
 37 |     this.cdl = cdl;
 38 |   }
 39 | 
 40 |   @Override
 41 |   public void run() {
 42 |     for (Entity e1 : partition) {
 43 |       for (Entity e2 : allEntities) {
 44 |         // only calculate and add if e1 < e2 (similarities are
 45 |         // symmetric, calculate in lexicographic order)
 46 |         if (e1.compareTo(e2) < 0) {
 47 |           double sim = 0.0;
 48 |           // calculate only if they belong to different mentions
 49 |           if (shouldCalculate(e1,e2)) {
 50 |             try {
 51 |               sim = eeSimMeasure.calcSimilarity(e1, e2);
 52 |                 numCalcs++; 
 53 |               // negative is not allowed
 54 |               if (sim < 0) {
 55 |                 logger.warn("Coherence of '"+e1+"' and '"+e2+"' was < 0, set to 0");
 56 |                 sim = 0.0;
 57 |               }
 58 |             } catch (Exception e) {
 59 |               e.printStackTrace();
 60 |             }
 61 |           } else {
 62 |             continue;
 63 |           }
 64 |           
 65 |           Map<Entity, Double> sims = entityEntitySimilarities.get(e1);
 66 |           if (sims == null) {
 67 |             sims = new HashMap<Entity, Double>();
 68 |             entityEntitySimilarities.put(e1, sims);
 69 |           }
 70 |           sims.put(e2, sim);
 71 |         }
 72 |       }
 73 |     }
 74 |     cdl.countDown();
 75 |   }
 76 |     
 77 |   public int getNumCalcs() {
 78 |     return numCalcs;
 79 |   }
 80 | 
 81 |   protected boolean shouldCalculate(Entity e1, Entity e2) {
 82 |     if (entityMentionsMap != null) {
 83 |       Set<Mention> mentions1 = new HashSet<Mention>();
 84 |   
 85 |       for (Mention m : entityMentionsMap.get(e1)) {
 86 |         mentions1.add(m);
 87 |       }
 88 |   
 89 |       Set<Mention> mentions2 = new HashSet<Mention>();
 90 |   
 91 |       for (Mention m : entityMentionsMap.get(e2)) {
 92 |         mentions2.add(m);
 93 |       }
 94 |   
 95 |       if (mentions1.size() != mentions2.size()) return true;
 96 |   
 97 |       for (Mention mention : mentions1) {
 98 |         if (!mentions2.contains(mention)) return true;
 99 |       }
100 |       return false;
101 |     } else {
102 |       return true;
103 |     }
104 |   }
105 | }
106 | 


--------------------------------------------------------------------------------
/src/mpi/aida/preparation/AidaTokenizerManager.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.preparation;
 2 | 
 3 | import mpi.tokenizer.data.Tokenizer;
 4 | import mpi.tokenizer.data.TokenizerManager;
 5 | import mpi.tokenizer.data.Tokens;
 6 | 
 7 | public class AidaTokenizerManager {
 8 |   public static void init() {
 9 |     TokenizerManager.init();
10 |   }
11 | 
12 |   public static Tokens tokenize(String docId, String text, Tokenizer.type type, boolean lemmatize) {
13 |     Tokens tokens = TokenizerManager.parse(docId, text, type, lemmatize);
14 |     return tokens;
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/mpi/aida/preparation/mentionrecognition/FilterMentions.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.preparation.mentionrecognition;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.LinkedList;
 5 | import java.util.List;
 6 | 
 7 | import javatools.datatypes.Pair;
 8 | import mpi.aida.data.Mentions;
 9 | import mpi.aida.data.PreparedInput;
10 | import mpi.tokenizer.data.Token;
11 | import mpi.tokenizer.data.Tokens;
12 | 
13 | public class FilterMentions implements Serializable {
14 | 
15 |   private static final long serialVersionUID = 6260499966421708963L;
16 | 
17 |   private NamedEntityFilter namedEntityFilter = null;
18 | 
19 |   private ManualFilter manualFilter = null;
20 | 
21 |   private HybridFilter hybridFilter = null;
22 | 
23 |   public FilterMentions() {
24 |     namedEntityFilter = new NamedEntityFilter();
25 |     manualFilter = new ManualFilter();
26 |     hybridFilter = new HybridFilter();
27 |   }
28 | 
29 |   /** which type of tokens to get*/
30 |   public static enum FilterType {
31 |     STANFORD_NER, Manual, ManualPOS, Manual_NER, Hybrid, None;
32 |   };
33 | 
34 |   public PreparedInput filter(String text, String docId, Tokens tokens, FilterType by) {
35 |     Mentions mentions = null;
36 |     Tokens returnTokens = null;
37 |     if (by.equals(FilterType.STANFORD_NER)) {
38 |       mentions = namedEntityFilter.filter(tokens);
39 |       returnTokens = tokens;
40 |     } else if (by.equals(FilterType.Manual) || by.equals(FilterType.ManualPOS) || by.equals(FilterType.Manual_NER)) {
41 |       Pair<Tokens, Mentions> tokensMentions = manualFilter.filter(text, docId, by);
42 |       mentions = tokensMentions.second();
43 |       returnTokens = tokensMentions.first();
44 |     } else if (by.equals(FilterType.Hybrid)) {
45 |       Pair<Tokens, Mentions> tokensMentions = manualFilter.filter(text, docId, by);
46 |       Mentions manualMentions = tokensMentions.second();
47 |       Mentions NERmentions = namedEntityFilter.filter(tokensMentions.first());
48 |       mentions = hybridFilter.parse(manualMentions, NERmentions);
49 |       returnTokens = tokensMentions.first();
50 |     } else if (by.equals(FilterType.None)) {
51 |       mentions = new Mentions();
52 |       List<String> tokenlist = new LinkedList<String>();
53 |       for (int p = 0; p < tokens.size(); p++) {
54 |         Token token = tokens.getToken(p);
55 |         tokenlist.add(token.getOriginal());
56 |       }
57 |       returnTokens = tokens;
58 |     }
59 |     PreparedInput preparedInput = new PreparedInput(docId, returnTokens, mentions);
60 |     return preparedInput;
61 |   }
62 | }


--------------------------------------------------------------------------------
/src/mpi/aida/preparation/mentionrecognition/HybridFilter.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.preparation.mentionrecognition;
 2 | 
 3 | import java.util.Collections;
 4 | import java.util.LinkedList;
 5 | import java.util.List;
 6 | 
 7 | import mpi.aida.data.Mention;
 8 | import mpi.aida.data.Mentions;
 9 | 
10 | public class HybridFilter {
11 | 
12 |   public Mentions parse(Mentions manual, Mentions ner) {
13 |     int from = 0;
14 |     List<Mention> toAdd = new LinkedList<Mention>();
15 |     for (int i = 0; i < ner.getMentions().size(); i++) {
16 |       Mention nerMention = ner.getMentions().get(i);
17 |       boolean ok = true;
18 |       int nerStart = nerMention.getStartToken();
19 |       int nerEnd = nerMention.getEndToken();
20 |       for (int m = from; m < manual.getMentions().size(); m++) {
21 |         Mention manMention = manual.getMentions().get(m);
22 |         int manStart = manMention.getStartToken();
23 |         int manEnd = manMention.getEndToken();
24 |         if (nerEnd >= manStart && nerEnd <= manEnd) {
25 |           ok = false;
26 |         } else if (nerStart >= manStart && nerStart <= manEnd) {
27 |           ok = false;
28 |         } else if (nerStart <= manStart && nerEnd >= manEnd) {
29 |           ok = false;
30 |         }
31 |       }
32 |       if (ok) {
33 |         toAdd.add(nerMention);
34 |       }
35 |     }
36 |     for (int i = 0; i < toAdd.size(); i++) {
37 |       manual.addMention(toAdd.get(i));
38 |     }
39 |     Collections.sort(manual.getMentions());
40 |     return manual;
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/mpi/aida/preparation/mentionrecognition/NamedEntityFilter.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.preparation.mentionrecognition;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.LinkedList;
 5 | import java.util.List;
 6 | 
 7 | import mpi.aida.data.Mention;
 8 | import mpi.aida.data.Mentions;
 9 | import mpi.tokenizer.data.Token;
10 | import mpi.tokenizer.data.Tokens;
11 | 
12 | public class NamedEntityFilter {
13 | 
14 |   private HashMap<String, String> tags = null;
15 | 
16 |   public NamedEntityFilter() {
17 |     tags = new HashMap<String, String>();
18 |     tags.put("LOCATION", "LOCATION");
19 |     tags.put("I-LOC", "I-LOC");
20 |     tags.put("B-LOC", "I-LOC");
21 |     tags.put("PERSON", "PERSON");
22 |     tags.put("I-PER", "I-PER");
23 |     tags.put("B-PER", "I-PER");
24 |     tags.put("ORGANIZATION", "ORGANIZATION");
25 |     tags.put("I-ORG", "I-ORG");
26 |     tags.put("B-ORG", "I-ORG");
27 |     tags.put("MISC", "MISC");
28 |     tags.put("I-MISC", "I-MISC");
29 |     tags.put("B-MISC", "I-MISC");
30 |   }
31 | 
32 |   public Mentions filter(Tokens tokens) {
33 |     Mentions mentions = new Mentions();
34 |     HashMap<Integer, Integer> subStrings = new HashMap<Integer, Integer>();
35 |     List<String> content = new LinkedList<String>();
36 |     for (int p = 0; p < tokens.size(); p++) {
37 |       Token token = tokens.getToken(p);
38 |       content.add(token.getOriginal());
39 |     }
40 |     String previous = null;
41 |     int start = -1;
42 |     int end = -1;
43 |     for (int p = 0; p < tokens.size(); p++) {
44 |       Token token = tokens.getToken(p);
45 |       if (previous == null) {
46 |         if (tags.containsKey(token.getNE())) {
47 |           previous = tags.get(token.getNE());
48 |           start = token.getId();
49 |           end = token.getId();
50 |         }
51 |       } else if (previous.equals(token.getNE())) {
52 |         end = token.getId();
53 |       } else {
54 |         Mention newMentions = getPossibleMentions(start, end, tokens);
55 |         mentions.addMention(newMentions);
56 |         subStrings.put(start, end);
57 |         previous = null;
58 |         if (tags.containsKey(token.getNE())) {
59 |           previous = tags.get(token.getNE());
60 |           start = token.getId();
61 |           end = token.getId();
62 |         }
63 |       }
64 |     }
65 |     if (previous != null) {
66 |       Mention newMentions = getPossibleMentions(start, end, tokens);
67 |       mentions.addMention(newMentions);
68 |       subStrings.put(start, end);
69 |       previous = null;
70 |     }
71 |     mentions.setSubstring(subStrings);
72 |     return mentions;
73 |   }
74 | 
75 |   private Mention getPossibleMentions(int start, int end, Tokens advTokens) {
76 |     String meansArg = advTokens.toText(start, end);
77 |     int startStanford = advTokens.getToken(start).getStandfordId();
78 |     int sentenceId = advTokens.getToken(start).getSentence();
79 |     int endStanford = advTokens.getToken(end).getStandfordId();
80 |     Mention mention = new Mention(meansArg, start, end, startStanford, endStanford, sentenceId);
81 |     int firstChar = advTokens.getToken(mention.getStartToken()).getBeginIndex();
82 |     int lastChar = advTokens.getToken(mention.getEndToken()).getEndIndex();
83 |     int charLength = lastChar - firstChar;
84 |     mention.setCharOffset(firstChar);
85 |     mention.setCharLength(charLength);
86 |     return mention;
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/mpi/aida/util/CollectionUtils.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.util;
 2 | 
 3 | import java.util.Collections;
 4 | import java.util.Comparator;
 5 | import java.util.LinkedHashMap;
 6 | import java.util.LinkedList;
 7 | import java.util.List;
 8 | import java.util.Map;
 9 | 
10 | 
11 | public class CollectionUtils {
12 |   public static <K, V extends Comparable<? super V>> LinkedHashMap<K, V> sortMapByValue(Map<K, V> map) {
13 |     return sortMapByValue(map, false);
14 |   }
15 |   
16 |   public static <K, V extends Comparable<? super V>> LinkedHashMap<K, V> sortMapByValue(Map<K, V> map, final boolean descending) {
17 |     List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(map.entrySet());
18 |     Collections.sort(list, new Comparator<Map.Entry<K, V>>() {
19 | 
20 |       public int compare(Map.Entry<K, V> o1, Map.Entry<K, V> o2) {
21 |         int comp = (o1.getValue()).compareTo(o2.getValue());
22 |         
23 |         if (descending) {
24 |           comp = comp * (-1);
25 |         }
26 |         
27 |         return comp;
28 |       }
29 |     });
30 | 
31 |     LinkedHashMap<K, V> result = new LinkedHashMap<K, V>();
32 |     for (Map.Entry<K, V> entry : list) {
33 |       result.put(entry.getKey(), entry.getValue());
34 |     }
35 |     return result;
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/mpi/aida/util/DocumentCounter.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.util;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.Observable;
 5 | 
 6 | import mpi.aida.data.DisambiguationResults;
 7 | 
 8 | import org.slf4j.Logger;
 9 | import org.slf4j.LoggerFactory;
10 | 
11 | 
12 | public class DocumentCounter extends Observable {
13 |   private static final Logger logger = 
14 |       LoggerFactory.getLogger(DocumentCounter.class);
15 |   
16 |   private int completed;
17 |   private int total;
18 |   private long startTime;
19 |   
20 |   private Map<String, DisambiguationResults> resultsMap;
21 |   
22 |   public DocumentCounter(int total) {
23 |     completed = 0;
24 |     this.total = total;
25 |     startTime = System.currentTimeMillis();
26 |   }
27 |   
28 |   public synchronized void oneDone() {
29 |     setChanged();
30 |     completed++;
31 |     notifyObservers(resultsMap);
32 |     
33 |     long runtime = (System.currentTimeMillis() - startTime) / 1000;
34 |     logger.info(completed+"/"+total+" DONE ("+runtime+"s total)");
35 |   }
36 | 
37 |   public Map<String, DisambiguationResults> getResultsMap() {
38 |     return resultsMap;
39 |   }
40 |   
41 |   public void setResultsMap(Map<String, DisambiguationResults> resultsMap) {
42 |     this.resultsMap = resultsMap;
43 |   }
44 | }


--------------------------------------------------------------------------------
/src/mpi/aida/util/InputTextInvertedIndex.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.util;
  2 | 
  3 | import gnu.trove.iterator.TIntIterator;
  4 | import gnu.trove.list.linked.TIntLinkedList;
  5 | import gnu.trove.map.hash.TIntIntHashMap;
  6 | import gnu.trove.map.hash.TIntObjectHashMap;
  7 | 
  8 | import java.util.LinkedList;
  9 | import java.util.List;
 10 | 
 11 | import mpi.aida.data.Mention;
 12 | 
 13 | public class InputTextInvertedIndex {
 14 | 	private TIntObjectHashMap<TIntLinkedList> indexIncludingStopWords;
 15 | 	private TIntObjectHashMap<TIntLinkedList> indexWithoutStopWords;
 16 | 	
 17 | 	public InputTextInvertedIndex() {
 18 | 		indexIncludingStopWords = new TIntObjectHashMap<TIntLinkedList>();
 19 | 		indexWithoutStopWords = new TIntObjectHashMap<TIntLinkedList>(); 		
 20 | 	}
 21 | 	
 22 | 	public InputTextInvertedIndex(int[] tokens, boolean isRemoveStopWords) {
 23 | 		indexIncludingStopWords = new TIntObjectHashMap<TIntLinkedList>();
 24 | 		indexWithoutStopWords = new TIntObjectHashMap<TIntLinkedList>(); 
 25 | 		int noStopwordsPosition = 0;
 26 | 		for (int position = 0; position < tokens.length; ++position) {
 27 | 			int token = tokens[position];
 28 | 			TIntLinkedList positions = indexIncludingStopWords.get(token); 
 29 | 			if (positions == null) {
 30 | 				positions = new TIntLinkedList();
 31 | 				indexIncludingStopWords.put(token, positions);
 32 | 			}
 33 | 			positions.add(position);
 34 | 			
 35 | 			if(!isRemoveStopWords || !StopWord.is(token)) {
 36 |   			positions = indexWithoutStopWords.get(token); 
 37 |   			if (positions == null) {
 38 |   				positions = new TIntLinkedList();
 39 |   				indexWithoutStopWords.put(token, positions);
 40 |   			}
 41 |   			positions.add(noStopwordsPosition);
 42 |         noStopwordsPosition++;
 43 | 			}
 44 | 		}
 45 | 	}
 46 | 	
 47 | 	public boolean containsWord(int word, Mention mention) {
 48 | 		if(!indexWithoutStopWords.containsKey(word))
 49 | 			return false;
 50 | 		TIntLinkedList positions = indexIncludingStopWords.get(word);
 51 | 		int mentionStart = mention.getStartToken();
 52 | 		int mentionEnd = mention.getEndToken();
 53 | 		for(TIntIterator itr = positions.iterator(); itr.hasNext(); ) {
 54 | 		  int position = itr.next();
 55 | 			if(position < mentionStart || position > mentionEnd)
 56 | 				return true;
 57 | 		}
 58 | 		return false;
 59 | 	}
 60 | 	
 61 | 	public List<Integer> getPositions(int word, Mention mention) {
 62 | 		int mentionStart = mention.getStartToken();
 63 | 		int mentionEnd = mention.getEndToken();
 64 | 		int mentionLength = mentionEnd - mentionStart + 1;
 65 | 				
 66 | 		List<Integer> positions = new LinkedList<Integer>();
 67 | 		//we need to subtract the mention length if the keyword is after the mention
 68 | 		for(int i = 0; i < indexIncludingStopWords.get(word).size(); i++) {
 69 | 			//get the keyword position from the full index (including stopwords)
 70 | 			int position = indexIncludingStopWords.get(word).get(i);
 71 | 			//compare to know the position of the keyword relative to the mention
 72 | 			if(position < mentionStart) //before the mention, return the actual position from the stopwords free index
 73 | 				positions.add(indexWithoutStopWords.get(word).get(i));
 74 | 			else if((position > mentionEnd)) //if after the mention, get the actual position and subtract mention length
 75 | 				positions.add(indexWithoutStopWords.get(word).get(i) - mentionLength);
 76 | 		}
 77 | 		
 78 | 		return positions;
 79 | 	}
 80 | 	
 81 | 	public void addToIndex(TIntIntHashMap newIndexEntries) {
 82 | 		for(int word: newIndexEntries.keys()) {
 83 | 			int offset = newIndexEntries.get(word);
 84 | 			
 85 | 			TIntLinkedList positions;
 86 | 			positions = indexIncludingStopWords.get(word); 
 87 | 			if (positions == null) {
 88 | 				positions = new TIntLinkedList();
 89 | 				indexIncludingStopWords.put(word, positions);
 90 | 			}
 91 | 			positions.add(offset);
 92 | 			
 93 | 			positions = indexWithoutStopWords.get(word); 
 94 | 			if (positions == null) {
 95 | 				positions = new TIntLinkedList();
 96 | 				indexWithoutStopWords.put(word, positions);
 97 | 			}
 98 | 			positions.add(offset);
 99 | 
100 | 			
101 | 		}		
102 | 	}
103 | 
104 | }
105 | 


--------------------------------------------------------------------------------
/src/mpi/aida/util/MinCover.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.util;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | public class MinCover {
 7 | 	public int length;
 8 | 	public List<Integer> startPositions =  new ArrayList<Integer>();
 9 | 	public List<Integer> endPositions =  new ArrayList<Integer>();
10 | }
11 | 


--------------------------------------------------------------------------------
/src/mpi/aida/util/NiceTime.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.util;
 2 | 
 3 | /**
 4 |  * Contains a method that will create a String from any long
 5 |  * saying how many days, hours, minutes ... the time value
 6 |  * represents.
 7 |  */
 8 | public class NiceTime {
 9 | 
10 |   /**
11 |    * takes a long value and converts it into a readable Time String
12 |    * ie. if time eq 1234 would return the String: 1s, 234ms
13 |    * @param time
14 |    * @return
15 |    */
16 |   public static String convert(long time) {
17 |     long seconds = -1;
18 |     long minutes = -1;
19 |     long hours = -1;
20 |     StringBuffer sb = new StringBuffer(100);
21 |     if (time < 0) {
22 |       return "0ms";
23 |     }
24 |     long milliseconds = time % 1000;
25 |     time = time / 1000;
26 |     if (time > 0) {
27 |       seconds = time % 60;
28 |       time = time / 60;
29 |     }
30 |     if (time > 0) {
31 |       minutes = time % 60;
32 |       time = time / 60;
33 |     }
34 |     if (time > 0) {
35 |       hours = time % 24;
36 |       time = time / 24;
37 |     }
38 |     if (time > 0) {
39 |       sb.append(time + "d, ");
40 |     }
41 |     if (hours != -1) {
42 |       sb.append(hours + "h, ");
43 |     }
44 |     if (minutes != -1) {
45 |       sb.append(minutes + "m, ");
46 |     }
47 |     if (seconds != -1) {
48 |       sb.append(seconds + "s, ");
49 |     }
50 |     sb.append(milliseconds + "ms");
51 |     return sb.toString();
52 |   }
53 |   
54 |   public static String convert(double time) {
55 |     return convert((long) time);
56 |   }
57 | 
58 | }
59 | 


--------------------------------------------------------------------------------
/src/mpi/aida/util/Result.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.util;
  2 | 
  3 | import java.util.Collections;
  4 | import java.util.HashMap;
  5 | import java.util.LinkedList;
  6 | import java.util.List;
  7 | 
  8 | import mpi.aida.util.htmloutput.ResultMention;
  9 | import mpi.tokenizer.data.Tokens;
 10 | 
 11 | public class Result {
 12 | 
 13 |   private String text;
 14 | 
 15 |   private String docId;
 16 | 
 17 |   private List<String> dataSetIds;
 18 | 
 19 |   private String groundTruthId = null;
 20 | 
 21 |   private Tokens tokens;
 22 | 
 23 |   private HashMap<Integer, HashMap<String, ResultMention>> finalEntities = null;
 24 | 
 25 |   public Result(String docId, String text, Tokens tokens, String groundTruthId) {
 26 |     this.docId = docId;
 27 |     this.text = text;
 28 |     this.dataSetIds = new LinkedList<String>();
 29 |     this.tokens = tokens;
 30 |     finalEntities = new HashMap<Integer, HashMap<String, ResultMention>>();
 31 |     this.groundTruthId = groundTruthId;
 32 |   }
 33 | 
 34 |   public void addFinalentity(ResultMention entity) {
 35 |     registerDataSet(entity.getDataSetId());
 36 |     HashMap<String, ResultMention> entry = null;
 37 |     if (finalEntities.containsKey(entity.getOffset())) {
 38 |       entry = finalEntities.get(entity.getOffset());
 39 |     } else {
 40 |       entry = new HashMap<String, ResultMention>();
 41 |       finalEntities.put(entity.getOffset(), entry);
 42 |     }
 43 |     if (!entry.containsKey(entity.getOffset())) {
 44 |       entry.put(entity.getDataSetId(), entity);
 45 |     }
 46 |   }
 47 | 
 48 |   private void registerDataSet(String dataSetId) {
 49 |     if (!dataSetIds.contains(dataSetId)) {
 50 |       if (dataSetId.equals(groundTruthId)) {
 51 |         dataSetIds.add(0, dataSetId);
 52 |       } else {
 53 |         dataSetIds.add(dataSetId);
 54 |       }
 55 |     }
 56 |   }
 57 | 
 58 |   public String getDocId() {
 59 |     return docId;
 60 |   }
 61 | 
 62 |   public String getText() {
 63 |     return text;
 64 |   }
 65 | 
 66 |   public boolean containsMention(int offset) {
 67 |     return finalEntities.containsKey(offset);
 68 |   }
 69 | 
 70 |   public boolean containsMention(int offset, String id) {
 71 |     if (!finalEntities.containsKey(offset)) {
 72 |       return false;
 73 |     }
 74 |     return finalEntities.get(offset).containsKey(id);
 75 |   }
 76 | 
 77 |   public HashMap<String, ResultMention> getMention(int offset) {
 78 |     return finalEntities.get(offset);
 79 |   }
 80 | 
 81 |   public int size() {
 82 |     return finalEntities.size();
 83 |   }
 84 | 
 85 |   public Tokens getTokens() {
 86 |     return tokens;
 87 |   }
 88 | 
 89 |   public List<String> getDataSetIds() {
 90 |     return dataSetIds;
 91 |   }
 92 | 
 93 |   public void sortDataSetIds(HashMap<String, String> idsAvgPrec){
 94 |     Collections.sort(dataSetIds, new SortByAvgPre(idsAvgPrec));
 95 |     dataSetIds.remove(groundTruthId);
 96 |     dataSetIds.add(0,groundTruthId);
 97 |   }
 98 |   
 99 |   public String getGroundTruthId() {
100 |     return groundTruthId;
101 |   }
102 | 
103 | }
104 | 


--------------------------------------------------------------------------------
/src/mpi/aida/util/SortByAvgPre.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.util;
 2 | 
 3 | import java.util.Comparator;
 4 | import java.util.HashMap;
 5 | 
 6 | public class SortByAvgPre implements Comparator<String> {
 7 | 
 8 |   private HashMap<String, String> idsAvgPrec = null;
 9 | 
10 |   public SortByAvgPre(HashMap<String, String> idsAvgPrec) {
11 |     this.idsAvgPrec = idsAvgPrec;
12 |   }
13 | 
14 |   @Override
15 |   public int compare(String o1, String o2) {
16 |     if (idsAvgPrec.get(o1) == null && idsAvgPrec.get(o2) == null) {
17 |       return 0;
18 |     } else if (idsAvgPrec.get(o1) == null || idsAvgPrec.get(o1).equals("none")) {
19 |       return 1;
20 |     } else if (idsAvgPrec.get(o2) == null || idsAvgPrec.get(o2).equals("none")) {
21 |       return -1;
22 |     }
23 |     double first = Double.parseDouble(idsAvgPrec.get(o1));
24 |     double second = Double.parseDouble(idsAvgPrec.get(o2));
25 |     if (first > second) {
26 |       return -1;
27 |     } else if (first < second) {
28 |       return 1;
29 |     }
30 |     return 0;
31 |   }
32 | 
33 | }
34 | 


--------------------------------------------------------------------------------
/src/mpi/aida/util/WikipediaDumpArticleIdExtractor.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.util;
 2 | 
 3 | import java.io.Reader;
 4 | 
 5 | import javatools.filehandlers.FileLines;
 6 | import javatools.parsers.Char;
 7 | import javatools.util.FileUtils;
 8 | 
 9 | /**
10 |  * Extracts all article ids from a Wikipedia pages-articles dump.
11 |  * Output format is:
12 |  * article_title<TAB>id
13 |  * 
14 |  *
15 |  */
16 | public class WikipediaDumpArticleIdExtractor {
17 | 
18 |   public static void main(String[] args) throws Exception {
19 |     if (args.length != 1) {
20 |       printUsage();
21 |       System.exit(1);
22 |     }
23 |     
24 |     final Reader reader = FileUtils.getBufferedUTF8Reader(args[0]);
25 |     String page = FileLines.readBetween(reader, "<page>", "</page>");
26 |     
27 |     int pagesDone = 0;
28 |     
29 |     while (page != null) {
30 |       if (++pagesDone % 100000 == 0) {
31 |         System.err.println(pagesDone + " pages done.");
32 |       }
33 |       
34 |       page = Char.decodeAmpersand(page.replace("&amp;", "&"));
35 |       String title = FileLines.readBetween(page, "<title>", "</title>");
36 |       String id = FileLines.readBetween(page, "<id>", "</id>");
37 |       String wpUrl = "http://en.wikipedia.org/wiki/" + title.replace(' ', '_');
38 |       System.out.println(wpUrl + "\t" + id);
39 |       
40 |       page = FileLines.readBetween(reader, "<page>", "</page>");
41 |     }
42 |   }
43 | 
44 |   public static void printUsage() {
45 |     System.out.println("Usage:");
46 |     System.out.println("\tWikipediaDumpArticleIdExtractor <wikipedia-pages-articles.xml>");
47 |   }
48 | }


--------------------------------------------------------------------------------
/src/mpi/aida/util/WikipediaUtil.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.util;
 2 | 
 3 | import java.util.regex.Pattern;
 4 | 
 5 | public class WikipediaUtil {
 6 | 
 7 |   public static final int TOTAL_DOCS = 2628265;
 8 | 
 9 |   /**
10 |    * Returns ONLY text (minus headlines, links, etc.) for a Wikipedia article source
11 |    * 
12 |    * @param text
13 |    * @return
14 |    */
15 |   public static String cleanWikipediaArticle(String text) {
16 |     // replace newlines
17 |     text = text.replace('\n', ' ');
18 |     
19 |     // remove external links
20 |     text = text.replaceAll("(\\[https?:.+)\\[\\[[^\\[\\]]+\\]\\]", "$1");
21 |     text = text.replaceAll("\\[https?:[^\\[\\]]+\\]", " ");
22 | 
23 |     // remove references
24 |     text = text.replaceAll("<ref[[^<]\\n]+</ref>", "");
25 |     text = text.replaceAll("<ref[[^<]\\n]+/>", "");
26 |     
27 |     // remove galleries
28 |     text = text.replaceAll("(?s)<gallery>.*</gallery>", "");
29 | 
30 |     // remove xml tags
31 |     text = text.replaceAll("<[^/t! ][^>]+>", " ");
32 |     text = text.replaceAll("</[^t][^>]+>", " ");
33 | 
34 |     // remove tables
35 |     text = Pattern.compile("<table[^>]+>(?!</table>).*</table>", Pattern.DOTALL).matcher(text).replaceAll("");
36 | 
37 |     // remove xml comments
38 |     text = Pattern.compile("<!--.+-->", Pattern.DOTALL).matcher(text).replaceAll("");
39 |     
40 |     // remove all templates/macros
41 |     text = text.replaceAll("'{2,}", "");
42 |     text = text.replaceAll("\\[\\[[^\\[\\]]+:[^\\[\\]]+\\]\\]", "");
43 | 
44 |     // workaround for mal-formed tables
45 |     text = Pattern.compile("\\{\\{Standard table\\|0\\}\\}.*\\{\\{close table\\}\\}", Pattern.DOTALL).matcher(text).replaceAll("");
46 |     text = text.replaceAll("\\{\\{[sS]tart [bB]ox\\}\\}", "{|");
47 |     text = text.replaceAll("\\{\\{[eE]nd [bB]ox\\}\\}", "|}");
48 |     text = Pattern.compile("(?s)\\{\\|((?!\\|\\}).)*\n\\|\\}\n", Pattern.DOTALL).matcher(text).replaceAll("");
49 | 
50 |     // remove templates/infoboxes
51 |     text = text.replaceAll("\\{\\{[[^\\{\\}]]+\\}\\}", " ");
52 |     
53 |     // workaround for some non-standard texts
54 |     text = text.replaceAll("(?s)\\{\\|.*\n\\|\\}\u2020Denotes wild-card team \\(since 1995\\)\\.\n", "");
55 |     text = Pattern.compile("^\\*{1,2}.*$", Pattern.MULTILINE).matcher(text).replaceAll("");
56 |     text = Pattern.compile("^\\;.*$", Pattern.MULTILINE).matcher(text).replaceAll("");
57 |     text = Pattern.compile("^:+.*$", Pattern.MULTILINE).matcher(text).replaceAll("");
58 | 
59 |     // remove [[ ... :  ... ]]
60 |     text = text.replaceAll("\\[\\[[^\\[\\]]+:[^\\[\\]]+\\]\\]", " ");
61 |     
62 |     // remove headlines
63 |     text = text.replaceAll("={2,}.*?={2,}"," ");
64 |     
65 |     // replace links
66 |     text = text.replaceAll("\\[\\[[^\\]]+?\\|([^\\]\\n]+?)\\]\\]", "$1");
67 |     text = text.replaceAll("\\[\\[([^\\]]+?)\\]\\]", "$1");
68 | 
69 |     // normalize whitespaces
70 |     text = text.replaceAll("[\\s\\x00-\\x1F]+", " ");
71 |     
72 |     // normalize other characters
73 |     text = text.replaceAll("&lt;", "<").replaceAll("&gt;", ">");
74 |     
75 |     return text;
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/mpi/aida/util/YagoUtil.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.util;
  2 | 
  3 | import gnu.trove.map.hash.TIntObjectHashMap;
  4 | import gnu.trove.set.hash.TIntHashSet;
  5 | 
  6 | import java.sql.SQLException;
  7 | import java.util.Collection;
  8 | import java.util.LinkedList;
  9 | import java.util.List;
 10 | 
 11 | import mpi.aida.access.DataAccess;
 12 | import mpi.aida.data.Entities;
 13 | import mpi.aida.data.Entity;
 14 | 
 15 | import org.apache.commons.lang.StringUtils;
 16 | 
 17 | import basics.Normalize;
 18 | 
 19 | /**
 20 |  * This class contains some convenience wrappers for accessing YAGO data.
 21 |  * It has to use DataAccess and MUST NOT access the DB directly!
 22 |  * 
 23 |  *
 24 |  */
 25 | public class YagoUtil {
 26 |   
 27 |   public static final int TOTAL_YAGO_ENTITIES = 2651987;
 28 |   
 29 |   public enum Gender {
 30 |     FEMALE, MALE;
 31 |   }
 32 |  
 33 |   /**
 34 |    * Checks whether the given String is an entity in YAGO
 35 |    * 
 36 |    * @param entity  Entity to check.
 37 |    * @return        true if the entity is in YAGO
 38 |    * @throws SQLException
 39 |    */
 40 |   public static boolean isYagoEntity(Entity entity) throws SQLException {
 41 |     return DataAccess.isYagoEntity(entity);
 42 |   }
 43 |   
 44 |   public static Entity getEntityForId(int id) {
 45 |     return new Entity(DataAccess.getYagoEntityIdForId(id), id);
 46 |   }
 47 |   
 48 |   public static Entities getEntitiesForIds(int[] ids) {
 49 |     TIntObjectHashMap<String> yagoEntityIds =
 50 |         DataAccess.getYagoEntityIdsForIds(ids);
 51 |     Entities entities = new Entities();
 52 |     for (int i = 0; i < ids.length; ++i) {
 53 |       entities.add(new Entity(yagoEntityIds.get(ids[i]), ids[i]));
 54 |     }
 55 |     return entities;
 56 |   }
 57 |   
 58 |   public static Entity getEntityForYagoId(String id) {
 59 |     return new Entity(id, DataAccess.getIdForYagoEntityId(id));
 60 |   }
 61 |   
 62 |   public static Entities getEntitiesForYagoEntityIds(Collection<String> names) {
 63 |     Entities entities = new Entities();
 64 |     for (String name : names) {
 65 |       entities.add(new Entity(name, DataAccess.getIdForYagoEntityId(name)));
 66 |     }
 67 |     return entities;
 68 |   }
 69 |   
 70 |   /**
 71 |    * Formats a given mention string properly to query a yago database.
 72 |    * 
 73 |    * It will first transform the string into a YAGO string (with "" and
 74 |    * UTF-8 with backslash encoding), and then escape the string properly
 75 |    * for a Postgres query.
 76 |    * 
 77 |    * @param mention Mention to format
 78 |    * @return        Mention in YAGO2/Postgres format
 79 |    */
 80 |   public static String getYagoMentionStringPostgresEscaped(String mention) {
 81 |     return getPostgresEscapedString(Normalize.string(mention));
 82 |   }
 83 |   
 84 |   public static String getPostgresEscapedString(String input) {
 85 |     return input.replace("'", "''").replace("\\", "\\\\");
 86 |   }
 87 |   
 88 |   public static String getPostgresEscapedConcatenatedQuery(Collection<String> entities) {
 89 |     List<String> queryTerms = new LinkedList<String>();
 90 | 
 91 |     for (String term : entities) {
 92 |       StringBuilder sb = new StringBuilder();
 93 |       sb.append("E'").append(YagoUtil.getPostgresEscapedString(term)).append("'");
 94 |       queryTerms.add(sb.toString());
 95 |     }
 96 | 
 97 |     return StringUtils.join(queryTerms, ",");
 98 |   }
 99 |   
100 |   public static String getIdQuery(TIntHashSet ids) {
101 |     int[] conv = ids.toArray();
102 |     return getIdQuery(conv);
103 |   }
104 |   
105 |   public static String getIdQuery(int[] ids) {
106 |     StringBuilder sb = new StringBuilder();
107 |     for (int i = 0; i < ids.length; ++i) {
108 |       sb.append(ids[i]);
109 |       if (i < ids.length - 1) {
110 |         sb.append(",");
111 |       }
112 |     }
113 |     return sb.toString();
114 |   }  
115 | }


--------------------------------------------------------------------------------
/src/mpi/aida/util/htmloutput/ResultMention.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.util.htmloutput;
 2 | 
 3 | public class ResultMention {
 4 | 
 5 |   private String dataSetId = null;
 6 | 
 7 |   private int offset;
 8 | 
 9 |   private int length;
10 | 
11 |   private String mention;
12 | 
13 |   private String entity;
14 | 
15 |   private double confidence;
16 |   
17 |   private boolean isYagoEntity;
18 | 
19 |   public ResultMention(String dataSetId, int offset, int length, String mention, String entity, double confidence, boolean isYagoEntity) {
20 |     this.dataSetId = dataSetId;
21 |     this.offset = offset;
22 |     this.length = length;
23 |     this.mention = mention;
24 |     this.entity = entity;
25 |     this.confidence = confidence;
26 |     this.isYagoEntity = isYagoEntity;
27 |   }
28 | 
29 |   public int getOffset() {
30 |     return offset;
31 |   }
32 | 
33 |   public int getLength() {
34 |     return length;
35 |   }
36 | 
37 |   public String getMention() {
38 |     return mention;
39 |   }
40 | 
41 |   public String getEntity() {
42 |     return entity;
43 |   }
44 | 
45 |   public double getConfidence() {
46 |     return confidence;
47 |   }
48 |   
49 |   public String getDataSetId() {
50 |     return dataSetId;
51 |   }
52 | 
53 |   public String toString() {
54 |     return offset + "\t" + length + "\t" + mention + "\t" + entity;
55 |   }
56 | 
57 |   public boolean isYagoEntity() {
58 |     return isYagoEntity;
59 |   }
60 | 
61 | }
62 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/measure/EvaluationMeasures.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.measure;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.List;
 5 | import java.util.Map;
 6 | 
 7 | 
 8 | public class EvaluationMeasures {
 9 |   public static Map<String, Double> convertToAverageRanks(List<List<String>> list) {
10 |     Map<String, Double> rankedList = new HashMap<String, Double>();
11 | 
12 |     int i=0;
13 |     for (List<String> entityPartition : list) {
14 |       double avgRank = 0.0;
15 |       
16 |       for (@SuppressWarnings("unused") String entity : entityPartition) {
17 |         i++;
18 |         avgRank += i;
19 |       }
20 |       
21 |       avgRank /= (double) entityPartition.size();
22 |       
23 |       for (String entity : entityPartition) {
24 |         rankedList.put(entity, avgRank);
25 |       }
26 |     }
27 |     
28 |     return rankedList;
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/reader/CoNLLReader.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.reader;
 2 | 
 3 | import java.io.File;
 4 | 
 5 | public class CoNLLReader extends AidaFormatCollectionReader {
 6 | 
 7 |   public final static String finalFileName = File.separator + "CoNLL-YAGO.tsv";
 8 | 
 9 |   public final static String finalCollectionPath = "./data/experiment/CONLL";
10 | 
11 |   public CoNLLReader() {
12 |     super(finalCollectionPath, finalFileName);
13 |   }
14 | 
15 |   public CoNLLReader(CollectionPart cp) {
16 | 	    super(finalCollectionPath, finalFileName, cp);
17 |   }
18 | 
19 | 
20 |   public CoNLLReader(String collectionPath) {
21 |     super(collectionPath, finalFileName);
22 |   }
23 | 
24 |   public CoNLLReader(String collectionPath, int from, int to) {
25 |     super(collectionPath, finalFileName, from, to);
26 |   }
27 | 
28 |   public CoNLLReader(String collectionPath, CollectionPart cp) {
29 |     super(collectionPath, finalFileName, cp);
30 |   }
31 | 
32 |   public CoNLLReader(String collectionPath, String docNums) {
33 |     super(collectionPath, finalFileName, docNums);
34 |   }
35 | 
36 |   @Override
37 |   protected int[] getCollectionPartFromTo(CollectionPart cp) {
38 |     int[] ft = new int[] { 1, 1393 };
39 |     switch (cp) {
40 |       case TRAIN:
41 |         ft = new int[] { 1, 946 };
42 |         break;
43 |       case DEV:
44 |         ft = new int[] { 947, 1162 };
45 |         break;
46 |       case DEV_SMALL:
47 |         ft = new int[] { 947, 1046 };
48 |         break;
49 |       case TEST:
50 |         ft = new int[] { 1163, 1393 };
51 |         break;
52 |       default:
53 |         break;
54 |     }
55 |     return ft;
56 |   }
57 | 
58 |   
59 |   public static void main(String[] args) {
60 |     CoNLLReader reader = new CoNLLReader();
61 |     String key = reader.getAllDocIds().get(0);
62 |     System.out.println(reader.getTokensMap().get(key));
63 |   }
64 | }
65 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/reader/CollectionReader.java:
--------------------------------------------------------------------------------
  1 | package mpi.experiment.reader;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.IOException;
  6 | import java.util.HashMap;
  7 | import java.util.HashSet;
  8 | import java.util.List;
  9 | import java.util.Map;
 10 | 
 11 | import javatools.util.FileUtils;
 12 | import mpi.aida.data.Context;
 13 | import mpi.aida.data.Mentions;
 14 | import mpi.aida.data.PreparedInput;
 15 | 
 16 | import org.slf4j.Logger;
 17 | import org.slf4j.LoggerFactory;
 18 | 
 19 | public abstract class CollectionReader implements Iterable<PreparedInput> {
 20 |   private static final Logger logger = 
 21 |       LoggerFactory.getLogger(CollectionReader.class);
 22 |   
 23 |   public String filePath = null;
 24 |   
 25 |   public String collectionPath;
 26 | 
 27 |   public int from;
 28 | 
 29 |   public int to;
 30 | 
 31 |   protected HashSet<Integer> allDocNumbers = null;
 32 |   
 33 |   protected List<PreparedInput> preparedInputs;
 34 | 
 35 |   protected boolean includeNMEMentions = true;
 36 |   
 37 |   
 38 |   public void setIncludeNMEMentions(boolean includeNMEMentions) {
 39 |     this.includeNMEMentions = includeNMEMentions;
 40 |   }
 41 | 
 42 |   public static enum DataSource {
 43 |     CONLL, WIKIPEDIA_YAGO2, AIDA, NONE 
 44 |   }
 45 | 
 46 |   public static final String CONLL = "CONLL";
 47 | 
 48 |   public static final String WIKIPEDIA_YAGO2 = "WIKIPEDIA_YAGO2";
 49 |   
 50 |   public static final String AIDA = "AIDA";
 51 |   
 52 |   public static final String NONE = "NONE";
 53 | 
 54 |   public static enum CollectionPart {
 55 |     TRAIN, DEV, DEV_SMALL, TEST
 56 |   }
 57 | 
 58 |   public static final String TRAIN = "TRAIN";
 59 | 
 60 |   public static final String DEV = "DEV";
 61 |   
 62 |   public static final String DEV_SMALL = "DEV_SMALL";
 63 | 
 64 |   public static final String TEST = "TEST";
 65 | 
 66 |   public CollectionReader(String collectionPath) {
 67 |     this(collectionPath, 0, Integer.MAX_VALUE);
 68 |   }
 69 | 
 70 |   public CollectionReader(String collectionPath, CollectionPart cp) {
 71 |     int[] ft = getCollectionPartFromTo(cp);
 72 |     this.collectionPath = collectionPath;
 73 |     this.from = ft[0];
 74 |     this.to = ft[1];
 75 |   }
 76 | 
 77 |   public CollectionReader(String collectionPath, int from, int to) {
 78 |     this.collectionPath = collectionPath;
 79 |     this.from = from;
 80 |     this.to = to;
 81 |   }
 82 | 
 83 |   public CollectionReader(String collectionPath, String docIds) {
 84 |     this.collectionPath = collectionPath;
 85 |     if (docIds == null) {
 86 |       this.from = 0;
 87 |       this.to = Integer.MAX_VALUE;
 88 |     } else {
 89 |       allDocNumbers = new HashSet<Integer>();
 90 |       String[] data = docIds.split(",");
 91 |       int i = -1;
 92 |       for (i = 0; i < data.length; i++) {
 93 |         try {
 94 |           allDocNumbers.add(Integer.parseInt(data[i].trim()));
 95 |         } catch (NumberFormatException e) {
 96 |           logger.warn(data[i] + " is not an integer");
 97 |         }
 98 |       }
 99 |     }
100 |   }
101 | 
102 |   public abstract Mentions getDocumentMentions(String docId);
103 |   
104 |   public abstract Context getDocumentContext(String docId);
105 |   
106 |   public abstract int collectionSize();
107 | 
108 |   public abstract String getText(String docId) ;
109 |   
110 |   protected abstract int[] getCollectionPartFromTo(CollectionPart cp);
111 | 
112 |   public static CollectionPart getCollectionPart(String collectionPart) {
113 |     if (collectionPart == null) {
114 |       return null;
115 |     }
116 | 
117 |     if (collectionPart.equals(TRAIN)) {
118 |       return CollectionPart.TRAIN;
119 |     } else if (collectionPart.equals(DEV)) {
120 |       return CollectionPart.DEV;
121 |     } else if (collectionPart.equals(DEV_SMALL)) {
122 |       return CollectionPart.DEV_SMALL;
123 |     } else if (collectionPart.equals(TEST)) {
124 |       return CollectionPart.TEST;
125 |     } else {
126 |       return null;
127 |     }
128 |   }
129 |   
130 |   public Map<String, String> getAllDocuments() {
131 |     Map<String, String> docsWithText = new HashMap<String, String>();
132 |     
133 |     for (PreparedInput inputDoc : this) {
134 |       docsWithText.put(inputDoc.getDocId(), getText(inputDoc.getDocId()));
135 |     }
136 |     
137 |     return docsWithText;
138 |   }
139 | 
140 |   public String readStringFromFile(File f) throws IOException {
141 | 	  BufferedReader reader = FileUtils.getBufferedUTF8Reader(f);
142 | 
143 |     StringBuilder sb = new StringBuilder();
144 | 
145 |     for (String line = reader.readLine(); line != null; line = reader.readLine()) {
146 |       sb.append(line + "\n");
147 |     }
148 | 
149 |     return sb.toString();
150 |   }
151 | }
152 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/reader/KORE50Reader.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.reader;
 2 | 
 3 | import java.io.File;
 4 | 
 5 | public class KORE50Reader  extends AidaFormatCollectionReader {
 6 | 
 7 | 	public final static String finalFileName = File.separator + "AIDA.tsv";
 8 | 
 9 | 	public final static String finalCollectionPath = "./data/experiment/KORE50";
10 | 
11 | 	public KORE50Reader(String collectionPath, String fileName) {
12 | 		super(collectionPath, fileName);
13 | 	}
14 | 
15 | 	public KORE50Reader() {
16 | 		super(finalCollectionPath, finalFileName);
17 | 	}
18 | 
19 | 	@Override
20 | 	protected int[] getCollectionPartFromTo(CollectionPart cp) {
21 | 		return new int[] { 1, 50 };
22 | 	}
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/NullEntityEntityTracing.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.trace;
 2 | 
 3 | import java.util.Collection;
 4 | 
 5 | import mpi.experiment.trace.measures.MeasureTracer;
 6 | 
 7 | 
 8 | public class NullEntityEntityTracing extends EntityEntityTracing {
 9 | 
10 |   @Override
11 |   public String generateOutput() {
12 |     return "";
13 |   }
14 | 
15 |   @Override
16 |   public void addEntityEntityMeasureTracer(String e1, String e2, MeasureTracer mt) {
17 |   }
18 | 
19 |   @Override
20 |   public void setCorrectEntities(Collection<String> correctEntities) {
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/NullGraphTracer.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.trace;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Map;
 5 | 
 6 | public class NullGraphTracer extends GraphTracer {
 7 | 
 8 | 	@Override
 9 | 	public void addCandidateEntityToOriginalGraph(String docId, String mention,
10 | 			String candidateEntity, double entityWeightedDegree, double MESimilairty, Map<String, Double> connectedEntities) {
11 | 	}
12 | 
13 | 	@Override
14 | 	public void addCandidateEntityToCleanedGraph(String docId, String mention,
15 | 			String candidateEntity, double entityWeightedDegree, double MESimilairty) {
16 | 	}
17 | 
18 | 	@Override
19 | 	public void addCandidateEntityToFinalGraph(String docId, String mention,
20 | 			String candidateEntity, double entityWeightedDegree, double MESimilairty) {
21 | 	}
22 | 
23 | 	@Override
24 | 	public void addEntityRemovalStep(String docId, String entity, double entityWeightedDegree, List<String> connectedMentions) {
25 | 	}
26 | 
27 | 	@Override
28 | 	public void writeOutput(String outputPath) {
29 | 	}
30 | 
31 | 	 public void addStat(String docId, String description, String value) {
32 | 	 }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/NullTracer.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.trace;
 2 | 
 3 | import mpi.aida.data.Mention;
 4 | import mpi.experiment.trace.data.EntityTracer;
 5 | import mpi.experiment.trace.data.MentionTracer;
 6 | import mpi.experiment.trace.measures.MeasureTracer;
 7 | 
 8 | public class NullTracer extends Tracer {
 9 | 
10 |   EntityEntityTracing nullEETracing = new NullEntityEntityTracing();
11 |   
12 |   public NullTracer() {
13 |     super(null, null);
14 |   }
15 | 
16 |   public void addMentionForDocId(String docId, Mention m, MentionTracer mt) {
17 |   }
18 | 
19 |   public void addEntityForMention(Mention mention, String entity, EntityTracer entityTracer) {
20 |   }
21 | 
22 |   public void addMeasureForMentionEntity(Mention mention, String entity, MeasureTracer measure) {
23 |   }
24 | 
25 |   public void setMentionEntityTotalSimilarityScore(Mention mention, String entity, double score) {
26 |   }
27 | 
28 |   public void writeOutput(String resultFileName, boolean withYago) {
29 |   }
30 |   
31 |   public EntityEntityTracing eeTracing() {
32 |     return nullEETracing;
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/data/EntityTracer.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.trace.data;
 2 | 
 3 | import java.util.LinkedList;
 4 | import java.util.List;
 5 | 
 6 | import mpi.experiment.trace.measures.MeasureTracer;
 7 | 
 8 | public class EntityTracer implements Comparable<EntityTracer> {
 9 | 
10 | 	private String entity;
11 | 
12 | 	private double score;
13 | 
14 | 	private List<MeasureTracer> measureTracers = new LinkedList<MeasureTracer>();
15 | 
16 | 	public EntityTracer(String entity) {
17 | 		this.entity = entity;
18 | 	}
19 | 
20 | 	public void addMeasureTracer(MeasureTracer mt) {
21 | 		measureTracers.add(mt);
22 | 	}
23 | 
24 | 	public int compareTo(EntityTracer e) {
25 | 		return Double.compare(e.getTotalScore(), this.getTotalScore());
26 | 	}
27 | 
28 | 	public String getName() {
29 | 		return entity;
30 | 	}
31 | 
32 | 	public List<MeasureTracer> getMeasureTracers() {
33 | 		return measureTracers;
34 | 	}
35 | 
36 | 	public double getTotalScore() {
37 | 		return score;
38 | 	}
39 | 
40 | 	public void setTotalScore(double score) {
41 | 		this.score = score;
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/data/MentionTracer.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.trace.data;
 2 | 
 3 | import java.util.Collection;
 4 | import java.util.HashMap;
 5 | import java.util.Map;
 6 | 
 7 | import mpi.aida.data.Mention;
 8 | 
 9 | public class MentionTracer {
10 | 
11 | 	private Map<String, EntityTracer> entities = new HashMap<String, EntityTracer>();
12 | 
13 | 	private Mention mention;
14 | 
15 | 	public MentionTracer(Mention mention) {
16 | 		this.mention = mention;
17 | 	}
18 | 
19 | 	public String getName() {
20 | 		return mention.getMention();
21 | 	}
22 | 
23 | 	public EntityTracer getEntityTracer(String entity) {
24 | 		return entities.get(entity);
25 | 	}
26 | 
27 | 	public int getOffset() {
28 | 		return mention.getCharOffset();
29 | 	}
30 | 
31 | 	public void addEntityTracer(String entity, EntityTracer entityTracer) {
32 | 		entities.put(entity, entityTracer);
33 | 	}
34 | 
35 | 	public Collection<EntityTracer> getEntityTracers() {
36 | 		return entities.values();
37 | 	}
38 | 
39 | 	public int getLength() {
40 | 		return mention.getCharLength();
41 | 	}
42 | 	
43 | 	public String getMentionStr() {
44 | 		return mention.getMention() + ":" + mention.getStartToken();
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/measures/GenericEntityEntitySimilarityMeasureTracer.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.trace.measures;
 2 | 
 3 | 
 4 | public class GenericEntityEntitySimilarityMeasureTracer extends MeasureTracer {
 5 | 
 6 |   public GenericEntityEntitySimilarityMeasureTracer(String name, double weight) {
 7 |     super(name, weight);
 8 |   }
 9 | 
10 |   @Override
11 |   public String getOutput() {
12 |     return "&nbsp;&nbsp;&nbsp;&nbsp;<em>eesim: " + weight + "</em><br />";
13 |   }
14 | 
15 | }
16 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/measures/KeyphrasesMeasureTracer.java:
--------------------------------------------------------------------------------
  1 | package mpi.experiment.trace.measures;
  2 | 
  3 | import gnu.trove.list.linked.TIntLinkedList;
  4 | import gnu.trove.map.hash.TIntDoubleHashMap;
  5 | import gnu.trove.map.hash.TIntObjectHashMap;
  6 | 
  7 | import java.text.DecimalFormat;
  8 | import java.util.Collections;
  9 | import java.util.LinkedList;
 10 | import java.util.List;
 11 | 
 12 | import mpi.aida.access.DataAccess;
 13 | 
 14 | public class KeyphrasesMeasureTracer extends MeasureTracer {
 15 | 	public static int countForUI = 0;
 16 | 
 17 | 	private List<keyphraseTracingObject> keyphrases = null;
 18 | 	private DecimalFormat formatter = new DecimalFormat("#0.00000");
 19 | 	
 20 | 	public KeyphrasesMeasureTracer(String name, double weight) {
 21 | 		super(name, weight);
 22 | 		keyphrases = new LinkedList<KeyphrasesMeasureTracer.keyphraseTracingObject>();
 23 | 	}
 24 | 
 25 | 	@Override
 26 | 	public String getOutput() {
 27 | 		Collections.sort(keyphrases);
 28 | 				
 29 | 		TIntLinkedList wordIds = new TIntLinkedList();
 30 | 		for (keyphraseTracingObject kto : keyphrases) {
 31 | 		  for (int keyword : kto.keyphraseTokens) {
 32 | 		    wordIds.add(keyword);
 33 | 		  }
 34 | 		}
 35 | 		TIntObjectHashMap<String> id2word = 
 36 | 		    DataAccess.getWordsForIds(wordIds.toArray());
 37 | 		
 38 | 		StringBuilder sb = new StringBuilder();
 39 | 		sb.append("<strong style='color: #0000FF;'> score = " + formatter.format(score) + " </strong><br />");
 40 | 		int keyphraseCount = 0;
 41 | 		for(keyphraseTracingObject keyphrase : keyphrases) {
 42 | 			if(keyphraseCount == 5) {
 43 | 				countForUI++;
 44 | 				sb.append("<a onclick=\"setVisibility('div"
 45 | 						+ countForUI
 46 | 						+ "', 'block');\">More ...</a>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<a onclick=\"setVisibility('div"
 47 | 						+ countForUI + "', 'none');\">Less ...</a>");
 48 | 				sb.append("<div id='div" + countForUI + "' style='display:none'>");
 49 | 			}
 50 | 			sb.append("<span style='color: #005500;'>" + formatter.format(keyphrase.score) + "</span> - <span>\"");
 51 | 			sb.append(buildKeyhraseHTMLEntry(keyphrase.keyphraseTokens, keyphrase.matchedKeywords, id2word));
 52 | 			sb.append("\" </span> ");
 53 | 			sb.append("<br />");
 54 | 			keyphraseCount++;
 55 | 		}
 56 | 		if(keyphraseCount >= 5) {
 57 | 			sb.append("</div>");
 58 | 		}
 59 | 		return sb.toString();
 60 | 	}	
 61 | 	
 62 | 	private String buildKeyhraseHTMLEntry(int[] keyphraseTokens,
 63 | 			TIntDoubleHashMap matchedKeywords, TIntObjectHashMap<String> id2word) {
 64 | 		StringBuilder sb = new StringBuilder();
 65 | 		for (int token : keyphraseTokens) {
 66 | 			if (matchedKeywords.containsKey(token)) {
 67 | 				sb.append("<span style='BACKGROUND-COLOR: #FFAA70;'><strong>" + 
 68 | 				          id2word.get(token) + "</strong> <small>(" +
 69 | 				          matchedKeywords.get(token) +
 70 | 				          ")</small></span> ");
 71 | 			} else {
 72 | 				sb.append("<strong>" + id2word.get(token) + "</strong> ");
 73 | 			}
 74 | 		}
 75 | 		
 76 | 		return sb.toString();
 77 | 	}
 78 | 
 79 | 	/**
 80 | 	 * @param keyphrase the keyphrase to add
 81 | 	 * @param weight the average weight of the keyphrase
 82 | 	 * @param score how much score this keyphrase contributes to the total similarity
 83 | 	 * @param matchedKeywords the keywords within this keyphrase and their weights
 84 | 	 */
 85 | 	public void addKeyphraseTraceInfo(int[] keyphraseTokens, double weight, double score, TIntDoubleHashMap matchedKeywords) {
 86 | 		keyphrases.add(new keyphraseTracingObject(keyphraseTokens, score, matchedKeywords));
 87 | 	}
 88 | 	
 89 | 	private class keyphraseTracingObject implements Comparable<keyphraseTracingObject>{
 90 | 		private int[] keyphraseTokens;
 91 | 		private double score;
 92 | 		private TIntDoubleHashMap matchedKeywords; 
 93 | 		
 94 | 		public keyphraseTracingObject(
 95 | 		    int[] keyphraseTokens, double score, TIntDoubleHashMap matchedKeywords) {
 96 | 			this.keyphraseTokens = keyphraseTokens;
 97 | 			this.score = score;
 98 | 			this.matchedKeywords = matchedKeywords;			
 99 | 		}
100 | 
101 | 		@Override
102 | 		public int compareTo(keyphraseTracingObject o) {
103 | 			if(score < o.score)
104 | 				return 1;
105 | 			else if (score == o.score)
106 | 				return 0;
107 | 			else
108 | 				return -1;
109 | 		}
110 | 	}
111 | }
112 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/measures/KeytermEntityEntityMeasureTracer.java:
--------------------------------------------------------------------------------
  1 | package mpi.experiment.trace.measures;
  2 | 
  3 | import java.text.DecimalFormat;
  4 | import java.util.Map;
  5 | import java.util.Map.Entry;
  6 | 
  7 | import mpi.aida.util.CollectionUtils;
  8 | 
  9 | import org.slf4j.Logger;
 10 | import org.slf4j.LoggerFactory;
 11 | 
 12 | 
 13 | public class KeytermEntityEntityMeasureTracer extends MeasureTracer {
 14 |   private static final Logger logger = 
 15 |       LoggerFactory.getLogger(KeytermEntityEntityMeasureTracer.class);
 16 |   
 17 |   Map<String, Double> terms;
 18 |   Map<String, TermTracer> matchedTerms;
 19 |   
 20 |   private DecimalFormat sFormatter = new DecimalFormat("0.0E0");
 21 |   private DecimalFormat percentFormatter = new DecimalFormat("#0.0");
 22 | 
 23 |   public static final String UI_PREFIX = "KWCSEEMT";
 24 |   public static int countForUI = 0;
 25 |   
 26 |   public KeytermEntityEntityMeasureTracer(String name, double weight, Map<String, Double> terms, Map<String, TermTracer> matchedTerms) {
 27 |     super(name, weight);
 28 |     
 29 |     this.terms = terms;
 30 |     this.matchedTerms = matchedTerms;
 31 |   }
 32 | 
 33 |   @Override
 34 |   public String getOutput() {       
 35 |     int keywordCount=0;
 36 |     
 37 |     StringBuilder sb = new StringBuilder();
 38 |     
 39 | //    sb.append("&nbsp;&nbsp;&nbsp;&nbsp;<em>eesim: " + weight + "</em><br />");
 40 |     
 41 |     Map<String, TermTracer> sortedMatches = CollectionUtils.sortMapByValue(matchedTerms, true);
 42 |     
 43 |     double totalWeight = 0.0;
 44 |     for (TermTracer tt : matchedTerms.values()) {
 45 |       totalWeight += tt.getTermWeight();
 46 |     }
 47 |     
 48 |     double currentWeight = 0.0;
 49 |     
 50 |     for (Entry<String, TermTracer> k : sortedMatches.entrySet()) {  
 51 |       String term = k.getKey();      
 52 |       keywordCount++;
 53 |                   
 54 |       if(keywordCount == 1) {
 55 |         countForUI++;
 56 |         sb.append(" <a onclick=\"setVisibility('div"
 57 |             + UI_PREFIX + countForUI
 58 |             + "', 'block');\">More ...</a>&nbsp;&nbsp;&nbsp;&nbsp;<a onclick=\"setVisibility('div"
 59 |             + countForUI + "', 'none');\">Less ...</a>");
 60 |         sb.append("<div id='div" + UI_PREFIX + countForUI + "' style='display:none'>");
 61 |       }
 62 |       
 63 |       for (String inner : term.split(" ")) {
 64 |         if ((k.getValue().getInnerMatches() != null) && k.getValue().getInnerMatches().containsKey(inner)) {
 65 |           sb.append("<span style='background-color:#FFAA70;'>").append(inner).append(" (").append(sFormatter.format(k.getValue().getInnerMatches().get(inner))).append(")</span> ");
 66 |         } else {
 67 |           sb.append(inner).append(" ");
 68 |         }
 69 |       }
 70 |       sb.append(": ").append(sFormatter.format(terms.get(term)));
 71 | //      part = "<span style='background-color: #ADFF2F;'>" +
 72 |       try {
 73 |         if (terms.containsKey(term)) {
 74 |           double matchWeight = matchedTerms.get(term).getTermWeight();
 75 |           currentWeight += matchWeight;      
 76 |           sb.append(" (contrib. " + sFormatter.format(matchWeight) + ")");
 77 |         }
 78 |       } catch (IllegalArgumentException e) {
 79 |         logger.warn("Could not format weight for '" + 
 80 |                     term + "': " + terms.get(term));
 81 |       }
 82 |             
 83 |       sb.append("<br />");
 84 |       
 85 |       if (keywordCount % 10 == 0) {
 86 |         double percent = currentWeight / totalWeight * 100;
 87 |         sb.append("<div style='margin: 5px auto 5px 10px; font-weight:bold;'>" + sFormatter.format(currentWeight) + " (" + percentFormatter.format(percent) + "%)</div>");
 88 |       }
 89 |     }
 90 |     
 91 |     if (keywordCount >= 1) {
 92 |       sb.append("</div>");
 93 |     }
 94 | 
 95 |     sb.append("<div style='font-weight:bold;text-align:right;width:80%'>Matches: " + keywordCount + "/" + terms.size() + "</div>");
 96 |     
 97 |     return sb.toString();
 98 |   }
 99 | 
100 |   public Map<String, TermTracer> getMatchedKeywords() {
101 |     return matchedTerms;
102 |   }
103 | }
104 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/measures/KeywordContextEntityTracer.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.trace.measures;
 2 | 
 3 | import java.text.DecimalFormat;
 4 | import java.util.Map;
 5 | import java.util.Map.Entry;
 6 | 
 7 | import mpi.aida.util.CollectionUtils;
 8 | 
 9 | 
10 | public class KeywordContextEntityTracer extends TracerPart {
11 |    
12 |   private Map<String, Double> keywords;
13 |   
14 |   private DecimalFormat formatter = new DecimalFormat("#0.00000");
15 | 
16 |   private static int countForUI;
17 | 
18 |   public static final String UI_PREFIX = "KCET";
19 | 
20 |   public KeywordContextEntityTracer(Map<String, Double> keywords) {    
21 |     this.keywords = CollectionUtils.sortMapByValue(keywords, true);
22 |   }
23 | 
24 |   @Override
25 |   public String getOutput() {
26 |     StringBuilder sb = new StringBuilder();
27 |     
28 |     int keywordCount = 0;
29 |         
30 |     for (Entry<String, Double> e : keywords.entrySet()) {
31 |       sb.append(e.getKey()).append(" (").append(formatter.format(e.getValue())).append("), ");
32 |       
33 |       keywordCount++;
34 |       
35 |       if(keywordCount == 5) {
36 |         countForUI++;
37 |         sb.append(" <a onclick=\"setVisibility('div"
38 |             + UI_PREFIX + countForUI
39 |             + "', 'block');\">More ...</a>&nbsp;&nbsp;&nbsp;&nbsp;<a onclick=\"setVisibility('div"
40 |             + countForUI + "', 'none');\">Less ...</a>");
41 |         sb.append("<div id='div" + UI_PREFIX + countForUI + "' style='display:none'>");
42 |       }      
43 |     }
44 |     
45 |     if (keywordCount >= 5) {
46 |       sb.append("</div>");
47 |     }
48 |     
49 |     return sb.toString();
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/measures/MeasureTracer.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.trace.measures;
 2 | 
 3 | public abstract class MeasureTracer extends TracerPart {
 4 | 	// holds the name of the measure
 5 | 	protected String name;
 6 | 	// the value of the similarity of this measure
 7 | 	protected double score;
 8 | 
 9 | 	protected double weight;
10 | 	
11 | 	public MeasureTracer(String name, double weight) {
12 | 		super();
13 | 		this.name = name;
14 | 		this.weight = weight;
15 | 	}
16 | 	public String getName() {
17 | 		return name + " - " + weight;
18 | 	}
19 | 
20 | 	public double getScore() {
21 | 		return score;
22 | 	}
23 | 	
24 | 	public void setScore(double score) {
25 | 		this.score = score;
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/measures/PriorMeasureTracer.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.trace.measures;
 2 | 
 3 | import java.text.DecimalFormat;
 4 | 
 5 | public class PriorMeasureTracer extends MeasureTracer {
 6 | 
 7 | 	
 8 | 	public PriorMeasureTracer(String name, double weight) {
 9 | 		super(name, weight);
10 | 	}
11 | 
12 | 	@Override
13 | 	public String getOutput() {
14 | 		DecimalFormat formatter = new DecimalFormat("#0.000");
15 | 		return "<strong style='color: #0000FF;'>score = " + formatter.format(score) + "</strong";
16 | 	}
17 | 	
18 | }
19 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/measures/TermTracer.java:
--------------------------------------------------------------------------------
 1 | package mpi.experiment.trace.measures;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.Map;
 5 | 
 6 | public class TermTracer implements Comparable<TermTracer> {
 7 | 
 8 |   double termWeight;
 9 | 
10 |   Map<String, Double> innerMatches = new HashMap<String, Double>();
11 | 
12 |   public double getTermWeight() {
13 |     return termWeight;
14 |   }
15 | 
16 |   public void setTermWeight(double termWeight) {
17 |     this.termWeight = termWeight;
18 |   }
19 | 
20 |   public Map<String, Double> getInnerMatches() {
21 |     return innerMatches;
22 |   }
23 | 
24 |   public void addInnerMatch(String inner, Double weight) {
25 |     innerMatches.put(inner, weight);
26 |   }
27 | 
28 |   @Override
29 |   public int compareTo(TermTracer o) {
30 |     return Double.compare(termWeight, o.getTermWeight());
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/mpi/experiment/trace/measures/TracerPart.java:
--------------------------------------------------------------------------------
1 | package mpi.experiment.trace.measures;
2 | 
3 | 
4 | public abstract class TracerPart {
5 |   public abstract String getOutput();
6 | }
7 | 


--------------------------------------------------------------------------------
/test/mpi/aida/DisambiguatorTest.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import java.util.HashMap;
 6 | import java.util.Map;
 7 | 
 8 | import mpi.aida.config.AidaConfig;
 9 | import mpi.aida.config.settings.DisambiguationSettings;
10 | import mpi.aida.config.settings.PreparationSettings;
11 | import mpi.aida.config.settings.disambiguation.CocktailPartyDisambiguationSettings;
12 | import mpi.aida.data.DisambiguationResults;
13 | import mpi.aida.data.PreparedInput;
14 | import mpi.aida.data.ResultMention;
15 | import mpi.aida.preparation.mentionrecognition.FilterMentions.FilterType;
16 | 
17 | import org.junit.Test;
18 | 
19 | /**
20 |  * Testing against the predefined DataAccessForTesting.
21 |  * 
22 |  */
23 | public class DisambiguatorTest {
24 |   public static final double DEFAULT_ALPHA = 0.6;
25 |   public static final double DEFAULT_COH_ROBUSTNESS = 0.9;
26 |   public static final int DEFAULT_SIZE = 5;
27 |   
28 |   public DisambiguatorTest() {
29 |     AidaConfig.set("dataAccess", "testing");
30 |   }
31 |   
32 |   @Test
33 |   public void testPageKashmir() throws Exception {
34 |     Preparator p = new Preparator();
35 | 
36 |     String docId = "testPageKashmir";
37 |     String content = "When [[Page]] played Kashmir at Knebworth, his Les Paul was uniquely tuned.";
38 |     PreparationSettings prepSettings = new PreparationSettings();
39 |     prepSettings.setMentionsFilter(FilterType.Hybrid);
40 | 
41 |     PreparedInput preparedInput = p.prepare(docId, content, new PreparationSettings());
42 | 
43 |     DisambiguationSettings settings = new CocktailPartyDisambiguationSettings();
44 |     settings.setAlpha(DEFAULT_ALPHA);
45 |     settings.setCohRobustnessThreshold(DEFAULT_COH_ROBUSTNESS);
46 |     settings.setEntitiesPerMentionConstraint(DEFAULT_SIZE);
47 | 
48 |     Disambiguator d = new Disambiguator(preparedInput, settings);
49 | 
50 |     DisambiguationResults results = d.disambiguate();
51 | 
52 |     Map<String, String> mappings = repackageMappings(results);
53 | 
54 |     String mapped = mappings.get("Page");
55 |     assertEquals("Jimmy_Page", mapped);
56 | 
57 |     mapped = mappings.get("Kashmir");
58 |     assertEquals("Kashmir_(song)", mapped);
59 | 
60 |     mapped = mappings.get("Knebworth");
61 |     assertEquals("Knebworth_Festival", mapped);
62 | 
63 |     mapped = mappings.get("Les Paul");
64 |     assertEquals("--NME--", mapped);
65 |   }
66 | 
67 |   private Map<String, String> repackageMappings(DisambiguationResults results) {
68 |     Map<String, String> repack = new HashMap<String, String>();
69 | 
70 |     for (ResultMention rm : results.getResultMentions()) {
71 |       repack.put(rm.getMention(), results.getBestEntity(rm).getEntity());
72 |     }
73 | 
74 |     return repack;
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/test/mpi/aida/data/ContextTest.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.data;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import gnu.trove.map.hash.TIntObjectHashMap;
 5 | 
 6 | import java.util.LinkedList;
 7 | import java.util.List;
 8 | 
 9 | import mpi.aida.access.DataAccess;
10 | import mpi.aida.config.AidaConfig;
11 | 
12 | import org.junit.Test;
13 | 
14 | 
15 | public class ContextTest {
16 | 
17 |   public ContextTest() {
18 |     AidaConfig.set("dataAccess", "testing");
19 |   }
20 |   
21 |   @Test
22 |   public void test() {
23 |     List<String> text = new LinkedList<String>();
24 |     
25 |     text.add("Jimmy");
26 |     text.add("played");
27 |     text.add("Les");
28 |     text.add("Paul");
29 |     text.add("played");    
30 |     
31 |     Context context = new Context(text);
32 |     assertEquals(text, context.getTokens());    
33 |     TIntObjectHashMap<String> id2word =
34 |         DataAccess.getWordsForIds(context.getTokenIds());
35 |     
36 |     for (int i = 0; i < text.size(); ++i) {
37 |       assertEquals(text.get(i), id2word.get(context.getTokenIds()[i]));
38 |     }
39 |   } 
40 | }
41 | 


--------------------------------------------------------------------------------
/test/mpi/aida/graph/algorithms/CocktailPartySizeConstrainedTest.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.graph.algorithms;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | 
  5 | import java.util.HashMap;
  6 | import java.util.List;
  7 | import java.util.Map;
  8 | import java.util.Map.Entry;
  9 | 
 10 | import mpi.aida.Preparator;
 11 | import mpi.aida.config.AidaConfig;
 12 | import mpi.aida.config.settings.DisambiguationSettings;
 13 | import mpi.aida.config.settings.PreparationSettings;
 14 | import mpi.aida.config.settings.disambiguation.CocktailPartyDisambiguationSettings;
 15 | import mpi.aida.config.settings.preparation.StanfordHybridPreparationSettings;
 16 | import mpi.aida.data.Entities;
 17 | import mpi.aida.data.Entity;
 18 | import mpi.aida.data.PreparedInput;
 19 | import mpi.aida.data.ResultEntity;
 20 | import mpi.aida.data.ResultMention;
 21 | import mpi.aida.graph.Graph;
 22 | import mpi.aida.graph.GraphGenerator;
 23 | import mpi.experiment.trace.NullTracer;
 24 | import mpi.experiment.trace.Tracer;
 25 | 
 26 | import org.junit.Test;
 27 | 
 28 | public class CocktailPartySizeConstrainedTest {
 29 | 	
 30 | 	public CocktailPartySizeConstrainedTest() {
 31 | 	    AidaConfig.set("dataAccess", "testing");
 32 | 	}
 33 | 
 34 | 	@Test
 35 | 	public void testCocktailParty() throws Exception {
 36 | 
 37 | 		String text = "When [[Page]] played Kashmir at Knebworth, his Les Paul was uniquely tuned.";
 38 | 
 39 | 		String e1 = "Kashmir";
 40 | 		String e2 = "Kashmir_(song)";
 41 | 		String e3 = "Jimmy_Page";
 42 | 
 43 | 		Entities entities = new Entities();
 44 | 		entities.add(new Entity(e1, 1));
 45 | 		entities.add(new Entity(e2, 2));
 46 | 		entities.add(new Entity(e3, 2));
 47 | 
 48 | 		PreparationSettings prepSettings = new StanfordHybridPreparationSettings();
 49 | 		
 50 | 	    Tracer tracer = new NullTracer();
 51 | 		
 52 | 	    Preparator p = new Preparator();
 53 | 	    PreparedInput input = p.prepare("test", text, prepSettings);
 54 | 	    
 55 | 	    DisambiguationSettings disSettings = new CocktailPartyDisambiguationSettings();
 56 | 		
 57 | 		GraphGenerator gg = new GraphGenerator(input, disSettings, tracer);
 58 | 	    Graph gData = gg.run();
 59 | 	    
 60 | 		//KeyphrasesContext kpContext = new KeyphrasesContext(entities);
 61 | 		
 62 | 		DisambiguationAlgorithm da = null;
 63 | 		da = new CocktailPartySizeConstrained(gData, disSettings.shouldUseExhaustiveSearch(), disSettings.shouldUseNormalizedObjective(), disSettings.getEntitiesPerMentionConstraint());
 64 | 		Map<ResultMention, List<ResultEntity>> results = da.disambiguate();
 65 | 	    Map<String, ResultEntity> mappings = repackageMappings(results);
 66 | 
 67 | 	    String mapped = mappings.get("Page").getEntity();
 68 | 	    double score = mappings.get("Page").getDisambiguationScore();
 69 | 	    assertEquals("Jimmy_Page", mapped);
 70 | 	    assertEquals(0.002198, score, 0.00001);
 71 | 
 72 | 	    mapped = mappings.get("Kashmir").getEntity();
 73 | 	    score = mappings.get("Kashmir").getDisambiguationScore();
 74 | 	    assertEquals("Kashmir_(song)", mapped);
 75 | 	    assertEquals(0.00029, score, 0.00001);
 76 | 
 77 | 	    mapped = mappings.get("Knebworth").getEntity();
 78 | 	    score = mappings.get("Knebworth").getDisambiguationScore();
 79 | 	    assertEquals("Knebworth_Festival", mapped);
 80 | 	    assertEquals(0.6, score, 0.00001);
 81 | 
 82 | 	    mapped = mappings.get("Les Paul").getEntity();
 83 | 	    score = mappings.get("Les Paul").getDisambiguationScore();
 84 | 	    assertEquals("--NME--", mapped);
 85 | 	    assertEquals(0.0, score, 0.00001);
 86 | 	    
 87 | 	}
 88 | 
 89 | 	private Map<String, ResultEntity> repackageMappings(Map<ResultMention, List<ResultEntity>> results) {
 90 | 		Map<String, ResultEntity> repack = new HashMap<String, ResultEntity>();
 91 | 
 92 | 		for(Entry<ResultMention, List<ResultEntity>> entry: results.entrySet()) {
 93 | 			repack.put(entry.getKey().getMention(), entry.getValue().get(0));
 94 | 			System.out.println(entry.getValue().get(0));
 95 | 		}
 96 | 		return repack;
 97 | 	}
 98 | 
 99 | }
100 | 


--------------------------------------------------------------------------------
/test/mpi/aida/graph/algorithms/CocktailPartyTest.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.graph.algorithms;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | 
  5 | import java.util.HashMap;
  6 | import java.util.List;
  7 | import java.util.Map;
  8 | import java.util.Map.Entry;
  9 | 
 10 | import mpi.aida.Preparator;
 11 | import mpi.aida.config.AidaConfig;
 12 | import mpi.aida.config.settings.DisambiguationSettings;
 13 | import mpi.aida.config.settings.PreparationSettings;
 14 | import mpi.aida.config.settings.disambiguation.CocktailPartyDisambiguationSettings;
 15 | import mpi.aida.config.settings.preparation.StanfordHybridPreparationSettings;
 16 | import mpi.aida.data.Entities;
 17 | import mpi.aida.data.Entity;
 18 | import mpi.aida.data.PreparedInput;
 19 | import mpi.aida.data.ResultEntity;
 20 | import mpi.aida.data.ResultMention;
 21 | import mpi.aida.graph.Graph;
 22 | import mpi.aida.graph.GraphGenerator;
 23 | import mpi.aida.graph.algorithms.CocktailParty;
 24 | import mpi.aida.graph.algorithms.DisambiguationAlgorithm;
 25 | import mpi.experiment.trace.NullTracer;
 26 | import mpi.experiment.trace.Tracer;
 27 | 
 28 | import org.junit.Test;
 29 | 
 30 | public class CocktailPartyTest {
 31 | 	
 32 | 	public CocktailPartyTest() {
 33 | 	    AidaConfig.set("dataAccess", "testing");
 34 | 	}
 35 | 
 36 | 	@Test
 37 | 	public void testCocktailParty() throws Exception {
 38 | 
 39 | 		String text = "When [[Page]] played Kashmir at Knebworth, his Les Paul was uniquely tuned.";
 40 | 
 41 | 		String e1 = "Kashmir";
 42 | 		String e2 = "Kashmir_(song)";
 43 | 		String e3 = "Jimmy_Page";
 44 | 
 45 | 		Entities entities = new Entities();
 46 | 		entities.add(new Entity(e1, 1));
 47 | 		entities.add(new Entity(e2, 2));
 48 | 		entities.add(new Entity(e3, 2));
 49 | 
 50 | 		PreparationSettings prepSettings = new StanfordHybridPreparationSettings();
 51 | 		
 52 | 	    Tracer tracer = new NullTracer();
 53 | 		
 54 | 	    Preparator p = new Preparator();
 55 | 	    PreparedInput input = p.prepare("test", text, prepSettings);
 56 | 	    
 57 | 	    DisambiguationSettings disSettings = new CocktailPartyDisambiguationSettings();
 58 | 		
 59 | 		GraphGenerator gg = new GraphGenerator(input, disSettings, tracer);
 60 | 	    Graph gData = gg.run();
 61 | 	    
 62 | 		//KeyphrasesContext kpContext = new KeyphrasesContext(entities);
 63 | 		
 64 | 		DisambiguationAlgorithm da = null;
 65 | 		da = new CocktailParty(gData, disSettings.shouldUseExhaustiveSearch(), disSettings.shouldUseNormalizedObjective());
 66 | 		Map<ResultMention, List<ResultEntity>> results = da.disambiguate();
 67 | 	    Map<String, ResultEntity> mappings = repackageMappings(results);
 68 | 
 69 | 	    String mapped = mappings.get("Page").getEntity();
 70 | 	    double score = mappings.get("Page").getDisambiguationScore();
 71 | 	    assertEquals("Jimmy_Page", mapped);
 72 | 	    assertEquals(0.29169, score, 0.00001);
 73 | 
 74 | 	    mapped = mappings.get("Kashmir").getEntity();
 75 | 	    score = mappings.get("Kashmir").getDisambiguationScore();
 76 | 	    assertEquals("Kashmir_(song)", mapped);
 77 | 	    assertEquals(0.29143, score, 0.00001);
 78 | 
 79 | 	    mapped = mappings.get("Knebworth").getEntity();
 80 | 	    score = mappings.get("Knebworth").getDisambiguationScore();
 81 | 	    assertEquals("Knebworth_Festival", mapped);
 82 | 	    assertEquals(0.68879, score, 0.00001);
 83 | 
 84 | 	    mapped = mappings.get("Les Paul").getEntity();
 85 | 	    score = mappings.get("Les Paul").getDisambiguationScore();
 86 | 	    assertEquals("--NME--", mapped);
 87 | 	    assertEquals(0.0, score, 0.00001);
 88 | 	    
 89 | 	}
 90 | 
 91 | 	private Map<String, ResultEntity> repackageMappings(Map<ResultMention, List<ResultEntity>> results) {
 92 | 		Map<String, ResultEntity> repack = new HashMap<String, ResultEntity>();
 93 | 
 94 | 		for(Entry<ResultMention, List<ResultEntity>> entry: results.entrySet()) {
 95 | 			repack.put(entry.getKey().getMention(), entry.getValue().get(0));
 96 | 		}
 97 | 		return repack;
 98 | 	}
 99 | 
100 | }
101 | 


--------------------------------------------------------------------------------
/test/mpi/aida/graph/similarity/EnsembleMentionEntitySimilarityTest.java:
--------------------------------------------------------------------------------
  1 | package mpi.aida.graph.similarity;
  2 | 
  3 | import static org.junit.Assert.assertEquals;
  4 | 
  5 | import java.util.Arrays;
  6 | import java.util.HashMap;
  7 | import java.util.LinkedList;
  8 | import java.util.List;
  9 | import java.util.Map;
 10 | 
 11 | import mpi.aida.AidaManager;
 12 | import mpi.aida.access.DataAccess;
 13 | import mpi.aida.config.AidaConfig;
 14 | import mpi.aida.data.Context;
 15 | import mpi.aida.data.Entities;
 16 | import mpi.aida.data.Entity;
 17 | import mpi.aida.data.Mention;
 18 | import mpi.aida.data.Mentions;
 19 | import mpi.aida.graph.similarity.EnsembleMentionEntitySimilarity;
 20 | import mpi.aida.graph.similarity.util.SimilaritySettings;
 21 | import mpi.experiment.trace.NullTracer;
 22 | import mpi.experiment.trace.Tracer;
 23 | 
 24 | import org.junit.Test;
 25 | 
 26 | 
 27 | public class EnsembleMentionEntitySimilarityTest {
 28 | 
 29 |   public EnsembleMentionEntitySimilarityTest() {
 30 |     AidaConfig.set("dataAccess", "testing");
 31 |     AidaManager.init();
 32 |   }
 33 |   
 34 |   @Test
 35 |   public void test() throws Exception {
 36 |     // All caps PLAYED to check if term expansion is working.
 37 |     String text = 
 38 |         "When Page PLAYED Kashmir at Knebworth , his Les Paul was uniquely tuned .";
 39 |     
 40 |     Context context = new Context(Arrays.asList(text.split(" ")));
 41 |     
 42 |     String n1 = "Kashmir";
 43 |     String n2 = "Kashmir_(song)";
 44 |     String n3 = "Jimmy_Page";
 45 |     
 46 |     Entity e1 = new Entity(n1, DataAccess.getIdForYagoEntityId(n1));
 47 |     Entity e2 = new Entity(n2, DataAccess.getIdForYagoEntityId(n2));
 48 |     Entity e3 = new Entity(n3, DataAccess.getIdForYagoEntityId(n3));
 49 | 
 50 |     Entities entities = new Entities();
 51 |     entities.add(e1);
 52 |     entities.add(e2);
 53 |     entities.add(e3);
 54 |     
 55 |     Tracer tracer = new NullTracer(); 
 56 |     
 57 |     List<String[]> simConfigs = new LinkedList<String[]>();
 58 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.95" });
 59 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.05" });
 60 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.475" });
 61 |     simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.025" });
 62 | 
 63 |     List<String[]> eeSimConfigs = new LinkedList<String[]>();
 64 | 
 65 |     eeSimConfigs.add(new String[] { "MilneWittenEntityEntitySimilarity", "1.0" });
 66 | 
 67 |     double priorWeight = 0.5;
 68 |     
 69 |     Map<String, double[]> maxMinSettings = new HashMap<String, double[]>();
 70 |     maxMinSettings.put(
 71 |         "UnnormalizedKeyphrasesBasedIDFSimilarity:KeyphrasesContext",
 72 |         new double[] { 0, 70000 });
 73 |     maxMinSettings.put(
 74 |         "UnnormalizedKeyphrasesBasedMISimilarity:KeyphrasesContext",
 75 |         new double[] { 0, 1000 });
 76 |     maxMinSettings.put("prior", new double[] { 0.0, 1.0 });
 77 | 
 78 |     SimilaritySettings settings = 
 79 |         new SimilaritySettings(
 80 |             simConfigs, eeSimConfigs, priorWeight, maxMinSettings);
 81 |     settings.setPriorThreshold(0.8);
 82 | 
 83 |     Mentions ms = new Mentions();
 84 |     Mention m1 = new Mention();
 85 |     m1.setMention("Page");
 86 |     m1.setStartToken(1);
 87 |     m1.setEndToken(1);
 88 |     ms.addMention(m1);
 89 |     Mention m2 = new Mention();
 90 |     m2.setMention("Kashmir");
 91 |     m2.setStartToken(3);
 92 |     m2.setEndToken(3);
 93 |     ms.addMention(m2);
 94 |     AidaManager.fillInCandidateEntities(ms);
 95 | 
 96 |     EnsembleMentionEntitySimilarity emes = new EnsembleMentionEntitySimilarity(ms, entities, settings, tracer);
 97 | 
 98 |     double simPage = emes.calcSimilarity(m1, context, e3);
 99 |     double simKashmir = emes.calcSimilarity(m2, context, e2);
100 |     
101 |     assertEquals(0.000044195, simPage, 0.000000001);
102 |     assertEquals(0.050000, simKashmir, 0.00001);
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/test/mpi/aida/graph/similarity/PriorProbabilityTest.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import static org.junit.Assert.assertTrue;
 5 | 
 6 | import java.util.HashSet;
 7 | import java.util.Set;
 8 | 
 9 | import mpi.aida.AidaManager;
10 | import mpi.aida.config.AidaConfig;
11 | import mpi.aida.data.Entity;
12 | import mpi.aida.graph.similarity.MaterializedPriorProbability;
13 | import mpi.aida.graph.similarity.PriorProbability;
14 | 
15 | import org.junit.Test;
16 | 
17 | public class PriorProbabilityTest {
18 |   
19 |   public PriorProbabilityTest() {
20 |     AidaConfig.set("dataAccess", "testing");
21 |   }
22 |   
23 |   @Test
24 |   public void test() throws Exception {
25 |     Set<String> mentions = new HashSet<String>();
26 |     mentions.add("Kashmir");
27 |     mentions.add("Page");
28 |     
29 |     Entity kashmir = AidaManager.getEntity("Kashmir");
30 |     Entity kashmirSong = AidaManager.getEntity("Kashmir_(song)");
31 |     Entity jimmy = AidaManager.getEntity("Jimmy_Page");
32 |     Entity larry = AidaManager.getEntity("Larry_Page");
33 |     
34 |     PriorProbability pp = new MaterializedPriorProbability(mentions);
35 |     
36 |     double ppKashmirKashmir = pp.getPriorProbability("Kashmir", kashmir);
37 |     double ppKashmirKashmirSong = pp.getPriorProbability("Kashmir", kashmirSong);
38 |         
39 |     assertTrue(ppKashmirKashmir > ppKashmirKashmirSong);
40 |     assertEquals(0.9, ppKashmirKashmir, 0.001);
41 |     assertEquals(1.0, ppKashmirKashmir + ppKashmirKashmirSong, 0.001);
42 | 
43 |     double ppPageJimmy = pp.getPriorProbability("Page", jimmy);
44 |     double ppPageLarry = pp.getPriorProbability("Page", larry);
45 |     
46 |     assertTrue(ppPageJimmy < ppPageLarry);
47 |     assertEquals(0.3, ppPageJimmy, 0.001);
48 |     assertEquals(1.0, ppPageJimmy + ppPageLarry, 0.001);
49 |   }
50 | }


--------------------------------------------------------------------------------
/test/mpi/aida/graph/similarity/context/EntitiesContextTest.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.context;
 2 | 
 3 | import static org.junit.Assert.assertTrue;
 4 | import mpi.aida.graph.similarity.context.EntitiesContext;
 5 | 
 6 | import org.junit.Test;
 7 | 
 8 | 
 9 | public class EntitiesContextTest {
10 | 
11 |   @Test
12 |   public void testGetEntityName() {
13 |     assertTrue(EntitiesContext.getEntityName("Riazuddin_\u0028physicist\u0029").equals("Riazuddin"));
14 |     assertTrue(EntitiesContext.getEntityName("\u0028physicist\u0029_Riazuddin").equals("(physicist) Riazuddin"));
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/test/mpi/aida/graph/similarity/measure/KORETest.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import static org.junit.Assert.assertTrue;
 5 | import mpi.aida.access.DataAccess;
 6 | import mpi.aida.config.AidaConfig;
 7 | import mpi.aida.data.Entities;
 8 | import mpi.aida.data.Entity;
 9 | import mpi.aida.graph.similarity.EntityEntitySimilarity;
10 | import mpi.aida.graph.similarity.context.EntitiesContextSettings;
11 | import mpi.experiment.trace.NullTracer;
12 | 
13 | import org.junit.Test;
14 | 
15 | 
16 | public class KORETest {
17 |   public KORETest() {
18 |     AidaConfig.set("dataAccess", "testing");
19 |   }
20 |   
21 |   @Test
22 |   public void koreTest() throws Exception {
23 |     Entity a = new Entity("Kashmir_(song)", DataAccess.getIdForYagoEntityId("Kashmir_(song)"));
24 |     Entity b = new Entity("Jimmy_Page", DataAccess.getIdForYagoEntityId("Jimmy_Page"));
25 |     Entity c = new Entity("Larry_Page", DataAccess.getIdForYagoEntityId("Larry_Page"));
26 |     Entity d = new Entity("Knebworth_Festival", DataAccess.getIdForYagoEntityId("Knebworth_Festival"));
27 |     
28 |     Entities entities = new Entities();
29 |     entities.add(a);
30 |     entities.add(b);
31 |     entities.add(c);
32 |     entities.add(d);
33 | 
34 |     EntitiesContextSettings ecs = new EntitiesContextSettings();
35 |     ecs.setEntityCoherenceKeyphraseAlpha(1.0);
36 |     ecs.setEntityCoherenceKeywordAlpha(0.0);
37 |     ecs.setShouldNormalizeWeights(true);
38 |     ecs.setKeyphraseSourceExclusion(DataAccess.KPSOURCE_INLINKTITLE);
39 |     EntityEntitySimilarity kore = 
40 |         EntityEntitySimilarity.getKOREEntityEntitySimilarity(
41 |             entities, ecs, new NullTracer());
42 | 
43 |     double simAB = kore.calcSimilarity(a, b);
44 |     double simAC = kore.calcSimilarity(a, c);
45 |     double simBD = kore.calcSimilarity(b, d);
46 |     double simCD = kore.calcSimilarity(c, d);
47 |     double simAD = kore.calcSimilarity(a, d);
48 |     
49 |     assertTrue(simAB > simAC);
50 |     assertTrue(simAD < simAB);
51 |     assertTrue(simBD > simCD);
52 |     assertEquals(0.2091, simAB, 0.0001);
53 |     assertEquals(0.1125, simBD, 0.0001);
54 |     assertEquals(0.1613, simAD, 0.0001);
55 |     assertEquals(0.0, simCD, 0.001);
56 |   }
57 | }
58 | 


--------------------------------------------------------------------------------
/test/mpi/aida/graph/similarity/measure/MilneWittenEntityEntitySimilarityTest.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import static org.junit.Assert.assertTrue;
 5 | import mpi.aida.access.DataAccess;
 6 | import mpi.aida.config.AidaConfig;
 7 | import mpi.aida.data.Entities;
 8 | import mpi.aida.data.Entity;
 9 | import mpi.aida.graph.similarity.EntityEntitySimilarity;
10 | import mpi.experiment.trace.NullTracer;
11 | 
12 | import org.junit.Test;
13 | 
14 | public class MilneWittenEntityEntitySimilarityTest {
15 | 
16 |   public MilneWittenEntityEntitySimilarityTest() {
17 |     AidaConfig.set("dataAccess", "testing");
18 |   }
19 |   
20 |   @Test
21 |   public void mwTest() throws Exception {
22 |     Entity a = new Entity("Kashmir_(song)", DataAccess.getIdForYagoEntityId("Kashmir_(song)"));
23 |     Entity b = new Entity("Jimmy_Page", DataAccess.getIdForYagoEntityId("Jimmy_Page"));
24 |     Entity c = new Entity("Larry_Page", DataAccess.getIdForYagoEntityId("Larry_Page"));
25 |     Entity d = new Entity("Knebworth_Festival", DataAccess.getIdForYagoEntityId("Knebworth_Festival"));
26 |     
27 |     Entities entities = new Entities();
28 |     entities.add(a);
29 |     entities.add(b);
30 |     entities.add(c);
31 |     entities.add(d);
32 | 
33 |     EntityEntitySimilarity mwSim = 
34 |         EntityEntitySimilarity.getMilneWittenSimilarity(
35 |             entities, new NullTracer());
36 | 
37 |     double simAB = mwSim.calcSimilarity(a, b);
38 |     double simAC = mwSim.calcSimilarity(a, c);
39 |     double simBD = mwSim.calcSimilarity(b, d);
40 |     double simCD = mwSim.calcSimilarity(c, d);
41 |     double simAD = mwSim.calcSimilarity(a, d);
42 |     
43 |     assertTrue(simAB > simAC);
44 |     assertTrue(simAD < simAB);
45 |     assertTrue(simBD > simCD);
46 |     assertEquals(0.9493, simAB, 0.0001);
47 |     assertEquals(0.8987, simBD, 0.0001);
48 |     assertEquals(0.9197, simAD, 0.0001);
49 |     assertEquals(0.0, simCD, 0.001);
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/test/mpi/aida/graph/similarity/measure/WeightComputationTest.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.graph.similarity.measure;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | import static org.junit.Assert.assertTrue;
 5 | import mpi.aida.graph.similarity.measure.WeightComputation;
 6 | 
 7 | import org.junit.Test;
 8 | 
 9 | 
10 | public class WeightComputationTest {
11 | 
12 |   @Test
13 |   public void testComputeNPMI() {
14 |     double npmi;
15 |     npmi = WeightComputation.computeNPMI(1, 1, 1, 10);
16 |     assertEquals(1.0, npmi, 0.001);
17 | 
18 |     npmi = WeightComputation.computeNPMI(1, 1, 0, 10);
19 |     assertEquals(-1.0, npmi, 0.001);
20 |     
21 |     assertTrue(WeightComputation.computeNPMI(3, 3, 2, 10)
22 |         > WeightComputation.computeNPMI(3, 3, 1, 10));
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/test/mpi/aida/util/WikipediaUtilTest.java:
--------------------------------------------------------------------------------
 1 | package mpi.aida.util;
 2 | 
 3 | import static org.junit.Assert.assertEquals;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | 
 8 | public class WikipediaUtilTest {
 9 | 
10 |   @Test
11 |   public void test() {
12 |     String content = "{{Infobox scientist\n" +
13 |         "| name                    = Raghu Ramakrishnan\n" +
14 |         "| image                   = \n" +
15 |         "| image_size             = 150px\n" +
16 |         "| caption                 =\n" +
17 |         "| birth_date              =\n" +
18 |         "| birth_place             = \n" +
19 |         "| death_date              =\n" +
20 |         "| death_place             =\n" +
21 |         "| residence               =\n" +
22 |         "| citizenship             =\n" +
23 |         "| nationality             =\n" +
24 |         "| ethnicity               =\n" +
25 |         "| field                   = [[Computer Science]]\n" +
26 |         "| work_institution        = [[University of Wisconsin–Madison]], [[Yahoo! Research]]\n" +
27 |         "| alma_mater              = [[University of Texas]]\n" +
28 |         "| doctoral_advisor        = \n" +
29 |         "| doctoral_students       =\n" +
30 |         "| known_for               = \n" +
31 |         "| author_abbreviation_bot =\n" +
32 |         "| author_abbreviation_zoo =\n" +
33 |         "| prizes                  =\n" +
34 |         "| religion                =\n" +
35 |         "| footnotes               =\n" +
36 |         "}}\n" +
37 |         "'''Raghu Ramakrishnan''' is a renowned researcher in the areas of database and information management.  He is currently a Vice President and Research Fellow for [[Yahoo! Inc.]]  Previously, he was a Professor of [http://www.cs.wisc.edu Computer Sciences] at the [[University of Wisconsin–Madison]].\n" +
38 |         "\n" +
39 |         "Ramakrishnan received a bachelor's degree from IIT Madras in 1983, and a Ph.D. from the University of Texas at Austin in 1987.  He has been selected as a Fellow of the ACM and a Packard fellow, and has done pioneering research in the areas of deductive databases, data mining, exploratory data analysis, data privacy, and web-scale data integration.  The focus of his current work (2007) is community-based information management.\n" +
40 |         "\n" +
41 |         "With [[Johannes Gehrke]], he authored the popular textbook [http://www.cs.wisc.edu/~dbbook Database Management Systems], also known as the \"Cow Book\".\n" +
42 |         "\n" +
43 |         "==External links==\n" +
44 |         "*[http://www.cs.wisc.edu/~raghu Raghu's Wisconsin homepage]\n" +
45 |         "*[http://research.yahoo.com/~ramakris Raghu's Yahoo! homepage]\n" +
46 |         "\n" +
47 |         "{{Persondata <!-- Metadata: see [[Wikipedia:Persondata]]. -->\n" +
48 |         "| NAME              = Ramakrishnan, Raghu\n" +
49 |         "| ALTERNATIVE NAMES =\n" +
50 |         "| SHORT DESCRIPTION =\n" +
51 |         "| DATE OF BIRTH     =\n" +
52 |         "| PLACE OF BIRTH    =\n" +
53 |         "| DATE OF DEATH     =\n" +
54 |         "| PLACE OF DEATH    =\n" +
55 |         "}}\n" +
56 |         "{{DEFAULTSORT:Ramakrishnan, Raghu}}\n" +
57 |         "[[Category:Fellows of the Association for Computing Machinery]]\n" +
58 |         "[[Category:Database researchers]]\n" +
59 |         "[[Category:Living people]]\n" +
60 |         "[[Category:Data miners]]\n" +
61 |         "[[Category:Yahoo! employees]]\n";
62 |     
63 |       String clean = WikipediaUtil.cleanWikipediaArticle(content);
64 |       String expectecClean = " Raghu Ramakrishnan is a renowned researcher in the areas of database and information management. He is currently a Vice President and Research Fellow for Yahoo! Inc. Previously, he was a Professor of at the University of Wisconsin–Madison. Ramakrishnan received a bachelor's degree from IIT Madras in 1983, and a Ph.D. from the University of Texas at Austin in 1987. He has been selected as a Fellow of the ACM and a Packard fellow, and has done pioneering research in the areas of deductive databases, data mining, exploratory data analysis, data privacy, and web-scale data integration. The focus of his current work (2007) is community-based information management. With Johannes Gehrke, he authored the popular textbook , also known as the \"Cow Book\". * * ";
65 |       assertEquals(expectecClean, clean);
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------