├── COPYING ├── README.html ├── README.md ├── build.xml ├── lib ├── JavaEWAH-0.4.2.jar ├── basics2_20100910.jar ├── commons-cli-1.2.jar ├── commons-collections-3.2.1.jar ├── commons-io-1.4.jar ├── commons-lang-2.3.jar ├── javatools_20120619.jar ├── jgrapht.jar ├── joda-time.jar ├── jollyday.jar ├── junit-4.11.jar ├── log4j-1.2.17.jar ├── mpi-DBManager-20121219.jar ├── mpi-TokenizerService2-20130124.jar ├── postgresql-9.2-1002.jdbc4.jar ├── slf4j-api-1.7.2.jar ├── slf4j-log4j12-1.7.2.jar ├── stanford-corenlp-1.3.4-models.jar ├── stanford-corenlp-1.3.4.jar ├── trove-3.0.3.jar └── xom.jar ├── licenses ├── apache-license-2.txt ├── cc-by-nc-sa.txt ├── cern.txt ├── cpl-1.0.txt ├── gnu-gpl-v2.txt ├── gnu-gpl-v3.txt ├── gnu-lgpl-v2.1.txt └── mit.txt ├── resources └── log4j.properties ├── sample_settings ├── aida.properties └── database_aida.properties ├── settings └── tokens │ ├── stopwords6.txt │ └── symbols.txt ├── src └── mpi │ ├── aida │ ├── AidaManager.java │ ├── CommandLineDisambiguator.java │ ├── Disambiguator.java │ ├── Preparator.java │ ├── access │ │ ├── DataAccess.java │ │ ├── DataAccessForTesting.java │ │ ├── DataAccessInterface.java │ │ └── DataAccessSQL.java │ ├── config │ │ ├── AidaConfig.java │ │ └── settings │ │ │ ├── DisambiguationSettings.java │ │ │ ├── PreparationSettings.java │ │ │ ├── Settings.java │ │ │ ├── disambiguation │ │ │ ├── CocktailPartyDisambiguationSettings.java │ │ │ ├── CocktailPartyKOREDisambiguationSettings.java │ │ │ ├── LocalDisambiguationSettings.java │ │ │ └── PriorOnlyDisambiguationSettings.java │ │ │ └── preparation │ │ │ ├── StanfordHybridPreparationSettings.java │ │ │ └── StanfordManualPreparationSettings.java │ ├── data │ │ ├── Context.java │ │ ├── DisambiguationResults.java │ │ ├── Entities.java │ │ ├── Entity.java │ │ ├── Keyphrases.java │ │ ├── Mention.java │ │ ├── Mentions.java │ │ ├── PreparedInput.java │ │ ├── ResultEntity.java │ │ └── ResultMention.java │ ├── disambiguationtechnique │ │ └── LocalDisambiguation.java │ ├── graph │ │ ├── Graph.java │ │ ├── GraphGenerator.java │ │ ├── GraphNode.java │ │ ├── GraphNodeTypes.java │ │ ├── algorithms │ │ │ ├── CocktailParty.java │ │ │ ├── CocktailPartySizeConstrained.java │ │ │ ├── DisambiguationAlgorithm.java │ │ │ ├── GreedyHillClimbing.java │ │ │ ├── Node.java │ │ │ └── ShortestPath.java │ │ ├── extraction │ │ │ ├── DegreeComparator.java │ │ │ ├── ExtractGraph.java │ │ │ └── ExtractGraphAllEdges.java │ │ └── similarity │ │ │ ├── EnsembleEntityEntitySimilarity.java │ │ │ ├── EnsembleMentionEntitySimilarity.java │ │ │ ├── EntityEntitySimilarity.java │ │ │ ├── MaterializedPriorProbability.java │ │ │ ├── MentionEntitySimilarity.java │ │ │ ├── PriorProbability.java │ │ │ ├── context │ │ │ ├── EmptyEntitiesContext.java │ │ │ ├── EntitiesContext.java │ │ │ ├── EntitiesContextSettings.java │ │ │ ├── FastWeightedKeyphrasesContext.java │ │ │ ├── KeyphraseReweightedKeywordContext.java │ │ │ ├── KeyphrasesContext.java │ │ │ ├── TextContext.java │ │ │ └── WeightedKeyphrasesContext.java │ │ │ ├── exception │ │ │ └── MissingSettingException.java │ │ │ ├── importance │ │ │ ├── EntityImportance.java │ │ │ └── InlinkCountImportance.java │ │ │ ├── measure │ │ │ ├── AlwaysOneSimilarityMeasure.java │ │ │ ├── EntityEntitySimilarityMeasure.java │ │ │ ├── InlinkOverlapEntityEntitySimilarity.java │ │ │ ├── JaccardEntityEntitySimilarityMeasure.java │ │ │ ├── JaccardSimilarityMeasure.java │ │ │ ├── KOREEntityEntitySimilarityMeasure.java │ │ │ ├── KeyphraseCosineSimilarityMeasure.java │ │ │ ├── MentionEntitySimilarityMeasure.java │ │ │ ├── MilneWittenEntityEntitySimilarity.java │ │ │ ├── NGDSimilarityMeasure.java │ │ │ ├── NormalizedKeyphrasesBasedIDFSimilarity.java │ │ │ ├── NormalizedKeyphrasesBasedMISimilarity.java │ │ │ ├── NullEntityEntitySimilarityMeasure.java │ │ │ ├── NullMentionEntittySimilarityMeasure.java │ │ │ ├── SimilarityMeasure.java │ │ │ ├── TfIdfCosineSimilarityMeasure.java │ │ │ ├── UnnormalizedKeyphrasesBasedIDFSimilarity.java │ │ │ ├── UnnormalizedKeyphrasesBasedMISimilarity.java │ │ │ ├── WeightComputation.java │ │ │ ├── WeightedJaccardEntityEntitySimilarityMeasure.java │ │ │ ├── WeightedNGDSimilarityMeasure.java │ │ │ └── WordCountVectorDotProductSimilarityMeasure.java │ │ │ └── util │ │ │ ├── EntitiesContextCreator.java │ │ │ ├── MaxMinSettings.java │ │ │ ├── ParallelEntityEntityRelatednessComputation.java │ │ │ ├── ParallelEntityEntityRelatednessComputationThread.java │ │ │ └── SimilaritySettings.java │ ├── preparation │ │ ├── AidaTokenizerManager.java │ │ └── mentionrecognition │ │ │ ├── FilterMentions.java │ │ │ ├── HybridFilter.java │ │ │ ├── ManualFilter.java │ │ │ └── NamedEntityFilter.java │ └── util │ │ ├── CollectionUtils.java │ │ ├── DocumentCounter.java │ │ ├── InputTextInvertedIndex.java │ │ ├── Measures.java │ │ ├── MinCover.java │ │ ├── MinCoverCalculator.java │ │ ├── NiceTime.java │ │ ├── Result.java │ │ ├── RunningTimer.java │ │ ├── SortByAvgPre.java │ │ ├── StopWord.java │ │ ├── WikipediaDumpArticleIdExtractor.java │ │ ├── WikipediaUtil.java │ │ ├── YagoUtil.java │ │ └── htmloutput │ │ ├── GenerateWebHtml.java │ │ └── ResultMention.java │ └── experiment │ ├── measure │ └── EvaluationMeasures.java │ ├── reader │ ├── AidaFormatCollectionReader.java │ ├── CoNLLReader.java │ ├── CollectionReader.java │ └── KORE50Reader.java │ └── trace │ ├── EntityEntityTracing.java │ ├── GraphTracer.java │ ├── NullEntityEntityTracing.java │ ├── NullGraphTracer.java │ ├── NullTracer.java │ ├── Tracer.java │ ├── data │ ├── EntityTracer.java │ └── MentionTracer.java │ └── measures │ ├── GenericEntityEntitySimilarityMeasureTracer.java │ ├── KeyphrasesMeasureTracer.java │ ├── KeytermEntityEntityMeasureTracer.java │ ├── KeywordContextEntityTracer.java │ ├── MeasureTracer.java │ ├── PriorMeasureTracer.java │ ├── TermTracer.java │ └── TracerPart.java └── test └── mpi └── aida ├── DisambiguatorTest.java ├── data └── ContextTest.java ├── graph ├── algorithms │ ├── CocktailPartySizeConstrainedTest.java │ └── CocktailPartyTest.java └── similarity │ ├── EnsembleMentionEntitySimilarityTest.java │ ├── PriorProbabilityTest.java │ ├── context │ └── EntitiesContextTest.java │ └── measure │ ├── KORETest.java │ ├── KeyphrasesBasedSimilarityTest.java │ ├── MilneWittenEntityEntitySimilarityTest.java │ └── WeightComputationTest.java └── util └── WikipediaUtilTest.java /build.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Compile, test, generate a jar or run the rmi service for AIDA. 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 44 | 45 | 46 | 47 | 49 | 50 | 51 | 52 | 53 | 54 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 66 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 86 | 87 | 88 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | -------------------------------------------------------------------------------- /lib/JavaEWAH-0.4.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/JavaEWAH-0.4.2.jar -------------------------------------------------------------------------------- /lib/basics2_20100910.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/basics2_20100910.jar -------------------------------------------------------------------------------- /lib/commons-cli-1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/commons-cli-1.2.jar -------------------------------------------------------------------------------- /lib/commons-collections-3.2.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/commons-collections-3.2.1.jar -------------------------------------------------------------------------------- /lib/commons-io-1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/commons-io-1.4.jar -------------------------------------------------------------------------------- /lib/commons-lang-2.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/commons-lang-2.3.jar -------------------------------------------------------------------------------- /lib/javatools_20120619.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/javatools_20120619.jar -------------------------------------------------------------------------------- /lib/jgrapht.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/jgrapht.jar -------------------------------------------------------------------------------- /lib/joda-time.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/joda-time.jar -------------------------------------------------------------------------------- /lib/jollyday.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/jollyday.jar -------------------------------------------------------------------------------- /lib/junit-4.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/junit-4.11.jar -------------------------------------------------------------------------------- /lib/log4j-1.2.17.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/log4j-1.2.17.jar -------------------------------------------------------------------------------- /lib/mpi-DBManager-20121219.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/mpi-DBManager-20121219.jar -------------------------------------------------------------------------------- /lib/mpi-TokenizerService2-20130124.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/mpi-TokenizerService2-20130124.jar -------------------------------------------------------------------------------- /lib/postgresql-9.2-1002.jdbc4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/postgresql-9.2-1002.jdbc4.jar -------------------------------------------------------------------------------- /lib/slf4j-api-1.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/slf4j-api-1.7.2.jar -------------------------------------------------------------------------------- /lib/slf4j-log4j12-1.7.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/slf4j-log4j12-1.7.2.jar -------------------------------------------------------------------------------- /lib/stanford-corenlp-1.3.4-models.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/stanford-corenlp-1.3.4-models.jar -------------------------------------------------------------------------------- /lib/stanford-corenlp-1.3.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/stanford-corenlp-1.3.4.jar -------------------------------------------------------------------------------- /lib/trove-3.0.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/trove-3.0.3.jar -------------------------------------------------------------------------------- /lib/xom.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codepie/aida/213397710e4cf9a3cf3cfdd94c7f908190b14b83/lib/xom.jar -------------------------------------------------------------------------------- /licenses/cern.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 1999 CERN - European Organization for Nuclear Research. Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose is hereby granted without fee, provided that the above copyright notice appear in all copies and that both that copyright notice and this permission notice appear in supporting documentation. CERN makes no representations about the suitability of this software for any purpose. It is provided "as is" without expressed or implied warranty. -------------------------------------------------------------------------------- /licenses/mit.txt: -------------------------------------------------------------------------------- 1 | Permission is hereby granted, free of charge, to any person obtaining 2 | a copy of this software and associated documentation files (the 3 | "Software"), to deal in the Software without restriction, including 4 | without limitation the rights to use, copy, modify, merge, publish, 5 | distribute, sublicense, and/or sell copies of the Software, and to 6 | permit persons to whom the Software is furnished to do so, subject to 7 | the following conditions: 8 | 9 | The above copyright notice and this permission notice shall be 10 | included in all copies or substantial portions of the Software. 11 | 12 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 13 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 14 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 15 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 16 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 17 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 18 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=INFO, A1 3 | 4 | # A1 is set to be a ConsoleAppender. 5 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 6 | 7 | # A1 uses PatternLayout. 8 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 9 | log4j.appender.A1.layout.ConversionPattern=%d{HH:mm:ss,SSS} [%t] %-5p %C{1} %x - %m%n -------------------------------------------------------------------------------- /sample_settings/aida.properties: -------------------------------------------------------------------------------- 1 | # Type of data backend. For now only 'sql' is used 2 | dataAccess = sql 3 | # Number of parallel threads to use in the computation of the entity-entity 4 | # similarity. Use as many as you have CPU cores. 5 | eeNumThreads = 4 -------------------------------------------------------------------------------- /sample_settings/database_aida.properties: -------------------------------------------------------------------------------- 1 | type = postgres 2 | hostname = 3 | port = 5432 4 | schema = 5 | username = 6 | password = 7 | maxConnection = 50 -------------------------------------------------------------------------------- /settings/tokens/symbols.txt: -------------------------------------------------------------------------------- 1 | " 2 | % 3 | ' 4 | '' 5 | ( 6 | ) 7 | + 8 | , 9 | - 10 | . 11 | : 12 | ; 13 | _ 14 | — 15 | – 16 | | 17 | \ 18 | / 19 | * 20 | { 21 | } 22 | ! 23 | § 24 | $ 25 | & 26 | = 27 | * 28 | ~ 29 | # 30 | , 31 | > 32 | < 33 | [ 34 | ] -------------------------------------------------------------------------------- /src/mpi/aida/Preparator.java: -------------------------------------------------------------------------------- 1 | package mpi.aida; 2 | 3 | import java.util.Arrays; 4 | import java.util.List; 5 | 6 | import mpi.aida.config.settings.PreparationSettings; 7 | import mpi.aida.data.PreparedInput; 8 | 9 | public class Preparator { 10 | 11 | public PreparedInput prepare(String docId, String text, PreparationSettings settings) { 12 | PreparedInput preparedInput = AidaManager.prepareInputData(text, docId, settings.getMentionsFilter()); 13 | String[] types = settings.getFilteringTypes(); 14 | if (types != null) { 15 | List filteringTypes = Arrays.asList(settings.getFilteringTypes()); 16 | preparedInput.getMentions().setEntitiesTypes(filteringTypes); 17 | } 18 | return preparedInput; 19 | } 20 | 21 | } 22 | -------------------------------------------------------------------------------- /src/mpi/aida/access/DataAccessInterface.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.access; 2 | 3 | import gnu.trove.map.hash.TIntDoubleHashMap; 4 | import gnu.trove.map.hash.TIntIntHashMap; 5 | import gnu.trove.map.hash.TIntObjectHashMap; 6 | import gnu.trove.map.hash.TObjectIntHashMap; 7 | import gnu.trove.set.hash.TIntHashSet; 8 | 9 | import java.util.Collection; 10 | import java.util.List; 11 | import java.util.Map; 12 | import java.util.Set; 13 | 14 | import mpi.aida.data.Entities; 15 | import mpi.aida.data.Entity; 16 | import mpi.aida.data.Keyphrases; 17 | import mpi.aida.util.YagoUtil.Gender; 18 | 19 | public interface DataAccessInterface { 20 | 21 | public DataAccess.type getAccessType(); 22 | 23 | public Entities getEntitiesForMention(String mention); 24 | 25 | public int[] getInlinkNeighbors(Entity e); 26 | 27 | public TIntObjectHashMap getInlinkNeighbors(Entities entities); 28 | 29 | public Map getGenderForEntities(Entities entities); 30 | 31 | public Map> getTypes(Set entities); 32 | 33 | public List getTypes(String Entity); 34 | 35 | public TIntIntHashMap getKeyphraseDocumentFrequencies(TIntHashSet keyphrases); 36 | 37 | public List getParentTypes(String queryType); 38 | 39 | public String getKeyphraseSource(String entityName, String keyphrase); 40 | 41 | public Map> getEntityKeyphrases(Set entities); 42 | 43 | public Map> getKeyphraseEntities(Set keyphrases); 44 | 45 | public Map getEntityLSHSignatures(Entities entities); 46 | 47 | public Map getEntityLSHSignatures(Entities entities, String table); 48 | 49 | public String getFamilyName(String entity); 50 | 51 | public String getGivenName(String entity); 52 | 53 | public TIntDoubleHashMap getEntityPriors(String mention); 54 | 55 | public void getEntityKeyphraseTokens( 56 | Entities entities, String keyphraseSourceExclusion, 57 | TIntObjectHashMap entityKeyphrases, 58 | TIntObjectHashMap kpTokens); 59 | 60 | public TIntIntHashMap getKeywordDocumentFrequencies(TIntHashSet keywords); 61 | 62 | public TIntIntHashMap getEntitySuperdocSize(Entities entities); 63 | 64 | public TIntObjectHashMap getEntityKeywordIntersectionCount(Entities entities); 65 | 66 | public TIntObjectHashMap getYagoEntityIdsForIds(int[] ids); 67 | 68 | public TObjectIntHashMap getIdsForYagoEntityIds(Collection entityIds); 69 | 70 | public TIntObjectHashMap getWordsForIds(int[] keywordIds); 71 | 72 | public TObjectIntHashMap getIdsForWords(Collection keywords); 73 | 74 | public TObjectIntHashMap getAllEntityIds(); 75 | 76 | public Entities getAllEntities(); 77 | 78 | public int[] getAllWordExpansions(); 79 | 80 | public boolean checkEntityNameExists(String entity); 81 | 82 | public boolean isYagoEntity(Entity entity); 83 | 84 | public Keyphrases getEntityKeyphrases(Entities entities, 85 | String keyphraseSourceExclusion); 86 | } 87 | -------------------------------------------------------------------------------- /src/mpi/aida/config/AidaConfig.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.config; 2 | 3 | import java.io.File; 4 | import java.io.FileReader; 5 | import java.util.Properties; 6 | 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | /** 11 | * Main configuration path for global settings. 12 | */ 13 | public class AidaConfig { 14 | private static final Logger logger = 15 | LoggerFactory.getLogger(AidaConfig.class); 16 | 17 | public static final String SERVICENAME = "serviceName"; 18 | 19 | public static final String SERVERPORT = "serverport"; 20 | 21 | public static final String CLIENTPORT = "clientport"; 22 | 23 | public static final String STORAGEPATH = "storagePath"; 24 | 25 | public static final String DATAACCESS = "dataAccess"; 26 | 27 | public static final String DATAACCESS_IP = "dataAccessIP"; 28 | 29 | public static final String DATAACCESS_PORT = "dataAccessPort"; 30 | 31 | public static final String DATAACCESS_SERVICENAME = "dataAccessServiceName"; 32 | 33 | public static final String DATAACCESS_DIRECT_PATH = "dataAccessDirectPath"; 34 | 35 | public static final String RMI_TOKENIZER_LANGUAGE = "tokenizerLanguage"; 36 | 37 | public static final String LOG_TO_DB = "logToDB"; 38 | 39 | public static final String LOG_TABLENAME = "logTableName"; 40 | 41 | public static final String LOG_STATS_TABLENAME = "logStatsTableName"; 42 | 43 | public static final String MAX_NUM_CANDIDATE_ENTITIES_FOR_GRAPH = "maxNumCandidateEntitiesForGraph"; 44 | 45 | public static final String EE_NUM_THREADS = "eeNumThreads"; 46 | 47 | private Properties properties; 48 | 49 | private String path = "./settings/aida.properties"; 50 | 51 | private static AidaConfig config = null; 52 | 53 | private AidaConfig() { 54 | properties = new Properties(); 55 | try { 56 | properties.load(new FileReader(new File(path))); 57 | } catch (Exception e) { 58 | properties = new Properties(); 59 | logger.error("Main settings file missing. " + 60 | "Copy 'sample_settings/aida.properties' to the 'settings/' " + 61 | "directory and adjust it."); 62 | } 63 | } 64 | 65 | private static AidaConfig getInstance() { 66 | if (config == null) { 67 | config = new AidaConfig(); 68 | } 69 | return config; 70 | } 71 | 72 | private String getValue(String key) { 73 | return (String) properties.get(key); 74 | } 75 | 76 | private void setValue(String key, String value) { 77 | properties.setProperty(key, value); 78 | } 79 | 80 | private boolean hasKey(String key) { 81 | return properties.containsKey(key); 82 | } 83 | 84 | public static String get(String key) { 85 | String value = null; 86 | if (AidaConfig.getInstance().hasKey(key)) { 87 | value = AidaConfig.getInstance().getValue(key); 88 | } else { 89 | // Some default values. 90 | if (key.equals(EE_NUM_THREADS)) { 91 | value = "8"; 92 | } else if (key.equals(MAX_NUM_CANDIDATE_ENTITIES_FOR_GRAPH)) { 93 | // 0 means no limit. 94 | value = "0"; 95 | } else { 96 | logger.error("" + 97 | "Missing key in properties file with no default value: " + key); 98 | } 99 | } 100 | return value; 101 | } 102 | 103 | public static void set(String key, String value) { 104 | AidaConfig.getInstance().setValue(key, value); 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /src/mpi/aida/config/settings/PreparationSettings.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.config.settings; 2 | 3 | import java.io.Serializable; 4 | 5 | import mpi.aida.preparation.mentionrecognition.FilterMentions.FilterType; 6 | 7 | /** 8 | * Settings for the preparator. Predefined settings are available in 9 | * {@see mpi.aida.config.settings.preparation}. 10 | */ 11 | public class PreparationSettings implements Serializable { 12 | 13 | private static final long serialVersionUID = -2825720730925914648L; 14 | 15 | private FilterType mentionsFilter = FilterType.Hybrid; 16 | 17 | private String[] filteringTypes = null; 18 | 19 | private LANGUAGE language = LANGUAGE.ENGLISH; 20 | 21 | public static enum LANGUAGE { 22 | ENGLISH, GERMAN 23 | } 24 | 25 | public FilterType getMentionsFilter() { 26 | return mentionsFilter; 27 | } 28 | 29 | public void setMentionsFilter(FilterType mentionsFilter) { 30 | this.mentionsFilter = mentionsFilter; 31 | } 32 | 33 | public String[] getFilteringTypes() { 34 | return filteringTypes; 35 | } 36 | 37 | public void setFilteringTypes(String[] filteringTypes) { 38 | this.filteringTypes = filteringTypes; 39 | } 40 | 41 | public LANGUAGE getLanguage() { 42 | return language; 43 | } 44 | 45 | public void setLanguage(LANGUAGE language) { 46 | this.language = language; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/mpi/aida/config/settings/Settings.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.config.settings; 2 | 3 | import java.io.Serializable; 4 | 5 | /** 6 | * Combined PreparationSettings and DisambiguationSettings, useful for 7 | * a calling API. 8 | */ 9 | public class Settings implements Serializable { 10 | 11 | public enum TECHNIQUE { 12 | LOCAL, LOCAL_ITERATIVE, GRAPH, CHAKRABARTI 13 | } 14 | 15 | public enum ALGORITHM { 16 | COCKTAIL_PARTY, COCKTAIL_PARTY_SIZE_CONSTRAINED, RANDOM_WALK 17 | } 18 | 19 | private static final long serialVersionUID = -6602287193597852191L; 20 | 21 | private PreparationSettings preparationSettings = null; 22 | 23 | private DisambiguationSettings disambiguationSettings = null; 24 | 25 | public Settings(PreparationSettings preparationSettings, 26 | DisambiguationSettings disambiguationSettings) { 27 | this.preparationSettings = preparationSettings; 28 | this.disambiguationSettings = disambiguationSettings; 29 | } 30 | 31 | public PreparationSettings getPreparationSettings() { 32 | return preparationSettings; 33 | } 34 | 35 | public void setPreparationSettings(PreparationSettings preparationSettings) { 36 | this.preparationSettings = preparationSettings; 37 | } 38 | 39 | public DisambiguationSettings getDisambiguationSettings() { 40 | return disambiguationSettings; 41 | } 42 | 43 | public void setDisambiguationSettings(DisambiguationSettings disambiguationSettings) { 44 | this.disambiguationSettings = disambiguationSettings; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/mpi/aida/config/settings/disambiguation/CocktailPartyDisambiguationSettings.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.config.settings.disambiguation; 2 | 3 | import java.util.HashMap; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import mpi.aida.config.settings.DisambiguationSettings; 9 | import mpi.aida.config.settings.Settings.ALGORITHM; 10 | import mpi.aida.config.settings.Settings.TECHNIQUE; 11 | import mpi.aida.graph.similarity.exception.MissingSettingException; 12 | import mpi.aida.graph.similarity.util.SimilaritySettings; 13 | import mpi.experiment.trace.GraphTracer.TracingTarget; 14 | 15 | /** 16 | * Preconfigured settings for the {@see Disambiguator} using the mention-entity 17 | * prior, the keyphrase based similarity, and the MilneWitten Wikipedia link 18 | * based entity coherence. 19 | * 20 | * This gives the best quality and should be used in comparing results against 21 | * AIDA. 22 | */ 23 | public class CocktailPartyDisambiguationSettings extends DisambiguationSettings { 24 | 25 | private static final long serialVersionUID = 5867674989478781057L; 26 | 27 | public CocktailPartyDisambiguationSettings() throws MissingSettingException { 28 | setAlpha(0.6); 29 | setTracingTarget(TracingTarget.WEB_INTERFACE); 30 | 31 | setDisambiguationTechnique(TECHNIQUE.GRAPH); 32 | setDisambiguationAlgorithm(ALGORITHM.COCKTAIL_PARTY_SIZE_CONSTRAINED); 33 | setUseExhaustiveSearch(true); 34 | setUseNormalizedObjective(true); 35 | setEntitiesPerMentionConstraint(5); 36 | setUseCoherenceRobustnessTest(true); 37 | setCohRobustnessThreshold(0.9); 38 | 39 | Map minMaxs = new HashMap(); 40 | minMaxs.put("prior", new double[] { 0.0, 1.0} ); 41 | minMaxs.put("UnnormalizedKeyphrasesBasedMISimilarity:KeyphrasesContext", new double[] { 0.0, 840.1373501651881}); 42 | minMaxs.put("UnnormalizedKeyphrasesBasedIDFSimilarity:KeyphrasesContext", new double[] { 0.0, 63207.231647131}); 43 | 44 | List simConfigs = new LinkedList(); 45 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "1.4616111666431395E-5" }); 46 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "4.291375037765039E-5" }); 47 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.15586170799823845" }); 48 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.645200419577534" }); 49 | List cohConfigs = new LinkedList(); 50 | cohConfigs.add(new String[] { "MilneWittenEntityEntitySimilarity", "1.0" }); 51 | 52 | SimilaritySettings switchedKPsettings = new SimilaritySettings(simConfigs, cohConfigs, 0.19888034256218348, minMaxs); 53 | switchedKPsettings.setIdentifier("SwitchedKP"); 54 | switchedKPsettings.setPriorThreshold(0.9); 55 | setSimilaritySettings(switchedKPsettings); 56 | 57 | simConfigs = new LinkedList(); 58 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.971742997195044" }); 59 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.028257002804955994" }); 60 | SimilaritySettings unnormalizedKPsettings = new SimilaritySettings(simConfigs, null, 0.0, minMaxs); 61 | switchedKPsettings.setIdentifier("CoherenceRobustnessTest"); 62 | setCoherenceSimilaritySetting(unnormalizedKPsettings); 63 | } 64 | } -------------------------------------------------------------------------------- /src/mpi/aida/config/settings/disambiguation/CocktailPartyKOREDisambiguationSettings.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.config.settings.disambiguation; 2 | 3 | import java.util.HashMap; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import mpi.aida.access.DataAccess; 9 | import mpi.aida.config.settings.DisambiguationSettings; 10 | import mpi.aida.config.settings.Settings.ALGORITHM; 11 | import mpi.aida.config.settings.Settings.TECHNIQUE; 12 | import mpi.aida.graph.similarity.exception.MissingSettingException; 13 | import mpi.aida.graph.similarity.util.SimilaritySettings; 14 | import mpi.experiment.trace.GraphTracer.TracingTarget; 15 | 16 | /** 17 | * Preconfigured settings for the {@see Disambiguator} using the mention-entity 18 | * prior, the keyphrase based similarity, and the KORE keyphrase based 19 | * entity coherence. 20 | */ 21 | public class CocktailPartyKOREDisambiguationSettings extends DisambiguationSettings { 22 | 23 | private static final long serialVersionUID = 5867674989478781057L; 24 | 25 | public CocktailPartyKOREDisambiguationSettings() throws MissingSettingException { 26 | setAlpha(0.6); 27 | setTracingTarget(TracingTarget.WEB_INTERFACE); 28 | 29 | setDisambiguationTechnique(TECHNIQUE.GRAPH); 30 | setDisambiguationAlgorithm(ALGORITHM.COCKTAIL_PARTY_SIZE_CONSTRAINED); 31 | setUseExhaustiveSearch(true); 32 | setUseNormalizedObjective(true); 33 | setEntitiesPerMentionConstraint(5); 34 | setUseCoherenceRobustnessTest(true); 35 | setCohRobustnessThreshold(0.9); 36 | 37 | Map minMaxs = new HashMap(); 38 | minMaxs.put("prior", new double[] { 0.0, 1.0} ); 39 | minMaxs.put("UnnormalizedKeyphrasesBasedMISimilarity:KeyphrasesContext", new double[] { 0.0, 840.1373501651881}); 40 | minMaxs.put("UnnormalizedKeyphrasesBasedIDFSimilarity:KeyphrasesContext", new double[] { 0.0, 63207.231647131}); 41 | 42 | List simConfigs = new LinkedList(); 43 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "1.4616111666431395E-5" }); 44 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "4.291375037765039E-5" }); 45 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.15586170799823845" }); 46 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.645200419577534" }); 47 | List cohConfigs = new LinkedList(); 48 | cohConfigs.add(new String[] { "KOREEntityEntitySimilarity", "1.0" }); 49 | 50 | SimilaritySettings switchedKPsettings = new SimilaritySettings(simConfigs, cohConfigs, 0.19888034256218348, minMaxs); 51 | switchedKPsettings.setIdentifier("SwitchedKP"); 52 | switchedKPsettings.setPriorThreshold(0.9); 53 | switchedKPsettings.setEntityCohKeyphraseAlpha(1.0); 54 | switchedKPsettings.setEntityCohKeywordAlpha(0.0); 55 | switchedKPsettings.setShouldNormalizeCoherenceWeights(true); 56 | switchedKPsettings.setKeyphraseSourceExclusion(DataAccess.KPSOURCE_INLINKTITLE); 57 | setSimilaritySettings(switchedKPsettings); 58 | 59 | simConfigs = new LinkedList(); 60 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.971742997195044" }); 61 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.028257002804955994" }); 62 | SimilaritySettings unnormalizedKPsettings = new SimilaritySettings(simConfigs, null, 0.0, minMaxs); 63 | switchedKPsettings.setIdentifier("CoherenceRobustnessTest"); 64 | setCoherenceSimilaritySetting(unnormalizedKPsettings); 65 | } 66 | } -------------------------------------------------------------------------------- /src/mpi/aida/config/settings/disambiguation/LocalDisambiguationSettings.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.config.settings.disambiguation; 2 | 3 | import java.util.HashMap; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import mpi.aida.config.settings.DisambiguationSettings; 9 | import mpi.aida.config.settings.Settings.TECHNIQUE; 10 | import mpi.aida.graph.similarity.exception.MissingSettingException; 11 | import mpi.aida.graph.similarity.util.SimilaritySettings; 12 | 13 | /** 14 | * Preconfigured settings for the {@see Disambiguator} using only the 15 | * mention-entity prior and the keyphrase based similarity. 16 | */ 17 | public class LocalDisambiguationSettings extends DisambiguationSettings { 18 | 19 | private static final long serialVersionUID = -1943862223862927646L; 20 | 21 | public LocalDisambiguationSettings() throws MissingSettingException { 22 | setDisambiguationTechnique(TECHNIQUE.LOCAL); 23 | 24 | List simConfigs = new LinkedList(); 25 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "1.4616111666431395E-5" }); 26 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "4.291375037765039E-5" }); 27 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.15586170799823845" }); 28 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.645200419577534" }); 29 | 30 | Map minMaxs = new HashMap(); 31 | minMaxs.put("prior", new double[] { 0.0, 1.0} ); 32 | minMaxs.put("UnnormalizedKeyphrasesBasedMISimilarity:KeyphrasesContext", new double[] { 0.0, 840.1373501651881}); 33 | minMaxs.put("UnnormalizedKeyphrasesBasedIDFSimilarity:KeyphrasesContext", new double[] { 0.0, 63207.231647131}); 34 | 35 | SimilaritySettings switchedKPsettings = new SimilaritySettings(simConfigs, null, 0.19888034256218348, minMaxs); 36 | switchedKPsettings.setIdentifier("SwitchedKP"); 37 | switchedKPsettings.setPriorThreshold(0.9); 38 | setSimilaritySettings(switchedKPsettings); 39 | 40 | setIncludeNullAsEntityCandidate(false); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/mpi/aida/config/settings/disambiguation/PriorOnlyDisambiguationSettings.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.config.settings.disambiguation; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | import mpi.aida.config.settings.DisambiguationSettings; 7 | import mpi.aida.config.settings.Settings.TECHNIQUE; 8 | import mpi.aida.graph.similarity.exception.MissingSettingException; 9 | import mpi.aida.graph.similarity.util.SimilaritySettings; 10 | 11 | /** 12 | * Preconfigured settings for the {@see Disambiguator} using only the 13 | * mention-entity prior. 14 | */ 15 | public class PriorOnlyDisambiguationSettings extends DisambiguationSettings { 16 | 17 | private static final long serialVersionUID = 2212272023159361340L; 18 | 19 | public PriorOnlyDisambiguationSettings() throws MissingSettingException { 20 | setDisambiguationTechnique(TECHNIQUE.LOCAL); 21 | 22 | Map minMaxs = new HashMap(); 23 | minMaxs.put("prior", new double[] { 0.0, 1.0} ); 24 | 25 | SimilaritySettings priorSettings = new SimilaritySettings(null, null, 1.0, minMaxs); 26 | priorSettings.setIdentifier("Prior"); 27 | setSimilaritySettings(priorSettings); 28 | 29 | setIncludeNullAsEntityCandidate(false); 30 | } 31 | } 32 | 33 | 34 | -------------------------------------------------------------------------------- /src/mpi/aida/config/settings/preparation/StanfordHybridPreparationSettings.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.config.settings.preparation; 2 | 3 | import mpi.aida.config.settings.PreparationSettings; 4 | import mpi.aida.preparation.mentionrecognition.FilterMentions.FilterType; 5 | 6 | /** 7 | * Preparator setting that tokenizes the input text using the 8 | * Stanford CoreNLP tokenizer. Mentions are recognized using the 'ner' 9 | * stage of the CoreNLP pipeline. In additon, they can be marked up 10 | * explicitly by square brackets, e.g.: 11 | * [[Einstein]] was born in [[Ulm]]. 12 | */ 13 | public class StanfordHybridPreparationSettings extends PreparationSettings { 14 | 15 | private static final long serialVersionUID = 3743560957961384100L; 16 | 17 | public StanfordHybridPreparationSettings() { 18 | this.setMentionsFilter(FilterType.Hybrid); 19 | this.setFilteringTypes(null); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /src/mpi/aida/config/settings/preparation/StanfordManualPreparationSettings.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.config.settings.preparation; 2 | 3 | import mpi.aida.config.settings.PreparationSettings; 4 | import mpi.aida.preparation.mentionrecognition.FilterMentions.FilterType; 5 | 6 | /** 7 | * Preparator setting that tokenizes the input text using the 8 | * Stanford CoreNLP tokenizer. Mentions need to be marked up with square 9 | * bracktets. E.g.: 10 | * [[Einstein]] was born in [[Ulm]]. 11 | */ 12 | public class StanfordManualPreparationSettings extends PreparationSettings { 13 | 14 | private static final long serialVersionUID = 3743560957961384100L; 15 | 16 | public StanfordManualPreparationSettings() { 17 | this.setMentionsFilter(FilterType.Manual); 18 | this.setFilteringTypes(null); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/mpi/aida/data/Context.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import gnu.trove.map.hash.TObjectIntHashMap; 4 | 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | 8 | import mpi.aida.access.DataAccess; 9 | import mpi.tokenizer.data.Token; 10 | import mpi.tokenizer.data.Tokens; 11 | 12 | /** 13 | * Holds the input document as context representation. 14 | * 15 | * 16 | */ 17 | public class Context { 18 | 19 | private List tokenStrings; 20 | private int[] tokenIds; 21 | 22 | public Context(Tokens tokens) { 23 | List ts = new ArrayList(tokens.size()); 24 | for (Token token : tokens) { 25 | ts.add(token.getOriginal()); 26 | } 27 | init(ts); 28 | } 29 | 30 | public Context(List tokens) { 31 | init(tokens); 32 | } 33 | 34 | public void init(List tokens) { 35 | tokenStrings = new ArrayList(tokens); 36 | TObjectIntHashMap token2ids = 37 | DataAccess.getIdsForWords(tokenStrings); 38 | tokenIds = new int[tokens.size()]; 39 | for (int i = 0; i < tokens.size(); ++i) { 40 | tokenIds[i] = token2ids.get(tokenStrings.get(i)); 41 | } 42 | } 43 | 44 | public List getTokens() { 45 | return tokenStrings; 46 | } 47 | 48 | public int[] getTokenIds() { 49 | return tokenIds; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/mpi/aida/data/DisambiguationResults.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import java.io.Serializable; 4 | import java.util.ArrayList; 5 | import java.util.Collections; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | import mpi.experiment.trace.Tracer; 11 | 12 | public class DisambiguationResults implements Serializable { 13 | 14 | private static final long serialVersionUID = 8366493180300359941L; 15 | 16 | private Map> mentionMappings; 17 | 18 | private String gTracerHtml; 19 | 20 | private Tracer tracer = null; 21 | 22 | public DisambiguationResults(Map> mentionMappings, String gTracerHtml) { 23 | super(); 24 | this.mentionMappings = mentionMappings; 25 | this.gTracerHtml = gTracerHtml; 26 | } 27 | 28 | public List getResultMentions() { 29 | List mentions = new ArrayList(mentionMappings.keySet()); 30 | Collections.sort(mentions); 31 | return mentions; 32 | } 33 | 34 | public List getResultEntities(ResultMention rm) { 35 | return mentionMappings.get(rm); 36 | } 37 | 38 | public void setResultEntities(ResultMention rm, List res) { 39 | mentionMappings.put(rm, res); 40 | } 41 | 42 | public ResultEntity getBestEntity(ResultMention rm) { 43 | List res = getResultEntities(rm); 44 | 45 | if (res.size() == 0) { 46 | return null; 47 | } else { 48 | return res.get(0); 49 | } 50 | } 51 | 52 | /** 53 | * THIS METHOD IS DEPRECATED! 54 | * Please use getResultMentions() and getResultEntities()/getBestEntity() 55 | * 56 | * Return a map from all mentions found in the input document 57 | * to the best entity it could be disambiguated to. 58 | * 59 | * Mentions are in the format: mention name:::character-offset:::character-length:::score 60 | * Entities are a String identifying the YAGO2 entity (see http://www.yago-knowledge.org) 61 | * 62 | * @return Map of mentions to the best entity 63 | */ 64 | @Deprecated 65 | public Map getMentionMappings() { 66 | Map mappings = new HashMap(); 67 | 68 | for (ResultMention rm : getResultMentions()) { 69 | String entityId = null; 70 | ResultEntity re = getBestEntity(rm); 71 | if (re != null) { 72 | entityId = re.getEntity(); 73 | } 74 | 75 | mappings.put(rm.getMention() + ":::" + rm.getCharacterOffset() + ":::" + rm.getCharacterLength() + ":::" + re.getDisambiguationScore(), entityId); 76 | } 77 | 78 | return mappings; 79 | } 80 | 81 | public String getgTracerHtml() { 82 | return gTracerHtml; 83 | } 84 | 85 | public void setgTracerHtml(String gTracerHtml) { 86 | this.gTracerHtml = gTracerHtml; 87 | } 88 | 89 | public Tracer getTracer() { 90 | return tracer; 91 | } 92 | 93 | public void setTracer(Tracer tracer) { 94 | this.tracer = tracer; 95 | } 96 | 97 | public String toString() { 98 | return mentionMappings.toString(); 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/mpi/aida/data/Entities.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import java.io.Serializable; 4 | import java.util.Collection; 5 | import java.util.HashMap; 6 | import java.util.HashSet; 7 | import java.util.Iterator; 8 | import java.util.Set; 9 | 10 | public class Entities implements Serializable, Iterable { 11 | 12 | private static final long serialVersionUID = -5405018666688695438L; 13 | 14 | private boolean includesNmeEntities; 15 | 16 | private HashMap entitiesNames; 17 | 18 | private Set entities = null; 19 | 20 | public Entities() { 21 | this.entitiesNames = new HashMap(); 22 | entities = new HashSet(); 23 | } 24 | 25 | public Entities(Set entities) { 26 | this.entities = entities; 27 | this.entitiesNames = new HashMap(); 28 | for (Entity entity : entities) { 29 | this.entitiesNames.put(entity.getName(), entity.getId()); 30 | } 31 | } 32 | 33 | public int getId(String entity) { 34 | return entitiesNames.get(entity); 35 | } 36 | 37 | public boolean contains(String entity) { 38 | return entitiesNames.containsKey(entity); 39 | } 40 | 41 | public Set getUniqueNames() { 42 | return entitiesNames.keySet(); 43 | } 44 | 45 | public Set getUniqueNamesNormalizingNME() { 46 | Set names = new HashSet(); 47 | 48 | for (Entity e : entities) { 49 | if (e.isNMEentity()) { 50 | names.add(e.getNMEnormalizedName()); 51 | } else { 52 | names.add(e.getName()); 53 | } 54 | } 55 | 56 | return names; 57 | } 58 | 59 | public Collection getUniqueIds() { 60 | return entitiesNames.values(); 61 | } 62 | 63 | public Set getEntities() { 64 | return entities; 65 | } 66 | 67 | /** 68 | * Should only be used for testing or if you know the exact id for each entity 69 | * @param entity 70 | * @param id 71 | */ 72 | public void add(Entity entity) { 73 | entities.add(entity); 74 | entitiesNames.put(entity.getName(), entity.getId()); 75 | } 76 | 77 | public void addAll(Entities entities) { 78 | this.entities.addAll(entities.entities); 79 | this.entitiesNames.putAll(entities.entitiesNames); 80 | } 81 | 82 | public int uniqueNameSize() { 83 | return entitiesNames.size(); 84 | } 85 | 86 | public int size() { 87 | return entities.size(); 88 | } 89 | 90 | @Override 91 | public Iterator iterator() { 92 | return entities.iterator(); 93 | } 94 | 95 | public boolean isEmpty() { 96 | return entities.isEmpty(); 97 | } 98 | 99 | public boolean isIncludesNmeEntities() { 100 | return includesNmeEntities; 101 | } 102 | 103 | public void setIncludesNmeEntities(boolean includesNmeEntities) { 104 | this.includesNmeEntities = includesNmeEntities; 105 | } 106 | 107 | public static String getMentionNMEKey(String mentionName) { 108 | return mentionName+"-"+Entity.NO_MATCHING_ENTITY; 109 | } 110 | 111 | public static boolean isNMEName(String name) { 112 | return name.endsWith("-"+Entity.NO_MATCHING_ENTITY); 113 | } 114 | 115 | public static String getNameForNME(String nmeName) { 116 | String name = nmeName.replace("-" + Entity.NO_MATCHING_ENTITY, ""); 117 | return name; 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/mpi/aida/data/Entity.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import java.io.Serializable; 4 | import java.util.List; 5 | 6 | import javatools.parsers.Char; 7 | 8 | public class Entity implements Serializable, Comparable, Cloneable { 9 | 10 | private static final long serialVersionUID = 131444964369556633L; 11 | 12 | private String name; 13 | 14 | private List surroundingMentionNames; 15 | 16 | private int id = -1; 17 | 18 | 19 | public static final String NO_MATCHING_ENTITY = "--NME--"; 20 | 21 | /** 22 | * Use this field to represent the mention-entity similarity computed with 23 | * some method (not the score stored in the DB). This field will not be set 24 | * in the constructor. We set it later on, when we compute the similarity 25 | */ 26 | private double mentionEntitySimilarity; 27 | 28 | public Entity(String name, int id) { 29 | this.name = name; 30 | this.mentionEntitySimilarity = -1.0; 31 | this.id = id; 32 | } 33 | 34 | public String getName() { 35 | return name; 36 | } 37 | 38 | public String toString() { 39 | return name + " (" + id + ")"; 40 | } 41 | 42 | public String tohtmlString() { 43 | return "" + Char.toHTML(name) + ""; 44 | } 45 | 46 | public int getId() { 47 | return id; 48 | } 49 | 50 | public double getMentionEntitySimilarity() { 51 | return this.mentionEntitySimilarity; 52 | } 53 | 54 | public void setMentionEntitySimilarity(double mes) { 55 | this.mentionEntitySimilarity = mes; 56 | } 57 | 58 | public int compareTo(Entity e) { 59 | return name.compareTo(e.getName()); 60 | } 61 | 62 | public boolean equals(Object o) { 63 | if (o instanceof Entity) { 64 | Entity e = (Entity) o; 65 | return name.equals(e.getName()); 66 | } else { 67 | return false; 68 | } 69 | } 70 | 71 | public int hashCode() { 72 | return name.hashCode(); 73 | } 74 | 75 | public boolean isNMEentity() { 76 | return Entities.isNMEName(name); 77 | } 78 | 79 | public String getNMEnormalizedName() { 80 | String normName = name.replace("-"+NO_MATCHING_ENTITY, "").replace(' ', '_'); 81 | return normName; 82 | } 83 | 84 | public List getSurroundingMentionNames() { 85 | return surroundingMentionNames; 86 | } 87 | 88 | public void setSurroundingMentionNames(List surroundingMentionNames) { 89 | this.surroundingMentionNames = surroundingMentionNames; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/mpi/aida/data/Keyphrases.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import gnu.trove.map.hash.TIntDoubleHashMap; 4 | import gnu.trove.map.hash.TIntObjectHashMap; 5 | 6 | /** 7 | * Holds all the keyphrase data describing a set of entities. 8 | * 9 | * 10 | */ 11 | public class Keyphrases { 12 | 13 | private TIntObjectHashMap entityKeyphrases; 14 | private TIntObjectHashMap keyphraseTokens; 15 | private TIntObjectHashMap entity2keyphrase2mi; 16 | private TIntObjectHashMap entity2keyword2mi; 17 | 18 | public void setEntityKeyphrases(TIntObjectHashMap entityKeyphrases) { 19 | this.entityKeyphrases = entityKeyphrases; 20 | } 21 | 22 | public void setKeyphraseTokens(TIntObjectHashMap keyphraseTokens) { 23 | this.keyphraseTokens = keyphraseTokens; 24 | } 25 | 26 | public void setEntityKeyphraseWeights( 27 | TIntObjectHashMap entity2keyphrase2mi) { 28 | this.entity2keyphrase2mi = entity2keyphrase2mi; 29 | } 30 | 31 | public void setEntityKeywordWeights( 32 | TIntObjectHashMap entity2keyword2mi) { 33 | this.entity2keyword2mi = entity2keyword2mi; 34 | } 35 | 36 | public TIntObjectHashMap getEntityKeyphrases() { 37 | return entityKeyphrases; 38 | } 39 | 40 | public TIntObjectHashMap getKeyphraseTokens() { 41 | return keyphraseTokens; 42 | } 43 | 44 | public TIntObjectHashMap getEntityKeywordWeights() { 45 | return entity2keyword2mi; 46 | } 47 | 48 | public TIntObjectHashMap getEntityKeyphraseWeights() { 49 | return entity2keyphrase2mi; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/mpi/aida/data/Mention.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import java.io.Serializable; 4 | 5 | public class Mention implements Serializable, Comparable { 6 | 7 | private static final long serialVersionUID = 3177945435296705498L; 8 | 9 | private String mention; 10 | 11 | /** Starting token offset of the mention. */ 12 | private int startToken; 13 | 14 | /** Ending token offset of the mention (including this token). */ 15 | private int endToken; 16 | 17 | private int startStanford; 18 | 19 | private int endStanford; 20 | 21 | private int sentenceId; 22 | 23 | private String groundTruthEntity = null; 24 | 25 | private double disambiguationConfidence; 26 | 27 | // Character offset 28 | private int charOffset, charLength; 29 | 30 | private Entities candidateEntities; 31 | 32 | private int id = -1; 33 | 34 | public Mention() { 35 | } 36 | 37 | public Mention(String mention, int startToken, int endToken, int startStanford, int endStanford, int sentenceId) { 38 | this.startToken = startToken; 39 | this.endToken = endToken; 40 | this.startStanford = startStanford; 41 | this.endStanford = endStanford; 42 | this.mention = mention; 43 | this.sentenceId = sentenceId; 44 | } 45 | 46 | public String getMention() { 47 | return mention; 48 | } 49 | 50 | public int getStartToken() { 51 | return startToken; 52 | } 53 | 54 | public int getEndToken() { 55 | return endToken; 56 | } 57 | 58 | public int getStartStanford() { 59 | return startStanford; 60 | } 61 | 62 | public int getEndStanford() { 63 | return endStanford; 64 | } 65 | 66 | public int getSentenceId() { 67 | return sentenceId; 68 | } 69 | 70 | public void setSentenceId(int sentenceId) { 71 | this.sentenceId = sentenceId; 72 | } 73 | 74 | public void addCandidateEntity(Entity entity) { 75 | candidateEntities.add(entity); 76 | } 77 | 78 | public Entities getCandidateEntities() { 79 | return candidateEntities; 80 | } 81 | 82 | public void setCandidateEntities(Entities candidateEntities) { 83 | this.candidateEntities = candidateEntities; 84 | } 85 | 86 | public String toString() { 87 | return mention + ", From:" + startToken + "/" + startStanford + ", To:" + endToken + "/" + endStanford + ", Offset: " + charOffset + ", Length: " + charLength; 88 | } 89 | 90 | public void setStartToken(int start) { 91 | this.startToken = start; 92 | } 93 | 94 | public void setEndToken(int end) { 95 | this.endToken = end; 96 | } 97 | 98 | public int getCharOffset() { 99 | return this.charOffset; 100 | } 101 | 102 | public int getCharLength() { 103 | return this.charLength; 104 | } 105 | 106 | public void setCharOffset(int offset) { 107 | this.charOffset = offset; 108 | 109 | } 110 | 111 | public void setCharLength(int length) { 112 | this.charLength = length; 113 | } 114 | 115 | public void setMention(String mention) { 116 | this.mention = mention; 117 | } 118 | 119 | @Override 120 | public boolean equals(Object obj) { 121 | if (obj instanceof Mention) { 122 | Mention m = (Mention) obj; 123 | 124 | return m.getMention().equals(getMention()) && m.getCharOffset() == charOffset; 125 | } else { 126 | return false; 127 | } 128 | } 129 | 130 | @Override 131 | public int hashCode() { 132 | return mention.hashCode() + charOffset; 133 | } 134 | 135 | @Override 136 | public int compareTo(Mention mention) { 137 | return this.charOffset - mention.charOffset; 138 | } 139 | 140 | public void setGroundTruthResult(String result) { 141 | this.groundTruthEntity = result; 142 | } 143 | 144 | public String getGroundTruthResult() { 145 | return groundTruthEntity; 146 | } 147 | 148 | public void setDisambiguationConfidence(double confidence) { 149 | disambiguationConfidence = confidence; 150 | } 151 | 152 | public double getDisambiguationConfidence() { 153 | return disambiguationConfidence; 154 | } 155 | 156 | public int getId() { 157 | return id; 158 | } 159 | 160 | public void setId(int id) { 161 | this.id = id; 162 | } 163 | 164 | public void setStartStanford(int startStanford) { 165 | this.startStanford = startStanford; 166 | } 167 | 168 | public void setEndStanford(int endStanford) { 169 | this.endStanford = endStanford; 170 | } 171 | 172 | public String getIdentifiedRepresentation() { 173 | return mention + ":::" + charOffset; 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /src/mpi/aida/data/Mentions.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import java.io.Serializable; 4 | import java.util.ArrayList; 5 | import java.util.Collections; 6 | import java.util.HashMap; 7 | import java.util.LinkedList; 8 | import java.util.List; 9 | 10 | public class Mentions implements Serializable { 11 | 12 | private static final long serialVersionUID = -383105468450056989L; 13 | 14 | private List mentions = null; 15 | 16 | private HashMap subStrings = null; 17 | 18 | /** 19 | * The expected types for entities to which those mentions will be disambiguated 20 | */ 21 | private List entitiesTypes = null; 22 | 23 | public Mentions() { 24 | mentions = new LinkedList(); 25 | } 26 | 27 | public boolean containsOffset(int offset) { 28 | for (Mention mention : mentions) { 29 | if (mention.getCharOffset() == offset) { 30 | return true; 31 | } 32 | } 33 | return false; 34 | } 35 | 36 | public Mention getMentionForOffset(int offset) { 37 | for (Mention mention : mentions) { 38 | if (mention.getCharOffset() == offset) { 39 | return mention; 40 | } 41 | } 42 | return null; 43 | } 44 | 45 | public void addMention(Mention mention) { 46 | mentions.add(mention); 47 | } 48 | 49 | public List getMentions() { 50 | return mentions; 51 | } 52 | 53 | public ArrayList getMentionTokenStanfordIndices() 54 | { 55 | ArrayList mentionTokenIndices = new ArrayList(); 56 | // there's just one 57 | for (Mention mention : mentions) 58 | { 59 | for (int i=mention.getStartStanford();i<=mention.getEndStanford();i++) 60 | mentionTokenIndices.add(i); 61 | } 62 | return mentionTokenIndices; 63 | } 64 | 65 | public int getMentionTokenSentenceIndex() 66 | { 67 | // there's just one 68 | return mentions.get(0).getSentenceId(); 69 | } 70 | 71 | public boolean remove(Mention mention) { 72 | return mentions.remove(mention); 73 | } 74 | 75 | public String toString() { 76 | StringBuffer sb = new StringBuffer(200); 77 | for (int i = 0; i < mentions.size(); i++) { 78 | sb.append(mentions.get(i).toString()).append('\n'); 79 | } 80 | return sb.toString(); 81 | } 82 | 83 | public void setSubstring(HashMap subStrings) { 84 | this.subStrings = subStrings; 85 | } 86 | 87 | public HashMap getSubstrings() { 88 | return subStrings; 89 | } 90 | 91 | public void sortMentions() { 92 | Collections.sort(mentions); 93 | } 94 | 95 | public List getEntitiesTypes() { 96 | return entitiesTypes; 97 | } 98 | 99 | public void setEntitiesTypes(List entitiesTypes) { 100 | this.entitiesTypes = entitiesTypes; 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/mpi/aida/data/PreparedInput.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import mpi.tokenizer.data.Tokens; 4 | 5 | public class PreparedInput { 6 | 7 | private String docId; 8 | 9 | private Tokens tokens; 10 | 11 | /** Used by the local similarity methods in the disambiguation. It holds 12 | * the document tokens both as strings and converted to word ids. */ 13 | private Context context; 14 | 15 | private Mentions mentions; 16 | 17 | public PreparedInput(String docId) { 18 | this.docId = docId; 19 | } 20 | 21 | public PreparedInput(String docId, Tokens tokens, Mentions mentions) { 22 | this.docId = docId; 23 | this.tokens = tokens; 24 | this.mentions = mentions; 25 | context = createContextFromTokens(tokens); 26 | } 27 | 28 | public Tokens getTokens() { 29 | return tokens; 30 | } 31 | 32 | public void setTokens(Tokens tokens) { 33 | this.tokens = tokens; 34 | context = createContextFromTokens(tokens); 35 | } 36 | 37 | public Mentions getMentions() { 38 | return mentions; 39 | } 40 | 41 | public void setMentions(Mentions mentions) { 42 | this.mentions = mentions; 43 | } 44 | 45 | public Context getContext() { 46 | return context; 47 | } 48 | 49 | private Context createContextFromTokens(Tokens t) { 50 | return new Context(t); 51 | } 52 | 53 | public String getDocId() { 54 | return docId; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/mpi/aida/data/ResultEntity.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import java.io.Serializable; 4 | import java.text.NumberFormat; 5 | import java.util.ArrayList; 6 | import java.util.List; 7 | import java.util.Locale; 8 | 9 | /** 10 | * Entity the was assigned to a ResultMention. 11 | * The entity String is the identifier in YAGO2 12 | * (see http://www.yago-knowledge.org) 13 | * 14 | * 15 | */ 16 | public class ResultEntity implements Comparable, Serializable { 17 | 18 | private static final long serialVersionUID = -7062155406718136994L; 19 | 20 | /** YAGO2 identifier of the entity (http://www.yago-knowledge.org) */ 21 | private String entity; 22 | 23 | /** Score assigned to the entity */ 24 | private double disambiguationScore; 25 | 26 | public ResultEntity(String entity, double disambiguationScore) { 27 | super(); 28 | this.entity = entity; 29 | this.disambiguationScore = disambiguationScore; 30 | } 31 | 32 | public static ResultEntity getNoMatchingEntity() { 33 | return new ResultEntity(Entity.NO_MATCHING_ENTITY, 0.0); 34 | } 35 | 36 | public static List getResultEntityAsList(ResultEntity re) { 37 | List res = new ArrayList(1); 38 | res.add(re); 39 | return res; 40 | } 41 | 42 | /** 43 | * @return YAGO2 identifier of the entity (http://www.yago-knowledge.org) 44 | */ 45 | public String getEntity() { 46 | return entity; 47 | } 48 | 49 | public void setEntity(String entity) { 50 | this.entity = entity; 51 | } 52 | 53 | public double getDisambiguationScore() { 54 | return disambiguationScore; 55 | } 56 | 57 | public void setDisambiguationScore(double disambiguationScore) { 58 | this.disambiguationScore = disambiguationScore; 59 | } 60 | 61 | public boolean isNoMatchingEntity() { 62 | return entity.equals(Entity.NO_MATCHING_ENTITY); 63 | } 64 | 65 | @Override 66 | public int compareTo(ResultEntity re) { 67 | // natural ordering for ResultEntities is descending 68 | return new Double(new Double(re.getDisambiguationScore())).compareTo(disambiguationScore); 69 | } 70 | 71 | public String toString() { 72 | NumberFormat df = NumberFormat.getInstance(Locale.ENGLISH); 73 | df.setMaximumFractionDigits(5); 74 | return entity + " (" + df.format(disambiguationScore) + ")"; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/mpi/aida/data/ResultMention.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import java.io.Serializable; 4 | 5 | import org.slf4j.Logger; 6 | import org.slf4j.LoggerFactory; 7 | 8 | /** 9 | * Mention detected in the input text. It is identified uniquely 10 | * by the combination of the three members docId+mention+characterOffset. 11 | * 12 | * 13 | */ 14 | public class ResultMention implements Comparable, Serializable { 15 | private static final Logger logger = 16 | LoggerFactory.getLogger(ResultMention.class); 17 | 18 | private static final long serialVersionUID = -6791087404868641006L; 19 | 20 | private String docId; 21 | 22 | private String mention; 23 | 24 | private int characterOffset; 25 | 26 | private int characterLength; 27 | 28 | public ResultMention(String docId, String mention, int characterOffset, int characterLength) { 29 | super(); 30 | this.docId = docId; 31 | this.mention = mention; 32 | this.characterOffset = characterOffset; 33 | this.characterLength = characterLength; 34 | } 35 | 36 | public String getDocId() { 37 | return docId; 38 | } 39 | 40 | public void setDocId(String docId) { 41 | this.docId = docId; 42 | } 43 | 44 | public String getMention() { 45 | return mention; 46 | } 47 | 48 | public void setMention(String mention) { 49 | this.mention = mention; 50 | } 51 | 52 | public int getCharacterOffset() { 53 | return characterOffset; 54 | } 55 | 56 | public void setCharacterOffset(int characterOffset) { 57 | this.characterOffset = characterOffset; 58 | } 59 | 60 | public int getCharacterLength() { 61 | return characterLength; 62 | } 63 | 64 | public void setCharacterLength(int characterLength) { 65 | this.characterLength = characterLength; 66 | } 67 | 68 | public static ResultMention getResultMentionFromMentionString(String docId, String mentionString) { 69 | String[] data = mentionString.split(":::"); 70 | 71 | if (data.length < 3) { 72 | logger.error("Could not create ResultMention from mentionString: " + mentionString); 73 | return null; 74 | } 75 | 76 | String mention = data[0]; 77 | int characterOffset = Integer.parseInt(data[1]); 78 | int characterLength = Integer.parseInt(data[2]); 79 | 80 | ResultMention rm = new ResultMention(docId, mention, characterOffset, characterLength); 81 | return rm; 82 | } 83 | 84 | @Override 85 | public boolean equals(Object o) { 86 | if (o instanceof ResultMention) { 87 | ResultMention rm = (ResultMention) o; 88 | return (docId.equals(rm.getDocId()) && mention.equals(rm.getMention()) && characterOffset == rm.getCharacterOffset()); 89 | } else { 90 | return false; 91 | } 92 | } 93 | 94 | @Override 95 | public int hashCode() { 96 | return docId.hashCode() + mention.hashCode() + characterOffset; 97 | } 98 | 99 | @Override 100 | public int compareTo(ResultMention rm) { 101 | int result = docId.compareTo(rm.getDocId()); 102 | 103 | if (result == 0) { 104 | result = new Integer(characterOffset).compareTo(new Integer(rm.getCharacterOffset())); 105 | } 106 | 107 | return result; 108 | } 109 | 110 | public String toString() { 111 | return "[" + docId + "] " + mention + " (" + characterOffset + "/" + characterLength + ")"; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/GraphNode.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph; 2 | 3 | import gnu.trove.map.hash.TIntDoubleHashMap; 4 | 5 | public class GraphNode { 6 | 7 | private int id; 8 | private GraphNodeTypes type; 9 | private Object NodeData = null; 10 | private TIntDoubleHashMap successors; 11 | 12 | public GraphNode() { 13 | successors = new TIntDoubleHashMap(); 14 | } 15 | 16 | public int getId() { 17 | return id; 18 | } 19 | public void setId(int id) { 20 | this.id = id; 21 | } 22 | public GraphNodeTypes getType() { 23 | return type; 24 | } 25 | public void setType(GraphNodeTypes type) { 26 | this.type = type; 27 | } 28 | public Object getNodeData() { 29 | return NodeData; 30 | } 31 | public void setNodeData(Object nodeData) { 32 | NodeData = nodeData; 33 | } 34 | public TIntDoubleHashMap getSuccessors() { 35 | return successors; 36 | } 37 | public void setSuccessors(TIntDoubleHashMap successors) { 38 | this.successors = successors; 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/GraphNodeTypes.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph; 2 | 3 | public enum GraphNodeTypes { 4 | MENTION, ENTITY 5 | } 6 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/algorithms/DisambiguationAlgorithm.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.algorithms; 2 | 3 | import java.util.List; 4 | import java.util.Map; 5 | 6 | import mpi.aida.data.ResultEntity; 7 | import mpi.aida.data.ResultMention; 8 | 9 | 10 | public abstract class DisambiguationAlgorithm { 11 | 12 | public abstract Map> disambiguate() throws Exception; 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/algorithms/Node.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.algorithms; 2 | 3 | import java.util.Comparator; 4 | 5 | /** 6 | * Utility class to be used in the implemenation of the shortest-path 7 | * algorithms. We store a node together with its distance, and then we develop a 8 | * comparator that sorts nodes according to their distances 9 | */ 10 | public class Node { 11 | 12 | private int key; 13 | 14 | private double distance; 15 | 16 | public Node(int k, double d) { 17 | 18 | key = k; 19 | distance = d; 20 | 21 | } 22 | 23 | public int getKey() { 24 | 25 | return key; 26 | } 27 | 28 | public double getDistance() { 29 | 30 | return distance; 31 | } 32 | 33 | public void setDistance(double d) { 34 | 35 | distance = d; 36 | 37 | } 38 | } 39 | 40 | class NodeComparator implements Comparator { 41 | 42 | public int compare(Node first, Node second) { 43 | 44 | // I want to use the opposite order, so that I can build a max priority 45 | // queue using the default 46 | // implementation of a min priority queue 47 | Double firstDistance = first.getDistance(); 48 | Double secondDistance = second.getDistance(); 49 | return firstDistance.compareTo(secondDistance); 50 | 51 | } 52 | 53 | public boolean equals(Node first, Node second) { 54 | 55 | // I just want only one node with a given key in the priority queue 56 | if (first.getKey() == second.getKey()) 57 | return true; 58 | else 59 | return false; 60 | } 61 | 62 | } -------------------------------------------------------------------------------- /src/mpi/aida/graph/extraction/DegreeComparator.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.extraction; 2 | 3 | import java.util.Comparator; 4 | 5 | public class DegreeComparator implements Comparator { 6 | 7 | 8 | @Override 9 | public int compare(String arg0, String arg1) { 10 | // I want to use the opposite order, so that I can build a max priority queue using the default 11 | // implementation of a min priority queue 12 | String first = arg0; 13 | String second = arg1; 14 | Double firstDegree = Double.parseDouble(first.split(":::")[1]); 15 | Double secondDegree = Double.parseDouble(second.split(":::")[1]); 16 | return firstDegree.compareTo(secondDegree); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/extraction/ExtractGraphAllEdges.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.extraction; 2 | 3 | import mpi.aida.data.Entities; 4 | import mpi.aida.data.Mentions; 5 | import mpi.aida.graph.similarity.EnsembleEntityEntitySimilarity; 6 | 7 | 8 | public class ExtractGraphAllEdges extends ExtractGraph { 9 | 10 | public ExtractGraphAllEdges(String graphName, Mentions m, Entities ue, EnsembleEntityEntitySimilarity eeSim, double alpha) { 11 | super(graphName, m, ue, eeSim, alpha); 12 | } 13 | 14 | protected boolean haveDistinceMentions(String e1, String e2) { 15 | return true; 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/EnsembleEntityEntitySimilarity.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity; 2 | 3 | import java.util.List; 4 | 5 | import mpi.aida.data.Entities; 6 | import mpi.aida.data.Entity; 7 | import mpi.aida.graph.similarity.util.SimilaritySettings; 8 | import mpi.experiment.trace.Tracer; 9 | 10 | public class EnsembleEntityEntitySimilarity { 11 | 12 | private List eeSims; 13 | 14 | public EnsembleEntityEntitySimilarity(Entities uniqueEntities, SimilaritySettings settings, Tracer tracer) throws Exception { 15 | eeSims = settings.getEntityEntitySimilarities(uniqueEntities, tracer); 16 | } 17 | 18 | public double calcSimilarity(Entity a, Entity b) throws Exception { 19 | double weightedSimilarity = 0.0; 20 | 21 | for (EntityEntitySimilarity eeSim : eeSims) { 22 | double sim = eeSim.calcSimilarity(a, b) * eeSim.getWeight(); 23 | weightedSimilarity += sim; 24 | } 25 | 26 | return weightedSimilarity; 27 | } 28 | 29 | public List getEeSims() { 30 | return eeSims; 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/MaterializedPriorProbability.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity; 2 | 3 | import gnu.trove.map.hash.TIntDoubleHashMap; 4 | 5 | import java.sql.SQLException; 6 | import java.util.HashMap; 7 | import java.util.Set; 8 | 9 | import mpi.aida.access.DataAccess; 10 | 11 | /** 12 | * This class calculates the prior probability of a mention 13 | * being associated with a given entity. The prior probability is based 14 | * on the occurrence count of links (and their anchor text as mention) with 15 | * a given Wikipedia/YAGO entity as target. 16 | * 17 | * It is faster than {@link PriorProbability} because it uses a table with 18 | * all the priors materialized. To get the table, run the {@link MaterializedPriorProbability} 19 | * main method, it will create another table in the YAGO2 database which can 20 | * then be used by this class. 21 | * 22 | * 23 | */ 24 | public class MaterializedPriorProbability extends PriorProbability { 25 | 26 | public MaterializedPriorProbability(Set mentions) throws SQLException { 27 | super(mentions); 28 | } 29 | 30 | public void setupMentions(Set mentions) throws SQLException { 31 | priors = new HashMap(); 32 | for (String mention : mentions) { 33 | mention = conflateMention(mention); 34 | TIntDoubleHashMap entityPriors = DataAccess.getEntityPriors(mention); 35 | priors.put(mention, entityPriors); 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/PriorProbability.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity; 2 | 3 | import gnu.trove.iterator.TIntDoubleIterator; 4 | import gnu.trove.map.hash.TIntDoubleHashMap; 5 | 6 | import java.sql.SQLException; 7 | import java.util.HashMap; 8 | import java.util.Locale; 9 | import java.util.NoSuchElementException; 10 | import java.util.Set; 11 | 12 | import mpi.aida.data.Entity; 13 | 14 | /** 15 | * This class calculates the prior probability of a mention 16 | * being associated with a given entity. The prior probability is based 17 | * on the occurrence count of links (and their anchor text as mention) with 18 | * a given Wikipedia/YAGO entity as target. 19 | * 20 | * The calculation is done on the fly, so it is a bit slow. For a faster implementation, 21 | * use {@link MaterializedPriorProbability}. 22 | * 23 | * It uses the 'hasInternalWikipediaLinkTo' and 'hasAnchorText' relations 24 | * in the YAGO2 database. 25 | * 26 | * 27 | */ 28 | public abstract class PriorProbability { 29 | 30 | protected HashMap priors; 31 | 32 | private double weight; 33 | 34 | public PriorProbability(Set mentions) throws SQLException { 35 | setupMentions(mentions); 36 | } 37 | 38 | public double getWeight() { 39 | return weight; 40 | } 41 | 42 | public void setWeight(double weight) { 43 | this.weight = weight; 44 | } 45 | 46 | protected abstract void setupMentions(Set mentions) throws SQLException; 47 | 48 | /** 49 | * Returns the prior probability for the given mention-entity pair. 50 | * If smoothing is true, it will return the lowest prior among all entities if 51 | * there is no real prior. 52 | * 53 | * @param mention 54 | * @param entity 55 | * @param smoothing 56 | * @return 57 | */ 58 | public double getPriorProbability( 59 | String mentionText, Entity entity, boolean smoothing) { 60 | mentionText = conflateMention(mentionText); 61 | TIntDoubleHashMap mentionPriors = priors.get(mentionText); 62 | 63 | if (mentionPriors == null) { 64 | throw new NoSuchElementException( 65 | "Mention " + mentionText + " must be passed to constructor!"); 66 | } 67 | 68 | double entityPrior = mentionPriors.get(entity.getId()); 69 | if (smoothing && entityPrior == 0.0) { 70 | double smallestPrior = 1.0; 71 | 72 | for (TIntDoubleIterator it = mentionPriors.iterator(); it.hasNext();) { 73 | it.advance(); 74 | double currentPrior = it.value(); 75 | if (currentPrior < smallestPrior) { 76 | smallestPrior = currentPrior; 77 | } 78 | } 79 | entityPrior = smallestPrior; 80 | } 81 | 82 | return entityPrior; 83 | } 84 | 85 | public double getBestPrior(String mentionText) { 86 | mentionText = conflateMention(mentionText); 87 | TIntDoubleHashMap mentionPriors = priors.get(mentionText); 88 | 89 | double bestPrior = 0.0; 90 | for (TIntDoubleIterator it = mentionPriors.iterator(); it.hasNext();) { 91 | it.advance(); 92 | double currentPrior = it.value(); 93 | if (currentPrior > bestPrior) { 94 | bestPrior = currentPrior; 95 | } 96 | } 97 | 98 | return bestPrior; 99 | } 100 | 101 | public double getPriorProbability(String mentionText, Entity entity) { 102 | return getPriorProbability(mentionText, entity, false); 103 | } 104 | 105 | public static String conflateMention(String mention) { 106 | // conflate cases for mentions of length >= 4 107 | if (mention.length() >= 4) { 108 | mention = mention.toUpperCase(Locale.ENGLISH); 109 | } 110 | 111 | return mention; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/context/EmptyEntitiesContext.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.context; 2 | 3 | import mpi.aida.data.Entities; 4 | import mpi.aida.data.Entity; 5 | 6 | public class EmptyEntitiesContext extends EntitiesContext { 7 | 8 | public EmptyEntitiesContext(Entities entities) throws Exception { 9 | super(entities, null); 10 | } 11 | 12 | @Override 13 | public int[] getContext(Entity entity) { 14 | return null; 15 | } 16 | 17 | @Override 18 | protected void setupEntities(Entities entities) throws Exception { 19 | // nothing 20 | } 21 | 22 | public String toString() { 23 | return "EmptyEntitiesContext"; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/context/EntitiesContext.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.context; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | import mpi.aida.AidaManager; 7 | import mpi.aida.data.Entities; 8 | import mpi.aida.data.Entity; 9 | import mpi.tokenizer.data.Token; 10 | import mpi.tokenizer.data.Tokens; 11 | 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import basics.Normalize; 16 | 17 | public abstract class EntitiesContext { 18 | private static final Logger logger = 19 | LoggerFactory.getLogger(EntitiesContext.class); 20 | 21 | protected Entities entities; 22 | protected EntitiesContextSettings settings; 23 | 24 | public EntitiesContext(Entities entities, EntitiesContextSettings settings) throws Exception { 25 | this.entities = entities; 26 | this.settings = settings; 27 | 28 | long beginTime = System.currentTimeMillis(); 29 | 30 | setupEntities(entities); 31 | 32 | long runTime = (System.currentTimeMillis() - beginTime) / 1000; 33 | logger.debug("Done setting up " + this + ": " + runTime + "s"); 34 | } 35 | 36 | public void setEntities(Entities entities) throws Exception { 37 | this.entities = entities; 38 | setupEntities(entities); 39 | } 40 | 41 | public Entities getEntities() { 42 | return entities; 43 | } 44 | 45 | public abstract int[] getContext(Entity entity); 46 | 47 | protected abstract void setupEntities(Entities entities) throws Exception; 48 | 49 | protected List getTokens(String string) { 50 | List tokens = new LinkedList(); 51 | 52 | Tokens advTokens = AidaManager.tokenize("EntitiesContext", string); 53 | 54 | for (Token token : advTokens) { 55 | tokens.add(token.getOriginal()); 56 | } 57 | 58 | return tokens; 59 | } 60 | 61 | public static String getEntityName(String entity) { 62 | String norm = Normalize.unEntity(entity); 63 | norm = norm.replaceAll(" \\(.*?\\)$", ""); 64 | 65 | return norm; 66 | } 67 | 68 | public String toString() { 69 | return getIdentifier(); 70 | } 71 | 72 | public String getIdentifier() { 73 | return this.getClass().getSimpleName(); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/context/EntitiesContextSettings.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.context; 2 | 3 | 4 | public class EntitiesContextSettings { 5 | private int numberOfEntityKeyphrases = Integer.MAX_VALUE; 6 | 7 | private boolean normalizeWeights = true; // default is to normalize 8 | private boolean useConfusableMIWeight = false; 9 | private boolean averageWeights = false; 10 | 11 | private int nGramLength = 2; 12 | 13 | public static final double DEFAULT_KEYPHRASE_ALPHA = 0.9713705285593512; 14 | public static final double DEFAULT_KEYWORD_ALPHA = 0.9713705285593512; 15 | 16 | private double entityCoherenceKeyphraseAlpha = DEFAULT_KEYPHRASE_ALPHA; 17 | private double entityCoherenceKeywordAlpha = DEFAULT_KEYWORD_ALPHA; 18 | 19 | private String keyphraseSourceExclusion; 20 | 21 | // LSH 22 | private int lshBandSize; 23 | private int lshBandCount; 24 | private String lshDatabaseTable; 25 | 26 | /** 27 | * 28 | * @return Balance between Keyphrase MI/IDF. Use alpha*mi, (1-alpha)*idf 29 | */ 30 | public double getEntityCoherenceKeyphraseAlpha() { 31 | return entityCoherenceKeyphraseAlpha; 32 | } 33 | 34 | public void setEntityCoherenceKeyphraseAlpha(double entityCoherenceKeyphraseAlpha) { 35 | this.entityCoherenceKeyphraseAlpha = entityCoherenceKeyphraseAlpha; 36 | } 37 | 38 | /** 39 | * 40 | * @return Balance between Keyword MI/IDF. Use alpha*mi, (1-alpha)*idf 41 | */ 42 | public double getEntityCoherenceKeywordAlpha() { 43 | return entityCoherenceKeywordAlpha; 44 | } 45 | 46 | public void setEntityCoherenceKeywordAlpha(double entityCoherenceKeywordAlpha) { 47 | this.entityCoherenceKeywordAlpha = entityCoherenceKeywordAlpha; 48 | } 49 | 50 | public int getNumberOfEntityKeyphrases() { 51 | return numberOfEntityKeyphrases; 52 | } 53 | 54 | public void setNumberOfEntityKeyphrases(int numberOfEntityKeyphrases) { 55 | this.numberOfEntityKeyphrases = numberOfEntityKeyphrases; 56 | } 57 | 58 | 59 | public String getKeyphraseSourceExclusion() { 60 | return keyphraseSourceExclusion; 61 | } 62 | 63 | 64 | public void setKeyphraseSourceExclusion(String keyphraseSourceExclusion) { 65 | this.keyphraseSourceExclusion = keyphraseSourceExclusion; 66 | } 67 | 68 | public boolean shouldNormalizeWeights() { 69 | return normalizeWeights; 70 | } 71 | 72 | public void setShouldNormalizeWeights(boolean flag) { 73 | normalizeWeights = flag; 74 | } 75 | 76 | 77 | public boolean shouldUseConfusableMIWeight() { 78 | return useConfusableMIWeight; 79 | } 80 | 81 | public void setUseConfusableMIWeight(boolean useConfusableMIWeight) { 82 | this.useConfusableMIWeight = useConfusableMIWeight; 83 | } 84 | 85 | public boolean shouldAverageWeights() { 86 | return averageWeights; 87 | } 88 | 89 | public void setShouldAverageWeights(boolean flag) { 90 | this.averageWeights = flag; 91 | } 92 | 93 | public void setNgramLength(int nGramLength) { 94 | this.nGramLength = nGramLength; 95 | } 96 | 97 | public int getNgramLength() { 98 | return nGramLength; 99 | } 100 | 101 | public int getLshBandSize() { 102 | return lshBandSize; 103 | } 104 | 105 | public void setLshBandSize(int lshBandSize) { 106 | this.lshBandSize = lshBandSize; 107 | } 108 | 109 | public int getLshBandCount() { 110 | return lshBandCount; 111 | } 112 | 113 | public void setLshBandCount(int lshBandCount) { 114 | this.lshBandCount = lshBandCount; 115 | } 116 | 117 | public String getLshDatabaseTable() { 118 | return lshDatabaseTable; 119 | } 120 | 121 | public void setLshDatabaseTable(String lshDatabaseTable) { 122 | this.lshDatabaseTable = lshDatabaseTable; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/context/KeyphraseReweightedKeywordContext.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.context; 2 | 3 | import gnu.trove.map.hash.TIntObjectHashMap; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | import mpi.aida.data.Entities; 9 | import mpi.aida.data.Entity; 10 | import mpi.experiment.trace.GraphTracer; 11 | import mpi.experiment.trace.NullGraphTracer; 12 | 13 | 14 | public class KeyphraseReweightedKeywordContext extends FastWeightedKeyphrasesContext { 15 | 16 | public KeyphraseReweightedKeywordContext(Entities entities) throws Exception { 17 | super(entities); 18 | } 19 | 20 | public KeyphraseReweightedKeywordContext(Entities entities, EntitiesContextSettings settings) throws Exception { 21 | super(entities, settings); 22 | } 23 | 24 | @Override 25 | protected TIntObjectHashMap fillEntityVectors() { 26 | TIntObjectHashMap vectors = new TIntObjectHashMap(); 27 | 28 | for (Entity e : entities) { 29 | float[] weights = new float[allKeywords.size()]; 30 | 31 | for (int kp : getEntityKeyphraseIds(e)) { 32 | for (int tokenId : getKeyphraseTokenIds(kp, true)) { 33 | double mi = entity2keyword2mi.get(e.getId()).get(tokenId); 34 | 35 | double finalTokenWeight = mi; 36 | 37 | double keyphraseWeight = getKeyphraseMiWeight(e, kp); 38 | double reweightedFinalTokenWeight = keyphraseWeight * finalTokenWeight; 39 | 40 | if (Double.isNaN(reweightedFinalTokenWeight)) { 41 | System.err.println("NAN"); 42 | } 43 | 44 | weights[tokenId] = (float) reweightedFinalTokenWeight; 45 | } 46 | } 47 | 48 | if (!(GraphTracer.gTracer instanceof NullGraphTracer)) { 49 | Map entityKeywords = new HashMap(); 50 | 51 | for (int i = 0; i < weights.length; i++) { 52 | if (weights[i] > 0.0) { 53 | entityKeywords.put(getKeywordForId(i), weights[i]); 54 | } 55 | } 56 | } 57 | 58 | vectors.put(e.getId(), weights); 59 | } 60 | 61 | return vectors; 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/context/TextContext.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.context; 2 | 3 | import gnu.trove.map.hash.TIntObjectHashMap; 4 | import mpi.aida.data.Entities; 5 | import mpi.aida.data.Entity; 6 | 7 | /** 8 | * Abstract class for all contexts containing solely integer ids 9 | * representing tokens. 10 | * 11 | * 12 | */ 13 | public abstract class TextContext extends EntitiesContext { 14 | 15 | private TIntObjectHashMap entityTokens; 16 | 17 | public TextContext(Entities entities, EntitiesContextSettings settings) throws Exception { 18 | super(entities, settings); 19 | } 20 | 21 | @Override 22 | public int[] getContext(Entity entity) { 23 | return entityTokens.get(entity.getId()); 24 | } 25 | 26 | @Override 27 | protected void setupEntities(Entities entities) throws Exception { 28 | entityTokens = new TIntObjectHashMap(); 29 | 30 | for (int entity : entities.getUniqueIds()) { 31 | entityTokens.put(entity, getTextTokens(entity)); 32 | } 33 | } 34 | 35 | protected abstract int[] getTextTokens(int entity); 36 | } 37 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/exception/MissingSettingException.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.exception; 2 | 3 | 4 | public class MissingSettingException extends Exception { 5 | 6 | public MissingSettingException(String string) { 7 | super(string); 8 | } 9 | 10 | /** 11 | * 12 | */ 13 | private static final long serialVersionUID = -1610134821236307372L; 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/importance/EntityImportance.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.importance; 2 | 3 | import java.sql.SQLException; 4 | 5 | import mpi.aida.data.Entities; 6 | import mpi.aida.data.Entity; 7 | 8 | /** 9 | * This class serves as way to get the importance of an entity 10 | * with regard to the complete collection, not to a specific mention (such as prior probability) 11 | * 12 | * 13 | */ 14 | public abstract class EntityImportance { 15 | 16 | private Entities entities; 17 | 18 | private double weight = 0.0; 19 | 20 | public EntityImportance(Entities entities) throws SQLException { 21 | this.entities = entities; 22 | setupEntities(entities); 23 | } 24 | 25 | public Entities getEntities() { 26 | return entities; 27 | } 28 | 29 | protected abstract void setupEntities(Entities e) throws SQLException; 30 | 31 | public abstract double getImportance(Entity entity); 32 | 33 | public double getWeight() { 34 | return weight; 35 | } 36 | 37 | public void setWeight(double weight) { 38 | this.weight = weight; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/importance/InlinkCountImportance.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.importance; 2 | 3 | import gnu.trove.map.hash.TIntDoubleHashMap; 4 | import gnu.trove.map.hash.TIntObjectHashMap; 5 | 6 | import java.sql.SQLException; 7 | 8 | import mpi.aida.access.DataAccess; 9 | import mpi.aida.data.Entities; 10 | import mpi.aida.data.Entity; 11 | import mpi.aida.util.YagoUtil; 12 | import mpi.database.DBConnection; 13 | 14 | /** 15 | * Measures the importance of an entity by the number of 16 | * incoming links in Wikipedia/YAGO 17 | * 18 | * 19 | */ 20 | public class InlinkCountImportance extends EntityImportance { 21 | 22 | private TIntDoubleHashMap inlinkImportance; 23 | 24 | DBConnection con; 25 | 26 | public InlinkCountImportance(Entities entities) throws SQLException { 27 | super(entities); 28 | } 29 | 30 | @Override 31 | protected void setupEntities(Entities e) throws SQLException { 32 | TIntObjectHashMap neighbors = DataAccess.getInlinkNeighbors(e); 33 | for (int eId : e.getUniqueIds()) { 34 | double importance = 35 | (double) neighbors.get(eId).length 36 | / (double) YagoUtil.TOTAL_YAGO_ENTITIES; 37 | inlinkImportance.put(eId, importance); 38 | } 39 | } 40 | 41 | @Override 42 | public double getImportance(Entity entity) { 43 | return inlinkImportance.get(entity.getId()); 44 | } 45 | 46 | public String toString() { 47 | return "InlinkCountImportance"; 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/AlwaysOneSimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import mpi.aida.data.Context; 4 | import mpi.aida.data.Entity; 5 | import mpi.aida.data.Mention; 6 | import mpi.aida.graph.similarity.context.EntitiesContext; 7 | import mpi.experiment.trace.Tracer; 8 | 9 | public class AlwaysOneSimilarityMeasure extends MentionEntitySimilarityMeasure { 10 | 11 | public AlwaysOneSimilarityMeasure(Tracer tracer) { 12 | super(tracer); 13 | } 14 | 15 | @Override 16 | public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) { 17 | return 1.0; 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/EntityEntitySimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import mpi.aida.data.Entity; 4 | import mpi.aida.graph.similarity.context.EntitiesContext; 5 | import mpi.experiment.trace.Tracer; 6 | 7 | public abstract class EntityEntitySimilarityMeasure extends SimilarityMeasure { 8 | 9 | public EntityEntitySimilarityMeasure(Tracer tracer) { 10 | super(tracer); 11 | } 12 | 13 | public abstract double calcSimilarity(Entity a, Entity b, EntitiesContext context); 14 | } 15 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/InlinkOverlapEntityEntitySimilarity.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import gnu.trove.list.array.TIntArrayList; 4 | import gnu.trove.map.hash.TIntObjectHashMap; 5 | import gnu.trove.set.hash.TIntHashSet; 6 | 7 | import java.util.BitSet; 8 | 9 | import mpi.aida.AidaManager; 10 | import mpi.aida.access.DataAccess; 11 | import mpi.aida.data.Entities; 12 | import mpi.aida.data.Entity; 13 | import mpi.aida.graph.similarity.EntityEntitySimilarity; 14 | import mpi.aida.graph.similarity.context.EntitiesContext; 15 | import mpi.database.DBConnection; 16 | 17 | import org.slf4j.Logger; 18 | import org.slf4j.LoggerFactory; 19 | 20 | /** 21 | * Similarity of two entities is the number of common inlinks 22 | * 23 | * 24 | */ 25 | public class InlinkOverlapEntityEntitySimilarity extends EntityEntitySimilarity { 26 | private static final Logger logger = 27 | LoggerFactory.getLogger(InlinkOverlapEntityEntitySimilarity.class); 28 | 29 | private TIntObjectHashMap entity2inlink; 30 | private TIntObjectHashMap entity2vector; 31 | 32 | DBConnection con; 33 | 34 | public InlinkOverlapEntityEntitySimilarity(EntityEntitySimilarityMeasure similarityMeasure, EntitiesContext entityContext) throws Exception { 35 | // not needed - uses entites directly 36 | super(similarityMeasure, entityContext); 37 | 38 | setupEntities(entityContext.getEntities()); 39 | } 40 | 41 | private void setupEntities(Entities entities) throws Exception { 42 | if (entities.uniqueNameSize() == 0) { 43 | logger.info("Skipping initialization of InlinkEntityEntitySimilarity for " + entities.uniqueNameSize() + " entities"); 44 | return; 45 | } 46 | 47 | logger.info("Initializing InlinkEntityEntitySimilarity for " + entities.uniqueNameSize() + " entities"); 48 | 49 | con = AidaManager.getConnectionForDatabase(AidaManager.DB_AIDA, "getting inlinks"); 50 | 51 | entity2inlink = DataAccess.getInlinkNeighbors(entities); 52 | 53 | // get all inlinks for all entities 54 | // get all inlinks for all entities 55 | TIntHashSet allInlinks = new TIntHashSet(); 56 | 57 | for (int[] neighbors : entity2inlink.valueCollection()) { 58 | allInlinks.addAll(neighbors); 59 | } 60 | 61 | TIntArrayList allInlinksList = new TIntArrayList(allInlinks.size()); 62 | for (int entry : allInlinksList.toArray()) { 63 | allInlinksList.add(entry); 64 | } 65 | allInlinksList.sort(); 66 | 67 | // now create the bitvectors for each entity 68 | logger.info("Creating bitvectors for entities"); 69 | 70 | entity2vector = new TIntObjectHashMap(); 71 | 72 | for (int entity : entities.getUniqueIds()) { 73 | int[] inlinks = entity2inlink.get(entity); 74 | 75 | BitSet bs = new BitSet(allInlinksList.size()); 76 | 77 | int current = 0; 78 | 79 | for (int inlink : inlinks) { 80 | // move to position of inlink in allInlinks 81 | while (allInlinksList.get(current) != inlink) { 82 | current++; 83 | } 84 | bs.set(current); 85 | } 86 | 87 | entity2vector.put(entity, bs); 88 | } 89 | 90 | AidaManager.releaseConnection(AidaManager.DB_AIDA, con); 91 | 92 | logger.info("Done initializing InlinkEntityEntitySimilarity"); 93 | } 94 | 95 | @Override 96 | public double calcSimilarity(Entity a, Entity b) throws Exception { 97 | BitSet bsA = entity2vector.get(a.getId()); 98 | BitSet bsB = entity2vector.get(b.getId()); 99 | 100 | BitSet intersection = (BitSet) bsA.clone(); 101 | intersection.and(bsB); 102 | 103 | BitSet union = (BitSet) bsA.clone(); 104 | union.or(bsB); 105 | 106 | if (intersection.cardinality() == 0 || union.cardinality() == 0) { 107 | return 0.0; // cannot calc 108 | } 109 | 110 | double sim = (double) intersection.cardinality() 111 | / (double) union.cardinality(); 112 | 113 | return sim; 114 | } 115 | 116 | public String toString() { 117 | return "InlinkOverlapEntityEntitySimilarity"; 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/JaccardEntityEntitySimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import gnu.trove.set.hash.TIntHashSet; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | import mpi.aida.AidaManager; 9 | import mpi.aida.data.Entity; 10 | import mpi.aida.graph.similarity.context.EntitiesContext; 11 | import mpi.aida.graph.similarity.context.FastWeightedKeyphrasesContext; 12 | import mpi.aida.util.CollectionUtils; 13 | import mpi.experiment.trace.Tracer; 14 | import mpi.experiment.trace.measures.KeytermEntityEntityMeasureTracer; 15 | import mpi.experiment.trace.measures.TermTracer; 16 | 17 | public class JaccardEntityEntitySimilarityMeasure extends EntityEntitySimilarityMeasure { 18 | 19 | public JaccardEntityEntitySimilarityMeasure(Tracer tracer) { 20 | super(tracer); 21 | } 22 | 23 | @Override 24 | public double calcSimilarity(Entity a, Entity b, EntitiesContext context) { 25 | TIntHashSet contextA = new TIntHashSet(context.getContext(a)); 26 | TIntHashSet contextB = new TIntHashSet(context.getContext(b)); 27 | 28 | TIntHashSet union = getUnion(contextA, contextB); 29 | TIntHashSet intersection = getIntersection(contextA, contextB); 30 | 31 | double jaccardSim = (double) intersection.size() / (double) union.size(); 32 | return jaccardSim; 33 | } 34 | 35 | private TIntHashSet getIntersection(TIntHashSet contextA, TIntHashSet contextB) { 36 | TIntHashSet is = new TIntHashSet(); 37 | 38 | for (int a : contextA.toArray()) { 39 | if (contextB.contains(a) || contextB.contains(AidaManager.expandTerm(a))) { 40 | is.add(a); 41 | } 42 | } 43 | 44 | return is; 45 | } 46 | 47 | private TIntHashSet getUnion(TIntHashSet contextA, TIntHashSet contextB) { 48 | TIntHashSet union = new TIntHashSet(); 49 | 50 | for (int a : contextB.toArray()) { 51 | union.add(a); 52 | } 53 | 54 | for (int a : contextA.toArray()) { 55 | if (!union.contains(a) && !union.contains(AidaManager.expandTerm(a))) { 56 | union.add(a); 57 | } 58 | } 59 | 60 | return union; 61 | } 62 | 63 | @SuppressWarnings("unused") 64 | private void collectTracingInfo(Entity a, Entity b, int[] kpsA, int[] kpsB, double sim, Map matches, FastWeightedKeyphrasesContext kwc) { 65 | Map e1keyphrases = new HashMap(); 66 | for (int kp : kpsA) { 67 | if (kwc.getCombinedKeyphraseMiIdfWeight(a, kp) > 0.0) { 68 | e1keyphrases.put(kwc.getKeyphraseForId(kp), kwc.getCombinedKeyphraseMiIdfWeight(a, kp)); 69 | } 70 | } 71 | e1keyphrases = CollectionUtils.sortMapByValue(e1keyphrases, true); 72 | 73 | Map e2keyphrases = new HashMap(); 74 | for (int kp : kpsB) { 75 | if (kwc.getCombinedKeyphraseMiIdfWeight(b, kp) > 0.0) { 76 | e2keyphrases.put(kwc.getKeyphraseForId(kp), kwc.getCombinedKeyphraseMiIdfWeight(b, kp)); 77 | } 78 | } 79 | e2keyphrases = CollectionUtils.sortMapByValue(e2keyphrases, true); 80 | 81 | tracer.eeTracing().addEntityContext(a.getName(), e1keyphrases); 82 | tracer.eeTracing().addEntityContext(b.getName(), e2keyphrases); 83 | 84 | KeytermEntityEntityMeasureTracer mt = new KeytermEntityEntityMeasureTracer("PartialKeyphraseSim", 0.0, e2keyphrases, matches); 85 | mt.setScore(sim); 86 | tracer.eeTracing().addEntityEntityMeasureTracer(a.getName(), b.getName(), mt); 87 | 88 | KeytermEntityEntityMeasureTracer mt2 = new KeytermEntityEntityMeasureTracer("PartialKeyphraseSim", 0.0, e1keyphrases, matches); 89 | mt2.setScore(sim); 90 | tracer.eeTracing().addEntityEntityMeasureTracer(b.getName(), a.getName(), mt2); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/JaccardSimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import gnu.trove.set.hash.TIntHashSet; 4 | import mpi.aida.AidaManager; 5 | import mpi.aida.data.Context; 6 | import mpi.aida.data.Entity; 7 | import mpi.aida.data.Mention; 8 | import mpi.aida.graph.similarity.context.EntitiesContext; 9 | import mpi.experiment.trace.Tracer; 10 | 11 | public class JaccardSimilarityMeasure extends MentionEntitySimilarityMeasure { 12 | 13 | public JaccardSimilarityMeasure(Tracer tracer) { 14 | super(tracer); 15 | } 16 | 17 | @Override 18 | public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) { 19 | TIntHashSet contextA = new TIntHashSet(context.getTokenIds()); 20 | TIntHashSet contextB = new TIntHashSet(entitiesContext.getContext(entity)); 21 | 22 | TIntHashSet union = getUnion(contextA, contextB); 23 | TIntHashSet intersection = getIntersection(contextA, contextB); 24 | 25 | double jaccardSim = (double) intersection.size() / (double) union.size(); 26 | return jaccardSim; 27 | } 28 | 29 | private TIntHashSet getIntersection(TIntHashSet contextA, TIntHashSet contextB) { 30 | TIntHashSet is = new TIntHashSet(); 31 | 32 | for (int a : contextA.toArray()) { 33 | if (contextB.contains(a) || contextB.contains(AidaManager.expandTerm(a))) { 34 | is.add(a); 35 | } 36 | } 37 | 38 | return is; 39 | } 40 | 41 | private TIntHashSet getUnion(TIntHashSet contextA, TIntHashSet contextB) { 42 | TIntHashSet union = new TIntHashSet(); 43 | 44 | for (int a : contextB.toArray()) { 45 | union.add(a); 46 | } 47 | 48 | for (int a : contextA.toArray()) { 49 | if (!union.contains(a) && !union.contains(AidaManager.expandTerm(a))) { 50 | union.add(a); 51 | } 52 | } 53 | 54 | return union; 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/MentionEntitySimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import mpi.aida.data.Context; 4 | import mpi.aida.data.Entity; 5 | import mpi.aida.data.Mention; 6 | import mpi.aida.graph.similarity.context.EntitiesContext; 7 | import mpi.experiment.trace.Tracer; 8 | 9 | public abstract class MentionEntitySimilarityMeasure extends SimilarityMeasure { 10 | 11 | public MentionEntitySimilarityMeasure(Tracer tracer) { 12 | super(tracer); 13 | } 14 | 15 | protected boolean useDistanceDiscount = false; 16 | 17 | public boolean isUseDistanceDiscount() { 18 | return useDistanceDiscount; 19 | } 20 | 21 | public void setUseDistanceDiscount(boolean useDistanceDiscount) { 22 | this.useDistanceDiscount = useDistanceDiscount; 23 | } 24 | 25 | public abstract double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext); 26 | 27 | /** 28 | * This method is a place holder to enable the framework to add extra context to a specific mention 29 | * during the processing of the code 30 | * subclasses should override this method accordingly 31 | * @param context the context to add 32 | */ 33 | 34 | /** 35 | * This method is a place holder to enable the framework to add extra context to a specific mention 36 | * during the processing of the code 37 | * subclasses should override this method accordingly 38 | * 39 | * @param mention the mention to which this context belongs 40 | * @param context the context to add 41 | */ 42 | public void addExtraContext(Mention mention, Object context) { 43 | return; 44 | } 45 | 46 | 47 | /** 48 | * This method is a place holder to enable the framework to announce when a mention gets assigned to an entity 49 | * different measures may perform different upon such event. 50 | * default implementation is doing nothing 51 | * 52 | * @param mention the mention that was assigned 53 | * @param entity the entity to which the mention got assigned 54 | */ 55 | public void announceMentionAssignment(Mention mention, Entity entity) { 56 | return; 57 | } 58 | 59 | } 60 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/MilneWittenEntityEntitySimilarity.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import gnu.trove.iterator.TIntObjectIterator; 4 | import gnu.trove.map.hash.TIntObjectHashMap; 5 | import javaewah.EWAHCompressedBitmap; 6 | import mpi.aida.access.DataAccess; 7 | import mpi.aida.data.Entities; 8 | import mpi.aida.data.Entity; 9 | import mpi.aida.graph.similarity.EntityEntitySimilarity; 10 | import mpi.aida.graph.similarity.context.EntitiesContext; 11 | import mpi.aida.util.YagoUtil; 12 | 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | public class MilneWittenEntityEntitySimilarity extends EntityEntitySimilarity { 17 | private static final Logger logger = 18 | LoggerFactory.getLogger(MilneWittenEntityEntitySimilarity.class); 19 | 20 | private TIntObjectHashMap entity2vector; 21 | 22 | 23 | public MilneWittenEntityEntitySimilarity(EntityEntitySimilarityMeasure similarityMeasure, EntitiesContext entityContext) throws Exception { 24 | // not needed - uses entites directly 25 | super(similarityMeasure, entityContext); 26 | 27 | setupEntities(entityContext.getEntities()); 28 | } 29 | 30 | private void setupEntities(Entities entities) throws Exception { 31 | logger.info("Initializing MilneWittenEntityEntitySimilarity for " + 32 | entities.uniqueNameSize() + " entities"); 33 | 34 | TIntObjectHashMap entityInlinks = 35 | DataAccess.getInlinkNeighbors(entities); 36 | 37 | entity2vector = new TIntObjectHashMap(); 38 | 39 | for (TIntObjectIterator itr = entityInlinks.iterator(); 40 | itr.hasNext(); ) { 41 | itr.advance(); 42 | int entity = itr.key(); 43 | int[] inLinks = itr.value(); 44 | 45 | EWAHCompressedBitmap bs = new EWAHCompressedBitmap(); 46 | for (int l : inLinks) { 47 | bs.set(l); 48 | } 49 | entity2vector.put(entity, bs); 50 | } 51 | 52 | logger.info("Done initializing MilneWittenEntityEntitySimilarity for " + 53 | entities.uniqueNameSize() + " entities"); 54 | } 55 | 56 | @Override 57 | public double calcSimilarity(Entity a, Entity b) throws Exception { 58 | EWAHCompressedBitmap bsA = entity2vector.get(a.getId()); 59 | EWAHCompressedBitmap bsB = entity2vector.get(b.getId()); 60 | 61 | double sizeA = bsA.cardinality(); 62 | double sizeB = bsB.cardinality(); 63 | 64 | double max = -1.0; 65 | double min = -1.0; 66 | 67 | if (sizeA >= sizeB) { 68 | max = sizeA; 69 | min = sizeB; 70 | } else { 71 | max = sizeB; 72 | min = sizeA; 73 | } 74 | 75 | double sim = 0.0; // default is no sim 76 | 77 | int overlap = bsA.andCardinality(bsB); 78 | 79 | if (overlap > 0) { 80 | // now calc the real similarity 81 | double distance = (Math.log(max) - Math.log((double) overlap)) / (Math.log(YagoUtil.TOTAL_YAGO_ENTITIES) - Math.log(min)); 82 | 83 | sim = 1 - distance; 84 | 85 | if (distance > 1.0) { 86 | // really far apart ... 87 | sim = 0.0; 88 | } 89 | } 90 | 91 | return sim; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/NGDSimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import gnu.trove.set.hash.TIntHashSet; 4 | import mpi.aida.data.Entity; 5 | import mpi.aida.graph.similarity.context.EntitiesContext; 6 | import mpi.aida.graph.similarity.context.WeightedKeyphrasesContext; 7 | import mpi.aida.util.YagoUtil; 8 | import mpi.experiment.trace.Tracer; 9 | 10 | public class NGDSimilarityMeasure extends EntityEntitySimilarityMeasure { 11 | 12 | public NGDSimilarityMeasure(Tracer tracer) { 13 | super(tracer); 14 | } 15 | 16 | protected WeightedKeyphrasesContext kwc; 17 | 18 | @Override 19 | public double calcSimilarity(Entity a, Entity b, EntitiesContext entitiesContext) { 20 | kwc = (WeightedKeyphrasesContext) entitiesContext; 21 | 22 | double max = getMax(a, b, entitiesContext); 23 | double min = getMin(a, b, entitiesContext); 24 | double intersect = getIntersect(a, b, entitiesContext); 25 | double collection = getCollection(); 26 | 27 | double sim = 0.0; 28 | 29 | if (intersect > 0) { 30 | double ngd = 31 | ( Math.log(max) - Math.log(intersect) ) 32 | / ( Math.log(collection) - Math.log(min) ); 33 | sim = 1 - ngd; 34 | if (sim < 0) sim = 0.0; 35 | } 36 | 37 | return sim; 38 | } 39 | 40 | protected double getMax(Entity a, Entity b, EntitiesContext entitiesContext) { 41 | int[] e1context = kwc.getContext(a); 42 | int[] e2context = kwc.getContext(b); 43 | 44 | return Math.max(e1context.length, e2context.length); 45 | } 46 | 47 | protected double getMin(Entity a, Entity b, EntitiesContext entitiesContext) { 48 | int[] e1context = kwc.getContext(a); 49 | int[] e2context = kwc.getContext(b); 50 | 51 | return Math.min(e1context.length, e2context.length); 52 | } 53 | 54 | protected double getIntersect(Entity a, Entity b, EntitiesContext entitiesContext) { 55 | TIntHashSet e1context = new TIntHashSet(kwc.getContext(a)); 56 | TIntHashSet e2context = new TIntHashSet(kwc.getContext(b)); 57 | 58 | e1context.retainAll(e2context); 59 | int intersectSize = e1context.size(); 60 | return (double) intersectSize; 61 | } 62 | 63 | protected double getCollection() { 64 | return ((double) YagoUtil.TOTAL_YAGO_ENTITIES); 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/NormalizedKeyphrasesBasedIDFSimilarity.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import mpi.experiment.trace.Tracer; 4 | 5 | public class NormalizedKeyphrasesBasedIDFSimilarity extends UnnormalizedKeyphrasesBasedIDFSimilarity { 6 | 7 | public NormalizedKeyphrasesBasedIDFSimilarity(Tracer tracer) { 8 | super(tracer); 9 | normalize = true; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/NormalizedKeyphrasesBasedMISimilarity.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import mpi.experiment.trace.Tracer; 4 | 5 | public class NormalizedKeyphrasesBasedMISimilarity extends UnnormalizedKeyphrasesBasedMISimilarity { 6 | 7 | public NormalizedKeyphrasesBasedMISimilarity(Tracer tracer) { 8 | super(tracer); 9 | normalize = true; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/NullEntityEntitySimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import mpi.aida.data.Entity; 4 | import mpi.aida.graph.similarity.context.EntitiesContext; 5 | import mpi.experiment.trace.Tracer; 6 | 7 | public class NullEntityEntitySimilarityMeasure extends EntityEntitySimilarityMeasure { 8 | 9 | public NullEntityEntitySimilarityMeasure(Tracer tracer) { 10 | super(tracer); 11 | } 12 | 13 | @Override 14 | public double calcSimilarity(Entity a, Entity b, EntitiesContext context) { 15 | return -1; 16 | } 17 | 18 | } 19 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/NullMentionEntittySimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import mpi.aida.data.Context; 4 | import mpi.aida.data.Entity; 5 | import mpi.aida.data.Mention; 6 | import mpi.aida.graph.similarity.context.EntitiesContext; 7 | import mpi.experiment.trace.Tracer; 8 | 9 | 10 | public class NullMentionEntittySimilarityMeasure extends MentionEntitySimilarityMeasure { 11 | 12 | public NullMentionEntittySimilarityMeasure(Tracer tracer) { 13 | super(tracer); 14 | } 15 | 16 | @Override 17 | public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) { 18 | return 0; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/SimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import mpi.experiment.trace.Tracer; 4 | 5 | public abstract class SimilarityMeasure { 6 | 7 | protected Tracer tracer = null; 8 | 9 | public SimilarityMeasure(Tracer tracer) { 10 | this.tracer = tracer; 11 | } 12 | 13 | public String toString() { 14 | return getIdentifier(); 15 | } 16 | 17 | public String getIdentifier() { 18 | String id = this.getClass().getSimpleName(); 19 | return id; 20 | } 21 | 22 | public Tracer getTracer() { 23 | return tracer; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/TfIdfCosineSimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import gnu.trove.map.hash.TIntDoubleHashMap; 4 | import gnu.trove.map.hash.TIntIntHashMap; 5 | import gnu.trove.set.hash.TIntHashSet; 6 | import mpi.aida.AidaManager; 7 | import mpi.aida.access.DataAccess; 8 | import mpi.aida.data.Context; 9 | import mpi.aida.data.Entity; 10 | import mpi.aida.data.Mention; 11 | import mpi.aida.graph.similarity.context.EntitiesContext; 12 | import mpi.aida.util.YagoUtil; 13 | import mpi.experiment.trace.Tracer; 14 | 15 | /** 16 | * Calculates the similarity of two contexts by the cosine similarity 17 | * of their tf.idf weighted term vectors. 18 | * 19 | * 20 | */ 21 | public class TfIdfCosineSimilarityMeasure extends MentionEntitySimilarityMeasure { 22 | 23 | public TfIdfCosineSimilarityMeasure(Tracer tracer) { 24 | super(tracer); 25 | } 26 | 27 | @Override 28 | public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) { 29 | TIntDoubleHashMap contextVec = getTfIdfVector(context.getTokenIds()); 30 | TIntDoubleHashMap entityVec = getTfIdfVector(entitiesContext.getContext(entity)); 31 | 32 | double sim = calcCosine(entityVec, contextVec); 33 | return sim; 34 | } 35 | 36 | protected double calcCosine(TIntDoubleHashMap entityVec, TIntDoubleHashMap contextVec) { 37 | double dotProduct = 0.0; 38 | 39 | for (int termA : entityVec.keys()) { 40 | int expandedA = AidaManager.expandTerm(termA); 41 | if (contextVec.containsKey(termA)) { 42 | double tempProduct = entityVec.get(termA) * contextVec.get(termA); 43 | dotProduct += tempProduct; 44 | } 45 | if (contextVec.containsKey(expandedA)) { 46 | double tempProduct = entityVec.get(termA) * contextVec.get(expandedA); 47 | dotProduct += tempProduct; 48 | } 49 | } 50 | 51 | double normA = 0.0; 52 | for (double weightA : entityVec.values()) { 53 | normA += weightA * weightA; 54 | } 55 | normA = Math.sqrt(normA); 56 | 57 | double normB = 0.0; 58 | for (double weightB : contextVec.values()) { 59 | normB += weightB * weightB; 60 | } 61 | normB = Math.sqrt(normB); 62 | 63 | double sim = 0.0; 64 | 65 | if (normA * normB != 0) { 66 | sim = dotProduct / (normA * normB); 67 | } 68 | 69 | return sim; 70 | } 71 | 72 | private TIntDoubleHashMap getTfIdfVector(int[] is) { 73 | TIntDoubleHashMap vector = new TIntDoubleHashMap(); 74 | 75 | TIntIntHashMap tfs = new TIntIntHashMap(); 76 | 77 | for (int term : is) { 78 | tfs.adjustOrPutValue(term, 1, 1); 79 | } 80 | 81 | TIntIntHashMap termDFs = 82 | DataAccess.getKeywordDocumentFrequencies(new TIntHashSet(is)); 83 | 84 | for (int term : new TIntHashSet(is).toArray()) { 85 | int tf = tfs.get(term); 86 | int df = termDFs.get(term); 87 | if (df == 0) df = YagoUtil.TOTAL_YAGO_ENTITIES; // default smoothing 88 | 89 | double tfIdf = 90 | (double) tf 91 | * log2((double) YagoUtil.TOTAL_YAGO_ENTITIES / (double) df); 92 | 93 | vector.put(term, tfIdf); 94 | } 95 | 96 | return vector; 97 | } 98 | 99 | public static double log2(double x) { 100 | return Math.log(x) / Math.log(2); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/UnnormalizedKeyphrasesBasedIDFSimilarity.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import mpi.aida.data.Entity; 4 | import mpi.experiment.trace.Tracer; 5 | 6 | public class UnnormalizedKeyphrasesBasedIDFSimilarity extends UnnormalizedKeyphrasesBasedMISimilarity { 7 | 8 | public UnnormalizedKeyphrasesBasedIDFSimilarity(Tracer tracer) { 9 | super(tracer); 10 | } 11 | 12 | protected double getKeywordScore(Entity entity, int keyword) { 13 | return keyphrasesContext.getKeywordIDFWeight(keyword); 14 | } 15 | 16 | public String getIdentifier() { 17 | String identifier = "UnnormalizedKeyphrasesBasedIDFSimilarity"; 18 | 19 | if (isUseDistanceDiscount()) { 20 | identifier += ",i"; 21 | } 22 | 23 | return identifier; 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/WeightedJaccardEntityEntitySimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import gnu.trove.set.hash.TIntHashSet; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | import mpi.aida.data.Entity; 9 | import mpi.aida.graph.similarity.context.EntitiesContext; 10 | import mpi.aida.graph.similarity.context.WeightedKeyphrasesContext; 11 | import mpi.experiment.trace.Tracer; 12 | 13 | public class WeightedJaccardEntityEntitySimilarityMeasure extends EntityEntitySimilarityMeasure { 14 | 15 | public WeightedJaccardEntityEntitySimilarityMeasure(Tracer tracer) { 16 | super(tracer); 17 | } 18 | 19 | @Override 20 | public double calcSimilarity(Entity a, Entity b, EntitiesContext context) { 21 | WeightedKeyphrasesContext kpc = (WeightedKeyphrasesContext) context; 22 | 23 | TIntHashSet contextA = new TIntHashSet(kpc.getEntityKeyphraseIds(a)); 24 | TIntHashSet contextB = new TIntHashSet(kpc.getEntityKeyphraseIds(b)); 25 | 26 | double intersection = getIntersection(a, contextA, b, contextB, kpc); 27 | double union = getUnion(a, contextA, b, contextB, kpc); 28 | 29 | double jaccardSim = intersection / union; 30 | 31 | return jaccardSim; 32 | } 33 | 34 | private double getIntersection(Entity a, TIntHashSet contextA, Entity b, TIntHashSet contextB, WeightedKeyphrasesContext kpc) { 35 | double intersectWeight = 0.0; 36 | 37 | for (int k : contextA.toArray()) { 38 | if (contextB.contains(k)) { 39 | intersectWeight += Math.min(kpc.getCombinedKeyphraseMiIdfWeight(a, k), kpc.getCombinedKeyphraseMiIdfWeight(b, k)); 40 | } 41 | } 42 | 43 | return intersectWeight; 44 | } 45 | 46 | private double getUnion(Entity a, TIntHashSet contextA, Entity b, TIntHashSet contextB, WeightedKeyphrasesContext kpc) { 47 | Map weights = new HashMap(); 48 | 49 | for (int k : contextA.toArray()) { 50 | weights.put(k, kpc.getCombinedKeyphraseMiIdfWeight(a, k)); 51 | } 52 | 53 | for (int k : contextB.toArray()) { 54 | Double kwbWeight = kpc.getCombinedKeyphraseMiIdfWeight(b, k); 55 | Double kwaWeight = weights.get(k); 56 | 57 | if (kwaWeight != null) { 58 | weights.put(k, Math.max(kwaWeight, kwbWeight)); 59 | } else { 60 | weights.put(k, kwbWeight); 61 | } 62 | } 63 | 64 | double unionWeight = 0.0; 65 | 66 | for (Double d : weights.values()) { 67 | unionWeight += d; 68 | } 69 | 70 | return unionWeight; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/WeightedNGDSimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import gnu.trove.set.hash.TIntHashSet; 4 | import mpi.aida.data.Entity; 5 | import mpi.aida.graph.similarity.context.EntitiesContext; 6 | import mpi.aida.util.YagoUtil; 7 | import mpi.experiment.trace.Tracer; 8 | 9 | 10 | public class WeightedNGDSimilarityMeasure extends NGDSimilarityMeasure { 11 | 12 | public WeightedNGDSimilarityMeasure(Tracer tracer) { 13 | super(tracer); 14 | } 15 | 16 | @Override 17 | protected double getMax(Entity a, Entity b, EntitiesContext entitiesContext) { 18 | int[] e1context = kwc.getEntityKeyphraseIds(a); 19 | int[] e2context = kwc.getEntityKeyphraseIds(b); 20 | 21 | double e1weight = 0.0; 22 | for (int kp : e1context) { 23 | e1weight += kwc.getCombinedKeyphraseMiIdfWeight(a, kp); 24 | } 25 | 26 | double e2weight = 0.0; 27 | for (int kp : e2context) { 28 | e2weight += kwc.getCombinedKeyphraseMiIdfWeight(b, kp); 29 | } 30 | 31 | return Math.max(e1weight, e2weight); 32 | } 33 | 34 | @Override 35 | protected double getMin(Entity a, Entity b, EntitiesContext entitiesContext) { 36 | int[] e1context = kwc.getEntityKeyphraseIds(a); 37 | int[] e2context = kwc.getEntityKeyphraseIds(b); 38 | 39 | double e1weight = 0.0; 40 | for (int kp : e1context) { 41 | e1weight += kwc.getCombinedKeyphraseMiIdfWeight(a, kp); 42 | } 43 | 44 | double e2weight = 0.0; 45 | for (int kp : e2context) { 46 | e2weight += kwc.getCombinedKeyphraseMiIdfWeight(b, kp); 47 | } 48 | 49 | return Math.min(e1weight, e2weight); 50 | } 51 | 52 | @Override 53 | protected double getIntersect(Entity a, Entity b, EntitiesContext entitiesContext) { 54 | int[] e1context = kwc.getEntityKeyphraseIds(a); 55 | int[] e2context = kwc.getEntityKeyphraseIds(b); 56 | 57 | TIntHashSet e1forIntersect = new TIntHashSet(e1context); 58 | TIntHashSet e2forIntersect = new TIntHashSet(e2context); 59 | e1forIntersect.retainAll(e2forIntersect); 60 | 61 | double intersectWeight = 0.0; 62 | 63 | for (int kp : e1forIntersect.toArray()) { 64 | intersectWeight += kwc.getCombinedKeyphraseMiIdfWeight(a, kp); 65 | intersectWeight += kwc.getCombinedKeyphraseMiIdfWeight(b, kp); 66 | } 67 | 68 | // everthing was counted twice 69 | intersectWeight /= 2; 70 | 71 | return intersectWeight; 72 | } 73 | 74 | @Override 75 | protected double getCollection() { 76 | return YagoUtil.TOTAL_YAGO_ENTITIES; 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/measure/WordCountVectorDotProductSimilarityMeasure.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import gnu.trove.iterator.TIntIntIterator; 4 | import gnu.trove.map.hash.TIntIntHashMap; 5 | import mpi.aida.AidaManager; 6 | import mpi.aida.data.Context; 7 | import mpi.aida.data.Entity; 8 | import mpi.aida.data.Mention; 9 | import mpi.aida.graph.similarity.context.EntitiesContext; 10 | import mpi.experiment.trace.Tracer; 11 | 12 | /** 13 | * This class calculates the similarity between a mention and an 14 | * entity context by a dot product between the word count vectors. 15 | * 16 | * 17 | */ 18 | public class WordCountVectorDotProductSimilarityMeasure extends MentionEntitySimilarityMeasure { 19 | 20 | public WordCountVectorDotProductSimilarityMeasure(Tracer tracer) { 21 | super(tracer); 22 | } 23 | 24 | @Override 25 | public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) { 26 | // create two Maps representing the word count vectors 27 | TIntIntHashMap contextVec = createWordCountVector(context.getTokenIds()); 28 | TIntIntHashMap entityVec = createWordCountVector(entitiesContext.getContext(entity)); 29 | 30 | // calc dot product between them 31 | double similarity = calcDotProduct(entityVec, contextVec); 32 | return similarity; 33 | } 34 | 35 | private TIntIntHashMap createWordCountVector(int[] is) { 36 | TIntIntHashMap wordCountVector = new TIntIntHashMap(); 37 | 38 | for (int word : is) { 39 | wordCountVector.adjustOrPutValue(word, 1, 1); 40 | } 41 | 42 | return wordCountVector; 43 | } 44 | 45 | private double calcDotProduct( 46 | TIntIntHashMap entityVec, TIntIntHashMap contextVec) { 47 | int dotProduct = 0; 48 | 49 | for (TIntIntIterator it = entityVec.iterator(); it.hasNext(); ) { 50 | it.advance(); 51 | int wordA = it.key(); 52 | 53 | int expandedA = AidaManager.expandTerm(wordA); 54 | 55 | // get counts of word in both vectors 56 | int wordAcount = entityVec.get(wordA); 57 | int wordBcount = contextVec.get(wordA); 58 | 59 | wordBcount += contextVec.get(expandedA); // add expanded count if available 60 | 61 | int temp = wordAcount * wordBcount; 62 | dotProduct += temp; 63 | } 64 | 65 | return dotProduct; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/util/EntitiesContextCreator.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.util; 2 | 3 | import java.util.HashMap; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.concurrent.locks.Lock; 8 | import java.util.concurrent.locks.ReentrantLock; 9 | 10 | import mpi.aida.data.Entities; 11 | import mpi.aida.graph.similarity.context.EntitiesContext; 12 | 13 | /** 14 | * Caches entity contexts based on the context id and document id. 15 | * Assumes distinct document ids and caches up to ecc contexts. 16 | * 17 | * 18 | */ 19 | public class EntitiesContextCreator { 20 | /** Has to be at least 1. */ 21 | private static final int CACHE_SIZE = 10; 22 | 23 | /** Holds the cached EntityContexts. */ 24 | private Map cache = 25 | new HashMap(); 26 | 27 | /** 28 | * Keeps the order in which the EntityContexts were created for 29 | * discarding the least recently used on cache overflow. 30 | */ 31 | private List cacheIds = new LinkedList(); 32 | 33 | /** 34 | * Synchronized the creation of different contexts. Allows the parallel 35 | * creation of contexts for distinct documents but blocks for requests 36 | * of the same context. 37 | */ 38 | private Map contextCreationLocks = new HashMap(); 39 | 40 | private static class EntitiesContextCreatorHolder { 41 | public static EntitiesContextCreator ecc = new EntitiesContextCreator(); 42 | } 43 | 44 | public static EntitiesContextCreator getEntitiesContextCache() { 45 | return EntitiesContextCreatorHolder.ecc; 46 | } 47 | 48 | public EntitiesContext getEntitiesContext( 49 | String contextClassName, String docId, Entities entities) 50 | throws Exception { 51 | 52 | String id = getCacheId(contextClassName, docId); 53 | 54 | // Allow the parallel creation of distinct contexts but only 55 | // one creation per id. 56 | Lock contextLock = getContextCreationLock(id); 57 | contextLock.lock(); 58 | EntitiesContext context = null; 59 | try { 60 | context = cache.get(id); 61 | 62 | if (context == null) { 63 | // Create context. 64 | context = 65 | (EntitiesContext) 66 | Class.forName(contextClassName). 67 | getDeclaredConstructor(Entities.class).newInstance(entities); 68 | 69 | // Put it into the cache, deleting the oldest cache if the cache 70 | // size is exceeded. 71 | synchronized(cache) { 72 | cache.put(id, context); 73 | cacheIds.add(id); 74 | 75 | if (cacheIds.size() > CACHE_SIZE) { 76 | String removedId = cacheIds.get(0); 77 | cacheIds.remove(0); 78 | cache.remove(removedId); 79 | } 80 | } 81 | } 82 | } catch (Exception e) { 83 | throw e; 84 | } finally { 85 | contextLock.unlock(); 86 | } 87 | 88 | // Will be null if something goes wrong in the creation process. 89 | return context; 90 | } 91 | 92 | private String getCacheId(String contextClassName, String docId) { 93 | return contextClassName + "\t" + docId; 94 | } 95 | 96 | private synchronized Lock getContextCreationLock(String id) { 97 | Lock lock = contextCreationLocks.get(id); 98 | if (lock == null) { 99 | lock = new ReentrantLock(); 100 | contextCreationLocks.put(id, lock); 101 | } 102 | return lock; 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/util/MaxMinSettings.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.util; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.FileReader; 5 | import java.io.IOException; 6 | import java.io.Serializable; 7 | import java.util.HashMap; 8 | import java.util.Map; 9 | 10 | public class MaxMinSettings implements Serializable { 11 | 12 | private static final long serialVersionUID = -3088993650033149824L; 13 | 14 | Map minMaxs; 15 | 16 | public MaxMinSettings(String propertiesFilePath) 17 | throws NumberFormatException, IOException { 18 | minMaxs = new HashMap(); 19 | 20 | BufferedReader reader = 21 | new BufferedReader(new FileReader(propertiesFilePath)); 22 | for (String line = reader.readLine(); line != null; line = reader.readLine()) { 23 | String[] data = line.split("="); 24 | 25 | double min = Double.parseDouble(data[1].split(" ")[0]); 26 | double max = Double.parseDouble(data[1].split(" ")[1]); 27 | 28 | minMaxs.put(data[0], new double[] { min, max }); 29 | } 30 | reader.close(); 31 | } 32 | 33 | public MaxMinSettings(Map minMaxs) { 34 | this.minMaxs = minMaxs; 35 | } 36 | 37 | public double getMin(String featureName) { 38 | if (!minMaxs.containsKey(featureName)) { 39 | throw new IllegalArgumentException("No min for '"+featureName+"'"); 40 | } 41 | return minMaxs.get(featureName)[0]; 42 | } 43 | 44 | public double getMax(String featureName) { 45 | if (!minMaxs.containsKey(featureName)) { 46 | throw new IllegalArgumentException("No max for '"+featureName+"'"); 47 | } 48 | return minMaxs.get(featureName)[1]; 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/util/ParallelEntityEntityRelatednessComputation.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.HashMap; 6 | import java.util.HashSet; 7 | import java.util.LinkedList; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Set; 11 | import java.util.concurrent.CountDownLatch; 12 | 13 | import mpi.aida.config.AidaConfig; 14 | import mpi.aida.data.Entities; 15 | import mpi.aida.data.Entity; 16 | import mpi.aida.data.Mention; 17 | import mpi.aida.data.Mentions; 18 | import mpi.aida.graph.similarity.EnsembleEntityEntitySimilarity; 19 | 20 | 21 | public class ParallelEntityEntityRelatednessComputation { 22 | private int numThreads = 4; // default. 23 | private long totalNumCalcs = 0; // this is only valid if the object is created anew for each entitiy set - used for timing experiments 24 | 25 | public ParallelEntityEntityRelatednessComputation() { 26 | this(Integer.parseInt(AidaConfig.get(AidaConfig.EE_NUM_THREADS))); 27 | } 28 | 29 | public ParallelEntityEntityRelatednessComputation(int numThreads) { 30 | this.numThreads = numThreads; 31 | } 32 | 33 | public Map> computeRelatedness(EnsembleEntityEntitySimilarity entitySimilarity, Entities entities) throws InterruptedException { 34 | return computeRelatedness(entitySimilarity, entities, null); 35 | } 36 | 37 | public Map> computeRelatedness(EnsembleEntityEntitySimilarity entitySimilarity, Entities entities, Mentions mentions) throws InterruptedException { 38 | Map> entityEntitySimilarities = Collections.synchronizedMap(new HashMap>()); 39 | 40 | Map> entityMentionsMap = null; 41 | if (mentions != null) { 42 | entityMentionsMap = prepareEntityMentionsMap(mentions); 43 | } 44 | 45 | List> entityPartitions = new LinkedList>(); 46 | List allEntities = new ArrayList(entities.getEntities()); 47 | 48 | int overall = 0; 49 | Set part = null; 50 | int partSize = entities.uniqueNameSize() / numThreads; 51 | 52 | for (int currentPart = 0; currentPart < numThreads; currentPart++) { 53 | part = new HashSet(); 54 | entityPartitions.add(part); 55 | 56 | for (int j = 0; j < partSize; j++) { 57 | int total = (currentPart * partSize) + j; 58 | part.add(allEntities.get(total)); 59 | 60 | overall++; 61 | } 62 | } 63 | 64 | // add rest to last part 65 | for (; overall < allEntities.size(); overall++) { 66 | part.add(allEntities.get(overall)); 67 | } 68 | 69 | // create threads and run 70 | CountDownLatch cdl = new CountDownLatch(numThreads); 71 | 72 | List scs = new LinkedList(); 73 | 74 | for (int i = 0; i < numThreads; i++) { 75 | ParallelEntityEntityRelatednessComputationThread sc = new ParallelEntityEntityRelatednessComputationThread(entityPartitions.get(i), entities, entitySimilarity, entityEntitySimilarities, entityMentionsMap, cdl); 76 | scs.add(sc); 77 | sc.start(); 78 | } 79 | 80 | // wait for calculation to finish 81 | cdl.await(); 82 | 83 | // sum up total number of calculations 84 | for (ParallelEntityEntityRelatednessComputationThread sc : scs) { 85 | totalNumCalcs += sc.getNumCalcs(); 86 | } 87 | 88 | return entityEntitySimilarities; 89 | } 90 | 91 | private Map> prepareEntityMentionsMap(Mentions mentions) { 92 | Map> entityMentionsMap = new HashMap>(); 93 | 94 | for (int i = 0; i < mentions.getMentions().size(); i++) { 95 | Mention mention = mentions.getMentions().get(i); 96 | Entities entities = mention.getCandidateEntities(); 97 | for (Entity entity : entities) { 98 | List entityMentions = entityMentionsMap.get(entity); 99 | if (entityMentions == null) { 100 | entityMentions = new LinkedList(); 101 | entityMentionsMap.put(entity, entityMentions); 102 | } 103 | entityMentions.add(mention); 104 | } 105 | } 106 | 107 | return entityMentionsMap; 108 | } 109 | 110 | public long getTotalNumCalcs() { 111 | return totalNumCalcs; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/mpi/aida/graph/similarity/util/ParallelEntityEntityRelatednessComputationThread.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.util; 2 | 3 | import java.util.HashMap; 4 | import java.util.HashSet; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.Set; 8 | import java.util.concurrent.CountDownLatch; 9 | 10 | import mpi.aida.data.Entities; 11 | import mpi.aida.data.Entity; 12 | import mpi.aida.data.Mention; 13 | import mpi.aida.graph.similarity.EnsembleEntityEntitySimilarity; 14 | 15 | import org.slf4j.Logger; 16 | import org.slf4j.LoggerFactory; 17 | 18 | 19 | public class ParallelEntityEntityRelatednessComputationThread extends Thread { 20 | private static final Logger logger = 21 | LoggerFactory.getLogger(ParallelEntityEntityRelatednessComputationThread.class); 22 | 23 | private Set partition; 24 | private Entities allEntities; 25 | private EnsembleEntityEntitySimilarity eeSimMeasure; 26 | private Map> entityEntitySimilarities; 27 | private Map> entityMentionsMap; 28 | private CountDownLatch cdl; 29 | private int numCalcs = 0; 30 | 31 | public ParallelEntityEntityRelatednessComputationThread(Set partition, Entities allEntities, EnsembleEntityEntitySimilarity eeSim, Map> entityEntitySimilarities, Map> entityMentionsMap, CountDownLatch cdl) { 32 | this.partition = partition; 33 | this.allEntities = allEntities; 34 | this.eeSimMeasure = eeSim; 35 | this.entityEntitySimilarities = entityEntitySimilarities; 36 | this.entityMentionsMap = entityMentionsMap; 37 | this.cdl = cdl; 38 | } 39 | 40 | @Override 41 | public void run() { 42 | for (Entity e1 : partition) { 43 | for (Entity e2 : allEntities) { 44 | // only calculate and add if e1 < e2 (similarities are 45 | // symmetric, calculate in lexicographic order) 46 | if (e1.compareTo(e2) < 0) { 47 | double sim = 0.0; 48 | // calculate only if they belong to different mentions 49 | if (shouldCalculate(e1,e2)) { 50 | try { 51 | sim = eeSimMeasure.calcSimilarity(e1, e2); 52 | numCalcs++; 53 | // negative is not allowed 54 | if (sim < 0) { 55 | logger.warn("Coherence of '"+e1+"' and '"+e2+"' was < 0, set to 0"); 56 | sim = 0.0; 57 | } 58 | } catch (Exception e) { 59 | e.printStackTrace(); 60 | } 61 | } else { 62 | continue; 63 | } 64 | 65 | Map sims = entityEntitySimilarities.get(e1); 66 | if (sims == null) { 67 | sims = new HashMap(); 68 | entityEntitySimilarities.put(e1, sims); 69 | } 70 | sims.put(e2, sim); 71 | } 72 | } 73 | } 74 | cdl.countDown(); 75 | } 76 | 77 | public int getNumCalcs() { 78 | return numCalcs; 79 | } 80 | 81 | protected boolean shouldCalculate(Entity e1, Entity e2) { 82 | if (entityMentionsMap != null) { 83 | Set mentions1 = new HashSet(); 84 | 85 | for (Mention m : entityMentionsMap.get(e1)) { 86 | mentions1.add(m); 87 | } 88 | 89 | Set mentions2 = new HashSet(); 90 | 91 | for (Mention m : entityMentionsMap.get(e2)) { 92 | mentions2.add(m); 93 | } 94 | 95 | if (mentions1.size() != mentions2.size()) return true; 96 | 97 | for (Mention mention : mentions1) { 98 | if (!mentions2.contains(mention)) return true; 99 | } 100 | return false; 101 | } else { 102 | return true; 103 | } 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /src/mpi/aida/preparation/AidaTokenizerManager.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.preparation; 2 | 3 | import mpi.tokenizer.data.Tokenizer; 4 | import mpi.tokenizer.data.TokenizerManager; 5 | import mpi.tokenizer.data.Tokens; 6 | 7 | public class AidaTokenizerManager { 8 | public static void init() { 9 | TokenizerManager.init(); 10 | } 11 | 12 | public static Tokens tokenize(String docId, String text, Tokenizer.type type, boolean lemmatize) { 13 | Tokens tokens = TokenizerManager.parse(docId, text, type, lemmatize); 14 | return tokens; 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/mpi/aida/preparation/mentionrecognition/FilterMentions.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.preparation.mentionrecognition; 2 | 3 | import java.io.Serializable; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | 7 | import javatools.datatypes.Pair; 8 | import mpi.aida.data.Mentions; 9 | import mpi.aida.data.PreparedInput; 10 | import mpi.tokenizer.data.Token; 11 | import mpi.tokenizer.data.Tokens; 12 | 13 | public class FilterMentions implements Serializable { 14 | 15 | private static final long serialVersionUID = 6260499966421708963L; 16 | 17 | private NamedEntityFilter namedEntityFilter = null; 18 | 19 | private ManualFilter manualFilter = null; 20 | 21 | private HybridFilter hybridFilter = null; 22 | 23 | public FilterMentions() { 24 | namedEntityFilter = new NamedEntityFilter(); 25 | manualFilter = new ManualFilter(); 26 | hybridFilter = new HybridFilter(); 27 | } 28 | 29 | /** which type of tokens to get*/ 30 | public static enum FilterType { 31 | STANFORD_NER, Manual, ManualPOS, Manual_NER, Hybrid, None; 32 | }; 33 | 34 | public PreparedInput filter(String text, String docId, Tokens tokens, FilterType by) { 35 | Mentions mentions = null; 36 | Tokens returnTokens = null; 37 | if (by.equals(FilterType.STANFORD_NER)) { 38 | mentions = namedEntityFilter.filter(tokens); 39 | returnTokens = tokens; 40 | } else if (by.equals(FilterType.Manual) || by.equals(FilterType.ManualPOS) || by.equals(FilterType.Manual_NER)) { 41 | Pair tokensMentions = manualFilter.filter(text, docId, by); 42 | mentions = tokensMentions.second(); 43 | returnTokens = tokensMentions.first(); 44 | } else if (by.equals(FilterType.Hybrid)) { 45 | Pair tokensMentions = manualFilter.filter(text, docId, by); 46 | Mentions manualMentions = tokensMentions.second(); 47 | Mentions NERmentions = namedEntityFilter.filter(tokensMentions.first()); 48 | mentions = hybridFilter.parse(manualMentions, NERmentions); 49 | returnTokens = tokensMentions.first(); 50 | } else if (by.equals(FilterType.None)) { 51 | mentions = new Mentions(); 52 | List tokenlist = new LinkedList(); 53 | for (int p = 0; p < tokens.size(); p++) { 54 | Token token = tokens.getToken(p); 55 | tokenlist.add(token.getOriginal()); 56 | } 57 | returnTokens = tokens; 58 | } 59 | PreparedInput preparedInput = new PreparedInput(docId, returnTokens, mentions); 60 | return preparedInput; 61 | } 62 | } -------------------------------------------------------------------------------- /src/mpi/aida/preparation/mentionrecognition/HybridFilter.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.preparation.mentionrecognition; 2 | 3 | import java.util.Collections; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | 7 | import mpi.aida.data.Mention; 8 | import mpi.aida.data.Mentions; 9 | 10 | public class HybridFilter { 11 | 12 | public Mentions parse(Mentions manual, Mentions ner) { 13 | int from = 0; 14 | List toAdd = new LinkedList(); 15 | for (int i = 0; i < ner.getMentions().size(); i++) { 16 | Mention nerMention = ner.getMentions().get(i); 17 | boolean ok = true; 18 | int nerStart = nerMention.getStartToken(); 19 | int nerEnd = nerMention.getEndToken(); 20 | for (int m = from; m < manual.getMentions().size(); m++) { 21 | Mention manMention = manual.getMentions().get(m); 22 | int manStart = manMention.getStartToken(); 23 | int manEnd = manMention.getEndToken(); 24 | if (nerEnd >= manStart && nerEnd <= manEnd) { 25 | ok = false; 26 | } else if (nerStart >= manStart && nerStart <= manEnd) { 27 | ok = false; 28 | } else if (nerStart <= manStart && nerEnd >= manEnd) { 29 | ok = false; 30 | } 31 | } 32 | if (ok) { 33 | toAdd.add(nerMention); 34 | } 35 | } 36 | for (int i = 0; i < toAdd.size(); i++) { 37 | manual.addMention(toAdd.get(i)); 38 | } 39 | Collections.sort(manual.getMentions()); 40 | return manual; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/mpi/aida/preparation/mentionrecognition/NamedEntityFilter.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.preparation.mentionrecognition; 2 | 3 | import java.util.HashMap; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | 7 | import mpi.aida.data.Mention; 8 | import mpi.aida.data.Mentions; 9 | import mpi.tokenizer.data.Token; 10 | import mpi.tokenizer.data.Tokens; 11 | 12 | public class NamedEntityFilter { 13 | 14 | private HashMap tags = null; 15 | 16 | public NamedEntityFilter() { 17 | tags = new HashMap(); 18 | tags.put("LOCATION", "LOCATION"); 19 | tags.put("I-LOC", "I-LOC"); 20 | tags.put("B-LOC", "I-LOC"); 21 | tags.put("PERSON", "PERSON"); 22 | tags.put("I-PER", "I-PER"); 23 | tags.put("B-PER", "I-PER"); 24 | tags.put("ORGANIZATION", "ORGANIZATION"); 25 | tags.put("I-ORG", "I-ORG"); 26 | tags.put("B-ORG", "I-ORG"); 27 | tags.put("MISC", "MISC"); 28 | tags.put("I-MISC", "I-MISC"); 29 | tags.put("B-MISC", "I-MISC"); 30 | } 31 | 32 | public Mentions filter(Tokens tokens) { 33 | Mentions mentions = new Mentions(); 34 | HashMap subStrings = new HashMap(); 35 | List content = new LinkedList(); 36 | for (int p = 0; p < tokens.size(); p++) { 37 | Token token = tokens.getToken(p); 38 | content.add(token.getOriginal()); 39 | } 40 | String previous = null; 41 | int start = -1; 42 | int end = -1; 43 | for (int p = 0; p < tokens.size(); p++) { 44 | Token token = tokens.getToken(p); 45 | if (previous == null) { 46 | if (tags.containsKey(token.getNE())) { 47 | previous = tags.get(token.getNE()); 48 | start = token.getId(); 49 | end = token.getId(); 50 | } 51 | } else if (previous.equals(token.getNE())) { 52 | end = token.getId(); 53 | } else { 54 | Mention newMentions = getPossibleMentions(start, end, tokens); 55 | mentions.addMention(newMentions); 56 | subStrings.put(start, end); 57 | previous = null; 58 | if (tags.containsKey(token.getNE())) { 59 | previous = tags.get(token.getNE()); 60 | start = token.getId(); 61 | end = token.getId(); 62 | } 63 | } 64 | } 65 | if (previous != null) { 66 | Mention newMentions = getPossibleMentions(start, end, tokens); 67 | mentions.addMention(newMentions); 68 | subStrings.put(start, end); 69 | previous = null; 70 | } 71 | mentions.setSubstring(subStrings); 72 | return mentions; 73 | } 74 | 75 | private Mention getPossibleMentions(int start, int end, Tokens advTokens) { 76 | String meansArg = advTokens.toText(start, end); 77 | int startStanford = advTokens.getToken(start).getStandfordId(); 78 | int sentenceId = advTokens.getToken(start).getSentence(); 79 | int endStanford = advTokens.getToken(end).getStandfordId(); 80 | Mention mention = new Mention(meansArg, start, end, startStanford, endStanford, sentenceId); 81 | int firstChar = advTokens.getToken(mention.getStartToken()).getBeginIndex(); 82 | int lastChar = advTokens.getToken(mention.getEndToken()).getEndIndex(); 83 | int charLength = lastChar - firstChar; 84 | mention.setCharOffset(firstChar); 85 | mention.setCharLength(charLength); 86 | return mention; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/mpi/aida/util/CollectionUtils.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | import java.util.Collections; 4 | import java.util.Comparator; 5 | import java.util.LinkedHashMap; 6 | import java.util.LinkedList; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | 11 | public class CollectionUtils { 12 | public static > LinkedHashMap sortMapByValue(Map map) { 13 | return sortMapByValue(map, false); 14 | } 15 | 16 | public static > LinkedHashMap sortMapByValue(Map map, final boolean descending) { 17 | List> list = new LinkedList>(map.entrySet()); 18 | Collections.sort(list, new Comparator>() { 19 | 20 | public int compare(Map.Entry o1, Map.Entry o2) { 21 | int comp = (o1.getValue()).compareTo(o2.getValue()); 22 | 23 | if (descending) { 24 | comp = comp * (-1); 25 | } 26 | 27 | return comp; 28 | } 29 | }); 30 | 31 | LinkedHashMap result = new LinkedHashMap(); 32 | for (Map.Entry entry : list) { 33 | result.put(entry.getKey(), entry.getValue()); 34 | } 35 | return result; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/mpi/aida/util/DocumentCounter.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | import java.util.Map; 4 | import java.util.Observable; 5 | 6 | import mpi.aida.data.DisambiguationResults; 7 | 8 | import org.slf4j.Logger; 9 | import org.slf4j.LoggerFactory; 10 | 11 | 12 | public class DocumentCounter extends Observable { 13 | private static final Logger logger = 14 | LoggerFactory.getLogger(DocumentCounter.class); 15 | 16 | private int completed; 17 | private int total; 18 | private long startTime; 19 | 20 | private Map resultsMap; 21 | 22 | public DocumentCounter(int total) { 23 | completed = 0; 24 | this.total = total; 25 | startTime = System.currentTimeMillis(); 26 | } 27 | 28 | public synchronized void oneDone() { 29 | setChanged(); 30 | completed++; 31 | notifyObservers(resultsMap); 32 | 33 | long runtime = (System.currentTimeMillis() - startTime) / 1000; 34 | logger.info(completed+"/"+total+" DONE ("+runtime+"s total)"); 35 | } 36 | 37 | public Map getResultsMap() { 38 | return resultsMap; 39 | } 40 | 41 | public void setResultsMap(Map resultsMap) { 42 | this.resultsMap = resultsMap; 43 | } 44 | } -------------------------------------------------------------------------------- /src/mpi/aida/util/InputTextInvertedIndex.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | import gnu.trove.iterator.TIntIterator; 4 | import gnu.trove.list.linked.TIntLinkedList; 5 | import gnu.trove.map.hash.TIntIntHashMap; 6 | import gnu.trove.map.hash.TIntObjectHashMap; 7 | 8 | import java.util.LinkedList; 9 | import java.util.List; 10 | 11 | import mpi.aida.data.Mention; 12 | 13 | public class InputTextInvertedIndex { 14 | private TIntObjectHashMap indexIncludingStopWords; 15 | private TIntObjectHashMap indexWithoutStopWords; 16 | 17 | public InputTextInvertedIndex() { 18 | indexIncludingStopWords = new TIntObjectHashMap(); 19 | indexWithoutStopWords = new TIntObjectHashMap(); 20 | } 21 | 22 | public InputTextInvertedIndex(int[] tokens, boolean isRemoveStopWords) { 23 | indexIncludingStopWords = new TIntObjectHashMap(); 24 | indexWithoutStopWords = new TIntObjectHashMap(); 25 | int noStopwordsPosition = 0; 26 | for (int position = 0; position < tokens.length; ++position) { 27 | int token = tokens[position]; 28 | TIntLinkedList positions = indexIncludingStopWords.get(token); 29 | if (positions == null) { 30 | positions = new TIntLinkedList(); 31 | indexIncludingStopWords.put(token, positions); 32 | } 33 | positions.add(position); 34 | 35 | if(!isRemoveStopWords || !StopWord.is(token)) { 36 | positions = indexWithoutStopWords.get(token); 37 | if (positions == null) { 38 | positions = new TIntLinkedList(); 39 | indexWithoutStopWords.put(token, positions); 40 | } 41 | positions.add(noStopwordsPosition); 42 | noStopwordsPosition++; 43 | } 44 | } 45 | } 46 | 47 | public boolean containsWord(int word, Mention mention) { 48 | if(!indexWithoutStopWords.containsKey(word)) 49 | return false; 50 | TIntLinkedList positions = indexIncludingStopWords.get(word); 51 | int mentionStart = mention.getStartToken(); 52 | int mentionEnd = mention.getEndToken(); 53 | for(TIntIterator itr = positions.iterator(); itr.hasNext(); ) { 54 | int position = itr.next(); 55 | if(position < mentionStart || position > mentionEnd) 56 | return true; 57 | } 58 | return false; 59 | } 60 | 61 | public List getPositions(int word, Mention mention) { 62 | int mentionStart = mention.getStartToken(); 63 | int mentionEnd = mention.getEndToken(); 64 | int mentionLength = mentionEnd - mentionStart + 1; 65 | 66 | List positions = new LinkedList(); 67 | //we need to subtract the mention length if the keyword is after the mention 68 | for(int i = 0; i < indexIncludingStopWords.get(word).size(); i++) { 69 | //get the keyword position from the full index (including stopwords) 70 | int position = indexIncludingStopWords.get(word).get(i); 71 | //compare to know the position of the keyword relative to the mention 72 | if(position < mentionStart) //before the mention, return the actual position from the stopwords free index 73 | positions.add(indexWithoutStopWords.get(word).get(i)); 74 | else if((position > mentionEnd)) //if after the mention, get the actual position and subtract mention length 75 | positions.add(indexWithoutStopWords.get(word).get(i) - mentionLength); 76 | } 77 | 78 | return positions; 79 | } 80 | 81 | public void addToIndex(TIntIntHashMap newIndexEntries) { 82 | for(int word: newIndexEntries.keys()) { 83 | int offset = newIndexEntries.get(word); 84 | 85 | TIntLinkedList positions; 86 | positions = indexIncludingStopWords.get(word); 87 | if (positions == null) { 88 | positions = new TIntLinkedList(); 89 | indexIncludingStopWords.put(word, positions); 90 | } 91 | positions.add(offset); 92 | 93 | positions = indexWithoutStopWords.get(word); 94 | if (positions == null) { 95 | positions = new TIntLinkedList(); 96 | indexWithoutStopWords.put(word, positions); 97 | } 98 | positions.add(offset); 99 | 100 | 101 | } 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /src/mpi/aida/util/MinCover.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | public class MinCover { 7 | public int length; 8 | public List startPositions = new ArrayList(); 9 | public List endPositions = new ArrayList(); 10 | } 11 | -------------------------------------------------------------------------------- /src/mpi/aida/util/NiceTime.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | /** 4 | * Contains a method that will create a String from any long 5 | * saying how many days, hours, minutes ... the time value 6 | * represents. 7 | */ 8 | public class NiceTime { 9 | 10 | /** 11 | * takes a long value and converts it into a readable Time String 12 | * ie. if time eq 1234 would return the String: 1s, 234ms 13 | * @param time 14 | * @return 15 | */ 16 | public static String convert(long time) { 17 | long seconds = -1; 18 | long minutes = -1; 19 | long hours = -1; 20 | StringBuffer sb = new StringBuffer(100); 21 | if (time < 0) { 22 | return "0ms"; 23 | } 24 | long milliseconds = time % 1000; 25 | time = time / 1000; 26 | if (time > 0) { 27 | seconds = time % 60; 28 | time = time / 60; 29 | } 30 | if (time > 0) { 31 | minutes = time % 60; 32 | time = time / 60; 33 | } 34 | if (time > 0) { 35 | hours = time % 24; 36 | time = time / 24; 37 | } 38 | if (time > 0) { 39 | sb.append(time + "d, "); 40 | } 41 | if (hours != -1) { 42 | sb.append(hours + "h, "); 43 | } 44 | if (minutes != -1) { 45 | sb.append(minutes + "m, "); 46 | } 47 | if (seconds != -1) { 48 | sb.append(seconds + "s, "); 49 | } 50 | sb.append(milliseconds + "ms"); 51 | return sb.toString(); 52 | } 53 | 54 | public static String convert(double time) { 55 | return convert((long) time); 56 | } 57 | 58 | } 59 | -------------------------------------------------------------------------------- /src/mpi/aida/util/Result.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | import java.util.Collections; 4 | import java.util.HashMap; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | 8 | import mpi.aida.util.htmloutput.ResultMention; 9 | import mpi.tokenizer.data.Tokens; 10 | 11 | public class Result { 12 | 13 | private String text; 14 | 15 | private String docId; 16 | 17 | private List dataSetIds; 18 | 19 | private String groundTruthId = null; 20 | 21 | private Tokens tokens; 22 | 23 | private HashMap> finalEntities = null; 24 | 25 | public Result(String docId, String text, Tokens tokens, String groundTruthId) { 26 | this.docId = docId; 27 | this.text = text; 28 | this.dataSetIds = new LinkedList(); 29 | this.tokens = tokens; 30 | finalEntities = new HashMap>(); 31 | this.groundTruthId = groundTruthId; 32 | } 33 | 34 | public void addFinalentity(ResultMention entity) { 35 | registerDataSet(entity.getDataSetId()); 36 | HashMap entry = null; 37 | if (finalEntities.containsKey(entity.getOffset())) { 38 | entry = finalEntities.get(entity.getOffset()); 39 | } else { 40 | entry = new HashMap(); 41 | finalEntities.put(entity.getOffset(), entry); 42 | } 43 | if (!entry.containsKey(entity.getOffset())) { 44 | entry.put(entity.getDataSetId(), entity); 45 | } 46 | } 47 | 48 | private void registerDataSet(String dataSetId) { 49 | if (!dataSetIds.contains(dataSetId)) { 50 | if (dataSetId.equals(groundTruthId)) { 51 | dataSetIds.add(0, dataSetId); 52 | } else { 53 | dataSetIds.add(dataSetId); 54 | } 55 | } 56 | } 57 | 58 | public String getDocId() { 59 | return docId; 60 | } 61 | 62 | public String getText() { 63 | return text; 64 | } 65 | 66 | public boolean containsMention(int offset) { 67 | return finalEntities.containsKey(offset); 68 | } 69 | 70 | public boolean containsMention(int offset, String id) { 71 | if (!finalEntities.containsKey(offset)) { 72 | return false; 73 | } 74 | return finalEntities.get(offset).containsKey(id); 75 | } 76 | 77 | public HashMap getMention(int offset) { 78 | return finalEntities.get(offset); 79 | } 80 | 81 | public int size() { 82 | return finalEntities.size(); 83 | } 84 | 85 | public Tokens getTokens() { 86 | return tokens; 87 | } 88 | 89 | public List getDataSetIds() { 90 | return dataSetIds; 91 | } 92 | 93 | public void sortDataSetIds(HashMap idsAvgPrec){ 94 | Collections.sort(dataSetIds, new SortByAvgPre(idsAvgPrec)); 95 | dataSetIds.remove(groundTruthId); 96 | dataSetIds.add(0,groundTruthId); 97 | } 98 | 99 | public String getGroundTruthId() { 100 | return groundTruthId; 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /src/mpi/aida/util/SortByAvgPre.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | import java.util.Comparator; 4 | import java.util.HashMap; 5 | 6 | public class SortByAvgPre implements Comparator { 7 | 8 | private HashMap idsAvgPrec = null; 9 | 10 | public SortByAvgPre(HashMap idsAvgPrec) { 11 | this.idsAvgPrec = idsAvgPrec; 12 | } 13 | 14 | @Override 15 | public int compare(String o1, String o2) { 16 | if (idsAvgPrec.get(o1) == null && idsAvgPrec.get(o2) == null) { 17 | return 0; 18 | } else if (idsAvgPrec.get(o1) == null || idsAvgPrec.get(o1).equals("none")) { 19 | return 1; 20 | } else if (idsAvgPrec.get(o2) == null || idsAvgPrec.get(o2).equals("none")) { 21 | return -1; 22 | } 23 | double first = Double.parseDouble(idsAvgPrec.get(o1)); 24 | double second = Double.parseDouble(idsAvgPrec.get(o2)); 25 | if (first > second) { 26 | return -1; 27 | } else if (first < second) { 28 | return 1; 29 | } 30 | return 0; 31 | } 32 | 33 | } 34 | -------------------------------------------------------------------------------- /src/mpi/aida/util/WikipediaDumpArticleIdExtractor.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | import java.io.Reader; 4 | 5 | import javatools.filehandlers.FileLines; 6 | import javatools.parsers.Char; 7 | import javatools.util.FileUtils; 8 | 9 | /** 10 | * Extracts all article ids from a Wikipedia pages-articles dump. 11 | * Output format is: 12 | * article_titleid 13 | * 14 | * 15 | */ 16 | public class WikipediaDumpArticleIdExtractor { 17 | 18 | public static void main(String[] args) throws Exception { 19 | if (args.length != 1) { 20 | printUsage(); 21 | System.exit(1); 22 | } 23 | 24 | final Reader reader = FileUtils.getBufferedUTF8Reader(args[0]); 25 | String page = FileLines.readBetween(reader, "", ""); 26 | 27 | int pagesDone = 0; 28 | 29 | while (page != null) { 30 | if (++pagesDone % 100000 == 0) { 31 | System.err.println(pagesDone + " pages done."); 32 | } 33 | 34 | page = Char.decodeAmpersand(page.replace("&", "&")); 35 | String title = FileLines.readBetween(page, "", ""); 36 | String id = FileLines.readBetween(page, "", ""); 37 | String wpUrl = "http://en.wikipedia.org/wiki/" + title.replace(' ', '_'); 38 | System.out.println(wpUrl + "\t" + id); 39 | 40 | page = FileLines.readBetween(reader, "", ""); 41 | } 42 | } 43 | 44 | public static void printUsage() { 45 | System.out.println("Usage:"); 46 | System.out.println("\tWikipediaDumpArticleIdExtractor "); 47 | } 48 | } -------------------------------------------------------------------------------- /src/mpi/aida/util/WikipediaUtil.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | import java.util.regex.Pattern; 4 | 5 | public class WikipediaUtil { 6 | 7 | public static final int TOTAL_DOCS = 2628265; 8 | 9 | /** 10 | * Returns ONLY text (minus headlines, links, etc.) for a Wikipedia article source 11 | * 12 | * @param text 13 | * @return 14 | */ 15 | public static String cleanWikipediaArticle(String text) { 16 | // replace newlines 17 | text = text.replace('\n', ' '); 18 | 19 | // remove external links 20 | text = text.replaceAll("(\\[https?:.+)\\[\\[[^\\[\\]]+\\]\\]", "$1"); 21 | text = text.replaceAll("\\[https?:[^\\[\\]]+\\]", " "); 22 | 23 | // remove references 24 | text = text.replaceAll("", ""); 25 | text = text.replaceAll("", ""); 26 | 27 | // remove galleries 28 | text = text.replaceAll("(?s).*", ""); 29 | 30 | // remove xml tags 31 | text = text.replaceAll("<[^/t! ][^>]+>", " "); 32 | text = text.replaceAll("]+>", " "); 33 | 34 | // remove tables 35 | text = Pattern.compile("]+>(?!).*", Pattern.DOTALL).matcher(text).replaceAll(""); 36 | 37 | // remove xml comments 38 | text = Pattern.compile("", Pattern.DOTALL).matcher(text).replaceAll(""); 39 | 40 | // remove all templates/macros 41 | text = text.replaceAll("'{2,}", ""); 42 | text = text.replaceAll("\\[\\[[^\\[\\]]+:[^\\[\\]]+\\]\\]", ""); 43 | 44 | // workaround for mal-formed tables 45 | text = Pattern.compile("\\{\\{Standard table\\|0\\}\\}.*\\{\\{close table\\}\\}", Pattern.DOTALL).matcher(text).replaceAll(""); 46 | text = text.replaceAll("\\{\\{[sS]tart [bB]ox\\}\\}", "{|"); 47 | text = text.replaceAll("\\{\\{[eE]nd [bB]ox\\}\\}", "|}"); 48 | text = Pattern.compile("(?s)\\{\\|((?!\\|\\}).)*\n\\|\\}\n", Pattern.DOTALL).matcher(text).replaceAll(""); 49 | 50 | // remove templates/infoboxes 51 | text = text.replaceAll("\\{\\{[[^\\{\\}]]+\\}\\}", " "); 52 | 53 | // workaround for some non-standard texts 54 | text = text.replaceAll("(?s)\\{\\|.*\n\\|\\}\u2020Denotes wild-card team \\(since 1995\\)\\.\n", ""); 55 | text = Pattern.compile("^\\*{1,2}.*$", Pattern.MULTILINE).matcher(text).replaceAll(""); 56 | text = Pattern.compile("^\\;.*$", Pattern.MULTILINE).matcher(text).replaceAll(""); 57 | text = Pattern.compile("^:+.*$", Pattern.MULTILINE).matcher(text).replaceAll(""); 58 | 59 | // remove [[ ... : ... ]] 60 | text = text.replaceAll("\\[\\[[^\\[\\]]+:[^\\[\\]]+\\]\\]", " "); 61 | 62 | // remove headlines 63 | text = text.replaceAll("={2,}.*?={2,}"," "); 64 | 65 | // replace links 66 | text = text.replaceAll("\\[\\[[^\\]]+?\\|([^\\]\\n]+?)\\]\\]", "$1"); 67 | text = text.replaceAll("\\[\\[([^\\]]+?)\\]\\]", "$1"); 68 | 69 | // normalize whitespaces 70 | text = text.replaceAll("[\\s\\x00-\\x1F]+", " "); 71 | 72 | // normalize other characters 73 | text = text.replaceAll("<", "<").replaceAll(">", ">"); 74 | 75 | return text; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/mpi/aida/util/YagoUtil.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | import gnu.trove.map.hash.TIntObjectHashMap; 4 | import gnu.trove.set.hash.TIntHashSet; 5 | 6 | import java.sql.SQLException; 7 | import java.util.Collection; 8 | import java.util.LinkedList; 9 | import java.util.List; 10 | 11 | import mpi.aida.access.DataAccess; 12 | import mpi.aida.data.Entities; 13 | import mpi.aida.data.Entity; 14 | 15 | import org.apache.commons.lang.StringUtils; 16 | 17 | import basics.Normalize; 18 | 19 | /** 20 | * This class contains some convenience wrappers for accessing YAGO data. 21 | * It has to use DataAccess and MUST NOT access the DB directly! 22 | * 23 | * 24 | */ 25 | public class YagoUtil { 26 | 27 | public static final int TOTAL_YAGO_ENTITIES = 2651987; 28 | 29 | public enum Gender { 30 | FEMALE, MALE; 31 | } 32 | 33 | /** 34 | * Checks whether the given String is an entity in YAGO 35 | * 36 | * @param entity Entity to check. 37 | * @return true if the entity is in YAGO 38 | * @throws SQLException 39 | */ 40 | public static boolean isYagoEntity(Entity entity) throws SQLException { 41 | return DataAccess.isYagoEntity(entity); 42 | } 43 | 44 | public static Entity getEntityForId(int id) { 45 | return new Entity(DataAccess.getYagoEntityIdForId(id), id); 46 | } 47 | 48 | public static Entities getEntitiesForIds(int[] ids) { 49 | TIntObjectHashMap yagoEntityIds = 50 | DataAccess.getYagoEntityIdsForIds(ids); 51 | Entities entities = new Entities(); 52 | for (int i = 0; i < ids.length; ++i) { 53 | entities.add(new Entity(yagoEntityIds.get(ids[i]), ids[i])); 54 | } 55 | return entities; 56 | } 57 | 58 | public static Entity getEntityForYagoId(String id) { 59 | return new Entity(id, DataAccess.getIdForYagoEntityId(id)); 60 | } 61 | 62 | public static Entities getEntitiesForYagoEntityIds(Collection names) { 63 | Entities entities = new Entities(); 64 | for (String name : names) { 65 | entities.add(new Entity(name, DataAccess.getIdForYagoEntityId(name))); 66 | } 67 | return entities; 68 | } 69 | 70 | /** 71 | * Formats a given mention string properly to query a yago database. 72 | * 73 | * It will first transform the string into a YAGO string (with "" and 74 | * UTF-8 with backslash encoding), and then escape the string properly 75 | * for a Postgres query. 76 | * 77 | * @param mention Mention to format 78 | * @return Mention in YAGO2/Postgres format 79 | */ 80 | public static String getYagoMentionStringPostgresEscaped(String mention) { 81 | return getPostgresEscapedString(Normalize.string(mention)); 82 | } 83 | 84 | public static String getPostgresEscapedString(String input) { 85 | return input.replace("'", "''").replace("\\", "\\\\"); 86 | } 87 | 88 | public static String getPostgresEscapedConcatenatedQuery(Collection entities) { 89 | List queryTerms = new LinkedList(); 90 | 91 | for (String term : entities) { 92 | StringBuilder sb = new StringBuilder(); 93 | sb.append("E'").append(YagoUtil.getPostgresEscapedString(term)).append("'"); 94 | queryTerms.add(sb.toString()); 95 | } 96 | 97 | return StringUtils.join(queryTerms, ","); 98 | } 99 | 100 | public static String getIdQuery(TIntHashSet ids) { 101 | int[] conv = ids.toArray(); 102 | return getIdQuery(conv); 103 | } 104 | 105 | public static String getIdQuery(int[] ids) { 106 | StringBuilder sb = new StringBuilder(); 107 | for (int i = 0; i < ids.length; ++i) { 108 | sb.append(ids[i]); 109 | if (i < ids.length - 1) { 110 | sb.append(","); 111 | } 112 | } 113 | return sb.toString(); 114 | } 115 | } -------------------------------------------------------------------------------- /src/mpi/aida/util/htmloutput/ResultMention.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util.htmloutput; 2 | 3 | public class ResultMention { 4 | 5 | private String dataSetId = null; 6 | 7 | private int offset; 8 | 9 | private int length; 10 | 11 | private String mention; 12 | 13 | private String entity; 14 | 15 | private double confidence; 16 | 17 | private boolean isYagoEntity; 18 | 19 | public ResultMention(String dataSetId, int offset, int length, String mention, String entity, double confidence, boolean isYagoEntity) { 20 | this.dataSetId = dataSetId; 21 | this.offset = offset; 22 | this.length = length; 23 | this.mention = mention; 24 | this.entity = entity; 25 | this.confidence = confidence; 26 | this.isYagoEntity = isYagoEntity; 27 | } 28 | 29 | public int getOffset() { 30 | return offset; 31 | } 32 | 33 | public int getLength() { 34 | return length; 35 | } 36 | 37 | public String getMention() { 38 | return mention; 39 | } 40 | 41 | public String getEntity() { 42 | return entity; 43 | } 44 | 45 | public double getConfidence() { 46 | return confidence; 47 | } 48 | 49 | public String getDataSetId() { 50 | return dataSetId; 51 | } 52 | 53 | public String toString() { 54 | return offset + "\t" + length + "\t" + mention + "\t" + entity; 55 | } 56 | 57 | public boolean isYagoEntity() { 58 | return isYagoEntity; 59 | } 60 | 61 | } 62 | -------------------------------------------------------------------------------- /src/mpi/experiment/measure/EvaluationMeasures.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.measure; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | 8 | public class EvaluationMeasures { 9 | public static Map convertToAverageRanks(List> list) { 10 | Map rankedList = new HashMap(); 11 | 12 | int i=0; 13 | for (List entityPartition : list) { 14 | double avgRank = 0.0; 15 | 16 | for (@SuppressWarnings("unused") String entity : entityPartition) { 17 | i++; 18 | avgRank += i; 19 | } 20 | 21 | avgRank /= (double) entityPartition.size(); 22 | 23 | for (String entity : entityPartition) { 24 | rankedList.put(entity, avgRank); 25 | } 26 | } 27 | 28 | return rankedList; 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/mpi/experiment/reader/CoNLLReader.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.reader; 2 | 3 | import java.io.File; 4 | 5 | public class CoNLLReader extends AidaFormatCollectionReader { 6 | 7 | public final static String finalFileName = File.separator + "CoNLL-YAGO.tsv"; 8 | 9 | public final static String finalCollectionPath = "./data/experiment/CONLL"; 10 | 11 | public CoNLLReader() { 12 | super(finalCollectionPath, finalFileName); 13 | } 14 | 15 | public CoNLLReader(CollectionPart cp) { 16 | super(finalCollectionPath, finalFileName, cp); 17 | } 18 | 19 | 20 | public CoNLLReader(String collectionPath) { 21 | super(collectionPath, finalFileName); 22 | } 23 | 24 | public CoNLLReader(String collectionPath, int from, int to) { 25 | super(collectionPath, finalFileName, from, to); 26 | } 27 | 28 | public CoNLLReader(String collectionPath, CollectionPart cp) { 29 | super(collectionPath, finalFileName, cp); 30 | } 31 | 32 | public CoNLLReader(String collectionPath, String docNums) { 33 | super(collectionPath, finalFileName, docNums); 34 | } 35 | 36 | @Override 37 | protected int[] getCollectionPartFromTo(CollectionPart cp) { 38 | int[] ft = new int[] { 1, 1393 }; 39 | switch (cp) { 40 | case TRAIN: 41 | ft = new int[] { 1, 946 }; 42 | break; 43 | case DEV: 44 | ft = new int[] { 947, 1162 }; 45 | break; 46 | case DEV_SMALL: 47 | ft = new int[] { 947, 1046 }; 48 | break; 49 | case TEST: 50 | ft = new int[] { 1163, 1393 }; 51 | break; 52 | default: 53 | break; 54 | } 55 | return ft; 56 | } 57 | 58 | 59 | public static void main(String[] args) { 60 | CoNLLReader reader = new CoNLLReader(); 61 | String key = reader.getAllDocIds().get(0); 62 | System.out.println(reader.getTokensMap().get(key)); 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/mpi/experiment/reader/CollectionReader.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.reader; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.IOException; 6 | import java.util.HashMap; 7 | import java.util.HashSet; 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | import javatools.util.FileUtils; 12 | import mpi.aida.data.Context; 13 | import mpi.aida.data.Mentions; 14 | import mpi.aida.data.PreparedInput; 15 | 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | 19 | public abstract class CollectionReader implements Iterable { 20 | private static final Logger logger = 21 | LoggerFactory.getLogger(CollectionReader.class); 22 | 23 | public String filePath = null; 24 | 25 | public String collectionPath; 26 | 27 | public int from; 28 | 29 | public int to; 30 | 31 | protected HashSet allDocNumbers = null; 32 | 33 | protected List preparedInputs; 34 | 35 | protected boolean includeNMEMentions = true; 36 | 37 | 38 | public void setIncludeNMEMentions(boolean includeNMEMentions) { 39 | this.includeNMEMentions = includeNMEMentions; 40 | } 41 | 42 | public static enum DataSource { 43 | CONLL, WIKIPEDIA_YAGO2, AIDA, NONE 44 | } 45 | 46 | public static final String CONLL = "CONLL"; 47 | 48 | public static final String WIKIPEDIA_YAGO2 = "WIKIPEDIA_YAGO2"; 49 | 50 | public static final String AIDA = "AIDA"; 51 | 52 | public static final String NONE = "NONE"; 53 | 54 | public static enum CollectionPart { 55 | TRAIN, DEV, DEV_SMALL, TEST 56 | } 57 | 58 | public static final String TRAIN = "TRAIN"; 59 | 60 | public static final String DEV = "DEV"; 61 | 62 | public static final String DEV_SMALL = "DEV_SMALL"; 63 | 64 | public static final String TEST = "TEST"; 65 | 66 | public CollectionReader(String collectionPath) { 67 | this(collectionPath, 0, Integer.MAX_VALUE); 68 | } 69 | 70 | public CollectionReader(String collectionPath, CollectionPart cp) { 71 | int[] ft = getCollectionPartFromTo(cp); 72 | this.collectionPath = collectionPath; 73 | this.from = ft[0]; 74 | this.to = ft[1]; 75 | } 76 | 77 | public CollectionReader(String collectionPath, int from, int to) { 78 | this.collectionPath = collectionPath; 79 | this.from = from; 80 | this.to = to; 81 | } 82 | 83 | public CollectionReader(String collectionPath, String docIds) { 84 | this.collectionPath = collectionPath; 85 | if (docIds == null) { 86 | this.from = 0; 87 | this.to = Integer.MAX_VALUE; 88 | } else { 89 | allDocNumbers = new HashSet(); 90 | String[] data = docIds.split(","); 91 | int i = -1; 92 | for (i = 0; i < data.length; i++) { 93 | try { 94 | allDocNumbers.add(Integer.parseInt(data[i].trim())); 95 | } catch (NumberFormatException e) { 96 | logger.warn(data[i] + " is not an integer"); 97 | } 98 | } 99 | } 100 | } 101 | 102 | public abstract Mentions getDocumentMentions(String docId); 103 | 104 | public abstract Context getDocumentContext(String docId); 105 | 106 | public abstract int collectionSize(); 107 | 108 | public abstract String getText(String docId) ; 109 | 110 | protected abstract int[] getCollectionPartFromTo(CollectionPart cp); 111 | 112 | public static CollectionPart getCollectionPart(String collectionPart) { 113 | if (collectionPart == null) { 114 | return null; 115 | } 116 | 117 | if (collectionPart.equals(TRAIN)) { 118 | return CollectionPart.TRAIN; 119 | } else if (collectionPart.equals(DEV)) { 120 | return CollectionPart.DEV; 121 | } else if (collectionPart.equals(DEV_SMALL)) { 122 | return CollectionPart.DEV_SMALL; 123 | } else if (collectionPart.equals(TEST)) { 124 | return CollectionPart.TEST; 125 | } else { 126 | return null; 127 | } 128 | } 129 | 130 | public Map getAllDocuments() { 131 | Map docsWithText = new HashMap(); 132 | 133 | for (PreparedInput inputDoc : this) { 134 | docsWithText.put(inputDoc.getDocId(), getText(inputDoc.getDocId())); 135 | } 136 | 137 | return docsWithText; 138 | } 139 | 140 | public String readStringFromFile(File f) throws IOException { 141 | BufferedReader reader = FileUtils.getBufferedUTF8Reader(f); 142 | 143 | StringBuilder sb = new StringBuilder(); 144 | 145 | for (String line = reader.readLine(); line != null; line = reader.readLine()) { 146 | sb.append(line + "\n"); 147 | } 148 | 149 | return sb.toString(); 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /src/mpi/experiment/reader/KORE50Reader.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.reader; 2 | 3 | import java.io.File; 4 | 5 | public class KORE50Reader extends AidaFormatCollectionReader { 6 | 7 | public final static String finalFileName = File.separator + "AIDA.tsv"; 8 | 9 | public final static String finalCollectionPath = "./data/experiment/KORE50"; 10 | 11 | public KORE50Reader(String collectionPath, String fileName) { 12 | super(collectionPath, fileName); 13 | } 14 | 15 | public KORE50Reader() { 16 | super(finalCollectionPath, finalFileName); 17 | } 18 | 19 | @Override 20 | protected int[] getCollectionPartFromTo(CollectionPart cp) { 21 | return new int[] { 1, 50 }; 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/NullEntityEntityTracing.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace; 2 | 3 | import java.util.Collection; 4 | 5 | import mpi.experiment.trace.measures.MeasureTracer; 6 | 7 | 8 | public class NullEntityEntityTracing extends EntityEntityTracing { 9 | 10 | @Override 11 | public String generateOutput() { 12 | return ""; 13 | } 14 | 15 | @Override 16 | public void addEntityEntityMeasureTracer(String e1, String e2, MeasureTracer mt) { 17 | } 18 | 19 | @Override 20 | public void setCorrectEntities(Collection correctEntities) { 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/NullGraphTracer.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace; 2 | 3 | import java.util.List; 4 | import java.util.Map; 5 | 6 | public class NullGraphTracer extends GraphTracer { 7 | 8 | @Override 9 | public void addCandidateEntityToOriginalGraph(String docId, String mention, 10 | String candidateEntity, double entityWeightedDegree, double MESimilairty, Map connectedEntities) { 11 | } 12 | 13 | @Override 14 | public void addCandidateEntityToCleanedGraph(String docId, String mention, 15 | String candidateEntity, double entityWeightedDegree, double MESimilairty) { 16 | } 17 | 18 | @Override 19 | public void addCandidateEntityToFinalGraph(String docId, String mention, 20 | String candidateEntity, double entityWeightedDegree, double MESimilairty) { 21 | } 22 | 23 | @Override 24 | public void addEntityRemovalStep(String docId, String entity, double entityWeightedDegree, List connectedMentions) { 25 | } 26 | 27 | @Override 28 | public void writeOutput(String outputPath) { 29 | } 30 | 31 | public void addStat(String docId, String description, String value) { 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/NullTracer.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace; 2 | 3 | import mpi.aida.data.Mention; 4 | import mpi.experiment.trace.data.EntityTracer; 5 | import mpi.experiment.trace.data.MentionTracer; 6 | import mpi.experiment.trace.measures.MeasureTracer; 7 | 8 | public class NullTracer extends Tracer { 9 | 10 | EntityEntityTracing nullEETracing = new NullEntityEntityTracing(); 11 | 12 | public NullTracer() { 13 | super(null, null); 14 | } 15 | 16 | public void addMentionForDocId(String docId, Mention m, MentionTracer mt) { 17 | } 18 | 19 | public void addEntityForMention(Mention mention, String entity, EntityTracer entityTracer) { 20 | } 21 | 22 | public void addMeasureForMentionEntity(Mention mention, String entity, MeasureTracer measure) { 23 | } 24 | 25 | public void setMentionEntityTotalSimilarityScore(Mention mention, String entity, double score) { 26 | } 27 | 28 | public void writeOutput(String resultFileName, boolean withYago) { 29 | } 30 | 31 | public EntityEntityTracing eeTracing() { 32 | return nullEETracing; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/data/EntityTracer.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace.data; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | import mpi.experiment.trace.measures.MeasureTracer; 7 | 8 | public class EntityTracer implements Comparable { 9 | 10 | private String entity; 11 | 12 | private double score; 13 | 14 | private List measureTracers = new LinkedList(); 15 | 16 | public EntityTracer(String entity) { 17 | this.entity = entity; 18 | } 19 | 20 | public void addMeasureTracer(MeasureTracer mt) { 21 | measureTracers.add(mt); 22 | } 23 | 24 | public int compareTo(EntityTracer e) { 25 | return Double.compare(e.getTotalScore(), this.getTotalScore()); 26 | } 27 | 28 | public String getName() { 29 | return entity; 30 | } 31 | 32 | public List getMeasureTracers() { 33 | return measureTracers; 34 | } 35 | 36 | public double getTotalScore() { 37 | return score; 38 | } 39 | 40 | public void setTotalScore(double score) { 41 | this.score = score; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/data/MentionTracer.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace.data; 2 | 3 | import java.util.Collection; 4 | import java.util.HashMap; 5 | import java.util.Map; 6 | 7 | import mpi.aida.data.Mention; 8 | 9 | public class MentionTracer { 10 | 11 | private Map entities = new HashMap(); 12 | 13 | private Mention mention; 14 | 15 | public MentionTracer(Mention mention) { 16 | this.mention = mention; 17 | } 18 | 19 | public String getName() { 20 | return mention.getMention(); 21 | } 22 | 23 | public EntityTracer getEntityTracer(String entity) { 24 | return entities.get(entity); 25 | } 26 | 27 | public int getOffset() { 28 | return mention.getCharOffset(); 29 | } 30 | 31 | public void addEntityTracer(String entity, EntityTracer entityTracer) { 32 | entities.put(entity, entityTracer); 33 | } 34 | 35 | public Collection getEntityTracers() { 36 | return entities.values(); 37 | } 38 | 39 | public int getLength() { 40 | return mention.getCharLength(); 41 | } 42 | 43 | public String getMentionStr() { 44 | return mention.getMention() + ":" + mention.getStartToken(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/measures/GenericEntityEntitySimilarityMeasureTracer.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace.measures; 2 | 3 | 4 | public class GenericEntityEntitySimilarityMeasureTracer extends MeasureTracer { 5 | 6 | public GenericEntityEntitySimilarityMeasureTracer(String name, double weight) { 7 | super(name, weight); 8 | } 9 | 10 | @Override 11 | public String getOutput() { 12 | return "    eesim: " + weight + "
"; 13 | } 14 | 15 | } 16 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/measures/KeyphrasesMeasureTracer.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace.measures; 2 | 3 | import gnu.trove.list.linked.TIntLinkedList; 4 | import gnu.trove.map.hash.TIntDoubleHashMap; 5 | import gnu.trove.map.hash.TIntObjectHashMap; 6 | 7 | import java.text.DecimalFormat; 8 | import java.util.Collections; 9 | import java.util.LinkedList; 10 | import java.util.List; 11 | 12 | import mpi.aida.access.DataAccess; 13 | 14 | public class KeyphrasesMeasureTracer extends MeasureTracer { 15 | public static int countForUI = 0; 16 | 17 | private List keyphrases = null; 18 | private DecimalFormat formatter = new DecimalFormat("#0.00000"); 19 | 20 | public KeyphrasesMeasureTracer(String name, double weight) { 21 | super(name, weight); 22 | keyphrases = new LinkedList(); 23 | } 24 | 25 | @Override 26 | public String getOutput() { 27 | Collections.sort(keyphrases); 28 | 29 | TIntLinkedList wordIds = new TIntLinkedList(); 30 | for (keyphraseTracingObject kto : keyphrases) { 31 | for (int keyword : kto.keyphraseTokens) { 32 | wordIds.add(keyword); 33 | } 34 | } 35 | TIntObjectHashMap id2word = 36 | DataAccess.getWordsForIds(wordIds.toArray()); 37 | 38 | StringBuilder sb = new StringBuilder(); 39 | sb.append(" score = " + formatter.format(score) + "
"); 40 | int keyphraseCount = 0; 41 | for(keyphraseTracingObject keyphrase : keyphrases) { 42 | if(keyphraseCount == 5) { 43 | countForUI++; 44 | sb.append("More ...      Less ..."); 48 | sb.append(""); 58 | } 59 | return sb.toString(); 60 | } 61 | 62 | private String buildKeyhraseHTMLEntry(int[] keyphraseTokens, 63 | TIntDoubleHashMap matchedKeywords, TIntObjectHashMap id2word) { 64 | StringBuilder sb = new StringBuilder(); 65 | for (int token : keyphraseTokens) { 66 | if (matchedKeywords.containsKey(token)) { 67 | sb.append("" + 68 | id2word.get(token) + " (" + 69 | matchedKeywords.get(token) + 70 | ") "); 71 | } else { 72 | sb.append("" + id2word.get(token) + " "); 73 | } 74 | } 75 | 76 | return sb.toString(); 77 | } 78 | 79 | /** 80 | * @param keyphrase the keyphrase to add 81 | * @param weight the average weight of the keyphrase 82 | * @param score how much score this keyphrase contributes to the total similarity 83 | * @param matchedKeywords the keywords within this keyphrase and their weights 84 | */ 85 | public void addKeyphraseTraceInfo(int[] keyphraseTokens, double weight, double score, TIntDoubleHashMap matchedKeywords) { 86 | keyphrases.add(new keyphraseTracingObject(keyphraseTokens, score, matchedKeywords)); 87 | } 88 | 89 | private class keyphraseTracingObject implements Comparable{ 90 | private int[] keyphraseTokens; 91 | private double score; 92 | private TIntDoubleHashMap matchedKeywords; 93 | 94 | public keyphraseTracingObject( 95 | int[] keyphraseTokens, double score, TIntDoubleHashMap matchedKeywords) { 96 | this.keyphraseTokens = keyphraseTokens; 97 | this.score = score; 98 | this.matchedKeywords = matchedKeywords; 99 | } 100 | 101 | @Override 102 | public int compareTo(keyphraseTracingObject o) { 103 | if(score < o.score) 104 | return 1; 105 | else if (score == o.score) 106 | return 0; 107 | else 108 | return -1; 109 | } 110 | } 111 | } 112 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/measures/KeytermEntityEntityMeasureTracer.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace.measures; 2 | 3 | import java.text.DecimalFormat; 4 | import java.util.Map; 5 | import java.util.Map.Entry; 6 | 7 | import mpi.aida.util.CollectionUtils; 8 | 9 | import org.slf4j.Logger; 10 | import org.slf4j.LoggerFactory; 11 | 12 | 13 | public class KeytermEntityEntityMeasureTracer extends MeasureTracer { 14 | private static final Logger logger = 15 | LoggerFactory.getLogger(KeytermEntityEntityMeasureTracer.class); 16 | 17 | Map terms; 18 | Map matchedTerms; 19 | 20 | private DecimalFormat sFormatter = new DecimalFormat("0.0E0"); 21 | private DecimalFormat percentFormatter = new DecimalFormat("#0.0"); 22 | 23 | public static final String UI_PREFIX = "KWCSEEMT"; 24 | public static int countForUI = 0; 25 | 26 | public KeytermEntityEntityMeasureTracer(String name, double weight, Map terms, Map matchedTerms) { 27 | super(name, weight); 28 | 29 | this.terms = terms; 30 | this.matchedTerms = matchedTerms; 31 | } 32 | 33 | @Override 34 | public String getOutput() { 35 | int keywordCount=0; 36 | 37 | StringBuilder sb = new StringBuilder(); 38 | 39 | // sb.append("    eesim: " + weight + "
"); 40 | 41 | Map sortedMatches = CollectionUtils.sortMapByValue(matchedTerms, true); 42 | 43 | double totalWeight = 0.0; 44 | for (TermTracer tt : matchedTerms.values()) { 45 | totalWeight += tt.getTermWeight(); 46 | } 47 | 48 | double currentWeight = 0.0; 49 | 50 | for (Entry k : sortedMatches.entrySet()) { 51 | String term = k.getKey(); 52 | keywordCount++; 53 | 54 | if(keywordCount == 1) { 55 | countForUI++; 56 | sb.append(" More ...    Less ..."); 60 | sb.append(""); 93 | } 94 | 95 | sb.append("
Matches: " + keywordCount + "/" + terms.size() + "
"); 96 | 97 | return sb.toString(); 98 | } 99 | 100 | public Map getMatchedKeywords() { 101 | return matchedTerms; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/measures/KeywordContextEntityTracer.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace.measures; 2 | 3 | import java.text.DecimalFormat; 4 | import java.util.Map; 5 | import java.util.Map.Entry; 6 | 7 | import mpi.aida.util.CollectionUtils; 8 | 9 | 10 | public class KeywordContextEntityTracer extends TracerPart { 11 | 12 | private Map keywords; 13 | 14 | private DecimalFormat formatter = new DecimalFormat("#0.00000"); 15 | 16 | private static int countForUI; 17 | 18 | public static final String UI_PREFIX = "KCET"; 19 | 20 | public KeywordContextEntityTracer(Map keywords) { 21 | this.keywords = CollectionUtils.sortMapByValue(keywords, true); 22 | } 23 | 24 | @Override 25 | public String getOutput() { 26 | StringBuilder sb = new StringBuilder(); 27 | 28 | int keywordCount = 0; 29 | 30 | for (Entry e : keywords.entrySet()) { 31 | sb.append(e.getKey()).append(" (").append(formatter.format(e.getValue())).append("), "); 32 | 33 | keywordCount++; 34 | 35 | if(keywordCount == 5) { 36 | countForUI++; 37 | sb.append(" More ...    Less ..."); 41 | sb.append(""); 47 | } 48 | 49 | return sb.toString(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/measures/MeasureTracer.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace.measures; 2 | 3 | public abstract class MeasureTracer extends TracerPart { 4 | // holds the name of the measure 5 | protected String name; 6 | // the value of the similarity of this measure 7 | protected double score; 8 | 9 | protected double weight; 10 | 11 | public MeasureTracer(String name, double weight) { 12 | super(); 13 | this.name = name; 14 | this.weight = weight; 15 | } 16 | public String getName() { 17 | return name + " - " + weight; 18 | } 19 | 20 | public double getScore() { 21 | return score; 22 | } 23 | 24 | public void setScore(double score) { 25 | this.score = score; 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/measures/PriorMeasureTracer.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace.measures; 2 | 3 | import java.text.DecimalFormat; 4 | 5 | public class PriorMeasureTracer extends MeasureTracer { 6 | 7 | 8 | public PriorMeasureTracer(String name, double weight) { 9 | super(name, weight); 10 | } 11 | 12 | @Override 13 | public String getOutput() { 14 | DecimalFormat formatter = new DecimalFormat("#0.000"); 15 | return "score = " + formatter.format(score) + " { 7 | 8 | double termWeight; 9 | 10 | Map innerMatches = new HashMap(); 11 | 12 | public double getTermWeight() { 13 | return termWeight; 14 | } 15 | 16 | public void setTermWeight(double termWeight) { 17 | this.termWeight = termWeight; 18 | } 19 | 20 | public Map getInnerMatches() { 21 | return innerMatches; 22 | } 23 | 24 | public void addInnerMatch(String inner, Double weight) { 25 | innerMatches.put(inner, weight); 26 | } 27 | 28 | @Override 29 | public int compareTo(TermTracer o) { 30 | return Double.compare(termWeight, o.getTermWeight()); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/mpi/experiment/trace/measures/TracerPart.java: -------------------------------------------------------------------------------- 1 | package mpi.experiment.trace.measures; 2 | 3 | 4 | public abstract class TracerPart { 5 | public abstract String getOutput(); 6 | } 7 | -------------------------------------------------------------------------------- /test/mpi/aida/DisambiguatorTest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | import mpi.aida.config.AidaConfig; 9 | import mpi.aida.config.settings.DisambiguationSettings; 10 | import mpi.aida.config.settings.PreparationSettings; 11 | import mpi.aida.config.settings.disambiguation.CocktailPartyDisambiguationSettings; 12 | import mpi.aida.data.DisambiguationResults; 13 | import mpi.aida.data.PreparedInput; 14 | import mpi.aida.data.ResultMention; 15 | import mpi.aida.preparation.mentionrecognition.FilterMentions.FilterType; 16 | 17 | import org.junit.Test; 18 | 19 | /** 20 | * Testing against the predefined DataAccessForTesting. 21 | * 22 | */ 23 | public class DisambiguatorTest { 24 | public static final double DEFAULT_ALPHA = 0.6; 25 | public static final double DEFAULT_COH_ROBUSTNESS = 0.9; 26 | public static final int DEFAULT_SIZE = 5; 27 | 28 | public DisambiguatorTest() { 29 | AidaConfig.set("dataAccess", "testing"); 30 | } 31 | 32 | @Test 33 | public void testPageKashmir() throws Exception { 34 | Preparator p = new Preparator(); 35 | 36 | String docId = "testPageKashmir"; 37 | String content = "When [[Page]] played Kashmir at Knebworth, his Les Paul was uniquely tuned."; 38 | PreparationSettings prepSettings = new PreparationSettings(); 39 | prepSettings.setMentionsFilter(FilterType.Hybrid); 40 | 41 | PreparedInput preparedInput = p.prepare(docId, content, new PreparationSettings()); 42 | 43 | DisambiguationSettings settings = new CocktailPartyDisambiguationSettings(); 44 | settings.setAlpha(DEFAULT_ALPHA); 45 | settings.setCohRobustnessThreshold(DEFAULT_COH_ROBUSTNESS); 46 | settings.setEntitiesPerMentionConstraint(DEFAULT_SIZE); 47 | 48 | Disambiguator d = new Disambiguator(preparedInput, settings); 49 | 50 | DisambiguationResults results = d.disambiguate(); 51 | 52 | Map mappings = repackageMappings(results); 53 | 54 | String mapped = mappings.get("Page"); 55 | assertEquals("Jimmy_Page", mapped); 56 | 57 | mapped = mappings.get("Kashmir"); 58 | assertEquals("Kashmir_(song)", mapped); 59 | 60 | mapped = mappings.get("Knebworth"); 61 | assertEquals("Knebworth_Festival", mapped); 62 | 63 | mapped = mappings.get("Les Paul"); 64 | assertEquals("--NME--", mapped); 65 | } 66 | 67 | private Map repackageMappings(DisambiguationResults results) { 68 | Map repack = new HashMap(); 69 | 70 | for (ResultMention rm : results.getResultMentions()) { 71 | repack.put(rm.getMention(), results.getBestEntity(rm).getEntity()); 72 | } 73 | 74 | return repack; 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /test/mpi/aida/data/ContextTest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.data; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import gnu.trove.map.hash.TIntObjectHashMap; 5 | 6 | import java.util.LinkedList; 7 | import java.util.List; 8 | 9 | import mpi.aida.access.DataAccess; 10 | import mpi.aida.config.AidaConfig; 11 | 12 | import org.junit.Test; 13 | 14 | 15 | public class ContextTest { 16 | 17 | public ContextTest() { 18 | AidaConfig.set("dataAccess", "testing"); 19 | } 20 | 21 | @Test 22 | public void test() { 23 | List text = new LinkedList(); 24 | 25 | text.add("Jimmy"); 26 | text.add("played"); 27 | text.add("Les"); 28 | text.add("Paul"); 29 | text.add("played"); 30 | 31 | Context context = new Context(text); 32 | assertEquals(text, context.getTokens()); 33 | TIntObjectHashMap id2word = 34 | DataAccess.getWordsForIds(context.getTokenIds()); 35 | 36 | for (int i = 0; i < text.size(); ++i) { 37 | assertEquals(text.get(i), id2word.get(context.getTokenIds()[i])); 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /test/mpi/aida/graph/algorithms/CocktailPartySizeConstrainedTest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.algorithms; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.HashMap; 6 | import java.util.List; 7 | import java.util.Map; 8 | import java.util.Map.Entry; 9 | 10 | import mpi.aida.Preparator; 11 | import mpi.aida.config.AidaConfig; 12 | import mpi.aida.config.settings.DisambiguationSettings; 13 | import mpi.aida.config.settings.PreparationSettings; 14 | import mpi.aida.config.settings.disambiguation.CocktailPartyDisambiguationSettings; 15 | import mpi.aida.config.settings.preparation.StanfordHybridPreparationSettings; 16 | import mpi.aida.data.Entities; 17 | import mpi.aida.data.Entity; 18 | import mpi.aida.data.PreparedInput; 19 | import mpi.aida.data.ResultEntity; 20 | import mpi.aida.data.ResultMention; 21 | import mpi.aida.graph.Graph; 22 | import mpi.aida.graph.GraphGenerator; 23 | import mpi.experiment.trace.NullTracer; 24 | import mpi.experiment.trace.Tracer; 25 | 26 | import org.junit.Test; 27 | 28 | public class CocktailPartySizeConstrainedTest { 29 | 30 | public CocktailPartySizeConstrainedTest() { 31 | AidaConfig.set("dataAccess", "testing"); 32 | } 33 | 34 | @Test 35 | public void testCocktailParty() throws Exception { 36 | 37 | String text = "When [[Page]] played Kashmir at Knebworth, his Les Paul was uniquely tuned."; 38 | 39 | String e1 = "Kashmir"; 40 | String e2 = "Kashmir_(song)"; 41 | String e3 = "Jimmy_Page"; 42 | 43 | Entities entities = new Entities(); 44 | entities.add(new Entity(e1, 1)); 45 | entities.add(new Entity(e2, 2)); 46 | entities.add(new Entity(e3, 2)); 47 | 48 | PreparationSettings prepSettings = new StanfordHybridPreparationSettings(); 49 | 50 | Tracer tracer = new NullTracer(); 51 | 52 | Preparator p = new Preparator(); 53 | PreparedInput input = p.prepare("test", text, prepSettings); 54 | 55 | DisambiguationSettings disSettings = new CocktailPartyDisambiguationSettings(); 56 | 57 | GraphGenerator gg = new GraphGenerator(input, disSettings, tracer); 58 | Graph gData = gg.run(); 59 | 60 | //KeyphrasesContext kpContext = new KeyphrasesContext(entities); 61 | 62 | DisambiguationAlgorithm da = null; 63 | da = new CocktailPartySizeConstrained(gData, disSettings.shouldUseExhaustiveSearch(), disSettings.shouldUseNormalizedObjective(), disSettings.getEntitiesPerMentionConstraint()); 64 | Map> results = da.disambiguate(); 65 | Map mappings = repackageMappings(results); 66 | 67 | String mapped = mappings.get("Page").getEntity(); 68 | double score = mappings.get("Page").getDisambiguationScore(); 69 | assertEquals("Jimmy_Page", mapped); 70 | assertEquals(0.002198, score, 0.00001); 71 | 72 | mapped = mappings.get("Kashmir").getEntity(); 73 | score = mappings.get("Kashmir").getDisambiguationScore(); 74 | assertEquals("Kashmir_(song)", mapped); 75 | assertEquals(0.00029, score, 0.00001); 76 | 77 | mapped = mappings.get("Knebworth").getEntity(); 78 | score = mappings.get("Knebworth").getDisambiguationScore(); 79 | assertEquals("Knebworth_Festival", mapped); 80 | assertEquals(0.6, score, 0.00001); 81 | 82 | mapped = mappings.get("Les Paul").getEntity(); 83 | score = mappings.get("Les Paul").getDisambiguationScore(); 84 | assertEquals("--NME--", mapped); 85 | assertEquals(0.0, score, 0.00001); 86 | 87 | } 88 | 89 | private Map repackageMappings(Map> results) { 90 | Map repack = new HashMap(); 91 | 92 | for(Entry> entry: results.entrySet()) { 93 | repack.put(entry.getKey().getMention(), entry.getValue().get(0)); 94 | System.out.println(entry.getValue().get(0)); 95 | } 96 | return repack; 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /test/mpi/aida/graph/algorithms/CocktailPartyTest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.algorithms; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.HashMap; 6 | import java.util.List; 7 | import java.util.Map; 8 | import java.util.Map.Entry; 9 | 10 | import mpi.aida.Preparator; 11 | import mpi.aida.config.AidaConfig; 12 | import mpi.aida.config.settings.DisambiguationSettings; 13 | import mpi.aida.config.settings.PreparationSettings; 14 | import mpi.aida.config.settings.disambiguation.CocktailPartyDisambiguationSettings; 15 | import mpi.aida.config.settings.preparation.StanfordHybridPreparationSettings; 16 | import mpi.aida.data.Entities; 17 | import mpi.aida.data.Entity; 18 | import mpi.aida.data.PreparedInput; 19 | import mpi.aida.data.ResultEntity; 20 | import mpi.aida.data.ResultMention; 21 | import mpi.aida.graph.Graph; 22 | import mpi.aida.graph.GraphGenerator; 23 | import mpi.aida.graph.algorithms.CocktailParty; 24 | import mpi.aida.graph.algorithms.DisambiguationAlgorithm; 25 | import mpi.experiment.trace.NullTracer; 26 | import mpi.experiment.trace.Tracer; 27 | 28 | import org.junit.Test; 29 | 30 | public class CocktailPartyTest { 31 | 32 | public CocktailPartyTest() { 33 | AidaConfig.set("dataAccess", "testing"); 34 | } 35 | 36 | @Test 37 | public void testCocktailParty() throws Exception { 38 | 39 | String text = "When [[Page]] played Kashmir at Knebworth, his Les Paul was uniquely tuned."; 40 | 41 | String e1 = "Kashmir"; 42 | String e2 = "Kashmir_(song)"; 43 | String e3 = "Jimmy_Page"; 44 | 45 | Entities entities = new Entities(); 46 | entities.add(new Entity(e1, 1)); 47 | entities.add(new Entity(e2, 2)); 48 | entities.add(new Entity(e3, 2)); 49 | 50 | PreparationSettings prepSettings = new StanfordHybridPreparationSettings(); 51 | 52 | Tracer tracer = new NullTracer(); 53 | 54 | Preparator p = new Preparator(); 55 | PreparedInput input = p.prepare("test", text, prepSettings); 56 | 57 | DisambiguationSettings disSettings = new CocktailPartyDisambiguationSettings(); 58 | 59 | GraphGenerator gg = new GraphGenerator(input, disSettings, tracer); 60 | Graph gData = gg.run(); 61 | 62 | //KeyphrasesContext kpContext = new KeyphrasesContext(entities); 63 | 64 | DisambiguationAlgorithm da = null; 65 | da = new CocktailParty(gData, disSettings.shouldUseExhaustiveSearch(), disSettings.shouldUseNormalizedObjective()); 66 | Map> results = da.disambiguate(); 67 | Map mappings = repackageMappings(results); 68 | 69 | String mapped = mappings.get("Page").getEntity(); 70 | double score = mappings.get("Page").getDisambiguationScore(); 71 | assertEquals("Jimmy_Page", mapped); 72 | assertEquals(0.29169, score, 0.00001); 73 | 74 | mapped = mappings.get("Kashmir").getEntity(); 75 | score = mappings.get("Kashmir").getDisambiguationScore(); 76 | assertEquals("Kashmir_(song)", mapped); 77 | assertEquals(0.29143, score, 0.00001); 78 | 79 | mapped = mappings.get("Knebworth").getEntity(); 80 | score = mappings.get("Knebworth").getDisambiguationScore(); 81 | assertEquals("Knebworth_Festival", mapped); 82 | assertEquals(0.68879, score, 0.00001); 83 | 84 | mapped = mappings.get("Les Paul").getEntity(); 85 | score = mappings.get("Les Paul").getDisambiguationScore(); 86 | assertEquals("--NME--", mapped); 87 | assertEquals(0.0, score, 0.00001); 88 | 89 | } 90 | 91 | private Map repackageMappings(Map> results) { 92 | Map repack = new HashMap(); 93 | 94 | for(Entry> entry: results.entrySet()) { 95 | repack.put(entry.getKey().getMention(), entry.getValue().get(0)); 96 | } 97 | return repack; 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /test/mpi/aida/graph/similarity/EnsembleMentionEntitySimilarityTest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import java.util.Arrays; 6 | import java.util.HashMap; 7 | import java.util.LinkedList; 8 | import java.util.List; 9 | import java.util.Map; 10 | 11 | import mpi.aida.AidaManager; 12 | import mpi.aida.access.DataAccess; 13 | import mpi.aida.config.AidaConfig; 14 | import mpi.aida.data.Context; 15 | import mpi.aida.data.Entities; 16 | import mpi.aida.data.Entity; 17 | import mpi.aida.data.Mention; 18 | import mpi.aida.data.Mentions; 19 | import mpi.aida.graph.similarity.EnsembleMentionEntitySimilarity; 20 | import mpi.aida.graph.similarity.util.SimilaritySettings; 21 | import mpi.experiment.trace.NullTracer; 22 | import mpi.experiment.trace.Tracer; 23 | 24 | import org.junit.Test; 25 | 26 | 27 | public class EnsembleMentionEntitySimilarityTest { 28 | 29 | public EnsembleMentionEntitySimilarityTest() { 30 | AidaConfig.set("dataAccess", "testing"); 31 | AidaManager.init(); 32 | } 33 | 34 | @Test 35 | public void test() throws Exception { 36 | // All caps PLAYED to check if term expansion is working. 37 | String text = 38 | "When Page PLAYED Kashmir at Knebworth , his Les Paul was uniquely tuned ."; 39 | 40 | Context context = new Context(Arrays.asList(text.split(" "))); 41 | 42 | String n1 = "Kashmir"; 43 | String n2 = "Kashmir_(song)"; 44 | String n3 = "Jimmy_Page"; 45 | 46 | Entity e1 = new Entity(n1, DataAccess.getIdForYagoEntityId(n1)); 47 | Entity e2 = new Entity(n2, DataAccess.getIdForYagoEntityId(n2)); 48 | Entity e3 = new Entity(n3, DataAccess.getIdForYagoEntityId(n3)); 49 | 50 | Entities entities = new Entities(); 51 | entities.add(e1); 52 | entities.add(e2); 53 | entities.add(e3); 54 | 55 | Tracer tracer = new NullTracer(); 56 | 57 | List simConfigs = new LinkedList(); 58 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.95" }); 59 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.05" }); 60 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedMISimilarity", "KeyphrasesContext", "0.475" }); 61 | simConfigs.add(new String[] { "UnnormalizedKeyphrasesBasedIDFSimilarity", "KeyphrasesContext", "0.025" }); 62 | 63 | List eeSimConfigs = new LinkedList(); 64 | 65 | eeSimConfigs.add(new String[] { "MilneWittenEntityEntitySimilarity", "1.0" }); 66 | 67 | double priorWeight = 0.5; 68 | 69 | Map maxMinSettings = new HashMap(); 70 | maxMinSettings.put( 71 | "UnnormalizedKeyphrasesBasedIDFSimilarity:KeyphrasesContext", 72 | new double[] { 0, 70000 }); 73 | maxMinSettings.put( 74 | "UnnormalizedKeyphrasesBasedMISimilarity:KeyphrasesContext", 75 | new double[] { 0, 1000 }); 76 | maxMinSettings.put("prior", new double[] { 0.0, 1.0 }); 77 | 78 | SimilaritySettings settings = 79 | new SimilaritySettings( 80 | simConfigs, eeSimConfigs, priorWeight, maxMinSettings); 81 | settings.setPriorThreshold(0.8); 82 | 83 | Mentions ms = new Mentions(); 84 | Mention m1 = new Mention(); 85 | m1.setMention("Page"); 86 | m1.setStartToken(1); 87 | m1.setEndToken(1); 88 | ms.addMention(m1); 89 | Mention m2 = new Mention(); 90 | m2.setMention("Kashmir"); 91 | m2.setStartToken(3); 92 | m2.setEndToken(3); 93 | ms.addMention(m2); 94 | AidaManager.fillInCandidateEntities(ms); 95 | 96 | EnsembleMentionEntitySimilarity emes = new EnsembleMentionEntitySimilarity(ms, entities, settings, tracer); 97 | 98 | double simPage = emes.calcSimilarity(m1, context, e3); 99 | double simKashmir = emes.calcSimilarity(m2, context, e2); 100 | 101 | assertEquals(0.000044195, simPage, 0.000000001); 102 | assertEquals(0.050000, simKashmir, 0.00001); 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /test/mpi/aida/graph/similarity/PriorProbabilityTest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertTrue; 5 | 6 | import java.util.HashSet; 7 | import java.util.Set; 8 | 9 | import mpi.aida.AidaManager; 10 | import mpi.aida.config.AidaConfig; 11 | import mpi.aida.data.Entity; 12 | import mpi.aida.graph.similarity.MaterializedPriorProbability; 13 | import mpi.aida.graph.similarity.PriorProbability; 14 | 15 | import org.junit.Test; 16 | 17 | public class PriorProbabilityTest { 18 | 19 | public PriorProbabilityTest() { 20 | AidaConfig.set("dataAccess", "testing"); 21 | } 22 | 23 | @Test 24 | public void test() throws Exception { 25 | Set mentions = new HashSet(); 26 | mentions.add("Kashmir"); 27 | mentions.add("Page"); 28 | 29 | Entity kashmir = AidaManager.getEntity("Kashmir"); 30 | Entity kashmirSong = AidaManager.getEntity("Kashmir_(song)"); 31 | Entity jimmy = AidaManager.getEntity("Jimmy_Page"); 32 | Entity larry = AidaManager.getEntity("Larry_Page"); 33 | 34 | PriorProbability pp = new MaterializedPriorProbability(mentions); 35 | 36 | double ppKashmirKashmir = pp.getPriorProbability("Kashmir", kashmir); 37 | double ppKashmirKashmirSong = pp.getPriorProbability("Kashmir", kashmirSong); 38 | 39 | assertTrue(ppKashmirKashmir > ppKashmirKashmirSong); 40 | assertEquals(0.9, ppKashmirKashmir, 0.001); 41 | assertEquals(1.0, ppKashmirKashmir + ppKashmirKashmirSong, 0.001); 42 | 43 | double ppPageJimmy = pp.getPriorProbability("Page", jimmy); 44 | double ppPageLarry = pp.getPriorProbability("Page", larry); 45 | 46 | assertTrue(ppPageJimmy < ppPageLarry); 47 | assertEquals(0.3, ppPageJimmy, 0.001); 48 | assertEquals(1.0, ppPageJimmy + ppPageLarry, 0.001); 49 | } 50 | } -------------------------------------------------------------------------------- /test/mpi/aida/graph/similarity/context/EntitiesContextTest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.context; 2 | 3 | import static org.junit.Assert.assertTrue; 4 | import mpi.aida.graph.similarity.context.EntitiesContext; 5 | 6 | import org.junit.Test; 7 | 8 | 9 | public class EntitiesContextTest { 10 | 11 | @Test 12 | public void testGetEntityName() { 13 | assertTrue(EntitiesContext.getEntityName("Riazuddin_\u0028physicist\u0029").equals("Riazuddin")); 14 | assertTrue(EntitiesContext.getEntityName("\u0028physicist\u0029_Riazuddin").equals("(physicist) Riazuddin")); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /test/mpi/aida/graph/similarity/measure/KORETest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertTrue; 5 | import mpi.aida.access.DataAccess; 6 | import mpi.aida.config.AidaConfig; 7 | import mpi.aida.data.Entities; 8 | import mpi.aida.data.Entity; 9 | import mpi.aida.graph.similarity.EntityEntitySimilarity; 10 | import mpi.aida.graph.similarity.context.EntitiesContextSettings; 11 | import mpi.experiment.trace.NullTracer; 12 | 13 | import org.junit.Test; 14 | 15 | 16 | public class KORETest { 17 | public KORETest() { 18 | AidaConfig.set("dataAccess", "testing"); 19 | } 20 | 21 | @Test 22 | public void koreTest() throws Exception { 23 | Entity a = new Entity("Kashmir_(song)", DataAccess.getIdForYagoEntityId("Kashmir_(song)")); 24 | Entity b = new Entity("Jimmy_Page", DataAccess.getIdForYagoEntityId("Jimmy_Page")); 25 | Entity c = new Entity("Larry_Page", DataAccess.getIdForYagoEntityId("Larry_Page")); 26 | Entity d = new Entity("Knebworth_Festival", DataAccess.getIdForYagoEntityId("Knebworth_Festival")); 27 | 28 | Entities entities = new Entities(); 29 | entities.add(a); 30 | entities.add(b); 31 | entities.add(c); 32 | entities.add(d); 33 | 34 | EntitiesContextSettings ecs = new EntitiesContextSettings(); 35 | ecs.setEntityCoherenceKeyphraseAlpha(1.0); 36 | ecs.setEntityCoherenceKeywordAlpha(0.0); 37 | ecs.setShouldNormalizeWeights(true); 38 | ecs.setKeyphraseSourceExclusion(DataAccess.KPSOURCE_INLINKTITLE); 39 | EntityEntitySimilarity kore = 40 | EntityEntitySimilarity.getKOREEntityEntitySimilarity( 41 | entities, ecs, new NullTracer()); 42 | 43 | double simAB = kore.calcSimilarity(a, b); 44 | double simAC = kore.calcSimilarity(a, c); 45 | double simBD = kore.calcSimilarity(b, d); 46 | double simCD = kore.calcSimilarity(c, d); 47 | double simAD = kore.calcSimilarity(a, d); 48 | 49 | assertTrue(simAB > simAC); 50 | assertTrue(simAD < simAB); 51 | assertTrue(simBD > simCD); 52 | assertEquals(0.2091, simAB, 0.0001); 53 | assertEquals(0.1125, simBD, 0.0001); 54 | assertEquals(0.1613, simAD, 0.0001); 55 | assertEquals(0.0, simCD, 0.001); 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /test/mpi/aida/graph/similarity/measure/MilneWittenEntityEntitySimilarityTest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertTrue; 5 | import mpi.aida.access.DataAccess; 6 | import mpi.aida.config.AidaConfig; 7 | import mpi.aida.data.Entities; 8 | import mpi.aida.data.Entity; 9 | import mpi.aida.graph.similarity.EntityEntitySimilarity; 10 | import mpi.experiment.trace.NullTracer; 11 | 12 | import org.junit.Test; 13 | 14 | public class MilneWittenEntityEntitySimilarityTest { 15 | 16 | public MilneWittenEntityEntitySimilarityTest() { 17 | AidaConfig.set("dataAccess", "testing"); 18 | } 19 | 20 | @Test 21 | public void mwTest() throws Exception { 22 | Entity a = new Entity("Kashmir_(song)", DataAccess.getIdForYagoEntityId("Kashmir_(song)")); 23 | Entity b = new Entity("Jimmy_Page", DataAccess.getIdForYagoEntityId("Jimmy_Page")); 24 | Entity c = new Entity("Larry_Page", DataAccess.getIdForYagoEntityId("Larry_Page")); 25 | Entity d = new Entity("Knebworth_Festival", DataAccess.getIdForYagoEntityId("Knebworth_Festival")); 26 | 27 | Entities entities = new Entities(); 28 | entities.add(a); 29 | entities.add(b); 30 | entities.add(c); 31 | entities.add(d); 32 | 33 | EntityEntitySimilarity mwSim = 34 | EntityEntitySimilarity.getMilneWittenSimilarity( 35 | entities, new NullTracer()); 36 | 37 | double simAB = mwSim.calcSimilarity(a, b); 38 | double simAC = mwSim.calcSimilarity(a, c); 39 | double simBD = mwSim.calcSimilarity(b, d); 40 | double simCD = mwSim.calcSimilarity(c, d); 41 | double simAD = mwSim.calcSimilarity(a, d); 42 | 43 | assertTrue(simAB > simAC); 44 | assertTrue(simAD < simAB); 45 | assertTrue(simBD > simCD); 46 | assertEquals(0.9493, simAB, 0.0001); 47 | assertEquals(0.8987, simBD, 0.0001); 48 | assertEquals(0.9197, simAD, 0.0001); 49 | assertEquals(0.0, simCD, 0.001); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /test/mpi/aida/graph/similarity/measure/WeightComputationTest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.graph.similarity.measure; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | import static org.junit.Assert.assertTrue; 5 | import mpi.aida.graph.similarity.measure.WeightComputation; 6 | 7 | import org.junit.Test; 8 | 9 | 10 | public class WeightComputationTest { 11 | 12 | @Test 13 | public void testComputeNPMI() { 14 | double npmi; 15 | npmi = WeightComputation.computeNPMI(1, 1, 1, 10); 16 | assertEquals(1.0, npmi, 0.001); 17 | 18 | npmi = WeightComputation.computeNPMI(1, 1, 0, 10); 19 | assertEquals(-1.0, npmi, 0.001); 20 | 21 | assertTrue(WeightComputation.computeNPMI(3, 3, 2, 10) 22 | > WeightComputation.computeNPMI(3, 3, 1, 10)); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /test/mpi/aida/util/WikipediaUtilTest.java: -------------------------------------------------------------------------------- 1 | package mpi.aida.util; 2 | 3 | import static org.junit.Assert.assertEquals; 4 | 5 | import org.junit.Test; 6 | 7 | 8 | public class WikipediaUtilTest { 9 | 10 | @Test 11 | public void test() { 12 | String content = "{{Infobox scientist\n" + 13 | "| name = Raghu Ramakrishnan\n" + 14 | "| image = \n" + 15 | "| image_size = 150px\n" + 16 | "| caption =\n" + 17 | "| birth_date =\n" + 18 | "| birth_place = \n" + 19 | "| death_date =\n" + 20 | "| death_place =\n" + 21 | "| residence =\n" + 22 | "| citizenship =\n" + 23 | "| nationality =\n" + 24 | "| ethnicity =\n" + 25 | "| field = [[Computer Science]]\n" + 26 | "| work_institution = [[University of Wisconsin–Madison]], [[Yahoo! Research]]\n" + 27 | "| alma_mater = [[University of Texas]]\n" + 28 | "| doctoral_advisor = \n" + 29 | "| doctoral_students =\n" + 30 | "| known_for = \n" + 31 | "| author_abbreviation_bot =\n" + 32 | "| author_abbreviation_zoo =\n" + 33 | "| prizes =\n" + 34 | "| religion =\n" + 35 | "| footnotes =\n" + 36 | "}}\n" + 37 | "'''Raghu Ramakrishnan''' is a renowned researcher in the areas of database and information management. He is currently a Vice President and Research Fellow for [[Yahoo! Inc.]] Previously, he was a Professor of [http://www.cs.wisc.edu Computer Sciences] at the [[University of Wisconsin–Madison]].\n" + 38 | "\n" + 39 | "Ramakrishnan received a bachelor's degree from IIT Madras in 1983, and a Ph.D. from the University of Texas at Austin in 1987. He has been selected as a Fellow of the ACM and a Packard fellow, and has done pioneering research in the areas of deductive databases, data mining, exploratory data analysis, data privacy, and web-scale data integration. The focus of his current work (2007) is community-based information management.\n" + 40 | "\n" + 41 | "With [[Johannes Gehrke]], he authored the popular textbook [http://www.cs.wisc.edu/~dbbook Database Management Systems], also known as the \"Cow Book\".\n" + 42 | "\n" + 43 | "==External links==\n" + 44 | "*[http://www.cs.wisc.edu/~raghu Raghu's Wisconsin homepage]\n" + 45 | "*[http://research.yahoo.com/~ramakris Raghu's Yahoo! homepage]\n" + 46 | "\n" + 47 | "{{Persondata \n" + 48 | "| NAME = Ramakrishnan, Raghu\n" + 49 | "| ALTERNATIVE NAMES =\n" + 50 | "| SHORT DESCRIPTION =\n" + 51 | "| DATE OF BIRTH =\n" + 52 | "| PLACE OF BIRTH =\n" + 53 | "| DATE OF DEATH =\n" + 54 | "| PLACE OF DEATH =\n" + 55 | "}}\n" + 56 | "{{DEFAULTSORT:Ramakrishnan, Raghu}}\n" + 57 | "[[Category:Fellows of the Association for Computing Machinery]]\n" + 58 | "[[Category:Database researchers]]\n" + 59 | "[[Category:Living people]]\n" + 60 | "[[Category:Data miners]]\n" + 61 | "[[Category:Yahoo! employees]]\n"; 62 | 63 | String clean = WikipediaUtil.cleanWikipediaArticle(content); 64 | String expectecClean = " Raghu Ramakrishnan is a renowned researcher in the areas of database and information management. He is currently a Vice President and Research Fellow for Yahoo! Inc. Previously, he was a Professor of at the University of Wisconsin–Madison. Ramakrishnan received a bachelor's degree from IIT Madras in 1983, and a Ph.D. from the University of Texas at Austin in 1987. He has been selected as a Fellow of the ACM and a Packard fellow, and has done pioneering research in the areas of deductive databases, data mining, exploratory data analysis, data privacy, and web-scale data integration. The focus of his current work (2007) is community-based information management. With Johannes Gehrke, he authored the popular textbook , also known as the \"Cow Book\". * * "; 65 | assertEquals(expectecClean, clean); 66 | } 67 | } 68 | --------------------------------------------------------------------------------