├── doser-dis-core ├── .gitignore ├── .settings │ ├── org.eclipse.wst.jsdt.ui.superType.name │ ├── org.eclipse.wst.validation.prefs │ ├── org.eclipse.wst.jsdt.ui.superType.container │ ├── org.eclipse.m2e.wtp.prefs │ ├── org.eclipse.m2e.core.prefs │ ├── org.eclipse.wst.ws.service.policy.prefs │ ├── org.eclipse.wst.common.project.facet.core.prefs.xml │ ├── org.eclipse.wst.common.project.facet.core.xml │ ├── .jsdtscope │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.wst.common.component ├── src │ ├── main │ │ ├── java │ │ │ └── doser │ │ │ │ ├── language │ │ │ │ └── Languages.java │ │ │ │ ├── entitydisambiguation │ │ │ │ ├── dpo │ │ │ │ │ ├── package-info.java │ │ │ │ │ ├── DisambiguatedEntity.java │ │ │ │ │ ├── DisambiguationResponse.java │ │ │ │ │ ├── Response.java │ │ │ │ │ ├── DisambiguationRequest.java │ │ │ │ │ └── EntityDisambiguationDPO.java │ │ │ │ ├── knowledgebases │ │ │ │ │ ├── KnowledgeBaseIdentifiers.java │ │ │ │ │ ├── DocumentCentricKnowledgeBaseDefault.java │ │ │ │ │ ├── EntityCentricKBDBpedia.java │ │ │ │ │ ├── AbstractKnowledgeBase.java │ │ │ │ │ └── EntityCentricKBBiomed.java │ │ │ │ ├── algorithms │ │ │ │ │ ├── IllegalDisambiguationAlgorithmInputException.java │ │ │ │ │ ├── rules │ │ │ │ │ │ ├── AbstractRule.java │ │ │ │ │ │ ├── RuleAdapation.java │ │ │ │ │ │ ├── NoCandidatesCheckPlural.java │ │ │ │ │ │ ├── CheckGeneralEntities.java │ │ │ │ │ │ ├── UnambiguousToAmbiguousRule.java │ │ │ │ │ │ ├── NoCandidatesExpansionRules.java │ │ │ │ │ │ └── ContextRule.java │ │ │ │ │ ├── Candidate.java │ │ │ │ │ ├── collective │ │ │ │ │ │ ├── Edge.java │ │ │ │ │ │ ├── dbpedia │ │ │ │ │ │ │ ├── CandidateReductionDBpediaW2V.java │ │ │ │ │ │ │ ├── TableColumnFilter.java │ │ │ │ │ │ │ ├── CollectiveAndContextDriver.java │ │ │ │ │ │ │ └── Word2VecDisambiguator.java │ │ │ │ │ │ ├── general │ │ │ │ │ │ │ ├── CandidateReductionGeneralW2V.java │ │ │ │ │ │ │ ├── CollectiveContextDriverGeneral.java │ │ │ │ │ │ │ └── CollectiveDisambiguationGeneralEntities.java │ │ │ │ │ │ ├── CandidateReduction.java │ │ │ │ │ │ ├── CandidatePruning.java │ │ │ │ │ │ └── Vertex.java │ │ │ │ │ ├── AbstractDisambiguationAlgorithm.java │ │ │ │ │ ├── DisambiguationHandler.java │ │ │ │ │ └── SurfaceForm.java │ │ │ │ ├── backend │ │ │ │ │ ├── AbstractDisambiguationTask.java │ │ │ │ │ ├── DisambiguationTaskSingle.java │ │ │ │ │ └── DisambiguationTaskCollective.java │ │ │ │ └── properties │ │ │ │ │ └── Properties.java │ │ │ │ ├── word2vec │ │ │ │ ├── Doc2VecJsonFormat.java │ │ │ │ ├── Data.java │ │ │ │ └── Word2VecJsonFormat.java │ │ │ │ └── tools │ │ │ │ ├── ServiceQueries.java │ │ │ │ └── NTToDbPediaUrlEncoding.java │ │ └── resources │ │ │ ├── application.properties │ │ │ └── disambiguation.properties │ └── test │ │ └── java │ │ └── doser │ │ └── test │ │ └── breakdetection │ │ └── BreakDetection.java ├── .classpath ├── .project └── pom.xml ├── doser-dis-extensions ├── .gitignore ├── .settings │ ├── org.eclipse.wst.jsdt.ui.superType.name │ ├── org.eclipse.wst.jsdt.ui.superType.container │ ├── org.eclipse.wst.validation.prefs │ ├── org.eclipse.m2e.wtp.prefs │ ├── org.eclipse.m2e.core.prefs │ ├── org.eclipse.wst.ws.service.policy.prefs │ ├── org.eclipse.wst.common.project.facet.core.prefs.xml │ ├── org.eclipse.wst.common.project.facet.core.xml │ ├── .jsdtscope │ ├── org.eclipse.jdt.core.prefs │ ├── org.eclipse.wst.common.component │ └── org.eclipse.jdt.ui.prefs ├── src │ └── main │ │ ├── resources │ │ └── application.properties │ │ └── java │ │ └── doser │ │ ├── lucene │ │ ├── features │ │ │ ├── DocCenExtFeatures.java │ │ │ ├── IEntityCentricExtFeatures.java │ │ │ └── LuceneFeatures.java │ │ ├── analysis │ │ │ ├── DoserIDFilter.java │ │ │ ├── DoserIDTokenizer.java │ │ │ ├── DoserStandardTokenizer.java │ │ │ ├── DoserIDAnalyzer.java │ │ │ └── DoserStandardAnalyzer.java │ │ └── query │ │ │ ├── LTRBooleanQuery.java │ │ │ ├── LearnToRankFeatureDefaultValueManager.java │ │ │ ├── LearnToRankClause.java │ │ │ ├── LearnToRankTermScorer.java │ │ │ ├── PriorQuery.java │ │ │ ├── SensePriorQuery.java │ │ │ └── ConjunctionScorer.java │ │ ├── algorithms │ │ └── MajorityVoteAlgorithm.java │ │ ├── general │ │ ├── HelpfulMethods.java │ │ └── Test.java │ │ └── nlp │ │ └── NLPTools.java ├── pom.xml ├── .project └── .classpath ├── doser-dis-disambiguationserver ├── .gitignore ├── .settings │ ├── org.eclipse.wst.jsdt.ui.superType.name │ ├── org.eclipse.wst.jsdt.ui.superType.container │ ├── org.eclipse.wst.validation.prefs │ ├── org.eclipse.m2e.wtp.prefs │ ├── org.eclipse.m2e.core.prefs │ ├── org.eclipse.wst.ws.service.policy.prefs │ ├── org.eclipse.wst.common.project.facet.core.prefs.xml │ ├── org.eclipse.wst.common.project.facet.core.xml │ ├── .jsdtscope │ ├── org.eclipse.jdt.core.prefs │ └── org.eclipse.wst.common.component ├── src │ └── main │ │ ├── resources │ │ ├── application.properties │ │ └── log4j.xml │ │ ├── java │ │ └── doser │ │ │ └── server │ │ │ └── actions │ │ │ ├── package-info.java │ │ │ ├── FrameworkInitialization.java │ │ │ └── disambiguation │ │ │ └── DisambiguationService.java │ │ └── webapp │ │ └── WEB-INF │ │ ├── applicationContext.xml │ │ ├── web.xml │ │ └── dispatcher-servlet.xml ├── .classpath ├── .project └── pom.xml ├── Word2VecRestInterface ├── .idea │ ├── .name │ ├── scopes │ │ └── scope_settings.xml │ ├── encodings.xml │ ├── vcs.xml │ ├── Word2VecRestInterface.iml │ ├── modules.xml │ └── misc.xml ├── startserver └── config.ini ├── .settings ├── org.eclipse.m2e.core.prefs ├── org.eclipse.jdt.core.prefs └── org.eclipse.jst.jsp.core.prefs ├── .classpath ├── yes.pub ├── pom.xml ├── .project ├── README.md └── yes /doser-dis-core/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /doser-dis-extensions/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /Word2VecRestInterface/.idea/.name: -------------------------------------------------------------------------------- 1 | Word2VecRestInterface -------------------------------------------------------------------------------- /doser-dis-core/.settings/org.eclipse.wst.jsdt.ui.superType.name: -------------------------------------------------------------------------------- 1 | Window -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.wst.jsdt.ui.superType.name: -------------------------------------------------------------------------------- 1 | Window -------------------------------------------------------------------------------- /Word2VecRestInterface/startserver: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | nohup python Word2VecRest.py & 3 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/org.eclipse.wst.jsdt.ui.superType.name: -------------------------------------------------------------------------------- 1 | Window -------------------------------------------------------------------------------- /doser-dis-core/.settings/org.eclipse.wst.validation.prefs: -------------------------------------------------------------------------------- 1 | disabled=06target 2 | eclipse.preferences.version=1 3 | -------------------------------------------------------------------------------- /doser-dis-core/.settings/org.eclipse.wst.jsdt.ui.superType.container: -------------------------------------------------------------------------------- 1 | org.eclipse.wst.jsdt.launching.baseBrowserLibrary -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.wst.jsdt.ui.superType.container: -------------------------------------------------------------------------------- 1 | org.eclipse.wst.jsdt.launching.baseBrowserLibrary -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.wst.validation.prefs: -------------------------------------------------------------------------------- 1 | disabled=06target 2 | eclipse.preferences.version=1 3 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/org.eclipse.wst.jsdt.ui.superType.container: -------------------------------------------------------------------------------- 1 | org.eclipse.wst.jsdt.launching.baseBrowserLibrary -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/org.eclipse.wst.validation.prefs: -------------------------------------------------------------------------------- 1 | disabled=06target 2 | eclipse.preferences.version=1 3 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /doser-dis-core/.settings/org.eclipse.m2e.wtp.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.m2e.wtp.enabledProjectSpecificPrefs=false 3 | -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.m2e.wtp.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.m2e.wtp.enabledProjectSpecificPrefs=false 3 | -------------------------------------------------------------------------------- /doser-dis-core/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/language/Languages.java: -------------------------------------------------------------------------------- 1 | package doser.language; 2 | 3 | public enum Languages { 4 | english, german, other 5 | } 6 | -------------------------------------------------------------------------------- /doser-dis-core/.settings/org.eclipse.wst.ws.service.policy.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.wst.ws.service.policy.projectEnabled=false 3 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/org.eclipse.m2e.wtp.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.m2e.wtp.enabledProjectSpecificPrefs=false 3 | -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.wst.ws.service.policy.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.wst.ws.service.policy.projectEnabled=false 3 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/org.eclipse.wst.ws.service.policy.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.wst.ws.service.policy.projectEnabled=false 3 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | application.name = ${project.name} 2 | application.artifactId = ${project.artifactId} 3 | application.version = ${project.version} -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | application.name = ${project.name} 2 | application.artifactId = ${project.artifactId} 3 | application.version = ${project.version} -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Data Presentation Objects (DPO) for model input and output 3 | */ 4 | package doser.entitydisambiguation.dpo; -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/src/main/resources/application.properties: -------------------------------------------------------------------------------- 1 | application.name = ${project.name} 2 | application.artifactId = ${project.artifactId} 3 | application.version = ${project.version} -------------------------------------------------------------------------------- /Word2VecRestInterface/.idea/scopes/scope_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /Word2VecRestInterface/.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /Word2VecRestInterface/.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/src/main/java/doser/server/actions/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Package for all server actions. Use the "@Controller" class annotation to add a new action class. 3 | */ 4 | package doser.server.actions; -------------------------------------------------------------------------------- /doser-dis-core/.settings/org.eclipse.wst.common.project.facet.core.prefs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/KnowledgeBaseIdentifiers.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.knowledgebases; 2 | 3 | public enum KnowledgeBaseIdentifiers { 4 | Standard, CSTable, Biomed, DocumentCentricDefault; 5 | } 6 | -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.wst.common.project.facet.core.prefs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/org.eclipse.wst.common.project.facet.core.prefs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | -------------------------------------------------------------------------------- /Word2VecRestInterface/.idea/Word2VecRestInterface.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /doser-dis-core/.settings/org.eclipse.wst.common.project.facet.core.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.wst.common.project.facet.core.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /Word2VecRestInterface/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/org.eclipse.wst.common.project.facet.core.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.compliance=1.7 5 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 6 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 7 | org.eclipse.jdt.core.compiler.source=1.7 8 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/features/DocCenExtFeatures.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.features; 2 | 3 | /** 4 | * Interface to specify an external Lucene feature set for a document-centric 5 | * knowledge base. External features are features not integrated in Apache 6 | * Lucene. 7 | * 8 | * @author Stefan Zwicklbauer 9 | * 10 | */ 11 | 12 | public interface DocCenExtFeatures { 13 | 14 | } 15 | -------------------------------------------------------------------------------- /Word2VecRestInterface/config.ini: -------------------------------------------------------------------------------- 1 | [Word2VecRest] 2 | embeddings_w2v_wikipedia = /mnt/ssd1/disambiguation/word2vec/WikiEntityModel_400_neg10_iter5.seq 3 | embeddings_w2v_calbc = /mnt/ssd1/disambiguation/word2vec/calbcsmall_model_sg_500.bin 4 | embeddings_d2v_wikipedia = /mnt/ssd1/disambiguation/word2vec/doc2vec/Wiki_Standard_Model/doc2vec_wiki_model.d2v 5 | embeddings_d2v_wikipedia_german = /mnt/ssd1/disambiguation/word2vec/doc2vec/Wikipedia_Standard_German/doc2vec_model_german.d2v 6 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/IllegalDisambiguationAlgorithmInputException.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms; 2 | 3 | public class IllegalDisambiguationAlgorithmInputException extends 4 | IllegalArgumentException { 5 | 6 | private static final long serialVersionUID = 1L; 7 | 8 | IllegalDisambiguationAlgorithmInputException() { 9 | super("Wrong Knowledge base!"); 10 | } 11 | 12 | IllegalDisambiguationAlgorithmInputException(String text) { 13 | super(text); 14 | } 15 | 16 | } 17 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/resources/disambiguation.properties: -------------------------------------------------------------------------------- 1 | application.name = ${project.name} 2 | application.artifactId = ${project.artifactId} 3 | application.version = ${project.version} 4 | luceneversion = 4.7.0 5 | disambiguation.entityCentricKBWikipedia = /mnt/ssd1/disambiguation/LuceneIndex/Wikipedia_Default_Aida_Sigir/ 6 | disambiguation.entityCentricBiomedCalbC= /mnt/ssd1/disambiguation/LuceneIndex/Biomed_CalbCSmall/ 7 | disambiguation.returnSize = 10 8 | disambiguation.Word2VecService = http://theseus.dimis.fim.uni-passau.de:80/Word2VecRest/ 9 | candidateExpansion = false -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserIDFilter.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.analysis; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.lucene.analysis.TokenFilter; 6 | import org.apache.lucene.analysis.TokenStream; 7 | 8 | public class DoserIDFilter extends TokenFilter { 9 | 10 | public DoserIDFilter(TokenStream in) { 11 | super(in); 12 | } 13 | 14 | @Override 15 | public boolean incrementToken() throws IOException { 16 | if (!input.incrementToken()) { 17 | return false; 18 | } 19 | return true; 20 | } 21 | 22 | } 23 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/word2vec/Doc2VecJsonFormat.java: -------------------------------------------------------------------------------- 1 | package doser.word2vec; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | public class Doc2VecJsonFormat { 7 | 8 | private List data; 9 | 10 | public Doc2VecJsonFormat() { 11 | super(); 12 | this.data = new ArrayList(); 13 | } 14 | 15 | public List getData() { 16 | return data; 17 | } 18 | 19 | public void setData(List data) { 20 | this.data = data; 21 | } 22 | 23 | public void addData(Data doc) { 24 | this.data.add(doc); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/AbstractRule.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.rules; 2 | 3 | import java.util.List; 4 | 5 | import doser.entitydisambiguation.algorithms.SurfaceForm; 6 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase; 7 | 8 | abstract class AbstractRule { 9 | 10 | protected AbstractKnowledgeBase eckb; 11 | 12 | AbstractRule(AbstractKnowledgeBase eckb) { 13 | super(); 14 | this.eckb = eckb; 15 | } 16 | 17 | abstract boolean applyRule(List rep); 18 | 19 | } 20 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/src/main/webapp/WEB-INF/applicationContext.xml: -------------------------------------------------------------------------------- 1 | 2 | 9 | 10 | -------------------------------------------------------------------------------- /doser-dis-core/.settings/.jsdtscope: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/DocumentCentricKnowledgeBaseDefault.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.knowledgebases; 2 | 3 | import org.apache.lucene.search.similarities.Similarity; 4 | 5 | public class DocumentCentricKnowledgeBaseDefault extends AbstractKnowledgeBase { 6 | 7 | public DocumentCentricKnowledgeBaseDefault(String uri, boolean dynamic, 8 | Similarity sim) { 9 | super(uri, dynamic, sim); 10 | } 11 | 12 | public DocumentCentricKnowledgeBaseDefault(String uri, boolean dynamic) { 13 | super(uri, dynamic); 14 | } 15 | 16 | @Override 17 | public void initialize() { 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/.jsdtscope: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/.jsdtscope: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/src/main/resources/log4j.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/features/IEntityCentricExtFeatures.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.features; 2 | 3 | import java.util.Set; 4 | 5 | /** 6 | * Interface to specify an external Lucene feature set for an entity-centric 7 | * knowledge base. External features are features not integrated in Apache 8 | * Lucene. 9 | * 10 | * @author Stefan Zwicklbauer 11 | * 12 | */ 13 | public interface IEntityCentricExtFeatures { 14 | 15 | public float getPriorOfDocument(final int docId); 16 | 17 | public float getSensePriorOfDocument(final String keyword, final int docId); 18 | 19 | public Set getRelations(final String url); 20 | 21 | public int getOccurrences(String sf, String uri); 22 | } 23 | -------------------------------------------------------------------------------- /doser-dis-core/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 12 | org.eclipse.jdt.core.compiler.source=1.7 13 | -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 12 | org.eclipse.jdt.core.compiler.source=1.7 13 | -------------------------------------------------------------------------------- /yes.pub: -------------------------------------------------------------------------------- 1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDFiuuAKuK8WhRCBVZpjlXIs7TWNKwtpGYqhrbF+hOkstu26QXsPYz6ywZDfQzHS3ey6mi1a/nBx9IYwwgPERu56M1OEUXvHQogEmowCMMVGCDkDgkfkCsMeChIsvCqabTugX6sT/6HHR26QXD1xzkVMhlyF7AuK+XxHNriu7SaVjYwBfVyQc4Mf8usoigKJgBRu5vj4BXzH5oslIAlCZTcFR3tT7Iy4G7IpFwjoBZufQeQiS7k8JLfgKjB9Mcc3H9/gZNvau7RsuAo24SQ4y9Jjt3BahqVdxJgKZMdYyQeRresX7oiXqrsrwBAKHyFUZZAxYZJT2Y0PaK7IrZfXRikmSN+W2Gf9dTxRI5LfYW94JvTIeT5anUhOYtAf71wSmAimQrXbMS4JKlbbZSQB/U/GY3XX+mEyoG/qqgJUNjBTF5NPtOzKbprgTkubu6VNduokKLAJP+z0ZfDoZwZaPvXR9qmFu8E5qaAIfXM/oXd9DPcSuyAh1HvXnkCHJ0z1oGusmc/Cpk6Agt5IvL4khb/HtQpvdbr8DDM963Zy8VEHaq1Uq1SKEpAcw678EtbEymbEieL0BSq8wbBn6fQRXWiCDdiqRbAkIK3Q1kyMKxmovPmYtzykYgWmb0feQpVpROVvL1JyOCKRKEK2xEWsVidcBZJtTb+JW9OkThdun8q5w== quhfus@stefan.zwicklbauer@uni-passau.de 2 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.7 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 12 | org.eclipse.jdt.core.compiler.source=1.7 13 | -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.wst.common.component: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/Candidate.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms; 2 | 3 | class Candidate implements Comparable { 4 | 5 | private String candidate; 6 | private double score; 7 | 8 | Candidate(String candidate, double score) { 9 | super(); 10 | this.candidate = candidate; 11 | this.score = score; 12 | } 13 | 14 | @Override 15 | public int compareTo(Candidate o) { 16 | if (this.score < o.score) { 17 | return -1; 18 | } else if (this.score > o.score) { 19 | return 1; 20 | } else { 21 | return 0; 22 | } 23 | } 24 | 25 | String getCandidate() { 26 | return candidate; 27 | } 28 | 29 | double getScore() { 30 | return score; 31 | } 32 | 33 | @Override 34 | public String toString() { 35 | return candidate; 36 | } 37 | } -------------------------------------------------------------------------------- /doser-dis-core/.settings/org.eclipse.wst.common.component: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | uses 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/DisambiguatedEntity.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.dpo; 2 | 3 | 4 | /** 5 | * Class representing a disambiguated entity consisting of the entity mention 6 | * (the text), the identified URI, a value representing the confidence for the 7 | * decision, and a entity description. Class is a POJO for automatic 8 | * (de-)serialization. TODO may not be complete (e.g. relevant terms may be 9 | * added) 10 | * 11 | * @author zwicklbauer 12 | * 13 | */ 14 | public class DisambiguatedEntity { 15 | 16 | private String entityUri; 17 | 18 | public DisambiguatedEntity() { 19 | super(); 20 | this.entityUri = ""; 21 | } 22 | 23 | public DisambiguatedEntity(final String text, final String entityUri, 24 | final double confidence, final String description) { 25 | this.entityUri = entityUri; 26 | } 27 | 28 | public String getEntityUri() { 29 | return this.entityUri; 30 | } 31 | 32 | public void setEntityUri(final String entityUri) { 33 | this.entityUri = entityUri; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/query/LTRBooleanQuery.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.query; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.lucene.search.BooleanQuery; 6 | import org.apache.lucene.search.IndexSearcher; 7 | import org.apache.lucene.search.Weight; 8 | 9 | public class LTRBooleanQuery extends BooleanQuery { 10 | 11 | public class LTRBooleanWeight extends BooleanWeight { 12 | 13 | public LTRBooleanWeight(final IndexSearcher searcher, 14 | final boolean disableCoord) throws IOException { 15 | super(searcher, disableCoord); 16 | } 17 | 18 | @Override 19 | public float coord(final int overlap, final int maxOverlap) { 20 | // return 1.0f; 21 | return maxOverlap == 1 ? 1F : similarity.coord(overlap, maxOverlap); 22 | } 23 | } 24 | 25 | public LTRBooleanQuery() { 26 | super(); 27 | } 28 | 29 | public LTRBooleanQuery(final boolean bool) { 30 | super(bool); 31 | } 32 | 33 | @Override 34 | public Weight createWeight(final IndexSearcher searcher) throws IOException { 35 | return new LTRBooleanWeight(searcher, isCoordDisabled()); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 4 | 4.0.0 5 | 6 | doser-dis 7 | doser-dis-parent 8 | pom 9 | 1.0 10 | DoSer 11 | 12 | 13 | doser-dis-extensions 14 | doser-dis-core 15 | doser-dis-disambiguationserver 16 | 17 | 18 | 19 | 20 | 21 | 22 | org.apache.maven.plugins 23 | maven-compiler-plugin 24 | 3.1 25 | 26 | 1.7 27 | 1.7 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/algorithms/MajorityVoteAlgorithm.java: -------------------------------------------------------------------------------- 1 | package doser.algorithms; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | import doser.general.HelpfulMethods; 8 | 9 | 10 | /** 11 | * Majority vote methods for arbitrary types 12 | * 13 | * @author Stefan Zwicklbauer 14 | * 15 | */ 16 | public final class MajorityVoteAlgorithm> { 17 | 18 | public MajorityVoteAlgorithm() { 19 | super(); 20 | } 21 | 22 | public Map.Entry getMajorityType(final List typeList) { 23 | final List> list = this 24 | .getMajorityTypes(typeList); 25 | Map.Entry res = null; 26 | if (!list.isEmpty()) { 27 | res = list.get(0); 28 | } 29 | return res; 30 | } 31 | 32 | public List> getMajorityTypes(final List list) { 33 | final Map hash = new HashMap(); 34 | for (final K k : list) { 35 | if (hash.containsKey(k)) { 36 | Integer number = hash.get(k); 37 | hash.put(k, ++number); 38 | } else { 39 | hash.put(k, 1); 40 | } 41 | } 42 | return HelpfulMethods.sortByValue(hash); 43 | } 44 | 45 | } 46 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/Edge.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective; 2 | 3 | 4 | public class Edge { 5 | 6 | private Integer edgeNr; 7 | 8 | private Vertex target; 9 | 10 | private double transition; 11 | 12 | private Double edgeProbability; 13 | 14 | public Edge(Integer edgeNr, Vertex target, double transition) { 15 | super(); 16 | this.transition = transition; 17 | this.edgeNr = edgeNr; 18 | this.target = target; 19 | } 20 | 21 | public double getTransition() { 22 | return transition; 23 | } 24 | public void setTransition(double transition) { 25 | this.transition = transition; 26 | } 27 | 28 | public void setProbability(double p) { 29 | this.edgeProbability = new Double(p); 30 | } 31 | 32 | public Double getProbability() { 33 | return this.edgeProbability; 34 | } 35 | 36 | public Vertex getTarget() { 37 | return this.target; 38 | } 39 | 40 | @Override 41 | public boolean equals(Object obj) { 42 | if(this.edgeNr == ((Edge) obj).edgeNr) { 43 | return true; 44 | } 45 | return false; 46 | } 47 | 48 | @Override 49 | public int hashCode() { 50 | return edgeNr.hashCode(); 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /Word2VecRestInterface/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.settings/org.eclipse.wst.common.component: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | uses 9 | 10 | 11 | uses 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /doser-dis-core/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /doser-dis-extensions/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | doser-dis 5 | doser-dis-parent 6 | 1.0 7 | 8 | 4.0.0 9 | doser.sub 10 | doser-dis-extensions 11 | 0.6 12 | doser-dis-extensions 13 | 14 | 15 | doser-dis-extensions 16 | 17 | 18 | 19 | 20 | 21 | org.apache.lucene 22 | lucene-core 23 | 4.10.4 24 | 25 | 26 | org.apache.lucene 27 | lucene-analyzers-common 28 | 4.10.4 29 | 30 | 31 | org.apache.lucene 32 | lucene-queryparser 33 | 4.10.4 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | de.uop.code-disambiguationserver 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.wst.jsdt.core.javascriptValidator 10 | 11 | 12 | 13 | 14 | org.eclipse.jdt.core.javabuilder 15 | 16 | 17 | 18 | 19 | org.eclipse.wst.common.project.facet.core.builder 20 | 21 | 22 | 23 | 24 | org.eclipse.wst.validation.validationbuilder 25 | 26 | 27 | 28 | 29 | org.eclipse.m2e.core.maven2Builder 30 | 31 | 32 | 33 | 34 | 35 | org.eclipse.jem.workbench.JavaEMFNature 36 | org.eclipse.wst.common.modulecore.ModuleCoreNature 37 | org.eclipse.jdt.core.javanature 38 | org.eclipse.m2e.core.maven2Nature 39 | org.eclipse.wst.common.project.facet.core.nature 40 | org.eclipse.wst.jsdt.core.jsNature 41 | 42 | 43 | -------------------------------------------------------------------------------- /doser-dis-core/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | doser-extensions 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.wst.jsdt.core.javascriptValidator 10 | 11 | 12 | 13 | 14 | org.eclipse.jdt.core.javabuilder 15 | 16 | 17 | 18 | 19 | org.eclipse.wst.common.project.facet.core.builder 20 | 21 | 22 | 23 | 24 | org.eclipse.wst.validation.validationbuilder 25 | 26 | 27 | 28 | 29 | org.eclipse.m2e.core.maven2Builder 30 | 31 | 32 | 33 | 34 | 35 | org.eclipse.jem.workbench.JavaEMFNature 36 | org.eclipse.wst.common.modulecore.ModuleCoreNature 37 | org.eclipse.jdt.core.javanature 38 | org.eclipse.m2e.core.maven2Nature 39 | org.eclipse.wst.common.project.facet.core.nature 40 | org.eclipse.wst.jsdt.core.jsNature 41 | 42 | 43 | -------------------------------------------------------------------------------- /doser-dis-extensions/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | doser-extensions 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.wst.jsdt.core.javascriptValidator 10 | 11 | 12 | 13 | 14 | org.eclipse.jdt.core.javabuilder 15 | 16 | 17 | 18 | 19 | org.eclipse.wst.common.project.facet.core.builder 20 | 21 | 22 | 23 | 24 | org.eclipse.wst.validation.validationbuilder 25 | 26 | 27 | 28 | 29 | org.eclipse.m2e.core.maven2Builder 30 | 31 | 32 | 33 | 34 | 35 | org.eclipse.jem.workbench.JavaEMFNature 36 | org.eclipse.wst.common.modulecore.ModuleCoreNature 37 | org.eclipse.jdt.core.javanature 38 | org.eclipse.m2e.core.maven2Nature 39 | org.eclipse.wst.common.project.facet.core.nature 40 | org.eclipse.wst.jsdt.core.jsNature 41 | 42 | 43 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | de.uop.code-disambiguationserver 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.wst.jsdt.core.javascriptValidator 10 | 11 | 12 | 13 | 14 | org.eclipse.jdt.core.javabuilder 15 | 16 | 17 | 18 | 19 | org.eclipse.wst.common.project.facet.core.builder 20 | 21 | 22 | 23 | 24 | org.eclipse.wst.validation.validationbuilder 25 | 26 | 27 | 28 | 29 | org.eclipse.m2e.core.maven2Builder 30 | 31 | 32 | 33 | 34 | 35 | org.eclipse.jem.workbench.JavaEMFNature 36 | org.eclipse.wst.common.modulecore.ModuleCoreNature 37 | org.eclipse.jdt.core.javanature 38 | org.eclipse.m2e.core.maven2Nature 39 | org.eclipse.wst.common.project.facet.core.nature 40 | org.eclipse.wst.jsdt.core.jsNature 41 | 42 | 43 | -------------------------------------------------------------------------------- /doser-dis-extensions/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/DisambiguationResponse.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.dpo; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * { 7 | * 8 | * "documentUri":"unique document id", "disambiguatedSurfaceforms": [ { 9 | * "selectedText":"influenza", "position": { "pageId":0, 10 | * "offsets":[1,2,3,5,6,7], "boundingBox":{"minx":0.1, "miny":0.3, "maxx":0.01, 11 | * "maxy":0.03} }, "disEntities": [ { "text":"Influenza (Illness)" 12 | * "entityUri":"http://en.dbpedia.org/pages/..." "confidence":"0.80" 13 | * "description":"some additional description" 14 | * 15 | * ---a list of synonyms (for a later stage)--- "synonyms": [ { "term":"..." } ] 16 | * } // more Items ] } 17 | * 18 | * Version 2.0 is used for additional testing. Current version offers the usage 19 | * of a position array in surfaceFormsToDisambiguate 20 | * 21 | * @author Stefan Zwicklbauer 22 | * 23 | */ 24 | public class DisambiguationResponse { 25 | 26 | private List tasks; // NOPMD by quh on 18.02.14 09:34 27 | 28 | private String documentUri; 29 | 30 | public List getTasks() { 31 | return tasks; 32 | } 33 | 34 | public void setTasks(List tasks) { 35 | this.tasks = tasks; 36 | } 37 | 38 | public String getDocumentUri() { 39 | return this.documentUri; 40 | } 41 | 42 | public void setDocumentUri(final String documentUri) { 43 | this.documentUri = documentUri; 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/Response.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.dpo; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | /** 7 | * This class represents a disambiguated surface form and contains all necessary 8 | * information about the disambiguation. Position is required because a 9 | * ColumnResponseItem has no unique primary key and assures the correct 10 | * assignment to the original item. 11 | * 12 | * Version 2.0 offers a list of positions 13 | * 14 | * @author Stefan Zwicklbauer 15 | * 16 | */ 17 | public class Response { 18 | 19 | private List disEntities; 20 | private String selectedText; 21 | private int documentId; 22 | 23 | public Response() { 24 | super(); 25 | this.disEntities = new LinkedList(); 26 | } 27 | 28 | public List getDisEntities() { 29 | return this.disEntities; 30 | } 31 | 32 | public String getSelectedText() { 33 | return this.selectedText; 34 | } 35 | 36 | public void setDisEntities(final List disEntities) { 37 | this.disEntities = disEntities; 38 | } 39 | 40 | public void setSelectedText(final String selectedText) { 41 | this.selectedText = selectedText; 42 | } 43 | 44 | public int getDocumentId() { 45 | return documentId; 46 | } 47 | 48 | public void setDocumentId(int documentId) { 49 | this.documentId = documentId; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/src/main/webapp/WEB-INF/web.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | Disambiguation Server 4 | 5 | contextConfigLocation 6 | /WEB-INF/applicationContext.xml 7 | 8 | 9 | encoding-filter 10 | org.springframework.web.filter.CharacterEncodingFilter 11 | 12 | encoding 13 | UTF-8 14 | 15 | 16 | 17 | encoding-filter 18 | /* 19 | 20 | 21 | dispatcher 22 | org.springframework.web.servlet.DispatcherServlet 23 | 1 24 | 25 | 26 | dispatcher 27 | / 28 | 29 | 30 | doser.server.actions.FrameworkInitialization 31 | 32 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/query/LearnToRankFeatureDefaultValueManager.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.query; 2 | 3 | /** 4 | * Not in use so far. 5 | * 6 | * @author quh 7 | */ 8 | public class LearnToRankFeatureDefaultValueManager { 9 | 10 | private static LearnToRankFeatureDefaultValueManager man; 11 | 12 | public static LearnToRankFeatureDefaultValueManager getInstance() { 13 | return man; 14 | } 15 | 16 | public static void setInstance( 17 | final LearnToRankFeatureDefaultValueManager manager) { 18 | man = manager; 19 | } 20 | 21 | private int amountQueries; 22 | 23 | private final float[] maxVals; 24 | 25 | private final float[] sums; 26 | 27 | public LearnToRankFeatureDefaultValueManager(final int pos) { 28 | maxVals = new float[pos]; 29 | sums = new float[pos]; 30 | for (int j = 0; j < sums.length; j++) { 31 | sums[j] = 0; 32 | } 33 | amountQueries = 0; 34 | } 35 | 36 | public float[] getAverageResults() { 37 | final float[] results = new float[maxVals.length]; 38 | for (int i = 0; i < sums.length; i++) { 39 | results[i] = sums[i] / amountQueries; 40 | } 41 | return results; 42 | } 43 | 44 | public void newQuery() { 45 | for (int i = 0; i < maxVals.length; i++) { 46 | sums[i] += maxVals[i]; 47 | } 48 | amountQueries++; 49 | } 50 | 51 | public synchronized void setValue(final int position, final float value) { 52 | if (maxVals[position] < value) { 53 | maxVals[position] = value; 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/CandidateReductionDBpediaW2V.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective.dbpedia; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | import doser.entitydisambiguation.algorithms.SurfaceForm; 7 | import doser.entitydisambiguation.algorithms.collective.CandidateReduction; 8 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; 9 | 10 | public class CandidateReductionDBpediaW2V extends CandidateReduction { 11 | 12 | private int iterations; 13 | private boolean disambiguate; 14 | private EntityCentricKBDBpedia eckb; 15 | private int reduceTo; 16 | 17 | CandidateReductionDBpediaW2V(EntityCentricKBDBpedia eckb, List rep, int maxsurfaceformsperquery, 18 | int reduceTo, int iterations, boolean disambiguate, boolean alwaysAction) { 19 | super(rep, maxsurfaceformsperquery, alwaysAction); 20 | this.iterations = iterations; 21 | this.disambiguate = disambiguate; 22 | this.eckb = eckb; 23 | this.reduceTo = reduceTo; 24 | } 25 | 26 | @Override 27 | public List miniSolve(List rep) { 28 | List sol = new LinkedList(); 29 | Word2VecDisambiguator disambiguator = new Word2VecDisambiguator(eckb, rep, disambiguate, reduceTo, iterations); 30 | disambiguator.setup(); 31 | disambiguator.solve(); 32 | sol.addAll(disambiguator.getRepresentation()); 33 | return sol; 34 | 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jst.jsp.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | validateFragments=false 3 | validation.actions-missing-required-attribute=1 4 | validation.actions-non-empty-inline-tag=2 5 | validation.actions-unexpected-rtexprvalue=2 6 | validation.actions-unknown-attribute=2 7 | validation.directive-attribute-duplicate=2 8 | validation.directive-include-fragment-file-not-found=2 9 | validation.directive-include-fragment-file-not-specified=2 10 | validation.directive-taglib-duplicate-prefixes-different-uris=2 11 | validation.directive-taglib-duplicate-prefixes-same-uris=-1 12 | validation.directive-taglib-missing-prefix=2 13 | validation.directive-taglib-missing-uri-or-tagdir=2 14 | validation.directive-taglib-unresolvable-uri-or-tagdir=2 15 | validation.el-function-undefined=1 16 | validation.el-general-syntax=1 17 | validation.el-lexical-failure=-1 18 | validation.java-=-1 19 | validation.java-local-variable-is-never-used=-1 20 | validation.java-null-local-variable-reference=-1 21 | validation.java-potential-null-local-variable-reference=-1 22 | validation.java-unused-import=-1 23 | validation.translation-tag-class-not-found=2 24 | validation.translation-tei-class-not-found=2 25 | validation.translation-tei-class-not-instantiated=2 26 | validation.translation-tei-class-runtime-exception=2 27 | validation.translation-tei-message=1 28 | validation.translation-usebean-ambiguous-type-info=2 29 | validation.translation-usebean-invalid-id=1 30 | validation.translation-usebean-missing-type-info=1 31 | validation.use-project-settings=true 32 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/general/CandidateReductionGeneralW2V.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective.general; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | import doser.entitydisambiguation.algorithms.SurfaceForm; 7 | import doser.entitydisambiguation.algorithms.collective.CandidateReduction; 8 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral; 9 | 10 | public class CandidateReductionGeneralW2V extends CandidateReduction { 11 | 12 | private int iterations; 13 | private boolean disambiguate; 14 | private AbstractEntityCentricKBGeneral eckb; 15 | private int reduceTo; 16 | 17 | public CandidateReductionGeneralW2V(AbstractEntityCentricKBGeneral eckb, List rep, int maxsurfaceformsperquery, 18 | int reduceTo, int iterations, boolean disambiguate, boolean alwaysAction) { 19 | super(rep, maxsurfaceformsperquery, alwaysAction); 20 | this.iterations = iterations; 21 | this.disambiguate = disambiguate; 22 | this.eckb = eckb; 23 | this.reduceTo = reduceTo; 24 | } 25 | 26 | @Override 27 | public List miniSolve(List rep) { 28 | List sol = new LinkedList(); 29 | Word2VecDisambiguatorGeneral disambiguator = new Word2VecDisambiguatorGeneral(eckb, rep, disambiguate, reduceTo, 30 | iterations); 31 | disambiguator.setup(); 32 | disambiguator.solve(); 33 | sol.addAll(disambiguator.getRepresentation()); 34 | return sol; 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserIDTokenizer.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.analysis; 2 | 3 | import java.io.Reader; 4 | 5 | import org.apache.lucene.analysis.Tokenizer; 6 | import org.apache.lucene.analysis.util.CharTokenizer; 7 | import org.apache.lucene.util.AttributeFactory; 8 | 9 | public final class DoserIDTokenizer extends CharTokenizer { 10 | 11 | /** 12 | * Construct a new WhitespaceTokenizer using a given 13 | * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. 14 | * 15 | * @param matchVersion 16 | * Lucene version to match See 17 | * {@link above} 18 | * @param factory 19 | * the attribute factory to use for this {@link Tokenizer} 20 | * @param in 21 | * the input to split up into tokens 22 | */ 23 | public DoserIDTokenizer(AttributeFactory factory, Reader in) { 24 | super(factory, in); 25 | } 26 | 27 | /** 28 | * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version 29 | * to match See {@link above} 30 | * 31 | * @param in 32 | * the input to split up into tokens 33 | */ 34 | public DoserIDTokenizer(Reader in) { 35 | super(in); 36 | } 37 | 38 | /** 39 | * Collects only characters which do not satisfy 40 | * {@link Character#isWhitespace(int)}. 41 | */ 42 | @Override 43 | protected boolean isTokenChar(int c) { 44 | boolean check = true; 45 | if (Character.isWhitespace(c)) { 46 | check = false; 47 | } 48 | return check; 49 | } 50 | } -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/RuleAdapation.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.rules; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; 7 | import doser.entitydisambiguation.algorithms.SurfaceForm; 8 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase; 9 | 10 | public class RuleAdapation { 11 | 12 | private List ruleChain; 13 | 14 | public RuleAdapation() { 15 | super(); 16 | this.ruleChain = new ArrayList(); 17 | } 18 | 19 | public void addNoCandidatesCheckPluralRule(AbstractKnowledgeBase eckb) { 20 | this.ruleChain.add(new NoCandidatesCheckPlural(eckb)); 21 | } 22 | 23 | public void addNoCandidatesExpansionRule(AbstractKnowledgeBase eckb) { 24 | this.ruleChain.add(new NoCandidatesExpansionRules(eckb)); 25 | } 26 | 27 | public void addUnambiguousToAmbiguousRule(EntityCentricKBDBpedia eckb) { 28 | this.ruleChain.add(new UnambiguousToAmbiguousRule(eckb)); 29 | } 30 | 31 | public void addPatternRule(EntityCentricKBDBpedia eckb, String topic) { 32 | if (topic != null) { 33 | this.ruleChain.add(new PatternRule(eckb)); 34 | } 35 | } 36 | 37 | public void addContextRule(EntityCentricKBDBpedia eckb) { 38 | this.ruleChain.add(new ContextRule(eckb)); 39 | } 40 | 41 | public void performRuleChainBeforeCandidateSelection(List rep) { 42 | for (AbstractRule r : ruleChain) { 43 | r.applyRule(rep); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/src/main/webapp/WEB-INF/dispatcher-servlet.xml: -------------------------------------------------------------------------------- 1 | 2 | 11 | 12 | 13 | 14 | 15 | 16 | 18 | 19 | 20 | 21 | 22 | 23 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/tools/ServiceQueries.java: -------------------------------------------------------------------------------- 1 | package doser.tools; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.http.Header; 6 | import org.apache.http.HttpEntity; 7 | import org.apache.http.HttpResponse; 8 | import org.apache.http.client.ClientProtocolException; 9 | import org.apache.http.client.methods.HttpPost; 10 | import org.apache.http.entity.AbstractHttpEntity; 11 | import org.apache.http.impl.client.DefaultHttpClient; 12 | import org.apache.http.util.EntityUtils; 13 | import org.apache.log4j.Logger; 14 | 15 | /** 16 | * Class providing queries for different services. Integrated so far: DbPedia 17 | * Spotlight 18 | * 19 | * @author Stefan Zwicklbauer 20 | * 21 | */ 22 | public class ServiceQueries { 23 | 24 | public static String httpPostRequest(String uri, AbstractHttpEntity entity, 25 | Header[] header) { 26 | DefaultHttpClient httpclient = new DefaultHttpClient(); 27 | HttpPost httppost = new HttpPost(uri); 28 | httppost.setHeaders(header); 29 | httppost.setEntity(entity); 30 | 31 | HttpResponse response; 32 | StringBuffer buffer = new StringBuffer(); 33 | try { 34 | response = httpclient.execute(httppost); 35 | HttpEntity ent = response.getEntity(); 36 | 37 | buffer.append(EntityUtils.toString(ent)); 38 | httpclient.getConnectionManager().shutdown(); 39 | 40 | } catch (ClientProtocolException e) { 41 | Logger.getRootLogger().error("HTTPClient error", e); 42 | } catch (IOException e) { 43 | Logger.getRootLogger().error("HTTPClient error", e); 44 | } 45 | return buffer.toString(); 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/backend/AbstractDisambiguationTask.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.backend; 2 | 3 | import java.util.List; 4 | 5 | import doser.entitydisambiguation.dpo.Response; 6 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase; 7 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers; 8 | 9 | public abstract class AbstractDisambiguationTask { 10 | 11 | protected int returnNr; 12 | 13 | protected AbstractKnowledgeBase kb; 14 | 15 | protected KnowledgeBaseIdentifiers kbIdentifier; 16 | 17 | protected boolean retrieveDocClasses; 18 | 19 | protected List responses; 20 | 21 | public int getReturnNr() { 22 | return returnNr; 23 | } 24 | 25 | public void setReturnNr(int returnNr) { 26 | this.returnNr = returnNr; 27 | } 28 | 29 | public AbstractKnowledgeBase getKb() { 30 | return kb; 31 | } 32 | 33 | public void setKb(AbstractKnowledgeBase kb) { 34 | this.kb = kb; 35 | } 36 | 37 | public KnowledgeBaseIdentifiers getKbIdentifier() { 38 | return this.kbIdentifier; 39 | } 40 | 41 | public boolean isRetrieveDocClasses() { 42 | return retrieveDocClasses; 43 | } 44 | 45 | public void setRetrieveDocClasses(boolean retrieveDocClasses) { 46 | this.retrieveDocClasses = retrieveDocClasses; 47 | } 48 | 49 | public List getResponse() { 50 | return responses; 51 | } 52 | 53 | public void setResponse(List responses) { 54 | this.responses = responses; 55 | } 56 | 57 | public abstract void setKbIdentifier(String kbversion, String setting); 58 | } 59 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/word2vec/Data.java: -------------------------------------------------------------------------------- 1 | package doser.word2vec; 2 | 3 | public class Data { 4 | 5 | private String surfaceForm; 6 | private String qryNr; 7 | private String[] candidates; 8 | private String context; 9 | // private String entity; 10 | 11 | public String getSurfaceForm() { 12 | return surfaceForm; 13 | } 14 | 15 | public void setSurfaceForm(String surfaceForm) { 16 | this.surfaceForm = surfaceForm; 17 | } 18 | 19 | public String getQryNr() { 20 | return qryNr; 21 | } 22 | 23 | public void setQryNr(String qryNr) { 24 | this.qryNr = qryNr; 25 | } 26 | 27 | public String[] getCandidates() { 28 | return candidates; 29 | } 30 | 31 | public void setCandidates(String[] candidates) { 32 | this.candidates = candidates; 33 | } 34 | 35 | public String getContext() { 36 | return context; 37 | } 38 | 39 | public void setContext(String context) { 40 | this.context = context; 41 | } 42 | 43 | // public String getEntity() { 44 | // return entity; 45 | // } 46 | // 47 | // public void setEntity(String entity) { 48 | // this.entity = entity; 49 | // } 50 | // 51 | // @Override 52 | // public int hashCode() { 53 | // return surfaceForm.hashCode() + qryNr.hashCode() + context.hashCode() 54 | // + entity.hashCode(); 55 | // 56 | // } 57 | // 58 | // @Override 59 | // public boolean equals(Object obj) { 60 | // Data data = (Data) obj; 61 | // if (this.surfaceForm.equals(data.getSurfaceForm()) 62 | // && this.context.equals(data.getSurfaceForm()) 63 | // && this.qryNr.equals(data.getQryNr()) 64 | // && this.entity.equals(data.getEntity())) { 65 | // return true; 66 | // } 67 | // return false; 68 | // } 69 | } 70 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/AbstractDisambiguationAlgorithm.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms; 2 | 3 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask; 4 | 5 | public abstract class AbstractDisambiguationAlgorithm { 6 | 7 | protected AbstractDisambiguationTask task; 8 | 9 | public void disambiguate(AbstractDisambiguationTask task) 10 | throws IllegalDisambiguationAlgorithmInputException { 11 | if (checkAndSetInputParameter(task)) { 12 | if (preDisambiguation()) { 13 | processAlgorithm(); 14 | } 15 | } else { 16 | throw new IllegalDisambiguationAlgorithmInputException( 17 | "Check your input knowledge base and disambiguation task"); 18 | } 19 | } 20 | 21 | public static String extractContext(int position, String text, 22 | int contextarea) { 23 | if(text == null || text.length() == 0) { 24 | return ""; 25 | } 26 | 27 | long startArea = position - contextarea; 28 | long endArea = position + contextarea; 29 | if (startArea < 0) { 30 | startArea = 0; 31 | } 32 | if (endArea > text.length() - 1) { 33 | endArea = text.length() - 1; 34 | } 35 | String tempText = text.substring((int) startArea, (int) endArea); 36 | String[] splitter = tempText.split(" "); 37 | String result = ""; 38 | for (int i = 1; i < splitter.length - 1; i++) { 39 | result += splitter[i] + " "; 40 | } 41 | return result; 42 | } 43 | 44 | protected abstract boolean checkAndSetInputParameter(AbstractDisambiguationTask task); 45 | 46 | protected abstract void processAlgorithm() 47 | throws IllegalDisambiguationAlgorithmInputException; 48 | 49 | protected abstract boolean preDisambiguation(); 50 | } -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/general/HelpfulMethods.java: -------------------------------------------------------------------------------- 1 | package doser.general; 2 | 3 | import java.util.Collections; 4 | import java.util.Comparator; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | public final class HelpfulMethods { 10 | 11 | /** 12 | * Sorts a Map by value 13 | * 14 | * Partially buggy due to 15 | * http://stackoverflow.com/questions/109383/how-to-sort 16 | * -a-mapkey-value-on-the-values-in-java/1283722#1283722 17 | * 18 | * @param map 19 | * @return SortedMap by Value 20 | */ 21 | @Deprecated 22 | public static > List> sortByValue( 23 | final Map map) { 24 | final List> list = new LinkedList>( 25 | map.entrySet()); 26 | Collections.sort(list, new Comparator>() { 27 | @Override 28 | public int compare(final Map.Entry op1, 29 | final Map.Entry op2) { 30 | return (op2.getValue()).compareTo(op1.getValue()); 31 | } 32 | }); 33 | return list; 34 | } 35 | 36 | /** 37 | * Correct Map Sorting with Guava 38 | * 39 | */ 40 | // public static > List> sortByValueGuava( 41 | // Map map) { 42 | // // final List sortedKeys = 43 | // // Ordering.natural().onResultOf(Functions.forMap(map)).immutableSortedCopy(map.keySet()); 44 | // 45 | // Comparator> byMapValues = new Ordering>() { 46 | // @Override 47 | // public int compare(Map.Entry left, Map.Entry right) { 48 | // return left.getValue().compareTo(right.getValue()); 49 | // } 50 | // }; 51 | // 52 | // List> entryList = Lists.newArrayList(map.entrySet()); 53 | // Collections.sort(entryList, byMapValues); 54 | // return entryList; 55 | // } 56 | } 57 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/backend/DisambiguationTaskSingle.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.backend; 2 | 3 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO; 4 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers; 5 | 6 | public class DisambiguationTaskSingle extends AbstractDisambiguationTask { 7 | 8 | private EntityDisambiguationDPO entityToDis; 9 | 10 | public DisambiguationTaskSingle(final EntityDisambiguationDPO entityToDis) { 11 | super(); 12 | this.entityToDis = entityToDis; 13 | this.retrieveDocClasses = false; 14 | } 15 | 16 | public EntityDisambiguationDPO getEntityToDisambiguate() { 17 | return this.entityToDis; 18 | } 19 | 20 | public void setSurfaceForm(final EntityDisambiguationDPO surfaceForm) { 21 | this.entityToDis = surfaceForm; 22 | } 23 | 24 | /** 25 | * Assignment function to determine the used knowledge base 26 | * 27 | * @param kbversion 28 | * @param setting 29 | */ 30 | @Override 31 | public void setKbIdentifier(String kbversion, String setting) { 32 | if(setting == null) { 33 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard; 34 | } else if(setting.equalsIgnoreCase("DocumentCentric")) { 35 | if(kbversion.equalsIgnoreCase("default")) { 36 | this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault; 37 | } else { 38 | this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault; 39 | } 40 | } else if(setting.equalsIgnoreCase("EntityCentric")) { 41 | if(kbversion.equalsIgnoreCase("default")) { 42 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard; 43 | } else if(kbversion.equalsIgnoreCase("cstable")) { 44 | this.kbIdentifier = KnowledgeBaseIdentifiers.CSTable; 45 | } else if(kbversion.equalsIgnoreCase("biomedcopy")) { 46 | this.kbIdentifier = KnowledgeBaseIdentifiers.Biomed; 47 | } else { 48 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard; 49 | } 50 | } else { 51 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard; 52 | } 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/nlp/NLPTools.java: -------------------------------------------------------------------------------- 1 | package doser.nlp; 2 | 3 | import java.util.List; 4 | import java.util.Properties; 5 | 6 | //import edu.stanford.nlp.ling.CoreAnnotations; 7 | //import edu.stanford.nlp.ling.CoreLabel; 8 | //import edu.stanford.nlp.pipeline.Annotation; 9 | //import edu.stanford.nlp.pipeline.StanfordCoreNLP; 10 | //import edu.stanford.nlp.util.Pair; 11 | // 12 | // 13 | //public class NLPTools { 14 | // 15 | // private static volatile NLPTools instance; 16 | // 17 | // private StanfordCoreNLP pipeline; 18 | // 19 | // private NLPTools() { 20 | // super(); 21 | // Properties props = new Properties(); 22 | // props.put("annotators", "tokenize, ssplit, pos, lemma, stopword"); 23 | // props.setProperty("customAnnotatorClass.stopword", 24 | // "doser.nlp.StopWordAnnotator"); 25 | // props.setProperty(StopWordAnnotator.STOPWORDS_LIST, StopWordAnnotator.customStopWordList); 26 | // props.setProperty(StopWordAnnotator.CHECK_LEMMA, "true"); 27 | // 28 | // this.pipeline = new StanfordCoreNLP(props); 29 | // } 30 | // 31 | // public static NLPTools getInstance() { 32 | // if (instance == null ) { 33 | // synchronized (NLPTools.class) { 34 | // if (instance == null) { 35 | // instance = new NLPTools(); 36 | // } 37 | // } 38 | // } 39 | // return instance; 40 | // } 41 | // 42 | // public String performLemmatizationAndStopWordRemoval(String str) { 43 | // Annotation document = new Annotation(str); 44 | // this.pipeline.annotate(document); 45 | // List tokens = document 46 | // .get(CoreAnnotations.TokensAnnotation.class); 47 | // StringBuilder builder = new StringBuilder(); 48 | // for (CoreLabel token : tokens) { 49 | // Pair stopword = token.get(StopWordAnnotator.class); 50 | // String lemma = token.lemma().toLowerCase(); 51 | // if(!stopword.first()) { 52 | // builder.append(lemma); 53 | // builder.append(" "); 54 | // } 55 | // } 56 | // return builder.toString().trim(); 57 | // } 58 | //} 59 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/DisambiguationRequest.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.dpo; 2 | 3 | import java.util.List; 4 | 5 | /** 6 | * { "documentUri":"unique document id", "surfaceFormsToDisambiguate": [ { 7 | * "selectedText":"influenza", "context": 8 | * "Typically, influenza is transmitted through the air by coughs or sneezes, creating aerosols containing the virus." 9 | * , "position": { "pageId":0, "offsets":[1,2,3,5,6,7], 10 | * "boundingBox":{"minx":0.1, "miny":0.3, "maxx":0.01, "maxy":0.03} } } ], 11 | * "alreadyDisambiguatedEntities":[ { "text":"Illness", 12 | * "entityUri":"http://en.dbpedia.org/page/Illness", "confidence": 0.90, 13 | * "distance": 300 }, { "text":"Desease", 14 | * "entityUri":"http://en.dbpedia.org/page/Desease", "confidence": 0.65, 15 | * "distance": 500 } ] } 16 | * 17 | * Version 2.0 is used for additional testing. Current version 18 | * offers the usage of a position array in surfaceFormsToDisambiguate 19 | * 20 | * @author Stefan Zwicklbauer 21 | * 22 | */ 23 | public class DisambiguationRequest { 24 | private String documentUri; 25 | private List surfaceFormsToDisambiguate; 26 | private Integer docsToReturn; 27 | private String mainTopic; 28 | 29 | public String getDocumentUri() { 30 | return this.documentUri; 31 | } 32 | 33 | public List getSurfaceFormsToDisambiguate() { 34 | return this.surfaceFormsToDisambiguate; 35 | } 36 | 37 | public void setDocumentUri(final String documentUri) { 38 | this.documentUri = documentUri; 39 | } 40 | 41 | public void setSurfaceFormsToDisambiguate( 42 | final List surfaceFormsToDisambiguate) { 43 | this.surfaceFormsToDisambiguate = surfaceFormsToDisambiguate; 44 | } 45 | 46 | public Integer getDocsToReturn() { 47 | return docsToReturn; 48 | } 49 | 50 | public void setDocsToReturn(Integer docsToReturn) { 51 | this.docsToReturn = docsToReturn; 52 | } 53 | 54 | public String getMainTopic() { 55 | return mainTopic; 56 | } 57 | 58 | public void setMainTopic(String mainTopic) { 59 | this.mainTopic = mainTopic; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/DisambiguationHandler.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms; 2 | 3 | import doser.entitydisambiguation.algorithms.collective.dbpedia.CollectiveDisambiguationDBpediaEntities; 4 | import doser.entitydisambiguation.algorithms.collective.general.CollectiveDisambiguationGeneralEntities; 5 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask; 6 | import doser.entitydisambiguation.backend.DisambiguationTaskSingle; 7 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO; 8 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers; 9 | 10 | public class DisambiguationHandler { 11 | 12 | private static final DisambiguationHandler instance; 13 | 14 | static { 15 | try { 16 | instance = new DisambiguationHandler(); 17 | } catch (Exception e) { 18 | throw new RuntimeException("An error occurred!", e); 19 | } 20 | } 21 | 22 | private DisambiguationHandler() { 23 | super(); 24 | } 25 | 26 | public static DisambiguationHandler getInstance() { 27 | return instance; 28 | } 29 | 30 | public AbstractDisambiguationAlgorithm getAlgorithm(AbstractDisambiguationTask task) { 31 | AbstractDisambiguationAlgorithm algorithm = null; 32 | if (task instanceof DisambiguationTaskSingle) { 33 | DisambiguationTaskSingle t = (DisambiguationTaskSingle) task; 34 | EntityDisambiguationDPO dpo = t.getEntityToDisambiguate(); 35 | if ((dpo.getSetting() != null 36 | && (dpo.getSetting().equalsIgnoreCase("NoContext")) 37 | || dpo.getContext() == null || dpo.getContext().equals("") || dpo 38 | .getContext().equals(" "))) { 39 | algorithm = new EntityCentricAlgorithmTableDefault(); 40 | } else if ((dpo.getSetting() != null) 41 | && (dpo.getSetting().equalsIgnoreCase("DocumentCentric"))) { 42 | algorithm = new DocumentCentricAlgorithmDefault(); 43 | } else { 44 | algorithm = new EntityCentricAlgorithmDefault(); 45 | } 46 | } else { 47 | if (task.getKbIdentifier().equals(KnowledgeBaseIdentifiers.Biomed)) { 48 | algorithm = new CollectiveDisambiguationGeneralEntities(); 49 | } else { 50 | algorithm = new CollectiveDisambiguationDBpediaEntities(); 51 | } 52 | } 53 | return algorithm; 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/word2vec/Word2VecJsonFormat.java: -------------------------------------------------------------------------------- 1 | package doser.word2vec; 2 | 3 | import java.io.IOException; 4 | import java.util.Set; 5 | 6 | import org.apache.http.Header; 7 | import org.apache.http.entity.ByteArrayEntity; 8 | import org.apache.http.entity.ContentType; 9 | import org.apache.http.message.BasicHeader; 10 | import org.codehaus.jackson.map.ObjectMapper; 11 | import org.codehaus.jettison.json.JSONArray; 12 | import org.codehaus.jettison.json.JSONException; 13 | import org.codehaus.jettison.json.JSONObject; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import doser.entitydisambiguation.properties.Properties; 18 | import doser.tools.ServiceQueries; 19 | 20 | public class Word2VecJsonFormat { 21 | 22 | private final static Logger logger = LoggerFactory.getLogger(Word2VecJsonFormat.class); 23 | 24 | private String domain; 25 | private Set data; 26 | 27 | public Set getData() { 28 | return data; 29 | } 30 | 31 | public void setData(Set data) { 32 | this.data = data; 33 | } 34 | 35 | public String getDomain() { 36 | return domain; 37 | } 38 | 39 | public void setDomain(String domain) { 40 | this.domain = domain; 41 | } 42 | 43 | public static JSONArray performquery(Object json, String serviceEndpoint) { 44 | final ObjectMapper mapper = new ObjectMapper(); 45 | String jsonString = null; 46 | JSONArray result = null; 47 | try { 48 | jsonString = mapper.writeValueAsString(json); 49 | Header[] headers = { new BasicHeader("Accept", "application/json"), 50 | new BasicHeader("content-type", "application/json") }; 51 | ByteArrayEntity ent = new ByteArrayEntity(jsonString.getBytes(), 52 | ContentType.create("application/json")); 53 | String resStr = ServiceQueries.httpPostRequest( 54 | (Properties.getInstance().getWord2VecService() + serviceEndpoint), ent, headers); 55 | JSONObject resultJSON = null; 56 | try { 57 | resultJSON = new JSONObject(resStr); 58 | result = resultJSON.getJSONArray("data"); 59 | } catch (JSONException e) { 60 | logger.error("JsonException in "+Word2VecJsonFormat.class.getName(), e); 61 | } 62 | } catch (IOException e) { 63 | logger.error("JsonException in "+Word2VecJsonFormat.class.getName(), e); 64 | } 65 | return result; 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/backend/DisambiguationTaskCollective.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.backend; 2 | 3 | import java.util.List; 4 | 5 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO; 6 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers; 7 | 8 | public class DisambiguationTaskCollective extends AbstractDisambiguationTask { 9 | 10 | private List entitiesToDis; 11 | 12 | /* A maintopic e.g. the column identifier in a table */ 13 | private String mainTopic; 14 | 15 | public DisambiguationTaskCollective(final List entityToDis, String mainTopic) { 16 | super(); 17 | this.entitiesToDis = entityToDis; 18 | this.mainTopic = mainTopic; 19 | } 20 | 21 | public List getEntityToDisambiguate() { 22 | return this.entitiesToDis; 23 | } 24 | 25 | public String getMainTopic() { 26 | return this.mainTopic; 27 | } 28 | 29 | public void setSurfaceForm(final List surfaceForm) { 30 | this.entitiesToDis = surfaceForm; 31 | } 32 | 33 | /** 34 | * Assignment function to determine the used knowledge base 35 | * 36 | * @param kbversion 37 | * @param setting 38 | */ 39 | @Override 40 | public void setKbIdentifier(String kbversion, String setting) { 41 | if(setting == null) { 42 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard; 43 | } else if(setting.equalsIgnoreCase("DocumentCentric")) { 44 | if(kbversion.equalsIgnoreCase("default")) { 45 | this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault; 46 | } else { 47 | this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault; 48 | } 49 | } else if(setting.equalsIgnoreCase("EntityCentric")) { 50 | if(kbversion.equalsIgnoreCase("default")) { 51 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard; 52 | } else if(kbversion.equalsIgnoreCase("cstable")) { 53 | this.kbIdentifier = KnowledgeBaseIdentifiers.CSTable; 54 | } else if(kbversion.equalsIgnoreCase("biomed")) { 55 | this.kbIdentifier = KnowledgeBaseIdentifiers.Biomed; 56 | } else { 57 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard; 58 | } 59 | } else { 60 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard; 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserStandardTokenizer.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.analysis; 2 | 3 | /* 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.io.Reader; 21 | 22 | import org.apache.lucene.analysis.Tokenizer; 23 | import org.apache.lucene.analysis.util.CharTokenizer; 24 | import org.apache.lucene.util.AttributeFactory; 25 | 26 | public final class DoserStandardTokenizer extends CharTokenizer { 27 | 28 | /** 29 | * Construct a new WhitespaceTokenizer using a given 30 | * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}. 31 | * 32 | * @param factory 33 | * the attribute factory to use for this {@link Tokenizer} 34 | * @param in 35 | * the input to split up into tokens 36 | */ 37 | public DoserStandardTokenizer(AttributeFactory factory, Reader in) { 38 | super(factory, in); 39 | } 40 | 41 | /** 42 | * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version 43 | * to match See {@link above} 44 | * 45 | * @param in 46 | * the input to split up into tokens 47 | */ 48 | public DoserStandardTokenizer(Reader in) { 49 | super(in); 50 | } 51 | 52 | /** 53 | * Collects only characters which do not satisfy 54 | * {@link Character#isWhitespace(int)}. 55 | */ 56 | @Override 57 | protected boolean isTokenChar(int c) { 58 | boolean check = true; 59 | if (Character.isWhitespace(c) || c == 46) { 60 | check = false; 61 | } 62 | return check; 63 | } 64 | } -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/features/LuceneFeatures.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.features; 2 | 3 | import java.util.Locale; 4 | 5 | import org.apache.lucene.index.Term; 6 | import org.apache.lucene.search.Query; 7 | import org.apache.lucene.search.BooleanClause.Occur; 8 | import org.apache.lucene.search.similarities.Similarity; 9 | 10 | import doser.lucene.query.LTRBooleanQuery; 11 | import doser.lucene.query.LearnToRankFuzzyQuery; 12 | import doser.lucene.query.LearnToRankTermQuery; 13 | import doser.lucene.query.PriorQuery; 14 | import doser.lucene.query.SensePriorQuery; 15 | 16 | public class LuceneFeatures { 17 | 18 | public static Query queryLabelTerm(String keyword, String field, 19 | Similarity sim) { 20 | final LearnToRankTermQuery q = new LearnToRankTermQuery(new Term(field, 21 | keyword.toLowerCase(Locale.US)), sim); 22 | return q; 23 | } 24 | 25 | public static Query queryLabelFuzzy(String keyword, String field, 26 | Similarity sim) { 27 | final LearnToRankFuzzyQuery q = new LearnToRankFuzzyQuery(new Term( 28 | field, keyword.toLowerCase(Locale.US)), sim); 29 | return q; 30 | } 31 | 32 | 33 | public static Query queryStringTerm(String str, String field, 34 | Similarity sim, Occur occ, int maxclause) { 35 | 36 | final String[] split = str.split(" "); 37 | final LTRBooleanQuery bquery = new LTRBooleanQuery(); 38 | for (final String element : split) { 39 | final LearnToRankTermQuery tquery = new LearnToRankTermQuery( 40 | new Term(field, element.toLowerCase(Locale.US)), sim); 41 | bquery.add(tquery, occ); 42 | } 43 | return bquery; 44 | } 45 | 46 | public static Query queryStringFuzzy(String str, String field, 47 | Similarity sim, Occur occ, int maxclause) { 48 | 49 | final String[] split = str.split(" "); 50 | final LTRBooleanQuery bquery = new LTRBooleanQuery(); 51 | for (final String element : split) { 52 | final LearnToRankFuzzyQuery tquery = new LearnToRankFuzzyQuery( 53 | new Term(field, element.toLowerCase(Locale.US)), sim); 54 | bquery.add(tquery, occ); 55 | 56 | } 57 | return bquery; 58 | } 59 | 60 | 61 | public static Query queryPrior(IEntityCentricExtFeatures kb) { 62 | return new PriorQuery(kb); 63 | } 64 | 65 | public static Query querySensePrior(String str, IEntityCentricExtFeatures kb) { 66 | return new SensePriorQuery(str, kb); 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/TableColumnFilter.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective.dbpedia; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import org.apache.lucene.index.IndexReader; 7 | import org.apache.lucene.index.Term; 8 | import org.apache.lucene.search.BooleanQuery; 9 | import org.apache.lucene.search.IndexSearcher; 10 | import org.apache.lucene.search.ScoreDoc; 11 | import org.apache.lucene.search.TopDocs; 12 | import org.apache.lucene.search.BooleanClause.Occur; 13 | 14 | import doser.entitydisambiguation.algorithms.SurfaceForm; 15 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; 16 | import doser.lucene.query.TermQuery; 17 | 18 | public class TableColumnFilter { 19 | 20 | private EntityCentricKBDBpedia eckb; 21 | private String topic; 22 | 23 | TableColumnFilter(EntityCentricKBDBpedia eckb, String topic) { 24 | super(); 25 | this.eckb = eckb; 26 | this.topic = topic; 27 | } 28 | 29 | public void filter(List reps) { 30 | for (SurfaceForm sf : reps) { 31 | List candidates = sf.getCandidates(); 32 | if (candidates.size() > 0) { 33 | String s = performLuceneQuery(candidates, topic); 34 | if (s != null) { 35 | sf.setDisambiguatedEntity(s); 36 | } 37 | } 38 | } 39 | } 40 | 41 | private String performLuceneQuery(List candidates, String topic) { 42 | String result = null; 43 | IndexSearcher searcher = eckb.getSearcher(); 44 | IndexReader reader = searcher.getIndexReader(); 45 | BooleanQuery candidateq = new BooleanQuery(); 46 | for (String can : candidates) { 47 | candidateq.add(new TermQuery(new Term("Mainlink", can)), Occur.SHOULD); 48 | } 49 | BooleanQuery q = new BooleanQuery(); 50 | q.add(candidateq, Occur.MUST); 51 | q.add(new TermQuery(new Term("LongDescription", topic)), Occur.MUST); 52 | TopDocs t = null; 53 | try { 54 | t = searcher.search(q, candidates.size()); 55 | } catch (IOException e) { 56 | e.printStackTrace(); 57 | } 58 | if (t != null) { 59 | ScoreDoc[] scoredocs = t.scoreDocs; 60 | if (scoredocs.length == 1) { 61 | try { 62 | result = reader.document(scoredocs[0].doc).get("Mainlink"); 63 | } catch (IOException e) { 64 | e.printStackTrace(); 65 | } 66 | } 67 | } 68 | return result; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/EntityDisambiguationDPO.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.dpo; 2 | 3 | 4 | /** 5 | * Represents surfaceform which should be disambiguated. Positions is used as an 6 | * intern id, which is necessary during feedback processing later. 7 | * 8 | * Version 2.0 One position of a surface form might be not enough. Version 2 9 | * offers the possibility to send an array of position. 10 | * 11 | * Version 3.0 A new field InterDisambiguationSetting flags the kind of 12 | * Disambiguation. This can be one of the following Types: - Standard Entity 13 | * Disambiguation with context - Standard Entity Disambiguation without context 14 | * - Entity Disambiguation without context on specialized domain (i.e. tables) 15 | * 16 | * Version 4.0 KnowledgeBaseIdentifier allows to select a specific knowledge 17 | * base for each disambiguation algorithm. This option should only be used if 18 | * the user is aware of what he is doing. Additionally the user is able to get 19 | * the lucene documents of disambiguated entities. 20 | * 21 | * 22 | * @author Stefan Zwicklbauer 23 | * 24 | */ 25 | public class EntityDisambiguationDPO { 26 | 27 | private String documentId; 28 | private String context; 29 | private String selectedText; 30 | private String setting; 31 | private String kbversion; 32 | private int startPosition; 33 | 34 | public EntityDisambiguationDPO() { 35 | super(); 36 | } 37 | 38 | public String getContext() { 39 | return this.context; 40 | } 41 | 42 | public String getSelectedText() { 43 | return this.selectedText; 44 | } 45 | 46 | public void setContext(final String context) { 47 | this.context = context; 48 | } 49 | 50 | public void setSelectedText(final String selectedText) { 51 | this.selectedText = selectedText; 52 | } 53 | 54 | public void setSetting(final String setting) { 55 | this.setting = setting; 56 | } 57 | 58 | public String getSetting() { 59 | return setting; 60 | } 61 | 62 | public void setDocumentId(final String documentId) { 63 | this.documentId = documentId; 64 | } 65 | 66 | public String getDocumentId() { 67 | return this.documentId; 68 | } 69 | 70 | public void setInternSetting(final String setting) { 71 | this.setting = setting; 72 | } 73 | 74 | public String getKbversion() { 75 | return kbversion; 76 | } 77 | 78 | public void setKbversion(String kbversion) { 79 | this.kbversion = kbversion; 80 | } 81 | 82 | public int getStartPosition() { 83 | return startPosition; 84 | } 85 | 86 | public void setStartPosition(int startPosition) { 87 | this.startPosition = startPosition; 88 | } 89 | } -------------------------------------------------------------------------------- /doser-dis-extensions/.settings/org.eclipse.jdt.ui.prefs: -------------------------------------------------------------------------------- 1 | cleanup.add_default_serial_version_id=true 2 | cleanup.add_generated_serial_version_id=false 3 | cleanup.add_missing_annotations=true 4 | cleanup.add_missing_deprecated_annotations=true 5 | cleanup.add_missing_methods=false 6 | cleanup.add_missing_nls_tags=false 7 | cleanup.add_missing_override_annotations=true 8 | cleanup.add_missing_override_annotations_interface_methods=true 9 | cleanup.add_serial_version_id=false 10 | cleanup.always_use_blocks=true 11 | cleanup.always_use_parentheses_in_expressions=false 12 | cleanup.always_use_this_for_non_static_field_access=false 13 | cleanup.always_use_this_for_non_static_method_access=false 14 | cleanup.convert_to_enhanced_for_loop=true 15 | cleanup.correct_indentation=true 16 | cleanup.format_source_code=true 17 | cleanup.format_source_code_changes_only=false 18 | cleanup.make_local_variable_final=true 19 | cleanup.make_parameters_final=false 20 | cleanup.make_private_fields_final=true 21 | cleanup.make_type_abstract_if_missing_method=false 22 | cleanup.make_variable_declarations_final=false 23 | cleanup.never_use_blocks=false 24 | cleanup.never_use_parentheses_in_expressions=true 25 | cleanup.organize_imports=true 26 | cleanup.qualify_static_field_accesses_with_declaring_class=false 27 | cleanup.qualify_static_member_accesses_through_instances_with_declaring_class=true 28 | cleanup.qualify_static_member_accesses_through_subtypes_with_declaring_class=true 29 | cleanup.qualify_static_member_accesses_with_declaring_class=true 30 | cleanup.qualify_static_method_accesses_with_declaring_class=false 31 | cleanup.remove_private_constructors=true 32 | cleanup.remove_trailing_whitespaces=true 33 | cleanup.remove_trailing_whitespaces_all=true 34 | cleanup.remove_trailing_whitespaces_ignore_empty=false 35 | cleanup.remove_unnecessary_casts=true 36 | cleanup.remove_unnecessary_nls_tags=true 37 | cleanup.remove_unused_imports=true 38 | cleanup.remove_unused_local_variables=false 39 | cleanup.remove_unused_private_fields=true 40 | cleanup.remove_unused_private_members=false 41 | cleanup.remove_unused_private_methods=true 42 | cleanup.remove_unused_private_types=true 43 | cleanup.sort_members=true 44 | cleanup.sort_members_all=true 45 | cleanup.use_blocks=true 46 | cleanup.use_blocks_only_for_return_and_throw=false 47 | cleanup.use_parentheses_in_expressions=false 48 | cleanup.use_this_for_non_static_field_access=true 49 | cleanup.use_this_for_non_static_field_access_only_if_necessary=true 50 | cleanup.use_this_for_non_static_method_access=true 51 | cleanup.use_this_for_non_static_method_access_only_if_necessary=true 52 | cleanup_profile=_Doser Code Profile 53 | cleanup_settings_version=2 54 | eclipse.preferences.version=1 55 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/EntityCentricKBDBpedia.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.knowledgebases; 2 | 3 | import java.util.List; 4 | 5 | import org.apache.lucene.search.similarities.Similarity; 6 | 7 | public class EntityCentricKBDBpedia extends AbstractEntityCentricKBGeneral { 8 | 9 | public EntityCentricKBDBpedia(String uri, boolean dynamic) { 10 | super(uri, dynamic); 11 | } 12 | 13 | public EntityCentricKBDBpedia(String uri, boolean dynamic, Similarity sim) { 14 | super(uri, dynamic, sim); 15 | } 16 | 17 | /** 18 | * Takes a set of dbpedia entities as well as a target entity and generates 19 | * one string that fits into the word2vec query format used in this class. 20 | * The source entities are concatenated and should be compared with the 21 | * target entity. 22 | * 23 | * @param source 24 | * a set of source entities 25 | * @param target 26 | * the target entity. 27 | * @return String in appropriate word2vec query format 28 | */ 29 | @Override 30 | public String generateWord2VecFormatString(String source, String target) { 31 | String s = source.replaceAll("http://dbpedia.org/resource/", ""); 32 | String t = target.replaceAll("http://dbpedia.org/resource/", ""); 33 | int c = s.compareToIgnoreCase(target); 34 | String res = ""; 35 | if (c < 0) { 36 | res = s + "|" + t; 37 | } else if (c == 0) { 38 | res = s + "|" + t; 39 | } else { 40 | res = t + "|" + s; 41 | } 42 | return res; 43 | } 44 | 45 | /** 46 | * Takes a set of dbpedia entities as well as a target entity and generates 47 | * one string that fits into the word2vec query format used in this class. 48 | * The source entities are concatenated and should be compared with the 49 | * target entity. 50 | * 51 | * @param source 52 | * a set of source entities 53 | * @param target 54 | * the target entity. 55 | * @return String in appropriate word2vec query format 56 | */ 57 | @Override 58 | public String generateWord2VecFormatString(List source, String target) { 59 | StringBuilder builder = new StringBuilder(); 60 | for (String s : source) { 61 | s = s.replaceAll("http://dbpedia.org/resource/", ""); 62 | builder.append(s); 63 | builder.append("|"); 64 | } 65 | String src = builder.toString(); 66 | src = src.substring(0, src.length() - 1); 67 | String t = target.replaceAll("http://dbpedia.org/resource/", ""); 68 | return src + "|" + t; 69 | } 70 | 71 | @Override 72 | protected String generateDomainName() { 73 | return "DBpedia"; 74 | } 75 | 76 | @Override 77 | protected String kbName() { 78 | return "DBpedia KB"; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/NoCandidatesCheckPlural.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.rules; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | 8 | import org.apache.lucene.document.Document; 9 | import org.apache.lucene.index.IndexReader; 10 | import org.apache.lucene.search.IndexSearcher; 11 | import org.apache.lucene.search.ScoreDoc; 12 | import org.apache.lucene.search.TopDocs; 13 | import org.apache.lucene.search.similarities.DefaultSimilarity; 14 | 15 | import doser.entitydisambiguation.algorithms.SurfaceForm; 16 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase; 17 | import doser.lucene.features.LuceneFeatures; 18 | import doser.lucene.query.LearnToRankClause; 19 | import doser.lucene.query.LearnToRankQuery; 20 | import doser.tools.Inflector; 21 | 22 | /** 23 | * Überprüft ob eine surface form im plural angegeben ist und falls ja überprüfe 24 | * den singular 25 | * 26 | * @author stefan 27 | * 28 | */ 29 | class NoCandidatesCheckPlural extends AbstractRule { 30 | 31 | NoCandidatesCheckPlural(AbstractKnowledgeBase eckb) { 32 | super(eckb); 33 | } 34 | 35 | @Override 36 | public boolean applyRule(List rep) { 37 | for (SurfaceForm r : rep) { 38 | if (r.getCandidates().size() == 0) { 39 | String sf = r.getSurfaceForm(); 40 | String singular = Inflector.getInstance().singularize(sf); 41 | if (!sf.equalsIgnoreCase(singular)) { 42 | // Try singular search 43 | ArrayList lst = queryLucene(singular); 44 | if (lst.size() != 0) { 45 | r.setCandidates(lst); 46 | } 47 | } 48 | } 49 | } 50 | return false; 51 | } 52 | 53 | private ArrayList queryLucene(String surfaceForm) { 54 | ArrayList list = new ArrayList(); 55 | final IndexSearcher searcher = eckb.getSearcher(); 56 | final IndexReader reader = searcher.getIndexReader(); 57 | LearnToRankQuery query = new LearnToRankQuery(); 58 | List features = new LinkedList(); 59 | DefaultSimilarity defaultSim = new DefaultSimilarity(); 60 | features.add(query.add(LuceneFeatures.queryLabelTerm(surfaceForm, 61 | "UniqueLabel", defaultSim), "Feature1", true)); 62 | try { 63 | final TopDocs top = searcher.search(query, 150); 64 | final ScoreDoc[] score = top.scoreDocs; 65 | if (score.length <= 5) { 66 | for (int i = 0; i < score.length; ++i) { 67 | final Document doc = reader.document(score[i].doc); 68 | list.add(doc.get("Mainlink")); 69 | } 70 | } 71 | } catch (IOException e) { 72 | e.printStackTrace(); 73 | } 74 | return list; 75 | } 76 | 77 | } 78 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/general/CollectiveContextDriverGeneral.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective.general; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | import doser.entitydisambiguation.algorithms.SurfaceForm; 7 | import doser.entitydisambiguation.algorithms.collective.CandidatePruning; 8 | import doser.entitydisambiguation.algorithms.rules.RuleAdapation; 9 | import doser.entitydisambiguation.dpo.DisambiguatedEntity; 10 | import doser.entitydisambiguation.dpo.Response; 11 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral; 12 | 13 | class CollectiveContextDriverGeneral { 14 | 15 | static final int PREPROCESSINGCONTEXTSIZE = 200; 16 | 17 | private Response[] currentResponse; 18 | private List rep; 19 | private AbstractEntityCentricKBGeneral eckb; 20 | 21 | CollectiveContextDriverGeneral(Response[] res, List rep, AbstractEntityCentricKBGeneral eckb) { 22 | super(); 23 | this.currentResponse = res; 24 | this.rep = rep; 25 | this.eckb = eckb; 26 | } 27 | 28 | void solve() { 29 | // First candidate pruning 30 | CandidatePruning pruning = new CandidatePruning(eckb); 31 | pruning.prune(rep); 32 | 33 | RuleAdapation rules = new RuleAdapation(); 34 | rules.addNoCandidatesCheckPluralRule(eckb); 35 | rules.addNoCandidatesExpansionRule(eckb); 36 | rules.performRuleChainBeforeCandidateSelection(rep); 37 | 38 | CandidateReductionGeneralW2V w2vreduction = new CandidateReductionGeneralW2V(eckb, rep, 20, 5, 125, false, false); 39 | w2vreduction.solve(); 40 | rep = w2vreduction.getRep(); 41 | 42 | w2vreduction = new CandidateReductionGeneralW2V(eckb, rep, 45, 5, 250, true, true); 43 | w2vreduction.solve(); 44 | rep = w2vreduction.getRep(); 45 | FinalEntityDisambiguation finalDis = new FinalEntityDisambiguation(eckb, rep); 46 | finalDis.setup(); 47 | finalDis.solve(); 48 | } 49 | 50 | void generateResult() { 51 | for (int i = 0; i < currentResponse.length; i++) { 52 | SurfaceForm r = search(i); 53 | if (currentResponse[i] == null && r != null && r.getCandidates().size() == 1) { 54 | Response res = new Response(); 55 | List entList = new LinkedList(); 56 | DisambiguatedEntity ent = new DisambiguatedEntity(); 57 | ent.setEntityUri(r.getCandidates().get(0)); 58 | entList.add(ent); 59 | res.setDisEntities(entList); 60 | res.setSelectedText(r.getSurfaceForm()); 61 | currentResponse[i] = res; 62 | } 63 | } 64 | } 65 | 66 | private SurfaceForm search(int qryNr) { 67 | for (SurfaceForm r : rep) { 68 | if (r.getQueryNr() == qryNr) { 69 | return r; 70 | } 71 | } 72 | return null; 73 | } 74 | 75 | } 76 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserIDAnalyzer.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.analysis; 2 | 3 | import java.io.IOException; 4 | import java.io.Reader; 5 | 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.apache.lucene.analysis.core.StopAnalyzer; 8 | import org.apache.lucene.analysis.util.CharArraySet; 9 | import org.apache.lucene.analysis.util.StopwordAnalyzerBase; 10 | import org.apache.lucene.analysis.util.WordlistLoader; 11 | import org.apache.lucene.util.Version; 12 | 13 | /** 14 | * This analyzer is a special analyzer for id queries in our knowledge bases 15 | * 16 | * @author Stefan Zwicklbauer 17 | * 18 | */ 19 | public final class DoserIDAnalyzer extends StopwordAnalyzerBase { 20 | 21 | /** Default maximum allowed token length */ 22 | public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; 23 | 24 | /** 25 | * An unmodifiable set containing some common English words that are usually 26 | * not useful for searching. 27 | */ 28 | public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; 29 | 30 | private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; 31 | 32 | /** 33 | * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}). 34 | * 35 | * 36 | */ 37 | public DoserIDAnalyzer() { 38 | this(STOP_WORDS_SET); 39 | } 40 | 41 | /** 42 | * Builds an analyzer with the given stop words. 43 | * 44 | * @param stopWords 45 | * stop words 46 | */ 47 | public DoserIDAnalyzer(CharArraySet stopWords) { 48 | super(stopWords); 49 | } 50 | 51 | /** 52 | * Builds an analyzer with the stop words from the given reader. 53 | * 54 | * @see WordlistLoader#getWordSet(Reader, Version) 55 | * @param stopwords 56 | * Reader to read stop words from 57 | */ 58 | public DoserIDAnalyzer(Reader stopwords) 59 | throws IOException { 60 | this(loadStopwordSet(stopwords)); 61 | } 62 | 63 | @Override 64 | protected TokenStreamComponents createComponents(final String fieldName, 65 | final Reader reader) { 66 | final DoserIDTokenizer src = new DoserIDTokenizer(reader); 67 | TokenStream tok = new DoserIDFilter(src); 68 | return new TokenStreamComponents(src, tok) { 69 | @Override 70 | protected void setReader(final Reader reader) throws IOException { 71 | super.setReader(reader); 72 | } 73 | }; 74 | } 75 | 76 | /** 77 | * @see #setMaxTokenLength 78 | */ 79 | public int getMaxTokenLength() { 80 | return maxTokenLength; 81 | } 82 | 83 | /** 84 | * Set maximum allowed token length. If a token is seen that exceeds this 85 | * length then it is discarded. This setting only takes effect the next time 86 | * tokenStream or tokenStream is called. 87 | */ 88 | public void setMaxTokenLength(int length) { 89 | maxTokenLength = length; 90 | } 91 | } -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/query/LearnToRankClause.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.query; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | 6 | import org.apache.lucene.search.Query; 7 | import org.apache.lucene.search.Weight; 8 | 9 | /** 10 | * LearnToRank clause representing an arbitrary feature query. Additional 11 | * criterias may be defined later but are not necessary so far. 12 | * 13 | * HashMap featuresValues contains all calculated featuresValues. The HashMap 14 | * key stores the document number. The Pair integer stores the featureNumber. 15 | * 16 | * The HashMap has to be resetted after each query. 17 | * 18 | */ 19 | public class LearnToRankClause { 20 | 21 | class Pair { 22 | 23 | private final int featureNr; 24 | 25 | private final float featureValue; 26 | 27 | Pair(final int docNr, final float featureValue) { 28 | featureNr = docNr; 29 | this.featureValue = featureValue; 30 | } 31 | 32 | public int getDocNr() { 33 | return featureNr; 34 | } 35 | 36 | public float getFeatureValue() { 37 | return featureValue; 38 | } 39 | 40 | } 41 | 42 | private Weight cweight; 43 | 44 | private final Map featureValues; 45 | 46 | private final boolean mustOccur; 47 | 48 | private final String name; 49 | 50 | private Query query; 51 | 52 | private float weight; 53 | 54 | public LearnToRankClause(final Query query, final String name, 55 | final boolean mustOccur) { 56 | this.query = query; 57 | this.name = name; 58 | weight = 1.0f; 59 | this.mustOccur = mustOccur; 60 | featureValues = new HashMap(); 61 | } 62 | 63 | public void addFeatureValue(final int docBase, final int docNr, 64 | final float value) { 65 | featureValues.put((docBase + docNr), value); 66 | } 67 | 68 | public void clear() { 69 | featureValues.clear(); 70 | } 71 | 72 | public double getFeatureValue(final int docId) { 73 | double val = 0f; 74 | try { 75 | val = featureValues.get(docId); 76 | } catch (final NullPointerException e) { 77 | val = 0f; 78 | } 79 | return val; 80 | } 81 | 82 | public String getName() { 83 | return name; 84 | } 85 | 86 | public Query getQuery() { 87 | return query; 88 | } 89 | 90 | public Weight getW() { 91 | return cweight; 92 | } 93 | 94 | public float getWeight() { 95 | return weight; 96 | } 97 | 98 | public boolean isMustOccur() { 99 | return mustOccur; 100 | } 101 | 102 | public void setQuery(final Query query) { 103 | this.query = query; 104 | } 105 | 106 | public void setW(final Weight cweight) { 107 | this.cweight = cweight; 108 | } 109 | 110 | public void setWeight(final float weight) { 111 | this.weight = weight; 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/properties/Properties.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.properties; 2 | 3 | import org.apache.commons.configuration.ConfigurationException; 4 | import org.apache.commons.configuration.PropertiesConfiguration; 5 | import org.apache.log4j.Logger; 6 | 7 | public final class Properties { 8 | private static Properties instance; 9 | private static final String RESOURCE_NAME = "disambiguation.properties"; 10 | // private static final String RESOURCE_NAME = "./disambiguation.properties"; 11 | 12 | public synchronized static Properties getInstance() { 13 | if (instance == null) { 14 | instance = new Properties(); 15 | } 16 | 17 | return instance; 18 | } 19 | 20 | /** 21 | * Provides easy access to property files (e.g. config.getInt()) 22 | */ 23 | PropertiesConfiguration config; 24 | 25 | private Properties() { 26 | try { 27 | this.config = new PropertiesConfiguration(RESOURCE_NAME); 28 | } catch (final ConfigurationException e) { 29 | Logger.getRootLogger().error("Failed to load properties file: " + RESOURCE_NAME, e); 30 | } 31 | } 32 | 33 | /** 34 | * ArtifactId of the application (from maven pom.xml) 35 | * 36 | * @return artifact id 37 | */ 38 | public String getApplicationArtifactId() { 39 | return this.config.getString("application.artifactId"); 40 | } 41 | 42 | /** 43 | * Name of the application (from maven pom.xml) 44 | * 45 | * @return application name 46 | */ 47 | public String getApplicationName() { 48 | return this.config.getString("application.name"); 49 | } 50 | 51 | /** 52 | * Version of the application (from maven pom.xml) 53 | * 54 | * @return application version 55 | */ 56 | public String getApplicationVersion() { 57 | return this.config.getString("application.version"); 58 | } 59 | 60 | public int getDisambiguationResultSize() { 61 | final String size = this.config.getString("disambiguation.returnSize"); 62 | return Integer.valueOf(size); 63 | } 64 | 65 | /** 66 | * Get location of entity-centric knowledge base 67 | */ 68 | public String getEntityCentricKBWikipedia() { 69 | return this.config.getString("disambiguation.entityCentricKBWikipedia"); 70 | } 71 | 72 | public String getEntityCentricKBBiomed() { 73 | return this.config.getString("disambiguation.entityCentricBiomedCalbC"); 74 | } 75 | 76 | public String getWord2VecService() { 77 | return this.config.getString("disambiguation.Word2VecService"); 78 | } 79 | 80 | public String getWord2VecModel() { 81 | return this.config.getString("word2vecmodel"); 82 | } 83 | 84 | public boolean getCandidateExpansion() { 85 | boolean bool = false; 86 | String s = this.config.getString("candidateExpansion"); 87 | if(s.equalsIgnoreCase("true")) { 88 | bool = true; 89 | } 90 | return bool; 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | doser-dis 5 | doser-dis-parent 6 | 1.0 7 | 8 | 4.0.0 9 | doser.sub 10 | doser-dis-disambiguationserver 11 | 0.6 12 | doser-dis-disambiguationserver 13 | 14 | 15 | doser-dis-disambiguationserver 16 | 17 | 18 | maven-war-plugin 19 | 2.1.1 20 | 21 | 22 | org.apache.maven.plugins 23 | 2.9 24 | maven-eclipse-plugin 25 | 26 | true 27 | 2.0 28 | 29 | 30 | 31 | org.apache.tomcat.maven 32 | tomcat7-maven-plugin 33 | 2.0 34 | 35 | 36 | tomcat-run 37 | 38 | exec-war-only 39 | 40 | package 41 | 42 | /doser 43 | false 44 | DoSer-disambiguation-only.jar 45 | utf-8 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | doser.sub 55 | doser-dis-core 56 | ${project.version} 57 | 58 | 59 | org.springframework 60 | spring-webmvc 61 | 4.0.6.RELEASE 62 | 63 | 64 | commons-fileupload 65 | commons-fileupload 66 | 1.3.1 67 | 68 | 69 | javax.servlet 70 | javax.servlet-api 71 | provided 72 | 3.0.1 73 | 74 | 75 | 76 | 77 | 78 | xml-apis 79 | xml-apis 80 | 1.4.01 81 | 82 | 83 | 84 | war 85 | 86 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserStandardAnalyzer.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.analysis; 2 | 3 | import java.io.IOException; 4 | import java.io.Reader; 5 | 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.apache.lucene.analysis.core.LowerCaseFilter; 8 | import org.apache.lucene.analysis.core.StopAnalyzer; 9 | import org.apache.lucene.analysis.standard.StandardFilter; 10 | import org.apache.lucene.analysis.util.CharArraySet; 11 | import org.apache.lucene.analysis.util.StopwordAnalyzerBase; 12 | import org.apache.lucene.analysis.util.WordlistLoader; 13 | import org.apache.lucene.util.Version; 14 | 15 | public final class DoserStandardAnalyzer extends StopwordAnalyzerBase { 16 | 17 | /** Default maximum allowed token length */ 18 | public static final int DEFAULT_MAX_TOKEN_LENGTH = 255; 19 | 20 | /** 21 | * An unmodifiable set containing some common English words that are usually 22 | * not useful for searching. 23 | */ 24 | public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET; 25 | 26 | private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH; 27 | 28 | /** 29 | * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}). 30 | * 31 | */ 32 | public DoserStandardAnalyzer() { 33 | this(STOP_WORDS_SET); 34 | } 35 | 36 | /** 37 | * Builds an analyzer with the given stop words. 38 | * 39 | * @param stopWords 40 | * stop words 41 | */ 42 | public DoserStandardAnalyzer(CharArraySet stopWords) { 43 | super(stopWords); 44 | } 45 | 46 | /** 47 | * Builds an analyzer with the stop words from the given reader. 48 | * 49 | * @see WordlistLoader#getWordSet(Reader, Version) 50 | * @param stopwords 51 | * Reader to read stop words from 52 | */ 53 | public DoserStandardAnalyzer(Reader stopwords) 54 | throws IOException { 55 | this(loadStopwordSet(stopwords)); 56 | } 57 | 58 | @Override 59 | protected TokenStreamComponents createComponents(final String fieldName, 60 | final Reader reader) { 61 | final DoserStandardTokenizer src = new DoserStandardTokenizer(reader); 62 | TokenStream tok = new StandardFilter(src); 63 | tok = new LowerCaseFilter(tok); 64 | return new TokenStreamComponents(src, tok) { 65 | @Override 66 | protected void setReader(final Reader reader) throws IOException { 67 | super.setReader(reader); 68 | } 69 | }; 70 | } 71 | 72 | /** 73 | * @see #setMaxTokenLength 74 | */ 75 | public int getMaxTokenLength() { 76 | return maxTokenLength; 77 | } 78 | 79 | /** 80 | * Set maximum allowed token length. If a token is seen that exceeds this 81 | * length then it is discarded. This setting only takes effect the next time 82 | * tokenStream or tokenStream is called. 83 | */ 84 | public void setMaxTokenLength(int length) { 85 | maxTokenLength = length; 86 | } 87 | } -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/AbstractKnowledgeBase.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.knowledgebases; 2 | 3 | import java.io.File; 4 | import java.io.IOException; 5 | import java.util.TimerTask; 6 | 7 | import org.apache.lucene.search.IndexSearcher; 8 | import org.apache.lucene.search.SearcherFactory; 9 | import org.apache.lucene.search.SearcherManager; 10 | import org.apache.lucene.search.similarities.DefaultSimilarity; 11 | import org.apache.lucene.search.similarities.Similarity; 12 | import org.apache.lucene.store.Directory; 13 | import org.apache.lucene.store.FSDirectory; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | /** 18 | * Each knowledge base provides its own class with its respective properties. 19 | * These are the knowledge base index uri. IndexSearcher, IndexReader objects 20 | * and the dynamic property. 21 | * 22 | * @author stefan zwicklbauer 23 | */ 24 | public abstract class AbstractKnowledgeBase extends TimerTask { 25 | 26 | private final static Logger logger = LoggerFactory.getLogger(AbstractKnowledgeBase.class); 27 | 28 | private String indexUri; 29 | 30 | private boolean dynamic; 31 | 32 | private SearcherManager manager; 33 | 34 | private IndexSearcher searcher; 35 | 36 | AbstractKnowledgeBase(String uri, boolean dynamic) { 37 | this(uri, dynamic, new DefaultSimilarity()); 38 | } 39 | 40 | AbstractKnowledgeBase(String uri, boolean dynamic, Similarity sim) { 41 | super(); 42 | this.indexUri = uri; 43 | this.dynamic = dynamic; 44 | 45 | File indexDir = new File(indexUri); 46 | Directory dir; 47 | try { 48 | dir = FSDirectory.open(indexDir); 49 | this.manager = new SearcherManager(dir, new SearcherFactory()); 50 | } catch (IOException e) { 51 | logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e); 52 | } 53 | } 54 | 55 | public String getIndexUri() { 56 | return indexUri; 57 | } 58 | 59 | 60 | public IndexSearcher getSearcher() { 61 | try { 62 | this.searcher = manager.acquire(); 63 | } catch (IOException e) { 64 | logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e); 65 | } 66 | return this.searcher; 67 | } 68 | 69 | public void release() { 70 | try { 71 | manager.release(searcher); 72 | } catch (IOException e) { 73 | logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e); 74 | } 75 | } 76 | 77 | /** 78 | * Periodically reopens the Indexreader, if and only if this is an dynamic 79 | * knowledge base. The changed knowledge base will be live within a few moments. 80 | */ 81 | @Override 82 | public void run() { 83 | if (dynamic) { 84 | try { 85 | manager.maybeRefresh(); 86 | } catch (IOException e) { 87 | logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e); 88 | } 89 | } 90 | } 91 | 92 | public abstract void initialize(); 93 | } 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | #DoSeR-Disambiguation 2 | This package exclusively contains the disambiguation system of DoSeR. Compilation results in a Stand-alone jar file which starts an Apache Tomcat Server. More infos about the full DoSeR systems can be found here: [Github Wiki](https://github.com/quhfus/DoSeR/wiki) 3 | 4 | If your system does not have enough system memory (25GB Ram), you can use the the rest service of the current DoSeR version which is applicable for GERBIL. **Coming soon** 5 | 6 | 7 | We note that this service is limited to 5 queries in parallel. 8 | 9 | ##Requirements 10 | To install and run the DoSeR disambiguation systems, the following components must be installed: 11 | 12 | 1. Java Version 1.7 or higher 13 | 14 | 2. Python 2.5 or higher 15 | 16 | 3. Disambiguation Lucene Index: [Dropbox Link](https://www.dropbox.com/s/7ihkw5gzqc3afjo/DBpedia_DisambiguationIndex.tar.gz?dl=0) 17 | 18 | 4. Semantic Embeddings: [Dropbox Link](https://www.dropbox.com/s/4e2g72yud1muv5a/Semantic_Embeddings.tar.gz?dl=0) 19 | 20 | ##Installation 21 | 1. Checkout the DoSeR-Disambiguation Github repository and install the system with **mvn compile**. If no maven is installed or if you are not interested in the source code you can download the doser-dis-disambiguationserver.jar file and disambiguation.properties file from here (coming very soon). 22 | 23 | 2. Put the resulting or downloaded **doser-dis-disambiguationserver.jar** file and the properties file into a newly created directory **foo**. Unzip the Disambiguation Index and put the index folder into the **foo** directory. 24 | 25 | 3. Unzip and extract the Semantic Embeddings zip file into any folder. 26 | 27 | 4. Install and start the Word2Vec Rest Server (Installation guide can be found [here](https://github.com/quhfus/DoSeR-Disambiguation/wiki/Word2Vec-RestServer)) 28 | 29 | 6. Open and adapt the disambiguation.properties file 30 | 31 | 7. Start the doser-dis-disambiguationserver.jar 32 | 33 | ##Citation 34 | If you use DoSeR in your research, please cite the following paper: 35 | 36 | @inproceedings{DBLP:conf/esws/ZwicklbauerSG16, 37 | author = {Stefan Zwicklbauer and Christin Seifert and Michael Granitzer}, 38 | title = {DoSeR - A Knowledge-Base-Agnostic Framework for Entity Disambiguation Using Semantic Embeddings}, 39 | booktitle = {The Semantic Web. Latest Advances and New Domains - 13th International 40 | Conference, {ESWC} 2016, Heraklion, Crete, Greece, May 29 - June 2, 41 | 2016, Proceedings}, 42 | pages = {182--198}, 43 | year = {2016}, 44 | crossref = {DBLP:conf/esws/2016}, 45 | url = {http://dx.doi.org/10.1007/978-3-319-34129-3_12}, 46 | doi = {10.1007/978-3-319-34129-3_12}, 47 | timestamp = {Mon, 23 May 2016 13:46:28 +0200}, 48 | biburl = {http://dblp.uni-trier.de/rec/bib/conf/esws/ZwicklbauerSG16}, 49 | bibsource = {dblp computer science bibliography, http://dblp.org} 50 | } 51 | -------------------------------------------------------------------------------- /doser-dis-core/pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4 | doser-dis 5 | doser-dis-parent 6 | 1.0 7 | 8 | 4.0.0 9 | doser.sub 10 | doser-dis-core 11 | 0.6 12 | doser-dis-core 13 | 14 | 15 | doser-dis-core 16 | 17 | 18 | 19 | 20 | 21 | 22 | com.google.guava 23 | guava 24 | 18.0 25 | 26 | 27 | 28 | 29 | doser.sub 30 | doser-dis-extensions 31 | ${project.version} 32 | 33 | 34 | org.rdfhdt 35 | hdt-java-core 36 | 1.1 37 | 38 | 39 | org.rdfhdt 40 | hdt-jena 41 | 1.1 42 | 43 | 44 | net.sf.jgrapht 45 | jgrapht 46 | 0.8.3 47 | 48 | 49 | com.googlecode.aima-java 50 | aima-core 51 | 0.10.5 52 | 53 | 54 | commons-configuration 55 | commons-configuration 56 | 1.10 57 | 58 | 59 | org.codehaus.jettison 60 | jettison 61 | 1.3.5 62 | 63 | 64 | org.codehaus.jackson 65 | jackson-mapper-asl 66 | 1.9.13 67 | 68 | 69 | net.sf.jung 70 | jung2 71 | 2.0.1 72 | pom 73 | 74 | 75 | net.sf.jung 76 | jung-graph-impl 77 | 2.0.1 78 | 79 | 80 | net.sf.jung 81 | jung-algorithms 82 | 2.0.1 83 | 84 | 85 | org.apache.commons 86 | commons-math 87 | 2.2 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/src/main/java/doser/server/actions/FrameworkInitialization.java: -------------------------------------------------------------------------------- 1 | package doser.server.actions; 2 | 3 | import java.util.Enumeration; 4 | 5 | import javax.servlet.ServletContext; 6 | import javax.servlet.ServletContextEvent; 7 | import javax.servlet.ServletContextListener; 8 | 9 | import org.apache.log4j.Logger; 10 | import org.springframework.beans.factory.DisposableBean; 11 | import org.springframework.web.context.ContextLoader; 12 | import org.springframework.web.context.WebApplicationContext; 13 | 14 | import doser.entitydisambiguation.backend.DisambiguationMainService; 15 | 16 | public class FrameworkInitialization extends ContextLoader implements 17 | ServletContextListener { 18 | 19 | private ContextLoader contextLoader; 20 | 21 | public FrameworkInitialization() { 22 | } 23 | 24 | public FrameworkInitialization(WebApplicationContext context) { 25 | super(context); 26 | } 27 | 28 | /** 29 | * Initialize the root web application context. 30 | */ 31 | @Override 32 | public void contextInitialized(ServletContextEvent event) { 33 | DisambiguationMainService.initialize(); 34 | this.contextLoader = createContextLoader(); 35 | if (this.contextLoader == null) { 36 | this.contextLoader = this; 37 | } 38 | this.contextLoader.initWebApplicationContext(event.getServletContext()); 39 | } 40 | 41 | /** 42 | * Create the ContextLoader to use. Can be overridden in subclasses. 43 | * 44 | * @return the new ContextLoader 45 | * @deprecated in favor of simply subclassing ContextLoaderListener itself 46 | * (which extends ContextLoader, as of Spring 3.0) 47 | */ 48 | @Deprecated 49 | protected ContextLoader createContextLoader() { 50 | return null; 51 | } 52 | 53 | /** 54 | * Return the ContextLoader used by this listener. 55 | * 56 | * @return the current ContextLoader 57 | * @deprecated in favor of simply subclassing ContextLoaderListener itself 58 | * (which extends ContextLoader, as of Spring 3.0) 59 | */ 60 | @Deprecated 61 | public ContextLoader getContextLoader() { 62 | return this.contextLoader; 63 | } 64 | 65 | /** 66 | * Close the root web application context. 67 | */ 68 | @Override 69 | public void contextDestroyed(ServletContextEvent event) { 70 | DisambiguationMainService.getInstance().shutDownDisambiguationService(); 71 | if (this.contextLoader != null) { 72 | this.contextLoader.closeWebApplicationContext(event 73 | .getServletContext()); 74 | } 75 | ServletContext sc = event.getServletContext(); 76 | Enumeration attrNames = sc.getAttributeNames(); 77 | while (attrNames.hasMoreElements()) { 78 | String attrName = attrNames.nextElement(); 79 | if (attrName.startsWith("org.springframework.")) { 80 | Object attrValue = sc.getAttribute(attrName); 81 | if (attrValue instanceof DisposableBean) { 82 | try { 83 | ((DisposableBean) attrValue).destroy(); 84 | } catch (Throwable ex) { 85 | Logger.getRootLogger().fatal(ex.getMessage()); 86 | } 87 | } 88 | } 89 | } 90 | } 91 | 92 | } 93 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/CandidateReduction.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | import java.util.concurrent.TimeUnit; 6 | 7 | import doser.entitydisambiguation.algorithms.SurfaceForm; 8 | 9 | public abstract class CandidateReduction { 10 | 11 | // public static final int MAXSURFACEFORMSPERQUERY = 20; 12 | // public static final int REDUCETO = 5; 13 | private List rep; 14 | private boolean alwaysAction; 15 | private int maxsurfaceformsperquery; 16 | 17 | public CandidateReduction(List rep, 18 | int maxsurfaceformsperquery, boolean alwaysAction) { 19 | super(); 20 | this.rep = rep; 21 | this.maxsurfaceformsperquery = maxsurfaceformsperquery; 22 | this.alwaysAction = alwaysAction; 23 | } 24 | 25 | public void solve() { 26 | List finalList = new LinkedList(); 27 | if (this.rep.size() > maxsurfaceformsperquery) { 28 | int counter = 0; 29 | while (true) { 30 | long time = System.currentTimeMillis(); 31 | if ((counter + maxsurfaceformsperquery) < this.rep.size()) { 32 | List subList = this.rep.subList(counter, (counter + maxsurfaceformsperquery)); 33 | finalList.addAll(miniSolve(subList)); 34 | counter += maxsurfaceformsperquery; 35 | } else { 36 | List subList = this.rep.subList(counter, this.rep.size()); 37 | List cloneList = new LinkedList(); 38 | for (SurfaceForm sf : subList) { 39 | SurfaceForm clone = (SurfaceForm) sf.clone(); 40 | cloneList.add(clone); 41 | } 42 | 43 | int prevcounter = 0; 44 | List prevList = this.rep.subList(counter - maxsurfaceformsperquery, counter); 45 | while (cloneList.size() < maxsurfaceformsperquery) { 46 | SurfaceForm clone = (SurfaceForm) prevList.get(prevcounter).clone(); 47 | clone.setRelevant(false); 48 | cloneList.add(clone); 49 | prevcounter++; 50 | } 51 | List workedList = miniSolve(cloneList); 52 | List sfs = new LinkedList(); 53 | for (SurfaceForm sf : workedList) { 54 | if (sf.isRelevant()) { 55 | sfs.add(sf); 56 | } 57 | } 58 | finalList.addAll(sfs); 59 | break; 60 | } 61 | long millis = System.currentTimeMillis() - time; 62 | String formatedTime = String.format("%d min, %d sec", 63 | TimeUnit.MILLISECONDS.toMinutes(millis), 64 | TimeUnit.MILLISECONDS.toSeconds(millis) - 65 | TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis)) 66 | ); 67 | System.out.println(formatedTime); 68 | } 69 | this.rep = finalList; 70 | } else { 71 | if(alwaysAction) { 72 | finalList.addAll(miniSolve(rep)); 73 | this.rep = finalList; 74 | } 75 | } 76 | } 77 | 78 | public List getRep() { 79 | return rep; 80 | } 81 | 82 | public abstract List miniSolve(List rep); 83 | } 84 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/query/LearnToRankTermScorer.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.query; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.lucene.index.DocsEnum; 6 | import org.apache.lucene.search.Scorer; 7 | import org.apache.lucene.search.Weight; 8 | import org.apache.lucene.search.similarities.Similarity; 9 | 10 | /** 11 | * Expert: A Scorer for documents matching a Term. 12 | */ 13 | final class LearnToRankTermScorer extends Scorer { 14 | private final Similarity.SimScorer docScorer; 15 | private final DocsEnum docsEnum; 16 | 17 | /** 18 | * Construct a TermScorer. 19 | * 20 | * @param weight 21 | * The weight of the Term in the query. 22 | * @param docsEnum 23 | * An iterator over the documents matching the Term. 24 | * @param docScorer 25 | * The Similarity.ExactSimScorer implementation to 26 | * be used for score computations. 27 | * @param docFreq 28 | * per-segment docFreq of this term 29 | */ 30 | LearnToRankTermScorer(final Weight weight, final DocsEnum docsEnum, 31 | final Similarity.SimScorer docScorer) { 32 | super(weight); 33 | this.docScorer = docScorer; 34 | this.docsEnum = docsEnum; 35 | } 36 | 37 | /** 38 | * Advances to the first match beyond the current whose document number is 39 | * greater than or equal to a given target.
40 | * The implementation uses {@link DocsEnum#advance(int)}. 41 | * 42 | * @param target 43 | * The target document number. 44 | * @return the matching document or NO_MORE_DOCS if none exist. 45 | */ 46 | @Override 47 | public int advance(final int target) throws IOException { 48 | return docsEnum.advance(target); 49 | } 50 | 51 | @Override 52 | public long cost() { 53 | return docsEnum.cost(); 54 | } 55 | 56 | @Override 57 | public int docID() { 58 | return docsEnum.docID(); 59 | } 60 | 61 | @Override 62 | public int freq() throws IOException { 63 | return docsEnum.freq(); 64 | } 65 | 66 | DocsEnum getDocsEnum() { 67 | return docsEnum; 68 | } 69 | 70 | /** 71 | * Advances to the next document matching the query.
72 | * 73 | * @return the document matching the query or NO_MORE_DOCS if there are no 74 | * more documents. 75 | */ 76 | @Override 77 | public int nextDoc() throws IOException { 78 | return docsEnum.nextDoc(); 79 | } 80 | 81 | // TODO: benchmark if the specialized conjunction really benefits 82 | // from this, or if instead its from sorting by docFreq, or both 83 | 84 | @Override 85 | public float score() throws IOException { 86 | assert docID() != NO_MORE_DOCS; 87 | return docScorer.score(docsEnum.docID(), docsEnum.freq()); 88 | } 89 | 90 | // TODO: generalize something like this for scorers? 91 | // even this is just an estimation... 92 | 93 | // int getDocFreq() { 94 | // return docFreq; 95 | // } 96 | 97 | /** Returns a string representation of this TermScorer. */ 98 | @Override 99 | public String toString() { 100 | return "scorer(" + weight + ")"; 101 | } 102 | } -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/EntityCentricKBBiomed.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.knowledgebases; 2 | 3 | import java.util.List; 4 | 5 | import org.apache.lucene.search.similarities.Similarity; 6 | 7 | public class EntityCentricKBBiomed extends AbstractEntityCentricKBGeneral { 8 | 9 | public EntityCentricKBBiomed(String uri, boolean dynamic, Similarity sim) { 10 | super(uri, dynamic, sim); 11 | } 12 | 13 | public EntityCentricKBBiomed(String uri, boolean dynamic) { 14 | super(uri, dynamic); 15 | } 16 | 17 | /** 18 | * Takes a set of entities as well as a target entity and generates one 19 | * string that fits into the word2vec query format used in this class. The 20 | * source entities are concatenated and should be compared with the target 21 | * entity. 22 | * 23 | * @param source 24 | * a set of source entities 25 | * @param target 26 | * the target entity. 27 | * @return String in appropriate word2vec query format 28 | */ 29 | public String generateWord2VecFormatString(String source, String target) { 30 | source = convertUrlToBiomedEntityIdentifier(source); 31 | target = convertUrlToBiomedEntityIdentifier(target); 32 | int c = source.compareToIgnoreCase(target); 33 | String res = ""; 34 | if (c < 0) { 35 | res = source + "|" + target; 36 | } else if (c == 0) { 37 | res = source + "|" + target; 38 | } else { 39 | res = target + "|" + source; 40 | } 41 | return res; 42 | } 43 | 44 | /** 45 | * Takes a set of entities as well as a target entity and generates one 46 | * string that fits into the word2vec query format used in this class. The 47 | * source entities are concatenated and should be compared wit the target 48 | * entity. 49 | * 50 | * @param source 51 | * a set of source entities 52 | * @param target 53 | * the target entity. 54 | * @return String in appropriate word2vec query format 55 | */ 56 | public String generateWord2VecFormatString(List source, String target) { 57 | StringBuilder builder = new StringBuilder(); 58 | for (String s : source) { 59 | s = convertUrlToBiomedEntityIdentifier(s); 60 | builder.append(s); 61 | builder.append("|"); 62 | } 63 | String src = builder.toString(); 64 | src = src.substring(0, src.length() - 1); 65 | target = convertUrlToBiomedEntityIdentifier(target); 66 | return src + "|" + target; 67 | } 68 | 69 | private String convertUrlToBiomedEntityIdentifier(String url) { 70 | String res = ""; 71 | if (url.startsWith("http://www.uniprot.org/uniprot/")) { 72 | res = "UNIPROT_" + url.replaceAll("http://www.uniprot.org/uniprot/", ""); 73 | } else if (url.startsWith("http://www.ncbi.nlm.nih.gov/gene/")) { 74 | res = "NCBI_" + url.replaceAll("http://www.ncbi.nlm.nih.gov/gene/", ""); 75 | } else if (url.startsWith("http://linkedlifedata.com/resource/umls-concept/")) { 76 | res = "UMLS_" + url.replaceAll("http://linkedlifedata.com/resource/umls-concept/", ""); 77 | } 78 | return res; 79 | } 80 | 81 | @Override 82 | protected String generateDomainName() { 83 | return "Biomed"; 84 | } 85 | 86 | @Override 87 | protected String kbName() { 88 | return "CalbC Biomedical KB"; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/CheckGeneralEntities.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.rules; 2 | 3 | import java.io.IOException; 4 | import java.util.LinkedList; 5 | import java.util.List; 6 | 7 | import org.apache.lucene.document.Document; 8 | import org.apache.lucene.index.IndexReader; 9 | import org.apache.lucene.index.Term; 10 | import org.apache.lucene.search.IndexSearcher; 11 | import org.apache.lucene.search.ScoreDoc; 12 | import org.apache.lucene.search.TopDocs; 13 | 14 | import doser.entitydisambiguation.algorithms.SurfaceForm; 15 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; 16 | import doser.lucene.query.TermQuery; 17 | 18 | class CheckGeneralEntities extends AbstractRule { 19 | 20 | CheckGeneralEntities(EntityCentricKBDBpedia eckb) { 21 | super(eckb); 22 | } 23 | 24 | @Override 25 | public boolean applyRule(List rep) { 26 | for (SurfaceForm c : rep) { 27 | String sf = c.getSurfaceForm().toLowerCase(); 28 | List candidates = c.getCandidates(); 29 | String checked = null; 30 | // Surface Form - Candidate Match i.e. Saturday - 31 | // http://dbpedia.org/resource/Saturday 32 | for (String s : candidates) { 33 | String ent = s.replaceAll("http://dbpedia.org/resource/", "") 34 | .toLowerCase(); 35 | if (sf.equalsIgnoreCase(ent)) { 36 | checked = s; 37 | break; 38 | } 39 | } 40 | 41 | if (checked != null && !checkSurfaceFormSubset(sf, rep)) { 42 | List keepCandidates = new LinkedList(); 43 | for (String can : candidates) { 44 | String[] labels = null; 45 | IndexSearcher searcher = eckb.getSearcher(); 46 | IndexReader reader = searcher.getIndexReader(); 47 | TermQuery query = new TermQuery(new Term("Mainlink", can)); 48 | try { 49 | final TopDocs top = searcher.search(query, 1); 50 | final ScoreDoc[] score = top.scoreDocs; 51 | final Document doc = reader.document(score[0].doc); 52 | labels = doc.getValues("Label"); 53 | } catch (IOException e) { 54 | e.printStackTrace(); 55 | } 56 | // Check whether the candidate has label of the original 57 | // surface form 58 | if (labels != null) { 59 | boolean isIn = false; 60 | for (int i = 0; i < labels.length; ++i) { 61 | if (labels[i].toLowerCase().equalsIgnoreCase(sf)) { 62 | isIn = true; 63 | break; 64 | } 65 | } 66 | // If IN, keep this candidate 67 | if (isIn) { 68 | keepCandidates.add(can); 69 | } 70 | } 71 | } 72 | if (!keepCandidates.isEmpty()) { 73 | c.setCandidates(keepCandidates); 74 | if(keepCandidates.size() == 1) { 75 | System.out.println("**********************************************************************"); 76 | System.out.println(keepCandidates.toString()); 77 | System.out.println("**********************************************************************"); 78 | } 79 | } 80 | } 81 | } 82 | return false; 83 | } 84 | 85 | private boolean checkSurfaceFormSubset(String sf, 86 | List reps) { 87 | boolean isIn = false; 88 | for (SurfaceForm c : reps) { 89 | String toCheck = c.getSurfaceForm().toLowerCase(); 90 | if (!toCheck.equalsIgnoreCase(sf) && toCheck.contains(sf)) { 91 | isIn = true; 92 | break; 93 | } 94 | } 95 | return isIn; 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/CollectiveAndContextDriver.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective.dbpedia; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | import doser.entitydisambiguation.algorithms.SurfaceForm; 7 | import doser.entitydisambiguation.algorithms.collective.CandidatePruning; 8 | import doser.entitydisambiguation.algorithms.rules.RuleAdapation; 9 | import doser.entitydisambiguation.dpo.DisambiguatedEntity; 10 | import doser.entitydisambiguation.dpo.Response; 11 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; 12 | 13 | class CollectiveAndContextDriver { 14 | 15 | static final int PREPROCESSINGCONTEXTSIZE = 200; 16 | 17 | private String topic; 18 | private Response[] currentResponse; 19 | private List rep; 20 | private EntityCentricKBDBpedia eckb; 21 | 22 | CollectiveAndContextDriver(Response[] res, List rep, EntityCentricKBDBpedia eckb, String topic) { 23 | super(); 24 | this.topic = topic; 25 | if (res.length != rep.size()) { 26 | throw new IllegalArgumentException(); 27 | } 28 | this.currentResponse = res; 29 | this.rep = rep; 30 | this.eckb = eckb; 31 | this.eckb.precomputeDoc2VecSimilarities(rep, PREPROCESSINGCONTEXTSIZE); 32 | } 33 | 34 | void solve() { 35 | // First candidate pruning 36 | CandidatePruning pruning = new CandidatePruning(eckb); 37 | pruning.prune(rep); 38 | if (topic != null) { 39 | TableColumnFilter cf = new TableColumnFilter(eckb, topic); 40 | cf.filter(rep); 41 | } 42 | TimeNumberDisambiguation timenumberdis = new TimeNumberDisambiguation(eckb); 43 | timenumberdis.solve(rep); 44 | LocationDisambiguation locationDis = new LocationDisambiguation(eckb); 45 | locationDis.solve(rep); 46 | 47 | RuleAdapation rules = new RuleAdapation(); 48 | rules.addNoCandidatesCheckPluralRule(eckb); 49 | rules.addNoCandidatesExpansionRule(eckb); 50 | rules.addUnambiguousToAmbiguousRule(eckb); 51 | rules.addPatternRule(eckb, topic); 52 | rules.addContextRule(eckb); 53 | rules.performRuleChainBeforeCandidateSelection(rep); 54 | 55 | CandidateReductionDBpediaW2V w2vreduction = new CandidateReductionDBpediaW2V(eckb, rep, 20, 5, 150, false, false); 56 | w2vreduction.solve(); 57 | rep = w2vreduction.getRep(); 58 | 59 | w2vreduction = new CandidateReductionDBpediaW2V(eckb, rep, 45, 5, 250, true, true); 60 | w2vreduction.solve(); 61 | rep = w2vreduction.getRep(); 62 | FinalEntityDisambiguation finalDis = new FinalEntityDisambiguation(eckb, rep); 63 | finalDis.setup(); 64 | finalDis.solve(); 65 | } 66 | 67 | void generateResult() { 68 | for (int i = 0; i < currentResponse.length; i++) { 69 | SurfaceForm r = search(i); 70 | if (currentResponse[i] == null && r != null && r.getCandidates().size() == 1) { 71 | Response res = new Response(); 72 | List entList = new LinkedList(); 73 | DisambiguatedEntity ent = new DisambiguatedEntity(); 74 | ent.setEntityUri(r.getCandidates().get(0)); 75 | entList.add(ent); 76 | res.setDisEntities(entList); 77 | res.setSelectedText(r.getSurfaceForm()); 78 | currentResponse[i] = res; 79 | } 80 | } 81 | } 82 | 83 | private SurfaceForm search(int qryNr) { 84 | for (SurfaceForm r : rep) { 85 | if (r.getQueryNr() == qryNr) { 86 | return r; 87 | } 88 | } 89 | return null; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/query/PriorQuery.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.query; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.lucene.index.AtomicReaderContext; 6 | import org.apache.lucene.search.Explanation; 7 | import org.apache.lucene.search.IndexSearcher; 8 | import org.apache.lucene.search.Query; 9 | import org.apache.lucene.search.Scorer; 10 | import org.apache.lucene.search.Weight; 11 | import org.apache.lucene.util.Bits; 12 | 13 | import doser.lucene.features.IEntityCentricExtFeatures; 14 | 15 | /** 16 | * Due to major performance problems if we use an IndexReader request for every 17 | * single document, we create a Hashmap to improve the 18 | * overall performance. 19 | * 20 | * Our StartupInformationLoader provides these necessary information much 21 | * faster. 22 | * 23 | * @author Stefan Zwicklbauer 24 | */ 25 | public class PriorQuery extends Query { 26 | 27 | class PriorWeight extends Weight { 28 | 29 | class PriorScorer extends Scorer { 30 | 31 | private final AtomicReaderContext context; 32 | 33 | private int lastDoc = -1; 34 | 35 | PriorScorer(final Weight weight, final AtomicReaderContext context) { 36 | super(weight); 37 | this.context = context; 38 | } 39 | 40 | @Override 41 | public int advance(final int target) throws IOException { 42 | final int maxdoc = context.reader().numDocs(); 43 | if (target > (maxdoc - 1)) { 44 | return NO_MORE_DOCS; 45 | } 46 | return lastDoc = target; 47 | } 48 | 49 | @Override 50 | public long cost() { 51 | return 0; 52 | } 53 | 54 | @Override 55 | public int docID() { 56 | return lastDoc; 57 | } 58 | 59 | @Override 60 | public int freq() throws IOException { 61 | return 1; 62 | } 63 | 64 | @Override 65 | public int nextDoc() throws IOException { 66 | if ((context.reader().numDocs() - 1) > lastDoc) { 67 | return ++lastDoc; 68 | } else { 69 | return NO_MORE_DOCS; 70 | } 71 | } 72 | 73 | @Override 74 | public float score() throws IOException { 75 | return kb.getPriorOfDocument(context.docBase + lastDoc); 76 | } 77 | 78 | @Override 79 | public String toString() { 80 | return "Prior"; 81 | } 82 | } 83 | 84 | @Override 85 | public Explanation explain(final AtomicReaderContext context, 86 | final int doc) throws IOException { 87 | return null; 88 | } 89 | 90 | @Override 91 | public Query getQuery() { 92 | return PriorQuery.this; 93 | } 94 | 95 | @Override 96 | public float getValueForNormalization() throws IOException { 97 | return 0; 98 | } 99 | 100 | @Override 101 | public void normalize(final float norm, final float topLevelBoost) { 102 | // Do nothing here! 103 | } 104 | 105 | @Override 106 | public Scorer scorer(AtomicReaderContext context, Bits acceptDocs) 107 | throws IOException { 108 | return new PriorScorer(this, context); 109 | } 110 | } 111 | 112 | private IEntityCentricExtFeatures kb; 113 | 114 | public PriorQuery(IEntityCentricExtFeatures kb) { 115 | super(); 116 | this.kb = kb; 117 | } 118 | 119 | @Override 120 | public Weight createWeight(final IndexSearcher searcher) throws IOException { 121 | return new PriorWeight(); 122 | } 123 | 124 | @Override 125 | public String toString(final String field) { 126 | return "PriorQuery"; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /yes: -------------------------------------------------------------------------------- 1 | -----BEGIN RSA PRIVATE KEY----- 2 | Proc-Type: 4,ENCRYPTED 3 | DEK-Info: AES-128-CBC,9F33236E2FD99EACABA4D7F529D0E8A5 4 | 5 | 8PISpGsmdq0QuL/NcFlGOZznZdyibB1/A6nI5bfiDljT5hzQ7xWFBM3S2IHeKUVK 6 | wdJdA+c3Y4dXgRllMczUMJBXX3UfObsm3/5TWCKPwczxLJ0tgxCgYVX9KNVt0Ngv 7 | b2ayQqkNBvBHq5ooKr8glkjvZ0Wl6QZ+W4pz8KndfzSiUri/WTryEmjzYbgyBXyG 8 | 8L+wG8mGOiCYKOFlVM+ViE8f3d+i0lsxX7PgXkdyWOvlgx2Iy3MhLQNXw0LztU6I 9 | QFpa1DtjcBewpvYtPJn6fma1nqhc8bTSaM0/1a8aLeCJWqCzQ5vD1wkkZ1eLEMDn 10 | Jg0D1fT2mm3XtNAMwOHcd+j3IG7aTofhU+XRBPk1YRdbOJjNuMzgV+P3dxXUhGLV 11 | N9vb2hUm/wIXngiKTeigsYGj59nvhyda6DfLhsNfizH1M/Foq3ZaNdWCvwtfJzAS 12 | sw2tW+PnPJiKpSXE1O7DQ3fduv5gBrrxZ906kHVKzPPa0T+0HWN+Z3MyM8IbuYKf 13 | zUVo0IogdobK+vm6HcKTWCdV0v5BPG6cTWHbTUi2kdJLc9j1lnnzEAOMIHYexsg2 14 | 8PmD2uncDNvUvS5DDILVSFj40zG57c2pVgBWcN1U211env8eb9jD4oJr+rOH4gvm 15 | pDLnB72eYZmQ9oUnnLsPo8c8cFfuJiTYIqmPW5crpzjUQlDlMlc8Kh5A3XJ/XHqh 16 | sq7M+Kn54l51SH+FvpS6u/s6dpwjCa+UbuFzdbJlE/RRLZaoTh0gov6k0n/48XSU 17 | 0XBJGuKyL8hmGmwAyMwdNb9vuH0Gah30ZeIpb8Iiw6aUNeCnpDrZ+b1M7VAC2Q/M 18 | UvuNe+datxI4FNyqPOnmi8o+vkWl3W8+M71qkGGsi+qnSUwnR9uUFg6VBt1WEdHw 19 | QpkPeQtnS53kadKSqLZEnPTnLsEYZfByCexgdXzJr32+IiUzkM8PoNuMzhVR+LgK 20 | Q55EJyFj736f8pwzC4k8Iz5WwAqnabXJH3eEW+o94a75xCM/32QW3ZJS8+yVh3Jb 21 | R622Tu9S6VxPzrS/HRbAmLCsWwy8svobKVTMN5vOzx3bZ5DrtjdyY8eBfQgBLQQW 22 | HxKGXYygz7M93e03K4VQbc0Gt1igBXgOH/W6MZXAzMk+WfXVRml2BzUWnh5pGvVt 23 | x0+vlbWESWKdIYY16R20R2594Elh9j1kgzRE3c3f0Aq86S5VhR4wvwcjF2GpHHuH 24 | 1ILCKvmWehfl+DJ1kyYfAXemsHxkkAHNCpJQ9TeKQiVUWDIjEBvuEEn6lgEu3vWG 25 | LgCV+AkWRKRRExssPK/Cj/VUqa4mhFLOy61JKi5XCj991MwXXJPaSmTp9j2hofcR 26 | yZWkaqwhe3kkZfVCETl4wTAPs+uB+7vW1zO70me959D4qoZVmu/Lr/VnGDw+7WIg 27 | NuDyIM7cSE/8va7r19b0uDJdwLrcmir8WwmxJOHCOQv+hY73RR2Hcmr1EtOp2BEw 28 | dwuc3+ewMcvNSQLnBUe/6OMRz9Z0kc620f1H6X4WJHu1BA0FDCbh9HeEpx3zsECN 29 | YPCrlZUS68kkGxscE3QgtTnKDsjArPrLxxFueBAlpYVUW+jzhqnd1w0xCXBbB2tV 30 | xi0kIpigCobhS35xig5nR6nkoSjc7nr6ybiEVA5x6Fbd41fVwtFop+4W8kmZL4/I 31 | 8lwHBp8SLRv/bjN7q72rYn1HH+JIKAskuLhpG00nK9gzDDhWYGVEuFhy3Jw6OZtZ 32 | tro3PRgAW83yKIjGvpGb1iZEg4YQhldZGq4/bxOU0FYKTniGlA+sZFmSYrKCPW4D 33 | 9J8isFexm0P6Dv8vjzIV/WSbTn9Z9bF3bcN1Eg91a2X/82iVlTh4Lgea8NMhLtUN 34 | nyKETpUQRoT126mHuaVbvD/OG2PUwLMt5vahQoTaYmazTk+Uevjgi9PfWBSLzsqB 35 | QKzCNoQjzcibYuAv4zU7hbjEXjtLXXkVyzVhTTiTKzIXEd8c8f4XUSEHo82UXjMa 36 | gzsXx8VsTEzfBEPSruBaKxf82LInpgGwNPlVTsW+g1T0nGE0qC7W/BYfSYEVCZY/ 37 | PmaIwMzC5akuZnqiLTOMIwEdPe8iYzntcvCfUZB1rL75Xx6Y7YFLIt96fKFb5Nn1 38 | Yni80JAtvoFlCiZaUSoWAnHecXNewd5xwJjaJdgFh38cCZmvjTxupk9rU3lsLLoX 39 | tzZEFJv7Qt/axbqkIT/zdJr87zeScAgLU7PcpO05LPQR9pU3mm3z0jylgMUzU1Qz 40 | IVJHQ8CnaCTg2S7fwjZjHVlHIrPPiZgVhTN7Rt8vZ3CB7Wf8sXpGBIXADYoNiOVG 41 | lvtrXJYGZ5uoyeJLHerGNyMu4B3iCoY50kilNCcQ3cfX7G6SMwMgH1oJDHCMdOr+ 42 | PLWf45FcwQrhkj56DqytV389OKaADsJoNuEpgbLmnUBCJQHzq4/Lfoqvqj7z2PE/ 43 | F2kgb9JN7eBfbw/a6Sa7A0Qe8yCOVd9HWqSt0sQDqITcybF/gfU5IAjaFDWm6xKk 44 | FMFKTigj6Y4UfDfffZfVFAJ0AqNfkHTAI92ShGU/hrDAHmcgiio3m93IsnSjqYWs 45 | McFgcvsaqQpb4LfkdckXBDrZCVXNbeOe7JdxLcZxlI1hHeve2spz7zY7N3MTZzNm 46 | xZ0wcndfcmfVv/KXGvjPGh9+rrZyWXfeT5bwE6wLwg+CJmCI2AJDvoGdx7hkL8FL 47 | FKjbOrnTCai+Q4/vOdVpQz7/X7nyIX5DgqthqI8PTF4qAmoKM8htATK96CfW/Mw5 48 | PEQbU25nRHSE/TxVWoeoPJ5YQLnlh6Voey9Sk5vSzBNwyZXde9/1okZPvnZjmcvu 49 | 9TxOpoETYnNyfZEJ4g4FvHWSpN7YiDnNiwvD4nCRIq9oQTWhjK3w76Drv92MjaqJ 50 | bzaNMt909qVjLaio1sT5tDtqXT9Me5R7bL1qoEPXAePzYD7Bc1kZs1FD3emCCjh+ 51 | TL/sLv64fPrpEH026AKfNqUWd9A0EexJqnVH6J6TgE6LrYe7Wq8PHlc+3DiEdroT 52 | qyMnP71BTu/UrUcm/rQ/+FDvduVncD0mDuUaw3Vr3Lf0DgYr/7nd5IFMP+5bpPZo 53 | KU5dNyRfOYOZTJ4vdTYpjeOU1IkjP+fBrbZ8wacHEqju68v4XViIJNaZrAJmq5t/ 54 | -----END RSA PRIVATE KEY----- 55 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/query/SensePriorQuery.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.query; 2 | 3 | import java.io.IOException; 4 | 5 | import org.apache.lucene.index.AtomicReaderContext; 6 | import org.apache.lucene.search.Explanation; 7 | import org.apache.lucene.search.IndexSearcher; 8 | import org.apache.lucene.search.Query; 9 | import org.apache.lucene.search.Scorer; 10 | import org.apache.lucene.search.Weight; 11 | import org.apache.lucene.util.Bits; 12 | 13 | import doser.lucene.features.IEntityCentricExtFeatures; 14 | 15 | /** 16 | * Due to major performance problems if we use an IndexReader request for every 17 | * single document, we create a Hashmap to improve the 18 | * overall performance. 19 | * 20 | * Our StartupInformationLoader provides these necessary information much 21 | * faster. 22 | * 23 | * @author Stefan Zwicklbauer 24 | * 25 | */ 26 | public class SensePriorQuery extends Query { 27 | 28 | class PriorWeight extends Weight { 29 | 30 | class SensePriorScorer extends Scorer { 31 | 32 | private final AtomicReaderContext context; 33 | 34 | private int lastDoc = -1; 35 | 36 | SensePriorScorer(final Weight weight, 37 | final AtomicReaderContext context) { 38 | super(weight); 39 | this.context = context; 40 | } 41 | 42 | @Override 43 | public int advance(final int target) throws IOException { 44 | final int maxdoc = context.reader().numDocs(); 45 | if (target > (maxdoc - 1)) { 46 | return NO_MORE_DOCS; 47 | } 48 | return lastDoc = target; 49 | } 50 | 51 | @Override 52 | public long cost() { 53 | return 0; 54 | } 55 | 56 | @Override 57 | public int docID() { 58 | return lastDoc; 59 | } 60 | 61 | @Override 62 | public int freq() throws IOException { 63 | return 1; 64 | } 65 | 66 | @Override 67 | public int nextDoc() throws IOException { 68 | if ((context.reader().numDocs() - 1) > lastDoc) { 69 | return ++lastDoc; 70 | } else { 71 | return NO_MORE_DOCS; 72 | } 73 | } 74 | 75 | @Override 76 | public float score() throws IOException { 77 | float res = 0.0f; 78 | res = kb.getSensePriorOfDocument(keyword, context.docBase 79 | + lastDoc); 80 | return res; 81 | } 82 | 83 | @Override 84 | public String toString() { 85 | return "SensePrior"; 86 | } 87 | 88 | } 89 | 90 | @Override 91 | public Explanation explain(final AtomicReaderContext context, 92 | final int doc) throws IOException { 93 | return null; 94 | } 95 | 96 | @Override 97 | public Query getQuery() { 98 | return SensePriorQuery.this; 99 | } 100 | 101 | @Override 102 | public float getValueForNormalization() throws IOException { 103 | return 0; 104 | } 105 | 106 | @Override 107 | public void normalize(final float norm, final float topLevelBoost) { 108 | // Do nothing here 109 | } 110 | 111 | @Override 112 | public Scorer scorer(AtomicReaderContext context, Bits acceptDocs) 113 | throws IOException { 114 | return new SensePriorScorer(this, context); 115 | } 116 | 117 | } 118 | 119 | private final IEntityCentricExtFeatures kb; 120 | 121 | private final String keyword; 122 | 123 | public SensePriorQuery(final String keyword, final IEntityCentricExtFeatures kb) { 124 | super(); 125 | this.keyword = keyword; 126 | this.kb = kb; 127 | } 128 | 129 | @Override 130 | public Weight createWeight(final IndexSearcher searcher) throws IOException { 131 | return new PriorWeight(); 132 | } 133 | 134 | @Override 135 | public String toString(final String field) { 136 | return "SensePriorQuery"; 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/SurfaceForm.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | public class SurfaceForm implements Comparable, Cloneable { 7 | 8 | private int queryNr; 9 | private String surfaceForm; 10 | private String context; 11 | private List candidates; 12 | private Integer ambiguity; 13 | private boolean isACandidate; 14 | private double difference; 15 | private int position; 16 | private boolean matchesInitial; 17 | private boolean initial; 18 | private boolean isRelevant; 19 | 20 | public SurfaceForm(String surfaceForm, String context, List candidates, int qryNr, int position) { 21 | super(); 22 | this.ambiguity = candidates.size(); 23 | this.surfaceForm = surfaceForm; 24 | this.context = context; 25 | this.candidates = candidates; 26 | this.queryNr = qryNr; 27 | this.isACandidate = true; 28 | this.difference = 0; 29 | this.position = position; 30 | this.initial = false; 31 | this.isRelevant = true; 32 | } 33 | 34 | public boolean isRelevant() { 35 | return isRelevant; 36 | } 37 | 38 | public void setRelevant(boolean isRelevant) { 39 | this.isRelevant = isRelevant; 40 | } 41 | 42 | public boolean isMatchesInitial() { 43 | return matchesInitial; 44 | } 45 | 46 | public void setMatchesInitial(boolean matchesInitial) { 47 | this.matchesInitial = matchesInitial; 48 | } 49 | 50 | public boolean isInitial() { 51 | return initial; 52 | } 53 | 54 | public void setInitial(boolean initial) { 55 | this.initial = initial; 56 | } 57 | 58 | public void setCandidates(List candidates) { 59 | this.candidates = candidates; 60 | } 61 | 62 | public List getCandidates() { 63 | return candidates; 64 | } 65 | 66 | public void setACandidate(boolean can) { 67 | this.isACandidate = can; 68 | } 69 | 70 | public String getSurfaceForm() { 71 | return surfaceForm; 72 | } 73 | 74 | public boolean isACandidate() { 75 | return isACandidate; 76 | } 77 | 78 | public String getContext() { 79 | return context; 80 | } 81 | 82 | public int getQueryNr() { 83 | return queryNr; 84 | } 85 | 86 | public int getAmbiguity() { 87 | return this.ambiguity; 88 | } 89 | 90 | public void setDisambiguatedEntity(String url) { 91 | candidates.clear(); 92 | candidates.add(url); 93 | } 94 | 95 | public void clearList() { 96 | candidates.clear(); 97 | } 98 | 99 | public void addCandidate(String can) { 100 | candidates.add(can); 101 | } 102 | 103 | public double getDifference() { 104 | return difference; 105 | } 106 | 107 | public void setDifference(double difference) { 108 | this.difference = difference; 109 | } 110 | 111 | public int getPosition() { 112 | return position; 113 | } 114 | 115 | public void setPosition(int position) { 116 | this.position = position; 117 | } 118 | 119 | @Override 120 | public int compareTo(SurfaceForm o) { 121 | if (this.difference < o.getDifference()) { 122 | return 1; 123 | } else if (this.difference > o.getDifference()) { 124 | return -1; 125 | } else { 126 | return 0; 127 | } 128 | } 129 | 130 | public Object clone() { 131 | ArrayList newCandidates = new ArrayList(); 132 | for (String s : candidates) { 133 | newCandidates.add(s); 134 | } 135 | 136 | SurfaceForm n = new SurfaceForm(new String(this.surfaceForm), new String(this.context), newCandidates, 137 | this.queryNr, this.position); 138 | n.setACandidate(this.isACandidate); 139 | n.setInitial(this.initial); 140 | n.setMatchesInitial(this.matchesInitial); 141 | n.setRelevant(this.isRelevant); 142 | return n; 143 | } 144 | } -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/UnambiguousToAmbiguousRule.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.rules; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | import java.util.Map; 8 | 9 | import org.apache.lucene.document.Document; 10 | import org.apache.lucene.index.Term; 11 | import org.apache.lucene.search.IndexSearcher; 12 | import org.apache.lucene.search.Query; 13 | import org.apache.lucene.search.ScoreDoc; 14 | import org.apache.lucene.search.TopDocs; 15 | 16 | import doser.entitydisambiguation.algorithms.SurfaceForm; 17 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; 18 | import doser.lucene.query.TermQuery; 19 | 20 | /** 21 | * Falls eine Surface Form eindeutig ist und weitere Surface Forms eine 22 | * Abkürzung darstellen, diese allerdings nicht eindeutig sind, wird dies sofort 23 | * aufgelöst. 24 | * 25 | * Beispiel: 1 Surface Form: Burlington Industries Inc (eindeutig) 2 Surface 26 | * Form: Burlington (ambiguous) ... 27 | * 28 | * 29 | * @author quh 30 | * 31 | */ 32 | 33 | class UnambiguousToAmbiguousRule extends AbstractRule { 34 | 35 | UnambiguousToAmbiguousRule(EntityCentricKBDBpedia eckb) { 36 | super(eckb); 37 | } 38 | 39 | @Override 40 | public boolean applyRule(List rep) { 41 | List unambiguous = new LinkedList(); 42 | for (SurfaceForm c : rep) { 43 | if (c.getCandidates().size() == 1) { 44 | String candidate = c.getCandidates().get(0); 45 | String type = queryType(candidate); 46 | if (type.equalsIgnoreCase("Person") || type.equalsIgnoreCase("Organisation")) { 47 | unambiguous.add(c); 48 | } 49 | } 50 | } 51 | for (SurfaceForm c : rep) { 52 | if (c.getCandidates().size() > 1) { 53 | HashMap map = new HashMap(); 54 | for (SurfaceForm un : unambiguous) { 55 | String type = queryType(un.getCandidates().get(0)); 56 | if ((isSubString(un.getSurfaceForm(), c.getSurfaceForm()) 57 | && c.getCandidates().contains(un.getCandidates().get(0)) 58 | && un.getPosition() < c.getPosition()) 59 | || (type.equalsIgnoreCase("Person") && isSubString(un.getSurfaceForm(), c.getSurfaceForm()) 60 | && un.getPosition() < c.getPosition())) { 61 | map.put(un.getCandidates().get(0), c.getPosition() - un.getPosition()); 62 | // c.setDisambiguatedEntity(un.getCandidates().get(0)); 63 | } 64 | } 65 | if (!map.isEmpty()) { 66 | int distance = Integer.MAX_VALUE; 67 | String can = ""; 68 | for (Map.Entry entry : map.entrySet()) { 69 | if (entry.getValue() < distance) { 70 | distance = entry.getValue(); 71 | can = entry.getKey(); 72 | } 73 | } 74 | c.setDisambiguatedEntity(can); 75 | } 76 | } 77 | } 78 | return false; 79 | } 80 | 81 | private boolean isSubString(String s1, String s2) { 82 | if (s1.toLowerCase().contains(s2.toLowerCase())) { 83 | return true; 84 | } else 85 | return false; 86 | } 87 | 88 | private String queryType(String url) { 89 | String type = ""; 90 | IndexSearcher searcher = eckb.getSearcher(); 91 | Query q = new TermQuery(new Term("Mainlink", url)); 92 | try { 93 | TopDocs docs = searcher.search(q, 1); 94 | ScoreDoc[] scoredocs = docs.scoreDocs; 95 | if(scoredocs.length == 0) { 96 | type = "Misc"; 97 | } else { 98 | int nr = scoredocs[0].doc; 99 | Document doc = searcher.getIndexReader().document(nr); 100 | type = doc.get("Type"); 101 | } 102 | } catch (IOException e) { 103 | e.printStackTrace(); 104 | } 105 | return type; 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/NoCandidatesExpansionRules.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.rules; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | 8 | import org.apache.lucene.document.Document; 9 | import org.apache.lucene.index.IndexReader; 10 | import org.apache.lucene.search.IndexSearcher; 11 | import org.apache.lucene.search.ScoreDoc; 12 | import org.apache.lucene.search.TopDocs; 13 | import org.apache.lucene.search.similarities.DefaultSimilarity; 14 | 15 | import doser.entitydisambiguation.algorithms.SurfaceForm; 16 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase; 17 | import doser.lucene.features.LuceneFeatures; 18 | import doser.lucene.query.LearnToRankClause; 19 | import doser.lucene.query.LearnToRankQuery; 20 | import doser.tools.Inflector; 21 | 22 | /** 23 | * Falls eine Surface Form keine Kandidaten hat, allerdings aus mindestens 3 24 | * Wörtern besteht, werden alle Wörter mit kleinergleich 3 Buchstaben entfernt 25 | * und erneut angefragt. Dies geschieht ebenfalls nach der Entfernung von 26 | * Sonderzeichen. Entsprechend werden die Kandidaten gesetzt. 27 | * 28 | * @author quh 29 | */ 30 | 31 | class NoCandidatesExpansionRules extends AbstractRule { 32 | 33 | NoCandidatesExpansionRules(AbstractKnowledgeBase eckb) { 34 | super(eckb); 35 | } 36 | 37 | @Override 38 | public boolean applyRule(List rep) { 39 | for (SurfaceForm c : rep) { 40 | if (c.getCandidates().size() == 0) { 41 | c.setCandidates(queryCandidates(c.getSurfaceForm())); 42 | } 43 | } 44 | return false; 45 | } 46 | 47 | private ArrayList queryCandidates(String surfaceForm) { 48 | ArrayList lst = new ArrayList(); 49 | String[] splitter = surfaceForm.split(" "); 50 | if (splitter.length > 2) { 51 | StringBuilder builder = new StringBuilder(); 52 | for (int i = 0; i < splitter.length; i++) { 53 | if (splitter[i].length() > 3) { 54 | builder.append(splitter[i] + " "); 55 | 56 | } 57 | } 58 | String builderstring = builder.toString(); 59 | if (builderstring.length() > 0) { 60 | String newSf = builderstring.substring(0, 61 | builderstring.length() - 1); 62 | lst = queryLucene(surfaceForm); 63 | if (lst.size() == 0) { 64 | // Try again without special chars 65 | newSf = newSf.replaceAll("[^a-zA-Z ]", ""); 66 | lst = queryLucene(newSf); 67 | // If size is 0 anyway, still check Plural to singular 68 | if (lst.size() == 0) { 69 | String singular = Inflector.getInstance().singularize( 70 | newSf); 71 | if (!newSf.equalsIgnoreCase(singular)) { 72 | // Try singular search 73 | lst = queryCandidates(singular); 74 | } 75 | } 76 | } 77 | } 78 | } 79 | return lst; 80 | } 81 | 82 | private ArrayList queryLucene(String surfaceForm) { 83 | ArrayList list = new ArrayList(); 84 | final IndexSearcher searcher = eckb.getSearcher(); 85 | final IndexReader reader = searcher.getIndexReader(); 86 | LearnToRankQuery query = new LearnToRankQuery(); 87 | List features = new LinkedList(); 88 | DefaultSimilarity defaultSim = new DefaultSimilarity(); 89 | features.add(query.add(LuceneFeatures.queryLabelTerm(surfaceForm, 90 | "UniqueLabel", defaultSim), "Feature1", true)); 91 | try { 92 | final TopDocs top = searcher.search(query, 150); 93 | final ScoreDoc[] score = top.scoreDocs; 94 | if (score.length <= 5) { 95 | for (int i = 0; i < score.length; ++i) { 96 | final Document doc = reader.document(score[i].doc); 97 | list.add(doc.get("Mainlink")); 98 | } 99 | } 100 | } catch (IOException e) { 101 | e.printStackTrace(); 102 | } 103 | return list; 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/tools/NTToDbPediaUrlEncoding.java: -------------------------------------------------------------------------------- 1 | package doser.tools; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.File; 5 | import java.io.FileNotFoundException; 6 | import java.io.FileReader; 7 | import java.io.FileWriter; 8 | import java.io.IOException; 9 | import java.io.UnsupportedEncodingException; 10 | import java.io.Writer; 11 | import java.net.URLEncoder; 12 | 13 | import org.apache.commons.lang.StringEscapeUtils; 14 | import org.apache.log4j.Logger; 15 | 16 | public final class NTToDbPediaUrlEncoding { 17 | 18 | private NTToDbPediaUrlEncoding() { 19 | super(); 20 | } 21 | 22 | public static String dbpediaEncoding(final String url) { 23 | final StringBuffer buffer = new StringBuffer(); 24 | for (int i = 0; i < url.length(); i++) { 25 | final String str = String.valueOf(url.charAt(i)); 26 | if (str.equalsIgnoreCase("!")) { 27 | buffer.append('!'); 28 | } else if (str.equalsIgnoreCase("$")) { 29 | buffer.append('$'); 30 | } else if (str.equalsIgnoreCase("&")) { 31 | buffer.append('&'); 32 | } else if (str.equalsIgnoreCase("'")) { 33 | buffer.append('\''); 34 | } else if (str.equalsIgnoreCase("(")) { 35 | buffer.append('('); 36 | } else if (str.equalsIgnoreCase(")")) { 37 | buffer.append(')'); 38 | } else if (str.equalsIgnoreCase("*")) { 39 | buffer.append('*'); 40 | } else if (str.equalsIgnoreCase("+")) { 41 | buffer.append('+'); 42 | } else if (str.equalsIgnoreCase(",")) { 43 | buffer.append(','); 44 | } else if (str.equalsIgnoreCase("-")) { 45 | buffer.append('-'); 46 | } else if (str.equalsIgnoreCase("/")) { 47 | buffer.append('/'); 48 | } else if (str.equalsIgnoreCase(":")) { 49 | buffer.append(':'); 50 | } else if (str.equalsIgnoreCase(";")) { 51 | buffer.append(';'); 52 | } else if (str.equalsIgnoreCase("=")) { 53 | buffer.append('='); 54 | } else if (str.equalsIgnoreCase("@")) { 55 | buffer.append('@'); 56 | } else if (str.equalsIgnoreCase("_")) { 57 | buffer.append('_'); 58 | } else if (str.equalsIgnoreCase("~")) { 59 | buffer.append('~'); 60 | } else { 61 | try { 62 | buffer.append(URLEncoder.encode(str, "UTF-8")); 63 | } catch (final UnsupportedEncodingException e) { 64 | Logger.getRootLogger().error(e.getStackTrace()); 65 | } 66 | } 67 | } 68 | return buffer.toString(); 69 | } 70 | 71 | public static void main(final String[] args) throws IOException { 72 | final String fileInput = args[0]; 73 | final String fileOutput = args[1]; 74 | final File fileIn = new File(fileInput); 75 | final File fileOut = new File(fileOutput); 76 | final Writer writer = new FileWriter(fileOut); 77 | BufferedReader reader = null; 78 | try { 79 | reader = new BufferedReader(new FileReader(fileIn)); 80 | } catch (final FileNotFoundException e) { 81 | Logger.getRootLogger().error(e.getStackTrace()); 82 | } 83 | String line = null; 84 | while ((line = reader.readLine()) != null) { 85 | line = line.replaceAll("[ ]+", " "); 86 | final String splitter[] = line.split(" "); 87 | final StringBuffer buffer = new StringBuffer(); 88 | 89 | // Subject 90 | String url = splitter[0].substring(1, splitter[0].length() - 1); 91 | String sLine = StringEscapeUtils.unescapeJava(url); 92 | buffer.append("<" + dbpediaEncoding(sLine) + "> "); 93 | 94 | // Predicate 95 | buffer.append(splitter[1] + " "); 96 | 97 | // Object 98 | if (splitter[2].startsWith("<")) { 99 | url = splitter[2].substring(1, splitter[2].length() - 1); 100 | sLine = StringEscapeUtils.unescapeJava(url); 101 | buffer.append("<" + dbpediaEncoding(sLine) + ">"); 102 | } else { 103 | buffer.append(splitter[2]); 104 | } 105 | writer.write(buffer.toString()); 106 | writer.write(System.getProperty("line.separator")); 107 | writer.flush(); 108 | } 109 | writer.close(); 110 | reader.close(); 111 | } 112 | 113 | } 114 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/general/Test.java: -------------------------------------------------------------------------------- 1 | package doser.general; 2 | 3 | import java.io.IOException; 4 | import java.text.ParseException; 5 | 6 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 7 | import org.apache.lucene.document.Document; 8 | import org.apache.lucene.document.Field.Store; 9 | import org.apache.lucene.document.StringField; 10 | import org.apache.lucene.document.TextField; 11 | import org.apache.lucene.index.DirectoryReader; 12 | import org.apache.lucene.index.IndexReader; 13 | import org.apache.lucene.index.IndexWriter; 14 | import org.apache.lucene.index.IndexWriterConfig; 15 | import org.apache.lucene.index.Term; 16 | import org.apache.lucene.search.BooleanClause.Occur; 17 | import org.apache.lucene.search.BooleanQuery; 18 | import org.apache.lucene.search.IndexSearcher; 19 | import org.apache.lucene.search.PhraseQuery; 20 | import org.apache.lucene.search.Query; 21 | import org.apache.lucene.search.ScoreDoc; 22 | import org.apache.lucene.search.TermQuery; 23 | import org.apache.lucene.search.TopScoreDocCollector; 24 | import org.apache.lucene.search.spans.SpanNearQuery; 25 | import org.apache.lucene.search.spans.SpanQuery; 26 | import org.apache.lucene.search.spans.SpanTermQuery; 27 | import org.apache.lucene.store.Directory; 28 | import org.apache.lucene.store.RAMDirectory; 29 | import org.apache.lucene.util.Version; 30 | 31 | public class Test { 32 | private IndexWriter writer; 33 | 34 | public void lucene() throws IOException, ParseException { 35 | // Build the index 36 | StandardAnalyzer analyzer = new StandardAnalyzer(); 37 | Directory index = new RAMDirectory(); 38 | IndexWriterConfig config = new IndexWriterConfig(Version.LATEST, 39 | analyzer); 40 | this.writer = new IndexWriter(index, config); 41 | 42 | // Add documents to the index 43 | addDoc("Spring", new String[] { "Java", "JSP", "DBPEDIA_56testdoc" }); 44 | addDoc("Java", new String[] { "Oracle", "Annotation is cool too" }); 45 | 46 | writer.close(); 47 | 48 | // Search the index 49 | IndexReader reader = DirectoryReader.open(index); 50 | IndexSearcher searcher = new IndexSearcher(reader); 51 | 52 | TermQuery q = new TermQuery(new Term("keyword", "DBPEDIA_56testdoc")); 53 | // SpanQuery q = new SpanNearQuery(new SpanQuery[] { 54 | // new SpanTermQuery(new Term("keyword", "too")), 55 | // new SpanTermQuery(new Term("keyword", "cool"))}, 56 | // 3, 57 | // true); 58 | 59 | // String[] s = {"cool", "too"}; 60 | // for (int i = 0; i < s.length; i++) { 61 | // q.add(new Term("keyword", s[i])); 62 | // } 63 | 64 | // q.add(new PhraseQuery(new Term("keyword", "Annotation is cool")), 65 | // Occur.MUST); 66 | 67 | System.out.println(q.toString()); 68 | 69 | int hitsPerPage = 10; 70 | TopScoreDocCollector collector = TopScoreDocCollector.create( 71 | hitsPerPage, true); 72 | 73 | searcher.search(q, collector); 74 | 75 | ScoreDoc[] hits = collector.topDocs().scoreDocs; 76 | 77 | for (int i = 0; i < hits.length; ++i) { 78 | int docId = hits[i].doc; 79 | Document doc = searcher.doc(docId); 80 | System.out.println(hits[i].toString()); 81 | System.out.println((i + 1) + ". \t" + doc.get("title")); 82 | } 83 | 84 | reader.close(); 85 | } 86 | 87 | private void addDoc(String title, String[] keywords) throws IOException { 88 | // Create new document 89 | Document doc = new Document(); 90 | 91 | // Add title 92 | doc.add(new TextField("title", title, Store.YES)); 93 | 94 | // Add keywords 95 | for (int i = 0; i < keywords.length; i++) { 96 | doc.add(new StringField("keyword", keywords[i], Store.YES)); 97 | } 98 | 99 | // Add document to index 100 | this.writer.addDocument(doc); 101 | } 102 | 103 | public static void main(String[] args) { 104 | Test test = new Test(); 105 | try { 106 | test.lucene(); 107 | } catch (IOException e) { 108 | // TODO Auto-generated catch block 109 | e.printStackTrace(); 110 | } catch (ParseException e) { 111 | // TODO Auto-generated catch block 112 | e.printStackTrace(); 113 | } 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /doser-dis-core/src/test/java/doser/test/breakdetection/BreakDetection.java: -------------------------------------------------------------------------------- 1 | package doser.test.breakdetection; 2 | 3 | public class BreakDetection { 4 | 5 | // @SuppressWarnings("deprecation") 6 | // public static void main(String[] args) { 7 | // 8 | // List shotList = new LinkedList(); 9 | // WebSite shot1 = new WebSite(); 10 | // shot1.setName("1"); 11 | // shot1.setText("Text1"); 12 | // shot1.setObjectId(0); 13 | // WebSite shot2 = new WebSite(); 14 | // shot2.setObjectId(1); 15 | // shot2.setName("2"); 16 | // shot2.setText("Text2"); 17 | // WebSite shot3 = new WebSite(); 18 | // shot3.setObjectId(2); 19 | // shot3.setName("3"); 20 | // shot3.setText("Text3"); 21 | // WebSite shot4 = new WebSite(); 22 | // shot4.setObjectId(3); 23 | // shot4.setName("4"); 24 | // shot4.setText("Text4"); 25 | // WebSite shot5 = new WebSite(); 26 | // shot5.setObjectId(4); 27 | // shot5.setName("5"); 28 | // shot5.setText("Text5"); 29 | // WebSite shot6 = new WebSite(); 30 | // shot6.setObjectId(5); 31 | // shot6.setName("6"); 32 | // shot6.setText("Text6"); 33 | // 34 | // shotList.add(shot1); 35 | // shotList.add(shot2); 36 | // shotList.add(shot3); 37 | // shotList.add(shot4); 38 | // shotList.add(shot5); 39 | // shotList.add(shot6); 40 | // Decomposition decomp = new Decomposition(shotList); 41 | // 42 | // double[][] similarityMatrix = new double[6][6]; 43 | // similarityMatrix[0][0] = 1; 44 | // similarityMatrix[0][1] = 0.5; 45 | // similarityMatrix[0][2] = 0.5; 46 | // similarityMatrix[0][3] = 0.8; 47 | // similarityMatrix[0][4] = 0.4; 48 | // similarityMatrix[0][5] = 0.8; 49 | // 50 | // similarityMatrix[1][0] = 0.5; 51 | // similarityMatrix[1][1] = 1.0; 52 | // similarityMatrix[1][2] = 0.5; 53 | // similarityMatrix[1][3] = 0.5; 54 | // similarityMatrix[1][4] = 0.5; 55 | // similarityMatrix[1][5] = 0.5; 56 | // 57 | // similarityMatrix[2][0] = 0.5; 58 | // similarityMatrix[2][1] = 0.5; 59 | // similarityMatrix[2][2] = 1; 60 | // similarityMatrix[2][3] = 0.5; 61 | // similarityMatrix[2][4] = 0.5; 62 | // similarityMatrix[2][5] = 0.5; 63 | // 64 | // similarityMatrix[3][0] = 0.8; 65 | // similarityMatrix[3][1] = 0.5; 66 | // similarityMatrix[3][2] = 0.5; 67 | // similarityMatrix[3][3] = 1; 68 | // similarityMatrix[3][4] = 0.5; 69 | // similarityMatrix[3][5] = 0.8; 70 | // 71 | // similarityMatrix[4][0] = 0.5; 72 | // similarityMatrix[4][1] = 0.5; 73 | // similarityMatrix[4][2] = 1; 74 | // similarityMatrix[4][3] = 0.5; 75 | // similarityMatrix[4][4] = 0.5; 76 | // similarityMatrix[4][5] = 0.5; 77 | // 78 | // similarityMatrix[5][0] = 0.8; 79 | // similarityMatrix[5][1] = 0.5; 80 | // similarityMatrix[5][2] = 0.5; 81 | // similarityMatrix[5][3] = 0.8; 82 | // similarityMatrix[5][4] = 0.5; 83 | // similarityMatrix[5][5] = 1; 84 | // 85 | // decomp.setSimilarityMatrix(similarityMatrix); 86 | // decomp.start(); 87 | // try { 88 | // decomp.join(); 89 | // } catch (InterruptedException e) { 90 | // e.printStackTrace(); 91 | // } 92 | // 93 | // ConcurrentNCutAlgorithm nCutAlgorithm = new ConcurrentNCutAlgorithm(decomp.getMainCluster()); 94 | // 95 | // List> clusterList = nCutAlgorithm.startClustering(); 96 | // for (Cluster cluster : clusterList) { 97 | // List list = cluster.getObjectList(); 98 | // for (WebSite site : list) { 99 | // System.out.println("Site id: " + site.getObjectId()); 100 | // } 101 | // } 102 | // 103 | // 104 | //// decomp.createUndirectedWeightedGraph(); 105 | // // Third Step: VideoDecomposition 106 | //// List> clusterLst = doVideoDecomposition(decomp, shotList); 107 | // 108 | //// // Step Four: Temporal Graph Creation 109 | //// TemporalGraph> tempGraph = doTemporalGraphGeneration(clusterLst); 110 | //// 111 | //// // Step Five: Shortest Path 112 | //// List shortestPath = doShortestPath(tempGraph); 113 | //// 114 | //// // Step Six: Scene Extraction 115 | //// doSceneExtraction(tempGraph, shortestPath); 116 | // } 117 | 118 | 119 | } 120 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/CandidatePruning.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | import java.util.HashSet; 6 | import java.util.LinkedList; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Set; 10 | 11 | import doser.entitydisambiguation.algorithms.SurfaceForm; 12 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral; 13 | import doser.general.HelpfulMethods; 14 | 15 | public class CandidatePruning { 16 | 17 | private static final int NUMBEROFADDITIONALW2VENTITIES = 6; 18 | 19 | private static final int ENTITYTHRESHOLD = 6; 20 | 21 | private static final int MINIMUMSURFACEFORMS = 3; 22 | 23 | private static final float WORD2VECTHRESHOLD = 1.60f; 24 | 25 | private AbstractEntityCentricKBGeneral eckb; 26 | 27 | public CandidatePruning(AbstractEntityCentricKBGeneral eckb) { 28 | super(); 29 | this.eckb = eckb; 30 | } 31 | 32 | public void prune(List rep) { 33 | List unambiguous = new LinkedList(); 34 | for (SurfaceForm c : rep) { 35 | if (c.getCandidates().size() == 1) { 36 | unambiguous.add(c); 37 | } 38 | } 39 | 40 | List list = new LinkedList(); 41 | for (SurfaceForm sf : rep) { 42 | if (rep.size() > 1 && sf.getCandidates().size() == 1 && sf.isInitial()) { 43 | list.add(sf.getCandidates().get(0)); 44 | } 45 | } 46 | 47 | for (SurfaceForm c : rep) { 48 | List candidates = c.getCandidates(); 49 | if (candidates.size() > ENTITYTHRESHOLD) { 50 | Set prunedCandidates = new HashSet(); 51 | 52 | // Sense Prior 53 | Map map = new HashMap(); 54 | for (String candidate : candidates) { 55 | map.put(candidate, eckb.getFeatureDefinition().getOccurrences(c.getSurfaceForm(), candidate)); 56 | } 57 | @SuppressWarnings("deprecation") 58 | List> l = HelpfulMethods.sortByValue(map); 59 | for (int i = 0; i < ENTITYTHRESHOLD; ++i) { 60 | prunedCandidates.add(l.get(i).getKey()); 61 | // System.out.println("SensePrior ADd: "+l.get(i).getKey()+" 62 | // "+l.get(i).getValue()); 63 | } 64 | 65 | // Doc2Vec ContextSimilarity 66 | Map map_doc2vec = new HashMap(); 67 | for (String candidate : candidates) { 68 | 69 | map_doc2vec.put(candidate, eckb.getDoc2VecSimilarity(c.getSurfaceForm(), c.getContext(), candidate)); 70 | } 71 | @SuppressWarnings("deprecation") 72 | List> l_doc2vec = HelpfulMethods.sortByValue(map_doc2vec); 73 | int added = 0; 74 | int counter = 0; 75 | while (counter < l_doc2vec.size() && added < 4) { 76 | String key = l_doc2vec.get(counter).getKey(); 77 | if (!prunedCandidates.contains(key)) { 78 | prunedCandidates.add(key); 79 | added++; 80 | } 81 | counter++; 82 | } 83 | // for (int i = 0; i < ENTITYTHRESHOLD; ++i) { 84 | // prunedCandidates.add(l_doc2vec.get(i).getKey()); 85 | // } 86 | 87 | // Check for very relevant Candidates via given Word2Vec 88 | // similarities 89 | if (list.size() >= MINIMUMSURFACEFORMS) { 90 | Set w2vFormatStrings = new HashSet(); 91 | for (String can : candidates) { 92 | if (!prunedCandidates.contains(can)) { 93 | String query = this.eckb.generateWord2VecFormatString(list, can); 94 | w2vFormatStrings.add(query); 95 | } 96 | } 97 | 98 | Map similarityMap = this.eckb.getWord2VecSimilarities(w2vFormatStrings); 99 | Map occmap = new HashMap(); 100 | for (String can : candidates) { 101 | if (!prunedCandidates.contains(can)) { 102 | String query = this.eckb.generateWord2VecFormatString(list, can); 103 | float val = similarityMap.get(query); 104 | if (val > WORD2VECTHRESHOLD) { 105 | occmap.put(can, eckb.getFeatureDefinition().getOccurrences(c.getSurfaceForm(), can)); 106 | // prunedCandidates.add(can); 107 | } 108 | } 109 | } 110 | @SuppressWarnings("deprecation") 111 | List> sortedl = HelpfulMethods.sortByValue(occmap); 112 | for (int i = 0; i < NUMBEROFADDITIONALW2VENTITIES; ++i) { 113 | if (i < sortedl.size()) { 114 | prunedCandidates.add(sortedl.get(i).getKey()); 115 | } 116 | } 117 | } 118 | 119 | c.setCandidates(new ArrayList(prunedCandidates)); 120 | } 121 | } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /doser-dis-extensions/src/main/java/doser/lucene/query/ConjunctionScorer.java: -------------------------------------------------------------------------------- 1 | package doser.lucene.query; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Collection; 6 | import java.util.Comparator; 7 | 8 | import org.apache.lucene.search.Scorer; 9 | import org.apache.lucene.search.Weight; 10 | import org.apache.lucene.util.ArrayUtil; 11 | 12 | /** Scorer for conjunctions, sets of queries, all of which are required. */ 13 | class ConjunctionScorer extends Scorer { 14 | static final class DocsAndFreqs { 15 | final long cost; 16 | int doc = -1; 17 | final Scorer scorer; 18 | 19 | DocsAndFreqs(final Scorer scorer) { 20 | this.scorer = scorer; 21 | cost = scorer.cost(); 22 | } 23 | } 24 | 25 | private final LearnToRankClause[] clauses; 26 | private final float coord; 27 | private final int docBase; 28 | protected final DocsAndFreqs[] docsAndFreqs; 29 | protected int lastDoc = -1; 30 | 31 | private final DocsAndFreqs lead; 32 | 33 | ConjunctionScorer(final Weight weight, final Scorer[] scorers, 34 | final float coord, final LearnToRankClause[] ltrclauses, 35 | final int docBase) { 36 | super(weight); 37 | this.coord = coord; 38 | this.docBase = docBase; 39 | clauses = ltrclauses; 40 | docsAndFreqs = new DocsAndFreqs[scorers.length]; 41 | for (int i = 0; i < scorers.length; i++) { 42 | docsAndFreqs[i] = new DocsAndFreqs(scorers[i]); 43 | } 44 | // Sort the array the first time to allow the least frequent DocsEnum to 45 | // lead the matching. 46 | ArrayUtil.timSort(docsAndFreqs, new Comparator() { 47 | @Override 48 | public int compare(final DocsAndFreqs obj1, final DocsAndFreqs obj2) { 49 | return Long.signum(obj1.cost - obj2.cost); 50 | } 51 | }); 52 | 53 | lead = docsAndFreqs[0]; // least frequent DocsEnum leads the 54 | // intersection 55 | } 56 | 57 | ConjunctionScorer(final Weight weight, final Scorer[] scorers, 58 | final LearnToRankClause[] ltrclauses, final int docBase) { 59 | this(weight, scorers, 1f, ltrclauses, docBase); 60 | } 61 | 62 | @Override 63 | public int advance(final int target) throws IOException { 64 | lead.doc = lead.scorer.advance(target); 65 | return lastDoc = doNext(lead.doc); 66 | } 67 | 68 | @Override 69 | public long cost() { 70 | return lead.scorer.cost(); 71 | } 72 | 73 | @Override 74 | public int docID() { 75 | return lastDoc; 76 | } 77 | 78 | private int doNext(int doc) throws IOException { // NOPMD by quh on 28.02.14 79 | // 10:45 80 | for (;;) { 81 | // doc may already be NO_MORE_DOCS here, but we don't check 82 | // explicitly 83 | // since all scorers should advance to NO_MORE_DOCS, match, then 84 | // return that value. 85 | advanceHead: for (;;) { 86 | for (int i = 1; i < docsAndFreqs.length; i++) { 87 | // invariant: docsAndFreqs[i].doc <= doc at this point. 88 | 89 | // docsAndFreqs[i].doc may already be equal to doc if we 90 | // "broke advanceHead" 91 | // on the previous iteration and the advance on the lead 92 | // scorer exactly matched. 93 | if (docsAndFreqs[i].doc < doc) { 94 | docsAndFreqs[i].doc = docsAndFreqs[i].scorer 95 | .advance(doc); 96 | 97 | if (docsAndFreqs[i].doc > doc) { 98 | // DocsEnum beyond the current doc - break and 99 | // advance lead to the new highest doc. 100 | doc = docsAndFreqs[i].doc; 101 | break advanceHead; 102 | } 103 | } 104 | } 105 | // success - all DocsEnums are on the same doc 106 | return doc; 107 | } 108 | // advance head for next iteration 109 | doc = lead.doc = lead.scorer.advance(doc); 110 | } 111 | } 112 | 113 | @Override 114 | public int freq() { 115 | return docsAndFreqs.length; 116 | } 117 | 118 | @Override 119 | public Collection getChildren() { 120 | final ArrayList children = new ArrayList( 121 | docsAndFreqs.length); 122 | for (final DocsAndFreqs docs : docsAndFreqs) { 123 | children.add(new ChildScorer(docs.scorer, "MUST")); 124 | } 125 | return children; 126 | } 127 | 128 | @Override 129 | public int nextDoc() throws IOException { 130 | lead.doc = lead.scorer.nextDoc(); 131 | return lastDoc = doNext(lead.doc); 132 | } 133 | 134 | @Override 135 | public float score() throws IOException { 136 | // TODO: sum into a double and cast to float if we ever send required 137 | // clauses to BS1 138 | float sum = 0.0f; 139 | for (int i = 0; i < docsAndFreqs.length; i++) { 140 | final float val = docsAndFreqs[i].scorer.score() 141 | * clauses[i].getWeight(); 142 | sum += val; 143 | clauses[i].addFeatureValue(docBase, lastDoc, val); 144 | } 145 | return sum * coord; 146 | } 147 | } -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/Vertex.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashSet; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.Set; 8 | 9 | public class Vertex implements Comparable { 10 | private List uris; 11 | private int entityQuery; 12 | private double score; 13 | private boolean isCandidate; 14 | private String description; 15 | private String text; 16 | private String context; 17 | private double occurrences; 18 | 19 | private Set outgoingEdges; 20 | 21 | private double sumOutGoing; 22 | 23 | public Vertex() { 24 | super(); 25 | this.uris = new ArrayList(); 26 | this.outgoingEdges = new HashSet(); 27 | this.entityQuery = -1; 28 | this.isCandidate = false; 29 | this.sumOutGoing = 0; 30 | this.text = ""; 31 | this.context = ""; 32 | } 33 | 34 | public void addOutGoingEdge(Edge e) { 35 | outgoingEdges.add(e); 36 | this.sumOutGoing += e.getTransition(); 37 | for(Edge out : outgoingEdges) { 38 | out.setProbability(out.getTransition() / sumOutGoing); 39 | } 40 | } 41 | 42 | public void removeAllOutgoingEdges() { 43 | this.outgoingEdges.clear(); 44 | } 45 | 46 | public Edge removeOutgoingEdge(Vertex v, Map edgeWeight) { 47 | Edge toRemove = null; 48 | for (Edge e : outgoingEdges) { 49 | if (e.getTarget().equals(v)) { 50 | toRemove = e; 51 | break; 52 | } 53 | } 54 | if (toRemove != null) { 55 | outgoingEdges.remove(toRemove); 56 | sumOutGoing -= toRemove.getTransition(); 57 | } 58 | 59 | // Update Transition Probability 60 | for(Edge out : outgoingEdges) { 61 | out.setProbability(out.getTransition() / sumOutGoing); 62 | edgeWeight.put(out, out.getProbability()); 63 | } 64 | 65 | return toRemove; 66 | } 67 | 68 | public String getContext() { 69 | return context; 70 | } 71 | 72 | public void setContext(String context) { 73 | this.context = context; 74 | } 75 | 76 | public double getSumOutGoingEdges() { 77 | return sumOutGoing; 78 | } 79 | 80 | public Set getOutgoingEdges() { 81 | return this.outgoingEdges; 82 | } 83 | 84 | public List getUris() { 85 | return uris; 86 | } 87 | 88 | public void addUri(String uri) { 89 | this.uris.add(uri); 90 | } 91 | 92 | public boolean isCandidate() { 93 | return isCandidate; 94 | } 95 | 96 | public void setCandidate(boolean isCandidate) { 97 | this.isCandidate = isCandidate; 98 | } 99 | 100 | public int getEntityQuery() { 101 | return entityQuery; 102 | } 103 | 104 | public void setEntityQuery(int entityQuery) { 105 | this.entityQuery = entityQuery; 106 | } 107 | 108 | public void setGraphValue(double val) { 109 | this.score = val; 110 | } 111 | 112 | public double getScore() { 113 | return this.score; 114 | } 115 | 116 | public void setScore(double score) { 117 | this.score = score; 118 | } 119 | 120 | public String getDescription() { 121 | return description; 122 | } 123 | 124 | void setDescription(String description) { 125 | this.description = description; 126 | } 127 | 128 | public String getText() { 129 | return text; 130 | } 131 | 132 | public void setText(String text) { 133 | this.text = text; 134 | } 135 | 136 | public double getOccurrences() { 137 | return occurrences; 138 | } 139 | 140 | public void setOccurrences(int occurrences) { 141 | this.occurrences = Math.log10(occurrences + 1); 142 | } 143 | 144 | @Override 145 | public boolean equals(Object obj) { 146 | Vertex comp = (Vertex) obj; 147 | boolean isEqual = true; 148 | if (this.uris.size() != comp.getUris().size() 149 | || this.entityQuery != comp.getEntityQuery()) { 150 | return false; 151 | } 152 | for (int i = 0; i < uris.size(); ++i) { 153 | if (!uris.get(i).equalsIgnoreCase(comp.getUris().get(i))) { 154 | isEqual = false; 155 | break; 156 | } 157 | } 158 | return isEqual; 159 | } 160 | 161 | @Override 162 | public int hashCode() { 163 | return (generateUriHash(this.uris) + ((Integer) this.getEntityQuery()) 164 | .hashCode()); 165 | } 166 | 167 | private int generateUriHash(List uris) { 168 | int hash = 0; 169 | for (String uri : uris) { 170 | hash += uri.hashCode(); 171 | } 172 | return hash; 173 | } 174 | 175 | /** 176 | * The return values are switched to provide a descending order when using 177 | * Collections.sort(), which generally provides an ascending sort order. 178 | * 179 | */ 180 | @Override 181 | public int compareTo(Vertex o) { 182 | if (this.getOccurrences() < o.getOccurrences()) { 183 | return 1; 184 | } else if (this.getOccurrences() > o.getOccurrences()) { 185 | return 1; 186 | } else { 187 | return 0; 188 | } 189 | } 190 | } -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/general/CollectiveDisambiguationGeneralEntities.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective.general; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Arrays; 6 | import java.util.LinkedList; 7 | import java.util.List; 8 | 9 | import org.apache.lucene.document.Document; 10 | import org.apache.lucene.index.IndexReader; 11 | import org.apache.lucene.index.Term; 12 | import org.apache.lucene.search.IndexSearcher; 13 | import org.apache.lucene.search.Query; 14 | import org.apache.lucene.search.ScoreDoc; 15 | import org.apache.lucene.search.TopDocs; 16 | import org.slf4j.Logger; 17 | import org.slf4j.LoggerFactory; 18 | 19 | import doser.entitydisambiguation.algorithms.AbstractDisambiguationAlgorithm; 20 | import doser.entitydisambiguation.algorithms.IllegalDisambiguationAlgorithmInputException; 21 | import doser.entitydisambiguation.algorithms.SurfaceForm; 22 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask; 23 | import doser.entitydisambiguation.backend.DisambiguationTaskCollective; 24 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO; 25 | import doser.entitydisambiguation.dpo.Response; 26 | import doser.entitydisambiguation.knowledgebases.EntityCentricKnowledgeBase; 27 | import doser.lucene.query.TermQuery; 28 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase; 29 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral; 30 | 31 | public class CollectiveDisambiguationGeneralEntities extends AbstractDisambiguationAlgorithm { 32 | 33 | private final static Logger logger = LoggerFactory.getLogger(CollectiveDisambiguationGeneralEntities.class); 34 | 35 | private AbstractEntityCentricKBGeneral eckb; 36 | 37 | private DisambiguationTaskCollective task; 38 | 39 | @Override 40 | protected boolean checkAndSetInputParameter(AbstractDisambiguationTask task) { 41 | AbstractKnowledgeBase kb = task.getKb(); 42 | if (!(task instanceof DisambiguationTaskCollective)) { 43 | return false; 44 | } 45 | 46 | this.eckb = (AbstractEntityCentricKBGeneral) kb; 47 | this.task = (DisambiguationTaskCollective) task; 48 | return true; 49 | } 50 | 51 | @Override 52 | protected void processAlgorithm() throws IllegalDisambiguationAlgorithmInputException { 53 | // AdditionalCandidateQuery aq = new AdditionalCandidateQuery(eckb); 54 | List entityList = task.getEntityToDisambiguate(); 55 | Response[] responseArray = new Response[entityList.size()]; 56 | 57 | List collectiveRep = new LinkedList(); 58 | for (int i = 0; i < entityList.size(); i++) { 59 | EntityDisambiguationDPO dpo = entityList.get(i); 60 | // Dieser Fix sollte irgendwo anders passieren. TODO Auslagern 61 | dpo.setSelectedText(dpo.getSelectedText().replaceAll("’", "'")); 62 | Query query = createQuery(dpo.getSelectedText(), eckb); 63 | final IndexSearcher searcher = eckb.getSearcher(); 64 | final IndexReader reader = searcher.getIndexReader(); 65 | try { 66 | final TopDocs top = searcher.search(query, task.getReturnNr()); 67 | final ScoreDoc[] score = top.scoreDocs; 68 | if (dpo.getSelectedText().equalsIgnoreCase("") || dpo.getSelectedText() == null) { 69 | ArrayList l = new ArrayList(); 70 | l.add(""); 71 | SurfaceForm col = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, i, 72 | dpo.getStartPosition()); 73 | collectiveRep.add(col); 74 | } else if (score.length == 1) { 75 | final Document doc = reader.document(score[0].doc); 76 | ArrayList l = new ArrayList(); 77 | l.add(doc.get("Mainlink")); 78 | SurfaceForm col = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, i, 79 | dpo.getStartPosition()); 80 | col.setInitial(true); 81 | collectiveRep.add(col); 82 | 83 | } else if (score.length > 1) { 84 | ArrayList l = new ArrayList(); 85 | for (int j = 0; j < score.length; j++) { 86 | final Document doc = reader.document(score[j].doc); 87 | l.add(doc.get("Mainlink")); 88 | } 89 | SurfaceForm col = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, i, 90 | dpo.getStartPosition()); 91 | collectiveRep.add(col); 92 | 93 | } else { 94 | // SurfaceForm sf = aq.checkAdditionalSurfaceForms(dpo, i); 95 | // collectiveRep.add(sf); 96 | } 97 | 98 | } catch (final IOException e) { 99 | logger.error("JsonException in "+CollectiveDisambiguationGeneralEntities.class.getName(), e); 100 | } 101 | } 102 | 103 | CollectiveContextDriverGeneral solver = new CollectiveContextDriverGeneral(responseArray, collectiveRep, eckb); 104 | solver.solve(); 105 | 106 | solver.generateResult(); 107 | List res = Arrays.asList(responseArray); 108 | task.setResponse(res); 109 | 110 | eckb.release(); 111 | } 112 | 113 | @Override 114 | protected boolean preDisambiguation() { 115 | return true; 116 | } 117 | 118 | private Query createQuery(String sf, EntityCentricKnowledgeBase kb) { 119 | String surfaceform = sf.toLowerCase(); 120 | TermQuery query = new TermQuery(new Term("UniqueLabel", surfaceform)); 121 | 122 | return query; 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/ContextRule.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.rules; 2 | 3 | import java.io.IOException; 4 | import java.util.HashSet; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | import java.util.Map; 8 | import java.util.Set; 9 | 10 | import org.apache.lucene.document.Document; 11 | import org.apache.lucene.index.Term; 12 | import org.apache.lucene.search.IndexSearcher; 13 | import org.apache.lucene.search.Query; 14 | import org.apache.lucene.search.ScoreDoc; 15 | import org.apache.lucene.search.TopDocs; 16 | 17 | import doser.entitydisambiguation.algorithms.SurfaceForm; 18 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; 19 | import doser.lucene.query.TermQuery; 20 | 21 | class ContextRule extends AbstractRule { 22 | 23 | private static final int MINDISAMBIGUATEDSURFACEFORMS = 2; 24 | 25 | private static final int MINIMUMSURFACEFORMS = 10; 26 | 27 | private static final float SIMILARITYTHRESHOLD = 1.57f; 28 | private static final float SIMILARITYTHRESHOLDMISC = 1.53f; 29 | 30 | private EntityCentricKBDBpedia eckb; 31 | 32 | ContextRule(EntityCentricKBDBpedia eckb) { 33 | super(eckb); 34 | this.eckb = eckb; 35 | } 36 | 37 | @Override 38 | public boolean applyRule(List rep) { 39 | if (rep.size() > MINIMUMSURFACEFORMS) { 40 | List list = new LinkedList(); 41 | for (SurfaceForm sf : rep) { 42 | if (rep.size() > 1 && sf.getCandidates().size() == 1 && sf.isInitial()) { 43 | list.add(sf.getCandidates().get(0)); 44 | } 45 | } 46 | if (list.size() >= MINDISAMBIGUATEDSURFACEFORMS) { 47 | Set w2vFormatStrings = new HashSet(); 48 | for (SurfaceForm sf : rep) { 49 | if (rep.size() > 1 && sf.getCandidates().size() > 1) { 50 | List l = sf.getCandidates(); 51 | List bestCandidate = new LinkedList(); 52 | Set levenshteinAdded = new HashSet(); 53 | for (String s : l) { 54 | String query = this.eckb.generateWord2VecFormatString(list, s); 55 | w2vFormatStrings.add(query); 56 | Map similarityMap = this.eckb.getWord2VecSimilarities(w2vFormatStrings); 57 | float simValue = similarityMap.get(query); 58 | // Check for Appropriate entities 59 | String candidateWithoutUrl = s.replaceAll("http://dbpedia.org/resource/", "").toLowerCase(); 60 | if (levenshteinDistance(candidateWithoutUrl, sf.getSurfaceForm().toLowerCase()) <= 2) { 61 | System.out.println("LEVENSHTEIN DISTANCE ENTITY: " + s); 62 | } 63 | if (simValue > SIMILARITYTHRESHOLD 64 | || (queryType(s).equalsIgnoreCase("Misc") && simValue > SIMILARITYTHRESHOLDMISC)) { 65 | bestCandidate.add(s); 66 | } else if (levenshteinDistance(candidateWithoutUrl, 67 | sf.getSurfaceForm().toLowerCase()) <= 2) { 68 | bestCandidate.add(s); 69 | levenshteinAdded.add(s); 70 | } 71 | } 72 | // Disambiguate and assign entity 73 | if (!bestCandidate.isEmpty()) { 74 | boolean notOnlyLevenshtein = false; 75 | for (String s : bestCandidate) { 76 | if (!levenshteinAdded.contains(s)) { 77 | notOnlyLevenshtein = true; 78 | } 79 | } 80 | if (notOnlyLevenshtein) { 81 | sf.setCandidates(bestCandidate); 82 | System.out.println("Es bleibt übrig SurfaceForm: " + sf.getSurfaceForm() + " +" 83 | + bestCandidate.toString()); 84 | } 85 | } 86 | } 87 | } 88 | } 89 | } 90 | return false; 91 | } 92 | 93 | private String queryType(String url) { 94 | String type = ""; 95 | IndexSearcher searcher = eckb.getSearcher(); 96 | Query q = new TermQuery(new Term("Mainlink", url)); 97 | try { 98 | TopDocs docs = searcher.search(q, 1); 99 | ScoreDoc[] scoredocs = docs.scoreDocs; 100 | if (scoredocs.length == 0) { 101 | type = "Misc"; 102 | } else { 103 | int nr = scoredocs[0].doc; 104 | Document doc = searcher.getIndexReader().document(nr); 105 | type = doc.get("Type"); 106 | } 107 | } catch (IOException e) { 108 | e.printStackTrace(); 109 | } 110 | return type; 111 | } 112 | 113 | int levenshteinDistance(CharSequence lhs, CharSequence rhs) { 114 | int len0 = lhs.length() + 1; 115 | int len1 = rhs.length() + 1; 116 | 117 | // the array of distances 118 | int[] cost = new int[len0]; 119 | int[] newcost = new int[len0]; 120 | 121 | // initial cost of skipping prefix in String s0 122 | for (int i = 0; i < len0; i++) 123 | cost[i] = i; 124 | 125 | // dynamically computing the array of distances 126 | 127 | // transformation cost for each letter in s1 128 | for (int j = 1; j < len1; j++) { 129 | // initial cost of skipping prefix in String s1 130 | newcost[0] = j; 131 | 132 | // transformation cost for each letter in s0 133 | for (int i = 1; i < len0; i++) { 134 | // matching current letters in both strings 135 | int match = (lhs.charAt(i - 1) == rhs.charAt(j - 1)) ? 0 : 1; 136 | 137 | // computing cost for each transformation 138 | int cost_replace = cost[i - 1] + match; 139 | int cost_insert = cost[i] + 1; 140 | int cost_delete = newcost[i - 1] + 1; 141 | 142 | // keep minimum cost 143 | newcost[i] = Math.min(Math.min(cost_insert, cost_delete), cost_replace); 144 | } 145 | 146 | // swap cost/newcost arrays 147 | int[] swap = cost; 148 | cost = newcost; 149 | newcost = swap; 150 | } 151 | 152 | // the distance is the cost for transforming all letters in both strings 153 | return cost[len0 - 1]; 154 | } 155 | } 156 | -------------------------------------------------------------------------------- /doser-dis-disambiguationserver/src/main/java/doser/server/actions/disambiguation/DisambiguationService.java: -------------------------------------------------------------------------------- 1 | package doser.server.actions.disambiguation; 2 | 3 | import java.util.LinkedList; 4 | import java.util.List; 5 | 6 | import org.springframework.stereotype.Controller; 7 | import org.springframework.web.bind.annotation.RequestBody; 8 | import org.springframework.web.bind.annotation.RequestMapping; 9 | import org.springframework.web.bind.annotation.RequestMethod; 10 | import org.springframework.web.bind.annotation.ResponseBody; 11 | 12 | import doser.entitydisambiguation.backend.DisambiguationMainService; 13 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask; 14 | import doser.entitydisambiguation.backend.DisambiguationTaskCollective; 15 | import doser.entitydisambiguation.backend.DisambiguationTaskSingle; 16 | import doser.entitydisambiguation.dpo.DisambiguationRequest; 17 | import doser.entitydisambiguation.dpo.DisambiguationResponse; 18 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO; 19 | import doser.entitydisambiguation.dpo.Response; 20 | import doser.entitydisambiguation.properties.Properties; 21 | 22 | @Controller 23 | @RequestMapping("/disambiguation") 24 | public class DisambiguationService { 25 | 26 | public DisambiguationService() { 27 | super(); 28 | } 29 | 30 | /** 31 | * Testing 32 | * 33 | * @param request 34 | * @return 35 | */ 36 | @RequestMapping(value = "/disambiguateWithoutCategories-single", method = RequestMethod.POST, headers = "Accept=application/json") 37 | public @ResponseBody DisambiguationResponse annotateSingle(@RequestBody final DisambiguationRequest request) { 38 | DisambiguationResponse annotationResponse = disambiguateSingle(request); 39 | return annotationResponse; 40 | } 41 | 42 | @RequestMapping(value = "/disambiguationWithoutCategories-collective", method = RequestMethod.POST, headers = "Accept=application/json") 43 | public @ResponseBody DisambiguationResponse annotateCollectiveWithoutCategories( 44 | @RequestBody final DisambiguationRequest request) { 45 | final DisambiguationResponse response = new DisambiguationResponse(); 46 | final DisambiguationMainService mainService = DisambiguationMainService.getInstance(); 47 | final List listToDis = request.getSurfaceFormsToDisambiguate(); 48 | 49 | if (mainService != null) { 50 | final List tasks = new LinkedList(); 51 | DisambiguationTaskCollective collectiveTask = new DisambiguationTaskCollective(listToDis, 52 | request.getMainTopic()); 53 | collectiveTask.setKbIdentifier("default", "EntityCentric"); 54 | collectiveTask.setReturnNr(1000); 55 | tasks.add(collectiveTask); 56 | mainService.disambiguate(tasks); 57 | 58 | List responses = collectiveTask.getResponse(); 59 | response.setTasks(responses); 60 | response.setDocumentUri(request.getDocumentUri()); 61 | } 62 | return response; 63 | } 64 | 65 | @RequestMapping(value = "/disambiguationWithoutCategoriesBiomed-collective", method = RequestMethod.POST, headers = "Accept=application/json") 66 | public @ResponseBody DisambiguationResponse annotateCollectiveWithoutCategoriesBiomed( 67 | @RequestBody final DisambiguationRequest request) { 68 | final DisambiguationResponse response = new DisambiguationResponse(); 69 | final DisambiguationMainService mainService = DisambiguationMainService.getInstance(); 70 | final List listToDis = request.getSurfaceFormsToDisambiguate(); 71 | 72 | if (mainService != null) { 73 | final List tasks = new LinkedList(); 74 | DisambiguationTaskCollective collectiveTask = new DisambiguationTaskCollective(listToDis, 75 | request.getMainTopic()); 76 | collectiveTask.setKbIdentifier("biomed", "EntityCentric"); 77 | collectiveTask.setReturnNr(1000); 78 | tasks.add(collectiveTask); 79 | mainService.disambiguate(tasks); 80 | 81 | List responses = collectiveTask.getResponse(); 82 | response.setTasks(responses); 83 | response.setDocumentUri(request.getDocumentUri()); 84 | } 85 | return response; 86 | } 87 | 88 | private DisambiguationResponse disambiguateSingle(DisambiguationRequest request) { 89 | final DisambiguationResponse response = new DisambiguationResponse(); 90 | final List listToDis = request.getSurfaceFormsToDisambiguate(); 91 | List responseList = new LinkedList(); 92 | response.setDocumentUri(request.getDocumentUri()); 93 | final List tasks = new LinkedList(); 94 | final DisambiguationMainService mainService = DisambiguationMainService.getInstance(); 95 | if (mainService != null) { 96 | int docsToReturn = 0; 97 | if (request.getDocsToReturn() == null) { 98 | docsToReturn = Properties.getInstance().getDisambiguationResultSize(); 99 | } else { 100 | docsToReturn = request.getDocsToReturn(); 101 | } 102 | for (int i = 0; i < listToDis.size(); i++) { 103 | EntityDisambiguationDPO dpo = listToDis.get(i); 104 | DisambiguationTaskSingle task = new DisambiguationTaskSingle(dpo); 105 | task.setReturnNr(docsToReturn); 106 | task.setKbIdentifier(listToDis.get(i).getKbversion(), listToDis.get(i).getSetting()); 107 | // Bugfix! Selected text may not be null. Should be "" 108 | // String instead; 109 | if (dpo.getSelectedText() != null) { 110 | tasks.add(task); 111 | } 112 | } 113 | mainService.disambiguate(tasks); 114 | } 115 | 116 | for (AbstractDisambiguationTask task : tasks) { 117 | responseList.add(task.getResponse().get(0)); 118 | } 119 | response.setTasks(responseList); 120 | return response; 121 | } 122 | } -------------------------------------------------------------------------------- /doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/Word2VecDisambiguator.java: -------------------------------------------------------------------------------- 1 | package doser.entitydisambiguation.algorithms.collective.dbpedia; 2 | 3 | import java.util.ArrayList; 4 | import java.util.BitSet; 5 | import java.util.Collection; 6 | import java.util.Collections; 7 | import java.util.HashMap; 8 | import java.util.List; 9 | 10 | import org.apache.commons.collections15.Factory; 11 | import org.apache.commons.collections15.functors.MapTransformer; 12 | import org.apache.commons.math.stat.descriptive.SummaryStatistics; 13 | 14 | import doser.entitydisambiguation.algorithms.SurfaceForm; 15 | import doser.entitydisambiguation.algorithms.collective.AbstractWord2VecPageRank; 16 | import doser.entitydisambiguation.algorithms.collective.Edge; 17 | import doser.entitydisambiguation.algorithms.collective.Vertex; 18 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia; 19 | import edu.uci.ics.jung.algorithms.scoring.PageRankWithPriors; 20 | import edu.uci.ics.jung.graph.DirectedSparseMultigraph; 21 | 22 | class Word2VecDisambiguator extends AbstractWord2VecPageRank { 23 | 24 | // private static final int MAXIMUMCANDIDATESPERSF = 8; 25 | 26 | private List origList; 27 | 28 | private boolean disambiguate; 29 | 30 | private int maximumcandidatespersf; 31 | 32 | private int iterations; 33 | 34 | 35 | Word2VecDisambiguator(EntityCentricKBDBpedia eckb, 36 | List rep, boolean disambiguate, int maximumcandidatespersf, int iterations) { 37 | super(eckb, rep); 38 | this.origList = new ArrayList(); 39 | this.disambiguate = disambiguate; 40 | this.maximumcandidatespersf = maximumcandidatespersf; 41 | this.iterations = iterations; 42 | } 43 | 44 | @Override 45 | public void setup() { 46 | this.graph = new DirectedSparseMultigraph(); 47 | this.edgeWeights = new HashMap(); 48 | this.edgeFactory = new Factory() { 49 | int i = 0; 50 | 51 | public Integer create() { 52 | return i++; 53 | } 54 | }; 55 | 56 | for (SurfaceForm sf : repList) { 57 | SurfaceForm clone = (SurfaceForm) sf.clone(); 58 | this.origList.add(clone); 59 | } 60 | 61 | this.disambiguatedSurfaceForms = new BitSet(repList.size()); 62 | for (int i = 0; i < repList.size(); i++) { 63 | if (repList.get(i).getCandidates().size() <= 1) { 64 | this.disambiguatedSurfaceForms.set(i); 65 | } 66 | } 67 | buildMainGraph(); 68 | } 69 | 70 | @Override 71 | protected PageRankWithPriors performPageRank() { 72 | PageRankWithPriors pr = new PageRankWithPriors( 73 | graph, MapTransformer.getInstance(edgeWeights), 74 | getRootPrior(graph.getVertices()), 0.09); 75 | pr.setMaxIterations(iterations); 76 | pr.evaluate(); 77 | return pr; 78 | } 79 | 80 | @Override 81 | public boolean analyzeResults(PageRankWithPriors pr) { 82 | boolean disambiguationStop = true; 83 | Collection vertexCol = graph.getVertices(); 84 | for (int i = 0; i < repList.size(); i++) { 85 | if (!disambiguatedSurfaceForms.get(i) && repList.get(i).isRelevant()) { 86 | int qryNr = repList.get(i).getQueryNr(); 87 | double maxScore = 0; 88 | SummaryStatistics stats = new SummaryStatistics(); 89 | String tempSolution = ""; 90 | List scores = new ArrayList(); 91 | for (Vertex v : vertexCol) { 92 | if (v.getEntityQuery() == qryNr && v.isCandidate()) { 93 | scores.add(new Candidate(v.getUris().get(0), pr 94 | .getVertexScore(v))); 95 | double score = Math.abs(pr.getVertexScore(v)); 96 | stats.addValue(score); 97 | if (score > maxScore) { 98 | tempSolution = v.getUris().get(0); 99 | maxScore = score; 100 | } 101 | } 102 | } 103 | SurfaceForm rep = repList.get(i); 104 | SurfaceForm clone = origList.get(i); 105 | Collections.sort(scores, Collections.reverseOrder()); 106 | double secondMax = scores.get(1).score; 107 | 108 | List newCandidates = new ArrayList(); 109 | for(int j = 0; j < maximumcandidatespersf; j++) { 110 | if(scores.size() > j) { 111 | newCandidates.add(scores.get(j).can); 112 | } else { 113 | break; 114 | } 115 | } 116 | 117 | if (!Double.isInfinite(maxScore)) { 118 | double avg = stats.getMean(); 119 | double threshold = computeThreshold(avg, maxScore); 120 | if (secondMax < threshold && disambiguate) { 121 | updateGraph(rep.getCandidates(), tempSolution, 122 | rep.getQueryNr()); 123 | rep.setDisambiguatedEntity(tempSolution); 124 | clone.setDisambiguatedEntity(tempSolution); 125 | disambiguatedSurfaceForms.set(i); 126 | disambiguationStop = false; 127 | break; 128 | } else { 129 | clone.setCandidates(newCandidates); 130 | } 131 | } 132 | } 133 | } 134 | return disambiguationStop; 135 | } 136 | 137 | /** 138 | * Threshold Computation // IMPORTANT DISAMBIGUATION PARAMETER 139 | * 140 | * @param avg 141 | * @param highest 142 | * @return 143 | */ 144 | private double computeThreshold(double avg, double highest) { 145 | double diff = highest - avg; 146 | double min = diff * 0.5; 147 | return highest - min; 148 | } 149 | 150 | @Override 151 | public List getRepresentation() { 152 | return this.origList; 153 | } 154 | 155 | class Candidate implements Comparable { 156 | private double score; 157 | private String can; 158 | 159 | Candidate(String can, double score) { 160 | super(); 161 | this.score = score; 162 | this.can = can; 163 | } 164 | 165 | @Override 166 | public int compareTo(Candidate o) { 167 | if (score < o.score) { 168 | return -1; 169 | } else if (score > o.score) { 170 | return 1; 171 | } else { 172 | return 0; 173 | } 174 | } 175 | } 176 | } 177 | --------------------------------------------------------------------------------