├── doser-dis-core
├── .gitignore
├── .settings
│ ├── org.eclipse.wst.jsdt.ui.superType.name
│ ├── org.eclipse.wst.validation.prefs
│ ├── org.eclipse.wst.jsdt.ui.superType.container
│ ├── org.eclipse.m2e.wtp.prefs
│ ├── org.eclipse.m2e.core.prefs
│ ├── org.eclipse.wst.ws.service.policy.prefs
│ ├── org.eclipse.wst.common.project.facet.core.prefs.xml
│ ├── org.eclipse.wst.common.project.facet.core.xml
│ ├── .jsdtscope
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.wst.common.component
├── src
│ ├── main
│ │ ├── java
│ │ │ └── doser
│ │ │ │ ├── language
│ │ │ │ └── Languages.java
│ │ │ │ ├── entitydisambiguation
│ │ │ │ ├── dpo
│ │ │ │ │ ├── package-info.java
│ │ │ │ │ ├── DisambiguatedEntity.java
│ │ │ │ │ ├── DisambiguationResponse.java
│ │ │ │ │ ├── Response.java
│ │ │ │ │ ├── DisambiguationRequest.java
│ │ │ │ │ └── EntityDisambiguationDPO.java
│ │ │ │ ├── knowledgebases
│ │ │ │ │ ├── KnowledgeBaseIdentifiers.java
│ │ │ │ │ ├── DocumentCentricKnowledgeBaseDefault.java
│ │ │ │ │ ├── EntityCentricKBDBpedia.java
│ │ │ │ │ ├── AbstractKnowledgeBase.java
│ │ │ │ │ └── EntityCentricKBBiomed.java
│ │ │ │ ├── algorithms
│ │ │ │ │ ├── IllegalDisambiguationAlgorithmInputException.java
│ │ │ │ │ ├── rules
│ │ │ │ │ │ ├── AbstractRule.java
│ │ │ │ │ │ ├── RuleAdapation.java
│ │ │ │ │ │ ├── NoCandidatesCheckPlural.java
│ │ │ │ │ │ ├── CheckGeneralEntities.java
│ │ │ │ │ │ ├── UnambiguousToAmbiguousRule.java
│ │ │ │ │ │ ├── NoCandidatesExpansionRules.java
│ │ │ │ │ │ └── ContextRule.java
│ │ │ │ │ ├── Candidate.java
│ │ │ │ │ ├── collective
│ │ │ │ │ │ ├── Edge.java
│ │ │ │ │ │ ├── dbpedia
│ │ │ │ │ │ │ ├── CandidateReductionDBpediaW2V.java
│ │ │ │ │ │ │ ├── TableColumnFilter.java
│ │ │ │ │ │ │ ├── CollectiveAndContextDriver.java
│ │ │ │ │ │ │ └── Word2VecDisambiguator.java
│ │ │ │ │ │ ├── general
│ │ │ │ │ │ │ ├── CandidateReductionGeneralW2V.java
│ │ │ │ │ │ │ ├── CollectiveContextDriverGeneral.java
│ │ │ │ │ │ │ └── CollectiveDisambiguationGeneralEntities.java
│ │ │ │ │ │ ├── CandidateReduction.java
│ │ │ │ │ │ ├── CandidatePruning.java
│ │ │ │ │ │ └── Vertex.java
│ │ │ │ │ ├── AbstractDisambiguationAlgorithm.java
│ │ │ │ │ ├── DisambiguationHandler.java
│ │ │ │ │ └── SurfaceForm.java
│ │ │ │ ├── backend
│ │ │ │ │ ├── AbstractDisambiguationTask.java
│ │ │ │ │ ├── DisambiguationTaskSingle.java
│ │ │ │ │ └── DisambiguationTaskCollective.java
│ │ │ │ └── properties
│ │ │ │ │ └── Properties.java
│ │ │ │ ├── word2vec
│ │ │ │ ├── Doc2VecJsonFormat.java
│ │ │ │ ├── Data.java
│ │ │ │ └── Word2VecJsonFormat.java
│ │ │ │ └── tools
│ │ │ │ ├── ServiceQueries.java
│ │ │ │ └── NTToDbPediaUrlEncoding.java
│ │ └── resources
│ │ │ ├── application.properties
│ │ │ └── disambiguation.properties
│ └── test
│ │ └── java
│ │ └── doser
│ │ └── test
│ │ └── breakdetection
│ │ └── BreakDetection.java
├── .classpath
├── .project
└── pom.xml
├── doser-dis-extensions
├── .gitignore
├── .settings
│ ├── org.eclipse.wst.jsdt.ui.superType.name
│ ├── org.eclipse.wst.jsdt.ui.superType.container
│ ├── org.eclipse.wst.validation.prefs
│ ├── org.eclipse.m2e.wtp.prefs
│ ├── org.eclipse.m2e.core.prefs
│ ├── org.eclipse.wst.ws.service.policy.prefs
│ ├── org.eclipse.wst.common.project.facet.core.prefs.xml
│ ├── org.eclipse.wst.common.project.facet.core.xml
│ ├── .jsdtscope
│ ├── org.eclipse.jdt.core.prefs
│ ├── org.eclipse.wst.common.component
│ └── org.eclipse.jdt.ui.prefs
├── src
│ └── main
│ │ ├── resources
│ │ └── application.properties
│ │ └── java
│ │ └── doser
│ │ ├── lucene
│ │ ├── features
│ │ │ ├── DocCenExtFeatures.java
│ │ │ ├── IEntityCentricExtFeatures.java
│ │ │ └── LuceneFeatures.java
│ │ ├── analysis
│ │ │ ├── DoserIDFilter.java
│ │ │ ├── DoserIDTokenizer.java
│ │ │ ├── DoserStandardTokenizer.java
│ │ │ ├── DoserIDAnalyzer.java
│ │ │ └── DoserStandardAnalyzer.java
│ │ └── query
│ │ │ ├── LTRBooleanQuery.java
│ │ │ ├── LearnToRankFeatureDefaultValueManager.java
│ │ │ ├── LearnToRankClause.java
│ │ │ ├── LearnToRankTermScorer.java
│ │ │ ├── PriorQuery.java
│ │ │ ├── SensePriorQuery.java
│ │ │ └── ConjunctionScorer.java
│ │ ├── algorithms
│ │ └── MajorityVoteAlgorithm.java
│ │ ├── general
│ │ ├── HelpfulMethods.java
│ │ └── Test.java
│ │ └── nlp
│ │ └── NLPTools.java
├── pom.xml
├── .project
└── .classpath
├── doser-dis-disambiguationserver
├── .gitignore
├── .settings
│ ├── org.eclipse.wst.jsdt.ui.superType.name
│ ├── org.eclipse.wst.jsdt.ui.superType.container
│ ├── org.eclipse.wst.validation.prefs
│ ├── org.eclipse.m2e.wtp.prefs
│ ├── org.eclipse.m2e.core.prefs
│ ├── org.eclipse.wst.ws.service.policy.prefs
│ ├── org.eclipse.wst.common.project.facet.core.prefs.xml
│ ├── org.eclipse.wst.common.project.facet.core.xml
│ ├── .jsdtscope
│ ├── org.eclipse.jdt.core.prefs
│ └── org.eclipse.wst.common.component
├── src
│ └── main
│ │ ├── resources
│ │ ├── application.properties
│ │ └── log4j.xml
│ │ ├── java
│ │ └── doser
│ │ │ └── server
│ │ │ └── actions
│ │ │ ├── package-info.java
│ │ │ ├── FrameworkInitialization.java
│ │ │ └── disambiguation
│ │ │ └── DisambiguationService.java
│ │ └── webapp
│ │ └── WEB-INF
│ │ ├── applicationContext.xml
│ │ ├── web.xml
│ │ └── dispatcher-servlet.xml
├── .classpath
├── .project
└── pom.xml
├── Word2VecRestInterface
├── .idea
│ ├── .name
│ ├── scopes
│ │ └── scope_settings.xml
│ ├── encodings.xml
│ ├── vcs.xml
│ ├── Word2VecRestInterface.iml
│ ├── modules.xml
│ └── misc.xml
├── startserver
└── config.ini
├── .settings
├── org.eclipse.m2e.core.prefs
├── org.eclipse.jdt.core.prefs
└── org.eclipse.jst.jsp.core.prefs
├── .classpath
├── yes.pub
├── pom.xml
├── .project
├── README.md
└── yes
/doser-dis-core/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 |
--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/.name:
--------------------------------------------------------------------------------
1 | Word2VecRestInterface
--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.jsdt.ui.superType.name:
--------------------------------------------------------------------------------
1 | Window
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.jsdt.ui.superType.name:
--------------------------------------------------------------------------------
1 | Window
--------------------------------------------------------------------------------
/Word2VecRestInterface/startserver:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nohup python Word2VecRest.py &
3 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.jsdt.ui.superType.name:
--------------------------------------------------------------------------------
1 | Window
--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.validation.prefs:
--------------------------------------------------------------------------------
1 | disabled=06target
2 | eclipse.preferences.version=1
3 |
--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.jsdt.ui.superType.container:
--------------------------------------------------------------------------------
1 | org.eclipse.wst.jsdt.launching.baseBrowserLibrary
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.jsdt.ui.superType.container:
--------------------------------------------------------------------------------
1 | org.eclipse.wst.jsdt.launching.baseBrowserLibrary
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.validation.prefs:
--------------------------------------------------------------------------------
1 | disabled=06target
2 | eclipse.preferences.version=1
3 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.jsdt.ui.superType.container:
--------------------------------------------------------------------------------
1 | org.eclipse.wst.jsdt.launching.baseBrowserLibrary
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.validation.prefs:
--------------------------------------------------------------------------------
1 | disabled=06target
2 | eclipse.preferences.version=1
3 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.m2e.wtp.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.m2e.wtp.enabledProjectSpecificPrefs=false
3 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.m2e.wtp.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.m2e.wtp.enabledProjectSpecificPrefs=false
3 |
--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/language/Languages.java:
--------------------------------------------------------------------------------
1 | package doser.language;
2 |
3 | public enum Languages {
4 | english, german, other
5 | }
6 |
--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.ws.service.policy.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.wst.ws.service.policy.projectEnabled=false
3 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.m2e.wtp.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.m2e.wtp.enabledProjectSpecificPrefs=false
3 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.ws.service.policy.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.wst.ws.service.policy.projectEnabled=false
3 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.ws.service.policy.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.wst.ws.service.policy.projectEnabled=false
3 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/resources/application.properties:
--------------------------------------------------------------------------------
1 | application.name = ${project.name}
2 | application.artifactId = ${project.artifactId}
3 | application.version = ${project.version}
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/resources/application.properties:
--------------------------------------------------------------------------------
1 | application.name = ${project.name}
2 | application.artifactId = ${project.artifactId}
3 | application.version = ${project.version}
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Data Presentation Objects (DPO) for model input and output
3 | */
4 | package doser.entitydisambiguation.dpo;
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/resources/application.properties:
--------------------------------------------------------------------------------
1 | application.name = ${project.name}
2 | application.artifactId = ${project.artifactId}
3 | application.version = ${project.version}
--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/scopes/scope_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/java/doser/server/actions/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 | * Package for all server actions. Use the "@Controller" class annotation to add a new action class.
3 | */
4 | package doser.server.actions;
--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.common.project.facet.core.prefs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/KnowledgeBaseIdentifiers.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.knowledgebases;
2 |
3 | public enum KnowledgeBaseIdentifiers {
4 | Standard, CSTable, Biomed, DocumentCentricDefault;
5 | }
6 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.common.project.facet.core.prefs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.common.project.facet.core.prefs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/Word2VecRestInterface.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.common.project.facet.core.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.common.project.facet.core.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.common.project.facet.core.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
4 | org.eclipse.jdt.core.compiler.compliance=1.7
5 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
6 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
7 | org.eclipse.jdt.core.compiler.source=1.7
8 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/features/DocCenExtFeatures.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.features;
2 |
3 | /**
4 | * Interface to specify an external Lucene feature set for a document-centric
5 | * knowledge base. External features are features not integrated in Apache
6 | * Lucene.
7 | *
8 | * @author Stefan Zwicklbauer
9 | *
10 | */
11 |
12 | public interface DocCenExtFeatures {
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/Word2VecRestInterface/config.ini:
--------------------------------------------------------------------------------
1 | [Word2VecRest]
2 | embeddings_w2v_wikipedia = /mnt/ssd1/disambiguation/word2vec/WikiEntityModel_400_neg10_iter5.seq
3 | embeddings_w2v_calbc = /mnt/ssd1/disambiguation/word2vec/calbcsmall_model_sg_500.bin
4 | embeddings_d2v_wikipedia = /mnt/ssd1/disambiguation/word2vec/doc2vec/Wiki_Standard_Model/doc2vec_wiki_model.d2v
5 | embeddings_d2v_wikipedia_german = /mnt/ssd1/disambiguation/word2vec/doc2vec/Wikipedia_Standard_German/doc2vec_model_german.d2v
6 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/IllegalDisambiguationAlgorithmInputException.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms;
2 |
3 | public class IllegalDisambiguationAlgorithmInputException extends
4 | IllegalArgumentException {
5 |
6 | private static final long serialVersionUID = 1L;
7 |
8 | IllegalDisambiguationAlgorithmInputException() {
9 | super("Wrong Knowledge base!");
10 | }
11 |
12 | IllegalDisambiguationAlgorithmInputException(String text) {
13 | super(text);
14 | }
15 |
16 | }
17 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/resources/disambiguation.properties:
--------------------------------------------------------------------------------
1 | application.name = ${project.name}
2 | application.artifactId = ${project.artifactId}
3 | application.version = ${project.version}
4 | luceneversion = 4.7.0
5 | disambiguation.entityCentricKBWikipedia = /mnt/ssd1/disambiguation/LuceneIndex/Wikipedia_Default_Aida_Sigir/
6 | disambiguation.entityCentricBiomedCalbC= /mnt/ssd1/disambiguation/LuceneIndex/Biomed_CalbCSmall/
7 | disambiguation.returnSize = 10
8 | disambiguation.Word2VecService = http://theseus.dimis.fim.uni-passau.de:80/Word2VecRest/
9 | candidateExpansion = false
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserIDFilter.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.analysis;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.lucene.analysis.TokenFilter;
6 | import org.apache.lucene.analysis.TokenStream;
7 |
8 | public class DoserIDFilter extends TokenFilter {
9 |
10 | public DoserIDFilter(TokenStream in) {
11 | super(in);
12 | }
13 |
14 | @Override
15 | public boolean incrementToken() throws IOException {
16 | if (!input.incrementToken()) {
17 | return false;
18 | }
19 | return true;
20 | }
21 |
22 | }
23 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/word2vec/Doc2VecJsonFormat.java:
--------------------------------------------------------------------------------
1 | package doser.word2vec;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | public class Doc2VecJsonFormat {
7 |
8 | private List data;
9 |
10 | public Doc2VecJsonFormat() {
11 | super();
12 | this.data = new ArrayList();
13 | }
14 |
15 | public List getData() {
16 | return data;
17 | }
18 |
19 | public void setData(List data) {
20 | this.data = data;
21 | }
22 |
23 | public void addData(Data doc) {
24 | this.data.add(doc);
25 | }
26 |
27 | }
28 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/AbstractRule.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.rules;
2 |
3 | import java.util.List;
4 |
5 | import doser.entitydisambiguation.algorithms.SurfaceForm;
6 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
7 |
8 | abstract class AbstractRule {
9 |
10 | protected AbstractKnowledgeBase eckb;
11 |
12 | AbstractRule(AbstractKnowledgeBase eckb) {
13 | super();
14 | this.eckb = eckb;
15 | }
16 |
17 | abstract boolean applyRule(List rep);
18 |
19 | }
20 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/webapp/WEB-INF/applicationContext.xml:
--------------------------------------------------------------------------------
1 |
2 |
9 |
10 |
--------------------------------------------------------------------------------
/doser-dis-core/.settings/.jsdtscope:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/DocumentCentricKnowledgeBaseDefault.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.knowledgebases;
2 |
3 | import org.apache.lucene.search.similarities.Similarity;
4 |
5 | public class DocumentCentricKnowledgeBaseDefault extends AbstractKnowledgeBase {
6 |
7 | public DocumentCentricKnowledgeBaseDefault(String uri, boolean dynamic,
8 | Similarity sim) {
9 | super(uri, dynamic, sim);
10 | }
11 |
12 | public DocumentCentricKnowledgeBaseDefault(String uri, boolean dynamic) {
13 | super(uri, dynamic);
14 | }
15 |
16 | @Override
17 | public void initialize() {
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/.jsdtscope:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/.jsdtscope:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/resources/log4j.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/features/IEntityCentricExtFeatures.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.features;
2 |
3 | import java.util.Set;
4 |
5 | /**
6 | * Interface to specify an external Lucene feature set for an entity-centric
7 | * knowledge base. External features are features not integrated in Apache
8 | * Lucene.
9 | *
10 | * @author Stefan Zwicklbauer
11 | *
12 | */
13 | public interface IEntityCentricExtFeatures {
14 |
15 | public float getPriorOfDocument(final int docId);
16 |
17 | public float getSensePriorOfDocument(final String keyword, final int docId);
18 |
19 | public Set getRelations(final String url);
20 |
21 | public int getOccurrences(String sf, String uri);
22 | }
23 |
--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.7
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.source=1.7
13 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.7
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.source=1.7
13 |
--------------------------------------------------------------------------------
/yes.pub:
--------------------------------------------------------------------------------
1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDFiuuAKuK8WhRCBVZpjlXIs7TWNKwtpGYqhrbF+hOkstu26QXsPYz6ywZDfQzHS3ey6mi1a/nBx9IYwwgPERu56M1OEUXvHQogEmowCMMVGCDkDgkfkCsMeChIsvCqabTugX6sT/6HHR26QXD1xzkVMhlyF7AuK+XxHNriu7SaVjYwBfVyQc4Mf8usoigKJgBRu5vj4BXzH5oslIAlCZTcFR3tT7Iy4G7IpFwjoBZufQeQiS7k8JLfgKjB9Mcc3H9/gZNvau7RsuAo24SQ4y9Jjt3BahqVdxJgKZMdYyQeRresX7oiXqrsrwBAKHyFUZZAxYZJT2Y0PaK7IrZfXRikmSN+W2Gf9dTxRI5LfYW94JvTIeT5anUhOYtAf71wSmAimQrXbMS4JKlbbZSQB/U/GY3XX+mEyoG/qqgJUNjBTF5NPtOzKbprgTkubu6VNduokKLAJP+z0ZfDoZwZaPvXR9qmFu8E5qaAIfXM/oXd9DPcSuyAh1HvXnkCHJ0z1oGusmc/Cpk6Agt5IvL4khb/HtQpvdbr8DDM963Zy8VEHaq1Uq1SKEpAcw678EtbEymbEieL0BSq8wbBn6fQRXWiCDdiqRbAkIK3Q1kyMKxmovPmYtzykYgWmb0feQpVpROVvL1JyOCKRKEK2xEWsVidcBZJtTb+JW9OkThdun8q5w== quhfus@stefan.zwicklbauer@uni-passau.de
2 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.7
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.source=1.7
13 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.common.component:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/Candidate.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms;
2 |
3 | class Candidate implements Comparable {
4 |
5 | private String candidate;
6 | private double score;
7 |
8 | Candidate(String candidate, double score) {
9 | super();
10 | this.candidate = candidate;
11 | this.score = score;
12 | }
13 |
14 | @Override
15 | public int compareTo(Candidate o) {
16 | if (this.score < o.score) {
17 | return -1;
18 | } else if (this.score > o.score) {
19 | return 1;
20 | } else {
21 | return 0;
22 | }
23 | }
24 |
25 | String getCandidate() {
26 | return candidate;
27 | }
28 |
29 | double getScore() {
30 | return score;
31 | }
32 |
33 | @Override
34 | public String toString() {
35 | return candidate;
36 | }
37 | }
--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.common.component:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | uses
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/DisambiguatedEntity.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.dpo;
2 |
3 |
4 | /**
5 | * Class representing a disambiguated entity consisting of the entity mention
6 | * (the text), the identified URI, a value representing the confidence for the
7 | * decision, and a entity description. Class is a POJO for automatic
8 | * (de-)serialization. TODO may not be complete (e.g. relevant terms may be
9 | * added)
10 | *
11 | * @author zwicklbauer
12 | *
13 | */
14 | public class DisambiguatedEntity {
15 |
16 | private String entityUri;
17 |
18 | public DisambiguatedEntity() {
19 | super();
20 | this.entityUri = "";
21 | }
22 |
23 | public DisambiguatedEntity(final String text, final String entityUri,
24 | final double confidence, final String description) {
25 | this.entityUri = entityUri;
26 | }
27 |
28 | public String getEntityUri() {
29 | return this.entityUri;
30 | }
31 |
32 | public void setEntityUri(final String entityUri) {
33 | this.entityUri = entityUri;
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/LTRBooleanQuery.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.query;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.lucene.search.BooleanQuery;
6 | import org.apache.lucene.search.IndexSearcher;
7 | import org.apache.lucene.search.Weight;
8 |
9 | public class LTRBooleanQuery extends BooleanQuery {
10 |
11 | public class LTRBooleanWeight extends BooleanWeight {
12 |
13 | public LTRBooleanWeight(final IndexSearcher searcher,
14 | final boolean disableCoord) throws IOException {
15 | super(searcher, disableCoord);
16 | }
17 |
18 | @Override
19 | public float coord(final int overlap, final int maxOverlap) {
20 | // return 1.0f;
21 | return maxOverlap == 1 ? 1F : similarity.coord(overlap, maxOverlap);
22 | }
23 | }
24 |
25 | public LTRBooleanQuery() {
26 | super();
27 | }
28 |
29 | public LTRBooleanQuery(final boolean bool) {
30 | super(bool);
31 | }
32 |
33 | @Override
34 | public Weight createWeight(final IndexSearcher searcher) throws IOException {
35 | return new LTRBooleanWeight(searcher, isCoordDisabled());
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
4 | 4.0.0
5 |
6 | doser-dis
7 | doser-dis-parent
8 | pom
9 | 1.0
10 | DoSer
11 |
12 |
13 | doser-dis-extensions
14 | doser-dis-core
15 | doser-dis-disambiguationserver
16 |
17 |
18 |
19 |
20 |
21 |
22 | org.apache.maven.plugins
23 | maven-compiler-plugin
24 | 3.1
25 |
26 | 1.7
27 | 1.7
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/algorithms/MajorityVoteAlgorithm.java:
--------------------------------------------------------------------------------
1 | package doser.algorithms;
2 |
3 | import java.util.HashMap;
4 | import java.util.List;
5 | import java.util.Map;
6 |
7 | import doser.general.HelpfulMethods;
8 |
9 |
10 | /**
11 | * Majority vote methods for arbitrary types
12 | *
13 | * @author Stefan Zwicklbauer
14 | *
15 | */
16 | public final class MajorityVoteAlgorithm> {
17 |
18 | public MajorityVoteAlgorithm() {
19 | super();
20 | }
21 |
22 | public Map.Entry getMajorityType(final List typeList) {
23 | final List> list = this
24 | .getMajorityTypes(typeList);
25 | Map.Entry res = null;
26 | if (!list.isEmpty()) {
27 | res = list.get(0);
28 | }
29 | return res;
30 | }
31 |
32 | public List> getMajorityTypes(final List list) {
33 | final Map hash = new HashMap();
34 | for (final K k : list) {
35 | if (hash.containsKey(k)) {
36 | Integer number = hash.get(k);
37 | hash.put(k, ++number);
38 | } else {
39 | hash.put(k, 1);
40 | }
41 | }
42 | return HelpfulMethods.sortByValue(hash);
43 | }
44 |
45 | }
46 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/Edge.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective;
2 |
3 |
4 | public class Edge {
5 |
6 | private Integer edgeNr;
7 |
8 | private Vertex target;
9 |
10 | private double transition;
11 |
12 | private Double edgeProbability;
13 |
14 | public Edge(Integer edgeNr, Vertex target, double transition) {
15 | super();
16 | this.transition = transition;
17 | this.edgeNr = edgeNr;
18 | this.target = target;
19 | }
20 |
21 | public double getTransition() {
22 | return transition;
23 | }
24 | public void setTransition(double transition) {
25 | this.transition = transition;
26 | }
27 |
28 | public void setProbability(double p) {
29 | this.edgeProbability = new Double(p);
30 | }
31 |
32 | public Double getProbability() {
33 | return this.edgeProbability;
34 | }
35 |
36 | public Vertex getTarget() {
37 | return this.target;
38 | }
39 |
40 | @Override
41 | public boolean equals(Object obj) {
42 | if(this.edgeNr == ((Edge) obj).edgeNr) {
43 | return true;
44 | }
45 | return false;
46 | }
47 |
48 | @Override
49 | public int hashCode() {
50 | return edgeNr.hashCode();
51 | }
52 |
53 | }
54 |
--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.common.component:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 | uses
9 |
10 |
11 | uses
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/doser-dis-core/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/doser-dis-extensions/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | doser-dis
5 | doser-dis-parent
6 | 1.0
7 |
8 | 4.0.0
9 | doser.sub
10 | doser-dis-extensions
11 | 0.6
12 | doser-dis-extensions
13 |
14 |
15 | doser-dis-extensions
16 |
17 |
18 |
19 |
20 |
21 | org.apache.lucene
22 | lucene-core
23 | 4.10.4
24 |
25 |
26 | org.apache.lucene
27 | lucene-analyzers-common
28 | 4.10.4
29 |
30 |
31 | org.apache.lucene
32 | lucene-queryparser
33 | 4.10.4
34 |
35 |
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | de.uop.code-disambiguationserver
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.wst.jsdt.core.javascriptValidator
10 |
11 |
12 |
13 |
14 | org.eclipse.jdt.core.javabuilder
15 |
16 |
17 |
18 |
19 | org.eclipse.wst.common.project.facet.core.builder
20 |
21 |
22 |
23 |
24 | org.eclipse.wst.validation.validationbuilder
25 |
26 |
27 |
28 |
29 | org.eclipse.m2e.core.maven2Builder
30 |
31 |
32 |
33 |
34 |
35 | org.eclipse.jem.workbench.JavaEMFNature
36 | org.eclipse.wst.common.modulecore.ModuleCoreNature
37 | org.eclipse.jdt.core.javanature
38 | org.eclipse.m2e.core.maven2Nature
39 | org.eclipse.wst.common.project.facet.core.nature
40 | org.eclipse.wst.jsdt.core.jsNature
41 |
42 |
43 |
--------------------------------------------------------------------------------
/doser-dis-core/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | doser-extensions
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.wst.jsdt.core.javascriptValidator
10 |
11 |
12 |
13 |
14 | org.eclipse.jdt.core.javabuilder
15 |
16 |
17 |
18 |
19 | org.eclipse.wst.common.project.facet.core.builder
20 |
21 |
22 |
23 |
24 | org.eclipse.wst.validation.validationbuilder
25 |
26 |
27 |
28 |
29 | org.eclipse.m2e.core.maven2Builder
30 |
31 |
32 |
33 |
34 |
35 | org.eclipse.jem.workbench.JavaEMFNature
36 | org.eclipse.wst.common.modulecore.ModuleCoreNature
37 | org.eclipse.jdt.core.javanature
38 | org.eclipse.m2e.core.maven2Nature
39 | org.eclipse.wst.common.project.facet.core.nature
40 | org.eclipse.wst.jsdt.core.jsNature
41 |
42 |
43 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | doser-extensions
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.wst.jsdt.core.javascriptValidator
10 |
11 |
12 |
13 |
14 | org.eclipse.jdt.core.javabuilder
15 |
16 |
17 |
18 |
19 | org.eclipse.wst.common.project.facet.core.builder
20 |
21 |
22 |
23 |
24 | org.eclipse.wst.validation.validationbuilder
25 |
26 |
27 |
28 |
29 | org.eclipse.m2e.core.maven2Builder
30 |
31 |
32 |
33 |
34 |
35 | org.eclipse.jem.workbench.JavaEMFNature
36 | org.eclipse.wst.common.modulecore.ModuleCoreNature
37 | org.eclipse.jdt.core.javanature
38 | org.eclipse.m2e.core.maven2Nature
39 | org.eclipse.wst.common.project.facet.core.nature
40 | org.eclipse.wst.jsdt.core.jsNature
41 |
42 |
43 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | de.uop.code-disambiguationserver
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.wst.jsdt.core.javascriptValidator
10 |
11 |
12 |
13 |
14 | org.eclipse.jdt.core.javabuilder
15 |
16 |
17 |
18 |
19 | org.eclipse.wst.common.project.facet.core.builder
20 |
21 |
22 |
23 |
24 | org.eclipse.wst.validation.validationbuilder
25 |
26 |
27 |
28 |
29 | org.eclipse.m2e.core.maven2Builder
30 |
31 |
32 |
33 |
34 |
35 | org.eclipse.jem.workbench.JavaEMFNature
36 | org.eclipse.wst.common.modulecore.ModuleCoreNature
37 | org.eclipse.jdt.core.javanature
38 | org.eclipse.m2e.core.maven2Nature
39 | org.eclipse.wst.common.project.facet.core.nature
40 | org.eclipse.wst.jsdt.core.jsNature
41 |
42 |
43 |
--------------------------------------------------------------------------------
/doser-dis-extensions/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/DisambiguationResponse.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.dpo;
2 |
3 | import java.util.List;
4 |
5 | /**
6 | * {
7 | *
8 | * "documentUri":"unique document id", "disambiguatedSurfaceforms": [ {
9 | * "selectedText":"influenza", "position": { "pageId":0,
10 | * "offsets":[1,2,3,5,6,7], "boundingBox":{"minx":0.1, "miny":0.3, "maxx":0.01,
11 | * "maxy":0.03} }, "disEntities": [ { "text":"Influenza (Illness)"
12 | * "entityUri":"http://en.dbpedia.org/pages/..." "confidence":"0.80"
13 | * "description":"some additional description"
14 | *
15 | * ---a list of synonyms (for a later stage)--- "synonyms": [ { "term":"..." } ]
16 | * } // more Items ] }
17 | *
18 | * Version 2.0 is used for additional testing. Current version offers the usage
19 | * of a position array in surfaceFormsToDisambiguate
20 | *
21 | * @author Stefan Zwicklbauer
22 | *
23 | */
24 | public class DisambiguationResponse {
25 |
26 | private List tasks; // NOPMD by quh on 18.02.14 09:34
27 |
28 | private String documentUri;
29 |
30 | public List getTasks() {
31 | return tasks;
32 | }
33 |
34 | public void setTasks(List tasks) {
35 | this.tasks = tasks;
36 | }
37 |
38 | public String getDocumentUri() {
39 | return this.documentUri;
40 | }
41 |
42 | public void setDocumentUri(final String documentUri) {
43 | this.documentUri = documentUri;
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/Response.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.dpo;
2 |
3 | import java.util.LinkedList;
4 | import java.util.List;
5 |
6 | /**
7 | * This class represents a disambiguated surface form and contains all necessary
8 | * information about the disambiguation. Position is required because a
9 | * ColumnResponseItem has no unique primary key and assures the correct
10 | * assignment to the original item.
11 | *
12 | * Version 2.0 offers a list of positions
13 | *
14 | * @author Stefan Zwicklbauer
15 | *
16 | */
17 | public class Response {
18 |
19 | private List disEntities;
20 | private String selectedText;
21 | private int documentId;
22 |
23 | public Response() {
24 | super();
25 | this.disEntities = new LinkedList();
26 | }
27 |
28 | public List getDisEntities() {
29 | return this.disEntities;
30 | }
31 |
32 | public String getSelectedText() {
33 | return this.selectedText;
34 | }
35 |
36 | public void setDisEntities(final List disEntities) {
37 | this.disEntities = disEntities;
38 | }
39 |
40 | public void setSelectedText(final String selectedText) {
41 | this.selectedText = selectedText;
42 | }
43 |
44 | public int getDocumentId() {
45 | return documentId;
46 | }
47 |
48 | public void setDocumentId(int documentId) {
49 | this.documentId = documentId;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | Disambiguation Server
4 |
5 | contextConfigLocation
6 | /WEB-INF/applicationContext.xml
7 |
8 |
9 | encoding-filter
10 | org.springframework.web.filter.CharacterEncodingFilter
11 |
12 | encoding
13 | UTF-8
14 |
15 |
16 |
17 | encoding-filter
18 | /*
19 |
20 |
21 | dispatcher
22 | org.springframework.web.servlet.DispatcherServlet
23 | 1
24 |
25 |
26 | dispatcher
27 | /
28 |
29 |
30 | doser.server.actions.FrameworkInitialization
31 |
32 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/LearnToRankFeatureDefaultValueManager.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.query;
2 |
3 | /**
4 | * Not in use so far.
5 | *
6 | * @author quh
7 | */
8 | public class LearnToRankFeatureDefaultValueManager {
9 |
10 | private static LearnToRankFeatureDefaultValueManager man;
11 |
12 | public static LearnToRankFeatureDefaultValueManager getInstance() {
13 | return man;
14 | }
15 |
16 | public static void setInstance(
17 | final LearnToRankFeatureDefaultValueManager manager) {
18 | man = manager;
19 | }
20 |
21 | private int amountQueries;
22 |
23 | private final float[] maxVals;
24 |
25 | private final float[] sums;
26 |
27 | public LearnToRankFeatureDefaultValueManager(final int pos) {
28 | maxVals = new float[pos];
29 | sums = new float[pos];
30 | for (int j = 0; j < sums.length; j++) {
31 | sums[j] = 0;
32 | }
33 | amountQueries = 0;
34 | }
35 |
36 | public float[] getAverageResults() {
37 | final float[] results = new float[maxVals.length];
38 | for (int i = 0; i < sums.length; i++) {
39 | results[i] = sums[i] / amountQueries;
40 | }
41 | return results;
42 | }
43 |
44 | public void newQuery() {
45 | for (int i = 0; i < maxVals.length; i++) {
46 | sums[i] += maxVals[i];
47 | }
48 | amountQueries++;
49 | }
50 |
51 | public synchronized void setValue(final int position, final float value) {
52 | if (maxVals[position] < value) {
53 | maxVals[position] = value;
54 | }
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/CandidateReductionDBpediaW2V.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective.dbpedia;
2 |
3 | import java.util.LinkedList;
4 | import java.util.List;
5 |
6 | import doser.entitydisambiguation.algorithms.SurfaceForm;
7 | import doser.entitydisambiguation.algorithms.collective.CandidateReduction;
8 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
9 |
10 | public class CandidateReductionDBpediaW2V extends CandidateReduction {
11 |
12 | private int iterations;
13 | private boolean disambiguate;
14 | private EntityCentricKBDBpedia eckb;
15 | private int reduceTo;
16 |
17 | CandidateReductionDBpediaW2V(EntityCentricKBDBpedia eckb, List rep, int maxsurfaceformsperquery,
18 | int reduceTo, int iterations, boolean disambiguate, boolean alwaysAction) {
19 | super(rep, maxsurfaceformsperquery, alwaysAction);
20 | this.iterations = iterations;
21 | this.disambiguate = disambiguate;
22 | this.eckb = eckb;
23 | this.reduceTo = reduceTo;
24 | }
25 |
26 | @Override
27 | public List miniSolve(List rep) {
28 | List sol = new LinkedList();
29 | Word2VecDisambiguator disambiguator = new Word2VecDisambiguator(eckb, rep, disambiguate, reduceTo, iterations);
30 | disambiguator.setup();
31 | disambiguator.solve();
32 | sol.addAll(disambiguator.getRepresentation());
33 | return sol;
34 |
35 | }
36 |
37 | }
38 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jst.jsp.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | validateFragments=false
3 | validation.actions-missing-required-attribute=1
4 | validation.actions-non-empty-inline-tag=2
5 | validation.actions-unexpected-rtexprvalue=2
6 | validation.actions-unknown-attribute=2
7 | validation.directive-attribute-duplicate=2
8 | validation.directive-include-fragment-file-not-found=2
9 | validation.directive-include-fragment-file-not-specified=2
10 | validation.directive-taglib-duplicate-prefixes-different-uris=2
11 | validation.directive-taglib-duplicate-prefixes-same-uris=-1
12 | validation.directive-taglib-missing-prefix=2
13 | validation.directive-taglib-missing-uri-or-tagdir=2
14 | validation.directive-taglib-unresolvable-uri-or-tagdir=2
15 | validation.el-function-undefined=1
16 | validation.el-general-syntax=1
17 | validation.el-lexical-failure=-1
18 | validation.java-=-1
19 | validation.java-local-variable-is-never-used=-1
20 | validation.java-null-local-variable-reference=-1
21 | validation.java-potential-null-local-variable-reference=-1
22 | validation.java-unused-import=-1
23 | validation.translation-tag-class-not-found=2
24 | validation.translation-tei-class-not-found=2
25 | validation.translation-tei-class-not-instantiated=2
26 | validation.translation-tei-class-runtime-exception=2
27 | validation.translation-tei-message=1
28 | validation.translation-usebean-ambiguous-type-info=2
29 | validation.translation-usebean-invalid-id=1
30 | validation.translation-usebean-missing-type-info=1
31 | validation.use-project-settings=true
32 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/general/CandidateReductionGeneralW2V.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective.general;
2 |
3 | import java.util.LinkedList;
4 | import java.util.List;
5 |
6 | import doser.entitydisambiguation.algorithms.SurfaceForm;
7 | import doser.entitydisambiguation.algorithms.collective.CandidateReduction;
8 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral;
9 |
10 | public class CandidateReductionGeneralW2V extends CandidateReduction {
11 |
12 | private int iterations;
13 | private boolean disambiguate;
14 | private AbstractEntityCentricKBGeneral eckb;
15 | private int reduceTo;
16 |
17 | public CandidateReductionGeneralW2V(AbstractEntityCentricKBGeneral eckb, List rep, int maxsurfaceformsperquery,
18 | int reduceTo, int iterations, boolean disambiguate, boolean alwaysAction) {
19 | super(rep, maxsurfaceformsperquery, alwaysAction);
20 | this.iterations = iterations;
21 | this.disambiguate = disambiguate;
22 | this.eckb = eckb;
23 | this.reduceTo = reduceTo;
24 | }
25 |
26 | @Override
27 | public List miniSolve(List rep) {
28 | List sol = new LinkedList();
29 | Word2VecDisambiguatorGeneral disambiguator = new Word2VecDisambiguatorGeneral(eckb, rep, disambiguate, reduceTo,
30 | iterations);
31 | disambiguator.setup();
32 | disambiguator.solve();
33 | sol.addAll(disambiguator.getRepresentation());
34 | return sol;
35 |
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserIDTokenizer.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.analysis;
2 |
3 | import java.io.Reader;
4 |
5 | import org.apache.lucene.analysis.Tokenizer;
6 | import org.apache.lucene.analysis.util.CharTokenizer;
7 | import org.apache.lucene.util.AttributeFactory;
8 |
9 | public final class DoserIDTokenizer extends CharTokenizer {
10 |
11 | /**
12 | * Construct a new WhitespaceTokenizer using a given
13 | * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
14 | *
15 | * @param matchVersion
16 | * Lucene version to match See
17 | * {@link above}
18 | * @param factory
19 | * the attribute factory to use for this {@link Tokenizer}
20 | * @param in
21 | * the input to split up into tokens
22 | */
23 | public DoserIDTokenizer(AttributeFactory factory, Reader in) {
24 | super(factory, in);
25 | }
26 |
27 | /**
28 | * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
29 | * to match See {@link above}
30 | *
31 | * @param in
32 | * the input to split up into tokens
33 | */
34 | public DoserIDTokenizer(Reader in) {
35 | super(in);
36 | }
37 |
38 | /**
39 | * Collects only characters which do not satisfy
40 | * {@link Character#isWhitespace(int)}.
41 | */
42 | @Override
43 | protected boolean isTokenChar(int c) {
44 | boolean check = true;
45 | if (Character.isWhitespace(c)) {
46 | check = false;
47 | }
48 | return check;
49 | }
50 | }
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/RuleAdapation.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.rules;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
7 | import doser.entitydisambiguation.algorithms.SurfaceForm;
8 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
9 |
10 | public class RuleAdapation {
11 |
12 | private List ruleChain;
13 |
14 | public RuleAdapation() {
15 | super();
16 | this.ruleChain = new ArrayList();
17 | }
18 |
19 | public void addNoCandidatesCheckPluralRule(AbstractKnowledgeBase eckb) {
20 | this.ruleChain.add(new NoCandidatesCheckPlural(eckb));
21 | }
22 |
23 | public void addNoCandidatesExpansionRule(AbstractKnowledgeBase eckb) {
24 | this.ruleChain.add(new NoCandidatesExpansionRules(eckb));
25 | }
26 |
27 | public void addUnambiguousToAmbiguousRule(EntityCentricKBDBpedia eckb) {
28 | this.ruleChain.add(new UnambiguousToAmbiguousRule(eckb));
29 | }
30 |
31 | public void addPatternRule(EntityCentricKBDBpedia eckb, String topic) {
32 | if (topic != null) {
33 | this.ruleChain.add(new PatternRule(eckb));
34 | }
35 | }
36 |
37 | public void addContextRule(EntityCentricKBDBpedia eckb) {
38 | this.ruleChain.add(new ContextRule(eckb));
39 | }
40 |
41 | public void performRuleChainBeforeCandidateSelection(List rep) {
42 | for (AbstractRule r : ruleChain) {
43 | r.applyRule(rep);
44 | }
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/webapp/WEB-INF/dispatcher-servlet.xml:
--------------------------------------------------------------------------------
1 |
2 |
11 |
12 |
13 |
14 |
15 |
16 |
18 |
19 |
20 |
21 |
22 |
23 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/tools/ServiceQueries.java:
--------------------------------------------------------------------------------
1 | package doser.tools;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.http.Header;
6 | import org.apache.http.HttpEntity;
7 | import org.apache.http.HttpResponse;
8 | import org.apache.http.client.ClientProtocolException;
9 | import org.apache.http.client.methods.HttpPost;
10 | import org.apache.http.entity.AbstractHttpEntity;
11 | import org.apache.http.impl.client.DefaultHttpClient;
12 | import org.apache.http.util.EntityUtils;
13 | import org.apache.log4j.Logger;
14 |
15 | /**
16 | * Class providing queries for different services. Integrated so far: DbPedia
17 | * Spotlight
18 | *
19 | * @author Stefan Zwicklbauer
20 | *
21 | */
22 | public class ServiceQueries {
23 |
24 | public static String httpPostRequest(String uri, AbstractHttpEntity entity,
25 | Header[] header) {
26 | DefaultHttpClient httpclient = new DefaultHttpClient();
27 | HttpPost httppost = new HttpPost(uri);
28 | httppost.setHeaders(header);
29 | httppost.setEntity(entity);
30 |
31 | HttpResponse response;
32 | StringBuffer buffer = new StringBuffer();
33 | try {
34 | response = httpclient.execute(httppost);
35 | HttpEntity ent = response.getEntity();
36 |
37 | buffer.append(EntityUtils.toString(ent));
38 | httpclient.getConnectionManager().shutdown();
39 |
40 | } catch (ClientProtocolException e) {
41 | Logger.getRootLogger().error("HTTPClient error", e);
42 | } catch (IOException e) {
43 | Logger.getRootLogger().error("HTTPClient error", e);
44 | }
45 | return buffer.toString();
46 | }
47 | }
48 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/backend/AbstractDisambiguationTask.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.backend;
2 |
3 | import java.util.List;
4 |
5 | import doser.entitydisambiguation.dpo.Response;
6 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
7 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers;
8 |
9 | public abstract class AbstractDisambiguationTask {
10 |
11 | protected int returnNr;
12 |
13 | protected AbstractKnowledgeBase kb;
14 |
15 | protected KnowledgeBaseIdentifiers kbIdentifier;
16 |
17 | protected boolean retrieveDocClasses;
18 |
19 | protected List responses;
20 |
21 | public int getReturnNr() {
22 | return returnNr;
23 | }
24 |
25 | public void setReturnNr(int returnNr) {
26 | this.returnNr = returnNr;
27 | }
28 |
29 | public AbstractKnowledgeBase getKb() {
30 | return kb;
31 | }
32 |
33 | public void setKb(AbstractKnowledgeBase kb) {
34 | this.kb = kb;
35 | }
36 |
37 | public KnowledgeBaseIdentifiers getKbIdentifier() {
38 | return this.kbIdentifier;
39 | }
40 |
41 | public boolean isRetrieveDocClasses() {
42 | return retrieveDocClasses;
43 | }
44 |
45 | public void setRetrieveDocClasses(boolean retrieveDocClasses) {
46 | this.retrieveDocClasses = retrieveDocClasses;
47 | }
48 |
49 | public List getResponse() {
50 | return responses;
51 | }
52 |
53 | public void setResponse(List responses) {
54 | this.responses = responses;
55 | }
56 |
57 | public abstract void setKbIdentifier(String kbversion, String setting);
58 | }
59 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/word2vec/Data.java:
--------------------------------------------------------------------------------
1 | package doser.word2vec;
2 |
3 | public class Data {
4 |
5 | private String surfaceForm;
6 | private String qryNr;
7 | private String[] candidates;
8 | private String context;
9 | // private String entity;
10 |
11 | public String getSurfaceForm() {
12 | return surfaceForm;
13 | }
14 |
15 | public void setSurfaceForm(String surfaceForm) {
16 | this.surfaceForm = surfaceForm;
17 | }
18 |
19 | public String getQryNr() {
20 | return qryNr;
21 | }
22 |
23 | public void setQryNr(String qryNr) {
24 | this.qryNr = qryNr;
25 | }
26 |
27 | public String[] getCandidates() {
28 | return candidates;
29 | }
30 |
31 | public void setCandidates(String[] candidates) {
32 | this.candidates = candidates;
33 | }
34 |
35 | public String getContext() {
36 | return context;
37 | }
38 |
39 | public void setContext(String context) {
40 | this.context = context;
41 | }
42 |
43 | // public String getEntity() {
44 | // return entity;
45 | // }
46 | //
47 | // public void setEntity(String entity) {
48 | // this.entity = entity;
49 | // }
50 | //
51 | // @Override
52 | // public int hashCode() {
53 | // return surfaceForm.hashCode() + qryNr.hashCode() + context.hashCode()
54 | // + entity.hashCode();
55 | //
56 | // }
57 | //
58 | // @Override
59 | // public boolean equals(Object obj) {
60 | // Data data = (Data) obj;
61 | // if (this.surfaceForm.equals(data.getSurfaceForm())
62 | // && this.context.equals(data.getSurfaceForm())
63 | // && this.qryNr.equals(data.getQryNr())
64 | // && this.entity.equals(data.getEntity())) {
65 | // return true;
66 | // }
67 | // return false;
68 | // }
69 | }
70 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/AbstractDisambiguationAlgorithm.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms;
2 |
3 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask;
4 |
5 | public abstract class AbstractDisambiguationAlgorithm {
6 |
7 | protected AbstractDisambiguationTask task;
8 |
9 | public void disambiguate(AbstractDisambiguationTask task)
10 | throws IllegalDisambiguationAlgorithmInputException {
11 | if (checkAndSetInputParameter(task)) {
12 | if (preDisambiguation()) {
13 | processAlgorithm();
14 | }
15 | } else {
16 | throw new IllegalDisambiguationAlgorithmInputException(
17 | "Check your input knowledge base and disambiguation task");
18 | }
19 | }
20 |
21 | public static String extractContext(int position, String text,
22 | int contextarea) {
23 | if(text == null || text.length() == 0) {
24 | return "";
25 | }
26 |
27 | long startArea = position - contextarea;
28 | long endArea = position + contextarea;
29 | if (startArea < 0) {
30 | startArea = 0;
31 | }
32 | if (endArea > text.length() - 1) {
33 | endArea = text.length() - 1;
34 | }
35 | String tempText = text.substring((int) startArea, (int) endArea);
36 | String[] splitter = tempText.split(" ");
37 | String result = "";
38 | for (int i = 1; i < splitter.length - 1; i++) {
39 | result += splitter[i] + " ";
40 | }
41 | return result;
42 | }
43 |
44 | protected abstract boolean checkAndSetInputParameter(AbstractDisambiguationTask task);
45 |
46 | protected abstract void processAlgorithm()
47 | throws IllegalDisambiguationAlgorithmInputException;
48 |
49 | protected abstract boolean preDisambiguation();
50 | }
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/general/HelpfulMethods.java:
--------------------------------------------------------------------------------
1 | package doser.general;
2 |
3 | import java.util.Collections;
4 | import java.util.Comparator;
5 | import java.util.LinkedList;
6 | import java.util.List;
7 | import java.util.Map;
8 |
9 | public final class HelpfulMethods {
10 |
11 | /**
12 | * Sorts a Map by value
13 | *
14 | * Partially buggy due to
15 | * http://stackoverflow.com/questions/109383/how-to-sort
16 | * -a-mapkey-value-on-the-values-in-java/1283722#1283722
17 | *
18 | * @param map
19 | * @return SortedMap by Value
20 | */
21 | @Deprecated
22 | public static > List> sortByValue(
23 | final Map map) {
24 | final List> list = new LinkedList>(
25 | map.entrySet());
26 | Collections.sort(list, new Comparator>() {
27 | @Override
28 | public int compare(final Map.Entry op1,
29 | final Map.Entry op2) {
30 | return (op2.getValue()).compareTo(op1.getValue());
31 | }
32 | });
33 | return list;
34 | }
35 |
36 | /**
37 | * Correct Map Sorting with Guava
38 | *
39 | */
40 | // public static > List> sortByValueGuava(
41 | // Map map) {
42 | // // final List sortedKeys =
43 | // // Ordering.natural().onResultOf(Functions.forMap(map)).immutableSortedCopy(map.keySet());
44 | //
45 | // Comparator> byMapValues = new Ordering>() {
46 | // @Override
47 | // public int compare(Map.Entry left, Map.Entry right) {
48 | // return left.getValue().compareTo(right.getValue());
49 | // }
50 | // };
51 | //
52 | // List> entryList = Lists.newArrayList(map.entrySet());
53 | // Collections.sort(entryList, byMapValues);
54 | // return entryList;
55 | // }
56 | }
57 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/backend/DisambiguationTaskSingle.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.backend;
2 |
3 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
4 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers;
5 |
6 | public class DisambiguationTaskSingle extends AbstractDisambiguationTask {
7 |
8 | private EntityDisambiguationDPO entityToDis;
9 |
10 | public DisambiguationTaskSingle(final EntityDisambiguationDPO entityToDis) {
11 | super();
12 | this.entityToDis = entityToDis;
13 | this.retrieveDocClasses = false;
14 | }
15 |
16 | public EntityDisambiguationDPO getEntityToDisambiguate() {
17 | return this.entityToDis;
18 | }
19 |
20 | public void setSurfaceForm(final EntityDisambiguationDPO surfaceForm) {
21 | this.entityToDis = surfaceForm;
22 | }
23 |
24 | /**
25 | * Assignment function to determine the used knowledge base
26 | *
27 | * @param kbversion
28 | * @param setting
29 | */
30 | @Override
31 | public void setKbIdentifier(String kbversion, String setting) {
32 | if(setting == null) {
33 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
34 | } else if(setting.equalsIgnoreCase("DocumentCentric")) {
35 | if(kbversion.equalsIgnoreCase("default")) {
36 | this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault;
37 | } else {
38 | this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault;
39 | }
40 | } else if(setting.equalsIgnoreCase("EntityCentric")) {
41 | if(kbversion.equalsIgnoreCase("default")) {
42 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
43 | } else if(kbversion.equalsIgnoreCase("cstable")) {
44 | this.kbIdentifier = KnowledgeBaseIdentifiers.CSTable;
45 | } else if(kbversion.equalsIgnoreCase("biomedcopy")) {
46 | this.kbIdentifier = KnowledgeBaseIdentifiers.Biomed;
47 | } else {
48 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
49 | }
50 | } else {
51 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
52 | }
53 | }
54 | }
55 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/nlp/NLPTools.java:
--------------------------------------------------------------------------------
1 | package doser.nlp;
2 |
3 | import java.util.List;
4 | import java.util.Properties;
5 |
6 | //import edu.stanford.nlp.ling.CoreAnnotations;
7 | //import edu.stanford.nlp.ling.CoreLabel;
8 | //import edu.stanford.nlp.pipeline.Annotation;
9 | //import edu.stanford.nlp.pipeline.StanfordCoreNLP;
10 | //import edu.stanford.nlp.util.Pair;
11 | //
12 | //
13 | //public class NLPTools {
14 | //
15 | // private static volatile NLPTools instance;
16 | //
17 | // private StanfordCoreNLP pipeline;
18 | //
19 | // private NLPTools() {
20 | // super();
21 | // Properties props = new Properties();
22 | // props.put("annotators", "tokenize, ssplit, pos, lemma, stopword");
23 | // props.setProperty("customAnnotatorClass.stopword",
24 | // "doser.nlp.StopWordAnnotator");
25 | // props.setProperty(StopWordAnnotator.STOPWORDS_LIST, StopWordAnnotator.customStopWordList);
26 | // props.setProperty(StopWordAnnotator.CHECK_LEMMA, "true");
27 | //
28 | // this.pipeline = new StanfordCoreNLP(props);
29 | // }
30 | //
31 | // public static NLPTools getInstance() {
32 | // if (instance == null ) {
33 | // synchronized (NLPTools.class) {
34 | // if (instance == null) {
35 | // instance = new NLPTools();
36 | // }
37 | // }
38 | // }
39 | // return instance;
40 | // }
41 | //
42 | // public String performLemmatizationAndStopWordRemoval(String str) {
43 | // Annotation document = new Annotation(str);
44 | // this.pipeline.annotate(document);
45 | // List tokens = document
46 | // .get(CoreAnnotations.TokensAnnotation.class);
47 | // StringBuilder builder = new StringBuilder();
48 | // for (CoreLabel token : tokens) {
49 | // Pair stopword = token.get(StopWordAnnotator.class);
50 | // String lemma = token.lemma().toLowerCase();
51 | // if(!stopword.first()) {
52 | // builder.append(lemma);
53 | // builder.append(" ");
54 | // }
55 | // }
56 | // return builder.toString().trim();
57 | // }
58 | //}
59 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/DisambiguationRequest.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.dpo;
2 |
3 | import java.util.List;
4 |
5 | /**
6 | * { "documentUri":"unique document id", "surfaceFormsToDisambiguate": [ {
7 | * "selectedText":"influenza", "context":
8 | * "Typically, influenza is transmitted through the air by coughs or sneezes, creating aerosols containing the virus."
9 | * , "position": { "pageId":0, "offsets":[1,2,3,5,6,7],
10 | * "boundingBox":{"minx":0.1, "miny":0.3, "maxx":0.01, "maxy":0.03} } } ],
11 | * "alreadyDisambiguatedEntities":[ { "text":"Illness",
12 | * "entityUri":"http://en.dbpedia.org/page/Illness", "confidence": 0.90,
13 | * "distance": 300 }, { "text":"Desease",
14 | * "entityUri":"http://en.dbpedia.org/page/Desease", "confidence": 0.65,
15 | * "distance": 500 } ] }
16 | *
17 | * Version 2.0 is used for additional testing. Current version
18 | * offers the usage of a position array in surfaceFormsToDisambiguate
19 | *
20 | * @author Stefan Zwicklbauer
21 | *
22 | */
23 | public class DisambiguationRequest {
24 | private String documentUri;
25 | private List surfaceFormsToDisambiguate;
26 | private Integer docsToReturn;
27 | private String mainTopic;
28 |
29 | public String getDocumentUri() {
30 | return this.documentUri;
31 | }
32 |
33 | public List getSurfaceFormsToDisambiguate() {
34 | return this.surfaceFormsToDisambiguate;
35 | }
36 |
37 | public void setDocumentUri(final String documentUri) {
38 | this.documentUri = documentUri;
39 | }
40 |
41 | public void setSurfaceFormsToDisambiguate(
42 | final List surfaceFormsToDisambiguate) {
43 | this.surfaceFormsToDisambiguate = surfaceFormsToDisambiguate;
44 | }
45 |
46 | public Integer getDocsToReturn() {
47 | return docsToReturn;
48 | }
49 |
50 | public void setDocsToReturn(Integer docsToReturn) {
51 | this.docsToReturn = docsToReturn;
52 | }
53 |
54 | public String getMainTopic() {
55 | return mainTopic;
56 | }
57 |
58 | public void setMainTopic(String mainTopic) {
59 | this.mainTopic = mainTopic;
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/DisambiguationHandler.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms;
2 |
3 | import doser.entitydisambiguation.algorithms.collective.dbpedia.CollectiveDisambiguationDBpediaEntities;
4 | import doser.entitydisambiguation.algorithms.collective.general.CollectiveDisambiguationGeneralEntities;
5 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask;
6 | import doser.entitydisambiguation.backend.DisambiguationTaskSingle;
7 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
8 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers;
9 |
10 | public class DisambiguationHandler {
11 |
12 | private static final DisambiguationHandler instance;
13 |
14 | static {
15 | try {
16 | instance = new DisambiguationHandler();
17 | } catch (Exception e) {
18 | throw new RuntimeException("An error occurred!", e);
19 | }
20 | }
21 |
22 | private DisambiguationHandler() {
23 | super();
24 | }
25 |
26 | public static DisambiguationHandler getInstance() {
27 | return instance;
28 | }
29 |
30 | public AbstractDisambiguationAlgorithm getAlgorithm(AbstractDisambiguationTask task) {
31 | AbstractDisambiguationAlgorithm algorithm = null;
32 | if (task instanceof DisambiguationTaskSingle) {
33 | DisambiguationTaskSingle t = (DisambiguationTaskSingle) task;
34 | EntityDisambiguationDPO dpo = t.getEntityToDisambiguate();
35 | if ((dpo.getSetting() != null
36 | && (dpo.getSetting().equalsIgnoreCase("NoContext"))
37 | || dpo.getContext() == null || dpo.getContext().equals("") || dpo
38 | .getContext().equals(" "))) {
39 | algorithm = new EntityCentricAlgorithmTableDefault();
40 | } else if ((dpo.getSetting() != null)
41 | && (dpo.getSetting().equalsIgnoreCase("DocumentCentric"))) {
42 | algorithm = new DocumentCentricAlgorithmDefault();
43 | } else {
44 | algorithm = new EntityCentricAlgorithmDefault();
45 | }
46 | } else {
47 | if (task.getKbIdentifier().equals(KnowledgeBaseIdentifiers.Biomed)) {
48 | algorithm = new CollectiveDisambiguationGeneralEntities();
49 | } else {
50 | algorithm = new CollectiveDisambiguationDBpediaEntities();
51 | }
52 | }
53 | return algorithm;
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/word2vec/Word2VecJsonFormat.java:
--------------------------------------------------------------------------------
1 | package doser.word2vec;
2 |
3 | import java.io.IOException;
4 | import java.util.Set;
5 |
6 | import org.apache.http.Header;
7 | import org.apache.http.entity.ByteArrayEntity;
8 | import org.apache.http.entity.ContentType;
9 | import org.apache.http.message.BasicHeader;
10 | import org.codehaus.jackson.map.ObjectMapper;
11 | import org.codehaus.jettison.json.JSONArray;
12 | import org.codehaus.jettison.json.JSONException;
13 | import org.codehaus.jettison.json.JSONObject;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 |
17 | import doser.entitydisambiguation.properties.Properties;
18 | import doser.tools.ServiceQueries;
19 |
20 | public class Word2VecJsonFormat {
21 |
22 | private final static Logger logger = LoggerFactory.getLogger(Word2VecJsonFormat.class);
23 |
24 | private String domain;
25 | private Set data;
26 |
27 | public Set getData() {
28 | return data;
29 | }
30 |
31 | public void setData(Set data) {
32 | this.data = data;
33 | }
34 |
35 | public String getDomain() {
36 | return domain;
37 | }
38 |
39 | public void setDomain(String domain) {
40 | this.domain = domain;
41 | }
42 |
43 | public static JSONArray performquery(Object json, String serviceEndpoint) {
44 | final ObjectMapper mapper = new ObjectMapper();
45 | String jsonString = null;
46 | JSONArray result = null;
47 | try {
48 | jsonString = mapper.writeValueAsString(json);
49 | Header[] headers = { new BasicHeader("Accept", "application/json"),
50 | new BasicHeader("content-type", "application/json") };
51 | ByteArrayEntity ent = new ByteArrayEntity(jsonString.getBytes(),
52 | ContentType.create("application/json"));
53 | String resStr = ServiceQueries.httpPostRequest(
54 | (Properties.getInstance().getWord2VecService() + serviceEndpoint), ent, headers);
55 | JSONObject resultJSON = null;
56 | try {
57 | resultJSON = new JSONObject(resStr);
58 | result = resultJSON.getJSONArray("data");
59 | } catch (JSONException e) {
60 | logger.error("JsonException in "+Word2VecJsonFormat.class.getName(), e);
61 | }
62 | } catch (IOException e) {
63 | logger.error("JsonException in "+Word2VecJsonFormat.class.getName(), e);
64 | }
65 | return result;
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/backend/DisambiguationTaskCollective.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.backend;
2 |
3 | import java.util.List;
4 |
5 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
6 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers;
7 |
8 | public class DisambiguationTaskCollective extends AbstractDisambiguationTask {
9 |
10 | private List entitiesToDis;
11 |
12 | /* A maintopic e.g. the column identifier in a table */
13 | private String mainTopic;
14 |
15 | public DisambiguationTaskCollective(final List entityToDis, String mainTopic) {
16 | super();
17 | this.entitiesToDis = entityToDis;
18 | this.mainTopic = mainTopic;
19 | }
20 |
21 | public List getEntityToDisambiguate() {
22 | return this.entitiesToDis;
23 | }
24 |
25 | public String getMainTopic() {
26 | return this.mainTopic;
27 | }
28 |
29 | public void setSurfaceForm(final List surfaceForm) {
30 | this.entitiesToDis = surfaceForm;
31 | }
32 |
33 | /**
34 | * Assignment function to determine the used knowledge base
35 | *
36 | * @param kbversion
37 | * @param setting
38 | */
39 | @Override
40 | public void setKbIdentifier(String kbversion, String setting) {
41 | if(setting == null) {
42 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
43 | } else if(setting.equalsIgnoreCase("DocumentCentric")) {
44 | if(kbversion.equalsIgnoreCase("default")) {
45 | this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault;
46 | } else {
47 | this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault;
48 | }
49 | } else if(setting.equalsIgnoreCase("EntityCentric")) {
50 | if(kbversion.equalsIgnoreCase("default")) {
51 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
52 | } else if(kbversion.equalsIgnoreCase("cstable")) {
53 | this.kbIdentifier = KnowledgeBaseIdentifiers.CSTable;
54 | } else if(kbversion.equalsIgnoreCase("biomed")) {
55 | this.kbIdentifier = KnowledgeBaseIdentifiers.Biomed;
56 | } else {
57 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
58 | }
59 | } else {
60 | this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
61 | }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserStandardTokenizer.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.analysis;
2 |
3 | /*
4 | * Licensed to the Apache Software Foundation (ASF) under one or more
5 | * contributor license agreements. See the NOTICE file distributed with
6 | * this work for additional information regarding copyright ownership.
7 | * The ASF licenses this file to You under the Apache License, Version 2.0
8 | * (the "License"); you may not use this file except in compliance with
9 | * the License. You may obtain a copy of the License at
10 | *
11 | * http://www.apache.org/licenses/LICENSE-2.0
12 | *
13 | * Unless required by applicable law or agreed to in writing, software
14 | * distributed under the License is distributed on an "AS IS" BASIS,
15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | * See the License for the specific language governing permissions and
17 | * limitations under the License.
18 | */
19 |
20 | import java.io.Reader;
21 |
22 | import org.apache.lucene.analysis.Tokenizer;
23 | import org.apache.lucene.analysis.util.CharTokenizer;
24 | import org.apache.lucene.util.AttributeFactory;
25 |
26 | public final class DoserStandardTokenizer extends CharTokenizer {
27 |
28 | /**
29 | * Construct a new WhitespaceTokenizer using a given
30 | * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
31 | *
32 | * @param factory
33 | * the attribute factory to use for this {@link Tokenizer}
34 | * @param in
35 | * the input to split up into tokens
36 | */
37 | public DoserStandardTokenizer(AttributeFactory factory, Reader in) {
38 | super(factory, in);
39 | }
40 |
41 | /**
42 | * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
43 | * to match See {@link above}
44 | *
45 | * @param in
46 | * the input to split up into tokens
47 | */
48 | public DoserStandardTokenizer(Reader in) {
49 | super(in);
50 | }
51 |
52 | /**
53 | * Collects only characters which do not satisfy
54 | * {@link Character#isWhitespace(int)}.
55 | */
56 | @Override
57 | protected boolean isTokenChar(int c) {
58 | boolean check = true;
59 | if (Character.isWhitespace(c) || c == 46) {
60 | check = false;
61 | }
62 | return check;
63 | }
64 | }
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/features/LuceneFeatures.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.features;
2 |
3 | import java.util.Locale;
4 |
5 | import org.apache.lucene.index.Term;
6 | import org.apache.lucene.search.Query;
7 | import org.apache.lucene.search.BooleanClause.Occur;
8 | import org.apache.lucene.search.similarities.Similarity;
9 |
10 | import doser.lucene.query.LTRBooleanQuery;
11 | import doser.lucene.query.LearnToRankFuzzyQuery;
12 | import doser.lucene.query.LearnToRankTermQuery;
13 | import doser.lucene.query.PriorQuery;
14 | import doser.lucene.query.SensePriorQuery;
15 |
16 | public class LuceneFeatures {
17 |
18 | public static Query queryLabelTerm(String keyword, String field,
19 | Similarity sim) {
20 | final LearnToRankTermQuery q = new LearnToRankTermQuery(new Term(field,
21 | keyword.toLowerCase(Locale.US)), sim);
22 | return q;
23 | }
24 |
25 | public static Query queryLabelFuzzy(String keyword, String field,
26 | Similarity sim) {
27 | final LearnToRankFuzzyQuery q = new LearnToRankFuzzyQuery(new Term(
28 | field, keyword.toLowerCase(Locale.US)), sim);
29 | return q;
30 | }
31 |
32 |
33 | public static Query queryStringTerm(String str, String field,
34 | Similarity sim, Occur occ, int maxclause) {
35 |
36 | final String[] split = str.split(" ");
37 | final LTRBooleanQuery bquery = new LTRBooleanQuery();
38 | for (final String element : split) {
39 | final LearnToRankTermQuery tquery = new LearnToRankTermQuery(
40 | new Term(field, element.toLowerCase(Locale.US)), sim);
41 | bquery.add(tquery, occ);
42 | }
43 | return bquery;
44 | }
45 |
46 | public static Query queryStringFuzzy(String str, String field,
47 | Similarity sim, Occur occ, int maxclause) {
48 |
49 | final String[] split = str.split(" ");
50 | final LTRBooleanQuery bquery = new LTRBooleanQuery();
51 | for (final String element : split) {
52 | final LearnToRankFuzzyQuery tquery = new LearnToRankFuzzyQuery(
53 | new Term(field, element.toLowerCase(Locale.US)), sim);
54 | bquery.add(tquery, occ);
55 |
56 | }
57 | return bquery;
58 | }
59 |
60 |
61 | public static Query queryPrior(IEntityCentricExtFeatures kb) {
62 | return new PriorQuery(kb);
63 | }
64 |
65 | public static Query querySensePrior(String str, IEntityCentricExtFeatures kb) {
66 | return new SensePriorQuery(str, kb);
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/TableColumnFilter.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective.dbpedia;
2 |
3 | import java.io.IOException;
4 | import java.util.List;
5 |
6 | import org.apache.lucene.index.IndexReader;
7 | import org.apache.lucene.index.Term;
8 | import org.apache.lucene.search.BooleanQuery;
9 | import org.apache.lucene.search.IndexSearcher;
10 | import org.apache.lucene.search.ScoreDoc;
11 | import org.apache.lucene.search.TopDocs;
12 | import org.apache.lucene.search.BooleanClause.Occur;
13 |
14 | import doser.entitydisambiguation.algorithms.SurfaceForm;
15 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
16 | import doser.lucene.query.TermQuery;
17 |
18 | public class TableColumnFilter {
19 |
20 | private EntityCentricKBDBpedia eckb;
21 | private String topic;
22 |
23 | TableColumnFilter(EntityCentricKBDBpedia eckb, String topic) {
24 | super();
25 | this.eckb = eckb;
26 | this.topic = topic;
27 | }
28 |
29 | public void filter(List reps) {
30 | for (SurfaceForm sf : reps) {
31 | List candidates = sf.getCandidates();
32 | if (candidates.size() > 0) {
33 | String s = performLuceneQuery(candidates, topic);
34 | if (s != null) {
35 | sf.setDisambiguatedEntity(s);
36 | }
37 | }
38 | }
39 | }
40 |
41 | private String performLuceneQuery(List candidates, String topic) {
42 | String result = null;
43 | IndexSearcher searcher = eckb.getSearcher();
44 | IndexReader reader = searcher.getIndexReader();
45 | BooleanQuery candidateq = new BooleanQuery();
46 | for (String can : candidates) {
47 | candidateq.add(new TermQuery(new Term("Mainlink", can)), Occur.SHOULD);
48 | }
49 | BooleanQuery q = new BooleanQuery();
50 | q.add(candidateq, Occur.MUST);
51 | q.add(new TermQuery(new Term("LongDescription", topic)), Occur.MUST);
52 | TopDocs t = null;
53 | try {
54 | t = searcher.search(q, candidates.size());
55 | } catch (IOException e) {
56 | e.printStackTrace();
57 | }
58 | if (t != null) {
59 | ScoreDoc[] scoredocs = t.scoreDocs;
60 | if (scoredocs.length == 1) {
61 | try {
62 | result = reader.document(scoredocs[0].doc).get("Mainlink");
63 | } catch (IOException e) {
64 | e.printStackTrace();
65 | }
66 | }
67 | }
68 | return result;
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/EntityDisambiguationDPO.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.dpo;
2 |
3 |
4 | /**
5 | * Represents surfaceform which should be disambiguated. Positions is used as an
6 | * intern id, which is necessary during feedback processing later.
7 | *
8 | * Version 2.0 One position of a surface form might be not enough. Version 2
9 | * offers the possibility to send an array of position.
10 | *
11 | * Version 3.0 A new field InterDisambiguationSetting flags the kind of
12 | * Disambiguation. This can be one of the following Types: - Standard Entity
13 | * Disambiguation with context - Standard Entity Disambiguation without context
14 | * - Entity Disambiguation without context on specialized domain (i.e. tables)
15 | *
16 | * Version 4.0 KnowledgeBaseIdentifier allows to select a specific knowledge
17 | * base for each disambiguation algorithm. This option should only be used if
18 | * the user is aware of what he is doing. Additionally the user is able to get
19 | * the lucene documents of disambiguated entities.
20 | *
21 | *
22 | * @author Stefan Zwicklbauer
23 | *
24 | */
25 | public class EntityDisambiguationDPO {
26 |
27 | private String documentId;
28 | private String context;
29 | private String selectedText;
30 | private String setting;
31 | private String kbversion;
32 | private int startPosition;
33 |
34 | public EntityDisambiguationDPO() {
35 | super();
36 | }
37 |
38 | public String getContext() {
39 | return this.context;
40 | }
41 |
42 | public String getSelectedText() {
43 | return this.selectedText;
44 | }
45 |
46 | public void setContext(final String context) {
47 | this.context = context;
48 | }
49 |
50 | public void setSelectedText(final String selectedText) {
51 | this.selectedText = selectedText;
52 | }
53 |
54 | public void setSetting(final String setting) {
55 | this.setting = setting;
56 | }
57 |
58 | public String getSetting() {
59 | return setting;
60 | }
61 |
62 | public void setDocumentId(final String documentId) {
63 | this.documentId = documentId;
64 | }
65 |
66 | public String getDocumentId() {
67 | return this.documentId;
68 | }
69 |
70 | public void setInternSetting(final String setting) {
71 | this.setting = setting;
72 | }
73 |
74 | public String getKbversion() {
75 | return kbversion;
76 | }
77 |
78 | public void setKbversion(String kbversion) {
79 | this.kbversion = kbversion;
80 | }
81 |
82 | public int getStartPosition() {
83 | return startPosition;
84 | }
85 |
86 | public void setStartPosition(int startPosition) {
87 | this.startPosition = startPosition;
88 | }
89 | }
--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
1 | cleanup.add_default_serial_version_id=true
2 | cleanup.add_generated_serial_version_id=false
3 | cleanup.add_missing_annotations=true
4 | cleanup.add_missing_deprecated_annotations=true
5 | cleanup.add_missing_methods=false
6 | cleanup.add_missing_nls_tags=false
7 | cleanup.add_missing_override_annotations=true
8 | cleanup.add_missing_override_annotations_interface_methods=true
9 | cleanup.add_serial_version_id=false
10 | cleanup.always_use_blocks=true
11 | cleanup.always_use_parentheses_in_expressions=false
12 | cleanup.always_use_this_for_non_static_field_access=false
13 | cleanup.always_use_this_for_non_static_method_access=false
14 | cleanup.convert_to_enhanced_for_loop=true
15 | cleanup.correct_indentation=true
16 | cleanup.format_source_code=true
17 | cleanup.format_source_code_changes_only=false
18 | cleanup.make_local_variable_final=true
19 | cleanup.make_parameters_final=false
20 | cleanup.make_private_fields_final=true
21 | cleanup.make_type_abstract_if_missing_method=false
22 | cleanup.make_variable_declarations_final=false
23 | cleanup.never_use_blocks=false
24 | cleanup.never_use_parentheses_in_expressions=true
25 | cleanup.organize_imports=true
26 | cleanup.qualify_static_field_accesses_with_declaring_class=false
27 | cleanup.qualify_static_member_accesses_through_instances_with_declaring_class=true
28 | cleanup.qualify_static_member_accesses_through_subtypes_with_declaring_class=true
29 | cleanup.qualify_static_member_accesses_with_declaring_class=true
30 | cleanup.qualify_static_method_accesses_with_declaring_class=false
31 | cleanup.remove_private_constructors=true
32 | cleanup.remove_trailing_whitespaces=true
33 | cleanup.remove_trailing_whitespaces_all=true
34 | cleanup.remove_trailing_whitespaces_ignore_empty=false
35 | cleanup.remove_unnecessary_casts=true
36 | cleanup.remove_unnecessary_nls_tags=true
37 | cleanup.remove_unused_imports=true
38 | cleanup.remove_unused_local_variables=false
39 | cleanup.remove_unused_private_fields=true
40 | cleanup.remove_unused_private_members=false
41 | cleanup.remove_unused_private_methods=true
42 | cleanup.remove_unused_private_types=true
43 | cleanup.sort_members=true
44 | cleanup.sort_members_all=true
45 | cleanup.use_blocks=true
46 | cleanup.use_blocks_only_for_return_and_throw=false
47 | cleanup.use_parentheses_in_expressions=false
48 | cleanup.use_this_for_non_static_field_access=true
49 | cleanup.use_this_for_non_static_field_access_only_if_necessary=true
50 | cleanup.use_this_for_non_static_method_access=true
51 | cleanup.use_this_for_non_static_method_access_only_if_necessary=true
52 | cleanup_profile=_Doser Code Profile
53 | cleanup_settings_version=2
54 | eclipse.preferences.version=1
55 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/EntityCentricKBDBpedia.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.knowledgebases;
2 |
3 | import java.util.List;
4 |
5 | import org.apache.lucene.search.similarities.Similarity;
6 |
7 | public class EntityCentricKBDBpedia extends AbstractEntityCentricKBGeneral {
8 |
9 | public EntityCentricKBDBpedia(String uri, boolean dynamic) {
10 | super(uri, dynamic);
11 | }
12 |
13 | public EntityCentricKBDBpedia(String uri, boolean dynamic, Similarity sim) {
14 | super(uri, dynamic, sim);
15 | }
16 |
17 | /**
18 | * Takes a set of dbpedia entities as well as a target entity and generates
19 | * one string that fits into the word2vec query format used in this class.
20 | * The source entities are concatenated and should be compared with the
21 | * target entity.
22 | *
23 | * @param source
24 | * a set of source entities
25 | * @param target
26 | * the target entity.
27 | * @return String in appropriate word2vec query format
28 | */
29 | @Override
30 | public String generateWord2VecFormatString(String source, String target) {
31 | String s = source.replaceAll("http://dbpedia.org/resource/", "");
32 | String t = target.replaceAll("http://dbpedia.org/resource/", "");
33 | int c = s.compareToIgnoreCase(target);
34 | String res = "";
35 | if (c < 0) {
36 | res = s + "|" + t;
37 | } else if (c == 0) {
38 | res = s + "|" + t;
39 | } else {
40 | res = t + "|" + s;
41 | }
42 | return res;
43 | }
44 |
45 | /**
46 | * Takes a set of dbpedia entities as well as a target entity and generates
47 | * one string that fits into the word2vec query format used in this class.
48 | * The source entities are concatenated and should be compared with the
49 | * target entity.
50 | *
51 | * @param source
52 | * a set of source entities
53 | * @param target
54 | * the target entity.
55 | * @return String in appropriate word2vec query format
56 | */
57 | @Override
58 | public String generateWord2VecFormatString(List source, String target) {
59 | StringBuilder builder = new StringBuilder();
60 | for (String s : source) {
61 | s = s.replaceAll("http://dbpedia.org/resource/", "");
62 | builder.append(s);
63 | builder.append("|");
64 | }
65 | String src = builder.toString();
66 | src = src.substring(0, src.length() - 1);
67 | String t = target.replaceAll("http://dbpedia.org/resource/", "");
68 | return src + "|" + t;
69 | }
70 |
71 | @Override
72 | protected String generateDomainName() {
73 | return "DBpedia";
74 | }
75 |
76 | @Override
77 | protected String kbName() {
78 | return "DBpedia KB";
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/NoCandidatesCheckPlural.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.rules;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.LinkedList;
6 | import java.util.List;
7 |
8 | import org.apache.lucene.document.Document;
9 | import org.apache.lucene.index.IndexReader;
10 | import org.apache.lucene.search.IndexSearcher;
11 | import org.apache.lucene.search.ScoreDoc;
12 | import org.apache.lucene.search.TopDocs;
13 | import org.apache.lucene.search.similarities.DefaultSimilarity;
14 |
15 | import doser.entitydisambiguation.algorithms.SurfaceForm;
16 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
17 | import doser.lucene.features.LuceneFeatures;
18 | import doser.lucene.query.LearnToRankClause;
19 | import doser.lucene.query.LearnToRankQuery;
20 | import doser.tools.Inflector;
21 |
22 | /**
23 | * Überprüft ob eine surface form im plural angegeben ist und falls ja überprüfe
24 | * den singular
25 | *
26 | * @author stefan
27 | *
28 | */
29 | class NoCandidatesCheckPlural extends AbstractRule {
30 |
31 | NoCandidatesCheckPlural(AbstractKnowledgeBase eckb) {
32 | super(eckb);
33 | }
34 |
35 | @Override
36 | public boolean applyRule(List rep) {
37 | for (SurfaceForm r : rep) {
38 | if (r.getCandidates().size() == 0) {
39 | String sf = r.getSurfaceForm();
40 | String singular = Inflector.getInstance().singularize(sf);
41 | if (!sf.equalsIgnoreCase(singular)) {
42 | // Try singular search
43 | ArrayList lst = queryLucene(singular);
44 | if (lst.size() != 0) {
45 | r.setCandidates(lst);
46 | }
47 | }
48 | }
49 | }
50 | return false;
51 | }
52 |
53 | private ArrayList queryLucene(String surfaceForm) {
54 | ArrayList list = new ArrayList();
55 | final IndexSearcher searcher = eckb.getSearcher();
56 | final IndexReader reader = searcher.getIndexReader();
57 | LearnToRankQuery query = new LearnToRankQuery();
58 | List features = new LinkedList();
59 | DefaultSimilarity defaultSim = new DefaultSimilarity();
60 | features.add(query.add(LuceneFeatures.queryLabelTerm(surfaceForm,
61 | "UniqueLabel", defaultSim), "Feature1", true));
62 | try {
63 | final TopDocs top = searcher.search(query, 150);
64 | final ScoreDoc[] score = top.scoreDocs;
65 | if (score.length <= 5) {
66 | for (int i = 0; i < score.length; ++i) {
67 | final Document doc = reader.document(score[i].doc);
68 | list.add(doc.get("Mainlink"));
69 | }
70 | }
71 | } catch (IOException e) {
72 | e.printStackTrace();
73 | }
74 | return list;
75 | }
76 |
77 | }
78 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/general/CollectiveContextDriverGeneral.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective.general;
2 |
3 | import java.util.LinkedList;
4 | import java.util.List;
5 |
6 | import doser.entitydisambiguation.algorithms.SurfaceForm;
7 | import doser.entitydisambiguation.algorithms.collective.CandidatePruning;
8 | import doser.entitydisambiguation.algorithms.rules.RuleAdapation;
9 | import doser.entitydisambiguation.dpo.DisambiguatedEntity;
10 | import doser.entitydisambiguation.dpo.Response;
11 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral;
12 |
13 | class CollectiveContextDriverGeneral {
14 |
15 | static final int PREPROCESSINGCONTEXTSIZE = 200;
16 |
17 | private Response[] currentResponse;
18 | private List rep;
19 | private AbstractEntityCentricKBGeneral eckb;
20 |
21 | CollectiveContextDriverGeneral(Response[] res, List rep, AbstractEntityCentricKBGeneral eckb) {
22 | super();
23 | this.currentResponse = res;
24 | this.rep = rep;
25 | this.eckb = eckb;
26 | }
27 |
28 | void solve() {
29 | // First candidate pruning
30 | CandidatePruning pruning = new CandidatePruning(eckb);
31 | pruning.prune(rep);
32 |
33 | RuleAdapation rules = new RuleAdapation();
34 | rules.addNoCandidatesCheckPluralRule(eckb);
35 | rules.addNoCandidatesExpansionRule(eckb);
36 | rules.performRuleChainBeforeCandidateSelection(rep);
37 |
38 | CandidateReductionGeneralW2V w2vreduction = new CandidateReductionGeneralW2V(eckb, rep, 20, 5, 125, false, false);
39 | w2vreduction.solve();
40 | rep = w2vreduction.getRep();
41 |
42 | w2vreduction = new CandidateReductionGeneralW2V(eckb, rep, 45, 5, 250, true, true);
43 | w2vreduction.solve();
44 | rep = w2vreduction.getRep();
45 | FinalEntityDisambiguation finalDis = new FinalEntityDisambiguation(eckb, rep);
46 | finalDis.setup();
47 | finalDis.solve();
48 | }
49 |
50 | void generateResult() {
51 | for (int i = 0; i < currentResponse.length; i++) {
52 | SurfaceForm r = search(i);
53 | if (currentResponse[i] == null && r != null && r.getCandidates().size() == 1) {
54 | Response res = new Response();
55 | List entList = new LinkedList();
56 | DisambiguatedEntity ent = new DisambiguatedEntity();
57 | ent.setEntityUri(r.getCandidates().get(0));
58 | entList.add(ent);
59 | res.setDisEntities(entList);
60 | res.setSelectedText(r.getSurfaceForm());
61 | currentResponse[i] = res;
62 | }
63 | }
64 | }
65 |
66 | private SurfaceForm search(int qryNr) {
67 | for (SurfaceForm r : rep) {
68 | if (r.getQueryNr() == qryNr) {
69 | return r;
70 | }
71 | }
72 | return null;
73 | }
74 |
75 | }
76 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserIDAnalyzer.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.analysis;
2 |
3 | import java.io.IOException;
4 | import java.io.Reader;
5 |
6 | import org.apache.lucene.analysis.TokenStream;
7 | import org.apache.lucene.analysis.core.StopAnalyzer;
8 | import org.apache.lucene.analysis.util.CharArraySet;
9 | import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
10 | import org.apache.lucene.analysis.util.WordlistLoader;
11 | import org.apache.lucene.util.Version;
12 |
13 | /**
14 | * This analyzer is a special analyzer for id queries in our knowledge bases
15 | *
16 | * @author Stefan Zwicklbauer
17 | *
18 | */
19 | public final class DoserIDAnalyzer extends StopwordAnalyzerBase {
20 |
21 | /** Default maximum allowed token length */
22 | public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
23 |
24 | /**
25 | * An unmodifiable set containing some common English words that are usually
26 | * not useful for searching.
27 | */
28 | public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
29 |
30 | private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
31 |
32 | /**
33 | * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}).
34 | *
35 | *
36 | */
37 | public DoserIDAnalyzer() {
38 | this(STOP_WORDS_SET);
39 | }
40 |
41 | /**
42 | * Builds an analyzer with the given stop words.
43 | *
44 | * @param stopWords
45 | * stop words
46 | */
47 | public DoserIDAnalyzer(CharArraySet stopWords) {
48 | super(stopWords);
49 | }
50 |
51 | /**
52 | * Builds an analyzer with the stop words from the given reader.
53 | *
54 | * @see WordlistLoader#getWordSet(Reader, Version)
55 | * @param stopwords
56 | * Reader to read stop words from
57 | */
58 | public DoserIDAnalyzer(Reader stopwords)
59 | throws IOException {
60 | this(loadStopwordSet(stopwords));
61 | }
62 |
63 | @Override
64 | protected TokenStreamComponents createComponents(final String fieldName,
65 | final Reader reader) {
66 | final DoserIDTokenizer src = new DoserIDTokenizer(reader);
67 | TokenStream tok = new DoserIDFilter(src);
68 | return new TokenStreamComponents(src, tok) {
69 | @Override
70 | protected void setReader(final Reader reader) throws IOException {
71 | super.setReader(reader);
72 | }
73 | };
74 | }
75 |
76 | /**
77 | * @see #setMaxTokenLength
78 | */
79 | public int getMaxTokenLength() {
80 | return maxTokenLength;
81 | }
82 |
83 | /**
84 | * Set maximum allowed token length. If a token is seen that exceeds this
85 | * length then it is discarded. This setting only takes effect the next time
86 | * tokenStream or tokenStream is called.
87 | */
88 | public void setMaxTokenLength(int length) {
89 | maxTokenLength = length;
90 | }
91 | }
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/LearnToRankClause.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.query;
2 |
3 | import java.util.HashMap;
4 | import java.util.Map;
5 |
6 | import org.apache.lucene.search.Query;
7 | import org.apache.lucene.search.Weight;
8 |
9 | /**
10 | * LearnToRank clause representing an arbitrary feature query. Additional
11 | * criterias may be defined later but are not necessary so far.
12 | *
13 | * HashMap featuresValues contains all calculated featuresValues. The HashMap
14 | * key stores the document number. The Pair integer stores the featureNumber.
15 | *
16 | * The HashMap has to be resetted after each query.
17 | *
18 | */
19 | public class LearnToRankClause {
20 |
21 | class Pair {
22 |
23 | private final int featureNr;
24 |
25 | private final float featureValue;
26 |
27 | Pair(final int docNr, final float featureValue) {
28 | featureNr = docNr;
29 | this.featureValue = featureValue;
30 | }
31 |
32 | public int getDocNr() {
33 | return featureNr;
34 | }
35 |
36 | public float getFeatureValue() {
37 | return featureValue;
38 | }
39 |
40 | }
41 |
42 | private Weight cweight;
43 |
44 | private final Map featureValues;
45 |
46 | private final boolean mustOccur;
47 |
48 | private final String name;
49 |
50 | private Query query;
51 |
52 | private float weight;
53 |
54 | public LearnToRankClause(final Query query, final String name,
55 | final boolean mustOccur) {
56 | this.query = query;
57 | this.name = name;
58 | weight = 1.0f;
59 | this.mustOccur = mustOccur;
60 | featureValues = new HashMap();
61 | }
62 |
63 | public void addFeatureValue(final int docBase, final int docNr,
64 | final float value) {
65 | featureValues.put((docBase + docNr), value);
66 | }
67 |
68 | public void clear() {
69 | featureValues.clear();
70 | }
71 |
72 | public double getFeatureValue(final int docId) {
73 | double val = 0f;
74 | try {
75 | val = featureValues.get(docId);
76 | } catch (final NullPointerException e) {
77 | val = 0f;
78 | }
79 | return val;
80 | }
81 |
82 | public String getName() {
83 | return name;
84 | }
85 |
86 | public Query getQuery() {
87 | return query;
88 | }
89 |
90 | public Weight getW() {
91 | return cweight;
92 | }
93 |
94 | public float getWeight() {
95 | return weight;
96 | }
97 |
98 | public boolean isMustOccur() {
99 | return mustOccur;
100 | }
101 |
102 | public void setQuery(final Query query) {
103 | this.query = query;
104 | }
105 |
106 | public void setW(final Weight cweight) {
107 | this.cweight = cweight;
108 | }
109 |
110 | public void setWeight(final float weight) {
111 | this.weight = weight;
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/properties/Properties.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.properties;
2 |
3 | import org.apache.commons.configuration.ConfigurationException;
4 | import org.apache.commons.configuration.PropertiesConfiguration;
5 | import org.apache.log4j.Logger;
6 |
7 | public final class Properties {
8 | private static Properties instance;
9 | private static final String RESOURCE_NAME = "disambiguation.properties";
10 | // private static final String RESOURCE_NAME = "./disambiguation.properties";
11 |
12 | public synchronized static Properties getInstance() {
13 | if (instance == null) {
14 | instance = new Properties();
15 | }
16 |
17 | return instance;
18 | }
19 |
20 | /**
21 | * Provides easy access to property files (e.g. config.getInt())
22 | */
23 | PropertiesConfiguration config;
24 |
25 | private Properties() {
26 | try {
27 | this.config = new PropertiesConfiguration(RESOURCE_NAME);
28 | } catch (final ConfigurationException e) {
29 | Logger.getRootLogger().error("Failed to load properties file: " + RESOURCE_NAME, e);
30 | }
31 | }
32 |
33 | /**
34 | * ArtifactId of the application (from maven pom.xml)
35 | *
36 | * @return artifact id
37 | */
38 | public String getApplicationArtifactId() {
39 | return this.config.getString("application.artifactId");
40 | }
41 |
42 | /**
43 | * Name of the application (from maven pom.xml)
44 | *
45 | * @return application name
46 | */
47 | public String getApplicationName() {
48 | return this.config.getString("application.name");
49 | }
50 |
51 | /**
52 | * Version of the application (from maven pom.xml)
53 | *
54 | * @return application version
55 | */
56 | public String getApplicationVersion() {
57 | return this.config.getString("application.version");
58 | }
59 |
60 | public int getDisambiguationResultSize() {
61 | final String size = this.config.getString("disambiguation.returnSize");
62 | return Integer.valueOf(size);
63 | }
64 |
65 | /**
66 | * Get location of entity-centric knowledge base
67 | */
68 | public String getEntityCentricKBWikipedia() {
69 | return this.config.getString("disambiguation.entityCentricKBWikipedia");
70 | }
71 |
72 | public String getEntityCentricKBBiomed() {
73 | return this.config.getString("disambiguation.entityCentricBiomedCalbC");
74 | }
75 |
76 | public String getWord2VecService() {
77 | return this.config.getString("disambiguation.Word2VecService");
78 | }
79 |
80 | public String getWord2VecModel() {
81 | return this.config.getString("word2vecmodel");
82 | }
83 |
84 | public boolean getCandidateExpansion() {
85 | boolean bool = false;
86 | String s = this.config.getString("candidateExpansion");
87 | if(s.equalsIgnoreCase("true")) {
88 | bool = true;
89 | }
90 | return bool;
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | doser-dis
5 | doser-dis-parent
6 | 1.0
7 |
8 | 4.0.0
9 | doser.sub
10 | doser-dis-disambiguationserver
11 | 0.6
12 | doser-dis-disambiguationserver
13 |
14 |
15 | doser-dis-disambiguationserver
16 |
17 |
18 | maven-war-plugin
19 | 2.1.1
20 |
21 |
22 | org.apache.maven.plugins
23 | 2.9
24 | maven-eclipse-plugin
25 |
26 | true
27 | 2.0
28 |
29 |
30 |
31 | org.apache.tomcat.maven
32 | tomcat7-maven-plugin
33 | 2.0
34 |
35 |
36 | tomcat-run
37 |
38 | exec-war-only
39 |
40 | package
41 |
42 | /doser
43 | false
44 | DoSer-disambiguation-only.jar
45 | utf-8
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 | doser.sub
55 | doser-dis-core
56 | ${project.version}
57 |
58 |
59 | org.springframework
60 | spring-webmvc
61 | 4.0.6.RELEASE
62 |
63 |
64 | commons-fileupload
65 | commons-fileupload
66 | 1.3.1
67 |
68 |
69 | javax.servlet
70 | javax.servlet-api
71 | provided
72 | 3.0.1
73 |
74 |
75 |
76 |
77 |
78 | xml-apis
79 | xml-apis
80 | 1.4.01
81 |
82 |
83 |
84 | war
85 |
86 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserStandardAnalyzer.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.analysis;
2 |
3 | import java.io.IOException;
4 | import java.io.Reader;
5 |
6 | import org.apache.lucene.analysis.TokenStream;
7 | import org.apache.lucene.analysis.core.LowerCaseFilter;
8 | import org.apache.lucene.analysis.core.StopAnalyzer;
9 | import org.apache.lucene.analysis.standard.StandardFilter;
10 | import org.apache.lucene.analysis.util.CharArraySet;
11 | import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
12 | import org.apache.lucene.analysis.util.WordlistLoader;
13 | import org.apache.lucene.util.Version;
14 |
15 | public final class DoserStandardAnalyzer extends StopwordAnalyzerBase {
16 |
17 | /** Default maximum allowed token length */
18 | public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
19 |
20 | /**
21 | * An unmodifiable set containing some common English words that are usually
22 | * not useful for searching.
23 | */
24 | public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
25 |
26 | private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
27 |
28 | /**
29 | * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}).
30 | *
31 | */
32 | public DoserStandardAnalyzer() {
33 | this(STOP_WORDS_SET);
34 | }
35 |
36 | /**
37 | * Builds an analyzer with the given stop words.
38 | *
39 | * @param stopWords
40 | * stop words
41 | */
42 | public DoserStandardAnalyzer(CharArraySet stopWords) {
43 | super(stopWords);
44 | }
45 |
46 | /**
47 | * Builds an analyzer with the stop words from the given reader.
48 | *
49 | * @see WordlistLoader#getWordSet(Reader, Version)
50 | * @param stopwords
51 | * Reader to read stop words from
52 | */
53 | public DoserStandardAnalyzer(Reader stopwords)
54 | throws IOException {
55 | this(loadStopwordSet(stopwords));
56 | }
57 |
58 | @Override
59 | protected TokenStreamComponents createComponents(final String fieldName,
60 | final Reader reader) {
61 | final DoserStandardTokenizer src = new DoserStandardTokenizer(reader);
62 | TokenStream tok = new StandardFilter(src);
63 | tok = new LowerCaseFilter(tok);
64 | return new TokenStreamComponents(src, tok) {
65 | @Override
66 | protected void setReader(final Reader reader) throws IOException {
67 | super.setReader(reader);
68 | }
69 | };
70 | }
71 |
72 | /**
73 | * @see #setMaxTokenLength
74 | */
75 | public int getMaxTokenLength() {
76 | return maxTokenLength;
77 | }
78 |
79 | /**
80 | * Set maximum allowed token length. If a token is seen that exceeds this
81 | * length then it is discarded. This setting only takes effect the next time
82 | * tokenStream or tokenStream is called.
83 | */
84 | public void setMaxTokenLength(int length) {
85 | maxTokenLength = length;
86 | }
87 | }
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/AbstractKnowledgeBase.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.knowledgebases;
2 |
3 | import java.io.File;
4 | import java.io.IOException;
5 | import java.util.TimerTask;
6 |
7 | import org.apache.lucene.search.IndexSearcher;
8 | import org.apache.lucene.search.SearcherFactory;
9 | import org.apache.lucene.search.SearcherManager;
10 | import org.apache.lucene.search.similarities.DefaultSimilarity;
11 | import org.apache.lucene.search.similarities.Similarity;
12 | import org.apache.lucene.store.Directory;
13 | import org.apache.lucene.store.FSDirectory;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 |
17 | /**
18 | * Each knowledge base provides its own class with its respective properties.
19 | * These are the knowledge base index uri. IndexSearcher, IndexReader objects
20 | * and the dynamic property.
21 | *
22 | * @author stefan zwicklbauer
23 | */
24 | public abstract class AbstractKnowledgeBase extends TimerTask {
25 |
26 | private final static Logger logger = LoggerFactory.getLogger(AbstractKnowledgeBase.class);
27 |
28 | private String indexUri;
29 |
30 | private boolean dynamic;
31 |
32 | private SearcherManager manager;
33 |
34 | private IndexSearcher searcher;
35 |
36 | AbstractKnowledgeBase(String uri, boolean dynamic) {
37 | this(uri, dynamic, new DefaultSimilarity());
38 | }
39 |
40 | AbstractKnowledgeBase(String uri, boolean dynamic, Similarity sim) {
41 | super();
42 | this.indexUri = uri;
43 | this.dynamic = dynamic;
44 |
45 | File indexDir = new File(indexUri);
46 | Directory dir;
47 | try {
48 | dir = FSDirectory.open(indexDir);
49 | this.manager = new SearcherManager(dir, new SearcherFactory());
50 | } catch (IOException e) {
51 | logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e);
52 | }
53 | }
54 |
55 | public String getIndexUri() {
56 | return indexUri;
57 | }
58 |
59 |
60 | public IndexSearcher getSearcher() {
61 | try {
62 | this.searcher = manager.acquire();
63 | } catch (IOException e) {
64 | logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e);
65 | }
66 | return this.searcher;
67 | }
68 |
69 | public void release() {
70 | try {
71 | manager.release(searcher);
72 | } catch (IOException e) {
73 | logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e);
74 | }
75 | }
76 |
77 | /**
78 | * Periodically reopens the Indexreader, if and only if this is an dynamic
79 | * knowledge base. The changed knowledge base will be live within a few moments.
80 | */
81 | @Override
82 | public void run() {
83 | if (dynamic) {
84 | try {
85 | manager.maybeRefresh();
86 | } catch (IOException e) {
87 | logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e);
88 | }
89 | }
90 | }
91 |
92 | public abstract void initialize();
93 | }
94 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | #DoSeR-Disambiguation
2 | This package exclusively contains the disambiguation system of DoSeR. Compilation results in a Stand-alone jar file which starts an Apache Tomcat Server. More infos about the full DoSeR systems can be found here: [Github Wiki](https://github.com/quhfus/DoSeR/wiki)
3 |
4 | If your system does not have enough system memory (25GB Ram), you can use the the rest service of the current DoSeR version which is applicable for GERBIL. **Coming soon**
5 |
6 |
7 | We note that this service is limited to 5 queries in parallel.
8 |
9 | ##Requirements
10 | To install and run the DoSeR disambiguation systems, the following components must be installed:
11 |
12 | 1. Java Version 1.7 or higher
13 |
14 | 2. Python 2.5 or higher
15 |
16 | 3. Disambiguation Lucene Index: [Dropbox Link](https://www.dropbox.com/s/7ihkw5gzqc3afjo/DBpedia_DisambiguationIndex.tar.gz?dl=0)
17 |
18 | 4. Semantic Embeddings: [Dropbox Link](https://www.dropbox.com/s/4e2g72yud1muv5a/Semantic_Embeddings.tar.gz?dl=0)
19 |
20 | ##Installation
21 | 1. Checkout the DoSeR-Disambiguation Github repository and install the system with **mvn compile**. If no maven is installed or if you are not interested in the source code you can download the doser-dis-disambiguationserver.jar file and disambiguation.properties file from here (coming very soon).
22 |
23 | 2. Put the resulting or downloaded **doser-dis-disambiguationserver.jar** file and the properties file into a newly created directory **foo**. Unzip the Disambiguation Index and put the index folder into the **foo** directory.
24 |
25 | 3. Unzip and extract the Semantic Embeddings zip file into any folder.
26 |
27 | 4. Install and start the Word2Vec Rest Server (Installation guide can be found [here](https://github.com/quhfus/DoSeR-Disambiguation/wiki/Word2Vec-RestServer))
28 |
29 | 6. Open and adapt the disambiguation.properties file
30 |
31 | 7. Start the doser-dis-disambiguationserver.jar
32 |
33 | ##Citation
34 | If you use DoSeR in your research, please cite the following paper:
35 |
36 | @inproceedings{DBLP:conf/esws/ZwicklbauerSG16,
37 | author = {Stefan Zwicklbauer and Christin Seifert and Michael Granitzer},
38 | title = {DoSeR - A Knowledge-Base-Agnostic Framework for Entity Disambiguation Using Semantic Embeddings},
39 | booktitle = {The Semantic Web. Latest Advances and New Domains - 13th International
40 | Conference, {ESWC} 2016, Heraklion, Crete, Greece, May 29 - June 2,
41 | 2016, Proceedings},
42 | pages = {182--198},
43 | year = {2016},
44 | crossref = {DBLP:conf/esws/2016},
45 | url = {http://dx.doi.org/10.1007/978-3-319-34129-3_12},
46 | doi = {10.1007/978-3-319-34129-3_12},
47 | timestamp = {Mon, 23 May 2016 13:46:28 +0200},
48 | biburl = {http://dblp.uni-trier.de/rec/bib/conf/esws/ZwicklbauerSG16},
49 | bibsource = {dblp computer science bibliography, http://dblp.org}
50 | }
51 |
--------------------------------------------------------------------------------
/doser-dis-core/pom.xml:
--------------------------------------------------------------------------------
1 |
3 |
4 | doser-dis
5 | doser-dis-parent
6 | 1.0
7 |
8 | 4.0.0
9 | doser.sub
10 | doser-dis-core
11 | 0.6
12 | doser-dis-core
13 |
14 |
15 | doser-dis-core
16 |
17 |
18 |
19 |
20 |
21 |
22 | com.google.guava
23 | guava
24 | 18.0
25 |
26 |
27 |
28 |
29 | doser.sub
30 | doser-dis-extensions
31 | ${project.version}
32 |
33 |
34 | org.rdfhdt
35 | hdt-java-core
36 | 1.1
37 |
38 |
39 | org.rdfhdt
40 | hdt-jena
41 | 1.1
42 |
43 |
44 | net.sf.jgrapht
45 | jgrapht
46 | 0.8.3
47 |
48 |
49 | com.googlecode.aima-java
50 | aima-core
51 | 0.10.5
52 |
53 |
54 | commons-configuration
55 | commons-configuration
56 | 1.10
57 |
58 |
59 | org.codehaus.jettison
60 | jettison
61 | 1.3.5
62 |
63 |
64 | org.codehaus.jackson
65 | jackson-mapper-asl
66 | 1.9.13
67 |
68 |
69 | net.sf.jung
70 | jung2
71 | 2.0.1
72 | pom
73 |
74 |
75 | net.sf.jung
76 | jung-graph-impl
77 | 2.0.1
78 |
79 |
80 | net.sf.jung
81 | jung-algorithms
82 | 2.0.1
83 |
84 |
85 | org.apache.commons
86 | commons-math
87 | 2.2
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/java/doser/server/actions/FrameworkInitialization.java:
--------------------------------------------------------------------------------
1 | package doser.server.actions;
2 |
3 | import java.util.Enumeration;
4 |
5 | import javax.servlet.ServletContext;
6 | import javax.servlet.ServletContextEvent;
7 | import javax.servlet.ServletContextListener;
8 |
9 | import org.apache.log4j.Logger;
10 | import org.springframework.beans.factory.DisposableBean;
11 | import org.springframework.web.context.ContextLoader;
12 | import org.springframework.web.context.WebApplicationContext;
13 |
14 | import doser.entitydisambiguation.backend.DisambiguationMainService;
15 |
16 | public class FrameworkInitialization extends ContextLoader implements
17 | ServletContextListener {
18 |
19 | private ContextLoader contextLoader;
20 |
21 | public FrameworkInitialization() {
22 | }
23 |
24 | public FrameworkInitialization(WebApplicationContext context) {
25 | super(context);
26 | }
27 |
28 | /**
29 | * Initialize the root web application context.
30 | */
31 | @Override
32 | public void contextInitialized(ServletContextEvent event) {
33 | DisambiguationMainService.initialize();
34 | this.contextLoader = createContextLoader();
35 | if (this.contextLoader == null) {
36 | this.contextLoader = this;
37 | }
38 | this.contextLoader.initWebApplicationContext(event.getServletContext());
39 | }
40 |
41 | /**
42 | * Create the ContextLoader to use. Can be overridden in subclasses.
43 | *
44 | * @return the new ContextLoader
45 | * @deprecated in favor of simply subclassing ContextLoaderListener itself
46 | * (which extends ContextLoader, as of Spring 3.0)
47 | */
48 | @Deprecated
49 | protected ContextLoader createContextLoader() {
50 | return null;
51 | }
52 |
53 | /**
54 | * Return the ContextLoader used by this listener.
55 | *
56 | * @return the current ContextLoader
57 | * @deprecated in favor of simply subclassing ContextLoaderListener itself
58 | * (which extends ContextLoader, as of Spring 3.0)
59 | */
60 | @Deprecated
61 | public ContextLoader getContextLoader() {
62 | return this.contextLoader;
63 | }
64 |
65 | /**
66 | * Close the root web application context.
67 | */
68 | @Override
69 | public void contextDestroyed(ServletContextEvent event) {
70 | DisambiguationMainService.getInstance().shutDownDisambiguationService();
71 | if (this.contextLoader != null) {
72 | this.contextLoader.closeWebApplicationContext(event
73 | .getServletContext());
74 | }
75 | ServletContext sc = event.getServletContext();
76 | Enumeration attrNames = sc.getAttributeNames();
77 | while (attrNames.hasMoreElements()) {
78 | String attrName = attrNames.nextElement();
79 | if (attrName.startsWith("org.springframework.")) {
80 | Object attrValue = sc.getAttribute(attrName);
81 | if (attrValue instanceof DisposableBean) {
82 | try {
83 | ((DisposableBean) attrValue).destroy();
84 | } catch (Throwable ex) {
85 | Logger.getRootLogger().fatal(ex.getMessage());
86 | }
87 | }
88 | }
89 | }
90 | }
91 |
92 | }
93 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/CandidateReduction.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective;
2 |
3 | import java.util.LinkedList;
4 | import java.util.List;
5 | import java.util.concurrent.TimeUnit;
6 |
7 | import doser.entitydisambiguation.algorithms.SurfaceForm;
8 |
9 | public abstract class CandidateReduction {
10 |
11 | // public static final int MAXSURFACEFORMSPERQUERY = 20;
12 | // public static final int REDUCETO = 5;
13 | private List rep;
14 | private boolean alwaysAction;
15 | private int maxsurfaceformsperquery;
16 |
17 | public CandidateReduction(List rep,
18 | int maxsurfaceformsperquery, boolean alwaysAction) {
19 | super();
20 | this.rep = rep;
21 | this.maxsurfaceformsperquery = maxsurfaceformsperquery;
22 | this.alwaysAction = alwaysAction;
23 | }
24 |
25 | public void solve() {
26 | List finalList = new LinkedList();
27 | if (this.rep.size() > maxsurfaceformsperquery) {
28 | int counter = 0;
29 | while (true) {
30 | long time = System.currentTimeMillis();
31 | if ((counter + maxsurfaceformsperquery) < this.rep.size()) {
32 | List subList = this.rep.subList(counter, (counter + maxsurfaceformsperquery));
33 | finalList.addAll(miniSolve(subList));
34 | counter += maxsurfaceformsperquery;
35 | } else {
36 | List subList = this.rep.subList(counter, this.rep.size());
37 | List cloneList = new LinkedList();
38 | for (SurfaceForm sf : subList) {
39 | SurfaceForm clone = (SurfaceForm) sf.clone();
40 | cloneList.add(clone);
41 | }
42 |
43 | int prevcounter = 0;
44 | List prevList = this.rep.subList(counter - maxsurfaceformsperquery, counter);
45 | while (cloneList.size() < maxsurfaceformsperquery) {
46 | SurfaceForm clone = (SurfaceForm) prevList.get(prevcounter).clone();
47 | clone.setRelevant(false);
48 | cloneList.add(clone);
49 | prevcounter++;
50 | }
51 | List workedList = miniSolve(cloneList);
52 | List sfs = new LinkedList();
53 | for (SurfaceForm sf : workedList) {
54 | if (sf.isRelevant()) {
55 | sfs.add(sf);
56 | }
57 | }
58 | finalList.addAll(sfs);
59 | break;
60 | }
61 | long millis = System.currentTimeMillis() - time;
62 | String formatedTime = String.format("%d min, %d sec",
63 | TimeUnit.MILLISECONDS.toMinutes(millis),
64 | TimeUnit.MILLISECONDS.toSeconds(millis) -
65 | TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))
66 | );
67 | System.out.println(formatedTime);
68 | }
69 | this.rep = finalList;
70 | } else {
71 | if(alwaysAction) {
72 | finalList.addAll(miniSolve(rep));
73 | this.rep = finalList;
74 | }
75 | }
76 | }
77 |
78 | public List getRep() {
79 | return rep;
80 | }
81 |
82 | public abstract List miniSolve(List rep);
83 | }
84 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/LearnToRankTermScorer.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.query;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.lucene.index.DocsEnum;
6 | import org.apache.lucene.search.Scorer;
7 | import org.apache.lucene.search.Weight;
8 | import org.apache.lucene.search.similarities.Similarity;
9 |
10 | /**
11 | * Expert: A Scorer for documents matching a Term.
12 | */
13 | final class LearnToRankTermScorer extends Scorer {
14 | private final Similarity.SimScorer docScorer;
15 | private final DocsEnum docsEnum;
16 |
17 | /**
18 | * Construct a TermScorer.
19 | *
20 | * @param weight
21 | * The weight of the Term in the query.
22 | * @param docsEnum
23 | * An iterator over the documents matching the Term.
24 | * @param docScorer
25 | * The Similarity.ExactSimScorer implementation to
26 | * be used for score computations.
27 | * @param docFreq
28 | * per-segment docFreq of this term
29 | */
30 | LearnToRankTermScorer(final Weight weight, final DocsEnum docsEnum,
31 | final Similarity.SimScorer docScorer) {
32 | super(weight);
33 | this.docScorer = docScorer;
34 | this.docsEnum = docsEnum;
35 | }
36 |
37 | /**
38 | * Advances to the first match beyond the current whose document number is
39 | * greater than or equal to a given target.
40 | * The implementation uses {@link DocsEnum#advance(int)}.
41 | *
42 | * @param target
43 | * The target document number.
44 | * @return the matching document or NO_MORE_DOCS if none exist.
45 | */
46 | @Override
47 | public int advance(final int target) throws IOException {
48 | return docsEnum.advance(target);
49 | }
50 |
51 | @Override
52 | public long cost() {
53 | return docsEnum.cost();
54 | }
55 |
56 | @Override
57 | public int docID() {
58 | return docsEnum.docID();
59 | }
60 |
61 | @Override
62 | public int freq() throws IOException {
63 | return docsEnum.freq();
64 | }
65 |
66 | DocsEnum getDocsEnum() {
67 | return docsEnum;
68 | }
69 |
70 | /**
71 | * Advances to the next document matching the query.
72 | *
73 | * @return the document matching the query or NO_MORE_DOCS if there are no
74 | * more documents.
75 | */
76 | @Override
77 | public int nextDoc() throws IOException {
78 | return docsEnum.nextDoc();
79 | }
80 |
81 | // TODO: benchmark if the specialized conjunction really benefits
82 | // from this, or if instead its from sorting by docFreq, or both
83 |
84 | @Override
85 | public float score() throws IOException {
86 | assert docID() != NO_MORE_DOCS;
87 | return docScorer.score(docsEnum.docID(), docsEnum.freq());
88 | }
89 |
90 | // TODO: generalize something like this for scorers?
91 | // even this is just an estimation...
92 |
93 | // int getDocFreq() {
94 | // return docFreq;
95 | // }
96 |
97 | /** Returns a string representation of this TermScorer. */
98 | @Override
99 | public String toString() {
100 | return "scorer(" + weight + ")";
101 | }
102 | }
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/EntityCentricKBBiomed.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.knowledgebases;
2 |
3 | import java.util.List;
4 |
5 | import org.apache.lucene.search.similarities.Similarity;
6 |
7 | public class EntityCentricKBBiomed extends AbstractEntityCentricKBGeneral {
8 |
9 | public EntityCentricKBBiomed(String uri, boolean dynamic, Similarity sim) {
10 | super(uri, dynamic, sim);
11 | }
12 |
13 | public EntityCentricKBBiomed(String uri, boolean dynamic) {
14 | super(uri, dynamic);
15 | }
16 |
17 | /**
18 | * Takes a set of entities as well as a target entity and generates one
19 | * string that fits into the word2vec query format used in this class. The
20 | * source entities are concatenated and should be compared with the target
21 | * entity.
22 | *
23 | * @param source
24 | * a set of source entities
25 | * @param target
26 | * the target entity.
27 | * @return String in appropriate word2vec query format
28 | */
29 | public String generateWord2VecFormatString(String source, String target) {
30 | source = convertUrlToBiomedEntityIdentifier(source);
31 | target = convertUrlToBiomedEntityIdentifier(target);
32 | int c = source.compareToIgnoreCase(target);
33 | String res = "";
34 | if (c < 0) {
35 | res = source + "|" + target;
36 | } else if (c == 0) {
37 | res = source + "|" + target;
38 | } else {
39 | res = target + "|" + source;
40 | }
41 | return res;
42 | }
43 |
44 | /**
45 | * Takes a set of entities as well as a target entity and generates one
46 | * string that fits into the word2vec query format used in this class. The
47 | * source entities are concatenated and should be compared wit the target
48 | * entity.
49 | *
50 | * @param source
51 | * a set of source entities
52 | * @param target
53 | * the target entity.
54 | * @return String in appropriate word2vec query format
55 | */
56 | public String generateWord2VecFormatString(List source, String target) {
57 | StringBuilder builder = new StringBuilder();
58 | for (String s : source) {
59 | s = convertUrlToBiomedEntityIdentifier(s);
60 | builder.append(s);
61 | builder.append("|");
62 | }
63 | String src = builder.toString();
64 | src = src.substring(0, src.length() - 1);
65 | target = convertUrlToBiomedEntityIdentifier(target);
66 | return src + "|" + target;
67 | }
68 |
69 | private String convertUrlToBiomedEntityIdentifier(String url) {
70 | String res = "";
71 | if (url.startsWith("http://www.uniprot.org/uniprot/")) {
72 | res = "UNIPROT_" + url.replaceAll("http://www.uniprot.org/uniprot/", "");
73 | } else if (url.startsWith("http://www.ncbi.nlm.nih.gov/gene/")) {
74 | res = "NCBI_" + url.replaceAll("http://www.ncbi.nlm.nih.gov/gene/", "");
75 | } else if (url.startsWith("http://linkedlifedata.com/resource/umls-concept/")) {
76 | res = "UMLS_" + url.replaceAll("http://linkedlifedata.com/resource/umls-concept/", "");
77 | }
78 | return res;
79 | }
80 |
81 | @Override
82 | protected String generateDomainName() {
83 | return "Biomed";
84 | }
85 |
86 | @Override
87 | protected String kbName() {
88 | return "CalbC Biomedical KB";
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/CheckGeneralEntities.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.rules;
2 |
3 | import java.io.IOException;
4 | import java.util.LinkedList;
5 | import java.util.List;
6 |
7 | import org.apache.lucene.document.Document;
8 | import org.apache.lucene.index.IndexReader;
9 | import org.apache.lucene.index.Term;
10 | import org.apache.lucene.search.IndexSearcher;
11 | import org.apache.lucene.search.ScoreDoc;
12 | import org.apache.lucene.search.TopDocs;
13 |
14 | import doser.entitydisambiguation.algorithms.SurfaceForm;
15 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
16 | import doser.lucene.query.TermQuery;
17 |
18 | class CheckGeneralEntities extends AbstractRule {
19 |
20 | CheckGeneralEntities(EntityCentricKBDBpedia eckb) {
21 | super(eckb);
22 | }
23 |
24 | @Override
25 | public boolean applyRule(List rep) {
26 | for (SurfaceForm c : rep) {
27 | String sf = c.getSurfaceForm().toLowerCase();
28 | List candidates = c.getCandidates();
29 | String checked = null;
30 | // Surface Form - Candidate Match i.e. Saturday -
31 | // http://dbpedia.org/resource/Saturday
32 | for (String s : candidates) {
33 | String ent = s.replaceAll("http://dbpedia.org/resource/", "")
34 | .toLowerCase();
35 | if (sf.equalsIgnoreCase(ent)) {
36 | checked = s;
37 | break;
38 | }
39 | }
40 |
41 | if (checked != null && !checkSurfaceFormSubset(sf, rep)) {
42 | List keepCandidates = new LinkedList();
43 | for (String can : candidates) {
44 | String[] labels = null;
45 | IndexSearcher searcher = eckb.getSearcher();
46 | IndexReader reader = searcher.getIndexReader();
47 | TermQuery query = new TermQuery(new Term("Mainlink", can));
48 | try {
49 | final TopDocs top = searcher.search(query, 1);
50 | final ScoreDoc[] score = top.scoreDocs;
51 | final Document doc = reader.document(score[0].doc);
52 | labels = doc.getValues("Label");
53 | } catch (IOException e) {
54 | e.printStackTrace();
55 | }
56 | // Check whether the candidate has label of the original
57 | // surface form
58 | if (labels != null) {
59 | boolean isIn = false;
60 | for (int i = 0; i < labels.length; ++i) {
61 | if (labels[i].toLowerCase().equalsIgnoreCase(sf)) {
62 | isIn = true;
63 | break;
64 | }
65 | }
66 | // If IN, keep this candidate
67 | if (isIn) {
68 | keepCandidates.add(can);
69 | }
70 | }
71 | }
72 | if (!keepCandidates.isEmpty()) {
73 | c.setCandidates(keepCandidates);
74 | if(keepCandidates.size() == 1) {
75 | System.out.println("**********************************************************************");
76 | System.out.println(keepCandidates.toString());
77 | System.out.println("**********************************************************************");
78 | }
79 | }
80 | }
81 | }
82 | return false;
83 | }
84 |
85 | private boolean checkSurfaceFormSubset(String sf,
86 | List reps) {
87 | boolean isIn = false;
88 | for (SurfaceForm c : reps) {
89 | String toCheck = c.getSurfaceForm().toLowerCase();
90 | if (!toCheck.equalsIgnoreCase(sf) && toCheck.contains(sf)) {
91 | isIn = true;
92 | break;
93 | }
94 | }
95 | return isIn;
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/CollectiveAndContextDriver.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective.dbpedia;
2 |
3 | import java.util.LinkedList;
4 | import java.util.List;
5 |
6 | import doser.entitydisambiguation.algorithms.SurfaceForm;
7 | import doser.entitydisambiguation.algorithms.collective.CandidatePruning;
8 | import doser.entitydisambiguation.algorithms.rules.RuleAdapation;
9 | import doser.entitydisambiguation.dpo.DisambiguatedEntity;
10 | import doser.entitydisambiguation.dpo.Response;
11 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
12 |
13 | class CollectiveAndContextDriver {
14 |
15 | static final int PREPROCESSINGCONTEXTSIZE = 200;
16 |
17 | private String topic;
18 | private Response[] currentResponse;
19 | private List rep;
20 | private EntityCentricKBDBpedia eckb;
21 |
22 | CollectiveAndContextDriver(Response[] res, List rep, EntityCentricKBDBpedia eckb, String topic) {
23 | super();
24 | this.topic = topic;
25 | if (res.length != rep.size()) {
26 | throw new IllegalArgumentException();
27 | }
28 | this.currentResponse = res;
29 | this.rep = rep;
30 | this.eckb = eckb;
31 | this.eckb.precomputeDoc2VecSimilarities(rep, PREPROCESSINGCONTEXTSIZE);
32 | }
33 |
34 | void solve() {
35 | // First candidate pruning
36 | CandidatePruning pruning = new CandidatePruning(eckb);
37 | pruning.prune(rep);
38 | if (topic != null) {
39 | TableColumnFilter cf = new TableColumnFilter(eckb, topic);
40 | cf.filter(rep);
41 | }
42 | TimeNumberDisambiguation timenumberdis = new TimeNumberDisambiguation(eckb);
43 | timenumberdis.solve(rep);
44 | LocationDisambiguation locationDis = new LocationDisambiguation(eckb);
45 | locationDis.solve(rep);
46 |
47 | RuleAdapation rules = new RuleAdapation();
48 | rules.addNoCandidatesCheckPluralRule(eckb);
49 | rules.addNoCandidatesExpansionRule(eckb);
50 | rules.addUnambiguousToAmbiguousRule(eckb);
51 | rules.addPatternRule(eckb, topic);
52 | rules.addContextRule(eckb);
53 | rules.performRuleChainBeforeCandidateSelection(rep);
54 |
55 | CandidateReductionDBpediaW2V w2vreduction = new CandidateReductionDBpediaW2V(eckb, rep, 20, 5, 150, false, false);
56 | w2vreduction.solve();
57 | rep = w2vreduction.getRep();
58 |
59 | w2vreduction = new CandidateReductionDBpediaW2V(eckb, rep, 45, 5, 250, true, true);
60 | w2vreduction.solve();
61 | rep = w2vreduction.getRep();
62 | FinalEntityDisambiguation finalDis = new FinalEntityDisambiguation(eckb, rep);
63 | finalDis.setup();
64 | finalDis.solve();
65 | }
66 |
67 | void generateResult() {
68 | for (int i = 0; i < currentResponse.length; i++) {
69 | SurfaceForm r = search(i);
70 | if (currentResponse[i] == null && r != null && r.getCandidates().size() == 1) {
71 | Response res = new Response();
72 | List entList = new LinkedList();
73 | DisambiguatedEntity ent = new DisambiguatedEntity();
74 | ent.setEntityUri(r.getCandidates().get(0));
75 | entList.add(ent);
76 | res.setDisEntities(entList);
77 | res.setSelectedText(r.getSurfaceForm());
78 | currentResponse[i] = res;
79 | }
80 | }
81 | }
82 |
83 | private SurfaceForm search(int qryNr) {
84 | for (SurfaceForm r : rep) {
85 | if (r.getQueryNr() == qryNr) {
86 | return r;
87 | }
88 | }
89 | return null;
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/PriorQuery.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.query;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.lucene.index.AtomicReaderContext;
6 | import org.apache.lucene.search.Explanation;
7 | import org.apache.lucene.search.IndexSearcher;
8 | import org.apache.lucene.search.Query;
9 | import org.apache.lucene.search.Scorer;
10 | import org.apache.lucene.search.Weight;
11 | import org.apache.lucene.util.Bits;
12 |
13 | import doser.lucene.features.IEntityCentricExtFeatures;
14 |
15 | /**
16 | * Due to major performance problems if we use an IndexReader request for every
17 | * single document, we create a Hashmap to improve the
18 | * overall performance.
19 | *
20 | * Our StartupInformationLoader provides these necessary information much
21 | * faster.
22 | *
23 | * @author Stefan Zwicklbauer
24 | */
25 | public class PriorQuery extends Query {
26 |
27 | class PriorWeight extends Weight {
28 |
29 | class PriorScorer extends Scorer {
30 |
31 | private final AtomicReaderContext context;
32 |
33 | private int lastDoc = -1;
34 |
35 | PriorScorer(final Weight weight, final AtomicReaderContext context) {
36 | super(weight);
37 | this.context = context;
38 | }
39 |
40 | @Override
41 | public int advance(final int target) throws IOException {
42 | final int maxdoc = context.reader().numDocs();
43 | if (target > (maxdoc - 1)) {
44 | return NO_MORE_DOCS;
45 | }
46 | return lastDoc = target;
47 | }
48 |
49 | @Override
50 | public long cost() {
51 | return 0;
52 | }
53 |
54 | @Override
55 | public int docID() {
56 | return lastDoc;
57 | }
58 |
59 | @Override
60 | public int freq() throws IOException {
61 | return 1;
62 | }
63 |
64 | @Override
65 | public int nextDoc() throws IOException {
66 | if ((context.reader().numDocs() - 1) > lastDoc) {
67 | return ++lastDoc;
68 | } else {
69 | return NO_MORE_DOCS;
70 | }
71 | }
72 |
73 | @Override
74 | public float score() throws IOException {
75 | return kb.getPriorOfDocument(context.docBase + lastDoc);
76 | }
77 |
78 | @Override
79 | public String toString() {
80 | return "Prior";
81 | }
82 | }
83 |
84 | @Override
85 | public Explanation explain(final AtomicReaderContext context,
86 | final int doc) throws IOException {
87 | return null;
88 | }
89 |
90 | @Override
91 | public Query getQuery() {
92 | return PriorQuery.this;
93 | }
94 |
95 | @Override
96 | public float getValueForNormalization() throws IOException {
97 | return 0;
98 | }
99 |
100 | @Override
101 | public void normalize(final float norm, final float topLevelBoost) {
102 | // Do nothing here!
103 | }
104 |
105 | @Override
106 | public Scorer scorer(AtomicReaderContext context, Bits acceptDocs)
107 | throws IOException {
108 | return new PriorScorer(this, context);
109 | }
110 | }
111 |
112 | private IEntityCentricExtFeatures kb;
113 |
114 | public PriorQuery(IEntityCentricExtFeatures kb) {
115 | super();
116 | this.kb = kb;
117 | }
118 |
119 | @Override
120 | public Weight createWeight(final IndexSearcher searcher) throws IOException {
121 | return new PriorWeight();
122 | }
123 |
124 | @Override
125 | public String toString(final String field) {
126 | return "PriorQuery";
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/yes:
--------------------------------------------------------------------------------
1 | -----BEGIN RSA PRIVATE KEY-----
2 | Proc-Type: 4,ENCRYPTED
3 | DEK-Info: AES-128-CBC,9F33236E2FD99EACABA4D7F529D0E8A5
4 |
5 | 8PISpGsmdq0QuL/NcFlGOZznZdyibB1/A6nI5bfiDljT5hzQ7xWFBM3S2IHeKUVK
6 | wdJdA+c3Y4dXgRllMczUMJBXX3UfObsm3/5TWCKPwczxLJ0tgxCgYVX9KNVt0Ngv
7 | b2ayQqkNBvBHq5ooKr8glkjvZ0Wl6QZ+W4pz8KndfzSiUri/WTryEmjzYbgyBXyG
8 | 8L+wG8mGOiCYKOFlVM+ViE8f3d+i0lsxX7PgXkdyWOvlgx2Iy3MhLQNXw0LztU6I
9 | QFpa1DtjcBewpvYtPJn6fma1nqhc8bTSaM0/1a8aLeCJWqCzQ5vD1wkkZ1eLEMDn
10 | Jg0D1fT2mm3XtNAMwOHcd+j3IG7aTofhU+XRBPk1YRdbOJjNuMzgV+P3dxXUhGLV
11 | N9vb2hUm/wIXngiKTeigsYGj59nvhyda6DfLhsNfizH1M/Foq3ZaNdWCvwtfJzAS
12 | sw2tW+PnPJiKpSXE1O7DQ3fduv5gBrrxZ906kHVKzPPa0T+0HWN+Z3MyM8IbuYKf
13 | zUVo0IogdobK+vm6HcKTWCdV0v5BPG6cTWHbTUi2kdJLc9j1lnnzEAOMIHYexsg2
14 | 8PmD2uncDNvUvS5DDILVSFj40zG57c2pVgBWcN1U211env8eb9jD4oJr+rOH4gvm
15 | pDLnB72eYZmQ9oUnnLsPo8c8cFfuJiTYIqmPW5crpzjUQlDlMlc8Kh5A3XJ/XHqh
16 | sq7M+Kn54l51SH+FvpS6u/s6dpwjCa+UbuFzdbJlE/RRLZaoTh0gov6k0n/48XSU
17 | 0XBJGuKyL8hmGmwAyMwdNb9vuH0Gah30ZeIpb8Iiw6aUNeCnpDrZ+b1M7VAC2Q/M
18 | UvuNe+datxI4FNyqPOnmi8o+vkWl3W8+M71qkGGsi+qnSUwnR9uUFg6VBt1WEdHw
19 | QpkPeQtnS53kadKSqLZEnPTnLsEYZfByCexgdXzJr32+IiUzkM8PoNuMzhVR+LgK
20 | Q55EJyFj736f8pwzC4k8Iz5WwAqnabXJH3eEW+o94a75xCM/32QW3ZJS8+yVh3Jb
21 | R622Tu9S6VxPzrS/HRbAmLCsWwy8svobKVTMN5vOzx3bZ5DrtjdyY8eBfQgBLQQW
22 | HxKGXYygz7M93e03K4VQbc0Gt1igBXgOH/W6MZXAzMk+WfXVRml2BzUWnh5pGvVt
23 | x0+vlbWESWKdIYY16R20R2594Elh9j1kgzRE3c3f0Aq86S5VhR4wvwcjF2GpHHuH
24 | 1ILCKvmWehfl+DJ1kyYfAXemsHxkkAHNCpJQ9TeKQiVUWDIjEBvuEEn6lgEu3vWG
25 | LgCV+AkWRKRRExssPK/Cj/VUqa4mhFLOy61JKi5XCj991MwXXJPaSmTp9j2hofcR
26 | yZWkaqwhe3kkZfVCETl4wTAPs+uB+7vW1zO70me959D4qoZVmu/Lr/VnGDw+7WIg
27 | NuDyIM7cSE/8va7r19b0uDJdwLrcmir8WwmxJOHCOQv+hY73RR2Hcmr1EtOp2BEw
28 | dwuc3+ewMcvNSQLnBUe/6OMRz9Z0kc620f1H6X4WJHu1BA0FDCbh9HeEpx3zsECN
29 | YPCrlZUS68kkGxscE3QgtTnKDsjArPrLxxFueBAlpYVUW+jzhqnd1w0xCXBbB2tV
30 | xi0kIpigCobhS35xig5nR6nkoSjc7nr6ybiEVA5x6Fbd41fVwtFop+4W8kmZL4/I
31 | 8lwHBp8SLRv/bjN7q72rYn1HH+JIKAskuLhpG00nK9gzDDhWYGVEuFhy3Jw6OZtZ
32 | tro3PRgAW83yKIjGvpGb1iZEg4YQhldZGq4/bxOU0FYKTniGlA+sZFmSYrKCPW4D
33 | 9J8isFexm0P6Dv8vjzIV/WSbTn9Z9bF3bcN1Eg91a2X/82iVlTh4Lgea8NMhLtUN
34 | nyKETpUQRoT126mHuaVbvD/OG2PUwLMt5vahQoTaYmazTk+Uevjgi9PfWBSLzsqB
35 | QKzCNoQjzcibYuAv4zU7hbjEXjtLXXkVyzVhTTiTKzIXEd8c8f4XUSEHo82UXjMa
36 | gzsXx8VsTEzfBEPSruBaKxf82LInpgGwNPlVTsW+g1T0nGE0qC7W/BYfSYEVCZY/
37 | PmaIwMzC5akuZnqiLTOMIwEdPe8iYzntcvCfUZB1rL75Xx6Y7YFLIt96fKFb5Nn1
38 | Yni80JAtvoFlCiZaUSoWAnHecXNewd5xwJjaJdgFh38cCZmvjTxupk9rU3lsLLoX
39 | tzZEFJv7Qt/axbqkIT/zdJr87zeScAgLU7PcpO05LPQR9pU3mm3z0jylgMUzU1Qz
40 | IVJHQ8CnaCTg2S7fwjZjHVlHIrPPiZgVhTN7Rt8vZ3CB7Wf8sXpGBIXADYoNiOVG
41 | lvtrXJYGZ5uoyeJLHerGNyMu4B3iCoY50kilNCcQ3cfX7G6SMwMgH1oJDHCMdOr+
42 | PLWf45FcwQrhkj56DqytV389OKaADsJoNuEpgbLmnUBCJQHzq4/Lfoqvqj7z2PE/
43 | F2kgb9JN7eBfbw/a6Sa7A0Qe8yCOVd9HWqSt0sQDqITcybF/gfU5IAjaFDWm6xKk
44 | FMFKTigj6Y4UfDfffZfVFAJ0AqNfkHTAI92ShGU/hrDAHmcgiio3m93IsnSjqYWs
45 | McFgcvsaqQpb4LfkdckXBDrZCVXNbeOe7JdxLcZxlI1hHeve2spz7zY7N3MTZzNm
46 | xZ0wcndfcmfVv/KXGvjPGh9+rrZyWXfeT5bwE6wLwg+CJmCI2AJDvoGdx7hkL8FL
47 | FKjbOrnTCai+Q4/vOdVpQz7/X7nyIX5DgqthqI8PTF4qAmoKM8htATK96CfW/Mw5
48 | PEQbU25nRHSE/TxVWoeoPJ5YQLnlh6Voey9Sk5vSzBNwyZXde9/1okZPvnZjmcvu
49 | 9TxOpoETYnNyfZEJ4g4FvHWSpN7YiDnNiwvD4nCRIq9oQTWhjK3w76Drv92MjaqJ
50 | bzaNMt909qVjLaio1sT5tDtqXT9Me5R7bL1qoEPXAePzYD7Bc1kZs1FD3emCCjh+
51 | TL/sLv64fPrpEH026AKfNqUWd9A0EexJqnVH6J6TgE6LrYe7Wq8PHlc+3DiEdroT
52 | qyMnP71BTu/UrUcm/rQ/+FDvduVncD0mDuUaw3Vr3Lf0DgYr/7nd5IFMP+5bpPZo
53 | KU5dNyRfOYOZTJ4vdTYpjeOU1IkjP+fBrbZ8wacHEqju68v4XViIJNaZrAJmq5t/
54 | -----END RSA PRIVATE KEY-----
55 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/SensePriorQuery.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.query;
2 |
3 | import java.io.IOException;
4 |
5 | import org.apache.lucene.index.AtomicReaderContext;
6 | import org.apache.lucene.search.Explanation;
7 | import org.apache.lucene.search.IndexSearcher;
8 | import org.apache.lucene.search.Query;
9 | import org.apache.lucene.search.Scorer;
10 | import org.apache.lucene.search.Weight;
11 | import org.apache.lucene.util.Bits;
12 |
13 | import doser.lucene.features.IEntityCentricExtFeatures;
14 |
15 | /**
16 | * Due to major performance problems if we use an IndexReader request for every
17 | * single document, we create a Hashmap to improve the
18 | * overall performance.
19 | *
20 | * Our StartupInformationLoader provides these necessary information much
21 | * faster.
22 | *
23 | * @author Stefan Zwicklbauer
24 | *
25 | */
26 | public class SensePriorQuery extends Query {
27 |
28 | class PriorWeight extends Weight {
29 |
30 | class SensePriorScorer extends Scorer {
31 |
32 | private final AtomicReaderContext context;
33 |
34 | private int lastDoc = -1;
35 |
36 | SensePriorScorer(final Weight weight,
37 | final AtomicReaderContext context) {
38 | super(weight);
39 | this.context = context;
40 | }
41 |
42 | @Override
43 | public int advance(final int target) throws IOException {
44 | final int maxdoc = context.reader().numDocs();
45 | if (target > (maxdoc - 1)) {
46 | return NO_MORE_DOCS;
47 | }
48 | return lastDoc = target;
49 | }
50 |
51 | @Override
52 | public long cost() {
53 | return 0;
54 | }
55 |
56 | @Override
57 | public int docID() {
58 | return lastDoc;
59 | }
60 |
61 | @Override
62 | public int freq() throws IOException {
63 | return 1;
64 | }
65 |
66 | @Override
67 | public int nextDoc() throws IOException {
68 | if ((context.reader().numDocs() - 1) > lastDoc) {
69 | return ++lastDoc;
70 | } else {
71 | return NO_MORE_DOCS;
72 | }
73 | }
74 |
75 | @Override
76 | public float score() throws IOException {
77 | float res = 0.0f;
78 | res = kb.getSensePriorOfDocument(keyword, context.docBase
79 | + lastDoc);
80 | return res;
81 | }
82 |
83 | @Override
84 | public String toString() {
85 | return "SensePrior";
86 | }
87 |
88 | }
89 |
90 | @Override
91 | public Explanation explain(final AtomicReaderContext context,
92 | final int doc) throws IOException {
93 | return null;
94 | }
95 |
96 | @Override
97 | public Query getQuery() {
98 | return SensePriorQuery.this;
99 | }
100 |
101 | @Override
102 | public float getValueForNormalization() throws IOException {
103 | return 0;
104 | }
105 |
106 | @Override
107 | public void normalize(final float norm, final float topLevelBoost) {
108 | // Do nothing here
109 | }
110 |
111 | @Override
112 | public Scorer scorer(AtomicReaderContext context, Bits acceptDocs)
113 | throws IOException {
114 | return new SensePriorScorer(this, context);
115 | }
116 |
117 | }
118 |
119 | private final IEntityCentricExtFeatures kb;
120 |
121 | private final String keyword;
122 |
123 | public SensePriorQuery(final String keyword, final IEntityCentricExtFeatures kb) {
124 | super();
125 | this.keyword = keyword;
126 | this.kb = kb;
127 | }
128 |
129 | @Override
130 | public Weight createWeight(final IndexSearcher searcher) throws IOException {
131 | return new PriorWeight();
132 | }
133 |
134 | @Override
135 | public String toString(final String field) {
136 | return "SensePriorQuery";
137 | }
138 | }
139 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/SurfaceForm.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | public class SurfaceForm implements Comparable, Cloneable {
7 |
8 | private int queryNr;
9 | private String surfaceForm;
10 | private String context;
11 | private List candidates;
12 | private Integer ambiguity;
13 | private boolean isACandidate;
14 | private double difference;
15 | private int position;
16 | private boolean matchesInitial;
17 | private boolean initial;
18 | private boolean isRelevant;
19 |
20 | public SurfaceForm(String surfaceForm, String context, List candidates, int qryNr, int position) {
21 | super();
22 | this.ambiguity = candidates.size();
23 | this.surfaceForm = surfaceForm;
24 | this.context = context;
25 | this.candidates = candidates;
26 | this.queryNr = qryNr;
27 | this.isACandidate = true;
28 | this.difference = 0;
29 | this.position = position;
30 | this.initial = false;
31 | this.isRelevant = true;
32 | }
33 |
34 | public boolean isRelevant() {
35 | return isRelevant;
36 | }
37 |
38 | public void setRelevant(boolean isRelevant) {
39 | this.isRelevant = isRelevant;
40 | }
41 |
42 | public boolean isMatchesInitial() {
43 | return matchesInitial;
44 | }
45 |
46 | public void setMatchesInitial(boolean matchesInitial) {
47 | this.matchesInitial = matchesInitial;
48 | }
49 |
50 | public boolean isInitial() {
51 | return initial;
52 | }
53 |
54 | public void setInitial(boolean initial) {
55 | this.initial = initial;
56 | }
57 |
58 | public void setCandidates(List candidates) {
59 | this.candidates = candidates;
60 | }
61 |
62 | public List getCandidates() {
63 | return candidates;
64 | }
65 |
66 | public void setACandidate(boolean can) {
67 | this.isACandidate = can;
68 | }
69 |
70 | public String getSurfaceForm() {
71 | return surfaceForm;
72 | }
73 |
74 | public boolean isACandidate() {
75 | return isACandidate;
76 | }
77 |
78 | public String getContext() {
79 | return context;
80 | }
81 |
82 | public int getQueryNr() {
83 | return queryNr;
84 | }
85 |
86 | public int getAmbiguity() {
87 | return this.ambiguity;
88 | }
89 |
90 | public void setDisambiguatedEntity(String url) {
91 | candidates.clear();
92 | candidates.add(url);
93 | }
94 |
95 | public void clearList() {
96 | candidates.clear();
97 | }
98 |
99 | public void addCandidate(String can) {
100 | candidates.add(can);
101 | }
102 |
103 | public double getDifference() {
104 | return difference;
105 | }
106 |
107 | public void setDifference(double difference) {
108 | this.difference = difference;
109 | }
110 |
111 | public int getPosition() {
112 | return position;
113 | }
114 |
115 | public void setPosition(int position) {
116 | this.position = position;
117 | }
118 |
119 | @Override
120 | public int compareTo(SurfaceForm o) {
121 | if (this.difference < o.getDifference()) {
122 | return 1;
123 | } else if (this.difference > o.getDifference()) {
124 | return -1;
125 | } else {
126 | return 0;
127 | }
128 | }
129 |
130 | public Object clone() {
131 | ArrayList newCandidates = new ArrayList();
132 | for (String s : candidates) {
133 | newCandidates.add(s);
134 | }
135 |
136 | SurfaceForm n = new SurfaceForm(new String(this.surfaceForm), new String(this.context), newCandidates,
137 | this.queryNr, this.position);
138 | n.setACandidate(this.isACandidate);
139 | n.setInitial(this.initial);
140 | n.setMatchesInitial(this.matchesInitial);
141 | n.setRelevant(this.isRelevant);
142 | return n;
143 | }
144 | }
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/UnambiguousToAmbiguousRule.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.rules;
2 |
3 | import java.io.IOException;
4 | import java.util.HashMap;
5 | import java.util.LinkedList;
6 | import java.util.List;
7 | import java.util.Map;
8 |
9 | import org.apache.lucene.document.Document;
10 | import org.apache.lucene.index.Term;
11 | import org.apache.lucene.search.IndexSearcher;
12 | import org.apache.lucene.search.Query;
13 | import org.apache.lucene.search.ScoreDoc;
14 | import org.apache.lucene.search.TopDocs;
15 |
16 | import doser.entitydisambiguation.algorithms.SurfaceForm;
17 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
18 | import doser.lucene.query.TermQuery;
19 |
20 | /**
21 | * Falls eine Surface Form eindeutig ist und weitere Surface Forms eine
22 | * Abkürzung darstellen, diese allerdings nicht eindeutig sind, wird dies sofort
23 | * aufgelöst.
24 | *
25 | * Beispiel: 1 Surface Form: Burlington Industries Inc (eindeutig) 2 Surface
26 | * Form: Burlington (ambiguous) ...
27 | *
28 | *
29 | * @author quh
30 | *
31 | */
32 |
33 | class UnambiguousToAmbiguousRule extends AbstractRule {
34 |
35 | UnambiguousToAmbiguousRule(EntityCentricKBDBpedia eckb) {
36 | super(eckb);
37 | }
38 |
39 | @Override
40 | public boolean applyRule(List rep) {
41 | List unambiguous = new LinkedList();
42 | for (SurfaceForm c : rep) {
43 | if (c.getCandidates().size() == 1) {
44 | String candidate = c.getCandidates().get(0);
45 | String type = queryType(candidate);
46 | if (type.equalsIgnoreCase("Person") || type.equalsIgnoreCase("Organisation")) {
47 | unambiguous.add(c);
48 | }
49 | }
50 | }
51 | for (SurfaceForm c : rep) {
52 | if (c.getCandidates().size() > 1) {
53 | HashMap map = new HashMap();
54 | for (SurfaceForm un : unambiguous) {
55 | String type = queryType(un.getCandidates().get(0));
56 | if ((isSubString(un.getSurfaceForm(), c.getSurfaceForm())
57 | && c.getCandidates().contains(un.getCandidates().get(0))
58 | && un.getPosition() < c.getPosition())
59 | || (type.equalsIgnoreCase("Person") && isSubString(un.getSurfaceForm(), c.getSurfaceForm())
60 | && un.getPosition() < c.getPosition())) {
61 | map.put(un.getCandidates().get(0), c.getPosition() - un.getPosition());
62 | // c.setDisambiguatedEntity(un.getCandidates().get(0));
63 | }
64 | }
65 | if (!map.isEmpty()) {
66 | int distance = Integer.MAX_VALUE;
67 | String can = "";
68 | for (Map.Entry entry : map.entrySet()) {
69 | if (entry.getValue() < distance) {
70 | distance = entry.getValue();
71 | can = entry.getKey();
72 | }
73 | }
74 | c.setDisambiguatedEntity(can);
75 | }
76 | }
77 | }
78 | return false;
79 | }
80 |
81 | private boolean isSubString(String s1, String s2) {
82 | if (s1.toLowerCase().contains(s2.toLowerCase())) {
83 | return true;
84 | } else
85 | return false;
86 | }
87 |
88 | private String queryType(String url) {
89 | String type = "";
90 | IndexSearcher searcher = eckb.getSearcher();
91 | Query q = new TermQuery(new Term("Mainlink", url));
92 | try {
93 | TopDocs docs = searcher.search(q, 1);
94 | ScoreDoc[] scoredocs = docs.scoreDocs;
95 | if(scoredocs.length == 0) {
96 | type = "Misc";
97 | } else {
98 | int nr = scoredocs[0].doc;
99 | Document doc = searcher.getIndexReader().document(nr);
100 | type = doc.get("Type");
101 | }
102 | } catch (IOException e) {
103 | e.printStackTrace();
104 | }
105 | return type;
106 | }
107 |
108 | }
109 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/NoCandidatesExpansionRules.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.rules;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.LinkedList;
6 | import java.util.List;
7 |
8 | import org.apache.lucene.document.Document;
9 | import org.apache.lucene.index.IndexReader;
10 | import org.apache.lucene.search.IndexSearcher;
11 | import org.apache.lucene.search.ScoreDoc;
12 | import org.apache.lucene.search.TopDocs;
13 | import org.apache.lucene.search.similarities.DefaultSimilarity;
14 |
15 | import doser.entitydisambiguation.algorithms.SurfaceForm;
16 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
17 | import doser.lucene.features.LuceneFeatures;
18 | import doser.lucene.query.LearnToRankClause;
19 | import doser.lucene.query.LearnToRankQuery;
20 | import doser.tools.Inflector;
21 |
22 | /**
23 | * Falls eine Surface Form keine Kandidaten hat, allerdings aus mindestens 3
24 | * Wörtern besteht, werden alle Wörter mit kleinergleich 3 Buchstaben entfernt
25 | * und erneut angefragt. Dies geschieht ebenfalls nach der Entfernung von
26 | * Sonderzeichen. Entsprechend werden die Kandidaten gesetzt.
27 | *
28 | * @author quh
29 | */
30 |
31 | class NoCandidatesExpansionRules extends AbstractRule {
32 |
33 | NoCandidatesExpansionRules(AbstractKnowledgeBase eckb) {
34 | super(eckb);
35 | }
36 |
37 | @Override
38 | public boolean applyRule(List rep) {
39 | for (SurfaceForm c : rep) {
40 | if (c.getCandidates().size() == 0) {
41 | c.setCandidates(queryCandidates(c.getSurfaceForm()));
42 | }
43 | }
44 | return false;
45 | }
46 |
47 | private ArrayList queryCandidates(String surfaceForm) {
48 | ArrayList lst = new ArrayList();
49 | String[] splitter = surfaceForm.split(" ");
50 | if (splitter.length > 2) {
51 | StringBuilder builder = new StringBuilder();
52 | for (int i = 0; i < splitter.length; i++) {
53 | if (splitter[i].length() > 3) {
54 | builder.append(splitter[i] + " ");
55 |
56 | }
57 | }
58 | String builderstring = builder.toString();
59 | if (builderstring.length() > 0) {
60 | String newSf = builderstring.substring(0,
61 | builderstring.length() - 1);
62 | lst = queryLucene(surfaceForm);
63 | if (lst.size() == 0) {
64 | // Try again without special chars
65 | newSf = newSf.replaceAll("[^a-zA-Z ]", "");
66 | lst = queryLucene(newSf);
67 | // If size is 0 anyway, still check Plural to singular
68 | if (lst.size() == 0) {
69 | String singular = Inflector.getInstance().singularize(
70 | newSf);
71 | if (!newSf.equalsIgnoreCase(singular)) {
72 | // Try singular search
73 | lst = queryCandidates(singular);
74 | }
75 | }
76 | }
77 | }
78 | }
79 | return lst;
80 | }
81 |
82 | private ArrayList queryLucene(String surfaceForm) {
83 | ArrayList list = new ArrayList();
84 | final IndexSearcher searcher = eckb.getSearcher();
85 | final IndexReader reader = searcher.getIndexReader();
86 | LearnToRankQuery query = new LearnToRankQuery();
87 | List features = new LinkedList();
88 | DefaultSimilarity defaultSim = new DefaultSimilarity();
89 | features.add(query.add(LuceneFeatures.queryLabelTerm(surfaceForm,
90 | "UniqueLabel", defaultSim), "Feature1", true));
91 | try {
92 | final TopDocs top = searcher.search(query, 150);
93 | final ScoreDoc[] score = top.scoreDocs;
94 | if (score.length <= 5) {
95 | for (int i = 0; i < score.length; ++i) {
96 | final Document doc = reader.document(score[i].doc);
97 | list.add(doc.get("Mainlink"));
98 | }
99 | }
100 | } catch (IOException e) {
101 | e.printStackTrace();
102 | }
103 | return list;
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/tools/NTToDbPediaUrlEncoding.java:
--------------------------------------------------------------------------------
1 | package doser.tools;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.File;
5 | import java.io.FileNotFoundException;
6 | import java.io.FileReader;
7 | import java.io.FileWriter;
8 | import java.io.IOException;
9 | import java.io.UnsupportedEncodingException;
10 | import java.io.Writer;
11 | import java.net.URLEncoder;
12 |
13 | import org.apache.commons.lang.StringEscapeUtils;
14 | import org.apache.log4j.Logger;
15 |
16 | public final class NTToDbPediaUrlEncoding {
17 |
18 | private NTToDbPediaUrlEncoding() {
19 | super();
20 | }
21 |
22 | public static String dbpediaEncoding(final String url) {
23 | final StringBuffer buffer = new StringBuffer();
24 | for (int i = 0; i < url.length(); i++) {
25 | final String str = String.valueOf(url.charAt(i));
26 | if (str.equalsIgnoreCase("!")) {
27 | buffer.append('!');
28 | } else if (str.equalsIgnoreCase("$")) {
29 | buffer.append('$');
30 | } else if (str.equalsIgnoreCase("&")) {
31 | buffer.append('&');
32 | } else if (str.equalsIgnoreCase("'")) {
33 | buffer.append('\'');
34 | } else if (str.equalsIgnoreCase("(")) {
35 | buffer.append('(');
36 | } else if (str.equalsIgnoreCase(")")) {
37 | buffer.append(')');
38 | } else if (str.equalsIgnoreCase("*")) {
39 | buffer.append('*');
40 | } else if (str.equalsIgnoreCase("+")) {
41 | buffer.append('+');
42 | } else if (str.equalsIgnoreCase(",")) {
43 | buffer.append(',');
44 | } else if (str.equalsIgnoreCase("-")) {
45 | buffer.append('-');
46 | } else if (str.equalsIgnoreCase("/")) {
47 | buffer.append('/');
48 | } else if (str.equalsIgnoreCase(":")) {
49 | buffer.append(':');
50 | } else if (str.equalsIgnoreCase(";")) {
51 | buffer.append(';');
52 | } else if (str.equalsIgnoreCase("=")) {
53 | buffer.append('=');
54 | } else if (str.equalsIgnoreCase("@")) {
55 | buffer.append('@');
56 | } else if (str.equalsIgnoreCase("_")) {
57 | buffer.append('_');
58 | } else if (str.equalsIgnoreCase("~")) {
59 | buffer.append('~');
60 | } else {
61 | try {
62 | buffer.append(URLEncoder.encode(str, "UTF-8"));
63 | } catch (final UnsupportedEncodingException e) {
64 | Logger.getRootLogger().error(e.getStackTrace());
65 | }
66 | }
67 | }
68 | return buffer.toString();
69 | }
70 |
71 | public static void main(final String[] args) throws IOException {
72 | final String fileInput = args[0];
73 | final String fileOutput = args[1];
74 | final File fileIn = new File(fileInput);
75 | final File fileOut = new File(fileOutput);
76 | final Writer writer = new FileWriter(fileOut);
77 | BufferedReader reader = null;
78 | try {
79 | reader = new BufferedReader(new FileReader(fileIn));
80 | } catch (final FileNotFoundException e) {
81 | Logger.getRootLogger().error(e.getStackTrace());
82 | }
83 | String line = null;
84 | while ((line = reader.readLine()) != null) {
85 | line = line.replaceAll("[ ]+", " ");
86 | final String splitter[] = line.split(" ");
87 | final StringBuffer buffer = new StringBuffer();
88 |
89 | // Subject
90 | String url = splitter[0].substring(1, splitter[0].length() - 1);
91 | String sLine = StringEscapeUtils.unescapeJava(url);
92 | buffer.append("<" + dbpediaEncoding(sLine) + "> ");
93 |
94 | // Predicate
95 | buffer.append(splitter[1] + " ");
96 |
97 | // Object
98 | if (splitter[2].startsWith("<")) {
99 | url = splitter[2].substring(1, splitter[2].length() - 1);
100 | sLine = StringEscapeUtils.unescapeJava(url);
101 | buffer.append("<" + dbpediaEncoding(sLine) + ">");
102 | } else {
103 | buffer.append(splitter[2]);
104 | }
105 | writer.write(buffer.toString());
106 | writer.write(System.getProperty("line.separator"));
107 | writer.flush();
108 | }
109 | writer.close();
110 | reader.close();
111 | }
112 |
113 | }
114 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/general/Test.java:
--------------------------------------------------------------------------------
1 | package doser.general;
2 |
3 | import java.io.IOException;
4 | import java.text.ParseException;
5 |
6 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
7 | import org.apache.lucene.document.Document;
8 | import org.apache.lucene.document.Field.Store;
9 | import org.apache.lucene.document.StringField;
10 | import org.apache.lucene.document.TextField;
11 | import org.apache.lucene.index.DirectoryReader;
12 | import org.apache.lucene.index.IndexReader;
13 | import org.apache.lucene.index.IndexWriter;
14 | import org.apache.lucene.index.IndexWriterConfig;
15 | import org.apache.lucene.index.Term;
16 | import org.apache.lucene.search.BooleanClause.Occur;
17 | import org.apache.lucene.search.BooleanQuery;
18 | import org.apache.lucene.search.IndexSearcher;
19 | import org.apache.lucene.search.PhraseQuery;
20 | import org.apache.lucene.search.Query;
21 | import org.apache.lucene.search.ScoreDoc;
22 | import org.apache.lucene.search.TermQuery;
23 | import org.apache.lucene.search.TopScoreDocCollector;
24 | import org.apache.lucene.search.spans.SpanNearQuery;
25 | import org.apache.lucene.search.spans.SpanQuery;
26 | import org.apache.lucene.search.spans.SpanTermQuery;
27 | import org.apache.lucene.store.Directory;
28 | import org.apache.lucene.store.RAMDirectory;
29 | import org.apache.lucene.util.Version;
30 |
31 | public class Test {
32 | private IndexWriter writer;
33 |
34 | public void lucene() throws IOException, ParseException {
35 | // Build the index
36 | StandardAnalyzer analyzer = new StandardAnalyzer();
37 | Directory index = new RAMDirectory();
38 | IndexWriterConfig config = new IndexWriterConfig(Version.LATEST,
39 | analyzer);
40 | this.writer = new IndexWriter(index, config);
41 |
42 | // Add documents to the index
43 | addDoc("Spring", new String[] { "Java", "JSP", "DBPEDIA_56testdoc" });
44 | addDoc("Java", new String[] { "Oracle", "Annotation is cool too" });
45 |
46 | writer.close();
47 |
48 | // Search the index
49 | IndexReader reader = DirectoryReader.open(index);
50 | IndexSearcher searcher = new IndexSearcher(reader);
51 |
52 | TermQuery q = new TermQuery(new Term("keyword", "DBPEDIA_56testdoc"));
53 | // SpanQuery q = new SpanNearQuery(new SpanQuery[] {
54 | // new SpanTermQuery(new Term("keyword", "too")),
55 | // new SpanTermQuery(new Term("keyword", "cool"))},
56 | // 3,
57 | // true);
58 |
59 | // String[] s = {"cool", "too"};
60 | // for (int i = 0; i < s.length; i++) {
61 | // q.add(new Term("keyword", s[i]));
62 | // }
63 |
64 | // q.add(new PhraseQuery(new Term("keyword", "Annotation is cool")),
65 | // Occur.MUST);
66 |
67 | System.out.println(q.toString());
68 |
69 | int hitsPerPage = 10;
70 | TopScoreDocCollector collector = TopScoreDocCollector.create(
71 | hitsPerPage, true);
72 |
73 | searcher.search(q, collector);
74 |
75 | ScoreDoc[] hits = collector.topDocs().scoreDocs;
76 |
77 | for (int i = 0; i < hits.length; ++i) {
78 | int docId = hits[i].doc;
79 | Document doc = searcher.doc(docId);
80 | System.out.println(hits[i].toString());
81 | System.out.println((i + 1) + ". \t" + doc.get("title"));
82 | }
83 |
84 | reader.close();
85 | }
86 |
87 | private void addDoc(String title, String[] keywords) throws IOException {
88 | // Create new document
89 | Document doc = new Document();
90 |
91 | // Add title
92 | doc.add(new TextField("title", title, Store.YES));
93 |
94 | // Add keywords
95 | for (int i = 0; i < keywords.length; i++) {
96 | doc.add(new StringField("keyword", keywords[i], Store.YES));
97 | }
98 |
99 | // Add document to index
100 | this.writer.addDocument(doc);
101 | }
102 |
103 | public static void main(String[] args) {
104 | Test test = new Test();
105 | try {
106 | test.lucene();
107 | } catch (IOException e) {
108 | // TODO Auto-generated catch block
109 | e.printStackTrace();
110 | } catch (ParseException e) {
111 | // TODO Auto-generated catch block
112 | e.printStackTrace();
113 | }
114 | }
115 | }
116 |
--------------------------------------------------------------------------------
/doser-dis-core/src/test/java/doser/test/breakdetection/BreakDetection.java:
--------------------------------------------------------------------------------
1 | package doser.test.breakdetection;
2 |
3 | public class BreakDetection {
4 |
5 | // @SuppressWarnings("deprecation")
6 | // public static void main(String[] args) {
7 | //
8 | // List shotList = new LinkedList();
9 | // WebSite shot1 = new WebSite();
10 | // shot1.setName("1");
11 | // shot1.setText("Text1");
12 | // shot1.setObjectId(0);
13 | // WebSite shot2 = new WebSite();
14 | // shot2.setObjectId(1);
15 | // shot2.setName("2");
16 | // shot2.setText("Text2");
17 | // WebSite shot3 = new WebSite();
18 | // shot3.setObjectId(2);
19 | // shot3.setName("3");
20 | // shot3.setText("Text3");
21 | // WebSite shot4 = new WebSite();
22 | // shot4.setObjectId(3);
23 | // shot4.setName("4");
24 | // shot4.setText("Text4");
25 | // WebSite shot5 = new WebSite();
26 | // shot5.setObjectId(4);
27 | // shot5.setName("5");
28 | // shot5.setText("Text5");
29 | // WebSite shot6 = new WebSite();
30 | // shot6.setObjectId(5);
31 | // shot6.setName("6");
32 | // shot6.setText("Text6");
33 | //
34 | // shotList.add(shot1);
35 | // shotList.add(shot2);
36 | // shotList.add(shot3);
37 | // shotList.add(shot4);
38 | // shotList.add(shot5);
39 | // shotList.add(shot6);
40 | // Decomposition decomp = new Decomposition(shotList);
41 | //
42 | // double[][] similarityMatrix = new double[6][6];
43 | // similarityMatrix[0][0] = 1;
44 | // similarityMatrix[0][1] = 0.5;
45 | // similarityMatrix[0][2] = 0.5;
46 | // similarityMatrix[0][3] = 0.8;
47 | // similarityMatrix[0][4] = 0.4;
48 | // similarityMatrix[0][5] = 0.8;
49 | //
50 | // similarityMatrix[1][0] = 0.5;
51 | // similarityMatrix[1][1] = 1.0;
52 | // similarityMatrix[1][2] = 0.5;
53 | // similarityMatrix[1][3] = 0.5;
54 | // similarityMatrix[1][4] = 0.5;
55 | // similarityMatrix[1][5] = 0.5;
56 | //
57 | // similarityMatrix[2][0] = 0.5;
58 | // similarityMatrix[2][1] = 0.5;
59 | // similarityMatrix[2][2] = 1;
60 | // similarityMatrix[2][3] = 0.5;
61 | // similarityMatrix[2][4] = 0.5;
62 | // similarityMatrix[2][5] = 0.5;
63 | //
64 | // similarityMatrix[3][0] = 0.8;
65 | // similarityMatrix[3][1] = 0.5;
66 | // similarityMatrix[3][2] = 0.5;
67 | // similarityMatrix[3][3] = 1;
68 | // similarityMatrix[3][4] = 0.5;
69 | // similarityMatrix[3][5] = 0.8;
70 | //
71 | // similarityMatrix[4][0] = 0.5;
72 | // similarityMatrix[4][1] = 0.5;
73 | // similarityMatrix[4][2] = 1;
74 | // similarityMatrix[4][3] = 0.5;
75 | // similarityMatrix[4][4] = 0.5;
76 | // similarityMatrix[4][5] = 0.5;
77 | //
78 | // similarityMatrix[5][0] = 0.8;
79 | // similarityMatrix[5][1] = 0.5;
80 | // similarityMatrix[5][2] = 0.5;
81 | // similarityMatrix[5][3] = 0.8;
82 | // similarityMatrix[5][4] = 0.5;
83 | // similarityMatrix[5][5] = 1;
84 | //
85 | // decomp.setSimilarityMatrix(similarityMatrix);
86 | // decomp.start();
87 | // try {
88 | // decomp.join();
89 | // } catch (InterruptedException e) {
90 | // e.printStackTrace();
91 | // }
92 | //
93 | // ConcurrentNCutAlgorithm nCutAlgorithm = new ConcurrentNCutAlgorithm(decomp.getMainCluster());
94 | //
95 | // List> clusterList = nCutAlgorithm.startClustering();
96 | // for (Cluster cluster : clusterList) {
97 | // List list = cluster.getObjectList();
98 | // for (WebSite site : list) {
99 | // System.out.println("Site id: " + site.getObjectId());
100 | // }
101 | // }
102 | //
103 | //
104 | //// decomp.createUndirectedWeightedGraph();
105 | // // Third Step: VideoDecomposition
106 | //// List> clusterLst = doVideoDecomposition(decomp, shotList);
107 | //
108 | //// // Step Four: Temporal Graph Creation
109 | //// TemporalGraph> tempGraph = doTemporalGraphGeneration(clusterLst);
110 | ////
111 | //// // Step Five: Shortest Path
112 | //// List shortestPath = doShortestPath(tempGraph);
113 | ////
114 | //// // Step Six: Scene Extraction
115 | //// doSceneExtraction(tempGraph, shortestPath);
116 | // }
117 |
118 |
119 | }
120 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/CandidatePruning.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective;
2 |
3 | import java.util.ArrayList;
4 | import java.util.HashMap;
5 | import java.util.HashSet;
6 | import java.util.LinkedList;
7 | import java.util.List;
8 | import java.util.Map;
9 | import java.util.Set;
10 |
11 | import doser.entitydisambiguation.algorithms.SurfaceForm;
12 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral;
13 | import doser.general.HelpfulMethods;
14 |
15 | public class CandidatePruning {
16 |
17 | private static final int NUMBEROFADDITIONALW2VENTITIES = 6;
18 |
19 | private static final int ENTITYTHRESHOLD = 6;
20 |
21 | private static final int MINIMUMSURFACEFORMS = 3;
22 |
23 | private static final float WORD2VECTHRESHOLD = 1.60f;
24 |
25 | private AbstractEntityCentricKBGeneral eckb;
26 |
27 | public CandidatePruning(AbstractEntityCentricKBGeneral eckb) {
28 | super();
29 | this.eckb = eckb;
30 | }
31 |
32 | public void prune(List rep) {
33 | List unambiguous = new LinkedList();
34 | for (SurfaceForm c : rep) {
35 | if (c.getCandidates().size() == 1) {
36 | unambiguous.add(c);
37 | }
38 | }
39 |
40 | List list = new LinkedList();
41 | for (SurfaceForm sf : rep) {
42 | if (rep.size() > 1 && sf.getCandidates().size() == 1 && sf.isInitial()) {
43 | list.add(sf.getCandidates().get(0));
44 | }
45 | }
46 |
47 | for (SurfaceForm c : rep) {
48 | List candidates = c.getCandidates();
49 | if (candidates.size() > ENTITYTHRESHOLD) {
50 | Set prunedCandidates = new HashSet();
51 |
52 | // Sense Prior
53 | Map map = new HashMap();
54 | for (String candidate : candidates) {
55 | map.put(candidate, eckb.getFeatureDefinition().getOccurrences(c.getSurfaceForm(), candidate));
56 | }
57 | @SuppressWarnings("deprecation")
58 | List> l = HelpfulMethods.sortByValue(map);
59 | for (int i = 0; i < ENTITYTHRESHOLD; ++i) {
60 | prunedCandidates.add(l.get(i).getKey());
61 | // System.out.println("SensePrior ADd: "+l.get(i).getKey()+"
62 | // "+l.get(i).getValue());
63 | }
64 |
65 | // Doc2Vec ContextSimilarity
66 | Map map_doc2vec = new HashMap();
67 | for (String candidate : candidates) {
68 |
69 | map_doc2vec.put(candidate, eckb.getDoc2VecSimilarity(c.getSurfaceForm(), c.getContext(), candidate));
70 | }
71 | @SuppressWarnings("deprecation")
72 | List> l_doc2vec = HelpfulMethods.sortByValue(map_doc2vec);
73 | int added = 0;
74 | int counter = 0;
75 | while (counter < l_doc2vec.size() && added < 4) {
76 | String key = l_doc2vec.get(counter).getKey();
77 | if (!prunedCandidates.contains(key)) {
78 | prunedCandidates.add(key);
79 | added++;
80 | }
81 | counter++;
82 | }
83 | // for (int i = 0; i < ENTITYTHRESHOLD; ++i) {
84 | // prunedCandidates.add(l_doc2vec.get(i).getKey());
85 | // }
86 |
87 | // Check for very relevant Candidates via given Word2Vec
88 | // similarities
89 | if (list.size() >= MINIMUMSURFACEFORMS) {
90 | Set w2vFormatStrings = new HashSet();
91 | for (String can : candidates) {
92 | if (!prunedCandidates.contains(can)) {
93 | String query = this.eckb.generateWord2VecFormatString(list, can);
94 | w2vFormatStrings.add(query);
95 | }
96 | }
97 |
98 | Map similarityMap = this.eckb.getWord2VecSimilarities(w2vFormatStrings);
99 | Map occmap = new HashMap();
100 | for (String can : candidates) {
101 | if (!prunedCandidates.contains(can)) {
102 | String query = this.eckb.generateWord2VecFormatString(list, can);
103 | float val = similarityMap.get(query);
104 | if (val > WORD2VECTHRESHOLD) {
105 | occmap.put(can, eckb.getFeatureDefinition().getOccurrences(c.getSurfaceForm(), can));
106 | // prunedCandidates.add(can);
107 | }
108 | }
109 | }
110 | @SuppressWarnings("deprecation")
111 | List> sortedl = HelpfulMethods.sortByValue(occmap);
112 | for (int i = 0; i < NUMBEROFADDITIONALW2VENTITIES; ++i) {
113 | if (i < sortedl.size()) {
114 | prunedCandidates.add(sortedl.get(i).getKey());
115 | }
116 | }
117 | }
118 |
119 | c.setCandidates(new ArrayList(prunedCandidates));
120 | }
121 | }
122 | }
123 | }
124 |
--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/ConjunctionScorer.java:
--------------------------------------------------------------------------------
1 | package doser.lucene.query;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.Collection;
6 | import java.util.Comparator;
7 |
8 | import org.apache.lucene.search.Scorer;
9 | import org.apache.lucene.search.Weight;
10 | import org.apache.lucene.util.ArrayUtil;
11 |
12 | /** Scorer for conjunctions, sets of queries, all of which are required. */
13 | class ConjunctionScorer extends Scorer {
14 | static final class DocsAndFreqs {
15 | final long cost;
16 | int doc = -1;
17 | final Scorer scorer;
18 |
19 | DocsAndFreqs(final Scorer scorer) {
20 | this.scorer = scorer;
21 | cost = scorer.cost();
22 | }
23 | }
24 |
25 | private final LearnToRankClause[] clauses;
26 | private final float coord;
27 | private final int docBase;
28 | protected final DocsAndFreqs[] docsAndFreqs;
29 | protected int lastDoc = -1;
30 |
31 | private final DocsAndFreqs lead;
32 |
33 | ConjunctionScorer(final Weight weight, final Scorer[] scorers,
34 | final float coord, final LearnToRankClause[] ltrclauses,
35 | final int docBase) {
36 | super(weight);
37 | this.coord = coord;
38 | this.docBase = docBase;
39 | clauses = ltrclauses;
40 | docsAndFreqs = new DocsAndFreqs[scorers.length];
41 | for (int i = 0; i < scorers.length; i++) {
42 | docsAndFreqs[i] = new DocsAndFreqs(scorers[i]);
43 | }
44 | // Sort the array the first time to allow the least frequent DocsEnum to
45 | // lead the matching.
46 | ArrayUtil.timSort(docsAndFreqs, new Comparator() {
47 | @Override
48 | public int compare(final DocsAndFreqs obj1, final DocsAndFreqs obj2) {
49 | return Long.signum(obj1.cost - obj2.cost);
50 | }
51 | });
52 |
53 | lead = docsAndFreqs[0]; // least frequent DocsEnum leads the
54 | // intersection
55 | }
56 |
57 | ConjunctionScorer(final Weight weight, final Scorer[] scorers,
58 | final LearnToRankClause[] ltrclauses, final int docBase) {
59 | this(weight, scorers, 1f, ltrclauses, docBase);
60 | }
61 |
62 | @Override
63 | public int advance(final int target) throws IOException {
64 | lead.doc = lead.scorer.advance(target);
65 | return lastDoc = doNext(lead.doc);
66 | }
67 |
68 | @Override
69 | public long cost() {
70 | return lead.scorer.cost();
71 | }
72 |
73 | @Override
74 | public int docID() {
75 | return lastDoc;
76 | }
77 |
78 | private int doNext(int doc) throws IOException { // NOPMD by quh on 28.02.14
79 | // 10:45
80 | for (;;) {
81 | // doc may already be NO_MORE_DOCS here, but we don't check
82 | // explicitly
83 | // since all scorers should advance to NO_MORE_DOCS, match, then
84 | // return that value.
85 | advanceHead: for (;;) {
86 | for (int i = 1; i < docsAndFreqs.length; i++) {
87 | // invariant: docsAndFreqs[i].doc <= doc at this point.
88 |
89 | // docsAndFreqs[i].doc may already be equal to doc if we
90 | // "broke advanceHead"
91 | // on the previous iteration and the advance on the lead
92 | // scorer exactly matched.
93 | if (docsAndFreqs[i].doc < doc) {
94 | docsAndFreqs[i].doc = docsAndFreqs[i].scorer
95 | .advance(doc);
96 |
97 | if (docsAndFreqs[i].doc > doc) {
98 | // DocsEnum beyond the current doc - break and
99 | // advance lead to the new highest doc.
100 | doc = docsAndFreqs[i].doc;
101 | break advanceHead;
102 | }
103 | }
104 | }
105 | // success - all DocsEnums are on the same doc
106 | return doc;
107 | }
108 | // advance head for next iteration
109 | doc = lead.doc = lead.scorer.advance(doc);
110 | }
111 | }
112 |
113 | @Override
114 | public int freq() {
115 | return docsAndFreqs.length;
116 | }
117 |
118 | @Override
119 | public Collection getChildren() {
120 | final ArrayList children = new ArrayList(
121 | docsAndFreqs.length);
122 | for (final DocsAndFreqs docs : docsAndFreqs) {
123 | children.add(new ChildScorer(docs.scorer, "MUST"));
124 | }
125 | return children;
126 | }
127 |
128 | @Override
129 | public int nextDoc() throws IOException {
130 | lead.doc = lead.scorer.nextDoc();
131 | return lastDoc = doNext(lead.doc);
132 | }
133 |
134 | @Override
135 | public float score() throws IOException {
136 | // TODO: sum into a double and cast to float if we ever send required
137 | // clauses to BS1
138 | float sum = 0.0f;
139 | for (int i = 0; i < docsAndFreqs.length; i++) {
140 | final float val = docsAndFreqs[i].scorer.score()
141 | * clauses[i].getWeight();
142 | sum += val;
143 | clauses[i].addFeatureValue(docBase, lastDoc, val);
144 | }
145 | return sum * coord;
146 | }
147 | }
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/Vertex.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective;
2 |
3 | import java.util.ArrayList;
4 | import java.util.HashSet;
5 | import java.util.List;
6 | import java.util.Map;
7 | import java.util.Set;
8 |
9 | public class Vertex implements Comparable {
10 | private List uris;
11 | private int entityQuery;
12 | private double score;
13 | private boolean isCandidate;
14 | private String description;
15 | private String text;
16 | private String context;
17 | private double occurrences;
18 |
19 | private Set outgoingEdges;
20 |
21 | private double sumOutGoing;
22 |
23 | public Vertex() {
24 | super();
25 | this.uris = new ArrayList();
26 | this.outgoingEdges = new HashSet();
27 | this.entityQuery = -1;
28 | this.isCandidate = false;
29 | this.sumOutGoing = 0;
30 | this.text = "";
31 | this.context = "";
32 | }
33 |
34 | public void addOutGoingEdge(Edge e) {
35 | outgoingEdges.add(e);
36 | this.sumOutGoing += e.getTransition();
37 | for(Edge out : outgoingEdges) {
38 | out.setProbability(out.getTransition() / sumOutGoing);
39 | }
40 | }
41 |
42 | public void removeAllOutgoingEdges() {
43 | this.outgoingEdges.clear();
44 | }
45 |
46 | public Edge removeOutgoingEdge(Vertex v, Map edgeWeight) {
47 | Edge toRemove = null;
48 | for (Edge e : outgoingEdges) {
49 | if (e.getTarget().equals(v)) {
50 | toRemove = e;
51 | break;
52 | }
53 | }
54 | if (toRemove != null) {
55 | outgoingEdges.remove(toRemove);
56 | sumOutGoing -= toRemove.getTransition();
57 | }
58 |
59 | // Update Transition Probability
60 | for(Edge out : outgoingEdges) {
61 | out.setProbability(out.getTransition() / sumOutGoing);
62 | edgeWeight.put(out, out.getProbability());
63 | }
64 |
65 | return toRemove;
66 | }
67 |
68 | public String getContext() {
69 | return context;
70 | }
71 |
72 | public void setContext(String context) {
73 | this.context = context;
74 | }
75 |
76 | public double getSumOutGoingEdges() {
77 | return sumOutGoing;
78 | }
79 |
80 | public Set getOutgoingEdges() {
81 | return this.outgoingEdges;
82 | }
83 |
84 | public List getUris() {
85 | return uris;
86 | }
87 |
88 | public void addUri(String uri) {
89 | this.uris.add(uri);
90 | }
91 |
92 | public boolean isCandidate() {
93 | return isCandidate;
94 | }
95 |
96 | public void setCandidate(boolean isCandidate) {
97 | this.isCandidate = isCandidate;
98 | }
99 |
100 | public int getEntityQuery() {
101 | return entityQuery;
102 | }
103 |
104 | public void setEntityQuery(int entityQuery) {
105 | this.entityQuery = entityQuery;
106 | }
107 |
108 | public void setGraphValue(double val) {
109 | this.score = val;
110 | }
111 |
112 | public double getScore() {
113 | return this.score;
114 | }
115 |
116 | public void setScore(double score) {
117 | this.score = score;
118 | }
119 |
120 | public String getDescription() {
121 | return description;
122 | }
123 |
124 | void setDescription(String description) {
125 | this.description = description;
126 | }
127 |
128 | public String getText() {
129 | return text;
130 | }
131 |
132 | public void setText(String text) {
133 | this.text = text;
134 | }
135 |
136 | public double getOccurrences() {
137 | return occurrences;
138 | }
139 |
140 | public void setOccurrences(int occurrences) {
141 | this.occurrences = Math.log10(occurrences + 1);
142 | }
143 |
144 | @Override
145 | public boolean equals(Object obj) {
146 | Vertex comp = (Vertex) obj;
147 | boolean isEqual = true;
148 | if (this.uris.size() != comp.getUris().size()
149 | || this.entityQuery != comp.getEntityQuery()) {
150 | return false;
151 | }
152 | for (int i = 0; i < uris.size(); ++i) {
153 | if (!uris.get(i).equalsIgnoreCase(comp.getUris().get(i))) {
154 | isEqual = false;
155 | break;
156 | }
157 | }
158 | return isEqual;
159 | }
160 |
161 | @Override
162 | public int hashCode() {
163 | return (generateUriHash(this.uris) + ((Integer) this.getEntityQuery())
164 | .hashCode());
165 | }
166 |
167 | private int generateUriHash(List uris) {
168 | int hash = 0;
169 | for (String uri : uris) {
170 | hash += uri.hashCode();
171 | }
172 | return hash;
173 | }
174 |
175 | /**
176 | * The return values are switched to provide a descending order when using
177 | * Collections.sort(), which generally provides an ascending sort order.
178 | *
179 | */
180 | @Override
181 | public int compareTo(Vertex o) {
182 | if (this.getOccurrences() < o.getOccurrences()) {
183 | return 1;
184 | } else if (this.getOccurrences() > o.getOccurrences()) {
185 | return 1;
186 | } else {
187 | return 0;
188 | }
189 | }
190 | }
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/general/CollectiveDisambiguationGeneralEntities.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective.general;
2 |
3 | import java.io.IOException;
4 | import java.util.ArrayList;
5 | import java.util.Arrays;
6 | import java.util.LinkedList;
7 | import java.util.List;
8 |
9 | import org.apache.lucene.document.Document;
10 | import org.apache.lucene.index.IndexReader;
11 | import org.apache.lucene.index.Term;
12 | import org.apache.lucene.search.IndexSearcher;
13 | import org.apache.lucene.search.Query;
14 | import org.apache.lucene.search.ScoreDoc;
15 | import org.apache.lucene.search.TopDocs;
16 | import org.slf4j.Logger;
17 | import org.slf4j.LoggerFactory;
18 |
19 | import doser.entitydisambiguation.algorithms.AbstractDisambiguationAlgorithm;
20 | import doser.entitydisambiguation.algorithms.IllegalDisambiguationAlgorithmInputException;
21 | import doser.entitydisambiguation.algorithms.SurfaceForm;
22 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask;
23 | import doser.entitydisambiguation.backend.DisambiguationTaskCollective;
24 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
25 | import doser.entitydisambiguation.dpo.Response;
26 | import doser.entitydisambiguation.knowledgebases.EntityCentricKnowledgeBase;
27 | import doser.lucene.query.TermQuery;
28 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
29 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral;
30 |
31 | public class CollectiveDisambiguationGeneralEntities extends AbstractDisambiguationAlgorithm {
32 |
33 | private final static Logger logger = LoggerFactory.getLogger(CollectiveDisambiguationGeneralEntities.class);
34 |
35 | private AbstractEntityCentricKBGeneral eckb;
36 |
37 | private DisambiguationTaskCollective task;
38 |
39 | @Override
40 | protected boolean checkAndSetInputParameter(AbstractDisambiguationTask task) {
41 | AbstractKnowledgeBase kb = task.getKb();
42 | if (!(task instanceof DisambiguationTaskCollective)) {
43 | return false;
44 | }
45 |
46 | this.eckb = (AbstractEntityCentricKBGeneral) kb;
47 | this.task = (DisambiguationTaskCollective) task;
48 | return true;
49 | }
50 |
51 | @Override
52 | protected void processAlgorithm() throws IllegalDisambiguationAlgorithmInputException {
53 | // AdditionalCandidateQuery aq = new AdditionalCandidateQuery(eckb);
54 | List entityList = task.getEntityToDisambiguate();
55 | Response[] responseArray = new Response[entityList.size()];
56 |
57 | List collectiveRep = new LinkedList();
58 | for (int i = 0; i < entityList.size(); i++) {
59 | EntityDisambiguationDPO dpo = entityList.get(i);
60 | // Dieser Fix sollte irgendwo anders passieren. TODO Auslagern
61 | dpo.setSelectedText(dpo.getSelectedText().replaceAll("’", "'"));
62 | Query query = createQuery(dpo.getSelectedText(), eckb);
63 | final IndexSearcher searcher = eckb.getSearcher();
64 | final IndexReader reader = searcher.getIndexReader();
65 | try {
66 | final TopDocs top = searcher.search(query, task.getReturnNr());
67 | final ScoreDoc[] score = top.scoreDocs;
68 | if (dpo.getSelectedText().equalsIgnoreCase("") || dpo.getSelectedText() == null) {
69 | ArrayList l = new ArrayList();
70 | l.add("");
71 | SurfaceForm col = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, i,
72 | dpo.getStartPosition());
73 | collectiveRep.add(col);
74 | } else if (score.length == 1) {
75 | final Document doc = reader.document(score[0].doc);
76 | ArrayList l = new ArrayList();
77 | l.add(doc.get("Mainlink"));
78 | SurfaceForm col = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, i,
79 | dpo.getStartPosition());
80 | col.setInitial(true);
81 | collectiveRep.add(col);
82 |
83 | } else if (score.length > 1) {
84 | ArrayList l = new ArrayList();
85 | for (int j = 0; j < score.length; j++) {
86 | final Document doc = reader.document(score[j].doc);
87 | l.add(doc.get("Mainlink"));
88 | }
89 | SurfaceForm col = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, i,
90 | dpo.getStartPosition());
91 | collectiveRep.add(col);
92 |
93 | } else {
94 | // SurfaceForm sf = aq.checkAdditionalSurfaceForms(dpo, i);
95 | // collectiveRep.add(sf);
96 | }
97 |
98 | } catch (final IOException e) {
99 | logger.error("JsonException in "+CollectiveDisambiguationGeneralEntities.class.getName(), e);
100 | }
101 | }
102 |
103 | CollectiveContextDriverGeneral solver = new CollectiveContextDriverGeneral(responseArray, collectiveRep, eckb);
104 | solver.solve();
105 |
106 | solver.generateResult();
107 | List res = Arrays.asList(responseArray);
108 | task.setResponse(res);
109 |
110 | eckb.release();
111 | }
112 |
113 | @Override
114 | protected boolean preDisambiguation() {
115 | return true;
116 | }
117 |
118 | private Query createQuery(String sf, EntityCentricKnowledgeBase kb) {
119 | String surfaceform = sf.toLowerCase();
120 | TermQuery query = new TermQuery(new Term("UniqueLabel", surfaceform));
121 |
122 | return query;
123 | }
124 | }
125 |
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/ContextRule.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.rules;
2 |
3 | import java.io.IOException;
4 | import java.util.HashSet;
5 | import java.util.LinkedList;
6 | import java.util.List;
7 | import java.util.Map;
8 | import java.util.Set;
9 |
10 | import org.apache.lucene.document.Document;
11 | import org.apache.lucene.index.Term;
12 | import org.apache.lucene.search.IndexSearcher;
13 | import org.apache.lucene.search.Query;
14 | import org.apache.lucene.search.ScoreDoc;
15 | import org.apache.lucene.search.TopDocs;
16 |
17 | import doser.entitydisambiguation.algorithms.SurfaceForm;
18 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
19 | import doser.lucene.query.TermQuery;
20 |
21 | class ContextRule extends AbstractRule {
22 |
23 | private static final int MINDISAMBIGUATEDSURFACEFORMS = 2;
24 |
25 | private static final int MINIMUMSURFACEFORMS = 10;
26 |
27 | private static final float SIMILARITYTHRESHOLD = 1.57f;
28 | private static final float SIMILARITYTHRESHOLDMISC = 1.53f;
29 |
30 | private EntityCentricKBDBpedia eckb;
31 |
32 | ContextRule(EntityCentricKBDBpedia eckb) {
33 | super(eckb);
34 | this.eckb = eckb;
35 | }
36 |
37 | @Override
38 | public boolean applyRule(List rep) {
39 | if (rep.size() > MINIMUMSURFACEFORMS) {
40 | List list = new LinkedList();
41 | for (SurfaceForm sf : rep) {
42 | if (rep.size() > 1 && sf.getCandidates().size() == 1 && sf.isInitial()) {
43 | list.add(sf.getCandidates().get(0));
44 | }
45 | }
46 | if (list.size() >= MINDISAMBIGUATEDSURFACEFORMS) {
47 | Set w2vFormatStrings = new HashSet();
48 | for (SurfaceForm sf : rep) {
49 | if (rep.size() > 1 && sf.getCandidates().size() > 1) {
50 | List l = sf.getCandidates();
51 | List bestCandidate = new LinkedList();
52 | Set levenshteinAdded = new HashSet();
53 | for (String s : l) {
54 | String query = this.eckb.generateWord2VecFormatString(list, s);
55 | w2vFormatStrings.add(query);
56 | Map similarityMap = this.eckb.getWord2VecSimilarities(w2vFormatStrings);
57 | float simValue = similarityMap.get(query);
58 | // Check for Appropriate entities
59 | String candidateWithoutUrl = s.replaceAll("http://dbpedia.org/resource/", "").toLowerCase();
60 | if (levenshteinDistance(candidateWithoutUrl, sf.getSurfaceForm().toLowerCase()) <= 2) {
61 | System.out.println("LEVENSHTEIN DISTANCE ENTITY: " + s);
62 | }
63 | if (simValue > SIMILARITYTHRESHOLD
64 | || (queryType(s).equalsIgnoreCase("Misc") && simValue > SIMILARITYTHRESHOLDMISC)) {
65 | bestCandidate.add(s);
66 | } else if (levenshteinDistance(candidateWithoutUrl,
67 | sf.getSurfaceForm().toLowerCase()) <= 2) {
68 | bestCandidate.add(s);
69 | levenshteinAdded.add(s);
70 | }
71 | }
72 | // Disambiguate and assign entity
73 | if (!bestCandidate.isEmpty()) {
74 | boolean notOnlyLevenshtein = false;
75 | for (String s : bestCandidate) {
76 | if (!levenshteinAdded.contains(s)) {
77 | notOnlyLevenshtein = true;
78 | }
79 | }
80 | if (notOnlyLevenshtein) {
81 | sf.setCandidates(bestCandidate);
82 | System.out.println("Es bleibt übrig SurfaceForm: " + sf.getSurfaceForm() + " +"
83 | + bestCandidate.toString());
84 | }
85 | }
86 | }
87 | }
88 | }
89 | }
90 | return false;
91 | }
92 |
93 | private String queryType(String url) {
94 | String type = "";
95 | IndexSearcher searcher = eckb.getSearcher();
96 | Query q = new TermQuery(new Term("Mainlink", url));
97 | try {
98 | TopDocs docs = searcher.search(q, 1);
99 | ScoreDoc[] scoredocs = docs.scoreDocs;
100 | if (scoredocs.length == 0) {
101 | type = "Misc";
102 | } else {
103 | int nr = scoredocs[0].doc;
104 | Document doc = searcher.getIndexReader().document(nr);
105 | type = doc.get("Type");
106 | }
107 | } catch (IOException e) {
108 | e.printStackTrace();
109 | }
110 | return type;
111 | }
112 |
113 | int levenshteinDistance(CharSequence lhs, CharSequence rhs) {
114 | int len0 = lhs.length() + 1;
115 | int len1 = rhs.length() + 1;
116 |
117 | // the array of distances
118 | int[] cost = new int[len0];
119 | int[] newcost = new int[len0];
120 |
121 | // initial cost of skipping prefix in String s0
122 | for (int i = 0; i < len0; i++)
123 | cost[i] = i;
124 |
125 | // dynamically computing the array of distances
126 |
127 | // transformation cost for each letter in s1
128 | for (int j = 1; j < len1; j++) {
129 | // initial cost of skipping prefix in String s1
130 | newcost[0] = j;
131 |
132 | // transformation cost for each letter in s0
133 | for (int i = 1; i < len0; i++) {
134 | // matching current letters in both strings
135 | int match = (lhs.charAt(i - 1) == rhs.charAt(j - 1)) ? 0 : 1;
136 |
137 | // computing cost for each transformation
138 | int cost_replace = cost[i - 1] + match;
139 | int cost_insert = cost[i] + 1;
140 | int cost_delete = newcost[i - 1] + 1;
141 |
142 | // keep minimum cost
143 | newcost[i] = Math.min(Math.min(cost_insert, cost_delete), cost_replace);
144 | }
145 |
146 | // swap cost/newcost arrays
147 | int[] swap = cost;
148 | cost = newcost;
149 | newcost = swap;
150 | }
151 |
152 | // the distance is the cost for transforming all letters in both strings
153 | return cost[len0 - 1];
154 | }
155 | }
156 |
--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/java/doser/server/actions/disambiguation/DisambiguationService.java:
--------------------------------------------------------------------------------
1 | package doser.server.actions.disambiguation;
2 |
3 | import java.util.LinkedList;
4 | import java.util.List;
5 |
6 | import org.springframework.stereotype.Controller;
7 | import org.springframework.web.bind.annotation.RequestBody;
8 | import org.springframework.web.bind.annotation.RequestMapping;
9 | import org.springframework.web.bind.annotation.RequestMethod;
10 | import org.springframework.web.bind.annotation.ResponseBody;
11 |
12 | import doser.entitydisambiguation.backend.DisambiguationMainService;
13 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask;
14 | import doser.entitydisambiguation.backend.DisambiguationTaskCollective;
15 | import doser.entitydisambiguation.backend.DisambiguationTaskSingle;
16 | import doser.entitydisambiguation.dpo.DisambiguationRequest;
17 | import doser.entitydisambiguation.dpo.DisambiguationResponse;
18 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
19 | import doser.entitydisambiguation.dpo.Response;
20 | import doser.entitydisambiguation.properties.Properties;
21 |
22 | @Controller
23 | @RequestMapping("/disambiguation")
24 | public class DisambiguationService {
25 |
26 | public DisambiguationService() {
27 | super();
28 | }
29 |
30 | /**
31 | * Testing
32 | *
33 | * @param request
34 | * @return
35 | */
36 | @RequestMapping(value = "/disambiguateWithoutCategories-single", method = RequestMethod.POST, headers = "Accept=application/json")
37 | public @ResponseBody DisambiguationResponse annotateSingle(@RequestBody final DisambiguationRequest request) {
38 | DisambiguationResponse annotationResponse = disambiguateSingle(request);
39 | return annotationResponse;
40 | }
41 |
42 | @RequestMapping(value = "/disambiguationWithoutCategories-collective", method = RequestMethod.POST, headers = "Accept=application/json")
43 | public @ResponseBody DisambiguationResponse annotateCollectiveWithoutCategories(
44 | @RequestBody final DisambiguationRequest request) {
45 | final DisambiguationResponse response = new DisambiguationResponse();
46 | final DisambiguationMainService mainService = DisambiguationMainService.getInstance();
47 | final List listToDis = request.getSurfaceFormsToDisambiguate();
48 |
49 | if (mainService != null) {
50 | final List tasks = new LinkedList();
51 | DisambiguationTaskCollective collectiveTask = new DisambiguationTaskCollective(listToDis,
52 | request.getMainTopic());
53 | collectiveTask.setKbIdentifier("default", "EntityCentric");
54 | collectiveTask.setReturnNr(1000);
55 | tasks.add(collectiveTask);
56 | mainService.disambiguate(tasks);
57 |
58 | List responses = collectiveTask.getResponse();
59 | response.setTasks(responses);
60 | response.setDocumentUri(request.getDocumentUri());
61 | }
62 | return response;
63 | }
64 |
65 | @RequestMapping(value = "/disambiguationWithoutCategoriesBiomed-collective", method = RequestMethod.POST, headers = "Accept=application/json")
66 | public @ResponseBody DisambiguationResponse annotateCollectiveWithoutCategoriesBiomed(
67 | @RequestBody final DisambiguationRequest request) {
68 | final DisambiguationResponse response = new DisambiguationResponse();
69 | final DisambiguationMainService mainService = DisambiguationMainService.getInstance();
70 | final List listToDis = request.getSurfaceFormsToDisambiguate();
71 |
72 | if (mainService != null) {
73 | final List tasks = new LinkedList();
74 | DisambiguationTaskCollective collectiveTask = new DisambiguationTaskCollective(listToDis,
75 | request.getMainTopic());
76 | collectiveTask.setKbIdentifier("biomed", "EntityCentric");
77 | collectiveTask.setReturnNr(1000);
78 | tasks.add(collectiveTask);
79 | mainService.disambiguate(tasks);
80 |
81 | List responses = collectiveTask.getResponse();
82 | response.setTasks(responses);
83 | response.setDocumentUri(request.getDocumentUri());
84 | }
85 | return response;
86 | }
87 |
88 | private DisambiguationResponse disambiguateSingle(DisambiguationRequest request) {
89 | final DisambiguationResponse response = new DisambiguationResponse();
90 | final List listToDis = request.getSurfaceFormsToDisambiguate();
91 | List responseList = new LinkedList();
92 | response.setDocumentUri(request.getDocumentUri());
93 | final List tasks = new LinkedList();
94 | final DisambiguationMainService mainService = DisambiguationMainService.getInstance();
95 | if (mainService != null) {
96 | int docsToReturn = 0;
97 | if (request.getDocsToReturn() == null) {
98 | docsToReturn = Properties.getInstance().getDisambiguationResultSize();
99 | } else {
100 | docsToReturn = request.getDocsToReturn();
101 | }
102 | for (int i = 0; i < listToDis.size(); i++) {
103 | EntityDisambiguationDPO dpo = listToDis.get(i);
104 | DisambiguationTaskSingle task = new DisambiguationTaskSingle(dpo);
105 | task.setReturnNr(docsToReturn);
106 | task.setKbIdentifier(listToDis.get(i).getKbversion(), listToDis.get(i).getSetting());
107 | // Bugfix! Selected text may not be null. Should be ""
108 | // String instead;
109 | if (dpo.getSelectedText() != null) {
110 | tasks.add(task);
111 | }
112 | }
113 | mainService.disambiguate(tasks);
114 | }
115 |
116 | for (AbstractDisambiguationTask task : tasks) {
117 | responseList.add(task.getResponse().get(0));
118 | }
119 | response.setTasks(responseList);
120 | return response;
121 | }
122 | }
--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/Word2VecDisambiguator.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.algorithms.collective.dbpedia;
2 |
3 | import java.util.ArrayList;
4 | import java.util.BitSet;
5 | import java.util.Collection;
6 | import java.util.Collections;
7 | import java.util.HashMap;
8 | import java.util.List;
9 |
10 | import org.apache.commons.collections15.Factory;
11 | import org.apache.commons.collections15.functors.MapTransformer;
12 | import org.apache.commons.math.stat.descriptive.SummaryStatistics;
13 |
14 | import doser.entitydisambiguation.algorithms.SurfaceForm;
15 | import doser.entitydisambiguation.algorithms.collective.AbstractWord2VecPageRank;
16 | import doser.entitydisambiguation.algorithms.collective.Edge;
17 | import doser.entitydisambiguation.algorithms.collective.Vertex;
18 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
19 | import edu.uci.ics.jung.algorithms.scoring.PageRankWithPriors;
20 | import edu.uci.ics.jung.graph.DirectedSparseMultigraph;
21 |
22 | class Word2VecDisambiguator extends AbstractWord2VecPageRank {
23 |
24 | // private static final int MAXIMUMCANDIDATESPERSF = 8;
25 |
26 | private List origList;
27 |
28 | private boolean disambiguate;
29 |
30 | private int maximumcandidatespersf;
31 |
32 | private int iterations;
33 |
34 |
35 | Word2VecDisambiguator(EntityCentricKBDBpedia eckb,
36 | List rep, boolean disambiguate, int maximumcandidatespersf, int iterations) {
37 | super(eckb, rep);
38 | this.origList = new ArrayList();
39 | this.disambiguate = disambiguate;
40 | this.maximumcandidatespersf = maximumcandidatespersf;
41 | this.iterations = iterations;
42 | }
43 |
44 | @Override
45 | public void setup() {
46 | this.graph = new DirectedSparseMultigraph();
47 | this.edgeWeights = new HashMap();
48 | this.edgeFactory = new Factory() {
49 | int i = 0;
50 |
51 | public Integer create() {
52 | return i++;
53 | }
54 | };
55 |
56 | for (SurfaceForm sf : repList) {
57 | SurfaceForm clone = (SurfaceForm) sf.clone();
58 | this.origList.add(clone);
59 | }
60 |
61 | this.disambiguatedSurfaceForms = new BitSet(repList.size());
62 | for (int i = 0; i < repList.size(); i++) {
63 | if (repList.get(i).getCandidates().size() <= 1) {
64 | this.disambiguatedSurfaceForms.set(i);
65 | }
66 | }
67 | buildMainGraph();
68 | }
69 |
70 | @Override
71 | protected PageRankWithPriors performPageRank() {
72 | PageRankWithPriors pr = new PageRankWithPriors(
73 | graph, MapTransformer.getInstance(edgeWeights),
74 | getRootPrior(graph.getVertices()), 0.09);
75 | pr.setMaxIterations(iterations);
76 | pr.evaluate();
77 | return pr;
78 | }
79 |
80 | @Override
81 | public boolean analyzeResults(PageRankWithPriors pr) {
82 | boolean disambiguationStop = true;
83 | Collection vertexCol = graph.getVertices();
84 | for (int i = 0; i < repList.size(); i++) {
85 | if (!disambiguatedSurfaceForms.get(i) && repList.get(i).isRelevant()) {
86 | int qryNr = repList.get(i).getQueryNr();
87 | double maxScore = 0;
88 | SummaryStatistics stats = new SummaryStatistics();
89 | String tempSolution = "";
90 | List scores = new ArrayList();
91 | for (Vertex v : vertexCol) {
92 | if (v.getEntityQuery() == qryNr && v.isCandidate()) {
93 | scores.add(new Candidate(v.getUris().get(0), pr
94 | .getVertexScore(v)));
95 | double score = Math.abs(pr.getVertexScore(v));
96 | stats.addValue(score);
97 | if (score > maxScore) {
98 | tempSolution = v.getUris().get(0);
99 | maxScore = score;
100 | }
101 | }
102 | }
103 | SurfaceForm rep = repList.get(i);
104 | SurfaceForm clone = origList.get(i);
105 | Collections.sort(scores, Collections.reverseOrder());
106 | double secondMax = scores.get(1).score;
107 |
108 | List newCandidates = new ArrayList();
109 | for(int j = 0; j < maximumcandidatespersf; j++) {
110 | if(scores.size() > j) {
111 | newCandidates.add(scores.get(j).can);
112 | } else {
113 | break;
114 | }
115 | }
116 |
117 | if (!Double.isInfinite(maxScore)) {
118 | double avg = stats.getMean();
119 | double threshold = computeThreshold(avg, maxScore);
120 | if (secondMax < threshold && disambiguate) {
121 | updateGraph(rep.getCandidates(), tempSolution,
122 | rep.getQueryNr());
123 | rep.setDisambiguatedEntity(tempSolution);
124 | clone.setDisambiguatedEntity(tempSolution);
125 | disambiguatedSurfaceForms.set(i);
126 | disambiguationStop = false;
127 | break;
128 | } else {
129 | clone.setCandidates(newCandidates);
130 | }
131 | }
132 | }
133 | }
134 | return disambiguationStop;
135 | }
136 |
137 | /**
138 | * Threshold Computation // IMPORTANT DISAMBIGUATION PARAMETER
139 | *
140 | * @param avg
141 | * @param highest
142 | * @return
143 | */
144 | private double computeThreshold(double avg, double highest) {
145 | double diff = highest - avg;
146 | double min = diff * 0.5;
147 | return highest - min;
148 | }
149 |
150 | @Override
151 | public List getRepresentation() {
152 | return this.origList;
153 | }
154 |
155 | class Candidate implements Comparable {
156 | private double score;
157 | private String can;
158 |
159 | Candidate(String can, double score) {
160 | super();
161 | this.score = score;
162 | this.can = can;
163 | }
164 |
165 | @Override
166 | public int compareTo(Candidate o) {
167 | if (score < o.score) {
168 | return -1;
169 | } else if (score > o.score) {
170 | return 1;
171 | } else {
172 | return 0;
173 | }
174 | }
175 | }
176 | }
177 |
--------------------------------------------------------------------------------