├── doser-dis-core
    ├── .gitignore
    ├── .settings
    │   ├── org.eclipse.wst.jsdt.ui.superType.name
    │   ├── org.eclipse.wst.validation.prefs
    │   ├── org.eclipse.wst.jsdt.ui.superType.container
    │   ├── org.eclipse.m2e.wtp.prefs
    │   ├── org.eclipse.m2e.core.prefs
    │   ├── org.eclipse.wst.ws.service.policy.prefs
    │   ├── org.eclipse.wst.common.project.facet.core.prefs.xml
    │   ├── org.eclipse.wst.common.project.facet.core.xml
    │   ├── .jsdtscope
    │   ├── org.eclipse.jdt.core.prefs
    │   └── org.eclipse.wst.common.component
    ├── src
    │   ├── main
    │   │   ├── java
    │   │   │   └── doser
    │   │   │   │   ├── language
    │   │   │   │       └── Languages.java
    │   │   │   │   ├── entitydisambiguation
    │   │   │   │       ├── dpo
    │   │   │   │       │   ├── package-info.java
    │   │   │   │       │   ├── DisambiguatedEntity.java
    │   │   │   │       │   ├── DisambiguationResponse.java
    │   │   │   │       │   ├── Response.java
    │   │   │   │       │   ├── DisambiguationRequest.java
    │   │   │   │       │   └── EntityDisambiguationDPO.java
    │   │   │   │       ├── knowledgebases
    │   │   │   │       │   ├── KnowledgeBaseIdentifiers.java
    │   │   │   │       │   ├── DocumentCentricKnowledgeBaseDefault.java
    │   │   │   │       │   ├── EntityCentricKBDBpedia.java
    │   │   │   │       │   ├── AbstractKnowledgeBase.java
    │   │   │   │       │   └── EntityCentricKBBiomed.java
    │   │   │   │       ├── algorithms
    │   │   │   │       │   ├── IllegalDisambiguationAlgorithmInputException.java
    │   │   │   │       │   ├── rules
    │   │   │   │       │   │   ├── AbstractRule.java
    │   │   │   │       │   │   ├── RuleAdapation.java
    │   │   │   │       │   │   ├── NoCandidatesCheckPlural.java
    │   │   │   │       │   │   ├── CheckGeneralEntities.java
    │   │   │   │       │   │   ├── UnambiguousToAmbiguousRule.java
    │   │   │   │       │   │   ├── NoCandidatesExpansionRules.java
    │   │   │   │       │   │   └── ContextRule.java
    │   │   │   │       │   ├── Candidate.java
    │   │   │   │       │   ├── collective
    │   │   │   │       │   │   ├── Edge.java
    │   │   │   │       │   │   ├── dbpedia
    │   │   │   │       │   │   │   ├── CandidateReductionDBpediaW2V.java
    │   │   │   │       │   │   │   ├── TableColumnFilter.java
    │   │   │   │       │   │   │   ├── CollectiveAndContextDriver.java
    │   │   │   │       │   │   │   └── Word2VecDisambiguator.java
    │   │   │   │       │   │   ├── general
    │   │   │   │       │   │   │   ├── CandidateReductionGeneralW2V.java
    │   │   │   │       │   │   │   ├── CollectiveContextDriverGeneral.java
    │   │   │   │       │   │   │   └── CollectiveDisambiguationGeneralEntities.java
    │   │   │   │       │   │   ├── CandidateReduction.java
    │   │   │   │       │   │   ├── CandidatePruning.java
    │   │   │   │       │   │   └── Vertex.java
    │   │   │   │       │   ├── AbstractDisambiguationAlgorithm.java
    │   │   │   │       │   ├── DisambiguationHandler.java
    │   │   │   │       │   └── SurfaceForm.java
    │   │   │   │       ├── backend
    │   │   │   │       │   ├── AbstractDisambiguationTask.java
    │   │   │   │       │   ├── DisambiguationTaskSingle.java
    │   │   │   │       │   └── DisambiguationTaskCollective.java
    │   │   │   │       └── properties
    │   │   │   │       │   └── Properties.java
    │   │   │   │   ├── word2vec
    │   │   │   │       ├── Doc2VecJsonFormat.java
    │   │   │   │       ├── Data.java
    │   │   │   │       └── Word2VecJsonFormat.java
    │   │   │   │   └── tools
    │   │   │   │       ├── ServiceQueries.java
    │   │   │   │       └── NTToDbPediaUrlEncoding.java
    │   │   └── resources
    │   │   │   ├── application.properties
    │   │   │   └── disambiguation.properties
    │   └── test
    │   │   └── java
    │   │       └── doser
    │   │           └── test
    │   │               └── breakdetection
    │   │                   └── BreakDetection.java
    ├── .classpath
    ├── .project
    └── pom.xml
├── doser-dis-extensions
    ├── .gitignore
    ├── .settings
    │   ├── org.eclipse.wst.jsdt.ui.superType.name
    │   ├── org.eclipse.wst.jsdt.ui.superType.container
    │   ├── org.eclipse.wst.validation.prefs
    │   ├── org.eclipse.m2e.wtp.prefs
    │   ├── org.eclipse.m2e.core.prefs
    │   ├── org.eclipse.wst.ws.service.policy.prefs
    │   ├── org.eclipse.wst.common.project.facet.core.prefs.xml
    │   ├── org.eclipse.wst.common.project.facet.core.xml
    │   ├── .jsdtscope
    │   ├── org.eclipse.jdt.core.prefs
    │   ├── org.eclipse.wst.common.component
    │   └── org.eclipse.jdt.ui.prefs
    ├── src
    │   └── main
    │   │   ├── resources
    │   │       └── application.properties
    │   │   └── java
    │   │       └── doser
    │   │           ├── lucene
    │   │               ├── features
    │   │               │   ├── DocCenExtFeatures.java
    │   │               │   ├── IEntityCentricExtFeatures.java
    │   │               │   └── LuceneFeatures.java
    │   │               ├── analysis
    │   │               │   ├── DoserIDFilter.java
    │   │               │   ├── DoserIDTokenizer.java
    │   │               │   ├── DoserStandardTokenizer.java
    │   │               │   ├── DoserIDAnalyzer.java
    │   │               │   └── DoserStandardAnalyzer.java
    │   │               └── query
    │   │               │   ├── LTRBooleanQuery.java
    │   │               │   ├── LearnToRankFeatureDefaultValueManager.java
    │   │               │   ├── LearnToRankClause.java
    │   │               │   ├── LearnToRankTermScorer.java
    │   │               │   ├── PriorQuery.java
    │   │               │   ├── SensePriorQuery.java
    │   │               │   └── ConjunctionScorer.java
    │   │           ├── algorithms
    │   │               └── MajorityVoteAlgorithm.java
    │   │           ├── general
    │   │               ├── HelpfulMethods.java
    │   │               └── Test.java
    │   │           └── nlp
    │   │               └── NLPTools.java
    ├── pom.xml
    ├── .project
    └── .classpath
├── doser-dis-disambiguationserver
    ├── .gitignore
    ├── .settings
    │   ├── org.eclipse.wst.jsdt.ui.superType.name
    │   ├── org.eclipse.wst.jsdt.ui.superType.container
    │   ├── org.eclipse.wst.validation.prefs
    │   ├── org.eclipse.m2e.wtp.prefs
    │   ├── org.eclipse.m2e.core.prefs
    │   ├── org.eclipse.wst.ws.service.policy.prefs
    │   ├── org.eclipse.wst.common.project.facet.core.prefs.xml
    │   ├── org.eclipse.wst.common.project.facet.core.xml
    │   ├── .jsdtscope
    │   ├── org.eclipse.jdt.core.prefs
    │   └── org.eclipse.wst.common.component
    ├── src
    │   └── main
    │   │   ├── resources
    │   │       ├── application.properties
    │   │       └── log4j.xml
    │   │   ├── java
    │   │       └── doser
    │   │       │   └── server
    │   │       │       └── actions
    │   │       │           ├── package-info.java
    │   │       │           ├── FrameworkInitialization.java
    │   │       │           └── disambiguation
    │   │       │               └── DisambiguationService.java
    │   │   └── webapp
    │   │       └── WEB-INF
    │   │           ├── applicationContext.xml
    │   │           ├── web.xml
    │   │           └── dispatcher-servlet.xml
    ├── .classpath
    ├── .project
    └── pom.xml
├── Word2VecRestInterface
    ├── .idea
    │   ├── .name
    │   ├── scopes
    │   │   └── scope_settings.xml
    │   ├── encodings.xml
    │   ├── vcs.xml
    │   ├── Word2VecRestInterface.iml
    │   ├── modules.xml
    │   └── misc.xml
    ├── startserver
    └── config.ini
├── .settings
    ├── org.eclipse.m2e.core.prefs
    ├── org.eclipse.jdt.core.prefs
    └── org.eclipse.jst.jsp.core.prefs
├── .classpath
├── yes.pub
├── pom.xml
├── .project
├── README.md
└── yes


/doser-dis-core/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | 


--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/.name:
--------------------------------------------------------------------------------
1 | Word2VecRestInterface


--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.jsdt.ui.superType.name:
--------------------------------------------------------------------------------
1 | Window


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.jsdt.ui.superType.name:
--------------------------------------------------------------------------------
1 | Window


--------------------------------------------------------------------------------
/Word2VecRestInterface/startserver:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | nohup python Word2VecRest.py &
3 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.jsdt.ui.superType.name:
--------------------------------------------------------------------------------
1 | Window


--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.validation.prefs:
--------------------------------------------------------------------------------
1 | disabled=06target
2 | eclipse.preferences.version=1
3 | 


--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.jsdt.ui.superType.container:
--------------------------------------------------------------------------------
1 | org.eclipse.wst.jsdt.launching.baseBrowserLibrary


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.jsdt.ui.superType.container:
--------------------------------------------------------------------------------
1 | org.eclipse.wst.jsdt.launching.baseBrowserLibrary


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.validation.prefs:
--------------------------------------------------------------------------------
1 | disabled=06target
2 | eclipse.preferences.version=1
3 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.jsdt.ui.superType.container:
--------------------------------------------------------------------------------
1 | org.eclipse.wst.jsdt.launching.baseBrowserLibrary


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.validation.prefs:
--------------------------------------------------------------------------------
1 | disabled=06target
2 | eclipse.preferences.version=1
3 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.m2e.wtp.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.m2e.wtp.enabledProjectSpecificPrefs=false
3 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.m2e.wtp.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.m2e.wtp.enabledProjectSpecificPrefs=false
3 | 


--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/language/Languages.java:
--------------------------------------------------------------------------------
1 | package doser.language;
2 | 
3 | public enum Languages {
4 | english, german, other
5 | }
6 | 


--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.ws.service.policy.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.wst.ws.service.policy.projectEnabled=false
3 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.m2e.wtp.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.m2e.wtp.enabledProjectSpecificPrefs=false
3 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.ws.service.policy.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.wst.ws.service.policy.projectEnabled=false
3 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.ws.service.policy.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.wst.ws.service.policy.projectEnabled=false
3 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/resources/application.properties:
--------------------------------------------------------------------------------
1 | application.name = ${project.name}
2 | application.artifactId = ${project.artifactId}
3 | application.version = ${project.version}


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/resources/application.properties:
--------------------------------------------------------------------------------
1 | application.name = ${project.name}
2 | application.artifactId = ${project.artifactId}
3 | application.version = ${project.version}


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Data Presentation Objects (DPO) for model input and output
3 |  */
4 | package doser.entitydisambiguation.dpo;


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/resources/application.properties:
--------------------------------------------------------------------------------
1 | application.name = ${project.name}
2 | application.artifactId = ${project.artifactId}
3 | application.version = ${project.version}


--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/scopes/scope_settings.xml:
--------------------------------------------------------------------------------
1 | <component name="DependencyValidationManager">
2 |   <state>
3 |     <option name="SKIP_IMPORT_STATEMENTS" value="false" />
4 |   </state>
5 | </component>


--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" />
4 | </project>


--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="" vcs="" />
5 |   </component>
6 | </project>


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/java/doser/server/actions/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Package for all server actions. Use the "@Controller" class annotation to add a new action class.
3 |  */
4 | package doser.server.actions;


--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.common.project.facet.core.prefs.xml:
--------------------------------------------------------------------------------
1 | <root>
2 |   <facet id="jst.jaxrs">
3 |     <node name="libprov">
4 |       <attribute name="provider-id" value="jaxrs-no-op-library-provider"/>
5 |     </node>
6 |   </facet>
7 | </root>
8 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/KnowledgeBaseIdentifiers.java:
--------------------------------------------------------------------------------
1 | package doser.entitydisambiguation.knowledgebases;
2 | 
3 | public enum KnowledgeBaseIdentifiers {
4 | 	Standard, CSTable, Biomed, DocumentCentricDefault;
5 | }
6 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.common.project.facet.core.prefs.xml:
--------------------------------------------------------------------------------
1 | <root>
2 |   <facet id="jst.jaxrs">
3 |     <node name="libprov">
4 |       <attribute name="provider-id" value="jaxrs-no-op-library-provider"/>
5 |     </node>
6 |   </facet>
7 | </root>
8 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.common.project.facet.core.prefs.xml:
--------------------------------------------------------------------------------
1 | <root>
2 |   <facet id="jst.jaxrs">
3 |     <node name="libprov">
4 |       <attribute name="provider-id" value="jaxrs-no-op-library-provider"/>
5 |     </node>
6 |   </facet>
7 | </root>
8 | 


--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <classpath>
3 | 	<classpathentry kind="src" path="src"/>
4 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
5 | 	<classpathentry kind="output" path="bin"/>
6 | </classpath>
7 | 


--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/Word2VecRestInterface.iml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <module type="PYTHON_MODULE" version="4">
3 |   <component name="NewModuleRootManager">
4 |     <content url="file://$MODULE_DIR$" />
5 |     <orderEntry type="inheritedJdk" />
6 |     <orderEntry type="sourceFolder" forTests="false" />
7 |   </component>
8 | </module>


--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.common.project.facet.core.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <faceted-project>
3 |   <fixed facet="wst.jsdt.web"/>
4 |   <installed facet="wst.jsdt.web" version="1.0"/>
5 |   <installed facet="jst.jaxrs" version="1.1"/>
6 |   <installed facet="java" version="1.7"/>
7 |   <installed facet="jst.web" version="3.0"/>
8 | </faceted-project>
9 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.common.project.facet.core.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <faceted-project>
3 |   <fixed facet="wst.jsdt.web"/>
4 |   <installed facet="jst.web" version="2.5"/>
5 |   <installed facet="wst.jsdt.web" version="1.0"/>
6 |   <installed facet="jst.jaxrs" version="1.1"/>
7 |   <installed facet="java" version="1.7"/>
8 | </faceted-project>
9 | 


--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/../Word2VecRestInterface/.idea/Word2VecRestInterface.iml" filepath="$PROJECT_DIR$/../Word2VecRestInterface/.idea/Word2VecRestInterface.iml" />
6 |     </modules>
7 |   </component>
8 | </project>


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.common.project.facet.core.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <faceted-project>
3 |   <fixed facet="wst.jsdt.web"/>
4 |   <installed facet="jst.web" version="2.5"/>
5 |   <installed facet="wst.jsdt.web" version="1.0"/>
6 |   <installed facet="jst.jaxrs" version="1.1"/>
7 |   <installed facet="java" version="1.7"/>
8 | </faceted-project>
9 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
4 | org.eclipse.jdt.core.compiler.compliance=1.7
5 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
6 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
7 | org.eclipse.jdt.core.compiler.source=1.7
8 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/features/DocCenExtFeatures.java:
--------------------------------------------------------------------------------
 1 | package doser.lucene.features;
 2 | 
 3 | /**
 4 |  * Interface to specify an external Lucene feature set for a document-centric
 5 |  * knowledge base. External features are features not integrated in Apache
 6 |  * Lucene.
 7 |  * 
 8 |  * @author Stefan Zwicklbauer
 9 |  * 
10 |  */
11 | 
12 | public interface DocCenExtFeatures {
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/Word2VecRestInterface/config.ini:
--------------------------------------------------------------------------------
1 | [Word2VecRest]
2 | embeddings_w2v_wikipedia = /mnt/ssd1/disambiguation/word2vec/WikiEntityModel_400_neg10_iter5.seq
3 | embeddings_w2v_calbc = /mnt/ssd1/disambiguation/word2vec/calbcsmall_model_sg_500.bin
4 | embeddings_d2v_wikipedia = /mnt/ssd1/disambiguation/word2vec/doc2vec/Wiki_Standard_Model/doc2vec_wiki_model.d2v
5 | embeddings_d2v_wikipedia_german = /mnt/ssd1/disambiguation/word2vec/doc2vec/Wikipedia_Standard_German/doc2vec_model_german.d2v
6 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/IllegalDisambiguationAlgorithmInputException.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms;
 2 | 
 3 | public class IllegalDisambiguationAlgorithmInputException extends
 4 | 		IllegalArgumentException {
 5 | 
 6 | 	private static final long serialVersionUID = 1L;
 7 | 
 8 | 	IllegalDisambiguationAlgorithmInputException() {
 9 | 		super("Wrong Knowledge base!");
10 | 	}
11 | 
12 | 	IllegalDisambiguationAlgorithmInputException(String text) {
13 | 		super(text);
14 | 	}
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/resources/disambiguation.properties:
--------------------------------------------------------------------------------
1 | application.name = ${project.name}
2 | application.artifactId = ${project.artifactId}
3 | application.version = ${project.version}
4 | luceneversion = 4.7.0
5 | disambiguation.entityCentricKBWikipedia = /mnt/ssd1/disambiguation/LuceneIndex/Wikipedia_Default_Aida_Sigir/
6 | disambiguation.entityCentricBiomedCalbC= /mnt/ssd1/disambiguation/LuceneIndex/Biomed_CalbCSmall/
7 | disambiguation.returnSize = 10
8 | disambiguation.Word2VecService = http://theseus.dimis.fim.uni-passau.de:80/Word2VecRest/
9 | candidateExpansion = false


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserIDFilter.java:
--------------------------------------------------------------------------------
 1 | package doser.lucene.analysis;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.lucene.analysis.TokenFilter;
 6 | import org.apache.lucene.analysis.TokenStream;
 7 | 
 8 | public class DoserIDFilter extends TokenFilter {
 9 | 
10 | 	public DoserIDFilter(TokenStream in) {
11 | 		super(in);
12 | 	}
13 | 
14 | 	@Override
15 | 	public boolean incrementToken() throws IOException {
16 | 		if (!input.incrementToken()) {
17 | 			return false;
18 | 		}
19 | 		return true;
20 | 	}
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/word2vec/Doc2VecJsonFormat.java:
--------------------------------------------------------------------------------
 1 | package doser.word2vec;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | public class Doc2VecJsonFormat {
 7 | 
 8 | 	private List<Data> data;
 9 | 
10 | 	public Doc2VecJsonFormat() {
11 | 		super();
12 | 		this.data = new ArrayList<Data>();
13 | 	}
14 | 
15 | 	public List<Data> getData() {
16 | 		return data;
17 | 	}
18 | 
19 | 	public void setData(List<Data> data) {
20 | 		this.data = data;
21 | 	}
22 | 
23 | 	public void addData(Data doc) {
24 | 		this.data.add(doc);
25 | 	}
26 | 	
27 | }
28 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/AbstractRule.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.rules;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 6 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
 7 | 
 8 | abstract class AbstractRule {
 9 | 
10 | 	protected AbstractKnowledgeBase eckb;
11 | 	
12 | 	AbstractRule(AbstractKnowledgeBase eckb) {
13 | 		super();
14 | 		this.eckb = eckb;
15 | 	}
16 | 	
17 | 	abstract boolean applyRule(List<SurfaceForm> rep);
18 | 	
19 | }
20 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/webapp/WEB-INF/applicationContext.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <beans xmlns="http://www.springframework.org/schema/beans"
 3 | 	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
 4 | 	xmlns:context="http://www.springframework.org/schema/context"
 5 | 	xsi:schemaLocation="http://www.springframework.org/schema/beans 
 6 |            http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
 7 |            http://www.springframework.org/schema/context
 8 |            http://www.springframework.org/schema/context/spring-context-3.0.xsd">
 9 | 
10 | </beans>


--------------------------------------------------------------------------------
/doser-dis-core/.settings/.jsdtscope:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" path="src/main/webapp"/>
 4 | 	<classpathentry kind="src" path="target/m2e-wtp/web-resources"/>
 5 | 	<classpathentry kind="con" path="org.eclipse.wst.jsdt.launching.JRE_CONTAINER"/>
 6 | 	<classpathentry kind="con" path="org.eclipse.wst.jsdt.launching.WebProject">
 7 | 		<attributes>
 8 | 			<attribute name="hide" value="true"/>
 9 | 		</attributes>
10 | 	</classpathentry>
11 | 	<classpathentry kind="con" path="org.eclipse.wst.jsdt.launching.baseBrowserLibrary"/>
12 | 	<classpathentry kind="output" path=""/>
13 | </classpath>
14 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/DocumentCentricKnowledgeBaseDefault.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.knowledgebases;
 2 | 
 3 | import org.apache.lucene.search.similarities.Similarity;
 4 | 
 5 | public class DocumentCentricKnowledgeBaseDefault extends AbstractKnowledgeBase  {
 6 | 
 7 | 	public DocumentCentricKnowledgeBaseDefault(String uri, boolean dynamic,
 8 | 			Similarity sim) {
 9 | 		super(uri, dynamic, sim);
10 | 	}
11 | 
12 | 	public DocumentCentricKnowledgeBaseDefault(String uri, boolean dynamic) {
13 | 		super(uri, dynamic);
14 | 	}
15 | 
16 | 	@Override
17 | 	public void initialize() {
18 | 	}
19 | }
20 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/.jsdtscope:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" path="src/main/webapp"/>
 4 | 	<classpathentry kind="src" path="target/m2e-wtp/web-resources"/>
 5 | 	<classpathentry kind="con" path="org.eclipse.wst.jsdt.launching.JRE_CONTAINER"/>
 6 | 	<classpathentry kind="con" path="org.eclipse.wst.jsdt.launching.WebProject">
 7 | 		<attributes>
 8 | 			<attribute name="hide" value="true"/>
 9 | 		</attributes>
10 | 	</classpathentry>
11 | 	<classpathentry kind="con" path="org.eclipse.wst.jsdt.launching.baseBrowserLibrary"/>
12 | 	<classpathentry kind="output" path=""/>
13 | </classpath>
14 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/.jsdtscope:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" path="src/main/webapp"/>
 4 | 	<classpathentry kind="src" path="target/m2e-wtp/web-resources"/>
 5 | 	<classpathentry kind="con" path="org.eclipse.wst.jsdt.launching.JRE_CONTAINER"/>
 6 | 	<classpathentry kind="con" path="org.eclipse.wst.jsdt.launching.WebProject">
 7 | 		<attributes>
 8 | 			<attribute name="hide" value="true"/>
 9 | 		</attributes>
10 | 	</classpathentry>
11 | 	<classpathentry kind="con" path="org.eclipse.wst.jsdt.launching.baseBrowserLibrary"/>
12 | 	<classpathentry kind="output" path=""/>
13 | </classpath>
14 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/resources/log4j.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" ?>
 2 | <!DOCTYPE log4j:configuration SYSTEM "http://logging.apache.org/log4j/1.2/apidocs/org/apache/log4j/xml/doc-files/log4j.dtd">
 3 | 
 4 | <log4j:configuration xmlns:log4j="http://jakarta.apache.org/log4j/">
 5 | 	<appender name="console" class="org.apache.log4j.ConsoleAppender">
 6 | 		<layout class="org.apache.log4j.PatternLayout">
 7 | 			<param name="ConversionPattern" value="%5p [%t] (%F:%M:%L) - %m%n" />
 8 | 		</layout>
 9 | 	</appender>
10 |     
11 | 	<root>
12 | 		<priority value="INFO" />
13 | 		<appender-ref ref="console"/>
14 | 	</root>
15 | </log4j:configuration>


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/features/IEntityCentricExtFeatures.java:
--------------------------------------------------------------------------------
 1 | package doser.lucene.features;
 2 | 
 3 | import java.util.Set;
 4 | 
 5 | /**
 6 |  * Interface to specify an external Lucene feature set for an entity-centric
 7 |  * knowledge base. External features are features not integrated in Apache
 8 |  * Lucene.
 9 |  * 
10 |  * @author Stefan Zwicklbauer
11 |  * 
12 |  */
13 | public interface IEntityCentricExtFeatures {
14 | 
15 | 	public float getPriorOfDocument(final int docId);
16 | 
17 | 	public float getSensePriorOfDocument(final String keyword, final int docId);
18 | 	
19 | 	public Set<String> getRelations(final String url);
20 | 	
21 | 	public int getOccurrences(String sf, String uri);
22 | }
23 | 


--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
 5 | org.eclipse.jdt.core.compiler.compliance=1.7
 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.source=1.7
13 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
 5 | org.eclipse.jdt.core.compiler.compliance=1.7
 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.source=1.7
13 | 


--------------------------------------------------------------------------------
/yes.pub:
--------------------------------------------------------------------------------
1 | ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQDFiuuAKuK8WhRCBVZpjlXIs7TWNKwtpGYqhrbF+hOkstu26QXsPYz6ywZDfQzHS3ey6mi1a/nBx9IYwwgPERu56M1OEUXvHQogEmowCMMVGCDkDgkfkCsMeChIsvCqabTugX6sT/6HHR26QXD1xzkVMhlyF7AuK+XxHNriu7SaVjYwBfVyQc4Mf8usoigKJgBRu5vj4BXzH5oslIAlCZTcFR3tT7Iy4G7IpFwjoBZufQeQiS7k8JLfgKjB9Mcc3H9/gZNvau7RsuAo24SQ4y9Jjt3BahqVdxJgKZMdYyQeRresX7oiXqrsrwBAKHyFUZZAxYZJT2Y0PaK7IrZfXRikmSN+W2Gf9dTxRI5LfYW94JvTIeT5anUhOYtAf71wSmAimQrXbMS4JKlbbZSQB/U/GY3XX+mEyoG/qqgJUNjBTF5NPtOzKbprgTkubu6VNduokKLAJP+z0ZfDoZwZaPvXR9qmFu8E5qaAIfXM/oXd9DPcSuyAh1HvXnkCHJ0z1oGusmc/Cpk6Agt5IvL4khb/HtQpvdbr8DDM963Zy8VEHaq1Uq1SKEpAcw678EtbEymbEieL0BSq8wbBn6fQRXWiCDdiqRbAkIK3Q1kyMKxmovPmYtzykYgWmb0feQpVpROVvL1JyOCKRKEK2xEWsVidcBZJtTb+JW9OkThdun8q5w== quhfus@stefan.zwicklbauer@uni-passau.de
2 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
 5 | org.eclipse.jdt.core.compiler.compliance=1.7
 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.source=1.7
13 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.wst.common.component:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0">
 2 |     <wb-module deploy-name="de.uop.code-disambiguationserver">
 3 |         <wb-resource deploy-path="/" source-path="/target/m2e-wtp/web-resources"/>
 4 |         <wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
 5 |         <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
 6 |         <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/resources"/>
 7 |         <property name="java-output-path" value="/de.uop.code-disambiguationserver/target/classes"/>
 8 |         <property name="context-root" value="doser-extensions"/>
 9 |     </wb-module>
10 | </project-modules>
11 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/Candidate.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms;
 2 | 
 3 | class Candidate implements Comparable<Candidate> {
 4 | 
 5 | 	private String candidate;
 6 | 	private double score;
 7 | 
 8 | 	Candidate(String candidate, double score) {
 9 | 		super();
10 | 		this.candidate = candidate;
11 | 		this.score = score;
12 | 	}
13 | 
14 | 	@Override
15 | 	public int compareTo(Candidate o) {
16 | 		if (this.score < o.score) {
17 | 			return -1;
18 | 		} else if (this.score > o.score) {
19 | 			return 1;
20 | 		} else {
21 | 			return 0;
22 | 		}
23 | 	}
24 | 
25 | 	String getCandidate() {
26 | 		return candidate;
27 | 	}
28 | 
29 | 	double getScore() {
30 | 		return score;
31 | 	}
32 | 	
33 | 	@Override
34 | 	public String toString() {
35 | 		return candidate;
36 | 	}
37 | }


--------------------------------------------------------------------------------
/doser-dis-core/.settings/org.eclipse.wst.common.component:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0">
 2 |     <wb-module deploy-name="doser-dis-disambiguationserver">
 3 |         <wb-resource deploy-path="/" source-path="/target/m2e-wtp/web-resources"/>
 4 |         <wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
 5 |         <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
 6 |         <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/resources"/>
 7 |         <dependent-module archiveName="doser-dis-extensions-0.6.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/doser-dis-extensions/doser-dis-extensions">
 8 |             <dependency-type>uses</dependency-type>
 9 |         </dependent-module>
10 |         <property name="java-output-path" value="/de.uop.code-disambiguationserver/target/classes"/>
11 |         <property name="context-root" value="doser-dis-core"/>
12 |     </wb-module>
13 | </project-modules>
14 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/DisambiguatedEntity.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.dpo;
 2 | 
 3 | 
 4 | /**
 5 |  * Class representing a disambiguated entity consisting of the entity mention
 6 |  * (the text), the identified URI, a value representing the confidence for the
 7 |  * decision, and a entity description. Class is a POJO for automatic
 8 |  * (de-)serialization. TODO may not be complete (e.g. relevant terms may be
 9 |  * added)
10 |  * 
11 |  * @author zwicklbauer
12 |  * 
13 |  */
14 | public class DisambiguatedEntity {
15 | 
16 | 	private String entityUri;
17 | 
18 | 	public DisambiguatedEntity() {
19 | 		super();
20 | 		this.entityUri = "";
21 | 	}
22 | 
23 | 	public DisambiguatedEntity(final String text, final String entityUri,
24 | 			final double confidence, final String description) {
25 | 		this.entityUri = entityUri;
26 | 	}
27 | 
28 | 	public String getEntityUri() {
29 | 		return this.entityUri;
30 | 	}
31 | 
32 | 	public void setEntityUri(final String entityUri) {
33 | 		this.entityUri = entityUri;
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/LTRBooleanQuery.java:
--------------------------------------------------------------------------------
 1 | package doser.lucene.query;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.lucene.search.BooleanQuery;
 6 | import org.apache.lucene.search.IndexSearcher;
 7 | import org.apache.lucene.search.Weight;
 8 | 
 9 | public class LTRBooleanQuery extends BooleanQuery {
10 | 
11 | 	public class LTRBooleanWeight extends BooleanWeight {
12 | 
13 | 		public LTRBooleanWeight(final IndexSearcher searcher,
14 | 				final boolean disableCoord) throws IOException {
15 | 			super(searcher, disableCoord);
16 | 		}
17 | 
18 | 		@Override
19 | 		public float coord(final int overlap, final int maxOverlap) {
20 | 			// return 1.0f;
21 | 			return maxOverlap == 1 ? 1F : similarity.coord(overlap, maxOverlap);
22 | 		}
23 | 	}
24 | 
25 | 	public LTRBooleanQuery() {
26 | 		super();
27 | 	}
28 | 
29 | 	public LTRBooleanQuery(final boolean bool) {
30 | 		super(bool);
31 | 	}
32 | 
33 | 	@Override
34 | 	public Weight createWeight(final IndexSearcher searcher) throws IOException {
35 | 		return new LTRBooleanWeight(searcher, isCoordDisabled());
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
 3 |                              http://maven.apache.org/maven-v4_0_0.xsd">
 4 | 	<modelVersion>4.0.0</modelVersion>
 5 | 
 6 | 	<groupId>doser-dis</groupId>
 7 | 	<artifactId>doser-dis-parent</artifactId>
 8 | 	<packaging>pom</packaging>
 9 | 	<version>1.0</version>
10 | 	<name>DoSer</name>
11 | 
12 | 	<modules>
13 | 		<module>doser-dis-extensions</module>
14 | 		<module>doser-dis-core</module>
15 | 		<module>doser-dis-disambiguationserver</module>
16 | 	</modules>
17 | 
18 | 	<build>
19 | 		<pluginManagement>
20 | 			<plugins>
21 | 				<plugin>
22 | 					<groupId>org.apache.maven.plugins</groupId>
23 | 					<artifactId>maven-compiler-plugin</artifactId>
24 | 					<version>3.1</version>
25 | 					<configuration>
26 | 						<source>1.7</source>
27 | 						<target>1.7</target>
28 | 					</configuration>
29 | 				</plugin>
30 | 			</plugins>
31 | 		</pluginManagement>
32 | 	</build>
33 | 
34 | 	<dependencies>
35 | 	</dependencies>
36 | 	<dependencyManagement>
37 | 	</dependencyManagement>
38 | </project>
39 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/algorithms/MajorityVoteAlgorithm.java:
--------------------------------------------------------------------------------
 1 | package doser.algorithms;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.List;
 5 | import java.util.Map;
 6 | 
 7 | import doser.general.HelpfulMethods;
 8 | 
 9 | 
10 | /**
11 |  * Majority vote methods for arbitrary types
12 |  * 
13 |  * @author Stefan Zwicklbauer
14 |  * 
15 |  */
16 | public final class MajorityVoteAlgorithm<K extends Comparable<? super K>> {
17 | 
18 | 	public MajorityVoteAlgorithm() {
19 | 		super();
20 | 	}
21 | 
22 | 	public Map.Entry<K, Integer> getMajorityType(final List<K> typeList) {
23 | 		final List<Map.Entry<K, Integer>> list = this
24 | 				.getMajorityTypes(typeList);
25 | 		Map.Entry<K, Integer> res = null;
26 | 		if (!list.isEmpty()) {
27 | 			res = list.get(0);
28 | 		}
29 | 		return res;
30 | 	}
31 | 
32 | 	public List<Map.Entry<K, Integer>> getMajorityTypes(final List<K> list) {
33 | 		final Map<K, Integer> hash = new HashMap<K, Integer>();
34 | 		for (final K k : list) {
35 | 			if (hash.containsKey(k)) {
36 | 				Integer number = hash.get(k);
37 | 				hash.put(k, ++number);
38 | 			} else {
39 | 				hash.put(k, 1);
40 | 			}
41 | 		}
42 | 		return HelpfulMethods.sortByValue(hash);
43 | 	}
44 | 
45 | }
46 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/Edge.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.collective;
 2 | 
 3 | 
 4 | public class Edge {
 5 | 	
 6 | 	private Integer edgeNr;
 7 | 	
 8 | 	private Vertex target;
 9 | 	
10 | 	private double transition;
11 | 	
12 | 	private Double edgeProbability;
13 | 	
14 | 	public Edge(Integer edgeNr, Vertex target, double transition) {
15 | 		super();
16 | 		this.transition = transition;
17 | 		this.edgeNr = edgeNr;
18 | 		this.target = target;
19 | 	}
20 | 	
21 | 	public double getTransition() {
22 | 		return transition;
23 | 	}
24 | 	public void setTransition(double transition) {
25 | 		this.transition = transition;
26 | 	}
27 | 	
28 | 	public void setProbability(double p) {
29 | 		this.edgeProbability = new Double(p);
30 | 	}
31 | 	
32 | 	public Double getProbability() {
33 | 		return this.edgeProbability;
34 | 	}
35 | 	
36 | 	public Vertex getTarget() {
37 | 		return this.target;
38 | 	}
39 | 	
40 | 	@Override
41 | 	public boolean equals(Object obj) {
42 | 		if(this.edgeNr == ((Edge) obj).edgeNr) {
43 | 			return true;
44 | 		}
45 | 		return false;
46 | 	}
47 | 
48 | 	@Override
49 | 	public int hashCode() {
50 | 		return edgeNr.hashCode();
51 | 	}
52 | 
53 | }
54 | 


--------------------------------------------------------------------------------
/Word2VecRestInterface/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="DaemonCodeAnalyzer">
 4 |     <disable_hints />
 5 |   </component>
 6 |   <component name="DependencyValidationManager">
 7 |     <option name="SKIP_IMPORT_STATEMENTS" value="false" />
 8 |   </component>
 9 |   <component name="Encoding" useUTFGuessing="true" native2AsciiForPropertiesFiles="false" />
10 |   <component name="ProjectLevelVcsManager" settingsEditedManually="false">
11 |     <OptionsSetting value="true" id="Add" />
12 |     <OptionsSetting value="true" id="Remove" />
13 |     <OptionsSetting value="true" id="Checkout" />
14 |     <OptionsSetting value="true" id="Update" />
15 |     <OptionsSetting value="true" id="Status" />
16 |     <OptionsSetting value="true" id="Edit" />
17 |     <ConfirmationsSetting value="0" id="Add" />
18 |     <ConfirmationsSetting value="0" id="Remove" />
19 |   </component>
20 |   <component name="ProjectModuleManager">
21 |     <modules />
22 |   </component>
23 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 2.7.3 (/usr/bin/python2.7)" project-jdk-type="Python SDK" />
24 |   <component name="RunManager">
25 |     <list size="0" />
26 |   </component>
27 | </project>


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.settings/org.eclipse.wst.common.component:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?><project-modules id="moduleCoreId" project-version="1.5.0">
 2 |     <wb-module deploy-name="de.uop.code-disambiguationserver">
 3 |         <wb-resource deploy-path="/" source-path="/target/m2e-wtp/web-resources"/>
 4 |         <wb-resource deploy-path="/" source-path="/src/main/webapp" tag="defaultRootSource"/>
 5 |         <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/java"/>
 6 |         <wb-resource deploy-path="/WEB-INF/classes" source-path="/src/main/resources"/>
 7 |         <dependent-module archiveName="doser-dis-core-0.6.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/doser-dis-core/doser-dis-core">
 8 |             <dependency-type>uses</dependency-type>
 9 |         </dependent-module>
10 |         <dependent-module archiveName="doser-dis-extensions-0.6.jar" deploy-path="/WEB-INF/lib" handle="module:/resource/doser-dis-extensions/doser-dis-extensions">
11 |             <dependency-type>uses</dependency-type>
12 |         </dependent-module>
13 |         <property name="java-output-path" value="/de.uop.code-disambiguationserver/target/classes"/>
14 |         <property name="context-root" value="doser-dis-disambiguationserver"/>
15 |     </wb-module>
16 | </project-modules>
17 | 


--------------------------------------------------------------------------------
/doser-dis-core/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
15 | 		<attributes>
16 | 			<attribute name="optional" value="true"/>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 		</attributes>
19 | 	</classpathentry>
20 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7">
21 | 		<attributes>
22 | 			<attribute name="maven.pomderived" value="true"/>
23 | 		</attributes>
24 | 	</classpathentry>
25 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
26 | 		<attributes>
27 | 			<attribute name="maven.pomderived" value="true"/>
28 | 		</attributes>
29 | 	</classpathentry>
30 | 	<classpathentry kind="output" path="target/classes"/>
31 | </classpath>
32 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<parent>
 4 | 		<groupId>doser-dis</groupId>
 5 | 		<artifactId>doser-dis-parent</artifactId>
 6 | 		<version>1.0</version>
 7 | 	</parent>
 8 | 	<modelVersion>4.0.0</modelVersion>
 9 | 	<groupId>doser.sub</groupId>
10 | 	<artifactId>doser-dis-extensions</artifactId>
11 | 	<version>0.6</version>
12 | 	<name>doser-dis-extensions</name>
13 | 	<description />
14 | 	<build>
15 | 		<finalName>doser-dis-extensions</finalName>
16 | 		<plugins>
17 | 		</plugins>
18 | 	</build>
19 | 	<dependencies>
20 | 		<dependency>
21 | 			<groupId>org.apache.lucene</groupId>
22 | 			<artifactId>lucene-core</artifactId>
23 | 			<version>4.10.4</version>
24 | 		</dependency>
25 | 		<dependency>
26 | 			<groupId>org.apache.lucene</groupId>
27 | 			<artifactId>lucene-analyzers-common</artifactId>
28 | 			<version>4.10.4</version>
29 | 		</dependency>
30 | 		<dependency>
31 | 			<groupId>org.apache.lucene</groupId>
32 | 			<artifactId>lucene-queryparser</artifactId>
33 | 			<version>4.10.4</version>
34 | 		</dependency>
35 | 	</dependencies>
36 | 	<dependencyManagement>
37 | 		<dependencies>
38 | 		</dependencies>
39 | 	</dependencyManagement>
40 | </project>


--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>de.uop.code-disambiguationserver</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.wst.jsdt.core.javascriptValidator</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.jdt.core.javabuilder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 		<buildCommand>
19 | 			<name>org.eclipse.wst.common.project.facet.core.builder</name>
20 | 			<arguments>
21 | 			</arguments>
22 | 		</buildCommand>
23 | 		<buildCommand>
24 | 			<name>org.eclipse.wst.validation.validationbuilder</name>
25 | 			<arguments>
26 | 			</arguments>
27 | 		</buildCommand>
28 | 		<buildCommand>
29 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
30 | 			<arguments>
31 | 			</arguments>
32 | 		</buildCommand>
33 | 	</buildSpec>
34 | 	<natures>
35 | 		<nature>org.eclipse.jem.workbench.JavaEMFNature</nature>
36 | 		<nature>org.eclipse.wst.common.modulecore.ModuleCoreNature</nature>
37 | 		<nature>org.eclipse.jdt.core.javanature</nature>
38 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
39 | 		<nature>org.eclipse.wst.common.project.facet.core.nature</nature>
40 | 		<nature>org.eclipse.wst.jsdt.core.jsNature</nature>
41 | 	</natures>
42 | </projectDescription>
43 | 


--------------------------------------------------------------------------------
/doser-dis-core/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>doser-extensions</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.wst.jsdt.core.javascriptValidator</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.jdt.core.javabuilder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 		<buildCommand>
19 | 			<name>org.eclipse.wst.common.project.facet.core.builder</name>
20 | 			<arguments>
21 | 			</arguments>
22 | 		</buildCommand>
23 | 		<buildCommand>
24 | 			<name>org.eclipse.wst.validation.validationbuilder</name>
25 | 			<arguments>
26 | 			</arguments>
27 | 		</buildCommand>
28 | 		<buildCommand>
29 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
30 | 			<arguments>
31 | 			</arguments>
32 | 		</buildCommand>
33 | 	</buildSpec>
34 | 	<natures>
35 | 		<nature>org.eclipse.jem.workbench.JavaEMFNature</nature>
36 | 		<nature>org.eclipse.wst.common.modulecore.ModuleCoreNature</nature>
37 | 		<nature>org.eclipse.jdt.core.javanature</nature>
38 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
39 | 		<nature>org.eclipse.wst.common.project.facet.core.nature</nature>
40 | 		<nature>org.eclipse.wst.jsdt.core.jsNature</nature>
41 | 	</natures>
42 | </projectDescription>
43 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>doser-extensions</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.wst.jsdt.core.javascriptValidator</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.jdt.core.javabuilder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 		<buildCommand>
19 | 			<name>org.eclipse.wst.common.project.facet.core.builder</name>
20 | 			<arguments>
21 | 			</arguments>
22 | 		</buildCommand>
23 | 		<buildCommand>
24 | 			<name>org.eclipse.wst.validation.validationbuilder</name>
25 | 			<arguments>
26 | 			</arguments>
27 | 		</buildCommand>
28 | 		<buildCommand>
29 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
30 | 			<arguments>
31 | 			</arguments>
32 | 		</buildCommand>
33 | 	</buildSpec>
34 | 	<natures>
35 | 		<nature>org.eclipse.jem.workbench.JavaEMFNature</nature>
36 | 		<nature>org.eclipse.wst.common.modulecore.ModuleCoreNature</nature>
37 | 		<nature>org.eclipse.jdt.core.javanature</nature>
38 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
39 | 		<nature>org.eclipse.wst.common.project.facet.core.nature</nature>
40 | 		<nature>org.eclipse.wst.jsdt.core.jsNature</nature>
41 | 	</natures>
42 | </projectDescription>
43 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
15 | 		<attributes>
16 | 			<attribute name="optional" value="true"/>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 		</attributes>
19 | 	</classpathentry>
20 | 	<classpathentry kind="src" path="/doser-dis-core"/>
21 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7">
22 | 		<attributes>
23 | 			<attribute name="maven.pomderived" value="true"/>
24 | 		</attributes>
25 | 	</classpathentry>
26 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
27 | 		<attributes>
28 | 			<attribute name="maven.pomderived" value="true"/>
29 | 		</attributes>
30 | 	</classpathentry>
31 | 	<classpathentry kind="output" path="target/classes"/>
32 | </classpath>
33 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>de.uop.code-disambiguationserver</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.wst.jsdt.core.javascriptValidator</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.jdt.core.javabuilder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 		<buildCommand>
19 | 			<name>org.eclipse.wst.common.project.facet.core.builder</name>
20 | 			<arguments>
21 | 			</arguments>
22 | 		</buildCommand>
23 | 		<buildCommand>
24 | 			<name>org.eclipse.wst.validation.validationbuilder</name>
25 | 			<arguments>
26 | 			</arguments>
27 | 		</buildCommand>
28 | 		<buildCommand>
29 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
30 | 			<arguments>
31 | 			</arguments>
32 | 		</buildCommand>
33 | 	</buildSpec>
34 | 	<natures>
35 | 		<nature>org.eclipse.jem.workbench.JavaEMFNature</nature>
36 | 		<nature>org.eclipse.wst.common.modulecore.ModuleCoreNature</nature>
37 | 		<nature>org.eclipse.jdt.core.javanature</nature>
38 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
39 | 		<nature>org.eclipse.wst.common.project.facet.core.nature</nature>
40 | 		<nature>org.eclipse.wst.jsdt.core.jsNature</nature>
41 | 	</natures>
42 | </projectDescription>
43 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
15 | 		<attributes>
16 | 			<attribute name="optional" value="true"/>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 		</attributes>
19 | 	</classpathentry>
20 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7">
21 | 		<attributes>
22 | 			<attribute name="maven.pomderived" value="true"/>
23 | 		</attributes>
24 | 	</classpathentry>
25 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
26 | 		<attributes>
27 | 			<attribute name="maven.pomderived" value="true"/>
28 | 		</attributes>
29 | 	</classpathentry>
30 | 	<classpathentry kind="con" path="org.eclipse.jdt.junit.JUNIT_CONTAINER/4"/>
31 | 	<classpathentry kind="output" path="target/classes"/>
32 | </classpath>
33 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/DisambiguationResponse.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.dpo;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | /**
 6 |  * {
 7 |  * 
 8 |  * "documentUri":"unique document id", "disambiguatedSurfaceforms": [ {
 9 |  * "selectedText":"influenza", "position": { "pageId":0,
10 |  * "offsets":[1,2,3,5,6,7], "boundingBox":{"minx":0.1, "miny":0.3, "maxx":0.01,
11 |  * "maxy":0.03} }, "disEntities": [ { "text":"Influenza (Illness)"
12 |  * "entityUri":"http://en.dbpedia.org/pages/..." "confidence":"0.80"
13 |  * "description":"some additional description"
14 |  * 
15 |  * ---a list of synonyms (for a later stage)--- "synonyms": [ { "term":"..." } ]
16 |  * } // more Items ] }
17 |  * 
18 |  * Version 2.0 is used for additional testing. Current version offers the usage
19 |  * of a position array in surfaceFormsToDisambiguate
20 |  * 
21 |  * @author Stefan Zwicklbauer
22 |  * 
23 |  */
24 | public class DisambiguationResponse {
25 | 	
26 | 	private List<Response> tasks; // NOPMD by quh on 18.02.14 09:34
27 | 
28 | 	private String documentUri;
29 | 	
30 | 	public List<Response> getTasks() {
31 | 		return tasks;
32 | 	}
33 | 
34 | 	public void setTasks(List<Response> tasks) {
35 | 		this.tasks = tasks;
36 | 	}
37 | 
38 | 	public String getDocumentUri() {
39 | 		return this.documentUri;
40 | 	}
41 | 
42 | 	public void setDocumentUri(final String documentUri) {
43 | 		this.documentUri = documentUri;
44 | 	}
45 | }
46 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/Response.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.dpo;
 2 | 
 3 | import java.util.LinkedList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * This class represents a disambiguated surface form and contains all necessary
 8 |  * information about the disambiguation. Position is required because a
 9 |  * ColumnResponseItem has no unique primary key and assures the correct
10 |  * assignment to the original item.
11 |  * 
12 |  * Version 2.0 offers a list of positions
13 |  * 
14 |  * @author Stefan Zwicklbauer
15 |  * 
16 |  */
17 | public class Response {
18 | 
19 | 	private List<DisambiguatedEntity> disEntities;
20 | 	private String selectedText;
21 | 	private int documentId;
22 | 
23 | 	public Response() {
24 | 		super();
25 | 		this.disEntities = new LinkedList<DisambiguatedEntity>();
26 | 	}
27 | 
28 | 	public List<DisambiguatedEntity> getDisEntities() {
29 | 		return this.disEntities;
30 | 	}
31 | 
32 | 	public String getSelectedText() {
33 | 		return this.selectedText;
34 | 	}
35 | 
36 | 	public void setDisEntities(final List<DisambiguatedEntity> disEntities) {
37 | 		this.disEntities = disEntities;
38 | 	}
39 | 
40 | 	public void setSelectedText(final String selectedText) {
41 | 		this.selectedText = selectedText;
42 | 	}
43 | 
44 | 	public int getDocumentId() {
45 | 		return documentId;
46 | 	}
47 | 
48 | 	public void setDocumentId(int documentId) {
49 | 		this.documentId = documentId;
50 | 	}
51 | }
52 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/webapp/WEB-INF/web.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <web-app xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://java.sun.com/xml/ns/javaee" xsi:schemaLocation="http://java.sun.com/xml/ns/javaee http://java.sun.com/xml/ns/javaee/web-app_2_5.xsd" version="2.5">
 3 |   <display-name>Disambiguation Server</display-name>
 4 |   <context-param>
 5 |     <param-name>contextConfigLocation</param-name>
 6 |     <param-value>/WEB-INF/applicationContext.xml</param-value>
 7 |   </context-param>
 8 |   <filter>
 9 |     <filter-name>encoding-filter</filter-name>
10 |     <filter-class>org.springframework.web.filter.CharacterEncodingFilter</filter-class>
11 |     <init-param>
12 |       <param-name>encoding</param-name>
13 |       <param-value>UTF-8</param-value>
14 |     </init-param>
15 |   </filter>
16 |   <filter-mapping>
17 |     <filter-name>encoding-filter</filter-name>
18 |     <url-pattern>/*</url-pattern>
19 |   </filter-mapping>
20 |   <servlet>
21 |     <servlet-name>dispatcher</servlet-name>
22 |     <servlet-class>org.springframework.web.servlet.DispatcherServlet</servlet-class>
23 |     <load-on-startup>1</load-on-startup>
24 |   </servlet>
25 |   <servlet-mapping>
26 |     <servlet-name>dispatcher</servlet-name>
27 |     <url-pattern>/</url-pattern>
28 |   </servlet-mapping>
29 |   <listener>
30 |     <listener-class>doser.server.actions.FrameworkInitialization</listener-class>
31 |   </listener>
32 | </web-app>


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/LearnToRankFeatureDefaultValueManager.java:
--------------------------------------------------------------------------------
 1 | package doser.lucene.query;
 2 | 
 3 | /**
 4 |  * Not in use so far.
 5 |  * 
 6 |  * @author quh
 7 |  */
 8 | public class LearnToRankFeatureDefaultValueManager {
 9 | 
10 | 	private static LearnToRankFeatureDefaultValueManager man;
11 | 
12 | 	public static LearnToRankFeatureDefaultValueManager getInstance() {
13 | 		return man;
14 | 	}
15 | 
16 | 	public static void setInstance(
17 | 			final LearnToRankFeatureDefaultValueManager manager) {
18 | 		man = manager;
19 | 	}
20 | 
21 | 	private int amountQueries;
22 | 
23 | 	private final float[] maxVals;
24 | 
25 | 	private final float[] sums;
26 | 
27 | 	public LearnToRankFeatureDefaultValueManager(final int pos) {
28 | 		maxVals = new float[pos];
29 | 		sums = new float[pos];
30 | 		for (int j = 0; j < sums.length; j++) {
31 | 			sums[j] = 0;
32 | 		}
33 | 		amountQueries = 0;
34 | 	}
35 | 
36 | 	public float[] getAverageResults() {
37 | 		final float[] results = new float[maxVals.length];
38 | 		for (int i = 0; i < sums.length; i++) {
39 | 			results[i] = sums[i] / amountQueries;
40 | 		}
41 | 		return results;
42 | 	}
43 | 
44 | 	public void newQuery() {
45 | 		for (int i = 0; i < maxVals.length; i++) {
46 | 			sums[i] += maxVals[i];
47 | 		}
48 | 		amountQueries++;
49 | 	}
50 | 
51 | 	public synchronized void setValue(final int position, final float value) {
52 | 		if (maxVals[position] < value) {
53 | 			maxVals[position] = value;
54 | 		}
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/CandidateReductionDBpediaW2V.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.collective.dbpedia;
 2 | 
 3 | import java.util.LinkedList;
 4 | import java.util.List;
 5 | 
 6 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 7 | import doser.entitydisambiguation.algorithms.collective.CandidateReduction;
 8 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
 9 | 
10 | public class CandidateReductionDBpediaW2V extends CandidateReduction {
11 | 
12 | 	private int iterations;
13 | 	private boolean disambiguate;
14 | 	private EntityCentricKBDBpedia eckb;
15 | 	private int reduceTo;
16 | 	
17 | 	CandidateReductionDBpediaW2V(EntityCentricKBDBpedia eckb, List<SurfaceForm> rep, int maxsurfaceformsperquery,
18 | 			int reduceTo, int iterations, boolean disambiguate, boolean alwaysAction) {
19 | 		super(rep, maxsurfaceformsperquery, alwaysAction);
20 | 		this.iterations = iterations;
21 | 		this.disambiguate = disambiguate;
22 | 		this.eckb = eckb;
23 | 		this.reduceTo = reduceTo;
24 | 	}
25 | 
26 | 	@Override
27 | 	public List<SurfaceForm> miniSolve(List<SurfaceForm> rep) {
28 | 		List<SurfaceForm> sol = new LinkedList<SurfaceForm>();
29 | 		Word2VecDisambiguator disambiguator = new Word2VecDisambiguator(eckb, rep, disambiguate, reduceTo, iterations);
30 | 		disambiguator.setup();
31 | 		disambiguator.solve();
32 | 		sol.addAll(disambiguator.getRepresentation());
33 | 		return sol;
34 | 
35 | 	}
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/.settings/org.eclipse.jst.jsp.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | validateFragments=false
 3 | validation.actions-missing-required-attribute=1
 4 | validation.actions-non-empty-inline-tag=2
 5 | validation.actions-unexpected-rtexprvalue=2
 6 | validation.actions-unknown-attribute=2
 7 | validation.directive-attribute-duplicate=2
 8 | validation.directive-include-fragment-file-not-found=2
 9 | validation.directive-include-fragment-file-not-specified=2
10 | validation.directive-taglib-duplicate-prefixes-different-uris=2
11 | validation.directive-taglib-duplicate-prefixes-same-uris=-1
12 | validation.directive-taglib-missing-prefix=2
13 | validation.directive-taglib-missing-uri-or-tagdir=2
14 | validation.directive-taglib-unresolvable-uri-or-tagdir=2
15 | validation.el-function-undefined=1
16 | validation.el-general-syntax=1
17 | validation.el-lexical-failure=-1
18 | validation.java-=-1
19 | validation.java-local-variable-is-never-used=-1
20 | validation.java-null-local-variable-reference=-1
21 | validation.java-potential-null-local-variable-reference=-1
22 | validation.java-unused-import=-1
23 | validation.translation-tag-class-not-found=2
24 | validation.translation-tei-class-not-found=2
25 | validation.translation-tei-class-not-instantiated=2
26 | validation.translation-tei-class-runtime-exception=2
27 | validation.translation-tei-message=1
28 | validation.translation-usebean-ambiguous-type-info=2
29 | validation.translation-usebean-invalid-id=1
30 | validation.translation-usebean-missing-type-info=1
31 | validation.use-project-settings=true
32 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/general/CandidateReductionGeneralW2V.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.collective.general;
 2 | 
 3 | import java.util.LinkedList;
 4 | import java.util.List;
 5 | 
 6 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 7 | import doser.entitydisambiguation.algorithms.collective.CandidateReduction;
 8 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral;
 9 | 
10 | public class CandidateReductionGeneralW2V extends CandidateReduction {
11 | 
12 | 	private int iterations;
13 | 	private boolean disambiguate;
14 | 	private AbstractEntityCentricKBGeneral eckb;
15 | 	private int reduceTo;
16 | 	
17 | 	public CandidateReductionGeneralW2V(AbstractEntityCentricKBGeneral eckb, List<SurfaceForm> rep, int maxsurfaceformsperquery,
18 | 			int reduceTo, int iterations, boolean disambiguate, boolean alwaysAction) {
19 | 		super(rep, maxsurfaceformsperquery, alwaysAction);
20 | 		this.iterations = iterations;
21 | 		this.disambiguate = disambiguate;
22 | 		this.eckb = eckb;
23 | 		this.reduceTo = reduceTo;
24 | 	}
25 | 
26 | 	@Override
27 | 	public List<SurfaceForm> miniSolve(List<SurfaceForm> rep) {
28 | 		List<SurfaceForm> sol = new LinkedList<SurfaceForm>();
29 | 		Word2VecDisambiguatorGeneral disambiguator = new Word2VecDisambiguatorGeneral(eckb, rep, disambiguate, reduceTo,
30 | 				iterations);
31 | 		disambiguator.setup();
32 | 		disambiguator.solve();
33 | 		sol.addAll(disambiguator.getRepresentation());
34 | 		return sol;
35 | 
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserIDTokenizer.java:
--------------------------------------------------------------------------------
 1 | package doser.lucene.analysis;
 2 | 
 3 | import java.io.Reader;
 4 | 
 5 | import org.apache.lucene.analysis.Tokenizer;
 6 | import org.apache.lucene.analysis.util.CharTokenizer;
 7 | import org.apache.lucene.util.AttributeFactory;
 8 | 
 9 | public final class DoserIDTokenizer extends CharTokenizer {
10 | 
11 | 	/**
12 | 	 * Construct a new WhitespaceTokenizer using a given
13 | 	 * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
14 | 	 * 
15 | 	 * @param matchVersion
16 | 	 *            Lucene version to match See
17 | 	 *            {@link <a href="#version">above</a>}
18 | 	 * @param factory
19 | 	 *            the attribute factory to use for this {@link Tokenizer}
20 | 	 * @param in
21 | 	 *            the input to split up into tokens
22 | 	 */
23 | 	public DoserIDTokenizer(AttributeFactory factory, Reader in) {
24 | 		super(factory, in);
25 | 	}
26 | 
27 | 	/**
28 | 	 * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
29 | 	 * to match See {@link <a href="#version">above</a>}
30 | 	 * 
31 | 	 * @param in
32 | 	 *            the input to split up into tokens
33 | 	 */
34 | 	public DoserIDTokenizer(Reader in) {
35 | 		super(in);
36 | 	}
37 | 
38 | 	/**
39 | 	 * Collects only characters which do not satisfy
40 | 	 * {@link Character#isWhitespace(int)}.
41 | 	 */
42 | 	@Override
43 | 	protected boolean isTokenChar(int c) {
44 | 		boolean check = true;
45 | 		if (Character.isWhitespace(c)) {
46 | 			check = false;
47 | 		}
48 | 		return check;
49 | 	}
50 | }


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/RuleAdapation.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.rules;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
 7 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 8 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
 9 | 
10 | public class RuleAdapation {
11 | 
12 | 	private List<AbstractRule> ruleChain;
13 | 	
14 | 	public RuleAdapation() {
15 | 		super();
16 | 		this.ruleChain = new ArrayList<AbstractRule>();
17 | 	}
18 | 	
19 | 	public void addNoCandidatesCheckPluralRule(AbstractKnowledgeBase eckb) {
20 | 		this.ruleChain.add(new NoCandidatesCheckPlural(eckb));
21 | 	}
22 | 	
23 | 	public void addNoCandidatesExpansionRule(AbstractKnowledgeBase eckb) {
24 | 		this.ruleChain.add(new NoCandidatesExpansionRules(eckb));
25 | 	}
26 | 	
27 | 	public void addUnambiguousToAmbiguousRule(EntityCentricKBDBpedia eckb) {
28 | 		this.ruleChain.add(new UnambiguousToAmbiguousRule(eckb));
29 | 	}
30 | 	
31 | 	public void addPatternRule(EntityCentricKBDBpedia eckb, String topic) {
32 | 		if (topic != null) {
33 | 			this.ruleChain.add(new PatternRule(eckb));
34 | 		}
35 | 	}
36 | 	
37 | 	public void addContextRule(EntityCentricKBDBpedia eckb) {
38 | 		this.ruleChain.add(new ContextRule(eckb));
39 | 	}
40 | 
41 | 	public void performRuleChainBeforeCandidateSelection(List<SurfaceForm> rep) {
42 | 		for (AbstractRule r : ruleChain) {
43 | 			r.applyRule(rep);
44 | 		}
45 | 	}
46 | }
47 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/webapp/WEB-INF/dispatcher-servlet.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <beans xmlns="http://www.springframework.org/schema/beans"
 3 | 	xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:context="http://www.springframework.org/schema/context"
 4 | 	xmlns:mvc="http://www.springframework.org/schema/mvc"
 5 | 	xsi:schemaLocation="http://www.springframework.org/schema/beans 
 6 |            				http://www.springframework.org/schema/beans/spring-beans-3.0.xsd
 7 |            				http://www.springframework.org/schema/context 
 8 |            				http://www.springframework.org/schema/context/spring-context-3.0.xsd 
 9 |            				http://www.springframework.org/schema/mvc
10 |            				http://www.springframework.org/schema/mvc/spring-mvc-3.0.xsd">
11 | 
12 | 	<context:component-scan base-package="doser.server.actions" />
13 | 	<mvc:annotation-driven />
14 | 	<mvc:resources mapping="/demo/resources/**" location="/resources/demo/" />
15 | 
16 | 	<bean id="viewResolver"
17 | 		class="org.springframework.web.servlet.view.InternalResourceViewResolver">
18 | 		<property name="viewClass" value="org.springframework.web.servlet.view.JstlView" />
19 | 		<property name="prefix" value="/WEB-INF/jsp/" />
20 | 		<property name="suffix" value=".jsp" />
21 | 	</bean>
22 | 
23 | 	<bean id="multipartResolver"
24 | 		class="org.springframework.web.multipart.commons.CommonsMultipartResolver">
25 | 		<!-- one of the properties available; the maximum file size in bytes -->
26 | 		<property name="maxUploadSize" value="22024969" />
27 | 	</bean>
28 | </beans>


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/tools/ServiceQueries.java:
--------------------------------------------------------------------------------
 1 | package doser.tools;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.http.Header;
 6 | import org.apache.http.HttpEntity;
 7 | import org.apache.http.HttpResponse;
 8 | import org.apache.http.client.ClientProtocolException;
 9 | import org.apache.http.client.methods.HttpPost;
10 | import org.apache.http.entity.AbstractHttpEntity;
11 | import org.apache.http.impl.client.DefaultHttpClient;
12 | import org.apache.http.util.EntityUtils;
13 | import org.apache.log4j.Logger;
14 | 
15 | /**
16 |  * Class providing queries for different services. Integrated so far: DbPedia
17 |  * Spotlight
18 |  * 
19 |  * @author Stefan Zwicklbauer
20 |  * 
21 |  */
22 | public class ServiceQueries {
23 | 
24 | 	public static String httpPostRequest(String uri, AbstractHttpEntity entity,
25 | 			Header[] header) {
26 | 		DefaultHttpClient httpclient = new DefaultHttpClient();
27 | 		HttpPost httppost = new HttpPost(uri);
28 | 		httppost.setHeaders(header);
29 | 		httppost.setEntity(entity);
30 | 
31 | 		HttpResponse response;
32 | 		StringBuffer buffer = new StringBuffer();
33 | 		try {
34 | 			response = httpclient.execute(httppost);
35 | 			HttpEntity ent = response.getEntity();
36 | 
37 | 			buffer.append(EntityUtils.toString(ent));
38 | 			httpclient.getConnectionManager().shutdown();
39 | 
40 | 		} catch (ClientProtocolException e) {
41 | 			Logger.getRootLogger().error("HTTPClient error", e);
42 | 		} catch (IOException e) {
43 | 			Logger.getRootLogger().error("HTTPClient error", e);
44 | 		}
45 | 		return buffer.toString();
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/backend/AbstractDisambiguationTask.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.backend;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import doser.entitydisambiguation.dpo.Response;
 6 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
 7 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers;
 8 | 
 9 | public abstract class AbstractDisambiguationTask {
10 | 
11 | 	protected int returnNr;
12 | 
13 | 	protected AbstractKnowledgeBase kb;
14 | 
15 | 	protected KnowledgeBaseIdentifiers kbIdentifier;
16 | 	
17 | 	protected boolean retrieveDocClasses;
18 | 	
19 | 	protected List<Response> responses;
20 | 
21 | 	public int getReturnNr() {
22 | 		return returnNr;
23 | 	}
24 | 
25 | 	public void setReturnNr(int returnNr) {
26 | 		this.returnNr = returnNr;
27 | 	}
28 | 
29 | 	public AbstractKnowledgeBase getKb() {
30 | 		return kb;
31 | 	}
32 | 
33 | 	public void setKb(AbstractKnowledgeBase kb) {
34 | 		this.kb = kb;
35 | 	}
36 | 
37 | 	public KnowledgeBaseIdentifiers getKbIdentifier() {
38 | 		return this.kbIdentifier;
39 | 	}
40 | 	
41 | 	public boolean isRetrieveDocClasses() {
42 | 		return retrieveDocClasses;
43 | 	}
44 | 
45 | 	public void setRetrieveDocClasses(boolean retrieveDocClasses) {
46 | 		this.retrieveDocClasses = retrieveDocClasses;
47 | 	}
48 | 	
49 | 	public List<Response> getResponse() {
50 | 		return responses;
51 | 	}
52 | 
53 | 	public void setResponse(List<Response> responses) {
54 | 		this.responses = responses;
55 | 	}
56 | 	
57 | 	public abstract void setKbIdentifier(String kbversion, String setting);
58 | }
59 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/word2vec/Data.java:
--------------------------------------------------------------------------------
 1 | package doser.word2vec;
 2 | 
 3 | public class Data {
 4 | 
 5 | 	private String surfaceForm;
 6 | 	private String qryNr;
 7 | 	private String[] candidates;
 8 | 	private String context;
 9 | //	private String entity;
10 | 
11 | 	public String getSurfaceForm() {
12 | 		return surfaceForm;
13 | 	}
14 | 
15 | 	public void setSurfaceForm(String surfaceForm) {
16 | 		this.surfaceForm = surfaceForm;
17 | 	}
18 | 
19 | 	public String getQryNr() {
20 | 		return qryNr;
21 | 	}
22 | 
23 | 	public void setQryNr(String qryNr) {
24 | 		this.qryNr = qryNr;
25 | 	}
26 | 
27 | 	public String[] getCandidates() {
28 | 		return candidates;
29 | 	}
30 | 
31 | 	public void setCandidates(String[] candidates) {
32 | 		this.candidates = candidates;
33 | 	}
34 | 
35 | 	public String getContext() {
36 | 		return context;
37 | 	}
38 | 
39 | 	public void setContext(String context) {
40 | 		this.context = context;
41 | 	}
42 | 
43 | //	public String getEntity() {
44 | //		return entity;
45 | //	}
46 | //
47 | //	public void setEntity(String entity) {
48 | //		this.entity = entity;
49 | //	}
50 | //
51 | //	@Override
52 | //	public int hashCode() {
53 | //		return surfaceForm.hashCode() + qryNr.hashCode() + context.hashCode()
54 | //				+ entity.hashCode();
55 | //
56 | //	}
57 | //
58 | //	@Override
59 | //	public boolean equals(Object obj) {
60 | //		Data data = (Data) obj;
61 | //		if (this.surfaceForm.equals(data.getSurfaceForm())
62 | //				&& this.context.equals(data.getSurfaceForm())
63 | //				&& this.qryNr.equals(data.getQryNr())
64 | //				&& this.entity.equals(data.getEntity())) {
65 | //			return true;
66 | //		}
67 | //		return false;
68 | //	}
69 | }
70 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/AbstractDisambiguationAlgorithm.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms;
 2 | 
 3 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask;
 4 | 
 5 | public abstract class AbstractDisambiguationAlgorithm {
 6 | 
 7 | 	protected AbstractDisambiguationTask task;
 8 | 
 9 | 	public void disambiguate(AbstractDisambiguationTask task)
10 | 			throws IllegalDisambiguationAlgorithmInputException {
11 | 		if (checkAndSetInputParameter(task)) {
12 | 			if (preDisambiguation()) {
13 | 				processAlgorithm();
14 | 			}
15 | 		} else {
16 | 			throw new IllegalDisambiguationAlgorithmInputException(
17 | 					"Check your input knowledge base and disambiguation task");
18 | 		}
19 | 	}
20 | 
21 | 	public static String extractContext(int position, String text,
22 | 			int contextarea) {
23 | 		if(text == null || text.length() == 0) {
24 | 			return "";
25 | 		}
26 | 		
27 | 		long startArea = position - contextarea;
28 | 		long endArea = position + contextarea;
29 | 		if (startArea < 0) {
30 | 			startArea = 0;
31 | 		}
32 | 		if (endArea > text.length() - 1) {
33 | 			endArea = text.length() - 1;
34 | 		}
35 | 		String tempText = text.substring((int) startArea, (int) endArea);
36 | 		String[] splitter = tempText.split(" ");
37 | 		String result = "";
38 | 		for (int i = 1; i < splitter.length - 1; i++) {
39 | 			result += splitter[i] + " ";
40 | 		}
41 | 		return result;
42 | 	}
43 | 
44 | 	protected abstract boolean checkAndSetInputParameter(AbstractDisambiguationTask task);
45 | 
46 | 	protected abstract void processAlgorithm()
47 | 			throws IllegalDisambiguationAlgorithmInputException;
48 | 
49 | 	protected abstract boolean preDisambiguation();
50 | }


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/general/HelpfulMethods.java:
--------------------------------------------------------------------------------
 1 | package doser.general;
 2 | 
 3 | import java.util.Collections;
 4 | import java.util.Comparator;
 5 | import java.util.LinkedList;
 6 | import java.util.List;
 7 | import java.util.Map;
 8 | 
 9 | public final class HelpfulMethods {
10 | 
11 | 	/**
12 | 	 * Sorts a Map by value
13 | 	 * 
14 | 	 * Partially buggy due to
15 | 	 * http://stackoverflow.com/questions/109383/how-to-sort
16 | 	 * -a-mapkey-value-on-the-values-in-java/1283722#1283722
17 | 	 * 
18 | 	 * @param map
19 | 	 * @return SortedMap by Value
20 | 	 */
21 | 	@Deprecated
22 | 	public static <K, V extends Comparable<? super V>> List<Map.Entry<K, V>> sortByValue(
23 | 			final Map<K, V> map) {
24 | 		final List<Map.Entry<K, V>> list = new LinkedList<Map.Entry<K, V>>(
25 | 				map.entrySet());
26 | 		Collections.sort(list, new Comparator<Map.Entry<K, V>>() {
27 | 			@Override
28 | 			public int compare(final Map.Entry<K, V> op1,
29 | 					final Map.Entry<K, V> op2) {
30 | 				return (op2.getValue()).compareTo(op1.getValue());
31 | 			}
32 | 		});
33 | 		return list;
34 | 	}
35 | 
36 | 	/**
37 | 	 * Correct Map Sorting with Guava
38 | 	 * 
39 | 	 */
40 | //	public static <K, V extends Comparable<? super V>> List<Map.Entry<K, V>> sortByValueGuava(
41 | //			Map<K, V> map) {
42 | //		// final List<K> sortedKeys =
43 | //		// Ordering.natural().onResultOf(Functions.forMap(map)).immutableSortedCopy(map.keySet());
44 | //
45 | //		Comparator<Map.Entry<K, V>> byMapValues = new Ordering<Map.Entry<K, V>>() {
46 | //			@Override
47 | //			public int compare(Map.Entry<K, V> left, Map.Entry<K, V> right) {
48 | //				return left.getValue().compareTo(right.getValue());
49 | //			}
50 | //		};
51 | //		
52 | //		List<Map.Entry<K, V>> entryList = Lists.newArrayList(map.entrySet());
53 | //	    Collections.sort(entryList, byMapValues);
54 | //	    return entryList;
55 | //	}
56 | }
57 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/backend/DisambiguationTaskSingle.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.backend;
 2 | 
 3 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
 4 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers;
 5 | 
 6 | public class DisambiguationTaskSingle extends AbstractDisambiguationTask {
 7 | 
 8 | 	private EntityDisambiguationDPO entityToDis;
 9 | 
10 | 	public DisambiguationTaskSingle(final EntityDisambiguationDPO entityToDis) {
11 | 		super();
12 | 		this.entityToDis = entityToDis;
13 | 		this.retrieveDocClasses = false;
14 | 	}
15 | 
16 | 	public EntityDisambiguationDPO getEntityToDisambiguate() {
17 | 		return this.entityToDis;
18 | 	}
19 | 
20 | 	public void setSurfaceForm(final EntityDisambiguationDPO surfaceForm) {
21 | 		this.entityToDis = surfaceForm;
22 | 	}
23 | 
24 | 	/**
25 | 	 * Assignment function to determine the used knowledge base
26 | 	 * 
27 | 	 * @param kbversion
28 | 	 * @param setting
29 | 	 */
30 | 	@Override
31 | 	public void setKbIdentifier(String kbversion, String setting) {
32 | 		if(setting == null) {
33 | 			this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
34 | 		} else if(setting.equalsIgnoreCase("DocumentCentric")) {
35 | 			if(kbversion.equalsIgnoreCase("default")) {
36 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault;
37 | 			} else {
38 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault;
39 | 			}
40 | 		} else if(setting.equalsIgnoreCase("EntityCentric")) {
41 | 			if(kbversion.equalsIgnoreCase("default")) {
42 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
43 | 			} else if(kbversion.equalsIgnoreCase("cstable")) {
44 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.CSTable;
45 | 			} else if(kbversion.equalsIgnoreCase("biomedcopy")) {
46 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.Biomed;
47 | 			} else {
48 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
49 | 			}
50 | 		} else {
51 | 			this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
52 | 		}
53 | 	}
54 | }
55 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/nlp/NLPTools.java:
--------------------------------------------------------------------------------
 1 | package doser.nlp;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Properties;
 5 | 
 6 | //import edu.stanford.nlp.ling.CoreAnnotations;
 7 | //import edu.stanford.nlp.ling.CoreLabel;
 8 | //import edu.stanford.nlp.pipeline.Annotation;
 9 | //import edu.stanford.nlp.pipeline.StanfordCoreNLP;
10 | //import edu.stanford.nlp.util.Pair;
11 | //
12 | //
13 | //public class NLPTools {
14 | //
15 | //	private static volatile NLPTools instance;
16 | //	
17 | //	private StanfordCoreNLP pipeline;
18 | //	
19 | //	private NLPTools() { 
20 | //		super();
21 | //		Properties props = new Properties();
22 | //		props.put("annotators", "tokenize, ssplit, pos, lemma, stopword");
23 | //		props.setProperty("customAnnotatorClass.stopword",
24 | //				"doser.nlp.StopWordAnnotator");
25 | //		props.setProperty(StopWordAnnotator.STOPWORDS_LIST, StopWordAnnotator.customStopWordList);
26 | //		props.setProperty(StopWordAnnotator.CHECK_LEMMA, "true");
27 | //		
28 | //		this.pipeline = new StanfordCoreNLP(props);
29 | //	}
30 | //
31 | //    public static NLPTools getInstance() {
32 | //        if (instance == null ) {
33 | //            synchronized (NLPTools.class) {
34 | //                if (instance == null) {
35 | //                    instance = new NLPTools();
36 | //                }
37 | //            }
38 | //        }
39 | //        return instance;
40 | //    }
41 | //	
42 | //	public String performLemmatizationAndStopWordRemoval(String str) {
43 | //		Annotation document = new Annotation(str);
44 | //		this.pipeline.annotate(document);
45 | //		List<CoreLabel> tokens = document
46 | //				.get(CoreAnnotations.TokensAnnotation.class);
47 | //		StringBuilder builder = new StringBuilder();
48 | //		for (CoreLabel token : tokens) {
49 | //			 Pair<Boolean, Boolean> stopword = token.get(StopWordAnnotator.class);
50 | //			 String lemma = token.lemma().toLowerCase();
51 | //			 if(!stopword.first()) {
52 | //				 builder.append(lemma);
53 | //				 builder.append(" ");
54 | //			 }
55 | //		}
56 | //		return builder.toString().trim();
57 | //	}
58 | //}
59 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/DisambiguationRequest.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.dpo;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | /**
 6 |  * { "documentUri":"unique document id", "surfaceFormsToDisambiguate": [ {
 7 |  * "selectedText":"influenza", "context":
 8 |  * "Typically, influenza is transmitted through the air by coughs or sneezes, creating aerosols containing the virus."
 9 |  * , "position": { "pageId":0, "offsets":[1,2,3,5,6,7],
10 |  * "boundingBox":{"minx":0.1, "miny":0.3, "maxx":0.01, "maxy":0.03} } } ],
11 |  * "alreadyDisambiguatedEntities":[ { "text":"Illness",
12 |  * "entityUri":"http://en.dbpedia.org/page/Illness", "confidence": 0.90,
13 |  * "distance": 300 }, { "text":"Desease",
14 |  * "entityUri":"http://en.dbpedia.org/page/Desease", "confidence": 0.65,
15 |  * "distance": 500 } ] }
16 |  * 
17 |  * Version 2.0 is used for additional testing. Current version
18 |  * offers the usage of a position array in surfaceFormsToDisambiguate
19 |  * 
20 |  * @author Stefan Zwicklbauer
21 |  * 
22 |  */
23 | public class DisambiguationRequest {
24 | 	private String documentUri;
25 | 	private List<EntityDisambiguationDPO> surfaceFormsToDisambiguate;
26 | 	private Integer docsToReturn;
27 | 	private String mainTopic;
28 | 
29 | 	public String getDocumentUri() {
30 | 		return this.documentUri;
31 | 	}
32 | 
33 | 	public List<EntityDisambiguationDPO> getSurfaceFormsToDisambiguate() {
34 | 		return this.surfaceFormsToDisambiguate;
35 | 	}
36 | 
37 | 	public void setDocumentUri(final String documentUri) {
38 | 		this.documentUri = documentUri;
39 | 	}
40 | 
41 | 	public void setSurfaceFormsToDisambiguate(
42 | 			final List<EntityDisambiguationDPO> surfaceFormsToDisambiguate) {
43 | 		this.surfaceFormsToDisambiguate = surfaceFormsToDisambiguate;
44 | 	}
45 | 
46 | 	public Integer getDocsToReturn() {
47 | 		return docsToReturn;
48 | 	}
49 | 
50 | 	public void setDocsToReturn(Integer docsToReturn) {
51 | 		this.docsToReturn = docsToReturn;
52 | 	}
53 | 
54 | 	public String getMainTopic() {
55 | 		return mainTopic;
56 | 	}
57 | 
58 | 	public void setMainTopic(String mainTopic) {
59 | 		this.mainTopic = mainTopic;
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/DisambiguationHandler.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms;
 2 | 
 3 | import doser.entitydisambiguation.algorithms.collective.dbpedia.CollectiveDisambiguationDBpediaEntities;
 4 | import doser.entitydisambiguation.algorithms.collective.general.CollectiveDisambiguationGeneralEntities;
 5 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask;
 6 | import doser.entitydisambiguation.backend.DisambiguationTaskSingle;
 7 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
 8 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers;
 9 | 
10 | public class DisambiguationHandler {
11 | 
12 | 	private static final DisambiguationHandler instance;
13 | 
14 | 	static {
15 | 		try {
16 | 			instance = new DisambiguationHandler();
17 | 		} catch (Exception e) {
18 | 			throw new RuntimeException("An error occurred!", e);
19 | 		}
20 | 	}
21 | 
22 | 	private DisambiguationHandler() {
23 | 		super();
24 | 	}
25 | 
26 | 	public static DisambiguationHandler getInstance() {
27 | 		return instance;
28 | 	}
29 | 
30 | 	public AbstractDisambiguationAlgorithm getAlgorithm(AbstractDisambiguationTask task) {
31 | 		AbstractDisambiguationAlgorithm algorithm = null;
32 | 		if (task instanceof DisambiguationTaskSingle) {
33 | 			DisambiguationTaskSingle t = (DisambiguationTaskSingle) task;
34 | 			EntityDisambiguationDPO dpo = t.getEntityToDisambiguate();
35 | 			if ((dpo.getSetting() != null
36 | 					&& (dpo.getSetting().equalsIgnoreCase("NoContext"))
37 | 					|| dpo.getContext() == null || dpo.getContext().equals("") || dpo
38 | 					.getContext().equals(" "))) {
39 | 				algorithm = new EntityCentricAlgorithmTableDefault();
40 | 			} else if ((dpo.getSetting() != null)
41 | 					&& (dpo.getSetting().equalsIgnoreCase("DocumentCentric"))) {
42 | 				algorithm = new DocumentCentricAlgorithmDefault();
43 | 			} else {
44 | 				algorithm = new EntityCentricAlgorithmDefault();
45 | 			}
46 | 		} else {
47 | 			if (task.getKbIdentifier().equals(KnowledgeBaseIdentifiers.Biomed)) {
48 | 				algorithm = new CollectiveDisambiguationGeneralEntities();
49 | 			} else {
50 | 				algorithm = new CollectiveDisambiguationDBpediaEntities();
51 | 			}
52 | 		}
53 | 		return algorithm;
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/word2vec/Word2VecJsonFormat.java:
--------------------------------------------------------------------------------
 1 | package doser.word2vec;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Set;
 5 | 
 6 | import org.apache.http.Header;
 7 | import org.apache.http.entity.ByteArrayEntity;
 8 | import org.apache.http.entity.ContentType;
 9 | import org.apache.http.message.BasicHeader;
10 | import org.codehaus.jackson.map.ObjectMapper;
11 | import org.codehaus.jettison.json.JSONArray;
12 | import org.codehaus.jettison.json.JSONException;
13 | import org.codehaus.jettison.json.JSONObject;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 | 
17 | import doser.entitydisambiguation.properties.Properties;
18 | import doser.tools.ServiceQueries;
19 | 
20 | public class Word2VecJsonFormat {
21 | 
22 | 	private final static Logger logger = LoggerFactory.getLogger(Word2VecJsonFormat.class);
23 | 	
24 | 	private String domain;
25 | 	private Set<String> data;
26 | 
27 | 	public Set<String> getData() {
28 | 		return data;
29 | 	}
30 | 
31 | 	public void setData(Set<String> data) {
32 | 		this.data = data;
33 | 	}
34 | 	
35 | 	public String getDomain() {
36 | 		return domain;
37 | 	}
38 | 
39 | 	public void setDomain(String domain) {
40 | 		this.domain = domain;
41 | 	}
42 | 
43 | 	public static JSONArray performquery(Object json, String serviceEndpoint) {
44 | 		final ObjectMapper mapper = new ObjectMapper();
45 | 		String jsonString = null;
46 | 		JSONArray result = null;
47 | 		try {
48 | 			jsonString = mapper.writeValueAsString(json);
49 | 			Header[] headers = { new BasicHeader("Accept", "application/json"),
50 | 					new BasicHeader("content-type", "application/json") };
51 | 			ByteArrayEntity ent = new ByteArrayEntity(jsonString.getBytes(),
52 | 					ContentType.create("application/json"));
53 | 			String resStr = ServiceQueries.httpPostRequest(
54 | 					(Properties.getInstance().getWord2VecService() + serviceEndpoint), ent, headers);
55 | 			JSONObject resultJSON = null;
56 | 			try {
57 | 				resultJSON = new JSONObject(resStr);
58 | 				result = resultJSON.getJSONArray("data");
59 | 			} catch (JSONException e) {
60 | 				logger.error("JsonException in "+Word2VecJsonFormat.class.getName(), e);
61 | 			}
62 | 		} catch (IOException e) {
63 | 			logger.error("JsonException in "+Word2VecJsonFormat.class.getName(), e);
64 | 		}
65 | 		return result;
66 | 	}
67 | }
68 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/backend/DisambiguationTaskCollective.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.backend;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
 6 | import doser.entitydisambiguation.knowledgebases.KnowledgeBaseIdentifiers;
 7 | 
 8 | public class DisambiguationTaskCollective extends AbstractDisambiguationTask {
 9 | 
10 | 	private List<EntityDisambiguationDPO> entitiesToDis;
11 | 	
12 | 	/* A maintopic e.g. the column identifier in a table */
13 | 	private String mainTopic;
14 | 
15 | 	public DisambiguationTaskCollective(final List<EntityDisambiguationDPO> entityToDis, String mainTopic) {
16 | 		super();
17 | 		this.entitiesToDis = entityToDis;
18 | 		this.mainTopic = mainTopic;
19 | 	}
20 | 
21 | 	public List<EntityDisambiguationDPO> getEntityToDisambiguate() {
22 | 		return this.entitiesToDis;
23 | 	}
24 | 	
25 | 	public String getMainTopic() {
26 | 		return this.mainTopic;
27 | 	}
28 | 
29 | 	public void setSurfaceForm(final List<EntityDisambiguationDPO> surfaceForm) {
30 | 		this.entitiesToDis = surfaceForm;
31 | 	}
32 | 
33 | 	/**
34 | 	 * Assignment function to determine the used knowledge base
35 | 	 * 
36 | 	 * @param kbversion
37 | 	 * @param setting
38 | 	 */
39 | 	@Override
40 | 	public void setKbIdentifier(String kbversion, String setting) {
41 | 		if(setting == null) {
42 | 			this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
43 | 		} else if(setting.equalsIgnoreCase("DocumentCentric")) {
44 | 			if(kbversion.equalsIgnoreCase("default")) {
45 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault;
46 | 			} else {
47 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.DocumentCentricDefault;
48 | 			}
49 | 		} else if(setting.equalsIgnoreCase("EntityCentric")) {
50 | 			if(kbversion.equalsIgnoreCase("default")) {
51 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
52 | 			} else if(kbversion.equalsIgnoreCase("cstable")) {
53 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.CSTable;
54 | 			} else if(kbversion.equalsIgnoreCase("biomed")) {
55 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.Biomed;
56 | 			} else {
57 | 				this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
58 | 			}
59 | 		} else {
60 | 			this.kbIdentifier = KnowledgeBaseIdentifiers.Standard;
61 | 		}
62 | 	}	
63 | }
64 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserStandardTokenizer.java:
--------------------------------------------------------------------------------
 1 | package doser.lucene.analysis;
 2 | 
 3 | /*
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import java.io.Reader;
21 | 
22 | import org.apache.lucene.analysis.Tokenizer;
23 | import org.apache.lucene.analysis.util.CharTokenizer;
24 | import org.apache.lucene.util.AttributeFactory;
25 | 
26 | public final class DoserStandardTokenizer extends CharTokenizer {
27 | 
28 | 	/**
29 | 	 * Construct a new WhitespaceTokenizer using a given
30 | 	 * {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
31 | 	 * 
32 | 	 * @param factory
33 | 	 *            the attribute factory to use for this {@link Tokenizer}
34 | 	 * @param in
35 | 	 *            the input to split up into tokens
36 | 	 */
37 | 	public DoserStandardTokenizer(AttributeFactory factory, Reader in) {
38 | 		super(factory, in);
39 | 	}
40 | 
41 | 	/**
42 | 	 * Construct a new WhitespaceTokenizer. * @param matchVersion Lucene version
43 | 	 * to match See {@link <a href="#version">above</a>}
44 | 	 * 
45 | 	 * @param in
46 | 	 *            the input to split up into tokens
47 | 	 */
48 | 	public DoserStandardTokenizer(Reader in) {
49 | 		super(in);
50 | 	}
51 | 
52 | 	/**
53 | 	 * Collects only characters which do not satisfy
54 | 	 * {@link Character#isWhitespace(int)}.
55 | 	 */
56 | 	@Override
57 | 	protected boolean isTokenChar(int c) {
58 | 		boolean check = true;
59 | 		if (Character.isWhitespace(c) || c == 46) {
60 | 			check = false;
61 | 		}
62 | 		return check;
63 | 	}
64 | }


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/features/LuceneFeatures.java:
--------------------------------------------------------------------------------
 1 | package doser.lucene.features;
 2 | 
 3 | import java.util.Locale;
 4 | 
 5 | import org.apache.lucene.index.Term;
 6 | import org.apache.lucene.search.Query;
 7 | import org.apache.lucene.search.BooleanClause.Occur;
 8 | import org.apache.lucene.search.similarities.Similarity;
 9 | 
10 | import doser.lucene.query.LTRBooleanQuery;
11 | import doser.lucene.query.LearnToRankFuzzyQuery;
12 | import doser.lucene.query.LearnToRankTermQuery;
13 | import doser.lucene.query.PriorQuery;
14 | import doser.lucene.query.SensePriorQuery;
15 | 
16 | public class LuceneFeatures {
17 | 
18 | 	public static Query queryLabelTerm(String keyword, String field,
19 | 			Similarity sim) {
20 | 		final LearnToRankTermQuery q = new LearnToRankTermQuery(new Term(field,
21 | 				keyword.toLowerCase(Locale.US)), sim);
22 | 		return q;
23 | 	}
24 | 
25 | 	public static Query queryLabelFuzzy(String keyword, String field,
26 | 			Similarity sim) {
27 | 		final LearnToRankFuzzyQuery q = new LearnToRankFuzzyQuery(new Term(
28 | 				field, keyword.toLowerCase(Locale.US)), sim);
29 | 		return q;
30 | 	}
31 | 
32 | 
33 | 	public static Query queryStringTerm(String str, String field,
34 | 			Similarity sim, Occur occ, int maxclause) {
35 | 
36 | 		final String[] split = str.split(" ");
37 | 		final LTRBooleanQuery bquery = new LTRBooleanQuery();
38 | 		for (final String element : split) {
39 | 			final LearnToRankTermQuery tquery = new LearnToRankTermQuery(
40 | 					new Term(field, element.toLowerCase(Locale.US)), sim);
41 | 			bquery.add(tquery, occ);
42 | 		}
43 | 		return bquery;
44 | 	}
45 | 
46 | 	public static Query queryStringFuzzy(String str, String field,
47 | 			Similarity sim, Occur occ, int maxclause) {
48 | 
49 | 		final String[] split = str.split(" ");
50 | 		final LTRBooleanQuery bquery = new LTRBooleanQuery();
51 | 		for (final String element : split) {
52 | 			final LearnToRankFuzzyQuery tquery = new LearnToRankFuzzyQuery(
53 | 					new Term(field, element.toLowerCase(Locale.US)), sim);
54 | 			bquery.add(tquery, occ);
55 | 
56 | 		}
57 | 		return bquery;
58 | 	}
59 | 
60 | 
61 | 	public static Query queryPrior(IEntityCentricExtFeatures kb) {
62 | 		return new PriorQuery(kb);
63 | 	}
64 | 
65 | 	public static Query querySensePrior(String str, IEntityCentricExtFeatures kb) {
66 | 		return new SensePriorQuery(str, kb);
67 | 	}
68 | }
69 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/TableColumnFilter.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.collective.dbpedia;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.lucene.index.IndexReader;
 7 | import org.apache.lucene.index.Term;
 8 | import org.apache.lucene.search.BooleanQuery;
 9 | import org.apache.lucene.search.IndexSearcher;
10 | import org.apache.lucene.search.ScoreDoc;
11 | import org.apache.lucene.search.TopDocs;
12 | import org.apache.lucene.search.BooleanClause.Occur;
13 | 
14 | import doser.entitydisambiguation.algorithms.SurfaceForm;
15 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
16 | import doser.lucene.query.TermQuery;
17 | 
18 | public class TableColumnFilter {
19 | 
20 | 	private EntityCentricKBDBpedia eckb;
21 | 	private String topic;
22 | 
23 | 	TableColumnFilter(EntityCentricKBDBpedia eckb, String topic) {
24 | 		super();
25 | 		this.eckb = eckb;
26 | 		this.topic = topic;
27 | 	}
28 | 
29 | 	public void filter(List<SurfaceForm> reps) {
30 | 		for (SurfaceForm sf : reps) {
31 | 			List<String> candidates = sf.getCandidates();
32 | 			if (candidates.size() > 0) {
33 | 				String s = performLuceneQuery(candidates, topic);
34 | 				if (s != null) {
35 | 					sf.setDisambiguatedEntity(s);
36 | 				}
37 | 			}
38 | 		}
39 | 	}
40 | 
41 | 	private String performLuceneQuery(List<String> candidates, String topic) {
42 | 		String result = null;
43 | 		IndexSearcher searcher = eckb.getSearcher();
44 | 		IndexReader reader = searcher.getIndexReader();
45 | 		BooleanQuery candidateq = new BooleanQuery();
46 | 		for (String can : candidates) {
47 | 			candidateq.add(new TermQuery(new Term("Mainlink", can)), Occur.SHOULD);
48 | 		}
49 | 		BooleanQuery q = new BooleanQuery();
50 | 		q.add(candidateq, Occur.MUST);
51 | 		q.add(new TermQuery(new Term("LongDescription", topic)), Occur.MUST);
52 | 		TopDocs t = null;
53 | 		try {
54 | 			t = searcher.search(q, candidates.size());
55 | 		} catch (IOException e) {
56 | 			e.printStackTrace();
57 | 		}
58 | 		if (t != null) {
59 | 			ScoreDoc[] scoredocs = t.scoreDocs;
60 | 			if (scoredocs.length == 1) {
61 | 				try {
62 | 					result = reader.document(scoredocs[0].doc).get("Mainlink");
63 | 				} catch (IOException e) {
64 | 					e.printStackTrace();
65 | 				}
66 | 			}
67 | 		}
68 | 		return result;
69 | 	}
70 | }
71 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/dpo/EntityDisambiguationDPO.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.dpo;
 2 | 
 3 | 
 4 | /**
 5 |  * Represents surfaceform which should be disambiguated. Positions is used as an
 6 |  * intern id, which is necessary during feedback processing later.
 7 |  * 
 8 |  * Version 2.0 One position of a surface form might be not enough. Version 2
 9 |  * offers the possibility to send an array of position.
10 |  * 
11 |  * Version 3.0 A new field InterDisambiguationSetting flags the kind of
12 |  * Disambiguation. This can be one of the following Types: - Standard Entity
13 |  * Disambiguation with context - Standard Entity Disambiguation without context
14 |  * - Entity Disambiguation without context on specialized domain (i.e. tables)
15 |  * 
16 |  * Version 4.0 KnowledgeBaseIdentifier allows to select a specific knowledge
17 |  * base for each disambiguation algorithm. This option should only be used if
18 |  * the user is aware of what he is doing. Additionally the user is able to get
19 |  * the lucene documents of disambiguated entities.
20 |  * 
21 |  * 
22 |  * @author Stefan Zwicklbauer
23 |  * 
24 |  */
25 | public class EntityDisambiguationDPO {
26 | 
27 | 	private String documentId;
28 | 	private String context;
29 | 	private String selectedText;
30 | 	private String setting;
31 | 	private String kbversion;
32 | 	private int startPosition;
33 | 
34 | 	public EntityDisambiguationDPO() {
35 | 		super();
36 | 	}
37 | 
38 | 	public String getContext() {
39 | 		return this.context;
40 | 	}
41 | 
42 | 	public String getSelectedText() {
43 | 		return this.selectedText;
44 | 	}
45 | 
46 | 	public void setContext(final String context) {
47 | 		this.context = context;
48 | 	}
49 | 
50 | 	public void setSelectedText(final String selectedText) {
51 | 		this.selectedText = selectedText;
52 | 	}
53 | 
54 | 	public void setSetting(final String setting) {
55 | 		this.setting = setting;
56 | 	}
57 | 
58 | 	public String getSetting() {
59 | 		return setting;
60 | 	}
61 | 	
62 | 	public void setDocumentId(final String documentId) {
63 | 		this.documentId = documentId;
64 | 	}
65 | 
66 | 	public String getDocumentId() {
67 | 		return this.documentId;
68 | 	}
69 | 
70 | 	public void setInternSetting(final String setting) {
71 | 		this.setting = setting;
72 | 	}
73 | 	
74 | 	public String getKbversion() {
75 | 		return kbversion;
76 | 	}
77 | 
78 | 	public void setKbversion(String kbversion) {
79 | 		this.kbversion = kbversion;
80 | 	}
81 | 
82 | 	public int getStartPosition() {
83 | 		return startPosition;
84 | 	}
85 | 
86 | 	public void setStartPosition(int startPosition) {
87 | 		this.startPosition = startPosition;
88 | 	}
89 | }


--------------------------------------------------------------------------------
/doser-dis-extensions/.settings/org.eclipse.jdt.ui.prefs:
--------------------------------------------------------------------------------
 1 | cleanup.add_default_serial_version_id=true
 2 | cleanup.add_generated_serial_version_id=false
 3 | cleanup.add_missing_annotations=true
 4 | cleanup.add_missing_deprecated_annotations=true
 5 | cleanup.add_missing_methods=false
 6 | cleanup.add_missing_nls_tags=false
 7 | cleanup.add_missing_override_annotations=true
 8 | cleanup.add_missing_override_annotations_interface_methods=true
 9 | cleanup.add_serial_version_id=false
10 | cleanup.always_use_blocks=true
11 | cleanup.always_use_parentheses_in_expressions=false
12 | cleanup.always_use_this_for_non_static_field_access=false
13 | cleanup.always_use_this_for_non_static_method_access=false
14 | cleanup.convert_to_enhanced_for_loop=true
15 | cleanup.correct_indentation=true
16 | cleanup.format_source_code=true
17 | cleanup.format_source_code_changes_only=false
18 | cleanup.make_local_variable_final=true
19 | cleanup.make_parameters_final=false
20 | cleanup.make_private_fields_final=true
21 | cleanup.make_type_abstract_if_missing_method=false
22 | cleanup.make_variable_declarations_final=false
23 | cleanup.never_use_blocks=false
24 | cleanup.never_use_parentheses_in_expressions=true
25 | cleanup.organize_imports=true
26 | cleanup.qualify_static_field_accesses_with_declaring_class=false
27 | cleanup.qualify_static_member_accesses_through_instances_with_declaring_class=true
28 | cleanup.qualify_static_member_accesses_through_subtypes_with_declaring_class=true
29 | cleanup.qualify_static_member_accesses_with_declaring_class=true
30 | cleanup.qualify_static_method_accesses_with_declaring_class=false
31 | cleanup.remove_private_constructors=true
32 | cleanup.remove_trailing_whitespaces=true
33 | cleanup.remove_trailing_whitespaces_all=true
34 | cleanup.remove_trailing_whitespaces_ignore_empty=false
35 | cleanup.remove_unnecessary_casts=true
36 | cleanup.remove_unnecessary_nls_tags=true
37 | cleanup.remove_unused_imports=true
38 | cleanup.remove_unused_local_variables=false
39 | cleanup.remove_unused_private_fields=true
40 | cleanup.remove_unused_private_members=false
41 | cleanup.remove_unused_private_methods=true
42 | cleanup.remove_unused_private_types=true
43 | cleanup.sort_members=true
44 | cleanup.sort_members_all=true
45 | cleanup.use_blocks=true
46 | cleanup.use_blocks_only_for_return_and_throw=false
47 | cleanup.use_parentheses_in_expressions=false
48 | cleanup.use_this_for_non_static_field_access=true
49 | cleanup.use_this_for_non_static_field_access_only_if_necessary=true
50 | cleanup.use_this_for_non_static_method_access=true
51 | cleanup.use_this_for_non_static_method_access_only_if_necessary=true
52 | cleanup_profile=_Doser Code Profile
53 | cleanup_settings_version=2
54 | eclipse.preferences.version=1
55 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/EntityCentricKBDBpedia.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.knowledgebases;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import org.apache.lucene.search.similarities.Similarity;
 6 | 
 7 | public class EntityCentricKBDBpedia extends AbstractEntityCentricKBGeneral {
 8 | 
 9 | 	public EntityCentricKBDBpedia(String uri, boolean dynamic) {
10 | 		super(uri, dynamic);
11 | 	}
12 | 
13 | 	public EntityCentricKBDBpedia(String uri, boolean dynamic, Similarity sim) {
14 | 		super(uri, dynamic, sim);
15 | 	}
16 | 
17 | 	/**
18 | 	 * Takes a set of dbpedia entities as well as a target entity and generates
19 | 	 * one string that fits into the word2vec query format used in this class.
20 | 	 * The source entities are concatenated and should be compared with the
21 | 	 * target entity.
22 | 	 *
23 | 	 * @param source
24 | 	 *            a set of source entities
25 | 	 * @param target
26 | 	 *            the target entity.
27 | 	 * @return String in appropriate word2vec query format
28 | 	 */
29 | 	@Override
30 | 	public String generateWord2VecFormatString(String source, String target) {
31 | 		String s = source.replaceAll("http://dbpedia.org/resource/", "");
32 | 		String t = target.replaceAll("http://dbpedia.org/resource/", "");
33 | 		int c = s.compareToIgnoreCase(target);
34 | 		String res = "";
35 | 		if (c < 0) {
36 | 			res = s + "|" + t;
37 | 		} else if (c == 0) {
38 | 			res = s + "|" + t;
39 | 		} else {
40 | 			res = t + "|" + s;
41 | 		}
42 | 		return res;
43 | 	}
44 | 
45 | 	/**
46 | 	 * Takes a set of dbpedia entities as well as a target entity and generates
47 | 	 * one string that fits into the word2vec query format used in this class.
48 | 	 * The source entities are concatenated and should be compared with the
49 | 	 * target entity.
50 | 	 *
51 | 	 * @param source
52 | 	 *            a set of source entities
53 | 	 * @param target
54 | 	 *            the target entity.
55 | 	 * @return String in appropriate word2vec query format
56 | 	 */
57 | 	@Override
58 | 	public String generateWord2VecFormatString(List<String> source, String target) {
59 | 		StringBuilder builder = new StringBuilder();
60 | 		for (String s : source) {
61 | 			s = s.replaceAll("http://dbpedia.org/resource/", "");
62 | 			builder.append(s);
63 | 			builder.append("|");
64 | 		}
65 | 		String src = builder.toString();
66 | 		src = src.substring(0, src.length() - 1);
67 | 		String t = target.replaceAll("http://dbpedia.org/resource/", "");
68 | 		return src + "|" + t;
69 | 	}
70 | 	
71 | 	@Override
72 | 	protected String generateDomainName() {
73 | 		return "DBpedia";
74 | 	}
75 | 	
76 | 	@Override
77 | 	protected String kbName() {
78 | 		return "DBpedia KB";
79 | 	}
80 | }
81 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/NoCandidatesCheckPlural.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.rules;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.LinkedList;
 6 | import java.util.List;
 7 | 
 8 | import org.apache.lucene.document.Document;
 9 | import org.apache.lucene.index.IndexReader;
10 | import org.apache.lucene.search.IndexSearcher;
11 | import org.apache.lucene.search.ScoreDoc;
12 | import org.apache.lucene.search.TopDocs;
13 | import org.apache.lucene.search.similarities.DefaultSimilarity;
14 | 
15 | import doser.entitydisambiguation.algorithms.SurfaceForm;
16 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
17 | import doser.lucene.features.LuceneFeatures;
18 | import doser.lucene.query.LearnToRankClause;
19 | import doser.lucene.query.LearnToRankQuery;
20 | import doser.tools.Inflector;
21 | 
22 | /**
23 |  * Überprüft ob eine surface form im plural angegeben ist und falls ja überprüfe
24 |  * den singular
25 |  * 
26 |  * @author stefan
27 |  *
28 |  */
29 | class NoCandidatesCheckPlural extends AbstractRule {
30 | 
31 | 	NoCandidatesCheckPlural(AbstractKnowledgeBase eckb) {
32 | 		super(eckb);
33 | 	}
34 | 
35 | 	@Override
36 | 	public boolean applyRule(List<SurfaceForm> rep) {
37 | 		for (SurfaceForm r : rep) {
38 | 			if (r.getCandidates().size() == 0) {
39 | 				String sf = r.getSurfaceForm();
40 | 				String singular = Inflector.getInstance().singularize(sf);
41 | 				if (!sf.equalsIgnoreCase(singular)) {
42 | 					// Try singular search
43 | 					ArrayList<String> lst = queryLucene(singular);
44 | 					if (lst.size() != 0) {
45 | 						r.setCandidates(lst);
46 | 					}
47 | 				}
48 | 			}
49 | 		}
50 | 		return false;
51 | 	}
52 | 
53 | 	private ArrayList<String> queryLucene(String surfaceForm) {
54 | 		ArrayList<String> list = new ArrayList<String>();
55 | 		final IndexSearcher searcher = eckb.getSearcher();
56 | 		final IndexReader reader = searcher.getIndexReader();
57 | 		LearnToRankQuery query = new LearnToRankQuery();
58 | 		List<LearnToRankClause> features = new LinkedList<LearnToRankClause>();
59 | 		DefaultSimilarity defaultSim = new DefaultSimilarity();
60 | 		features.add(query.add(LuceneFeatures.queryLabelTerm(surfaceForm,
61 | 				"UniqueLabel", defaultSim), "Feature1", true));
62 | 		try {
63 | 			final TopDocs top = searcher.search(query, 150);
64 | 			final ScoreDoc[] score = top.scoreDocs;
65 | 			if (score.length <= 5) {
66 | 				for (int i = 0; i < score.length; ++i) {
67 | 					final Document doc = reader.document(score[i].doc);
68 | 					list.add(doc.get("Mainlink"));
69 | 				}
70 | 			}
71 | 		} catch (IOException e) {
72 | 			e.printStackTrace();
73 | 		}
74 | 		return list;
75 | 	}
76 | 
77 | }
78 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/general/CollectiveContextDriverGeneral.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.collective.general;
 2 | 
 3 | import java.util.LinkedList;
 4 | import java.util.List;
 5 | 
 6 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 7 | import doser.entitydisambiguation.algorithms.collective.CandidatePruning;
 8 | import doser.entitydisambiguation.algorithms.rules.RuleAdapation;
 9 | import doser.entitydisambiguation.dpo.DisambiguatedEntity;
10 | import doser.entitydisambiguation.dpo.Response;
11 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral;
12 | 
13 | class CollectiveContextDriverGeneral {
14 | 
15 | 	static final int PREPROCESSINGCONTEXTSIZE = 200;
16 | 	
17 | 	private Response[] currentResponse;
18 | 	private List<SurfaceForm> rep;
19 | 	private AbstractEntityCentricKBGeneral eckb;
20 | 	
21 | 	CollectiveContextDriverGeneral(Response[] res, List<SurfaceForm> rep, AbstractEntityCentricKBGeneral eckb) {
22 | 		super();
23 | 		this.currentResponse = res;
24 | 		this.rep = rep;
25 | 		this.eckb = eckb;
26 | 	}
27 | 	
28 | 	void solve() {
29 | 		// First candidate pruning
30 | 		CandidatePruning pruning = new CandidatePruning(eckb);
31 | 		pruning.prune(rep);
32 | 
33 | 		RuleAdapation rules = new RuleAdapation();
34 | 		rules.addNoCandidatesCheckPluralRule(eckb);
35 | 		rules.addNoCandidatesExpansionRule(eckb);
36 | 		rules.performRuleChainBeforeCandidateSelection(rep);
37 | 
38 | 		CandidateReductionGeneralW2V w2vreduction = new CandidateReductionGeneralW2V(eckb, rep, 20, 5, 125, false, false);
39 | 		w2vreduction.solve();
40 | 		rep = w2vreduction.getRep();
41 | 
42 | 		w2vreduction = new CandidateReductionGeneralW2V(eckb, rep, 45, 5, 250, true, true);
43 | 		w2vreduction.solve();
44 | 		rep = w2vreduction.getRep();
45 | 		FinalEntityDisambiguation finalDis = new FinalEntityDisambiguation(eckb, rep);
46 | 		finalDis.setup();
47 | 		finalDis.solve();
48 | 	}
49 | 	
50 | 	void generateResult() {
51 | 		for (int i = 0; i < currentResponse.length; i++) {
52 | 			SurfaceForm r = search(i);
53 | 			if (currentResponse[i] == null && r != null && r.getCandidates().size() == 1) {
54 | 				Response res = new Response();
55 | 				List<DisambiguatedEntity> entList = new LinkedList<DisambiguatedEntity>();
56 | 				DisambiguatedEntity ent = new DisambiguatedEntity();
57 | 				ent.setEntityUri(r.getCandidates().get(0));
58 | 				entList.add(ent);
59 | 				res.setDisEntities(entList);
60 | 				res.setSelectedText(r.getSurfaceForm());
61 | 				currentResponse[i] = res;
62 | 			}
63 | 		}
64 | 	}
65 | 	
66 | 	private SurfaceForm search(int qryNr) {
67 | 		for (SurfaceForm r : rep) {
68 | 			if (r.getQueryNr() == qryNr) {
69 | 				return r;
70 | 			}
71 | 		}
72 | 		return null;
73 | 	}
74 | 	
75 | }
76 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserIDAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package doser.lucene.analysis;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.Reader;
 5 | 
 6 | import org.apache.lucene.analysis.TokenStream;
 7 | import org.apache.lucene.analysis.core.StopAnalyzer;
 8 | import org.apache.lucene.analysis.util.CharArraySet;
 9 | import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
10 | import org.apache.lucene.analysis.util.WordlistLoader;
11 | import org.apache.lucene.util.Version;
12 | 
13 | /**
14 |  * This analyzer is a special analyzer for id queries in our knowledge bases
15 |  * 
16 |  * @author Stefan Zwicklbauer
17 |  * 
18 |  */
19 | public final class DoserIDAnalyzer extends StopwordAnalyzerBase {
20 | 
21 | 	/** Default maximum allowed token length */
22 | 	public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
23 | 
24 | 	/**
25 | 	 * An unmodifiable set containing some common English words that are usually
26 | 	 * not useful for searching.
27 | 	 */
28 | 	public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
29 | 
30 | 	private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
31 | 
32 | 	/**
33 | 	 * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}).
34 | 	 * 
35 | 	 * 
36 | 	 */
37 | 	public DoserIDAnalyzer() {
38 | 		this(STOP_WORDS_SET);
39 | 	}
40 | 
41 | 	/**
42 | 	 * Builds an analyzer with the given stop words.
43 | 	 * 
44 | 	 * @param stopWords
45 | 	 *            stop words
46 | 	 */
47 | 	public DoserIDAnalyzer(CharArraySet stopWords) {
48 | 		super(stopWords);
49 | 	}
50 | 
51 | 	/**
52 | 	 * Builds an analyzer with the stop words from the given reader.
53 | 	 * 
54 | 	 * @see WordlistLoader#getWordSet(Reader, Version)
55 | 	 * @param stopwords
56 | 	 *            Reader to read stop words from
57 | 	 */
58 | 	public DoserIDAnalyzer(Reader stopwords)
59 | 			throws IOException {
60 | 		this(loadStopwordSet(stopwords));
61 | 	}
62 | 
63 | 	@Override
64 | 	protected TokenStreamComponents createComponents(final String fieldName,
65 | 			final Reader reader) {
66 | 		final DoserIDTokenizer src = new DoserIDTokenizer(reader);
67 | 		TokenStream tok = new DoserIDFilter(src);
68 | 		return new TokenStreamComponents(src, tok) {
69 | 			@Override
70 | 			protected void setReader(final Reader reader) throws IOException {
71 | 				super.setReader(reader);
72 | 			}
73 | 		};
74 | 	}
75 | 
76 | 	/**
77 | 	 * @see #setMaxTokenLength
78 | 	 */
79 | 	public int getMaxTokenLength() {
80 | 		return maxTokenLength;
81 | 	}
82 | 
83 | 	/**
84 | 	 * Set maximum allowed token length. If a token is seen that exceeds this
85 | 	 * length then it is discarded. This setting only takes effect the next time
86 | 	 * tokenStream or tokenStream is called.
87 | 	 */
88 | 	public void setMaxTokenLength(int length) {
89 | 		maxTokenLength = length;
90 | 	}
91 | }


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/LearnToRankClause.java:
--------------------------------------------------------------------------------
  1 | package doser.lucene.query;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.Map;
  5 | 
  6 | import org.apache.lucene.search.Query;
  7 | import org.apache.lucene.search.Weight;
  8 | 
  9 | /**
 10 |  * LearnToRank clause representing an arbitrary feature query. Additional
 11 |  * criterias may be defined later but are not necessary so far.
 12 |  * 
 13 |  * HashMap featuresValues contains all calculated featuresValues. The HashMap
 14 |  * key stores the document number. The Pair integer stores the featureNumber.
 15 |  * 
 16 |  * The HashMap has to be resetted after each query.
 17 |  * 
 18 |  */
 19 | public class LearnToRankClause {
 20 | 
 21 | 	class Pair {
 22 | 
 23 | 		private final int featureNr;
 24 | 
 25 | 		private final float featureValue;
 26 | 
 27 | 		Pair(final int docNr, final float featureValue) {
 28 | 			featureNr = docNr;
 29 | 			this.featureValue = featureValue;
 30 | 		}
 31 | 
 32 | 		public int getDocNr() {
 33 | 			return featureNr;
 34 | 		}
 35 | 
 36 | 		public float getFeatureValue() {
 37 | 			return featureValue;
 38 | 		}
 39 | 
 40 | 	}
 41 | 
 42 | 	private Weight cweight;
 43 | 
 44 | 	private final Map<Integer, Float> featureValues;
 45 | 
 46 | 	private final boolean mustOccur;
 47 | 
 48 | 	private final String name;
 49 | 
 50 | 	private Query query;
 51 | 
 52 | 	private float weight;
 53 | 
 54 | 	public LearnToRankClause(final Query query, final String name,
 55 | 			final boolean mustOccur) {
 56 | 		this.query = query;
 57 | 		this.name = name;
 58 | 		weight = 1.0f;
 59 | 		this.mustOccur = mustOccur;
 60 | 		featureValues = new HashMap<Integer, Float>();
 61 | 	}
 62 | 
 63 | 	public void addFeatureValue(final int docBase, final int docNr,
 64 | 			final float value) {
 65 | 		featureValues.put((docBase + docNr), value);
 66 | 	}
 67 | 
 68 | 	public void clear() {
 69 | 		featureValues.clear();
 70 | 	}
 71 | 
 72 | 	public double getFeatureValue(final int docId) {
 73 | 		double val = 0f;
 74 | 		try {
 75 | 			val = featureValues.get(docId);
 76 | 		} catch (final NullPointerException e) {
 77 | 			val = 0f;
 78 | 		}
 79 | 		return val;
 80 | 	}
 81 | 
 82 | 	public String getName() {
 83 | 		return name;
 84 | 	}
 85 | 
 86 | 	public Query getQuery() {
 87 | 		return query;
 88 | 	}
 89 | 
 90 | 	public Weight getW() {
 91 | 		return cweight;
 92 | 	}
 93 | 
 94 | 	public float getWeight() {
 95 | 		return weight;
 96 | 	}
 97 | 
 98 | 	public boolean isMustOccur() {
 99 | 		return mustOccur;
100 | 	}
101 | 
102 | 	public void setQuery(final Query query) {
103 | 		this.query = query;
104 | 	}
105 | 
106 | 	public void setW(final Weight cweight) {
107 | 		this.cweight = cweight;
108 | 	}
109 | 
110 | 	public void setWeight(final float weight) {
111 | 		this.weight = weight;
112 | 	}
113 | }
114 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/properties/Properties.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.properties;
 2 | 
 3 | import org.apache.commons.configuration.ConfigurationException;
 4 | import org.apache.commons.configuration.PropertiesConfiguration;
 5 | import org.apache.log4j.Logger;
 6 | 
 7 | public final class Properties {
 8 | 	private static Properties instance;
 9 | 	private static final String RESOURCE_NAME = "disambiguation.properties";
10 | //	private static final String RESOURCE_NAME = "./disambiguation.properties";
11 | 	
12 | 	public synchronized static Properties getInstance() {
13 | 		if (instance == null) {
14 | 			instance = new Properties();
15 | 		}
16 | 
17 | 		return instance;
18 | 	}
19 | 
20 | 	/**
21 | 	 * Provides easy access to property files (e.g. config.getInt())
22 | 	 */
23 | 	PropertiesConfiguration config;
24 | 
25 | 	private Properties() {
26 | 		try {
27 | 			this.config = new PropertiesConfiguration(RESOURCE_NAME);
28 | 		} catch (final ConfigurationException e) {
29 | 			Logger.getRootLogger().error("Failed to load properties file: "	+ RESOURCE_NAME, e);
30 | 		}
31 | 	}
32 | 
33 | 	/**
34 | 	 * ArtifactId of the application (from maven pom.xml)
35 | 	 * 
36 | 	 * @return artifact id
37 | 	 */
38 | 	public String getApplicationArtifactId() {
39 | 		return this.config.getString("application.artifactId");
40 | 	}
41 | 
42 | 	/**
43 | 	 * Name of the application (from maven pom.xml)
44 | 	 * 
45 | 	 * @return application name
46 | 	 */
47 | 	public String getApplicationName() {
48 | 		return this.config.getString("application.name");
49 | 	}
50 | 
51 | 	/**
52 | 	 * Version of the application (from maven pom.xml)
53 | 	 * 
54 | 	 * @return application version
55 | 	 */
56 | 	public String getApplicationVersion() {
57 | 		return this.config.getString("application.version");
58 | 	}
59 | 	
60 | 	public int getDisambiguationResultSize() {
61 | 		final String size = this.config.getString("disambiguation.returnSize");
62 | 		return Integer.valueOf(size);
63 | 	}
64 | 
65 | 	/**
66 | 	 * Get location of entity-centric knowledge base
67 | 	 */
68 | 	public String getEntityCentricKBWikipedia() {
69 | 		return this.config.getString("disambiguation.entityCentricKBWikipedia");
70 | 	}
71 | 	
72 | 	public String getEntityCentricKBBiomed() {
73 | 		return this.config.getString("disambiguation.entityCentricBiomedCalbC");
74 | 	}
75 | 	
76 | 	public String getWord2VecService() {
77 | 		return this.config.getString("disambiguation.Word2VecService");
78 | 	}
79 | 
80 | 	public String getWord2VecModel() {
81 | 		return this.config.getString("word2vecmodel");
82 | 	}
83 | 	
84 | 	public boolean getCandidateExpansion() {
85 | 		boolean bool = false;
86 | 		String s = this.config.getString("candidateExpansion");
87 | 		if(s.equalsIgnoreCase("true")) {
88 | 			bool = true;
89 | 		}
90 | 		return bool;
91 | 	}	
92 | }
93 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<parent>
 4 | 		<groupId>doser-dis</groupId>
 5 | 		<artifactId>doser-dis-parent</artifactId>
 6 | 		<version>1.0</version>
 7 | 	</parent>
 8 | 	<modelVersion>4.0.0</modelVersion>
 9 | 	<groupId>doser.sub</groupId>
10 | 	<artifactId>doser-dis-disambiguationserver</artifactId>
11 | 	<version>0.6</version>
12 | 	<name>doser-dis-disambiguationserver</name>
13 | 	<description />
14 | 	<build>
15 | 		<finalName>doser-dis-disambiguationserver</finalName>
16 | 		<plugins>
17 | 			<plugin>
18 | 				<artifactId>maven-war-plugin</artifactId>
19 | 				<version>2.1.1</version>
20 | 			</plugin>
21 | 			<plugin>
22 | 				<groupId>org.apache.maven.plugins</groupId>
23 | 				<version>2.9</version>
24 | 				<artifactId>maven-eclipse-plugin</artifactId>
25 | 				<configuration>
26 | 					<wtpapplicationxml>true</wtpapplicationxml>
27 | 					<wtpversion>2.0</wtpversion>
28 | 				</configuration>
29 | 			</plugin>
30 | 			<plugin>
31 | 				<groupId>org.apache.tomcat.maven</groupId>
32 | 				<artifactId>tomcat7-maven-plugin</artifactId>
33 | 				<version>2.0</version>
34 | 				<executions>
35 | 					<execution>
36 | 						<id>tomcat-run</id>
37 | 						<goals>
38 | 							<goal>exec-war-only</goal>
39 | 						</goals>
40 | 						<phase>package</phase>
41 | 						<configuration>
42 | 							<path>/doser</path>
43 | 							<enableNaming>false</enableNaming>
44 | 							<finalName>DoSer-disambiguation-only.jar</finalName>
45 | 							<charset>utf-8</charset>
46 | 						</configuration>
47 | 					</execution>
48 | 				</executions>
49 | 			</plugin>
50 | 		</plugins>
51 | 	</build>
52 | 	<dependencies>
53 | 		<dependency>
54 | 			<groupId>doser.sub</groupId>
55 | 			<artifactId>doser-dis-core</artifactId>
56 | 			<version>${project.version}</version>
57 | 		</dependency>
58 | 		<dependency>
59 | 			<groupId>org.springframework</groupId>
60 | 			<artifactId>spring-webmvc</artifactId>
61 | 			<version>4.0.6.RELEASE</version>
62 | 		</dependency>
63 | 		<dependency>
64 | 			<groupId>commons-fileupload</groupId>
65 | 			<artifactId>commons-fileupload</artifactId>
66 | 			<version>1.3.1</version>
67 | 		</dependency>
68 | 		<dependency>
69 | 			<groupId>javax.servlet</groupId>
70 | 			<artifactId>javax.servlet-api</artifactId>
71 | 			<scope>provided</scope>
72 | 			<version>3.0.1</version>
73 | 		</dependency>
74 | 	</dependencies>
75 | 	<dependencyManagement>
76 | 		<dependencies>
77 | 			<dependency>
78 | 				<groupId>xml-apis</groupId>
79 | 				<artifactId>xml-apis</artifactId>
80 | 				<version>1.4.01</version>
81 | 			</dependency>
82 | 		</dependencies>
83 | 	</dependencyManagement>
84 | 	<packaging>war</packaging>
85 | </project>
86 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/analysis/DoserStandardAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package doser.lucene.analysis;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.Reader;
 5 | 
 6 | import org.apache.lucene.analysis.TokenStream;
 7 | import org.apache.lucene.analysis.core.LowerCaseFilter;
 8 | import org.apache.lucene.analysis.core.StopAnalyzer;
 9 | import org.apache.lucene.analysis.standard.StandardFilter;
10 | import org.apache.lucene.analysis.util.CharArraySet;
11 | import org.apache.lucene.analysis.util.StopwordAnalyzerBase;
12 | import org.apache.lucene.analysis.util.WordlistLoader;
13 | import org.apache.lucene.util.Version;
14 | 
15 | public final class DoserStandardAnalyzer extends StopwordAnalyzerBase {
16 | 
17 | 	/** Default maximum allowed token length */
18 | 	public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
19 | 
20 | 	/**
21 | 	 * An unmodifiable set containing some common English words that are usually
22 | 	 * not useful for searching.
23 | 	 */
24 | 	public static final CharArraySet STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
25 | 
26 | 	private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
27 | 
28 | 	/**
29 | 	 * Builds an analyzer with the default stop words ({@link #STOP_WORDS_SET}).
30 | 	 * 
31 | 	 */
32 | 	public DoserStandardAnalyzer() {
33 | 		this(STOP_WORDS_SET);
34 | 	}
35 | 
36 | 	/**
37 | 	 * Builds an analyzer with the given stop words.
38 | 	 * 
39 | 	 * @param stopWords
40 | 	 *            stop words
41 | 	 */
42 | 	public DoserStandardAnalyzer(CharArraySet stopWords) {
43 | 		super(stopWords);
44 | 	}
45 | 
46 | 	/**
47 | 	 * Builds an analyzer with the stop words from the given reader.
48 | 	 * 
49 | 	 * @see WordlistLoader#getWordSet(Reader, Version)
50 | 	 * @param stopwords
51 | 	 *            Reader to read stop words from
52 | 	 */
53 | 	public DoserStandardAnalyzer(Reader stopwords)
54 | 			throws IOException {
55 | 		this(loadStopwordSet(stopwords));
56 | 	}
57 | 
58 | 	@Override
59 | 	protected TokenStreamComponents createComponents(final String fieldName,
60 | 			final Reader reader) {
61 | 		final DoserStandardTokenizer src = new DoserStandardTokenizer(reader);
62 | 		TokenStream tok = new StandardFilter(src);
63 | 		tok = new LowerCaseFilter(tok);
64 | 		return new TokenStreamComponents(src, tok) {
65 | 			@Override
66 | 			protected void setReader(final Reader reader) throws IOException {
67 | 				super.setReader(reader);
68 | 			}
69 | 		};
70 | 	}
71 | 
72 | 	/**
73 | 	 * @see #setMaxTokenLength
74 | 	 */
75 | 	public int getMaxTokenLength() {
76 | 		return maxTokenLength;
77 | 	}
78 | 
79 | 	/**
80 | 	 * Set maximum allowed token length. If a token is seen that exceeds this
81 | 	 * length then it is discarded. This setting only takes effect the next time
82 | 	 * tokenStream or tokenStream is called.
83 | 	 */
84 | 	public void setMaxTokenLength(int length) {
85 | 		maxTokenLength = length;
86 | 	}
87 | }


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/AbstractKnowledgeBase.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.knowledgebases;
 2 | 
 3 | import java.io.File;
 4 | import java.io.IOException;
 5 | import java.util.TimerTask;
 6 | 
 7 | import org.apache.lucene.search.IndexSearcher;
 8 | import org.apache.lucene.search.SearcherFactory;
 9 | import org.apache.lucene.search.SearcherManager;
10 | import org.apache.lucene.search.similarities.DefaultSimilarity;
11 | import org.apache.lucene.search.similarities.Similarity;
12 | import org.apache.lucene.store.Directory;
13 | import org.apache.lucene.store.FSDirectory;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 | 
17 | /**
18 |  * Each knowledge base provides its own class with its respective properties.
19 |  * These are the knowledge base index uri. IndexSearcher, IndexReader objects
20 |  * and the dynamic property.
21 |  * 
22 |  * @author stefan zwicklbauer
23 |  */
24 | public abstract class AbstractKnowledgeBase extends TimerTask {
25 | 
26 | 	private final static Logger logger = LoggerFactory.getLogger(AbstractKnowledgeBase.class);
27 | 	
28 | 	private String indexUri;
29 | 	
30 | 	private boolean dynamic;
31 | 
32 | 	private SearcherManager manager;
33 | 
34 | 	private IndexSearcher searcher;
35 | 	
36 | 	AbstractKnowledgeBase(String uri, boolean dynamic) {
37 | 		this(uri, dynamic, new DefaultSimilarity());
38 | 	}
39 | 
40 | 	AbstractKnowledgeBase(String uri, boolean dynamic, Similarity sim) {
41 | 		super();
42 | 		this.indexUri = uri;
43 | 		this.dynamic = dynamic;
44 | 
45 | 		File indexDir = new File(indexUri);
46 | 		Directory dir;
47 | 		try {
48 | 			dir = FSDirectory.open(indexDir);
49 | 			this.manager = new SearcherManager(dir, new SearcherFactory());
50 | 		} catch (IOException e) {
51 | 			logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e);
52 | 		}
53 | 	}
54 | 
55 | 	public String getIndexUri() {
56 | 		return indexUri;
57 | 	}
58 | 
59 | 
60 | 	public IndexSearcher getSearcher() {
61 | 		try {
62 | 			this.searcher = manager.acquire();
63 | 		} catch (IOException e) {
64 | 			logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e);
65 | 		}
66 | 		return this.searcher;
67 | 	}
68 | 
69 | 	public void release() {
70 | 		try {
71 | 			manager.release(searcher);
72 | 		} catch (IOException e) {
73 | 			logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e);
74 | 		}
75 | 	}
76 | 
77 | 	/**
78 | 	 * Periodically reopens the Indexreader, if and only if this is an dynamic
79 | 	 * knowledge base. The changed knowledge base will be live within a few moments.
80 | 	 */
81 | 	@Override
82 | 	public void run() {
83 | 		if (dynamic) {
84 | 			try {
85 | 				manager.maybeRefresh();
86 | 			} catch (IOException e) {
87 | 				logger.error("IOException in "+AbstractKnowledgeBase.class.getName(), e);
88 | 			}
89 | 		}
90 | 	}
91 | 	
92 | 	public abstract void initialize();
93 | }
94 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | #DoSeR-Disambiguation
 2 | This package exclusively contains the disambiguation system of DoSeR. Compilation results in a Stand-alone jar file which starts an Apache Tomcat Server. More infos about the full DoSeR systems can be found here: [Github Wiki](https://github.com/quhfus/DoSeR/wiki) 
 3 | 
 4 | If your system does not have enough system memory (25GB Ram), you can use the the rest service of the current DoSeR version which is applicable for GERBIL. **Coming soon**
 5 | 
 6 | 
 7 | We note that this service is limited to 5 queries in parallel.
 8 | 
 9 | ##Requirements
10 | To install and run the DoSeR disambiguation systems, the following components must be installed:
11 | 
12 | 1. Java Version 1.7 or higher
13 | 
14 | 2. Python 2.5 or higher
15 | 
16 | 3. Disambiguation Lucene Index: [Dropbox Link](https://www.dropbox.com/s/7ihkw5gzqc3afjo/DBpedia_DisambiguationIndex.tar.gz?dl=0) 
17 | 
18 | 4. Semantic Embeddings: [Dropbox Link](https://www.dropbox.com/s/4e2g72yud1muv5a/Semantic_Embeddings.tar.gz?dl=0)
19 | 
20 | ##Installation
21 | 1. Checkout the DoSeR-Disambiguation Github repository and install the system with **mvn compile**. If no maven is installed or if you are not interested in the source code you can download the doser-dis-disambiguationserver.jar file and disambiguation.properties file from here (coming very soon).  
22 | 
23 | 2. Put the resulting or downloaded **doser-dis-disambiguationserver.jar** file and the properties file into a newly created directory **foo**. Unzip the Disambiguation Index and put the index folder into the **foo** directory.
24 | 
25 | 3. Unzip and extract the Semantic Embeddings zip file into any folder.
26 | 
27 | 4. Install and start the Word2Vec Rest Server (Installation guide can be found [here](https://github.com/quhfus/DoSeR-Disambiguation/wiki/Word2Vec-RestServer))
28 | 
29 | 6. Open and adapt the disambiguation.properties file
30 | 
31 | 7. Start the doser-dis-disambiguationserver.jar 
32 | 
33 | ##Citation
34 | If you use DoSeR in your research, please cite the following paper:
35 | 
36 |     @inproceedings{DBLP:conf/esws/ZwicklbauerSG16,
37 |     author    = {Stefan Zwicklbauer and Christin Seifert and Michael Granitzer},
38 |     title     = {DoSeR - A Knowledge-Base-Agnostic Framework for Entity Disambiguation Using Semantic Embeddings},
39 |     booktitle = {The Semantic Web. Latest Advances and New Domains - 13th International
40 |                Conference, {ESWC} 2016, Heraklion, Crete, Greece, May 29 - June 2,
41 |                2016, Proceedings},
42 |     pages     = {182--198},
43 |     year      = {2016},
44 |     crossref  = {DBLP:conf/esws/2016},
45 |     url       = {http://dx.doi.org/10.1007/978-3-319-34129-3_12},
46 |     doi       = {10.1007/978-3-319-34129-3_12},
47 |     timestamp = {Mon, 23 May 2016 13:46:28 +0200},
48 |     biburl    = {http://dblp.uni-trier.de/rec/bib/conf/esws/ZwicklbauerSG16},
49 |     bibsource = {dblp computer science bibliography, http://dblp.org}
50 |     }
51 | 


--------------------------------------------------------------------------------
/doser-dis-core/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 | 	<parent>
 4 | 		<groupId>doser-dis</groupId>
 5 | 		<artifactId>doser-dis-parent</artifactId>
 6 | 		<version>1.0</version>
 7 | 	</parent>
 8 | 	<modelVersion>4.0.0</modelVersion>
 9 | 	<groupId>doser.sub</groupId>
10 | 	<artifactId>doser-dis-core</artifactId>
11 | 	<version>0.6</version>
12 | 	<name>doser-dis-core</name>
13 | 	<description />
14 | 	<build>
15 | 		<finalName>doser-dis-core</finalName>
16 | 		<plugins>
17 | 		</plugins>
18 | 	</build>
19 | 
20 | 	<dependencies>
21 | 		<dependency>
22 | 			<groupId>com.google.guava</groupId>
23 | 			<artifactId>guava</artifactId>
24 | 			<version>18.0</version>
25 | 		</dependency>
26 | 
27 | 
28 | 		<dependency>
29 | 			<groupId>doser.sub</groupId>
30 | 			<artifactId>doser-dis-extensions</artifactId>
31 | 			<version>${project.version}</version>
32 | 		</dependency>
33 | 		<dependency>
34 | 			<groupId>org.rdfhdt</groupId>
35 | 			<artifactId>hdt-java-core</artifactId>
36 | 			<version>1.1</version>
37 | 		</dependency>
38 | 		<dependency>
39 | 			<groupId>org.rdfhdt</groupId>
40 | 			<artifactId>hdt-jena</artifactId>
41 | 			<version>1.1</version>
42 | 		</dependency>
43 | 		<dependency>
44 | 			<groupId>net.sf.jgrapht</groupId>
45 | 			<artifactId>jgrapht</artifactId>
46 | 			<version>0.8.3</version>
47 | 		</dependency>
48 | 		<dependency>
49 | 			<groupId>com.googlecode.aima-java</groupId>
50 | 			<artifactId>aima-core</artifactId>
51 | 			<version>0.10.5</version>
52 | 		</dependency>
53 | 		<dependency>
54 | 			<groupId>commons-configuration</groupId>
55 | 			<artifactId>commons-configuration</artifactId>
56 | 			<version>1.10</version>
57 | 		</dependency>
58 | 		<dependency>
59 | 			<groupId>org.codehaus.jettison</groupId>
60 | 			<artifactId>jettison</artifactId>
61 | 			<version>1.3.5</version>
62 | 		</dependency>
63 | 		<dependency>
64 | 			<groupId>org.codehaus.jackson</groupId>
65 | 			<artifactId>jackson-mapper-asl</artifactId>
66 | 			<version>1.9.13</version>
67 | 		</dependency>
68 | 		<dependency>
69 | 			<groupId>net.sf.jung</groupId>
70 | 			<artifactId>jung2</artifactId>
71 | 			<version>2.0.1</version>
72 | 			<type>pom</type>
73 | 		</dependency>
74 | 		<dependency>
75 | 			<groupId>net.sf.jung</groupId>
76 | 			<artifactId>jung-graph-impl</artifactId>
77 | 			<version>2.0.1</version>
78 | 		</dependency>
79 | 		<dependency>
80 | 			<groupId>net.sf.jung</groupId>
81 | 			<artifactId>jung-algorithms</artifactId>
82 | 			<version>2.0.1</version>
83 | 		</dependency>
84 | 		<dependency>
85 | 			<groupId>org.apache.commons</groupId>
86 | 			<artifactId>commons-math</artifactId>
87 | 			<version>2.2</version>
88 | 		</dependency>
89 | 	</dependencies>
90 | 
91 | 	<dependencyManagement>
92 | 		<dependencies>
93 | 		</dependencies>
94 | 	</dependencyManagement>
95 | </project>
96 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/java/doser/server/actions/FrameworkInitialization.java:
--------------------------------------------------------------------------------
 1 | package doser.server.actions;
 2 | 
 3 | import java.util.Enumeration;
 4 | 
 5 | import javax.servlet.ServletContext;
 6 | import javax.servlet.ServletContextEvent;
 7 | import javax.servlet.ServletContextListener;
 8 | 
 9 | import org.apache.log4j.Logger;
10 | import org.springframework.beans.factory.DisposableBean;
11 | import org.springframework.web.context.ContextLoader;
12 | import org.springframework.web.context.WebApplicationContext;
13 | 
14 | import doser.entitydisambiguation.backend.DisambiguationMainService;
15 | 
16 | public class FrameworkInitialization extends ContextLoader implements
17 | 		ServletContextListener {
18 | 
19 | 	private ContextLoader contextLoader;
20 | 
21 | 	public FrameworkInitialization() {
22 | 	}
23 | 
24 | 	public FrameworkInitialization(WebApplicationContext context) {
25 | 		super(context);
26 | 	}
27 | 
28 | 	/**
29 | 	 * Initialize the root web application context.
30 | 	 */
31 | 	@Override
32 | 	public void contextInitialized(ServletContextEvent event) {
33 | 		DisambiguationMainService.initialize();
34 | 		this.contextLoader = createContextLoader();
35 | 		if (this.contextLoader == null) {
36 | 			this.contextLoader = this;
37 | 		}
38 | 		this.contextLoader.initWebApplicationContext(event.getServletContext());
39 | 	}
40 | 
41 | 	/**
42 | 	 * Create the ContextLoader to use. Can be overridden in subclasses.
43 | 	 * 
44 | 	 * @return the new ContextLoader
45 | 	 * @deprecated in favor of simply subclassing ContextLoaderListener itself
46 | 	 *             (which extends ContextLoader, as of Spring 3.0)
47 | 	 */
48 | 	@Deprecated
49 | 	protected ContextLoader createContextLoader() {
50 | 		return null;
51 | 	}
52 | 
53 | 	/**
54 | 	 * Return the ContextLoader used by this listener.
55 | 	 * 
56 | 	 * @return the current ContextLoader
57 | 	 * @deprecated in favor of simply subclassing ContextLoaderListener itself
58 | 	 *             (which extends ContextLoader, as of Spring 3.0)
59 | 	 */
60 | 	@Deprecated
61 | 	public ContextLoader getContextLoader() {
62 | 		return this.contextLoader;
63 | 	}
64 | 
65 | 	/**
66 | 	 * Close the root web application context.
67 | 	 */
68 | 	@Override
69 | 	public void contextDestroyed(ServletContextEvent event) {
70 | 		DisambiguationMainService.getInstance().shutDownDisambiguationService();
71 | 		if (this.contextLoader != null) {
72 | 			this.contextLoader.closeWebApplicationContext(event
73 | 					.getServletContext());
74 | 		}
75 | 		ServletContext sc = event.getServletContext();
76 | 		Enumeration<String> attrNames = sc.getAttributeNames();
77 | 		while (attrNames.hasMoreElements()) {
78 | 			String attrName = attrNames.nextElement();
79 | 			if (attrName.startsWith("org.springframework.")) {
80 | 				Object attrValue = sc.getAttribute(attrName);
81 | 				if (attrValue instanceof DisposableBean) {
82 | 					try {
83 | 						((DisposableBean) attrValue).destroy();
84 | 					} catch (Throwable ex) {
85 | 						Logger.getRootLogger().fatal(ex.getMessage());
86 | 					}
87 | 				}
88 | 			}
89 | 		}
90 | 	}
91 | 
92 | }
93 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/CandidateReduction.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.collective;
 2 | 
 3 | import java.util.LinkedList;
 4 | import java.util.List;
 5 | import java.util.concurrent.TimeUnit;
 6 | 
 7 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 8 | 
 9 | public abstract class CandidateReduction {
10 | 
11 | 	// public static final int MAXSURFACEFORMSPERQUERY = 20;
12 | 	// public static final int REDUCETO = 5;
13 | 	private List<SurfaceForm> rep;
14 | 	private boolean alwaysAction;
15 | 	private int maxsurfaceformsperquery;
16 | 
17 | 	public CandidateReduction(List<SurfaceForm> rep,
18 | 			int maxsurfaceformsperquery, boolean alwaysAction) {
19 | 		super();
20 | 		this.rep = rep;
21 | 		this.maxsurfaceformsperquery = maxsurfaceformsperquery;
22 | 		this.alwaysAction = alwaysAction;
23 | 	}
24 | 
25 | 	public void solve() {
26 | 		List<SurfaceForm> finalList = new LinkedList<SurfaceForm>();
27 | 		if (this.rep.size() > maxsurfaceformsperquery) {
28 | 			int counter = 0;
29 | 			while (true) {
30 | 				long time = System.currentTimeMillis();
31 | 				if ((counter + maxsurfaceformsperquery) < this.rep.size()) {
32 | 					List<SurfaceForm> subList = this.rep.subList(counter, (counter + maxsurfaceformsperquery));
33 | 					finalList.addAll(miniSolve(subList));
34 | 					counter += maxsurfaceformsperquery;
35 | 				} else {
36 | 					List<SurfaceForm> subList = this.rep.subList(counter, this.rep.size());
37 | 					List<SurfaceForm> cloneList = new LinkedList<SurfaceForm>();
38 | 					for (SurfaceForm sf : subList) {
39 | 						SurfaceForm clone = (SurfaceForm) sf.clone();
40 | 						cloneList.add(clone);
41 | 					}
42 | 
43 | 					int prevcounter = 0;
44 | 					List<SurfaceForm> prevList = this.rep.subList(counter - maxsurfaceformsperquery, counter);
45 | 					while (cloneList.size() < maxsurfaceformsperquery) {
46 | 						SurfaceForm clone = (SurfaceForm) prevList.get(prevcounter).clone();
47 | 						clone.setRelevant(false);
48 | 						cloneList.add(clone);
49 | 						prevcounter++;
50 | 					}
51 | 					List<SurfaceForm> workedList = miniSolve(cloneList);
52 | 					List<SurfaceForm> sfs = new LinkedList<SurfaceForm>();
53 | 					for (SurfaceForm sf : workedList) {
54 | 						if (sf.isRelevant()) {
55 | 							sfs.add(sf);
56 | 						}
57 | 					}
58 | 					finalList.addAll(sfs);
59 | 					break;
60 | 				}
61 | 				long millis = System.currentTimeMillis() - time;
62 | 				String formatedTime = String.format("%d min, %d sec", 
63 | 					    TimeUnit.MILLISECONDS.toMinutes(millis),
64 | 					    TimeUnit.MILLISECONDS.toSeconds(millis) - 
65 | 					    TimeUnit.MINUTES.toSeconds(TimeUnit.MILLISECONDS.toMinutes(millis))
66 | 					);
67 | 				System.out.println(formatedTime);
68 | 			}
69 | 			this.rep = finalList;
70 | 		} else {
71 | 			if(alwaysAction) {
72 | 				finalList.addAll(miniSolve(rep));
73 | 				this.rep = finalList;
74 | 			}
75 | 		}
76 | 	}
77 | 
78 | 	public List<SurfaceForm> getRep() {
79 | 		return rep;
80 | 	}
81 | 
82 | 	public abstract List<SurfaceForm> miniSolve(List<SurfaceForm> rep);
83 | }
84 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/LearnToRankTermScorer.java:
--------------------------------------------------------------------------------
  1 | package doser.lucene.query;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.lucene.index.DocsEnum;
  6 | import org.apache.lucene.search.Scorer;
  7 | import org.apache.lucene.search.Weight;
  8 | import org.apache.lucene.search.similarities.Similarity;
  9 | 
 10 | /**
 11 |  * Expert: A <code>Scorer</code> for documents matching a <code>Term</code>.
 12 |  */
 13 | final class LearnToRankTermScorer extends Scorer {
 14 | 	private final Similarity.SimScorer docScorer;
 15 | 	private final DocsEnum docsEnum;
 16 | 
 17 | 	/**
 18 | 	 * Construct a <code>TermScorer</code>.
 19 | 	 * 
 20 | 	 * @param weight
 21 | 	 *            The weight of the <code>Term</code> in the query.
 22 | 	 * @param docsEnum
 23 | 	 *            An iterator over the documents matching the <code>Term</code>.
 24 | 	 * @param docScorer
 25 | 	 *            The </code>Similarity.ExactSimScorer</code> implementation to
 26 | 	 *            be used for score computations.
 27 | 	 * @param docFreq
 28 | 	 *            per-segment docFreq of this term
 29 | 	 */
 30 | 	LearnToRankTermScorer(final Weight weight, final DocsEnum docsEnum,
 31 | 			final Similarity.SimScorer docScorer) {
 32 | 		super(weight);
 33 | 		this.docScorer = docScorer;
 34 | 		this.docsEnum = docsEnum;
 35 | 	}
 36 | 
 37 | 	/**
 38 | 	 * Advances to the first match beyond the current whose document number is
 39 | 	 * greater than or equal to a given target. <br>
 40 | 	 * The implementation uses {@link DocsEnum#advance(int)}.
 41 | 	 * 
 42 | 	 * @param target
 43 | 	 *            The target document number.
 44 | 	 * @return the matching document or NO_MORE_DOCS if none exist.
 45 | 	 */
 46 | 	@Override
 47 | 	public int advance(final int target) throws IOException {
 48 | 		return docsEnum.advance(target);
 49 | 	}
 50 | 
 51 | 	@Override
 52 | 	public long cost() {
 53 | 		return docsEnum.cost();
 54 | 	}
 55 | 
 56 | 	@Override
 57 | 	public int docID() {
 58 | 		return docsEnum.docID();
 59 | 	}
 60 | 
 61 | 	@Override
 62 | 	public int freq() throws IOException {
 63 | 		return docsEnum.freq();
 64 | 	}
 65 | 
 66 | 	DocsEnum getDocsEnum() {
 67 | 		return docsEnum;
 68 | 	}
 69 | 
 70 | 	/**
 71 | 	 * Advances to the next document matching the query. <br>
 72 | 	 * 
 73 | 	 * @return the document matching the query or NO_MORE_DOCS if there are no
 74 | 	 *         more documents.
 75 | 	 */
 76 | 	@Override
 77 | 	public int nextDoc() throws IOException {
 78 | 		return docsEnum.nextDoc();
 79 | 	}
 80 | 
 81 | 	// TODO: benchmark if the specialized conjunction really benefits
 82 | 	// from this, or if instead its from sorting by docFreq, or both
 83 | 
 84 | 	@Override
 85 | 	public float score() throws IOException {
 86 | 		assert docID() != NO_MORE_DOCS;
 87 | 		return docScorer.score(docsEnum.docID(), docsEnum.freq());
 88 | 	}
 89 | 
 90 | 	// TODO: generalize something like this for scorers?
 91 | 	// even this is just an estimation...
 92 | 
 93 | 	// int getDocFreq() {
 94 | 	// return docFreq;
 95 | 	// }
 96 | 
 97 | 	/** Returns a string representation of this <code>TermScorer</code>. */
 98 | 	@Override
 99 | 	public String toString() {
100 | 		return "scorer(" + weight + ")";
101 | 	}
102 | }


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/knowledgebases/EntityCentricKBBiomed.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.knowledgebases;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import org.apache.lucene.search.similarities.Similarity;
 6 | 
 7 | public class EntityCentricKBBiomed extends AbstractEntityCentricKBGeneral {
 8 | 
 9 | 	public EntityCentricKBBiomed(String uri, boolean dynamic, Similarity sim) {
10 | 		super(uri, dynamic, sim);
11 | 	}
12 | 
13 | 	public EntityCentricKBBiomed(String uri, boolean dynamic) {
14 | 		super(uri, dynamic);
15 | 	}
16 | 
17 | 	/**
18 | 	 * Takes a set of entities as well as a target entity and generates one
19 | 	 * string that fits into the word2vec query format used in this class. The
20 | 	 * source entities are concatenated and should be compared with the target
21 | 	 * entity.
22 | 	 *
23 | 	 * @param source
24 | 	 *            a set of source entities
25 | 	 * @param target
26 | 	 *            the target entity.
27 | 	 * @return String in appropriate word2vec query format
28 | 	 */
29 | 	public String generateWord2VecFormatString(String source, String target) {
30 | 		source = convertUrlToBiomedEntityIdentifier(source);
31 | 		target = convertUrlToBiomedEntityIdentifier(target);
32 | 		int c = source.compareToIgnoreCase(target);
33 | 		String res = "";
34 | 		if (c < 0) {
35 | 			res = source + "|" + target;
36 | 		} else if (c == 0) {
37 | 			res = source + "|" + target;
38 | 		} else {
39 | 			res = target + "|" + source;
40 | 		}
41 | 		return res;
42 | 	}
43 | 
44 | 	/**
45 | 	 * Takes a set of entities as well as a target entity and generates one
46 | 	 * string that fits into the word2vec query format used in this class. The
47 | 	 * source entities are concatenated and should be compared wit the target
48 | 	 * entity.
49 | 	 *
50 | 	 * @param source
51 | 	 *            a set of source entities
52 | 	 * @param target
53 | 	 *            the target entity.
54 | 	 * @return String in appropriate word2vec query format
55 | 	 */
56 | 	public String generateWord2VecFormatString(List<String> source, String target) {
57 | 		StringBuilder builder = new StringBuilder();
58 | 		for (String s : source) {
59 | 			s = convertUrlToBiomedEntityIdentifier(s);
60 | 			builder.append(s);
61 | 			builder.append("|");
62 | 		}
63 | 		String src = builder.toString();
64 | 		src = src.substring(0, src.length() - 1);
65 | 		target = convertUrlToBiomedEntityIdentifier(target);
66 | 		return src + "|" + target;
67 | 	}
68 | 
69 | 	private String convertUrlToBiomedEntityIdentifier(String url) {
70 | 		String res = "";
71 | 		if (url.startsWith("http://www.uniprot.org/uniprot/")) {
72 | 			res = "UNIPROT_" + url.replaceAll("http://www.uniprot.org/uniprot/", "");
73 | 		} else if (url.startsWith("http://www.ncbi.nlm.nih.gov/gene/")) {
74 | 			res = "NCBI_" + url.replaceAll("http://www.ncbi.nlm.nih.gov/gene/", "");
75 | 		} else if (url.startsWith("http://linkedlifedata.com/resource/umls-concept/")) {
76 | 			res = "UMLS_" + url.replaceAll("http://linkedlifedata.com/resource/umls-concept/", "");
77 | 		}
78 | 		return res;
79 | 	}
80 | 
81 | 	@Override
82 | 	protected String generateDomainName() {
83 | 		return "Biomed";
84 | 	}
85 | 	
86 | 	@Override
87 | 	protected String kbName() {
88 | 		return "CalbC Biomedical KB";
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/CheckGeneralEntities.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.rules;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.LinkedList;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.lucene.document.Document;
 8 | import org.apache.lucene.index.IndexReader;
 9 | import org.apache.lucene.index.Term;
10 | import org.apache.lucene.search.IndexSearcher;
11 | import org.apache.lucene.search.ScoreDoc;
12 | import org.apache.lucene.search.TopDocs;
13 | 
14 | import doser.entitydisambiguation.algorithms.SurfaceForm;
15 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
16 | import doser.lucene.query.TermQuery;
17 | 
18 | class CheckGeneralEntities extends AbstractRule {
19 | 
20 | 	CheckGeneralEntities(EntityCentricKBDBpedia eckb) {
21 | 		super(eckb);
22 | 	}
23 | 
24 | 	@Override
25 | 	public boolean applyRule(List<SurfaceForm> rep) {
26 | 		for (SurfaceForm c : rep) {
27 | 			String sf = c.getSurfaceForm().toLowerCase();
28 | 			List<String> candidates = c.getCandidates();
29 | 			String checked = null;
30 | 			// Surface Form - Candidate Match i.e. Saturday -
31 | 			// http://dbpedia.org/resource/Saturday
32 | 			for (String s : candidates) {
33 | 				String ent = s.replaceAll("http://dbpedia.org/resource/", "")
34 | 						.toLowerCase();
35 | 				if (sf.equalsIgnoreCase(ent)) {
36 | 					checked = s;
37 | 					break;
38 | 				}
39 | 			}
40 | 
41 | 			if (checked != null && !checkSurfaceFormSubset(sf, rep)) {
42 | 				List<String> keepCandidates = new LinkedList<String>();
43 | 				for (String can : candidates) {
44 | 					String[] labels = null;
45 | 					IndexSearcher searcher = eckb.getSearcher();
46 | 					IndexReader reader = searcher.getIndexReader();
47 | 					TermQuery query = new TermQuery(new Term("Mainlink", can));
48 | 					try {
49 | 						final TopDocs top = searcher.search(query, 1);
50 | 						final ScoreDoc[] score = top.scoreDocs;
51 | 						final Document doc = reader.document(score[0].doc);
52 | 						labels = doc.getValues("Label");
53 | 					} catch (IOException e) {
54 | 						e.printStackTrace();
55 | 					}
56 | 					// Check whether the candidate has label of the original
57 | 					// surface form
58 | 					if (labels != null) {
59 | 						boolean isIn = false;
60 | 						for (int i = 0; i < labels.length; ++i) {
61 | 							if (labels[i].toLowerCase().equalsIgnoreCase(sf)) {
62 | 								isIn = true;
63 | 								break;
64 | 							}
65 | 						}
66 | 						// If IN, keep this candidate
67 | 						if (isIn) {
68 | 							keepCandidates.add(can);
69 | 						}
70 | 					}
71 | 				}
72 | 				if (!keepCandidates.isEmpty()) {
73 | 					c.setCandidates(keepCandidates);
74 | 					if(keepCandidates.size() == 1) {
75 | 						System.out.println("**********************************************************************");
76 | 						System.out.println(keepCandidates.toString());
77 | 						System.out.println("**********************************************************************");
78 | 					}
79 | 				}
80 | 			}
81 | 		}
82 | 		return false;
83 | 	}
84 | 
85 | 	private boolean checkSurfaceFormSubset(String sf,
86 | 			List<SurfaceForm> reps) {
87 | 		boolean isIn = false;
88 | 		for (SurfaceForm c : reps) {
89 | 			String toCheck = c.getSurfaceForm().toLowerCase();
90 | 			if (!toCheck.equalsIgnoreCase(sf) && toCheck.contains(sf)) {
91 | 				isIn = true;
92 | 				break;
93 | 			}
94 | 		}
95 | 		return isIn;
96 | 	}
97 | }
98 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/CollectiveAndContextDriver.java:
--------------------------------------------------------------------------------
 1 | package doser.entitydisambiguation.algorithms.collective.dbpedia;
 2 | 
 3 | import java.util.LinkedList;
 4 | import java.util.List;
 5 | 
 6 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 7 | import doser.entitydisambiguation.algorithms.collective.CandidatePruning;
 8 | import doser.entitydisambiguation.algorithms.rules.RuleAdapation;
 9 | import doser.entitydisambiguation.dpo.DisambiguatedEntity;
10 | import doser.entitydisambiguation.dpo.Response;
11 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
12 | 
13 | class CollectiveAndContextDriver {
14 | 
15 | 	static final int PREPROCESSINGCONTEXTSIZE = 200;
16 | 
17 | 	private String topic;
18 | 	private Response[] currentResponse;
19 | 	private List<SurfaceForm> rep;
20 | 	private EntityCentricKBDBpedia eckb;
21 | 
22 | 	CollectiveAndContextDriver(Response[] res, List<SurfaceForm> rep, EntityCentricKBDBpedia eckb, String topic) {
23 | 		super();
24 | 		this.topic = topic;
25 | 		if (res.length != rep.size()) {
26 | 			throw new IllegalArgumentException();
27 | 		}
28 | 		this.currentResponse = res;
29 | 		this.rep = rep;
30 | 		this.eckb = eckb;
31 | 		this.eckb.precomputeDoc2VecSimilarities(rep, PREPROCESSINGCONTEXTSIZE);
32 | 	}
33 | 
34 | 	void solve() {
35 | 		// First candidate pruning
36 | 		CandidatePruning pruning = new CandidatePruning(eckb);
37 | 		pruning.prune(rep);
38 | 		if (topic != null) {
39 | 			TableColumnFilter cf = new TableColumnFilter(eckb, topic);
40 | 			cf.filter(rep);
41 | 		}
42 | 		TimeNumberDisambiguation timenumberdis = new TimeNumberDisambiguation(eckb);
43 | 		timenumberdis.solve(rep);
44 | 		LocationDisambiguation locationDis = new LocationDisambiguation(eckb);
45 | 		locationDis.solve(rep);
46 | 
47 | 		RuleAdapation rules = new RuleAdapation();
48 | 		rules.addNoCandidatesCheckPluralRule(eckb);
49 | 		rules.addNoCandidatesExpansionRule(eckb);
50 | 		rules.addUnambiguousToAmbiguousRule(eckb);
51 | 		rules.addPatternRule(eckb, topic);
52 | 		rules.addContextRule(eckb);
53 | 		rules.performRuleChainBeforeCandidateSelection(rep);
54 | 
55 | 		CandidateReductionDBpediaW2V w2vreduction = new CandidateReductionDBpediaW2V(eckb, rep, 20, 5, 150, false, false);
56 | 		w2vreduction.solve();
57 | 		rep = w2vreduction.getRep();
58 | 
59 | 		w2vreduction = new CandidateReductionDBpediaW2V(eckb, rep, 45, 5, 250, true, true);
60 | 		w2vreduction.solve();
61 | 		rep = w2vreduction.getRep();
62 | 		FinalEntityDisambiguation finalDis = new FinalEntityDisambiguation(eckb, rep);
63 | 		finalDis.setup();
64 | 		finalDis.solve();
65 | 	}
66 | 
67 | 	void generateResult() {
68 | 		for (int i = 0; i < currentResponse.length; i++) {
69 | 			SurfaceForm r = search(i);
70 | 			if (currentResponse[i] == null && r != null && r.getCandidates().size() == 1) {
71 | 				Response res = new Response();
72 | 				List<DisambiguatedEntity> entList = new LinkedList<DisambiguatedEntity>();
73 | 				DisambiguatedEntity ent = new DisambiguatedEntity();
74 | 				ent.setEntityUri(r.getCandidates().get(0));
75 | 				entList.add(ent);
76 | 				res.setDisEntities(entList);
77 | 				res.setSelectedText(r.getSurfaceForm());
78 | 				currentResponse[i] = res;
79 | 			}
80 | 		}
81 | 	}
82 | 
83 | 	private SurfaceForm search(int qryNr) {
84 | 		for (SurfaceForm r : rep) {
85 | 			if (r.getQueryNr() == qryNr) {
86 | 				return r;
87 | 			}
88 | 		}
89 | 		return null;
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/PriorQuery.java:
--------------------------------------------------------------------------------
  1 | package doser.lucene.query;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.lucene.index.AtomicReaderContext;
  6 | import org.apache.lucene.search.Explanation;
  7 | import org.apache.lucene.search.IndexSearcher;
  8 | import org.apache.lucene.search.Query;
  9 | import org.apache.lucene.search.Scorer;
 10 | import org.apache.lucene.search.Weight;
 11 | import org.apache.lucene.util.Bits;
 12 | 
 13 | import doser.lucene.features.IEntityCentricExtFeatures;
 14 | 
 15 | /**
 16 |  * Due to major performance problems if we use an IndexReader request for every
 17 |  * single document, we create a <Concept, Occurence> Hashmap to improve the
 18 |  * overall performance.
 19 |  * 
 20 |  * Our StartupInformationLoader provides these necessary information much
 21 |  * faster.
 22 |  * 
 23 |  * @author Stefan Zwicklbauer
 24 |  */
 25 | public class PriorQuery extends Query {
 26 | 
 27 | 	class PriorWeight extends Weight {
 28 | 
 29 | 		class PriorScorer extends Scorer {
 30 | 
 31 | 			private final AtomicReaderContext context;
 32 | 
 33 | 			private int lastDoc = -1;
 34 | 
 35 | 			PriorScorer(final Weight weight, final AtomicReaderContext context) {
 36 | 				super(weight);
 37 | 				this.context = context;
 38 | 			}
 39 | 
 40 | 			@Override
 41 | 			public int advance(final int target) throws IOException {
 42 | 				final int maxdoc = context.reader().numDocs();
 43 | 				if (target > (maxdoc - 1)) {
 44 | 					return NO_MORE_DOCS;
 45 | 				}
 46 | 				return lastDoc = target;
 47 | 			}
 48 | 
 49 | 			@Override
 50 | 			public long cost() {
 51 | 				return 0;
 52 | 			}
 53 | 
 54 | 			@Override
 55 | 			public int docID() {
 56 | 				return lastDoc;
 57 | 			}
 58 | 
 59 | 			@Override
 60 | 			public int freq() throws IOException {
 61 | 				return 1;
 62 | 			}
 63 | 
 64 | 			@Override
 65 | 			public int nextDoc() throws IOException {
 66 | 				if ((context.reader().numDocs() - 1) > lastDoc) {
 67 | 					return ++lastDoc;
 68 | 				} else {
 69 | 					return NO_MORE_DOCS;
 70 | 				}
 71 | 			}
 72 | 
 73 | 			@Override
 74 | 			public float score() throws IOException {
 75 | 				return kb.getPriorOfDocument(context.docBase + lastDoc);
 76 | 			}
 77 | 
 78 | 			@Override
 79 | 			public String toString() {
 80 | 				return "Prior";
 81 | 			}
 82 | 		}
 83 | 
 84 | 		@Override
 85 | 		public Explanation explain(final AtomicReaderContext context,
 86 | 				final int doc) throws IOException {
 87 | 			return null;
 88 | 		}
 89 | 
 90 | 		@Override
 91 | 		public Query getQuery() {
 92 | 			return PriorQuery.this;
 93 | 		}
 94 | 
 95 | 		@Override
 96 | 		public float getValueForNormalization() throws IOException {
 97 | 			return 0;
 98 | 		}
 99 | 
100 | 		@Override
101 | 		public void normalize(final float norm, final float topLevelBoost) {
102 | 			// Do nothing here!
103 | 		}
104 | 
105 | 		@Override
106 | 		public Scorer scorer(AtomicReaderContext context, Bits acceptDocs)
107 | 				throws IOException {
108 | 			return new PriorScorer(this, context);
109 | 		}
110 | 	}
111 | 
112 | 	private IEntityCentricExtFeatures kb;
113 | 
114 | 	public PriorQuery(IEntityCentricExtFeatures kb) {
115 | 		super();
116 | 		this.kb = kb;
117 | 	}
118 | 
119 | 	@Override
120 | 	public Weight createWeight(final IndexSearcher searcher) throws IOException {
121 | 		return new PriorWeight();
122 | 	}
123 | 
124 | 	@Override
125 | 	public String toString(final String field) {
126 | 		return "PriorQuery";
127 | 	}
128 | }
129 | 


--------------------------------------------------------------------------------
/yes:
--------------------------------------------------------------------------------
 1 | -----BEGIN RSA PRIVATE KEY-----
 2 | Proc-Type: 4,ENCRYPTED
 3 | DEK-Info: AES-128-CBC,9F33236E2FD99EACABA4D7F529D0E8A5
 4 | 
 5 | 8PISpGsmdq0QuL/NcFlGOZznZdyibB1/A6nI5bfiDljT5hzQ7xWFBM3S2IHeKUVK
 6 | wdJdA+c3Y4dXgRllMczUMJBXX3UfObsm3/5TWCKPwczxLJ0tgxCgYVX9KNVt0Ngv
 7 | b2ayQqkNBvBHq5ooKr8glkjvZ0Wl6QZ+W4pz8KndfzSiUri/WTryEmjzYbgyBXyG
 8 | 8L+wG8mGOiCYKOFlVM+ViE8f3d+i0lsxX7PgXkdyWOvlgx2Iy3MhLQNXw0LztU6I
 9 | QFpa1DtjcBewpvYtPJn6fma1nqhc8bTSaM0/1a8aLeCJWqCzQ5vD1wkkZ1eLEMDn
10 | Jg0D1fT2mm3XtNAMwOHcd+j3IG7aTofhU+XRBPk1YRdbOJjNuMzgV+P3dxXUhGLV
11 | N9vb2hUm/wIXngiKTeigsYGj59nvhyda6DfLhsNfizH1M/Foq3ZaNdWCvwtfJzAS
12 | sw2tW+PnPJiKpSXE1O7DQ3fduv5gBrrxZ906kHVKzPPa0T+0HWN+Z3MyM8IbuYKf
13 | zUVo0IogdobK+vm6HcKTWCdV0v5BPG6cTWHbTUi2kdJLc9j1lnnzEAOMIHYexsg2
14 | 8PmD2uncDNvUvS5DDILVSFj40zG57c2pVgBWcN1U211env8eb9jD4oJr+rOH4gvm
15 | pDLnB72eYZmQ9oUnnLsPo8c8cFfuJiTYIqmPW5crpzjUQlDlMlc8Kh5A3XJ/XHqh
16 | sq7M+Kn54l51SH+FvpS6u/s6dpwjCa+UbuFzdbJlE/RRLZaoTh0gov6k0n/48XSU
17 | 0XBJGuKyL8hmGmwAyMwdNb9vuH0Gah30ZeIpb8Iiw6aUNeCnpDrZ+b1M7VAC2Q/M
18 | UvuNe+datxI4FNyqPOnmi8o+vkWl3W8+M71qkGGsi+qnSUwnR9uUFg6VBt1WEdHw
19 | QpkPeQtnS53kadKSqLZEnPTnLsEYZfByCexgdXzJr32+IiUzkM8PoNuMzhVR+LgK
20 | Q55EJyFj736f8pwzC4k8Iz5WwAqnabXJH3eEW+o94a75xCM/32QW3ZJS8+yVh3Jb
21 | R622Tu9S6VxPzrS/HRbAmLCsWwy8svobKVTMN5vOzx3bZ5DrtjdyY8eBfQgBLQQW
22 | HxKGXYygz7M93e03K4VQbc0Gt1igBXgOH/W6MZXAzMk+WfXVRml2BzUWnh5pGvVt
23 | x0+vlbWESWKdIYY16R20R2594Elh9j1kgzRE3c3f0Aq86S5VhR4wvwcjF2GpHHuH
24 | 1ILCKvmWehfl+DJ1kyYfAXemsHxkkAHNCpJQ9TeKQiVUWDIjEBvuEEn6lgEu3vWG
25 | LgCV+AkWRKRRExssPK/Cj/VUqa4mhFLOy61JKi5XCj991MwXXJPaSmTp9j2hofcR
26 | yZWkaqwhe3kkZfVCETl4wTAPs+uB+7vW1zO70me959D4qoZVmu/Lr/VnGDw+7WIg
27 | NuDyIM7cSE/8va7r19b0uDJdwLrcmir8WwmxJOHCOQv+hY73RR2Hcmr1EtOp2BEw
28 | dwuc3+ewMcvNSQLnBUe/6OMRz9Z0kc620f1H6X4WJHu1BA0FDCbh9HeEpx3zsECN
29 | YPCrlZUS68kkGxscE3QgtTnKDsjArPrLxxFueBAlpYVUW+jzhqnd1w0xCXBbB2tV
30 | xi0kIpigCobhS35xig5nR6nkoSjc7nr6ybiEVA5x6Fbd41fVwtFop+4W8kmZL4/I
31 | 8lwHBp8SLRv/bjN7q72rYn1HH+JIKAskuLhpG00nK9gzDDhWYGVEuFhy3Jw6OZtZ
32 | tro3PRgAW83yKIjGvpGb1iZEg4YQhldZGq4/bxOU0FYKTniGlA+sZFmSYrKCPW4D
33 | 9J8isFexm0P6Dv8vjzIV/WSbTn9Z9bF3bcN1Eg91a2X/82iVlTh4Lgea8NMhLtUN
34 | nyKETpUQRoT126mHuaVbvD/OG2PUwLMt5vahQoTaYmazTk+Uevjgi9PfWBSLzsqB
35 | QKzCNoQjzcibYuAv4zU7hbjEXjtLXXkVyzVhTTiTKzIXEd8c8f4XUSEHo82UXjMa
36 | gzsXx8VsTEzfBEPSruBaKxf82LInpgGwNPlVTsW+g1T0nGE0qC7W/BYfSYEVCZY/
37 | PmaIwMzC5akuZnqiLTOMIwEdPe8iYzntcvCfUZB1rL75Xx6Y7YFLIt96fKFb5Nn1
38 | Yni80JAtvoFlCiZaUSoWAnHecXNewd5xwJjaJdgFh38cCZmvjTxupk9rU3lsLLoX
39 | tzZEFJv7Qt/axbqkIT/zdJr87zeScAgLU7PcpO05LPQR9pU3mm3z0jylgMUzU1Qz
40 | IVJHQ8CnaCTg2S7fwjZjHVlHIrPPiZgVhTN7Rt8vZ3CB7Wf8sXpGBIXADYoNiOVG
41 | lvtrXJYGZ5uoyeJLHerGNyMu4B3iCoY50kilNCcQ3cfX7G6SMwMgH1oJDHCMdOr+
42 | PLWf45FcwQrhkj56DqytV389OKaADsJoNuEpgbLmnUBCJQHzq4/Lfoqvqj7z2PE/
43 | F2kgb9JN7eBfbw/a6Sa7A0Qe8yCOVd9HWqSt0sQDqITcybF/gfU5IAjaFDWm6xKk
44 | FMFKTigj6Y4UfDfffZfVFAJ0AqNfkHTAI92ShGU/hrDAHmcgiio3m93IsnSjqYWs
45 | McFgcvsaqQpb4LfkdckXBDrZCVXNbeOe7JdxLcZxlI1hHeve2spz7zY7N3MTZzNm
46 | xZ0wcndfcmfVv/KXGvjPGh9+rrZyWXfeT5bwE6wLwg+CJmCI2AJDvoGdx7hkL8FL
47 | FKjbOrnTCai+Q4/vOdVpQz7/X7nyIX5DgqthqI8PTF4qAmoKM8htATK96CfW/Mw5
48 | PEQbU25nRHSE/TxVWoeoPJ5YQLnlh6Voey9Sk5vSzBNwyZXde9/1okZPvnZjmcvu
49 | 9TxOpoETYnNyfZEJ4g4FvHWSpN7YiDnNiwvD4nCRIq9oQTWhjK3w76Drv92MjaqJ
50 | bzaNMt909qVjLaio1sT5tDtqXT9Me5R7bL1qoEPXAePzYD7Bc1kZs1FD3emCCjh+
51 | TL/sLv64fPrpEH026AKfNqUWd9A0EexJqnVH6J6TgE6LrYe7Wq8PHlc+3DiEdroT
52 | qyMnP71BTu/UrUcm/rQ/+FDvduVncD0mDuUaw3Vr3Lf0DgYr/7nd5IFMP+5bpPZo
53 | KU5dNyRfOYOZTJ4vdTYpjeOU1IkjP+fBrbZ8wacHEqju68v4XViIJNaZrAJmq5t/
54 | -----END RSA PRIVATE KEY-----
55 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/SensePriorQuery.java:
--------------------------------------------------------------------------------
  1 | package doser.lucene.query;
  2 | 
  3 | import java.io.IOException;
  4 | 
  5 | import org.apache.lucene.index.AtomicReaderContext;
  6 | import org.apache.lucene.search.Explanation;
  7 | import org.apache.lucene.search.IndexSearcher;
  8 | import org.apache.lucene.search.Query;
  9 | import org.apache.lucene.search.Scorer;
 10 | import org.apache.lucene.search.Weight;
 11 | import org.apache.lucene.util.Bits;
 12 | 
 13 | import doser.lucene.features.IEntityCentricExtFeatures;
 14 | 
 15 | /**
 16 |  * Due to major performance problems if we use an IndexReader request for every
 17 |  * single document, we create a <Concept, Occurence> Hashmap to improve the
 18 |  * overall performance.
 19 |  * 
 20 |  * Our StartupInformationLoader provides these necessary information much
 21 |  * faster.
 22 |  * 
 23 |  * @author Stefan Zwicklbauer
 24 |  * 
 25 |  */
 26 | public class SensePriorQuery extends Query {
 27 | 
 28 | 	class PriorWeight extends Weight {
 29 | 
 30 | 		class SensePriorScorer extends Scorer {
 31 | 
 32 | 			private final AtomicReaderContext context;
 33 | 
 34 | 			private int lastDoc = -1;
 35 | 
 36 | 			SensePriorScorer(final Weight weight,
 37 | 					final AtomicReaderContext context) {
 38 | 				super(weight);
 39 | 				this.context = context;
 40 | 			}
 41 | 
 42 | 			@Override
 43 | 			public int advance(final int target) throws IOException {
 44 | 				final int maxdoc = context.reader().numDocs();
 45 | 				if (target > (maxdoc - 1)) {
 46 | 					return NO_MORE_DOCS;
 47 | 				}
 48 | 				return lastDoc = target;
 49 | 			}
 50 | 
 51 | 			@Override
 52 | 			public long cost() {
 53 | 				return 0;
 54 | 			}
 55 | 
 56 | 			@Override
 57 | 			public int docID() {
 58 | 				return lastDoc;
 59 | 			}
 60 | 
 61 | 			@Override
 62 | 			public int freq() throws IOException {
 63 | 				return 1;
 64 | 			}
 65 | 
 66 | 			@Override
 67 | 			public int nextDoc() throws IOException {
 68 | 				if ((context.reader().numDocs() - 1) > lastDoc) {
 69 | 					return ++lastDoc;
 70 | 				} else {
 71 | 					return NO_MORE_DOCS;
 72 | 				}
 73 | 			}
 74 | 
 75 | 			@Override
 76 | 			public float score() throws IOException {
 77 | 				float res = 0.0f;
 78 | 				res = kb.getSensePriorOfDocument(keyword, context.docBase
 79 | 						+ lastDoc);
 80 | 				return res;
 81 | 			}
 82 | 
 83 | 			@Override
 84 | 			public String toString() {
 85 | 				return "SensePrior";
 86 | 			}
 87 | 
 88 | 		}
 89 | 
 90 | 		@Override
 91 | 		public Explanation explain(final AtomicReaderContext context,
 92 | 				final int doc) throws IOException {
 93 | 			return null;
 94 | 		}
 95 | 
 96 | 		@Override
 97 | 		public Query getQuery() {
 98 | 			return SensePriorQuery.this;
 99 | 		}
100 | 
101 | 		@Override
102 | 		public float getValueForNormalization() throws IOException {
103 | 			return 0;
104 | 		}
105 | 
106 | 		@Override
107 | 		public void normalize(final float norm, final float topLevelBoost) {
108 | 			// Do nothing here
109 | 		}
110 | 
111 | 		@Override
112 | 		public Scorer scorer(AtomicReaderContext context, Bits acceptDocs)
113 | 				throws IOException {
114 | 			return new SensePriorScorer(this, context);
115 | 		}
116 | 
117 | 	}
118 | 
119 | 	private final IEntityCentricExtFeatures kb;
120 | 
121 | 	private final String keyword;
122 | 
123 | 	public SensePriorQuery(final String keyword, final IEntityCentricExtFeatures kb) {
124 | 		super();
125 | 		this.keyword = keyword;
126 | 		this.kb = kb;
127 | 	}
128 | 
129 | 	@Override
130 | 	public Weight createWeight(final IndexSearcher searcher) throws IOException {
131 | 		return new PriorWeight();
132 | 	}
133 | 
134 | 	@Override
135 | 	public String toString(final String field) {
136 | 		return "SensePriorQuery";
137 | 	}
138 | }
139 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/SurfaceForm.java:
--------------------------------------------------------------------------------
  1 | package doser.entitydisambiguation.algorithms;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | public class SurfaceForm implements Comparable<SurfaceForm>, Cloneable {
  7 | 
  8 | 	private int queryNr;
  9 | 	private String surfaceForm;
 10 | 	private String context;
 11 | 	private List<String> candidates;
 12 | 	private Integer ambiguity;
 13 | 	private boolean isACandidate;
 14 | 	private double difference;
 15 | 	private int position;
 16 | 	private boolean matchesInitial;
 17 | 	private boolean initial;
 18 | 	private boolean isRelevant;
 19 | 
 20 | 	public SurfaceForm(String surfaceForm, String context, List<String> candidates, int qryNr, int position) {
 21 | 		super();
 22 | 		this.ambiguity = candidates.size();
 23 | 		this.surfaceForm = surfaceForm;
 24 | 		this.context = context;
 25 | 		this.candidates = candidates;
 26 | 		this.queryNr = qryNr;
 27 | 		this.isACandidate = true;
 28 | 		this.difference = 0;
 29 | 		this.position = position;
 30 | 		this.initial = false;
 31 | 		this.isRelevant = true;
 32 | 	}
 33 | 
 34 | 	public boolean isRelevant() {
 35 | 		return isRelevant;
 36 | 	}
 37 | 
 38 | 	public void setRelevant(boolean isRelevant) {
 39 | 		this.isRelevant = isRelevant;
 40 | 	}
 41 | 
 42 | 	public boolean isMatchesInitial() {
 43 | 		return matchesInitial;
 44 | 	}
 45 | 
 46 | 	public void setMatchesInitial(boolean matchesInitial) {
 47 | 		this.matchesInitial = matchesInitial;
 48 | 	}
 49 | 
 50 | 	public boolean isInitial() {
 51 | 		return initial;
 52 | 	}
 53 | 
 54 | 	public void setInitial(boolean initial) {
 55 | 		this.initial = initial;
 56 | 	}
 57 | 
 58 | 	public void setCandidates(List<String> candidates) {
 59 | 		this.candidates = candidates;
 60 | 	}
 61 | 
 62 | 	public List<String> getCandidates() {
 63 | 		return candidates;
 64 | 	}
 65 | 
 66 | 	public void setACandidate(boolean can) {
 67 | 		this.isACandidate = can;
 68 | 	}
 69 | 
 70 | 	public String getSurfaceForm() {
 71 | 		return surfaceForm;
 72 | 	}
 73 | 
 74 | 	public boolean isACandidate() {
 75 | 		return isACandidate;
 76 | 	}
 77 | 
 78 | 	public String getContext() {
 79 | 		return context;
 80 | 	}
 81 | 
 82 | 	public int getQueryNr() {
 83 | 		return queryNr;
 84 | 	}
 85 | 
 86 | 	public int getAmbiguity() {
 87 | 		return this.ambiguity;
 88 | 	}
 89 | 
 90 | 	public void setDisambiguatedEntity(String url) {
 91 | 		candidates.clear();
 92 | 		candidates.add(url);
 93 | 	}
 94 | 
 95 | 	public void clearList() {
 96 | 		candidates.clear();
 97 | 	}
 98 | 
 99 | 	public void addCandidate(String can) {
100 | 		candidates.add(can);
101 | 	}
102 | 
103 | 	public double getDifference() {
104 | 		return difference;
105 | 	}
106 | 
107 | 	public void setDifference(double difference) {
108 | 		this.difference = difference;
109 | 	}
110 | 
111 | 	public int getPosition() {
112 | 		return position;
113 | 	}
114 | 
115 | 	public void setPosition(int position) {
116 | 		this.position = position;
117 | 	}
118 | 
119 | 	@Override
120 | 	public int compareTo(SurfaceForm o) {
121 | 		if (this.difference < o.getDifference()) {
122 | 			return 1;
123 | 		} else if (this.difference > o.getDifference()) {
124 | 			return -1;
125 | 		} else {
126 | 			return 0;
127 | 		}
128 | 	}
129 | 
130 | 	public Object clone() {
131 | 		ArrayList<String> newCandidates = new ArrayList<String>();
132 | 		for (String s : candidates) {
133 | 			newCandidates.add(s);
134 | 		}
135 | 
136 | 		SurfaceForm n = new SurfaceForm(new String(this.surfaceForm), new String(this.context), newCandidates,
137 | 				this.queryNr, this.position);
138 | 		n.setACandidate(this.isACandidate);
139 | 		n.setInitial(this.initial);
140 | 		n.setMatchesInitial(this.matchesInitial);
141 | 		n.setRelevant(this.isRelevant);
142 | 		return n;
143 | 	}
144 | }


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/UnambiguousToAmbiguousRule.java:
--------------------------------------------------------------------------------
  1 | package doser.entitydisambiguation.algorithms.rules;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.HashMap;
  5 | import java.util.LinkedList;
  6 | import java.util.List;
  7 | import java.util.Map;
  8 | 
  9 | import org.apache.lucene.document.Document;
 10 | import org.apache.lucene.index.Term;
 11 | import org.apache.lucene.search.IndexSearcher;
 12 | import org.apache.lucene.search.Query;
 13 | import org.apache.lucene.search.ScoreDoc;
 14 | import org.apache.lucene.search.TopDocs;
 15 | 
 16 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 17 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
 18 | import doser.lucene.query.TermQuery;
 19 | 
 20 | /**
 21 |  * Falls eine Surface Form eindeutig ist und weitere Surface Forms eine
 22 |  * Abkürzung darstellen, diese allerdings nicht eindeutig sind, wird dies sofort
 23 |  * aufgelöst.
 24 |  * 
 25 |  * Beispiel: 1 Surface Form: Burlington Industries Inc (eindeutig) 2 Surface
 26 |  * Form: Burlington (ambiguous) ...
 27 |  * 
 28 |  * 
 29 |  * @author quh
 30 |  *
 31 |  */
 32 | 
 33 | class UnambiguousToAmbiguousRule extends AbstractRule {
 34 | 
 35 | 	UnambiguousToAmbiguousRule(EntityCentricKBDBpedia eckb) {
 36 | 		super(eckb);
 37 | 	}
 38 | 
 39 | 	@Override
 40 | 	public boolean applyRule(List<SurfaceForm> rep) {
 41 | 		List<SurfaceForm> unambiguous = new LinkedList<SurfaceForm>();
 42 | 		for (SurfaceForm c : rep) {
 43 | 			if (c.getCandidates().size() == 1) {
 44 | 				String candidate = c.getCandidates().get(0);
 45 | 				String type = queryType(candidate);
 46 | 				if (type.equalsIgnoreCase("Person") || type.equalsIgnoreCase("Organisation")) {
 47 | 					unambiguous.add(c);
 48 | 				}
 49 | 			}
 50 | 		}
 51 | 		for (SurfaceForm c : rep) {
 52 | 			if (c.getCandidates().size() > 1) {
 53 | 				HashMap<String, Integer> map = new HashMap<String, Integer>();
 54 | 				for (SurfaceForm un : unambiguous) {
 55 | 					String type = queryType(un.getCandidates().get(0));
 56 | 					if ((isSubString(un.getSurfaceForm(), c.getSurfaceForm())
 57 | 							&& c.getCandidates().contains(un.getCandidates().get(0))
 58 | 							&& un.getPosition() < c.getPosition())
 59 | 							|| (type.equalsIgnoreCase("Person") && isSubString(un.getSurfaceForm(), c.getSurfaceForm())
 60 | 									&& un.getPosition() < c.getPosition())) {
 61 | 						map.put(un.getCandidates().get(0), c.getPosition() - un.getPosition());
 62 | 						// c.setDisambiguatedEntity(un.getCandidates().get(0));
 63 | 					}
 64 | 				}
 65 | 				if (!map.isEmpty()) {
 66 | 					int distance = Integer.MAX_VALUE;
 67 | 					String can = "";
 68 | 					for (Map.Entry<String, Integer> entry : map.entrySet()) {
 69 | 						if (entry.getValue() < distance) {
 70 | 							distance = entry.getValue();
 71 | 							can = entry.getKey();
 72 | 						}
 73 | 					}
 74 | 					c.setDisambiguatedEntity(can);
 75 | 				}
 76 | 			}
 77 | 		}
 78 | 		return false;
 79 | 	}
 80 | 
 81 | 	private boolean isSubString(String s1, String s2) {
 82 | 		if (s1.toLowerCase().contains(s2.toLowerCase())) {
 83 | 			return true;
 84 | 		} else
 85 | 			return false;
 86 | 	}
 87 | 
 88 | 	private String queryType(String url) {
 89 | 		String type = "";
 90 | 		IndexSearcher searcher = eckb.getSearcher();
 91 | 		Query q = new TermQuery(new Term("Mainlink", url));
 92 | 		try {
 93 | 			TopDocs docs = searcher.search(q, 1);
 94 | 			ScoreDoc[] scoredocs = docs.scoreDocs;
 95 | 			if(scoredocs.length == 0) {
 96 | 				type = "Misc";
 97 | 			} else {
 98 | 				int nr = scoredocs[0].doc;
 99 | 				Document doc = searcher.getIndexReader().document(nr);
100 | 				type = doc.get("Type");
101 | 			}
102 | 		} catch (IOException e) {
103 | 			e.printStackTrace();
104 | 		}
105 | 		return type;
106 | 	}
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/NoCandidatesExpansionRules.java:
--------------------------------------------------------------------------------
  1 | package doser.entitydisambiguation.algorithms.rules;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.LinkedList;
  6 | import java.util.List;
  7 | 
  8 | import org.apache.lucene.document.Document;
  9 | import org.apache.lucene.index.IndexReader;
 10 | import org.apache.lucene.search.IndexSearcher;
 11 | import org.apache.lucene.search.ScoreDoc;
 12 | import org.apache.lucene.search.TopDocs;
 13 | import org.apache.lucene.search.similarities.DefaultSimilarity;
 14 | 
 15 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 16 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
 17 | import doser.lucene.features.LuceneFeatures;
 18 | import doser.lucene.query.LearnToRankClause;
 19 | import doser.lucene.query.LearnToRankQuery;
 20 | import doser.tools.Inflector;
 21 | 
 22 | /**
 23 |  * Falls eine Surface Form keine Kandidaten hat, allerdings aus mindestens 3
 24 |  * Wörtern besteht, werden alle Wörter mit kleinergleich 3 Buchstaben entfernt
 25 |  * und erneut angefragt. Dies geschieht ebenfalls nach der Entfernung von
 26 |  * Sonderzeichen. Entsprechend werden die Kandidaten gesetzt.
 27 |  * 
 28 |  * @author quh
 29 |  */
 30 | 
 31 | class NoCandidatesExpansionRules extends AbstractRule {
 32 | 
 33 | 	NoCandidatesExpansionRules(AbstractKnowledgeBase eckb) {
 34 | 		super(eckb);
 35 | 	}
 36 | 
 37 | 	@Override
 38 | 	public boolean applyRule(List<SurfaceForm> rep) {
 39 | 		for (SurfaceForm c : rep) {
 40 | 			if (c.getCandidates().size() == 0) {
 41 | 				c.setCandidates(queryCandidates(c.getSurfaceForm()));
 42 | 			}
 43 | 		}
 44 | 		return false;
 45 | 	}
 46 | 
 47 | 	private ArrayList<String> queryCandidates(String surfaceForm) {
 48 | 		ArrayList<String> lst = new ArrayList<String>();
 49 | 		String[] splitter = surfaceForm.split(" ");
 50 | 		if (splitter.length > 2) {
 51 | 			StringBuilder builder = new StringBuilder();
 52 | 			for (int i = 0; i < splitter.length; i++) {
 53 | 				if (splitter[i].length() > 3) {
 54 | 					builder.append(splitter[i] + " ");
 55 | 
 56 | 				}
 57 | 			}
 58 | 			String builderstring = builder.toString();
 59 | 			if (builderstring.length() > 0) {
 60 | 				String newSf = builderstring.substring(0,
 61 | 						builderstring.length() - 1);
 62 | 				lst = queryLucene(surfaceForm);
 63 | 				if (lst.size() == 0) {
 64 | 					// Try again without special chars
 65 | 					newSf = newSf.replaceAll("[^a-zA-Z ]", "");
 66 | 					lst = queryLucene(newSf);
 67 | 					// If size is 0 anyway, still check Plural to singular
 68 | 					if (lst.size() == 0) {
 69 | 						String singular = Inflector.getInstance().singularize(
 70 | 								newSf);
 71 | 						if (!newSf.equalsIgnoreCase(singular)) {
 72 | 							// Try singular search
 73 | 							lst = queryCandidates(singular);
 74 | 						}
 75 | 					}
 76 | 				}
 77 | 			}
 78 | 		}
 79 | 		return lst;
 80 | 	}
 81 | 
 82 | 	private ArrayList<String> queryLucene(String surfaceForm) {
 83 | 		ArrayList<String> list = new ArrayList<String>();
 84 | 		final IndexSearcher searcher = eckb.getSearcher();
 85 | 		final IndexReader reader = searcher.getIndexReader();
 86 | 		LearnToRankQuery query = new LearnToRankQuery();
 87 | 		List<LearnToRankClause> features = new LinkedList<LearnToRankClause>();
 88 | 		DefaultSimilarity defaultSim = new DefaultSimilarity();
 89 | 		features.add(query.add(LuceneFeatures.queryLabelTerm(surfaceForm,
 90 | 				"UniqueLabel", defaultSim), "Feature1", true));
 91 | 		try {
 92 | 			final TopDocs top = searcher.search(query, 150);
 93 | 			final ScoreDoc[] score = top.scoreDocs;
 94 | 			if (score.length <= 5) {
 95 | 				for (int i = 0; i < score.length; ++i) {
 96 | 					final Document doc = reader.document(score[i].doc);
 97 | 					list.add(doc.get("Mainlink"));
 98 | 				}
 99 | 			}
100 | 		} catch (IOException e) {
101 | 			e.printStackTrace();
102 | 		}
103 | 		return list;
104 | 	}
105 | }
106 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/tools/NTToDbPediaUrlEncoding.java:
--------------------------------------------------------------------------------
  1 | package doser.tools;
  2 | 
  3 | import java.io.BufferedReader;
  4 | import java.io.File;
  5 | import java.io.FileNotFoundException;
  6 | import java.io.FileReader;
  7 | import java.io.FileWriter;
  8 | import java.io.IOException;
  9 | import java.io.UnsupportedEncodingException;
 10 | import java.io.Writer;
 11 | import java.net.URLEncoder;
 12 | 
 13 | import org.apache.commons.lang.StringEscapeUtils;
 14 | import org.apache.log4j.Logger;
 15 | 
 16 | public final class NTToDbPediaUrlEncoding {
 17 | 
 18 | 	private NTToDbPediaUrlEncoding() {
 19 | 		super();
 20 | 	}
 21 | 	
 22 | 	public static String dbpediaEncoding(final String url) {
 23 | 		final StringBuffer buffer = new StringBuffer();
 24 | 		for (int i = 0; i < url.length(); i++) {
 25 | 			final String str = String.valueOf(url.charAt(i));
 26 | 			if (str.equalsIgnoreCase("!")) {
 27 | 				buffer.append('!');
 28 | 			} else if (str.equalsIgnoreCase("$")) {
 29 | 				buffer.append('$');
 30 | 			} else if (str.equalsIgnoreCase("&")) {
 31 | 				buffer.append('&');
 32 | 			} else if (str.equalsIgnoreCase("'")) {
 33 | 				buffer.append('\'');
 34 | 			} else if (str.equalsIgnoreCase("(")) {
 35 | 				buffer.append('(');
 36 | 			} else if (str.equalsIgnoreCase(")")) {
 37 | 				buffer.append(')');
 38 | 			} else if (str.equalsIgnoreCase("*")) {
 39 | 				buffer.append('*');
 40 | 			} else if (str.equalsIgnoreCase("+")) {
 41 | 				buffer.append('+');
 42 | 			} else if (str.equalsIgnoreCase(",")) {
 43 | 				buffer.append(',');
 44 | 			} else if (str.equalsIgnoreCase("-")) {
 45 | 				buffer.append('-');
 46 | 			} else if (str.equalsIgnoreCase("/")) {
 47 | 				buffer.append('/');
 48 | 			} else if (str.equalsIgnoreCase(":")) {
 49 | 				buffer.append(':');
 50 | 			} else if (str.equalsIgnoreCase(";")) {
 51 | 				buffer.append(';');
 52 | 			} else if (str.equalsIgnoreCase("=")) {
 53 | 				buffer.append('=');
 54 | 			} else if (str.equalsIgnoreCase("@")) {
 55 | 				buffer.append('@');
 56 | 			} else if (str.equalsIgnoreCase("_")) {
 57 | 				buffer.append('_');
 58 | 			} else if (str.equalsIgnoreCase("~")) {
 59 | 				buffer.append('~');
 60 | 			} else {
 61 | 				try {
 62 | 					buffer.append(URLEncoder.encode(str, "UTF-8"));
 63 | 				} catch (final UnsupportedEncodingException e) {
 64 | 					Logger.getRootLogger().error(e.getStackTrace());
 65 | 				}
 66 | 			}
 67 | 		}
 68 | 		return buffer.toString();
 69 | 	}
 70 | 
 71 | 	public static void main(final String[] args) throws IOException {
 72 | 		final String fileInput = args[0];
 73 | 		final String fileOutput = args[1];
 74 | 		final File fileIn = new File(fileInput);
 75 | 		final File fileOut = new File(fileOutput);
 76 | 		final Writer writer = new FileWriter(fileOut);
 77 | 		BufferedReader reader = null;
 78 | 		try {
 79 | 			reader = new BufferedReader(new FileReader(fileIn));
 80 | 		} catch (final FileNotFoundException e) {
 81 | 			Logger.getRootLogger().error(e.getStackTrace());
 82 | 		}
 83 | 		String line = null;
 84 | 		while ((line = reader.readLine()) != null) {
 85 | 			line = line.replaceAll("[ ]+", " ");
 86 | 			final String splitter[] = line.split(" ");
 87 | 			final StringBuffer buffer = new StringBuffer();
 88 | 
 89 | 			// Subject
 90 | 			String url = splitter[0].substring(1, splitter[0].length() - 1);
 91 | 			String sLine = StringEscapeUtils.unescapeJava(url);
 92 | 			buffer.append("<" + dbpediaEncoding(sLine) + "> ");
 93 | 
 94 | 			// Predicate
 95 | 			buffer.append(splitter[1] + " ");
 96 | 
 97 | 			// Object
 98 | 			if (splitter[2].startsWith("<")) {
 99 | 				url = splitter[2].substring(1, splitter[2].length() - 1);
100 | 				sLine = StringEscapeUtils.unescapeJava(url);
101 | 				buffer.append("<" + dbpediaEncoding(sLine) + ">");
102 | 			} else {
103 | 				buffer.append(splitter[2]);
104 | 			}
105 | 			writer.write(buffer.toString());
106 | 			writer.write(System.getProperty("line.separator"));
107 | 			writer.flush();
108 | 		}
109 | 		writer.close();
110 | 		reader.close();
111 | 	}
112 | 
113 | }
114 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/general/Test.java:
--------------------------------------------------------------------------------
  1 | package doser.general;
  2 | 
  3 | import java.io.IOException;
  4 | import java.text.ParseException;
  5 | 
  6 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
  7 | import org.apache.lucene.document.Document;
  8 | import org.apache.lucene.document.Field.Store;
  9 | import org.apache.lucene.document.StringField;
 10 | import org.apache.lucene.document.TextField;
 11 | import org.apache.lucene.index.DirectoryReader;
 12 | import org.apache.lucene.index.IndexReader;
 13 | import org.apache.lucene.index.IndexWriter;
 14 | import org.apache.lucene.index.IndexWriterConfig;
 15 | import org.apache.lucene.index.Term;
 16 | import org.apache.lucene.search.BooleanClause.Occur;
 17 | import org.apache.lucene.search.BooleanQuery;
 18 | import org.apache.lucene.search.IndexSearcher;
 19 | import org.apache.lucene.search.PhraseQuery;
 20 | import org.apache.lucene.search.Query;
 21 | import org.apache.lucene.search.ScoreDoc;
 22 | import org.apache.lucene.search.TermQuery;
 23 | import org.apache.lucene.search.TopScoreDocCollector;
 24 | import org.apache.lucene.search.spans.SpanNearQuery;
 25 | import org.apache.lucene.search.spans.SpanQuery;
 26 | import org.apache.lucene.search.spans.SpanTermQuery;
 27 | import org.apache.lucene.store.Directory;
 28 | import org.apache.lucene.store.RAMDirectory;
 29 | import org.apache.lucene.util.Version;
 30 | 
 31 | public class Test {
 32 | 	private IndexWriter writer;
 33 | 
 34 | 	public void lucene() throws IOException, ParseException {
 35 | 		// Build the index
 36 | 		StandardAnalyzer analyzer = new StandardAnalyzer();
 37 | 		Directory index = new RAMDirectory();
 38 | 		IndexWriterConfig config = new IndexWriterConfig(Version.LATEST,
 39 | 				analyzer);
 40 | 		this.writer = new IndexWriter(index, config);
 41 | 
 42 | 		// Add documents to the index
 43 | 		addDoc("Spring", new String[] { "Java", "JSP", "DBPEDIA_56testdoc" });
 44 | 		addDoc("Java", new String[] { "Oracle", "Annotation is cool too" });
 45 | 
 46 | 		writer.close();
 47 | 
 48 | 		// Search the index
 49 | 		IndexReader reader = DirectoryReader.open(index);
 50 | 		IndexSearcher searcher = new IndexSearcher(reader);
 51 | 
 52 | 		TermQuery q = new TermQuery(new Term("keyword", "DBPEDIA_56testdoc"));
 53 | 		// SpanQuery q = new SpanNearQuery(new SpanQuery[] {
 54 | 		// new SpanTermQuery(new Term("keyword", "too")),
 55 | 		// new SpanTermQuery(new Term("keyword", "cool"))},
 56 | 		// 3,
 57 | 		// true);
 58 | 
 59 | 		// String[] s = {"cool", "too"};
 60 | 		// for (int i = 0; i < s.length; i++) {
 61 | 		// q.add(new Term("keyword", s[i]));
 62 | 		// }
 63 | 
 64 | 		// q.add(new PhraseQuery(new Term("keyword", "Annotation is cool")),
 65 | 		// Occur.MUST);
 66 | 
 67 | 		System.out.println(q.toString());
 68 | 
 69 | 		int hitsPerPage = 10;
 70 | 		TopScoreDocCollector collector = TopScoreDocCollector.create(
 71 | 				hitsPerPage, true);
 72 | 
 73 | 		searcher.search(q, collector);
 74 | 
 75 | 		ScoreDoc[] hits = collector.topDocs().scoreDocs;
 76 | 
 77 | 		for (int i = 0; i < hits.length; ++i) {
 78 | 			int docId = hits[i].doc;
 79 | 			Document doc = searcher.doc(docId);
 80 | 			System.out.println(hits[i].toString());
 81 | 			System.out.println((i + 1) + ". \t" + doc.get("title"));
 82 | 		}
 83 | 
 84 | 		reader.close();
 85 | 	}
 86 | 
 87 | 	private void addDoc(String title, String[] keywords) throws IOException {
 88 | 		// Create new document
 89 | 		Document doc = new Document();
 90 | 
 91 | 		// Add title
 92 | 		doc.add(new TextField("title", title, Store.YES));
 93 | 
 94 | 		// Add keywords
 95 | 		for (int i = 0; i < keywords.length; i++) {
 96 | 			doc.add(new StringField("keyword", keywords[i], Store.YES));
 97 | 		}
 98 | 
 99 | 		// Add document to index
100 | 		this.writer.addDocument(doc);
101 | 	}
102 | 
103 | 	public static void main(String[] args) {
104 | 		Test test = new Test();
105 | 		try {
106 | 			test.lucene();
107 | 		} catch (IOException e) {
108 | 			// TODO Auto-generated catch block
109 | 			e.printStackTrace();
110 | 		} catch (ParseException e) {
111 | 			// TODO Auto-generated catch block
112 | 			e.printStackTrace();
113 | 		}
114 | 	}
115 | }
116 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/test/java/doser/test/breakdetection/BreakDetection.java:
--------------------------------------------------------------------------------
  1 | package doser.test.breakdetection;
  2 | 
  3 | public class BreakDetection {
  4 | 
  5 | //	@SuppressWarnings("deprecation")
  6 | //	public static void main(String[] args) {
  7 | //
  8 | //		List<WebSite> shotList = new LinkedList<WebSite>();
  9 | //		WebSite shot1 = new WebSite();
 10 | //		shot1.setName("1");
 11 | //		shot1.setText("Text1");
 12 | //		shot1.setObjectId(0);
 13 | //		WebSite shot2 = new WebSite();
 14 | //		shot2.setObjectId(1);
 15 | //		shot2.setName("2");
 16 | //		shot2.setText("Text2");
 17 | //		WebSite shot3 = new WebSite();
 18 | //		shot3.setObjectId(2);
 19 | //		shot3.setName("3");
 20 | //		shot3.setText("Text3");
 21 | //		WebSite shot4 = new WebSite();
 22 | //		shot4.setObjectId(3);
 23 | //		shot4.setName("4");
 24 | //		shot4.setText("Text4");
 25 | //		WebSite shot5 = new WebSite();
 26 | //		shot5.setObjectId(4);
 27 | //		shot5.setName("5");
 28 | //		shot5.setText("Text5");
 29 | //		WebSite shot6 = new WebSite();
 30 | //		shot6.setObjectId(5);
 31 | //		shot6.setName("6");
 32 | //		shot6.setText("Text6");
 33 | //
 34 | //		shotList.add(shot1);
 35 | //		shotList.add(shot2);
 36 | //		shotList.add(shot3);
 37 | //		shotList.add(shot4);
 38 | //		shotList.add(shot5);
 39 | //		shotList.add(shot6);
 40 | //		Decomposition<WebSite> decomp = new Decomposition<WebSite>(shotList);
 41 | //		
 42 | //		double[][] similarityMatrix = new double[6][6];
 43 | //		similarityMatrix[0][0] = 1;
 44 | //		similarityMatrix[0][1] = 0.5;
 45 | //		similarityMatrix[0][2] = 0.5;
 46 | //		similarityMatrix[0][3] = 0.8;
 47 | //		similarityMatrix[0][4] = 0.4;
 48 | //		similarityMatrix[0][5] = 0.8;
 49 | //
 50 | //		similarityMatrix[1][0] = 0.5;
 51 | //		similarityMatrix[1][1] = 1.0;
 52 | //		similarityMatrix[1][2] = 0.5;
 53 | //		similarityMatrix[1][3] = 0.5;
 54 | //		similarityMatrix[1][4] = 0.5;
 55 | //		similarityMatrix[1][5] = 0.5;
 56 | //
 57 | //		similarityMatrix[2][0] = 0.5;
 58 | //		similarityMatrix[2][1] = 0.5;
 59 | //		similarityMatrix[2][2] = 1;
 60 | //		similarityMatrix[2][3] = 0.5;
 61 | //		similarityMatrix[2][4] = 0.5;
 62 | //		similarityMatrix[2][5] = 0.5;
 63 | //
 64 | //		similarityMatrix[3][0] = 0.8;
 65 | //		similarityMatrix[3][1] = 0.5;
 66 | //		similarityMatrix[3][2] = 0.5;
 67 | //		similarityMatrix[3][3] = 1;
 68 | //		similarityMatrix[3][4] = 0.5;
 69 | //		similarityMatrix[3][5] = 0.8;
 70 | //
 71 | //		similarityMatrix[4][0] = 0.5;
 72 | //		similarityMatrix[4][1] = 0.5;
 73 | //		similarityMatrix[4][2] = 1;
 74 | //		similarityMatrix[4][3] = 0.5;
 75 | //		similarityMatrix[4][4] = 0.5;
 76 | //		similarityMatrix[4][5] = 0.5;
 77 | //
 78 | //		similarityMatrix[5][0] = 0.8;
 79 | //		similarityMatrix[5][1] = 0.5;
 80 | //		similarityMatrix[5][2] = 0.5;
 81 | //		similarityMatrix[5][3] = 0.8;
 82 | //		similarityMatrix[5][4] = 0.5;
 83 | //		similarityMatrix[5][5] = 1;
 84 | //
 85 | //		decomp.setSimilarityMatrix(similarityMatrix);
 86 | //		decomp.start();
 87 | //		try {
 88 | //			decomp.join();
 89 | //		} catch (InterruptedException e) {
 90 | //			e.printStackTrace();
 91 | //		}
 92 | //		
 93 | //		ConcurrentNCutAlgorithm<WebSite> nCutAlgorithm = new ConcurrentNCutAlgorithm<WebSite>(decomp.getMainCluster());
 94 | //
 95 | //		List<Cluster<WebSite>> clusterList = nCutAlgorithm.startClustering();
 96 | //		for (Cluster<WebSite> cluster : clusterList) {
 97 | //			List<WebSite> list = cluster.getObjectList();
 98 | //			for (WebSite site : list) {
 99 | //				System.out.println("Site id: " + site.getObjectId());
100 | //			}
101 | //		}
102 | //		
103 | //		
104 | ////		decomp.createUndirectedWeightedGraph();
105 | //		// Third Step: VideoDecomposition
106 | ////		List<Cluster<WebSite>> clusterLst = doVideoDecomposition(decomp, shotList);
107 | //
108 | ////		// Step Four: Temporal Graph Creation
109 | ////		TemporalGraph<Cluster<WebSite>> tempGraph = doTemporalGraphGeneration(clusterLst);
110 | ////
111 | ////		// Step Five: Shortest Path
112 | ////		List<Cluster> shortestPath = doShortestPath(tempGraph);
113 | ////
114 | ////		// Step Six: Scene Extraction
115 | ////		doSceneExtraction(tempGraph, shortestPath);
116 | //	}
117 | 
118 | 	
119 | }
120 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/CandidatePruning.java:
--------------------------------------------------------------------------------
  1 | package doser.entitydisambiguation.algorithms.collective;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.HashMap;
  5 | import java.util.HashSet;
  6 | import java.util.LinkedList;
  7 | import java.util.List;
  8 | import java.util.Map;
  9 | import java.util.Set;
 10 | 
 11 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 12 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral;
 13 | import doser.general.HelpfulMethods;
 14 | 
 15 | public class CandidatePruning {
 16 | 
 17 | 	private static final int NUMBEROFADDITIONALW2VENTITIES = 6;
 18 | 
 19 | 	private static final int ENTITYTHRESHOLD = 6;
 20 | 
 21 | 	private static final int MINIMUMSURFACEFORMS = 3;
 22 | 
 23 | 	private static final float WORD2VECTHRESHOLD = 1.60f;
 24 | 
 25 | 	private AbstractEntityCentricKBGeneral eckb;
 26 | 
 27 | 	public CandidatePruning(AbstractEntityCentricKBGeneral eckb) {
 28 | 		super();
 29 | 		this.eckb = eckb;
 30 | 	}
 31 | 
 32 | 	public void prune(List<SurfaceForm> rep) {
 33 | 		List<SurfaceForm> unambiguous = new LinkedList<SurfaceForm>();
 34 | 		for (SurfaceForm c : rep) {
 35 | 			if (c.getCandidates().size() == 1) {
 36 | 				unambiguous.add(c);
 37 | 			}
 38 | 		}
 39 | 
 40 | 		List<String> list = new LinkedList<String>();
 41 | 		for (SurfaceForm sf : rep) {
 42 | 			if (rep.size() > 1 && sf.getCandidates().size() == 1 && sf.isInitial()) {
 43 | 				list.add(sf.getCandidates().get(0));
 44 | 			}
 45 | 		}
 46 | 
 47 | 		for (SurfaceForm c : rep) {
 48 | 			List<String> candidates = c.getCandidates();
 49 | 			if (candidates.size() > ENTITYTHRESHOLD) {
 50 | 				Set<String> prunedCandidates = new HashSet<String>();
 51 | 
 52 | 				// Sense Prior
 53 | 				Map<String, Integer> map = new HashMap<String, Integer>();
 54 | 				for (String candidate : candidates) {
 55 | 					map.put(candidate, eckb.getFeatureDefinition().getOccurrences(c.getSurfaceForm(), candidate));
 56 | 				}
 57 | 				@SuppressWarnings("deprecation")
 58 | 				List<Map.Entry<String, Integer>> l = HelpfulMethods.sortByValue(map);
 59 | 				for (int i = 0; i < ENTITYTHRESHOLD; ++i) {
 60 | 					prunedCandidates.add(l.get(i).getKey());
 61 | 					// System.out.println("SensePrior ADd: "+l.get(i).getKey()+"
 62 | 					// "+l.get(i).getValue());
 63 | 				}
 64 | 
 65 | 				// Doc2Vec ContextSimilarity
 66 | 				Map<String, Float> map_doc2vec = new HashMap<String, Float>();
 67 | 				for (String candidate : candidates) {
 68 | 
 69 | 					map_doc2vec.put(candidate, eckb.getDoc2VecSimilarity(c.getSurfaceForm(), c.getContext(), candidate));
 70 | 				}
 71 | 				@SuppressWarnings("deprecation")
 72 | 				List<Map.Entry<String, Float>> l_doc2vec = HelpfulMethods.sortByValue(map_doc2vec);
 73 | 				int added = 0;
 74 | 				int counter = 0;
 75 | 				while (counter < l_doc2vec.size() && added < 4) {
 76 | 					String key = l_doc2vec.get(counter).getKey();
 77 | 					if (!prunedCandidates.contains(key)) {
 78 | 						prunedCandidates.add(key);
 79 | 						added++;
 80 | 					}
 81 | 					counter++;
 82 | 				}
 83 | //				for (int i = 0; i < ENTITYTHRESHOLD; ++i) {
 84 | //					prunedCandidates.add(l_doc2vec.get(i).getKey());
 85 | //				}
 86 | 
 87 | 				// Check for very relevant Candidates via given Word2Vec
 88 | 				// similarities
 89 | 				if (list.size() >= MINIMUMSURFACEFORMS) {
 90 | 					Set<String> w2vFormatStrings = new HashSet<String>();
 91 | 					for (String can : candidates) {
 92 | 						if (!prunedCandidates.contains(can)) {
 93 | 							String query = this.eckb.generateWord2VecFormatString(list, can);
 94 | 							w2vFormatStrings.add(query);
 95 | 						}
 96 | 					}
 97 | 
 98 | 					Map<String, Float> similarityMap = this.eckb.getWord2VecSimilarities(w2vFormatStrings);
 99 | 					Map<String, Integer> occmap = new HashMap<String, Integer>();
100 | 					for (String can : candidates) {
101 | 						if (!prunedCandidates.contains(can)) {
102 | 							String query = this.eckb.generateWord2VecFormatString(list, can);
103 | 							float val = similarityMap.get(query);
104 | 							if (val > WORD2VECTHRESHOLD) {
105 | 								occmap.put(can, eckb.getFeatureDefinition().getOccurrences(c.getSurfaceForm(), can));
106 | //								prunedCandidates.add(can);
107 | 							}
108 | 						}
109 | 					}
110 | 					@SuppressWarnings("deprecation")
111 | 					List<Map.Entry<String, Integer>> sortedl = HelpfulMethods.sortByValue(occmap);
112 | 					for (int i = 0; i < NUMBEROFADDITIONALW2VENTITIES; ++i) {
113 | 						if (i < sortedl.size()) {
114 | 							prunedCandidates.add(sortedl.get(i).getKey());
115 | 						}
116 | 					}
117 | 				}
118 | 
119 | 				c.setCandidates(new ArrayList<String>(prunedCandidates));
120 | 			}
121 | 		}
122 | 	}
123 | }
124 | 


--------------------------------------------------------------------------------
/doser-dis-extensions/src/main/java/doser/lucene/query/ConjunctionScorer.java:
--------------------------------------------------------------------------------
  1 | package doser.lucene.query;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.Collection;
  6 | import java.util.Comparator;
  7 | 
  8 | import org.apache.lucene.search.Scorer;
  9 | import org.apache.lucene.search.Weight;
 10 | import org.apache.lucene.util.ArrayUtil;
 11 | 
 12 | /** Scorer for conjunctions, sets of queries, all of which are required. */
 13 | class ConjunctionScorer extends Scorer {
 14 | 	static final class DocsAndFreqs {
 15 | 		final long cost;
 16 | 		int doc = -1;
 17 | 		final Scorer scorer;
 18 | 
 19 | 		DocsAndFreqs(final Scorer scorer) {
 20 | 			this.scorer = scorer;
 21 | 			cost = scorer.cost();
 22 | 		}
 23 | 	}
 24 | 
 25 | 	private final LearnToRankClause[] clauses;
 26 | 	private final float coord;
 27 | 	private final int docBase;
 28 | 	protected final DocsAndFreqs[] docsAndFreqs;
 29 | 	protected int lastDoc = -1;
 30 | 
 31 | 	private final DocsAndFreqs lead;
 32 | 
 33 | 	ConjunctionScorer(final Weight weight, final Scorer[] scorers,
 34 | 			final float coord, final LearnToRankClause[] ltrclauses,
 35 | 			final int docBase) {
 36 | 		super(weight);
 37 | 		this.coord = coord;
 38 | 		this.docBase = docBase;
 39 | 		clauses = ltrclauses;
 40 | 		docsAndFreqs = new DocsAndFreqs[scorers.length];
 41 | 		for (int i = 0; i < scorers.length; i++) {
 42 | 			docsAndFreqs[i] = new DocsAndFreqs(scorers[i]);
 43 | 		}
 44 | 		// Sort the array the first time to allow the least frequent DocsEnum to
 45 | 		// lead the matching.
 46 | 		ArrayUtil.timSort(docsAndFreqs, new Comparator<DocsAndFreqs>() {
 47 | 			@Override
 48 | 			public int compare(final DocsAndFreqs obj1, final DocsAndFreqs obj2) {
 49 | 				return Long.signum(obj1.cost - obj2.cost);
 50 | 			}
 51 | 		});
 52 | 
 53 | 		lead = docsAndFreqs[0]; // least frequent DocsEnum leads the
 54 | 								// intersection
 55 | 	}
 56 | 
 57 | 	ConjunctionScorer(final Weight weight, final Scorer[] scorers,
 58 | 			final LearnToRankClause[] ltrclauses, final int docBase) {
 59 | 		this(weight, scorers, 1f, ltrclauses, docBase);
 60 | 	}
 61 | 
 62 | 	@Override
 63 | 	public int advance(final int target) throws IOException {
 64 | 		lead.doc = lead.scorer.advance(target);
 65 | 		return lastDoc = doNext(lead.doc);
 66 | 	}
 67 | 
 68 | 	@Override
 69 | 	public long cost() {
 70 | 		return lead.scorer.cost();
 71 | 	}
 72 | 
 73 | 	@Override
 74 | 	public int docID() {
 75 | 		return lastDoc;
 76 | 	}
 77 | 
 78 | 	private int doNext(int doc) throws IOException { // NOPMD by quh on 28.02.14
 79 | 														// 10:45
 80 | 		for (;;) {
 81 | 			// doc may already be NO_MORE_DOCS here, but we don't check
 82 | 			// explicitly
 83 | 			// since all scorers should advance to NO_MORE_DOCS, match, then
 84 | 			// return that value.
 85 | 			advanceHead: for (;;) {
 86 | 				for (int i = 1; i < docsAndFreqs.length; i++) {
 87 | 					// invariant: docsAndFreqs[i].doc <= doc at this point.
 88 | 
 89 | 					// docsAndFreqs[i].doc may already be equal to doc if we
 90 | 					// "broke advanceHead"
 91 | 					// on the previous iteration and the advance on the lead
 92 | 					// scorer exactly matched.
 93 | 					if (docsAndFreqs[i].doc < doc) {
 94 | 						docsAndFreqs[i].doc = docsAndFreqs[i].scorer
 95 | 								.advance(doc);
 96 | 
 97 | 						if (docsAndFreqs[i].doc > doc) {
 98 | 							// DocsEnum beyond the current doc - break and
 99 | 							// advance lead to the new highest doc.
100 | 							doc = docsAndFreqs[i].doc;
101 | 							break advanceHead;
102 | 						}
103 | 					}
104 | 				}
105 | 				// success - all DocsEnums are on the same doc
106 | 				return doc;
107 | 			}
108 | 			// advance head for next iteration
109 | 			doc = lead.doc = lead.scorer.advance(doc);
110 | 		}
111 | 	}
112 | 
113 | 	@Override
114 | 	public int freq() {
115 | 		return docsAndFreqs.length;
116 | 	}
117 | 
118 | 	@Override
119 | 	public Collection<ChildScorer> getChildren() {
120 | 		final ArrayList<ChildScorer> children = new ArrayList<ChildScorer>(
121 | 				docsAndFreqs.length);
122 | 		for (final DocsAndFreqs docs : docsAndFreqs) {
123 | 			children.add(new ChildScorer(docs.scorer, "MUST"));
124 | 		}
125 | 		return children;
126 | 	}
127 | 
128 | 	@Override
129 | 	public int nextDoc() throws IOException {
130 | 		lead.doc = lead.scorer.nextDoc();
131 | 		return lastDoc = doNext(lead.doc);
132 | 	}
133 | 
134 | 	@Override
135 | 	public float score() throws IOException {
136 | 		// TODO: sum into a double and cast to float if we ever send required
137 | 		// clauses to BS1
138 | 		float sum = 0.0f;
139 | 		for (int i = 0; i < docsAndFreqs.length; i++) {
140 | 			final float val = docsAndFreqs[i].scorer.score()
141 | 					* clauses[i].getWeight();
142 | 			sum += val;
143 | 			clauses[i].addFeatureValue(docBase, lastDoc, val);
144 | 		}
145 | 		return sum * coord;
146 | 	}
147 | }


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/Vertex.java:
--------------------------------------------------------------------------------
  1 | package doser.entitydisambiguation.algorithms.collective;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.HashSet;
  5 | import java.util.List;
  6 | import java.util.Map;
  7 | import java.util.Set;
  8 | 
  9 | public class Vertex implements Comparable<Vertex> {
 10 | 	private List<String> uris;
 11 | 	private int entityQuery;
 12 | 	private double score;
 13 | 	private boolean isCandidate;
 14 | 	private String description;
 15 | 	private String text;
 16 | 	private String context;
 17 | 	private double occurrences;
 18 | 
 19 | 	private Set<Edge> outgoingEdges;
 20 | 
 21 | 	private double sumOutGoing;
 22 | 
 23 | 	public Vertex() {
 24 | 		super();
 25 | 		this.uris = new ArrayList<String>();
 26 | 		this.outgoingEdges = new HashSet<Edge>();
 27 | 		this.entityQuery = -1;
 28 | 		this.isCandidate = false;
 29 | 		this.sumOutGoing = 0;
 30 | 		this.text = "";
 31 | 		this.context = "";
 32 | 	}
 33 | 
 34 | 	public void addOutGoingEdge(Edge e) {
 35 | 		outgoingEdges.add(e);
 36 | 		this.sumOutGoing += e.getTransition();
 37 | 		for(Edge out : outgoingEdges) {
 38 | 			out.setProbability(out.getTransition() / sumOutGoing);
 39 | 		}
 40 | 	}
 41 | 
 42 | 	public void removeAllOutgoingEdges() {
 43 | 		this.outgoingEdges.clear();
 44 | 	}
 45 | 
 46 | 	public Edge removeOutgoingEdge(Vertex v, Map<Edge, Number> edgeWeight) {
 47 | 		Edge toRemove = null;
 48 | 		for (Edge e : outgoingEdges) {
 49 | 			if (e.getTarget().equals(v)) {
 50 | 				toRemove = e;
 51 | 				break;
 52 | 			}
 53 | 		}
 54 | 		if (toRemove != null) {
 55 | 			outgoingEdges.remove(toRemove);
 56 | 			sumOutGoing -= toRemove.getTransition();
 57 | 		}
 58 | 		
 59 | 		// Update Transition Probability
 60 | 		for(Edge out : outgoingEdges) {
 61 | 			out.setProbability(out.getTransition() / sumOutGoing);
 62 | 			edgeWeight.put(out, out.getProbability());
 63 | 		}
 64 | 		
 65 | 		return toRemove;
 66 | 	}
 67 | 
 68 | 	public String getContext() {
 69 | 		return context;
 70 | 	}
 71 | 
 72 | 	public void setContext(String context) {
 73 | 		this.context = context;
 74 | 	}
 75 | 
 76 | 	public double getSumOutGoingEdges() {
 77 | 		return sumOutGoing;
 78 | 	}
 79 | 
 80 | 	public Set<Edge> getOutgoingEdges() {
 81 | 		return this.outgoingEdges;
 82 | 	}
 83 | 
 84 | 	public List<String> getUris() {
 85 | 		return uris;
 86 | 	}
 87 | 
 88 | 	public void addUri(String uri) {
 89 | 		this.uris.add(uri);
 90 | 	}
 91 | 
 92 | 	public boolean isCandidate() {
 93 | 		return isCandidate;
 94 | 	}
 95 | 
 96 | 	public void setCandidate(boolean isCandidate) {
 97 | 		this.isCandidate = isCandidate;
 98 | 	}
 99 | 
100 | 	public int getEntityQuery() {
101 | 		return entityQuery;
102 | 	}
103 | 
104 | 	public void setEntityQuery(int entityQuery) {
105 | 		this.entityQuery = entityQuery;
106 | 	}
107 | 
108 | 	public void setGraphValue(double val) {
109 | 		this.score = val;
110 | 	}
111 | 
112 | 	public double getScore() {
113 | 		return this.score;
114 | 	}
115 | 
116 | 	public void setScore(double score) {
117 | 		this.score = score;
118 | 	}
119 | 
120 | 	public String getDescription() {
121 | 		return description;
122 | 	}
123 | 
124 | 	void setDescription(String description) {
125 | 		this.description = description;
126 | 	}
127 | 
128 | 	public String getText() {
129 | 		return text;
130 | 	}
131 | 
132 | 	public void setText(String text) {
133 | 		this.text = text;
134 | 	}
135 | 
136 | 	public double getOccurrences() {
137 | 		return occurrences;
138 | 	}
139 | 
140 | 	public void setOccurrences(int occurrences) {
141 | 		this.occurrences = Math.log10(occurrences + 1);
142 | 	}
143 | 
144 | 	@Override
145 | 	public boolean equals(Object obj) {
146 | 		Vertex comp = (Vertex) obj;
147 | 		boolean isEqual = true;
148 | 		if (this.uris.size() != comp.getUris().size()
149 | 				|| this.entityQuery != comp.getEntityQuery()) {
150 | 			return false;
151 | 		}
152 | 		for (int i = 0; i < uris.size(); ++i) {
153 | 			if (!uris.get(i).equalsIgnoreCase(comp.getUris().get(i))) {
154 | 				isEqual = false;
155 | 				break;
156 | 			}
157 | 		}
158 | 		return isEqual;
159 | 	}
160 | 
161 | 	@Override
162 | 	public int hashCode() {
163 | 		return (generateUriHash(this.uris) + ((Integer) this.getEntityQuery())
164 | 				.hashCode());
165 | 	}
166 | 
167 | 	private int generateUriHash(List<String> uris) {
168 | 		int hash = 0;
169 | 		for (String uri : uris) {
170 | 			hash += uri.hashCode();
171 | 		}
172 | 		return hash;
173 | 	}
174 | 
175 | 	/**
176 | 	 * The return values are switched to provide a descending order when using
177 | 	 * Collections.sort(), which generally provides an ascending sort order.
178 | 	 * 
179 | 	 */
180 | 	@Override
181 | 	public int compareTo(Vertex o) {
182 | 		if (this.getOccurrences() < o.getOccurrences()) {
183 | 			return 1;
184 | 		} else if (this.getOccurrences() > o.getOccurrences()) {
185 | 			return 1;
186 | 		} else {
187 | 			return 0;
188 | 		}
189 | 	}
190 | }


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/general/CollectiveDisambiguationGeneralEntities.java:
--------------------------------------------------------------------------------
  1 | package doser.entitydisambiguation.algorithms.collective.general;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.ArrayList;
  5 | import java.util.Arrays;
  6 | import java.util.LinkedList;
  7 | import java.util.List;
  8 | 
  9 | import org.apache.lucene.document.Document;
 10 | import org.apache.lucene.index.IndexReader;
 11 | import org.apache.lucene.index.Term;
 12 | import org.apache.lucene.search.IndexSearcher;
 13 | import org.apache.lucene.search.Query;
 14 | import org.apache.lucene.search.ScoreDoc;
 15 | import org.apache.lucene.search.TopDocs;
 16 | import org.slf4j.Logger;
 17 | import org.slf4j.LoggerFactory;
 18 | 
 19 | import doser.entitydisambiguation.algorithms.AbstractDisambiguationAlgorithm;
 20 | import doser.entitydisambiguation.algorithms.IllegalDisambiguationAlgorithmInputException;
 21 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 22 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask;
 23 | import doser.entitydisambiguation.backend.DisambiguationTaskCollective;
 24 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
 25 | import doser.entitydisambiguation.dpo.Response;
 26 | import doser.entitydisambiguation.knowledgebases.EntityCentricKnowledgeBase;
 27 | import doser.lucene.query.TermQuery;
 28 | import doser.entitydisambiguation.knowledgebases.AbstractKnowledgeBase;
 29 | import doser.entitydisambiguation.knowledgebases.AbstractEntityCentricKBGeneral;
 30 | 
 31 | public class CollectiveDisambiguationGeneralEntities extends AbstractDisambiguationAlgorithm {
 32 | 
 33 | 	private final static Logger logger = LoggerFactory.getLogger(CollectiveDisambiguationGeneralEntities.class);
 34 | 	
 35 | 	private AbstractEntityCentricKBGeneral eckb;
 36 | 	
 37 | 	private DisambiguationTaskCollective task;
 38 | 	
 39 | 	@Override
 40 | 	protected boolean checkAndSetInputParameter(AbstractDisambiguationTask task) {
 41 | 		AbstractKnowledgeBase kb = task.getKb();
 42 | 		if (!(task instanceof DisambiguationTaskCollective)) {
 43 | 			return false;
 44 | 		}
 45 | 		
 46 | 		this.eckb = (AbstractEntityCentricKBGeneral) kb;
 47 | 		this.task = (DisambiguationTaskCollective) task;
 48 | 		return true;
 49 | 	}
 50 | 
 51 | 	@Override
 52 | 	protected void processAlgorithm() throws IllegalDisambiguationAlgorithmInputException {
 53 | //		AdditionalCandidateQuery aq = new AdditionalCandidateQuery(eckb);
 54 | 		List<EntityDisambiguationDPO> entityList = task.getEntityToDisambiguate();
 55 | 		Response[] responseArray = new Response[entityList.size()];
 56 | 
 57 | 		List<SurfaceForm> collectiveRep = new LinkedList<SurfaceForm>();
 58 | 		for (int i = 0; i < entityList.size(); i++) {
 59 | 			EntityDisambiguationDPO dpo = entityList.get(i);
 60 | 			// Dieser Fix sollte irgendwo anders passieren. TODO Auslagern
 61 | 			dpo.setSelectedText(dpo.getSelectedText().replaceAll("’", "'"));
 62 | 			Query query = createQuery(dpo.getSelectedText(), eckb);
 63 | 			final IndexSearcher searcher = eckb.getSearcher();
 64 | 			final IndexReader reader = searcher.getIndexReader();
 65 | 			try {
 66 | 				final TopDocs top = searcher.search(query, task.getReturnNr());
 67 | 				final ScoreDoc[] score = top.scoreDocs;
 68 | 				if (dpo.getSelectedText().equalsIgnoreCase("") || dpo.getSelectedText() == null) {
 69 | 					ArrayList<String> l = new ArrayList<String>();
 70 | 					l.add("");
 71 | 					SurfaceForm col = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, i,
 72 | 							dpo.getStartPosition());
 73 | 					collectiveRep.add(col);
 74 | 				} else if (score.length == 1) {
 75 | 					final Document doc = reader.document(score[0].doc);
 76 | 					ArrayList<String> l = new ArrayList<String>();
 77 | 					l.add(doc.get("Mainlink"));
 78 | 					SurfaceForm col = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, i,
 79 | 							dpo.getStartPosition());
 80 | 					col.setInitial(true);
 81 | 					collectiveRep.add(col);
 82 | 
 83 | 				} else if (score.length > 1) {
 84 | 					ArrayList<String> l = new ArrayList<String>();
 85 | 					for (int j = 0; j < score.length; j++) {
 86 | 						final Document doc = reader.document(score[j].doc);
 87 | 						l.add(doc.get("Mainlink"));
 88 | 					}
 89 | 					SurfaceForm col = new SurfaceForm(dpo.getSelectedText(), dpo.getContext(), l, i,
 90 | 							dpo.getStartPosition());
 91 | 					collectiveRep.add(col);
 92 | 
 93 | 				} else {
 94 | //					SurfaceForm sf = aq.checkAdditionalSurfaceForms(dpo, i);
 95 | //					collectiveRep.add(sf);
 96 | 				}
 97 | 
 98 | 			} catch (final IOException e) {
 99 | 				logger.error("JsonException in "+CollectiveDisambiguationGeneralEntities.class.getName(), e);
100 | 			}
101 | 		}
102 | 
103 | 		CollectiveContextDriverGeneral solver = new CollectiveContextDriverGeneral(responseArray, collectiveRep, eckb);
104 | 		solver.solve();
105 | 
106 | 		solver.generateResult();
107 | 		List<Response> res = Arrays.asList(responseArray);
108 | 		task.setResponse(res);
109 | 
110 | 		eckb.release();
111 | 	}
112 | 
113 | 	@Override
114 | 	protected boolean preDisambiguation() {
115 | 		return true;
116 | 	}
117 | 	
118 | 	private Query createQuery(String sf, EntityCentricKnowledgeBase kb) {
119 | 		String surfaceform = sf.toLowerCase();
120 | 		TermQuery query = new TermQuery(new Term("UniqueLabel", surfaceform));
121 | 
122 | 		return query;
123 | 	}
124 | }
125 | 


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/rules/ContextRule.java:
--------------------------------------------------------------------------------
  1 | package doser.entitydisambiguation.algorithms.rules;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.HashSet;
  5 | import java.util.LinkedList;
  6 | import java.util.List;
  7 | import java.util.Map;
  8 | import java.util.Set;
  9 | 
 10 | import org.apache.lucene.document.Document;
 11 | import org.apache.lucene.index.Term;
 12 | import org.apache.lucene.search.IndexSearcher;
 13 | import org.apache.lucene.search.Query;
 14 | import org.apache.lucene.search.ScoreDoc;
 15 | import org.apache.lucene.search.TopDocs;
 16 | 
 17 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 18 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
 19 | import doser.lucene.query.TermQuery;
 20 | 
 21 | class ContextRule extends AbstractRule {
 22 | 
 23 | 	private static final int MINDISAMBIGUATEDSURFACEFORMS = 2;
 24 | 
 25 | 	private static final int MINIMUMSURFACEFORMS = 10;
 26 | 
 27 | 	private static final float SIMILARITYTHRESHOLD = 1.57f;
 28 | 	private static final float SIMILARITYTHRESHOLDMISC = 1.53f;
 29 | 
 30 | 	private EntityCentricKBDBpedia eckb;
 31 | 	
 32 | 	ContextRule(EntityCentricKBDBpedia eckb) {
 33 | 		super(eckb);
 34 | 		this.eckb = eckb;
 35 | 	}
 36 | 
 37 | 	@Override
 38 | 	public boolean applyRule(List<SurfaceForm> rep) {
 39 | 		if (rep.size() > MINIMUMSURFACEFORMS) {
 40 | 			List<String> list = new LinkedList<String>();
 41 | 			for (SurfaceForm sf : rep) {
 42 | 				if (rep.size() > 1 && sf.getCandidates().size() == 1 && sf.isInitial()) {
 43 | 					list.add(sf.getCandidates().get(0));
 44 | 				}
 45 | 			}
 46 | 			if (list.size() >= MINDISAMBIGUATEDSURFACEFORMS) {
 47 | 				Set<String> w2vFormatStrings = new HashSet<String>();
 48 | 				for (SurfaceForm sf : rep) {
 49 | 					if (rep.size() > 1 && sf.getCandidates().size() > 1) {
 50 | 						List<String> l = sf.getCandidates();
 51 | 						List<String> bestCandidate = new LinkedList<String>();
 52 | 						Set<String> levenshteinAdded = new HashSet<String>();
 53 | 						for (String s : l) {
 54 | 							String query = this.eckb.generateWord2VecFormatString(list, s);
 55 | 							w2vFormatStrings.add(query);
 56 | 							Map<String, Float> similarityMap = this.eckb.getWord2VecSimilarities(w2vFormatStrings);
 57 | 							float simValue = similarityMap.get(query);
 58 | 							// Check for Appropriate entities
 59 | 							String candidateWithoutUrl = s.replaceAll("http://dbpedia.org/resource/", "").toLowerCase();
 60 | 							if (levenshteinDistance(candidateWithoutUrl, sf.getSurfaceForm().toLowerCase()) <= 2) {
 61 | 								System.out.println("LEVENSHTEIN DISTANCE ENTITY: " + s);
 62 | 							}
 63 | 							if (simValue > SIMILARITYTHRESHOLD
 64 | 									|| (queryType(s).equalsIgnoreCase("Misc") && simValue > SIMILARITYTHRESHOLDMISC)) {
 65 | 								bestCandidate.add(s);
 66 | 							} else if (levenshteinDistance(candidateWithoutUrl,
 67 | 									sf.getSurfaceForm().toLowerCase()) <= 2) {
 68 | 								bestCandidate.add(s);
 69 | 								levenshteinAdded.add(s);
 70 | 							}
 71 | 						}
 72 | 						// Disambiguate and assign entity
 73 | 						if (!bestCandidate.isEmpty()) {
 74 | 							boolean notOnlyLevenshtein = false;
 75 | 							for (String s : bestCandidate) {
 76 | 								if (!levenshteinAdded.contains(s)) {
 77 | 									notOnlyLevenshtein = true;
 78 | 								}
 79 | 							}
 80 | 							if (notOnlyLevenshtein) {
 81 | 								sf.setCandidates(bestCandidate);
 82 | 								System.out.println("Es bleibt übrig SurfaceForm: " + sf.getSurfaceForm() + "   +"
 83 | 										+ bestCandidate.toString());
 84 | 							}
 85 | 						}
 86 | 					}
 87 | 				}
 88 | 			}
 89 | 		}
 90 | 		return false;
 91 | 	}
 92 | 
 93 | 	private String queryType(String url) {
 94 | 		String type = "";
 95 | 		IndexSearcher searcher = eckb.getSearcher();
 96 | 		Query q = new TermQuery(new Term("Mainlink", url));
 97 | 		try {
 98 | 			TopDocs docs = searcher.search(q, 1);
 99 | 			ScoreDoc[] scoredocs = docs.scoreDocs;
100 | 			if (scoredocs.length == 0) {
101 | 				type = "Misc";
102 | 			} else {
103 | 				int nr = scoredocs[0].doc;
104 | 				Document doc = searcher.getIndexReader().document(nr);
105 | 				type = doc.get("Type");
106 | 			}
107 | 		} catch (IOException e) {
108 | 			e.printStackTrace();
109 | 		}
110 | 		return type;
111 | 	}
112 | 
113 | 	int levenshteinDistance(CharSequence lhs, CharSequence rhs) {
114 | 		int len0 = lhs.length() + 1;
115 | 		int len1 = rhs.length() + 1;
116 | 
117 | 		// the array of distances
118 | 		int[] cost = new int[len0];
119 | 		int[] newcost = new int[len0];
120 | 
121 | 		// initial cost of skipping prefix in String s0
122 | 		for (int i = 0; i < len0; i++)
123 | 			cost[i] = i;
124 | 
125 | 		// dynamically computing the array of distances
126 | 
127 | 		// transformation cost for each letter in s1
128 | 		for (int j = 1; j < len1; j++) {
129 | 			// initial cost of skipping prefix in String s1
130 | 			newcost[0] = j;
131 | 
132 | 			// transformation cost for each letter in s0
133 | 			for (int i = 1; i < len0; i++) {
134 | 				// matching current letters in both strings
135 | 				int match = (lhs.charAt(i - 1) == rhs.charAt(j - 1)) ? 0 : 1;
136 | 
137 | 				// computing cost for each transformation
138 | 				int cost_replace = cost[i - 1] + match;
139 | 				int cost_insert = cost[i] + 1;
140 | 				int cost_delete = newcost[i - 1] + 1;
141 | 
142 | 				// keep minimum cost
143 | 				newcost[i] = Math.min(Math.min(cost_insert, cost_delete), cost_replace);
144 | 			}
145 | 
146 | 			// swap cost/newcost arrays
147 | 			int[] swap = cost;
148 | 			cost = newcost;
149 | 			newcost = swap;
150 | 		}
151 | 
152 | 		// the distance is the cost for transforming all letters in both strings
153 | 		return cost[len0 - 1];
154 | 	}
155 | }
156 | 


--------------------------------------------------------------------------------
/doser-dis-disambiguationserver/src/main/java/doser/server/actions/disambiguation/DisambiguationService.java:
--------------------------------------------------------------------------------
  1 | package doser.server.actions.disambiguation;
  2 | 
  3 | import java.util.LinkedList;
  4 | import java.util.List;
  5 | 
  6 | import org.springframework.stereotype.Controller;
  7 | import org.springframework.web.bind.annotation.RequestBody;
  8 | import org.springframework.web.bind.annotation.RequestMapping;
  9 | import org.springframework.web.bind.annotation.RequestMethod;
 10 | import org.springframework.web.bind.annotation.ResponseBody;
 11 | 
 12 | import doser.entitydisambiguation.backend.DisambiguationMainService;
 13 | import doser.entitydisambiguation.backend.AbstractDisambiguationTask;
 14 | import doser.entitydisambiguation.backend.DisambiguationTaskCollective;
 15 | import doser.entitydisambiguation.backend.DisambiguationTaskSingle;
 16 | import doser.entitydisambiguation.dpo.DisambiguationRequest;
 17 | import doser.entitydisambiguation.dpo.DisambiguationResponse;
 18 | import doser.entitydisambiguation.dpo.EntityDisambiguationDPO;
 19 | import doser.entitydisambiguation.dpo.Response;
 20 | import doser.entitydisambiguation.properties.Properties;
 21 | 
 22 | @Controller
 23 | @RequestMapping("/disambiguation")
 24 | public class DisambiguationService {
 25 | 
 26 | 	public DisambiguationService() {
 27 | 		super();
 28 | 	}
 29 | 
 30 | 	/**
 31 | 	 * Testing
 32 | 	 * 
 33 | 	 * @param request
 34 | 	 * @return
 35 | 	 */
 36 | 	@RequestMapping(value = "/disambiguateWithoutCategories-single", method = RequestMethod.POST, headers = "Accept=application/json")
 37 | 	public @ResponseBody DisambiguationResponse annotateSingle(@RequestBody final DisambiguationRequest request) {
 38 | 		DisambiguationResponse annotationResponse = disambiguateSingle(request);
 39 | 		return annotationResponse;
 40 | 	}
 41 | 
 42 | 	@RequestMapping(value = "/disambiguationWithoutCategories-collective", method = RequestMethod.POST, headers = "Accept=application/json")
 43 | 	public @ResponseBody DisambiguationResponse annotateCollectiveWithoutCategories(
 44 | 			@RequestBody final DisambiguationRequest request) {
 45 | 		final DisambiguationResponse response = new DisambiguationResponse();
 46 | 		final DisambiguationMainService mainService = DisambiguationMainService.getInstance();
 47 | 		final List<EntityDisambiguationDPO> listToDis = request.getSurfaceFormsToDisambiguate();
 48 | 
 49 | 		if (mainService != null) {
 50 | 			final List<AbstractDisambiguationTask> tasks = new LinkedList<AbstractDisambiguationTask>();
 51 | 			DisambiguationTaskCollective collectiveTask = new DisambiguationTaskCollective(listToDis,
 52 | 					request.getMainTopic());
 53 | 			collectiveTask.setKbIdentifier("default", "EntityCentric");
 54 | 			collectiveTask.setReturnNr(1000);
 55 | 			tasks.add(collectiveTask);
 56 | 			mainService.disambiguate(tasks);
 57 | 
 58 | 			List<Response> responses = collectiveTask.getResponse();
 59 | 			response.setTasks(responses);
 60 | 			response.setDocumentUri(request.getDocumentUri());
 61 | 		}
 62 | 		return response;
 63 | 	}
 64 | 
 65 | 	@RequestMapping(value = "/disambiguationWithoutCategoriesBiomed-collective", method = RequestMethod.POST, headers = "Accept=application/json")
 66 | 	public @ResponseBody DisambiguationResponse annotateCollectiveWithoutCategoriesBiomed(
 67 | 			@RequestBody final DisambiguationRequest request) {
 68 | 		final DisambiguationResponse response = new DisambiguationResponse();
 69 | 		final DisambiguationMainService mainService = DisambiguationMainService.getInstance();
 70 | 		final List<EntityDisambiguationDPO> listToDis = request.getSurfaceFormsToDisambiguate();
 71 | 
 72 | 		if (mainService != null) {
 73 | 			final List<AbstractDisambiguationTask> tasks = new LinkedList<AbstractDisambiguationTask>();
 74 | 			DisambiguationTaskCollective collectiveTask = new DisambiguationTaskCollective(listToDis,
 75 | 					request.getMainTopic());
 76 | 			collectiveTask.setKbIdentifier("biomed", "EntityCentric");
 77 | 			collectiveTask.setReturnNr(1000);
 78 | 			tasks.add(collectiveTask);
 79 | 			mainService.disambiguate(tasks);
 80 | 
 81 | 			List<Response> responses = collectiveTask.getResponse();
 82 | 			response.setTasks(responses);
 83 | 			response.setDocumentUri(request.getDocumentUri());
 84 | 		}
 85 | 		return response;
 86 | 	}
 87 | 
 88 | 	private DisambiguationResponse disambiguateSingle(DisambiguationRequest request) {
 89 | 		final DisambiguationResponse response = new DisambiguationResponse();
 90 | 		final List<EntityDisambiguationDPO> listToDis = request.getSurfaceFormsToDisambiguate();
 91 | 		List<Response> responseList = new LinkedList<Response>();
 92 | 		response.setDocumentUri(request.getDocumentUri());
 93 | 		final List<AbstractDisambiguationTask> tasks = new LinkedList<AbstractDisambiguationTask>();
 94 | 		final DisambiguationMainService mainService = DisambiguationMainService.getInstance();
 95 | 		if (mainService != null) {
 96 | 			int docsToReturn = 0;
 97 | 			if (request.getDocsToReturn() == null) {
 98 | 				docsToReturn = Properties.getInstance().getDisambiguationResultSize();
 99 | 			} else {
100 | 				docsToReturn = request.getDocsToReturn();
101 | 			}
102 | 			for (int i = 0; i < listToDis.size(); i++) {
103 | 				EntityDisambiguationDPO dpo = listToDis.get(i);
104 | 				DisambiguationTaskSingle task = new DisambiguationTaskSingle(dpo);
105 | 				task.setReturnNr(docsToReturn);
106 | 				task.setKbIdentifier(listToDis.get(i).getKbversion(), listToDis.get(i).getSetting());
107 | 				// Bugfix! Selected text may not be null. Should be ""
108 | 				// String instead;
109 | 				if (dpo.getSelectedText() != null) {
110 | 					tasks.add(task);
111 | 				}
112 | 			}
113 | 			mainService.disambiguate(tasks);
114 | 		}
115 | 
116 | 		for (AbstractDisambiguationTask task : tasks) {
117 | 			responseList.add(task.getResponse().get(0));
118 | 		}
119 | 		response.setTasks(responseList);
120 | 		return response;
121 | 	}
122 | }


--------------------------------------------------------------------------------
/doser-dis-core/src/main/java/doser/entitydisambiguation/algorithms/collective/dbpedia/Word2VecDisambiguator.java:
--------------------------------------------------------------------------------
  1 | package doser.entitydisambiguation.algorithms.collective.dbpedia;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.BitSet;
  5 | import java.util.Collection;
  6 | import java.util.Collections;
  7 | import java.util.HashMap;
  8 | import java.util.List;
  9 | 
 10 | import org.apache.commons.collections15.Factory;
 11 | import org.apache.commons.collections15.functors.MapTransformer;
 12 | import org.apache.commons.math.stat.descriptive.SummaryStatistics;
 13 | 
 14 | import doser.entitydisambiguation.algorithms.SurfaceForm;
 15 | import doser.entitydisambiguation.algorithms.collective.AbstractWord2VecPageRank;
 16 | import doser.entitydisambiguation.algorithms.collective.Edge;
 17 | import doser.entitydisambiguation.algorithms.collective.Vertex;
 18 | import doser.entitydisambiguation.knowledgebases.EntityCentricKBDBpedia;
 19 | import edu.uci.ics.jung.algorithms.scoring.PageRankWithPriors;
 20 | import edu.uci.ics.jung.graph.DirectedSparseMultigraph;
 21 | 
 22 | class Word2VecDisambiguator extends AbstractWord2VecPageRank {
 23 | 
 24 | //	private static final int MAXIMUMCANDIDATESPERSF = 8;
 25 | 
 26 | 	private List<SurfaceForm> origList;
 27 | 	
 28 | 	private boolean disambiguate;
 29 | 	
 30 | 	private int maximumcandidatespersf;
 31 | 	
 32 | 	private int iterations;
 33 | 	
 34 | 
 35 | 	Word2VecDisambiguator(EntityCentricKBDBpedia eckb,
 36 | 			List<SurfaceForm> rep, boolean disambiguate, int maximumcandidatespersf, int iterations) {
 37 | 		super(eckb, rep);
 38 | 		this.origList = new ArrayList<SurfaceForm>();
 39 | 		this.disambiguate = disambiguate;
 40 | 		this.maximumcandidatespersf = maximumcandidatespersf;
 41 | 		this.iterations = iterations;
 42 | 	}
 43 | 
 44 | 	@Override
 45 | 	public void setup() {
 46 | 		this.graph = new DirectedSparseMultigraph<Vertex, Edge>();
 47 | 		this.edgeWeights = new HashMap<Edge, Number>();
 48 | 		this.edgeFactory = new Factory<Integer>() {
 49 | 			int i = 0;
 50 | 
 51 | 			public Integer create() {
 52 | 				return i++;
 53 | 			}
 54 | 		};
 55 | 
 56 | 		for (SurfaceForm sf : repList) {
 57 | 			SurfaceForm clone = (SurfaceForm) sf.clone();
 58 | 			this.origList.add(clone);
 59 | 		}
 60 | 
 61 | 		this.disambiguatedSurfaceForms = new BitSet(repList.size());
 62 | 		for (int i = 0; i < repList.size(); i++) {
 63 | 			if (repList.get(i).getCandidates().size() <= 1) {
 64 | 				this.disambiguatedSurfaceForms.set(i);
 65 | 			}
 66 | 		}
 67 | 		buildMainGraph();
 68 | 	}
 69 | 
 70 | 	@Override
 71 | 	protected PageRankWithPriors<Vertex, Edge> performPageRank() {
 72 | 		PageRankWithPriors<Vertex, Edge> pr = new PageRankWithPriors<Vertex, Edge>(
 73 | 				graph, MapTransformer.getInstance(edgeWeights),
 74 | 				getRootPrior(graph.getVertices()), 0.09);
 75 | 		pr.setMaxIterations(iterations);
 76 | 		pr.evaluate();
 77 | 		return pr;
 78 | 	}
 79 | 
 80 | 	@Override
 81 | 	public boolean analyzeResults(PageRankWithPriors<Vertex, Edge> pr) {
 82 | 		boolean disambiguationStop = true;
 83 | 		Collection<Vertex> vertexCol = graph.getVertices();
 84 | 		for (int i = 0; i < repList.size(); i++) {
 85 | 			if (!disambiguatedSurfaceForms.get(i) && repList.get(i).isRelevant()) {
 86 | 				int qryNr = repList.get(i).getQueryNr();
 87 | 				double maxScore = 0;
 88 | 				SummaryStatistics stats = new SummaryStatistics();
 89 | 				String tempSolution = "";
 90 | 				List<Candidate> scores = new ArrayList<Candidate>();
 91 | 				for (Vertex v : vertexCol) {
 92 | 					if (v.getEntityQuery() == qryNr && v.isCandidate()) {
 93 | 						scores.add(new Candidate(v.getUris().get(0), pr
 94 | 								.getVertexScore(v)));
 95 | 						double score = Math.abs(pr.getVertexScore(v));
 96 | 						stats.addValue(score);
 97 | 						if (score > maxScore) {
 98 | 							tempSolution = v.getUris().get(0);
 99 | 							maxScore = score;
100 | 						}
101 | 					}
102 | 				}
103 | 				SurfaceForm rep = repList.get(i);
104 | 				SurfaceForm clone = origList.get(i);
105 | 				Collections.sort(scores, Collections.reverseOrder());
106 | 				double secondMax = scores.get(1).score;
107 | 				
108 | 				List<String> newCandidates = new ArrayList<String>();
109 | 				for(int j = 0; j < maximumcandidatespersf; j++) {
110 | 					if(scores.size() > j) {
111 | 						newCandidates.add(scores.get(j).can);
112 | 					} else {
113 | 						break;
114 | 					}
115 | 				}
116 | 
117 | 				if (!Double.isInfinite(maxScore)) {
118 | 					double avg = stats.getMean();
119 | 					double threshold = computeThreshold(avg, maxScore);
120 | 					if (secondMax < threshold && disambiguate) {
121 | 						updateGraph(rep.getCandidates(), tempSolution,
122 | 								rep.getQueryNr());
123 | 						rep.setDisambiguatedEntity(tempSolution);
124 | 						clone.setDisambiguatedEntity(tempSolution);
125 | 						disambiguatedSurfaceForms.set(i);
126 | 						disambiguationStop = false;
127 | 						break;
128 | 					} else {
129 | 						clone.setCandidates(newCandidates);
130 | 					}
131 | 				}
132 | 			}
133 | 		}
134 | 		return disambiguationStop;
135 | 	}
136 | 
137 | 	/**
138 | 	 * Threshold Computation // IMPORTANT DISAMBIGUATION PARAMETER
139 | 	 * 
140 | 	 * @param avg
141 | 	 * @param highest
142 | 	 * @return
143 | 	 */
144 | 	private double computeThreshold(double avg, double highest) {
145 | 		double diff = highest - avg;
146 | 		double min = diff * 0.5;
147 | 		return highest - min;
148 | 	}
149 | 
150 | 	@Override
151 | 	public List<SurfaceForm> getRepresentation() {
152 | 		return this.origList;
153 | 	}
154 | 
155 | 	class Candidate implements Comparable<Candidate> {
156 | 		private double score;
157 | 		private String can;
158 | 
159 | 		Candidate(String can, double score) {
160 | 			super();
161 | 			this.score = score;
162 | 			this.can = can;
163 | 		}
164 | 
165 | 		@Override
166 | 		public int compareTo(Candidate o) {
167 | 			if (score < o.score) {
168 | 				return -1;
169 | 			} else if (score > o.score) {
170 | 				return 1;
171 | 			} else {
172 | 				return 0;
173 | 			}
174 | 		}
175 | 	}
176 | }
177 | 


--------------------------------------------------------------------------------