surroundingMentionNames;
15 |
16 | private int id = -1;
17 |
18 |
19 | public static final String NO_MATCHING_ENTITY = "--NME--";
20 |
21 | /**
22 | * Use this field to represent the mention-entity similarity computed with
23 | * some method (not the score stored in the DB). This field will not be set
24 | * in the constructor. We set it later on, when we compute the similarity
25 | */
26 | private double mentionEntitySimilarity;
27 |
28 | public Entity(String name, int id) {
29 | this.name = name;
30 | this.mentionEntitySimilarity = -1.0;
31 | this.id = id;
32 | }
33 |
34 | public String getName() {
35 | return name;
36 | }
37 |
38 | public String toString() {
39 | return name + " (" + id + ")";
40 | }
41 |
42 | public String tohtmlString() {
43 | return " | | " + Char.toHTML(name) + " | | | | ";
44 | }
45 |
46 | public int getId() {
47 | return id;
48 | }
49 |
50 | public double getMentionEntitySimilarity() {
51 | return this.mentionEntitySimilarity;
52 | }
53 |
54 | public void setMentionEntitySimilarity(double mes) {
55 | this.mentionEntitySimilarity = mes;
56 | }
57 |
58 | public int compareTo(Entity e) {
59 | return name.compareTo(e.getName());
60 | }
61 |
62 | public boolean equals(Object o) {
63 | if (o instanceof Entity) {
64 | Entity e = (Entity) o;
65 | return name.equals(e.getName());
66 | } else {
67 | return false;
68 | }
69 | }
70 |
71 | public int hashCode() {
72 | return name.hashCode();
73 | }
74 |
75 | public boolean isNMEentity() {
76 | return Entities.isNMEName(name);
77 | }
78 |
79 | public String getNMEnormalizedName() {
80 | String normName = name.replace("-"+NO_MATCHING_ENTITY, "").replace(' ', '_');
81 | return normName;
82 | }
83 |
84 | public List getSurroundingMentionNames() {
85 | return surroundingMentionNames;
86 | }
87 |
88 | public void setSurroundingMentionNames(List surroundingMentionNames) {
89 | this.surroundingMentionNames = surroundingMentionNames;
90 | }
91 | }
92 |
--------------------------------------------------------------------------------
/src/mpi/aida/data/Keyphrases.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.data;
2 |
3 | import gnu.trove.map.hash.TIntDoubleHashMap;
4 | import gnu.trove.map.hash.TIntObjectHashMap;
5 |
6 | /**
7 | * Holds all the keyphrase data describing a set of entities.
8 | *
9 | *
10 | */
11 | public class Keyphrases {
12 |
13 | private TIntObjectHashMap entityKeyphrases;
14 | private TIntObjectHashMap keyphraseTokens;
15 | private TIntObjectHashMap entity2keyphrase2mi;
16 | private TIntObjectHashMap entity2keyword2mi;
17 |
18 | public void setEntityKeyphrases(TIntObjectHashMap entityKeyphrases) {
19 | this.entityKeyphrases = entityKeyphrases;
20 | }
21 |
22 | public void setKeyphraseTokens(TIntObjectHashMap keyphraseTokens) {
23 | this.keyphraseTokens = keyphraseTokens;
24 | }
25 |
26 | public void setEntityKeyphraseWeights(
27 | TIntObjectHashMap entity2keyphrase2mi) {
28 | this.entity2keyphrase2mi = entity2keyphrase2mi;
29 | }
30 |
31 | public void setEntityKeywordWeights(
32 | TIntObjectHashMap entity2keyword2mi) {
33 | this.entity2keyword2mi = entity2keyword2mi;
34 | }
35 |
36 | public TIntObjectHashMap getEntityKeyphrases() {
37 | return entityKeyphrases;
38 | }
39 |
40 | public TIntObjectHashMap getKeyphraseTokens() {
41 | return keyphraseTokens;
42 | }
43 |
44 | public TIntObjectHashMap getEntityKeywordWeights() {
45 | return entity2keyword2mi;
46 | }
47 |
48 | public TIntObjectHashMap getEntityKeyphraseWeights() {
49 | return entity2keyphrase2mi;
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/mpi/aida/data/Mention.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.data;
2 |
3 | import java.io.Serializable;
4 |
5 | public class Mention implements Serializable, Comparable {
6 |
7 | private static final long serialVersionUID = 3177945435296705498L;
8 |
9 | private String mention;
10 |
11 | /** Starting token offset of the mention. */
12 | private int startToken;
13 |
14 | /** Ending token offset of the mention (including this token). */
15 | private int endToken;
16 |
17 | private int startStanford;
18 |
19 | private int endStanford;
20 |
21 | private int sentenceId;
22 |
23 | private String groundTruthEntity = null;
24 |
25 | private double disambiguationConfidence;
26 |
27 | // Character offset
28 | private int charOffset, charLength;
29 |
30 | private Entities candidateEntities;
31 |
32 | private int id = -1;
33 |
34 | public Mention() {
35 | }
36 |
37 | public Mention(String mention, int startToken, int endToken, int startStanford, int endStanford, int sentenceId) {
38 | this.startToken = startToken;
39 | this.endToken = endToken;
40 | this.startStanford = startStanford;
41 | this.endStanford = endStanford;
42 | this.mention = mention;
43 | this.sentenceId = sentenceId;
44 | }
45 |
46 | public String getMention() {
47 | return mention;
48 | }
49 |
50 | public int getStartToken() {
51 | return startToken;
52 | }
53 |
54 | public int getEndToken() {
55 | return endToken;
56 | }
57 |
58 | public int getStartStanford() {
59 | return startStanford;
60 | }
61 |
62 | public int getEndStanford() {
63 | return endStanford;
64 | }
65 |
66 | public int getSentenceId() {
67 | return sentenceId;
68 | }
69 |
70 | public void setSentenceId(int sentenceId) {
71 | this.sentenceId = sentenceId;
72 | }
73 |
74 | public void addCandidateEntity(Entity entity) {
75 | candidateEntities.add(entity);
76 | }
77 |
78 | public Entities getCandidateEntities() {
79 | return candidateEntities;
80 | }
81 |
82 | public void setCandidateEntities(Entities candidateEntities) {
83 | this.candidateEntities = candidateEntities;
84 | }
85 |
86 | public String toString() {
87 | return mention + ", From:" + startToken + "/" + startStanford + ", To:" + endToken + "/" + endStanford + ", Offset: " + charOffset + ", Length: " + charLength;
88 | }
89 |
90 | public void setStartToken(int start) {
91 | this.startToken = start;
92 | }
93 |
94 | public void setEndToken(int end) {
95 | this.endToken = end;
96 | }
97 |
98 | public int getCharOffset() {
99 | return this.charOffset;
100 | }
101 |
102 | public int getCharLength() {
103 | return this.charLength;
104 | }
105 |
106 | public void setCharOffset(int offset) {
107 | this.charOffset = offset;
108 |
109 | }
110 |
111 | public void setCharLength(int length) {
112 | this.charLength = length;
113 | }
114 |
115 | public void setMention(String mention) {
116 | this.mention = mention;
117 | }
118 |
119 | @Override
120 | public boolean equals(Object obj) {
121 | if (obj instanceof Mention) {
122 | Mention m = (Mention) obj;
123 |
124 | return m.getMention().equals(getMention()) && m.getCharOffset() == charOffset;
125 | } else {
126 | return false;
127 | }
128 | }
129 |
130 | @Override
131 | public int hashCode() {
132 | return mention.hashCode() + charOffset;
133 | }
134 |
135 | @Override
136 | public int compareTo(Mention mention) {
137 | return this.charOffset - mention.charOffset;
138 | }
139 |
140 | public void setGroundTruthResult(String result) {
141 | this.groundTruthEntity = result;
142 | }
143 |
144 | public String getGroundTruthResult() {
145 | return groundTruthEntity;
146 | }
147 |
148 | public void setDisambiguationConfidence(double confidence) {
149 | disambiguationConfidence = confidence;
150 | }
151 |
152 | public double getDisambiguationConfidence() {
153 | return disambiguationConfidence;
154 | }
155 |
156 | public int getId() {
157 | return id;
158 | }
159 |
160 | public void setId(int id) {
161 | this.id = id;
162 | }
163 |
164 | public void setStartStanford(int startStanford) {
165 | this.startStanford = startStanford;
166 | }
167 |
168 | public void setEndStanford(int endStanford) {
169 | this.endStanford = endStanford;
170 | }
171 |
172 | public String getIdentifiedRepresentation() {
173 | return mention + ":::" + charOffset;
174 | }
175 | }
176 |
--------------------------------------------------------------------------------
/src/mpi/aida/data/Mentions.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.data;
2 |
3 | import java.io.Serializable;
4 | import java.util.ArrayList;
5 | import java.util.Collections;
6 | import java.util.HashMap;
7 | import java.util.LinkedList;
8 | import java.util.List;
9 |
10 | public class Mentions implements Serializable {
11 |
12 | private static final long serialVersionUID = -383105468450056989L;
13 |
14 | private List mentions = null;
15 |
16 | private HashMap subStrings = null;
17 |
18 | /**
19 | * The expected types for entities to which those mentions will be disambiguated
20 | */
21 | private List entitiesTypes = null;
22 |
23 | public Mentions() {
24 | mentions = new LinkedList();
25 | }
26 |
27 | public boolean containsOffset(int offset) {
28 | for (Mention mention : mentions) {
29 | if (mention.getCharOffset() == offset) {
30 | return true;
31 | }
32 | }
33 | return false;
34 | }
35 |
36 | public Mention getMentionForOffset(int offset) {
37 | for (Mention mention : mentions) {
38 | if (mention.getCharOffset() == offset) {
39 | return mention;
40 | }
41 | }
42 | return null;
43 | }
44 |
45 | public void addMention(Mention mention) {
46 | mentions.add(mention);
47 | }
48 |
49 | public List getMentions() {
50 | return mentions;
51 | }
52 |
53 | public ArrayList getMentionTokenStanfordIndices()
54 | {
55 | ArrayList mentionTokenIndices = new ArrayList();
56 | // there's just one
57 | for (Mention mention : mentions)
58 | {
59 | for (int i=mention.getStartStanford();i<=mention.getEndStanford();i++)
60 | mentionTokenIndices.add(i);
61 | }
62 | return mentionTokenIndices;
63 | }
64 |
65 | public int getMentionTokenSentenceIndex()
66 | {
67 | // there's just one
68 | return mentions.get(0).getSentenceId();
69 | }
70 |
71 | public boolean remove(Mention mention) {
72 | return mentions.remove(mention);
73 | }
74 |
75 | public String toString() {
76 | StringBuffer sb = new StringBuffer(200);
77 | for (int i = 0; i < mentions.size(); i++) {
78 | sb.append(mentions.get(i).toString()).append('\n');
79 | }
80 | return sb.toString();
81 | }
82 |
83 | public void setSubstring(HashMap subStrings) {
84 | this.subStrings = subStrings;
85 | }
86 |
87 | public HashMap getSubstrings() {
88 | return subStrings;
89 | }
90 |
91 | public void sortMentions() {
92 | Collections.sort(mentions);
93 | }
94 |
95 | public List getEntitiesTypes() {
96 | return entitiesTypes;
97 | }
98 |
99 | public void setEntitiesTypes(List entitiesTypes) {
100 | this.entitiesTypes = entitiesTypes;
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/mpi/aida/data/PreparedInput.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.data;
2 |
3 | import mpi.tokenizer.data.Tokens;
4 |
5 | public class PreparedInput {
6 |
7 | private String docId;
8 |
9 | private Tokens tokens;
10 |
11 | /** Used by the local similarity methods in the disambiguation. It holds
12 | * the document tokens both as strings and converted to word ids. */
13 | private Context context;
14 |
15 | private Mentions mentions;
16 |
17 | public PreparedInput(String docId) {
18 | this.docId = docId;
19 | }
20 |
21 | public PreparedInput(String docId, Tokens tokens, Mentions mentions) {
22 | this.docId = docId;
23 | this.tokens = tokens;
24 | this.mentions = mentions;
25 | context = createContextFromTokens(tokens);
26 | }
27 |
28 | public Tokens getTokens() {
29 | return tokens;
30 | }
31 |
32 | public void setTokens(Tokens tokens) {
33 | this.tokens = tokens;
34 | context = createContextFromTokens(tokens);
35 | }
36 |
37 | public Mentions getMentions() {
38 | return mentions;
39 | }
40 |
41 | public void setMentions(Mentions mentions) {
42 | this.mentions = mentions;
43 | }
44 |
45 | public Context getContext() {
46 | return context;
47 | }
48 |
49 | private Context createContextFromTokens(Tokens t) {
50 | return new Context(t);
51 | }
52 |
53 | public String getDocId() {
54 | return docId;
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/mpi/aida/data/ResultEntity.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.data;
2 |
3 | import java.io.Serializable;
4 | import java.text.NumberFormat;
5 | import java.util.ArrayList;
6 | import java.util.List;
7 | import java.util.Locale;
8 |
9 | /**
10 | * Entity the was assigned to a ResultMention.
11 | * The entity String is the identifier in YAGO2
12 | * (see http://www.yago-knowledge.org)
13 | *
14 | *
15 | */
16 | public class ResultEntity implements Comparable, Serializable {
17 |
18 | private static final long serialVersionUID = -7062155406718136994L;
19 |
20 | /** YAGO2 identifier of the entity (http://www.yago-knowledge.org) */
21 | private String entity;
22 |
23 | /** Score assigned to the entity */
24 | private double disambiguationScore;
25 |
26 | public ResultEntity(String entity, double disambiguationScore) {
27 | super();
28 | this.entity = entity;
29 | this.disambiguationScore = disambiguationScore;
30 | }
31 |
32 | public static ResultEntity getNoMatchingEntity() {
33 | return new ResultEntity(Entity.NO_MATCHING_ENTITY, 0.0);
34 | }
35 |
36 | public static List getResultEntityAsList(ResultEntity re) {
37 | List res = new ArrayList(1);
38 | res.add(re);
39 | return res;
40 | }
41 |
42 | /**
43 | * @return YAGO2 identifier of the entity (http://www.yago-knowledge.org)
44 | */
45 | public String getEntity() {
46 | return entity;
47 | }
48 |
49 | public void setEntity(String entity) {
50 | this.entity = entity;
51 | }
52 |
53 | public double getDisambiguationScore() {
54 | return disambiguationScore;
55 | }
56 |
57 | public void setDisambiguationScore(double disambiguationScore) {
58 | this.disambiguationScore = disambiguationScore;
59 | }
60 |
61 | public boolean isNoMatchingEntity() {
62 | return entity.equals(Entity.NO_MATCHING_ENTITY);
63 | }
64 |
65 | @Override
66 | public int compareTo(ResultEntity re) {
67 | // natural ordering for ResultEntities is descending
68 | return new Double(new Double(re.getDisambiguationScore())).compareTo(disambiguationScore);
69 | }
70 |
71 | public String toString() {
72 | NumberFormat df = NumberFormat.getInstance(Locale.ENGLISH);
73 | df.setMaximumFractionDigits(5);
74 | return entity + " (" + df.format(disambiguationScore) + ")";
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/mpi/aida/data/ResultMention.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.data;
2 |
3 | import java.io.Serializable;
4 |
5 | import org.slf4j.Logger;
6 | import org.slf4j.LoggerFactory;
7 |
8 | /**
9 | * Mention detected in the input text. It is identified uniquely
10 | * by the combination of the three members docId+mention+characterOffset.
11 | *
12 | *
13 | */
14 | public class ResultMention implements Comparable, Serializable {
15 | private static final Logger logger =
16 | LoggerFactory.getLogger(ResultMention.class);
17 |
18 | private static final long serialVersionUID = -6791087404868641006L;
19 |
20 | private String docId;
21 |
22 | private String mention;
23 |
24 | private int characterOffset;
25 |
26 | private int characterLength;
27 |
28 | public ResultMention(String docId, String mention, int characterOffset, int characterLength) {
29 | super();
30 | this.docId = docId;
31 | this.mention = mention;
32 | this.characterOffset = characterOffset;
33 | this.characterLength = characterLength;
34 | }
35 |
36 | public String getDocId() {
37 | return docId;
38 | }
39 |
40 | public void setDocId(String docId) {
41 | this.docId = docId;
42 | }
43 |
44 | public String getMention() {
45 | return mention;
46 | }
47 |
48 | public void setMention(String mention) {
49 | this.mention = mention;
50 | }
51 |
52 | public int getCharacterOffset() {
53 | return characterOffset;
54 | }
55 |
56 | public void setCharacterOffset(int characterOffset) {
57 | this.characterOffset = characterOffset;
58 | }
59 |
60 | public int getCharacterLength() {
61 | return characterLength;
62 | }
63 |
64 | public void setCharacterLength(int characterLength) {
65 | this.characterLength = characterLength;
66 | }
67 |
68 | public static ResultMention getResultMentionFromMentionString(String docId, String mentionString) {
69 | String[] data = mentionString.split(":::");
70 |
71 | if (data.length < 3) {
72 | logger.error("Could not create ResultMention from mentionString: " + mentionString);
73 | return null;
74 | }
75 |
76 | String mention = data[0];
77 | int characterOffset = Integer.parseInt(data[1]);
78 | int characterLength = Integer.parseInt(data[2]);
79 |
80 | ResultMention rm = new ResultMention(docId, mention, characterOffset, characterLength);
81 | return rm;
82 | }
83 |
84 | @Override
85 | public boolean equals(Object o) {
86 | if (o instanceof ResultMention) {
87 | ResultMention rm = (ResultMention) o;
88 | return (docId.equals(rm.getDocId()) && mention.equals(rm.getMention()) && characterOffset == rm.getCharacterOffset());
89 | } else {
90 | return false;
91 | }
92 | }
93 |
94 | @Override
95 | public int hashCode() {
96 | return docId.hashCode() + mention.hashCode() + characterOffset;
97 | }
98 |
99 | @Override
100 | public int compareTo(ResultMention rm) {
101 | int result = docId.compareTo(rm.getDocId());
102 |
103 | if (result == 0) {
104 | result = new Integer(characterOffset).compareTo(new Integer(rm.getCharacterOffset()));
105 | }
106 |
107 | return result;
108 | }
109 |
110 | public String toString() {
111 | return "[" + docId + "] " + mention + " (" + characterOffset + "/" + characterLength + ")";
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/GraphNode.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph;
2 |
3 | import gnu.trove.map.hash.TIntDoubleHashMap;
4 |
5 | public class GraphNode {
6 |
7 | private int id;
8 | private GraphNodeTypes type;
9 | private Object NodeData = null;
10 | private TIntDoubleHashMap successors;
11 |
12 | public GraphNode() {
13 | successors = new TIntDoubleHashMap();
14 | }
15 |
16 | public int getId() {
17 | return id;
18 | }
19 | public void setId(int id) {
20 | this.id = id;
21 | }
22 | public GraphNodeTypes getType() {
23 | return type;
24 | }
25 | public void setType(GraphNodeTypes type) {
26 | this.type = type;
27 | }
28 | public Object getNodeData() {
29 | return NodeData;
30 | }
31 | public void setNodeData(Object nodeData) {
32 | NodeData = nodeData;
33 | }
34 | public TIntDoubleHashMap getSuccessors() {
35 | return successors;
36 | }
37 | public void setSuccessors(TIntDoubleHashMap successors) {
38 | this.successors = successors;
39 | }
40 |
41 | }
42 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/GraphNodeTypes.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph;
2 |
3 | public enum GraphNodeTypes {
4 | MENTION, ENTITY
5 | }
6 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/algorithms/DisambiguationAlgorithm.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.algorithms;
2 |
3 | import java.util.List;
4 | import java.util.Map;
5 |
6 | import mpi.aida.data.ResultEntity;
7 | import mpi.aida.data.ResultMention;
8 |
9 |
10 | public abstract class DisambiguationAlgorithm {
11 |
12 | public abstract Map> disambiguate() throws Exception;
13 |
14 | }
15 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/algorithms/Node.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.algorithms;
2 |
3 | import java.util.Comparator;
4 |
5 | /**
6 | * Utility class to be used in the implemenation of the shortest-path
7 | * algorithms. We store a node together with its distance, and then we develop a
8 | * comparator that sorts nodes according to their distances
9 | */
10 | public class Node {
11 |
12 | private int key;
13 |
14 | private double distance;
15 |
16 | public Node(int k, double d) {
17 |
18 | key = k;
19 | distance = d;
20 |
21 | }
22 |
23 | public int getKey() {
24 |
25 | return key;
26 | }
27 |
28 | public double getDistance() {
29 |
30 | return distance;
31 | }
32 |
33 | public void setDistance(double d) {
34 |
35 | distance = d;
36 |
37 | }
38 | }
39 |
40 | class NodeComparator implements Comparator {
41 |
42 | public int compare(Node first, Node second) {
43 |
44 | // I want to use the opposite order, so that I can build a max priority
45 | // queue using the default
46 | // implementation of a min priority queue
47 | Double firstDistance = first.getDistance();
48 | Double secondDistance = second.getDistance();
49 | return firstDistance.compareTo(secondDistance);
50 |
51 | }
52 |
53 | public boolean equals(Node first, Node second) {
54 |
55 | // I just want only one node with a given key in the priority queue
56 | if (first.getKey() == second.getKey())
57 | return true;
58 | else
59 | return false;
60 | }
61 |
62 | }
--------------------------------------------------------------------------------
/src/mpi/aida/graph/extraction/DegreeComparator.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.extraction;
2 |
3 | import java.util.Comparator;
4 |
5 | public class DegreeComparator implements Comparator {
6 |
7 |
8 | @Override
9 | public int compare(String arg0, String arg1) {
10 | // I want to use the opposite order, so that I can build a max priority queue using the default
11 | // implementation of a min priority queue
12 | String first = arg0;
13 | String second = arg1;
14 | Double firstDegree = Double.parseDouble(first.split(":::")[1]);
15 | Double secondDegree = Double.parseDouble(second.split(":::")[1]);
16 | return firstDegree.compareTo(secondDegree);
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/extraction/ExtractGraphAllEdges.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.extraction;
2 |
3 | import mpi.aida.data.Entities;
4 | import mpi.aida.data.Mentions;
5 | import mpi.aida.graph.similarity.EnsembleEntityEntitySimilarity;
6 |
7 |
8 | public class ExtractGraphAllEdges extends ExtractGraph {
9 |
10 | public ExtractGraphAllEdges(String graphName, Mentions m, Entities ue, EnsembleEntityEntitySimilarity eeSim, double alpha) {
11 | super(graphName, m, ue, eeSim, alpha);
12 | }
13 |
14 | protected boolean haveDistinceMentions(String e1, String e2) {
15 | return true;
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/EnsembleEntityEntitySimilarity.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity;
2 |
3 | import java.util.List;
4 |
5 | import mpi.aida.data.Entities;
6 | import mpi.aida.data.Entity;
7 | import mpi.aida.graph.similarity.util.SimilaritySettings;
8 | import mpi.experiment.trace.Tracer;
9 |
10 | public class EnsembleEntityEntitySimilarity {
11 |
12 | private List eeSims;
13 |
14 | public EnsembleEntityEntitySimilarity(Entities uniqueEntities, SimilaritySettings settings, Tracer tracer) throws Exception {
15 | eeSims = settings.getEntityEntitySimilarities(uniqueEntities, tracer);
16 | }
17 |
18 | public double calcSimilarity(Entity a, Entity b) throws Exception {
19 | double weightedSimilarity = 0.0;
20 |
21 | for (EntityEntitySimilarity eeSim : eeSims) {
22 | double sim = eeSim.calcSimilarity(a, b) * eeSim.getWeight();
23 | weightedSimilarity += sim;
24 | }
25 |
26 | return weightedSimilarity;
27 | }
28 |
29 | public List getEeSims() {
30 | return eeSims;
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/MaterializedPriorProbability.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity;
2 |
3 | import gnu.trove.map.hash.TIntDoubleHashMap;
4 |
5 | import java.sql.SQLException;
6 | import java.util.HashMap;
7 | import java.util.Set;
8 |
9 | import mpi.aida.access.DataAccess;
10 |
11 | /**
12 | * This class calculates the prior probability of a mention
13 | * being associated with a given entity. The prior probability is based
14 | * on the occurrence count of links (and their anchor text as mention) with
15 | * a given Wikipedia/YAGO entity as target.
16 | *
17 | * It is faster than {@link PriorProbability} because it uses a table with
18 | * all the priors materialized. To get the table, run the {@link MaterializedPriorProbability}
19 | * main method, it will create another table in the YAGO2 database which can
20 | * then be used by this class.
21 | *
22 | *
23 | */
24 | public class MaterializedPriorProbability extends PriorProbability {
25 |
26 | public MaterializedPriorProbability(Set mentions) throws SQLException {
27 | super(mentions);
28 | }
29 |
30 | public void setupMentions(Set mentions) throws SQLException {
31 | priors = new HashMap();
32 | for (String mention : mentions) {
33 | mention = conflateMention(mention);
34 | TIntDoubleHashMap entityPriors = DataAccess.getEntityPriors(mention);
35 | priors.put(mention, entityPriors);
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/PriorProbability.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity;
2 |
3 | import gnu.trove.iterator.TIntDoubleIterator;
4 | import gnu.trove.map.hash.TIntDoubleHashMap;
5 |
6 | import java.sql.SQLException;
7 | import java.util.HashMap;
8 | import java.util.Locale;
9 | import java.util.NoSuchElementException;
10 | import java.util.Set;
11 |
12 | import mpi.aida.data.Entity;
13 |
14 | /**
15 | * This class calculates the prior probability of a mention
16 | * being associated with a given entity. The prior probability is based
17 | * on the occurrence count of links (and their anchor text as mention) with
18 | * a given Wikipedia/YAGO entity as target.
19 | *
20 | * The calculation is done on the fly, so it is a bit slow. For a faster implementation,
21 | * use {@link MaterializedPriorProbability}.
22 | *
23 | * It uses the 'hasInternalWikipediaLinkTo' and 'hasAnchorText' relations
24 | * in the YAGO2 database.
25 | *
26 | *
27 | */
28 | public abstract class PriorProbability {
29 |
30 | protected HashMap priors;
31 |
32 | private double weight;
33 |
34 | public PriorProbability(Set mentions) throws SQLException {
35 | setupMentions(mentions);
36 | }
37 |
38 | public double getWeight() {
39 | return weight;
40 | }
41 |
42 | public void setWeight(double weight) {
43 | this.weight = weight;
44 | }
45 |
46 | protected abstract void setupMentions(Set mentions) throws SQLException;
47 |
48 | /**
49 | * Returns the prior probability for the given mention-entity pair.
50 | * If smoothing is true, it will return the lowest prior among all entities if
51 | * there is no real prior.
52 | *
53 | * @param mention
54 | * @param entity
55 | * @param smoothing
56 | * @return
57 | */
58 | public double getPriorProbability(
59 | String mentionText, Entity entity, boolean smoothing) {
60 | mentionText = conflateMention(mentionText);
61 | TIntDoubleHashMap mentionPriors = priors.get(mentionText);
62 |
63 | if (mentionPriors == null) {
64 | throw new NoSuchElementException(
65 | "Mention " + mentionText + " must be passed to constructor!");
66 | }
67 |
68 | double entityPrior = mentionPriors.get(entity.getId());
69 | if (smoothing && entityPrior == 0.0) {
70 | double smallestPrior = 1.0;
71 |
72 | for (TIntDoubleIterator it = mentionPriors.iterator(); it.hasNext();) {
73 | it.advance();
74 | double currentPrior = it.value();
75 | if (currentPrior < smallestPrior) {
76 | smallestPrior = currentPrior;
77 | }
78 | }
79 | entityPrior = smallestPrior;
80 | }
81 |
82 | return entityPrior;
83 | }
84 |
85 | public double getBestPrior(String mentionText) {
86 | mentionText = conflateMention(mentionText);
87 | TIntDoubleHashMap mentionPriors = priors.get(mentionText);
88 |
89 | double bestPrior = 0.0;
90 | for (TIntDoubleIterator it = mentionPriors.iterator(); it.hasNext();) {
91 | it.advance();
92 | double currentPrior = it.value();
93 | if (currentPrior > bestPrior) {
94 | bestPrior = currentPrior;
95 | }
96 | }
97 |
98 | return bestPrior;
99 | }
100 |
101 | public double getPriorProbability(String mentionText, Entity entity) {
102 | return getPriorProbability(mentionText, entity, false);
103 | }
104 |
105 | public static String conflateMention(String mention) {
106 | // conflate cases for mentions of length >= 4
107 | if (mention.length() >= 4) {
108 | mention = mention.toUpperCase(Locale.ENGLISH);
109 | }
110 |
111 | return mention;
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/context/EmptyEntitiesContext.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.context;
2 |
3 | import mpi.aida.data.Entities;
4 | import mpi.aida.data.Entity;
5 |
6 | public class EmptyEntitiesContext extends EntitiesContext {
7 |
8 | public EmptyEntitiesContext(Entities entities) throws Exception {
9 | super(entities, null);
10 | }
11 |
12 | @Override
13 | public int[] getContext(Entity entity) {
14 | return null;
15 | }
16 |
17 | @Override
18 | protected void setupEntities(Entities entities) throws Exception {
19 | // nothing
20 | }
21 |
22 | public String toString() {
23 | return "EmptyEntitiesContext";
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/context/EntitiesContext.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.context;
2 |
3 | import java.util.LinkedList;
4 | import java.util.List;
5 |
6 | import mpi.aida.AidaManager;
7 | import mpi.aida.data.Entities;
8 | import mpi.aida.data.Entity;
9 | import mpi.tokenizer.data.Token;
10 | import mpi.tokenizer.data.Tokens;
11 |
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 |
15 | import basics.Normalize;
16 |
17 | public abstract class EntitiesContext {
18 | private static final Logger logger =
19 | LoggerFactory.getLogger(EntitiesContext.class);
20 |
21 | protected Entities entities;
22 | protected EntitiesContextSettings settings;
23 |
24 | public EntitiesContext(Entities entities, EntitiesContextSettings settings) throws Exception {
25 | this.entities = entities;
26 | this.settings = settings;
27 |
28 | long beginTime = System.currentTimeMillis();
29 |
30 | setupEntities(entities);
31 |
32 | long runTime = (System.currentTimeMillis() - beginTime) / 1000;
33 | logger.debug("Done setting up " + this + ": " + runTime + "s");
34 | }
35 |
36 | public void setEntities(Entities entities) throws Exception {
37 | this.entities = entities;
38 | setupEntities(entities);
39 | }
40 |
41 | public Entities getEntities() {
42 | return entities;
43 | }
44 |
45 | public abstract int[] getContext(Entity entity);
46 |
47 | protected abstract void setupEntities(Entities entities) throws Exception;
48 |
49 | protected List getTokens(String string) {
50 | List tokens = new LinkedList();
51 |
52 | Tokens advTokens = AidaManager.tokenize("EntitiesContext", string);
53 |
54 | for (Token token : advTokens) {
55 | tokens.add(token.getOriginal());
56 | }
57 |
58 | return tokens;
59 | }
60 |
61 | public static String getEntityName(String entity) {
62 | String norm = Normalize.unEntity(entity);
63 | norm = norm.replaceAll(" \\(.*?\\)$", "");
64 |
65 | return norm;
66 | }
67 |
68 | public String toString() {
69 | return getIdentifier();
70 | }
71 |
72 | public String getIdentifier() {
73 | return this.getClass().getSimpleName();
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/context/EntitiesContextSettings.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.context;
2 |
3 |
4 | public class EntitiesContextSettings {
5 | private int numberOfEntityKeyphrases = Integer.MAX_VALUE;
6 |
7 | private boolean normalizeWeights = true; // default is to normalize
8 | private boolean useConfusableMIWeight = false;
9 | private boolean averageWeights = false;
10 |
11 | private int nGramLength = 2;
12 |
13 | public static final double DEFAULT_KEYPHRASE_ALPHA = 0.9713705285593512;
14 | public static final double DEFAULT_KEYWORD_ALPHA = 0.9713705285593512;
15 |
16 | private double entityCoherenceKeyphraseAlpha = DEFAULT_KEYPHRASE_ALPHA;
17 | private double entityCoherenceKeywordAlpha = DEFAULT_KEYWORD_ALPHA;
18 |
19 | private String keyphraseSourceExclusion;
20 |
21 | // LSH
22 | private int lshBandSize;
23 | private int lshBandCount;
24 | private String lshDatabaseTable;
25 |
26 | /**
27 | *
28 | * @return Balance between Keyphrase MI/IDF. Use alpha*mi, (1-alpha)*idf
29 | */
30 | public double getEntityCoherenceKeyphraseAlpha() {
31 | return entityCoherenceKeyphraseAlpha;
32 | }
33 |
34 | public void setEntityCoherenceKeyphraseAlpha(double entityCoherenceKeyphraseAlpha) {
35 | this.entityCoherenceKeyphraseAlpha = entityCoherenceKeyphraseAlpha;
36 | }
37 |
38 | /**
39 | *
40 | * @return Balance between Keyword MI/IDF. Use alpha*mi, (1-alpha)*idf
41 | */
42 | public double getEntityCoherenceKeywordAlpha() {
43 | return entityCoherenceKeywordAlpha;
44 | }
45 |
46 | public void setEntityCoherenceKeywordAlpha(double entityCoherenceKeywordAlpha) {
47 | this.entityCoherenceKeywordAlpha = entityCoherenceKeywordAlpha;
48 | }
49 |
50 | public int getNumberOfEntityKeyphrases() {
51 | return numberOfEntityKeyphrases;
52 | }
53 |
54 | public void setNumberOfEntityKeyphrases(int numberOfEntityKeyphrases) {
55 | this.numberOfEntityKeyphrases = numberOfEntityKeyphrases;
56 | }
57 |
58 |
59 | public String getKeyphraseSourceExclusion() {
60 | return keyphraseSourceExclusion;
61 | }
62 |
63 |
64 | public void setKeyphraseSourceExclusion(String keyphraseSourceExclusion) {
65 | this.keyphraseSourceExclusion = keyphraseSourceExclusion;
66 | }
67 |
68 | public boolean shouldNormalizeWeights() {
69 | return normalizeWeights;
70 | }
71 |
72 | public void setShouldNormalizeWeights(boolean flag) {
73 | normalizeWeights = flag;
74 | }
75 |
76 |
77 | public boolean shouldUseConfusableMIWeight() {
78 | return useConfusableMIWeight;
79 | }
80 |
81 | public void setUseConfusableMIWeight(boolean useConfusableMIWeight) {
82 | this.useConfusableMIWeight = useConfusableMIWeight;
83 | }
84 |
85 | public boolean shouldAverageWeights() {
86 | return averageWeights;
87 | }
88 |
89 | public void setShouldAverageWeights(boolean flag) {
90 | this.averageWeights = flag;
91 | }
92 |
93 | public void setNgramLength(int nGramLength) {
94 | this.nGramLength = nGramLength;
95 | }
96 |
97 | public int getNgramLength() {
98 | return nGramLength;
99 | }
100 |
101 | public int getLshBandSize() {
102 | return lshBandSize;
103 | }
104 |
105 | public void setLshBandSize(int lshBandSize) {
106 | this.lshBandSize = lshBandSize;
107 | }
108 |
109 | public int getLshBandCount() {
110 | return lshBandCount;
111 | }
112 |
113 | public void setLshBandCount(int lshBandCount) {
114 | this.lshBandCount = lshBandCount;
115 | }
116 |
117 | public String getLshDatabaseTable() {
118 | return lshDatabaseTable;
119 | }
120 |
121 | public void setLshDatabaseTable(String lshDatabaseTable) {
122 | this.lshDatabaseTable = lshDatabaseTable;
123 | }
124 | }
125 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/context/KeyphraseReweightedKeywordContext.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.context;
2 |
3 | import gnu.trove.map.hash.TIntObjectHashMap;
4 |
5 | import java.util.HashMap;
6 | import java.util.Map;
7 |
8 | import mpi.aida.data.Entities;
9 | import mpi.aida.data.Entity;
10 | import mpi.experiment.trace.GraphTracer;
11 | import mpi.experiment.trace.NullGraphTracer;
12 |
13 |
14 | public class KeyphraseReweightedKeywordContext extends FastWeightedKeyphrasesContext {
15 |
16 | public KeyphraseReweightedKeywordContext(Entities entities) throws Exception {
17 | super(entities);
18 | }
19 |
20 | public KeyphraseReweightedKeywordContext(Entities entities, EntitiesContextSettings settings) throws Exception {
21 | super(entities, settings);
22 | }
23 |
24 | @Override
25 | protected TIntObjectHashMap fillEntityVectors() {
26 | TIntObjectHashMap vectors = new TIntObjectHashMap();
27 |
28 | for (Entity e : entities) {
29 | float[] weights = new float[allKeywords.size()];
30 |
31 | for (int kp : getEntityKeyphraseIds(e)) {
32 | for (int tokenId : getKeyphraseTokenIds(kp, true)) {
33 | double mi = entity2keyword2mi.get(e.getId()).get(tokenId);
34 |
35 | double finalTokenWeight = mi;
36 |
37 | double keyphraseWeight = getKeyphraseMiWeight(e, kp);
38 | double reweightedFinalTokenWeight = keyphraseWeight * finalTokenWeight;
39 |
40 | if (Double.isNaN(reweightedFinalTokenWeight)) {
41 | System.err.println("NAN");
42 | }
43 |
44 | weights[tokenId] = (float) reweightedFinalTokenWeight;
45 | }
46 | }
47 |
48 | if (!(GraphTracer.gTracer instanceof NullGraphTracer)) {
49 | Map entityKeywords = new HashMap();
50 |
51 | for (int i = 0; i < weights.length; i++) {
52 | if (weights[i] > 0.0) {
53 | entityKeywords.put(getKeywordForId(i), weights[i]);
54 | }
55 | }
56 | }
57 |
58 | vectors.put(e.getId(), weights);
59 | }
60 |
61 | return vectors;
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/context/TextContext.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.context;
2 |
3 | import gnu.trove.map.hash.TIntObjectHashMap;
4 | import mpi.aida.data.Entities;
5 | import mpi.aida.data.Entity;
6 |
7 | /**
8 | * Abstract class for all contexts containing solely integer ids
9 | * representing tokens.
10 | *
11 | *
12 | */
13 | public abstract class TextContext extends EntitiesContext {
14 |
15 | private TIntObjectHashMap entityTokens;
16 |
17 | public TextContext(Entities entities, EntitiesContextSettings settings) throws Exception {
18 | super(entities, settings);
19 | }
20 |
21 | @Override
22 | public int[] getContext(Entity entity) {
23 | return entityTokens.get(entity.getId());
24 | }
25 |
26 | @Override
27 | protected void setupEntities(Entities entities) throws Exception {
28 | entityTokens = new TIntObjectHashMap();
29 |
30 | for (int entity : entities.getUniqueIds()) {
31 | entityTokens.put(entity, getTextTokens(entity));
32 | }
33 | }
34 |
35 | protected abstract int[] getTextTokens(int entity);
36 | }
37 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/exception/MissingSettingException.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.exception;
2 |
3 |
4 | public class MissingSettingException extends Exception {
5 |
6 | public MissingSettingException(String string) {
7 | super(string);
8 | }
9 |
10 | /**
11 | *
12 | */
13 | private static final long serialVersionUID = -1610134821236307372L;
14 |
15 | }
16 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/importance/EntityImportance.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.importance;
2 |
3 | import java.sql.SQLException;
4 |
5 | import mpi.aida.data.Entities;
6 | import mpi.aida.data.Entity;
7 |
8 | /**
9 | * This class serves as way to get the importance of an entity
10 | * with regard to the complete collection, not to a specific mention (such as prior probability)
11 | *
12 | *
13 | */
14 | public abstract class EntityImportance {
15 |
16 | private Entities entities;
17 |
18 | private double weight = 0.0;
19 |
20 | public EntityImportance(Entities entities) throws SQLException {
21 | this.entities = entities;
22 | setupEntities(entities);
23 | }
24 |
25 | public Entities getEntities() {
26 | return entities;
27 | }
28 |
29 | protected abstract void setupEntities(Entities e) throws SQLException;
30 |
31 | public abstract double getImportance(Entity entity);
32 |
33 | public double getWeight() {
34 | return weight;
35 | }
36 |
37 | public void setWeight(double weight) {
38 | this.weight = weight;
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/importance/InlinkCountImportance.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.importance;
2 |
3 | import gnu.trove.map.hash.TIntDoubleHashMap;
4 | import gnu.trove.map.hash.TIntObjectHashMap;
5 |
6 | import java.sql.SQLException;
7 |
8 | import mpi.aida.access.DataAccess;
9 | import mpi.aida.data.Entities;
10 | import mpi.aida.data.Entity;
11 | import mpi.aida.util.YagoUtil;
12 | import mpi.database.DBConnection;
13 |
14 | /**
15 | * Measures the importance of an entity by the number of
16 | * incoming links in Wikipedia/YAGO
17 | *
18 | *
19 | */
20 | public class InlinkCountImportance extends EntityImportance {
21 |
22 | private TIntDoubleHashMap inlinkImportance;
23 |
24 | DBConnection con;
25 |
26 | public InlinkCountImportance(Entities entities) throws SQLException {
27 | super(entities);
28 | }
29 |
30 | @Override
31 | protected void setupEntities(Entities e) throws SQLException {
32 | TIntObjectHashMap neighbors = DataAccess.getInlinkNeighbors(e);
33 | for (int eId : e.getUniqueIds()) {
34 | double importance =
35 | (double) neighbors.get(eId).length
36 | / (double) YagoUtil.TOTAL_YAGO_ENTITIES;
37 | inlinkImportance.put(eId, importance);
38 | }
39 | }
40 |
41 | @Override
42 | public double getImportance(Entity entity) {
43 | return inlinkImportance.get(entity.getId());
44 | }
45 |
46 | public String toString() {
47 | return "InlinkCountImportance";
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/AlwaysOneSimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import mpi.aida.data.Context;
4 | import mpi.aida.data.Entity;
5 | import mpi.aida.data.Mention;
6 | import mpi.aida.graph.similarity.context.EntitiesContext;
7 | import mpi.experiment.trace.Tracer;
8 |
9 | public class AlwaysOneSimilarityMeasure extends MentionEntitySimilarityMeasure {
10 |
11 | public AlwaysOneSimilarityMeasure(Tracer tracer) {
12 | super(tracer);
13 | }
14 |
15 | @Override
16 | public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) {
17 | return 1.0;
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/EntityEntitySimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import mpi.aida.data.Entity;
4 | import mpi.aida.graph.similarity.context.EntitiesContext;
5 | import mpi.experiment.trace.Tracer;
6 |
7 | public abstract class EntityEntitySimilarityMeasure extends SimilarityMeasure {
8 |
9 | public EntityEntitySimilarityMeasure(Tracer tracer) {
10 | super(tracer);
11 | }
12 |
13 | public abstract double calcSimilarity(Entity a, Entity b, EntitiesContext context);
14 | }
15 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/InlinkOverlapEntityEntitySimilarity.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import gnu.trove.list.array.TIntArrayList;
4 | import gnu.trove.map.hash.TIntObjectHashMap;
5 | import gnu.trove.set.hash.TIntHashSet;
6 |
7 | import java.util.BitSet;
8 |
9 | import mpi.aida.AidaManager;
10 | import mpi.aida.access.DataAccess;
11 | import mpi.aida.data.Entities;
12 | import mpi.aida.data.Entity;
13 | import mpi.aida.graph.similarity.EntityEntitySimilarity;
14 | import mpi.aida.graph.similarity.context.EntitiesContext;
15 | import mpi.database.DBConnection;
16 |
17 | import org.slf4j.Logger;
18 | import org.slf4j.LoggerFactory;
19 |
20 | /**
21 | * Similarity of two entities is the number of common inlinks
22 | *
23 | *
24 | */
25 | public class InlinkOverlapEntityEntitySimilarity extends EntityEntitySimilarity {
26 | private static final Logger logger =
27 | LoggerFactory.getLogger(InlinkOverlapEntityEntitySimilarity.class);
28 |
29 | private TIntObjectHashMap entity2inlink;
30 | private TIntObjectHashMap entity2vector;
31 |
32 | DBConnection con;
33 |
34 | public InlinkOverlapEntityEntitySimilarity(EntityEntitySimilarityMeasure similarityMeasure, EntitiesContext entityContext) throws Exception {
35 | // not needed - uses entites directly
36 | super(similarityMeasure, entityContext);
37 |
38 | setupEntities(entityContext.getEntities());
39 | }
40 |
41 | private void setupEntities(Entities entities) throws Exception {
42 | if (entities.uniqueNameSize() == 0) {
43 | logger.info("Skipping initialization of InlinkEntityEntitySimilarity for " + entities.uniqueNameSize() + " entities");
44 | return;
45 | }
46 |
47 | logger.info("Initializing InlinkEntityEntitySimilarity for " + entities.uniqueNameSize() + " entities");
48 |
49 | con = AidaManager.getConnectionForDatabase(AidaManager.DB_AIDA, "getting inlinks");
50 |
51 | entity2inlink = DataAccess.getInlinkNeighbors(entities);
52 |
53 | // get all inlinks for all entities
54 | // get all inlinks for all entities
55 | TIntHashSet allInlinks = new TIntHashSet();
56 |
57 | for (int[] neighbors : entity2inlink.valueCollection()) {
58 | allInlinks.addAll(neighbors);
59 | }
60 |
61 | TIntArrayList allInlinksList = new TIntArrayList(allInlinks.size());
62 | for (int entry : allInlinksList.toArray()) {
63 | allInlinksList.add(entry);
64 | }
65 | allInlinksList.sort();
66 |
67 | // now create the bitvectors for each entity
68 | logger.info("Creating bitvectors for entities");
69 |
70 | entity2vector = new TIntObjectHashMap();
71 |
72 | for (int entity : entities.getUniqueIds()) {
73 | int[] inlinks = entity2inlink.get(entity);
74 |
75 | BitSet bs = new BitSet(allInlinksList.size());
76 |
77 | int current = 0;
78 |
79 | for (int inlink : inlinks) {
80 | // move to position of inlink in allInlinks
81 | while (allInlinksList.get(current) != inlink) {
82 | current++;
83 | }
84 | bs.set(current);
85 | }
86 |
87 | entity2vector.put(entity, bs);
88 | }
89 |
90 | AidaManager.releaseConnection(AidaManager.DB_AIDA, con);
91 |
92 | logger.info("Done initializing InlinkEntityEntitySimilarity");
93 | }
94 |
95 | @Override
96 | public double calcSimilarity(Entity a, Entity b) throws Exception {
97 | BitSet bsA = entity2vector.get(a.getId());
98 | BitSet bsB = entity2vector.get(b.getId());
99 |
100 | BitSet intersection = (BitSet) bsA.clone();
101 | intersection.and(bsB);
102 |
103 | BitSet union = (BitSet) bsA.clone();
104 | union.or(bsB);
105 |
106 | if (intersection.cardinality() == 0 || union.cardinality() == 0) {
107 | return 0.0; // cannot calc
108 | }
109 |
110 | double sim = (double) intersection.cardinality()
111 | / (double) union.cardinality();
112 |
113 | return sim;
114 | }
115 |
116 | public String toString() {
117 | return "InlinkOverlapEntityEntitySimilarity";
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/JaccardEntityEntitySimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import gnu.trove.set.hash.TIntHashSet;
4 |
5 | import java.util.HashMap;
6 | import java.util.Map;
7 |
8 | import mpi.aida.AidaManager;
9 | import mpi.aida.data.Entity;
10 | import mpi.aida.graph.similarity.context.EntitiesContext;
11 | import mpi.aida.graph.similarity.context.FastWeightedKeyphrasesContext;
12 | import mpi.aida.util.CollectionUtils;
13 | import mpi.experiment.trace.Tracer;
14 | import mpi.experiment.trace.measures.KeytermEntityEntityMeasureTracer;
15 | import mpi.experiment.trace.measures.TermTracer;
16 |
17 | public class JaccardEntityEntitySimilarityMeasure extends EntityEntitySimilarityMeasure {
18 |
19 | public JaccardEntityEntitySimilarityMeasure(Tracer tracer) {
20 | super(tracer);
21 | }
22 |
23 | @Override
24 | public double calcSimilarity(Entity a, Entity b, EntitiesContext context) {
25 | TIntHashSet contextA = new TIntHashSet(context.getContext(a));
26 | TIntHashSet contextB = new TIntHashSet(context.getContext(b));
27 |
28 | TIntHashSet union = getUnion(contextA, contextB);
29 | TIntHashSet intersection = getIntersection(contextA, contextB);
30 |
31 | double jaccardSim = (double) intersection.size() / (double) union.size();
32 | return jaccardSim;
33 | }
34 |
35 | private TIntHashSet getIntersection(TIntHashSet contextA, TIntHashSet contextB) {
36 | TIntHashSet is = new TIntHashSet();
37 |
38 | for (int a : contextA.toArray()) {
39 | if (contextB.contains(a) || contextB.contains(AidaManager.expandTerm(a))) {
40 | is.add(a);
41 | }
42 | }
43 |
44 | return is;
45 | }
46 |
47 | private TIntHashSet getUnion(TIntHashSet contextA, TIntHashSet contextB) {
48 | TIntHashSet union = new TIntHashSet();
49 |
50 | for (int a : contextB.toArray()) {
51 | union.add(a);
52 | }
53 |
54 | for (int a : contextA.toArray()) {
55 | if (!union.contains(a) && !union.contains(AidaManager.expandTerm(a))) {
56 | union.add(a);
57 | }
58 | }
59 |
60 | return union;
61 | }
62 |
63 | @SuppressWarnings("unused")
64 | private void collectTracingInfo(Entity a, Entity b, int[] kpsA, int[] kpsB, double sim, Map matches, FastWeightedKeyphrasesContext kwc) {
65 | Map e1keyphrases = new HashMap();
66 | for (int kp : kpsA) {
67 | if (kwc.getCombinedKeyphraseMiIdfWeight(a, kp) > 0.0) {
68 | e1keyphrases.put(kwc.getKeyphraseForId(kp), kwc.getCombinedKeyphraseMiIdfWeight(a, kp));
69 | }
70 | }
71 | e1keyphrases = CollectionUtils.sortMapByValue(e1keyphrases, true);
72 |
73 | Map e2keyphrases = new HashMap();
74 | for (int kp : kpsB) {
75 | if (kwc.getCombinedKeyphraseMiIdfWeight(b, kp) > 0.0) {
76 | e2keyphrases.put(kwc.getKeyphraseForId(kp), kwc.getCombinedKeyphraseMiIdfWeight(b, kp));
77 | }
78 | }
79 | e2keyphrases = CollectionUtils.sortMapByValue(e2keyphrases, true);
80 |
81 | tracer.eeTracing().addEntityContext(a.getName(), e1keyphrases);
82 | tracer.eeTracing().addEntityContext(b.getName(), e2keyphrases);
83 |
84 | KeytermEntityEntityMeasureTracer mt = new KeytermEntityEntityMeasureTracer("PartialKeyphraseSim", 0.0, e2keyphrases, matches);
85 | mt.setScore(sim);
86 | tracer.eeTracing().addEntityEntityMeasureTracer(a.getName(), b.getName(), mt);
87 |
88 | KeytermEntityEntityMeasureTracer mt2 = new KeytermEntityEntityMeasureTracer("PartialKeyphraseSim", 0.0, e1keyphrases, matches);
89 | mt2.setScore(sim);
90 | tracer.eeTracing().addEntityEntityMeasureTracer(b.getName(), a.getName(), mt2);
91 | }
92 | }
93 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/JaccardSimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import gnu.trove.set.hash.TIntHashSet;
4 | import mpi.aida.AidaManager;
5 | import mpi.aida.data.Context;
6 | import mpi.aida.data.Entity;
7 | import mpi.aida.data.Mention;
8 | import mpi.aida.graph.similarity.context.EntitiesContext;
9 | import mpi.experiment.trace.Tracer;
10 |
11 | public class JaccardSimilarityMeasure extends MentionEntitySimilarityMeasure {
12 |
13 | public JaccardSimilarityMeasure(Tracer tracer) {
14 | super(tracer);
15 | }
16 |
17 | @Override
18 | public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) {
19 | TIntHashSet contextA = new TIntHashSet(context.getTokenIds());
20 | TIntHashSet contextB = new TIntHashSet(entitiesContext.getContext(entity));
21 |
22 | TIntHashSet union = getUnion(contextA, contextB);
23 | TIntHashSet intersection = getIntersection(contextA, contextB);
24 |
25 | double jaccardSim = (double) intersection.size() / (double) union.size();
26 | return jaccardSim;
27 | }
28 |
29 | private TIntHashSet getIntersection(TIntHashSet contextA, TIntHashSet contextB) {
30 | TIntHashSet is = new TIntHashSet();
31 |
32 | for (int a : contextA.toArray()) {
33 | if (contextB.contains(a) || contextB.contains(AidaManager.expandTerm(a))) {
34 | is.add(a);
35 | }
36 | }
37 |
38 | return is;
39 | }
40 |
41 | private TIntHashSet getUnion(TIntHashSet contextA, TIntHashSet contextB) {
42 | TIntHashSet union = new TIntHashSet();
43 |
44 | for (int a : contextB.toArray()) {
45 | union.add(a);
46 | }
47 |
48 | for (int a : contextA.toArray()) {
49 | if (!union.contains(a) && !union.contains(AidaManager.expandTerm(a))) {
50 | union.add(a);
51 | }
52 | }
53 |
54 | return union;
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/MentionEntitySimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import mpi.aida.data.Context;
4 | import mpi.aida.data.Entity;
5 | import mpi.aida.data.Mention;
6 | import mpi.aida.graph.similarity.context.EntitiesContext;
7 | import mpi.experiment.trace.Tracer;
8 |
9 | public abstract class MentionEntitySimilarityMeasure extends SimilarityMeasure {
10 |
11 | public MentionEntitySimilarityMeasure(Tracer tracer) {
12 | super(tracer);
13 | }
14 |
15 | protected boolean useDistanceDiscount = false;
16 |
17 | public boolean isUseDistanceDiscount() {
18 | return useDistanceDiscount;
19 | }
20 |
21 | public void setUseDistanceDiscount(boolean useDistanceDiscount) {
22 | this.useDistanceDiscount = useDistanceDiscount;
23 | }
24 |
25 | public abstract double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext);
26 |
27 | /**
28 | * This method is a place holder to enable the framework to add extra context to a specific mention
29 | * during the processing of the code
30 | * subclasses should override this method accordingly
31 | * @param context the context to add
32 | */
33 |
34 | /**
35 | * This method is a place holder to enable the framework to add extra context to a specific mention
36 | * during the processing of the code
37 | * subclasses should override this method accordingly
38 | *
39 | * @param mention the mention to which this context belongs
40 | * @param context the context to add
41 | */
42 | public void addExtraContext(Mention mention, Object context) {
43 | return;
44 | }
45 |
46 |
47 | /**
48 | * This method is a place holder to enable the framework to announce when a mention gets assigned to an entity
49 | * different measures may perform different upon such event.
50 | * default implementation is doing nothing
51 | *
52 | * @param mention the mention that was assigned
53 | * @param entity the entity to which the mention got assigned
54 | */
55 | public void announceMentionAssignment(Mention mention, Entity entity) {
56 | return;
57 | }
58 |
59 | }
60 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/MilneWittenEntityEntitySimilarity.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import gnu.trove.iterator.TIntObjectIterator;
4 | import gnu.trove.map.hash.TIntObjectHashMap;
5 | import javaewah.EWAHCompressedBitmap;
6 | import mpi.aida.access.DataAccess;
7 | import mpi.aida.data.Entities;
8 | import mpi.aida.data.Entity;
9 | import mpi.aida.graph.similarity.EntityEntitySimilarity;
10 | import mpi.aida.graph.similarity.context.EntitiesContext;
11 | import mpi.aida.util.YagoUtil;
12 |
13 | import org.slf4j.Logger;
14 | import org.slf4j.LoggerFactory;
15 |
16 | public class MilneWittenEntityEntitySimilarity extends EntityEntitySimilarity {
17 | private static final Logger logger =
18 | LoggerFactory.getLogger(MilneWittenEntityEntitySimilarity.class);
19 |
20 | private TIntObjectHashMap entity2vector;
21 |
22 |
23 | public MilneWittenEntityEntitySimilarity(EntityEntitySimilarityMeasure similarityMeasure, EntitiesContext entityContext) throws Exception {
24 | // not needed - uses entites directly
25 | super(similarityMeasure, entityContext);
26 |
27 | setupEntities(entityContext.getEntities());
28 | }
29 |
30 | private void setupEntities(Entities entities) throws Exception {
31 | logger.info("Initializing MilneWittenEntityEntitySimilarity for " +
32 | entities.uniqueNameSize() + " entities");
33 |
34 | TIntObjectHashMap entityInlinks =
35 | DataAccess.getInlinkNeighbors(entities);
36 |
37 | entity2vector = new TIntObjectHashMap();
38 |
39 | for (TIntObjectIterator itr = entityInlinks.iterator();
40 | itr.hasNext(); ) {
41 | itr.advance();
42 | int entity = itr.key();
43 | int[] inLinks = itr.value();
44 |
45 | EWAHCompressedBitmap bs = new EWAHCompressedBitmap();
46 | for (int l : inLinks) {
47 | bs.set(l);
48 | }
49 | entity2vector.put(entity, bs);
50 | }
51 |
52 | logger.info("Done initializing MilneWittenEntityEntitySimilarity for " +
53 | entities.uniqueNameSize() + " entities");
54 | }
55 |
56 | @Override
57 | public double calcSimilarity(Entity a, Entity b) throws Exception {
58 | EWAHCompressedBitmap bsA = entity2vector.get(a.getId());
59 | EWAHCompressedBitmap bsB = entity2vector.get(b.getId());
60 |
61 | double sizeA = bsA.cardinality();
62 | double sizeB = bsB.cardinality();
63 |
64 | double max = -1.0;
65 | double min = -1.0;
66 |
67 | if (sizeA >= sizeB) {
68 | max = sizeA;
69 | min = sizeB;
70 | } else {
71 | max = sizeB;
72 | min = sizeA;
73 | }
74 |
75 | double sim = 0.0; // default is no sim
76 |
77 | int overlap = bsA.andCardinality(bsB);
78 |
79 | if (overlap > 0) {
80 | // now calc the real similarity
81 | double distance = (Math.log(max) - Math.log((double) overlap)) / (Math.log(YagoUtil.TOTAL_YAGO_ENTITIES) - Math.log(min));
82 |
83 | sim = 1 - distance;
84 |
85 | if (distance > 1.0) {
86 | // really far apart ...
87 | sim = 0.0;
88 | }
89 | }
90 |
91 | return sim;
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/NGDSimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import gnu.trove.set.hash.TIntHashSet;
4 | import mpi.aida.data.Entity;
5 | import mpi.aida.graph.similarity.context.EntitiesContext;
6 | import mpi.aida.graph.similarity.context.WeightedKeyphrasesContext;
7 | import mpi.aida.util.YagoUtil;
8 | import mpi.experiment.trace.Tracer;
9 |
10 | public class NGDSimilarityMeasure extends EntityEntitySimilarityMeasure {
11 |
12 | public NGDSimilarityMeasure(Tracer tracer) {
13 | super(tracer);
14 | }
15 |
16 | protected WeightedKeyphrasesContext kwc;
17 |
18 | @Override
19 | public double calcSimilarity(Entity a, Entity b, EntitiesContext entitiesContext) {
20 | kwc = (WeightedKeyphrasesContext) entitiesContext;
21 |
22 | double max = getMax(a, b, entitiesContext);
23 | double min = getMin(a, b, entitiesContext);
24 | double intersect = getIntersect(a, b, entitiesContext);
25 | double collection = getCollection();
26 |
27 | double sim = 0.0;
28 |
29 | if (intersect > 0) {
30 | double ngd =
31 | ( Math.log(max) - Math.log(intersect) )
32 | / ( Math.log(collection) - Math.log(min) );
33 | sim = 1 - ngd;
34 | if (sim < 0) sim = 0.0;
35 | }
36 |
37 | return sim;
38 | }
39 |
40 | protected double getMax(Entity a, Entity b, EntitiesContext entitiesContext) {
41 | int[] e1context = kwc.getContext(a);
42 | int[] e2context = kwc.getContext(b);
43 |
44 | return Math.max(e1context.length, e2context.length);
45 | }
46 |
47 | protected double getMin(Entity a, Entity b, EntitiesContext entitiesContext) {
48 | int[] e1context = kwc.getContext(a);
49 | int[] e2context = kwc.getContext(b);
50 |
51 | return Math.min(e1context.length, e2context.length);
52 | }
53 |
54 | protected double getIntersect(Entity a, Entity b, EntitiesContext entitiesContext) {
55 | TIntHashSet e1context = new TIntHashSet(kwc.getContext(a));
56 | TIntHashSet e2context = new TIntHashSet(kwc.getContext(b));
57 |
58 | e1context.retainAll(e2context);
59 | int intersectSize = e1context.size();
60 | return (double) intersectSize;
61 | }
62 |
63 | protected double getCollection() {
64 | return ((double) YagoUtil.TOTAL_YAGO_ENTITIES);
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/NormalizedKeyphrasesBasedIDFSimilarity.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import mpi.experiment.trace.Tracer;
4 |
5 | public class NormalizedKeyphrasesBasedIDFSimilarity extends UnnormalizedKeyphrasesBasedIDFSimilarity {
6 |
7 | public NormalizedKeyphrasesBasedIDFSimilarity(Tracer tracer) {
8 | super(tracer);
9 | normalize = true;
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/NormalizedKeyphrasesBasedMISimilarity.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import mpi.experiment.trace.Tracer;
4 |
5 | public class NormalizedKeyphrasesBasedMISimilarity extends UnnormalizedKeyphrasesBasedMISimilarity {
6 |
7 | public NormalizedKeyphrasesBasedMISimilarity(Tracer tracer) {
8 | super(tracer);
9 | normalize = true;
10 | }
11 | }
12 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/NullEntityEntitySimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import mpi.aida.data.Entity;
4 | import mpi.aida.graph.similarity.context.EntitiesContext;
5 | import mpi.experiment.trace.Tracer;
6 |
7 | public class NullEntityEntitySimilarityMeasure extends EntityEntitySimilarityMeasure {
8 |
9 | public NullEntityEntitySimilarityMeasure(Tracer tracer) {
10 | super(tracer);
11 | }
12 |
13 | @Override
14 | public double calcSimilarity(Entity a, Entity b, EntitiesContext context) {
15 | return -1;
16 | }
17 |
18 | }
19 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/NullMentionEntittySimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import mpi.aida.data.Context;
4 | import mpi.aida.data.Entity;
5 | import mpi.aida.data.Mention;
6 | import mpi.aida.graph.similarity.context.EntitiesContext;
7 | import mpi.experiment.trace.Tracer;
8 |
9 |
10 | public class NullMentionEntittySimilarityMeasure extends MentionEntitySimilarityMeasure {
11 |
12 | public NullMentionEntittySimilarityMeasure(Tracer tracer) {
13 | super(tracer);
14 | }
15 |
16 | @Override
17 | public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) {
18 | return 0;
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/SimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import mpi.experiment.trace.Tracer;
4 |
5 | public abstract class SimilarityMeasure {
6 |
7 | protected Tracer tracer = null;
8 |
9 | public SimilarityMeasure(Tracer tracer) {
10 | this.tracer = tracer;
11 | }
12 |
13 | public String toString() {
14 | return getIdentifier();
15 | }
16 |
17 | public String getIdentifier() {
18 | String id = this.getClass().getSimpleName();
19 | return id;
20 | }
21 |
22 | public Tracer getTracer() {
23 | return tracer;
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/TfIdfCosineSimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import gnu.trove.map.hash.TIntDoubleHashMap;
4 | import gnu.trove.map.hash.TIntIntHashMap;
5 | import gnu.trove.set.hash.TIntHashSet;
6 | import mpi.aida.AidaManager;
7 | import mpi.aida.access.DataAccess;
8 | import mpi.aida.data.Context;
9 | import mpi.aida.data.Entity;
10 | import mpi.aida.data.Mention;
11 | import mpi.aida.graph.similarity.context.EntitiesContext;
12 | import mpi.aida.util.YagoUtil;
13 | import mpi.experiment.trace.Tracer;
14 |
15 | /**
16 | * Calculates the similarity of two contexts by the cosine similarity
17 | * of their tf.idf weighted term vectors.
18 | *
19 | *
20 | */
21 | public class TfIdfCosineSimilarityMeasure extends MentionEntitySimilarityMeasure {
22 |
23 | public TfIdfCosineSimilarityMeasure(Tracer tracer) {
24 | super(tracer);
25 | }
26 |
27 | @Override
28 | public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) {
29 | TIntDoubleHashMap contextVec = getTfIdfVector(context.getTokenIds());
30 | TIntDoubleHashMap entityVec = getTfIdfVector(entitiesContext.getContext(entity));
31 |
32 | double sim = calcCosine(entityVec, contextVec);
33 | return sim;
34 | }
35 |
36 | protected double calcCosine(TIntDoubleHashMap entityVec, TIntDoubleHashMap contextVec) {
37 | double dotProduct = 0.0;
38 |
39 | for (int termA : entityVec.keys()) {
40 | int expandedA = AidaManager.expandTerm(termA);
41 | if (contextVec.containsKey(termA)) {
42 | double tempProduct = entityVec.get(termA) * contextVec.get(termA);
43 | dotProduct += tempProduct;
44 | }
45 | if (contextVec.containsKey(expandedA)) {
46 | double tempProduct = entityVec.get(termA) * contextVec.get(expandedA);
47 | dotProduct += tempProduct;
48 | }
49 | }
50 |
51 | double normA = 0.0;
52 | for (double weightA : entityVec.values()) {
53 | normA += weightA * weightA;
54 | }
55 | normA = Math.sqrt(normA);
56 |
57 | double normB = 0.0;
58 | for (double weightB : contextVec.values()) {
59 | normB += weightB * weightB;
60 | }
61 | normB = Math.sqrt(normB);
62 |
63 | double sim = 0.0;
64 |
65 | if (normA * normB != 0) {
66 | sim = dotProduct / (normA * normB);
67 | }
68 |
69 | return sim;
70 | }
71 |
72 | private TIntDoubleHashMap getTfIdfVector(int[] is) {
73 | TIntDoubleHashMap vector = new TIntDoubleHashMap();
74 |
75 | TIntIntHashMap tfs = new TIntIntHashMap();
76 |
77 | for (int term : is) {
78 | tfs.adjustOrPutValue(term, 1, 1);
79 | }
80 |
81 | TIntIntHashMap termDFs =
82 | DataAccess.getKeywordDocumentFrequencies(new TIntHashSet(is));
83 |
84 | for (int term : new TIntHashSet(is).toArray()) {
85 | int tf = tfs.get(term);
86 | int df = termDFs.get(term);
87 | if (df == 0) df = YagoUtil.TOTAL_YAGO_ENTITIES; // default smoothing
88 |
89 | double tfIdf =
90 | (double) tf
91 | * log2((double) YagoUtil.TOTAL_YAGO_ENTITIES / (double) df);
92 |
93 | vector.put(term, tfIdf);
94 | }
95 |
96 | return vector;
97 | }
98 |
99 | public static double log2(double x) {
100 | return Math.log(x) / Math.log(2);
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/UnnormalizedKeyphrasesBasedIDFSimilarity.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import mpi.aida.data.Entity;
4 | import mpi.experiment.trace.Tracer;
5 |
6 | public class UnnormalizedKeyphrasesBasedIDFSimilarity extends UnnormalizedKeyphrasesBasedMISimilarity {
7 |
8 | public UnnormalizedKeyphrasesBasedIDFSimilarity(Tracer tracer) {
9 | super(tracer);
10 | }
11 |
12 | protected double getKeywordScore(Entity entity, int keyword) {
13 | return keyphrasesContext.getKeywordIDFWeight(keyword);
14 | }
15 |
16 | public String getIdentifier() {
17 | String identifier = "UnnormalizedKeyphrasesBasedIDFSimilarity";
18 |
19 | if (isUseDistanceDiscount()) {
20 | identifier += ",i";
21 | }
22 |
23 | return identifier;
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/WeightedJaccardEntityEntitySimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import gnu.trove.set.hash.TIntHashSet;
4 |
5 | import java.util.HashMap;
6 | import java.util.Map;
7 |
8 | import mpi.aida.data.Entity;
9 | import mpi.aida.graph.similarity.context.EntitiesContext;
10 | import mpi.aida.graph.similarity.context.WeightedKeyphrasesContext;
11 | import mpi.experiment.trace.Tracer;
12 |
13 | public class WeightedJaccardEntityEntitySimilarityMeasure extends EntityEntitySimilarityMeasure {
14 |
15 | public WeightedJaccardEntityEntitySimilarityMeasure(Tracer tracer) {
16 | super(tracer);
17 | }
18 |
19 | @Override
20 | public double calcSimilarity(Entity a, Entity b, EntitiesContext context) {
21 | WeightedKeyphrasesContext kpc = (WeightedKeyphrasesContext) context;
22 |
23 | TIntHashSet contextA = new TIntHashSet(kpc.getEntityKeyphraseIds(a));
24 | TIntHashSet contextB = new TIntHashSet(kpc.getEntityKeyphraseIds(b));
25 |
26 | double intersection = getIntersection(a, contextA, b, contextB, kpc);
27 | double union = getUnion(a, contextA, b, contextB, kpc);
28 |
29 | double jaccardSim = intersection / union;
30 |
31 | return jaccardSim;
32 | }
33 |
34 | private double getIntersection(Entity a, TIntHashSet contextA, Entity b, TIntHashSet contextB, WeightedKeyphrasesContext kpc) {
35 | double intersectWeight = 0.0;
36 |
37 | for (int k : contextA.toArray()) {
38 | if (contextB.contains(k)) {
39 | intersectWeight += Math.min(kpc.getCombinedKeyphraseMiIdfWeight(a, k), kpc.getCombinedKeyphraseMiIdfWeight(b, k));
40 | }
41 | }
42 |
43 | return intersectWeight;
44 | }
45 |
46 | private double getUnion(Entity a, TIntHashSet contextA, Entity b, TIntHashSet contextB, WeightedKeyphrasesContext kpc) {
47 | Map weights = new HashMap();
48 |
49 | for (int k : contextA.toArray()) {
50 | weights.put(k, kpc.getCombinedKeyphraseMiIdfWeight(a, k));
51 | }
52 |
53 | for (int k : contextB.toArray()) {
54 | Double kwbWeight = kpc.getCombinedKeyphraseMiIdfWeight(b, k);
55 | Double kwaWeight = weights.get(k);
56 |
57 | if (kwaWeight != null) {
58 | weights.put(k, Math.max(kwaWeight, kwbWeight));
59 | } else {
60 | weights.put(k, kwbWeight);
61 | }
62 | }
63 |
64 | double unionWeight = 0.0;
65 |
66 | for (Double d : weights.values()) {
67 | unionWeight += d;
68 | }
69 |
70 | return unionWeight;
71 | }
72 | }
73 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/WeightedNGDSimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import gnu.trove.set.hash.TIntHashSet;
4 | import mpi.aida.data.Entity;
5 | import mpi.aida.graph.similarity.context.EntitiesContext;
6 | import mpi.aida.util.YagoUtil;
7 | import mpi.experiment.trace.Tracer;
8 |
9 |
10 | public class WeightedNGDSimilarityMeasure extends NGDSimilarityMeasure {
11 |
12 | public WeightedNGDSimilarityMeasure(Tracer tracer) {
13 | super(tracer);
14 | }
15 |
16 | @Override
17 | protected double getMax(Entity a, Entity b, EntitiesContext entitiesContext) {
18 | int[] e1context = kwc.getEntityKeyphraseIds(a);
19 | int[] e2context = kwc.getEntityKeyphraseIds(b);
20 |
21 | double e1weight = 0.0;
22 | for (int kp : e1context) {
23 | e1weight += kwc.getCombinedKeyphraseMiIdfWeight(a, kp);
24 | }
25 |
26 | double e2weight = 0.0;
27 | for (int kp : e2context) {
28 | e2weight += kwc.getCombinedKeyphraseMiIdfWeight(b, kp);
29 | }
30 |
31 | return Math.max(e1weight, e2weight);
32 | }
33 |
34 | @Override
35 | protected double getMin(Entity a, Entity b, EntitiesContext entitiesContext) {
36 | int[] e1context = kwc.getEntityKeyphraseIds(a);
37 | int[] e2context = kwc.getEntityKeyphraseIds(b);
38 |
39 | double e1weight = 0.0;
40 | for (int kp : e1context) {
41 | e1weight += kwc.getCombinedKeyphraseMiIdfWeight(a, kp);
42 | }
43 |
44 | double e2weight = 0.0;
45 | for (int kp : e2context) {
46 | e2weight += kwc.getCombinedKeyphraseMiIdfWeight(b, kp);
47 | }
48 |
49 | return Math.min(e1weight, e2weight);
50 | }
51 |
52 | @Override
53 | protected double getIntersect(Entity a, Entity b, EntitiesContext entitiesContext) {
54 | int[] e1context = kwc.getEntityKeyphraseIds(a);
55 | int[] e2context = kwc.getEntityKeyphraseIds(b);
56 |
57 | TIntHashSet e1forIntersect = new TIntHashSet(e1context);
58 | TIntHashSet e2forIntersect = new TIntHashSet(e2context);
59 | e1forIntersect.retainAll(e2forIntersect);
60 |
61 | double intersectWeight = 0.0;
62 |
63 | for (int kp : e1forIntersect.toArray()) {
64 | intersectWeight += kwc.getCombinedKeyphraseMiIdfWeight(a, kp);
65 | intersectWeight += kwc.getCombinedKeyphraseMiIdfWeight(b, kp);
66 | }
67 |
68 | // everthing was counted twice
69 | intersectWeight /= 2;
70 |
71 | return intersectWeight;
72 | }
73 |
74 | @Override
75 | protected double getCollection() {
76 | return YagoUtil.TOTAL_YAGO_ENTITIES;
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/measure/WordCountVectorDotProductSimilarityMeasure.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.measure;
2 |
3 | import gnu.trove.iterator.TIntIntIterator;
4 | import gnu.trove.map.hash.TIntIntHashMap;
5 | import mpi.aida.AidaManager;
6 | import mpi.aida.data.Context;
7 | import mpi.aida.data.Entity;
8 | import mpi.aida.data.Mention;
9 | import mpi.aida.graph.similarity.context.EntitiesContext;
10 | import mpi.experiment.trace.Tracer;
11 |
12 | /**
13 | * This class calculates the similarity between a mention and an
14 | * entity context by a dot product between the word count vectors.
15 | *
16 | *
17 | */
18 | public class WordCountVectorDotProductSimilarityMeasure extends MentionEntitySimilarityMeasure {
19 |
20 | public WordCountVectorDotProductSimilarityMeasure(Tracer tracer) {
21 | super(tracer);
22 | }
23 |
24 | @Override
25 | public double calcSimilarity(Mention mention, Context context, Entity entity, EntitiesContext entitiesContext) {
26 | // create two Maps representing the word count vectors
27 | TIntIntHashMap contextVec = createWordCountVector(context.getTokenIds());
28 | TIntIntHashMap entityVec = createWordCountVector(entitiesContext.getContext(entity));
29 |
30 | // calc dot product between them
31 | double similarity = calcDotProduct(entityVec, contextVec);
32 | return similarity;
33 | }
34 |
35 | private TIntIntHashMap createWordCountVector(int[] is) {
36 | TIntIntHashMap wordCountVector = new TIntIntHashMap();
37 |
38 | for (int word : is) {
39 | wordCountVector.adjustOrPutValue(word, 1, 1);
40 | }
41 |
42 | return wordCountVector;
43 | }
44 |
45 | private double calcDotProduct(
46 | TIntIntHashMap entityVec, TIntIntHashMap contextVec) {
47 | int dotProduct = 0;
48 |
49 | for (TIntIntIterator it = entityVec.iterator(); it.hasNext(); ) {
50 | it.advance();
51 | int wordA = it.key();
52 |
53 | int expandedA = AidaManager.expandTerm(wordA);
54 |
55 | // get counts of word in both vectors
56 | int wordAcount = entityVec.get(wordA);
57 | int wordBcount = contextVec.get(wordA);
58 |
59 | wordBcount += contextVec.get(expandedA); // add expanded count if available
60 |
61 | int temp = wordAcount * wordBcount;
62 | dotProduct += temp;
63 | }
64 |
65 | return dotProduct;
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/util/EntitiesContextCreator.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.util;
2 |
3 | import java.util.HashMap;
4 | import java.util.LinkedList;
5 | import java.util.List;
6 | import java.util.Map;
7 | import java.util.concurrent.locks.Lock;
8 | import java.util.concurrent.locks.ReentrantLock;
9 |
10 | import mpi.aida.data.Entities;
11 | import mpi.aida.graph.similarity.context.EntitiesContext;
12 |
13 | /**
14 | * Caches entity contexts based on the context id and document id.
15 | * Assumes distinct document ids and caches up to ecc contexts.
16 | *
17 | *
18 | */
19 | public class EntitiesContextCreator {
20 | /** Has to be at least 1. */
21 | private static final int CACHE_SIZE = 10;
22 |
23 | /** Holds the cached EntityContexts. */
24 | private Map cache =
25 | new HashMap();
26 |
27 | /**
28 | * Keeps the order in which the EntityContexts were created for
29 | * discarding the least recently used on cache overflow.
30 | */
31 | private List cacheIds = new LinkedList();
32 |
33 | /**
34 | * Synchronized the creation of different contexts. Allows the parallel
35 | * creation of contexts for distinct documents but blocks for requests
36 | * of the same context.
37 | */
38 | private Map contextCreationLocks = new HashMap();
39 |
40 | private static class EntitiesContextCreatorHolder {
41 | public static EntitiesContextCreator ecc = new EntitiesContextCreator();
42 | }
43 |
44 | public static EntitiesContextCreator getEntitiesContextCache() {
45 | return EntitiesContextCreatorHolder.ecc;
46 | }
47 |
48 | public EntitiesContext getEntitiesContext(
49 | String contextClassName, String docId, Entities entities)
50 | throws Exception {
51 |
52 | String id = getCacheId(contextClassName, docId);
53 |
54 | // Allow the parallel creation of distinct contexts but only
55 | // one creation per id.
56 | Lock contextLock = getContextCreationLock(id);
57 | contextLock.lock();
58 | EntitiesContext context = null;
59 | try {
60 | context = cache.get(id);
61 |
62 | if (context == null) {
63 | // Create context.
64 | context =
65 | (EntitiesContext)
66 | Class.forName(contextClassName).
67 | getDeclaredConstructor(Entities.class).newInstance(entities);
68 |
69 | // Put it into the cache, deleting the oldest cache if the cache
70 | // size is exceeded.
71 | synchronized(cache) {
72 | cache.put(id, context);
73 | cacheIds.add(id);
74 |
75 | if (cacheIds.size() > CACHE_SIZE) {
76 | String removedId = cacheIds.get(0);
77 | cacheIds.remove(0);
78 | cache.remove(removedId);
79 | }
80 | }
81 | }
82 | } catch (Exception e) {
83 | throw e;
84 | } finally {
85 | contextLock.unlock();
86 | }
87 |
88 | // Will be null if something goes wrong in the creation process.
89 | return context;
90 | }
91 |
92 | private String getCacheId(String contextClassName, String docId) {
93 | return contextClassName + "\t" + docId;
94 | }
95 |
96 | private synchronized Lock getContextCreationLock(String id) {
97 | Lock lock = contextCreationLocks.get(id);
98 | if (lock == null) {
99 | lock = new ReentrantLock();
100 | contextCreationLocks.put(id, lock);
101 | }
102 | return lock;
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/util/MaxMinSettings.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.util;
2 |
3 | import java.io.BufferedReader;
4 | import java.io.FileReader;
5 | import java.io.IOException;
6 | import java.io.Serializable;
7 | import java.util.HashMap;
8 | import java.util.Map;
9 |
10 | public class MaxMinSettings implements Serializable {
11 |
12 | private static final long serialVersionUID = -3088993650033149824L;
13 |
14 | Map minMaxs;
15 |
16 | public MaxMinSettings(String propertiesFilePath)
17 | throws NumberFormatException, IOException {
18 | minMaxs = new HashMap();
19 |
20 | BufferedReader reader =
21 | new BufferedReader(new FileReader(propertiesFilePath));
22 | for (String line = reader.readLine(); line != null; line = reader.readLine()) {
23 | String[] data = line.split("=");
24 |
25 | double min = Double.parseDouble(data[1].split(" ")[0]);
26 | double max = Double.parseDouble(data[1].split(" ")[1]);
27 |
28 | minMaxs.put(data[0], new double[] { min, max });
29 | }
30 | reader.close();
31 | }
32 |
33 | public MaxMinSettings(Map minMaxs) {
34 | this.minMaxs = minMaxs;
35 | }
36 |
37 | public double getMin(String featureName) {
38 | if (!minMaxs.containsKey(featureName)) {
39 | throw new IllegalArgumentException("No min for '"+featureName+"'");
40 | }
41 | return minMaxs.get(featureName)[0];
42 | }
43 |
44 | public double getMax(String featureName) {
45 | if (!minMaxs.containsKey(featureName)) {
46 | throw new IllegalArgumentException("No max for '"+featureName+"'");
47 | }
48 | return minMaxs.get(featureName)[1];
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/util/ParallelEntityEntityRelatednessComputation.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.util;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Collections;
5 | import java.util.HashMap;
6 | import java.util.HashSet;
7 | import java.util.LinkedList;
8 | import java.util.List;
9 | import java.util.Map;
10 | import java.util.Set;
11 | import java.util.concurrent.CountDownLatch;
12 |
13 | import mpi.aida.config.AidaConfig;
14 | import mpi.aida.data.Entities;
15 | import mpi.aida.data.Entity;
16 | import mpi.aida.data.Mention;
17 | import mpi.aida.data.Mentions;
18 | import mpi.aida.graph.similarity.EnsembleEntityEntitySimilarity;
19 |
20 |
21 | public class ParallelEntityEntityRelatednessComputation {
22 | private int numThreads = 4; // default.
23 | private long totalNumCalcs = 0; // this is only valid if the object is created anew for each entitiy set - used for timing experiments
24 |
25 | public ParallelEntityEntityRelatednessComputation() {
26 | this(Integer.parseInt(AidaConfig.get(AidaConfig.EE_NUM_THREADS)));
27 | }
28 |
29 | public ParallelEntityEntityRelatednessComputation(int numThreads) {
30 | this.numThreads = numThreads;
31 | }
32 |
33 | public Map> computeRelatedness(EnsembleEntityEntitySimilarity entitySimilarity, Entities entities) throws InterruptedException {
34 | return computeRelatedness(entitySimilarity, entities, null);
35 | }
36 |
37 | public Map> computeRelatedness(EnsembleEntityEntitySimilarity entitySimilarity, Entities entities, Mentions mentions) throws InterruptedException {
38 | Map> entityEntitySimilarities = Collections.synchronizedMap(new HashMap>());
39 |
40 | Map> entityMentionsMap = null;
41 | if (mentions != null) {
42 | entityMentionsMap = prepareEntityMentionsMap(mentions);
43 | }
44 |
45 | List> entityPartitions = new LinkedList>();
46 | List allEntities = new ArrayList(entities.getEntities());
47 |
48 | int overall = 0;
49 | Set part = null;
50 | int partSize = entities.uniqueNameSize() / numThreads;
51 |
52 | for (int currentPart = 0; currentPart < numThreads; currentPart++) {
53 | part = new HashSet();
54 | entityPartitions.add(part);
55 |
56 | for (int j = 0; j < partSize; j++) {
57 | int total = (currentPart * partSize) + j;
58 | part.add(allEntities.get(total));
59 |
60 | overall++;
61 | }
62 | }
63 |
64 | // add rest to last part
65 | for (; overall < allEntities.size(); overall++) {
66 | part.add(allEntities.get(overall));
67 | }
68 |
69 | // create threads and run
70 | CountDownLatch cdl = new CountDownLatch(numThreads);
71 |
72 | List scs = new LinkedList();
73 |
74 | for (int i = 0; i < numThreads; i++) {
75 | ParallelEntityEntityRelatednessComputationThread sc = new ParallelEntityEntityRelatednessComputationThread(entityPartitions.get(i), entities, entitySimilarity, entityEntitySimilarities, entityMentionsMap, cdl);
76 | scs.add(sc);
77 | sc.start();
78 | }
79 |
80 | // wait for calculation to finish
81 | cdl.await();
82 |
83 | // sum up total number of calculations
84 | for (ParallelEntityEntityRelatednessComputationThread sc : scs) {
85 | totalNumCalcs += sc.getNumCalcs();
86 | }
87 |
88 | return entityEntitySimilarities;
89 | }
90 |
91 | private Map> prepareEntityMentionsMap(Mentions mentions) {
92 | Map> entityMentionsMap = new HashMap>();
93 |
94 | for (int i = 0; i < mentions.getMentions().size(); i++) {
95 | Mention mention = mentions.getMentions().get(i);
96 | Entities entities = mention.getCandidateEntities();
97 | for (Entity entity : entities) {
98 | List entityMentions = entityMentionsMap.get(entity);
99 | if (entityMentions == null) {
100 | entityMentions = new LinkedList();
101 | entityMentionsMap.put(entity, entityMentions);
102 | }
103 | entityMentions.add(mention);
104 | }
105 | }
106 |
107 | return entityMentionsMap;
108 | }
109 |
110 | public long getTotalNumCalcs() {
111 | return totalNumCalcs;
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/mpi/aida/graph/similarity/util/ParallelEntityEntityRelatednessComputationThread.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.graph.similarity.util;
2 |
3 | import java.util.HashMap;
4 | import java.util.HashSet;
5 | import java.util.List;
6 | import java.util.Map;
7 | import java.util.Set;
8 | import java.util.concurrent.CountDownLatch;
9 |
10 | import mpi.aida.data.Entities;
11 | import mpi.aida.data.Entity;
12 | import mpi.aida.data.Mention;
13 | import mpi.aida.graph.similarity.EnsembleEntityEntitySimilarity;
14 |
15 | import org.slf4j.Logger;
16 | import org.slf4j.LoggerFactory;
17 |
18 |
19 | public class ParallelEntityEntityRelatednessComputationThread extends Thread {
20 | private static final Logger logger =
21 | LoggerFactory.getLogger(ParallelEntityEntityRelatednessComputationThread.class);
22 |
23 | private Set partition;
24 | private Entities allEntities;
25 | private EnsembleEntityEntitySimilarity eeSimMeasure;
26 | private Map> entityEntitySimilarities;
27 | private Map> entityMentionsMap;
28 | private CountDownLatch cdl;
29 | private int numCalcs = 0;
30 |
31 | public ParallelEntityEntityRelatednessComputationThread(Set partition, Entities allEntities, EnsembleEntityEntitySimilarity eeSim, Map> entityEntitySimilarities, Map> entityMentionsMap, CountDownLatch cdl) {
32 | this.partition = partition;
33 | this.allEntities = allEntities;
34 | this.eeSimMeasure = eeSim;
35 | this.entityEntitySimilarities = entityEntitySimilarities;
36 | this.entityMentionsMap = entityMentionsMap;
37 | this.cdl = cdl;
38 | }
39 |
40 | @Override
41 | public void run() {
42 | for (Entity e1 : partition) {
43 | for (Entity e2 : allEntities) {
44 | // only calculate and add if e1 < e2 (similarities are
45 | // symmetric, calculate in lexicographic order)
46 | if (e1.compareTo(e2) < 0) {
47 | double sim = 0.0;
48 | // calculate only if they belong to different mentions
49 | if (shouldCalculate(e1,e2)) {
50 | try {
51 | sim = eeSimMeasure.calcSimilarity(e1, e2);
52 | numCalcs++;
53 | // negative is not allowed
54 | if (sim < 0) {
55 | logger.warn("Coherence of '"+e1+"' and '"+e2+"' was < 0, set to 0");
56 | sim = 0.0;
57 | }
58 | } catch (Exception e) {
59 | e.printStackTrace();
60 | }
61 | } else {
62 | continue;
63 | }
64 |
65 | Map sims = entityEntitySimilarities.get(e1);
66 | if (sims == null) {
67 | sims = new HashMap();
68 | entityEntitySimilarities.put(e1, sims);
69 | }
70 | sims.put(e2, sim);
71 | }
72 | }
73 | }
74 | cdl.countDown();
75 | }
76 |
77 | public int getNumCalcs() {
78 | return numCalcs;
79 | }
80 |
81 | protected boolean shouldCalculate(Entity e1, Entity e2) {
82 | if (entityMentionsMap != null) {
83 | Set mentions1 = new HashSet();
84 |
85 | for (Mention m : entityMentionsMap.get(e1)) {
86 | mentions1.add(m);
87 | }
88 |
89 | Set mentions2 = new HashSet();
90 |
91 | for (Mention m : entityMentionsMap.get(e2)) {
92 | mentions2.add(m);
93 | }
94 |
95 | if (mentions1.size() != mentions2.size()) return true;
96 |
97 | for (Mention mention : mentions1) {
98 | if (!mentions2.contains(mention)) return true;
99 | }
100 | return false;
101 | } else {
102 | return true;
103 | }
104 | }
105 | }
106 |
--------------------------------------------------------------------------------
/src/mpi/aida/preparation/AidaTokenizerManager.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.preparation;
2 |
3 | import mpi.tokenizer.data.Tokenizer;
4 | import mpi.tokenizer.data.TokenizerManager;
5 | import mpi.tokenizer.data.Tokens;
6 |
7 | public class AidaTokenizerManager {
8 | public static void init() {
9 | TokenizerManager.init();
10 | }
11 |
12 | public static Tokens tokenize(String docId, String text, Tokenizer.type type, boolean lemmatize) {
13 | Tokens tokens = TokenizerManager.parse(docId, text, type, lemmatize);
14 | return tokens;
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/mpi/aida/preparation/mentionrecognition/FilterMentions.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.preparation.mentionrecognition;
2 |
3 | import java.io.Serializable;
4 | import java.util.LinkedList;
5 | import java.util.List;
6 |
7 | import javatools.datatypes.Pair;
8 | import mpi.aida.data.Mentions;
9 | import mpi.aida.data.PreparedInput;
10 | import mpi.tokenizer.data.Token;
11 | import mpi.tokenizer.data.Tokens;
12 |
13 | public class FilterMentions implements Serializable {
14 |
15 | private static final long serialVersionUID = 6260499966421708963L;
16 |
17 | private NamedEntityFilter namedEntityFilter = null;
18 |
19 | private ManualFilter manualFilter = null;
20 |
21 | private HybridFilter hybridFilter = null;
22 |
23 | public FilterMentions() {
24 | namedEntityFilter = new NamedEntityFilter();
25 | manualFilter = new ManualFilter();
26 | hybridFilter = new HybridFilter();
27 | }
28 |
29 | /** which type of tokens to get*/
30 | public static enum FilterType {
31 | STANFORD_NER, Manual, ManualPOS, Manual_NER, Hybrid, None;
32 | };
33 |
34 | public PreparedInput filter(String text, String docId, Tokens tokens, FilterType by) {
35 | Mentions mentions = null;
36 | Tokens returnTokens = null;
37 | if (by.equals(FilterType.STANFORD_NER)) {
38 | mentions = namedEntityFilter.filter(tokens);
39 | returnTokens = tokens;
40 | } else if (by.equals(FilterType.Manual) || by.equals(FilterType.ManualPOS) || by.equals(FilterType.Manual_NER)) {
41 | Pair tokensMentions = manualFilter.filter(text, docId, by);
42 | mentions = tokensMentions.second();
43 | returnTokens = tokensMentions.first();
44 | } else if (by.equals(FilterType.Hybrid)) {
45 | Pair tokensMentions = manualFilter.filter(text, docId, by);
46 | Mentions manualMentions = tokensMentions.second();
47 | Mentions NERmentions = namedEntityFilter.filter(tokensMentions.first());
48 | mentions = hybridFilter.parse(manualMentions, NERmentions);
49 | returnTokens = tokensMentions.first();
50 | } else if (by.equals(FilterType.None)) {
51 | mentions = new Mentions();
52 | List tokenlist = new LinkedList();
53 | for (int p = 0; p < tokens.size(); p++) {
54 | Token token = tokens.getToken(p);
55 | tokenlist.add(token.getOriginal());
56 | }
57 | returnTokens = tokens;
58 | }
59 | PreparedInput preparedInput = new PreparedInput(docId, returnTokens, mentions);
60 | return preparedInput;
61 | }
62 | }
--------------------------------------------------------------------------------
/src/mpi/aida/preparation/mentionrecognition/HybridFilter.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.preparation.mentionrecognition;
2 |
3 | import java.util.Collections;
4 | import java.util.LinkedList;
5 | import java.util.List;
6 |
7 | import mpi.aida.data.Mention;
8 | import mpi.aida.data.Mentions;
9 |
10 | public class HybridFilter {
11 |
12 | public Mentions parse(Mentions manual, Mentions ner) {
13 | int from = 0;
14 | List toAdd = new LinkedList();
15 | for (int i = 0; i < ner.getMentions().size(); i++) {
16 | Mention nerMention = ner.getMentions().get(i);
17 | boolean ok = true;
18 | int nerStart = nerMention.getStartToken();
19 | int nerEnd = nerMention.getEndToken();
20 | for (int m = from; m < manual.getMentions().size(); m++) {
21 | Mention manMention = manual.getMentions().get(m);
22 | int manStart = manMention.getStartToken();
23 | int manEnd = manMention.getEndToken();
24 | if (nerEnd >= manStart && nerEnd <= manEnd) {
25 | ok = false;
26 | } else if (nerStart >= manStart && nerStart <= manEnd) {
27 | ok = false;
28 | } else if (nerStart <= manStart && nerEnd >= manEnd) {
29 | ok = false;
30 | }
31 | }
32 | if (ok) {
33 | toAdd.add(nerMention);
34 | }
35 | }
36 | for (int i = 0; i < toAdd.size(); i++) {
37 | manual.addMention(toAdd.get(i));
38 | }
39 | Collections.sort(manual.getMentions());
40 | return manual;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/mpi/aida/preparation/mentionrecognition/NamedEntityFilter.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.preparation.mentionrecognition;
2 |
3 | import java.util.HashMap;
4 | import java.util.LinkedList;
5 | import java.util.List;
6 |
7 | import mpi.aida.data.Mention;
8 | import mpi.aida.data.Mentions;
9 | import mpi.tokenizer.data.Token;
10 | import mpi.tokenizer.data.Tokens;
11 |
12 | public class NamedEntityFilter {
13 |
14 | private HashMap tags = null;
15 |
16 | public NamedEntityFilter() {
17 | tags = new HashMap();
18 | tags.put("LOCATION", "LOCATION");
19 | tags.put("I-LOC", "I-LOC");
20 | tags.put("B-LOC", "I-LOC");
21 | tags.put("PERSON", "PERSON");
22 | tags.put("I-PER", "I-PER");
23 | tags.put("B-PER", "I-PER");
24 | tags.put("ORGANIZATION", "ORGANIZATION");
25 | tags.put("I-ORG", "I-ORG");
26 | tags.put("B-ORG", "I-ORG");
27 | tags.put("MISC", "MISC");
28 | tags.put("I-MISC", "I-MISC");
29 | tags.put("B-MISC", "I-MISC");
30 | }
31 |
32 | public Mentions filter(Tokens tokens) {
33 | Mentions mentions = new Mentions();
34 | HashMap subStrings = new HashMap();
35 | List content = new LinkedList();
36 | for (int p = 0; p < tokens.size(); p++) {
37 | Token token = tokens.getToken(p);
38 | content.add(token.getOriginal());
39 | }
40 | String previous = null;
41 | int start = -1;
42 | int end = -1;
43 | for (int p = 0; p < tokens.size(); p++) {
44 | Token token = tokens.getToken(p);
45 | if (previous == null) {
46 | if (tags.containsKey(token.getNE())) {
47 | previous = tags.get(token.getNE());
48 | start = token.getId();
49 | end = token.getId();
50 | }
51 | } else if (previous.equals(token.getNE())) {
52 | end = token.getId();
53 | } else {
54 | Mention newMentions = getPossibleMentions(start, end, tokens);
55 | mentions.addMention(newMentions);
56 | subStrings.put(start, end);
57 | previous = null;
58 | if (tags.containsKey(token.getNE())) {
59 | previous = tags.get(token.getNE());
60 | start = token.getId();
61 | end = token.getId();
62 | }
63 | }
64 | }
65 | if (previous != null) {
66 | Mention newMentions = getPossibleMentions(start, end, tokens);
67 | mentions.addMention(newMentions);
68 | subStrings.put(start, end);
69 | previous = null;
70 | }
71 | mentions.setSubstring(subStrings);
72 | return mentions;
73 | }
74 |
75 | private Mention getPossibleMentions(int start, int end, Tokens advTokens) {
76 | String meansArg = advTokens.toText(start, end);
77 | int startStanford = advTokens.getToken(start).getStandfordId();
78 | int sentenceId = advTokens.getToken(start).getSentence();
79 | int endStanford = advTokens.getToken(end).getStandfordId();
80 | Mention mention = new Mention(meansArg, start, end, startStanford, endStanford, sentenceId);
81 | int firstChar = advTokens.getToken(mention.getStartToken()).getBeginIndex();
82 | int lastChar = advTokens.getToken(mention.getEndToken()).getEndIndex();
83 | int charLength = lastChar - firstChar;
84 | mention.setCharOffset(firstChar);
85 | mention.setCharLength(charLength);
86 | return mention;
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/src/mpi/aida/util/CollectionUtils.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util;
2 |
3 | import java.util.Collections;
4 | import java.util.Comparator;
5 | import java.util.LinkedHashMap;
6 | import java.util.LinkedList;
7 | import java.util.List;
8 | import java.util.Map;
9 |
10 |
11 | public class CollectionUtils {
12 | public static > LinkedHashMap sortMapByValue(Map map) {
13 | return sortMapByValue(map, false);
14 | }
15 |
16 | public static > LinkedHashMap sortMapByValue(Map map, final boolean descending) {
17 | List> list = new LinkedList>(map.entrySet());
18 | Collections.sort(list, new Comparator>() {
19 |
20 | public int compare(Map.Entry o1, Map.Entry o2) {
21 | int comp = (o1.getValue()).compareTo(o2.getValue());
22 |
23 | if (descending) {
24 | comp = comp * (-1);
25 | }
26 |
27 | return comp;
28 | }
29 | });
30 |
31 | LinkedHashMap result = new LinkedHashMap();
32 | for (Map.Entry entry : list) {
33 | result.put(entry.getKey(), entry.getValue());
34 | }
35 | return result;
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/mpi/aida/util/DocumentCounter.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util;
2 |
3 | import java.util.Map;
4 | import java.util.Observable;
5 |
6 | import mpi.aida.data.DisambiguationResults;
7 |
8 | import org.slf4j.Logger;
9 | import org.slf4j.LoggerFactory;
10 |
11 |
12 | public class DocumentCounter extends Observable {
13 | private static final Logger logger =
14 | LoggerFactory.getLogger(DocumentCounter.class);
15 |
16 | private int completed;
17 | private int total;
18 | private long startTime;
19 |
20 | private Map resultsMap;
21 |
22 | public DocumentCounter(int total) {
23 | completed = 0;
24 | this.total = total;
25 | startTime = System.currentTimeMillis();
26 | }
27 |
28 | public synchronized void oneDone() {
29 | setChanged();
30 | completed++;
31 | notifyObservers(resultsMap);
32 |
33 | long runtime = (System.currentTimeMillis() - startTime) / 1000;
34 | logger.info(completed+"/"+total+" DONE ("+runtime+"s total)");
35 | }
36 |
37 | public Map getResultsMap() {
38 | return resultsMap;
39 | }
40 |
41 | public void setResultsMap(Map resultsMap) {
42 | this.resultsMap = resultsMap;
43 | }
44 | }
--------------------------------------------------------------------------------
/src/mpi/aida/util/InputTextInvertedIndex.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util;
2 |
3 | import gnu.trove.iterator.TIntIterator;
4 | import gnu.trove.list.linked.TIntLinkedList;
5 | import gnu.trove.map.hash.TIntIntHashMap;
6 | import gnu.trove.map.hash.TIntObjectHashMap;
7 |
8 | import java.util.LinkedList;
9 | import java.util.List;
10 |
11 | import mpi.aida.data.Mention;
12 |
13 | public class InputTextInvertedIndex {
14 | private TIntObjectHashMap indexIncludingStopWords;
15 | private TIntObjectHashMap indexWithoutStopWords;
16 |
17 | public InputTextInvertedIndex() {
18 | indexIncludingStopWords = new TIntObjectHashMap();
19 | indexWithoutStopWords = new TIntObjectHashMap();
20 | }
21 |
22 | public InputTextInvertedIndex(int[] tokens, boolean isRemoveStopWords) {
23 | indexIncludingStopWords = new TIntObjectHashMap();
24 | indexWithoutStopWords = new TIntObjectHashMap();
25 | int noStopwordsPosition = 0;
26 | for (int position = 0; position < tokens.length; ++position) {
27 | int token = tokens[position];
28 | TIntLinkedList positions = indexIncludingStopWords.get(token);
29 | if (positions == null) {
30 | positions = new TIntLinkedList();
31 | indexIncludingStopWords.put(token, positions);
32 | }
33 | positions.add(position);
34 |
35 | if(!isRemoveStopWords || !StopWord.is(token)) {
36 | positions = indexWithoutStopWords.get(token);
37 | if (positions == null) {
38 | positions = new TIntLinkedList();
39 | indexWithoutStopWords.put(token, positions);
40 | }
41 | positions.add(noStopwordsPosition);
42 | noStopwordsPosition++;
43 | }
44 | }
45 | }
46 |
47 | public boolean containsWord(int word, Mention mention) {
48 | if(!indexWithoutStopWords.containsKey(word))
49 | return false;
50 | TIntLinkedList positions = indexIncludingStopWords.get(word);
51 | int mentionStart = mention.getStartToken();
52 | int mentionEnd = mention.getEndToken();
53 | for(TIntIterator itr = positions.iterator(); itr.hasNext(); ) {
54 | int position = itr.next();
55 | if(position < mentionStart || position > mentionEnd)
56 | return true;
57 | }
58 | return false;
59 | }
60 |
61 | public List getPositions(int word, Mention mention) {
62 | int mentionStart = mention.getStartToken();
63 | int mentionEnd = mention.getEndToken();
64 | int mentionLength = mentionEnd - mentionStart + 1;
65 |
66 | List positions = new LinkedList();
67 | //we need to subtract the mention length if the keyword is after the mention
68 | for(int i = 0; i < indexIncludingStopWords.get(word).size(); i++) {
69 | //get the keyword position from the full index (including stopwords)
70 | int position = indexIncludingStopWords.get(word).get(i);
71 | //compare to know the position of the keyword relative to the mention
72 | if(position < mentionStart) //before the mention, return the actual position from the stopwords free index
73 | positions.add(indexWithoutStopWords.get(word).get(i));
74 | else if((position > mentionEnd)) //if after the mention, get the actual position and subtract mention length
75 | positions.add(indexWithoutStopWords.get(word).get(i) - mentionLength);
76 | }
77 |
78 | return positions;
79 | }
80 |
81 | public void addToIndex(TIntIntHashMap newIndexEntries) {
82 | for(int word: newIndexEntries.keys()) {
83 | int offset = newIndexEntries.get(word);
84 |
85 | TIntLinkedList positions;
86 | positions = indexIncludingStopWords.get(word);
87 | if (positions == null) {
88 | positions = new TIntLinkedList();
89 | indexIncludingStopWords.put(word, positions);
90 | }
91 | positions.add(offset);
92 |
93 | positions = indexWithoutStopWords.get(word);
94 | if (positions == null) {
95 | positions = new TIntLinkedList();
96 | indexWithoutStopWords.put(word, positions);
97 | }
98 | positions.add(offset);
99 |
100 |
101 | }
102 | }
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/src/mpi/aida/util/MinCover.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 |
6 | public class MinCover {
7 | public int length;
8 | public List startPositions = new ArrayList();
9 | public List endPositions = new ArrayList();
10 | }
11 |
--------------------------------------------------------------------------------
/src/mpi/aida/util/NiceTime.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util;
2 |
3 | /**
4 | * Contains a method that will create a String from any long
5 | * saying how many days, hours, minutes ... the time value
6 | * represents.
7 | */
8 | public class NiceTime {
9 |
10 | /**
11 | * takes a long value and converts it into a readable Time String
12 | * ie. if time eq 1234 would return the String: 1s, 234ms
13 | * @param time
14 | * @return
15 | */
16 | public static String convert(long time) {
17 | long seconds = -1;
18 | long minutes = -1;
19 | long hours = -1;
20 | StringBuffer sb = new StringBuffer(100);
21 | if (time < 0) {
22 | return "0ms";
23 | }
24 | long milliseconds = time % 1000;
25 | time = time / 1000;
26 | if (time > 0) {
27 | seconds = time % 60;
28 | time = time / 60;
29 | }
30 | if (time > 0) {
31 | minutes = time % 60;
32 | time = time / 60;
33 | }
34 | if (time > 0) {
35 | hours = time % 24;
36 | time = time / 24;
37 | }
38 | if (time > 0) {
39 | sb.append(time + "d, ");
40 | }
41 | if (hours != -1) {
42 | sb.append(hours + "h, ");
43 | }
44 | if (minutes != -1) {
45 | sb.append(minutes + "m, ");
46 | }
47 | if (seconds != -1) {
48 | sb.append(seconds + "s, ");
49 | }
50 | sb.append(milliseconds + "ms");
51 | return sb.toString();
52 | }
53 |
54 | public static String convert(double time) {
55 | return convert((long) time);
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/src/mpi/aida/util/Result.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util;
2 |
3 | import java.util.Collections;
4 | import java.util.HashMap;
5 | import java.util.LinkedList;
6 | import java.util.List;
7 |
8 | import mpi.aida.util.htmloutput.ResultMention;
9 | import mpi.tokenizer.data.Tokens;
10 |
11 | public class Result {
12 |
13 | private String text;
14 |
15 | private String docId;
16 |
17 | private List dataSetIds;
18 |
19 | private String groundTruthId = null;
20 |
21 | private Tokens tokens;
22 |
23 | private HashMap> finalEntities = null;
24 |
25 | public Result(String docId, String text, Tokens tokens, String groundTruthId) {
26 | this.docId = docId;
27 | this.text = text;
28 | this.dataSetIds = new LinkedList();
29 | this.tokens = tokens;
30 | finalEntities = new HashMap>();
31 | this.groundTruthId = groundTruthId;
32 | }
33 |
34 | public void addFinalentity(ResultMention entity) {
35 | registerDataSet(entity.getDataSetId());
36 | HashMap entry = null;
37 | if (finalEntities.containsKey(entity.getOffset())) {
38 | entry = finalEntities.get(entity.getOffset());
39 | } else {
40 | entry = new HashMap();
41 | finalEntities.put(entity.getOffset(), entry);
42 | }
43 | if (!entry.containsKey(entity.getOffset())) {
44 | entry.put(entity.getDataSetId(), entity);
45 | }
46 | }
47 |
48 | private void registerDataSet(String dataSetId) {
49 | if (!dataSetIds.contains(dataSetId)) {
50 | if (dataSetId.equals(groundTruthId)) {
51 | dataSetIds.add(0, dataSetId);
52 | } else {
53 | dataSetIds.add(dataSetId);
54 | }
55 | }
56 | }
57 |
58 | public String getDocId() {
59 | return docId;
60 | }
61 |
62 | public String getText() {
63 | return text;
64 | }
65 |
66 | public boolean containsMention(int offset) {
67 | return finalEntities.containsKey(offset);
68 | }
69 |
70 | public boolean containsMention(int offset, String id) {
71 | if (!finalEntities.containsKey(offset)) {
72 | return false;
73 | }
74 | return finalEntities.get(offset).containsKey(id);
75 | }
76 |
77 | public HashMap getMention(int offset) {
78 | return finalEntities.get(offset);
79 | }
80 |
81 | public int size() {
82 | return finalEntities.size();
83 | }
84 |
85 | public Tokens getTokens() {
86 | return tokens;
87 | }
88 |
89 | public List getDataSetIds() {
90 | return dataSetIds;
91 | }
92 |
93 | public void sortDataSetIds(HashMap idsAvgPrec){
94 | Collections.sort(dataSetIds, new SortByAvgPre(idsAvgPrec));
95 | dataSetIds.remove(groundTruthId);
96 | dataSetIds.add(0,groundTruthId);
97 | }
98 |
99 | public String getGroundTruthId() {
100 | return groundTruthId;
101 | }
102 |
103 | }
104 |
--------------------------------------------------------------------------------
/src/mpi/aida/util/SortByAvgPre.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util;
2 |
3 | import java.util.Comparator;
4 | import java.util.HashMap;
5 |
6 | public class SortByAvgPre implements Comparator {
7 |
8 | private HashMap idsAvgPrec = null;
9 |
10 | public SortByAvgPre(HashMap idsAvgPrec) {
11 | this.idsAvgPrec = idsAvgPrec;
12 | }
13 |
14 | @Override
15 | public int compare(String o1, String o2) {
16 | if (idsAvgPrec.get(o1) == null && idsAvgPrec.get(o2) == null) {
17 | return 0;
18 | } else if (idsAvgPrec.get(o1) == null || idsAvgPrec.get(o1).equals("none")) {
19 | return 1;
20 | } else if (idsAvgPrec.get(o2) == null || idsAvgPrec.get(o2).equals("none")) {
21 | return -1;
22 | }
23 | double first = Double.parseDouble(idsAvgPrec.get(o1));
24 | double second = Double.parseDouble(idsAvgPrec.get(o2));
25 | if (first > second) {
26 | return -1;
27 | } else if (first < second) {
28 | return 1;
29 | }
30 | return 0;
31 | }
32 |
33 | }
34 |
--------------------------------------------------------------------------------
/src/mpi/aida/util/WikipediaDumpArticleIdExtractor.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util;
2 |
3 | import java.io.Reader;
4 |
5 | import javatools.filehandlers.FileLines;
6 | import javatools.parsers.Char;
7 | import javatools.util.FileUtils;
8 |
9 | /**
10 | * Extracts all article ids from a Wikipedia pages-articles dump.
11 | * Output format is:
12 | * article_titleid
13 | *
14 | *
15 | */
16 | public class WikipediaDumpArticleIdExtractor {
17 |
18 | public static void main(String[] args) throws Exception {
19 | if (args.length != 1) {
20 | printUsage();
21 | System.exit(1);
22 | }
23 |
24 | final Reader reader = FileUtils.getBufferedUTF8Reader(args[0]);
25 | String page = FileLines.readBetween(reader, "", "");
26 |
27 | int pagesDone = 0;
28 |
29 | while (page != null) {
30 | if (++pagesDone % 100000 == 0) {
31 | System.err.println(pagesDone + " pages done.");
32 | }
33 |
34 | page = Char.decodeAmpersand(page.replace("&", "&"));
35 | String title = FileLines.readBetween(page, "", "");
36 | String id = FileLines.readBetween(page, "", "");
37 | String wpUrl = "http://en.wikipedia.org/wiki/" + title.replace(' ', '_');
38 | System.out.println(wpUrl + "\t" + id);
39 |
40 | page = FileLines.readBetween(reader, "", "");
41 | }
42 | }
43 |
44 | public static void printUsage() {
45 | System.out.println("Usage:");
46 | System.out.println("\tWikipediaDumpArticleIdExtractor ");
47 | }
48 | }
--------------------------------------------------------------------------------
/src/mpi/aida/util/WikipediaUtil.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util;
2 |
3 | import java.util.regex.Pattern;
4 |
5 | public class WikipediaUtil {
6 |
7 | public static final int TOTAL_DOCS = 2628265;
8 |
9 | /**
10 | * Returns ONLY text (minus headlines, links, etc.) for a Wikipedia article source
11 | *
12 | * @param text
13 | * @return
14 | */
15 | public static String cleanWikipediaArticle(String text) {
16 | // replace newlines
17 | text = text.replace('\n', ' ');
18 |
19 | // remove external links
20 | text = text.replaceAll("(\\[https?:.+)\\[\\[[^\\[\\]]+\\]\\]", "$1");
21 | text = text.replaceAll("\\[https?:[^\\[\\]]+\\]", " ");
22 |
23 | // remove references
24 | text = text.replaceAll("[", "");
25 | text = text.replaceAll("]", "");
26 |
27 | // remove galleries
28 | text = text.replaceAll("(?s).*", "");
29 |
30 | // remove xml tags
31 | text = text.replaceAll("<[^/t! ][^>]+>", " ");
32 | text = text.replaceAll("[^t][^>]+>", " ");
33 |
34 | // remove tables
35 | text = Pattern.compile(").*", Pattern.DOTALL).matcher(text).replaceAll("");
36 |
37 | // remove xml comments
38 | text = Pattern.compile("", Pattern.DOTALL).matcher(text).replaceAll("");
39 |
40 | // remove all templates/macros
41 | text = text.replaceAll("'{2,}", "");
42 | text = text.replaceAll("\\[\\[[^\\[\\]]+:[^\\[\\]]+\\]\\]", "");
43 |
44 | // workaround for mal-formed tables
45 | text = Pattern.compile("\\{\\{Standard table\\|0\\}\\}.*\\{\\{close table\\}\\}", Pattern.DOTALL).matcher(text).replaceAll("");
46 | text = text.replaceAll("\\{\\{[sS]tart [bB]ox\\}\\}", "{|");
47 | text = text.replaceAll("\\{\\{[eE]nd [bB]ox\\}\\}", "|}");
48 | text = Pattern.compile("(?s)\\{\\|((?!\\|\\}).)*\n\\|\\}\n", Pattern.DOTALL).matcher(text).replaceAll("");
49 |
50 | // remove templates/infoboxes
51 | text = text.replaceAll("\\{\\{[[^\\{\\}]]+\\}\\}", " ");
52 |
53 | // workaround for some non-standard texts
54 | text = text.replaceAll("(?s)\\{\\|.*\n\\|\\}\u2020Denotes wild-card team \\(since 1995\\)\\.\n", "");
55 | text = Pattern.compile("^\\*{1,2}.*$", Pattern.MULTILINE).matcher(text).replaceAll("");
56 | text = Pattern.compile("^\\;.*$", Pattern.MULTILINE).matcher(text).replaceAll("");
57 | text = Pattern.compile("^:+.*$", Pattern.MULTILINE).matcher(text).replaceAll("");
58 |
59 | // remove [[ ... : ... ]]
60 | text = text.replaceAll("\\[\\[[^\\[\\]]+:[^\\[\\]]+\\]\\]", " ");
61 |
62 | // remove headlines
63 | text = text.replaceAll("={2,}.*?={2,}"," ");
64 |
65 | // replace links
66 | text = text.replaceAll("\\[\\[[^\\]]+?\\|([^\\]\\n]+?)\\]\\]", "$1");
67 | text = text.replaceAll("\\[\\[([^\\]]+?)\\]\\]", "$1");
68 |
69 | // normalize whitespaces
70 | text = text.replaceAll("[\\s\\x00-\\x1F]+", " ");
71 |
72 | // normalize other characters
73 | text = text.replaceAll("<", "<").replaceAll(">", ">");
74 |
75 | return text;
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/src/mpi/aida/util/YagoUtil.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util;
2 |
3 | import gnu.trove.map.hash.TIntObjectHashMap;
4 | import gnu.trove.set.hash.TIntHashSet;
5 |
6 | import java.sql.SQLException;
7 | import java.util.Collection;
8 | import java.util.LinkedList;
9 | import java.util.List;
10 |
11 | import mpi.aida.access.DataAccess;
12 | import mpi.aida.data.Entities;
13 | import mpi.aida.data.Entity;
14 |
15 | import org.apache.commons.lang.StringUtils;
16 |
17 | import basics.Normalize;
18 |
19 | /**
20 | * This class contains some convenience wrappers for accessing YAGO data.
21 | * It has to use DataAccess and MUST NOT access the DB directly!
22 | *
23 | *
24 | */
25 | public class YagoUtil {
26 |
27 | public static final int TOTAL_YAGO_ENTITIES = 2651987;
28 |
29 | public enum Gender {
30 | FEMALE, MALE;
31 | }
32 |
33 | /**
34 | * Checks whether the given String is an entity in YAGO
35 | *
36 | * @param entity Entity to check.
37 | * @return true if the entity is in YAGO
38 | * @throws SQLException
39 | */
40 | public static boolean isYagoEntity(Entity entity) throws SQLException {
41 | return DataAccess.isYagoEntity(entity);
42 | }
43 |
44 | public static Entity getEntityForId(int id) {
45 | return new Entity(DataAccess.getYagoEntityIdForId(id), id);
46 | }
47 |
48 | public static Entities getEntitiesForIds(int[] ids) {
49 | TIntObjectHashMap yagoEntityIds =
50 | DataAccess.getYagoEntityIdsForIds(ids);
51 | Entities entities = new Entities();
52 | for (int i = 0; i < ids.length; ++i) {
53 | entities.add(new Entity(yagoEntityIds.get(ids[i]), ids[i]));
54 | }
55 | return entities;
56 | }
57 |
58 | public static Entity getEntityForYagoId(String id) {
59 | return new Entity(id, DataAccess.getIdForYagoEntityId(id));
60 | }
61 |
62 | public static Entities getEntitiesForYagoEntityIds(Collection names) {
63 | Entities entities = new Entities();
64 | for (String name : names) {
65 | entities.add(new Entity(name, DataAccess.getIdForYagoEntityId(name)));
66 | }
67 | return entities;
68 | }
69 |
70 | /**
71 | * Formats a given mention string properly to query a yago database.
72 | *
73 | * It will first transform the string into a YAGO string (with "" and
74 | * UTF-8 with backslash encoding), and then escape the string properly
75 | * for a Postgres query.
76 | *
77 | * @param mention Mention to format
78 | * @return Mention in YAGO2/Postgres format
79 | */
80 | public static String getYagoMentionStringPostgresEscaped(String mention) {
81 | return getPostgresEscapedString(Normalize.string(mention));
82 | }
83 |
84 | public static String getPostgresEscapedString(String input) {
85 | return input.replace("'", "''").replace("\\", "\\\\");
86 | }
87 |
88 | public static String getPostgresEscapedConcatenatedQuery(Collection entities) {
89 | List queryTerms = new LinkedList();
90 |
91 | for (String term : entities) {
92 | StringBuilder sb = new StringBuilder();
93 | sb.append("E'").append(YagoUtil.getPostgresEscapedString(term)).append("'");
94 | queryTerms.add(sb.toString());
95 | }
96 |
97 | return StringUtils.join(queryTerms, ",");
98 | }
99 |
100 | public static String getIdQuery(TIntHashSet ids) {
101 | int[] conv = ids.toArray();
102 | return getIdQuery(conv);
103 | }
104 |
105 | public static String getIdQuery(int[] ids) {
106 | StringBuilder sb = new StringBuilder();
107 | for (int i = 0; i < ids.length; ++i) {
108 | sb.append(ids[i]);
109 | if (i < ids.length - 1) {
110 | sb.append(",");
111 | }
112 | }
113 | return sb.toString();
114 | }
115 | }
--------------------------------------------------------------------------------
/src/mpi/aida/util/htmloutput/ResultMention.java:
--------------------------------------------------------------------------------
1 | package mpi.aida.util.htmloutput;
2 |
3 | public class ResultMention {
4 |
5 | private String dataSetId = null;
6 |
7 | private int offset;
8 |
9 | private int length;
10 |
11 | private String mention;
12 |
13 | private String entity;
14 |
15 | private double confidence;
16 |
17 | private boolean isYagoEntity;
18 |
19 | public ResultMention(String dataSetId, int offset, int length, String mention, String entity, double confidence, boolean isYagoEntity) {
20 | this.dataSetId = dataSetId;
21 | this.offset = offset;
22 | this.length = length;
23 | this.mention = mention;
24 | this.entity = entity;
25 | this.confidence = confidence;
26 | this.isYagoEntity = isYagoEntity;
27 | }
28 |
29 | public int getOffset() {
30 | return offset;
31 | }
32 |
33 | public int getLength() {
34 | return length;
35 | }
36 |
37 | public String getMention() {
38 | return mention;
39 | }
40 |
41 | public String getEntity() {
42 | return entity;
43 | }
44 |
45 | public double getConfidence() {
46 | return confidence;
47 | }
48 |
49 | public String getDataSetId() {
50 | return dataSetId;
51 | }
52 |
53 | public String toString() {
54 | return offset + "\t" + length + "\t" + mention + "\t" + entity;
55 | }
56 |
57 | public boolean isYagoEntity() {
58 | return isYagoEntity;
59 | }
60 |
61 | }
62 |
--------------------------------------------------------------------------------
/src/mpi/experiment/measure/EvaluationMeasures.java:
--------------------------------------------------------------------------------
1 | package mpi.experiment.measure;
2 |
3 | import java.util.HashMap;
4 | import java.util.List;
5 | import java.util.Map;
6 |
7 |
8 | public class EvaluationMeasures {
9 | public static Map convertToAverageRanks(List> list) {
10 | Map rankedList = new HashMap