";
87 | }
88 |
89 | @Override
90 | public String toString() {
91 | return renderedTerms + "[" + conceptID + "]⟶" + conceptURI + " (" + score + ":" + textLabels + ")";
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/ConceptFinderConfig.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.MapDBConfiguration;
24 |
25 | /**
26 | *
27 | * ConceptFinderConfig is designed to set paths for caching and data access for this package.
28 | */
29 | public class ConceptFinderConfig extends MapDBConfiguration {
30 |
31 | private static final String fallBackLocation = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/";
32 | private static final String missingConceptDBFile = "/missingConcept";
33 |
34 | private static final String w2vDBFile = "/w2vdb";
35 | private static final String w2vVectorFile = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/GoogleNews-vectors-negative300.bin.gz";
36 | private static final String word2VecVectorsMapName = "word2Vec";
37 |
38 | /**
39 | *
40 | * @return the missingConceptDBFile location
41 | */
42 | protected static String getMissingConceptDBFile() {
43 | return getMapDBBase(fallBackLocation) + missingConceptDBFile;
44 | }
45 |
46 | /**
47 | *
48 | * @return the w2vVectorFile
49 | */
50 | protected static String getW2VVectorfile() {
51 | return w2vVectorFile;
52 | }
53 |
54 | /**
55 | *
56 | * @return the w2vDBFile location
57 | */
58 | protected static String getW2vDBFile() {
59 | return getMapDBBase(fallBackLocation) + w2vDBFile;
60 | }
61 |
62 | /**
63 | *
64 | * @return the word2VecVectorsMapName
65 | */
66 | protected static String getWord2VecVectorsMapName() {
67 | return word2VecVectorsMapName;
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/ConceptMatch.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
24 | import java.io.Serializable;
25 | import java.util.function.Function;
26 |
27 | /**
28 | * A ConceptMatch relates a concept to a term.
29 | */
30 | public class ConceptMatch implements Serializable {
31 |
32 | final String concept;
33 |
34 | final double similarity;
35 | final String term;
36 |
37 | /**
38 | * ConceptMatch constructor
39 | *
40 | * @param w2v
41 | * @param search
42 | * @param term
43 | * @param noter
44 | */
45 | public ConceptMatch(Word2VecSpace w2v, float[] search, String term,
46 | Function noter) {
47 | this.term = term;
48 | if (noter == null) {
49 | this.concept = "---";
50 | } else {
51 | this.concept = noter.apply(term);
52 | }
53 | similarity = w2v.googleSimilarity(search, w2v.getVector(term));
54 | }
55 |
56 | /**
57 | * ConceptMatch constructor
58 | *
59 | * @param w2v
60 | * @param search
61 | * @param term
62 | */
63 | public ConceptMatch(Word2VecSpace w2v, float[] search, String term) {
64 | this(w2v, search, term, null);
65 | }
66 |
67 | /**
68 | *
69 | * @return the concept
70 | */
71 | public String getConcept() {
72 | return concept;
73 | }
74 |
75 | /**
76 | *
77 | * @return the similarity
78 | */
79 | public double getSimilarity() {
80 | return similarity;
81 | }
82 |
83 | /**
84 | *
85 | * @return the term
86 | */
87 | public String getTerm() {
88 | return term;
89 | }
90 |
91 | @Override
92 | public String toString() {
93 | return term + ": " + similarity + ": " + (concept == null ? "--" : concept);
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/ConceptSpace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm;
25 | import com.cyc.tool.distributedrepresentations.Word2VecSubspace;
26 | import com.cyc.tool.owltools.OpenCycOwl;
27 | import java.io.IOException;
28 | import java.util.Comparator;
29 | import java.util.List;
30 | import java.util.function.Function;
31 | import java.util.function.Predicate;
32 | import java.util.stream.Collectors;
33 |
34 | /**
35 | *
36 | * ConceptSpace provides access to a Word2VecSpace and methods for finding ConceptMatches.
37 | */
38 | public class ConceptSpace {
39 |
40 | Word2VecSpace w2vSpace;
41 |
42 | /**
43 | * Creates a new instance of ConceptSpace.
44 | *
45 | * @param w2v
46 | * @throws java.io.IOException
47 | */
48 | public ConceptSpace(Word2VecSpace w2v) throws IOException {
49 | w2vSpace = w2v;
50 | }
51 |
52 | /**
53 | *
54 | * @param terms
55 | * @param n
56 | * @return a List of ConceptMatches
57 | * @throws NoWordToVecVectorForTerm
58 | */
59 | public List findNearestNFor(List terms, Integer n) throws NoWordToVecVectorForTerm {
60 | return findNearest(w2vSpace.getMaximalNormedVector(terms))
61 | .stream()
62 | .collect(Collectors.toList())
63 | .subList(0, n);
64 | }
65 |
66 | /**
67 | *
68 | * @param terms
69 | * @param n
70 | * @return a List of ConceptMatches
71 | * @throws NoWordToVecVectorForTerm
72 | */
73 | public List findNearestNFor(String terms, Integer n) throws NoWordToVecVectorForTerm {
74 | return findNearestNFor(w2vSpace.stringToList(terms), n);
75 |
76 | }
77 |
78 | /**
79 | *
80 | * @param terms
81 | * @param n
82 | * @param ocyc
83 | * @return a List of ConceptMatches
84 | * @throws NoWordToVecVectorForTerm
85 | */
86 | public List findNearestNForIn(List terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm {
87 | float[] norm = w2vSpace.getMaximalNormedVector(terms);
88 | return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t)))
89 | .stream()
90 | .collect(Collectors.toList())
91 | .subList(0, n);
92 | }
93 |
94 | /**
95 | *
96 | * @param terms
97 | * @param n
98 | * @param ocyc
99 | * @return a List of ConceptMatches
100 | * @throws NoWordToVecVectorForTerm
101 | */
102 | public List findNearestNForIn(String terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm {
103 |
104 | return findNearestNForIn(w2vSpace.stringToList(terms), n, ocyc);
105 |
106 | }
107 |
108 | /**
109 | *
110 | * @param terms
111 | * @param n
112 | * @param ocyc
113 | * @return a List of ConceptMatches
114 | * @throws NoWordToVecVectorForTerm
115 | */
116 | public List findNearestNForInStrictW2V(List terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm {
117 | float[] norm = w2vSpace.getGoogleNormedVector(terms);
118 | return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t)))
119 | .stream()
120 | .collect(Collectors.toList())
121 | .subList(0, n);
122 | }
123 |
124 | /**
125 | *
126 | * @param terms
127 | * @param n
128 | * @param ocyc
129 | * @return a List of ConceptMatches
130 | * @throws NoWordToVecVectorForTerm
131 | */
132 | public List findNearestNForInStrictW2V(String terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm {
133 | float[] norm = w2vSpace.getGoogleNormedVector(w2vSpace.stringToList(terms));
134 | return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t)))
135 | .stream()
136 | .collect(Collectors.toList())
137 | .subList(0, n);
138 | }
139 |
140 | /**
141 | * Find the position of terms in the larger space from which this is derived a larger space, and
142 | * then search around them in a this space that spans fewer terms, but is otherwise the same
143 | *
144 | * Will fail if the space for this concept space is not a SubSpace
145 | *
146 | * @param terms The string containing a set of terms to search around
147 | * @param n How many things to find in this space
148 | * @param note
149 | * @return a List of ConceptMatches
150 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm
151 | */
152 | public List findNearestNForPosition(String terms, Integer n, Function note) throws NoWordToVecVectorForTerm {
153 | return findNearestNForPosition(w2vSpace.stringToList(terms),
154 | n, note);
155 | }
156 |
157 | /**
158 | * Find the position of terms in the larger space from which this is derived a larger space, and
159 | * then search around them in a this space that spans fewer terms, but is otherwise the same
160 | *
161 | * Will fail if the space for this concept space is not a SubSpace
162 | *
163 | * @param terms The string containing a set of terms to search around
164 | * @param n How many things to find in this space
165 | * @param note
166 | * @return a List of ConceptMatches
167 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm
168 | */
169 | public List findNearestNForPosition(List terms, Integer n, Function note) throws NoWordToVecVectorForTerm {
170 | Word2VecSpace posSpace = ((Word2VecSubspace) w2vSpace).getSuperSpace();
171 | return findNearestNForPosition(terms,
172 | posSpace, n, note);
173 | }
174 |
175 | /**
176 | * Find the position of terms in a larger space, and then search around them in a space that spans
177 | * fewer terms, but is otherwise the same
178 | *
179 | * @param terms The string containing a set of terms to search around
180 | * @param posSpace The other larger space in which to search for those terms.
181 | * @param n How many things to find in this space
182 | * @param note
183 | * @return a List of ConceptMatches
184 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm
185 | */
186 | public List findNearestNForPosition(String terms, Word2VecSpace posSpace, Integer n, Function note) throws NoWordToVecVectorForTerm {
187 | return findNearestNForPosition(w2vSpace.stringToList(terms),
188 | posSpace, n, note);
189 |
190 | }
191 |
192 | /**
193 | * Find the position of terms in a larger space, and then search around them in a space that spans
194 | * fewer terms, but is otherwise the same
195 | *
196 | * @param terms The list of terms to search around
197 | * @param posSpace The other larger space in which to search for those terms.
198 | * @param n How many things to find in this space
199 | * @param note
200 | * @return a List of ConceptMatches
201 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm
202 | */
203 | public List findNearestNForPosition(List terms, Word2VecSpace posSpace, Integer n, Function note) throws NoWordToVecVectorForTerm {
204 | return findNearest(posSpace.getMaximalNormedVector(terms), note)
205 | .stream()
206 | .collect(Collectors.toList())
207 | .subList(0, n);
208 | }
209 |
210 | /**
211 | *
212 | * @param terms
213 | * @param n
214 | * @return a List of ConceptMatches
215 | * @throws NoWordToVecVectorForTerm
216 | */
217 | public List findNearestNForStrictW2V(List terms, Integer n) throws NoWordToVecVectorForTerm {
218 | return findNearest(w2vSpace.getGoogleNormedVector(terms))
219 | .stream()
220 | .collect(Collectors.toList())
221 | .subList(0, n);
222 | }
223 |
224 | /**
225 | *
226 | * @param terms
227 | * @param n
228 | * @return a List of ConceptMatches
229 | * @throws NoWordToVecVectorForTerm
230 | */
231 | public List findNearestNForWithInputTermFiltering(List terms, Integer n) throws NoWordToVecVectorForTerm {
232 | return findNearest(w2vSpace.getMaximalNormedVector(terms))
233 | .stream()
234 | .filter(m -> !terms.contains(m.getTerm())) // the google code removes any search term
235 | .collect(Collectors.toList())
236 | .subList(0, n);
237 | }
238 |
239 | /**
240 | *
241 | * @param terms
242 | * @param n
243 | * @return a List of ConceptMatches
244 | * @throws NoWordToVecVectorForTerm
245 | */
246 | public List findNearestNForWithInputTermFilteringStrictW2V(List terms, Integer n) throws NoWordToVecVectorForTerm {
247 | return findNearest(w2vSpace.getGoogleNormedVector(terms))
248 | .stream()
249 | .filter(m -> !terms.contains(m.getTerm())) // the google code removes any search term
250 | .collect(Collectors.toList())
251 | .subList(0, n);
252 | }
253 |
254 | /**
255 | *
256 | * @return the w2vSpace
257 | */
258 | public Word2VecSpace getW2VSpace() {
259 | return w2vSpace;
260 | }
261 |
262 | private List findNearest(float[] searchVector, Function note) {
263 | Comparator compareDouble
264 | = (Double m1, Double m2) -> Double.compare(m2, m1);
265 |
266 | Comparator compareMatches
267 | = (ConceptMatch m1, ConceptMatch m2) -> Double.compare(m2.getSimilarity(), m1.getSimilarity());
268 |
269 | // This is a massive sort (3m elements) so it might be better to optimise
270 | // for top N
271 | return w2vSpace.getVectors().keySet().stream()
272 | .map(s -> new ConceptMatch(w2vSpace, searchVector, s, note))
273 | .sorted(compareMatches).collect(Collectors.toList());
274 | }
275 |
276 | private List findNearest(float[] searchVector) {
277 | return findNearest(searchVector, null);
278 | }
279 |
280 | private List findNearestWhere(float[] searchVector, Predicate pred, Function note) {
281 | Comparator compareMatches
282 | = (ConceptMatch m1, ConceptMatch m2) -> Double.compare(m2.getSimilarity(), m1.getSimilarity());
283 | // This is a massive sort (3m elements) so it might be better to optimise
284 | // for top N
285 | return w2vSpace.getVectors().keySet().parallelStream()
286 | .filter(pred)
287 | .map(s -> new ConceptMatch(w2vSpace, searchVector, s, note))
288 | .sorted(compareMatches).collect(Collectors.toList());
289 | }
290 |
291 | }
292 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/DefaultConceptFinderConfig.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | /**
24 | * Default configuration for ConceptFinder.
25 | */
26 | public class DefaultConceptFinderConfig extends ConceptFinderConfig {
27 |
28 | private static final String conceptsForMissingTermsNameDefault = "missingTermConceptsDefault";
29 | private static final String missingTermMapNameDefault = "missingTermsDefault";
30 |
31 | /**
32 | *
33 | * @return the conceptsForMissingTermsNameDefault
34 | */
35 | protected static String getConceptsForMissingTermsName() {
36 | return conceptsForMissingTermsNameDefault;
37 | }
38 |
39 | /**
40 | *
41 | * @return the missingTermMapNameDefault
42 | */
43 | protected static String getMissingTermMapName() {
44 | return missingTermMapNameDefault;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/MissingConceptFinder.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
24 | import com.cyc.tool.owltools.OpenCycOwl;
25 | import java.io.File;
26 | import java.io.IOException;
27 | import java.util.ArrayList;
28 | import java.util.Arrays;
29 | import java.util.HashMap;
30 | import java.util.HashSet;
31 | import java.util.List;
32 | import java.util.Map;
33 | import java.util.Set;
34 | import java.util.concurrent.ConcurrentNavigableMap;
35 | import java.util.function.Predicate;
36 | import java.util.stream.Collectors;
37 | import java.util.stream.IntStream;
38 | import java.util.stream.Stream;
39 | import org.mapdb.DB;
40 | import org.mapdb.DBMaker;
41 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
42 |
43 | /**
44 | * Methods for finding missing concepts with a ConceptSpace, a Word2VecSpace, and OpenCyc.
45 | */
46 | abstract public class MissingConceptFinder {
47 |
48 | final private ConceptSpace cSpace;
49 | final private OpenCycOwl ocyc;
50 | private final Word2VecSpace w2vs;
51 | ConcurrentNavigableMap> conceptsForMissingTerms;
52 | DB db;
53 | List missingConceptNames;
54 | List missingMappingNames;
55 | ConcurrentNavigableMap missingTerms;
56 |
57 | /**
58 | * MissingConceptFinder constructor.
59 | *
60 | * @param w2v
61 | * @param oco
62 | * @throws IOException
63 | * @throws OWLOntologyCreationException
64 | */
65 | public MissingConceptFinder(Word2VecSpace w2v, OpenCycOwl oco) throws IOException, OWLOntologyCreationException {
66 | this(w2v, oco, null);
67 | }
68 |
69 | /**
70 | * MissingConceptFinder constructor.
71 | *
72 | * @param w2v
73 | * @param oco
74 | * @param cSpace
75 | * @throws IOException
76 | * @throws OWLOntologyCreationException
77 | */
78 | public MissingConceptFinder(Word2VecSpace w2v, OpenCycOwl oco, ConceptSpace cSpace) throws IOException, OWLOntologyCreationException {
79 | w2vs = w2v;
80 | ocyc = oco;
81 | this.cSpace = cSpace;
82 | db = DBMaker.newFileDB(new File(ConceptFinderConfig.getMissingConceptDBFile()))
83 | .closeOnJvmShutdown()
84 | // .encryptionEnable("password")
85 | .make();
86 |
87 | //Use this to reset
88 | // missingTerms.clear(); db.commit();
89 | }
90 |
91 | /**
92 | *
93 | * @return a List of Strings
94 | */
95 | public List conceptsWithTerms() {
96 | return this.getConceptsForMissingTerms().keySet().stream()
97 | .map(i -> Arrays.asList(getMissingTerms().get(i))
98 | .stream()
99 | .collect(Collectors.joining("|")))
100 | .collect(Collectors.toList());
101 | }
102 |
103 | /**
104 | * @return the conceptsForMissingTerms
105 | */
106 | public ConcurrentNavigableMap> getConceptsForMissingTerms() {
107 | return conceptsForMissingTerms;
108 | }
109 |
110 | /**
111 | * @param conceptsForMissingTerms the conceptsForMissingTerms to set
112 | */
113 | public void setConceptsForMissingTerms(ConcurrentNavigableMap> conceptsForMissingTerms) {
114 | this.conceptsForMissingTerms = conceptsForMissingTerms;
115 | }
116 |
117 | /**
118 | * @return the db
119 | */
120 | public DB getDb() {
121 | return db;
122 | }
123 |
124 | /**
125 | * @return the missingConceptNames
126 | */
127 | public List getMissingConceptNames() {
128 | return missingConceptNames;
129 | }
130 |
131 | /**
132 | * @param missingConceptNames the missingConceptNames to set
133 | */
134 | public void setMissingConceptNames(List missingConceptNames) {
135 | this.missingConceptNames = missingConceptNames;
136 | }
137 |
138 | /**
139 | * @return the missingMappingNames
140 | */
141 | public List getMissingMappingNames() {
142 | return missingMappingNames;
143 | }
144 |
145 | /**
146 | * @param missingMappingNames the missingMappingNames to set
147 | */
148 | public void setMissingMappingNames(List missingMappingNames) {
149 | this.missingMappingNames = missingMappingNames;
150 | }
151 |
152 | /**
153 | *
154 | * @return the missingTerms
155 | */
156 | public ConcurrentNavigableMap getMissingTerms() {
157 | return missingTerms;
158 | }
159 |
160 | /**
161 | * @param missingTerms the missingTerms to set
162 | */
163 | public void setMissingTerms(ConcurrentNavigableMap missingTerms) {
164 | this.missingTerms = missingTerms;
165 | }
166 |
167 | /**
168 | *
169 | * @return the number of missing concepts
170 | */
171 | public int missingConceptCount() {
172 | return getMissingConceptNames().size();
173 | }
174 |
175 | /**
176 | *
177 | * @param testCase
178 | * @return a Set of AttachmentHypotheses
179 | */
180 | protected Set findNearbyTermsWithGraphCore(String testCase) {
181 | return findNearbyTermsWithGraphCore(testCase, -1);
182 | }
183 |
184 | /**
185 | *
186 | * @param termStrings
187 | * @param n
188 | * @return a Set of AttachmentHypotheses
189 | */
190 | protected Set
191 | findNearbyTermsWithGraphCore(List termStrings, int n) {
192 | long t1 = System.currentTimeMillis();
193 | Set hypotheses = new HashSet<>();
194 |
195 | Set allTypes = new HashSet<>();
196 | Map typeWeights = new HashMap<>();
197 |
198 | Map conceptEvidence = new HashMap<>();
199 | System.out.print("====" + String.join("/", termStrings) + "====" + (n < 0 ? "" : " " + n) + " \t");
200 | List matches = new ArrayList<>();
201 | for (String term : termStrings) {
202 | try {
203 | matches.addAll(cSpace.findNearestNForIn(term, 40, ocyc));
204 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
205 | }
206 | }
207 | if (matches.size() == 0) {
208 | // assertEquals("common_eiders", matches.get(10).term);
209 | System.out.println("Terms [" + termStrings + "] have no words in Word2Vec");
210 | return hypotheses; // which is empty at this point
211 | // fail("took unexpected exception:" + ex);
212 | }
213 | IntStream.range(0, matches.size())
214 | .forEach(i -> {
215 | ConceptMatch m = matches.get(i);
216 | //System.out.println(i + " " + m.toString());
217 | if (m.concept != null) {
218 | allTypes.add(m.concept);
219 | typeWeights.put(m.concept,
220 | (typeWeights.containsKey(m.concept) ? typeWeights.get(m.concept) : 0.0d)
221 | + m.similarity);
222 | }
223 | });
224 | allTypes.forEach(s -> {
225 | Double weight = typeWeights.get(s);
226 | Set transTypes = ocyc.getTypesTransitiveURL(s);
227 | Set immedTypes = ocyc.getTypesURL(s);
228 |
229 | Set ret
230 | = Stream.concat(
231 | transTypes
232 | .stream()
233 | .filter(type -> allTypes.contains(type)),
234 | immedTypes.stream()
235 | ).collect(Collectors.toSet());
236 |
237 | if (!ret.isEmpty()) {
238 | ret.forEach(t -> {
239 | if (!conceptEvidence.containsKey(t)) {
240 | conceptEvidence.put(t, weight);
241 | } else {
242 | conceptEvidence.put(t, conceptEvidence.get(t) + weight);
243 | }
244 | });
245 |
246 | }
247 | });
248 |
249 | final double max = conceptEvidence.entrySet().stream()
250 | .mapToDouble(e -> e.getValue()).max().orElse(0);
251 |
252 | Set maxc = conceptEvidence.entrySet().stream()
253 | .filter(e -> e.getValue() == max)
254 | .map(e -> e.getKey()).collect(Collectors.toSet());
255 | System.out.println("Maximum parent count:" + max);
256 | System.out.println("Maximal parents:"
257 | + maxc.stream().map(s -> ocyc.labelsForConcept(s) + ": " + s)
258 | .collect(Collectors.joining("\n\t")));
259 | maxc.forEach(c -> hypotheses.add(new AttachmentHypothesis(n, termStrings,
260 | c, max, ocyc.labelsForConcept(c))));
261 | System.out.println("-----" + (System.currentTimeMillis() - t1) + "ms -----");
262 | return hypotheses; // Since we take the max of a double, there should be only one
263 | }
264 |
265 | /**
266 | *
267 | * @param testCase
268 | * @param n
269 | * @return a Set of AttachmentHypotheses
270 | * @deprecated
271 | */
272 | @Deprecated
273 | protected Set findNearbyTermsWithGraphCore(String testCase, int n) {
274 | List termStrings = new ArrayList<>();
275 | termStrings.add(testCase);
276 | return findNearbyTermsWithGraphCore(termStrings, n);
277 |
278 | }
279 |
280 | /**
281 | *
282 | * @return a List of names in the W2V space
283 | * @deprecated
284 | */
285 | @Deprecated //Depends on a variable that is only set in an initialisation phase
286 | protected List namesInW2V() {
287 | if (getMissingMappingNames() == null) {
288 | return null;
289 | }
290 | return getMissingMappingNames().stream()
291 | .filter(hasElementInW2V())
292 | .map(a -> a[0])
293 | .collect(Collectors.toList());
294 | }
295 |
296 | Predicate hasElementInW2V() {
297 | return a -> Arrays.stream(a)
298 | .anyMatch(w2vs::knownTerm);
299 | }
300 |
301 | }
302 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/MissingConceptFinderDefault.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
24 | import com.cyc.tool.owltools.OpenCycOwl;
25 | import java.io.IOException;
26 | import java.util.ArrayList;
27 | import java.util.Arrays;
28 | import java.util.List;
29 | import java.util.stream.Collectors;
30 | import java.util.stream.IntStream;
31 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
32 |
33 | /**
34 | * The default implementation for MissingConceptFinder.
35 | */
36 | public class MissingConceptFinderDefault extends MissingConceptFinder {
37 |
38 | static final boolean reset = true;
39 | String[][] conceptStrings = {{"Facebook", "the Facebook"},
40 | {"telephone microphone"},
41 | {"telephone speaker"},
42 | {"backhoe"},
43 | {"facial scar", "scar on face"},
44 | {"blue eyes"},
45 | {"saluting the flag"},
46 | {"muddy paws"},
47 | {"strong muscles"},
48 | {"pan balance"},
49 | {"graduated cylinder"},
50 | {"tape measure"},
51 | {"hand lens"},
52 | {"measuring cup"}
53 | };
54 | List conceptsToLookFor = Arrays.asList(conceptStrings);
55 |
56 | /**
57 | * MissingConceptFinderDefault constructor
58 | *
59 | * @param w2v
60 | * @param oco
61 | * @throws IOException
62 | * @throws OWLOntologyCreationException
63 | */
64 | public MissingConceptFinderDefault(Word2VecSpace w2v, OpenCycOwl oco) throws IOException, OWLOntologyCreationException {
65 | this(w2v, oco, null);
66 | }
67 |
68 | /**
69 | * MissingConceptFinderDefault constructor
70 | *
71 | * @param w2v
72 | * @param oco
73 | * @param cs
74 | * @throws IOException
75 | * @throws OWLOntologyCreationException
76 | */
77 | public MissingConceptFinderDefault(Word2VecSpace w2v, OpenCycOwl oco, ConceptSpace cs) throws IOException, OWLOntologyCreationException {
78 | super(w2v, oco, cs);
79 | missingTerms = db.getTreeMap(DefaultConceptFinderConfig.getMissingTermMapName());
80 | conceptsForMissingTerms = db.getTreeMap(DefaultConceptFinderConfig.getConceptsForMissingTermsName());
81 | if (reset) {
82 | missingTerms.clear();
83 | }
84 | if (missingTerms.isEmpty()) {
85 | conceptsForMissingTerms.clear();
86 | OpenCycOwl oc = new OpenCycOwl();
87 |
88 | missingMappingNames = conceptsToLookFor;
89 | missingConceptNames = missingMappingNames.stream()
90 | .filter(oc.noConcept())
91 | .collect(Collectors.toList());
92 | IntStream.range(0, missingConceptNames.size())
93 | .forEach(i -> missingTerms.put(i, missingConceptNames.get(i)));
94 | db.commit();
95 | db.compact();
96 | oc.close();
97 |
98 | } else {
99 | missingConceptNames = new ArrayList<>();
100 | missingTerms.keySet().forEach(k -> missingConceptNames.add(missingTerms.get(k)));
101 | }
102 | }
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/ConceptFinder/src/test/java/com/cyc/tool/conceptfinder/ConceptSpaceIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace;
24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
25 | import java.io.IOException;
26 | import java.util.Arrays;
27 | import java.util.List;
28 | import java.util.stream.IntStream;
29 | import org.junit.After;
30 | import org.junit.AfterClass;
31 | import static org.junit.Assert.assertEquals;
32 | import static org.junit.Assert.fail;
33 | import org.junit.Before;
34 | import org.junit.BeforeClass;
35 | import org.junit.Test;
36 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
37 |
38 | /**
39 | * ConceptSpace tests.
40 | */
41 | public class ConceptSpaceIT {
42 |
43 | static List cr = Arrays.asList("Chinese", "river");
44 | static ConceptSpace mySpace;
45 |
46 | public ConceptSpaceIT() {
47 | }
48 |
49 | @BeforeClass
50 |
51 | public static void setUpClass() throws IOException, OWLOntologyCreationException {
52 | mySpace = new ConceptSpace(GoogleNewsW2VSpace.get());
53 |
54 | }
55 |
56 | @AfterClass
57 |
58 | public static void tearDownClass() {
59 | mySpace = null;
60 | }
61 |
62 | @Test
63 | public void findNearbyTerms1() {
64 | try {
65 | long t1 = System.currentTimeMillis();
66 | List matches = mySpace.findNearestNForWithInputTermFiltering(cr, 40);
67 | IntStream.range(0, matches.size())
68 | .forEach(i -> {
69 | System.out.println(i + " " + matches.get(i).toString());
70 | });
71 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
72 | assertEquals(matches.get(0).getTerm(), "Yangtze_River");
73 | assertEquals(0.6047259562339493, matches.get(5).getSimilarity(), 0.000001);
74 |
75 | assertEquals(matches.get(23).getTerm(), "rivers");
76 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
77 | fail("took unexpected exception:" + ex);
78 | }
79 | }
80 |
81 | @Test
82 | public void findNearbyTerms2() {
83 | try {
84 | long t1 = System.currentTimeMillis();
85 | List matches = mySpace.findNearestNForWithInputTermFiltering(Arrays.asList("gangplank"), 40);
86 | IntStream.range(0, matches.size())
87 | .forEach(i -> {
88 | System.out.println(i + " " + matches.get(i).toString());
89 | });
90 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
91 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
92 | fail("took unexpected exception:" + ex);
93 | }
94 | }
95 |
96 | @Before
97 | public void setUp() {
98 | }
99 |
100 | @After
101 | public void tearDown() {
102 | }
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/ConceptFinder/src/test/java/com/cyc/tool/conceptfinder/MissingConceptFinderIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace;
24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
25 | import com.cyc.tool.owltools.OpenCycOwl;
26 | import java.io.IOException;
27 | import java.util.Arrays;
28 | import java.util.HashSet;
29 | import java.util.List;
30 | import java.util.Set;
31 | import java.util.stream.Collectors;
32 | import java.util.stream.IntStream;
33 | import org.junit.AfterClass;
34 | import static org.junit.Assert.assertEquals;
35 | import static org.junit.Assert.assertTrue;
36 | import static org.junit.Assert.fail;
37 | import org.junit.BeforeClass;
38 | import org.junit.Test;
39 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
40 |
41 | /**
42 | * MissingConceptFinder tests.
43 | */
44 | public class MissingConceptFinderIT {
45 |
46 | static ConceptSpace cSpace;
47 | static List cr = Arrays.asList("Chinese", "river");
48 | static MissingConceptFinder mcf;
49 | static Word2VecSpace mySpace;
50 | static OpenCycOwl ocyc;
51 | static List pelagicBird = Arrays.asList("pelagic", "bird");
52 |
53 | public MissingConceptFinderIT() {
54 | }
55 |
56 | @BeforeClass
57 | public static void setUpClass() throws IOException, OWLOntologyCreationException {
58 | mySpace = GoogleNewsW2VSpace.get();
59 | cSpace = new ConceptSpace(mySpace);
60 | ocyc = new OpenCycOwl();
61 | mcf = new MissingConceptFinderDefault(mySpace, ocyc, cSpace);
62 | }
63 |
64 | @AfterClass
65 | public static void tearDownClass() {
66 | mySpace = null;
67 | ocyc.close();
68 | }
69 | private static String set2String(Set s) {
70 | if (s.size()>10) return "";
71 | return s.stream()
72 | .map(i->{return String.join(",", mcf.getMissingTerms().get(i));})
73 | .collect(Collectors.joining(";"));
74 |
75 | }
76 |
77 | @Test
78 | public void conceptsWithTermsTest() {
79 | List res = mcf.conceptsWithTerms();
80 | System.out.println("There are " + res.size() + " missing concepts with associated KB terms: " + res);
81 | assertTrue(res.size() + "elements expected none", res.size() == 0);
82 | // assertTrue(res.containsAll(Arrays.asList("start", "rust", "blueberry")));
83 | }
84 |
85 | @Test
86 | public void findNearbyTerms1() {
87 | long t1 = System.currentTimeMillis();
88 | System.out.println("FNT1");
89 | List matches;
90 | try {
91 | matches = cSpace.findNearestNForIn(cr, 40, ocyc);
92 | IntStream.range(0, matches.size())
93 | .forEach(i -> {
94 | System.out.println(i + " " + matches.get(i).toString());
95 | });
96 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
97 | assertEquals("Chinese", matches.get(0).term);
98 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
99 | fail("took unexpected exception:" + ex);
100 | }
101 | }
102 |
103 | @Test
104 | public void findNearbyTerms2() {
105 | try {
106 | long t1 = System.currentTimeMillis();
107 | System.out.println("FNT2");
108 | List matches = cSpace.findNearestNForIn(cr, 40, ocyc);
109 | IntStream.range(0, matches.size())
110 | .forEach(i -> {
111 | System.out.println(i + " " + matches.get(i).toString());
112 | });
113 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
114 |
115 | assertEquals(0.5539201713461387, matches.get(13).similarity, 0.000001);
116 |
117 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
118 | fail("took unexpected exception:" + ex);
119 |
120 | }
121 |
122 | }
123 |
124 | @Test
125 | public void findNearbyTerms3() {
126 | try {
127 | long t1 = System.currentTimeMillis();
128 | System.out.println("FNT3");
129 | List matches = cSpace.findNearestNForIn(cr, 40, ocyc);
130 | IntStream.range(0, matches.size())
131 | .forEach(i -> {
132 | System.out.println(i + " " + matches.get(i).toString());
133 | });
134 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
135 |
136 | assertEquals("creek", matches.get(7).term);
137 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
138 | fail("took unexpected exception:" + ex);
139 | }
140 | }
141 |
142 | @Test
143 | public void findNearbyTerms4() {
144 | try {
145 | long t1 = System.currentTimeMillis();
146 | System.out.println("FNT4");
147 | List matches = cSpace.findNearestNForIn(cr, 40, ocyc);
148 | IntStream.range(0, matches.size())
149 | .forEach(i -> {
150 | System.out.println(i + " " + matches.get(i).toString());
151 | });
152 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
153 |
154 | assertEquals("riverbank", matches.get(12).term);
155 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
156 | fail("took unexpected exception:" + ex);
157 | }
158 | }
159 |
160 | @Test
161 | public void findNearbyTermsWithGraphListTest() {
162 | System.out.println("FNT WG 3");
163 | IntStream.rangeClosed(3, 6)
164 | .forEach(ti -> {
165 | Arrays.asList(mcf.getMissingTerms().get(ti))
166 | .forEach((String ss) -> {
167 | mcf.findNearbyTermsWithGraphCore(ss, ti);
168 | });
169 | });
170 | assertTrue(true);
171 | }
172 |
173 | @Test
174 | public void findNearbyTermsWithGraphTest1() {
175 | System.out.println("FNT WG 1");
176 | mcf.findNearbyTermsWithGraphCore("pelagic bird");
177 | assertTrue(true);
178 | }
179 |
180 | @Test
181 | public void findNearbyTermsWithGraphTest2(){
182 | System.out.println("FNT WG 2");
183 | mcf.findNearbyTermsWithGraphCore("tobacco shop");
184 | assertTrue(true);
185 | }
186 |
187 | @Test
188 | public void findNearbyTermsWithGraphTest3() {
189 | System.out.println("FNT WG 3");
190 | mcf.findNearbyTermsWithGraphCore("pelagic bird");
191 | mcf.findNearbyTermsWithGraphCore("tobacco shop");
192 | mcf.findNearbyTermsWithGraphCore("net melon");
193 | mcf.findNearbyTermsWithGraphCore("glowworm");
194 | mcf.findNearbyTermsWithGraphCore("tightrope walking");
195 | mcf.findNearbyTermsWithGraphCore("Adelie penguin");
196 | assertTrue(true);
197 | }
198 |
199 | @Test
200 | public void findNearbyTermsWithGraphTest4() {
201 | System.out.println("FNT WG 4");
202 |
203 | Set hyp = mcf.findNearbyTermsWithGraphCore("Adelie penguin");
204 | System.out.println("HYP" + hyp);
205 | assertEquals(1, hyp.size());
206 | }
207 |
208 | @Test
209 | public void findSomeMissingTerms1() {
210 | IntStream.rangeClosed(0, 3)
211 | .forEach(ti -> {
212 | Arrays.asList(mcf.getMissingTerms().get(ti))
213 | .forEach((String ss) -> {
214 | lookItUpWithOcyc(ss);
215 | });
216 | });
217 | assertTrue(true);
218 | }
219 |
220 | @Test
221 | public void findSomeMissingTerms2() {
222 | IntStream.of(1, 5, 7)
223 | //See https://docs.google.com/a/cyc.com/document/d/1Lwi21-yxcC0DGKJMcc4GFN3M_DBzEDAcSNjYufCRIfE/edit
224 | .forEach(ti -> {
225 | Arrays.asList(mcf.getMissingTerms().get(ti))
226 | .forEach((String ss) -> {
227 | lookItUpWithOcyc(ss);
228 | });
229 | });
230 | assertTrue(true);
231 | }
232 |
233 | @Test
234 | public void findSomeMissingTerms3() {
235 | IntStream.of(2, 3, 6)
236 | //See https://docs.google.com/a/cyc.com/document/d/1Lwi21-yxcC0DGKJMcc4GFN3M_DBzEDAcSNjYufCRIfE/edit
237 | .forEach(ti -> {
238 | Arrays.asList(mcf.getMissingTerms().get(ti))
239 | .forEach((String ss) -> {
240 | lookItUpAllW2V(ss);
241 | });
242 | });
243 | assertTrue(true);
244 | }
245 |
246 | @Test
247 | public void howManyMissingTermsInW2V() throws IOException {
248 | final Set found = new HashSet<>();
249 | final Set foundSpace = new HashSet<>();
250 | final Set unfound = new HashSet<>();
251 |
252 | mcf.getMissingTerms().keySet().forEach(i -> {
253 | Arrays.asList(mcf.getMissingTerms().get(i))
254 | .forEach((String ss) -> {
255 | if (mySpace.knownTerm(ss)) {
256 | found.add(i);
257 | if (ss.contains(" ")) {
258 | foundSpace.add(i);
259 | }
260 | } else {
261 | unfound.add(i);
262 | }
263 | });
264 | });
265 | System.out.println("Found directly in W2V : " + found.size()+" "+set2String(found));
266 | System.out.println("Found directly in W2V with space: " + foundSpace.size()+" "+set2String(foundSpace));
267 | System.out.println("Not found in W2V : " + unfound.size()+" "+set2String(unfound));
268 | assertEquals(2, foundSpace.size());
269 | assertEquals(8, unfound.size());
270 | }
271 |
272 | @Test
273 | public void listSomeTest() {
274 | IntStream.rangeClosed(0, 8)
275 | .forEach(i -> {
276 | System.out.println(i + ":\t" + String.join(", ",
277 | Arrays.asList(mcf.getMissingTerms().get(i))));
278 | });
279 | assertTrue(true);
280 | }
281 |
282 | // @Test
283 | // public void namesInW2VTest() {
284 | // List res;
285 | // res = mcf.namesInW2V();
286 | // assertEquals(12343, res.size());
287 | // }
288 | @Test
289 | public void missingConceptCountTest() {
290 | assertEquals(9, mcf.missingConceptCount());
291 | }
292 |
293 | private void lookItUpAllW2V(String ss) {
294 | try {
295 | System.out.println("=======[" + ss + "]=======");
296 | long t1 = System.currentTimeMillis();
297 | List matches
298 | = cSpace.findNearestNFor(Arrays.asList(ss.split("\\s+")), 40);
299 |
300 | System.out.println("Matches:" + (matches == null ? "null" : matches.size()));
301 | IntStream.range(0, matches.size())
302 | .forEach(i -> {
303 | String matchTerm = matches.get(i).term;
304 | String mat = matches.get(i).toString();
305 | if (ocyc.knownTerm(matchTerm)) {
306 | // System.out.println("Known:" +matchTerm);
307 | // System.out.println("Match is: "+ocyc.conceptsFor(matchTerm));
308 | mat = mat.replace("---",
309 | String.join(" | ", ocyc.conceptsFor(matchTerm)));
310 | }
311 | System.out.println(i + " " + mat);
312 | });
313 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
314 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
315 | System.out.println("--- position not known in word to vec space:[" + ss + "]");
316 | // Logger.getLogger(MissingVideoConceptFinderTest.class.getName()).log(Level.INFO, null, ex);
317 | }
318 | }
319 |
320 | private void lookItUpWithOcyc(String ss) {
321 | try {
322 | System.out.println("=======[" + ss + "]=======");
323 | long t1 = System.currentTimeMillis();
324 | List matches
325 | = cSpace.findNearestNForIn(Arrays.asList(ss.split("\\s+")), 40, ocyc);
326 |
327 | System.out.println("Matches:" + (matches == null ? "null" : matches.size()));
328 | IntStream.range(0, matches.size())
329 | .forEach(i -> {
330 | System.out.println(i + " " + matches.get(i).toString());
331 | });
332 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
333 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
334 | System.out.println("--- position not known in word to vec space:[" + ss + "]");
335 | // Logger.getLogger(MissingVideoConceptFinderTest.class.getName()).log(Level.INFO, null, ex);
336 | }
337 | }
338 | }
339 |
--------------------------------------------------------------------------------
/CycMapDBTools/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | CycMapDBTools
6 | jar
7 |
8 | UTF-8
9 | 1.8
10 | 1.8
11 |
12 |
13 |
14 | com.cyc.tool
15 | distributedRepresentationsParent
16 | 1.0
17 | ../distributedRepresentationsParent
18 |
19 |
20 |
21 |
22 |
23 | org.codehaus.mojo
24 | license-maven-plugin
25 |
26 |
27 | org.apache.maven.plugins
28 | maven-javadoc-plugin
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/CycMapDBTools/pom.xml~:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | CycMapDBTools
6 | 0.0.1-SNAPSHOT
7 | jar
8 |
9 | UTF-8
10 | 1.8
11 | 1.8
12 |
13 |
14 |
15 | com.cyc.project.kbtaxonomy
16 | KBTaxonomyParent
17 | 0.0.1-SNAPSHOT
18 | ../KBTaxonomyParent
19 |
20 |
21 |
22 |
23 |
24 | org.codehaus.mojo
25 | license-maven-plugin
26 |
27 |
28 | org.apache.maven.plugins
29 | maven-javadoc-plugin
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/CycMapDBTools/src/main/java/com/cyc/tool/MapDBConfiguration.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool;
2 |
3 | /*
4 | * #%L
5 | * CycMapDBTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.File;
24 | import java.io.FileNotFoundException;
25 | import java.io.IOException;
26 |
27 | /**
28 | *
31 | * BiologyW2VSpace filtered to only contain terms in Open Cyc.
32 | */
33 | public class BiologyW2VOpenCycSubspace extends Word2VecSubspace {
34 |
35 | static BiologyW2VOpenCycSubspace singleton;
36 |
37 | private BiologyW2VOpenCycSubspace(OpenCycOwl ocyc) throws IOException {
38 | super(BiologyW2VSpace.get(),
39 | m -> ocyc.knownTerm(m), getWord2VecVectorsMapName());
40 | }
41 |
42 | /**
43 | *
44 | * @return a WordToVecSubspace limited only to terms in OpenCyc
45 | */
46 | public static BiologyW2VOpenCycSubspace get() {
47 | if (singleton == null) {
48 | try {
49 | OpenCycOwl ocyc = new OpenCycOwl();
50 | singleton = new BiologyW2VOpenCycSubspace(ocyc);
51 | } catch (IOException ex) {
52 | Logger.getLogger(BiologyW2VSpace.class.getName()).log(Level.SEVERE, null, ex);
53 | throw new RuntimeException("Can't create the Biology W2VSpace object " + ex);
54 | } catch (OWLOntologyCreationException ex) {
55 | Logger.getLogger(BiologyW2VOpenCycSubspace.class.getName()).log(Level.SEVERE, null, ex);
56 | }
57 | }
58 | return singleton;
59 | }
60 |
61 | static String getWord2VecVectorsMapName() {
62 | return BiologyW2VOpenCycSubspace.class.getCanonicalName();
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/BiologyW2VSpace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.BufferedReader;
24 | import java.io.File;
25 | import java.io.FileReader;
26 | import java.io.IOException;
27 | import java.util.Arrays;
28 | import java.util.logging.Level;
29 | import java.util.logging.Logger;
30 | import java.util.stream.Collectors;
31 | import org.mapdb.DBMaker;
32 |
33 | /**
34 | * The word2vec space produced by BioASQ by training on pubmed.
35 | *
36 | *
37 | * See:
38 | * http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts
39 | */
40 | public class BiologyW2VSpace extends Word2VecSpace {
41 |
42 | private static final String fileBase = "/cyc/projects/kbTaxonomy/ConceptFinder/BioASQ/word2vecTools/";
43 | private static BiologyW2VSpace singleton;
44 | private static final String w2vlabelfile = fileBase + "types.txt";
45 | private static final String w2vvectorfile = fileBase + "vectors.txt";
46 |
47 | private BiologyW2VSpace() throws IOException {
48 | db = DBMaker.newFileDB(new File(Config.getW2vDBFile()))
49 | .closeOnJvmShutdown()
50 | // .encryptionEnable("password")
51 | .make();
52 | vectors = db.getTreeMap(getWord2VecVectorsMapName());
53 | // vectors.clear();
54 | if (!vectors.isEmpty()) {
55 | assert (getVector("anti-mib-1") != null);
56 | setSize(getVector("hgh-b").length);
57 | return;
58 | }
59 | int i = 0;
60 | try (BufferedReader labelReader = new BufferedReader(new FileReader(w2vlabelfile))) {
61 | try (BufferedReader vectorReader = new BufferedReader(new FileReader(w2vvectorfile))) {
62 | for (String label; (label = labelReader.readLine()) != null;) {
63 | String vec = vectorReader.readLine();
64 | float[] d
65 | = normVector(
66 | Arrays.asList(vec.split("\\s+"))
67 | .stream()
68 | .map(s -> Float.valueOf(s))
69 | .collect(Collectors.toList())
70 | );
71 | if (getSize() != 0) {
72 | assert d.length == getSize() : "Line without " + getSize() + " floats";
73 | } else {
74 | setSize(d.length);
75 | }
76 | if (i++ % 100000 == 0) {
77 | db.commit();
78 | System.out.println(i + ": " + label);
79 | }
80 |
81 | vectors.put(label, d);
82 | // process the line.
83 | }
84 | // line is not visible here.
85 | }
86 | }
87 | System.out.println("Read " + i + " term positions for " + BiologyW2VSpace.class.getSimpleName());
88 | db.commit();
89 | db.compact();
90 | }
91 |
92 | /**
93 | * Factory get method for BiologyW2VSpace.
94 | *
95 | * @return a BiologyW2VSpace
96 | */
97 | public static BiologyW2VSpace get() {
98 | if (singleton == null) {
99 | try {
100 | singleton = new BiologyW2VSpace();
101 | } catch (IOException ex) {
102 | Logger.getLogger(BiologyW2VSpace.class.getName()).log(Level.SEVERE, null, ex);
103 | throw new RuntimeException("Can't create the Biology W2VSpace object\n " + ex);
104 | }
105 | }
106 | return singleton;
107 | }
108 |
109 | /*
110 | @ToDo: change this to use the class name, so that it's automatically correct
111 | */
112 | private static String getWord2VecVectorsMapName() {
113 | return BiologyW2VSpace.class.getCanonicalName();
114 | }
115 |
116 | }
117 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Config.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.MapDBConfiguration;
24 |
25 | /**
26 | *
27 | * Config provides default locations for the DistributedRepresentations project.
28 | */
29 | public class Config extends MapDBConfiguration {
30 |
31 | private static final String fallBackDBLocation = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/";
32 |
33 | private static final String w2vDBFile = "/w2vdb";
34 |
35 | /**
36 | *
37 | * @return W2VDB file location
38 | */
39 | protected static String getW2vDBFile() {
40 | return getMapDBBase(fallBackDBLocation) + w2vDBFile;
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/GoogleNewsW2VOpenCycSubspace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.owltools.OpenCycOwl;
24 | import java.io.IOException;
25 | import java.util.logging.Level;
26 | import java.util.logging.Logger;
27 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
28 |
29 | /**
30 | * News Word2Vec Distributed representation filtered to only contain terms in Open Cyc.
31 | *
32 | *
33 | * Used for rapid searches of the space for open cyc terms
34 | */
35 | public class GoogleNewsW2VOpenCycSubspace extends Word2VecSubspace {
36 |
37 | static GoogleNewsW2VOpenCycSubspace singleton;
38 |
39 | private GoogleNewsW2VOpenCycSubspace(OpenCycOwl ocyc) throws IOException {
40 | super(GoogleNewsW2VSpace.get(),
41 | m -> ocyc.knownTerm(m), getWord2VecVectorsMapName());
42 | }
43 |
44 | /**
45 | *
46 | * @return a WordToVecSubspace limited only to terms in OpenCyc
47 | */
48 | public static GoogleNewsW2VOpenCycSubspace get() {
49 | if (singleton == null) {
50 | try {
51 | OpenCycOwl ocyc = new OpenCycOwl();
52 | singleton = new GoogleNewsW2VOpenCycSubspace(ocyc);
53 | } catch (IOException ex) {
54 | Logger.getLogger(GoogleNewsW2VSpace.class.getName()).log(Level.SEVERE, null, ex);
55 | throw new RuntimeException("Can't create the Google News W2VSpace object " + ex);
56 | } catch (OWLOntologyCreationException ex) {
57 | Logger.getLogger(GoogleNewsW2VOpenCycSubspace.class.getName()).log(Level.SEVERE, null, ex);
58 | }
59 | }
60 | return singleton;
61 | }
62 |
63 | static String getWord2VecVectorsMapName() {
64 | return GoogleNewsW2VOpenCycSubspace.class.getCanonicalName();
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/GoogleNewsW2VSpace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.IOException;
24 | import java.util.logging.Level;
25 | import java.util.logging.Logger;
26 |
27 | /**
28 | * The word2vec space produced by Google by training on 10^11 words of news.
29 | *
30 | *
31 | * See: https://code.google.com/p/word2vec/
32 | */
33 | public class GoogleNewsW2VSpace extends Word2VecSpaceFromFile {
34 |
35 | private static GoogleNewsW2VSpace singleton;
36 | private static final String w2vfile = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/GoogleNews-vectors-negative300.bin.gz";
37 |
38 | private GoogleNewsW2VSpace() throws IOException {
39 | super();
40 | vectors = db.getTreeMap(getWord2VecVectorsMapName());
41 | if (!vectors.isEmpty()) {
42 | assert (getVector("snowcapped_Caucasus") != null);
43 | setSize(getVector("dog").length);
44 | return;
45 | }
46 | createW2VinDB(getW2vfile());
47 | }
48 |
49 | /**
50 | * Factory get method for GoogleNewsW2VSpace.
51 | *
52 | * @return a GoogleNewsW2VSpace
53 | */
54 | public static GoogleNewsW2VSpace get() {
55 | if (singleton == null) {
56 | try {
57 | singleton = new GoogleNewsW2VSpace();
58 | } catch (IOException ex) {
59 | Logger.getLogger(GoogleNewsW2VSpace.class.getName()).log(Level.SEVERE, null, ex);
60 | throw new RuntimeException("Can't create the Google News W2VSpace object " + ex);
61 | }
62 | }
63 | return singleton;
64 | }
65 |
66 | private static String getW2vfile() {
67 | return w2vfile;
68 | }
69 |
70 | private static String getWord2VecVectorsMapName() {
71 | /*
72 | @ToDo: change this to use the class name, so that it's automatically correct
73 | */
74 | return GoogleNewsW2VSpace.class.getCanonicalName();
75 | //return word2VecVectorsMapName;
76 | }
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Word2VecSpace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.util.ArrayList;
24 | import java.util.Arrays;
25 | import java.util.List;
26 | import java.util.Map;
27 | import java.util.Map.Entry;
28 | import java.util.concurrent.ConcurrentNavigableMap;
29 | import java.util.function.Predicate;
30 | import java.util.stream.Collectors;
31 | import java.util.stream.IntStream;
32 | import org.mapdb.DB;
33 |
34 | /**
35 | * A space of words from Google Word2Vec
36 | *
37 | */
38 | public abstract class Word2VecSpace {
39 |
40 | private int size;
41 | DB db;
42 | Map vectors;
43 | long words;
44 |
45 | /**
46 | *
47 | * @param terms
48 | * @return a List of Strings containing nGrams for terms
49 | */
50 | public static List nGramsFor(List terms) {
51 | final List grams = new ArrayList();
52 | IntStream.rangeClosed(1, terms.size()).forEach(length -> {
53 | IntStream.rangeClosed(0, terms.size() - length).forEach(start -> {
54 | List l = terms.subList(start, start + length);
55 | grams.add(String.join(" ", l));
56 | });
57 |
58 | });
59 | return grams;
60 | }
61 |
62 | private static String norm(String term) {
63 | return term.replaceAll("\\s+", "_");
64 | }
65 |
66 | private double cosineSimilarity(float[] v1, float[] v2) {
67 | return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2));
68 | }
69 |
70 | /**
71 | *
72 | * @param t1
73 | * @param t2
74 | * @return the cosine similarity
75 | */
76 | public double cosineSimilarity(String t1, String t2) {
77 | return cosineSimilarity(getVector(t1), getVector(t2));
78 | }
79 |
80 | private double dotProduct(float[] v1, float[] v2) {
81 | return IntStream.range(0, v1.length)
82 | .mapToDouble(i -> (double) v1[i] * (double) v2[i])
83 | .sum();
84 | }
85 |
86 | private double euclidianDistance(float[] v1, float[] v2) {
87 | double dist = Math.sqrt(IntStream.range(0, v1.length)
88 | .mapToDouble(i -> Math.pow((double) v1[i] - (double) v2[i], 2))
89 | .sum());
90 | return dist;
91 | }
92 |
93 | private double euclidianDistance(String t1, String t2) {
94 | return euclidianDistance(getVector(t1), getVector(t2));
95 | }
96 |
97 | private float[] getAverageVector(List terms) {
98 | final float sum[] = new float[size];
99 | final double mult = 1.0 / terms.size();
100 | terms.forEach(s -> {
101 | float v[] = getVector(s);
102 | IntStream.range(0, size)
103 | .forEach(i -> {
104 | sum[i] += mult * v[i];
105 | });
106 | });
107 | return sum;
108 | }
109 |
110 | /**
111 | *
112 | * @return the db
113 | */
114 | public DB getDb() {
115 | return db;
116 | }
117 |
118 | /**
119 | * Set up the DB.
120 | *
121 | * @param db
122 | */
123 | public void setDb(DB db) {
124 | this.db = db;
125 | }
126 |
127 | /**
128 | *
129 | * @param terms
130 | * @return the sum of term vectors divided by vector length
131 | * @throws NoWordToVecVectorForTerm
132 | */
133 | public float[] getGoogleNormedVector(List terms) throws NoWordToVecVectorForTerm {
134 | // Sum of term vectors divided by vector length
135 | // Note that this will miss multi-word exact matches, so prefer getMaximalNormedVector
136 | //except for exact code comparison tests
137 | final float sum[] = new float[size];
138 | if (terms.stream().allMatch(s -> !knownTerm(s))) {
139 | throw new NoWordToVecVectorForTerm("Can't find vector for:" + String.join(", ", terms));
140 | }
141 | terms.stream()
142 | .filter(s -> knownTerm(s))
143 | .forEach(s -> {
144 | float v[] = getVector(s);
145 | IntStream.range(0, size)
146 | .forEach(i -> {
147 | sum[i] += v[i];
148 | });
149 | });
150 | return normVector(sum);
151 | }
152 |
153 | /**
154 | *
155 | * @param interms
156 | * @return the maximal normed vector
157 | * @throws NoWordToVecVectorForTerm
158 | */
159 | public float[]
160 | getMaximalNormedVector(List interms) throws NoWordToVecVectorForTerm {
161 | // Sum of term ngram vectors divided by vector length
162 | List terms = nGramsFor(interms);
163 | final float sum[] = new float[size];
164 | if (terms.stream().allMatch(s -> !knownTerm(s))) {
165 | throw new NoWordToVecVectorForTerm("Can't find vector for:" + String.join(", ", terms));
166 | }
167 | terms.stream()
168 | .filter(s -> knownTerm(s))
169 | .forEach(s -> {
170 | float v[] = getVector(s);
171 | IntStream.range(0, size)
172 | .forEach(i -> {
173 | sum[i] += v[i];
174 | });
175 | });
176 | return normVector(sum);
177 | }
178 |
179 | /**
180 | *
181 | * @return size of vectors
182 | */
183 | public int getNVectors() {
184 | return vectors.size();
185 | }
186 |
187 | /**
188 | *
189 | * @return size of the Word2VecSpace
190 | */
191 | public int getSize() {
192 | return size;
193 | }
194 |
195 | /**
196 | *
197 | * @param size
198 | */
199 | public void setSize(int size) {
200 | this.size = size;
201 | }
202 |
203 | /**
204 | *
205 | * @param term
206 | * @return the vector for term
207 | */
208 | public float[] getVector(String term) {
209 | return vectors.get(norm(term));
210 | }
211 |
212 | /**
213 | *
214 | * @return the vectors
215 | */
216 | public Map getVectors() {
217 | return vectors;
218 | }
219 |
220 | /**
221 | *
222 | * @param vectors
223 | */
224 | public void setVectors(ConcurrentNavigableMap vectors) {
225 | this.vectors = vectors;
226 | }
227 |
228 | /**
229 | *
230 | * @return the words
231 | */
232 | public long getWords() {
233 | return words;
234 | }
235 |
236 | /**
237 | *
238 | * @param words
239 | */
240 | public void setWords(long words) {
241 | this.words = words;
242 | }
243 |
244 | /**
245 | *
246 | * @param v1
247 | * @param v2
248 | * @return the similarity between v1 and v2
249 | */
250 | public double googleSimilarity(float[] v1, float[] v2) {
251 | return dotProduct(v1, v2);
252 | }
253 |
254 | private double googleSimilarity(String t1, String t2) {
255 | return googleSimilarity(getVector(t1), getVector(t2));
256 | }
257 |
258 | /**
259 | *
260 | * @param terms
261 | * @param term
262 | * @return the similarity
263 | * @throws NoWordToVecVectorForTerm
264 | */
265 | public double googleSimilarity(List terms, String term) throws NoWordToVecVectorForTerm {
266 | return googleSimilarity(getGoogleNormedVector(terms), getVector(term));
267 | }
268 |
269 | /**
270 | *
271 | * @param term
272 | * @return true if term is in vectors
273 | */
274 | public boolean knownTerm(String term) {
275 | return vectors.containsKey(norm(term));
276 | }
277 |
278 | private double magnitude(float[] v) {
279 | return Math.sqrt(IntStream.range(0, v.length).mapToDouble(i -> v[i] * v[i]).sum());
280 | }
281 |
282 | private double magnitude(List v) {
283 | return Math.sqrt(v.stream().mapToDouble(i -> i * i).sum());
284 | }
285 |
286 | /**
287 | *
288 | * @param v
289 | * @return normalized vector for v
290 | */
291 | public float[] normVector(float[] v) {
292 | final float normed[] = new float[size];
293 | double len = magnitude(v);
294 |
295 | IntStream.range(0, size)
296 | .forEach(i -> {
297 | normed[i] = v[i] / (float) len;
298 | });
299 | return normed;
300 | }
301 |
302 | /**
303 | *
304 | * @param v
305 | * @return normalized vector for v
306 | */
307 | public float[] normVector(List v) {
308 | final float normed[] = new float[v.size()];
309 | double len = magnitude(v);
310 |
311 | IntStream.range(0, v.size())
312 | .forEach(i -> {
313 | normed[i] = v.get(i) / (float) len;
314 | });
315 | return normed;
316 | }
317 |
318 | /**
319 | *
320 | * @param s
321 | * @return List of Strings
322 | */
323 | public List stringToList(String s) {
324 | return Arrays.asList(s.split("\\s+"));
325 | }
326 |
327 | /**
328 | *
329 | * @param includeIf the predicate that is applied to the strings (the keys or embedded strings)
330 | * of the word to vec space to determine whether they should be retained in the output vector list
331 | * @return filtered vectors Map
332 | */
333 | protected Map filterVectors(Predicate includeIf) {
334 | return vectors.entrySet().stream().filter(entry -> {
335 | return includeIf.test(entry.getKey());
336 | }).collect(Collectors.toMap(Entry::getKey, Entry::getValue));
337 | }
338 |
339 | /**
340 | * No Vector for Term
341 | *
342 | * Exception to use check when a term looked up in the space has no known position
343 | */
344 | public static class NoWordToVecVectorForTerm extends Exception {
345 |
346 | /**
347 | *
348 | * @param message
349 | */
350 | public NoWordToVecVectorForTerm(String message) {
351 | super(message);
352 | }
353 | }
354 | }
355 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Word2VecSpaceFromFile.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.DataInputStream;
24 | import java.io.File;
25 | import java.io.FileInputStream;
26 | import java.io.FileNotFoundException;
27 | import java.io.IOException;
28 | import java.util.logging.Level;
29 | import java.util.logging.Logger;
30 | import java.util.stream.IntStream;
31 | import java.util.zip.GZIPInputStream;
32 | import org.apache.commons.io.EndianUtils;
33 | import org.mapdb.DBMaker;
34 |
35 | /**
36 | * Word2Vec distributed representation space from Google Format file.
37 | *
38 | *
39 | * This class represents any distributed represenation computed using word2vec and initially loaded
40 | * from a Google word2vec formatted file
41 | */
42 | public abstract class Word2VecSpaceFromFile extends Word2VecSpace {
43 |
44 | final StringBuilder sb = new StringBuilder();
45 |
46 | /**
47 | * Constructor for Word2VecSpaceFromFile
48 | *
49 | * @throws IOException
50 | */
51 | public Word2VecSpaceFromFile() throws IOException {
52 | db = DBMaker.newFileDB(new File(Config.getW2vDBFile()))
53 | .closeOnJvmShutdown()
54 | // .encryptionEnable("password")
55 | .make();
56 |
57 | }
58 |
59 | /**
60 | * Create a W2V space in a DB.
61 | *
62 | * @param w2vZipFile
63 | * @throws FileNotFoundException
64 | * @throws IOException
65 | */
66 | protected final void createW2VinDB(String w2vZipFile) throws FileNotFoundException, IOException {
67 | try (DataInputStream data_in
68 | = new DataInputStream(
69 | new GZIPInputStream(new FileInputStream(
70 | new File(w2vZipFile))))) {
71 | getWordsAndSize(data_in);
72 | if (vectors.size() == words) {
73 | System.out.println("Word2Vec is in DB");
74 | } else {
75 | System.out.println("DB Size:" + vectors.size());
76 |
77 | System.out.println("Want to read Word Count: " + words);
78 | System.out.println("Size:" + getSize());
79 | for (int w = 0; w < words; w++) {
80 | float[] v = new float[getSize()];
81 | String key = getVocabString(data_in);
82 | System.out.println(w + ":\t" + key);
83 |
84 | IntStream.range(0, getSize()).forEach(i -> v[i]
85 | = getFloat(data_in));
86 | vectors.put(key, normVector(v));
87 | if (w % 100000 == 1) {
88 | db.commit();
89 | }
90 | }
91 | db.commit();
92 | db.compact();
93 | }
94 | }
95 | }
96 |
97 | private float getFloat(DataInputStream s) {
98 | try {
99 | float v = EndianUtils.readSwappedFloat(s);
100 | //System.out.println(st+"["+i+"]: "+v);
101 | return v;
102 | } catch (IOException ex) {
103 | Logger.getLogger(Word2VecSpace.class.getName()).log(Level.SEVERE, null, ex);
104 | return 0.0f;
105 | }
106 | }
107 |
108 | private String getVocabString(DataInputStream s) throws IOException {
109 | sb.setLength(0);
110 | for (char ch = (char) s.read();
111 | (!Character.isWhitespace(ch) && ch >= 0 && ch <= 256);
112 | ch = (char) s.read()) {
113 | sb.append((char) ch);
114 | }
115 | return sb.toString();
116 | }
117 |
118 | private void getWordsAndSize(DataInputStream s) throws IOException {
119 | sb.setLength(0);
120 | for (char ch = (char) s.read(); ch != '\n'; ch = (char) s.read()) {
121 | sb.append(ch);
122 | }
123 | String[] parts = sb.toString().split("\\s+");
124 | words = Long.parseLong(parts[0]);
125 | setSize((int) Long.parseLong(parts[1]));
126 | }
127 |
128 | }
129 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Word2VecSubspace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.File;
24 | import java.io.IOException;
25 | import java.util.Map;
26 | import java.util.function.Predicate;
27 | import org.mapdb.DBMaker;
28 |
29 | /**
30 | * A space of words from Google Word2Vec.
31 | *
32 | */
33 | public abstract class Word2VecSubspace extends Word2VecSpace {
34 |
35 | final Word2VecSpace mySuperSpace;
36 |
37 | /**
38 | * Word2VecSubspace constructor.
39 | *
40 | * @param ofSpace
41 | * @param includeIf
42 | * @param persistLoc
43 | * @throws IOException
44 | */
45 | protected Word2VecSubspace(Word2VecSpace ofSpace, Predicate includeIf, String persistLoc) throws IOException {
46 |
47 | mySuperSpace = ofSpace;
48 | if (db == null) {
49 | db = DBMaker.newFileDB(new File(Config.getW2vDBFile()))
50 | .closeOnJvmShutdown()
51 | // .encryptionEnable("password")
52 | .make();
53 | }
54 | vectors = db.getTreeMap(persistLoc);
55 | // vectors.clear();
56 | if (!vectors.isEmpty()) {
57 | setSize(vectors.values().iterator().next().length);
58 | System.out.println("Got cached w2vspace for " + persistLoc + " of dimensionality " + getSize() + " and with " + vectors.size() + " entries.");
59 | return;
60 | }
61 | // assert(vectors == null) :"Subspaces msut be completely empty when created";
62 | System.out.println("Filtering vectors for:" + persistLoc);
63 | Map newvectors = ofSpace.filterVectors(includeIf);
64 | newvectors.entrySet().forEach(e -> {
65 | vectors.put(e.getKey(), e.getValue());
66 | });
67 | db.commit();
68 | db.compact();
69 | db.commit();
70 | System.out.println("Vectors filtered and persisted.");
71 | }
72 |
73 | /**
74 | *
75 | * @return the mySuperSpace
76 | */
77 | public Word2VecSpace getSuperSpace() {
78 | return mySuperSpace;
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/test/java/com/cyc/tool/distributedrepresentations/BiologyW2VSpaceIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import static org.junit.Assert.assertEquals;
24 | import static org.junit.Assert.assertTrue;
25 | import org.junit.Test;
26 |
27 | /**
28 | * Tests for BiologyW2VSpace.
29 | */
30 | public class BiologyW2VSpaceIT {
31 |
32 | public BiologyW2VSpaceIT() {
33 | }
34 |
35 | @Test
36 | public void testGet() {
37 | System.out.println("get");
38 |
39 | BiologyW2VSpace result = BiologyW2VSpace.get();
40 | assertTrue(result != null);
41 | }
42 |
43 | @Test
44 | public void testNumberOfVectors() {
45 | System.out.println("getNVectors");
46 |
47 | int result = BiologyW2VSpace.get().getNVectors();
48 |
49 | assertEquals(result, 1701632);
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/test/java/com/cyc/tool/distributedrepresentations/Word2VecSpaceIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.IOException;
24 | import java.util.Arrays;
25 | import java.util.List;
26 | import org.junit.AfterClass;
27 | import static org.junit.Assert.assertEquals;
28 | import static org.junit.Assert.assertTrue;
29 | import static org.junit.Assert.fail;
30 | import org.junit.BeforeClass;
31 | import org.junit.Test;
32 |
33 | /**
34 | * Tests for Word2VecSpace.
35 | */
36 | public class Word2VecSpaceIT {
37 |
38 | static List cr = Arrays.asList("Chinese", "river");
39 | static Word2VecSpace mySpace;
40 |
41 | public Word2VecSpaceIT() {
42 | }
43 |
44 | @BeforeClass
45 |
46 | public static void setUpClass() throws IOException {
47 | mySpace = GoogleNewsW2VSpace.get();
48 | }
49 |
50 | @AfterClass
51 |
52 | public static void tearDownClass() {
53 | mySpace = null;
54 | }
55 | //
56 |
57 | @Test
58 | public void distanceTest() {
59 | assertEquals(1.0, mySpace.cosineSimilarity("skimpy bathing suits", "skimpy_bathing_suits"), 0.00000001);
60 | assertEquals(0.24279, mySpace.cosineSimilarity("skimpy bathing suits", "Giant Octopus"), 0.0001);
61 | assertEquals(0.54801, mySpace.cosineSimilarity("skimpy bathing suits", "bathing suits"), 0.0001);
62 | assertEquals(0.645069, mySpace.cosineSimilarity("apple", "pear"), 0.0001);
63 | assertEquals(0.20749, mySpace.cosineSimilarity("apple", "cat"), 0.0001);
64 |
65 | assertTrue(mySpace.cosineSimilarity("apple", "pear")
66 | > mySpace.cosineSimilarity("apple", "cat"));
67 | }
68 |
69 | @Test
70 | public void getVectorTest1() {
71 | assertEquals(-0.05338118f, (mySpace.getVector("skimpy bathing suits")[5]), 0.000001);
72 | assertEquals(0.047296f, (mySpace.getVector("skimpy bathing suits")[105]), 0.000001);
73 | }
74 |
75 | @Test
76 | public void getVectorTest2a() {
77 | assertEquals(-0.049851f, (mySpace.getVector("Chinese")[0]), 0.000001);
78 | assertEquals(-0.090444f, (mySpace.getVector("Chinese")[5]), 0.000001);
79 | }
80 |
81 | @Test
82 | public void getVectorTest2b() {
83 | assertEquals(0.002663f, (mySpace.getVector("river")[0]), 0.000001);
84 | assertEquals(-0.029231f, (mySpace.getVector("river")[5]), 0.000001);
85 | }
86 |
87 | @Test
88 | public void googleDistanceTest1() {
89 | try {
90 | assertEquals(0.667376,
91 | mySpace.googleSimilarity(cr, "Yangtze_River"), 0.0001);
92 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
93 | fail("took unexpected exception:" + ex);
94 | }
95 | }
96 |
97 | @Test
98 | public void googleDistanceTest2() {
99 | try {
100 | assertEquals(0.594108,
101 | mySpace.googleSimilarity(cr, "Hongze_Lake"), 0.0001);
102 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
103 | fail("took unexpected exception:" + ex);
104 | }
105 | }
106 |
107 | @Test
108 | public void googleDistanceTest3() {
109 | try {
110 | assertEquals(0.604726,
111 | mySpace.googleSimilarity(cr, "Huangpu_River"), 0.0001);
112 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
113 | fail("took unexpected exception:" + ex);
114 | }
115 | }
116 |
117 | @Test
118 | public void googleNormVectorTest0() {
119 | try {
120 | float[] norm = mySpace.getGoogleNormedVector(cr);
121 | assertEquals(-0.032075, norm[0], 0.000001);
122 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
123 | fail("took unexpected exception:" + ex);
124 | }
125 | }
126 |
127 | @Test
128 | public void googleNormVectorTest100() {
129 | float[] norm;
130 | try {
131 | norm = mySpace.getGoogleNormedVector(cr);
132 | assertEquals(-0.095236, norm[100], 0.000001);
133 |
134 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
135 | fail("took unexpected exception:" + ex);
136 | }
137 | }
138 |
139 | @Test
140 | public void googleNormVectorTest5() {
141 | try {
142 | float[] norm = mySpace.getGoogleNormedVector(cr);
143 | assertEquals(-0.081347, norm[5], 0.000001);
144 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
145 | fail("took unexpected exception:" + ex);
146 | }
147 | }
148 |
149 | @Test
150 | public void googleNormVectorTest50() {
151 | try {
152 | float[] norm = mySpace.getGoogleNormedVector(cr);
153 | assertEquals(0.080537, norm[50], 0.000001);
154 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
155 | fail("took unexpected exception:" + ex);
156 | }
157 | }
158 |
159 | /**
160 | * Test if known terms have been loaded from the Word2Vec file or DB
161 | */
162 | @Test
163 | public void knownTermTest() {
164 | // System.out.println("DB Size:" + vectors.size());
165 |
166 | assertTrue(mySpace.knownTerm("Yathra"));
167 | assertTrue(mySpace.knownTerm("skimpy bathing suits"));
168 | assertTrue(mySpace.knownTerm("Giant_Octopus"));
169 | assertTrue(mySpace.knownTerm("Yangtze_River"));
170 | assertTrue(mySpace.knownTerm("Chinese"));
171 | // assertTrue(mySpace.knownTerm("Chinese River"));
172 |
173 | }
174 |
175 | // @Test
176 | // public void findNearbyTerms1() {
177 | // try {
178 | // long t1 = System.currentTimeMillis();
179 | // List matches = mySpace.findNearestNForWithInputTermFiltering(cr, 40);
180 | // IntStream.range(0, matches.size())
181 | // .forEach(i -> {
182 | // System.out.println(i + " " + matches.get(i).toString());
183 | // });
184 | // System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
185 | // assertEquals(matches.get(0).getTerm(), "Yangtze_River");
186 | // assertEquals(0.604726, matches.get(5).getSimilarity(), 0.000001);
187 | //
188 | // assertEquals(matches.get(23).getTerm(), "rivers");
189 | // } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
190 | // fail("took unexpected exception:" + ex);
191 | // }
192 | // }
193 | //
194 | // @Test
195 | //
196 | // public void findNearbyTerms2() {
197 | // try {
198 | // long t1 = System.currentTimeMillis();
199 | // List matches = mySpace.findNearestNForWithInputTermFiltering(Arrays.asList("gangplank"), 40);
200 | // IntStream.range(0, matches.size())
201 | // .forEach(i -> {
202 | // System.out.println(i + " " + matches.get(i).toString());
203 | // });
204 | // System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
205 | // } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
206 | // fail("took unexpected exception:" + ex);
207 | // }
208 | // }
209 | @Test
210 | public void testNGramsFor() {
211 | List res = Word2VecSpace.nGramsFor(Arrays.asList("this", "is", "a", "test"));
212 | // System.out.println("test: "+res+" len:"+res.size());
213 |
214 | assertEquals(10, res.size());
215 | }
216 |
217 | @Test
218 | public void testNGramsForCR() {
219 | List res = Word2VecSpace.nGramsFor(cr);
220 | System.out.println("test: " + res + " len:" + res.size());
221 | assertEquals(3, res.size());
222 | }
223 |
224 | }
225 |
--------------------------------------------------------------------------------
/OwlTools/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | OwlTools
6 | jar
7 |
8 |
9 | com.cyc.tool
10 | distributedRepresentationsParent
11 | 1.0
12 | ../distributedRepresentationsParent
13 |
14 |
15 |
16 |
17 |
18 | org.apache.maven.plugins
19 | maven-compiler-plugin
20 | 2.3.2
21 |
22 | 1.8
23 | 1.8
24 |
25 |
26 |
27 | org.codehaus.mojo
28 | license-maven-plugin
29 |
30 |
31 | org.apache.maven.plugins
32 | maven-javadoc-plugin
33 |
34 |
35 |
36 |
37 |
38 | net.sourceforge.owlapi
39 | owlapi-distribution
40 | 4.0.1
41 | jar
42 |
43 |
44 | org.mapdb
45 | mapdb
46 | 1.0.6
47 | jar
48 |
49 |
50 | com.cyc.tool
51 | CycMapDBTools
52 | jar
53 |
54 |
55 |
56 | UTF-8
57 | 1.8
58 | 1.8
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/OwlTools/pom.xml~:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | OwlTools
6 | 0.0.1-SNAPSHOT
7 | jar
8 |
9 |
10 | com.cyc.project.kbtaxonomy
11 | KBTaxonomyParent
12 | 0.0.1-SNAPSHOT
13 | ../KBTaxonomyParent
14 |
15 |
16 |
17 |
18 |
19 | org.apache.maven.plugins
20 | maven-compiler-plugin
21 | 2.3.2
22 |
23 | 1.8
24 | 1.8
25 |
26 |
27 |
28 | org.codehaus.mojo
29 | license-maven-plugin
30 |
31 |
32 | org.apache.maven.plugins
33 | maven-javadoc-plugin
34 |
35 |
36 |
37 |
38 |
39 | net.sourceforge.owlapi
40 | owlapi-distribution
41 | 4.0.1
42 | jar
43 |
44 |
45 | org.mapdb
46 | mapdb
47 | 1.0.6
48 | jar
49 |
50 |
51 | com.cyc.tool
52 | DistributedRepresentations
53 | 0.0.1-SNAPSHOT
54 | jar
55 |
56 |
57 | com.cyc.tool
58 | CycMapDBTools
59 | 0.0.1-SNAPSHOT
60 | jar
61 |
62 |
63 |
64 | UTF-8
65 | 1.8
66 | 1.8
67 |
68 |
69 |
--------------------------------------------------------------------------------
/OwlTools/src/main/java/com/cyc/tool/owltools/OpenCycContent.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.owltools;
2 |
3 | /*
4 | * #%L
5 | * OwlTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.util.ArrayList;
24 | import java.util.Collection;
25 | import java.util.HashSet;
26 | import java.util.List;
27 | import java.util.Set;
28 | import org.semanticweb.owlapi.model.IRI;
29 | import org.semanticweb.owlapi.model.OWLAnnotation;
30 | import org.semanticweb.owlapi.model.OWLClass;
31 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
32 | import org.semanticweb.owlapi.reasoner.NodeSet;
33 | import org.semanticweb.owlapi.search.EntitySearcher;
34 |
35 | /**
36 | *
37 | * OpenCycContent is designed to hold information about a given OpenCyc concept that can be found in
38 | * the OWL export of OpenCyc.
39 | *
40 | * This software is the proprietary information of Cycorp, Inc.
41 | *
42 | * Use is subject to license terms.
43 | *
44 | * Created on : Feb 25, 2015, 2:47:47 PM
45 | */
46 | public class OpenCycContent {
47 |
48 | Set commentsForConcept;
49 | String conceptURI;
50 | String labelForConcept;
51 | Set prettyStringsForConcept;
52 | Set subTypesForConcept;
53 |
54 | Set typesForConcept;
55 |
56 | //// Constructors
57 | /**
58 | * Creates a new instance of OpenCycContent.
59 | *
60 | * @param hlid
61 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException
62 | */
63 | public OpenCycContent(String hlid) throws OWLOntologyCreationException {
64 | conceptURI = hlid;
65 | prettyStringsForConcept = null;
66 | commentsForConcept = null;
67 | labelForConcept = null;
68 | typesForConcept = null;
69 | }
70 |
71 | /**
72 | *
73 | * @return HTML String with information about the concept
74 | * @throws OWLOntologyCreationException
75 | */
76 | public String generateHtmlForConcept() throws OWLOntologyCreationException {
77 | String html = "";
78 | String constantName = getLabelForConcept();
79 | Set commentStr = getCommentsForConcept();
80 | Set prettyStr = getPrettyStringsForConcept();
81 | html += "