├── .gitignore ├── ConceptFinder ├── nbactions.xml ├── pom.xml └── src │ ├── main │ └── java │ │ └── com │ │ └── cyc │ │ └── tool │ │ └── conceptfinder │ │ ├── AttachmentHypothesis.java │ │ ├── ConceptFinderConfig.java │ │ ├── ConceptMatch.java │ │ ├── ConceptSpace.java │ │ ├── DefaultConceptFinderConfig.java │ │ ├── MissingConceptFinder.java │ │ └── MissingConceptFinderDefault.java │ └── test │ └── java │ └── com │ └── cyc │ └── tool │ └── conceptfinder │ ├── ConceptSpaceIT.java │ └── MissingConceptFinderIT.java ├── CycMapDBTools ├── pom.xml ├── pom.xml~ └── src │ └── main │ └── java │ └── com │ └── cyc │ └── tool │ └── MapDBConfiguration.java ├── DistributedRepresentations ├── nbactions.xml ├── pom.xml ├── pom.xml~ └── src │ ├── main │ └── java │ │ └── com │ │ └── cyc │ │ └── tool │ │ └── distributedrepresentations │ │ ├── BiologyW2VOpenCycSubspace.java │ │ ├── BiologyW2VSpace.java │ │ ├── Config.java │ │ ├── GoogleNewsW2VOpenCycSubspace.java │ │ ├── GoogleNewsW2VSpace.java │ │ ├── Word2VecSpace.java │ │ ├── Word2VecSpaceFromFile.java │ │ └── Word2VecSubspace.java │ └── test │ └── java │ └── com │ └── cyc │ └── tool │ └── distributedrepresentations │ ├── BiologyW2VSpaceIT.java │ └── Word2VecSpaceIT.java ├── OwlTools ├── pom.xml ├── pom.xml~ └── src │ ├── main │ └── java │ │ └── com │ │ └── cyc │ │ └── tool │ │ └── owltools │ │ ├── OpenCycContent.java │ │ ├── OpenCycOwl.java │ │ ├── OpenCycReasoner.java │ │ └── OwlToolsConfig.java │ └── test │ └── java │ └── com │ └── cyc │ └── tool │ └── owltools │ ├── OpenCycContentIT.java │ ├── OpenCycOwlIT.java │ ├── OpenCycOwlIT.java~ │ └── OpenCycReasonerIT.java ├── README.md └── distributedRepresentationsParent ├── pom.xml └── pom.xml~ /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | pom.xml.tag 3 | pom.xml.releaseBackup 4 | pom.xml.versionsBackup 5 | pom.xml.next 6 | release.properties 7 | dependency-reduced-pom.xml 8 | buildNumber.properties 9 | -------------------------------------------------------------------------------- /ConceptFinder/nbactions.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | CUSTOM-skiptests 5 | skiptests 6 | 7 | clean 8 | install 9 | 10 | 11 | true 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /ConceptFinder/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cyc.tool 5 | ConceptFinder 6 | jar 7 | 8 | 9 | com.cyc.tool 10 | distributedRepresentationsParent 11 | 1.0 12 | ../distributedRepresentationsParent 13 | 14 | 15 | 16 | 17 | 18 | org.codehaus.mojo 19 | license-maven-plugin 20 | 21 | 22 | org.apache.maven.plugins 23 | maven-javadoc-plugin 24 | 25 | 26 | 27 | 28 | 29 | 30 | org.mapdb 31 | mapdb 32 | 1.0.6 33 | jar 34 | 35 | 36 | net.sourceforge.owlapi 37 | owlapi-distribution 38 | 4.0.1 39 | jar 40 | 41 | 42 | com.cyc.tool 43 | OwlTools 44 | jar 45 | 46 | 47 | junit 48 | junit 49 | test 50 | jar 51 | 52 | 53 | com.cyc.tool 54 | DistributedRepresentations 55 | jar 56 | 57 | 58 | com.cyc.tool 59 | CycMapDBTools 60 | jar 61 | 62 | 63 | 64 | 65 | UTF-8 66 | 1.8 67 | 1.8 68 | 69 | 70 | -------------------------------------------------------------------------------- /ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/AttachmentHypothesis.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.conceptfinder; 2 | 3 | /* 4 | * #%L 5 | * ConceptFinder 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.util.List; 24 | 25 | /** 26 | * An AttachmenHypothesis relates an OpenCyc concept to terms in a W2V Space. 27 | */ 28 | public class AttachmentHypothesis { 29 | 30 | int conceptID; 31 | String conceptURI; 32 | String renderedTerms; 33 | Double score; 34 | List targetTerms; 35 | String textLabels; 36 | 37 | /** 38 | * AttachmentHypothesis constructor 39 | * 40 | * @param id 41 | * @param targetTerms 42 | * @param conceptURI 43 | * @param score 44 | * @param textLabels 45 | */ 46 | public AttachmentHypothesis(int id, List targetTerms, String conceptURI, Double score, String textLabels) { 47 | this.conceptURI = conceptURI; 48 | this.score = score; 49 | this.textLabels = textLabels; 50 | this.targetTerms = targetTerms; 51 | this.conceptID = id; 52 | this.renderedTerms = String.join("/", this.targetTerms); 53 | } 54 | 55 | /** 56 | * 57 | * @return the headings for the CSV file 58 | */ 59 | public static String headCSV() { 60 | return "ConceptID,Name,URI,Score,Strings"; 61 | } 62 | 63 | /** 64 | * 65 | * @return the headings for the HTML table 66 | */ 67 | public static String headHTMLTable() { 68 | return "ConceptIDNameURIScoreStrings"; 69 | } 70 | 71 | /** 72 | * 73 | * @return a CSV representation of the AttachmentHypothesis 74 | */ 75 | public String toCSV() { 76 | return conceptID + "," + renderedTerms.replaceAll(",", "") + "," + conceptURI + "," + score + "," 77 | + textLabels.replaceAll(",", ""); 78 | } 79 | 80 | /** 81 | * 82 | * @return an HTML representation of the AttachmentHypothesis 83 | */ 84 | public String toHTMLTableTR() { 85 | return "" + conceptID + "" + renderedTerms + "" + conceptURI + "" + score + "" 86 | + textLabels + ""; 87 | } 88 | 89 | @Override 90 | public String toString() { 91 | return renderedTerms + "[" + conceptID + "]⟶" + conceptURI + " (" + score + ":" + textLabels + ")"; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/ConceptFinderConfig.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.conceptfinder; 2 | 3 | /* 4 | * #%L 5 | * ConceptFinder 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.MapDBConfiguration; 24 | 25 | /** 26 | *

27 | * ConceptFinderConfig is designed to set paths for caching and data access for this package. 28 | */ 29 | public class ConceptFinderConfig extends MapDBConfiguration { 30 | 31 | private static final String fallBackLocation = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/"; 32 | private static final String missingConceptDBFile = "/missingConcept"; 33 | 34 | private static final String w2vDBFile = "/w2vdb"; 35 | private static final String w2vVectorFile = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/GoogleNews-vectors-negative300.bin.gz"; 36 | private static final String word2VecVectorsMapName = "word2Vec"; 37 | 38 | /** 39 | * 40 | * @return the missingConceptDBFile location 41 | */ 42 | protected static String getMissingConceptDBFile() { 43 | return getMapDBBase(fallBackLocation) + missingConceptDBFile; 44 | } 45 | 46 | /** 47 | * 48 | * @return the w2vVectorFile 49 | */ 50 | protected static String getW2VVectorfile() { 51 | return w2vVectorFile; 52 | } 53 | 54 | /** 55 | * 56 | * @return the w2vDBFile location 57 | */ 58 | protected static String getW2vDBFile() { 59 | return getMapDBBase(fallBackLocation) + w2vDBFile; 60 | } 61 | 62 | /** 63 | * 64 | * @return the word2VecVectorsMapName 65 | */ 66 | protected static String getWord2VecVectorsMapName() { 67 | return word2VecVectorsMapName; 68 | } 69 | 70 | } 71 | -------------------------------------------------------------------------------- /ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/ConceptMatch.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.conceptfinder; 2 | 3 | /* 4 | * #%L 5 | * ConceptFinder 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace; 24 | import java.io.Serializable; 25 | import java.util.function.Function; 26 | 27 | /** 28 | * A ConceptMatch relates a concept to a term. 29 | */ 30 | public class ConceptMatch implements Serializable { 31 | 32 | final String concept; 33 | 34 | final double similarity; 35 | final String term; 36 | 37 | /** 38 | * ConceptMatch constructor 39 | * 40 | * @param w2v 41 | * @param search 42 | * @param term 43 | * @param noter 44 | */ 45 | public ConceptMatch(Word2VecSpace w2v, float[] search, String term, 46 | Function noter) { 47 | this.term = term; 48 | if (noter == null) { 49 | this.concept = "---"; 50 | } else { 51 | this.concept = noter.apply(term); 52 | } 53 | similarity = w2v.googleSimilarity(search, w2v.getVector(term)); 54 | } 55 | 56 | /** 57 | * ConceptMatch constructor 58 | * 59 | * @param w2v 60 | * @param search 61 | * @param term 62 | */ 63 | public ConceptMatch(Word2VecSpace w2v, float[] search, String term) { 64 | this(w2v, search, term, null); 65 | } 66 | 67 | /** 68 | * 69 | * @return the concept 70 | */ 71 | public String getConcept() { 72 | return concept; 73 | } 74 | 75 | /** 76 | * 77 | * @return the similarity 78 | */ 79 | public double getSimilarity() { 80 | return similarity; 81 | } 82 | 83 | /** 84 | * 85 | * @return the term 86 | */ 87 | public String getTerm() { 88 | return term; 89 | } 90 | 91 | @Override 92 | public String toString() { 93 | return term + ": " + similarity + ": " + (concept == null ? "--" : concept); 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/ConceptSpace.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.conceptfinder; 2 | 3 | /* 4 | * #%L 5 | * ConceptFinder 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace; 24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm; 25 | import com.cyc.tool.distributedrepresentations.Word2VecSubspace; 26 | import com.cyc.tool.owltools.OpenCycOwl; 27 | import java.io.IOException; 28 | import java.util.Comparator; 29 | import java.util.List; 30 | import java.util.function.Function; 31 | import java.util.function.Predicate; 32 | import java.util.stream.Collectors; 33 | 34 | /** 35 | *

36 | * ConceptSpace provides access to a Word2VecSpace and methods for finding ConceptMatches. 37 | */ 38 | public class ConceptSpace { 39 | 40 | Word2VecSpace w2vSpace; 41 | 42 | /** 43 | * Creates a new instance of ConceptSpace. 44 | * 45 | * @param w2v 46 | * @throws java.io.IOException 47 | */ 48 | public ConceptSpace(Word2VecSpace w2v) throws IOException { 49 | w2vSpace = w2v; 50 | } 51 | 52 | /** 53 | * 54 | * @param terms 55 | * @param n 56 | * @return a List of ConceptMatches 57 | * @throws NoWordToVecVectorForTerm 58 | */ 59 | public List findNearestNFor(List terms, Integer n) throws NoWordToVecVectorForTerm { 60 | return findNearest(w2vSpace.getMaximalNormedVector(terms)) 61 | .stream() 62 | .collect(Collectors.toList()) 63 | .subList(0, n); 64 | } 65 | 66 | /** 67 | * 68 | * @param terms 69 | * @param n 70 | * @return a List of ConceptMatches 71 | * @throws NoWordToVecVectorForTerm 72 | */ 73 | public List findNearestNFor(String terms, Integer n) throws NoWordToVecVectorForTerm { 74 | return findNearestNFor(w2vSpace.stringToList(terms), n); 75 | 76 | } 77 | 78 | /** 79 | * 80 | * @param terms 81 | * @param n 82 | * @param ocyc 83 | * @return a List of ConceptMatches 84 | * @throws NoWordToVecVectorForTerm 85 | */ 86 | public List findNearestNForIn(List terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm { 87 | float[] norm = w2vSpace.getMaximalNormedVector(terms); 88 | return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t))) 89 | .stream() 90 | .collect(Collectors.toList()) 91 | .subList(0, n); 92 | } 93 | 94 | /** 95 | * 96 | * @param terms 97 | * @param n 98 | * @param ocyc 99 | * @return a List of ConceptMatches 100 | * @throws NoWordToVecVectorForTerm 101 | */ 102 | public List findNearestNForIn(String terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm { 103 | 104 | return findNearestNForIn(w2vSpace.stringToList(terms), n, ocyc); 105 | 106 | } 107 | 108 | /** 109 | * 110 | * @param terms 111 | * @param n 112 | * @param ocyc 113 | * @return a List of ConceptMatches 114 | * @throws NoWordToVecVectorForTerm 115 | */ 116 | public List findNearestNForInStrictW2V(List terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm { 117 | float[] norm = w2vSpace.getGoogleNormedVector(terms); 118 | return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t))) 119 | .stream() 120 | .collect(Collectors.toList()) 121 | .subList(0, n); 122 | } 123 | 124 | /** 125 | * 126 | * @param terms 127 | * @param n 128 | * @param ocyc 129 | * @return a List of ConceptMatches 130 | * @throws NoWordToVecVectorForTerm 131 | */ 132 | public List findNearestNForInStrictW2V(String terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm { 133 | float[] norm = w2vSpace.getGoogleNormedVector(w2vSpace.stringToList(terms)); 134 | return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t))) 135 | .stream() 136 | .collect(Collectors.toList()) 137 | .subList(0, n); 138 | } 139 | 140 | /** 141 | * Find the position of terms in the larger space from which this is derived a larger space, and 142 | * then search around them in a this space that spans fewer terms, but is otherwise the same 143 | * 144 | * Will fail if the space for this concept space is not a SubSpace 145 | * 146 | * @param terms The string containing a set of terms to search around 147 | * @param n How many things to find in this space 148 | * @param note 149 | * @return a List of ConceptMatches 150 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm 151 | */ 152 | public List findNearestNForPosition(String terms, Integer n, Function note) throws NoWordToVecVectorForTerm { 153 | return findNearestNForPosition(w2vSpace.stringToList(terms), 154 | n, note); 155 | } 156 | 157 | /** 158 | * Find the position of terms in the larger space from which this is derived a larger space, and 159 | * then search around them in a this space that spans fewer terms, but is otherwise the same 160 | * 161 | * Will fail if the space for this concept space is not a SubSpace 162 | * 163 | * @param terms The string containing a set of terms to search around 164 | * @param n How many things to find in this space 165 | * @param note 166 | * @return a List of ConceptMatches 167 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm 168 | */ 169 | public List findNearestNForPosition(List terms, Integer n, Function note) throws NoWordToVecVectorForTerm { 170 | Word2VecSpace posSpace = ((Word2VecSubspace) w2vSpace).getSuperSpace(); 171 | return findNearestNForPosition(terms, 172 | posSpace, n, note); 173 | } 174 | 175 | /** 176 | * Find the position of terms in a larger space, and then search around them in a space that spans 177 | * fewer terms, but is otherwise the same 178 | * 179 | * @param terms The string containing a set of terms to search around 180 | * @param posSpace The other larger space in which to search for those terms. 181 | * @param n How many things to find in this space 182 | * @param note 183 | * @return a List of ConceptMatches 184 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm 185 | */ 186 | public List findNearestNForPosition(String terms, Word2VecSpace posSpace, Integer n, Function note) throws NoWordToVecVectorForTerm { 187 | return findNearestNForPosition(w2vSpace.stringToList(terms), 188 | posSpace, n, note); 189 | 190 | } 191 | 192 | /** 193 | * Find the position of terms in a larger space, and then search around them in a space that spans 194 | * fewer terms, but is otherwise the same 195 | * 196 | * @param terms The list of terms to search around 197 | * @param posSpace The other larger space in which to search for those terms. 198 | * @param n How many things to find in this space 199 | * @param note 200 | * @return a List of ConceptMatches 201 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm 202 | */ 203 | public List findNearestNForPosition(List terms, Word2VecSpace posSpace, Integer n, Function note) throws NoWordToVecVectorForTerm { 204 | return findNearest(posSpace.getMaximalNormedVector(terms), note) 205 | .stream() 206 | .collect(Collectors.toList()) 207 | .subList(0, n); 208 | } 209 | 210 | /** 211 | * 212 | * @param terms 213 | * @param n 214 | * @return a List of ConceptMatches 215 | * @throws NoWordToVecVectorForTerm 216 | */ 217 | public List findNearestNForStrictW2V(List terms, Integer n) throws NoWordToVecVectorForTerm { 218 | return findNearest(w2vSpace.getGoogleNormedVector(terms)) 219 | .stream() 220 | .collect(Collectors.toList()) 221 | .subList(0, n); 222 | } 223 | 224 | /** 225 | * 226 | * @param terms 227 | * @param n 228 | * @return a List of ConceptMatches 229 | * @throws NoWordToVecVectorForTerm 230 | */ 231 | public List findNearestNForWithInputTermFiltering(List terms, Integer n) throws NoWordToVecVectorForTerm { 232 | return findNearest(w2vSpace.getMaximalNormedVector(terms)) 233 | .stream() 234 | .filter(m -> !terms.contains(m.getTerm())) // the google code removes any search term 235 | .collect(Collectors.toList()) 236 | .subList(0, n); 237 | } 238 | 239 | /** 240 | * 241 | * @param terms 242 | * @param n 243 | * @return a List of ConceptMatches 244 | * @throws NoWordToVecVectorForTerm 245 | */ 246 | public List findNearestNForWithInputTermFilteringStrictW2V(List terms, Integer n) throws NoWordToVecVectorForTerm { 247 | return findNearest(w2vSpace.getGoogleNormedVector(terms)) 248 | .stream() 249 | .filter(m -> !terms.contains(m.getTerm())) // the google code removes any search term 250 | .collect(Collectors.toList()) 251 | .subList(0, n); 252 | } 253 | 254 | /** 255 | * 256 | * @return the w2vSpace 257 | */ 258 | public Word2VecSpace getW2VSpace() { 259 | return w2vSpace; 260 | } 261 | 262 | private List findNearest(float[] searchVector, Function note) { 263 | Comparator compareDouble 264 | = (Double m1, Double m2) -> Double.compare(m2, m1); 265 | 266 | Comparator compareMatches 267 | = (ConceptMatch m1, ConceptMatch m2) -> Double.compare(m2.getSimilarity(), m1.getSimilarity()); 268 | 269 | // This is a massive sort (3m elements) so it might be better to optimise 270 | // for top N 271 | return w2vSpace.getVectors().keySet().stream() 272 | .map(s -> new ConceptMatch(w2vSpace, searchVector, s, note)) 273 | .sorted(compareMatches).collect(Collectors.toList()); 274 | } 275 | 276 | private List findNearest(float[] searchVector) { 277 | return findNearest(searchVector, null); 278 | } 279 | 280 | private List findNearestWhere(float[] searchVector, Predicate pred, Function note) { 281 | Comparator compareMatches 282 | = (ConceptMatch m1, ConceptMatch m2) -> Double.compare(m2.getSimilarity(), m1.getSimilarity()); 283 | // This is a massive sort (3m elements) so it might be better to optimise 284 | // for top N 285 | return w2vSpace.getVectors().keySet().parallelStream() 286 | .filter(pred) 287 | .map(s -> new ConceptMatch(w2vSpace, searchVector, s, note)) 288 | .sorted(compareMatches).collect(Collectors.toList()); 289 | } 290 | 291 | } 292 | -------------------------------------------------------------------------------- /ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/DefaultConceptFinderConfig.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.conceptfinder; 2 | 3 | /* 4 | * #%L 5 | * ConceptFinder 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | /** 24 | * Default configuration for ConceptFinder. 25 | */ 26 | public class DefaultConceptFinderConfig extends ConceptFinderConfig { 27 | 28 | private static final String conceptsForMissingTermsNameDefault = "missingTermConceptsDefault"; 29 | private static final String missingTermMapNameDefault = "missingTermsDefault"; 30 | 31 | /** 32 | * 33 | * @return the conceptsForMissingTermsNameDefault 34 | */ 35 | protected static String getConceptsForMissingTermsName() { 36 | return conceptsForMissingTermsNameDefault; 37 | } 38 | 39 | /** 40 | * 41 | * @return the missingTermMapNameDefault 42 | */ 43 | protected static String getMissingTermMapName() { 44 | return missingTermMapNameDefault; 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/MissingConceptFinder.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.conceptfinder; 2 | 3 | /* 4 | * #%L 5 | * ConceptFinder 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace; 24 | import com.cyc.tool.owltools.OpenCycOwl; 25 | import java.io.File; 26 | import java.io.IOException; 27 | import java.util.ArrayList; 28 | import java.util.Arrays; 29 | import java.util.HashMap; 30 | import java.util.HashSet; 31 | import java.util.List; 32 | import java.util.Map; 33 | import java.util.Set; 34 | import java.util.concurrent.ConcurrentNavigableMap; 35 | import java.util.function.Predicate; 36 | import java.util.stream.Collectors; 37 | import java.util.stream.IntStream; 38 | import java.util.stream.Stream; 39 | import org.mapdb.DB; 40 | import org.mapdb.DBMaker; 41 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 42 | 43 | /** 44 | * Methods for finding missing concepts with a ConceptSpace, a Word2VecSpace, and OpenCyc. 45 | */ 46 | abstract public class MissingConceptFinder { 47 | 48 | final private ConceptSpace cSpace; 49 | final private OpenCycOwl ocyc; 50 | private final Word2VecSpace w2vs; 51 | ConcurrentNavigableMap> conceptsForMissingTerms; 52 | DB db; 53 | List missingConceptNames; 54 | List missingMappingNames; 55 | ConcurrentNavigableMap missingTerms; 56 | 57 | /** 58 | * MissingConceptFinder constructor. 59 | * 60 | * @param w2v 61 | * @param oco 62 | * @throws IOException 63 | * @throws OWLOntologyCreationException 64 | */ 65 | public MissingConceptFinder(Word2VecSpace w2v, OpenCycOwl oco) throws IOException, OWLOntologyCreationException { 66 | this(w2v, oco, null); 67 | } 68 | 69 | /** 70 | * MissingConceptFinder constructor. 71 | * 72 | * @param w2v 73 | * @param oco 74 | * @param cSpace 75 | * @throws IOException 76 | * @throws OWLOntologyCreationException 77 | */ 78 | public MissingConceptFinder(Word2VecSpace w2v, OpenCycOwl oco, ConceptSpace cSpace) throws IOException, OWLOntologyCreationException { 79 | w2vs = w2v; 80 | ocyc = oco; 81 | this.cSpace = cSpace; 82 | db = DBMaker.newFileDB(new File(ConceptFinderConfig.getMissingConceptDBFile())) 83 | .closeOnJvmShutdown() 84 | // .encryptionEnable("password") 85 | .make(); 86 | 87 | //Use this to reset 88 | // missingTerms.clear(); db.commit(); 89 | } 90 | 91 | /** 92 | * 93 | * @return a List of Strings 94 | */ 95 | public List conceptsWithTerms() { 96 | return this.getConceptsForMissingTerms().keySet().stream() 97 | .map(i -> Arrays.asList(getMissingTerms().get(i)) 98 | .stream() 99 | .collect(Collectors.joining("|"))) 100 | .collect(Collectors.toList()); 101 | } 102 | 103 | /** 104 | * @return the conceptsForMissingTerms 105 | */ 106 | public ConcurrentNavigableMap> getConceptsForMissingTerms() { 107 | return conceptsForMissingTerms; 108 | } 109 | 110 | /** 111 | * @param conceptsForMissingTerms the conceptsForMissingTerms to set 112 | */ 113 | public void setConceptsForMissingTerms(ConcurrentNavigableMap> conceptsForMissingTerms) { 114 | this.conceptsForMissingTerms = conceptsForMissingTerms; 115 | } 116 | 117 | /** 118 | * @return the db 119 | */ 120 | public DB getDb() { 121 | return db; 122 | } 123 | 124 | /** 125 | * @return the missingConceptNames 126 | */ 127 | public List getMissingConceptNames() { 128 | return missingConceptNames; 129 | } 130 | 131 | /** 132 | * @param missingConceptNames the missingConceptNames to set 133 | */ 134 | public void setMissingConceptNames(List missingConceptNames) { 135 | this.missingConceptNames = missingConceptNames; 136 | } 137 | 138 | /** 139 | * @return the missingMappingNames 140 | */ 141 | public List getMissingMappingNames() { 142 | return missingMappingNames; 143 | } 144 | 145 | /** 146 | * @param missingMappingNames the missingMappingNames to set 147 | */ 148 | public void setMissingMappingNames(List missingMappingNames) { 149 | this.missingMappingNames = missingMappingNames; 150 | } 151 | 152 | /** 153 | * 154 | * @return the missingTerms 155 | */ 156 | public ConcurrentNavigableMap getMissingTerms() { 157 | return missingTerms; 158 | } 159 | 160 | /** 161 | * @param missingTerms the missingTerms to set 162 | */ 163 | public void setMissingTerms(ConcurrentNavigableMap missingTerms) { 164 | this.missingTerms = missingTerms; 165 | } 166 | 167 | /** 168 | * 169 | * @return the number of missing concepts 170 | */ 171 | public int missingConceptCount() { 172 | return getMissingConceptNames().size(); 173 | } 174 | 175 | /** 176 | * 177 | * @param testCase 178 | * @return a Set of AttachmentHypotheses 179 | */ 180 | protected Set findNearbyTermsWithGraphCore(String testCase) { 181 | return findNearbyTermsWithGraphCore(testCase, -1); 182 | } 183 | 184 | /** 185 | * 186 | * @param termStrings 187 | * @param n 188 | * @return a Set of AttachmentHypotheses 189 | */ 190 | protected Set 191 | findNearbyTermsWithGraphCore(List termStrings, int n) { 192 | long t1 = System.currentTimeMillis(); 193 | Set hypotheses = new HashSet<>(); 194 | 195 | Set allTypes = new HashSet<>(); 196 | Map typeWeights = new HashMap<>(); 197 | 198 | Map conceptEvidence = new HashMap<>(); 199 | System.out.print("====" + String.join("/", termStrings) + "====" + (n < 0 ? "" : " " + n) + " \t"); 200 | List matches = new ArrayList<>(); 201 | for (String term : termStrings) { 202 | try { 203 | matches.addAll(cSpace.findNearestNForIn(term, 40, ocyc)); 204 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 205 | } 206 | } 207 | if (matches.size() == 0) { 208 | // assertEquals("common_eiders", matches.get(10).term); 209 | System.out.println("Terms [" + termStrings + "] have no words in Word2Vec"); 210 | return hypotheses; // which is empty at this point 211 | // fail("took unexpected exception:" + ex); 212 | } 213 | IntStream.range(0, matches.size()) 214 | .forEach(i -> { 215 | ConceptMatch m = matches.get(i); 216 | //System.out.println(i + " " + m.toString()); 217 | if (m.concept != null) { 218 | allTypes.add(m.concept); 219 | typeWeights.put(m.concept, 220 | (typeWeights.containsKey(m.concept) ? typeWeights.get(m.concept) : 0.0d) 221 | + m.similarity); 222 | } 223 | }); 224 | allTypes.forEach(s -> { 225 | Double weight = typeWeights.get(s); 226 | Set transTypes = ocyc.getTypesTransitiveURL(s); 227 | Set immedTypes = ocyc.getTypesURL(s); 228 | 229 | Set ret 230 | = Stream.concat( 231 | transTypes 232 | .stream() 233 | .filter(type -> allTypes.contains(type)), 234 | immedTypes.stream() 235 | ).collect(Collectors.toSet()); 236 | 237 | if (!ret.isEmpty()) { 238 | ret.forEach(t -> { 239 | if (!conceptEvidence.containsKey(t)) { 240 | conceptEvidence.put(t, weight); 241 | } else { 242 | conceptEvidence.put(t, conceptEvidence.get(t) + weight); 243 | } 244 | }); 245 | 246 | } 247 | }); 248 | 249 | final double max = conceptEvidence.entrySet().stream() 250 | .mapToDouble(e -> e.getValue()).max().orElse(0); 251 | 252 | Set maxc = conceptEvidence.entrySet().stream() 253 | .filter(e -> e.getValue() == max) 254 | .map(e -> e.getKey()).collect(Collectors.toSet()); 255 | System.out.println("Maximum parent count:" + max); 256 | System.out.println("Maximal parents:" 257 | + maxc.stream().map(s -> ocyc.labelsForConcept(s) + ": " + s) 258 | .collect(Collectors.joining("\n\t"))); 259 | maxc.forEach(c -> hypotheses.add(new AttachmentHypothesis(n, termStrings, 260 | c, max, ocyc.labelsForConcept(c)))); 261 | System.out.println("-----" + (System.currentTimeMillis() - t1) + "ms -----"); 262 | return hypotheses; // Since we take the max of a double, there should be only one 263 | } 264 | 265 | /** 266 | * 267 | * @param testCase 268 | * @param n 269 | * @return a Set of AttachmentHypotheses 270 | * @deprecated 271 | */ 272 | @Deprecated 273 | protected Set findNearbyTermsWithGraphCore(String testCase, int n) { 274 | List termStrings = new ArrayList<>(); 275 | termStrings.add(testCase); 276 | return findNearbyTermsWithGraphCore(termStrings, n); 277 | 278 | } 279 | 280 | /** 281 | * 282 | * @return a List of names in the W2V space 283 | * @deprecated 284 | */ 285 | @Deprecated //Depends on a variable that is only set in an initialisation phase 286 | protected List namesInW2V() { 287 | if (getMissingMappingNames() == null) { 288 | return null; 289 | } 290 | return getMissingMappingNames().stream() 291 | .filter(hasElementInW2V()) 292 | .map(a -> a[0]) 293 | .collect(Collectors.toList()); 294 | } 295 | 296 | Predicate hasElementInW2V() { 297 | return a -> Arrays.stream(a) 298 | .anyMatch(w2vs::knownTerm); 299 | } 300 | 301 | } 302 | -------------------------------------------------------------------------------- /ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/MissingConceptFinderDefault.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.conceptfinder; 2 | 3 | /* 4 | * #%L 5 | * ConceptFinder 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace; 24 | import com.cyc.tool.owltools.OpenCycOwl; 25 | import java.io.IOException; 26 | import java.util.ArrayList; 27 | import java.util.Arrays; 28 | import java.util.List; 29 | import java.util.stream.Collectors; 30 | import java.util.stream.IntStream; 31 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 32 | 33 | /** 34 | * The default implementation for MissingConceptFinder. 35 | */ 36 | public class MissingConceptFinderDefault extends MissingConceptFinder { 37 | 38 | static final boolean reset = true; 39 | String[][] conceptStrings = {{"Facebook", "the Facebook"}, 40 | {"telephone microphone"}, 41 | {"telephone speaker"}, 42 | {"backhoe"}, 43 | {"facial scar", "scar on face"}, 44 | {"blue eyes"}, 45 | {"saluting the flag"}, 46 | {"muddy paws"}, 47 | {"strong muscles"}, 48 | {"pan balance"}, 49 | {"graduated cylinder"}, 50 | {"tape measure"}, 51 | {"hand lens"}, 52 | {"measuring cup"} 53 | }; 54 | List conceptsToLookFor = Arrays.asList(conceptStrings); 55 | 56 | /** 57 | * MissingConceptFinderDefault constructor 58 | * 59 | * @param w2v 60 | * @param oco 61 | * @throws IOException 62 | * @throws OWLOntologyCreationException 63 | */ 64 | public MissingConceptFinderDefault(Word2VecSpace w2v, OpenCycOwl oco) throws IOException, OWLOntologyCreationException { 65 | this(w2v, oco, null); 66 | } 67 | 68 | /** 69 | * MissingConceptFinderDefault constructor 70 | * 71 | * @param w2v 72 | * @param oco 73 | * @param cs 74 | * @throws IOException 75 | * @throws OWLOntologyCreationException 76 | */ 77 | public MissingConceptFinderDefault(Word2VecSpace w2v, OpenCycOwl oco, ConceptSpace cs) throws IOException, OWLOntologyCreationException { 78 | super(w2v, oco, cs); 79 | missingTerms = db.getTreeMap(DefaultConceptFinderConfig.getMissingTermMapName()); 80 | conceptsForMissingTerms = db.getTreeMap(DefaultConceptFinderConfig.getConceptsForMissingTermsName()); 81 | if (reset) { 82 | missingTerms.clear(); 83 | } 84 | if (missingTerms.isEmpty()) { 85 | conceptsForMissingTerms.clear(); 86 | OpenCycOwl oc = new OpenCycOwl(); 87 | 88 | missingMappingNames = conceptsToLookFor; 89 | missingConceptNames = missingMappingNames.stream() 90 | .filter(oc.noConcept()) 91 | .collect(Collectors.toList()); 92 | IntStream.range(0, missingConceptNames.size()) 93 | .forEach(i -> missingTerms.put(i, missingConceptNames.get(i))); 94 | db.commit(); 95 | db.compact(); 96 | oc.close(); 97 | 98 | } else { 99 | missingConceptNames = new ArrayList<>(); 100 | missingTerms.keySet().forEach(k -> missingConceptNames.add(missingTerms.get(k))); 101 | } 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /ConceptFinder/src/test/java/com/cyc/tool/conceptfinder/ConceptSpaceIT.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.conceptfinder; 2 | 3 | /* 4 | * #%L 5 | * ConceptFinder 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace; 24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace; 25 | import java.io.IOException; 26 | import java.util.Arrays; 27 | import java.util.List; 28 | import java.util.stream.IntStream; 29 | import org.junit.After; 30 | import org.junit.AfterClass; 31 | import static org.junit.Assert.assertEquals; 32 | import static org.junit.Assert.fail; 33 | import org.junit.Before; 34 | import org.junit.BeforeClass; 35 | import org.junit.Test; 36 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 37 | 38 | /** 39 | * ConceptSpace tests. 40 | */ 41 | public class ConceptSpaceIT { 42 | 43 | static List cr = Arrays.asList("Chinese", "river"); 44 | static ConceptSpace mySpace; 45 | 46 | public ConceptSpaceIT() { 47 | } 48 | 49 | @BeforeClass 50 | 51 | public static void setUpClass() throws IOException, OWLOntologyCreationException { 52 | mySpace = new ConceptSpace(GoogleNewsW2VSpace.get()); 53 | 54 | } 55 | 56 | @AfterClass 57 | 58 | public static void tearDownClass() { 59 | mySpace = null; 60 | } 61 | 62 | @Test 63 | public void findNearbyTerms1() { 64 | try { 65 | long t1 = System.currentTimeMillis(); 66 | List matches = mySpace.findNearestNForWithInputTermFiltering(cr, 40); 67 | IntStream.range(0, matches.size()) 68 | .forEach(i -> { 69 | System.out.println(i + " " + matches.get(i).toString()); 70 | }); 71 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms"); 72 | assertEquals(matches.get(0).getTerm(), "Yangtze_River"); 73 | assertEquals(0.6047259562339493, matches.get(5).getSimilarity(), 0.000001); 74 | 75 | assertEquals(matches.get(23).getTerm(), "rivers"); 76 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 77 | fail("took unexpected exception:" + ex); 78 | } 79 | } 80 | 81 | @Test 82 | public void findNearbyTerms2() { 83 | try { 84 | long t1 = System.currentTimeMillis(); 85 | List matches = mySpace.findNearestNForWithInputTermFiltering(Arrays.asList("gangplank"), 40); 86 | IntStream.range(0, matches.size()) 87 | .forEach(i -> { 88 | System.out.println(i + " " + matches.get(i).toString()); 89 | }); 90 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms"); 91 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 92 | fail("took unexpected exception:" + ex); 93 | } 94 | } 95 | 96 | @Before 97 | public void setUp() { 98 | } 99 | 100 | @After 101 | public void tearDown() { 102 | } 103 | 104 | } 105 | -------------------------------------------------------------------------------- /ConceptFinder/src/test/java/com/cyc/tool/conceptfinder/MissingConceptFinderIT.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.conceptfinder; 2 | 3 | /* 4 | * #%L 5 | * ConceptFinder 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace; 24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace; 25 | import com.cyc.tool.owltools.OpenCycOwl; 26 | import java.io.IOException; 27 | import java.util.Arrays; 28 | import java.util.HashSet; 29 | import java.util.List; 30 | import java.util.Set; 31 | import java.util.stream.Collectors; 32 | import java.util.stream.IntStream; 33 | import org.junit.AfterClass; 34 | import static org.junit.Assert.assertEquals; 35 | import static org.junit.Assert.assertTrue; 36 | import static org.junit.Assert.fail; 37 | import org.junit.BeforeClass; 38 | import org.junit.Test; 39 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 40 | 41 | /** 42 | * MissingConceptFinder tests. 43 | */ 44 | public class MissingConceptFinderIT { 45 | 46 | static ConceptSpace cSpace; 47 | static List cr = Arrays.asList("Chinese", "river"); 48 | static MissingConceptFinder mcf; 49 | static Word2VecSpace mySpace; 50 | static OpenCycOwl ocyc; 51 | static List pelagicBird = Arrays.asList("pelagic", "bird"); 52 | 53 | public MissingConceptFinderIT() { 54 | } 55 | 56 | @BeforeClass 57 | public static void setUpClass() throws IOException, OWLOntologyCreationException { 58 | mySpace = GoogleNewsW2VSpace.get(); 59 | cSpace = new ConceptSpace(mySpace); 60 | ocyc = new OpenCycOwl(); 61 | mcf = new MissingConceptFinderDefault(mySpace, ocyc, cSpace); 62 | } 63 | 64 | @AfterClass 65 | public static void tearDownClass() { 66 | mySpace = null; 67 | ocyc.close(); 68 | } 69 | private static String set2String(Set s) { 70 | if (s.size()>10) return ""; 71 | return s.stream() 72 | .map(i->{return String.join(",", mcf.getMissingTerms().get(i));}) 73 | .collect(Collectors.joining(";")); 74 | 75 | } 76 | 77 | @Test 78 | public void conceptsWithTermsTest() { 79 | List res = mcf.conceptsWithTerms(); 80 | System.out.println("There are " + res.size() + " missing concepts with associated KB terms: " + res); 81 | assertTrue(res.size() + "elements expected none", res.size() == 0); 82 | // assertTrue(res.containsAll(Arrays.asList("start", "rust", "blueberry"))); 83 | } 84 | 85 | @Test 86 | public void findNearbyTerms1() { 87 | long t1 = System.currentTimeMillis(); 88 | System.out.println("FNT1"); 89 | List matches; 90 | try { 91 | matches = cSpace.findNearestNForIn(cr, 40, ocyc); 92 | IntStream.range(0, matches.size()) 93 | .forEach(i -> { 94 | System.out.println(i + " " + matches.get(i).toString()); 95 | }); 96 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms"); 97 | assertEquals("Chinese", matches.get(0).term); 98 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 99 | fail("took unexpected exception:" + ex); 100 | } 101 | } 102 | 103 | @Test 104 | public void findNearbyTerms2() { 105 | try { 106 | long t1 = System.currentTimeMillis(); 107 | System.out.println("FNT2"); 108 | List matches = cSpace.findNearestNForIn(cr, 40, ocyc); 109 | IntStream.range(0, matches.size()) 110 | .forEach(i -> { 111 | System.out.println(i + " " + matches.get(i).toString()); 112 | }); 113 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms"); 114 | 115 | assertEquals(0.5539201713461387, matches.get(13).similarity, 0.000001); 116 | 117 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 118 | fail("took unexpected exception:" + ex); 119 | 120 | } 121 | 122 | } 123 | 124 | @Test 125 | public void findNearbyTerms3() { 126 | try { 127 | long t1 = System.currentTimeMillis(); 128 | System.out.println("FNT3"); 129 | List matches = cSpace.findNearestNForIn(cr, 40, ocyc); 130 | IntStream.range(0, matches.size()) 131 | .forEach(i -> { 132 | System.out.println(i + " " + matches.get(i).toString()); 133 | }); 134 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms"); 135 | 136 | assertEquals("creek", matches.get(7).term); 137 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 138 | fail("took unexpected exception:" + ex); 139 | } 140 | } 141 | 142 | @Test 143 | public void findNearbyTerms4() { 144 | try { 145 | long t1 = System.currentTimeMillis(); 146 | System.out.println("FNT4"); 147 | List matches = cSpace.findNearestNForIn(cr, 40, ocyc); 148 | IntStream.range(0, matches.size()) 149 | .forEach(i -> { 150 | System.out.println(i + " " + matches.get(i).toString()); 151 | }); 152 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms"); 153 | 154 | assertEquals("riverbank", matches.get(12).term); 155 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 156 | fail("took unexpected exception:" + ex); 157 | } 158 | } 159 | 160 | @Test 161 | public void findNearbyTermsWithGraphListTest() { 162 | System.out.println("FNT WG 3"); 163 | IntStream.rangeClosed(3, 6) 164 | .forEach(ti -> { 165 | Arrays.asList(mcf.getMissingTerms().get(ti)) 166 | .forEach((String ss) -> { 167 | mcf.findNearbyTermsWithGraphCore(ss, ti); 168 | }); 169 | }); 170 | assertTrue(true); 171 | } 172 | 173 | @Test 174 | public void findNearbyTermsWithGraphTest1() { 175 | System.out.println("FNT WG 1"); 176 | mcf.findNearbyTermsWithGraphCore("pelagic bird"); 177 | assertTrue(true); 178 | } 179 | 180 | @Test 181 | public void findNearbyTermsWithGraphTest2(){ 182 | System.out.println("FNT WG 2"); 183 | mcf.findNearbyTermsWithGraphCore("tobacco shop"); 184 | assertTrue(true); 185 | } 186 | 187 | @Test 188 | public void findNearbyTermsWithGraphTest3() { 189 | System.out.println("FNT WG 3"); 190 | mcf.findNearbyTermsWithGraphCore("pelagic bird"); 191 | mcf.findNearbyTermsWithGraphCore("tobacco shop"); 192 | mcf.findNearbyTermsWithGraphCore("net melon"); 193 | mcf.findNearbyTermsWithGraphCore("glowworm"); 194 | mcf.findNearbyTermsWithGraphCore("tightrope walking"); 195 | mcf.findNearbyTermsWithGraphCore("Adelie penguin"); 196 | assertTrue(true); 197 | } 198 | 199 | @Test 200 | public void findNearbyTermsWithGraphTest4() { 201 | System.out.println("FNT WG 4"); 202 | 203 | Set hyp = mcf.findNearbyTermsWithGraphCore("Adelie penguin"); 204 | System.out.println("HYP" + hyp); 205 | assertEquals(1, hyp.size()); 206 | } 207 | 208 | @Test 209 | public void findSomeMissingTerms1() { 210 | IntStream.rangeClosed(0, 3) 211 | .forEach(ti -> { 212 | Arrays.asList(mcf.getMissingTerms().get(ti)) 213 | .forEach((String ss) -> { 214 | lookItUpWithOcyc(ss); 215 | }); 216 | }); 217 | assertTrue(true); 218 | } 219 | 220 | @Test 221 | public void findSomeMissingTerms2() { 222 | IntStream.of(1, 5, 7) 223 | //See https://docs.google.com/a/cyc.com/document/d/1Lwi21-yxcC0DGKJMcc4GFN3M_DBzEDAcSNjYufCRIfE/edit 224 | .forEach(ti -> { 225 | Arrays.asList(mcf.getMissingTerms().get(ti)) 226 | .forEach((String ss) -> { 227 | lookItUpWithOcyc(ss); 228 | }); 229 | }); 230 | assertTrue(true); 231 | } 232 | 233 | @Test 234 | public void findSomeMissingTerms3() { 235 | IntStream.of(2, 3, 6) 236 | //See https://docs.google.com/a/cyc.com/document/d/1Lwi21-yxcC0DGKJMcc4GFN3M_DBzEDAcSNjYufCRIfE/edit 237 | .forEach(ti -> { 238 | Arrays.asList(mcf.getMissingTerms().get(ti)) 239 | .forEach((String ss) -> { 240 | lookItUpAllW2V(ss); 241 | }); 242 | }); 243 | assertTrue(true); 244 | } 245 | 246 | @Test 247 | public void howManyMissingTermsInW2V() throws IOException { 248 | final Set found = new HashSet<>(); 249 | final Set foundSpace = new HashSet<>(); 250 | final Set unfound = new HashSet<>(); 251 | 252 | mcf.getMissingTerms().keySet().forEach(i -> { 253 | Arrays.asList(mcf.getMissingTerms().get(i)) 254 | .forEach((String ss) -> { 255 | if (mySpace.knownTerm(ss)) { 256 | found.add(i); 257 | if (ss.contains(" ")) { 258 | foundSpace.add(i); 259 | } 260 | } else { 261 | unfound.add(i); 262 | } 263 | }); 264 | }); 265 | System.out.println("Found directly in W2V : " + found.size()+" "+set2String(found)); 266 | System.out.println("Found directly in W2V with space: " + foundSpace.size()+" "+set2String(foundSpace)); 267 | System.out.println("Not found in W2V : " + unfound.size()+" "+set2String(unfound)); 268 | assertEquals(2, foundSpace.size()); 269 | assertEquals(8, unfound.size()); 270 | } 271 | 272 | @Test 273 | public void listSomeTest() { 274 | IntStream.rangeClosed(0, 8) 275 | .forEach(i -> { 276 | System.out.println(i + ":\t" + String.join(", ", 277 | Arrays.asList(mcf.getMissingTerms().get(i)))); 278 | }); 279 | assertTrue(true); 280 | } 281 | 282 | // @Test 283 | // public void namesInW2VTest() { 284 | // List res; 285 | // res = mcf.namesInW2V(); 286 | // assertEquals(12343, res.size()); 287 | // } 288 | @Test 289 | public void missingConceptCountTest() { 290 | assertEquals(9, mcf.missingConceptCount()); 291 | } 292 | 293 | private void lookItUpAllW2V(String ss) { 294 | try { 295 | System.out.println("=======[" + ss + "]======="); 296 | long t1 = System.currentTimeMillis(); 297 | List matches 298 | = cSpace.findNearestNFor(Arrays.asList(ss.split("\\s+")), 40); 299 | 300 | System.out.println("Matches:" + (matches == null ? "null" : matches.size())); 301 | IntStream.range(0, matches.size()) 302 | .forEach(i -> { 303 | String matchTerm = matches.get(i).term; 304 | String mat = matches.get(i).toString(); 305 | if (ocyc.knownTerm(matchTerm)) { 306 | // System.out.println("Known:" +matchTerm); 307 | // System.out.println("Match is: "+ocyc.conceptsFor(matchTerm)); 308 | mat = mat.replace("---", 309 | String.join(" | ", ocyc.conceptsFor(matchTerm))); 310 | } 311 | System.out.println(i + " " + mat); 312 | }); 313 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms"); 314 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 315 | System.out.println("--- position not known in word to vec space:[" + ss + "]"); 316 | // Logger.getLogger(MissingVideoConceptFinderTest.class.getName()).log(Level.INFO, null, ex); 317 | } 318 | } 319 | 320 | private void lookItUpWithOcyc(String ss) { 321 | try { 322 | System.out.println("=======[" + ss + "]======="); 323 | long t1 = System.currentTimeMillis(); 324 | List matches 325 | = cSpace.findNearestNForIn(Arrays.asList(ss.split("\\s+")), 40, ocyc); 326 | 327 | System.out.println("Matches:" + (matches == null ? "null" : matches.size())); 328 | IntStream.range(0, matches.size()) 329 | .forEach(i -> { 330 | System.out.println(i + " " + matches.get(i).toString()); 331 | }); 332 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms"); 333 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 334 | System.out.println("--- position not known in word to vec space:[" + ss + "]"); 335 | // Logger.getLogger(MissingVideoConceptFinderTest.class.getName()).log(Level.INFO, null, ex); 336 | } 337 | } 338 | } 339 | -------------------------------------------------------------------------------- /CycMapDBTools/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cyc.tool 5 | CycMapDBTools 6 | jar 7 | 8 | UTF-8 9 | 1.8 10 | 1.8 11 | 12 | 13 | 14 | com.cyc.tool 15 | distributedRepresentationsParent 16 | 1.0 17 | ../distributedRepresentationsParent 18 | 19 | 20 | 21 | 22 | 23 | org.codehaus.mojo 24 | license-maven-plugin 25 | 26 | 27 | org.apache.maven.plugins 28 | maven-javadoc-plugin 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /CycMapDBTools/pom.xml~: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cyc.tool 5 | CycMapDBTools 6 | 0.0.1-SNAPSHOT 7 | jar 8 | 9 | UTF-8 10 | 1.8 11 | 1.8 12 | 13 | 14 | 15 | com.cyc.project.kbtaxonomy 16 | KBTaxonomyParent 17 | 0.0.1-SNAPSHOT 18 | ../KBTaxonomyParent 19 | 20 | 21 | 22 | 23 | 24 | org.codehaus.mojo 25 | license-maven-plugin 26 | 27 | 28 | org.apache.maven.plugins 29 | maven-javadoc-plugin 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /CycMapDBTools/src/main/java/com/cyc/tool/MapDBConfiguration.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool; 2 | 3 | /* 4 | * #%L 5 | * CycMapDBTools 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.io.File; 24 | import java.io.FileNotFoundException; 25 | import java.io.IOException; 26 | 27 | /** 28 | *

29 | * MapDBConfiguration defines some defaults to use when accessing MapDB locations. 30 | */ 31 | public class MapDBConfiguration { 32 | 33 | private static String baseString = null; 34 | static final String dirName = "MapDB"; 35 | static final String goodBase = "/fastscratch"; 36 | static final String goodLocation = goodBase + "/" + dirName; 37 | 38 | /** 39 | * 40 | * @param fb 41 | * @return base location for MapDB 42 | */ 43 | public static final String getMapDBBase(String fb) { 44 | if (null == baseString) { 45 | try { 46 | baseString 47 | = getMapDBDirectoryWithFallbackTo(new File(fb)).getCanonicalPath(); 48 | } catch (IOException ex) { 49 | throw new RuntimeException(ex); 50 | } 51 | } 52 | return baseString; 53 | } 54 | 55 | private static File getMapDBDirectoryWithFallbackTo(File fallback) throws FileNotFoundException, IOException { 56 | File base = new File(goodBase); 57 | if (base.exists() && base.canWrite()) { 58 | File mdb = new File(goodLocation); 59 | if (mdb.exists() || mdb.mkdirs()) { 60 | System.out.println("INFO: "+" using "+mdb.getCanonicalPath()); 61 | return mdb; 62 | } 63 | } else { 64 | System.out.println("WARN: "+goodBase+" not available, backing off to " 65 | +fallback.getCanonicalPath()); 66 | File completeFallBack = new File(fallback.getCanonicalPath() + "/" + dirName); 67 | if (completeFallBack.exists() || completeFallBack.mkdirs()) { 68 | return completeFallBack; 69 | } 70 | } 71 | throw new FileNotFoundException(goodBase + "is not avaliable for " + dirName 72 | + "and neither is" + fallback); 73 | 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /DistributedRepresentations/nbactions.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | CUSTOM-skiptests 5 | skiptests 6 | 7 | clean 8 | install 9 | 10 | 11 | true 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /DistributedRepresentations/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cyc.tool 5 | DistributedRepresentations 6 | jar 7 | 8 | 9 | com.cyc.tool 10 | distributedRepresentationsParent 11 | 1.0 12 | ../distributedRepresentationsParent 13 | 14 | 15 | 16 | 17 | 18 | org.codehaus.mojo 19 | license-maven-plugin 20 | 21 | 22 | org.apache.maven.plugins 23 | maven-javadoc-plugin 24 | 25 | 26 | 27 | 28 | 29 | 30 | net.sourceforge.owlapi 31 | owlapi-distribution 32 | 4.0.1 33 | jar 34 | 35 | 36 | org.mapdb 37 | mapdb 38 | 1.0.6 39 | jar 40 | 41 | 42 | junit 43 | junit 44 | test 45 | 46 | 47 | com.cyc.tool 48 | CycMapDBTools 49 | jar 50 | 51 | 52 | 53 | com.cyc.tool 54 | OwlTools 55 | 56 | 57 | 58 | 59 | UTF-8 60 | 1.8 61 | 1.8 62 | 63 | 64 | -------------------------------------------------------------------------------- /DistributedRepresentations/pom.xml~: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cyc.tool 5 | DistributedRepresentations 6 | 0.0.1-SNAPSHOT 7 | jar 8 | 9 | 10 | com.cyc.project.kbtaxonomy 11 | KBTaxonomyParent 12 | 0.0.1-SNAPSHOT 13 | ../KBTaxonomyParent 14 | 15 | 16 | 17 | 18 | 19 | org.codehaus.mojo 20 | license-maven-plugin 21 | 22 | 23 | org.apache.maven.plugins 24 | maven-javadoc-plugin 25 | 26 | 27 | 28 | 29 | 30 | 31 | net.sourceforge.owlapi 32 | owlapi-distribution 33 | 4.0.1 34 | jar 35 | 36 | 37 | org.mapdb 38 | mapdb 39 | 1.0.6 40 | jar 41 | 42 | 43 | junit 44 | junit 45 | 4.10 46 | test 47 | 48 | 49 | com.cyc.tool 50 | CycMapDBTools 51 | 0.0.1-SNAPSHOT 52 | jar 53 | 54 | 55 | 56 | com.cyc.tool 57 | OwlTools 58 | 0.0.1-SNAPSHOT 59 | 60 | 61 | 62 | 63 | 64 | UTF-8 65 | 1.8 66 | 1.8 67 | 68 | -------------------------------------------------------------------------------- /DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/BiologyW2VOpenCycSubspace.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.distributedrepresentations; 2 | 3 | /* 4 | * #%L 5 | * DistributedRepresentations 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.owltools.OpenCycOwl; 24 | import java.io.IOException; 25 | import java.util.logging.Level; 26 | import java.util.logging.Logger; 27 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 28 | 29 | /** 30 | *

31 | * BiologyW2VSpace filtered to only contain terms in Open Cyc. 32 | */ 33 | public class BiologyW2VOpenCycSubspace extends Word2VecSubspace { 34 | 35 | static BiologyW2VOpenCycSubspace singleton; 36 | 37 | private BiologyW2VOpenCycSubspace(OpenCycOwl ocyc) throws IOException { 38 | super(BiologyW2VSpace.get(), 39 | m -> ocyc.knownTerm(m), getWord2VecVectorsMapName()); 40 | } 41 | 42 | /** 43 | * 44 | * @return a WordToVecSubspace limited only to terms in OpenCyc 45 | */ 46 | public static BiologyW2VOpenCycSubspace get() { 47 | if (singleton == null) { 48 | try { 49 | OpenCycOwl ocyc = new OpenCycOwl(); 50 | singleton = new BiologyW2VOpenCycSubspace(ocyc); 51 | } catch (IOException ex) { 52 | Logger.getLogger(BiologyW2VSpace.class.getName()).log(Level.SEVERE, null, ex); 53 | throw new RuntimeException("Can't create the Biology W2VSpace object " + ex); 54 | } catch (OWLOntologyCreationException ex) { 55 | Logger.getLogger(BiologyW2VOpenCycSubspace.class.getName()).log(Level.SEVERE, null, ex); 56 | } 57 | } 58 | return singleton; 59 | } 60 | 61 | static String getWord2VecVectorsMapName() { 62 | return BiologyW2VOpenCycSubspace.class.getCanonicalName(); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/BiologyW2VSpace.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.distributedrepresentations; 2 | 3 | /* 4 | * #%L 5 | * DistributedRepresentations 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.io.BufferedReader; 24 | import java.io.File; 25 | import java.io.FileReader; 26 | import java.io.IOException; 27 | import java.util.Arrays; 28 | import java.util.logging.Level; 29 | import java.util.logging.Logger; 30 | import java.util.stream.Collectors; 31 | import org.mapdb.DBMaker; 32 | 33 | /** 34 | * The word2vec space produced by BioASQ by training on pubmed. 35 | * 36 | *

37 | * See: 38 | * http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts 39 | */ 40 | public class BiologyW2VSpace extends Word2VecSpace { 41 | 42 | private static final String fileBase = "/cyc/projects/kbTaxonomy/ConceptFinder/BioASQ/word2vecTools/"; 43 | private static BiologyW2VSpace singleton; 44 | private static final String w2vlabelfile = fileBase + "types.txt"; 45 | private static final String w2vvectorfile = fileBase + "vectors.txt"; 46 | 47 | private BiologyW2VSpace() throws IOException { 48 | db = DBMaker.newFileDB(new File(Config.getW2vDBFile())) 49 | .closeOnJvmShutdown() 50 | // .encryptionEnable("password") 51 | .make(); 52 | vectors = db.getTreeMap(getWord2VecVectorsMapName()); 53 | // vectors.clear(); 54 | if (!vectors.isEmpty()) { 55 | assert (getVector("anti-mib-1") != null); 56 | setSize(getVector("hgh-b").length); 57 | return; 58 | } 59 | int i = 0; 60 | try (BufferedReader labelReader = new BufferedReader(new FileReader(w2vlabelfile))) { 61 | try (BufferedReader vectorReader = new BufferedReader(new FileReader(w2vvectorfile))) { 62 | for (String label; (label = labelReader.readLine()) != null;) { 63 | String vec = vectorReader.readLine(); 64 | float[] d 65 | = normVector( 66 | Arrays.asList(vec.split("\\s+")) 67 | .stream() 68 | .map(s -> Float.valueOf(s)) 69 | .collect(Collectors.toList()) 70 | ); 71 | if (getSize() != 0) { 72 | assert d.length == getSize() : "Line without " + getSize() + " floats"; 73 | } else { 74 | setSize(d.length); 75 | } 76 | if (i++ % 100000 == 0) { 77 | db.commit(); 78 | System.out.println(i + ": " + label); 79 | } 80 | 81 | vectors.put(label, d); 82 | // process the line. 83 | } 84 | // line is not visible here. 85 | } 86 | } 87 | System.out.println("Read " + i + " term positions for " + BiologyW2VSpace.class.getSimpleName()); 88 | db.commit(); 89 | db.compact(); 90 | } 91 | 92 | /** 93 | * Factory get method for BiologyW2VSpace. 94 | * 95 | * @return a BiologyW2VSpace 96 | */ 97 | public static BiologyW2VSpace get() { 98 | if (singleton == null) { 99 | try { 100 | singleton = new BiologyW2VSpace(); 101 | } catch (IOException ex) { 102 | Logger.getLogger(BiologyW2VSpace.class.getName()).log(Level.SEVERE, null, ex); 103 | throw new RuntimeException("Can't create the Biology W2VSpace object\n " + ex); 104 | } 105 | } 106 | return singleton; 107 | } 108 | 109 | /* 110 | @ToDo: change this to use the class name, so that it's automatically correct 111 | */ 112 | private static String getWord2VecVectorsMapName() { 113 | return BiologyW2VSpace.class.getCanonicalName(); 114 | } 115 | 116 | } 117 | -------------------------------------------------------------------------------- /DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Config.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.distributedrepresentations; 2 | 3 | /* 4 | * #%L 5 | * DistributedRepresentations 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.MapDBConfiguration; 24 | 25 | /** 26 | *

27 | * Config provides default locations for the DistributedRepresentations project. 28 | */ 29 | public class Config extends MapDBConfiguration { 30 | 31 | private static final String fallBackDBLocation = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/"; 32 | 33 | private static final String w2vDBFile = "/w2vdb"; 34 | 35 | /** 36 | * 37 | * @return W2VDB file location 38 | */ 39 | protected static String getW2vDBFile() { 40 | return getMapDBBase(fallBackDBLocation) + w2vDBFile; 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/GoogleNewsW2VOpenCycSubspace.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.distributedrepresentations; 2 | 3 | /* 4 | * #%L 5 | * DistributedRepresentations 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.owltools.OpenCycOwl; 24 | import java.io.IOException; 25 | import java.util.logging.Level; 26 | import java.util.logging.Logger; 27 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 28 | 29 | /** 30 | * News Word2Vec Distributed representation filtered to only contain terms in Open Cyc. 31 | * 32 | *

33 | * Used for rapid searches of the space for open cyc terms 34 | */ 35 | public class GoogleNewsW2VOpenCycSubspace extends Word2VecSubspace { 36 | 37 | static GoogleNewsW2VOpenCycSubspace singleton; 38 | 39 | private GoogleNewsW2VOpenCycSubspace(OpenCycOwl ocyc) throws IOException { 40 | super(GoogleNewsW2VSpace.get(), 41 | m -> ocyc.knownTerm(m), getWord2VecVectorsMapName()); 42 | } 43 | 44 | /** 45 | * 46 | * @return a WordToVecSubspace limited only to terms in OpenCyc 47 | */ 48 | public static GoogleNewsW2VOpenCycSubspace get() { 49 | if (singleton == null) { 50 | try { 51 | OpenCycOwl ocyc = new OpenCycOwl(); 52 | singleton = new GoogleNewsW2VOpenCycSubspace(ocyc); 53 | } catch (IOException ex) { 54 | Logger.getLogger(GoogleNewsW2VSpace.class.getName()).log(Level.SEVERE, null, ex); 55 | throw new RuntimeException("Can't create the Google News W2VSpace object " + ex); 56 | } catch (OWLOntologyCreationException ex) { 57 | Logger.getLogger(GoogleNewsW2VOpenCycSubspace.class.getName()).log(Level.SEVERE, null, ex); 58 | } 59 | } 60 | return singleton; 61 | } 62 | 63 | static String getWord2VecVectorsMapName() { 64 | return GoogleNewsW2VOpenCycSubspace.class.getCanonicalName(); 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/GoogleNewsW2VSpace.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.distributedrepresentations; 2 | 3 | /* 4 | * #%L 5 | * DistributedRepresentations 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.io.IOException; 24 | import java.util.logging.Level; 25 | import java.util.logging.Logger; 26 | 27 | /** 28 | * The word2vec space produced by Google by training on 10^11 words of news. 29 | * 30 | *

31 | * See: https://code.google.com/p/word2vec/ 32 | */ 33 | public class GoogleNewsW2VSpace extends Word2VecSpaceFromFile { 34 | 35 | private static GoogleNewsW2VSpace singleton; 36 | private static final String w2vfile = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/GoogleNews-vectors-negative300.bin.gz"; 37 | 38 | private GoogleNewsW2VSpace() throws IOException { 39 | super(); 40 | vectors = db.getTreeMap(getWord2VecVectorsMapName()); 41 | if (!vectors.isEmpty()) { 42 | assert (getVector("snowcapped_Caucasus") != null); 43 | setSize(getVector("dog").length); 44 | return; 45 | } 46 | createW2VinDB(getW2vfile()); 47 | } 48 | 49 | /** 50 | * Factory get method for GoogleNewsW2VSpace. 51 | * 52 | * @return a GoogleNewsW2VSpace 53 | */ 54 | public static GoogleNewsW2VSpace get() { 55 | if (singleton == null) { 56 | try { 57 | singleton = new GoogleNewsW2VSpace(); 58 | } catch (IOException ex) { 59 | Logger.getLogger(GoogleNewsW2VSpace.class.getName()).log(Level.SEVERE, null, ex); 60 | throw new RuntimeException("Can't create the Google News W2VSpace object " + ex); 61 | } 62 | } 63 | return singleton; 64 | } 65 | 66 | private static String getW2vfile() { 67 | return w2vfile; 68 | } 69 | 70 | private static String getWord2VecVectorsMapName() { 71 | /* 72 | @ToDo: change this to use the class name, so that it's automatically correct 73 | */ 74 | return GoogleNewsW2VSpace.class.getCanonicalName(); 75 | //return word2VecVectorsMapName; 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Word2VecSpace.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.distributedrepresentations; 2 | 3 | /* 4 | * #%L 5 | * DistributedRepresentations 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.util.ArrayList; 24 | import java.util.Arrays; 25 | import java.util.List; 26 | import java.util.Map; 27 | import java.util.Map.Entry; 28 | import java.util.concurrent.ConcurrentNavigableMap; 29 | import java.util.function.Predicate; 30 | import java.util.stream.Collectors; 31 | import java.util.stream.IntStream; 32 | import org.mapdb.DB; 33 | 34 | /** 35 | * A space of words from Google Word2Vec 36 | * 37 | */ 38 | public abstract class Word2VecSpace { 39 | 40 | private int size; 41 | DB db; 42 | Map vectors; 43 | long words; 44 | 45 | /** 46 | * 47 | * @param terms 48 | * @return a List of Strings containing nGrams for terms 49 | */ 50 | public static List nGramsFor(List terms) { 51 | final List grams = new ArrayList(); 52 | IntStream.rangeClosed(1, terms.size()).forEach(length -> { 53 | IntStream.rangeClosed(0, terms.size() - length).forEach(start -> { 54 | List l = terms.subList(start, start + length); 55 | grams.add(String.join(" ", l)); 56 | }); 57 | 58 | }); 59 | return grams; 60 | } 61 | 62 | private static String norm(String term) { 63 | return term.replaceAll("\\s+", "_"); 64 | } 65 | 66 | private double cosineSimilarity(float[] v1, float[] v2) { 67 | return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2)); 68 | } 69 | 70 | /** 71 | * 72 | * @param t1 73 | * @param t2 74 | * @return the cosine similarity 75 | */ 76 | public double cosineSimilarity(String t1, String t2) { 77 | return cosineSimilarity(getVector(t1), getVector(t2)); 78 | } 79 | 80 | private double dotProduct(float[] v1, float[] v2) { 81 | return IntStream.range(0, v1.length) 82 | .mapToDouble(i -> (double) v1[i] * (double) v2[i]) 83 | .sum(); 84 | } 85 | 86 | private double euclidianDistance(float[] v1, float[] v2) { 87 | double dist = Math.sqrt(IntStream.range(0, v1.length) 88 | .mapToDouble(i -> Math.pow((double) v1[i] - (double) v2[i], 2)) 89 | .sum()); 90 | return dist; 91 | } 92 | 93 | private double euclidianDistance(String t1, String t2) { 94 | return euclidianDistance(getVector(t1), getVector(t2)); 95 | } 96 | 97 | private float[] getAverageVector(List terms) { 98 | final float sum[] = new float[size]; 99 | final double mult = 1.0 / terms.size(); 100 | terms.forEach(s -> { 101 | float v[] = getVector(s); 102 | IntStream.range(0, size) 103 | .forEach(i -> { 104 | sum[i] += mult * v[i]; 105 | }); 106 | }); 107 | return sum; 108 | } 109 | 110 | /** 111 | * 112 | * @return the db 113 | */ 114 | public DB getDb() { 115 | return db; 116 | } 117 | 118 | /** 119 | * Set up the DB. 120 | * 121 | * @param db 122 | */ 123 | public void setDb(DB db) { 124 | this.db = db; 125 | } 126 | 127 | /** 128 | * 129 | * @param terms 130 | * @return the sum of term vectors divided by vector length 131 | * @throws NoWordToVecVectorForTerm 132 | */ 133 | public float[] getGoogleNormedVector(List terms) throws NoWordToVecVectorForTerm { 134 | // Sum of term vectors divided by vector length 135 | // Note that this will miss multi-word exact matches, so prefer getMaximalNormedVector 136 | //except for exact code comparison tests 137 | final float sum[] = new float[size]; 138 | if (terms.stream().allMatch(s -> !knownTerm(s))) { 139 | throw new NoWordToVecVectorForTerm("Can't find vector for:" + String.join(", ", terms)); 140 | } 141 | terms.stream() 142 | .filter(s -> knownTerm(s)) 143 | .forEach(s -> { 144 | float v[] = getVector(s); 145 | IntStream.range(0, size) 146 | .forEach(i -> { 147 | sum[i] += v[i]; 148 | }); 149 | }); 150 | return normVector(sum); 151 | } 152 | 153 | /** 154 | * 155 | * @param interms 156 | * @return the maximal normed vector 157 | * @throws NoWordToVecVectorForTerm 158 | */ 159 | public float[] 160 | getMaximalNormedVector(List interms) throws NoWordToVecVectorForTerm { 161 | // Sum of term ngram vectors divided by vector length 162 | List terms = nGramsFor(interms); 163 | final float sum[] = new float[size]; 164 | if (terms.stream().allMatch(s -> !knownTerm(s))) { 165 | throw new NoWordToVecVectorForTerm("Can't find vector for:" + String.join(", ", terms)); 166 | } 167 | terms.stream() 168 | .filter(s -> knownTerm(s)) 169 | .forEach(s -> { 170 | float v[] = getVector(s); 171 | IntStream.range(0, size) 172 | .forEach(i -> { 173 | sum[i] += v[i]; 174 | }); 175 | }); 176 | return normVector(sum); 177 | } 178 | 179 | /** 180 | * 181 | * @return size of vectors 182 | */ 183 | public int getNVectors() { 184 | return vectors.size(); 185 | } 186 | 187 | /** 188 | * 189 | * @return size of the Word2VecSpace 190 | */ 191 | public int getSize() { 192 | return size; 193 | } 194 | 195 | /** 196 | * 197 | * @param size 198 | */ 199 | public void setSize(int size) { 200 | this.size = size; 201 | } 202 | 203 | /** 204 | * 205 | * @param term 206 | * @return the vector for term 207 | */ 208 | public float[] getVector(String term) { 209 | return vectors.get(norm(term)); 210 | } 211 | 212 | /** 213 | * 214 | * @return the vectors 215 | */ 216 | public Map getVectors() { 217 | return vectors; 218 | } 219 | 220 | /** 221 | * 222 | * @param vectors 223 | */ 224 | public void setVectors(ConcurrentNavigableMap vectors) { 225 | this.vectors = vectors; 226 | } 227 | 228 | /** 229 | * 230 | * @return the words 231 | */ 232 | public long getWords() { 233 | return words; 234 | } 235 | 236 | /** 237 | * 238 | * @param words 239 | */ 240 | public void setWords(long words) { 241 | this.words = words; 242 | } 243 | 244 | /** 245 | * 246 | * @param v1 247 | * @param v2 248 | * @return the similarity between v1 and v2 249 | */ 250 | public double googleSimilarity(float[] v1, float[] v2) { 251 | return dotProduct(v1, v2); 252 | } 253 | 254 | private double googleSimilarity(String t1, String t2) { 255 | return googleSimilarity(getVector(t1), getVector(t2)); 256 | } 257 | 258 | /** 259 | * 260 | * @param terms 261 | * @param term 262 | * @return the similarity 263 | * @throws NoWordToVecVectorForTerm 264 | */ 265 | public double googleSimilarity(List terms, String term) throws NoWordToVecVectorForTerm { 266 | return googleSimilarity(getGoogleNormedVector(terms), getVector(term)); 267 | } 268 | 269 | /** 270 | * 271 | * @param term 272 | * @return true if term is in vectors 273 | */ 274 | public boolean knownTerm(String term) { 275 | return vectors.containsKey(norm(term)); 276 | } 277 | 278 | private double magnitude(float[] v) { 279 | return Math.sqrt(IntStream.range(0, v.length).mapToDouble(i -> v[i] * v[i]).sum()); 280 | } 281 | 282 | private double magnitude(List v) { 283 | return Math.sqrt(v.stream().mapToDouble(i -> i * i).sum()); 284 | } 285 | 286 | /** 287 | * 288 | * @param v 289 | * @return normalized vector for v 290 | */ 291 | public float[] normVector(float[] v) { 292 | final float normed[] = new float[size]; 293 | double len = magnitude(v); 294 | 295 | IntStream.range(0, size) 296 | .forEach(i -> { 297 | normed[i] = v[i] / (float) len; 298 | }); 299 | return normed; 300 | } 301 | 302 | /** 303 | * 304 | * @param v 305 | * @return normalized vector for v 306 | */ 307 | public float[] normVector(List v) { 308 | final float normed[] = new float[v.size()]; 309 | double len = magnitude(v); 310 | 311 | IntStream.range(0, v.size()) 312 | .forEach(i -> { 313 | normed[i] = v.get(i) / (float) len; 314 | }); 315 | return normed; 316 | } 317 | 318 | /** 319 | * 320 | * @param s 321 | * @return List of Strings 322 | */ 323 | public List stringToList(String s) { 324 | return Arrays.asList(s.split("\\s+")); 325 | } 326 | 327 | /** 328 | * 329 | * @param includeIf the predicate that is applied to the strings (the keys or embedded strings) 330 | * of the word to vec space to determine whether they should be retained in the output vector list 331 | * @return filtered vectors Map 332 | */ 333 | protected Map filterVectors(Predicate includeIf) { 334 | return vectors.entrySet().stream().filter(entry -> { 335 | return includeIf.test(entry.getKey()); 336 | }).collect(Collectors.toMap(Entry::getKey, Entry::getValue)); 337 | } 338 | 339 | /** 340 | * No Vector for Term 341 | *

342 | * Exception to use check when a term looked up in the space has no known position 343 | */ 344 | public static class NoWordToVecVectorForTerm extends Exception { 345 | 346 | /** 347 | * 348 | * @param message 349 | */ 350 | public NoWordToVecVectorForTerm(String message) { 351 | super(message); 352 | } 353 | } 354 | } 355 | -------------------------------------------------------------------------------- /DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Word2VecSpaceFromFile.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.distributedrepresentations; 2 | 3 | /* 4 | * #%L 5 | * DistributedRepresentations 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.io.DataInputStream; 24 | import java.io.File; 25 | import java.io.FileInputStream; 26 | import java.io.FileNotFoundException; 27 | import java.io.IOException; 28 | import java.util.logging.Level; 29 | import java.util.logging.Logger; 30 | import java.util.stream.IntStream; 31 | import java.util.zip.GZIPInputStream; 32 | import org.apache.commons.io.EndianUtils; 33 | import org.mapdb.DBMaker; 34 | 35 | /** 36 | * Word2Vec distributed representation space from Google Format file. 37 | * 38 | *

39 | * This class represents any distributed represenation computed using word2vec and initially loaded 40 | * from a Google word2vec formatted file 41 | */ 42 | public abstract class Word2VecSpaceFromFile extends Word2VecSpace { 43 | 44 | final StringBuilder sb = new StringBuilder(); 45 | 46 | /** 47 | * Constructor for Word2VecSpaceFromFile 48 | * 49 | * @throws IOException 50 | */ 51 | public Word2VecSpaceFromFile() throws IOException { 52 | db = DBMaker.newFileDB(new File(Config.getW2vDBFile())) 53 | .closeOnJvmShutdown() 54 | // .encryptionEnable("password") 55 | .make(); 56 | 57 | } 58 | 59 | /** 60 | * Create a W2V space in a DB. 61 | * 62 | * @param w2vZipFile 63 | * @throws FileNotFoundException 64 | * @throws IOException 65 | */ 66 | protected final void createW2VinDB(String w2vZipFile) throws FileNotFoundException, IOException { 67 | try (DataInputStream data_in 68 | = new DataInputStream( 69 | new GZIPInputStream(new FileInputStream( 70 | new File(w2vZipFile))))) { 71 | getWordsAndSize(data_in); 72 | if (vectors.size() == words) { 73 | System.out.println("Word2Vec is in DB"); 74 | } else { 75 | System.out.println("DB Size:" + vectors.size()); 76 | 77 | System.out.println("Want to read Word Count: " + words); 78 | System.out.println("Size:" + getSize()); 79 | for (int w = 0; w < words; w++) { 80 | float[] v = new float[getSize()]; 81 | String key = getVocabString(data_in); 82 | System.out.println(w + ":\t" + key); 83 | 84 | IntStream.range(0, getSize()).forEach(i -> v[i] 85 | = getFloat(data_in)); 86 | vectors.put(key, normVector(v)); 87 | if (w % 100000 == 1) { 88 | db.commit(); 89 | } 90 | } 91 | db.commit(); 92 | db.compact(); 93 | } 94 | } 95 | } 96 | 97 | private float getFloat(DataInputStream s) { 98 | try { 99 | float v = EndianUtils.readSwappedFloat(s); 100 | //System.out.println(st+"["+i+"]: "+v); 101 | return v; 102 | } catch (IOException ex) { 103 | Logger.getLogger(Word2VecSpace.class.getName()).log(Level.SEVERE, null, ex); 104 | return 0.0f; 105 | } 106 | } 107 | 108 | private String getVocabString(DataInputStream s) throws IOException { 109 | sb.setLength(0); 110 | for (char ch = (char) s.read(); 111 | (!Character.isWhitespace(ch) && ch >= 0 && ch <= 256); 112 | ch = (char) s.read()) { 113 | sb.append((char) ch); 114 | } 115 | return sb.toString(); 116 | } 117 | 118 | private void getWordsAndSize(DataInputStream s) throws IOException { 119 | sb.setLength(0); 120 | for (char ch = (char) s.read(); ch != '\n'; ch = (char) s.read()) { 121 | sb.append(ch); 122 | } 123 | String[] parts = sb.toString().split("\\s+"); 124 | words = Long.parseLong(parts[0]); 125 | setSize((int) Long.parseLong(parts[1])); 126 | } 127 | 128 | } 129 | -------------------------------------------------------------------------------- /DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Word2VecSubspace.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.distributedrepresentations; 2 | 3 | /* 4 | * #%L 5 | * DistributedRepresentations 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.io.File; 24 | import java.io.IOException; 25 | import java.util.Map; 26 | import java.util.function.Predicate; 27 | import org.mapdb.DBMaker; 28 | 29 | /** 30 | * A space of words from Google Word2Vec. 31 | * 32 | */ 33 | public abstract class Word2VecSubspace extends Word2VecSpace { 34 | 35 | final Word2VecSpace mySuperSpace; 36 | 37 | /** 38 | * Word2VecSubspace constructor. 39 | * 40 | * @param ofSpace 41 | * @param includeIf 42 | * @param persistLoc 43 | * @throws IOException 44 | */ 45 | protected Word2VecSubspace(Word2VecSpace ofSpace, Predicate includeIf, String persistLoc) throws IOException { 46 | 47 | mySuperSpace = ofSpace; 48 | if (db == null) { 49 | db = DBMaker.newFileDB(new File(Config.getW2vDBFile())) 50 | .closeOnJvmShutdown() 51 | // .encryptionEnable("password") 52 | .make(); 53 | } 54 | vectors = db.getTreeMap(persistLoc); 55 | // vectors.clear(); 56 | if (!vectors.isEmpty()) { 57 | setSize(vectors.values().iterator().next().length); 58 | System.out.println("Got cached w2vspace for " + persistLoc + " of dimensionality " + getSize() + " and with " + vectors.size() + " entries."); 59 | return; 60 | } 61 | // assert(vectors == null) :"Subspaces msut be completely empty when created"; 62 | System.out.println("Filtering vectors for:" + persistLoc); 63 | Map newvectors = ofSpace.filterVectors(includeIf); 64 | newvectors.entrySet().forEach(e -> { 65 | vectors.put(e.getKey(), e.getValue()); 66 | }); 67 | db.commit(); 68 | db.compact(); 69 | db.commit(); 70 | System.out.println("Vectors filtered and persisted."); 71 | } 72 | 73 | /** 74 | * 75 | * @return the mySuperSpace 76 | */ 77 | public Word2VecSpace getSuperSpace() { 78 | return mySuperSpace; 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /DistributedRepresentations/src/test/java/com/cyc/tool/distributedrepresentations/BiologyW2VSpaceIT.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.distributedrepresentations; 2 | 3 | /* 4 | * #%L 5 | * DistributedRepresentations 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import static org.junit.Assert.assertEquals; 24 | import static org.junit.Assert.assertTrue; 25 | import org.junit.Test; 26 | 27 | /** 28 | * Tests for BiologyW2VSpace. 29 | */ 30 | public class BiologyW2VSpaceIT { 31 | 32 | public BiologyW2VSpaceIT() { 33 | } 34 | 35 | @Test 36 | public void testGet() { 37 | System.out.println("get"); 38 | 39 | BiologyW2VSpace result = BiologyW2VSpace.get(); 40 | assertTrue(result != null); 41 | } 42 | 43 | @Test 44 | public void testNumberOfVectors() { 45 | System.out.println("getNVectors"); 46 | 47 | int result = BiologyW2VSpace.get().getNVectors(); 48 | 49 | assertEquals(result, 1701632); 50 | } 51 | 52 | } 53 | -------------------------------------------------------------------------------- /DistributedRepresentations/src/test/java/com/cyc/tool/distributedrepresentations/Word2VecSpaceIT.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.distributedrepresentations; 2 | 3 | /* 4 | * #%L 5 | * DistributedRepresentations 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.io.IOException; 24 | import java.util.Arrays; 25 | import java.util.List; 26 | import org.junit.AfterClass; 27 | import static org.junit.Assert.assertEquals; 28 | import static org.junit.Assert.assertTrue; 29 | import static org.junit.Assert.fail; 30 | import org.junit.BeforeClass; 31 | import org.junit.Test; 32 | 33 | /** 34 | * Tests for Word2VecSpace. 35 | */ 36 | public class Word2VecSpaceIT { 37 | 38 | static List cr = Arrays.asList("Chinese", "river"); 39 | static Word2VecSpace mySpace; 40 | 41 | public Word2VecSpaceIT() { 42 | } 43 | 44 | @BeforeClass 45 | 46 | public static void setUpClass() throws IOException { 47 | mySpace = GoogleNewsW2VSpace.get(); 48 | } 49 | 50 | @AfterClass 51 | 52 | public static void tearDownClass() { 53 | mySpace = null; 54 | } 55 | // 56 | 57 | @Test 58 | public void distanceTest() { 59 | assertEquals(1.0, mySpace.cosineSimilarity("skimpy bathing suits", "skimpy_bathing_suits"), 0.00000001); 60 | assertEquals(0.24279, mySpace.cosineSimilarity("skimpy bathing suits", "Giant Octopus"), 0.0001); 61 | assertEquals(0.54801, mySpace.cosineSimilarity("skimpy bathing suits", "bathing suits"), 0.0001); 62 | assertEquals(0.645069, mySpace.cosineSimilarity("apple", "pear"), 0.0001); 63 | assertEquals(0.20749, mySpace.cosineSimilarity("apple", "cat"), 0.0001); 64 | 65 | assertTrue(mySpace.cosineSimilarity("apple", "pear") 66 | > mySpace.cosineSimilarity("apple", "cat")); 67 | } 68 | 69 | @Test 70 | public void getVectorTest1() { 71 | assertEquals(-0.05338118f, (mySpace.getVector("skimpy bathing suits")[5]), 0.000001); 72 | assertEquals(0.047296f, (mySpace.getVector("skimpy bathing suits")[105]), 0.000001); 73 | } 74 | 75 | @Test 76 | public void getVectorTest2a() { 77 | assertEquals(-0.049851f, (mySpace.getVector("Chinese")[0]), 0.000001); 78 | assertEquals(-0.090444f, (mySpace.getVector("Chinese")[5]), 0.000001); 79 | } 80 | 81 | @Test 82 | public void getVectorTest2b() { 83 | assertEquals(0.002663f, (mySpace.getVector("river")[0]), 0.000001); 84 | assertEquals(-0.029231f, (mySpace.getVector("river")[5]), 0.000001); 85 | } 86 | 87 | @Test 88 | public void googleDistanceTest1() { 89 | try { 90 | assertEquals(0.667376, 91 | mySpace.googleSimilarity(cr, "Yangtze_River"), 0.0001); 92 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 93 | fail("took unexpected exception:" + ex); 94 | } 95 | } 96 | 97 | @Test 98 | public void googleDistanceTest2() { 99 | try { 100 | assertEquals(0.594108, 101 | mySpace.googleSimilarity(cr, "Hongze_Lake"), 0.0001); 102 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 103 | fail("took unexpected exception:" + ex); 104 | } 105 | } 106 | 107 | @Test 108 | public void googleDistanceTest3() { 109 | try { 110 | assertEquals(0.604726, 111 | mySpace.googleSimilarity(cr, "Huangpu_River"), 0.0001); 112 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 113 | fail("took unexpected exception:" + ex); 114 | } 115 | } 116 | 117 | @Test 118 | public void googleNormVectorTest0() { 119 | try { 120 | float[] norm = mySpace.getGoogleNormedVector(cr); 121 | assertEquals(-0.032075, norm[0], 0.000001); 122 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 123 | fail("took unexpected exception:" + ex); 124 | } 125 | } 126 | 127 | @Test 128 | public void googleNormVectorTest100() { 129 | float[] norm; 130 | try { 131 | norm = mySpace.getGoogleNormedVector(cr); 132 | assertEquals(-0.095236, norm[100], 0.000001); 133 | 134 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 135 | fail("took unexpected exception:" + ex); 136 | } 137 | } 138 | 139 | @Test 140 | public void googleNormVectorTest5() { 141 | try { 142 | float[] norm = mySpace.getGoogleNormedVector(cr); 143 | assertEquals(-0.081347, norm[5], 0.000001); 144 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 145 | fail("took unexpected exception:" + ex); 146 | } 147 | } 148 | 149 | @Test 150 | public void googleNormVectorTest50() { 151 | try { 152 | float[] norm = mySpace.getGoogleNormedVector(cr); 153 | assertEquals(0.080537, norm[50], 0.000001); 154 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 155 | fail("took unexpected exception:" + ex); 156 | } 157 | } 158 | 159 | /** 160 | * Test if known terms have been loaded from the Word2Vec file or DB 161 | */ 162 | @Test 163 | public void knownTermTest() { 164 | // System.out.println("DB Size:" + vectors.size()); 165 | 166 | assertTrue(mySpace.knownTerm("Yathra")); 167 | assertTrue(mySpace.knownTerm("skimpy bathing suits")); 168 | assertTrue(mySpace.knownTerm("Giant_Octopus")); 169 | assertTrue(mySpace.knownTerm("Yangtze_River")); 170 | assertTrue(mySpace.knownTerm("Chinese")); 171 | // assertTrue(mySpace.knownTerm("Chinese River")); 172 | 173 | } 174 | 175 | // @Test 176 | // public void findNearbyTerms1() { 177 | // try { 178 | // long t1 = System.currentTimeMillis(); 179 | // List matches = mySpace.findNearestNForWithInputTermFiltering(cr, 40); 180 | // IntStream.range(0, matches.size()) 181 | // .forEach(i -> { 182 | // System.out.println(i + " " + matches.get(i).toString()); 183 | // }); 184 | // System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms"); 185 | // assertEquals(matches.get(0).getTerm(), "Yangtze_River"); 186 | // assertEquals(0.604726, matches.get(5).getSimilarity(), 0.000001); 187 | // 188 | // assertEquals(matches.get(23).getTerm(), "rivers"); 189 | // } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 190 | // fail("took unexpected exception:" + ex); 191 | // } 192 | // } 193 | // 194 | // @Test 195 | // 196 | // public void findNearbyTerms2() { 197 | // try { 198 | // long t1 = System.currentTimeMillis(); 199 | // List matches = mySpace.findNearestNForWithInputTermFiltering(Arrays.asList("gangplank"), 40); 200 | // IntStream.range(0, matches.size()) 201 | // .forEach(i -> { 202 | // System.out.println(i + " " + matches.get(i).toString()); 203 | // }); 204 | // System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms"); 205 | // } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) { 206 | // fail("took unexpected exception:" + ex); 207 | // } 208 | // } 209 | @Test 210 | public void testNGramsFor() { 211 | List res = Word2VecSpace.nGramsFor(Arrays.asList("this", "is", "a", "test")); 212 | // System.out.println("test: "+res+" len:"+res.size()); 213 | 214 | assertEquals(10, res.size()); 215 | } 216 | 217 | @Test 218 | public void testNGramsForCR() { 219 | List res = Word2VecSpace.nGramsFor(cr); 220 | System.out.println("test: " + res + " len:" + res.size()); 221 | assertEquals(3, res.size()); 222 | } 223 | 224 | } 225 | -------------------------------------------------------------------------------- /OwlTools/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cyc.tool 5 | OwlTools 6 | jar 7 | 8 | 9 | com.cyc.tool 10 | distributedRepresentationsParent 11 | 1.0 12 | ../distributedRepresentationsParent 13 | 14 | 15 | 16 | 17 | 18 | org.apache.maven.plugins 19 | maven-compiler-plugin 20 | 2.3.2 21 | 22 | 1.8 23 | 1.8 24 | 25 | 26 | 27 | org.codehaus.mojo 28 | license-maven-plugin 29 | 30 | 31 | org.apache.maven.plugins 32 | maven-javadoc-plugin 33 | 34 | 35 | 36 | 37 | 38 | net.sourceforge.owlapi 39 | owlapi-distribution 40 | 4.0.1 41 | jar 42 | 43 | 44 | org.mapdb 45 | mapdb 46 | 1.0.6 47 | jar 48 | 49 | 50 | com.cyc.tool 51 | CycMapDBTools 52 | jar 53 | 54 | 55 | 56 | UTF-8 57 | 1.8 58 | 1.8 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /OwlTools/pom.xml~: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cyc.tool 5 | OwlTools 6 | 0.0.1-SNAPSHOT 7 | jar 8 | 9 | 10 | com.cyc.project.kbtaxonomy 11 | KBTaxonomyParent 12 | 0.0.1-SNAPSHOT 13 | ../KBTaxonomyParent 14 | 15 | 16 | 17 | 18 | 19 | org.apache.maven.plugins 20 | maven-compiler-plugin 21 | 2.3.2 22 | 23 | 1.8 24 | 1.8 25 | 26 | 27 | 28 | org.codehaus.mojo 29 | license-maven-plugin 30 | 31 | 32 | org.apache.maven.plugins 33 | maven-javadoc-plugin 34 | 35 | 36 | 37 | 38 | 39 | net.sourceforge.owlapi 40 | owlapi-distribution 41 | 4.0.1 42 | jar 43 | 44 | 45 | org.mapdb 46 | mapdb 47 | 1.0.6 48 | jar 49 | 50 | 51 | com.cyc.tool 52 | DistributedRepresentations 53 | 0.0.1-SNAPSHOT 54 | jar 55 | 56 | 57 | com.cyc.tool 58 | CycMapDBTools 59 | 0.0.1-SNAPSHOT 60 | jar 61 | 62 | 63 | 64 | UTF-8 65 | 1.8 66 | 1.8 67 | 68 | 69 | -------------------------------------------------------------------------------- /OwlTools/src/main/java/com/cyc/tool/owltools/OpenCycContent.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.owltools; 2 | 3 | /* 4 | * #%L 5 | * OwlTools 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.util.ArrayList; 24 | import java.util.Collection; 25 | import java.util.HashSet; 26 | import java.util.List; 27 | import java.util.Set; 28 | import org.semanticweb.owlapi.model.IRI; 29 | import org.semanticweb.owlapi.model.OWLAnnotation; 30 | import org.semanticweb.owlapi.model.OWLClass; 31 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 32 | import org.semanticweb.owlapi.reasoner.NodeSet; 33 | import org.semanticweb.owlapi.search.EntitySearcher; 34 | 35 | /** 36 | *

37 | * OpenCycContent is designed to hold information about a given OpenCyc concept that can be found in 38 | * the OWL export of OpenCyc. 39 | * 40 | *
This software is the proprietary information of Cycorp, Inc. 41 | *

42 | * Use is subject to license terms. 43 | * 44 | * Created on : Feb 25, 2015, 2:47:47 PM 45 | */ 46 | public class OpenCycContent { 47 | 48 | Set commentsForConcept; 49 | String conceptURI; 50 | String labelForConcept; 51 | Set prettyStringsForConcept; 52 | Set subTypesForConcept; 53 | 54 | Set typesForConcept; 55 | 56 | //// Constructors 57 | /** 58 | * Creates a new instance of OpenCycContent. 59 | * 60 | * @param hlid 61 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException 62 | */ 63 | public OpenCycContent(String hlid) throws OWLOntologyCreationException { 64 | conceptURI = hlid; 65 | prettyStringsForConcept = null; 66 | commentsForConcept = null; 67 | labelForConcept = null; 68 | typesForConcept = null; 69 | } 70 | 71 | /** 72 | * 73 | * @return HTML String with information about the concept 74 | * @throws OWLOntologyCreationException 75 | */ 76 | public String generateHtmlForConcept() throws OWLOntologyCreationException { 77 | String html = ""; 78 | String constantName = getLabelForConcept(); 79 | Set commentStr = getCommentsForConcept(); 80 | Set prettyStr = getPrettyStringsForConcept(); 81 | html += "

" + constantName + "

\n\n" 82 | + selectPicForConcept(getTypesForConcept()) 83 | + "

" + commentStr.toArray(new String[0])[0] + "

\n" 84 | + "

English Phrases:

\n" 85 | + "
    \n"; 86 | for (String s : prettyStr) { 87 | html += "
  • " + s + "
  • \n"; 88 | } 89 | html += "
\n"; 90 | 91 | return html; 92 | } 93 | 94 | /** 95 | * 96 | * @return Set of String comments 97 | * @throws OWLOntologyCreationException 98 | */ 99 | public Set getCommentsForConcept() throws OWLOntologyCreationException { 100 | if (commentsForConcept == null) { 101 | commentsForConcept = getCommentsForConceptFromOWL(); 102 | } 103 | return commentsForConcept; 104 | } 105 | 106 | /** 107 | * 108 | * @return The CycL constant name 109 | * @throws OWLOntologyCreationException 110 | */ 111 | public String getLabelForConcept() throws OWLOntologyCreationException { 112 | if (labelForConcept == null) { 113 | labelForConcept = getLabelForConceptFromOWL(); 114 | } 115 | return labelForConcept; 116 | } 117 | 118 | /** 119 | * 120 | * @return Set of Strings with NL for the concept 121 | * @throws OWLOntologyCreationException 122 | */ 123 | public Set getPrettyStringsForConcept() throws OWLOntologyCreationException { 124 | if (prettyStringsForConcept == null) { 125 | prettyStringsForConcept = getPrettyStringsForConceptFromOWL(); 126 | } 127 | return prettyStringsForConcept; 128 | } 129 | 130 | /** 131 | * 132 | * @return Set of Strings with names for generalizations of the concept 133 | * @throws OWLOntologyCreationException 134 | */ 135 | public Set getSubTypesForConcept() throws OWLOntologyCreationException { 136 | if (subTypesForConcept == null) { 137 | subTypesForConcept = getSubTypesForConceptFromOWL(); 138 | } 139 | return subTypesForConcept; 140 | } 141 | 142 | /** 143 | * 144 | * @return Set of Strings with names for specializations of the concept 145 | * @throws OWLOntologyCreationException 146 | */ 147 | public Set getTypesForConcept() throws OWLOntologyCreationException { 148 | if (typesForConcept == null) { 149 | typesForConcept = getTypesForConceptFromOWL(); 150 | } 151 | return typesForConcept; 152 | } 153 | 154 | private Set getCommentsForConceptFromOWL() throws OWLOntologyCreationException { 155 | OpenCycReasoner reasoner = OpenCycReasoner.get(); 156 | Set comments = new HashSet<>(); 157 | OWLClass concept = reasoner.getDataFactory().getOWLClass(IRI.create("http://sw.opencyc.org/concept/" + conceptURI)); 158 | Collection anns = EntitySearcher.getAnnotations(concept, reasoner.getOpenCyc(), reasoner.getComment()); 159 | anns.forEach(ann -> { 160 | comments.add(ann.getValue().asLiteral().get().getLiteral()); 161 | }); 162 | 163 | return comments; 164 | } 165 | 166 | private String getLabelForConceptFromOWL() throws OWLOntologyCreationException { 167 | OpenCycReasoner reasoner = OpenCycReasoner.get(); 168 | String label = ""; 169 | List labels = new ArrayList<>(); 170 | OWLClass concept = reasoner.getDataFactory().getOWLClass(IRI.create("http://sw.opencyc.org/concept/" + conceptURI)); 171 | Collection anns = EntitySearcher.getAnnotations(concept, reasoner.getOpenCyc(), reasoner.getLabel()); 172 | anns.forEach(ann -> { 173 | labels.add(ann.getValue().asLiteral().get().getLiteral()); 174 | }); 175 | if (conceptURI.contains("Mx")) { 176 | try { 177 | label = labels.get(0); 178 | } catch (Exception e) { 179 | System.out.println("Something went wrong getting the label from OWL"); 180 | label = "FakeName"; 181 | } 182 | } 183 | return label; 184 | } 185 | 186 | private Set getPrettyStringsForConceptFromOWL() throws OWLOntologyCreationException { 187 | OpenCycReasoner reasoner = OpenCycReasoner.get(); 188 | Set prettyStrings = new HashSet<>(); 189 | OWLClass concept = reasoner.getDataFactory().getOWLClass(IRI.create("http://sw.opencyc.org/concept/" + conceptURI)); 190 | Collection anns = EntitySearcher.getAnnotations(concept, reasoner.getOpenCyc(), reasoner.getPrettyString()); 191 | anns.forEach(ann -> { 192 | prettyStrings.add(ann.getValue().asLiteral().get().getLiteral()); 193 | }); 194 | 195 | return prettyStrings; 196 | } 197 | 198 | private Set getSubTypesForConceptFromOWL() throws OWLOntologyCreationException { 199 | OpenCycReasoner reasoner = OpenCycReasoner.get(); 200 | Set types = new HashSet<>(); 201 | OWLClass concept = reasoner.getDataFactory().getOWLClass(IRI.create("http://sw.opencyc.org/concept/" + conceptURI)); 202 | NodeSet subClasses = reasoner.getReasoner().getSubClasses(concept, true); 203 | subClasses.forEach(node -> { 204 | Set ents = node.getEntities(); 205 | ents.forEach(ent -> { 206 | types.add(ent.getIRI().getShortForm()); 207 | }); 208 | }); 209 | return types; 210 | } 211 | 212 | private Set getTypesForConceptFromOWL() throws OWLOntologyCreationException { 213 | OpenCycReasoner reasoner = OpenCycReasoner.get(); 214 | Set types = new HashSet<>(); 215 | OWLClass concept = reasoner.getDataFactory().getOWLClass(IRI.create("http://sw.opencyc.org/concept/" + conceptURI)); 216 | NodeSet subClasses = reasoner.getReasoner().getSuperClasses(concept, true); 217 | subClasses.forEach(node -> { 218 | Set ents = node.getEntities(); 219 | ents.forEach(ent -> { 220 | types.add(ent.getIRI().getShortForm()); 221 | }); 222 | }); 223 | return types; 224 | } 225 | 226 | //// Protected Area 227 | private String selectPicForConcept(Set types) { 228 | String picHTML = ""; 229 | for (String type : types) { 230 | if (type.equalsIgnoreCase("Mx4rvViADZwpEbGdrcN5Y29ycA")) { 231 | // Event 232 | picHTML = ""; 233 | return picHTML; 234 | } else if (type.equalsIgnoreCase("Mx4rIcwFloGUQdeMlsOWYLFB2w")) { 235 | // Human 236 | picHTML = ""; 237 | return picHTML; 238 | } else if (type.equalsIgnoreCase("Mx4rv-6HepwpEbGdrcN5Y29ycA")) { 239 | // Transportation 240 | picHTML = ""; 241 | } 242 | } 243 | 244 | return picHTML; 245 | } 246 | } 247 | -------------------------------------------------------------------------------- /OwlTools/src/main/java/com/cyc/tool/owltools/OpenCycOwl.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.owltools; 2 | 3 | /* 4 | * #%L 5 | * OwlTools 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | //import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace; 24 | //import com.cyc.tool.distributedrepresentations.Word2VecSpace; 25 | import com.google.common.collect.Iterables; 26 | import java.io.File; 27 | import java.io.IOException; 28 | import java.util.Arrays; 29 | import java.util.Collection; 30 | import java.util.HashMap; 31 | import java.util.HashSet; 32 | import java.util.Locale; 33 | import java.util.Map; 34 | import java.util.Set; 35 | import java.util.concurrent.ConcurrentNavigableMap; 36 | import java.util.function.Predicate; 37 | import java.util.logging.Level; 38 | import java.util.logging.Logger; 39 | import java.util.stream.Collectors; 40 | import java.util.stream.Stream; 41 | import org.mapdb.DB; 42 | import org.mapdb.DBMaker; 43 | import org.semanticweb.owlapi.apibinding.OWLManager; 44 | import org.semanticweb.owlapi.io.FileDocumentSource; 45 | import org.semanticweb.owlapi.model.IRI; 46 | import org.semanticweb.owlapi.model.OWLAnnotation; 47 | import org.semanticweb.owlapi.model.OWLAnnotationProperty; 48 | import org.semanticweb.owlapi.model.OWLClass; 49 | import org.semanticweb.owlapi.model.OWLDataFactory; 50 | import org.semanticweb.owlapi.model.OWLLogicalEntity; 51 | import org.semanticweb.owlapi.model.OWLOntology; 52 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 53 | import org.semanticweb.owlapi.model.OWLOntologyManager; 54 | import org.semanticweb.owlapi.reasoner.Node; 55 | import org.semanticweb.owlapi.reasoner.NodeSet; 56 | import org.semanticweb.owlapi.reasoner.OWLReasoner; 57 | import org.semanticweb.owlapi.reasoner.OWLReasonerFactory; 58 | import org.semanticweb.owlapi.reasoner.structural.StructuralReasonerFactory; 59 | import org.semanticweb.owlapi.search.EntitySearcher; 60 | import org.semanticweb.owlapi.vocab.OWLRDFVocabulary; 61 | 62 | /** 63 | *

64 | * OpenCycOwl has methods for accessing information in an OpenCyc OWL file. 65 | * There is some known overlap with this class, {@link OpenCycReasoner}, 66 | * and {@link OpenCycContent}. 67 | * 68 | */ 69 | public class OpenCycOwl { 70 | 71 | 72 | static final String ocycLocation = OwlToolsConfig.ocycLocation; 73 | 74 | /** 75 | * HLID for testing puproses. 76 | */ 77 | public String pizzaGUID = "Mx4rvVibapwpEbGdrcN5Y29ycA"; 78 | private final boolean clearLabels = false; 79 | private final OWLDataFactory dataFactory; 80 | private final OWLOntologyManager manager; 81 | private OWLOntology openCyc; 82 | private final OWLAnnotationProperty prettyString; 83 | private final OWLAnnotationProperty rdfsLabel; 84 | private OWLReasoner reasoner; 85 | private final OWLReasonerFactory reasonerFactory; 86 | 87 | private long t; // time keeper 88 | Set allConcepts; 89 | final Map> conceptLabels; 90 | Set conceptsWithTerms; 91 | DB db; 92 | ConcurrentNavigableMap> ocycConceptForTermLabel; 93 | ConcurrentNavigableMap> ocycConceptForTermLower; 94 | ConcurrentNavigableMap> ocycConceptForTermPrettyString; 95 | ConcurrentNavigableMap> typeGraph; 96 | 97 | /** 98 | * Creates a new instance of OwlTest. 99 | * @throws java.io.IOException 100 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException 101 | */ 102 | public OpenCycOwl() throws IOException, OWLOntologyCreationException { 103 | 104 | // A simple example of how to load and save an ontology We first need to 105 | // obtain a copy of an OWLOntologyManager, which, as the name suggests, 106 | // manages a set of ontologies. An ontology is unique within an ontology 107 | // manager. Each ontology knows its ontology manager. To load multiple 108 | // copies of an ontology, multiple managers would have to be used. 109 | manager = OWLManager.createOWLOntologyManager(); 110 | // We load an ontology from a document IRI - in this case we'll load the 111 | // pizza ontology. 112 | // IRI documentIRI = IRI.create(PIZZA_IRI); 113 | // Now ask the manager to load the ontology 114 | // OWLOntology ontology = manager 115 | // .loadOntologyFromOntologyDocument(documentIRI); 116 | // but in this test we don't rely on a remote ontology and load it from 117 | // a string 118 | //play with mapr 119 | // System.out.println(Arrays.asList(1,2,3,4,5,6,7,8).stream().map(x->x*x).reduce((x,y)->x+y).get()); 120 | 121 | db = DBMaker.newFileDB(new File(OwlToolsConfig.getOcycTermDBFile())) 122 | .closeOnJvmShutdown() 123 | // .encryptionEnable("password") 124 | .make(); 125 | 126 | reasonerFactory = new StructuralReasonerFactory(); 127 | dataFactory = manager.getOWLDataFactory(); 128 | prettyString = dataFactory.getOWLAnnotationProperty( 129 | guidToIRI("Mx4rwLSVCpwpEbGdrcN5Y29ycA")); 130 | rdfsLabel = dataFactory.getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI()); 131 | this.getPrettyStringToConceptMap(); 132 | this.getRDFSLabelConceptMap(); 133 | this.getLowerCaseConceptMap(); 134 | this.createTypeGraph(); 135 | conceptLabels = new HashMap<>(); 136 | this.fillConceptLabels(); 137 | } 138 | 139 | /** 140 | * 141 | * @param args 142 | * @throws Exception 143 | */ 144 | public static void main(String[] args) throws Exception { 145 | 146 | OpenCycOwl my = new OpenCycOwl(); 147 | System.out.println("N Classes:" + my.getOpenCyc().getClassesInSignature().size()); 148 | my.pizzaTest(); 149 | // Remove the ontology from the manager 150 | my.manager.removeOntology(my.getOpenCyc()); 151 | } 152 | 153 | /** 154 | * 155 | * @return the allConcepts Set 156 | * @throws IOException 157 | */ 158 | public Set allConcepts() throws IOException { 159 | 160 | allConcepts = db.getHashSet(OwlToolsConfig.getAllConceptsName()); 161 | if (allConcepts.isEmpty()) { 162 | Set res 163 | = getOpenCyc(). 164 | getClassesInSignature() 165 | .stream() 166 | .map(clss -> { 167 | String csid = clss.toStringID(); 168 | String s = guidFromURLString(csid); 169 | System.out.println("AC:" + csid + " " + s); 170 | return s; 171 | }) 172 | .collect(Collectors.toSet()); 173 | allConcepts.addAll(res); 174 | db.commit(); 175 | } 176 | return allConcepts; 177 | } 178 | 179 | /** 180 | * Close the ontology access 181 | */ 182 | public void close() { 183 | if (openCyc != null) { 184 | manager.removeOntology(openCyc); 185 | } 186 | } 187 | 188 | /** 189 | * 190 | * @param term 191 | * @return all concepts for a given term String 192 | */ 193 | public Set conceptsFor(String term) { 194 | Set ret = new HashSet<>(); 195 | if (ocycConceptForTermPrettyString.containsKey(term)) { 196 | ret.addAll(ocycConceptForTermPrettyString.get(term)); 197 | } 198 | if (ocycConceptForTermLabel.containsKey(term)) { 199 | ret.addAll(ocycConceptForTermLabel.get(term)); 200 | } 201 | String l = term.toLowerCase(Locale.ENGLISH); 202 | if (ocycConceptForTermLower.containsKey(l)) { 203 | ret.addAll(ocycConceptForTermLower.get(l)); 204 | } 205 | if (term.contains("_")) { 206 | ret.addAll(conceptsFor(term.replace("_", " "))); 207 | } 208 | return ret; 209 | } 210 | 211 | 212 | 213 | /** 214 | * 215 | * @return Set of concepts with terms in the W2V space 216 | * @throws IOException 217 | */ 218 | // public Set conceptsWithW2VTerms() throws IOException { 219 | // /* @Todo: Consider making this more independent of the particular W2V space */ 220 | // Word2VecSpace w2v = GoogleNewsW2VSpace.get(); 221 | // conceptsWithTerms = db.getHashSet(OwlToolsConfig.getConceptsWithTermsName()); 222 | // if (conceptsWithTerms.isEmpty()) { 223 | // Set res 224 | // = Stream.concat( 225 | // Stream.concat( 226 | // ocycConceptForTermPrettyString.entrySet().stream(), 227 | // ocycConceptForTermLabel.entrySet().stream()), 228 | // ocycConceptForTermLower.entrySet().stream()) 229 | // .filter(s -> w2v.knownTerm(s.getKey())) 230 | // .map(s -> s.getValue()) 231 | // .flatMap(conceptSet -> conceptSet.stream()) 232 | // .collect(Collectors.toSet()); 233 | // conceptsWithTerms.addAll(res); 234 | // db.commit(); 235 | // } 236 | // return conceptsWithTerms; 237 | // } 238 | 239 | /** 240 | * 241 | * @param forT 242 | * @return Set of types for a term 243 | */ 244 | public Set getTypes(String forT) { 245 | Set ret = new HashSet<>(); 246 | if (typeGraph.containsKey(forT)) { 247 | return typeGraph.get(forT); 248 | } 249 | if (forT.equals("Thing")) { 250 | return ret; 251 | } 252 | // System.out.println("No types for :" + guidToURLString(forT)); 253 | return ret; 254 | } 255 | 256 | /** 257 | * 258 | * @param conceptGUID 259 | * @return Set of types for a concept 260 | * @throws OWLOntologyCreationException 261 | */ 262 | public Set getTypesForConceptFromOWL(String conceptGUID) throws OWLOntologyCreationException { 263 | 264 | Set types = new HashSet<>(); 265 | OWLClass concept 266 | = dataFactory.getOWLClass(guidToIRI(conceptGUID)); 267 | NodeSet subClasses = getReasoner() 268 | .getSuperClasses(concept, true); 269 | subClasses.forEach(node -> { 270 | Set ents = node.getEntities(); 271 | ents.forEach(ent -> { 272 | types.add(ent.getIRI().getShortForm()); 273 | }); 274 | }); 275 | return types; 276 | } 277 | 278 | /** 279 | * 280 | * @param forT 281 | * @return Set of types for a term 282 | */ 283 | public Set getTypesTransitive(String forT) { 284 | Set ret = new HashSet<>(); 285 | if (typeGraph.containsKey(forT)) { 286 | 287 | typeGraph 288 | .get(forT) 289 | .forEach(t -> { 290 | getTypesTransitive(t, ret); 291 | }); 292 | return ret; 293 | } 294 | // System.out.println("PROBLEM: " + forT); 295 | return ret; 296 | } 297 | 298 | /** 299 | * 300 | * @param forT 301 | * @return Set of types for a term 302 | */ 303 | public Set getTypesTransitiveURL(String forT) { 304 | return getTypesTransitive(guidFromURLString(forT)) 305 | .stream() 306 | .map(t -> guidToURLString(t)) 307 | .collect(Collectors.toSet()); 308 | } 309 | 310 | /** 311 | * 312 | * @param forT 313 | * @return Set of types of a term 314 | */ 315 | public Set getTypesURL(String forT) { 316 | return getTypes(guidFromURLString(forT)) 317 | .stream() 318 | .map(t -> guidToURLString(t)) 319 | .collect(Collectors.toSet()); 320 | } 321 | 322 | /** 323 | * 324 | * @param url 325 | * @return GUID from a URL 326 | */ 327 | public String guidFromURLString(String url) { 328 | return url.replaceFirst("http://sw.opencyc.org/concept/", ""); 329 | } 330 | 331 | /** 332 | * 333 | * @param conceptGuid 334 | * @return URL from a GUID 335 | */ 336 | public String guidToURLString(String conceptGuid) { 337 | return "http://sw.opencyc.org/concept/" + conceptGuid; 338 | } 339 | 340 | /** 341 | * 342 | * @param term 343 | * @return true if term is in the ontology 344 | */ 345 | public boolean knownTerm(String term) { 346 | if (ocycConceptForTermPrettyString.containsKey(term)) { 347 | return true; 348 | } 349 | if (ocycConceptForTermLabel.containsKey(term)) { 350 | return true; 351 | } 352 | if (ocycConceptForTermLower.containsKey(term.toLowerCase(Locale.ENGLISH))) { 353 | return true; 354 | } 355 | if (term.contains("_")) { 356 | return knownTerm(term.replace("_", " ")); 357 | } 358 | return false; 359 | } 360 | 361 | /** 362 | * 363 | * @param concept 364 | * @return a String with labels for the concept 365 | */ 366 | public String labelsForConcept(String concept) { 367 | if (conceptLabels.containsKey(concept)) { 368 | return String.join("|", conceptLabels.get(concept)); 369 | } 370 | return concept; 371 | } 372 | 373 | /** 374 | * 375 | * @return a Predicate to test if a concept is present 376 | */ 377 | public Predicate noConcept() { 378 | return a -> !Arrays.stream(a) 379 | .anyMatch(hasConcept()); 380 | } 381 | 382 | /** 383 | * 384 | * @return Number of classes in the ontology 385 | */ 386 | public int size() { 387 | return getOpenCyc().getClassesInSignature().size(); 388 | } 389 | 390 | /** 391 | * 392 | * @return an OWLOntology for OpenCyc 393 | */ 394 | protected OWLOntology getOpenCyc() { 395 | if (openCyc == null) { 396 | try { 397 | t = System.currentTimeMillis(); 398 | openCyc = manager 399 | .loadOntologyFromOntologyDocument( 400 | new FileDocumentSource( 401 | new File(ocycLocation))); 402 | System.out.println("Open Cyc Load time:" 403 | + (System.currentTimeMillis() - t) + "ms"); 404 | } catch (OWLOntologyCreationException ex) { 405 | Logger.getLogger(OpenCycOwl.class.getName()).log(Level.SEVERE, null, ex); 406 | } 407 | } 408 | 409 | return openCyc; 410 | } 411 | 412 | /** 413 | * 414 | * @return an OWLReasoner 415 | */ 416 | protected OWLReasoner getReasoner() { 417 | if (reasoner == null) { 418 | reasoner = reasonerFactory.createReasoner(getOpenCyc()); 419 | } 420 | return reasoner; 421 | } 422 | 423 | private void createTypeGraph() throws IOException { 424 | typeGraph = db.getTreeMap(OwlToolsConfig.getTypeGraphName()); 425 | if (typeGraph.isEmpty()) { 426 | allConcepts(). 427 | stream(). 428 | map(c -> guidFromURLString(c)) 429 | .forEach(s -> { 430 | try { 431 | Set types = getTypesForConceptFromOWL(s); 432 | System.out.println("Types for " + s + ": " + types.size()); 433 | typeGraph.put(s, types); 434 | } catch (OWLOntologyCreationException ex) { 435 | Logger.getLogger(OpenCycOwl.class.getName()).log(Level.SEVERE, null, ex); 436 | } 437 | }); 438 | db.commit(); 439 | db.compact(); 440 | } 441 | 442 | } 443 | 444 | private void fillConceptLabels() { 445 | 446 | t = System.currentTimeMillis(); 447 | Iterables.concat(ocycConceptForTermLabel.entrySet(), 448 | ocycConceptForTermLabel.entrySet(), 449 | ocycConceptForTermPrettyString.entrySet()).forEach(entry -> { 450 | Set concepts = entry.getValue(); 451 | concepts.forEach(concept -> { 452 | if (!conceptLabels.containsKey(concept)) { 453 | conceptLabels.put(concept, new HashSet<>()); 454 | } 455 | conceptLabels.get(concept).add(entry.getKey()); 456 | }); 457 | }); 458 | System.out.println("Concept to term map creation:" 459 | + (System.currentTimeMillis() - t) + "ms"); 460 | } 461 | 462 | private void getLowerCaseConceptMap() { 463 | 464 | ocycConceptForTermLower = db.getTreeMap(OwlToolsConfig.getOcycTermMapName() + "_Lower"); 465 | if (clearLabels) { 466 | ocycConceptForTermLower.clear(); 467 | } 468 | if (ocycConceptForTermLower.isEmpty()) { 469 | ocycConceptForTermPrettyString.keySet().forEach(s -> { 470 | storeDownCaseLabel(s, ocycConceptForTermPrettyString); 471 | }); 472 | 473 | ocycConceptForTermLabel.keySet().forEach(s -> { 474 | storeDownCaseLabel(s, ocycConceptForTermLabel); 475 | }); 476 | db.commit(); 477 | db.compact(); 478 | } 479 | 480 | } 481 | 482 | private void getPrettyStringToConceptMap() { 483 | // Print out all of the classes which are contained in the signature of 484 | // the ontology. These are the classes that are referenced by axioms in 485 | // the ontology. 486 | 487 | ocycConceptForTermPrettyString = db.getTreeMap(OwlToolsConfig.getOcycTermMapName()); 488 | if (clearLabels) { 489 | ocycConceptForTermPrettyString.clear(); 490 | } 491 | if (ocycConceptForTermPrettyString.isEmpty()) { 492 | Iterables.concat( 493 | getOpenCyc().getClassesInSignature(), 494 | getOpenCyc().getIndividualsInSignature()).forEach(owlObj -> { 495 | System.out.println("Loading PrettyStrings for " 496 | + (owlObj instanceof OWLClass ? "Class" : "Individual") + ": " + owlObj); 497 | Collection annotations 498 | = EntitySearcher.getAnnotations(owlObj, getOpenCyc(), prettyString); 499 | annotations.forEach(ann -> { 500 | storeConceptLabel(ann, owlObj, ocycConceptForTermPrettyString); 501 | }); 502 | }); 503 | db.commit(); 504 | db.compact(); 505 | 506 | } 507 | } 508 | 509 | private void getRDFSLabelConceptMap() { 510 | // Print out all of the classes which are contained in the signature of 511 | // the ontology. These are the classes that are referenced by axioms in 512 | // the ontology. 513 | 514 | ocycConceptForTermLabel = db.getTreeMap(OwlToolsConfig.getOcycTermMapName() + "_Label"); 515 | if (clearLabels) { 516 | ocycConceptForTermLabel.clear(); 517 | } 518 | if (ocycConceptForTermLabel.isEmpty()) { 519 | // Get the terms for collections and individuals 520 | Iterables.concat( 521 | getOpenCyc().getClassesInSignature(), 522 | getOpenCyc().getIndividualsInSignature()).forEach(owlObj -> { 523 | System.out.println("Loading RDFS Labels for " 524 | + (owlObj instanceof OWLClass ? "Class" : "Individual") + ": " + owlObj); 525 | Collection annotations 526 | = EntitySearcher.getAnnotations(owlObj, getOpenCyc(), rdfsLabel); 527 | annotations.forEach(ann -> { 528 | storeConceptLabel(ann, owlObj, ocycConceptForTermLabel); 529 | }); 530 | }); 531 | 532 | db.commit(); 533 | db.compact(); 534 | } 535 | } 536 | 537 | private void getTypesTransitive(String forT, Set soFar) { 538 | if (!soFar.contains(forT)) { 539 | soFar.add(forT); 540 | if (forT.equals("Thing")) { 541 | return; 542 | } 543 | getTypes(forT) 544 | .forEach(st -> { 545 | getTypesTransitive(st, soFar); 546 | }); 547 | 548 | } 549 | } 550 | 551 | private IRI guidToIRI(String conceptGuid) { 552 | return IRI.create(guidToURLString(conceptGuid)); 553 | } 554 | 555 | private Predicate hasConcept() { 556 | return a -> knownTerm(a); 557 | } 558 | 559 | private void pizzaTest() { 560 | // Now save a copy to another location in OWL/XML format (i.e. disregard 561 | // the format that the ontology was loaded in). 562 | //File f = folder.newFile("owlapiexample_example1.xml"); 563 | //IRI documentIRI2 = IRI.create(f); 564 | //manager.saveOntology(ontology, new OWLXMLDocumentFormat(), documentIRI2); 565 | 566 | OWLClass pizza 567 | = dataFactory.getOWLClass(guidToIRI(pizzaGUID)); 568 | 569 | NodeSet subClses = getReasoner().getSubClasses(pizza, true); 570 | // Setop=pizza.getObjectPropertiesInSignature(); 571 | t = System.currentTimeMillis(); 572 | Collection anns 573 | = EntitySearcher.getAnnotations(pizza, getOpenCyc(), prettyString); 574 | 575 | System.out.println("Search time:" + (System.currentTimeMillis() - t) + "ms"); 576 | anns.forEach(ann 577 | -> System.out.println(ann.getValue().asLiteral().get().getLiteral() 578 | )); 579 | 580 | subClses.forEach((Node node) -> { 581 | Set em = node.getEntities(); 582 | em.forEach(clss -> { 583 | System.out.println("SubType:" + clss); 584 | Collection annotations = EntitySearcher.getAnnotations(clss, getOpenCyc(), prettyString); 585 | annotations.forEach(ann -> { 586 | String lit = ann.getValue().asLiteral().get().getLiteral(); 587 | System.out.println("\t:" + lit); 588 | }); 589 | }); 590 | }); 591 | } 592 | 593 | private void storeConceptLabel(OWLAnnotation ann, OWLLogicalEntity owlObj, ConcurrentNavigableMap> labelMap) { 594 | String lit = ann.getValue().asLiteral().get().getLiteral(); 595 | final Set newLabels = new HashSet<>(); 596 | if (labelMap.containsKey(lit)) { 597 | newLabels.addAll(labelMap.get(lit)); 598 | } 599 | newLabels.add(owlObj.toStringID()); 600 | labelMap.put(lit, newLabels); 601 | if (lit.startsWith("the ")) { //hack to artificially extend reach 602 | final Set newLabelsThe = new HashSet<>(); 603 | String key = lit.replace("the ", ""); 604 | if (labelMap.containsKey(key)) { 605 | newLabelsThe.addAll(labelMap.get(key)); 606 | } 607 | newLabelsThe.add(owlObj.toStringID()); 608 | labelMap.put(key, newLabelsThe); 609 | } 610 | // System.out.println((sp.knownTerm(lit) ? "+" : "-") + lit); 611 | } 612 | 613 | private void storeDownCaseLabel(String s, ConcurrentNavigableMap> labelMap) { 614 | final Set newLabels = new HashSet<>(); 615 | String l = s.toLowerCase(Locale.ENGLISH); 616 | 617 | if (ocycConceptForTermLower.containsKey(l)) { 618 | newLabels.addAll(ocycConceptForTermLower.get(l)); 619 | } 620 | newLabels.addAll(labelMap.get(s)); 621 | 622 | ocycConceptForTermLower.put(l, newLabels); 623 | } 624 | 625 | } 626 | -------------------------------------------------------------------------------- /OwlTools/src/main/java/com/cyc/tool/owltools/OpenCycReasoner.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.owltools; 2 | 3 | /* 4 | * #%L 5 | * OwlTools 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.io.File; 24 | import java.util.ArrayList; 25 | import java.util.List; 26 | import java.util.Set; 27 | import org.semanticweb.owlapi.apibinding.OWLManager; 28 | import org.semanticweb.owlapi.io.FileDocumentSource; 29 | import org.semanticweb.owlapi.model.IRI; 30 | import org.semanticweb.owlapi.model.OWLAnnotationProperty; 31 | import org.semanticweb.owlapi.model.OWLClass; 32 | import org.semanticweb.owlapi.model.OWLDataFactory; 33 | import org.semanticweb.owlapi.model.OWLOntology; 34 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 35 | import org.semanticweb.owlapi.model.OWLOntologyManager; 36 | import org.semanticweb.owlapi.reasoner.OWLReasoner; 37 | import org.semanticweb.owlapi.reasoner.OWLReasonerFactory; 38 | import org.semanticweb.owlapi.reasoner.structural.StructuralReasonerFactory; 39 | import org.semanticweb.owlapi.util.DefaultPrefixManager; 40 | 41 | /** 42 | *

43 | * OpenCycReasoner provides access to methods that {@link OpenCycContent} uses to get information 44 | * out of the OpenCyc OWL file. 45 | * 46 | */ 47 | public class OpenCycReasoner { 48 | 49 | private static List allClassIRIs = null; 50 | private static Set allClasses = null; 51 | private static OWLAnnotationProperty comment = null; 52 | private static OWLDataFactory dataFactory = null; 53 | private static OWLAnnotationProperty label = null; 54 | private static OWLOntologyManager manager = null; 55 | 56 | private static OpenCycReasoner me = null; 57 | 58 | private static OWLOntology openCyc = null; 59 | 60 | private static OWLAnnotationProperty prettyString = null; 61 | 62 | private static OWLReasoner reasoner = null; 63 | private static OWLReasonerFactory reasonerFactory = null; 64 | static final String ocycLocation = OwlToolsConfig.ocycLocation; 65 | 66 | /** 67 | * Creates a new instance of OpenCycReasoner. 68 | */ 69 | private OpenCycReasoner() throws OWLOntologyCreationException { 70 | manager = OWLManager.createOWLOntologyManager(); 71 | openCyc = getManager() 72 | .loadOntologyFromOntologyDocument(new FileDocumentSource(new File(getOcycLocation()))); 73 | reasonerFactory = new StructuralReasonerFactory(); 74 | reasoner = getReasonerFactory().createReasoner(getOpenCyc()); 75 | dataFactory = getManager().getOWLDataFactory(); 76 | prettyString = getDataFactory().getOWLAnnotationProperty(IRI.create("http://sw.opencyc.org/concept/Mx4rwLSVCpwpEbGdrcN5Y29ycA")); 77 | comment = getDataFactory().getRDFSComment(); 78 | label = getDataFactory().getOWLAnnotationProperty("label", new DefaultPrefixManager("http://sw.cyc.com/CycAnnotations_v1#")); 79 | 80 | allClasses = openCyc.getClassesInSignature(); 81 | allClassIRIs = getIRIs(allClasses); 82 | } 83 | 84 | /** 85 | * Factory method to get an OpenCycReasoner instance. 86 | * 87 | * @return an OpenCycReasoner 88 | * @throws OWLOntologyCreationException 89 | */ 90 | public static OpenCycReasoner get() throws OWLOntologyCreationException { 91 | if (me == null) { 92 | me = new OpenCycReasoner(); 93 | } 94 | return me; 95 | } 96 | 97 | /** 98 | * 99 | * @return allClasses 100 | */ 101 | public Set getAllClasses() { 102 | return allClasses; 103 | } 104 | 105 | /** 106 | * 107 | * @return allClassIRIs 108 | */ 109 | public List getAllIRIs() { 110 | return allClassIRIs; 111 | } 112 | 113 | /** 114 | * @return the comment 115 | */ 116 | public OWLAnnotationProperty getComment() { 117 | return comment; 118 | } 119 | 120 | /** 121 | * @return the dataFactory 122 | */ 123 | public OWLDataFactory getDataFactory() { 124 | return dataFactory; 125 | } 126 | 127 | /** 128 | * 129 | * @return the label 130 | */ 131 | public OWLAnnotationProperty getLabel() { 132 | return label; 133 | } 134 | 135 | /** 136 | * @return the manager 137 | */ 138 | public OWLOntologyManager getManager() { 139 | return manager; 140 | } 141 | 142 | /** 143 | * @return the ocycLocation 144 | */ 145 | public String getOcycLocation() { 146 | return ocycLocation; 147 | } 148 | 149 | /** 150 | * @return the openCyc 151 | */ 152 | public OWLOntology getOpenCyc() { 153 | return openCyc; 154 | } 155 | 156 | /** 157 | * @return the prettyString 158 | */ 159 | public OWLAnnotationProperty getPrettyString() { 160 | return prettyString; 161 | } 162 | 163 | /** 164 | * @return the reasoner 165 | */ 166 | public OWLReasoner getReasoner() { 167 | return reasoner; 168 | } 169 | 170 | /** 171 | * @return the reasonerFactory 172 | */ 173 | public OWLReasonerFactory getReasonerFactory() { 174 | return reasonerFactory; 175 | } 176 | 177 | private List getIRIs(Set allClasses) { 178 | List allIRIs = new ArrayList<>(); 179 | allClasses.forEach(c -> { 180 | String iri = c.getIRI().getFragment(); 181 | allIRIs.add(iri); 182 | }); 183 | return allIRIs; 184 | } 185 | 186 | } 187 | -------------------------------------------------------------------------------- /OwlTools/src/main/java/com/cyc/tool/owltools/OwlToolsConfig.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.owltools; 2 | 3 | /* 4 | * #%L 5 | * OwlTools 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.MapDBConfiguration; 24 | 25 | /** 26 | *

OwlToolsConfig provides some locations to use for classes in the OwlTools project. 27 | */ 28 | public class OwlToolsConfig extends MapDBConfiguration{ 29 | 30 | /** 31 | * The location of the OpenCyc OWL export file. 32 | */ 33 | final public static String ocycLocation = "/cyc/projects/kbTaxonomy/owl-export-unversioned.owl"; 34 | private static final String allConceptsName = "allConcepts"; 35 | private static final String conceptsWithTermsName = "termsWithConcepts"; 36 | private static final String fallBackDBLocation = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/"; 37 | 38 | // From OwlToolsConfig.java in W2VOCyc 39 | private static final String ocycTermDBFile = "/ocycTerm"; 40 | private static final String ocycTermMapName = "owlTerms"; 41 | 42 | 43 | private static final String typeGraphName = "typeGraph"; 44 | 45 | /** 46 | * 47 | * @return the allConceptsName 48 | */ 49 | protected static String getAllConceptsName() { 50 | return allConceptsName; 51 | } 52 | 53 | /** 54 | * 55 | * @return the conceptsWithTermsName 56 | */ 57 | protected static String getConceptsWithTermsName() { 58 | return conceptsWithTermsName; 59 | } 60 | 61 | /** 62 | * 63 | * @return the location of the ocycTermDBFile 64 | */ 65 | protected static String getOcycTermDBFile() { 66 | return getMapDBBase(fallBackDBLocation) + 67 | ocycTermDBFile; 68 | } 69 | 70 | /** 71 | * 72 | * @return the ocycTermMapName 73 | */ 74 | protected static String getOcycTermMapName() { 75 | return ocycTermMapName; 76 | } 77 | 78 | /** 79 | * 80 | * @return the typeGraphName 81 | */ 82 | protected static String getTypeGraphName() { 83 | return typeGraphName; 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /OwlTools/src/test/java/com/cyc/tool/owltools/OpenCycContentIT.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.owltools; 2 | 3 | /* 4 | * #%L 5 | * OwlTools 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.util.HashSet; 24 | import java.util.Set; 25 | import org.junit.After; 26 | import org.junit.AfterClass; 27 | import static org.junit.Assert.assertEquals; 28 | import static org.junit.Assert.assertTrue; 29 | import org.junit.Before; 30 | import org.junit.BeforeClass; 31 | import org.junit.Test; 32 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 33 | 34 | /** 35 | * Tests for OpenCycContent. 36 | * 37 | */ 38 | public class OpenCycContentIT { 39 | 40 | public OpenCycContentIT() { 41 | } 42 | 43 | @BeforeClass 44 | public static void setUpClass() { 45 | } 46 | 47 | @AfterClass 48 | public static void tearDownClass() { 49 | } 50 | 51 | @Before 52 | public void setUp() { 53 | } 54 | 55 | @After 56 | public void tearDown() { 57 | } 58 | 59 | /** 60 | * Test of generateHtmlForConcept method, of class OpenCycContent. 61 | * @throws java.lang.Exception 62 | */ 63 | @Test 64 | public void testGenerateHtmlForConcept() throws Exception { 65 | System.out.println("generateHtmlForConcept"); 66 | OpenCycContent instance = new OpenCycContent("Mx4rKJAessNqRP6Yzb7lBhCrwQ"); // #$DogBreedShow;; 67 | String result = instance.generateHtmlForConcept(); 68 | System.out.println(result); 69 | assertTrue(result.contains("

English Phrases:

")); 70 | 71 | } 72 | 73 | /** 74 | * Test of getCommentsForConceptFromOWL method, of class OpenCycContent. 75 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException 76 | */ 77 | @Test 78 | public void testGetCommentsForConcept() throws OWLOntologyCreationException { 79 | System.out.println("getCommentsForConcept"); 80 | OpenCycContent instance = new OpenCycContent("Mx4rKJAessNqRP6Yzb7lBhCrwQ"); // #$DogBreedShow 81 | int expResultSize = 1; 82 | Set result = instance.getCommentsForConcept(); 83 | System.out.println("Comments: " + result); 84 | assertEquals(expResultSize, result.size()); 85 | 86 | } 87 | 88 | /** 89 | * Test of getLabelForConcept method, of class OpenCycContent. 90 | * @throws java.lang.Exception 91 | */ 92 | @Test 93 | public void testGetLabelForConcept() throws Exception { 94 | System.out.println("getLabelForConcept"); 95 | OpenCycContent instance = new OpenCycContent("Mx4rKJAessNqRP6Yzb7lBhCrwQ"); // #$DogBreedShow; 96 | String expResult = "DogBreedShow"; 97 | String result = instance.getLabelForConcept(); 98 | System.out.println("Label: " + result); 99 | assertEquals(expResult, result); 100 | } 101 | 102 | /** 103 | * Test of getPrettyStringsForConceptFromOWL method, of class OpenCycContent. 104 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException 105 | */ 106 | @Test 107 | public void testGetPrettyStringsForConcept() throws OWLOntologyCreationException { 108 | System.out.println("getPrettyStringsForConcept"); 109 | OpenCycContent instance = new OpenCycContent("Mx4rKJAessNqRP6Yzb7lBhCrwQ"); // #$DogBreedShow 110 | int expResultSize = 7; 111 | Set result = instance.getPrettyStringsForConcept(); 112 | System.out.println("Pretty Strings: " + result); 113 | assertEquals(expResultSize, result.size()); 114 | 115 | } 116 | 117 | /** 118 | * Test of getTypesForConcept method, of class OpenCycContent. 119 | * @throws java.lang.Exception 120 | */ 121 | @Test 122 | public void testGetTypesForConcept() throws Exception { 123 | System.out.println("getTypesForConcept"); 124 | OpenCycContent instance = new OpenCycContent("Mx4rKJAessNqRP6Yzb7lBhCrwQ"); // #$DogBreedShow;; 125 | Set expResult = new HashSet<>(); 126 | expResult.add("Mx4r7LaSPmtpQfiSSf5yKM70tg"); 127 | Set result = instance.getTypesForConcept(); 128 | System.out.println("Types: " + result); 129 | assertEquals(expResult, result); 130 | 131 | } 132 | 133 | } 134 | -------------------------------------------------------------------------------- /OwlTools/src/test/java/com/cyc/tool/owltools/OpenCycOwlIT.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.owltools; 2 | 3 | /* 4 | * #%L 5 | * OwlTools 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | //import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace; 24 | //import com.cyc.tool.distributedrepresentations.Word2VecSpace; 25 | import com.google.common.collect.Iterables; 26 | import java.io.IOException; 27 | import java.util.HashSet; 28 | import java.util.Set; 29 | import org.junit.AfterClass; 30 | import static org.junit.Assert.assertEquals; 31 | import static org.junit.Assert.assertTrue; 32 | import org.junit.BeforeClass; 33 | import org.junit.Test; 34 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 35 | 36 | /** 37 | * Tests for OpenCycOwl. 38 | */ 39 | public class OpenCycOwlIT { 40 | 41 | static OpenCycOwl ocyc; 42 | 43 | public OpenCycOwlIT() { 44 | } 45 | 46 | @BeforeClass 47 | public static void setUpClass() throws IOException, OWLOntologyCreationException { 48 | ocyc = new OpenCycOwl(); 49 | 50 | } 51 | 52 | @AfterClass 53 | public static void tearDownClass() { 54 | // Remove the ontology from the manager 55 | ocyc.close(); 56 | } 57 | 58 | @Test 59 | public void conceptForTest1() { 60 | Set res = ocyc.conceptsFor("the Yangtze"); 61 | assertEquals(1, res.size()); 62 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVj5qJwpEbGdrcN5Y29ycA")); 63 | } 64 | 65 | @Test 66 | public void conceptsForBirdTest1() { 67 | Set res = ocyc.conceptsFor("Bird"); 68 | System.out.println("HEY Bird "+res); 69 | assertEquals(2,res.size()); 70 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA")); 71 | } 72 | 73 | @Test 74 | public void conceptsForBirdTest2() { 75 | Set res = ocyc.conceptsFor("bird"); 76 | System.out.println("HEY bird "+res); 77 | assertEquals(2, res.size()); 78 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA")); 79 | } 80 | 81 | // @Test 82 | // public void conceptsWithTermsTest() throws IOException { 83 | // Set res = ocyc.conceptsWithW2VTerms(); 84 | // assertEquals(49616, res.size()); 85 | // } 86 | 87 | @Test 88 | public void getLabelsTest() { 89 | int res = ocyc.ocycConceptForTermLabel.size(); 90 | //ocyc.ocycConceptForTermLabel.keySet().forEach(s->{ 91 | // System.out.println(s+"\t"+ocyc.ocycConceptForTermLabel.get(s)); 92 | // }); 93 | System.out.println("N RDFS Labels with concepts:" + res); 94 | assertEquals(240258, res); 95 | } 96 | 97 | @Test 98 | public void getLowerCaseStringTest() { 99 | int res = ocyc.ocycConceptForTermLower.size(); 100 | System.out.println("N downcased pretty strings or labels with concepts:" + res); 101 | assertEquals(576678, res); 102 | } 103 | 104 | @Test 105 | public void getNConceptsTest() { 106 | int res = ocyc.size(); 107 | System.out.println("N Classes:" + res); 108 | assertEquals(116842, res); 109 | } 110 | 111 | @Test 112 | public void getPrettyStringTest() { 113 | int res = ocyc.ocycConceptForTermPrettyString.size(); 114 | System.out.println("N pretty Strings with concepts:" + res); 115 | assertEquals(345298, res); 116 | } 117 | 118 | @Test 119 | public void getTypesTest() throws IOException { 120 | Set res = ocyc.getTypes(ocyc.pizzaGUID); 121 | res.forEach(s -> { 122 | System.out.println("Pizza: " + ocyc.guidToURLString(s)); 123 | }); 124 | assertEquals(4, res.size()); 125 | } 126 | 127 | @Test 128 | public void getTypesTransitiveTest() throws IOException { 129 | Set res = ocyc.getTypesTransitive(ocyc.pizzaGUID); 130 | res.forEach(s -> { 131 | System.out.println("Pizza: " + ocyc.guidToURLString(s)); 132 | }); 133 | assertEquals(62, res.size()); 134 | } 135 | 136 | @Test 137 | public void guidFromURLStringTest() { 138 | String res = ocyc.guidFromURLString(ocyc.guidToURLString(ocyc.pizzaGUID)); 139 | assertEquals(ocyc.pizzaGUID, res); 140 | } 141 | 142 | @Test 143 | public void knownTermTest1() { 144 | boolean res = ocyc.knownTerm("Yangtze_River"); 145 | assertTrue(res); 146 | } 147 | 148 | @Test 149 | public void knownTermTest1b() { 150 | // Tests whether terms starting with "the " like "the Yangtze River" are 151 | // also being added without the "the " 152 | boolean res = ocyc.knownTerm("Yangtze River"); 153 | assertTrue(res); 154 | } 155 | 156 | @Test 157 | public void knownTermTest2() { 158 | boolean res = ocyc.knownTerm("the Yangtze"); 159 | assertTrue(res); 160 | } 161 | 162 | @Test 163 | public void knownTermTest3() { 164 | boolean res = ocyc.knownTerm("rivers"); 165 | assertTrue(res); 166 | 167 | } 168 | 169 | @Test 170 | public void knownTermTest4() { 171 | boolean res = ocyc.knownTerm("Hubble_Space_Telescope"); 172 | assertTrue(res); 173 | 174 | } 175 | @Test 176 | public void stringsForBirdConceptTest() { 177 | String res = ocyc.labelsForConcept("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA"); 178 | assertEquals("Birding|bird|Birds|Birder|Aves|birds|fowl", res); 179 | } 180 | 181 | @Test 182 | public void stringsForConceptTest1() { 183 | String res = ocyc.labelsForConcept("http://sw.opencyc.org/concept/Mx4rvVj5qJwpEbGdrcN5Y29ycA"); 184 | assertEquals("Chang Jiang|the Yangtze River|Yangtze|Chang Jiang River|the Yangtze|the Chang Jiang|Yangtze River|the Chang Jiang River", res); 185 | } 186 | 187 | // @Test 188 | // public void testConceptMap() throws IOException { 189 | // Word2VecSpace sp = GoogleNewsW2VSpace.get(); 190 | // Set yesses = new HashSet<>(); 191 | // Set allTerms = new HashSet<>(); 192 | // Iterables.concat( 193 | // ocyc.ocycConceptForTermPrettyString.keySet(), 194 | // ocyc.ocycConceptForTermLabel.keySet(), 195 | // ocyc.ocycConceptForTermLower.keySet()).forEach(lit -> { 196 | // if (sp.knownTerm(lit)) { 197 | // yesses.add(lit); 198 | // } 199 | // allTerms.add(lit); 200 | // 201 | // }); 202 | // System.out.println("Term strings for ocyc contained in W2V knownterm test:"); 203 | // System.out.println("\tYes:" + yesses.size()); 204 | // System.out.println("\t No:" + (allTerms.size() - yesses.size())); 205 | // System.out.println("\tAll:" + allTerms.size()); 206 | // // System.out.println("Yesses: \n" + String.join(", ", yesses)); 207 | // // System.out.println("Nos: \n" + String.join("; ", allTerms)); 208 | // assertEquals(67532, yesses.size()); 209 | // assertEquals(886523, allTerms.size()); 210 | // } 211 | } 212 | 213 | -------------------------------------------------------------------------------- /OwlTools/src/test/java/com/cyc/tool/owltools/OpenCycOwlIT.java~: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.owltools; 2 | 3 | /* 4 | * #%L 5 | * OwlTools 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace; 24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace; 25 | import com.google.common.collect.Iterables; 26 | import java.io.IOException; 27 | import java.util.HashSet; 28 | import java.util.Set; 29 | import org.junit.AfterClass; 30 | import static org.junit.Assert.assertEquals; 31 | import static org.junit.Assert.assertTrue; 32 | import org.junit.BeforeClass; 33 | import org.junit.Test; 34 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 35 | 36 | /** 37 | * Tests for OpenCycOwl. 38 | */ 39 | public class OpenCycOwlIT { 40 | 41 | static OpenCycOwl ocyc; 42 | 43 | public OpenCycOwlIT() { 44 | } 45 | 46 | @BeforeClass 47 | public static void setUpClass() throws IOException, OWLOntologyCreationException { 48 | ocyc = new OpenCycOwl(); 49 | 50 | } 51 | 52 | @AfterClass 53 | public static void tearDownClass() { 54 | // Remove the ontology from the manager 55 | ocyc.close(); 56 | } 57 | 58 | @Test 59 | public void conceptForTest1() { 60 | Set res = ocyc.conceptsFor("the Yangtze"); 61 | assertEquals(1, res.size()); 62 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVj5qJwpEbGdrcN5Y29ycA")); 63 | } 64 | 65 | @Test 66 | public void conceptsForBirdTest1() { 67 | Set res = ocyc.conceptsFor("Bird"); 68 | System.out.println("HEY Bird "+res); 69 | assertEquals(2,res.size()); 70 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA")); 71 | } 72 | 73 | @Test 74 | public void conceptsForBirdTest2() { 75 | Set res = ocyc.conceptsFor("bird"); 76 | System.out.println("HEY bird "+res); 77 | assertEquals(2, res.size()); 78 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA")); 79 | } 80 | 81 | @Test 82 | public void conceptsWithTermsTest() throws IOException { 83 | Set res = ocyc.conceptsWithW2VTerms(); 84 | assertEquals(49616, res.size()); 85 | } 86 | 87 | @Test 88 | public void getLabelsTest() { 89 | int res = ocyc.ocycConceptForTermLabel.size(); 90 | //ocyc.ocycConceptForTermLabel.keySet().forEach(s->{ 91 | // System.out.println(s+"\t"+ocyc.ocycConceptForTermLabel.get(s)); 92 | // }); 93 | System.out.println("N RDFS Labels with concepts:" + res); 94 | assertEquals(240258, res); 95 | } 96 | 97 | @Test 98 | public void getLowerCaseStringTest() { 99 | int res = ocyc.ocycConceptForTermLower.size(); 100 | System.out.println("N downcased pretty strings or labels with concepts:" + res); 101 | assertEquals(576678, res); 102 | } 103 | 104 | @Test 105 | public void getNConceptsTest() { 106 | int res = ocyc.size(); 107 | System.out.println("N Classes:" + res); 108 | assertEquals(116842, res); 109 | } 110 | 111 | @Test 112 | public void getPrettyStringTest() { 113 | int res = ocyc.ocycConceptForTermPrettyString.size(); 114 | System.out.println("N pretty Strings with concepts:" + res); 115 | assertEquals(345298, res); 116 | } 117 | 118 | @Test 119 | public void getTypesTest() throws IOException { 120 | Set res = ocyc.getTypes(ocyc.pizzaGUID); 121 | res.forEach(s -> { 122 | System.out.println("Pizza: " + ocyc.guidToURLString(s)); 123 | }); 124 | assertEquals(4, res.size()); 125 | } 126 | 127 | @Test 128 | public void getTypesTransitiveTest() throws IOException { 129 | Set res = ocyc.getTypesTransitive(ocyc.pizzaGUID); 130 | res.forEach(s -> { 131 | System.out.println("Pizza: " + ocyc.guidToURLString(s)); 132 | }); 133 | assertEquals(62, res.size()); 134 | } 135 | 136 | @Test 137 | public void guidFromURLStringTest() { 138 | String res = ocyc.guidFromURLString(ocyc.guidToURLString(ocyc.pizzaGUID)); 139 | assertEquals(ocyc.pizzaGUID, res); 140 | } 141 | 142 | @Test 143 | public void knownTermTest1() { 144 | boolean res = ocyc.knownTerm("Yangtze_River"); 145 | assertTrue(res); 146 | } 147 | 148 | @Test 149 | public void knownTermTest1b() { 150 | // Tests whether terms starting with "the " like "the Yangtze River" are 151 | // also being added without the "the " 152 | boolean res = ocyc.knownTerm("Yangtze River"); 153 | assertTrue(res); 154 | } 155 | 156 | @Test 157 | public void knownTermTest2() { 158 | boolean res = ocyc.knownTerm("the Yangtze"); 159 | assertTrue(res); 160 | } 161 | 162 | @Test 163 | public void knownTermTest3() { 164 | boolean res = ocyc.knownTerm("rivers"); 165 | assertTrue(res); 166 | 167 | } 168 | 169 | @Test 170 | public void knownTermTest4() { 171 | boolean res = ocyc.knownTerm("Hubble_Space_Telescope"); 172 | assertTrue(res); 173 | 174 | } 175 | @Test 176 | public void stringsForBirdConceptTest() { 177 | String res = ocyc.labelsForConcept("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA"); 178 | assertEquals("Birding|bird|Birds|Birder|Aves|birds|fowl", res); 179 | } 180 | 181 | @Test 182 | public void stringsForConceptTest1() { 183 | String res = ocyc.labelsForConcept("http://sw.opencyc.org/concept/Mx4rvVj5qJwpEbGdrcN5Y29ycA"); 184 | assertEquals("Chang Jiang|the Yangtze River|Yangtze|Chang Jiang River|the Yangtze|the Chang Jiang|Yangtze River|the Chang Jiang River", res); 185 | } 186 | 187 | @Test 188 | public void testConceptMap() throws IOException { 189 | Word2VecSpace sp = GoogleNewsW2VSpace.get(); 190 | Set yesses = new HashSet<>(); 191 | Set allTerms = new HashSet<>(); 192 | Iterables.concat( 193 | ocyc.ocycConceptForTermPrettyString.keySet(), 194 | ocyc.ocycConceptForTermLabel.keySet(), 195 | ocyc.ocycConceptForTermLower.keySet()).forEach(lit -> { 196 | if (sp.knownTerm(lit)) { 197 | yesses.add(lit); 198 | } 199 | allTerms.add(lit); 200 | 201 | }); 202 | System.out.println("Term strings for ocyc contained in W2V knownterm test:"); 203 | System.out.println("\tYes:" + yesses.size()); 204 | System.out.println("\t No:" + (allTerms.size() - yesses.size())); 205 | System.out.println("\tAll:" + allTerms.size()); 206 | // System.out.println("Yesses: \n" + String.join(", ", yesses)); 207 | // System.out.println("Nos: \n" + String.join("; ", allTerms)); 208 | assertEquals(67532, yesses.size()); 209 | assertEquals(886523, allTerms.size()); 210 | } 211 | } 212 | -------------------------------------------------------------------------------- /OwlTools/src/test/java/com/cyc/tool/owltools/OpenCycReasonerIT.java: -------------------------------------------------------------------------------- 1 | package com.cyc.tool.owltools; 2 | 3 | /* 4 | * #%L 5 | * OwlTools 6 | * %% 7 | * Copyright (C) 2015 Cycorp, Inc 8 | * %% 9 | * Licensed under the Apache License, Version 2.0 (the "License"); 10 | * you may not use this file except in compliance with the License. 11 | * You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | * #L% 21 | */ 22 | 23 | import java.util.List; 24 | import java.util.Set; 25 | import org.junit.After; 26 | import org.junit.AfterClass; 27 | import static org.junit.Assert.assertEquals; 28 | import static org.junit.Assert.assertTrue; 29 | import org.junit.Before; 30 | import org.junit.BeforeClass; 31 | import org.junit.Test; 32 | import org.semanticweb.owlapi.model.OWLClass; 33 | import org.semanticweb.owlapi.model.OWLOntologyCreationException; 34 | 35 | /** 36 | * Tests for OpenCycReasoner. 37 | */ 38 | public class OpenCycReasonerIT { 39 | 40 | public OpenCycReasonerIT() { 41 | } 42 | 43 | @BeforeClass 44 | public static void setUpClass() { 45 | } 46 | 47 | @AfterClass 48 | public static void tearDownClass() { 49 | } 50 | 51 | @Before 52 | public void setUp() { 53 | } 54 | 55 | @After 56 | public void tearDown() { 57 | } 58 | 59 | /** 60 | * Test of get method, of class OpenCycReasoner. 61 | * @throws java.lang.Exception 62 | */ 63 | @Test 64 | public void testGet() throws Exception { 65 | System.out.println("get"); 66 | OpenCycReasoner result = OpenCycReasoner.get(); 67 | assertTrue(result != null); 68 | } 69 | 70 | /** 71 | * Test of getAllClasses method, of class OpenCycReasoner. 72 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException 73 | */ 74 | @Test 75 | public void testGetAllClasses() throws OWLOntologyCreationException { 76 | System.out.println("getAllClasses"); 77 | OpenCycReasoner instance = OpenCycReasoner.get(); 78 | int expResultSize = 116842; 79 | Set result = instance.getAllClasses(); 80 | assertEquals(expResultSize, result.size()); 81 | } 82 | 83 | /** 84 | * Test of getAllIRIs method, of class OpenCycReasoner. 85 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException 86 | */ 87 | @Test 88 | public void testGetAllIRIs() throws OWLOntologyCreationException { 89 | System.out.println("getAllIRIs"); 90 | OpenCycReasoner instance = OpenCycReasoner.get(); 91 | int expResult = 116842; 92 | List result = instance.getAllIRIs(); 93 | assertEquals(expResult, result.size()); 94 | } 95 | 96 | } 97 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | DistributedRepresentations and ConceptFinder 3 | ============================================ 4 | 5 | Version 1.0 6 | 7 | Included 8 | -------- 9 | 10 | Projects: 11 | * DistributedRepresentationsParent - The parent pom for the other projects 12 | * OwlTools - Classes for accessing the OpenCyc OWL export 13 | * CycMapDBTools - Some configuration defaults for the other other projects 14 | * DistributedRepresentations - Project to access Word2Vec sources 15 | * ConceptFinder - Methods to find nearby concepts in the Word2Vec space 16 | 17 | Other Files: 18 | * Download the distributedRepresentationsDownloads file [here](https://drive.google.com/file/d/0B95A6Z6CLEXibm1DYnBzN3NkZEU/view?usp=sharing) and then extract the following: 19 | * GoogleNews-vectors-negative300.bin.gz - The GoogleNews Word2Vec Space 20 | * BioASQ - The Word2Vec space developed by BioASQ and trained on Pubmed sources 21 | * owl-export-unversioned.owl - The OpenCyc export 22 | * This README file 23 | 24 | Requirements 25 | ------------ 26 | 27 | * These projects require Java 1.8. 28 | * **_This code has not yet been tested on Windows._** 29 | 30 | Description and Usage 31 | --------------------- 32 | 33 | The projects in this repository constitute a library for accessing Word2Vec content and searching in that space. 34 | The OwlTools project provides access to OpenCyc concepts that can be 35 | mapped into the space. These mapped OpenCyc concepts can be viewed using the Taxonomy Viewer, located in the KBTaxonomy repository, which uses the Distributed Representations libraries to allow users to find OpenCyc concepts by way of nearest term search in the Word2Vec space. 36 | 37 | At present, the library supports two sources: 38 | 39 | 1. The word2vec space produced by Google by training on 10^11 words of news. - (https://code.google.com/p/word2vec/) 40 | 2. The word2vec space produced by BioASQ by training on pubmed. - (http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts) 41 | 42 | To use these libraries, you will need to update some file paths to your local system as follows: 43 | 44 | In DistributedRepresentations: 45 | 46 | 1. `GoogleNewsW2VSpace.java` 47 | * Modify the `w2vfile` variable so that it points to where you save the GoogleNews-vectors archive included in this repository 48 | 2. `BiologyW2VSpace.java` 49 | * Modify the `filebase` variable to where you save the BioASQ directory word2vecTools subdirectory 50 | 3. `Config.java` 51 | * Modify the `fallBackLocation` variable to a directory in your file system, ideally, where you saved the GoogleNews archive 52 | 53 | In ConceptFinder: 54 | 55 | 1. `ConceptFinderConfig.java` 56 | * Modify the `fallBackLocation` variable to a directory in your file system, ideally, where you saved the GoogleNews archive 57 | * Modify the `w2vVectorFile` variable accordingly 58 | 59 | In OwlTools: 60 | 61 | 1. `OwlToolsConfig.java` 62 | * Modify the `ocyclocation` variable to match the location of where you save the OpenCyc export file, `owl-export-unversioned.owl` 63 | * Modify the `fallBackLocation` variable to match the location you gave in `ConceptFinderConfig.java` 64 | 65 | To install the libraries to your local Maven repository, simply install the DistributedRepresentationsParent project. This will install all four of its children to your local Maven repository. To confirm that everything is working properly, run the integration tests in each of the projects. Note that some tests may take a long time (on the order of several hours) to run the first time, but should be faster in subsequent runs. The `Word2VecSpaceIT.java` test, in particular, will be setting up the Google News space on your local system, so it needs to run through all of the concepts in the space. This is a one-time operation though, so you should not have to perform this set up step again. 66 | 67 | IMPORTANT: If something goes wrong during the MapDB set-up operations, which get kicked off by running the integration tests in these projects, you may need to remove the MapDB directory and start again. This sometimes happens if the set-up process is interrupted before it has completed. 68 | -------------------------------------------------------------------------------- /distributedRepresentationsParent/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cyc.tool 5 | distributedRepresentationsParent 6 | 1.0 7 | pom 8 | 9 | UTF-8 10 | 1.7 11 | 1.7 12 | 13 | 14 | 15 | Cycorp, Inc 16 | http://www.cyc.com 17 | 18 | 19 | 2015 20 | 21 | 22 | ../CycMapDBTools 23 | ../OwlTools 24 | ../DistributedRepresentations 25 | ../ConceptFinder 26 | 27 | 28 | 29 | 30 | cyc.com 31 | file:///cyc/java/maven-repo/ 32 | 33 | true 34 | always 35 | 36 | 37 | true 38 | always 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | org.codehaus.mojo 48 | license-maven-plugin 49 | 1.7 50 | 51 | false 52 | false 53 | 54 | 55 | 56 | first 57 | 58 | update-file-header 59 | 60 | process-sources 61 | 62 | apache_v2 63 | 64 | src/main/java 65 | src/test 66 | 67 | 68 | 69 | 70 | 71 | 72 | org.apache.maven.plugins 73 | maven-javadoc-plugin 74 | 2.9.1 75 | 76 | 82 | -Xdoclint:none 83 | 84 | 85 | todo 86 | X 87 | 88 | 89 | true 90 | 91 | 92 | 93 | attach-javadocs 94 | 95 | jar 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | junit 108 | junit 109 | 4.11 110 | test 111 | 112 | 113 | 114 | 115 | com.google.code.gson 116 | gson 117 | 2.1 118 | jar 119 | 120 | 121 | com.googlecode.json-simple 122 | json-simple 123 | 1.1.1 124 | 125 | 126 | 127 | 128 | org.apache.commons 129 | commons-lang3 130 | 3.1 131 | 132 | 133 | commons-io 134 | commons-io 135 | 2.4 136 | 137 | 138 | commons-cli 139 | commons-cli 140 | 1.2 141 | 142 | 143 | 144 | org.apache.opennlp 145 | opennlp-tools 146 | 1.5.3 147 | 148 | 149 | 150 | org.slf4j 151 | slf4j-api 152 | 1.7.5 153 | 154 | 155 | org.slf4j 156 | slf4j-simple 157 | 1.7.5 158 | test 159 | 160 | 161 | 162 | com.cyc.tool 163 | CycMapDBTools 164 | 1.0 165 | jar 166 | 167 | 168 | com.cyc.tool 169 | OwlTools 170 | 1.0 171 | 172 | 173 | com.cyc.tool 174 | DistributedRepresentations 175 | 1.0 176 | jar 177 | 178 | 179 | 180 | 181 | 182 | 183 | junit 184 | junit 185 | test 186 | 187 | 188 | 189 | 190 | 191 | -------------------------------------------------------------------------------- /distributedRepresentationsParent/pom.xml~: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | com.cyc.tool 5 | distributedRepresentationsParent 6 | 1.0 7 | pom 8 | 9 | UTF-8 10 | 1.7 11 | 1.7 12 | 13 | 14 | 15 | Cycorp, Inc 16 | http://www.cyc.com 17 | 18 | 19 | 2015 20 | 21 | 22 | ../CycMapDBTools 23 | ../DistributedRepresentations 24 | ../ConceptFinder 25 | 26 | 27 | 28 | 29 | cyc.com 30 | file:///cyc/java/maven-repo/ 31 | 32 | true 33 | always 34 | 35 | 36 | true 37 | always 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | org.codehaus.mojo 47 | license-maven-plugin 48 | 1.7 49 | 50 | false 51 | false 52 | 53 | 54 | 55 | first 56 | 57 | update-file-header 58 | 59 | process-sources 60 | 61 | apache_v2 62 | 63 | src/main/java 64 | src/test 65 | 66 | 67 | 68 | 69 | 70 | 71 | org.apache.maven.plugins 72 | maven-javadoc-plugin 73 | 2.9.1 74 | 75 | 81 | -Xdoclint:none 82 | 83 | 84 | todo 85 | X 86 | 87 | 88 | true 89 | 90 | 91 | 92 | attach-javadocs 93 | 94 | jar 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | junit 107 | junit 108 | 4.11 109 | test 110 | 111 | 112 | 113 | 114 | com.google.code.gson 115 | gson 116 | 2.1 117 | jar 118 | 119 | 120 | com.googlecode.json-simple 121 | json-simple 122 | 1.1.1 123 | 124 | 125 | 126 | 127 | org.apache.commons 128 | commons-lang3 129 | 3.1 130 | 131 | 132 | commons-io 133 | commons-io 134 | 2.4 135 | 136 | 137 | commons-cli 138 | commons-cli 139 | 1.2 140 | 141 | 142 | 143 | org.apache.opennlp 144 | opennlp-tools 145 | 1.5.3 146 | 147 | 148 | 149 | org.slf4j 150 | slf4j-api 151 | 1.7.5 152 | 153 | 154 | org.slf4j 155 | slf4j-simple 156 | 1.7.5 157 | test 158 | 159 | 160 | 161 | com.cyc.tool 162 | CycMapDBTools 163 | 1.0 164 | jar 165 | 166 | 167 | com.cyc.tool 168 | OwlTools 169 | 1.0 170 | 171 | 172 | com.cyc.tool 173 | DistributedRepresentations 174 | 1.0 175 | jar 176 | 177 | 178 | 179 | 180 | 181 | 182 | junit 183 | junit 184 | test 185 | 186 | 187 | 188 | 189 | --------------------------------------------------------------------------------