├── .gitignore
├── ConceptFinder
├── nbactions.xml
├── src
│ ├── main
│ │ └── java
│ │ │ └── com
│ │ │ └── cyc
│ │ │ └── tool
│ │ │ └── conceptfinder
│ │ │ ├── DefaultConceptFinderConfig.java
│ │ │ ├── ConceptFinderConfig.java
│ │ │ ├── ConceptMatch.java
│ │ │ ├── AttachmentHypothesis.java
│ │ │ ├── MissingConceptFinderDefault.java
│ │ │ ├── MissingConceptFinder.java
│ │ │ └── ConceptSpace.java
│ └── test
│ │ └── java
│ │ └── com
│ │ └── cyc
│ │ └── tool
│ │ └── conceptfinder
│ │ ├── ConceptSpaceIT.java
│ │ └── MissingConceptFinderIT.java
└── pom.xml
├── DistributedRepresentations
├── nbactions.xml
├── src
│ ├── main
│ │ └── java
│ │ │ └── com
│ │ │ └── cyc
│ │ │ └── tool
│ │ │ └── distributedrepresentations
│ │ │ ├── Config.java
│ │ │ ├── BiologyW2VOpenCycSubspace.java
│ │ │ ├── GoogleNewsW2VOpenCycSubspace.java
│ │ │ ├── GoogleNewsW2VSpace.java
│ │ │ ├── Word2VecSubspace.java
│ │ │ ├── BiologyW2VSpace.java
│ │ │ ├── Word2VecSpaceFromFile.java
│ │ │ └── Word2VecSpace.java
│ └── test
│ │ └── java
│ │ └── com
│ │ └── cyc
│ │ └── tool
│ │ └── distributedrepresentations
│ │ ├── BiologyW2VSpaceIT.java
│ │ └── Word2VecSpaceIT.java
├── pom.xml
└── pom.xml~
├── CycMapDBTools
├── pom.xml
├── pom.xml~
└── src
│ └── main
│ └── java
│ └── com
│ └── cyc
│ └── tool
│ └── MapDBConfiguration.java
├── OwlTools
├── pom.xml
├── pom.xml~
└── src
│ ├── main
│ └── java
│ │ └── com
│ │ └── cyc
│ │ └── tool
│ │ └── owltools
│ │ ├── OwlToolsConfig.java
│ │ ├── OpenCycReasoner.java
│ │ ├── OpenCycContent.java
│ │ └── OpenCycOwl.java
│ └── test
│ └── java
│ └── com
│ └── cyc
│ └── tool
│ └── owltools
│ ├── OpenCycReasonerIT.java
│ ├── OpenCycContentIT.java
│ ├── OpenCycOwlIT.java~
│ └── OpenCycOwlIT.java
├── README.md
└── distributedRepresentationsParent
├── pom.xml~
└── pom.xml
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | pom.xml.tag
3 | pom.xml.releaseBackup
4 | pom.xml.versionsBackup
5 | pom.xml.next
6 | release.properties
7 | dependency-reduced-pom.xml
8 | buildNumber.properties
9 |
--------------------------------------------------------------------------------
/ConceptFinder/nbactions.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | CUSTOM-skiptests
5 | skiptests
6 |
7 | clean
8 | install
9 |
10 |
11 | true
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/DistributedRepresentations/nbactions.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | CUSTOM-skiptests
5 | skiptests
6 |
7 | clean
8 | install
9 |
10 |
11 | true
12 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/CycMapDBTools/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | CycMapDBTools
6 | jar
7 |
8 | UTF-8
9 | 1.8
10 | 1.8
11 |
12 |
13 |
14 | com.cyc.tool
15 | distributedRepresentationsParent
16 | 1.0
17 | ../distributedRepresentationsParent
18 |
19 |
20 |
21 |
22 |
23 | org.codehaus.mojo
24 | license-maven-plugin
25 |
26 |
27 | org.apache.maven.plugins
28 | maven-javadoc-plugin
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/CycMapDBTools/pom.xml~:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | CycMapDBTools
6 | 0.0.1-SNAPSHOT
7 | jar
8 |
9 | UTF-8
10 | 1.8
11 | 1.8
12 |
13 |
14 |
15 | com.cyc.project.kbtaxonomy
16 | KBTaxonomyParent
17 | 0.0.1-SNAPSHOT
18 | ../KBTaxonomyParent
19 |
20 |
21 |
22 |
23 |
24 | org.codehaus.mojo
25 | license-maven-plugin
26 |
27 |
28 | org.apache.maven.plugins
29 | maven-javadoc-plugin
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Config.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.MapDBConfiguration;
24 |
25 | /**
26 | *
27 | * Config provides default locations for the DistributedRepresentations project.
28 | */
29 | public class Config extends MapDBConfiguration {
30 |
31 | private static final String fallBackDBLocation = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/";
32 |
33 | private static final String w2vDBFile = "/w2vdb";
34 |
35 | /**
36 | *
37 | * @return W2VDB file location
38 | */
39 | protected static String getW2vDBFile() {
40 | return getMapDBBase(fallBackDBLocation) + w2vDBFile;
41 | }
42 |
43 | }
44 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/test/java/com/cyc/tool/distributedrepresentations/BiologyW2VSpaceIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import static org.junit.Assert.assertEquals;
24 | import static org.junit.Assert.assertTrue;
25 | import org.junit.Test;
26 |
27 | /**
28 | * Tests for BiologyW2VSpace.
29 | */
30 | public class BiologyW2VSpaceIT {
31 |
32 | public BiologyW2VSpaceIT() {
33 | }
34 |
35 | @Test
36 | public void testGet() {
37 | System.out.println("get");
38 |
39 | BiologyW2VSpace result = BiologyW2VSpace.get();
40 | assertTrue(result != null);
41 | }
42 |
43 | @Test
44 | public void testNumberOfVectors() {
45 | System.out.println("getNVectors");
46 |
47 | int result = BiologyW2VSpace.get().getNVectors();
48 |
49 | assertEquals(result, 1701632);
50 | }
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/DefaultConceptFinderConfig.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | /**
24 | * Default configuration for ConceptFinder.
25 | */
26 | public class DefaultConceptFinderConfig extends ConceptFinderConfig {
27 |
28 | private static final String conceptsForMissingTermsNameDefault = "missingTermConceptsDefault";
29 | private static final String missingTermMapNameDefault = "missingTermsDefault";
30 |
31 | /**
32 | *
33 | * @return the conceptsForMissingTermsNameDefault
34 | */
35 | protected static String getConceptsForMissingTermsName() {
36 | return conceptsForMissingTermsNameDefault;
37 | }
38 |
39 | /**
40 | *
41 | * @return the missingTermMapNameDefault
42 | */
43 | protected static String getMissingTermMapName() {
44 | return missingTermMapNameDefault;
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/OwlTools/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | OwlTools
6 | jar
7 |
8 |
9 | com.cyc.tool
10 | distributedRepresentationsParent
11 | 1.0
12 | ../distributedRepresentationsParent
13 |
14 |
15 |
16 |
17 |
18 | org.apache.maven.plugins
19 | maven-compiler-plugin
20 | 2.3.2
21 |
22 | 1.8
23 | 1.8
24 |
25 |
26 |
27 | org.codehaus.mojo
28 | license-maven-plugin
29 |
30 |
31 | org.apache.maven.plugins
32 | maven-javadoc-plugin
33 |
34 |
35 |
36 |
37 |
38 | net.sourceforge.owlapi
39 | owlapi-distribution
40 | 4.0.1
41 | jar
42 |
43 |
44 | org.mapdb
45 | mapdb
46 | 1.0.6
47 | jar
48 |
49 |
50 | com.cyc.tool
51 | CycMapDBTools
52 | jar
53 |
54 |
55 |
56 | UTF-8
57 | 1.8
58 | 1.8
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/DistributedRepresentations/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | DistributedRepresentations
6 | jar
7 |
8 |
9 | com.cyc.tool
10 | distributedRepresentationsParent
11 | 1.0
12 | ../distributedRepresentationsParent
13 |
14 |
15 |
16 |
17 |
18 | org.codehaus.mojo
19 | license-maven-plugin
20 |
21 |
22 | org.apache.maven.plugins
23 | maven-javadoc-plugin
24 |
25 |
26 |
27 |
28 |
29 |
30 | net.sourceforge.owlapi
31 | owlapi-distribution
32 | 4.0.1
33 | jar
34 |
35 |
36 | org.mapdb
37 | mapdb
38 | 1.0.6
39 | jar
40 |
41 |
42 | junit
43 | junit
44 | test
45 |
46 |
47 | com.cyc.tool
48 | CycMapDBTools
49 | jar
50 |
51 |
52 |
53 | com.cyc.tool
54 | OwlTools
55 |
56 |
57 |
58 |
59 | UTF-8
60 | 1.8
61 | 1.8
62 |
63 |
64 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/ConceptFinderConfig.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.MapDBConfiguration;
24 |
25 | /**
26 | *
27 | * ConceptFinderConfig is designed to set paths for caching and data access for this package.
28 | */
29 | public class ConceptFinderConfig extends MapDBConfiguration {
30 |
31 | private static final String fallBackLocation = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/";
32 | private static final String missingConceptDBFile = "/missingConcept";
33 |
34 | private static final String w2vDBFile = "/w2vdb";
35 | private static final String w2vVectorFile = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/GoogleNews-vectors-negative300.bin.gz";
36 | private static final String word2VecVectorsMapName = "word2Vec";
37 |
38 | /**
39 | *
40 | * @return the missingConceptDBFile location
41 | */
42 | protected static String getMissingConceptDBFile() {
43 | return getMapDBBase(fallBackLocation) + missingConceptDBFile;
44 | }
45 |
46 | /**
47 | *
48 | * @return the w2vVectorFile
49 | */
50 | protected static String getW2VVectorfile() {
51 | return w2vVectorFile;
52 | }
53 |
54 | /**
55 | *
56 | * @return the w2vDBFile location
57 | */
58 | protected static String getW2vDBFile() {
59 | return getMapDBBase(fallBackLocation) + w2vDBFile;
60 | }
61 |
62 | /**
63 | *
64 | * @return the word2VecVectorsMapName
65 | */
66 | protected static String getWord2VecVectorsMapName() {
67 | return word2VecVectorsMapName;
68 | }
69 |
70 | }
71 |
--------------------------------------------------------------------------------
/DistributedRepresentations/pom.xml~:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | DistributedRepresentations
6 | 0.0.1-SNAPSHOT
7 | jar
8 |
9 |
10 | com.cyc.project.kbtaxonomy
11 | KBTaxonomyParent
12 | 0.0.1-SNAPSHOT
13 | ../KBTaxonomyParent
14 |
15 |
16 |
17 |
18 |
19 | org.codehaus.mojo
20 | license-maven-plugin
21 |
22 |
23 | org.apache.maven.plugins
24 | maven-javadoc-plugin
25 |
26 |
27 |
28 |
29 |
30 |
31 | net.sourceforge.owlapi
32 | owlapi-distribution
33 | 4.0.1
34 | jar
35 |
36 |
37 | org.mapdb
38 | mapdb
39 | 1.0.6
40 | jar
41 |
42 |
43 | junit
44 | junit
45 | 4.10
46 | test
47 |
48 |
49 | com.cyc.tool
50 | CycMapDBTools
51 | 0.0.1-SNAPSHOT
52 | jar
53 |
54 |
55 |
56 | com.cyc.tool
57 | OwlTools
58 | 0.0.1-SNAPSHOT
59 |
60 |
61 |
62 |
63 |
64 | UTF-8
65 | 1.8
66 | 1.8
67 |
68 |
--------------------------------------------------------------------------------
/ConceptFinder/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | ConceptFinder
6 | jar
7 |
8 |
9 | com.cyc.tool
10 | distributedRepresentationsParent
11 | 1.0
12 | ../distributedRepresentationsParent
13 |
14 |
15 |
16 |
17 |
18 | org.codehaus.mojo
19 | license-maven-plugin
20 |
21 |
22 | org.apache.maven.plugins
23 | maven-javadoc-plugin
24 |
25 |
26 |
27 |
28 |
29 |
30 | org.mapdb
31 | mapdb
32 | 1.0.6
33 | jar
34 |
35 |
36 | net.sourceforge.owlapi
37 | owlapi-distribution
38 | 4.0.1
39 | jar
40 |
41 |
42 | com.cyc.tool
43 | OwlTools
44 | jar
45 |
46 |
47 | junit
48 | junit
49 | test
50 | jar
51 |
52 |
53 | com.cyc.tool
54 | DistributedRepresentations
55 | jar
56 |
57 |
58 | com.cyc.tool
59 | CycMapDBTools
60 | jar
61 |
62 |
63 |
64 |
65 | UTF-8
66 | 1.8
67 | 1.8
68 |
69 |
70 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/BiologyW2VOpenCycSubspace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.owltools.OpenCycOwl;
24 | import java.io.IOException;
25 | import java.util.logging.Level;
26 | import java.util.logging.Logger;
27 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
28 |
29 | /**
30 | *
31 | * BiologyW2VSpace filtered to only contain terms in Open Cyc.
32 | */
33 | public class BiologyW2VOpenCycSubspace extends Word2VecSubspace {
34 |
35 | static BiologyW2VOpenCycSubspace singleton;
36 |
37 | private BiologyW2VOpenCycSubspace(OpenCycOwl ocyc) throws IOException {
38 | super(BiologyW2VSpace.get(),
39 | m -> ocyc.knownTerm(m), getWord2VecVectorsMapName());
40 | }
41 |
42 | /**
43 | *
44 | * @return a WordToVecSubspace limited only to terms in OpenCyc
45 | */
46 | public static BiologyW2VOpenCycSubspace get() {
47 | if (singleton == null) {
48 | try {
49 | OpenCycOwl ocyc = new OpenCycOwl();
50 | singleton = new BiologyW2VOpenCycSubspace(ocyc);
51 | } catch (IOException ex) {
52 | Logger.getLogger(BiologyW2VSpace.class.getName()).log(Level.SEVERE, null, ex);
53 | throw new RuntimeException("Can't create the Biology W2VSpace object " + ex);
54 | } catch (OWLOntologyCreationException ex) {
55 | Logger.getLogger(BiologyW2VOpenCycSubspace.class.getName()).log(Level.SEVERE, null, ex);
56 | }
57 | }
58 | return singleton;
59 | }
60 |
61 | static String getWord2VecVectorsMapName() {
62 | return BiologyW2VOpenCycSubspace.class.getCanonicalName();
63 | }
64 |
65 | }
66 |
--------------------------------------------------------------------------------
/OwlTools/pom.xml~:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | OwlTools
6 | 0.0.1-SNAPSHOT
7 | jar
8 |
9 |
10 | com.cyc.project.kbtaxonomy
11 | KBTaxonomyParent
12 | 0.0.1-SNAPSHOT
13 | ../KBTaxonomyParent
14 |
15 |
16 |
17 |
18 |
19 | org.apache.maven.plugins
20 | maven-compiler-plugin
21 | 2.3.2
22 |
23 | 1.8
24 | 1.8
25 |
26 |
27 |
28 | org.codehaus.mojo
29 | license-maven-plugin
30 |
31 |
32 | org.apache.maven.plugins
33 | maven-javadoc-plugin
34 |
35 |
36 |
37 |
38 |
39 | net.sourceforge.owlapi
40 | owlapi-distribution
41 | 4.0.1
42 | jar
43 |
44 |
45 | org.mapdb
46 | mapdb
47 | 1.0.6
48 | jar
49 |
50 |
51 | com.cyc.tool
52 | DistributedRepresentations
53 | 0.0.1-SNAPSHOT
54 | jar
55 |
56 |
57 | com.cyc.tool
58 | CycMapDBTools
59 | 0.0.1-SNAPSHOT
60 | jar
61 |
62 |
63 |
64 | UTF-8
65 | 1.8
66 | 1.8
67 |
68 |
69 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/ConceptMatch.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
24 | import java.io.Serializable;
25 | import java.util.function.Function;
26 |
27 | /**
28 | * A ConceptMatch relates a concept to a term.
29 | */
30 | public class ConceptMatch implements Serializable {
31 |
32 | final String concept;
33 |
34 | final double similarity;
35 | final String term;
36 |
37 | /**
38 | * ConceptMatch constructor
39 | *
40 | * @param w2v
41 | * @param search
42 | * @param term
43 | * @param noter
44 | */
45 | public ConceptMatch(Word2VecSpace w2v, float[] search, String term,
46 | Function noter) {
47 | this.term = term;
48 | if (noter == null) {
49 | this.concept = "---";
50 | } else {
51 | this.concept = noter.apply(term);
52 | }
53 | similarity = w2v.googleSimilarity(search, w2v.getVector(term));
54 | }
55 |
56 | /**
57 | * ConceptMatch constructor
58 | *
59 | * @param w2v
60 | * @param search
61 | * @param term
62 | */
63 | public ConceptMatch(Word2VecSpace w2v, float[] search, String term) {
64 | this(w2v, search, term, null);
65 | }
66 |
67 | /**
68 | *
69 | * @return the concept
70 | */
71 | public String getConcept() {
72 | return concept;
73 | }
74 |
75 | /**
76 | *
77 | * @return the similarity
78 | */
79 | public double getSimilarity() {
80 | return similarity;
81 | }
82 |
83 | /**
84 | *
85 | * @return the term
86 | */
87 | public String getTerm() {
88 | return term;
89 | }
90 |
91 | @Override
92 | public String toString() {
93 | return term + ": " + similarity + ": " + (concept == null ? "--" : concept);
94 | }
95 | }
96 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/GoogleNewsW2VOpenCycSubspace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.owltools.OpenCycOwl;
24 | import java.io.IOException;
25 | import java.util.logging.Level;
26 | import java.util.logging.Logger;
27 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
28 |
29 | /**
30 | * News Word2Vec Distributed representation filtered to only contain terms in Open Cyc.
31 | *
32 | *
33 | * Used for rapid searches of the space for open cyc terms
34 | */
35 | public class GoogleNewsW2VOpenCycSubspace extends Word2VecSubspace {
36 |
37 | static GoogleNewsW2VOpenCycSubspace singleton;
38 |
39 | private GoogleNewsW2VOpenCycSubspace(OpenCycOwl ocyc) throws IOException {
40 | super(GoogleNewsW2VSpace.get(),
41 | m -> ocyc.knownTerm(m), getWord2VecVectorsMapName());
42 | }
43 |
44 | /**
45 | *
46 | * @return a WordToVecSubspace limited only to terms in OpenCyc
47 | */
48 | public static GoogleNewsW2VOpenCycSubspace get() {
49 | if (singleton == null) {
50 | try {
51 | OpenCycOwl ocyc = new OpenCycOwl();
52 | singleton = new GoogleNewsW2VOpenCycSubspace(ocyc);
53 | } catch (IOException ex) {
54 | Logger.getLogger(GoogleNewsW2VSpace.class.getName()).log(Level.SEVERE, null, ex);
55 | throw new RuntimeException("Can't create the Google News W2VSpace object " + ex);
56 | } catch (OWLOntologyCreationException ex) {
57 | Logger.getLogger(GoogleNewsW2VOpenCycSubspace.class.getName()).log(Level.SEVERE, null, ex);
58 | }
59 | }
60 | return singleton;
61 | }
62 |
63 | static String getWord2VecVectorsMapName() {
64 | return GoogleNewsW2VOpenCycSubspace.class.getCanonicalName();
65 | }
66 |
67 | }
68 |
--------------------------------------------------------------------------------
/OwlTools/src/main/java/com/cyc/tool/owltools/OwlToolsConfig.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.owltools;
2 |
3 | /*
4 | * #%L
5 | * OwlTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.MapDBConfiguration;
24 |
25 | /**
26 | *
OwlToolsConfig provides some locations to use for classes in the OwlTools project.
27 | */
28 | public class OwlToolsConfig extends MapDBConfiguration{
29 |
30 | /**
31 | * The location of the OpenCyc OWL export file.
32 | */
33 | final public static String ocycLocation = "/cyc/projects/kbTaxonomy/owl-export-unversioned.owl";
34 | private static final String allConceptsName = "allConcepts";
35 | private static final String conceptsWithTermsName = "termsWithConcepts";
36 | private static final String fallBackDBLocation = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/";
37 |
38 | // From OwlToolsConfig.java in W2VOCyc
39 | private static final String ocycTermDBFile = "/ocycTerm";
40 | private static final String ocycTermMapName = "owlTerms";
41 |
42 |
43 | private static final String typeGraphName = "typeGraph";
44 |
45 | /**
46 | *
47 | * @return the allConceptsName
48 | */
49 | protected static String getAllConceptsName() {
50 | return allConceptsName;
51 | }
52 |
53 | /**
54 | *
55 | * @return the conceptsWithTermsName
56 | */
57 | protected static String getConceptsWithTermsName() {
58 | return conceptsWithTermsName;
59 | }
60 |
61 | /**
62 | *
63 | * @return the location of the ocycTermDBFile
64 | */
65 | protected static String getOcycTermDBFile() {
66 | return getMapDBBase(fallBackDBLocation) +
67 | ocycTermDBFile;
68 | }
69 |
70 | /**
71 | *
72 | * @return the ocycTermMapName
73 | */
74 | protected static String getOcycTermMapName() {
75 | return ocycTermMapName;
76 | }
77 |
78 | /**
79 | *
80 | * @return the typeGraphName
81 | */
82 | protected static String getTypeGraphName() {
83 | return typeGraphName;
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/CycMapDBTools/src/main/java/com/cyc/tool/MapDBConfiguration.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool;
2 |
3 | /*
4 | * #%L
5 | * CycMapDBTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.File;
24 | import java.io.FileNotFoundException;
25 | import java.io.IOException;
26 |
27 | /**
28 | *
29 | * MapDBConfiguration defines some defaults to use when accessing MapDB locations.
30 | */
31 | public class MapDBConfiguration {
32 |
33 | private static String baseString = null;
34 | static final String dirName = "MapDB";
35 | static final String goodBase = "/fastscratch";
36 | static final String goodLocation = goodBase + "/" + dirName;
37 |
38 | /**
39 | *
40 | * @param fb
41 | * @return base location for MapDB
42 | */
43 | public static final String getMapDBBase(String fb) {
44 | if (null == baseString) {
45 | try {
46 | baseString
47 | = getMapDBDirectoryWithFallbackTo(new File(fb)).getCanonicalPath();
48 | } catch (IOException ex) {
49 | throw new RuntimeException(ex);
50 | }
51 | }
52 | return baseString;
53 | }
54 |
55 | private static File getMapDBDirectoryWithFallbackTo(File fallback) throws FileNotFoundException, IOException {
56 | File base = new File(goodBase);
57 | if (base.exists() && base.canWrite()) {
58 | File mdb = new File(goodLocation);
59 | if (mdb.exists() || mdb.mkdirs()) {
60 | System.out.println("INFO: "+" using "+mdb.getCanonicalPath());
61 | return mdb;
62 | }
63 | } else {
64 | System.out.println("WARN: "+goodBase+" not available, backing off to "
65 | +fallback.getCanonicalPath());
66 | File completeFallBack = new File(fallback.getCanonicalPath() + "/" + dirName);
67 | if (completeFallBack.exists() || completeFallBack.mkdirs()) {
68 | return completeFallBack;
69 | }
70 | }
71 | throw new FileNotFoundException(goodBase + "is not avaliable for " + dirName
72 | + "and neither is" + fallback);
73 |
74 | }
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/GoogleNewsW2VSpace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.IOException;
24 | import java.util.logging.Level;
25 | import java.util.logging.Logger;
26 |
27 | /**
28 | * The word2vec space produced by Google by training on 10^11 words of news.
29 | *
30 | *
31 | * See: https://code.google.com/p/word2vec/
32 | */
33 | public class GoogleNewsW2VSpace extends Word2VecSpaceFromFile {
34 |
35 | private static GoogleNewsW2VSpace singleton;
36 | private static final String w2vfile = "/cyc/projects/kbTaxonomy/Experiments/ConceptFinder/GoogleNews-vectors-negative300.bin.gz";
37 |
38 | private GoogleNewsW2VSpace() throws IOException {
39 | super();
40 | vectors = db.getTreeMap(getWord2VecVectorsMapName());
41 | if (!vectors.isEmpty()) {
42 | assert (getVector("snowcapped_Caucasus") != null);
43 | setSize(getVector("dog").length);
44 | return;
45 | }
46 | createW2VinDB(getW2vfile());
47 | }
48 |
49 | /**
50 | * Factory get method for GoogleNewsW2VSpace.
51 | *
52 | * @return a GoogleNewsW2VSpace
53 | */
54 | public static GoogleNewsW2VSpace get() {
55 | if (singleton == null) {
56 | try {
57 | singleton = new GoogleNewsW2VSpace();
58 | } catch (IOException ex) {
59 | Logger.getLogger(GoogleNewsW2VSpace.class.getName()).log(Level.SEVERE, null, ex);
60 | throw new RuntimeException("Can't create the Google News W2VSpace object " + ex);
61 | }
62 | }
63 | return singleton;
64 | }
65 |
66 | private static String getW2vfile() {
67 | return w2vfile;
68 | }
69 |
70 | private static String getWord2VecVectorsMapName() {
71 | /*
72 | @ToDo: change this to use the class name, so that it's automatically correct
73 | */
74 | return GoogleNewsW2VSpace.class.getCanonicalName();
75 | //return word2VecVectorsMapName;
76 | }
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Word2VecSubspace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.File;
24 | import java.io.IOException;
25 | import java.util.Map;
26 | import java.util.function.Predicate;
27 | import org.mapdb.DBMaker;
28 |
29 | /**
30 | * A space of words from Google Word2Vec.
31 | *
32 | */
33 | public abstract class Word2VecSubspace extends Word2VecSpace {
34 |
35 | final Word2VecSpace mySuperSpace;
36 |
37 | /**
38 | * Word2VecSubspace constructor.
39 | *
40 | * @param ofSpace
41 | * @param includeIf
42 | * @param persistLoc
43 | * @throws IOException
44 | */
45 | protected Word2VecSubspace(Word2VecSpace ofSpace, Predicate includeIf, String persistLoc) throws IOException {
46 |
47 | mySuperSpace = ofSpace;
48 | if (db == null) {
49 | db = DBMaker.newFileDB(new File(Config.getW2vDBFile()))
50 | .closeOnJvmShutdown()
51 | // .encryptionEnable("password")
52 | .make();
53 | }
54 | vectors = db.getTreeMap(persistLoc);
55 | // vectors.clear();
56 | if (!vectors.isEmpty()) {
57 | setSize(vectors.values().iterator().next().length);
58 | System.out.println("Got cached w2vspace for " + persistLoc + " of dimensionality " + getSize() + " and with " + vectors.size() + " entries.");
59 | return;
60 | }
61 | // assert(vectors == null) :"Subspaces msut be completely empty when created";
62 | System.out.println("Filtering vectors for:" + persistLoc);
63 | Map newvectors = ofSpace.filterVectors(includeIf);
64 | newvectors.entrySet().forEach(e -> {
65 | vectors.put(e.getKey(), e.getValue());
66 | });
67 | db.commit();
68 | db.compact();
69 | db.commit();
70 | System.out.println("Vectors filtered and persisted.");
71 | }
72 |
73 | /**
74 | *
75 | * @return the mySuperSpace
76 | */
77 | public Word2VecSpace getSuperSpace() {
78 | return mySuperSpace;
79 | }
80 |
81 | }
82 |
--------------------------------------------------------------------------------
/OwlTools/src/test/java/com/cyc/tool/owltools/OpenCycReasonerIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.owltools;
2 |
3 | /*
4 | * #%L
5 | * OwlTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.util.List;
24 | import java.util.Set;
25 | import org.junit.After;
26 | import org.junit.AfterClass;
27 | import static org.junit.Assert.assertEquals;
28 | import static org.junit.Assert.assertTrue;
29 | import org.junit.Before;
30 | import org.junit.BeforeClass;
31 | import org.junit.Test;
32 | import org.semanticweb.owlapi.model.OWLClass;
33 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
34 |
35 | /**
36 | * Tests for OpenCycReasoner.
37 | */
38 | public class OpenCycReasonerIT {
39 |
40 | public OpenCycReasonerIT() {
41 | }
42 |
43 | @BeforeClass
44 | public static void setUpClass() {
45 | }
46 |
47 | @AfterClass
48 | public static void tearDownClass() {
49 | }
50 |
51 | @Before
52 | public void setUp() {
53 | }
54 |
55 | @After
56 | public void tearDown() {
57 | }
58 |
59 | /**
60 | * Test of get method, of class OpenCycReasoner.
61 | * @throws java.lang.Exception
62 | */
63 | @Test
64 | public void testGet() throws Exception {
65 | System.out.println("get");
66 | OpenCycReasoner result = OpenCycReasoner.get();
67 | assertTrue(result != null);
68 | }
69 |
70 | /**
71 | * Test of getAllClasses method, of class OpenCycReasoner.
72 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException
73 | */
74 | @Test
75 | public void testGetAllClasses() throws OWLOntologyCreationException {
76 | System.out.println("getAllClasses");
77 | OpenCycReasoner instance = OpenCycReasoner.get();
78 | int expResultSize = 116842;
79 | Set result = instance.getAllClasses();
80 | assertEquals(expResultSize, result.size());
81 | }
82 |
83 | /**
84 | * Test of getAllIRIs method, of class OpenCycReasoner.
85 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException
86 | */
87 | @Test
88 | public void testGetAllIRIs() throws OWLOntologyCreationException {
89 | System.out.println("getAllIRIs");
90 | OpenCycReasoner instance = OpenCycReasoner.get();
91 | int expResult = 116842;
92 | List result = instance.getAllIRIs();
93 | assertEquals(expResult, result.size());
94 | }
95 |
96 | }
97 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/AttachmentHypothesis.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.util.List;
24 |
25 | /**
26 | * An AttachmenHypothesis relates an OpenCyc concept to terms in a W2V Space.
27 | */
28 | public class AttachmentHypothesis {
29 |
30 | int conceptID;
31 | String conceptURI;
32 | String renderedTerms;
33 | Double score;
34 | List targetTerms;
35 | String textLabels;
36 |
37 | /**
38 | * AttachmentHypothesis constructor
39 | *
40 | * @param id
41 | * @param targetTerms
42 | * @param conceptURI
43 | * @param score
44 | * @param textLabels
45 | */
46 | public AttachmentHypothesis(int id, List targetTerms, String conceptURI, Double score, String textLabels) {
47 | this.conceptURI = conceptURI;
48 | this.score = score;
49 | this.textLabels = textLabels;
50 | this.targetTerms = targetTerms;
51 | this.conceptID = id;
52 | this.renderedTerms = String.join("/", this.targetTerms);
53 | }
54 |
55 | /**
56 | *
57 | * @return the headings for the CSV file
58 | */
59 | public static String headCSV() {
60 | return "ConceptID,Name,URI,Score,Strings";
61 | }
62 |
63 | /**
64 | *
65 | * @return the headings for the HTML table
66 | */
67 | public static String headHTMLTable() {
68 | return "ConceptID Name URI Score Strings ";
69 | }
70 |
71 | /**
72 | *
73 | * @return a CSV representation of the AttachmentHypothesis
74 | */
75 | public String toCSV() {
76 | return conceptID + "," + renderedTerms.replaceAll(",", "") + "," + conceptURI + "," + score + ","
77 | + textLabels.replaceAll(",", "");
78 | }
79 |
80 | /**
81 | *
82 | * @return an HTML representation of the AttachmentHypothesis
83 | */
84 | public String toHTMLTableTR() {
85 | return "" + conceptID + " " + renderedTerms + " " + conceptURI + " " + score + " "
86 | + textLabels + " ";
87 | }
88 |
89 | @Override
90 | public String toString() {
91 | return renderedTerms + "[" + conceptID + "]⟶" + conceptURI + " (" + score + ":" + textLabels + ")";
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/ConceptFinder/src/test/java/com/cyc/tool/conceptfinder/ConceptSpaceIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace;
24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
25 | import java.io.IOException;
26 | import java.util.Arrays;
27 | import java.util.List;
28 | import java.util.stream.IntStream;
29 | import org.junit.After;
30 | import org.junit.AfterClass;
31 | import static org.junit.Assert.assertEquals;
32 | import static org.junit.Assert.fail;
33 | import org.junit.Before;
34 | import org.junit.BeforeClass;
35 | import org.junit.Test;
36 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
37 |
38 | /**
39 | * ConceptSpace tests.
40 | */
41 | public class ConceptSpaceIT {
42 |
43 | static List cr = Arrays.asList("Chinese", "river");
44 | static ConceptSpace mySpace;
45 |
46 | public ConceptSpaceIT() {
47 | }
48 |
49 | @BeforeClass
50 |
51 | public static void setUpClass() throws IOException, OWLOntologyCreationException {
52 | mySpace = new ConceptSpace(GoogleNewsW2VSpace.get());
53 |
54 | }
55 |
56 | @AfterClass
57 |
58 | public static void tearDownClass() {
59 | mySpace = null;
60 | }
61 |
62 | @Test
63 | public void findNearbyTerms1() {
64 | try {
65 | long t1 = System.currentTimeMillis();
66 | List matches = mySpace.findNearestNForWithInputTermFiltering(cr, 40);
67 | IntStream.range(0, matches.size())
68 | .forEach(i -> {
69 | System.out.println(i + " " + matches.get(i).toString());
70 | });
71 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
72 | assertEquals(matches.get(0).getTerm(), "Yangtze_River");
73 | assertEquals(0.6047259562339493, matches.get(5).getSimilarity(), 0.000001);
74 |
75 | assertEquals(matches.get(23).getTerm(), "rivers");
76 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
77 | fail("took unexpected exception:" + ex);
78 | }
79 | }
80 |
81 | @Test
82 | public void findNearbyTerms2() {
83 | try {
84 | long t1 = System.currentTimeMillis();
85 | List matches = mySpace.findNearestNForWithInputTermFiltering(Arrays.asList("gangplank"), 40);
86 | IntStream.range(0, matches.size())
87 | .forEach(i -> {
88 | System.out.println(i + " " + matches.get(i).toString());
89 | });
90 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
91 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
92 | fail("took unexpected exception:" + ex);
93 | }
94 | }
95 |
96 | @Before
97 | public void setUp() {
98 | }
99 |
100 | @After
101 | public void tearDown() {
102 | }
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/MissingConceptFinderDefault.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
24 | import com.cyc.tool.owltools.OpenCycOwl;
25 | import java.io.IOException;
26 | import java.util.ArrayList;
27 | import java.util.Arrays;
28 | import java.util.List;
29 | import java.util.stream.Collectors;
30 | import java.util.stream.IntStream;
31 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
32 |
33 | /**
34 | * The default implementation for MissingConceptFinder.
35 | */
36 | public class MissingConceptFinderDefault extends MissingConceptFinder {
37 |
38 | static final boolean reset = true;
39 | String[][] conceptStrings = {{"Facebook", "the Facebook"},
40 | {"telephone microphone"},
41 | {"telephone speaker"},
42 | {"backhoe"},
43 | {"facial scar", "scar on face"},
44 | {"blue eyes"},
45 | {"saluting the flag"},
46 | {"muddy paws"},
47 | {"strong muscles"},
48 | {"pan balance"},
49 | {"graduated cylinder"},
50 | {"tape measure"},
51 | {"hand lens"},
52 | {"measuring cup"}
53 | };
54 | List conceptsToLookFor = Arrays.asList(conceptStrings);
55 |
56 | /**
57 | * MissingConceptFinderDefault constructor
58 | *
59 | * @param w2v
60 | * @param oco
61 | * @throws IOException
62 | * @throws OWLOntologyCreationException
63 | */
64 | public MissingConceptFinderDefault(Word2VecSpace w2v, OpenCycOwl oco) throws IOException, OWLOntologyCreationException {
65 | this(w2v, oco, null);
66 | }
67 |
68 | /**
69 | * MissingConceptFinderDefault constructor
70 | *
71 | * @param w2v
72 | * @param oco
73 | * @param cs
74 | * @throws IOException
75 | * @throws OWLOntologyCreationException
76 | */
77 | public MissingConceptFinderDefault(Word2VecSpace w2v, OpenCycOwl oco, ConceptSpace cs) throws IOException, OWLOntologyCreationException {
78 | super(w2v, oco, cs);
79 | missingTerms = db.getTreeMap(DefaultConceptFinderConfig.getMissingTermMapName());
80 | conceptsForMissingTerms = db.getTreeMap(DefaultConceptFinderConfig.getConceptsForMissingTermsName());
81 | if (reset) {
82 | missingTerms.clear();
83 | }
84 | if (missingTerms.isEmpty()) {
85 | conceptsForMissingTerms.clear();
86 | OpenCycOwl oc = new OpenCycOwl();
87 |
88 | missingMappingNames = conceptsToLookFor;
89 | missingConceptNames = missingMappingNames.stream()
90 | .filter(oc.noConcept())
91 | .collect(Collectors.toList());
92 | IntStream.range(0, missingConceptNames.size())
93 | .forEach(i -> missingTerms.put(i, missingConceptNames.get(i)));
94 | db.commit();
95 | db.compact();
96 | oc.close();
97 |
98 | } else {
99 | missingConceptNames = new ArrayList<>();
100 | missingTerms.keySet().forEach(k -> missingConceptNames.add(missingTerms.get(k)));
101 | }
102 | }
103 |
104 | }
105 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | DistributedRepresentations and ConceptFinder
3 | ============================================
4 |
5 | Version 1.0
6 |
7 | Included
8 | --------
9 |
10 | Projects:
11 | * DistributedRepresentationsParent - The parent pom for the other projects
12 | * OwlTools - Classes for accessing the OpenCyc OWL export
13 | * CycMapDBTools - Some configuration defaults for the other other projects
14 | * DistributedRepresentations - Project to access Word2Vec sources
15 | * ConceptFinder - Methods to find nearby concepts in the Word2Vec space
16 |
17 | Other Files:
18 | * Download the distributedRepresentationsDownloads file [here](https://drive.google.com/file/d/0B95A6Z6CLEXibm1DYnBzN3NkZEU/view?usp=sharing) and then extract the following:
19 | * GoogleNews-vectors-negative300.bin.gz - The GoogleNews Word2Vec Space
20 | * BioASQ - The Word2Vec space developed by BioASQ and trained on Pubmed sources
21 | * owl-export-unversioned.owl - The OpenCyc export
22 | * This README file
23 |
24 | Requirements
25 | ------------
26 |
27 | * These projects require Java 1.8.
28 | * **_This code has not yet been tested on Windows._**
29 |
30 | Description and Usage
31 | ---------------------
32 |
33 | The projects in this repository constitute a library for accessing Word2Vec content and searching in that space.
34 | The OwlTools project provides access to OpenCyc concepts that can be
35 | mapped into the space. These mapped OpenCyc concepts can be viewed using the Taxonomy Viewer, located in the KBTaxonomy repository, which uses the Distributed Representations libraries to allow users to find OpenCyc concepts by way of nearest term search in the Word2Vec space.
36 |
37 | At present, the library supports two sources:
38 |
39 | 1. The word2vec space produced by Google by training on 10^11 words of news. - (https://code.google.com/p/word2vec/)
40 | 2. The word2vec space produced by BioASQ by training on pubmed. - (http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts)
41 |
42 | To use these libraries, you will need to update some file paths to your local system as follows:
43 |
44 | In DistributedRepresentations:
45 |
46 | 1. `GoogleNewsW2VSpace.java`
47 | * Modify the `w2vfile` variable so that it points to where you save the GoogleNews-vectors archive included in this repository
48 | 2. `BiologyW2VSpace.java`
49 | * Modify the `filebase` variable to where you save the BioASQ directory word2vecTools subdirectory
50 | 3. `Config.java`
51 | * Modify the `fallBackLocation` variable to a directory in your file system, ideally, where you saved the GoogleNews archive
52 |
53 | In ConceptFinder:
54 |
55 | 1. `ConceptFinderConfig.java`
56 | * Modify the `fallBackLocation` variable to a directory in your file system, ideally, where you saved the GoogleNews archive
57 | * Modify the `w2vVectorFile` variable accordingly
58 |
59 | In OwlTools:
60 |
61 | 1. `OwlToolsConfig.java`
62 | * Modify the `ocyclocation` variable to match the location of where you save the OpenCyc export file, `owl-export-unversioned.owl`
63 | * Modify the `fallBackLocation` variable to match the location you gave in `ConceptFinderConfig.java`
64 |
65 | To install the libraries to your local Maven repository, simply install the DistributedRepresentationsParent project. This will install all four of its children to your local Maven repository. To confirm that everything is working properly, run the integration tests in each of the projects. Note that some tests may take a long time (on the order of several hours) to run the first time, but should be faster in subsequent runs. The `Word2VecSpaceIT.java` test, in particular, will be setting up the Google News space on your local system, so it needs to run through all of the concepts in the space. This is a one-time operation though, so you should not have to perform this set up step again.
66 |
67 | IMPORTANT: If something goes wrong during the MapDB set-up operations, which get kicked off by running the integration tests in these projects, you may need to remove the MapDB directory and start again. This sometimes happens if the set-up process is interrupted before it has completed.
68 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/BiologyW2VSpace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.BufferedReader;
24 | import java.io.File;
25 | import java.io.FileReader;
26 | import java.io.IOException;
27 | import java.util.Arrays;
28 | import java.util.logging.Level;
29 | import java.util.logging.Logger;
30 | import java.util.stream.Collectors;
31 | import org.mapdb.DBMaker;
32 |
33 | /**
34 | * The word2vec space produced by BioASQ by training on pubmed.
35 | *
36 | *
37 | * See:
38 | * http://bioasq.org/news/bioasq-releases-continuous-space-word-vectors-obtained-applying-word2vec-pubmed-abstracts
39 | */
40 | public class BiologyW2VSpace extends Word2VecSpace {
41 |
42 | private static final String fileBase = "/cyc/projects/kbTaxonomy/ConceptFinder/BioASQ/word2vecTools/";
43 | private static BiologyW2VSpace singleton;
44 | private static final String w2vlabelfile = fileBase + "types.txt";
45 | private static final String w2vvectorfile = fileBase + "vectors.txt";
46 |
47 | private BiologyW2VSpace() throws IOException {
48 | db = DBMaker.newFileDB(new File(Config.getW2vDBFile()))
49 | .closeOnJvmShutdown()
50 | // .encryptionEnable("password")
51 | .make();
52 | vectors = db.getTreeMap(getWord2VecVectorsMapName());
53 | // vectors.clear();
54 | if (!vectors.isEmpty()) {
55 | assert (getVector("anti-mib-1") != null);
56 | setSize(getVector("hgh-b").length);
57 | return;
58 | }
59 | int i = 0;
60 | try (BufferedReader labelReader = new BufferedReader(new FileReader(w2vlabelfile))) {
61 | try (BufferedReader vectorReader = new BufferedReader(new FileReader(w2vvectorfile))) {
62 | for (String label; (label = labelReader.readLine()) != null;) {
63 | String vec = vectorReader.readLine();
64 | float[] d
65 | = normVector(
66 | Arrays.asList(vec.split("\\s+"))
67 | .stream()
68 | .map(s -> Float.valueOf(s))
69 | .collect(Collectors.toList())
70 | );
71 | if (getSize() != 0) {
72 | assert d.length == getSize() : "Line without " + getSize() + " floats";
73 | } else {
74 | setSize(d.length);
75 | }
76 | if (i++ % 100000 == 0) {
77 | db.commit();
78 | System.out.println(i + ": " + label);
79 | }
80 |
81 | vectors.put(label, d);
82 | // process the line.
83 | }
84 | // line is not visible here.
85 | }
86 | }
87 | System.out.println("Read " + i + " term positions for " + BiologyW2VSpace.class.getSimpleName());
88 | db.commit();
89 | db.compact();
90 | }
91 |
92 | /**
93 | * Factory get method for BiologyW2VSpace.
94 | *
95 | * @return a BiologyW2VSpace
96 | */
97 | public static BiologyW2VSpace get() {
98 | if (singleton == null) {
99 | try {
100 | singleton = new BiologyW2VSpace();
101 | } catch (IOException ex) {
102 | Logger.getLogger(BiologyW2VSpace.class.getName()).log(Level.SEVERE, null, ex);
103 | throw new RuntimeException("Can't create the Biology W2VSpace object\n " + ex);
104 | }
105 | }
106 | return singleton;
107 | }
108 |
109 | /*
110 | @ToDo: change this to use the class name, so that it's automatically correct
111 | */
112 | private static String getWord2VecVectorsMapName() {
113 | return BiologyW2VSpace.class.getCanonicalName();
114 | }
115 |
116 | }
117 |
--------------------------------------------------------------------------------
/OwlTools/src/test/java/com/cyc/tool/owltools/OpenCycContentIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.owltools;
2 |
3 | /*
4 | * #%L
5 | * OwlTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.util.HashSet;
24 | import java.util.Set;
25 | import org.junit.After;
26 | import org.junit.AfterClass;
27 | import static org.junit.Assert.assertEquals;
28 | import static org.junit.Assert.assertTrue;
29 | import org.junit.Before;
30 | import org.junit.BeforeClass;
31 | import org.junit.Test;
32 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
33 |
34 | /**
35 | * Tests for OpenCycContent.
36 | *
37 | */
38 | public class OpenCycContentIT {
39 |
40 | public OpenCycContentIT() {
41 | }
42 |
43 | @BeforeClass
44 | public static void setUpClass() {
45 | }
46 |
47 | @AfterClass
48 | public static void tearDownClass() {
49 | }
50 |
51 | @Before
52 | public void setUp() {
53 | }
54 |
55 | @After
56 | public void tearDown() {
57 | }
58 |
59 | /**
60 | * Test of generateHtmlForConcept method, of class OpenCycContent.
61 | * @throws java.lang.Exception
62 | */
63 | @Test
64 | public void testGenerateHtmlForConcept() throws Exception {
65 | System.out.println("generateHtmlForConcept");
66 | OpenCycContent instance = new OpenCycContent("Mx4rKJAessNqRP6Yzb7lBhCrwQ"); // #$DogBreedShow;;
67 | String result = instance.generateHtmlForConcept();
68 | System.out.println(result);
69 | assertTrue(result.contains("
English Phrases:
"));
70 |
71 | }
72 |
73 | /**
74 | * Test of getCommentsForConceptFromOWL method, of class OpenCycContent.
75 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException
76 | */
77 | @Test
78 | public void testGetCommentsForConcept() throws OWLOntologyCreationException {
79 | System.out.println("getCommentsForConcept");
80 | OpenCycContent instance = new OpenCycContent("Mx4rKJAessNqRP6Yzb7lBhCrwQ"); // #$DogBreedShow
81 | int expResultSize = 1;
82 | Set result = instance.getCommentsForConcept();
83 | System.out.println("Comments: " + result);
84 | assertEquals(expResultSize, result.size());
85 |
86 | }
87 |
88 | /**
89 | * Test of getLabelForConcept method, of class OpenCycContent.
90 | * @throws java.lang.Exception
91 | */
92 | @Test
93 | public void testGetLabelForConcept() throws Exception {
94 | System.out.println("getLabelForConcept");
95 | OpenCycContent instance = new OpenCycContent("Mx4rKJAessNqRP6Yzb7lBhCrwQ"); // #$DogBreedShow;
96 | String expResult = "DogBreedShow";
97 | String result = instance.getLabelForConcept();
98 | System.out.println("Label: " + result);
99 | assertEquals(expResult, result);
100 | }
101 |
102 | /**
103 | * Test of getPrettyStringsForConceptFromOWL method, of class OpenCycContent.
104 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException
105 | */
106 | @Test
107 | public void testGetPrettyStringsForConcept() throws OWLOntologyCreationException {
108 | System.out.println("getPrettyStringsForConcept");
109 | OpenCycContent instance = new OpenCycContent("Mx4rKJAessNqRP6Yzb7lBhCrwQ"); // #$DogBreedShow
110 | int expResultSize = 7;
111 | Set result = instance.getPrettyStringsForConcept();
112 | System.out.println("Pretty Strings: " + result);
113 | assertEquals(expResultSize, result.size());
114 |
115 | }
116 |
117 | /**
118 | * Test of getTypesForConcept method, of class OpenCycContent.
119 | * @throws java.lang.Exception
120 | */
121 | @Test
122 | public void testGetTypesForConcept() throws Exception {
123 | System.out.println("getTypesForConcept");
124 | OpenCycContent instance = new OpenCycContent("Mx4rKJAessNqRP6Yzb7lBhCrwQ"); // #$DogBreedShow;;
125 | Set expResult = new HashSet<>();
126 | expResult.add("Mx4r7LaSPmtpQfiSSf5yKM70tg");
127 | Set result = instance.getTypesForConcept();
128 | System.out.println("Types: " + result);
129 | assertEquals(expResult, result);
130 |
131 | }
132 |
133 | }
134 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Word2VecSpaceFromFile.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.DataInputStream;
24 | import java.io.File;
25 | import java.io.FileInputStream;
26 | import java.io.FileNotFoundException;
27 | import java.io.IOException;
28 | import java.util.logging.Level;
29 | import java.util.logging.Logger;
30 | import java.util.stream.IntStream;
31 | import java.util.zip.GZIPInputStream;
32 | import org.apache.commons.io.EndianUtils;
33 | import org.mapdb.DBMaker;
34 |
35 | /**
36 | * Word2Vec distributed representation space from Google Format file.
37 | *
38 | *
39 | * This class represents any distributed represenation computed using word2vec and initially loaded
40 | * from a Google word2vec formatted file
41 | */
42 | public abstract class Word2VecSpaceFromFile extends Word2VecSpace {
43 |
44 | final StringBuilder sb = new StringBuilder();
45 |
46 | /**
47 | * Constructor for Word2VecSpaceFromFile
48 | *
49 | * @throws IOException
50 | */
51 | public Word2VecSpaceFromFile() throws IOException {
52 | db = DBMaker.newFileDB(new File(Config.getW2vDBFile()))
53 | .closeOnJvmShutdown()
54 | // .encryptionEnable("password")
55 | .make();
56 |
57 | }
58 |
59 | /**
60 | * Create a W2V space in a DB.
61 | *
62 | * @param w2vZipFile
63 | * @throws FileNotFoundException
64 | * @throws IOException
65 | */
66 | protected final void createW2VinDB(String w2vZipFile) throws FileNotFoundException, IOException {
67 | try (DataInputStream data_in
68 | = new DataInputStream(
69 | new GZIPInputStream(new FileInputStream(
70 | new File(w2vZipFile))))) {
71 | getWordsAndSize(data_in);
72 | if (vectors.size() == words) {
73 | System.out.println("Word2Vec is in DB");
74 | } else {
75 | System.out.println("DB Size:" + vectors.size());
76 |
77 | System.out.println("Want to read Word Count: " + words);
78 | System.out.println("Size:" + getSize());
79 | for (int w = 0; w < words; w++) {
80 | float[] v = new float[getSize()];
81 | String key = getVocabString(data_in);
82 | System.out.println(w + ":\t" + key);
83 |
84 | IntStream.range(0, getSize()).forEach(i -> v[i]
85 | = getFloat(data_in));
86 | vectors.put(key, normVector(v));
87 | if (w % 100000 == 1) {
88 | db.commit();
89 | }
90 | }
91 | db.commit();
92 | db.compact();
93 | }
94 | }
95 | }
96 |
97 | private float getFloat(DataInputStream s) {
98 | try {
99 | float v = EndianUtils.readSwappedFloat(s);
100 | //System.out.println(st+"["+i+"]: "+v);
101 | return v;
102 | } catch (IOException ex) {
103 | Logger.getLogger(Word2VecSpace.class.getName()).log(Level.SEVERE, null, ex);
104 | return 0.0f;
105 | }
106 | }
107 |
108 | private String getVocabString(DataInputStream s) throws IOException {
109 | sb.setLength(0);
110 | for (char ch = (char) s.read();
111 | (!Character.isWhitespace(ch) && ch >= 0 && ch <= 256);
112 | ch = (char) s.read()) {
113 | sb.append((char) ch);
114 | }
115 | return sb.toString();
116 | }
117 |
118 | private void getWordsAndSize(DataInputStream s) throws IOException {
119 | sb.setLength(0);
120 | for (char ch = (char) s.read(); ch != '\n'; ch = (char) s.read()) {
121 | sb.append(ch);
122 | }
123 | String[] parts = sb.toString().split("\\s+");
124 | words = Long.parseLong(parts[0]);
125 | setSize((int) Long.parseLong(parts[1]));
126 | }
127 |
128 | }
129 |
--------------------------------------------------------------------------------
/OwlTools/src/main/java/com/cyc/tool/owltools/OpenCycReasoner.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.owltools;
2 |
3 | /*
4 | * #%L
5 | * OwlTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.File;
24 | import java.util.ArrayList;
25 | import java.util.List;
26 | import java.util.Set;
27 | import org.semanticweb.owlapi.apibinding.OWLManager;
28 | import org.semanticweb.owlapi.io.FileDocumentSource;
29 | import org.semanticweb.owlapi.model.IRI;
30 | import org.semanticweb.owlapi.model.OWLAnnotationProperty;
31 | import org.semanticweb.owlapi.model.OWLClass;
32 | import org.semanticweb.owlapi.model.OWLDataFactory;
33 | import org.semanticweb.owlapi.model.OWLOntology;
34 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
35 | import org.semanticweb.owlapi.model.OWLOntologyManager;
36 | import org.semanticweb.owlapi.reasoner.OWLReasoner;
37 | import org.semanticweb.owlapi.reasoner.OWLReasonerFactory;
38 | import org.semanticweb.owlapi.reasoner.structural.StructuralReasonerFactory;
39 | import org.semanticweb.owlapi.util.DefaultPrefixManager;
40 |
41 | /**
42 | *
43 | * OpenCycReasoner provides access to methods that {@link OpenCycContent} uses to get information
44 | * out of the OpenCyc OWL file.
45 | *
46 | */
47 | public class OpenCycReasoner {
48 |
49 | private static List allClassIRIs = null;
50 | private static Set allClasses = null;
51 | private static OWLAnnotationProperty comment = null;
52 | private static OWLDataFactory dataFactory = null;
53 | private static OWLAnnotationProperty label = null;
54 | private static OWLOntologyManager manager = null;
55 |
56 | private static OpenCycReasoner me = null;
57 |
58 | private static OWLOntology openCyc = null;
59 |
60 | private static OWLAnnotationProperty prettyString = null;
61 |
62 | private static OWLReasoner reasoner = null;
63 | private static OWLReasonerFactory reasonerFactory = null;
64 | static final String ocycLocation = OwlToolsConfig.ocycLocation;
65 |
66 | /**
67 | * Creates a new instance of OpenCycReasoner.
68 | */
69 | private OpenCycReasoner() throws OWLOntologyCreationException {
70 | manager = OWLManager.createOWLOntologyManager();
71 | openCyc = getManager()
72 | .loadOntologyFromOntologyDocument(new FileDocumentSource(new File(getOcycLocation())));
73 | reasonerFactory = new StructuralReasonerFactory();
74 | reasoner = getReasonerFactory().createReasoner(getOpenCyc());
75 | dataFactory = getManager().getOWLDataFactory();
76 | prettyString = getDataFactory().getOWLAnnotationProperty(IRI.create("http://sw.opencyc.org/concept/Mx4rwLSVCpwpEbGdrcN5Y29ycA"));
77 | comment = getDataFactory().getRDFSComment();
78 | label = getDataFactory().getOWLAnnotationProperty("label", new DefaultPrefixManager("http://sw.cyc.com/CycAnnotations_v1#"));
79 |
80 | allClasses = openCyc.getClassesInSignature();
81 | allClassIRIs = getIRIs(allClasses);
82 | }
83 |
84 | /**
85 | * Factory method to get an OpenCycReasoner instance.
86 | *
87 | * @return an OpenCycReasoner
88 | * @throws OWLOntologyCreationException
89 | */
90 | public static OpenCycReasoner get() throws OWLOntologyCreationException {
91 | if (me == null) {
92 | me = new OpenCycReasoner();
93 | }
94 | return me;
95 | }
96 |
97 | /**
98 | *
99 | * @return allClasses
100 | */
101 | public Set getAllClasses() {
102 | return allClasses;
103 | }
104 |
105 | /**
106 | *
107 | * @return allClassIRIs
108 | */
109 | public List getAllIRIs() {
110 | return allClassIRIs;
111 | }
112 |
113 | /**
114 | * @return the comment
115 | */
116 | public OWLAnnotationProperty getComment() {
117 | return comment;
118 | }
119 |
120 | /**
121 | * @return the dataFactory
122 | */
123 | public OWLDataFactory getDataFactory() {
124 | return dataFactory;
125 | }
126 |
127 | /**
128 | *
129 | * @return the label
130 | */
131 | public OWLAnnotationProperty getLabel() {
132 | return label;
133 | }
134 |
135 | /**
136 | * @return the manager
137 | */
138 | public OWLOntologyManager getManager() {
139 | return manager;
140 | }
141 |
142 | /**
143 | * @return the ocycLocation
144 | */
145 | public String getOcycLocation() {
146 | return ocycLocation;
147 | }
148 |
149 | /**
150 | * @return the openCyc
151 | */
152 | public OWLOntology getOpenCyc() {
153 | return openCyc;
154 | }
155 |
156 | /**
157 | * @return the prettyString
158 | */
159 | public OWLAnnotationProperty getPrettyString() {
160 | return prettyString;
161 | }
162 |
163 | /**
164 | * @return the reasoner
165 | */
166 | public OWLReasoner getReasoner() {
167 | return reasoner;
168 | }
169 |
170 | /**
171 | * @return the reasonerFactory
172 | */
173 | public OWLReasonerFactory getReasonerFactory() {
174 | return reasonerFactory;
175 | }
176 |
177 | private List getIRIs(Set allClasses) {
178 | List allIRIs = new ArrayList<>();
179 | allClasses.forEach(c -> {
180 | String iri = c.getIRI().getFragment();
181 | allIRIs.add(iri);
182 | });
183 | return allIRIs;
184 | }
185 |
186 | }
187 |
--------------------------------------------------------------------------------
/distributedRepresentationsParent/pom.xml~:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | distributedRepresentationsParent
6 | 1.0
7 | pom
8 |
9 | UTF-8
10 | 1.7
11 | 1.7
12 |
13 |
14 |
15 | Cycorp, Inc
16 | http://www.cyc.com
17 |
18 |
19 | 2015
20 |
21 |
22 | ../CycMapDBTools
23 | ../DistributedRepresentations
24 | ../ConceptFinder
25 |
26 |
27 |
28 |
29 | cyc.com
30 | file:///cyc/java/maven-repo/
31 |
32 | true
33 | always
34 |
35 |
36 | true
37 | always
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 | org.codehaus.mojo
47 | license-maven-plugin
48 | 1.7
49 |
50 | false
51 | false
52 |
53 |
54 |
55 | first
56 |
57 | update-file-header
58 |
59 | process-sources
60 |
61 | apache_v2
62 |
63 | src/main/java
64 | src/test
65 |
66 |
67 |
68 |
69 |
70 |
71 | org.apache.maven.plugins
72 | maven-javadoc-plugin
73 | 2.9.1
74 |
75 |
81 | -Xdoclint:none
82 |
83 |
84 | todo
85 | X
86 |
87 |
88 | true
89 |
90 |
91 |
92 | attach-javadocs
93 |
94 | jar
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 | junit
107 | junit
108 | 4.11
109 | test
110 |
111 |
112 |
113 |
114 | com.google.code.gson
115 | gson
116 | 2.1
117 | jar
118 |
119 |
120 | com.googlecode.json-simple
121 | json-simple
122 | 1.1.1
123 |
124 |
125 |
126 |
127 | org.apache.commons
128 | commons-lang3
129 | 3.1
130 |
131 |
132 | commons-io
133 | commons-io
134 | 2.4
135 |
136 |
137 | commons-cli
138 | commons-cli
139 | 1.2
140 |
141 |
142 |
143 | org.apache.opennlp
144 | opennlp-tools
145 | 1.5.3
146 |
147 |
148 |
149 | org.slf4j
150 | slf4j-api
151 | 1.7.5
152 |
153 |
154 | org.slf4j
155 | slf4j-simple
156 | 1.7.5
157 | test
158 |
159 |
160 |
161 | com.cyc.tool
162 | CycMapDBTools
163 | 1.0
164 | jar
165 |
166 |
167 | com.cyc.tool
168 | OwlTools
169 | 1.0
170 |
171 |
172 | com.cyc.tool
173 | DistributedRepresentations
174 | 1.0
175 | jar
176 |
177 |
178 |
179 |
180 |
181 |
182 | junit
183 | junit
184 | test
185 |
186 |
187 |
188 |
189 |
--------------------------------------------------------------------------------
/distributedRepresentationsParent/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | com.cyc.tool
5 | distributedRepresentationsParent
6 | 1.0
7 | pom
8 |
9 | UTF-8
10 | 1.7
11 | 1.7
12 |
13 |
14 |
15 | Cycorp, Inc
16 | http://www.cyc.com
17 |
18 |
19 | 2015
20 |
21 |
22 | ../CycMapDBTools
23 | ../OwlTools
24 | ../DistributedRepresentations
25 | ../ConceptFinder
26 |
27 |
28 |
29 |
30 | cyc.com
31 | file:///cyc/java/maven-repo/
32 |
33 | true
34 | always
35 |
36 |
37 | true
38 | always
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 | org.codehaus.mojo
48 | license-maven-plugin
49 | 1.7
50 |
51 | false
52 | false
53 |
54 |
55 |
56 | first
57 |
58 | update-file-header
59 |
60 | process-sources
61 |
62 | apache_v2
63 |
64 | src/main/java
65 | src/test
66 |
67 |
68 |
69 |
70 |
71 |
72 | org.apache.maven.plugins
73 | maven-javadoc-plugin
74 | 2.9.1
75 |
76 |
82 | -Xdoclint:none
83 |
84 |
85 | todo
86 | X
87 |
88 |
89 | true
90 |
91 |
92 |
93 | attach-javadocs
94 |
95 | jar
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 | junit
108 | junit
109 | 4.11
110 | test
111 |
112 |
113 |
114 |
115 | com.google.code.gson
116 | gson
117 | 2.1
118 | jar
119 |
120 |
121 | com.googlecode.json-simple
122 | json-simple
123 | 1.1.1
124 |
125 |
126 |
127 |
128 | org.apache.commons
129 | commons-lang3
130 | 3.1
131 |
132 |
133 | commons-io
134 | commons-io
135 | 2.4
136 |
137 |
138 | commons-cli
139 | commons-cli
140 | 1.2
141 |
142 |
143 |
144 | org.apache.opennlp
145 | opennlp-tools
146 | 1.5.3
147 |
148 |
149 |
150 | org.slf4j
151 | slf4j-api
152 | 1.7.5
153 |
154 |
155 | org.slf4j
156 | slf4j-simple
157 | 1.7.5
158 | test
159 |
160 |
161 |
162 | com.cyc.tool
163 | CycMapDBTools
164 | 1.0
165 | jar
166 |
167 |
168 | com.cyc.tool
169 | OwlTools
170 | 1.0
171 |
172 |
173 | com.cyc.tool
174 | DistributedRepresentations
175 | 1.0
176 | jar
177 |
178 |
179 |
180 |
181 |
182 |
183 | junit
184 | junit
185 | test
186 |
187 |
188 |
189 |
190 |
191 |
--------------------------------------------------------------------------------
/OwlTools/src/test/java/com/cyc/tool/owltools/OpenCycOwlIT.java~:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.owltools;
2 |
3 | /*
4 | * #%L
5 | * OwlTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace;
24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
25 | import com.google.common.collect.Iterables;
26 | import java.io.IOException;
27 | import java.util.HashSet;
28 | import java.util.Set;
29 | import org.junit.AfterClass;
30 | import static org.junit.Assert.assertEquals;
31 | import static org.junit.Assert.assertTrue;
32 | import org.junit.BeforeClass;
33 | import org.junit.Test;
34 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
35 |
36 | /**
37 | * Tests for OpenCycOwl.
38 | */
39 | public class OpenCycOwlIT {
40 |
41 | static OpenCycOwl ocyc;
42 |
43 | public OpenCycOwlIT() {
44 | }
45 |
46 | @BeforeClass
47 | public static void setUpClass() throws IOException, OWLOntologyCreationException {
48 | ocyc = new OpenCycOwl();
49 |
50 | }
51 |
52 | @AfterClass
53 | public static void tearDownClass() {
54 | // Remove the ontology from the manager
55 | ocyc.close();
56 | }
57 |
58 | @Test
59 | public void conceptForTest1() {
60 | Set res = ocyc.conceptsFor("the Yangtze");
61 | assertEquals(1, res.size());
62 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVj5qJwpEbGdrcN5Y29ycA"));
63 | }
64 |
65 | @Test
66 | public void conceptsForBirdTest1() {
67 | Set res = ocyc.conceptsFor("Bird");
68 | System.out.println("HEY Bird "+res);
69 | assertEquals(2,res.size());
70 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA"));
71 | }
72 |
73 | @Test
74 | public void conceptsForBirdTest2() {
75 | Set res = ocyc.conceptsFor("bird");
76 | System.out.println("HEY bird "+res);
77 | assertEquals(2, res.size());
78 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA"));
79 | }
80 |
81 | @Test
82 | public void conceptsWithTermsTest() throws IOException {
83 | Set res = ocyc.conceptsWithW2VTerms();
84 | assertEquals(49616, res.size());
85 | }
86 |
87 | @Test
88 | public void getLabelsTest() {
89 | int res = ocyc.ocycConceptForTermLabel.size();
90 | //ocyc.ocycConceptForTermLabel.keySet().forEach(s->{
91 | // System.out.println(s+"\t"+ocyc.ocycConceptForTermLabel.get(s));
92 | // });
93 | System.out.println("N RDFS Labels with concepts:" + res);
94 | assertEquals(240258, res);
95 | }
96 |
97 | @Test
98 | public void getLowerCaseStringTest() {
99 | int res = ocyc.ocycConceptForTermLower.size();
100 | System.out.println("N downcased pretty strings or labels with concepts:" + res);
101 | assertEquals(576678, res);
102 | }
103 |
104 | @Test
105 | public void getNConceptsTest() {
106 | int res = ocyc.size();
107 | System.out.println("N Classes:" + res);
108 | assertEquals(116842, res);
109 | }
110 |
111 | @Test
112 | public void getPrettyStringTest() {
113 | int res = ocyc.ocycConceptForTermPrettyString.size();
114 | System.out.println("N pretty Strings with concepts:" + res);
115 | assertEquals(345298, res);
116 | }
117 |
118 | @Test
119 | public void getTypesTest() throws IOException {
120 | Set res = ocyc.getTypes(ocyc.pizzaGUID);
121 | res.forEach(s -> {
122 | System.out.println("Pizza: " + ocyc.guidToURLString(s));
123 | });
124 | assertEquals(4, res.size());
125 | }
126 |
127 | @Test
128 | public void getTypesTransitiveTest() throws IOException {
129 | Set res = ocyc.getTypesTransitive(ocyc.pizzaGUID);
130 | res.forEach(s -> {
131 | System.out.println("Pizza: " + ocyc.guidToURLString(s));
132 | });
133 | assertEquals(62, res.size());
134 | }
135 |
136 | @Test
137 | public void guidFromURLStringTest() {
138 | String res = ocyc.guidFromURLString(ocyc.guidToURLString(ocyc.pizzaGUID));
139 | assertEquals(ocyc.pizzaGUID, res);
140 | }
141 |
142 | @Test
143 | public void knownTermTest1() {
144 | boolean res = ocyc.knownTerm("Yangtze_River");
145 | assertTrue(res);
146 | }
147 |
148 | @Test
149 | public void knownTermTest1b() {
150 | // Tests whether terms starting with "the " like "the Yangtze River" are
151 | // also being added without the "the "
152 | boolean res = ocyc.knownTerm("Yangtze River");
153 | assertTrue(res);
154 | }
155 |
156 | @Test
157 | public void knownTermTest2() {
158 | boolean res = ocyc.knownTerm("the Yangtze");
159 | assertTrue(res);
160 | }
161 |
162 | @Test
163 | public void knownTermTest3() {
164 | boolean res = ocyc.knownTerm("rivers");
165 | assertTrue(res);
166 |
167 | }
168 |
169 | @Test
170 | public void knownTermTest4() {
171 | boolean res = ocyc.knownTerm("Hubble_Space_Telescope");
172 | assertTrue(res);
173 |
174 | }
175 | @Test
176 | public void stringsForBirdConceptTest() {
177 | String res = ocyc.labelsForConcept("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA");
178 | assertEquals("Birding|bird|Birds|Birder|Aves|birds|fowl", res);
179 | }
180 |
181 | @Test
182 | public void stringsForConceptTest1() {
183 | String res = ocyc.labelsForConcept("http://sw.opencyc.org/concept/Mx4rvVj5qJwpEbGdrcN5Y29ycA");
184 | assertEquals("Chang Jiang|the Yangtze River|Yangtze|Chang Jiang River|the Yangtze|the Chang Jiang|Yangtze River|the Chang Jiang River", res);
185 | }
186 |
187 | @Test
188 | public void testConceptMap() throws IOException {
189 | Word2VecSpace sp = GoogleNewsW2VSpace.get();
190 | Set yesses = new HashSet<>();
191 | Set allTerms = new HashSet<>();
192 | Iterables.concat(
193 | ocyc.ocycConceptForTermPrettyString.keySet(),
194 | ocyc.ocycConceptForTermLabel.keySet(),
195 | ocyc.ocycConceptForTermLower.keySet()).forEach(lit -> {
196 | if (sp.knownTerm(lit)) {
197 | yesses.add(lit);
198 | }
199 | allTerms.add(lit);
200 |
201 | });
202 | System.out.println("Term strings for ocyc contained in W2V knownterm test:");
203 | System.out.println("\tYes:" + yesses.size());
204 | System.out.println("\t No:" + (allTerms.size() - yesses.size()));
205 | System.out.println("\tAll:" + allTerms.size());
206 | // System.out.println("Yesses: \n" + String.join(", ", yesses));
207 | // System.out.println("Nos: \n" + String.join("; ", allTerms));
208 | assertEquals(67532, yesses.size());
209 | assertEquals(886523, allTerms.size());
210 | }
211 | }
212 |
--------------------------------------------------------------------------------
/OwlTools/src/test/java/com/cyc/tool/owltools/OpenCycOwlIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.owltools;
2 |
3 | /*
4 | * #%L
5 | * OwlTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | //import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace;
24 | //import com.cyc.tool.distributedrepresentations.Word2VecSpace;
25 | import com.google.common.collect.Iterables;
26 | import java.io.IOException;
27 | import java.util.HashSet;
28 | import java.util.Set;
29 | import org.junit.AfterClass;
30 | import static org.junit.Assert.assertEquals;
31 | import static org.junit.Assert.assertTrue;
32 | import org.junit.BeforeClass;
33 | import org.junit.Test;
34 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
35 |
36 | /**
37 | * Tests for OpenCycOwl.
38 | */
39 | public class OpenCycOwlIT {
40 |
41 | static OpenCycOwl ocyc;
42 |
43 | public OpenCycOwlIT() {
44 | }
45 |
46 | @BeforeClass
47 | public static void setUpClass() throws IOException, OWLOntologyCreationException {
48 | ocyc = new OpenCycOwl();
49 |
50 | }
51 |
52 | @AfterClass
53 | public static void tearDownClass() {
54 | // Remove the ontology from the manager
55 | ocyc.close();
56 | }
57 |
58 | @Test
59 | public void conceptForTest1() {
60 | Set res = ocyc.conceptsFor("the Yangtze");
61 | assertEquals(1, res.size());
62 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVj5qJwpEbGdrcN5Y29ycA"));
63 | }
64 |
65 | @Test
66 | public void conceptsForBirdTest1() {
67 | Set res = ocyc.conceptsFor("Bird");
68 | System.out.println("HEY Bird "+res);
69 | assertEquals(2,res.size());
70 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA"));
71 | }
72 |
73 | @Test
74 | public void conceptsForBirdTest2() {
75 | Set res = ocyc.conceptsFor("bird");
76 | System.out.println("HEY bird "+res);
77 | assertEquals(2, res.size());
78 | assertTrue(res.contains("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA"));
79 | }
80 |
81 | // @Test
82 | // public void conceptsWithTermsTest() throws IOException {
83 | // Set res = ocyc.conceptsWithW2VTerms();
84 | // assertEquals(49616, res.size());
85 | // }
86 |
87 | @Test
88 | public void getLabelsTest() {
89 | int res = ocyc.ocycConceptForTermLabel.size();
90 | //ocyc.ocycConceptForTermLabel.keySet().forEach(s->{
91 | // System.out.println(s+"\t"+ocyc.ocycConceptForTermLabel.get(s));
92 | // });
93 | System.out.println("N RDFS Labels with concepts:" + res);
94 | assertEquals(240258, res);
95 | }
96 |
97 | @Test
98 | public void getLowerCaseStringTest() {
99 | int res = ocyc.ocycConceptForTermLower.size();
100 | System.out.println("N downcased pretty strings or labels with concepts:" + res);
101 | assertEquals(576678, res);
102 | }
103 |
104 | @Test
105 | public void getNConceptsTest() {
106 | int res = ocyc.size();
107 | System.out.println("N Classes:" + res);
108 | assertEquals(116842, res);
109 | }
110 |
111 | @Test
112 | public void getPrettyStringTest() {
113 | int res = ocyc.ocycConceptForTermPrettyString.size();
114 | System.out.println("N pretty Strings with concepts:" + res);
115 | assertEquals(345298, res);
116 | }
117 |
118 | @Test
119 | public void getTypesTest() throws IOException {
120 | Set res = ocyc.getTypes(ocyc.pizzaGUID);
121 | res.forEach(s -> {
122 | System.out.println("Pizza: " + ocyc.guidToURLString(s));
123 | });
124 | assertEquals(4, res.size());
125 | }
126 |
127 | @Test
128 | public void getTypesTransitiveTest() throws IOException {
129 | Set res = ocyc.getTypesTransitive(ocyc.pizzaGUID);
130 | res.forEach(s -> {
131 | System.out.println("Pizza: " + ocyc.guidToURLString(s));
132 | });
133 | assertEquals(62, res.size());
134 | }
135 |
136 | @Test
137 | public void guidFromURLStringTest() {
138 | String res = ocyc.guidFromURLString(ocyc.guidToURLString(ocyc.pizzaGUID));
139 | assertEquals(ocyc.pizzaGUID, res);
140 | }
141 |
142 | @Test
143 | public void knownTermTest1() {
144 | boolean res = ocyc.knownTerm("Yangtze_River");
145 | assertTrue(res);
146 | }
147 |
148 | @Test
149 | public void knownTermTest1b() {
150 | // Tests whether terms starting with "the " like "the Yangtze River" are
151 | // also being added without the "the "
152 | boolean res = ocyc.knownTerm("Yangtze River");
153 | assertTrue(res);
154 | }
155 |
156 | @Test
157 | public void knownTermTest2() {
158 | boolean res = ocyc.knownTerm("the Yangtze");
159 | assertTrue(res);
160 | }
161 |
162 | @Test
163 | public void knownTermTest3() {
164 | boolean res = ocyc.knownTerm("rivers");
165 | assertTrue(res);
166 |
167 | }
168 |
169 | @Test
170 | public void knownTermTest4() {
171 | boolean res = ocyc.knownTerm("Hubble_Space_Telescope");
172 | assertTrue(res);
173 |
174 | }
175 | @Test
176 | public void stringsForBirdConceptTest() {
177 | String res = ocyc.labelsForConcept("http://sw.opencyc.org/concept/Mx4rvVi8SJwpEbGdrcN5Y29ycA");
178 | assertEquals("Birding|bird|Birds|Birder|Aves|birds|fowl", res);
179 | }
180 |
181 | @Test
182 | public void stringsForConceptTest1() {
183 | String res = ocyc.labelsForConcept("http://sw.opencyc.org/concept/Mx4rvVj5qJwpEbGdrcN5Y29ycA");
184 | assertEquals("Chang Jiang|the Yangtze River|Yangtze|Chang Jiang River|the Yangtze|the Chang Jiang|Yangtze River|the Chang Jiang River", res);
185 | }
186 |
187 | // @Test
188 | // public void testConceptMap() throws IOException {
189 | // Word2VecSpace sp = GoogleNewsW2VSpace.get();
190 | // Set yesses = new HashSet<>();
191 | // Set allTerms = new HashSet<>();
192 | // Iterables.concat(
193 | // ocyc.ocycConceptForTermPrettyString.keySet(),
194 | // ocyc.ocycConceptForTermLabel.keySet(),
195 | // ocyc.ocycConceptForTermLower.keySet()).forEach(lit -> {
196 | // if (sp.knownTerm(lit)) {
197 | // yesses.add(lit);
198 | // }
199 | // allTerms.add(lit);
200 | //
201 | // });
202 | // System.out.println("Term strings for ocyc contained in W2V knownterm test:");
203 | // System.out.println("\tYes:" + yesses.size());
204 | // System.out.println("\t No:" + (allTerms.size() - yesses.size()));
205 | // System.out.println("\tAll:" + allTerms.size());
206 | // // System.out.println("Yesses: \n" + String.join(", ", yesses));
207 | // // System.out.println("Nos: \n" + String.join("; ", allTerms));
208 | // assertEquals(67532, yesses.size());
209 | // assertEquals(886523, allTerms.size());
210 | // }
211 | }
212 |
213 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/test/java/com/cyc/tool/distributedrepresentations/Word2VecSpaceIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.io.IOException;
24 | import java.util.Arrays;
25 | import java.util.List;
26 | import org.junit.AfterClass;
27 | import static org.junit.Assert.assertEquals;
28 | import static org.junit.Assert.assertTrue;
29 | import static org.junit.Assert.fail;
30 | import org.junit.BeforeClass;
31 | import org.junit.Test;
32 |
33 | /**
34 | * Tests for Word2VecSpace.
35 | */
36 | public class Word2VecSpaceIT {
37 |
38 | static List cr = Arrays.asList("Chinese", "river");
39 | static Word2VecSpace mySpace;
40 |
41 | public Word2VecSpaceIT() {
42 | }
43 |
44 | @BeforeClass
45 |
46 | public static void setUpClass() throws IOException {
47 | mySpace = GoogleNewsW2VSpace.get();
48 | }
49 |
50 | @AfterClass
51 |
52 | public static void tearDownClass() {
53 | mySpace = null;
54 | }
55 | //
56 |
57 | @Test
58 | public void distanceTest() {
59 | assertEquals(1.0, mySpace.cosineSimilarity("skimpy bathing suits", "skimpy_bathing_suits"), 0.00000001);
60 | assertEquals(0.24279, mySpace.cosineSimilarity("skimpy bathing suits", "Giant Octopus"), 0.0001);
61 | assertEquals(0.54801, mySpace.cosineSimilarity("skimpy bathing suits", "bathing suits"), 0.0001);
62 | assertEquals(0.645069, mySpace.cosineSimilarity("apple", "pear"), 0.0001);
63 | assertEquals(0.20749, mySpace.cosineSimilarity("apple", "cat"), 0.0001);
64 |
65 | assertTrue(mySpace.cosineSimilarity("apple", "pear")
66 | > mySpace.cosineSimilarity("apple", "cat"));
67 | }
68 |
69 | @Test
70 | public void getVectorTest1() {
71 | assertEquals(-0.05338118f, (mySpace.getVector("skimpy bathing suits")[5]), 0.000001);
72 | assertEquals(0.047296f, (mySpace.getVector("skimpy bathing suits")[105]), 0.000001);
73 | }
74 |
75 | @Test
76 | public void getVectorTest2a() {
77 | assertEquals(-0.049851f, (mySpace.getVector("Chinese")[0]), 0.000001);
78 | assertEquals(-0.090444f, (mySpace.getVector("Chinese")[5]), 0.000001);
79 | }
80 |
81 | @Test
82 | public void getVectorTest2b() {
83 | assertEquals(0.002663f, (mySpace.getVector("river")[0]), 0.000001);
84 | assertEquals(-0.029231f, (mySpace.getVector("river")[5]), 0.000001);
85 | }
86 |
87 | @Test
88 | public void googleDistanceTest1() {
89 | try {
90 | assertEquals(0.667376,
91 | mySpace.googleSimilarity(cr, "Yangtze_River"), 0.0001);
92 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
93 | fail("took unexpected exception:" + ex);
94 | }
95 | }
96 |
97 | @Test
98 | public void googleDistanceTest2() {
99 | try {
100 | assertEquals(0.594108,
101 | mySpace.googleSimilarity(cr, "Hongze_Lake"), 0.0001);
102 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
103 | fail("took unexpected exception:" + ex);
104 | }
105 | }
106 |
107 | @Test
108 | public void googleDistanceTest3() {
109 | try {
110 | assertEquals(0.604726,
111 | mySpace.googleSimilarity(cr, "Huangpu_River"), 0.0001);
112 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
113 | fail("took unexpected exception:" + ex);
114 | }
115 | }
116 |
117 | @Test
118 | public void googleNormVectorTest0() {
119 | try {
120 | float[] norm = mySpace.getGoogleNormedVector(cr);
121 | assertEquals(-0.032075, norm[0], 0.000001);
122 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
123 | fail("took unexpected exception:" + ex);
124 | }
125 | }
126 |
127 | @Test
128 | public void googleNormVectorTest100() {
129 | float[] norm;
130 | try {
131 | norm = mySpace.getGoogleNormedVector(cr);
132 | assertEquals(-0.095236, norm[100], 0.000001);
133 |
134 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
135 | fail("took unexpected exception:" + ex);
136 | }
137 | }
138 |
139 | @Test
140 | public void googleNormVectorTest5() {
141 | try {
142 | float[] norm = mySpace.getGoogleNormedVector(cr);
143 | assertEquals(-0.081347, norm[5], 0.000001);
144 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
145 | fail("took unexpected exception:" + ex);
146 | }
147 | }
148 |
149 | @Test
150 | public void googleNormVectorTest50() {
151 | try {
152 | float[] norm = mySpace.getGoogleNormedVector(cr);
153 | assertEquals(0.080537, norm[50], 0.000001);
154 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
155 | fail("took unexpected exception:" + ex);
156 | }
157 | }
158 |
159 | /**
160 | * Test if known terms have been loaded from the Word2Vec file or DB
161 | */
162 | @Test
163 | public void knownTermTest() {
164 | // System.out.println("DB Size:" + vectors.size());
165 |
166 | assertTrue(mySpace.knownTerm("Yathra"));
167 | assertTrue(mySpace.knownTerm("skimpy bathing suits"));
168 | assertTrue(mySpace.knownTerm("Giant_Octopus"));
169 | assertTrue(mySpace.knownTerm("Yangtze_River"));
170 | assertTrue(mySpace.knownTerm("Chinese"));
171 | // assertTrue(mySpace.knownTerm("Chinese River"));
172 |
173 | }
174 |
175 | // @Test
176 | // public void findNearbyTerms1() {
177 | // try {
178 | // long t1 = System.currentTimeMillis();
179 | // List matches = mySpace.findNearestNForWithInputTermFiltering(cr, 40);
180 | // IntStream.range(0, matches.size())
181 | // .forEach(i -> {
182 | // System.out.println(i + " " + matches.get(i).toString());
183 | // });
184 | // System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
185 | // assertEquals(matches.get(0).getTerm(), "Yangtze_River");
186 | // assertEquals(0.604726, matches.get(5).getSimilarity(), 0.000001);
187 | //
188 | // assertEquals(matches.get(23).getTerm(), "rivers");
189 | // } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
190 | // fail("took unexpected exception:" + ex);
191 | // }
192 | // }
193 | //
194 | // @Test
195 | //
196 | // public void findNearbyTerms2() {
197 | // try {
198 | // long t1 = System.currentTimeMillis();
199 | // List matches = mySpace.findNearestNForWithInputTermFiltering(Arrays.asList("gangplank"), 40);
200 | // IntStream.range(0, matches.size())
201 | // .forEach(i -> {
202 | // System.out.println(i + " " + matches.get(i).toString());
203 | // });
204 | // System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
205 | // } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
206 | // fail("took unexpected exception:" + ex);
207 | // }
208 | // }
209 | @Test
210 | public void testNGramsFor() {
211 | List res = Word2VecSpace.nGramsFor(Arrays.asList("this", "is", "a", "test"));
212 | // System.out.println("test: "+res+" len:"+res.size());
213 |
214 | assertEquals(10, res.size());
215 | }
216 |
217 | @Test
218 | public void testNGramsForCR() {
219 | List res = Word2VecSpace.nGramsFor(cr);
220 | System.out.println("test: " + res + " len:" + res.size());
221 | assertEquals(3, res.size());
222 | }
223 |
224 | }
225 |
--------------------------------------------------------------------------------
/OwlTools/src/main/java/com/cyc/tool/owltools/OpenCycContent.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.owltools;
2 |
3 | /*
4 | * #%L
5 | * OwlTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.util.ArrayList;
24 | import java.util.Collection;
25 | import java.util.HashSet;
26 | import java.util.List;
27 | import java.util.Set;
28 | import org.semanticweb.owlapi.model.IRI;
29 | import org.semanticweb.owlapi.model.OWLAnnotation;
30 | import org.semanticweb.owlapi.model.OWLClass;
31 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
32 | import org.semanticweb.owlapi.reasoner.NodeSet;
33 | import org.semanticweb.owlapi.search.EntitySearcher;
34 |
35 | /**
36 | *
37 | * OpenCycContent is designed to hold information about a given OpenCyc concept that can be found in
38 | * the OWL export of OpenCyc.
39 | *
40 | * This software is the proprietary information of Cycorp, Inc.
41 | *
42 | * Use is subject to license terms.
43 | *
44 | * Created on : Feb 25, 2015, 2:47:47 PM
45 | */
46 | public class OpenCycContent {
47 |
48 | Set commentsForConcept;
49 | String conceptURI;
50 | String labelForConcept;
51 | Set prettyStringsForConcept;
52 | Set subTypesForConcept;
53 |
54 | Set typesForConcept;
55 |
56 | //// Constructors
57 | /**
58 | * Creates a new instance of OpenCycContent.
59 | *
60 | * @param hlid
61 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException
62 | */
63 | public OpenCycContent(String hlid) throws OWLOntologyCreationException {
64 | conceptURI = hlid;
65 | prettyStringsForConcept = null;
66 | commentsForConcept = null;
67 | labelForConcept = null;
68 | typesForConcept = null;
69 | }
70 |
71 | /**
72 | *
73 | * @return HTML String with information about the concept
74 | * @throws OWLOntologyCreationException
75 | */
76 | public String generateHtmlForConcept() throws OWLOntologyCreationException {
77 | String html = "";
78 | String constantName = getLabelForConcept();
79 | Set commentStr = getCommentsForConcept();
80 | Set prettyStr = getPrettyStringsForConcept();
81 | html += "" + constantName + " \n\n"
82 | + selectPicForConcept(getTypesForConcept())
83 | + "" + commentStr.toArray(new String[0])[0] + "
\n"
84 | + "English Phrases:
\n"
85 | + "\n";
86 | for (String s : prettyStr) {
87 | html += " " + s + " \n";
88 | }
89 | html += " \n";
90 |
91 | return html;
92 | }
93 |
94 | /**
95 | *
96 | * @return Set of String comments
97 | * @throws OWLOntologyCreationException
98 | */
99 | public Set getCommentsForConcept() throws OWLOntologyCreationException {
100 | if (commentsForConcept == null) {
101 | commentsForConcept = getCommentsForConceptFromOWL();
102 | }
103 | return commentsForConcept;
104 | }
105 |
106 | /**
107 | *
108 | * @return The CycL constant name
109 | * @throws OWLOntologyCreationException
110 | */
111 | public String getLabelForConcept() throws OWLOntologyCreationException {
112 | if (labelForConcept == null) {
113 | labelForConcept = getLabelForConceptFromOWL();
114 | }
115 | return labelForConcept;
116 | }
117 |
118 | /**
119 | *
120 | * @return Set of Strings with NL for the concept
121 | * @throws OWLOntologyCreationException
122 | */
123 | public Set getPrettyStringsForConcept() throws OWLOntologyCreationException {
124 | if (prettyStringsForConcept == null) {
125 | prettyStringsForConcept = getPrettyStringsForConceptFromOWL();
126 | }
127 | return prettyStringsForConcept;
128 | }
129 |
130 | /**
131 | *
132 | * @return Set of Strings with names for generalizations of the concept
133 | * @throws OWLOntologyCreationException
134 | */
135 | public Set getSubTypesForConcept() throws OWLOntologyCreationException {
136 | if (subTypesForConcept == null) {
137 | subTypesForConcept = getSubTypesForConceptFromOWL();
138 | }
139 | return subTypesForConcept;
140 | }
141 |
142 | /**
143 | *
144 | * @return Set of Strings with names for specializations of the concept
145 | * @throws OWLOntologyCreationException
146 | */
147 | public Set getTypesForConcept() throws OWLOntologyCreationException {
148 | if (typesForConcept == null) {
149 | typesForConcept = getTypesForConceptFromOWL();
150 | }
151 | return typesForConcept;
152 | }
153 |
154 | private Set getCommentsForConceptFromOWL() throws OWLOntologyCreationException {
155 | OpenCycReasoner reasoner = OpenCycReasoner.get();
156 | Set comments = new HashSet<>();
157 | OWLClass concept = reasoner.getDataFactory().getOWLClass(IRI.create("http://sw.opencyc.org/concept/" + conceptURI));
158 | Collection anns = EntitySearcher.getAnnotations(concept, reasoner.getOpenCyc(), reasoner.getComment());
159 | anns.forEach(ann -> {
160 | comments.add(ann.getValue().asLiteral().get().getLiteral());
161 | });
162 |
163 | return comments;
164 | }
165 |
166 | private String getLabelForConceptFromOWL() throws OWLOntologyCreationException {
167 | OpenCycReasoner reasoner = OpenCycReasoner.get();
168 | String label = "";
169 | List labels = new ArrayList<>();
170 | OWLClass concept = reasoner.getDataFactory().getOWLClass(IRI.create("http://sw.opencyc.org/concept/" + conceptURI));
171 | Collection anns = EntitySearcher.getAnnotations(concept, reasoner.getOpenCyc(), reasoner.getLabel());
172 | anns.forEach(ann -> {
173 | labels.add(ann.getValue().asLiteral().get().getLiteral());
174 | });
175 | if (conceptURI.contains("Mx")) {
176 | try {
177 | label = labels.get(0);
178 | } catch (Exception e) {
179 | System.out.println("Something went wrong getting the label from OWL");
180 | label = "FakeName";
181 | }
182 | }
183 | return label;
184 | }
185 |
186 | private Set getPrettyStringsForConceptFromOWL() throws OWLOntologyCreationException {
187 | OpenCycReasoner reasoner = OpenCycReasoner.get();
188 | Set prettyStrings = new HashSet<>();
189 | OWLClass concept = reasoner.getDataFactory().getOWLClass(IRI.create("http://sw.opencyc.org/concept/" + conceptURI));
190 | Collection anns = EntitySearcher.getAnnotations(concept, reasoner.getOpenCyc(), reasoner.getPrettyString());
191 | anns.forEach(ann -> {
192 | prettyStrings.add(ann.getValue().asLiteral().get().getLiteral());
193 | });
194 |
195 | return prettyStrings;
196 | }
197 |
198 | private Set getSubTypesForConceptFromOWL() throws OWLOntologyCreationException {
199 | OpenCycReasoner reasoner = OpenCycReasoner.get();
200 | Set types = new HashSet<>();
201 | OWLClass concept = reasoner.getDataFactory().getOWLClass(IRI.create("http://sw.opencyc.org/concept/" + conceptURI));
202 | NodeSet subClasses = reasoner.getReasoner().getSubClasses(concept, true);
203 | subClasses.forEach(node -> {
204 | Set ents = node.getEntities();
205 | ents.forEach(ent -> {
206 | types.add(ent.getIRI().getShortForm());
207 | });
208 | });
209 | return types;
210 | }
211 |
212 | private Set getTypesForConceptFromOWL() throws OWLOntologyCreationException {
213 | OpenCycReasoner reasoner = OpenCycReasoner.get();
214 | Set types = new HashSet<>();
215 | OWLClass concept = reasoner.getDataFactory().getOWLClass(IRI.create("http://sw.opencyc.org/concept/" + conceptURI));
216 | NodeSet subClasses = reasoner.getReasoner().getSuperClasses(concept, true);
217 | subClasses.forEach(node -> {
218 | Set ents = node.getEntities();
219 | ents.forEach(ent -> {
220 | types.add(ent.getIRI().getShortForm());
221 | });
222 | });
223 | return types;
224 | }
225 |
226 | //// Protected Area
227 | private String selectPicForConcept(Set types) {
228 | String picHTML = " ";
229 | for (String type : types) {
230 | if (type.equalsIgnoreCase("Mx4rvViADZwpEbGdrcN5Y29ycA")) {
231 | // Event
232 | picHTML = " ";
233 | return picHTML;
234 | } else if (type.equalsIgnoreCase("Mx4rIcwFloGUQdeMlsOWYLFB2w")) {
235 | // Human
236 | picHTML = " ";
237 | return picHTML;
238 | } else if (type.equalsIgnoreCase("Mx4rv-6HepwpEbGdrcN5Y29ycA")) {
239 | // Transportation
240 | picHTML = " ";
241 | }
242 | }
243 |
244 | return picHTML;
245 | }
246 | }
247 |
--------------------------------------------------------------------------------
/DistributedRepresentations/src/main/java/com/cyc/tool/distributedrepresentations/Word2VecSpace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.distributedrepresentations;
2 |
3 | /*
4 | * #%L
5 | * DistributedRepresentations
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import java.util.ArrayList;
24 | import java.util.Arrays;
25 | import java.util.List;
26 | import java.util.Map;
27 | import java.util.Map.Entry;
28 | import java.util.concurrent.ConcurrentNavigableMap;
29 | import java.util.function.Predicate;
30 | import java.util.stream.Collectors;
31 | import java.util.stream.IntStream;
32 | import org.mapdb.DB;
33 |
34 | /**
35 | * A space of words from Google Word2Vec
36 | *
37 | */
38 | public abstract class Word2VecSpace {
39 |
40 | private int size;
41 | DB db;
42 | Map vectors;
43 | long words;
44 |
45 | /**
46 | *
47 | * @param terms
48 | * @return a List of Strings containing nGrams for terms
49 | */
50 | public static List nGramsFor(List terms) {
51 | final List grams = new ArrayList();
52 | IntStream.rangeClosed(1, terms.size()).forEach(length -> {
53 | IntStream.rangeClosed(0, terms.size() - length).forEach(start -> {
54 | List l = terms.subList(start, start + length);
55 | grams.add(String.join(" ", l));
56 | });
57 |
58 | });
59 | return grams;
60 | }
61 |
62 | private static String norm(String term) {
63 | return term.replaceAll("\\s+", "_");
64 | }
65 |
66 | private double cosineSimilarity(float[] v1, float[] v2) {
67 | return dotProduct(v1, v2) / (magnitude(v1) * magnitude(v2));
68 | }
69 |
70 | /**
71 | *
72 | * @param t1
73 | * @param t2
74 | * @return the cosine similarity
75 | */
76 | public double cosineSimilarity(String t1, String t2) {
77 | return cosineSimilarity(getVector(t1), getVector(t2));
78 | }
79 |
80 | private double dotProduct(float[] v1, float[] v2) {
81 | return IntStream.range(0, v1.length)
82 | .mapToDouble(i -> (double) v1[i] * (double) v2[i])
83 | .sum();
84 | }
85 |
86 | private double euclidianDistance(float[] v1, float[] v2) {
87 | double dist = Math.sqrt(IntStream.range(0, v1.length)
88 | .mapToDouble(i -> Math.pow((double) v1[i] - (double) v2[i], 2))
89 | .sum());
90 | return dist;
91 | }
92 |
93 | private double euclidianDistance(String t1, String t2) {
94 | return euclidianDistance(getVector(t1), getVector(t2));
95 | }
96 |
97 | private float[] getAverageVector(List terms) {
98 | final float sum[] = new float[size];
99 | final double mult = 1.0 / terms.size();
100 | terms.forEach(s -> {
101 | float v[] = getVector(s);
102 | IntStream.range(0, size)
103 | .forEach(i -> {
104 | sum[i] += mult * v[i];
105 | });
106 | });
107 | return sum;
108 | }
109 |
110 | /**
111 | *
112 | * @return the db
113 | */
114 | public DB getDb() {
115 | return db;
116 | }
117 |
118 | /**
119 | * Set up the DB.
120 | *
121 | * @param db
122 | */
123 | public void setDb(DB db) {
124 | this.db = db;
125 | }
126 |
127 | /**
128 | *
129 | * @param terms
130 | * @return the sum of term vectors divided by vector length
131 | * @throws NoWordToVecVectorForTerm
132 | */
133 | public float[] getGoogleNormedVector(List terms) throws NoWordToVecVectorForTerm {
134 | // Sum of term vectors divided by vector length
135 | // Note that this will miss multi-word exact matches, so prefer getMaximalNormedVector
136 | //except for exact code comparison tests
137 | final float sum[] = new float[size];
138 | if (terms.stream().allMatch(s -> !knownTerm(s))) {
139 | throw new NoWordToVecVectorForTerm("Can't find vector for:" + String.join(", ", terms));
140 | }
141 | terms.stream()
142 | .filter(s -> knownTerm(s))
143 | .forEach(s -> {
144 | float v[] = getVector(s);
145 | IntStream.range(0, size)
146 | .forEach(i -> {
147 | sum[i] += v[i];
148 | });
149 | });
150 | return normVector(sum);
151 | }
152 |
153 | /**
154 | *
155 | * @param interms
156 | * @return the maximal normed vector
157 | * @throws NoWordToVecVectorForTerm
158 | */
159 | public float[]
160 | getMaximalNormedVector(List interms) throws NoWordToVecVectorForTerm {
161 | // Sum of term ngram vectors divided by vector length
162 | List terms = nGramsFor(interms);
163 | final float sum[] = new float[size];
164 | if (terms.stream().allMatch(s -> !knownTerm(s))) {
165 | throw new NoWordToVecVectorForTerm("Can't find vector for:" + String.join(", ", terms));
166 | }
167 | terms.stream()
168 | .filter(s -> knownTerm(s))
169 | .forEach(s -> {
170 | float v[] = getVector(s);
171 | IntStream.range(0, size)
172 | .forEach(i -> {
173 | sum[i] += v[i];
174 | });
175 | });
176 | return normVector(sum);
177 | }
178 |
179 | /**
180 | *
181 | * @return size of vectors
182 | */
183 | public int getNVectors() {
184 | return vectors.size();
185 | }
186 |
187 | /**
188 | *
189 | * @return size of the Word2VecSpace
190 | */
191 | public int getSize() {
192 | return size;
193 | }
194 |
195 | /**
196 | *
197 | * @param size
198 | */
199 | public void setSize(int size) {
200 | this.size = size;
201 | }
202 |
203 | /**
204 | *
205 | * @param term
206 | * @return the vector for term
207 | */
208 | public float[] getVector(String term) {
209 | return vectors.get(norm(term));
210 | }
211 |
212 | /**
213 | *
214 | * @return the vectors
215 | */
216 | public Map getVectors() {
217 | return vectors;
218 | }
219 |
220 | /**
221 | *
222 | * @param vectors
223 | */
224 | public void setVectors(ConcurrentNavigableMap vectors) {
225 | this.vectors = vectors;
226 | }
227 |
228 | /**
229 | *
230 | * @return the words
231 | */
232 | public long getWords() {
233 | return words;
234 | }
235 |
236 | /**
237 | *
238 | * @param words
239 | */
240 | public void setWords(long words) {
241 | this.words = words;
242 | }
243 |
244 | /**
245 | *
246 | * @param v1
247 | * @param v2
248 | * @return the similarity between v1 and v2
249 | */
250 | public double googleSimilarity(float[] v1, float[] v2) {
251 | return dotProduct(v1, v2);
252 | }
253 |
254 | private double googleSimilarity(String t1, String t2) {
255 | return googleSimilarity(getVector(t1), getVector(t2));
256 | }
257 |
258 | /**
259 | *
260 | * @param terms
261 | * @param term
262 | * @return the similarity
263 | * @throws NoWordToVecVectorForTerm
264 | */
265 | public double googleSimilarity(List terms, String term) throws NoWordToVecVectorForTerm {
266 | return googleSimilarity(getGoogleNormedVector(terms), getVector(term));
267 | }
268 |
269 | /**
270 | *
271 | * @param term
272 | * @return true if term is in vectors
273 | */
274 | public boolean knownTerm(String term) {
275 | return vectors.containsKey(norm(term));
276 | }
277 |
278 | private double magnitude(float[] v) {
279 | return Math.sqrt(IntStream.range(0, v.length).mapToDouble(i -> v[i] * v[i]).sum());
280 | }
281 |
282 | private double magnitude(List v) {
283 | return Math.sqrt(v.stream().mapToDouble(i -> i * i).sum());
284 | }
285 |
286 | /**
287 | *
288 | * @param v
289 | * @return normalized vector for v
290 | */
291 | public float[] normVector(float[] v) {
292 | final float normed[] = new float[size];
293 | double len = magnitude(v);
294 |
295 | IntStream.range(0, size)
296 | .forEach(i -> {
297 | normed[i] = v[i] / (float) len;
298 | });
299 | return normed;
300 | }
301 |
302 | /**
303 | *
304 | * @param v
305 | * @return normalized vector for v
306 | */
307 | public float[] normVector(List v) {
308 | final float normed[] = new float[v.size()];
309 | double len = magnitude(v);
310 |
311 | IntStream.range(0, v.size())
312 | .forEach(i -> {
313 | normed[i] = v.get(i) / (float) len;
314 | });
315 | return normed;
316 | }
317 |
318 | /**
319 | *
320 | * @param s
321 | * @return List of Strings
322 | */
323 | public List stringToList(String s) {
324 | return Arrays.asList(s.split("\\s+"));
325 | }
326 |
327 | /**
328 | *
329 | * @param includeIf the predicate that is applied to the strings (the keys or embedded strings)
330 | * of the word to vec space to determine whether they should be retained in the output vector list
331 | * @return filtered vectors Map
332 | */
333 | protected Map filterVectors(Predicate includeIf) {
334 | return vectors.entrySet().stream().filter(entry -> {
335 | return includeIf.test(entry.getKey());
336 | }).collect(Collectors.toMap(Entry::getKey, Entry::getValue));
337 | }
338 |
339 | /**
340 | * No Vector for Term
341 | *
342 | * Exception to use check when a term looked up in the space has no known position
343 | */
344 | public static class NoWordToVecVectorForTerm extends Exception {
345 |
346 | /**
347 | *
348 | * @param message
349 | */
350 | public NoWordToVecVectorForTerm(String message) {
351 | super(message);
352 | }
353 | }
354 | }
355 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/MissingConceptFinder.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
24 | import com.cyc.tool.owltools.OpenCycOwl;
25 | import java.io.File;
26 | import java.io.IOException;
27 | import java.util.ArrayList;
28 | import java.util.Arrays;
29 | import java.util.HashMap;
30 | import java.util.HashSet;
31 | import java.util.List;
32 | import java.util.Map;
33 | import java.util.Set;
34 | import java.util.concurrent.ConcurrentNavigableMap;
35 | import java.util.function.Predicate;
36 | import java.util.stream.Collectors;
37 | import java.util.stream.IntStream;
38 | import java.util.stream.Stream;
39 | import org.mapdb.DB;
40 | import org.mapdb.DBMaker;
41 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
42 |
43 | /**
44 | * Methods for finding missing concepts with a ConceptSpace, a Word2VecSpace, and OpenCyc.
45 | */
46 | abstract public class MissingConceptFinder {
47 |
48 | final private ConceptSpace cSpace;
49 | final private OpenCycOwl ocyc;
50 | private final Word2VecSpace w2vs;
51 | ConcurrentNavigableMap> conceptsForMissingTerms;
52 | DB db;
53 | List missingConceptNames;
54 | List missingMappingNames;
55 | ConcurrentNavigableMap missingTerms;
56 |
57 | /**
58 | * MissingConceptFinder constructor.
59 | *
60 | * @param w2v
61 | * @param oco
62 | * @throws IOException
63 | * @throws OWLOntologyCreationException
64 | */
65 | public MissingConceptFinder(Word2VecSpace w2v, OpenCycOwl oco) throws IOException, OWLOntologyCreationException {
66 | this(w2v, oco, null);
67 | }
68 |
69 | /**
70 | * MissingConceptFinder constructor.
71 | *
72 | * @param w2v
73 | * @param oco
74 | * @param cSpace
75 | * @throws IOException
76 | * @throws OWLOntologyCreationException
77 | */
78 | public MissingConceptFinder(Word2VecSpace w2v, OpenCycOwl oco, ConceptSpace cSpace) throws IOException, OWLOntologyCreationException {
79 | w2vs = w2v;
80 | ocyc = oco;
81 | this.cSpace = cSpace;
82 | db = DBMaker.newFileDB(new File(ConceptFinderConfig.getMissingConceptDBFile()))
83 | .closeOnJvmShutdown()
84 | // .encryptionEnable("password")
85 | .make();
86 |
87 | //Use this to reset
88 | // missingTerms.clear(); db.commit();
89 | }
90 |
91 | /**
92 | *
93 | * @return a List of Strings
94 | */
95 | public List conceptsWithTerms() {
96 | return this.getConceptsForMissingTerms().keySet().stream()
97 | .map(i -> Arrays.asList(getMissingTerms().get(i))
98 | .stream()
99 | .collect(Collectors.joining("|")))
100 | .collect(Collectors.toList());
101 | }
102 |
103 | /**
104 | * @return the conceptsForMissingTerms
105 | */
106 | public ConcurrentNavigableMap> getConceptsForMissingTerms() {
107 | return conceptsForMissingTerms;
108 | }
109 |
110 | /**
111 | * @param conceptsForMissingTerms the conceptsForMissingTerms to set
112 | */
113 | public void setConceptsForMissingTerms(ConcurrentNavigableMap> conceptsForMissingTerms) {
114 | this.conceptsForMissingTerms = conceptsForMissingTerms;
115 | }
116 |
117 | /**
118 | * @return the db
119 | */
120 | public DB getDb() {
121 | return db;
122 | }
123 |
124 | /**
125 | * @return the missingConceptNames
126 | */
127 | public List getMissingConceptNames() {
128 | return missingConceptNames;
129 | }
130 |
131 | /**
132 | * @param missingConceptNames the missingConceptNames to set
133 | */
134 | public void setMissingConceptNames(List missingConceptNames) {
135 | this.missingConceptNames = missingConceptNames;
136 | }
137 |
138 | /**
139 | * @return the missingMappingNames
140 | */
141 | public List getMissingMappingNames() {
142 | return missingMappingNames;
143 | }
144 |
145 | /**
146 | * @param missingMappingNames the missingMappingNames to set
147 | */
148 | public void setMissingMappingNames(List missingMappingNames) {
149 | this.missingMappingNames = missingMappingNames;
150 | }
151 |
152 | /**
153 | *
154 | * @return the missingTerms
155 | */
156 | public ConcurrentNavigableMap getMissingTerms() {
157 | return missingTerms;
158 | }
159 |
160 | /**
161 | * @param missingTerms the missingTerms to set
162 | */
163 | public void setMissingTerms(ConcurrentNavigableMap missingTerms) {
164 | this.missingTerms = missingTerms;
165 | }
166 |
167 | /**
168 | *
169 | * @return the number of missing concepts
170 | */
171 | public int missingConceptCount() {
172 | return getMissingConceptNames().size();
173 | }
174 |
175 | /**
176 | *
177 | * @param testCase
178 | * @return a Set of AttachmentHypotheses
179 | */
180 | protected Set findNearbyTermsWithGraphCore(String testCase) {
181 | return findNearbyTermsWithGraphCore(testCase, -1);
182 | }
183 |
184 | /**
185 | *
186 | * @param termStrings
187 | * @param n
188 | * @return a Set of AttachmentHypotheses
189 | */
190 | protected Set
191 | findNearbyTermsWithGraphCore(List termStrings, int n) {
192 | long t1 = System.currentTimeMillis();
193 | Set hypotheses = new HashSet<>();
194 |
195 | Set allTypes = new HashSet<>();
196 | Map typeWeights = new HashMap<>();
197 |
198 | Map conceptEvidence = new HashMap<>();
199 | System.out.print("====" + String.join("/", termStrings) + "====" + (n < 0 ? "" : " " + n) + " \t");
200 | List matches = new ArrayList<>();
201 | for (String term : termStrings) {
202 | try {
203 | matches.addAll(cSpace.findNearestNForIn(term, 40, ocyc));
204 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
205 | }
206 | }
207 | if (matches.size() == 0) {
208 | // assertEquals("common_eiders", matches.get(10).term);
209 | System.out.println("Terms [" + termStrings + "] have no words in Word2Vec");
210 | return hypotheses; // which is empty at this point
211 | // fail("took unexpected exception:" + ex);
212 | }
213 | IntStream.range(0, matches.size())
214 | .forEach(i -> {
215 | ConceptMatch m = matches.get(i);
216 | //System.out.println(i + " " + m.toString());
217 | if (m.concept != null) {
218 | allTypes.add(m.concept);
219 | typeWeights.put(m.concept,
220 | (typeWeights.containsKey(m.concept) ? typeWeights.get(m.concept) : 0.0d)
221 | + m.similarity);
222 | }
223 | });
224 | allTypes.forEach(s -> {
225 | Double weight = typeWeights.get(s);
226 | Set transTypes = ocyc.getTypesTransitiveURL(s);
227 | Set immedTypes = ocyc.getTypesURL(s);
228 |
229 | Set ret
230 | = Stream.concat(
231 | transTypes
232 | .stream()
233 | .filter(type -> allTypes.contains(type)),
234 | immedTypes.stream()
235 | ).collect(Collectors.toSet());
236 |
237 | if (!ret.isEmpty()) {
238 | ret.forEach(t -> {
239 | if (!conceptEvidence.containsKey(t)) {
240 | conceptEvidence.put(t, weight);
241 | } else {
242 | conceptEvidence.put(t, conceptEvidence.get(t) + weight);
243 | }
244 | });
245 |
246 | }
247 | });
248 |
249 | final double max = conceptEvidence.entrySet().stream()
250 | .mapToDouble(e -> e.getValue()).max().orElse(0);
251 |
252 | Set maxc = conceptEvidence.entrySet().stream()
253 | .filter(e -> e.getValue() == max)
254 | .map(e -> e.getKey()).collect(Collectors.toSet());
255 | System.out.println("Maximum parent count:" + max);
256 | System.out.println("Maximal parents:"
257 | + maxc.stream().map(s -> ocyc.labelsForConcept(s) + ": " + s)
258 | .collect(Collectors.joining("\n\t")));
259 | maxc.forEach(c -> hypotheses.add(new AttachmentHypothesis(n, termStrings,
260 | c, max, ocyc.labelsForConcept(c))));
261 | System.out.println("-----" + (System.currentTimeMillis() - t1) + "ms -----");
262 | return hypotheses; // Since we take the max of a double, there should be only one
263 | }
264 |
265 | /**
266 | *
267 | * @param testCase
268 | * @param n
269 | * @return a Set of AttachmentHypotheses
270 | * @deprecated
271 | */
272 | @Deprecated
273 | protected Set findNearbyTermsWithGraphCore(String testCase, int n) {
274 | List termStrings = new ArrayList<>();
275 | termStrings.add(testCase);
276 | return findNearbyTermsWithGraphCore(termStrings, n);
277 |
278 | }
279 |
280 | /**
281 | *
282 | * @return a List of names in the W2V space
283 | * @deprecated
284 | */
285 | @Deprecated //Depends on a variable that is only set in an initialisation phase
286 | protected List namesInW2V() {
287 | if (getMissingMappingNames() == null) {
288 | return null;
289 | }
290 | return getMissingMappingNames().stream()
291 | .filter(hasElementInW2V())
292 | .map(a -> a[0])
293 | .collect(Collectors.toList());
294 | }
295 |
296 | Predicate hasElementInW2V() {
297 | return a -> Arrays.stream(a)
298 | .anyMatch(w2vs::knownTerm);
299 | }
300 |
301 | }
302 |
--------------------------------------------------------------------------------
/ConceptFinder/src/main/java/com/cyc/tool/conceptfinder/ConceptSpace.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm;
25 | import com.cyc.tool.distributedrepresentations.Word2VecSubspace;
26 | import com.cyc.tool.owltools.OpenCycOwl;
27 | import java.io.IOException;
28 | import java.util.Comparator;
29 | import java.util.List;
30 | import java.util.function.Function;
31 | import java.util.function.Predicate;
32 | import java.util.stream.Collectors;
33 |
34 | /**
35 | *
36 | * ConceptSpace provides access to a Word2VecSpace and methods for finding ConceptMatches.
37 | */
38 | public class ConceptSpace {
39 |
40 | Word2VecSpace w2vSpace;
41 |
42 | /**
43 | * Creates a new instance of ConceptSpace.
44 | *
45 | * @param w2v
46 | * @throws java.io.IOException
47 | */
48 | public ConceptSpace(Word2VecSpace w2v) throws IOException {
49 | w2vSpace = w2v;
50 | }
51 |
52 | /**
53 | *
54 | * @param terms
55 | * @param n
56 | * @return a List of ConceptMatches
57 | * @throws NoWordToVecVectorForTerm
58 | */
59 | public List findNearestNFor(List terms, Integer n) throws NoWordToVecVectorForTerm {
60 | return findNearest(w2vSpace.getMaximalNormedVector(terms))
61 | .stream()
62 | .collect(Collectors.toList())
63 | .subList(0, n);
64 | }
65 |
66 | /**
67 | *
68 | * @param terms
69 | * @param n
70 | * @return a List of ConceptMatches
71 | * @throws NoWordToVecVectorForTerm
72 | */
73 | public List findNearestNFor(String terms, Integer n) throws NoWordToVecVectorForTerm {
74 | return findNearestNFor(w2vSpace.stringToList(terms), n);
75 |
76 | }
77 |
78 | /**
79 | *
80 | * @param terms
81 | * @param n
82 | * @param ocyc
83 | * @return a List of ConceptMatches
84 | * @throws NoWordToVecVectorForTerm
85 | */
86 | public List findNearestNForIn(List terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm {
87 | float[] norm = w2vSpace.getMaximalNormedVector(terms);
88 | return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t)))
89 | .stream()
90 | .collect(Collectors.toList())
91 | .subList(0, n);
92 | }
93 |
94 | /**
95 | *
96 | * @param terms
97 | * @param n
98 | * @param ocyc
99 | * @return a List of ConceptMatches
100 | * @throws NoWordToVecVectorForTerm
101 | */
102 | public List findNearestNForIn(String terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm {
103 |
104 | return findNearestNForIn(w2vSpace.stringToList(terms), n, ocyc);
105 |
106 | }
107 |
108 | /**
109 | *
110 | * @param terms
111 | * @param n
112 | * @param ocyc
113 | * @return a List of ConceptMatches
114 | * @throws NoWordToVecVectorForTerm
115 | */
116 | public List findNearestNForInStrictW2V(List terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm {
117 | float[] norm = w2vSpace.getGoogleNormedVector(terms);
118 | return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t)))
119 | .stream()
120 | .collect(Collectors.toList())
121 | .subList(0, n);
122 | }
123 |
124 | /**
125 | *
126 | * @param terms
127 | * @param n
128 | * @param ocyc
129 | * @return a List of ConceptMatches
130 | * @throws NoWordToVecVectorForTerm
131 | */
132 | public List findNearestNForInStrictW2V(String terms, Integer n, OpenCycOwl ocyc) throws NoWordToVecVectorForTerm {
133 | float[] norm = w2vSpace.getGoogleNormedVector(w2vSpace.stringToList(terms));
134 | return findNearestWhere(norm, m -> ocyc.knownTerm(m), t -> String.join(" | ", ocyc.conceptsFor(t)))
135 | .stream()
136 | .collect(Collectors.toList())
137 | .subList(0, n);
138 | }
139 |
140 | /**
141 | * Find the position of terms in the larger space from which this is derived a larger space, and
142 | * then search around them in a this space that spans fewer terms, but is otherwise the same
143 | *
144 | * Will fail if the space for this concept space is not a SubSpace
145 | *
146 | * @param terms The string containing a set of terms to search around
147 | * @param n How many things to find in this space
148 | * @param note
149 | * @return a List of ConceptMatches
150 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm
151 | */
152 | public List findNearestNForPosition(String terms, Integer n, Function note) throws NoWordToVecVectorForTerm {
153 | return findNearestNForPosition(w2vSpace.stringToList(terms),
154 | n, note);
155 | }
156 |
157 | /**
158 | * Find the position of terms in the larger space from which this is derived a larger space, and
159 | * then search around them in a this space that spans fewer terms, but is otherwise the same
160 | *
161 | * Will fail if the space for this concept space is not a SubSpace
162 | *
163 | * @param terms The string containing a set of terms to search around
164 | * @param n How many things to find in this space
165 | * @param note
166 | * @return a List of ConceptMatches
167 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm
168 | */
169 | public List findNearestNForPosition(List terms, Integer n, Function note) throws NoWordToVecVectorForTerm {
170 | Word2VecSpace posSpace = ((Word2VecSubspace) w2vSpace).getSuperSpace();
171 | return findNearestNForPosition(terms,
172 | posSpace, n, note);
173 | }
174 |
175 | /**
176 | * Find the position of terms in a larger space, and then search around them in a space that spans
177 | * fewer terms, but is otherwise the same
178 | *
179 | * @param terms The string containing a set of terms to search around
180 | * @param posSpace The other larger space in which to search for those terms.
181 | * @param n How many things to find in this space
182 | * @param note
183 | * @return a List of ConceptMatches
184 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm
185 | */
186 | public List findNearestNForPosition(String terms, Word2VecSpace posSpace, Integer n, Function note) throws NoWordToVecVectorForTerm {
187 | return findNearestNForPosition(w2vSpace.stringToList(terms),
188 | posSpace, n, note);
189 |
190 | }
191 |
192 | /**
193 | * Find the position of terms in a larger space, and then search around them in a space that spans
194 | * fewer terms, but is otherwise the same
195 | *
196 | * @param terms The list of terms to search around
197 | * @param posSpace The other larger space in which to search for those terms.
198 | * @param n How many things to find in this space
199 | * @param note
200 | * @return a List of ConceptMatches
201 | * @throws com.cyc.tool.distributedrepresentations.Word2VecSpace.NoWordToVecVectorForTerm
202 | */
203 | public List findNearestNForPosition(List terms, Word2VecSpace posSpace, Integer n, Function note) throws NoWordToVecVectorForTerm {
204 | return findNearest(posSpace.getMaximalNormedVector(terms), note)
205 | .stream()
206 | .collect(Collectors.toList())
207 | .subList(0, n);
208 | }
209 |
210 | /**
211 | *
212 | * @param terms
213 | * @param n
214 | * @return a List of ConceptMatches
215 | * @throws NoWordToVecVectorForTerm
216 | */
217 | public List findNearestNForStrictW2V(List terms, Integer n) throws NoWordToVecVectorForTerm {
218 | return findNearest(w2vSpace.getGoogleNormedVector(terms))
219 | .stream()
220 | .collect(Collectors.toList())
221 | .subList(0, n);
222 | }
223 |
224 | /**
225 | *
226 | * @param terms
227 | * @param n
228 | * @return a List of ConceptMatches
229 | * @throws NoWordToVecVectorForTerm
230 | */
231 | public List findNearestNForWithInputTermFiltering(List terms, Integer n) throws NoWordToVecVectorForTerm {
232 | return findNearest(w2vSpace.getMaximalNormedVector(terms))
233 | .stream()
234 | .filter(m -> !terms.contains(m.getTerm())) // the google code removes any search term
235 | .collect(Collectors.toList())
236 | .subList(0, n);
237 | }
238 |
239 | /**
240 | *
241 | * @param terms
242 | * @param n
243 | * @return a List of ConceptMatches
244 | * @throws NoWordToVecVectorForTerm
245 | */
246 | public List findNearestNForWithInputTermFilteringStrictW2V(List terms, Integer n) throws NoWordToVecVectorForTerm {
247 | return findNearest(w2vSpace.getGoogleNormedVector(terms))
248 | .stream()
249 | .filter(m -> !terms.contains(m.getTerm())) // the google code removes any search term
250 | .collect(Collectors.toList())
251 | .subList(0, n);
252 | }
253 |
254 | /**
255 | *
256 | * @return the w2vSpace
257 | */
258 | public Word2VecSpace getW2VSpace() {
259 | return w2vSpace;
260 | }
261 |
262 | private List findNearest(float[] searchVector, Function note) {
263 | Comparator compareDouble
264 | = (Double m1, Double m2) -> Double.compare(m2, m1);
265 |
266 | Comparator compareMatches
267 | = (ConceptMatch m1, ConceptMatch m2) -> Double.compare(m2.getSimilarity(), m1.getSimilarity());
268 |
269 | // This is a massive sort (3m elements) so it might be better to optimise
270 | // for top N
271 | return w2vSpace.getVectors().keySet().stream()
272 | .map(s -> new ConceptMatch(w2vSpace, searchVector, s, note))
273 | .sorted(compareMatches).collect(Collectors.toList());
274 | }
275 |
276 | private List findNearest(float[] searchVector) {
277 | return findNearest(searchVector, null);
278 | }
279 |
280 | private List findNearestWhere(float[] searchVector, Predicate pred, Function note) {
281 | Comparator compareMatches
282 | = (ConceptMatch m1, ConceptMatch m2) -> Double.compare(m2.getSimilarity(), m1.getSimilarity());
283 | // This is a massive sort (3m elements) so it might be better to optimise
284 | // for top N
285 | return w2vSpace.getVectors().keySet().parallelStream()
286 | .filter(pred)
287 | .map(s -> new ConceptMatch(w2vSpace, searchVector, s, note))
288 | .sorted(compareMatches).collect(Collectors.toList());
289 | }
290 |
291 | }
292 |
--------------------------------------------------------------------------------
/ConceptFinder/src/test/java/com/cyc/tool/conceptfinder/MissingConceptFinderIT.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.conceptfinder;
2 |
3 | /*
4 | * #%L
5 | * ConceptFinder
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace;
24 | import com.cyc.tool.distributedrepresentations.Word2VecSpace;
25 | import com.cyc.tool.owltools.OpenCycOwl;
26 | import java.io.IOException;
27 | import java.util.Arrays;
28 | import java.util.HashSet;
29 | import java.util.List;
30 | import java.util.Set;
31 | import java.util.stream.Collectors;
32 | import java.util.stream.IntStream;
33 | import org.junit.AfterClass;
34 | import static org.junit.Assert.assertEquals;
35 | import static org.junit.Assert.assertTrue;
36 | import static org.junit.Assert.fail;
37 | import org.junit.BeforeClass;
38 | import org.junit.Test;
39 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
40 |
41 | /**
42 | * MissingConceptFinder tests.
43 | */
44 | public class MissingConceptFinderIT {
45 |
46 | static ConceptSpace cSpace;
47 | static List cr = Arrays.asList("Chinese", "river");
48 | static MissingConceptFinder mcf;
49 | static Word2VecSpace mySpace;
50 | static OpenCycOwl ocyc;
51 | static List pelagicBird = Arrays.asList("pelagic", "bird");
52 |
53 | public MissingConceptFinderIT() {
54 | }
55 |
56 | @BeforeClass
57 | public static void setUpClass() throws IOException, OWLOntologyCreationException {
58 | mySpace = GoogleNewsW2VSpace.get();
59 | cSpace = new ConceptSpace(mySpace);
60 | ocyc = new OpenCycOwl();
61 | mcf = new MissingConceptFinderDefault(mySpace, ocyc, cSpace);
62 | }
63 |
64 | @AfterClass
65 | public static void tearDownClass() {
66 | mySpace = null;
67 | ocyc.close();
68 | }
69 | private static String set2String(Set s) {
70 | if (s.size()>10) return "";
71 | return s.stream()
72 | .map(i->{return String.join(",", mcf.getMissingTerms().get(i));})
73 | .collect(Collectors.joining(";"));
74 |
75 | }
76 |
77 | @Test
78 | public void conceptsWithTermsTest() {
79 | List res = mcf.conceptsWithTerms();
80 | System.out.println("There are " + res.size() + " missing concepts with associated KB terms: " + res);
81 | assertTrue(res.size() + "elements expected none", res.size() == 0);
82 | // assertTrue(res.containsAll(Arrays.asList("start", "rust", "blueberry")));
83 | }
84 |
85 | @Test
86 | public void findNearbyTerms1() {
87 | long t1 = System.currentTimeMillis();
88 | System.out.println("FNT1");
89 | List matches;
90 | try {
91 | matches = cSpace.findNearestNForIn(cr, 40, ocyc);
92 | IntStream.range(0, matches.size())
93 | .forEach(i -> {
94 | System.out.println(i + " " + matches.get(i).toString());
95 | });
96 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
97 | assertEquals("Chinese", matches.get(0).term);
98 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
99 | fail("took unexpected exception:" + ex);
100 | }
101 | }
102 |
103 | @Test
104 | public void findNearbyTerms2() {
105 | try {
106 | long t1 = System.currentTimeMillis();
107 | System.out.println("FNT2");
108 | List matches = cSpace.findNearestNForIn(cr, 40, ocyc);
109 | IntStream.range(0, matches.size())
110 | .forEach(i -> {
111 | System.out.println(i + " " + matches.get(i).toString());
112 | });
113 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
114 |
115 | assertEquals(0.5539201713461387, matches.get(13).similarity, 0.000001);
116 |
117 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
118 | fail("took unexpected exception:" + ex);
119 |
120 | }
121 |
122 | }
123 |
124 | @Test
125 | public void findNearbyTerms3() {
126 | try {
127 | long t1 = System.currentTimeMillis();
128 | System.out.println("FNT3");
129 | List matches = cSpace.findNearestNForIn(cr, 40, ocyc);
130 | IntStream.range(0, matches.size())
131 | .forEach(i -> {
132 | System.out.println(i + " " + matches.get(i).toString());
133 | });
134 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
135 |
136 | assertEquals("creek", matches.get(7).term);
137 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
138 | fail("took unexpected exception:" + ex);
139 | }
140 | }
141 |
142 | @Test
143 | public void findNearbyTerms4() {
144 | try {
145 | long t1 = System.currentTimeMillis();
146 | System.out.println("FNT4");
147 | List matches = cSpace.findNearestNForIn(cr, 40, ocyc);
148 | IntStream.range(0, matches.size())
149 | .forEach(i -> {
150 | System.out.println(i + " " + matches.get(i).toString());
151 | });
152 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
153 |
154 | assertEquals("riverbank", matches.get(12).term);
155 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
156 | fail("took unexpected exception:" + ex);
157 | }
158 | }
159 |
160 | @Test
161 | public void findNearbyTermsWithGraphListTest() {
162 | System.out.println("FNT WG 3");
163 | IntStream.rangeClosed(3, 6)
164 | .forEach(ti -> {
165 | Arrays.asList(mcf.getMissingTerms().get(ti))
166 | .forEach((String ss) -> {
167 | mcf.findNearbyTermsWithGraphCore(ss, ti);
168 | });
169 | });
170 | assertTrue(true);
171 | }
172 |
173 | @Test
174 | public void findNearbyTermsWithGraphTest1() {
175 | System.out.println("FNT WG 1");
176 | mcf.findNearbyTermsWithGraphCore("pelagic bird");
177 | assertTrue(true);
178 | }
179 |
180 | @Test
181 | public void findNearbyTermsWithGraphTest2(){
182 | System.out.println("FNT WG 2");
183 | mcf.findNearbyTermsWithGraphCore("tobacco shop");
184 | assertTrue(true);
185 | }
186 |
187 | @Test
188 | public void findNearbyTermsWithGraphTest3() {
189 | System.out.println("FNT WG 3");
190 | mcf.findNearbyTermsWithGraphCore("pelagic bird");
191 | mcf.findNearbyTermsWithGraphCore("tobacco shop");
192 | mcf.findNearbyTermsWithGraphCore("net melon");
193 | mcf.findNearbyTermsWithGraphCore("glowworm");
194 | mcf.findNearbyTermsWithGraphCore("tightrope walking");
195 | mcf.findNearbyTermsWithGraphCore("Adelie penguin");
196 | assertTrue(true);
197 | }
198 |
199 | @Test
200 | public void findNearbyTermsWithGraphTest4() {
201 | System.out.println("FNT WG 4");
202 |
203 | Set hyp = mcf.findNearbyTermsWithGraphCore("Adelie penguin");
204 | System.out.println("HYP" + hyp);
205 | assertEquals(1, hyp.size());
206 | }
207 |
208 | @Test
209 | public void findSomeMissingTerms1() {
210 | IntStream.rangeClosed(0, 3)
211 | .forEach(ti -> {
212 | Arrays.asList(mcf.getMissingTerms().get(ti))
213 | .forEach((String ss) -> {
214 | lookItUpWithOcyc(ss);
215 | });
216 | });
217 | assertTrue(true);
218 | }
219 |
220 | @Test
221 | public void findSomeMissingTerms2() {
222 | IntStream.of(1, 5, 7)
223 | //See https://docs.google.com/a/cyc.com/document/d/1Lwi21-yxcC0DGKJMcc4GFN3M_DBzEDAcSNjYufCRIfE/edit
224 | .forEach(ti -> {
225 | Arrays.asList(mcf.getMissingTerms().get(ti))
226 | .forEach((String ss) -> {
227 | lookItUpWithOcyc(ss);
228 | });
229 | });
230 | assertTrue(true);
231 | }
232 |
233 | @Test
234 | public void findSomeMissingTerms3() {
235 | IntStream.of(2, 3, 6)
236 | //See https://docs.google.com/a/cyc.com/document/d/1Lwi21-yxcC0DGKJMcc4GFN3M_DBzEDAcSNjYufCRIfE/edit
237 | .forEach(ti -> {
238 | Arrays.asList(mcf.getMissingTerms().get(ti))
239 | .forEach((String ss) -> {
240 | lookItUpAllW2V(ss);
241 | });
242 | });
243 | assertTrue(true);
244 | }
245 |
246 | @Test
247 | public void howManyMissingTermsInW2V() throws IOException {
248 | final Set found = new HashSet<>();
249 | final Set foundSpace = new HashSet<>();
250 | final Set unfound = new HashSet<>();
251 |
252 | mcf.getMissingTerms().keySet().forEach(i -> {
253 | Arrays.asList(mcf.getMissingTerms().get(i))
254 | .forEach((String ss) -> {
255 | if (mySpace.knownTerm(ss)) {
256 | found.add(i);
257 | if (ss.contains(" ")) {
258 | foundSpace.add(i);
259 | }
260 | } else {
261 | unfound.add(i);
262 | }
263 | });
264 | });
265 | System.out.println("Found directly in W2V : " + found.size()+" "+set2String(found));
266 | System.out.println("Found directly in W2V with space: " + foundSpace.size()+" "+set2String(foundSpace));
267 | System.out.println("Not found in W2V : " + unfound.size()+" "+set2String(unfound));
268 | assertEquals(2, foundSpace.size());
269 | assertEquals(8, unfound.size());
270 | }
271 |
272 | @Test
273 | public void listSomeTest() {
274 | IntStream.rangeClosed(0, 8)
275 | .forEach(i -> {
276 | System.out.println(i + ":\t" + String.join(", ",
277 | Arrays.asList(mcf.getMissingTerms().get(i))));
278 | });
279 | assertTrue(true);
280 | }
281 |
282 | // @Test
283 | // public void namesInW2VTest() {
284 | // List res;
285 | // res = mcf.namesInW2V();
286 | // assertEquals(12343, res.size());
287 | // }
288 | @Test
289 | public void missingConceptCountTest() {
290 | assertEquals(9, mcf.missingConceptCount());
291 | }
292 |
293 | private void lookItUpAllW2V(String ss) {
294 | try {
295 | System.out.println("=======[" + ss + "]=======");
296 | long t1 = System.currentTimeMillis();
297 | List matches
298 | = cSpace.findNearestNFor(Arrays.asList(ss.split("\\s+")), 40);
299 |
300 | System.out.println("Matches:" + (matches == null ? "null" : matches.size()));
301 | IntStream.range(0, matches.size())
302 | .forEach(i -> {
303 | String matchTerm = matches.get(i).term;
304 | String mat = matches.get(i).toString();
305 | if (ocyc.knownTerm(matchTerm)) {
306 | // System.out.println("Known:" +matchTerm);
307 | // System.out.println("Match is: "+ocyc.conceptsFor(matchTerm));
308 | mat = mat.replace("---",
309 | String.join(" | ", ocyc.conceptsFor(matchTerm)));
310 | }
311 | System.out.println(i + " " + mat);
312 | });
313 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
314 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
315 | System.out.println("--- position not known in word to vec space:[" + ss + "]");
316 | // Logger.getLogger(MissingVideoConceptFinderTest.class.getName()).log(Level.INFO, null, ex);
317 | }
318 | }
319 |
320 | private void lookItUpWithOcyc(String ss) {
321 | try {
322 | System.out.println("=======[" + ss + "]=======");
323 | long t1 = System.currentTimeMillis();
324 | List matches
325 | = cSpace.findNearestNForIn(Arrays.asList(ss.split("\\s+")), 40, ocyc);
326 |
327 | System.out.println("Matches:" + (matches == null ? "null" : matches.size()));
328 | IntStream.range(0, matches.size())
329 | .forEach(i -> {
330 | System.out.println(i + " " + matches.get(i).toString());
331 | });
332 | System.out.println("Took " + (System.currentTimeMillis() - t1) + "ms");
333 | } catch (Word2VecSpace.NoWordToVecVectorForTerm ex) {
334 | System.out.println("--- position not known in word to vec space:[" + ss + "]");
335 | // Logger.getLogger(MissingVideoConceptFinderTest.class.getName()).log(Level.INFO, null, ex);
336 | }
337 | }
338 | }
339 |
--------------------------------------------------------------------------------
/OwlTools/src/main/java/com/cyc/tool/owltools/OpenCycOwl.java:
--------------------------------------------------------------------------------
1 | package com.cyc.tool.owltools;
2 |
3 | /*
4 | * #%L
5 | * OwlTools
6 | * %%
7 | * Copyright (C) 2015 Cycorp, Inc
8 | * %%
9 | * Licensed under the Apache License, Version 2.0 (the "License");
10 | * you may not use this file except in compliance with the License.
11 | * You may obtain a copy of the License at
12 | *
13 | * http://www.apache.org/licenses/LICENSE-2.0
14 | *
15 | * Unless required by applicable law or agreed to in writing, software
16 | * distributed under the License is distributed on an "AS IS" BASIS,
17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | * See the License for the specific language governing permissions and
19 | * limitations under the License.
20 | * #L%
21 | */
22 |
23 | //import com.cyc.tool.distributedrepresentations.GoogleNewsW2VSpace;
24 | //import com.cyc.tool.distributedrepresentations.Word2VecSpace;
25 | import com.google.common.collect.Iterables;
26 | import java.io.File;
27 | import java.io.IOException;
28 | import java.util.Arrays;
29 | import java.util.Collection;
30 | import java.util.HashMap;
31 | import java.util.HashSet;
32 | import java.util.Locale;
33 | import java.util.Map;
34 | import java.util.Set;
35 | import java.util.concurrent.ConcurrentNavigableMap;
36 | import java.util.function.Predicate;
37 | import java.util.logging.Level;
38 | import java.util.logging.Logger;
39 | import java.util.stream.Collectors;
40 | import java.util.stream.Stream;
41 | import org.mapdb.DB;
42 | import org.mapdb.DBMaker;
43 | import org.semanticweb.owlapi.apibinding.OWLManager;
44 | import org.semanticweb.owlapi.io.FileDocumentSource;
45 | import org.semanticweb.owlapi.model.IRI;
46 | import org.semanticweb.owlapi.model.OWLAnnotation;
47 | import org.semanticweb.owlapi.model.OWLAnnotationProperty;
48 | import org.semanticweb.owlapi.model.OWLClass;
49 | import org.semanticweb.owlapi.model.OWLDataFactory;
50 | import org.semanticweb.owlapi.model.OWLLogicalEntity;
51 | import org.semanticweb.owlapi.model.OWLOntology;
52 | import org.semanticweb.owlapi.model.OWLOntologyCreationException;
53 | import org.semanticweb.owlapi.model.OWLOntologyManager;
54 | import org.semanticweb.owlapi.reasoner.Node;
55 | import org.semanticweb.owlapi.reasoner.NodeSet;
56 | import org.semanticweb.owlapi.reasoner.OWLReasoner;
57 | import org.semanticweb.owlapi.reasoner.OWLReasonerFactory;
58 | import org.semanticweb.owlapi.reasoner.structural.StructuralReasonerFactory;
59 | import org.semanticweb.owlapi.search.EntitySearcher;
60 | import org.semanticweb.owlapi.vocab.OWLRDFVocabulary;
61 |
62 | /**
63 | *
64 | * OpenCycOwl has methods for accessing information in an OpenCyc OWL file.
65 | * There is some known overlap with this class, {@link OpenCycReasoner},
66 | * and {@link OpenCycContent}.
67 | *
68 | */
69 | public class OpenCycOwl {
70 |
71 |
72 | static final String ocycLocation = OwlToolsConfig.ocycLocation;
73 |
74 | /**
75 | * HLID for testing puproses.
76 | */
77 | public String pizzaGUID = "Mx4rvVibapwpEbGdrcN5Y29ycA";
78 | private final boolean clearLabels = false;
79 | private final OWLDataFactory dataFactory;
80 | private final OWLOntologyManager manager;
81 | private OWLOntology openCyc;
82 | private final OWLAnnotationProperty prettyString;
83 | private final OWLAnnotationProperty rdfsLabel;
84 | private OWLReasoner reasoner;
85 | private final OWLReasonerFactory reasonerFactory;
86 |
87 | private long t; // time keeper
88 | Set allConcepts;
89 | final Map> conceptLabels;
90 | Set conceptsWithTerms;
91 | DB db;
92 | ConcurrentNavigableMap> ocycConceptForTermLabel;
93 | ConcurrentNavigableMap> ocycConceptForTermLower;
94 | ConcurrentNavigableMap> ocycConceptForTermPrettyString;
95 | ConcurrentNavigableMap> typeGraph;
96 |
97 | /**
98 | * Creates a new instance of OwlTest.
99 | * @throws java.io.IOException
100 | * @throws org.semanticweb.owlapi.model.OWLOntologyCreationException
101 | */
102 | public OpenCycOwl() throws IOException, OWLOntologyCreationException {
103 |
104 | // A simple example of how to load and save an ontology We first need to
105 | // obtain a copy of an OWLOntologyManager, which, as the name suggests,
106 | // manages a set of ontologies. An ontology is unique within an ontology
107 | // manager. Each ontology knows its ontology manager. To load multiple
108 | // copies of an ontology, multiple managers would have to be used.
109 | manager = OWLManager.createOWLOntologyManager();
110 | // We load an ontology from a document IRI - in this case we'll load the
111 | // pizza ontology.
112 | // IRI documentIRI = IRI.create(PIZZA_IRI);
113 | // Now ask the manager to load the ontology
114 | // OWLOntology ontology = manager
115 | // .loadOntologyFromOntologyDocument(documentIRI);
116 | // but in this test we don't rely on a remote ontology and load it from
117 | // a string
118 | //play with mapr
119 | // System.out.println(Arrays.asList(1,2,3,4,5,6,7,8).stream().map(x->x*x).reduce((x,y)->x+y).get());
120 |
121 | db = DBMaker.newFileDB(new File(OwlToolsConfig.getOcycTermDBFile()))
122 | .closeOnJvmShutdown()
123 | // .encryptionEnable("password")
124 | .make();
125 |
126 | reasonerFactory = new StructuralReasonerFactory();
127 | dataFactory = manager.getOWLDataFactory();
128 | prettyString = dataFactory.getOWLAnnotationProperty(
129 | guidToIRI("Mx4rwLSVCpwpEbGdrcN5Y29ycA"));
130 | rdfsLabel = dataFactory.getOWLAnnotationProperty(OWLRDFVocabulary.RDFS_LABEL.getIRI());
131 | this.getPrettyStringToConceptMap();
132 | this.getRDFSLabelConceptMap();
133 | this.getLowerCaseConceptMap();
134 | this.createTypeGraph();
135 | conceptLabels = new HashMap<>();
136 | this.fillConceptLabels();
137 | }
138 |
139 | /**
140 | *
141 | * @param args
142 | * @throws Exception
143 | */
144 | public static void main(String[] args) throws Exception {
145 |
146 | OpenCycOwl my = new OpenCycOwl();
147 | System.out.println("N Classes:" + my.getOpenCyc().getClassesInSignature().size());
148 | my.pizzaTest();
149 | // Remove the ontology from the manager
150 | my.manager.removeOntology(my.getOpenCyc());
151 | }
152 |
153 | /**
154 | *
155 | * @return the allConcepts Set
156 | * @throws IOException
157 | */
158 | public Set allConcepts() throws IOException {
159 |
160 | allConcepts = db.getHashSet(OwlToolsConfig.getAllConceptsName());
161 | if (allConcepts.isEmpty()) {
162 | Set res
163 | = getOpenCyc().
164 | getClassesInSignature()
165 | .stream()
166 | .map(clss -> {
167 | String csid = clss.toStringID();
168 | String s = guidFromURLString(csid);
169 | System.out.println("AC:" + csid + " " + s);
170 | return s;
171 | })
172 | .collect(Collectors.toSet());
173 | allConcepts.addAll(res);
174 | db.commit();
175 | }
176 | return allConcepts;
177 | }
178 |
179 | /**
180 | * Close the ontology access
181 | */
182 | public void close() {
183 | if (openCyc != null) {
184 | manager.removeOntology(openCyc);
185 | }
186 | }
187 |
188 | /**
189 | *
190 | * @param term
191 | * @return all concepts for a given term String
192 | */
193 | public Set conceptsFor(String term) {
194 | Set ret = new HashSet<>();
195 | if (ocycConceptForTermPrettyString.containsKey(term)) {
196 | ret.addAll(ocycConceptForTermPrettyString.get(term));
197 | }
198 | if (ocycConceptForTermLabel.containsKey(term)) {
199 | ret.addAll(ocycConceptForTermLabel.get(term));
200 | }
201 | String l = term.toLowerCase(Locale.ENGLISH);
202 | if (ocycConceptForTermLower.containsKey(l)) {
203 | ret.addAll(ocycConceptForTermLower.get(l));
204 | }
205 | if (term.contains("_")) {
206 | ret.addAll(conceptsFor(term.replace("_", " ")));
207 | }
208 | return ret;
209 | }
210 |
211 |
212 |
213 | /**
214 | *
215 | * @return Set of concepts with terms in the W2V space
216 | * @throws IOException
217 | */
218 | // public Set conceptsWithW2VTerms() throws IOException {
219 | // /* @Todo: Consider making this more independent of the particular W2V space */
220 | // Word2VecSpace w2v = GoogleNewsW2VSpace.get();
221 | // conceptsWithTerms = db.getHashSet(OwlToolsConfig.getConceptsWithTermsName());
222 | // if (conceptsWithTerms.isEmpty()) {
223 | // Set res
224 | // = Stream.concat(
225 | // Stream.concat(
226 | // ocycConceptForTermPrettyString.entrySet().stream(),
227 | // ocycConceptForTermLabel.entrySet().stream()),
228 | // ocycConceptForTermLower.entrySet().stream())
229 | // .filter(s -> w2v.knownTerm(s.getKey()))
230 | // .map(s -> s.getValue())
231 | // .flatMap(conceptSet -> conceptSet.stream())
232 | // .collect(Collectors.toSet());
233 | // conceptsWithTerms.addAll(res);
234 | // db.commit();
235 | // }
236 | // return conceptsWithTerms;
237 | // }
238 |
239 | /**
240 | *
241 | * @param forT
242 | * @return Set of types for a term
243 | */
244 | public Set getTypes(String forT) {
245 | Set ret = new HashSet<>();
246 | if (typeGraph.containsKey(forT)) {
247 | return typeGraph.get(forT);
248 | }
249 | if (forT.equals("Thing")) {
250 | return ret;
251 | }
252 | // System.out.println("No types for :" + guidToURLString(forT));
253 | return ret;
254 | }
255 |
256 | /**
257 | *
258 | * @param conceptGUID
259 | * @return Set of types for a concept
260 | * @throws OWLOntologyCreationException
261 | */
262 | public Set getTypesForConceptFromOWL(String conceptGUID) throws OWLOntologyCreationException {
263 |
264 | Set types = new HashSet<>();
265 | OWLClass concept
266 | = dataFactory.getOWLClass(guidToIRI(conceptGUID));
267 | NodeSet subClasses = getReasoner()
268 | .getSuperClasses(concept, true);
269 | subClasses.forEach(node -> {
270 | Set ents = node.getEntities();
271 | ents.forEach(ent -> {
272 | types.add(ent.getIRI().getShortForm());
273 | });
274 | });
275 | return types;
276 | }
277 |
278 | /**
279 | *
280 | * @param forT
281 | * @return Set of types for a term
282 | */
283 | public Set getTypesTransitive(String forT) {
284 | Set ret = new HashSet<>();
285 | if (typeGraph.containsKey(forT)) {
286 |
287 | typeGraph
288 | .get(forT)
289 | .forEach(t -> {
290 | getTypesTransitive(t, ret);
291 | });
292 | return ret;
293 | }
294 | // System.out.println("PROBLEM: " + forT);
295 | return ret;
296 | }
297 |
298 | /**
299 | *
300 | * @param forT
301 | * @return Set of types for a term
302 | */
303 | public Set getTypesTransitiveURL(String forT) {
304 | return getTypesTransitive(guidFromURLString(forT))
305 | .stream()
306 | .map(t -> guidToURLString(t))
307 | .collect(Collectors.toSet());
308 | }
309 |
310 | /**
311 | *
312 | * @param forT
313 | * @return Set of types of a term
314 | */
315 | public Set getTypesURL(String forT) {
316 | return getTypes(guidFromURLString(forT))
317 | .stream()
318 | .map(t -> guidToURLString(t))
319 | .collect(Collectors.toSet());
320 | }
321 |
322 | /**
323 | *
324 | * @param url
325 | * @return GUID from a URL
326 | */
327 | public String guidFromURLString(String url) {
328 | return url.replaceFirst("http://sw.opencyc.org/concept/", "");
329 | }
330 |
331 | /**
332 | *
333 | * @param conceptGuid
334 | * @return URL from a GUID
335 | */
336 | public String guidToURLString(String conceptGuid) {
337 | return "http://sw.opencyc.org/concept/" + conceptGuid;
338 | }
339 |
340 | /**
341 | *
342 | * @param term
343 | * @return true if term is in the ontology
344 | */
345 | public boolean knownTerm(String term) {
346 | if (ocycConceptForTermPrettyString.containsKey(term)) {
347 | return true;
348 | }
349 | if (ocycConceptForTermLabel.containsKey(term)) {
350 | return true;
351 | }
352 | if (ocycConceptForTermLower.containsKey(term.toLowerCase(Locale.ENGLISH))) {
353 | return true;
354 | }
355 | if (term.contains("_")) {
356 | return knownTerm(term.replace("_", " "));
357 | }
358 | return false;
359 | }
360 |
361 | /**
362 | *
363 | * @param concept
364 | * @return a String with labels for the concept
365 | */
366 | public String labelsForConcept(String concept) {
367 | if (conceptLabels.containsKey(concept)) {
368 | return String.join("|", conceptLabels.get(concept));
369 | }
370 | return concept;
371 | }
372 |
373 | /**
374 | *
375 | * @return a Predicate to test if a concept is present
376 | */
377 | public Predicate noConcept() {
378 | return a -> !Arrays.stream(a)
379 | .anyMatch(hasConcept());
380 | }
381 |
382 | /**
383 | *
384 | * @return Number of classes in the ontology
385 | */
386 | public int size() {
387 | return getOpenCyc().getClassesInSignature().size();
388 | }
389 |
390 | /**
391 | *
392 | * @return an OWLOntology for OpenCyc
393 | */
394 | protected OWLOntology getOpenCyc() {
395 | if (openCyc == null) {
396 | try {
397 | t = System.currentTimeMillis();
398 | openCyc = manager
399 | .loadOntologyFromOntologyDocument(
400 | new FileDocumentSource(
401 | new File(ocycLocation)));
402 | System.out.println("Open Cyc Load time:"
403 | + (System.currentTimeMillis() - t) + "ms");
404 | } catch (OWLOntologyCreationException ex) {
405 | Logger.getLogger(OpenCycOwl.class.getName()).log(Level.SEVERE, null, ex);
406 | }
407 | }
408 |
409 | return openCyc;
410 | }
411 |
412 | /**
413 | *
414 | * @return an OWLReasoner
415 | */
416 | protected OWLReasoner getReasoner() {
417 | if (reasoner == null) {
418 | reasoner = reasonerFactory.createReasoner(getOpenCyc());
419 | }
420 | return reasoner;
421 | }
422 |
423 | private void createTypeGraph() throws IOException {
424 | typeGraph = db.getTreeMap(OwlToolsConfig.getTypeGraphName());
425 | if (typeGraph.isEmpty()) {
426 | allConcepts().
427 | stream().
428 | map(c -> guidFromURLString(c))
429 | .forEach(s -> {
430 | try {
431 | Set types = getTypesForConceptFromOWL(s);
432 | System.out.println("Types for " + s + ": " + types.size());
433 | typeGraph.put(s, types);
434 | } catch (OWLOntologyCreationException ex) {
435 | Logger.getLogger(OpenCycOwl.class.getName()).log(Level.SEVERE, null, ex);
436 | }
437 | });
438 | db.commit();
439 | db.compact();
440 | }
441 |
442 | }
443 |
444 | private void fillConceptLabels() {
445 |
446 | t = System.currentTimeMillis();
447 | Iterables.concat(ocycConceptForTermLabel.entrySet(),
448 | ocycConceptForTermLabel.entrySet(),
449 | ocycConceptForTermPrettyString.entrySet()).forEach(entry -> {
450 | Set concepts = entry.getValue();
451 | concepts.forEach(concept -> {
452 | if (!conceptLabels.containsKey(concept)) {
453 | conceptLabels.put(concept, new HashSet<>());
454 | }
455 | conceptLabels.get(concept).add(entry.getKey());
456 | });
457 | });
458 | System.out.println("Concept to term map creation:"
459 | + (System.currentTimeMillis() - t) + "ms");
460 | }
461 |
462 | private void getLowerCaseConceptMap() {
463 |
464 | ocycConceptForTermLower = db.getTreeMap(OwlToolsConfig.getOcycTermMapName() + "_Lower");
465 | if (clearLabels) {
466 | ocycConceptForTermLower.clear();
467 | }
468 | if (ocycConceptForTermLower.isEmpty()) {
469 | ocycConceptForTermPrettyString.keySet().forEach(s -> {
470 | storeDownCaseLabel(s, ocycConceptForTermPrettyString);
471 | });
472 |
473 | ocycConceptForTermLabel.keySet().forEach(s -> {
474 | storeDownCaseLabel(s, ocycConceptForTermLabel);
475 | });
476 | db.commit();
477 | db.compact();
478 | }
479 |
480 | }
481 |
482 | private void getPrettyStringToConceptMap() {
483 | // Print out all of the classes which are contained in the signature of
484 | // the ontology. These are the classes that are referenced by axioms in
485 | // the ontology.
486 |
487 | ocycConceptForTermPrettyString = db.getTreeMap(OwlToolsConfig.getOcycTermMapName());
488 | if (clearLabels) {
489 | ocycConceptForTermPrettyString.clear();
490 | }
491 | if (ocycConceptForTermPrettyString.isEmpty()) {
492 | Iterables.concat(
493 | getOpenCyc().getClassesInSignature(),
494 | getOpenCyc().getIndividualsInSignature()).forEach(owlObj -> {
495 | System.out.println("Loading PrettyStrings for "
496 | + (owlObj instanceof OWLClass ? "Class" : "Individual") + ": " + owlObj);
497 | Collection annotations
498 | = EntitySearcher.getAnnotations(owlObj, getOpenCyc(), prettyString);
499 | annotations.forEach(ann -> {
500 | storeConceptLabel(ann, owlObj, ocycConceptForTermPrettyString);
501 | });
502 | });
503 | db.commit();
504 | db.compact();
505 |
506 | }
507 | }
508 |
509 | private void getRDFSLabelConceptMap() {
510 | // Print out all of the classes which are contained in the signature of
511 | // the ontology. These are the classes that are referenced by axioms in
512 | // the ontology.
513 |
514 | ocycConceptForTermLabel = db.getTreeMap(OwlToolsConfig.getOcycTermMapName() + "_Label");
515 | if (clearLabels) {
516 | ocycConceptForTermLabel.clear();
517 | }
518 | if (ocycConceptForTermLabel.isEmpty()) {
519 | // Get the terms for collections and individuals
520 | Iterables.concat(
521 | getOpenCyc().getClassesInSignature(),
522 | getOpenCyc().getIndividualsInSignature()).forEach(owlObj -> {
523 | System.out.println("Loading RDFS Labels for "
524 | + (owlObj instanceof OWLClass ? "Class" : "Individual") + ": " + owlObj);
525 | Collection annotations
526 | = EntitySearcher.getAnnotations(owlObj, getOpenCyc(), rdfsLabel);
527 | annotations.forEach(ann -> {
528 | storeConceptLabel(ann, owlObj, ocycConceptForTermLabel);
529 | });
530 | });
531 |
532 | db.commit();
533 | db.compact();
534 | }
535 | }
536 |
537 | private void getTypesTransitive(String forT, Set soFar) {
538 | if (!soFar.contains(forT)) {
539 | soFar.add(forT);
540 | if (forT.equals("Thing")) {
541 | return;
542 | }
543 | getTypes(forT)
544 | .forEach(st -> {
545 | getTypesTransitive(st, soFar);
546 | });
547 |
548 | }
549 | }
550 |
551 | private IRI guidToIRI(String conceptGuid) {
552 | return IRI.create(guidToURLString(conceptGuid));
553 | }
554 |
555 | private Predicate hasConcept() {
556 | return a -> knownTerm(a);
557 | }
558 |
559 | private void pizzaTest() {
560 | // Now save a copy to another location in OWL/XML format (i.e. disregard
561 | // the format that the ontology was loaded in).
562 | //File f = folder.newFile("owlapiexample_example1.xml");
563 | //IRI documentIRI2 = IRI.create(f);
564 | //manager.saveOntology(ontology, new OWLXMLDocumentFormat(), documentIRI2);
565 |
566 | OWLClass pizza
567 | = dataFactory.getOWLClass(guidToIRI(pizzaGUID));
568 |
569 | NodeSet subClses = getReasoner().getSubClasses(pizza, true);
570 | // Setop=pizza.getObjectPropertiesInSignature();
571 | t = System.currentTimeMillis();
572 | Collection anns
573 | = EntitySearcher.getAnnotations(pizza, getOpenCyc(), prettyString);
574 |
575 | System.out.println("Search time:" + (System.currentTimeMillis() - t) + "ms");
576 | anns.forEach(ann
577 | -> System.out.println(ann.getValue().asLiteral().get().getLiteral()
578 | ));
579 |
580 | subClses.forEach((Node node) -> {
581 | Set em = node.getEntities();
582 | em.forEach(clss -> {
583 | System.out.println("SubType:" + clss);
584 | Collection annotations = EntitySearcher.getAnnotations(clss, getOpenCyc(), prettyString);
585 | annotations.forEach(ann -> {
586 | String lit = ann.getValue().asLiteral().get().getLiteral();
587 | System.out.println("\t:" + lit);
588 | });
589 | });
590 | });
591 | }
592 |
593 | private void storeConceptLabel(OWLAnnotation ann, OWLLogicalEntity owlObj, ConcurrentNavigableMap> labelMap) {
594 | String lit = ann.getValue().asLiteral().get().getLiteral();
595 | final Set newLabels = new HashSet<>();
596 | if (labelMap.containsKey(lit)) {
597 | newLabels.addAll(labelMap.get(lit));
598 | }
599 | newLabels.add(owlObj.toStringID());
600 | labelMap.put(lit, newLabels);
601 | if (lit.startsWith("the ")) { //hack to artificially extend reach
602 | final Set newLabelsThe = new HashSet<>();
603 | String key = lit.replace("the ", "");
604 | if (labelMap.containsKey(key)) {
605 | newLabelsThe.addAll(labelMap.get(key));
606 | }
607 | newLabelsThe.add(owlObj.toStringID());
608 | labelMap.put(key, newLabelsThe);
609 | }
610 | // System.out.println((sp.knownTerm(lit) ? "+" : "-") + lit);
611 | }
612 |
613 | private void storeDownCaseLabel(String s, ConcurrentNavigableMap> labelMap) {
614 | final Set newLabels = new HashSet<>();
615 | String l = s.toLowerCase(Locale.ENGLISH);
616 |
617 | if (ocycConceptForTermLower.containsKey(l)) {
618 | newLabels.addAll(ocycConceptForTermLower.get(l));
619 | }
620 | newLabels.addAll(labelMap.get(s));
621 |
622 | ocycConceptForTermLower.put(l, newLabels);
623 | }
624 |
625 | }
626 |
--------------------------------------------------------------------------------